contextpilot 0.3.0__tar.gz → 0.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- contextpilot-0.3.2/PKG-INFO +239 -0
- contextpilot-0.3.2/README.md +198 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/contextpilot/__init__.py +4 -1
- {contextpilot-0.3.0 → contextpilot-0.3.2}/contextpilot/context_index/index_construction.py +81 -9
- {contextpilot-0.3.0 → contextpilot-0.3.2}/contextpilot/context_ordering/inter_scheduler.py +10 -9
- {contextpilot-0.3.0 → contextpilot-0.3.2}/contextpilot/retriever/mem0_retriever.py +1 -1
- {contextpilot-0.3.0 → contextpilot-0.3.2}/contextpilot/server/http_client.py +37 -82
- {contextpilot-0.3.0 → contextpilot-0.3.2}/contextpilot/server/http_server.py +175 -112
- {contextpilot-0.3.0 → contextpilot-0.3.2}/contextpilot/server/live_index.py +324 -55
- contextpilot-0.3.2/contextpilot.egg-info/PKG-INFO +239 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/pyproject.toml +1 -1
- {contextpilot-0.3.0 → contextpilot-0.3.2}/tests/test_context_ordering.py +23 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/tests/test_cpu_distances.py +0 -38
- {contextpilot-0.3.0 → contextpilot-0.3.2}/tests/test_multi_turn_e2e.py +41 -36
- contextpilot-0.3.0/PKG-INFO +0 -180
- contextpilot-0.3.0/README.md +0 -139
- contextpilot-0.3.0/contextpilot.egg-info/PKG-INFO +0 -180
- {contextpilot-0.3.0 → contextpilot-0.3.2}/LICENSE +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/contextpilot/context_index/__init__.py +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/contextpilot/context_index/compute_distance_cpu.py +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/contextpilot/context_index/compute_distance_gpu.py +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/contextpilot/context_index/tree_nodes.py +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/contextpilot/context_ordering/__init__.py +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/contextpilot/context_ordering/intra_ordering.py +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/contextpilot/pipeline/__init__.py +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/contextpilot/pipeline/components.py +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/contextpilot/pipeline/multi_turn.py +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/contextpilot/pipeline/rag_pipeline.py +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/contextpilot/retriever/__init__.py +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/contextpilot/retriever/bm25.py +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/contextpilot/retriever/faiss_embedding.py +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/contextpilot/retriever/pageindex_retriever.py +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/contextpilot/server/__init__.py +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/contextpilot/server/conversation_tracker.py +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/contextpilot/server/eviction_heap.py +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/contextpilot/server/metadata.py +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/contextpilot/utils/__init__.py +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/contextpilot/utils/eval_metrics.py +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/contextpilot/utils/prompt_generator.py +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/contextpilot/utils/tools.py +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/contextpilot.egg-info/SOURCES.txt +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/contextpilot.egg-info/dependency_links.txt +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/contextpilot.egg-info/requires.txt +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/contextpilot.egg-info/top_level.txt +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/requirements.txt +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/setup.cfg +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/tests/test_context_index.py +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/tests/test_deduplication.py +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/tests/test_gpu_distance_performance.py +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/tests/test_gpu_distances.py +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/tests/test_group_prefix_sharing.py +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/tests/test_incremental_build.py +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/tests/test_live_index.py +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/tests/test_mem0_integration.py +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/tests/test_multi_turn.py +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/tests/test_pageindex_integration.py +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/tests/test_performance.py +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/tests/test_pipeline.py +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/tests/test_server_integration.py +0 -0
- {contextpilot-0.3.0 → contextpilot-0.3.2}/tests/test_utils.py +0 -0
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: contextpilot
|
|
3
|
+
Version: 0.3.2
|
|
4
|
+
Summary: Efficient Retrieval-Augmented Generation with Accuracy-Preserving Context Reuse
|
|
5
|
+
Author: Yinsicheng Jiang, Chivier Humber
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://github.com/SecretSettler/ContextPilot
|
|
8
|
+
Project-URL: Repository, https://github.com/SecretSettler/ContextPilot
|
|
9
|
+
Project-URL: Issues, https://github.com/SecretSettler/ContextPilot/issues
|
|
10
|
+
Keywords: rag,llm,context-reuse,kv-cache,retrieval-augmented-generation
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: datasets
|
|
24
|
+
Requires-Dist: transformers
|
|
25
|
+
Requires-Dist: elasticsearch==8.18.1
|
|
26
|
+
Requires-Dist: aiohttp
|
|
27
|
+
Requires-Dist: ujson
|
|
28
|
+
Requires-Dist: scipy
|
|
29
|
+
Requires-Dist: fastapi[all]
|
|
30
|
+
Requires-Dist: cupy-cuda12x
|
|
31
|
+
Requires-Dist: pytest
|
|
32
|
+
Provides-Extra: dev
|
|
33
|
+
Requires-Dist: black; extra == "dev"
|
|
34
|
+
Requires-Dist: bumpver; extra == "dev"
|
|
35
|
+
Requires-Dist: isort; extra == "dev"
|
|
36
|
+
Requires-Dist: pip-tools; extra == "dev"
|
|
37
|
+
Requires-Dist: pytest; extra == "dev"
|
|
38
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
39
|
+
Requires-Dist: ipython; extra == "dev"
|
|
40
|
+
Dynamic: license-file
|
|
41
|
+
|
|
42
|
+
<div align="center">
|
|
43
|
+
<img src="assets/about.png" alt="ContextPilot Logo" width="800"/>
|
|
44
|
+
|
|
45
|
+
<h1><strong>ContextPilot: Efficient Long Context Inference with Context Reuse</strong></h1>
|
|
46
|
+
|
|
47
|
+
[](https://www.python.org/)
|
|
48
|
+
[](https://pypi.org/project/contextpilot/)
|
|
49
|
+
[](LICENSE)
|
|
50
|
+
|
|
51
|
+
</div>
|
|
52
|
+
|
|
53
|
+
--------------------------------------------------------------------------------
|
|
54
|
+
|
|
55
|
+
| [**Documentation**](docs/README.md) | [**Examples**](examples/) | [**Benchmarks**](docs/reference/benchmarks.md) |
|
|
56
|
+
|
|
57
|
+
## News
|
|
58
|
+
|
|
59
|
+
- [2026/02] ContextPilot v0.3.2 released, supporting [PageIndex](https://github.com/VectifyAI/PageIndex) and [Mem0](https://github.com/mem0ai/mem0).
|
|
60
|
+
- [2026/01] ContextPilot has been accepted to MLSys 2026 🎉! See you in Bellevue, WA, USA.
|
|
61
|
+
- [2025/12] ContextPilot v0.2.0 released.
|
|
62
|
+
|
|
63
|
+
## About
|
|
64
|
+
|
|
65
|
+
ContextPilot is a fast optimization system on context engineering layer for agentic workloads:
|
|
66
|
+
1. **High Throughput & Cache Hit Ratio**: Boosting prefill throughput and prefix cache hit ratio with intelligent context reuse.
|
|
67
|
+
2. **Strong Compatibility**: Strong compatibility with existing popular RAG libraries (PageIndex), Agentic memory layer (Mem0), KV cache optimization engine (LMCache), and Inference engines (vLLM and SGLang).
|
|
68
|
+
3. **Negligible Accuracy Loss**: Achieving significant performance improvements with minimal to no accuracy degradation across various benchmarks.
|
|
69
|
+
3. **Widely Tested**: Tested with a wide range of RAG and Agentic AI applications.
|
|
70
|
+
|
|
71
|
+
## Target Workloads
|
|
72
|
+
|
|
73
|
+
1. **Trending Topic QA** — Search and generation for breaking news and hot topics beyond model knowledge
|
|
74
|
+
2. **Closed-Domain Long-Context QA** — QA over specialized corpora (novels, financial reports, legal documents) with retrieval or in-context search
|
|
75
|
+
3. **Large-Batch Long-Context Execution** — High-throughput inference where many requests share overlapping contexts; ContextPilot maximizes prefix reuse regardless of the search method
|
|
76
|
+
4. **Multi-Turn Conversations with Long-Term Memory** — Persistent context reuse across turns (e.g. [Mem0](https://github.com/mem0ai/mem0))
|
|
77
|
+
|
|
78
|
+
## Benchmark and Performance
|
|
79
|
+
|
|
80
|
+
### System Performance
|
|
81
|
+
|
|
82
|
+
<div align="center">
|
|
83
|
+
<img src="assets/deepseek_r1_results.png" alt="Benchmark Results" width="600"/>
|
|
84
|
+
</div>
|
|
85
|
+
|
|
86
|
+
ContextPilot (Stateless) on DeepSeek-R1 maintains accuracy compared to SGLang, achieving 64.68% vs 64.15% F1 on MultihopRAG and 41.08% vs 40.20% F1 on NarrativeQA.
|
|
87
|
+
|
|
88
|
+
### Accuracy on MT-RAG Benchmark (Online Scheduling)
|
|
89
|
+
|
|
90
|
+
<div align="center">
|
|
91
|
+
|
|
92
|
+
| Method | Qwen3-4B | Llama3.1-8B | Qwen3-30B-A3B |
|
|
93
|
+
|--------|----------|-------------|-----------|
|
|
94
|
+
| LMCache | 62.56 | **68.46** | 75.12 |
|
|
95
|
+
| CacheBlend | 50.33 | 56.52 | X |
|
|
96
|
+
| RadixCache | 62.56 | **68.46** | 75.12 |
|
|
97
|
+
| **ContextPilot** | **64.27** | 68.12 | **75.81** |
|
|
98
|
+
|
|
99
|
+
</div>
|
|
100
|
+
|
|
101
|
+
ContextPilot delivers **4-13x** improvements in cache hit rates and **1.5-3.5x** reductions in prefill latency for large-batch RAG workloads, while maintaining or improving accuracy.
|
|
102
|
+
|
|
103
|
+
**Furthermore**, ContextPilot has been tested to reduce input token costs by around **36%** with GPT-5.2.
|
|
104
|
+
|
|
105
|
+
See [Benchmarks](docs/reference/benchmarks.md) in the documentation for GPU vs CPU performance analysis and detailed benchmark methodology.
|
|
106
|
+
|
|
107
|
+
## Getting Started
|
|
108
|
+
|
|
109
|
+
### Installation
|
|
110
|
+
|
|
111
|
+
**Requirements:** Python >= 3.10
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
pip install contextpilot
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
Or install from source:
|
|
118
|
+
```bash
|
|
119
|
+
git clone https://github.com/Edinburgh-AgenticAI/ContextPilot.git
|
|
120
|
+
cd ContextPilot
|
|
121
|
+
pip install -e .
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
More [detailed installation instructions](docs/getting_started/installation.md) are available in the docs.
|
|
125
|
+
|
|
126
|
+
### Quick Start
|
|
127
|
+
|
|
128
|
+
**Offline / Online Stateless** — build index & schedule in one shot:
|
|
129
|
+
|
|
130
|
+
```python
|
|
131
|
+
from openai import OpenAI
|
|
132
|
+
import contextpilot as cp
|
|
133
|
+
|
|
134
|
+
client = OpenAI(base_url="http://localhost:30000/v1", api_key="...") # Your inference engine URL and API key
|
|
135
|
+
|
|
136
|
+
queries = ["What is AI?", "Explain neural networks", "What is deep learning?"]
|
|
137
|
+
all_contexts = [
|
|
138
|
+
["Doc about AI", "Doc about ML", "Doc about computing"],
|
|
139
|
+
["Doc about neural nets", "Doc about deep learning"],
|
|
140
|
+
["Doc about ML", "Doc about AI", "Doc about deep learning basics"],
|
|
141
|
+
]
|
|
142
|
+
|
|
143
|
+
# Build index and schedule for prefix sharing
|
|
144
|
+
index = cp.build_context_index(all_contexts, use_gpu=False)
|
|
145
|
+
reordered, _, order, _ = cp.InterContextScheduler().schedule_contexts(index)
|
|
146
|
+
|
|
147
|
+
# Send in optimized order — shared prefixes hit KV cache
|
|
148
|
+
for ctx, orig_idx in zip(reordered, order):
|
|
149
|
+
docs_section = "\n".join(f"[{i+1}] {doc}" for i, doc in enumerate(ctx))
|
|
150
|
+
# Importance ranking restores original retrieval order for the model
|
|
151
|
+
importance_ranking = ">".join(
|
|
152
|
+
str(ctx.index(doc) + 1) for doc in all_contexts[orig_idx] if doc in ctx
|
|
153
|
+
)
|
|
154
|
+
response = client.chat.completions.create(
|
|
155
|
+
model="Qwen/Qwen3-4B",
|
|
156
|
+
messages=[
|
|
157
|
+
{"role": "system", "content": (
|
|
158
|
+
f"Answer the question based on the provided documents.\n\n"
|
|
159
|
+
f"<documents>\n{docs_section}\n</documents>\n\n"
|
|
160
|
+
f"Read the documents in this importance ranking: {importance_ranking}\n"
|
|
161
|
+
f"Prioritize information from higher-ranked documents."
|
|
162
|
+
)},
|
|
163
|
+
{"role": "user", "content": queries[orig_idx]},
|
|
164
|
+
],
|
|
165
|
+
)
|
|
166
|
+
print(f"Q: {queries[orig_idx]}\nA: {response.choices[0].message.content}\n")
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
> For online stateless scheduling via HTTP server, see the [online usage guide](docs/guides/online_usage.md).
|
|
170
|
+
|
|
171
|
+
**Stateful** — `LiveContextIndex` tracks cached state:
|
|
172
|
+
|
|
173
|
+
```python
|
|
174
|
+
from openai import OpenAI
|
|
175
|
+
import contextpilot as cp
|
|
176
|
+
|
|
177
|
+
client = OpenAI(base_url="http://localhost:30000/v1", api_key="...")
|
|
178
|
+
live = cp.LiveContextIndex(use_gpu=False)
|
|
179
|
+
|
|
180
|
+
# Simulate multi-turn: each turn has batch_size=1
|
|
181
|
+
turns = [
|
|
182
|
+
{
|
|
183
|
+
"query": "What is AI?",
|
|
184
|
+
"contexts": [["Doc about AI", "Doc about ML", "Doc about computing"]],
|
|
185
|
+
},
|
|
186
|
+
{
|
|
187
|
+
"query": "Compare supervised and unsupervised learning",
|
|
188
|
+
# 2 of 3 docs overlap with Turn 1 ("Doc about AI", "Doc about ML"), different order + 1 new doc
|
|
189
|
+
"contexts": [["Doc about ML", "Doc about clustering", "Doc about AI"]],
|
|
190
|
+
},
|
|
191
|
+
]
|
|
192
|
+
|
|
193
|
+
for turn_idx, turn in enumerate(turns):
|
|
194
|
+
contexts = turn["contexts"]
|
|
195
|
+
query = turn["query"]
|
|
196
|
+
|
|
197
|
+
# build_incremental handles both cold start and incremental turns
|
|
198
|
+
result = live.build_incremental(contexts)
|
|
199
|
+
reordered = result['reordered_contexts']
|
|
200
|
+
# Turn 2: reordered to ["Doc about AI", "Doc about ML", "Doc about clustering"]
|
|
201
|
+
# ^— shared prefix from Turn 1 —^ ^— new doc appended
|
|
202
|
+
|
|
203
|
+
ctx = reordered[0]
|
|
204
|
+
docs_section = "\n".join(f"[{i+1}] {doc}" for i, doc in enumerate(ctx))
|
|
205
|
+
importance_ranking = ">".join(
|
|
206
|
+
str(ctx.index(doc) + 1) for doc in contexts[0] if doc in ctx
|
|
207
|
+
)
|
|
208
|
+
response = client.chat.completions.create(
|
|
209
|
+
model="Qwen/Qwen3-4B",
|
|
210
|
+
messages=[
|
|
211
|
+
{"role": "system", "content": (
|
|
212
|
+
f"Answer the question based on the provided documents.\n\n"
|
|
213
|
+
f"<documents>\n{docs_section}\n</documents>\n\n"
|
|
214
|
+
f"Read the documents in this importance ranking: {importance_ranking}\n"
|
|
215
|
+
f"Prioritize information from higher-ranked documents."
|
|
216
|
+
)},
|
|
217
|
+
{"role": "user", "content": query},
|
|
218
|
+
],
|
|
219
|
+
)
|
|
220
|
+
print(f"[Turn {turn_idx+1}] Q: {query}")
|
|
221
|
+
print(f"A: {response.choices[0].message.content}\n")
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
> **Note:** Stateful mode works without eviction sync — `LiveContextIndex` tracks the previous ordering and reorders new contexts to maximize prefix cache hits. For production deployments with limited storage size where the KV cache may evict entries, install the [SGLang eviction patch](docs/guides/online_usage.md#sglang-integration) to keep the index in sync. See the [online usage guide](docs/guides/online_usage.md) for HTTP server setup.
|
|
225
|
+
|
|
226
|
+
## Documentation
|
|
227
|
+
|
|
228
|
+
Check out the ContextPilot [documentation](docs/README.md) for comprehensive guides.
|
|
229
|
+
|
|
230
|
+
## Examples
|
|
231
|
+
|
|
232
|
+
Go hands-on with our [examples](examples/), demonstrating how to address different use cases with ContextPilot.
|
|
233
|
+
|
|
234
|
+
## Contributing
|
|
235
|
+
|
|
236
|
+
We welcome and value all contributions! Please feel free to submit issues and pull requests.
|
|
237
|
+
|
|
238
|
+
## Citation
|
|
239
|
+
We will include the paper citation soon!
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
<img src="assets/about.png" alt="ContextPilot Logo" width="800"/>
|
|
3
|
+
|
|
4
|
+
<h1><strong>ContextPilot: Efficient Long Context Inference with Context Reuse</strong></h1>
|
|
5
|
+
|
|
6
|
+
[](https://www.python.org/)
|
|
7
|
+
[](https://pypi.org/project/contextpilot/)
|
|
8
|
+
[](LICENSE)
|
|
9
|
+
|
|
10
|
+
</div>
|
|
11
|
+
|
|
12
|
+
--------------------------------------------------------------------------------
|
|
13
|
+
|
|
14
|
+
| [**Documentation**](docs/README.md) | [**Examples**](examples/) | [**Benchmarks**](docs/reference/benchmarks.md) |
|
|
15
|
+
|
|
16
|
+
## News
|
|
17
|
+
|
|
18
|
+
- [2026/02] ContextPilot v0.3.2 released, supporting [PageIndex](https://github.com/VectifyAI/PageIndex) and [Mem0](https://github.com/mem0ai/mem0).
|
|
19
|
+
- [2026/01] ContextPilot has been accepted to MLSys 2026 🎉! See you in Bellevue, WA, USA.
|
|
20
|
+
- [2025/12] ContextPilot v0.2.0 released.
|
|
21
|
+
|
|
22
|
+
## About
|
|
23
|
+
|
|
24
|
+
ContextPilot is a fast optimization system on context engineering layer for agentic workloads:
|
|
25
|
+
1. **High Throughput & Cache Hit Ratio**: Boosting prefill throughput and prefix cache hit ratio with intelligent context reuse.
|
|
26
|
+
2. **Strong Compatibility**: Strong compatibility with existing popular RAG libraries (PageIndex), Agentic memory layer (Mem0), KV cache optimization engine (LMCache), and Inference engines (vLLM and SGLang).
|
|
27
|
+
3. **Negligible Accuracy Loss**: Achieving significant performance improvements with minimal to no accuracy degradation across various benchmarks.
|
|
28
|
+
3. **Widely Tested**: Tested with a wide range of RAG and Agentic AI applications.
|
|
29
|
+
|
|
30
|
+
## Target Workloads
|
|
31
|
+
|
|
32
|
+
1. **Trending Topic QA** — Search and generation for breaking news and hot topics beyond model knowledge
|
|
33
|
+
2. **Closed-Domain Long-Context QA** — QA over specialized corpora (novels, financial reports, legal documents) with retrieval or in-context search
|
|
34
|
+
3. **Large-Batch Long-Context Execution** — High-throughput inference where many requests share overlapping contexts; ContextPilot maximizes prefix reuse regardless of the search method
|
|
35
|
+
4. **Multi-Turn Conversations with Long-Term Memory** — Persistent context reuse across turns (e.g. [Mem0](https://github.com/mem0ai/mem0))
|
|
36
|
+
|
|
37
|
+
## Benchmark and Performance
|
|
38
|
+
|
|
39
|
+
### System Performance
|
|
40
|
+
|
|
41
|
+
<div align="center">
|
|
42
|
+
<img src="assets/deepseek_r1_results.png" alt="Benchmark Results" width="600"/>
|
|
43
|
+
</div>
|
|
44
|
+
|
|
45
|
+
ContextPilot (Stateless) on DeepSeek-R1 maintains accuracy compared to SGLang, achieving 64.68% vs 64.15% F1 on MultihopRAG and 41.08% vs 40.20% F1 on NarrativeQA.
|
|
46
|
+
|
|
47
|
+
### Accuracy on MT-RAG Benchmark (Online Scheduling)
|
|
48
|
+
|
|
49
|
+
<div align="center">
|
|
50
|
+
|
|
51
|
+
| Method | Qwen3-4B | Llama3.1-8B | Qwen3-30B-A3B |
|
|
52
|
+
|--------|----------|-------------|-----------|
|
|
53
|
+
| LMCache | 62.56 | **68.46** | 75.12 |
|
|
54
|
+
| CacheBlend | 50.33 | 56.52 | X |
|
|
55
|
+
| RadixCache | 62.56 | **68.46** | 75.12 |
|
|
56
|
+
| **ContextPilot** | **64.27** | 68.12 | **75.81** |
|
|
57
|
+
|
|
58
|
+
</div>
|
|
59
|
+
|
|
60
|
+
ContextPilot delivers **4-13x** improvements in cache hit rates and **1.5-3.5x** reductions in prefill latency for large-batch RAG workloads, while maintaining or improving accuracy.
|
|
61
|
+
|
|
62
|
+
**Furthermore**, ContextPilot has been tested to reduce input token costs by around **36%** with GPT-5.2.
|
|
63
|
+
|
|
64
|
+
See [Benchmarks](docs/reference/benchmarks.md) in the documentation for GPU vs CPU performance analysis and detailed benchmark methodology.
|
|
65
|
+
|
|
66
|
+
## Getting Started
|
|
67
|
+
|
|
68
|
+
### Installation
|
|
69
|
+
|
|
70
|
+
**Requirements:** Python >= 3.10
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
pip install contextpilot
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
Or install from source:
|
|
77
|
+
```bash
|
|
78
|
+
git clone https://github.com/Edinburgh-AgenticAI/ContextPilot.git
|
|
79
|
+
cd ContextPilot
|
|
80
|
+
pip install -e .
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
More [detailed installation instructions](docs/getting_started/installation.md) are available in the docs.
|
|
84
|
+
|
|
85
|
+
### Quick Start
|
|
86
|
+
|
|
87
|
+
**Offline / Online Stateless** — build index & schedule in one shot:
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
from openai import OpenAI
|
|
91
|
+
import contextpilot as cp
|
|
92
|
+
|
|
93
|
+
client = OpenAI(base_url="http://localhost:30000/v1", api_key="...") # Your inference engine URL and API key
|
|
94
|
+
|
|
95
|
+
queries = ["What is AI?", "Explain neural networks", "What is deep learning?"]
|
|
96
|
+
all_contexts = [
|
|
97
|
+
["Doc about AI", "Doc about ML", "Doc about computing"],
|
|
98
|
+
["Doc about neural nets", "Doc about deep learning"],
|
|
99
|
+
["Doc about ML", "Doc about AI", "Doc about deep learning basics"],
|
|
100
|
+
]
|
|
101
|
+
|
|
102
|
+
# Build index and schedule for prefix sharing
|
|
103
|
+
index = cp.build_context_index(all_contexts, use_gpu=False)
|
|
104
|
+
reordered, _, order, _ = cp.InterContextScheduler().schedule_contexts(index)
|
|
105
|
+
|
|
106
|
+
# Send in optimized order — shared prefixes hit KV cache
|
|
107
|
+
for ctx, orig_idx in zip(reordered, order):
|
|
108
|
+
docs_section = "\n".join(f"[{i+1}] {doc}" for i, doc in enumerate(ctx))
|
|
109
|
+
# Importance ranking restores original retrieval order for the model
|
|
110
|
+
importance_ranking = ">".join(
|
|
111
|
+
str(ctx.index(doc) + 1) for doc in all_contexts[orig_idx] if doc in ctx
|
|
112
|
+
)
|
|
113
|
+
response = client.chat.completions.create(
|
|
114
|
+
model="Qwen/Qwen3-4B",
|
|
115
|
+
messages=[
|
|
116
|
+
{"role": "system", "content": (
|
|
117
|
+
f"Answer the question based on the provided documents.\n\n"
|
|
118
|
+
f"<documents>\n{docs_section}\n</documents>\n\n"
|
|
119
|
+
f"Read the documents in this importance ranking: {importance_ranking}\n"
|
|
120
|
+
f"Prioritize information from higher-ranked documents."
|
|
121
|
+
)},
|
|
122
|
+
{"role": "user", "content": queries[orig_idx]},
|
|
123
|
+
],
|
|
124
|
+
)
|
|
125
|
+
print(f"Q: {queries[orig_idx]}\nA: {response.choices[0].message.content}\n")
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
> For online stateless scheduling via HTTP server, see the [online usage guide](docs/guides/online_usage.md).
|
|
129
|
+
|
|
130
|
+
**Stateful** — `LiveContextIndex` tracks cached state:
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
from openai import OpenAI
|
|
134
|
+
import contextpilot as cp
|
|
135
|
+
|
|
136
|
+
client = OpenAI(base_url="http://localhost:30000/v1", api_key="...")
|
|
137
|
+
live = cp.LiveContextIndex(use_gpu=False)
|
|
138
|
+
|
|
139
|
+
# Simulate multi-turn: each turn has batch_size=1
|
|
140
|
+
turns = [
|
|
141
|
+
{
|
|
142
|
+
"query": "What is AI?",
|
|
143
|
+
"contexts": [["Doc about AI", "Doc about ML", "Doc about computing"]],
|
|
144
|
+
},
|
|
145
|
+
{
|
|
146
|
+
"query": "Compare supervised and unsupervised learning",
|
|
147
|
+
# 2 of 3 docs overlap with Turn 1 ("Doc about AI", "Doc about ML"), different order + 1 new doc
|
|
148
|
+
"contexts": [["Doc about ML", "Doc about clustering", "Doc about AI"]],
|
|
149
|
+
},
|
|
150
|
+
]
|
|
151
|
+
|
|
152
|
+
for turn_idx, turn in enumerate(turns):
|
|
153
|
+
contexts = turn["contexts"]
|
|
154
|
+
query = turn["query"]
|
|
155
|
+
|
|
156
|
+
# build_incremental handles both cold start and incremental turns
|
|
157
|
+
result = live.build_incremental(contexts)
|
|
158
|
+
reordered = result['reordered_contexts']
|
|
159
|
+
# Turn 2: reordered to ["Doc about AI", "Doc about ML", "Doc about clustering"]
|
|
160
|
+
# ^— shared prefix from Turn 1 —^ ^— new doc appended
|
|
161
|
+
|
|
162
|
+
ctx = reordered[0]
|
|
163
|
+
docs_section = "\n".join(f"[{i+1}] {doc}" for i, doc in enumerate(ctx))
|
|
164
|
+
importance_ranking = ">".join(
|
|
165
|
+
str(ctx.index(doc) + 1) for doc in contexts[0] if doc in ctx
|
|
166
|
+
)
|
|
167
|
+
response = client.chat.completions.create(
|
|
168
|
+
model="Qwen/Qwen3-4B",
|
|
169
|
+
messages=[
|
|
170
|
+
{"role": "system", "content": (
|
|
171
|
+
f"Answer the question based on the provided documents.\n\n"
|
|
172
|
+
f"<documents>\n{docs_section}\n</documents>\n\n"
|
|
173
|
+
f"Read the documents in this importance ranking: {importance_ranking}\n"
|
|
174
|
+
f"Prioritize information from higher-ranked documents."
|
|
175
|
+
)},
|
|
176
|
+
{"role": "user", "content": query},
|
|
177
|
+
],
|
|
178
|
+
)
|
|
179
|
+
print(f"[Turn {turn_idx+1}] Q: {query}")
|
|
180
|
+
print(f"A: {response.choices[0].message.content}\n")
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
> **Note:** Stateful mode works without eviction sync — `LiveContextIndex` tracks the previous ordering and reorders new contexts to maximize prefix cache hits. For production deployments with limited storage size where the KV cache may evict entries, install the [SGLang eviction patch](docs/guides/online_usage.md#sglang-integration) to keep the index in sync. See the [online usage guide](docs/guides/online_usage.md) for HTTP server setup.
|
|
184
|
+
|
|
185
|
+
## Documentation
|
|
186
|
+
|
|
187
|
+
Check out the ContextPilot [documentation](docs/README.md) for comprehensive guides.
|
|
188
|
+
|
|
189
|
+
## Examples
|
|
190
|
+
|
|
191
|
+
Go hands-on with our [examples](examples/), demonstrating how to address different use cases with ContextPilot.
|
|
192
|
+
|
|
193
|
+
## Contributing
|
|
194
|
+
|
|
195
|
+
We welcome and value all contributions! Please feel free to submit issues and pull requests.
|
|
196
|
+
|
|
197
|
+
## Citation
|
|
198
|
+
We will include the paper citation soon!
|
|
@@ -38,6 +38,8 @@ from .context_ordering import (
|
|
|
38
38
|
InterContextScheduler,
|
|
39
39
|
)
|
|
40
40
|
|
|
41
|
+
from .server.live_index import LiveContextIndex
|
|
42
|
+
|
|
41
43
|
from .retriever import (
|
|
42
44
|
BM25Retriever,
|
|
43
45
|
FAISSRetriever,
|
|
@@ -47,7 +49,7 @@ from .retriever import (
|
|
|
47
49
|
MEM0_AVAILABLE,
|
|
48
50
|
)
|
|
49
51
|
|
|
50
|
-
__version__ = "0.2
|
|
52
|
+
__version__ = "0.3.2"
|
|
51
53
|
|
|
52
54
|
__all__ = [
|
|
53
55
|
# High-level pipeline API
|
|
@@ -63,6 +65,7 @@ __all__ = [
|
|
|
63
65
|
'build_context_index',
|
|
64
66
|
'IntraContextOrderer',
|
|
65
67
|
'InterContextScheduler',
|
|
68
|
+
'LiveContextIndex',
|
|
66
69
|
|
|
67
70
|
# Retrievers
|
|
68
71
|
'BM25Retriever',
|
|
@@ -96,6 +96,12 @@ class ContextIndex:
|
|
|
96
96
|
self.node_manager = NodeManager()
|
|
97
97
|
self.context_orderer = IntraContextOrderer()
|
|
98
98
|
|
|
99
|
+
# String-to-int mapping (auto-populated when string inputs are given)
|
|
100
|
+
self._str_to_id: dict = {}
|
|
101
|
+
self._id_to_str: dict = {}
|
|
102
|
+
self._next_str_id: int = 0
|
|
103
|
+
self._is_string_input: bool = False
|
|
104
|
+
|
|
99
105
|
if self.use_gpu:
|
|
100
106
|
print("Using GPU for distance computation")
|
|
101
107
|
else:
|
|
@@ -104,16 +110,48 @@ class ContextIndex:
|
|
|
104
110
|
else:
|
|
105
111
|
print("Using CPU for distance computation")
|
|
106
112
|
|
|
107
|
-
def
|
|
113
|
+
def _convert_to_int(self, contexts):
|
|
114
|
+
"""Convert string contexts to integer IDs if needed."""
|
|
115
|
+
if not contexts or not contexts[0]:
|
|
116
|
+
return contexts
|
|
117
|
+
if isinstance(contexts[0][0], str):
|
|
118
|
+
self._is_string_input = True
|
|
119
|
+
converted = []
|
|
120
|
+
for ctx in contexts:
|
|
121
|
+
converted_ctx = []
|
|
122
|
+
for item in ctx:
|
|
123
|
+
sid = self._str_to_id.get(item)
|
|
124
|
+
if sid is None:
|
|
125
|
+
sid = self._next_str_id
|
|
126
|
+
self._str_to_id[item] = sid
|
|
127
|
+
self._id_to_str[sid] = item
|
|
128
|
+
self._next_str_id += 1
|
|
129
|
+
converted_ctx.append(sid)
|
|
130
|
+
converted.append(converted_ctx)
|
|
131
|
+
return converted
|
|
132
|
+
return contexts
|
|
133
|
+
|
|
134
|
+
def _convert_to_str(self, contexts):
|
|
135
|
+
"""Convert integer contexts back to strings if input was strings."""
|
|
136
|
+
if not self._is_string_input or not contexts:
|
|
137
|
+
return contexts
|
|
138
|
+
# Skip if already converted (e.g. from fit_transform output)
|
|
139
|
+
if contexts[0] and isinstance(contexts[0][0], str):
|
|
140
|
+
return contexts
|
|
141
|
+
return [[self._id_to_str[i] for i in ctx] for ctx in contexts]
|
|
142
|
+
|
|
143
|
+
def fit_transform(self, contexts) -> IndexResult:
|
|
108
144
|
"""
|
|
109
145
|
Perform clustering and return results.
|
|
110
146
|
|
|
111
147
|
Args:
|
|
112
|
-
contexts: List of contexts, where each
|
|
148
|
+
contexts: List of contexts, where each context is a list of chunk IDs (int) or strings.
|
|
149
|
+
String inputs are automatically converted to integer IDs.
|
|
113
150
|
|
|
114
151
|
Returns:
|
|
115
152
|
IndexResult object containing clustering results
|
|
116
153
|
"""
|
|
154
|
+
contexts = self._convert_to_int(contexts)
|
|
117
155
|
n = len(contexts)
|
|
118
156
|
|
|
119
157
|
if n < 2:
|
|
@@ -194,14 +232,41 @@ class ContextIndex:
|
|
|
194
232
|
)
|
|
195
233
|
|
|
196
234
|
def _handle_single_prompt(self, contexts: List[List[int]]) -> IndexResult:
|
|
197
|
-
"""Handle case with less than 2 contexts.
|
|
235
|
+
"""Handle case with less than 2 contexts.
|
|
236
|
+
|
|
237
|
+
Always creates an empty root node above the leaf(s) so that
|
|
238
|
+
leaf.is_root is never True. This prevents the root-exclusion
|
|
239
|
+
guard in build_incremental from skipping legitimate matches.
|
|
240
|
+
"""
|
|
198
241
|
for i, prompt in enumerate(contexts):
|
|
199
|
-
self.node_manager.create_leaf_node(i, prompt)
|
|
242
|
+
node = self.node_manager.create_leaf_node(i, prompt)
|
|
243
|
+
# ClusterNode.__init__ sets doc_ids = sorted(content).
|
|
244
|
+
# Override to preserve the original context order so that
|
|
245
|
+
# build_incremental can use it as a correct prefix for Turn 2.
|
|
246
|
+
node.doc_ids = list(prompt)
|
|
247
|
+
|
|
248
|
+
# Wrap leaf node(s) under an empty root so that no leaf is the root.
|
|
249
|
+
# This mirrors the virtual-root logic in update_search_paths for forests,
|
|
250
|
+
# but applies it even for a single leaf.
|
|
251
|
+
leaf_ids = list(self.node_manager.unique_nodes.keys())
|
|
252
|
+
virtual_root_id = max(leaf_ids) + 1 if leaf_ids else 0
|
|
253
|
+
virtual_root = ClusterNode(
|
|
254
|
+
node_id=virtual_root_id,
|
|
255
|
+
content=set(),
|
|
256
|
+
original_indices=set(),
|
|
257
|
+
distance=0.0,
|
|
258
|
+
children=leaf_ids,
|
|
259
|
+
parent=None,
|
|
260
|
+
frequency=sum(self.node_manager.unique_nodes[nid].frequency for nid in leaf_ids)
|
|
261
|
+
)
|
|
262
|
+
self.node_manager.unique_nodes[virtual_root_id] = virtual_root
|
|
263
|
+
for nid in leaf_ids:
|
|
264
|
+
self.node_manager.unique_nodes[nid].parent = virtual_root_id
|
|
200
265
|
|
|
201
|
-
# Update search paths
|
|
266
|
+
# Update search paths (now a proper rooted tree)
|
|
202
267
|
self.node_manager.update_search_paths()
|
|
203
268
|
|
|
204
|
-
# For single context, extract search paths
|
|
269
|
+
# For single context, extract search paths
|
|
205
270
|
search_paths = self.context_orderer.extract_search_paths(
|
|
206
271
|
self.node_manager.unique_nodes, len(contexts)
|
|
207
272
|
)
|
|
@@ -233,7 +298,7 @@ class ContextIndex:
|
|
|
233
298
|
|
|
234
299
|
|
|
235
300
|
# Convenience function for backward compatibility
|
|
236
|
-
def build_context_index(contexts
|
|
301
|
+
def build_context_index(contexts,
|
|
237
302
|
linkage_method: str = "average",
|
|
238
303
|
use_gpu: bool = True,
|
|
239
304
|
alpha: float = 0.005,
|
|
@@ -243,7 +308,7 @@ def build_context_index(contexts: List[List[int]],
|
|
|
243
308
|
Convenience function for building a context index.
|
|
244
309
|
|
|
245
310
|
Args:
|
|
246
|
-
contexts: List of contexts, where each
|
|
311
|
+
contexts: List of contexts, where each context is a list of chunk IDs (int) or strings
|
|
247
312
|
linkage_method: Linkage method for hierarchical clustering
|
|
248
313
|
use_gpu: Whether to use GPU for distance computation
|
|
249
314
|
alpha: Weight for position term in distance calculation
|
|
@@ -260,4 +325,11 @@ def build_context_index(contexts: List[List[int]],
|
|
|
260
325
|
num_workers=num_workers,
|
|
261
326
|
batch_size=batch_size
|
|
262
327
|
)
|
|
263
|
-
|
|
328
|
+
result = indexer.fit_transform(contexts)
|
|
329
|
+
# Convert back to strings at the API boundary if input was strings
|
|
330
|
+
if indexer._is_string_input:
|
|
331
|
+
result.reordered_contexts = indexer._convert_to_str(result.reordered_contexts)
|
|
332
|
+
result.original_contexts = indexer._convert_to_str(result.original_contexts)
|
|
333
|
+
result.reordered_prompts = result.reordered_contexts
|
|
334
|
+
result.original_prompts = result.original_contexts
|
|
335
|
+
return result
|
|
@@ -4,7 +4,7 @@ Inter-Context Scheduler (ContextPilot Paper Algorithm)
|
|
|
4
4
|
This module implements the scheduling algorithm described in the ContextPilot paper:
|
|
5
5
|
1. Reuses search paths obtained during context ordering (no redundant tree lookups)
|
|
6
6
|
2. Groups contexts by the first element of their search path, naturally separating cache regions
|
|
7
|
-
3. Sorts contexts within each group by path length
|
|
7
|
+
3. Sorts contexts within each group by path length descending, with lexicographic tiebreaker
|
|
8
8
|
|
|
9
9
|
This avoids the O(N log M) tree rescanning overhead of existing methods.
|
|
10
10
|
"""
|
|
@@ -20,7 +20,7 @@ class InterContextScheduler:
|
|
|
20
20
|
This scheduler:
|
|
21
21
|
- Reuses search paths obtained during context ordering (no redundant tree lookups)
|
|
22
22
|
- Groups contexts by the first element of their search path
|
|
23
|
-
- Sorts contexts within each group by path length descending
|
|
23
|
+
- Sorts contexts within each group by path length descending, lex tiebreaker
|
|
24
24
|
|
|
25
25
|
Time complexity: O(N) grouping + O(N log N) sorting over N contexts
|
|
26
26
|
(Independent of tree size M, unlike traditional O(N log M) + O(N log N) methods)
|
|
@@ -109,17 +109,19 @@ class InterContextScheduler:
|
|
|
109
109
|
contexts: List[List[int]]
|
|
110
110
|
) -> List[List[int]]:
|
|
111
111
|
"""
|
|
112
|
-
Sort contexts within each group by path length
|
|
112
|
+
Sort contexts within each group by path length descending,
|
|
113
|
+
with lexicographic tiebreaker for equal-length paths.
|
|
113
114
|
|
|
114
|
-
|
|
115
|
-
|
|
115
|
+
Primary key (length descending): longer prefix matches first.
|
|
116
|
+
Secondary key (lexicographic): among equal-length paths, groups
|
|
117
|
+
contexts sharing deeper path prefixes adjacently to maximize LCP.
|
|
116
118
|
|
|
117
|
-
Total complexity: O(N log N) across all groups
|
|
119
|
+
Total complexity: O(N * L * log N) across all groups (L = tree depth)
|
|
118
120
|
|
|
119
121
|
Args:
|
|
120
122
|
groups_by_root: Groups of context indices by root prefix
|
|
121
123
|
search_paths: Search paths for each context
|
|
122
|
-
contexts: Reordered contexts (unused
|
|
124
|
+
contexts: Reordered contexts (unused)
|
|
123
125
|
|
|
124
126
|
Returns:
|
|
125
127
|
List of sorted groups
|
|
@@ -127,10 +129,9 @@ class InterContextScheduler:
|
|
|
127
129
|
sorted_groups = []
|
|
128
130
|
|
|
129
131
|
for root_prefix, group_indices in groups_by_root.items():
|
|
130
|
-
# Sort by path length in descending order, with index as tiebreaker
|
|
131
132
|
sorted_group = sorted(
|
|
132
133
|
group_indices,
|
|
133
|
-
key=lambda idx: (-len(search_paths[idx]), idx)
|
|
134
|
+
key=lambda idx: (-len(search_paths[idx]), search_paths[idx], idx)
|
|
134
135
|
)
|
|
135
136
|
sorted_groups.append(sorted_group)
|
|
136
137
|
|