contextpilot 0.3.1__tar.gz → 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. contextpilot-0.3.2/PKG-INFO +239 -0
  2. contextpilot-0.3.2/README.md +198 -0
  3. {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/__init__.py +4 -1
  4. {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/context_index/index_construction.py +81 -9
  5. {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/server/http_client.py +31 -17
  6. {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/server/http_server.py +2 -1
  7. {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/server/live_index.py +324 -55
  8. contextpilot-0.3.2/contextpilot.egg-info/PKG-INFO +239 -0
  9. {contextpilot-0.3.1 → contextpilot-0.3.2}/pyproject.toml +1 -1
  10. contextpilot-0.3.1/PKG-INFO +0 -137
  11. contextpilot-0.3.1/README.md +0 -96
  12. contextpilot-0.3.1/contextpilot.egg-info/PKG-INFO +0 -137
  13. {contextpilot-0.3.1 → contextpilot-0.3.2}/LICENSE +0 -0
  14. {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/context_index/__init__.py +0 -0
  15. {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/context_index/compute_distance_cpu.py +0 -0
  16. {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/context_index/compute_distance_gpu.py +0 -0
  17. {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/context_index/tree_nodes.py +0 -0
  18. {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/context_ordering/__init__.py +0 -0
  19. {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/context_ordering/inter_scheduler.py +0 -0
  20. {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/context_ordering/intra_ordering.py +0 -0
  21. {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/pipeline/__init__.py +0 -0
  22. {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/pipeline/components.py +0 -0
  23. {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/pipeline/multi_turn.py +0 -0
  24. {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/pipeline/rag_pipeline.py +0 -0
  25. {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/retriever/__init__.py +0 -0
  26. {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/retriever/bm25.py +0 -0
  27. {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/retriever/faiss_embedding.py +0 -0
  28. {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/retriever/mem0_retriever.py +0 -0
  29. {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/retriever/pageindex_retriever.py +0 -0
  30. {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/server/__init__.py +0 -0
  31. {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/server/conversation_tracker.py +0 -0
  32. {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/server/eviction_heap.py +0 -0
  33. {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/server/metadata.py +0 -0
  34. {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/utils/__init__.py +0 -0
  35. {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/utils/eval_metrics.py +0 -0
  36. {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/utils/prompt_generator.py +0 -0
  37. {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot/utils/tools.py +0 -0
  38. {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot.egg-info/SOURCES.txt +0 -0
  39. {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot.egg-info/dependency_links.txt +0 -0
  40. {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot.egg-info/requires.txt +0 -0
  41. {contextpilot-0.3.1 → contextpilot-0.3.2}/contextpilot.egg-info/top_level.txt +0 -0
  42. {contextpilot-0.3.1 → contextpilot-0.3.2}/requirements.txt +0 -0
  43. {contextpilot-0.3.1 → contextpilot-0.3.2}/setup.cfg +0 -0
  44. {contextpilot-0.3.1 → contextpilot-0.3.2}/tests/test_context_index.py +0 -0
  45. {contextpilot-0.3.1 → contextpilot-0.3.2}/tests/test_context_ordering.py +0 -0
  46. {contextpilot-0.3.1 → contextpilot-0.3.2}/tests/test_cpu_distances.py +0 -0
  47. {contextpilot-0.3.1 → contextpilot-0.3.2}/tests/test_deduplication.py +0 -0
  48. {contextpilot-0.3.1 → contextpilot-0.3.2}/tests/test_gpu_distance_performance.py +0 -0
  49. {contextpilot-0.3.1 → contextpilot-0.3.2}/tests/test_gpu_distances.py +0 -0
  50. {contextpilot-0.3.1 → contextpilot-0.3.2}/tests/test_group_prefix_sharing.py +0 -0
  51. {contextpilot-0.3.1 → contextpilot-0.3.2}/tests/test_incremental_build.py +0 -0
  52. {contextpilot-0.3.1 → contextpilot-0.3.2}/tests/test_live_index.py +0 -0
  53. {contextpilot-0.3.1 → contextpilot-0.3.2}/tests/test_mem0_integration.py +0 -0
  54. {contextpilot-0.3.1 → contextpilot-0.3.2}/tests/test_multi_turn.py +0 -0
  55. {contextpilot-0.3.1 → contextpilot-0.3.2}/tests/test_multi_turn_e2e.py +0 -0
  56. {contextpilot-0.3.1 → contextpilot-0.3.2}/tests/test_pageindex_integration.py +0 -0
  57. {contextpilot-0.3.1 → contextpilot-0.3.2}/tests/test_performance.py +0 -0
  58. {contextpilot-0.3.1 → contextpilot-0.3.2}/tests/test_pipeline.py +0 -0
  59. {contextpilot-0.3.1 → contextpilot-0.3.2}/tests/test_server_integration.py +0 -0
  60. {contextpilot-0.3.1 → contextpilot-0.3.2}/tests/test_utils.py +0 -0
@@ -0,0 +1,239 @@
1
+ Metadata-Version: 2.4
2
+ Name: contextpilot
3
+ Version: 0.3.2
4
+ Summary: Efficient Retrieval-Augmented Generation with Accuracy-Preserving Context Reuse
5
+ Author: Yinsicheng Jiang, Chivier Humber
6
+ License: Apache-2.0
7
+ Project-URL: Homepage, https://github.com/SecretSettler/ContextPilot
8
+ Project-URL: Repository, https://github.com/SecretSettler/ContextPilot
9
+ Project-URL: Issues, https://github.com/SecretSettler/ContextPilot/issues
10
+ Keywords: rag,llm,context-reuse,kv-cache,retrieval-augmented-generation
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: Apache Software License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Requires-Python: >=3.10
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: datasets
24
+ Requires-Dist: transformers
25
+ Requires-Dist: elasticsearch==8.18.1
26
+ Requires-Dist: aiohttp
27
+ Requires-Dist: ujson
28
+ Requires-Dist: scipy
29
+ Requires-Dist: fastapi[all]
30
+ Requires-Dist: cupy-cuda12x
31
+ Requires-Dist: pytest
32
+ Provides-Extra: dev
33
+ Requires-Dist: black; extra == "dev"
34
+ Requires-Dist: bumpver; extra == "dev"
35
+ Requires-Dist: isort; extra == "dev"
36
+ Requires-Dist: pip-tools; extra == "dev"
37
+ Requires-Dist: pytest; extra == "dev"
38
+ Requires-Dist: pytest-cov; extra == "dev"
39
+ Requires-Dist: ipython; extra == "dev"
40
+ Dynamic: license-file
41
+
42
+ <div align="center">
43
+ <img src="assets/about.png" alt="ContextPilot Logo" width="800"/>
44
+
45
+ <h1><strong>ContextPilot: Efficient Long Context Inference with Context Reuse</strong></h1>
46
+
47
+ [![Python](https://img.shields.io/badge/python-≥3.10-blue)](https://www.python.org/)
48
+ [![PyPI](https://img.shields.io/pypi/v/contextpilot)](https://pypi.org/project/contextpilot/)
49
+ [![License](https://img.shields.io/badge/license-Apache%202.0-green)](LICENSE)
50
+
51
+ </div>
52
+
53
+ --------------------------------------------------------------------------------
54
+
55
+ | [**Documentation**](docs/README.md) | [**Examples**](examples/) | [**Benchmarks**](docs/reference/benchmarks.md) |
56
+
57
+ ## News
58
+
59
+ - [2026/02] ContextPilot v0.3.2 released, supporting [PageIndex](https://github.com/VectifyAI/PageIndex) and [Mem0](https://github.com/mem0ai/mem0).
60
+ - [2026/01] ContextPilot has been accepted to MLSys 2026 🎉! See you in Bellevue, WA, USA.
61
+ - [2025/12] ContextPilot v0.2.0 released.
62
+
63
+ ## About
64
+
65
+ ContextPilot is a fast optimization system on context engineering layer for agentic workloads:
66
+ 1. **High Throughput & Cache Hit Ratio**: Boosting prefill throughput and prefix cache hit ratio with intelligent context reuse.
67
+ 2. **Strong Compatibility**: Strong compatibility with existing popular RAG libraries (PageIndex), Agentic memory layer (Mem0), KV cache optimization engine (LMCache), and Inference engines (vLLM and SGLang).
68
+ 3. **Negligible Accuracy Loss**: Achieving significant performance improvements with minimal to no accuracy degradation across various benchmarks.
69
+ 3. **Widely Tested**: Tested with a wide range of RAG and Agentic AI applications.
70
+
71
+ ## Target Workloads
72
+
73
+ 1. **Trending Topic QA** — Search and generation for breaking news and hot topics beyond model knowledge
74
+ 2. **Closed-Domain Long-Context QA** — QA over specialized corpora (novels, financial reports, legal documents) with retrieval or in-context search
75
+ 3. **Large-Batch Long-Context Execution** — High-throughput inference where many requests share overlapping contexts; ContextPilot maximizes prefix reuse regardless of the search method
76
+ 4. **Multi-Turn Conversations with Long-Term Memory** — Persistent context reuse across turns (e.g. [Mem0](https://github.com/mem0ai/mem0))
77
+
78
+ ## Benchmark and Performance
79
+
80
+ ### System Performance
81
+
82
+ <div align="center">
83
+ <img src="assets/deepseek_r1_results.png" alt="Benchmark Results" width="600"/>
84
+ </div>
85
+
86
+ ContextPilot (Stateless) on DeepSeek-R1 maintains accuracy compared to SGLang, achieving 64.68% vs 64.15% F1 on MultihopRAG and 41.08% vs 40.20% F1 on NarrativeQA.
87
+
88
+ ### Accuracy on MT-RAG Benchmark (Online Scheduling)
89
+
90
+ <div align="center">
91
+
92
+ | Method | Qwen3-4B | Llama3.1-8B | Qwen3-30B-A3B |
93
+ |--------|----------|-------------|-----------|
94
+ | LMCache | 62.56 | **68.46** | 75.12 |
95
+ | CacheBlend | 50.33 | 56.52 | X |
96
+ | RadixCache | 62.56 | **68.46** | 75.12 |
97
+ | **ContextPilot** | **64.27** | 68.12 | **75.81** |
98
+
99
+ </div>
100
+
101
+ ContextPilot delivers **4-13x** improvements in cache hit rates and **1.5-3.5x** reductions in prefill latency for large-batch RAG workloads, while maintaining or improving accuracy.
102
+
103
+ **Furthermore**, ContextPilot has been tested to reduce input token costs by around **36%** with GPT-5.2.
104
+
105
+ See [Benchmarks](docs/reference/benchmarks.md) in the documentation for GPU vs CPU performance analysis and detailed benchmark methodology.
106
+
107
+ ## Getting Started
108
+
109
+ ### Installation
110
+
111
+ **Requirements:** Python >= 3.10
112
+
113
+ ```bash
114
+ pip install contextpilot
115
+ ```
116
+
117
+ Or install from source:
118
+ ```bash
119
+ git clone https://github.com/Edinburgh-AgenticAI/ContextPilot.git
120
+ cd ContextPilot
121
+ pip install -e .
122
+ ```
123
+
124
+ More [detailed installation instructions](docs/getting_started/installation.md) are available in the docs.
125
+
126
+ ### Quick Start
127
+
128
+ **Offline / Online Stateless** — build index & schedule in one shot:
129
+
130
+ ```python
131
+ from openai import OpenAI
132
+ import contextpilot as cp
133
+
134
+ client = OpenAI(base_url="http://localhost:30000/v1", api_key="...") # Your inference engine URL and API key
135
+
136
+ queries = ["What is AI?", "Explain neural networks", "What is deep learning?"]
137
+ all_contexts = [
138
+ ["Doc about AI", "Doc about ML", "Doc about computing"],
139
+ ["Doc about neural nets", "Doc about deep learning"],
140
+ ["Doc about ML", "Doc about AI", "Doc about deep learning basics"],
141
+ ]
142
+
143
+ # Build index and schedule for prefix sharing
144
+ index = cp.build_context_index(all_contexts, use_gpu=False)
145
+ reordered, _, order, _ = cp.InterContextScheduler().schedule_contexts(index)
146
+
147
+ # Send in optimized order — shared prefixes hit KV cache
148
+ for ctx, orig_idx in zip(reordered, order):
149
+ docs_section = "\n".join(f"[{i+1}] {doc}" for i, doc in enumerate(ctx))
150
+ # Importance ranking restores original retrieval order for the model
151
+ importance_ranking = ">".join(
152
+ str(ctx.index(doc) + 1) for doc in all_contexts[orig_idx] if doc in ctx
153
+ )
154
+ response = client.chat.completions.create(
155
+ model="Qwen/Qwen3-4B",
156
+ messages=[
157
+ {"role": "system", "content": (
158
+ f"Answer the question based on the provided documents.\n\n"
159
+ f"<documents>\n{docs_section}\n</documents>\n\n"
160
+ f"Read the documents in this importance ranking: {importance_ranking}\n"
161
+ f"Prioritize information from higher-ranked documents."
162
+ )},
163
+ {"role": "user", "content": queries[orig_idx]},
164
+ ],
165
+ )
166
+ print(f"Q: {queries[orig_idx]}\nA: {response.choices[0].message.content}\n")
167
+ ```
168
+
169
+ > For online stateless scheduling via HTTP server, see the [online usage guide](docs/guides/online_usage.md).
170
+
171
+ **Stateful** — `LiveContextIndex` tracks cached state:
172
+
173
+ ```python
174
+ from openai import OpenAI
175
+ import contextpilot as cp
176
+
177
+ client = OpenAI(base_url="http://localhost:30000/v1", api_key="...")
178
+ live = cp.LiveContextIndex(use_gpu=False)
179
+
180
+ # Simulate multi-turn: each turn has batch_size=1
181
+ turns = [
182
+ {
183
+ "query": "What is AI?",
184
+ "contexts": [["Doc about AI", "Doc about ML", "Doc about computing"]],
185
+ },
186
+ {
187
+ "query": "Compare supervised and unsupervised learning",
188
+ # 2 of 3 docs overlap with Turn 1 ("Doc about AI", "Doc about ML"), different order + 1 new doc
189
+ "contexts": [["Doc about ML", "Doc about clustering", "Doc about AI"]],
190
+ },
191
+ ]
192
+
193
+ for turn_idx, turn in enumerate(turns):
194
+ contexts = turn["contexts"]
195
+ query = turn["query"]
196
+
197
+ # build_incremental handles both cold start and incremental turns
198
+ result = live.build_incremental(contexts)
199
+ reordered = result['reordered_contexts']
200
+ # Turn 2: reordered to ["Doc about AI", "Doc about ML", "Doc about clustering"]
201
+ # ^— shared prefix from Turn 1 —^ ^— new doc appended
202
+
203
+ ctx = reordered[0]
204
+ docs_section = "\n".join(f"[{i+1}] {doc}" for i, doc in enumerate(ctx))
205
+ importance_ranking = ">".join(
206
+ str(ctx.index(doc) + 1) for doc in contexts[0] if doc in ctx
207
+ )
208
+ response = client.chat.completions.create(
209
+ model="Qwen/Qwen3-4B",
210
+ messages=[
211
+ {"role": "system", "content": (
212
+ f"Answer the question based on the provided documents.\n\n"
213
+ f"<documents>\n{docs_section}\n</documents>\n\n"
214
+ f"Read the documents in this importance ranking: {importance_ranking}\n"
215
+ f"Prioritize information from higher-ranked documents."
216
+ )},
217
+ {"role": "user", "content": query},
218
+ ],
219
+ )
220
+ print(f"[Turn {turn_idx+1}] Q: {query}")
221
+ print(f"A: {response.choices[0].message.content}\n")
222
+ ```
223
+
224
+ > **Note:** Stateful mode works without eviction sync — `LiveContextIndex` tracks the previous ordering and reorders new contexts to maximize prefix cache hits. For production deployments with limited storage size where the KV cache may evict entries, install the [SGLang eviction patch](docs/guides/online_usage.md#sglang-integration) to keep the index in sync. See the [online usage guide](docs/guides/online_usage.md) for HTTP server setup.
225
+
226
+ ## Documentation
227
+
228
+ Check out the ContextPilot [documentation](docs/README.md) for comprehensive guides.
229
+
230
+ ## Examples
231
+
232
+ Go hands-on with our [examples](examples/), demonstrating how to address different use cases with ContextPilot.
233
+
234
+ ## Contributing
235
+
236
+ We welcome and value all contributions! Please feel free to submit issues and pull requests.
237
+
238
+ ## Citation
239
+ We will include the paper citation soon!
@@ -0,0 +1,198 @@
1
+ <div align="center">
2
+ <img src="assets/about.png" alt="ContextPilot Logo" width="800"/>
3
+
4
+ <h1><strong>ContextPilot: Efficient Long Context Inference with Context Reuse</strong></h1>
5
+
6
+ [![Python](https://img.shields.io/badge/python-≥3.10-blue)](https://www.python.org/)
7
+ [![PyPI](https://img.shields.io/pypi/v/contextpilot)](https://pypi.org/project/contextpilot/)
8
+ [![License](https://img.shields.io/badge/license-Apache%202.0-green)](LICENSE)
9
+
10
+ </div>
11
+
12
+ --------------------------------------------------------------------------------
13
+
14
+ | [**Documentation**](docs/README.md) | [**Examples**](examples/) | [**Benchmarks**](docs/reference/benchmarks.md) |
15
+
16
+ ## News
17
+
18
+ - [2026/02] ContextPilot v0.3.2 released, supporting [PageIndex](https://github.com/VectifyAI/PageIndex) and [Mem0](https://github.com/mem0ai/mem0).
19
+ - [2026/01] ContextPilot has been accepted to MLSys 2026 🎉! See you in Bellevue, WA, USA.
20
+ - [2025/12] ContextPilot v0.2.0 released.
21
+
22
+ ## About
23
+
24
+ ContextPilot is a fast optimization system on context engineering layer for agentic workloads:
25
+ 1. **High Throughput & Cache Hit Ratio**: Boosting prefill throughput and prefix cache hit ratio with intelligent context reuse.
26
+ 2. **Strong Compatibility**: Strong compatibility with existing popular RAG libraries (PageIndex), Agentic memory layer (Mem0), KV cache optimization engine (LMCache), and Inference engines (vLLM and SGLang).
27
+ 3. **Negligible Accuracy Loss**: Achieving significant performance improvements with minimal to no accuracy degradation across various benchmarks.
28
+ 3. **Widely Tested**: Tested with a wide range of RAG and Agentic AI applications.
29
+
30
+ ## Target Workloads
31
+
32
+ 1. **Trending Topic QA** — Search and generation for breaking news and hot topics beyond model knowledge
33
+ 2. **Closed-Domain Long-Context QA** — QA over specialized corpora (novels, financial reports, legal documents) with retrieval or in-context search
34
+ 3. **Large-Batch Long-Context Execution** — High-throughput inference where many requests share overlapping contexts; ContextPilot maximizes prefix reuse regardless of the search method
35
+ 4. **Multi-Turn Conversations with Long-Term Memory** — Persistent context reuse across turns (e.g. [Mem0](https://github.com/mem0ai/mem0))
36
+
37
+ ## Benchmark and Performance
38
+
39
+ ### System Performance
40
+
41
+ <div align="center">
42
+ <img src="assets/deepseek_r1_results.png" alt="Benchmark Results" width="600"/>
43
+ </div>
44
+
45
+ ContextPilot (Stateless) on DeepSeek-R1 maintains accuracy compared to SGLang, achieving 64.68% vs 64.15% F1 on MultihopRAG and 41.08% vs 40.20% F1 on NarrativeQA.
46
+
47
+ ### Accuracy on MT-RAG Benchmark (Online Scheduling)
48
+
49
+ <div align="center">
50
+
51
+ | Method | Qwen3-4B | Llama3.1-8B | Qwen3-30B-A3B |
52
+ |--------|----------|-------------|-----------|
53
+ | LMCache | 62.56 | **68.46** | 75.12 |
54
+ | CacheBlend | 50.33 | 56.52 | X |
55
+ | RadixCache | 62.56 | **68.46** | 75.12 |
56
+ | **ContextPilot** | **64.27** | 68.12 | **75.81** |
57
+
58
+ </div>
59
+
60
+ ContextPilot delivers **4-13x** improvements in cache hit rates and **1.5-3.5x** reductions in prefill latency for large-batch RAG workloads, while maintaining or improving accuracy.
61
+
62
+ **Furthermore**, ContextPilot has been tested to reduce input token costs by around **36%** with GPT-5.2.
63
+
64
+ See [Benchmarks](docs/reference/benchmarks.md) in the documentation for GPU vs CPU performance analysis and detailed benchmark methodology.
65
+
66
+ ## Getting Started
67
+
68
+ ### Installation
69
+
70
+ **Requirements:** Python >= 3.10
71
+
72
+ ```bash
73
+ pip install contextpilot
74
+ ```
75
+
76
+ Or install from source:
77
+ ```bash
78
+ git clone https://github.com/Edinburgh-AgenticAI/ContextPilot.git
79
+ cd ContextPilot
80
+ pip install -e .
81
+ ```
82
+
83
+ More [detailed installation instructions](docs/getting_started/installation.md) are available in the docs.
84
+
85
+ ### Quick Start
86
+
87
+ **Offline / Online Stateless** — build index & schedule in one shot:
88
+
89
+ ```python
90
+ from openai import OpenAI
91
+ import contextpilot as cp
92
+
93
+ client = OpenAI(base_url="http://localhost:30000/v1", api_key="...") # Your inference engine URL and API key
94
+
95
+ queries = ["What is AI?", "Explain neural networks", "What is deep learning?"]
96
+ all_contexts = [
97
+ ["Doc about AI", "Doc about ML", "Doc about computing"],
98
+ ["Doc about neural nets", "Doc about deep learning"],
99
+ ["Doc about ML", "Doc about AI", "Doc about deep learning basics"],
100
+ ]
101
+
102
+ # Build index and schedule for prefix sharing
103
+ index = cp.build_context_index(all_contexts, use_gpu=False)
104
+ reordered, _, order, _ = cp.InterContextScheduler().schedule_contexts(index)
105
+
106
+ # Send in optimized order — shared prefixes hit KV cache
107
+ for ctx, orig_idx in zip(reordered, order):
108
+ docs_section = "\n".join(f"[{i+1}] {doc}" for i, doc in enumerate(ctx))
109
+ # Importance ranking restores original retrieval order for the model
110
+ importance_ranking = ">".join(
111
+ str(ctx.index(doc) + 1) for doc in all_contexts[orig_idx] if doc in ctx
112
+ )
113
+ response = client.chat.completions.create(
114
+ model="Qwen/Qwen3-4B",
115
+ messages=[
116
+ {"role": "system", "content": (
117
+ f"Answer the question based on the provided documents.\n\n"
118
+ f"<documents>\n{docs_section}\n</documents>\n\n"
119
+ f"Read the documents in this importance ranking: {importance_ranking}\n"
120
+ f"Prioritize information from higher-ranked documents."
121
+ )},
122
+ {"role": "user", "content": queries[orig_idx]},
123
+ ],
124
+ )
125
+ print(f"Q: {queries[orig_idx]}\nA: {response.choices[0].message.content}\n")
126
+ ```
127
+
128
+ > For online stateless scheduling via HTTP server, see the [online usage guide](docs/guides/online_usage.md).
129
+
130
+ **Stateful** — `LiveContextIndex` tracks cached state:
131
+
132
+ ```python
133
+ from openai import OpenAI
134
+ import contextpilot as cp
135
+
136
+ client = OpenAI(base_url="http://localhost:30000/v1", api_key="...")
137
+ live = cp.LiveContextIndex(use_gpu=False)
138
+
139
+ # Simulate multi-turn: each turn has batch_size=1
140
+ turns = [
141
+ {
142
+ "query": "What is AI?",
143
+ "contexts": [["Doc about AI", "Doc about ML", "Doc about computing"]],
144
+ },
145
+ {
146
+ "query": "Compare supervised and unsupervised learning",
147
+ # 2 of 3 docs overlap with Turn 1 ("Doc about AI", "Doc about ML"), different order + 1 new doc
148
+ "contexts": [["Doc about ML", "Doc about clustering", "Doc about AI"]],
149
+ },
150
+ ]
151
+
152
+ for turn_idx, turn in enumerate(turns):
153
+ contexts = turn["contexts"]
154
+ query = turn["query"]
155
+
156
+ # build_incremental handles both cold start and incremental turns
157
+ result = live.build_incremental(contexts)
158
+ reordered = result['reordered_contexts']
159
+ # Turn 2: reordered to ["Doc about AI", "Doc about ML", "Doc about clustering"]
160
+ # ^— shared prefix from Turn 1 —^ ^— new doc appended
161
+
162
+ ctx = reordered[0]
163
+ docs_section = "\n".join(f"[{i+1}] {doc}" for i, doc in enumerate(ctx))
164
+ importance_ranking = ">".join(
165
+ str(ctx.index(doc) + 1) for doc in contexts[0] if doc in ctx
166
+ )
167
+ response = client.chat.completions.create(
168
+ model="Qwen/Qwen3-4B",
169
+ messages=[
170
+ {"role": "system", "content": (
171
+ f"Answer the question based on the provided documents.\n\n"
172
+ f"<documents>\n{docs_section}\n</documents>\n\n"
173
+ f"Read the documents in this importance ranking: {importance_ranking}\n"
174
+ f"Prioritize information from higher-ranked documents."
175
+ )},
176
+ {"role": "user", "content": query},
177
+ ],
178
+ )
179
+ print(f"[Turn {turn_idx+1}] Q: {query}")
180
+ print(f"A: {response.choices[0].message.content}\n")
181
+ ```
182
+
183
+ > **Note:** Stateful mode works without eviction sync — `LiveContextIndex` tracks the previous ordering and reorders new contexts to maximize prefix cache hits. For production deployments with limited storage size where the KV cache may evict entries, install the [SGLang eviction patch](docs/guides/online_usage.md#sglang-integration) to keep the index in sync. See the [online usage guide](docs/guides/online_usage.md) for HTTP server setup.
184
+
185
+ ## Documentation
186
+
187
+ Check out the ContextPilot [documentation](docs/README.md) for comprehensive guides.
188
+
189
+ ## Examples
190
+
191
+ Go hands-on with our [examples](examples/), demonstrating how to address different use cases with ContextPilot.
192
+
193
+ ## Contributing
194
+
195
+ We welcome and value all contributions! Please feel free to submit issues and pull requests.
196
+
197
+ ## Citation
198
+ We will include the paper citation soon!
@@ -38,6 +38,8 @@ from .context_ordering import (
38
38
  InterContextScheduler,
39
39
  )
40
40
 
41
+ from .server.live_index import LiveContextIndex
42
+
41
43
  from .retriever import (
42
44
  BM25Retriever,
43
45
  FAISSRetriever,
@@ -47,7 +49,7 @@ from .retriever import (
47
49
  MEM0_AVAILABLE,
48
50
  )
49
51
 
50
- __version__ = "0.3.1"
52
+ __version__ = "0.3.2"
51
53
 
52
54
  __all__ = [
53
55
  # High-level pipeline API
@@ -63,6 +65,7 @@ __all__ = [
63
65
  'build_context_index',
64
66
  'IntraContextOrderer',
65
67
  'InterContextScheduler',
68
+ 'LiveContextIndex',
66
69
 
67
70
  # Retrievers
68
71
  'BM25Retriever',
@@ -96,6 +96,12 @@ class ContextIndex:
96
96
  self.node_manager = NodeManager()
97
97
  self.context_orderer = IntraContextOrderer()
98
98
 
99
+ # String-to-int mapping (auto-populated when string inputs are given)
100
+ self._str_to_id: dict = {}
101
+ self._id_to_str: dict = {}
102
+ self._next_str_id: int = 0
103
+ self._is_string_input: bool = False
104
+
99
105
  if self.use_gpu:
100
106
  print("Using GPU for distance computation")
101
107
  else:
@@ -104,16 +110,48 @@ class ContextIndex:
104
110
  else:
105
111
  print("Using CPU for distance computation")
106
112
 
107
- def fit_transform(self, contexts: List[List[int]]) -> IndexResult:
113
+ def _convert_to_int(self, contexts):
114
+ """Convert string contexts to integer IDs if needed."""
115
+ if not contexts or not contexts[0]:
116
+ return contexts
117
+ if isinstance(contexts[0][0], str):
118
+ self._is_string_input = True
119
+ converted = []
120
+ for ctx in contexts:
121
+ converted_ctx = []
122
+ for item in ctx:
123
+ sid = self._str_to_id.get(item)
124
+ if sid is None:
125
+ sid = self._next_str_id
126
+ self._str_to_id[item] = sid
127
+ self._id_to_str[sid] = item
128
+ self._next_str_id += 1
129
+ converted_ctx.append(sid)
130
+ converted.append(converted_ctx)
131
+ return converted
132
+ return contexts
133
+
134
+ def _convert_to_str(self, contexts):
135
+ """Convert integer contexts back to strings if input was strings."""
136
+ if not self._is_string_input or not contexts:
137
+ return contexts
138
+ # Skip if already converted (e.g. from fit_transform output)
139
+ if contexts[0] and isinstance(contexts[0][0], str):
140
+ return contexts
141
+ return [[self._id_to_str[i] for i in ctx] for ctx in contexts]
142
+
143
+ def fit_transform(self, contexts) -> IndexResult:
108
144
  """
109
145
  Perform clustering and return results.
110
146
 
111
147
  Args:
112
- contexts: List of contexts, where each prompt is a list of chunk IDs
148
+ contexts: List of contexts, where each context is a list of chunk IDs (int) or strings.
149
+ String inputs are automatically converted to integer IDs.
113
150
 
114
151
  Returns:
115
152
  IndexResult object containing clustering results
116
153
  """
154
+ contexts = self._convert_to_int(contexts)
117
155
  n = len(contexts)
118
156
 
119
157
  if n < 2:
@@ -194,14 +232,41 @@ class ContextIndex:
194
232
  )
195
233
 
196
234
  def _handle_single_prompt(self, contexts: List[List[int]]) -> IndexResult:
197
- """Handle case with less than 2 contexts."""
235
+ """Handle case with less than 2 contexts.
236
+
237
+ Always creates an empty root node above the leaf(s) so that
238
+ leaf.is_root is never True. This prevents the root-exclusion
239
+ guard in build_incremental from skipping legitimate matches.
240
+ """
198
241
  for i, prompt in enumerate(contexts):
199
- self.node_manager.create_leaf_node(i, prompt)
242
+ node = self.node_manager.create_leaf_node(i, prompt)
243
+ # ClusterNode.__init__ sets doc_ids = sorted(content).
244
+ # Override to preserve the original context order so that
245
+ # build_incremental can use it as a correct prefix for Turn 2.
246
+ node.doc_ids = list(prompt)
247
+
248
+ # Wrap leaf node(s) under an empty root so that no leaf is the root.
249
+ # This mirrors the virtual-root logic in update_search_paths for forests,
250
+ # but applies it even for a single leaf.
251
+ leaf_ids = list(self.node_manager.unique_nodes.keys())
252
+ virtual_root_id = max(leaf_ids) + 1 if leaf_ids else 0
253
+ virtual_root = ClusterNode(
254
+ node_id=virtual_root_id,
255
+ content=set(),
256
+ original_indices=set(),
257
+ distance=0.0,
258
+ children=leaf_ids,
259
+ parent=None,
260
+ frequency=sum(self.node_manager.unique_nodes[nid].frequency for nid in leaf_ids)
261
+ )
262
+ self.node_manager.unique_nodes[virtual_root_id] = virtual_root
263
+ for nid in leaf_ids:
264
+ self.node_manager.unique_nodes[nid].parent = virtual_root_id
200
265
 
201
- # Update search paths even for single nodes
266
+ # Update search paths (now a proper rooted tree)
202
267
  self.node_manager.update_search_paths()
203
268
 
204
- # For single context, extract search paths (will be empty for root-only tree)
269
+ # For single context, extract search paths
205
270
  search_paths = self.context_orderer.extract_search_paths(
206
271
  self.node_manager.unique_nodes, len(contexts)
207
272
  )
@@ -233,7 +298,7 @@ class ContextIndex:
233
298
 
234
299
 
235
300
  # Convenience function for backward compatibility
236
- def build_context_index(contexts: List[List[int]],
301
+ def build_context_index(contexts,
237
302
  linkage_method: str = "average",
238
303
  use_gpu: bool = True,
239
304
  alpha: float = 0.005,
@@ -243,7 +308,7 @@ def build_context_index(contexts: List[List[int]],
243
308
  Convenience function for building a context index.
244
309
 
245
310
  Args:
246
- contexts: List of contexts, where each prompt is a list of chunk IDs
311
+ contexts: List of contexts, where each context is a list of chunk IDs (int) or strings
247
312
  linkage_method: Linkage method for hierarchical clustering
248
313
  use_gpu: Whether to use GPU for distance computation
249
314
  alpha: Weight for position term in distance calculation
@@ -260,4 +325,11 @@ def build_context_index(contexts: List[List[int]],
260
325
  num_workers=num_workers,
261
326
  batch_size=batch_size
262
327
  )
263
- return indexer.fit_transform(contexts)
328
+ result = indexer.fit_transform(contexts)
329
+ # Convert back to strings at the API boundary if input was strings
330
+ if indexer._is_string_input:
331
+ result.reordered_contexts = indexer._convert_to_str(result.reordered_contexts)
332
+ result.original_contexts = indexer._convert_to_str(result.original_contexts)
333
+ result.reordered_prompts = result.reordered_contexts
334
+ result.original_prompts = result.original_contexts
335
+ return result
@@ -28,11 +28,11 @@ class ContextPilotIndexClient:
28
28
  Example usage in SGLang:
29
29
  # In scheduler initialization:
30
30
  self.contextpilot_client = ContextPilotIndexClient("http://localhost:8765")
31
-
32
- # In eviction code:
33
- def evict_tokens(self, num_tokens):
34
- self.tree_cache.evict(num_tokens)
35
- self.contextpilot_client.evict(num_tokens) # Sync with index
31
+
32
+ # In eviction callback:
33
+ def on_cache_evict(self, evicted_request_ids):
34
+ # Sync eviction with ContextPilot index
35
+ self.contextpilot_client.evict(evicted_request_ids)
36
36
  """
37
37
 
38
38
  def __init__(
@@ -92,19 +92,23 @@ class ContextPilotIndexClient:
92
92
  logger.warning(f"ContextPilot index request failed: {e}")
93
93
  return None
94
94
 
95
- def evict(self, num_tokens: int) -> Optional[Dict[str, Any]]:
95
+ def evict(self, request_ids: List[str]) -> Optional[Dict[str, Any]]:
96
96
  """
97
- Evict tokens from the index.
98
-
97
+ Evict requests from the index.
98
+
99
99
  THIS IS THE MAIN METHOD THAT SGLANG SHOULD CALL FOR EVICTION SYNC.
100
-
100
+
101
101
  Args:
102
- num_tokens: Number of tokens to evict (same as SGLang's eviction)
103
-
102
+ request_ids: List of request IDs to evict (from SGLang's cache eviction)
103
+
104
104
  Returns:
105
- Dictionary with eviction results, or None if request failed
105
+ Dictionary with eviction results:
106
+ - removed_count: Number of requests successfully removed
107
+ - not_found: List of request IDs that were not in the index
108
+ - conversations_cleared: Number of conversation chains cleared
109
+ Returns None if request failed
106
110
  """
107
- return self._post("/evict", {"num_tokens": num_tokens})
111
+ return self._post("/evict", {"request_ids": request_ids})
108
112
 
109
113
  def search(
110
114
  self,
@@ -327,16 +331,26 @@ class ContextPilotIndexClient:
327
331
 
328
332
  # Convenience functions for simple usage
329
333
 
330
- def evict_tokens(num_tokens: int, server_url: str = "http://localhost:8765"):
334
+ def evict_requests(
335
+ request_ids: List[str],
336
+ server_url: str = "http://localhost:8765"
337
+ ) -> Optional[Dict[str, Any]]:
331
338
  """
332
- Simple function to evict tokens.
333
-
339
+ Simple function to evict requests from the index.
340
+
334
341
  For one-off calls without maintaining a client instance.
342
+
343
+ Args:
344
+ request_ids: List of request IDs to evict
345
+ server_url: ContextPilot server URL
346
+
347
+ Returns:
348
+ Dictionary with removed_count, not_found, conversations_cleared
335
349
  """
336
350
  try:
337
351
  response = requests.post(
338
352
  f"{server_url}/evict",
339
- json={"num_tokens": num_tokens},
353
+ json={"request_ids": request_ids},
340
354
  timeout=1.0
341
355
  )
342
356
  response.raise_for_status()