contextpilot 0.3.2__tar.gz → 0.3.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. {contextpilot-0.3.2 → contextpilot-0.3.3}/PKG-INFO +57 -53
  2. {contextpilot-0.3.2 → contextpilot-0.3.3}/README.md +56 -52
  3. {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/__init__.py +4 -8
  4. {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/context_index/__init__.py +1 -1
  5. {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/context_index/compute_distance_cpu.py +3 -3
  6. {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/context_index/index_construction.py +4 -2
  7. {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/pipeline/rag_pipeline.py +1 -1
  8. {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/server/__init__.py +2 -2
  9. {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/server/http_client.py +133 -106
  10. {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/server/http_server.py +211 -248
  11. {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/server/live_index.py +164 -11
  12. {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot.egg-info/PKG-INFO +57 -53
  13. {contextpilot-0.3.2 → contextpilot-0.3.3}/pyproject.toml +1 -1
  14. {contextpilot-0.3.2 → contextpilot-0.3.3}/tests/test_context_index.py +8 -8
  15. {contextpilot-0.3.2 → contextpilot-0.3.3}/tests/test_cpu_distances.py +3 -3
  16. {contextpilot-0.3.2 → contextpilot-0.3.3}/tests/test_gpu_distance_performance.py +4 -4
  17. {contextpilot-0.3.2 → contextpilot-0.3.3}/tests/test_gpu_distances.py +1 -1
  18. {contextpilot-0.3.2 → contextpilot-0.3.3}/tests/test_group_prefix_sharing.py +3 -3
  19. {contextpilot-0.3.2 → contextpilot-0.3.3}/tests/test_live_index.py +205 -16
  20. {contextpilot-0.3.2 → contextpilot-0.3.3}/tests/test_performance.py +18 -18
  21. {contextpilot-0.3.2 → contextpilot-0.3.3}/tests/test_pipeline.py +2 -2
  22. {contextpilot-0.3.2 → contextpilot-0.3.3}/tests/test_server_integration.py +1 -1
  23. {contextpilot-0.3.2 → contextpilot-0.3.3}/LICENSE +0 -0
  24. {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/context_index/compute_distance_gpu.py +0 -0
  25. {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/context_index/tree_nodes.py +0 -0
  26. {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/context_ordering/__init__.py +0 -0
  27. {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/context_ordering/inter_scheduler.py +0 -0
  28. {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/context_ordering/intra_ordering.py +0 -0
  29. {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/pipeline/__init__.py +0 -0
  30. {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/pipeline/components.py +0 -0
  31. {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/pipeline/multi_turn.py +0 -0
  32. {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/retriever/__init__.py +0 -0
  33. {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/retriever/bm25.py +0 -0
  34. {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/retriever/faiss_embedding.py +0 -0
  35. {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/retriever/mem0_retriever.py +0 -0
  36. {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/retriever/pageindex_retriever.py +0 -0
  37. {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/server/conversation_tracker.py +0 -0
  38. {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/server/eviction_heap.py +0 -0
  39. {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/server/metadata.py +0 -0
  40. {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/utils/__init__.py +0 -0
  41. {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/utils/eval_metrics.py +0 -0
  42. {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/utils/prompt_generator.py +0 -0
  43. {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/utils/tools.py +0 -0
  44. {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot.egg-info/SOURCES.txt +0 -0
  45. {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot.egg-info/dependency_links.txt +0 -0
  46. {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot.egg-info/requires.txt +0 -0
  47. {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot.egg-info/top_level.txt +0 -0
  48. {contextpilot-0.3.2 → contextpilot-0.3.3}/requirements.txt +0 -0
  49. {contextpilot-0.3.2 → contextpilot-0.3.3}/setup.cfg +0 -0
  50. {contextpilot-0.3.2 → contextpilot-0.3.3}/tests/test_context_ordering.py +0 -0
  51. {contextpilot-0.3.2 → contextpilot-0.3.3}/tests/test_deduplication.py +0 -0
  52. {contextpilot-0.3.2 → contextpilot-0.3.3}/tests/test_incremental_build.py +0 -0
  53. {contextpilot-0.3.2 → contextpilot-0.3.3}/tests/test_mem0_integration.py +0 -0
  54. {contextpilot-0.3.2 → contextpilot-0.3.3}/tests/test_multi_turn.py +0 -0
  55. {contextpilot-0.3.2 → contextpilot-0.3.3}/tests/test_multi_turn_e2e.py +0 -0
  56. {contextpilot-0.3.2 → contextpilot-0.3.3}/tests/test_pageindex_integration.py +0 -0
  57. {contextpilot-0.3.2 → contextpilot-0.3.3}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: contextpilot
3
- Version: 0.3.2
3
+ Version: 0.3.3
4
4
  Summary: Efficient Retrieval-Augmented Generation with Accuracy-Preserving Context Reuse
5
5
  Author: Yinsicheng Jiang, Chivier Humber
6
6
  License: Apache-2.0
@@ -125,31 +125,35 @@ More [detailed installation instructions](docs/getting_started/installation.md)
125
125
 
126
126
  ### Quick Start
127
127
 
128
- **Offline / Online Stateless** — build index & schedule in one shot:
128
+ **Stateful** — `ContextPilot` tracks cached state across turns so
129
+ overlapping documents are moved to the prefix for KV-cache reuse:
129
130
 
130
131
  ```python
131
132
  from openai import OpenAI
132
133
  import contextpilot as cp
133
134
 
134
- client = OpenAI(base_url="http://localhost:30000/v1", api_key="...") # Your inference engine URL and API key
135
-
136
- queries = ["What is AI?", "Explain neural networks", "What is deep learning?"]
137
- all_contexts = [
138
- ["Doc about AI", "Doc about ML", "Doc about computing"],
139
- ["Doc about neural nets", "Doc about deep learning"],
140
- ["Doc about ML", "Doc about AI", "Doc about deep learning basics"],
135
+ client = OpenAI(base_url="http://localhost:30000/v1", api_key="...")
136
+ cp_live = cp.ContextPilot(use_gpu=False)
137
+
138
+ # Simulated per-turn memory search (e.g. from mem0)
139
+ # Each turn retrieves different but partially overlapping documents
140
+ turn_memories = [
141
+ ["Transformers use self-attention", "GPT is based on transformers", "BERT is bidirectional"],
142
+ ["RNNs use hidden states", "GPT is based on transformers", "LSTMs solve vanishing gradients"],
143
+ ["Attention computes QKV", "Transformers use self-attention", "GPT is based on transformers"],
141
144
  ]
145
+ queries = ["What are transformers?", "How do RNNs compare?", "Explain attention in detail."]
142
146
 
143
- # Build index and schedule for prefix sharing
144
- index = cp.build_context_index(all_contexts, use_gpu=False)
145
- reordered, _, order, _ = cp.InterContextScheduler().schedule_contexts(index)
147
+ for turn_idx, (query, mems) in enumerate(zip(queries, turn_memories)):
148
+ # 1. Reorder for prefix sharing (handles cold start & incremental)
149
+ [ctx], order = cp_live.reorder([mems]) # single request per turn
150
+ # Turn 2: "GPT is based on transformers" ← moved to prefix (shared with turn 1)
151
+ # Turn 3: "Transformers …", "GPT …" ← both moved to prefix
146
152
 
147
- # Send in optimized order shared prefixes hit KV cache
148
- for ctx, orig_idx in zip(reordered, order):
153
+ # 2. Generate answer with reordered context
149
154
  docs_section = "\n".join(f"[{i+1}] {doc}" for i, doc in enumerate(ctx))
150
- # Importance ranking restores original retrieval order for the model
151
155
  importance_ranking = ">".join(
152
- str(ctx.index(doc) + 1) for doc in all_contexts[orig_idx] if doc in ctx
156
+ str(ctx.index(doc) + 1) for doc in mems if doc in ctx
153
157
  )
154
158
  response = client.chat.completions.create(
155
159
  model="Qwen/Qwen3-4B",
@@ -160,68 +164,68 @@ for ctx, orig_idx in zip(reordered, order):
160
164
  f"Read the documents in this importance ranking: {importance_ranking}\n"
161
165
  f"Prioritize information from higher-ranked documents."
162
166
  )},
163
- {"role": "user", "content": queries[orig_idx]},
167
+ {"role": "user", "content": query},
164
168
  ],
165
169
  )
166
- print(f"Q: {queries[orig_idx]}\nA: {response.choices[0].message.content}\n")
170
+ print(f"[Turn {turn_idx+1}] Q: {query}")
171
+ print(f"A: {response.choices[0].message.content}\n")
167
172
  ```
168
173
 
169
- > For online stateless scheduling via HTTP server, see the [online usage guide](docs/guides/online_usage.md).
174
+ > **Note:** Stateful mode works without eviction sync — `ContextPilot` tracks the previous ordering and reorders new contexts to maximize prefix cache hits. For production deployments with limited KV-cache capacity, install the [SGLang eviction patch](docs/guides/online_usage.md#sglang-integration) to keep the index in sync. See the [online usage guide](docs/guides/online_usage.md) for HTTP server setup.
170
175
 
171
- **Stateful** — `LiveContextIndex` tracks cached state:
176
+ **Offline / Online Stateless** — same API, just pass the full batch at once:
172
177
 
173
178
  ```python
174
179
  from openai import OpenAI
175
180
  import contextpilot as cp
176
181
 
177
- client = OpenAI(base_url="http://localhost:30000/v1", api_key="...")
178
- live = cp.LiveContextIndex(use_gpu=False)
179
-
180
- # Simulate multi-turn: each turn has batch_size=1
181
- turns = [
182
- {
183
- "query": "What is AI?",
184
- "contexts": [["Doc about AI", "Doc about ML", "Doc about computing"]],
185
- },
186
- {
187
- "query": "Compare supervised and unsupervised learning",
188
- # 2 of 3 docs overlap with Turn 1 ("Doc about AI", "Doc about ML"), different order + 1 new doc
189
- "contexts": [["Doc about ML", "Doc about clustering", "Doc about AI"]],
190
- },
191
- ]
182
+ client = OpenAI(base_url="http://localhost:30000/v1", api_key="...") # Your inference engine URL and API key
183
+ cp_batch = cp.ContextPilot(use_gpu=False)
192
184
 
193
- for turn_idx, turn in enumerate(turns):
194
- contexts = turn["contexts"]
195
- query = turn["query"]
185
+ queries = ["What is AI?", "Explain neural networks", "What is deep learning?"]
186
+ all_contexts = [
187
+ ["Doc about AI", "Doc about ML", "Doc about computing"],
188
+ ["Doc about neural nets", "Doc about deep learning"],
189
+ ["Doc about ML", "Doc about AI", "Doc about deep learning basics"],
190
+ ]
196
191
 
197
- # build_incremental handles both cold start and incremental turns
198
- result = live.build_incremental(contexts)
199
- reordered = result['reordered_contexts']
200
- # Turn 2: reordered to ["Doc about AI", "Doc about ML", "Doc about clustering"]
201
- # ^— shared prefix from Turn 1 —^ ^— new doc appended
192
+ # One call: builds index, reorders docs for prefix sharing, and schedules execution order
193
+ reordered, order = cp_batch.reorder(all_contexts)
202
194
 
203
- ctx = reordered[0]
195
+ # Build all prompts in optimized order
196
+ messages_batch = []
197
+ for ctx, orig_idx in zip(reordered, order):
204
198
  docs_section = "\n".join(f"[{i+1}] {doc}" for i, doc in enumerate(ctx))
205
199
  importance_ranking = ">".join(
206
- str(ctx.index(doc) + 1) for doc in contexts[0] if doc in ctx
200
+ str(ctx.index(doc) + 1) for doc in all_contexts[orig_idx] if doc in ctx
207
201
  )
208
- response = client.chat.completions.create(
209
- model="Qwen/Qwen3-4B",
210
- messages=[
202
+ messages_batch.append({
203
+ "model": "Qwen/Qwen3-4B",
204
+ "messages": [
211
205
  {"role": "system", "content": (
212
206
  f"Answer the question based on the provided documents.\n\n"
213
207
  f"<documents>\n{docs_section}\n</documents>\n\n"
214
208
  f"Read the documents in this importance ranking: {importance_ranking}\n"
215
209
  f"Prioritize information from higher-ranked documents."
216
210
  )},
217
- {"role": "user", "content": query},
211
+ {"role": "user", "content": queries[orig_idx]},
218
212
  ],
219
- )
220
- print(f"[Turn {turn_idx+1}] Q: {query}")
221
- print(f"A: {response.choices[0].message.content}\n")
213
+ })
214
+
215
+ # Send concurrently — inference engine processes them in order for max cache reuse
216
+ import asyncio, openai
217
+
218
+ async def generate_all(batch):
219
+ aclient = openai.AsyncOpenAI(base_url="http://localhost:30000/v1", api_key="...")
220
+ tasks = [aclient.chat.completions.create(**req) for req in batch]
221
+ return await asyncio.gather(*tasks)
222
+
223
+ responses = asyncio.run(generate_all(messages_batch))
224
+ for resp, orig_idx in zip(responses, order):
225
+ print(f"Q: {queries[orig_idx]}\nA: {resp.choices[0].message.content}\n")
222
226
  ```
223
227
 
224
- > **Note:** Stateful mode works without eviction sync — `LiveContextIndex` tracks the previous ordering and reorders new contexts to maximize prefix cache hits. For production deployments with limited storage size where the KV cache may evict entries, install the [SGLang eviction patch](docs/guides/online_usage.md#sglang-integration) to keep the index in sync. See the [online usage guide](docs/guides/online_usage.md) for HTTP server setup.
228
+ > For online stateless scheduling via HTTP server, see the [online usage guide](docs/guides/online_usage.md).
225
229
 
226
230
  ## Documentation
227
231
 
@@ -84,31 +84,35 @@ More [detailed installation instructions](docs/getting_started/installation.md)
84
84
 
85
85
  ### Quick Start
86
86
 
87
- **Offline / Online Stateless** — build index & schedule in one shot:
87
+ **Stateful** — `ContextPilot` tracks cached state across turns so
88
+ overlapping documents are moved to the prefix for KV-cache reuse:
88
89
 
89
90
  ```python
90
91
  from openai import OpenAI
91
92
  import contextpilot as cp
92
93
 
93
- client = OpenAI(base_url="http://localhost:30000/v1", api_key="...") # Your inference engine URL and API key
94
-
95
- queries = ["What is AI?", "Explain neural networks", "What is deep learning?"]
96
- all_contexts = [
97
- ["Doc about AI", "Doc about ML", "Doc about computing"],
98
- ["Doc about neural nets", "Doc about deep learning"],
99
- ["Doc about ML", "Doc about AI", "Doc about deep learning basics"],
94
+ client = OpenAI(base_url="http://localhost:30000/v1", api_key="...")
95
+ cp_live = cp.ContextPilot(use_gpu=False)
96
+
97
+ # Simulated per-turn memory search (e.g. from mem0)
98
+ # Each turn retrieves different but partially overlapping documents
99
+ turn_memories = [
100
+ ["Transformers use self-attention", "GPT is based on transformers", "BERT is bidirectional"],
101
+ ["RNNs use hidden states", "GPT is based on transformers", "LSTMs solve vanishing gradients"],
102
+ ["Attention computes QKV", "Transformers use self-attention", "GPT is based on transformers"],
100
103
  ]
104
+ queries = ["What are transformers?", "How do RNNs compare?", "Explain attention in detail."]
101
105
 
102
- # Build index and schedule for prefix sharing
103
- index = cp.build_context_index(all_contexts, use_gpu=False)
104
- reordered, _, order, _ = cp.InterContextScheduler().schedule_contexts(index)
106
+ for turn_idx, (query, mems) in enumerate(zip(queries, turn_memories)):
107
+ # 1. Reorder for prefix sharing (handles cold start & incremental)
108
+ [ctx], order = cp_live.reorder([mems]) # single request per turn
109
+ # Turn 2: "GPT is based on transformers" ← moved to prefix (shared with turn 1)
110
+ # Turn 3: "Transformers …", "GPT …" ← both moved to prefix
105
111
 
106
- # Send in optimized order shared prefixes hit KV cache
107
- for ctx, orig_idx in zip(reordered, order):
112
+ # 2. Generate answer with reordered context
108
113
  docs_section = "\n".join(f"[{i+1}] {doc}" for i, doc in enumerate(ctx))
109
- # Importance ranking restores original retrieval order for the model
110
114
  importance_ranking = ">".join(
111
- str(ctx.index(doc) + 1) for doc in all_contexts[orig_idx] if doc in ctx
115
+ str(ctx.index(doc) + 1) for doc in mems if doc in ctx
112
116
  )
113
117
  response = client.chat.completions.create(
114
118
  model="Qwen/Qwen3-4B",
@@ -119,68 +123,68 @@ for ctx, orig_idx in zip(reordered, order):
119
123
  f"Read the documents in this importance ranking: {importance_ranking}\n"
120
124
  f"Prioritize information from higher-ranked documents."
121
125
  )},
122
- {"role": "user", "content": queries[orig_idx]},
126
+ {"role": "user", "content": query},
123
127
  ],
124
128
  )
125
- print(f"Q: {queries[orig_idx]}\nA: {response.choices[0].message.content}\n")
129
+ print(f"[Turn {turn_idx+1}] Q: {query}")
130
+ print(f"A: {response.choices[0].message.content}\n")
126
131
  ```
127
132
 
128
- > For online stateless scheduling via HTTP server, see the [online usage guide](docs/guides/online_usage.md).
133
+ > **Note:** Stateful mode works without eviction sync — `ContextPilot` tracks the previous ordering and reorders new contexts to maximize prefix cache hits. For production deployments with limited KV-cache capacity, install the [SGLang eviction patch](docs/guides/online_usage.md#sglang-integration) to keep the index in sync. See the [online usage guide](docs/guides/online_usage.md) for HTTP server setup.
129
134
 
130
- **Stateful** — `LiveContextIndex` tracks cached state:
135
+ **Offline / Online Stateless** — same API, just pass the full batch at once:
131
136
 
132
137
  ```python
133
138
  from openai import OpenAI
134
139
  import contextpilot as cp
135
140
 
136
- client = OpenAI(base_url="http://localhost:30000/v1", api_key="...")
137
- live = cp.LiveContextIndex(use_gpu=False)
138
-
139
- # Simulate multi-turn: each turn has batch_size=1
140
- turns = [
141
- {
142
- "query": "What is AI?",
143
- "contexts": [["Doc about AI", "Doc about ML", "Doc about computing"]],
144
- },
145
- {
146
- "query": "Compare supervised and unsupervised learning",
147
- # 2 of 3 docs overlap with Turn 1 ("Doc about AI", "Doc about ML"), different order + 1 new doc
148
- "contexts": [["Doc about ML", "Doc about clustering", "Doc about AI"]],
149
- },
150
- ]
141
+ client = OpenAI(base_url="http://localhost:30000/v1", api_key="...") # Your inference engine URL and API key
142
+ cp_batch = cp.ContextPilot(use_gpu=False)
151
143
 
152
- for turn_idx, turn in enumerate(turns):
153
- contexts = turn["contexts"]
154
- query = turn["query"]
144
+ queries = ["What is AI?", "Explain neural networks", "What is deep learning?"]
145
+ all_contexts = [
146
+ ["Doc about AI", "Doc about ML", "Doc about computing"],
147
+ ["Doc about neural nets", "Doc about deep learning"],
148
+ ["Doc about ML", "Doc about AI", "Doc about deep learning basics"],
149
+ ]
155
150
 
156
- # build_incremental handles both cold start and incremental turns
157
- result = live.build_incremental(contexts)
158
- reordered = result['reordered_contexts']
159
- # Turn 2: reordered to ["Doc about AI", "Doc about ML", "Doc about clustering"]
160
- # ^— shared prefix from Turn 1 —^ ^— new doc appended
151
+ # One call: builds index, reorders docs for prefix sharing, and schedules execution order
152
+ reordered, order = cp_batch.reorder(all_contexts)
161
153
 
162
- ctx = reordered[0]
154
+ # Build all prompts in optimized order
155
+ messages_batch = []
156
+ for ctx, orig_idx in zip(reordered, order):
163
157
  docs_section = "\n".join(f"[{i+1}] {doc}" for i, doc in enumerate(ctx))
164
158
  importance_ranking = ">".join(
165
- str(ctx.index(doc) + 1) for doc in contexts[0] if doc in ctx
159
+ str(ctx.index(doc) + 1) for doc in all_contexts[orig_idx] if doc in ctx
166
160
  )
167
- response = client.chat.completions.create(
168
- model="Qwen/Qwen3-4B",
169
- messages=[
161
+ messages_batch.append({
162
+ "model": "Qwen/Qwen3-4B",
163
+ "messages": [
170
164
  {"role": "system", "content": (
171
165
  f"Answer the question based on the provided documents.\n\n"
172
166
  f"<documents>\n{docs_section}\n</documents>\n\n"
173
167
  f"Read the documents in this importance ranking: {importance_ranking}\n"
174
168
  f"Prioritize information from higher-ranked documents."
175
169
  )},
176
- {"role": "user", "content": query},
170
+ {"role": "user", "content": queries[orig_idx]},
177
171
  ],
178
- )
179
- print(f"[Turn {turn_idx+1}] Q: {query}")
180
- print(f"A: {response.choices[0].message.content}\n")
172
+ })
173
+
174
+ # Send concurrently — inference engine processes them in order for max cache reuse
175
+ import asyncio, openai
176
+
177
+ async def generate_all(batch):
178
+ aclient = openai.AsyncOpenAI(base_url="http://localhost:30000/v1", api_key="...")
179
+ tasks = [aclient.chat.completions.create(**req) for req in batch]
180
+ return await asyncio.gather(*tasks)
181
+
182
+ responses = asyncio.run(generate_all(messages_batch))
183
+ for resp, orig_idx in zip(responses, order):
184
+ print(f"Q: {queries[orig_idx]}\nA: {resp.choices[0].message.content}\n")
181
185
  ```
182
186
 
183
- > **Note:** Stateful mode works without eviction sync — `LiveContextIndex` tracks the previous ordering and reorders new contexts to maximize prefix cache hits. For production deployments with limited storage size where the KV cache may evict entries, install the [SGLang eviction patch](docs/guides/online_usage.md#sglang-integration) to keep the index in sync. See the [online usage guide](docs/guides/online_usage.md) for HTTP server setup.
187
+ > For online stateless scheduling via HTTP server, see the [online usage guide](docs/guides/online_usage.md).
184
188
 
185
189
  ## Documentation
186
190
 
@@ -16,7 +16,7 @@ Quick Start:
16
16
  >>>
17
17
  >>> results = pipeline.run(queries=["What is AI?"])
18
18
 
19
- See docs/PIPELINE_API.md for detailed documentation.
19
+ See docs/reference/api.md for detailed documentation.
20
20
  """
21
21
 
22
22
  from .pipeline import (
@@ -30,15 +30,13 @@ from .pipeline import (
30
30
  from .context_index import (
31
31
  ContextIndex,
32
32
  IndexResult,
33
- build_context_index,
34
33
  )
35
34
 
36
35
  from .context_ordering import (
37
36
  IntraContextOrderer,
38
- InterContextScheduler,
39
37
  )
40
38
 
41
- from .server.live_index import LiveContextIndex
39
+ from .server.live_index import ContextPilot
42
40
 
43
41
  from .retriever import (
44
42
  BM25Retriever,
@@ -49,7 +47,7 @@ from .retriever import (
49
47
  MEM0_AVAILABLE,
50
48
  )
51
49
 
52
- __version__ = "0.3.2"
50
+ __version__ = "0.3.3"
53
51
 
54
52
  __all__ = [
55
53
  # High-level pipeline API
@@ -62,10 +60,8 @@ __all__ = [
62
60
  # Core components
63
61
  'ContextIndex',
64
62
  'IndexResult',
65
- 'build_context_index',
66
63
  'IntraContextOrderer',
67
- 'InterContextScheduler',
68
- 'LiveContextIndex',
64
+ 'ContextPilot',
69
65
 
70
66
  # Retrievers
71
67
  'BM25Retriever',
@@ -15,7 +15,7 @@ from .tree_nodes import (
15
15
  from .index_construction import (
16
16
  ContextIndex,
17
17
  IndexResult,
18
- build_context_index
18
+ build_context_index,
19
19
  )
20
20
 
21
21
  # Import distance computation
@@ -9,7 +9,7 @@ from multiprocessing import Pool, cpu_count
9
9
  from typing import List
10
10
 
11
11
 
12
- def compute_distance_single(context_a: List[int], context_b: List[int], alpha: float = 0.005) -> float:
12
+ def compute_distance_single(context_a: List[int], context_b: List[int], alpha: float = 0.001) -> float:
13
13
  """
14
14
  Compute distance between two contexts using our metric:
15
15
  distance = (1 - overlap/max_size) + alpha * avg_position_diff
@@ -49,7 +49,7 @@ def compute_distance_single(context_a: List[int], context_b: List[int], alpha: f
49
49
 
50
50
  def compute_distances_batch(queries: List[List[int]],
51
51
  targets: List[List[int]],
52
- alpha: float = 0.005,
52
+ alpha: float = 0.001,
53
53
  num_workers: int = None) -> np.ndarray:
54
54
  """
55
55
  Compute distances from multiple query contexts to multiple target contexts.
@@ -233,7 +233,7 @@ def compute_batch_worker(args):
233
233
 
234
234
 
235
235
  def compute_distance_matrix_cpu_optimized(contexts: List[List[int]],
236
- alpha: float = 0.005,
236
+ alpha: float = 0.001,
237
237
  num_workers: int = None,
238
238
  batch_size: int = 1000) -> np.ndarray:
239
239
  """
@@ -74,7 +74,7 @@ class ContextIndex:
74
74
  def __init__(self,
75
75
  linkage_method: str = "average",
76
76
  use_gpu: bool = True,
77
- alpha: float = 0.005,
77
+ alpha: float = 0.001,
78
78
  num_workers: Optional[int] = None,
79
79
  batch_size: int = 1000):
80
80
  """
@@ -301,7 +301,7 @@ class ContextIndex:
301
301
  def build_context_index(contexts,
302
302
  linkage_method: str = "average",
303
303
  use_gpu: bool = True,
304
- alpha: float = 0.005,
304
+ alpha: float = 0.001,
305
305
  num_workers: Optional[int] = None,
306
306
  batch_size: int = 1000) -> IndexResult:
307
307
  """
@@ -333,3 +333,5 @@ def build_context_index(contexts,
333
333
  result.reordered_prompts = result.reordered_contexts
334
334
  result.original_prompts = result.original_contexts
335
335
  return result
336
+
337
+
@@ -840,7 +840,7 @@ class RAGPipeline:
840
840
  json={
841
841
  "contexts": contexts,
842
842
  "initial_tokens_per_context": 100,
843
- "alpha": 0.005,
843
+ "alpha": 0.001,
844
844
  "use_gpu": False,
845
845
  "linkage_method": "average",
846
846
  "incremental": incremental
@@ -9,7 +9,7 @@ Includes HTTP server/client for remote index access from SGLang.
9
9
 
10
10
  from .metadata import NodeMetadata
11
11
  from .eviction_heap import EvictionHeap
12
- from .live_index import LiveContextIndex
12
+ from .live_index import ContextPilot
13
13
 
14
14
  # HTTP server/client (optional - requires fastapi/requests)
15
15
  try:
@@ -25,7 +25,7 @@ except ImportError:
25
25
  __all__ = [
26
26
  'NodeMetadata',
27
27
  'EvictionHeap',
28
- 'LiveContextIndex',
28
+ 'ContextPilot',
29
29
  'ContextPilotIndexClient',
30
30
  'evict_tokens',
31
31
  'http_app',