contextpilot 0.3.2__tar.gz → 0.3.3.post2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/PKG-INFO +60 -53
  2. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/README.md +59 -52
  3. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/__init__.py +4 -8
  4. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/context_index/__init__.py +1 -1
  5. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/context_index/compute_distance_cpu.py +3 -3
  6. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/context_index/index_construction.py +4 -2
  7. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/pipeline/rag_pipeline.py +1 -1
  8. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/server/__init__.py +2 -2
  9. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/server/http_client.py +133 -106
  10. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/server/http_server.py +211 -248
  11. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/server/live_index.py +170 -11
  12. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot.egg-info/PKG-INFO +60 -53
  13. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/pyproject.toml +1 -1
  14. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/tests/test_context_index.py +8 -8
  15. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/tests/test_cpu_distances.py +3 -3
  16. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/tests/test_gpu_distance_performance.py +4 -4
  17. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/tests/test_gpu_distances.py +1 -1
  18. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/tests/test_group_prefix_sharing.py +3 -3
  19. contextpilot-0.3.3.post2/tests/test_live_index.py +468 -0
  20. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/tests/test_performance.py +18 -18
  21. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/tests/test_pipeline.py +2 -2
  22. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/tests/test_server_integration.py +1 -1
  23. contextpilot-0.3.2/tests/test_live_index.py +0 -256
  24. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/LICENSE +0 -0
  25. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/context_index/compute_distance_gpu.py +0 -0
  26. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/context_index/tree_nodes.py +0 -0
  27. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/context_ordering/__init__.py +0 -0
  28. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/context_ordering/inter_scheduler.py +0 -0
  29. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/context_ordering/intra_ordering.py +0 -0
  30. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/pipeline/__init__.py +0 -0
  31. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/pipeline/components.py +0 -0
  32. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/pipeline/multi_turn.py +0 -0
  33. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/retriever/__init__.py +0 -0
  34. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/retriever/bm25.py +0 -0
  35. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/retriever/faiss_embedding.py +0 -0
  36. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/retriever/mem0_retriever.py +0 -0
  37. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/retriever/pageindex_retriever.py +0 -0
  38. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/server/conversation_tracker.py +0 -0
  39. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/server/eviction_heap.py +0 -0
  40. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/server/metadata.py +0 -0
  41. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/utils/__init__.py +0 -0
  42. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/utils/eval_metrics.py +0 -0
  43. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/utils/prompt_generator.py +0 -0
  44. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/utils/tools.py +0 -0
  45. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot.egg-info/SOURCES.txt +0 -0
  46. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot.egg-info/dependency_links.txt +0 -0
  47. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot.egg-info/requires.txt +0 -0
  48. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot.egg-info/top_level.txt +0 -0
  49. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/requirements.txt +0 -0
  50. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/setup.cfg +0 -0
  51. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/tests/test_context_ordering.py +0 -0
  52. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/tests/test_deduplication.py +0 -0
  53. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/tests/test_incremental_build.py +0 -0
  54. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/tests/test_mem0_integration.py +0 -0
  55. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/tests/test_multi_turn.py +0 -0
  56. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/tests/test_multi_turn_e2e.py +0 -0
  57. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/tests/test_pageindex_integration.py +0 -0
  58. {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: contextpilot
3
- Version: 0.3.2
3
+ Version: 0.3.3.post2
4
4
  Summary: Efficient Retrieval-Augmented Generation with Accuracy-Preserving Context Reuse
5
5
  Author: Yinsicheng Jiang, Chivier Humber
6
6
  License: Apache-2.0
@@ -125,31 +125,37 @@ More [detailed installation instructions](docs/getting_started/installation.md)
125
125
 
126
126
  ### Quick Start
127
127
 
128
- **Offline / Online Stateless** — build index & schedule in one shot:
128
+ **Stateful** — `ContextPilot` tracks cached state across turns so
129
+ overlapping documents are moved to the prefix for KV-cache reuse:
129
130
 
130
131
  ```python
131
132
  from openai import OpenAI
132
133
  import contextpilot as cp
133
134
 
134
- client = OpenAI(base_url="http://localhost:30000/v1", api_key="...") # Your inference engine URL and API key
135
-
136
- queries = ["What is AI?", "Explain neural networks", "What is deep learning?"]
137
- all_contexts = [
138
- ["Doc about AI", "Doc about ML", "Doc about computing"],
139
- ["Doc about neural nets", "Doc about deep learning"],
140
- ["Doc about ML", "Doc about AI", "Doc about deep learning basics"],
135
+ client = OpenAI(base_url="http://localhost:30000/v1", api_key="...")
136
+ cp_live = cp.ContextPilot(use_gpu=False)
137
+
138
+ # Simulated per-turn memory search (e.g. from mem0)
139
+ # Each turn retrieves different but partially overlapping documents
140
+ turn_memories = [
141
+ ["Transformers use self-attention", "GPT is based on transformers", "BERT is bidirectional"],
142
+ ["RNNs use hidden states", "GPT is based on transformers", "LSTMs solve vanishing gradients"],
143
+ ["Attention computes QKV", "Transformers use self-attention", "GPT is based on transformers"],
141
144
  ]
145
+ queries = ["What are transformers?", "How do RNNs compare?", "Explain attention in detail."]
142
146
 
143
- # Build index and schedule for prefix sharing
144
- index = cp.build_context_index(all_contexts, use_gpu=False)
145
- reordered, _, order, _ = cp.InterContextScheduler().schedule_contexts(index)
147
+ for turn_idx, (query, mems) in enumerate(zip(queries, turn_memories)):
148
+ # 1. Reorder for prefix sharing (handles cold start & incremental)
149
+ # .reorder() accepts a single list or list-of-lists
150
+ reordered, indices = cp_live.reorder(mems)
151
+ ctx = reordered[0] # single context per turn
152
+ # Turn 2: "GPT is based on transformers" ← moved to prefix (shared with turn 1)
153
+ # Turn 3: "Transformers …", "GPT …" ← both moved to prefix
146
154
 
147
- # Send in optimized order shared prefixes hit KV cache
148
- for ctx, orig_idx in zip(reordered, order):
155
+ # 2. Generate answer with reordered context
149
156
  docs_section = "\n".join(f"[{i+1}] {doc}" for i, doc in enumerate(ctx))
150
- # Importance ranking restores original retrieval order for the model
151
157
  importance_ranking = ">".join(
152
- str(ctx.index(doc) + 1) for doc in all_contexts[orig_idx] if doc in ctx
158
+ str(ctx.index(doc) + 1) for doc in mems if doc in ctx
153
159
  )
154
160
  response = client.chat.completions.create(
155
161
  model="Qwen/Qwen3-4B",
@@ -160,68 +166,69 @@ for ctx, orig_idx in zip(reordered, order):
160
166
  f"Read the documents in this importance ranking: {importance_ranking}\n"
161
167
  f"Prioritize information from higher-ranked documents."
162
168
  )},
163
- {"role": "user", "content": queries[orig_idx]},
169
+ {"role": "user", "content": query},
164
170
  ],
165
171
  )
166
- print(f"Q: {queries[orig_idx]}\nA: {response.choices[0].message.content}\n")
172
+ print(f"[Turn {turn_idx+1}] Q: {query}")
173
+ print(f"A: {response.choices[0].message.content}\n")
167
174
  ```
168
175
 
169
- > For online stateless scheduling via HTTP server, see the [online usage guide](docs/guides/online_usage.md).
176
+ > **Note:** Stateful mode works without eviction sync — `ContextPilot` tracks the previous ordering and reorders new contexts to maximize prefix cache hits. For production deployments with limited KV-cache capacity, install the [SGLang eviction patch](docs/guides/online_usage.md#sglang-integration) to keep the index in sync. See the [online usage guide](docs/guides/online_usage.md) for HTTP server setup.
170
177
 
171
- **Stateful** — `LiveContextIndex` tracks cached state:
178
+ **Offline / Online Stateless** — same API, just pass the full batch at once:
172
179
 
173
180
  ```python
174
181
  from openai import OpenAI
175
182
  import contextpilot as cp
176
183
 
177
- client = OpenAI(base_url="http://localhost:30000/v1", api_key="...")
178
- live = cp.LiveContextIndex(use_gpu=False)
179
-
180
- # Simulate multi-turn: each turn has batch_size=1
181
- turns = [
182
- {
183
- "query": "What is AI?",
184
- "contexts": [["Doc about AI", "Doc about ML", "Doc about computing"]],
185
- },
186
- {
187
- "query": "Compare supervised and unsupervised learning",
188
- # 2 of 3 docs overlap with Turn 1 ("Doc about AI", "Doc about ML"), different order + 1 new doc
189
- "contexts": [["Doc about ML", "Doc about clustering", "Doc about AI"]],
190
- },
191
- ]
184
+ client = OpenAI(base_url="http://localhost:30000/v1", api_key="...") # Your inference engine URL and API key
185
+ cp_batch = cp.ContextPilot(use_gpu=False)
192
186
 
193
- for turn_idx, turn in enumerate(turns):
194
- contexts = turn["contexts"]
195
- query = turn["query"]
187
+ queries = ["What is AI?", "Explain neural networks", "What is deep learning?"]
188
+ all_contexts = [
189
+ ["Doc about AI", "Doc about ML", "Doc about computing"],
190
+ ["Doc about neural nets", "Doc about deep learning"],
191
+ ["Doc about ML", "Doc about AI", "Doc about deep learning basics"],
192
+ ]
196
193
 
197
- # build_incremental handles both cold start and incremental turns
198
- result = live.build_incremental(contexts)
199
- reordered = result['reordered_contexts']
200
- # Turn 2: reordered to ["Doc about AI", "Doc about ML", "Doc about clustering"]
201
- # ^— shared prefix from Turn 1 —^ ^— new doc appended
194
+ # One call: builds index, reorders docs for prefix sharing, and schedules execution order
195
+ # .reorder() returns (reordered_contexts, original_indices)
196
+ reordered_ctx, order = cp_batch.reorder(all_contexts)
202
197
 
203
- ctx = reordered[0]
198
+ # Build all prompts in optimized order
199
+ messages_batch = []
200
+ for ctx, orig_idx in zip(reordered_ctx, order):
204
201
  docs_section = "\n".join(f"[{i+1}] {doc}" for i, doc in enumerate(ctx))
205
202
  importance_ranking = ">".join(
206
- str(ctx.index(doc) + 1) for doc in contexts[0] if doc in ctx
203
+ str(ctx.index(doc) + 1) for doc in all_contexts[orig_idx] if doc in ctx
207
204
  )
208
- response = client.chat.completions.create(
209
- model="Qwen/Qwen3-4B",
210
- messages=[
205
+ messages_batch.append({
206
+ "model": "Qwen/Qwen3-4B",
207
+ "messages": [
211
208
  {"role": "system", "content": (
212
209
  f"Answer the question based on the provided documents.\n\n"
213
210
  f"<documents>\n{docs_section}\n</documents>\n\n"
214
211
  f"Read the documents in this importance ranking: {importance_ranking}\n"
215
212
  f"Prioritize information from higher-ranked documents."
216
213
  )},
217
- {"role": "user", "content": query},
214
+ {"role": "user", "content": queries[orig_idx]},
218
215
  ],
219
- )
220
- print(f"[Turn {turn_idx+1}] Q: {query}")
221
- print(f"A: {response.choices[0].message.content}\n")
216
+ })
217
+
218
+ # Send concurrently — inference engine processes them in order for max cache reuse
219
+ import asyncio, openai
220
+
221
+ async def generate_all(batch):
222
+ aclient = openai.AsyncOpenAI(base_url="http://localhost:30000/v1", api_key="...")
223
+ tasks = [aclient.chat.completions.create(**req) for req in batch]
224
+ return await asyncio.gather(*tasks)
225
+
226
+ responses = asyncio.run(generate_all(messages_batch))
227
+ for resp, orig_idx in zip(responses, order):
228
+ print(f"Q: {queries[orig_idx]}\nA: {resp.choices[0].message.content}\n")
222
229
  ```
223
230
 
224
- > **Note:** Stateful mode works without eviction sync — `LiveContextIndex` tracks the previous ordering and reorders new contexts to maximize prefix cache hits. For production deployments with limited storage size where the KV cache may evict entries, install the [SGLang eviction patch](docs/guides/online_usage.md#sglang-integration) to keep the index in sync. See the [online usage guide](docs/guides/online_usage.md) for HTTP server setup.
231
+ > For online stateless scheduling via HTTP server, see the [online usage guide](docs/guides/online_usage.md).
225
232
 
226
233
  ## Documentation
227
234
 
@@ -84,31 +84,37 @@ More [detailed installation instructions](docs/getting_started/installation.md)
84
84
 
85
85
  ### Quick Start
86
86
 
87
- **Offline / Online Stateless** — build index & schedule in one shot:
87
+ **Stateful** — `ContextPilot` tracks cached state across turns so
88
+ overlapping documents are moved to the prefix for KV-cache reuse:
88
89
 
89
90
  ```python
90
91
  from openai import OpenAI
91
92
  import contextpilot as cp
92
93
 
93
- client = OpenAI(base_url="http://localhost:30000/v1", api_key="...") # Your inference engine URL and API key
94
-
95
- queries = ["What is AI?", "Explain neural networks", "What is deep learning?"]
96
- all_contexts = [
97
- ["Doc about AI", "Doc about ML", "Doc about computing"],
98
- ["Doc about neural nets", "Doc about deep learning"],
99
- ["Doc about ML", "Doc about AI", "Doc about deep learning basics"],
94
+ client = OpenAI(base_url="http://localhost:30000/v1", api_key="...")
95
+ cp_live = cp.ContextPilot(use_gpu=False)
96
+
97
+ # Simulated per-turn memory search (e.g. from mem0)
98
+ # Each turn retrieves different but partially overlapping documents
99
+ turn_memories = [
100
+ ["Transformers use self-attention", "GPT is based on transformers", "BERT is bidirectional"],
101
+ ["RNNs use hidden states", "GPT is based on transformers", "LSTMs solve vanishing gradients"],
102
+ ["Attention computes QKV", "Transformers use self-attention", "GPT is based on transformers"],
100
103
  ]
104
+ queries = ["What are transformers?", "How do RNNs compare?", "Explain attention in detail."]
101
105
 
102
- # Build index and schedule for prefix sharing
103
- index = cp.build_context_index(all_contexts, use_gpu=False)
104
- reordered, _, order, _ = cp.InterContextScheduler().schedule_contexts(index)
106
+ for turn_idx, (query, mems) in enumerate(zip(queries, turn_memories)):
107
+ # 1. Reorder for prefix sharing (handles cold start & incremental)
108
+ # .reorder() accepts a single list or list-of-lists
109
+ reordered, indices = cp_live.reorder(mems)
110
+ ctx = reordered[0] # single context per turn
111
+ # Turn 2: "GPT is based on transformers" ← moved to prefix (shared with turn 1)
112
+ # Turn 3: "Transformers …", "GPT …" ← both moved to prefix
105
113
 
106
- # Send in optimized order shared prefixes hit KV cache
107
- for ctx, orig_idx in zip(reordered, order):
114
+ # 2. Generate answer with reordered context
108
115
  docs_section = "\n".join(f"[{i+1}] {doc}" for i, doc in enumerate(ctx))
109
- # Importance ranking restores original retrieval order for the model
110
116
  importance_ranking = ">".join(
111
- str(ctx.index(doc) + 1) for doc in all_contexts[orig_idx] if doc in ctx
117
+ str(ctx.index(doc) + 1) for doc in mems if doc in ctx
112
118
  )
113
119
  response = client.chat.completions.create(
114
120
  model="Qwen/Qwen3-4B",
@@ -119,68 +125,69 @@ for ctx, orig_idx in zip(reordered, order):
119
125
  f"Read the documents in this importance ranking: {importance_ranking}\n"
120
126
  f"Prioritize information from higher-ranked documents."
121
127
  )},
122
- {"role": "user", "content": queries[orig_idx]},
128
+ {"role": "user", "content": query},
123
129
  ],
124
130
  )
125
- print(f"Q: {queries[orig_idx]}\nA: {response.choices[0].message.content}\n")
131
+ print(f"[Turn {turn_idx+1}] Q: {query}")
132
+ print(f"A: {response.choices[0].message.content}\n")
126
133
  ```
127
134
 
128
- > For online stateless scheduling via HTTP server, see the [online usage guide](docs/guides/online_usage.md).
135
+ > **Note:** Stateful mode works without eviction sync — `ContextPilot` tracks the previous ordering and reorders new contexts to maximize prefix cache hits. For production deployments with limited KV-cache capacity, install the [SGLang eviction patch](docs/guides/online_usage.md#sglang-integration) to keep the index in sync. See the [online usage guide](docs/guides/online_usage.md) for HTTP server setup.
129
136
 
130
- **Stateful** — `LiveContextIndex` tracks cached state:
137
+ **Offline / Online Stateless** — same API, just pass the full batch at once:
131
138
 
132
139
  ```python
133
140
  from openai import OpenAI
134
141
  import contextpilot as cp
135
142
 
136
- client = OpenAI(base_url="http://localhost:30000/v1", api_key="...")
137
- live = cp.LiveContextIndex(use_gpu=False)
138
-
139
- # Simulate multi-turn: each turn has batch_size=1
140
- turns = [
141
- {
142
- "query": "What is AI?",
143
- "contexts": [["Doc about AI", "Doc about ML", "Doc about computing"]],
144
- },
145
- {
146
- "query": "Compare supervised and unsupervised learning",
147
- # 2 of 3 docs overlap with Turn 1 ("Doc about AI", "Doc about ML"), different order + 1 new doc
148
- "contexts": [["Doc about ML", "Doc about clustering", "Doc about AI"]],
149
- },
150
- ]
143
+ client = OpenAI(base_url="http://localhost:30000/v1", api_key="...") # Your inference engine URL and API key
144
+ cp_batch = cp.ContextPilot(use_gpu=False)
151
145
 
152
- for turn_idx, turn in enumerate(turns):
153
- contexts = turn["contexts"]
154
- query = turn["query"]
146
+ queries = ["What is AI?", "Explain neural networks", "What is deep learning?"]
147
+ all_contexts = [
148
+ ["Doc about AI", "Doc about ML", "Doc about computing"],
149
+ ["Doc about neural nets", "Doc about deep learning"],
150
+ ["Doc about ML", "Doc about AI", "Doc about deep learning basics"],
151
+ ]
155
152
 
156
- # build_incremental handles both cold start and incremental turns
157
- result = live.build_incremental(contexts)
158
- reordered = result['reordered_contexts']
159
- # Turn 2: reordered to ["Doc about AI", "Doc about ML", "Doc about clustering"]
160
- # ^— shared prefix from Turn 1 —^ ^— new doc appended
153
+ # One call: builds index, reorders docs for prefix sharing, and schedules execution order
154
+ # .reorder() returns (reordered_contexts, original_indices)
155
+ reordered_ctx, order = cp_batch.reorder(all_contexts)
161
156
 
162
- ctx = reordered[0]
157
+ # Build all prompts in optimized order
158
+ messages_batch = []
159
+ for ctx, orig_idx in zip(reordered_ctx, order):
163
160
  docs_section = "\n".join(f"[{i+1}] {doc}" for i, doc in enumerate(ctx))
164
161
  importance_ranking = ">".join(
165
- str(ctx.index(doc) + 1) for doc in contexts[0] if doc in ctx
162
+ str(ctx.index(doc) + 1) for doc in all_contexts[orig_idx] if doc in ctx
166
163
  )
167
- response = client.chat.completions.create(
168
- model="Qwen/Qwen3-4B",
169
- messages=[
164
+ messages_batch.append({
165
+ "model": "Qwen/Qwen3-4B",
166
+ "messages": [
170
167
  {"role": "system", "content": (
171
168
  f"Answer the question based on the provided documents.\n\n"
172
169
  f"<documents>\n{docs_section}\n</documents>\n\n"
173
170
  f"Read the documents in this importance ranking: {importance_ranking}\n"
174
171
  f"Prioritize information from higher-ranked documents."
175
172
  )},
176
- {"role": "user", "content": query},
173
+ {"role": "user", "content": queries[orig_idx]},
177
174
  ],
178
- )
179
- print(f"[Turn {turn_idx+1}] Q: {query}")
180
- print(f"A: {response.choices[0].message.content}\n")
175
+ })
176
+
177
+ # Send concurrently — inference engine processes them in order for max cache reuse
178
+ import asyncio, openai
179
+
180
+ async def generate_all(batch):
181
+ aclient = openai.AsyncOpenAI(base_url="http://localhost:30000/v1", api_key="...")
182
+ tasks = [aclient.chat.completions.create(**req) for req in batch]
183
+ return await asyncio.gather(*tasks)
184
+
185
+ responses = asyncio.run(generate_all(messages_batch))
186
+ for resp, orig_idx in zip(responses, order):
187
+ print(f"Q: {queries[orig_idx]}\nA: {resp.choices[0].message.content}\n")
181
188
  ```
182
189
 
183
- > **Note:** Stateful mode works without eviction sync — `LiveContextIndex` tracks the previous ordering and reorders new contexts to maximize prefix cache hits. For production deployments with limited storage size where the KV cache may evict entries, install the [SGLang eviction patch](docs/guides/online_usage.md#sglang-integration) to keep the index in sync. See the [online usage guide](docs/guides/online_usage.md) for HTTP server setup.
190
+ > For online stateless scheduling via HTTP server, see the [online usage guide](docs/guides/online_usage.md).
184
191
 
185
192
  ## Documentation
186
193
 
@@ -16,7 +16,7 @@ Quick Start:
16
16
  >>>
17
17
  >>> results = pipeline.run(queries=["What is AI?"])
18
18
 
19
- See docs/PIPELINE_API.md for detailed documentation.
19
+ See docs/reference/api.md for detailed documentation.
20
20
  """
21
21
 
22
22
  from .pipeline import (
@@ -30,15 +30,13 @@ from .pipeline import (
30
30
  from .context_index import (
31
31
  ContextIndex,
32
32
  IndexResult,
33
- build_context_index,
34
33
  )
35
34
 
36
35
  from .context_ordering import (
37
36
  IntraContextOrderer,
38
- InterContextScheduler,
39
37
  )
40
38
 
41
- from .server.live_index import LiveContextIndex
39
+ from .server.live_index import ContextPilot
42
40
 
43
41
  from .retriever import (
44
42
  BM25Retriever,
@@ -49,7 +47,7 @@ from .retriever import (
49
47
  MEM0_AVAILABLE,
50
48
  )
51
49
 
52
- __version__ = "0.3.2"
50
+ __version__ = "0.3.3.post2"
53
51
 
54
52
  __all__ = [
55
53
  # High-level pipeline API
@@ -62,10 +60,8 @@ __all__ = [
62
60
  # Core components
63
61
  'ContextIndex',
64
62
  'IndexResult',
65
- 'build_context_index',
66
63
  'IntraContextOrderer',
67
- 'InterContextScheduler',
68
- 'LiveContextIndex',
64
+ 'ContextPilot',
69
65
 
70
66
  # Retrievers
71
67
  'BM25Retriever',
@@ -15,7 +15,7 @@ from .tree_nodes import (
15
15
  from .index_construction import (
16
16
  ContextIndex,
17
17
  IndexResult,
18
- build_context_index
18
+ build_context_index,
19
19
  )
20
20
 
21
21
  # Import distance computation
@@ -9,7 +9,7 @@ from multiprocessing import Pool, cpu_count
9
9
  from typing import List
10
10
 
11
11
 
12
- def compute_distance_single(context_a: List[int], context_b: List[int], alpha: float = 0.005) -> float:
12
+ def compute_distance_single(context_a: List[int], context_b: List[int], alpha: float = 0.001) -> float:
13
13
  """
14
14
  Compute distance between two contexts using our metric:
15
15
  distance = (1 - overlap/max_size) + alpha * avg_position_diff
@@ -49,7 +49,7 @@ def compute_distance_single(context_a: List[int], context_b: List[int], alpha: f
49
49
 
50
50
  def compute_distances_batch(queries: List[List[int]],
51
51
  targets: List[List[int]],
52
- alpha: float = 0.005,
52
+ alpha: float = 0.001,
53
53
  num_workers: int = None) -> np.ndarray:
54
54
  """
55
55
  Compute distances from multiple query contexts to multiple target contexts.
@@ -233,7 +233,7 @@ def compute_batch_worker(args):
233
233
 
234
234
 
235
235
  def compute_distance_matrix_cpu_optimized(contexts: List[List[int]],
236
- alpha: float = 0.005,
236
+ alpha: float = 0.001,
237
237
  num_workers: int = None,
238
238
  batch_size: int = 1000) -> np.ndarray:
239
239
  """
@@ -74,7 +74,7 @@ class ContextIndex:
74
74
  def __init__(self,
75
75
  linkage_method: str = "average",
76
76
  use_gpu: bool = True,
77
- alpha: float = 0.005,
77
+ alpha: float = 0.001,
78
78
  num_workers: Optional[int] = None,
79
79
  batch_size: int = 1000):
80
80
  """
@@ -301,7 +301,7 @@ class ContextIndex:
301
301
  def build_context_index(contexts,
302
302
  linkage_method: str = "average",
303
303
  use_gpu: bool = True,
304
- alpha: float = 0.005,
304
+ alpha: float = 0.001,
305
305
  num_workers: Optional[int] = None,
306
306
  batch_size: int = 1000) -> IndexResult:
307
307
  """
@@ -333,3 +333,5 @@ def build_context_index(contexts,
333
333
  result.reordered_prompts = result.reordered_contexts
334
334
  result.original_prompts = result.original_contexts
335
335
  return result
336
+
337
+
@@ -840,7 +840,7 @@ class RAGPipeline:
840
840
  json={
841
841
  "contexts": contexts,
842
842
  "initial_tokens_per_context": 100,
843
- "alpha": 0.005,
843
+ "alpha": 0.001,
844
844
  "use_gpu": False,
845
845
  "linkage_method": "average",
846
846
  "incremental": incremental
@@ -9,7 +9,7 @@ Includes HTTP server/client for remote index access from SGLang.
9
9
 
10
10
  from .metadata import NodeMetadata
11
11
  from .eviction_heap import EvictionHeap
12
- from .live_index import LiveContextIndex
12
+ from .live_index import ContextPilot
13
13
 
14
14
  # HTTP server/client (optional - requires fastapi/requests)
15
15
  try:
@@ -25,7 +25,7 @@ except ImportError:
25
25
  __all__ = [
26
26
  'NodeMetadata',
27
27
  'EvictionHeap',
28
- 'LiveContextIndex',
28
+ 'ContextPilot',
29
29
  'ContextPilotIndexClient',
30
30
  'evict_tokens',
31
31
  'http_app',