contextpilot 0.3.2__tar.gz → 0.3.3.post2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/PKG-INFO +60 -53
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/README.md +59 -52
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/__init__.py +4 -8
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/context_index/__init__.py +1 -1
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/context_index/compute_distance_cpu.py +3 -3
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/context_index/index_construction.py +4 -2
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/pipeline/rag_pipeline.py +1 -1
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/server/__init__.py +2 -2
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/server/http_client.py +133 -106
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/server/http_server.py +211 -248
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/server/live_index.py +170 -11
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot.egg-info/PKG-INFO +60 -53
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/pyproject.toml +1 -1
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/tests/test_context_index.py +8 -8
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/tests/test_cpu_distances.py +3 -3
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/tests/test_gpu_distance_performance.py +4 -4
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/tests/test_gpu_distances.py +1 -1
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/tests/test_group_prefix_sharing.py +3 -3
- contextpilot-0.3.3.post2/tests/test_live_index.py +468 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/tests/test_performance.py +18 -18
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/tests/test_pipeline.py +2 -2
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/tests/test_server_integration.py +1 -1
- contextpilot-0.3.2/tests/test_live_index.py +0 -256
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/LICENSE +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/context_index/compute_distance_gpu.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/context_index/tree_nodes.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/context_ordering/__init__.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/context_ordering/inter_scheduler.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/context_ordering/intra_ordering.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/pipeline/__init__.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/pipeline/components.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/pipeline/multi_turn.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/retriever/__init__.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/retriever/bm25.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/retriever/faiss_embedding.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/retriever/mem0_retriever.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/retriever/pageindex_retriever.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/server/conversation_tracker.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/server/eviction_heap.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/server/metadata.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/utils/__init__.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/utils/eval_metrics.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/utils/prompt_generator.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/utils/tools.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot.egg-info/SOURCES.txt +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot.egg-info/dependency_links.txt +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot.egg-info/requires.txt +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot.egg-info/top_level.txt +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/requirements.txt +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/setup.cfg +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/tests/test_context_ordering.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/tests/test_deduplication.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/tests/test_incremental_build.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/tests/test_mem0_integration.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/tests/test_multi_turn.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/tests/test_multi_turn_e2e.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/tests/test_pageindex_integration.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3.post2}/tests/test_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: contextpilot
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.3.post2
|
|
4
4
|
Summary: Efficient Retrieval-Augmented Generation with Accuracy-Preserving Context Reuse
|
|
5
5
|
Author: Yinsicheng Jiang, Chivier Humber
|
|
6
6
|
License: Apache-2.0
|
|
@@ -125,31 +125,37 @@ More [detailed installation instructions](docs/getting_started/installation.md)
|
|
|
125
125
|
|
|
126
126
|
### Quick Start
|
|
127
127
|
|
|
128
|
-
**
|
|
128
|
+
**Stateful** — `ContextPilot` tracks cached state across turns so
|
|
129
|
+
overlapping documents are moved to the prefix for KV-cache reuse:
|
|
129
130
|
|
|
130
131
|
```python
|
|
131
132
|
from openai import OpenAI
|
|
132
133
|
import contextpilot as cp
|
|
133
134
|
|
|
134
|
-
client = OpenAI(base_url="http://localhost:30000/v1", api_key="...")
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
["
|
|
135
|
+
client = OpenAI(base_url="http://localhost:30000/v1", api_key="...")
|
|
136
|
+
cp_live = cp.ContextPilot(use_gpu=False)
|
|
137
|
+
|
|
138
|
+
# Simulated per-turn memory search (e.g. from mem0)
|
|
139
|
+
# Each turn retrieves different but partially overlapping documents
|
|
140
|
+
turn_memories = [
|
|
141
|
+
["Transformers use self-attention", "GPT is based on transformers", "BERT is bidirectional"],
|
|
142
|
+
["RNNs use hidden states", "GPT is based on transformers", "LSTMs solve vanishing gradients"],
|
|
143
|
+
["Attention computes QKV", "Transformers use self-attention", "GPT is based on transformers"],
|
|
141
144
|
]
|
|
145
|
+
queries = ["What are transformers?", "How do RNNs compare?", "Explain attention in detail."]
|
|
142
146
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
147
|
+
for turn_idx, (query, mems) in enumerate(zip(queries, turn_memories)):
|
|
148
|
+
# 1. Reorder for prefix sharing (handles cold start & incremental)
|
|
149
|
+
# .reorder() accepts a single list or list-of-lists
|
|
150
|
+
reordered, indices = cp_live.reorder(mems)
|
|
151
|
+
ctx = reordered[0] # single context per turn
|
|
152
|
+
# Turn 2: "GPT is based on transformers" ← moved to prefix (shared with turn 1)
|
|
153
|
+
# Turn 3: "Transformers …", "GPT …" ← both moved to prefix
|
|
146
154
|
|
|
147
|
-
#
|
|
148
|
-
for ctx, orig_idx in zip(reordered, order):
|
|
155
|
+
# 2. Generate answer with reordered context
|
|
149
156
|
docs_section = "\n".join(f"[{i+1}] {doc}" for i, doc in enumerate(ctx))
|
|
150
|
-
# Importance ranking restores original retrieval order for the model
|
|
151
157
|
importance_ranking = ">".join(
|
|
152
|
-
str(ctx.index(doc) + 1) for doc in
|
|
158
|
+
str(ctx.index(doc) + 1) for doc in mems if doc in ctx
|
|
153
159
|
)
|
|
154
160
|
response = client.chat.completions.create(
|
|
155
161
|
model="Qwen/Qwen3-4B",
|
|
@@ -160,68 +166,69 @@ for ctx, orig_idx in zip(reordered, order):
|
|
|
160
166
|
f"Read the documents in this importance ranking: {importance_ranking}\n"
|
|
161
167
|
f"Prioritize information from higher-ranked documents."
|
|
162
168
|
)},
|
|
163
|
-
{"role": "user", "content":
|
|
169
|
+
{"role": "user", "content": query},
|
|
164
170
|
],
|
|
165
171
|
)
|
|
166
|
-
print(f"
|
|
172
|
+
print(f"[Turn {turn_idx+1}] Q: {query}")
|
|
173
|
+
print(f"A: {response.choices[0].message.content}\n")
|
|
167
174
|
```
|
|
168
175
|
|
|
169
|
-
> For
|
|
176
|
+
> **Note:** Stateful mode works without eviction sync — `ContextPilot` tracks the previous ordering and reorders new contexts to maximize prefix cache hits. For production deployments with limited KV-cache capacity, install the [SGLang eviction patch](docs/guides/online_usage.md#sglang-integration) to keep the index in sync. See the [online usage guide](docs/guides/online_usage.md) for HTTP server setup.
|
|
170
177
|
|
|
171
|
-
**
|
|
178
|
+
**Offline / Online Stateless** — same API, just pass the full batch at once:
|
|
172
179
|
|
|
173
180
|
```python
|
|
174
181
|
from openai import OpenAI
|
|
175
182
|
import contextpilot as cp
|
|
176
183
|
|
|
177
|
-
client = OpenAI(base_url="http://localhost:30000/v1", api_key="...")
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
# Simulate multi-turn: each turn has batch_size=1
|
|
181
|
-
turns = [
|
|
182
|
-
{
|
|
183
|
-
"query": "What is AI?",
|
|
184
|
-
"contexts": [["Doc about AI", "Doc about ML", "Doc about computing"]],
|
|
185
|
-
},
|
|
186
|
-
{
|
|
187
|
-
"query": "Compare supervised and unsupervised learning",
|
|
188
|
-
# 2 of 3 docs overlap with Turn 1 ("Doc about AI", "Doc about ML"), different order + 1 new doc
|
|
189
|
-
"contexts": [["Doc about ML", "Doc about clustering", "Doc about AI"]],
|
|
190
|
-
},
|
|
191
|
-
]
|
|
184
|
+
client = OpenAI(base_url="http://localhost:30000/v1", api_key="...") # Your inference engine URL and API key
|
|
185
|
+
cp_batch = cp.ContextPilot(use_gpu=False)
|
|
192
186
|
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
187
|
+
queries = ["What is AI?", "Explain neural networks", "What is deep learning?"]
|
|
188
|
+
all_contexts = [
|
|
189
|
+
["Doc about AI", "Doc about ML", "Doc about computing"],
|
|
190
|
+
["Doc about neural nets", "Doc about deep learning"],
|
|
191
|
+
["Doc about ML", "Doc about AI", "Doc about deep learning basics"],
|
|
192
|
+
]
|
|
196
193
|
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
# Turn 2: reordered to ["Doc about AI", "Doc about ML", "Doc about clustering"]
|
|
201
|
-
# ^— shared prefix from Turn 1 —^ ^— new doc appended
|
|
194
|
+
# One call: builds index, reorders docs for prefix sharing, and schedules execution order
|
|
195
|
+
# .reorder() returns (reordered_contexts, original_indices)
|
|
196
|
+
reordered_ctx, order = cp_batch.reorder(all_contexts)
|
|
202
197
|
|
|
203
|
-
|
|
198
|
+
# Build all prompts in optimized order
|
|
199
|
+
messages_batch = []
|
|
200
|
+
for ctx, orig_idx in zip(reordered_ctx, order):
|
|
204
201
|
docs_section = "\n".join(f"[{i+1}] {doc}" for i, doc in enumerate(ctx))
|
|
205
202
|
importance_ranking = ">".join(
|
|
206
|
-
str(ctx.index(doc) + 1) for doc in
|
|
203
|
+
str(ctx.index(doc) + 1) for doc in all_contexts[orig_idx] if doc in ctx
|
|
207
204
|
)
|
|
208
|
-
|
|
209
|
-
model
|
|
210
|
-
messages
|
|
205
|
+
messages_batch.append({
|
|
206
|
+
"model": "Qwen/Qwen3-4B",
|
|
207
|
+
"messages": [
|
|
211
208
|
{"role": "system", "content": (
|
|
212
209
|
f"Answer the question based on the provided documents.\n\n"
|
|
213
210
|
f"<documents>\n{docs_section}\n</documents>\n\n"
|
|
214
211
|
f"Read the documents in this importance ranking: {importance_ranking}\n"
|
|
215
212
|
f"Prioritize information from higher-ranked documents."
|
|
216
213
|
)},
|
|
217
|
-
{"role": "user", "content":
|
|
214
|
+
{"role": "user", "content": queries[orig_idx]},
|
|
218
215
|
],
|
|
219
|
-
)
|
|
220
|
-
|
|
221
|
-
|
|
216
|
+
})
|
|
217
|
+
|
|
218
|
+
# Send concurrently — inference engine processes them in order for max cache reuse
|
|
219
|
+
import asyncio, openai
|
|
220
|
+
|
|
221
|
+
async def generate_all(batch):
|
|
222
|
+
aclient = openai.AsyncOpenAI(base_url="http://localhost:30000/v1", api_key="...")
|
|
223
|
+
tasks = [aclient.chat.completions.create(**req) for req in batch]
|
|
224
|
+
return await asyncio.gather(*tasks)
|
|
225
|
+
|
|
226
|
+
responses = asyncio.run(generate_all(messages_batch))
|
|
227
|
+
for resp, orig_idx in zip(responses, order):
|
|
228
|
+
print(f"Q: {queries[orig_idx]}\nA: {resp.choices[0].message.content}\n")
|
|
222
229
|
```
|
|
223
230
|
|
|
224
|
-
>
|
|
231
|
+
> For online stateless scheduling via HTTP server, see the [online usage guide](docs/guides/online_usage.md).
|
|
225
232
|
|
|
226
233
|
## Documentation
|
|
227
234
|
|
|
@@ -84,31 +84,37 @@ More [detailed installation instructions](docs/getting_started/installation.md)
|
|
|
84
84
|
|
|
85
85
|
### Quick Start
|
|
86
86
|
|
|
87
|
-
**
|
|
87
|
+
**Stateful** — `ContextPilot` tracks cached state across turns so
|
|
88
|
+
overlapping documents are moved to the prefix for KV-cache reuse:
|
|
88
89
|
|
|
89
90
|
```python
|
|
90
91
|
from openai import OpenAI
|
|
91
92
|
import contextpilot as cp
|
|
92
93
|
|
|
93
|
-
client = OpenAI(base_url="http://localhost:30000/v1", api_key="...")
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
["
|
|
94
|
+
client = OpenAI(base_url="http://localhost:30000/v1", api_key="...")
|
|
95
|
+
cp_live = cp.ContextPilot(use_gpu=False)
|
|
96
|
+
|
|
97
|
+
# Simulated per-turn memory search (e.g. from mem0)
|
|
98
|
+
# Each turn retrieves different but partially overlapping documents
|
|
99
|
+
turn_memories = [
|
|
100
|
+
["Transformers use self-attention", "GPT is based on transformers", "BERT is bidirectional"],
|
|
101
|
+
["RNNs use hidden states", "GPT is based on transformers", "LSTMs solve vanishing gradients"],
|
|
102
|
+
["Attention computes QKV", "Transformers use self-attention", "GPT is based on transformers"],
|
|
100
103
|
]
|
|
104
|
+
queries = ["What are transformers?", "How do RNNs compare?", "Explain attention in detail."]
|
|
101
105
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
106
|
+
for turn_idx, (query, mems) in enumerate(zip(queries, turn_memories)):
|
|
107
|
+
# 1. Reorder for prefix sharing (handles cold start & incremental)
|
|
108
|
+
# .reorder() accepts a single list or list-of-lists
|
|
109
|
+
reordered, indices = cp_live.reorder(mems)
|
|
110
|
+
ctx = reordered[0] # single context per turn
|
|
111
|
+
# Turn 2: "GPT is based on transformers" ← moved to prefix (shared with turn 1)
|
|
112
|
+
# Turn 3: "Transformers …", "GPT …" ← both moved to prefix
|
|
105
113
|
|
|
106
|
-
#
|
|
107
|
-
for ctx, orig_idx in zip(reordered, order):
|
|
114
|
+
# 2. Generate answer with reordered context
|
|
108
115
|
docs_section = "\n".join(f"[{i+1}] {doc}" for i, doc in enumerate(ctx))
|
|
109
|
-
# Importance ranking restores original retrieval order for the model
|
|
110
116
|
importance_ranking = ">".join(
|
|
111
|
-
str(ctx.index(doc) + 1) for doc in
|
|
117
|
+
str(ctx.index(doc) + 1) for doc in mems if doc in ctx
|
|
112
118
|
)
|
|
113
119
|
response = client.chat.completions.create(
|
|
114
120
|
model="Qwen/Qwen3-4B",
|
|
@@ -119,68 +125,69 @@ for ctx, orig_idx in zip(reordered, order):
|
|
|
119
125
|
f"Read the documents in this importance ranking: {importance_ranking}\n"
|
|
120
126
|
f"Prioritize information from higher-ranked documents."
|
|
121
127
|
)},
|
|
122
|
-
{"role": "user", "content":
|
|
128
|
+
{"role": "user", "content": query},
|
|
123
129
|
],
|
|
124
130
|
)
|
|
125
|
-
print(f"
|
|
131
|
+
print(f"[Turn {turn_idx+1}] Q: {query}")
|
|
132
|
+
print(f"A: {response.choices[0].message.content}\n")
|
|
126
133
|
```
|
|
127
134
|
|
|
128
|
-
> For
|
|
135
|
+
> **Note:** Stateful mode works without eviction sync — `ContextPilot` tracks the previous ordering and reorders new contexts to maximize prefix cache hits. For production deployments with limited KV-cache capacity, install the [SGLang eviction patch](docs/guides/online_usage.md#sglang-integration) to keep the index in sync. See the [online usage guide](docs/guides/online_usage.md) for HTTP server setup.
|
|
129
136
|
|
|
130
|
-
**
|
|
137
|
+
**Offline / Online Stateless** — same API, just pass the full batch at once:
|
|
131
138
|
|
|
132
139
|
```python
|
|
133
140
|
from openai import OpenAI
|
|
134
141
|
import contextpilot as cp
|
|
135
142
|
|
|
136
|
-
client = OpenAI(base_url="http://localhost:30000/v1", api_key="...")
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
# Simulate multi-turn: each turn has batch_size=1
|
|
140
|
-
turns = [
|
|
141
|
-
{
|
|
142
|
-
"query": "What is AI?",
|
|
143
|
-
"contexts": [["Doc about AI", "Doc about ML", "Doc about computing"]],
|
|
144
|
-
},
|
|
145
|
-
{
|
|
146
|
-
"query": "Compare supervised and unsupervised learning",
|
|
147
|
-
# 2 of 3 docs overlap with Turn 1 ("Doc about AI", "Doc about ML"), different order + 1 new doc
|
|
148
|
-
"contexts": [["Doc about ML", "Doc about clustering", "Doc about AI"]],
|
|
149
|
-
},
|
|
150
|
-
]
|
|
143
|
+
client = OpenAI(base_url="http://localhost:30000/v1", api_key="...") # Your inference engine URL and API key
|
|
144
|
+
cp_batch = cp.ContextPilot(use_gpu=False)
|
|
151
145
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
146
|
+
queries = ["What is AI?", "Explain neural networks", "What is deep learning?"]
|
|
147
|
+
all_contexts = [
|
|
148
|
+
["Doc about AI", "Doc about ML", "Doc about computing"],
|
|
149
|
+
["Doc about neural nets", "Doc about deep learning"],
|
|
150
|
+
["Doc about ML", "Doc about AI", "Doc about deep learning basics"],
|
|
151
|
+
]
|
|
155
152
|
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
# Turn 2: reordered to ["Doc about AI", "Doc about ML", "Doc about clustering"]
|
|
160
|
-
# ^— shared prefix from Turn 1 —^ ^— new doc appended
|
|
153
|
+
# One call: builds index, reorders docs for prefix sharing, and schedules execution order
|
|
154
|
+
# .reorder() returns (reordered_contexts, original_indices)
|
|
155
|
+
reordered_ctx, order = cp_batch.reorder(all_contexts)
|
|
161
156
|
|
|
162
|
-
|
|
157
|
+
# Build all prompts in optimized order
|
|
158
|
+
messages_batch = []
|
|
159
|
+
for ctx, orig_idx in zip(reordered_ctx, order):
|
|
163
160
|
docs_section = "\n".join(f"[{i+1}] {doc}" for i, doc in enumerate(ctx))
|
|
164
161
|
importance_ranking = ">".join(
|
|
165
|
-
str(ctx.index(doc) + 1) for doc in
|
|
162
|
+
str(ctx.index(doc) + 1) for doc in all_contexts[orig_idx] if doc in ctx
|
|
166
163
|
)
|
|
167
|
-
|
|
168
|
-
model
|
|
169
|
-
messages
|
|
164
|
+
messages_batch.append({
|
|
165
|
+
"model": "Qwen/Qwen3-4B",
|
|
166
|
+
"messages": [
|
|
170
167
|
{"role": "system", "content": (
|
|
171
168
|
f"Answer the question based on the provided documents.\n\n"
|
|
172
169
|
f"<documents>\n{docs_section}\n</documents>\n\n"
|
|
173
170
|
f"Read the documents in this importance ranking: {importance_ranking}\n"
|
|
174
171
|
f"Prioritize information from higher-ranked documents."
|
|
175
172
|
)},
|
|
176
|
-
{"role": "user", "content":
|
|
173
|
+
{"role": "user", "content": queries[orig_idx]},
|
|
177
174
|
],
|
|
178
|
-
)
|
|
179
|
-
|
|
180
|
-
|
|
175
|
+
})
|
|
176
|
+
|
|
177
|
+
# Send concurrently — inference engine processes them in order for max cache reuse
|
|
178
|
+
import asyncio, openai
|
|
179
|
+
|
|
180
|
+
async def generate_all(batch):
|
|
181
|
+
aclient = openai.AsyncOpenAI(base_url="http://localhost:30000/v1", api_key="...")
|
|
182
|
+
tasks = [aclient.chat.completions.create(**req) for req in batch]
|
|
183
|
+
return await asyncio.gather(*tasks)
|
|
184
|
+
|
|
185
|
+
responses = asyncio.run(generate_all(messages_batch))
|
|
186
|
+
for resp, orig_idx in zip(responses, order):
|
|
187
|
+
print(f"Q: {queries[orig_idx]}\nA: {resp.choices[0].message.content}\n")
|
|
181
188
|
```
|
|
182
189
|
|
|
183
|
-
>
|
|
190
|
+
> For online stateless scheduling via HTTP server, see the [online usage guide](docs/guides/online_usage.md).
|
|
184
191
|
|
|
185
192
|
## Documentation
|
|
186
193
|
|
|
@@ -16,7 +16,7 @@ Quick Start:
|
|
|
16
16
|
>>>
|
|
17
17
|
>>> results = pipeline.run(queries=["What is AI?"])
|
|
18
18
|
|
|
19
|
-
See docs/
|
|
19
|
+
See docs/reference/api.md for detailed documentation.
|
|
20
20
|
"""
|
|
21
21
|
|
|
22
22
|
from .pipeline import (
|
|
@@ -30,15 +30,13 @@ from .pipeline import (
|
|
|
30
30
|
from .context_index import (
|
|
31
31
|
ContextIndex,
|
|
32
32
|
IndexResult,
|
|
33
|
-
build_context_index,
|
|
34
33
|
)
|
|
35
34
|
|
|
36
35
|
from .context_ordering import (
|
|
37
36
|
IntraContextOrderer,
|
|
38
|
-
InterContextScheduler,
|
|
39
37
|
)
|
|
40
38
|
|
|
41
|
-
from .server.live_index import
|
|
39
|
+
from .server.live_index import ContextPilot
|
|
42
40
|
|
|
43
41
|
from .retriever import (
|
|
44
42
|
BM25Retriever,
|
|
@@ -49,7 +47,7 @@ from .retriever import (
|
|
|
49
47
|
MEM0_AVAILABLE,
|
|
50
48
|
)
|
|
51
49
|
|
|
52
|
-
__version__ = "0.3.
|
|
50
|
+
__version__ = "0.3.3.post2"
|
|
53
51
|
|
|
54
52
|
__all__ = [
|
|
55
53
|
# High-level pipeline API
|
|
@@ -62,10 +60,8 @@ __all__ = [
|
|
|
62
60
|
# Core components
|
|
63
61
|
'ContextIndex',
|
|
64
62
|
'IndexResult',
|
|
65
|
-
'build_context_index',
|
|
66
63
|
'IntraContextOrderer',
|
|
67
|
-
'
|
|
68
|
-
'LiveContextIndex',
|
|
64
|
+
'ContextPilot',
|
|
69
65
|
|
|
70
66
|
# Retrievers
|
|
71
67
|
'BM25Retriever',
|
{contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/context_index/compute_distance_cpu.py
RENAMED
|
@@ -9,7 +9,7 @@ from multiprocessing import Pool, cpu_count
|
|
|
9
9
|
from typing import List
|
|
10
10
|
|
|
11
11
|
|
|
12
|
-
def compute_distance_single(context_a: List[int], context_b: List[int], alpha: float = 0.
|
|
12
|
+
def compute_distance_single(context_a: List[int], context_b: List[int], alpha: float = 0.001) -> float:
|
|
13
13
|
"""
|
|
14
14
|
Compute distance between two contexts using our metric:
|
|
15
15
|
distance = (1 - overlap/max_size) + alpha * avg_position_diff
|
|
@@ -49,7 +49,7 @@ def compute_distance_single(context_a: List[int], context_b: List[int], alpha: f
|
|
|
49
49
|
|
|
50
50
|
def compute_distances_batch(queries: List[List[int]],
|
|
51
51
|
targets: List[List[int]],
|
|
52
|
-
alpha: float = 0.
|
|
52
|
+
alpha: float = 0.001,
|
|
53
53
|
num_workers: int = None) -> np.ndarray:
|
|
54
54
|
"""
|
|
55
55
|
Compute distances from multiple query contexts to multiple target contexts.
|
|
@@ -233,7 +233,7 @@ def compute_batch_worker(args):
|
|
|
233
233
|
|
|
234
234
|
|
|
235
235
|
def compute_distance_matrix_cpu_optimized(contexts: List[List[int]],
|
|
236
|
-
alpha: float = 0.
|
|
236
|
+
alpha: float = 0.001,
|
|
237
237
|
num_workers: int = None,
|
|
238
238
|
batch_size: int = 1000) -> np.ndarray:
|
|
239
239
|
"""
|
{contextpilot-0.3.2 → contextpilot-0.3.3.post2}/contextpilot/context_index/index_construction.py
RENAMED
|
@@ -74,7 +74,7 @@ class ContextIndex:
|
|
|
74
74
|
def __init__(self,
|
|
75
75
|
linkage_method: str = "average",
|
|
76
76
|
use_gpu: bool = True,
|
|
77
|
-
alpha: float = 0.
|
|
77
|
+
alpha: float = 0.001,
|
|
78
78
|
num_workers: Optional[int] = None,
|
|
79
79
|
batch_size: int = 1000):
|
|
80
80
|
"""
|
|
@@ -301,7 +301,7 @@ class ContextIndex:
|
|
|
301
301
|
def build_context_index(contexts,
|
|
302
302
|
linkage_method: str = "average",
|
|
303
303
|
use_gpu: bool = True,
|
|
304
|
-
alpha: float = 0.
|
|
304
|
+
alpha: float = 0.001,
|
|
305
305
|
num_workers: Optional[int] = None,
|
|
306
306
|
batch_size: int = 1000) -> IndexResult:
|
|
307
307
|
"""
|
|
@@ -333,3 +333,5 @@ def build_context_index(contexts,
|
|
|
333
333
|
result.reordered_prompts = result.reordered_contexts
|
|
334
334
|
result.original_prompts = result.original_contexts
|
|
335
335
|
return result
|
|
336
|
+
|
|
337
|
+
|
|
@@ -9,7 +9,7 @@ Includes HTTP server/client for remote index access from SGLang.
|
|
|
9
9
|
|
|
10
10
|
from .metadata import NodeMetadata
|
|
11
11
|
from .eviction_heap import EvictionHeap
|
|
12
|
-
from .live_index import
|
|
12
|
+
from .live_index import ContextPilot
|
|
13
13
|
|
|
14
14
|
# HTTP server/client (optional - requires fastapi/requests)
|
|
15
15
|
try:
|
|
@@ -25,7 +25,7 @@ except ImportError:
|
|
|
25
25
|
__all__ = [
|
|
26
26
|
'NodeMetadata',
|
|
27
27
|
'EvictionHeap',
|
|
28
|
-
'
|
|
28
|
+
'ContextPilot',
|
|
29
29
|
'ContextPilotIndexClient',
|
|
30
30
|
'evict_tokens',
|
|
31
31
|
'http_app',
|