contextpilot 0.3.2__tar.gz → 0.3.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {contextpilot-0.3.2 → contextpilot-0.3.3}/PKG-INFO +57 -53
- {contextpilot-0.3.2 → contextpilot-0.3.3}/README.md +56 -52
- {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/__init__.py +4 -8
- {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/context_index/__init__.py +1 -1
- {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/context_index/compute_distance_cpu.py +3 -3
- {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/context_index/index_construction.py +4 -2
- {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/pipeline/rag_pipeline.py +1 -1
- {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/server/__init__.py +2 -2
- {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/server/http_client.py +133 -106
- {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/server/http_server.py +211 -248
- {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/server/live_index.py +164 -11
- {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot.egg-info/PKG-INFO +57 -53
- {contextpilot-0.3.2 → contextpilot-0.3.3}/pyproject.toml +1 -1
- {contextpilot-0.3.2 → contextpilot-0.3.3}/tests/test_context_index.py +8 -8
- {contextpilot-0.3.2 → contextpilot-0.3.3}/tests/test_cpu_distances.py +3 -3
- {contextpilot-0.3.2 → contextpilot-0.3.3}/tests/test_gpu_distance_performance.py +4 -4
- {contextpilot-0.3.2 → contextpilot-0.3.3}/tests/test_gpu_distances.py +1 -1
- {contextpilot-0.3.2 → contextpilot-0.3.3}/tests/test_group_prefix_sharing.py +3 -3
- {contextpilot-0.3.2 → contextpilot-0.3.3}/tests/test_live_index.py +205 -16
- {contextpilot-0.3.2 → contextpilot-0.3.3}/tests/test_performance.py +18 -18
- {contextpilot-0.3.2 → contextpilot-0.3.3}/tests/test_pipeline.py +2 -2
- {contextpilot-0.3.2 → contextpilot-0.3.3}/tests/test_server_integration.py +1 -1
- {contextpilot-0.3.2 → contextpilot-0.3.3}/LICENSE +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/context_index/compute_distance_gpu.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/context_index/tree_nodes.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/context_ordering/__init__.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/context_ordering/inter_scheduler.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/context_ordering/intra_ordering.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/pipeline/__init__.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/pipeline/components.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/pipeline/multi_turn.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/retriever/__init__.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/retriever/bm25.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/retriever/faiss_embedding.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/retriever/mem0_retriever.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/retriever/pageindex_retriever.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/server/conversation_tracker.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/server/eviction_heap.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/server/metadata.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/utils/__init__.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/utils/eval_metrics.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/utils/prompt_generator.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/utils/tools.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot.egg-info/SOURCES.txt +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot.egg-info/dependency_links.txt +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot.egg-info/requires.txt +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot.egg-info/top_level.txt +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3}/requirements.txt +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3}/setup.cfg +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3}/tests/test_context_ordering.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3}/tests/test_deduplication.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3}/tests/test_incremental_build.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3}/tests/test_mem0_integration.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3}/tests/test_multi_turn.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3}/tests/test_multi_turn_e2e.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3}/tests/test_pageindex_integration.py +0 -0
- {contextpilot-0.3.2 → contextpilot-0.3.3}/tests/test_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: contextpilot
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.3
|
|
4
4
|
Summary: Efficient Retrieval-Augmented Generation with Accuracy-Preserving Context Reuse
|
|
5
5
|
Author: Yinsicheng Jiang, Chivier Humber
|
|
6
6
|
License: Apache-2.0
|
|
@@ -125,31 +125,35 @@ More [detailed installation instructions](docs/getting_started/installation.md)
|
|
|
125
125
|
|
|
126
126
|
### Quick Start
|
|
127
127
|
|
|
128
|
-
**
|
|
128
|
+
**Stateful** — `ContextPilot` tracks cached state across turns so
|
|
129
|
+
overlapping documents are moved to the prefix for KV-cache reuse:
|
|
129
130
|
|
|
130
131
|
```python
|
|
131
132
|
from openai import OpenAI
|
|
132
133
|
import contextpilot as cp
|
|
133
134
|
|
|
134
|
-
client = OpenAI(base_url="http://localhost:30000/v1", api_key="...")
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
["
|
|
135
|
+
client = OpenAI(base_url="http://localhost:30000/v1", api_key="...")
|
|
136
|
+
cp_live = cp.ContextPilot(use_gpu=False)
|
|
137
|
+
|
|
138
|
+
# Simulated per-turn memory search (e.g. from mem0)
|
|
139
|
+
# Each turn retrieves different but partially overlapping documents
|
|
140
|
+
turn_memories = [
|
|
141
|
+
["Transformers use self-attention", "GPT is based on transformers", "BERT is bidirectional"],
|
|
142
|
+
["RNNs use hidden states", "GPT is based on transformers", "LSTMs solve vanishing gradients"],
|
|
143
|
+
["Attention computes QKV", "Transformers use self-attention", "GPT is based on transformers"],
|
|
141
144
|
]
|
|
145
|
+
queries = ["What are transformers?", "How do RNNs compare?", "Explain attention in detail."]
|
|
142
146
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
147
|
+
for turn_idx, (query, mems) in enumerate(zip(queries, turn_memories)):
|
|
148
|
+
# 1. Reorder for prefix sharing (handles cold start & incremental)
|
|
149
|
+
[ctx], order = cp_live.reorder([mems]) # single request per turn
|
|
150
|
+
# Turn 2: "GPT is based on transformers" ← moved to prefix (shared with turn 1)
|
|
151
|
+
# Turn 3: "Transformers …", "GPT …" ← both moved to prefix
|
|
146
152
|
|
|
147
|
-
#
|
|
148
|
-
for ctx, orig_idx in zip(reordered, order):
|
|
153
|
+
# 2. Generate answer with reordered context
|
|
149
154
|
docs_section = "\n".join(f"[{i+1}] {doc}" for i, doc in enumerate(ctx))
|
|
150
|
-
# Importance ranking restores original retrieval order for the model
|
|
151
155
|
importance_ranking = ">".join(
|
|
152
|
-
str(ctx.index(doc) + 1) for doc in
|
|
156
|
+
str(ctx.index(doc) + 1) for doc in mems if doc in ctx
|
|
153
157
|
)
|
|
154
158
|
response = client.chat.completions.create(
|
|
155
159
|
model="Qwen/Qwen3-4B",
|
|
@@ -160,68 +164,68 @@ for ctx, orig_idx in zip(reordered, order):
|
|
|
160
164
|
f"Read the documents in this importance ranking: {importance_ranking}\n"
|
|
161
165
|
f"Prioritize information from higher-ranked documents."
|
|
162
166
|
)},
|
|
163
|
-
{"role": "user", "content":
|
|
167
|
+
{"role": "user", "content": query},
|
|
164
168
|
],
|
|
165
169
|
)
|
|
166
|
-
print(f"
|
|
170
|
+
print(f"[Turn {turn_idx+1}] Q: {query}")
|
|
171
|
+
print(f"A: {response.choices[0].message.content}\n")
|
|
167
172
|
```
|
|
168
173
|
|
|
169
|
-
> For
|
|
174
|
+
> **Note:** Stateful mode works without eviction sync — `ContextPilot` tracks the previous ordering and reorders new contexts to maximize prefix cache hits. For production deployments with limited KV-cache capacity, install the [SGLang eviction patch](docs/guides/online_usage.md#sglang-integration) to keep the index in sync. See the [online usage guide](docs/guides/online_usage.md) for HTTP server setup.
|
|
170
175
|
|
|
171
|
-
**
|
|
176
|
+
**Offline / Online Stateless** — same API, just pass the full batch at once:
|
|
172
177
|
|
|
173
178
|
```python
|
|
174
179
|
from openai import OpenAI
|
|
175
180
|
import contextpilot as cp
|
|
176
181
|
|
|
177
|
-
client = OpenAI(base_url="http://localhost:30000/v1", api_key="...")
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
# Simulate multi-turn: each turn has batch_size=1
|
|
181
|
-
turns = [
|
|
182
|
-
{
|
|
183
|
-
"query": "What is AI?",
|
|
184
|
-
"contexts": [["Doc about AI", "Doc about ML", "Doc about computing"]],
|
|
185
|
-
},
|
|
186
|
-
{
|
|
187
|
-
"query": "Compare supervised and unsupervised learning",
|
|
188
|
-
# 2 of 3 docs overlap with Turn 1 ("Doc about AI", "Doc about ML"), different order + 1 new doc
|
|
189
|
-
"contexts": [["Doc about ML", "Doc about clustering", "Doc about AI"]],
|
|
190
|
-
},
|
|
191
|
-
]
|
|
182
|
+
client = OpenAI(base_url="http://localhost:30000/v1", api_key="...") # Your inference engine URL and API key
|
|
183
|
+
cp_batch = cp.ContextPilot(use_gpu=False)
|
|
192
184
|
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
185
|
+
queries = ["What is AI?", "Explain neural networks", "What is deep learning?"]
|
|
186
|
+
all_contexts = [
|
|
187
|
+
["Doc about AI", "Doc about ML", "Doc about computing"],
|
|
188
|
+
["Doc about neural nets", "Doc about deep learning"],
|
|
189
|
+
["Doc about ML", "Doc about AI", "Doc about deep learning basics"],
|
|
190
|
+
]
|
|
196
191
|
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
reordered = result['reordered_contexts']
|
|
200
|
-
# Turn 2: reordered to ["Doc about AI", "Doc about ML", "Doc about clustering"]
|
|
201
|
-
# ^— shared prefix from Turn 1 —^ ^— new doc appended
|
|
192
|
+
# One call: builds index, reorders docs for prefix sharing, and schedules execution order
|
|
193
|
+
reordered, order = cp_batch.reorder(all_contexts)
|
|
202
194
|
|
|
203
|
-
|
|
195
|
+
# Build all prompts in optimized order
|
|
196
|
+
messages_batch = []
|
|
197
|
+
for ctx, orig_idx in zip(reordered, order):
|
|
204
198
|
docs_section = "\n".join(f"[{i+1}] {doc}" for i, doc in enumerate(ctx))
|
|
205
199
|
importance_ranking = ">".join(
|
|
206
|
-
str(ctx.index(doc) + 1) for doc in
|
|
200
|
+
str(ctx.index(doc) + 1) for doc in all_contexts[orig_idx] if doc in ctx
|
|
207
201
|
)
|
|
208
|
-
|
|
209
|
-
model
|
|
210
|
-
messages
|
|
202
|
+
messages_batch.append({
|
|
203
|
+
"model": "Qwen/Qwen3-4B",
|
|
204
|
+
"messages": [
|
|
211
205
|
{"role": "system", "content": (
|
|
212
206
|
f"Answer the question based on the provided documents.\n\n"
|
|
213
207
|
f"<documents>\n{docs_section}\n</documents>\n\n"
|
|
214
208
|
f"Read the documents in this importance ranking: {importance_ranking}\n"
|
|
215
209
|
f"Prioritize information from higher-ranked documents."
|
|
216
210
|
)},
|
|
217
|
-
{"role": "user", "content":
|
|
211
|
+
{"role": "user", "content": queries[orig_idx]},
|
|
218
212
|
],
|
|
219
|
-
)
|
|
220
|
-
|
|
221
|
-
|
|
213
|
+
})
|
|
214
|
+
|
|
215
|
+
# Send concurrently — inference engine processes them in order for max cache reuse
|
|
216
|
+
import asyncio, openai
|
|
217
|
+
|
|
218
|
+
async def generate_all(batch):
|
|
219
|
+
aclient = openai.AsyncOpenAI(base_url="http://localhost:30000/v1", api_key="...")
|
|
220
|
+
tasks = [aclient.chat.completions.create(**req) for req in batch]
|
|
221
|
+
return await asyncio.gather(*tasks)
|
|
222
|
+
|
|
223
|
+
responses = asyncio.run(generate_all(messages_batch))
|
|
224
|
+
for resp, orig_idx in zip(responses, order):
|
|
225
|
+
print(f"Q: {queries[orig_idx]}\nA: {resp.choices[0].message.content}\n")
|
|
222
226
|
```
|
|
223
227
|
|
|
224
|
-
>
|
|
228
|
+
> For online stateless scheduling via HTTP server, see the [online usage guide](docs/guides/online_usage.md).
|
|
225
229
|
|
|
226
230
|
## Documentation
|
|
227
231
|
|
|
@@ -84,31 +84,35 @@ More [detailed installation instructions](docs/getting_started/installation.md)
|
|
|
84
84
|
|
|
85
85
|
### Quick Start
|
|
86
86
|
|
|
87
|
-
**
|
|
87
|
+
**Stateful** — `ContextPilot` tracks cached state across turns so
|
|
88
|
+
overlapping documents are moved to the prefix for KV-cache reuse:
|
|
88
89
|
|
|
89
90
|
```python
|
|
90
91
|
from openai import OpenAI
|
|
91
92
|
import contextpilot as cp
|
|
92
93
|
|
|
93
|
-
client = OpenAI(base_url="http://localhost:30000/v1", api_key="...")
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
["
|
|
94
|
+
client = OpenAI(base_url="http://localhost:30000/v1", api_key="...")
|
|
95
|
+
cp_live = cp.ContextPilot(use_gpu=False)
|
|
96
|
+
|
|
97
|
+
# Simulated per-turn memory search (e.g. from mem0)
|
|
98
|
+
# Each turn retrieves different but partially overlapping documents
|
|
99
|
+
turn_memories = [
|
|
100
|
+
["Transformers use self-attention", "GPT is based on transformers", "BERT is bidirectional"],
|
|
101
|
+
["RNNs use hidden states", "GPT is based on transformers", "LSTMs solve vanishing gradients"],
|
|
102
|
+
["Attention computes QKV", "Transformers use self-attention", "GPT is based on transformers"],
|
|
100
103
|
]
|
|
104
|
+
queries = ["What are transformers?", "How do RNNs compare?", "Explain attention in detail."]
|
|
101
105
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
106
|
+
for turn_idx, (query, mems) in enumerate(zip(queries, turn_memories)):
|
|
107
|
+
# 1. Reorder for prefix sharing (handles cold start & incremental)
|
|
108
|
+
[ctx], order = cp_live.reorder([mems]) # single request per turn
|
|
109
|
+
# Turn 2: "GPT is based on transformers" ← moved to prefix (shared with turn 1)
|
|
110
|
+
# Turn 3: "Transformers …", "GPT …" ← both moved to prefix
|
|
105
111
|
|
|
106
|
-
#
|
|
107
|
-
for ctx, orig_idx in zip(reordered, order):
|
|
112
|
+
# 2. Generate answer with reordered context
|
|
108
113
|
docs_section = "\n".join(f"[{i+1}] {doc}" for i, doc in enumerate(ctx))
|
|
109
|
-
# Importance ranking restores original retrieval order for the model
|
|
110
114
|
importance_ranking = ">".join(
|
|
111
|
-
str(ctx.index(doc) + 1) for doc in
|
|
115
|
+
str(ctx.index(doc) + 1) for doc in mems if doc in ctx
|
|
112
116
|
)
|
|
113
117
|
response = client.chat.completions.create(
|
|
114
118
|
model="Qwen/Qwen3-4B",
|
|
@@ -119,68 +123,68 @@ for ctx, orig_idx in zip(reordered, order):
|
|
|
119
123
|
f"Read the documents in this importance ranking: {importance_ranking}\n"
|
|
120
124
|
f"Prioritize information from higher-ranked documents."
|
|
121
125
|
)},
|
|
122
|
-
{"role": "user", "content":
|
|
126
|
+
{"role": "user", "content": query},
|
|
123
127
|
],
|
|
124
128
|
)
|
|
125
|
-
print(f"
|
|
129
|
+
print(f"[Turn {turn_idx+1}] Q: {query}")
|
|
130
|
+
print(f"A: {response.choices[0].message.content}\n")
|
|
126
131
|
```
|
|
127
132
|
|
|
128
|
-
> For
|
|
133
|
+
> **Note:** Stateful mode works without eviction sync — `ContextPilot` tracks the previous ordering and reorders new contexts to maximize prefix cache hits. For production deployments with limited KV-cache capacity, install the [SGLang eviction patch](docs/guides/online_usage.md#sglang-integration) to keep the index in sync. See the [online usage guide](docs/guides/online_usage.md) for HTTP server setup.
|
|
129
134
|
|
|
130
|
-
**
|
|
135
|
+
**Offline / Online Stateless** — same API, just pass the full batch at once:
|
|
131
136
|
|
|
132
137
|
```python
|
|
133
138
|
from openai import OpenAI
|
|
134
139
|
import contextpilot as cp
|
|
135
140
|
|
|
136
|
-
client = OpenAI(base_url="http://localhost:30000/v1", api_key="...")
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
# Simulate multi-turn: each turn has batch_size=1
|
|
140
|
-
turns = [
|
|
141
|
-
{
|
|
142
|
-
"query": "What is AI?",
|
|
143
|
-
"contexts": [["Doc about AI", "Doc about ML", "Doc about computing"]],
|
|
144
|
-
},
|
|
145
|
-
{
|
|
146
|
-
"query": "Compare supervised and unsupervised learning",
|
|
147
|
-
# 2 of 3 docs overlap with Turn 1 ("Doc about AI", "Doc about ML"), different order + 1 new doc
|
|
148
|
-
"contexts": [["Doc about ML", "Doc about clustering", "Doc about AI"]],
|
|
149
|
-
},
|
|
150
|
-
]
|
|
141
|
+
client = OpenAI(base_url="http://localhost:30000/v1", api_key="...") # Your inference engine URL and API key
|
|
142
|
+
cp_batch = cp.ContextPilot(use_gpu=False)
|
|
151
143
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
144
|
+
queries = ["What is AI?", "Explain neural networks", "What is deep learning?"]
|
|
145
|
+
all_contexts = [
|
|
146
|
+
["Doc about AI", "Doc about ML", "Doc about computing"],
|
|
147
|
+
["Doc about neural nets", "Doc about deep learning"],
|
|
148
|
+
["Doc about ML", "Doc about AI", "Doc about deep learning basics"],
|
|
149
|
+
]
|
|
155
150
|
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
reordered = result['reordered_contexts']
|
|
159
|
-
# Turn 2: reordered to ["Doc about AI", "Doc about ML", "Doc about clustering"]
|
|
160
|
-
# ^— shared prefix from Turn 1 —^ ^— new doc appended
|
|
151
|
+
# One call: builds index, reorders docs for prefix sharing, and schedules execution order
|
|
152
|
+
reordered, order = cp_batch.reorder(all_contexts)
|
|
161
153
|
|
|
162
|
-
|
|
154
|
+
# Build all prompts in optimized order
|
|
155
|
+
messages_batch = []
|
|
156
|
+
for ctx, orig_idx in zip(reordered, order):
|
|
163
157
|
docs_section = "\n".join(f"[{i+1}] {doc}" for i, doc in enumerate(ctx))
|
|
164
158
|
importance_ranking = ">".join(
|
|
165
|
-
str(ctx.index(doc) + 1) for doc in
|
|
159
|
+
str(ctx.index(doc) + 1) for doc in all_contexts[orig_idx] if doc in ctx
|
|
166
160
|
)
|
|
167
|
-
|
|
168
|
-
model
|
|
169
|
-
messages
|
|
161
|
+
messages_batch.append({
|
|
162
|
+
"model": "Qwen/Qwen3-4B",
|
|
163
|
+
"messages": [
|
|
170
164
|
{"role": "system", "content": (
|
|
171
165
|
f"Answer the question based on the provided documents.\n\n"
|
|
172
166
|
f"<documents>\n{docs_section}\n</documents>\n\n"
|
|
173
167
|
f"Read the documents in this importance ranking: {importance_ranking}\n"
|
|
174
168
|
f"Prioritize information from higher-ranked documents."
|
|
175
169
|
)},
|
|
176
|
-
{"role": "user", "content":
|
|
170
|
+
{"role": "user", "content": queries[orig_idx]},
|
|
177
171
|
],
|
|
178
|
-
)
|
|
179
|
-
|
|
180
|
-
|
|
172
|
+
})
|
|
173
|
+
|
|
174
|
+
# Send concurrently — inference engine processes them in order for max cache reuse
|
|
175
|
+
import asyncio, openai
|
|
176
|
+
|
|
177
|
+
async def generate_all(batch):
|
|
178
|
+
aclient = openai.AsyncOpenAI(base_url="http://localhost:30000/v1", api_key="...")
|
|
179
|
+
tasks = [aclient.chat.completions.create(**req) for req in batch]
|
|
180
|
+
return await asyncio.gather(*tasks)
|
|
181
|
+
|
|
182
|
+
responses = asyncio.run(generate_all(messages_batch))
|
|
183
|
+
for resp, orig_idx in zip(responses, order):
|
|
184
|
+
print(f"Q: {queries[orig_idx]}\nA: {resp.choices[0].message.content}\n")
|
|
181
185
|
```
|
|
182
186
|
|
|
183
|
-
>
|
|
187
|
+
> For online stateless scheduling via HTTP server, see the [online usage guide](docs/guides/online_usage.md).
|
|
184
188
|
|
|
185
189
|
## Documentation
|
|
186
190
|
|
|
@@ -16,7 +16,7 @@ Quick Start:
|
|
|
16
16
|
>>>
|
|
17
17
|
>>> results = pipeline.run(queries=["What is AI?"])
|
|
18
18
|
|
|
19
|
-
See docs/
|
|
19
|
+
See docs/reference/api.md for detailed documentation.
|
|
20
20
|
"""
|
|
21
21
|
|
|
22
22
|
from .pipeline import (
|
|
@@ -30,15 +30,13 @@ from .pipeline import (
|
|
|
30
30
|
from .context_index import (
|
|
31
31
|
ContextIndex,
|
|
32
32
|
IndexResult,
|
|
33
|
-
build_context_index,
|
|
34
33
|
)
|
|
35
34
|
|
|
36
35
|
from .context_ordering import (
|
|
37
36
|
IntraContextOrderer,
|
|
38
|
-
InterContextScheduler,
|
|
39
37
|
)
|
|
40
38
|
|
|
41
|
-
from .server.live_index import
|
|
39
|
+
from .server.live_index import ContextPilot
|
|
42
40
|
|
|
43
41
|
from .retriever import (
|
|
44
42
|
BM25Retriever,
|
|
@@ -49,7 +47,7 @@ from .retriever import (
|
|
|
49
47
|
MEM0_AVAILABLE,
|
|
50
48
|
)
|
|
51
49
|
|
|
52
|
-
__version__ = "0.3.
|
|
50
|
+
__version__ = "0.3.3"
|
|
53
51
|
|
|
54
52
|
__all__ = [
|
|
55
53
|
# High-level pipeline API
|
|
@@ -62,10 +60,8 @@ __all__ = [
|
|
|
62
60
|
# Core components
|
|
63
61
|
'ContextIndex',
|
|
64
62
|
'IndexResult',
|
|
65
|
-
'build_context_index',
|
|
66
63
|
'IntraContextOrderer',
|
|
67
|
-
'
|
|
68
|
-
'LiveContextIndex',
|
|
64
|
+
'ContextPilot',
|
|
69
65
|
|
|
70
66
|
# Retrievers
|
|
71
67
|
'BM25Retriever',
|
{contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/context_index/compute_distance_cpu.py
RENAMED
|
@@ -9,7 +9,7 @@ from multiprocessing import Pool, cpu_count
|
|
|
9
9
|
from typing import List
|
|
10
10
|
|
|
11
11
|
|
|
12
|
-
def compute_distance_single(context_a: List[int], context_b: List[int], alpha: float = 0.
|
|
12
|
+
def compute_distance_single(context_a: List[int], context_b: List[int], alpha: float = 0.001) -> float:
|
|
13
13
|
"""
|
|
14
14
|
Compute distance between two contexts using our metric:
|
|
15
15
|
distance = (1 - overlap/max_size) + alpha * avg_position_diff
|
|
@@ -49,7 +49,7 @@ def compute_distance_single(context_a: List[int], context_b: List[int], alpha: f
|
|
|
49
49
|
|
|
50
50
|
def compute_distances_batch(queries: List[List[int]],
|
|
51
51
|
targets: List[List[int]],
|
|
52
|
-
alpha: float = 0.
|
|
52
|
+
alpha: float = 0.001,
|
|
53
53
|
num_workers: int = None) -> np.ndarray:
|
|
54
54
|
"""
|
|
55
55
|
Compute distances from multiple query contexts to multiple target contexts.
|
|
@@ -233,7 +233,7 @@ def compute_batch_worker(args):
|
|
|
233
233
|
|
|
234
234
|
|
|
235
235
|
def compute_distance_matrix_cpu_optimized(contexts: List[List[int]],
|
|
236
|
-
alpha: float = 0.
|
|
236
|
+
alpha: float = 0.001,
|
|
237
237
|
num_workers: int = None,
|
|
238
238
|
batch_size: int = 1000) -> np.ndarray:
|
|
239
239
|
"""
|
|
@@ -74,7 +74,7 @@ class ContextIndex:
|
|
|
74
74
|
def __init__(self,
|
|
75
75
|
linkage_method: str = "average",
|
|
76
76
|
use_gpu: bool = True,
|
|
77
|
-
alpha: float = 0.
|
|
77
|
+
alpha: float = 0.001,
|
|
78
78
|
num_workers: Optional[int] = None,
|
|
79
79
|
batch_size: int = 1000):
|
|
80
80
|
"""
|
|
@@ -301,7 +301,7 @@ class ContextIndex:
|
|
|
301
301
|
def build_context_index(contexts,
|
|
302
302
|
linkage_method: str = "average",
|
|
303
303
|
use_gpu: bool = True,
|
|
304
|
-
alpha: float = 0.
|
|
304
|
+
alpha: float = 0.001,
|
|
305
305
|
num_workers: Optional[int] = None,
|
|
306
306
|
batch_size: int = 1000) -> IndexResult:
|
|
307
307
|
"""
|
|
@@ -333,3 +333,5 @@ def build_context_index(contexts,
|
|
|
333
333
|
result.reordered_prompts = result.reordered_contexts
|
|
334
334
|
result.original_prompts = result.original_contexts
|
|
335
335
|
return result
|
|
336
|
+
|
|
337
|
+
|
|
@@ -9,7 +9,7 @@ Includes HTTP server/client for remote index access from SGLang.
|
|
|
9
9
|
|
|
10
10
|
from .metadata import NodeMetadata
|
|
11
11
|
from .eviction_heap import EvictionHeap
|
|
12
|
-
from .live_index import
|
|
12
|
+
from .live_index import ContextPilot
|
|
13
13
|
|
|
14
14
|
# HTTP server/client (optional - requires fastapi/requests)
|
|
15
15
|
try:
|
|
@@ -25,7 +25,7 @@ except ImportError:
|
|
|
25
25
|
__all__ = [
|
|
26
26
|
'NodeMetadata',
|
|
27
27
|
'EvictionHeap',
|
|
28
|
-
'
|
|
28
|
+
'ContextPilot',
|
|
29
29
|
'ContextPilotIndexClient',
|
|
30
30
|
'evict_tokens',
|
|
31
31
|
'http_app',
|