PyPI - contextpilot - Versions diffs - 0.3.2__tar.gz → 0.3.3__tar.gz - Mend

contextpilot 0.3.2tar.gz → 0.3.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

{contextpilot-0.3.2 → contextpilot-0.3.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: contextpilot
-Version: 0.3.2
+Version: 0.3.3
 Summary: Efficient Retrieval-Augmented Generation with Accuracy-Preserving Context Reuse
 Author: Yinsicheng Jiang, Chivier Humber
 License: Apache-2.0
@@ -125,31 +125,35 @@ More [detailed installation instructions](docs/getting_started/installation.md)
 ### Quick Start
-**Offline / Online Stateless** — build index & schedule in one shot:
+**Stateful** — `ContextPilot` tracks cached state across turns so
+overlapping documents are moved to the prefix for KV-cache reuse:
 ```python
 from openai import OpenAI
 import contextpilot as cp
-client = OpenAI(base_url="http://localhost:30000/v1", api_key="...") # Your inference engine URL and API key
-queries = ["What is AI?", "Explain neural networks", "What is deep learning?"]
-all_contexts = [
-    ["Doc about AI", "Doc about ML", "Doc about computing"],
-    ["Doc about neural nets", "Doc about deep learning"],
-    ["Doc about ML", "Doc about AI", "Doc about deep learning basics"],
+client = OpenAI(base_url="http://localhost:30000/v1", api_key="...")
+cp_live = cp.ContextPilot(use_gpu=False)
+# Simulated per-turn memory search (e.g. from mem0)
+# Each turn retrieves different but partially overlapping documents
+turn_memories = [
+    ["Transformers use self-attention", "GPT is based on transformers", "BERT is bidirectional"],
+    ["RNNs use hidden states", "GPT is based on transformers", "LSTMs solve vanishing gradients"],
+    ["Attention computes QKV", "Transformers use self-attention", "GPT is based on transformers"],
 ]
+queries = ["What are transformers?", "How do RNNs compare?", "Explain attention in detail."]
-# Build index and schedule for prefix sharing
-index = cp.build_context_index(all_contexts, use_gpu=False)
-reordered, _, order, _ = cp.InterContextScheduler().schedule_contexts(index)
+for turn_idx, (query, mems) in enumerate(zip(queries, turn_memories)):
+    # 1. Reorder for prefix sharing (handles cold start & incremental)
+    [ctx], order = cp_live.reorder([mems])   # single request per turn
+    # Turn 2: "GPT is based on transformers" ← moved to prefix (shared with turn 1)
+    # Turn 3: "Transformers …", "GPT …"     ← both moved to prefix
-# Send in optimized order — shared prefixes hit KV cache
-for ctx, orig_idx in zip(reordered, order):
+    # 2. Generate answer with reordered context
     docs_section = "\n".join(f"[{i+1}] {doc}" for i, doc in enumerate(ctx))
-    # Importance ranking restores original retrieval order for the model
     importance_ranking = ">".join(
-        str(ctx.index(doc) + 1) for doc in all_contexts[orig_idx] if doc in ctx
+        str(ctx.index(doc) + 1) for doc in mems if doc in ctx
     )
     response = client.chat.completions.create(
         model="Qwen/Qwen3-4B",
@@ -160,68 +164,68 @@ for ctx, orig_idx in zip(reordered, order):
                 f"Read the documents in this importance ranking: {importance_ranking}\n"
                 f"Prioritize information from higher-ranked documents."
             )},
-            {"role": "user", "content": queries[orig_idx]},
+            {"role": "user", "content": query},
         ],
     )
-    print(f"Q: {queries[orig_idx]}\nA: {response.choices[0].message.content}\n")
+    print(f"[Turn {turn_idx+1}] Q: {query}")
+    print(f"A: {response.choices[0].message.content}\n")
 ```
-> For online stateless scheduling via HTTP server, see the [online usage guide](docs/guides/online_usage.md).
+> **Note:** Stateful mode works without eviction sync — `ContextPilot` tracks the previous ordering and reorders new contexts to maximize prefix cache hits. For production deployments with limited KV-cache capacity, install the [SGLang eviction patch](docs/guides/online_usage.md#sglang-integration) to keep the index in sync. See the [online usage guide](docs/guides/online_usage.md) for HTTP server setup.
-**Stateful** — `LiveContextIndex` tracks cached state:
+**Offline / Online Stateless** — same API, just pass the full batch at once:
 ```python
 from openai import OpenAI
 import contextpilot as cp
-client = OpenAI(base_url="http://localhost:30000/v1", api_key="...")
-live = cp.LiveContextIndex(use_gpu=False)
-# Simulate multi-turn: each turn has batch_size=1
-turns = [
-    {
-        "query": "What is AI?",
-        "contexts": [["Doc about AI", "Doc about ML", "Doc about computing"]],
-    },
-    {
-        "query": "Compare supervised and unsupervised learning",
-        # 2 of 3 docs overlap with Turn 1 ("Doc about AI", "Doc about ML"), different order + 1 new doc
-        "contexts": [["Doc about ML", "Doc about clustering", "Doc about AI"]],
-    },
-]
+client = OpenAI(base_url="http://localhost:30000/v1", api_key="...") # Your inference engine URL and API key
+cp_batch = cp.ContextPilot(use_gpu=False)
-for turn_idx, turn in enumerate(turns):
-    contexts = turn["contexts"]
-    query = turn["query"]
+queries = ["What is AI?", "Explain neural networks", "What is deep learning?"]
+all_contexts = [
+    ["Doc about AI", "Doc about ML", "Doc about computing"],
+    ["Doc about neural nets", "Doc about deep learning"],
+    ["Doc about ML", "Doc about AI", "Doc about deep learning basics"],
+]
-    # build_incremental handles both cold start and incremental turns
-    result = live.build_incremental(contexts)
-    reordered = result['reordered_contexts']
-    # Turn 2: reordered to ["Doc about AI", "Doc about ML", "Doc about clustering"]
-    #                        ^— shared prefix from Turn 1 —^    ^— new doc appended
+# One call: builds index, reorders docs for prefix sharing, and schedules execution order
+reordered, order = cp_batch.reorder(all_contexts)
-    ctx = reordered[0]
+# Build all prompts in optimized order
+messages_batch = []
+for ctx, orig_idx in zip(reordered, order):
     docs_section = "\n".join(f"[{i+1}] {doc}" for i, doc in enumerate(ctx))
     importance_ranking = ">".join(
-        str(ctx.index(doc) + 1) for doc in contexts[0] if doc in ctx
+        str(ctx.index(doc) + 1) for doc in all_contexts[orig_idx] if doc in ctx
     )
-    response = client.chat.completions.create(
-        model="Qwen/Qwen3-4B",
-        messages=[
+    messages_batch.append({
+        "model": "Qwen/Qwen3-4B",
+        "messages": [
             {"role": "system", "content": (
                 f"Answer the question based on the provided documents.\n\n"
                 f"<documents>\n{docs_section}\n</documents>\n\n"
                 f"Read the documents in this importance ranking: {importance_ranking}\n"
                 f"Prioritize information from higher-ranked documents."
             )},
-            {"role": "user", "content": query},
+            {"role": "user", "content": queries[orig_idx]},
         ],
-    )
-    print(f"[Turn {turn_idx+1}] Q: {query}")
-    print(f"A: {response.choices[0].message.content}\n")
+    })
+# Send concurrently — inference engine processes them in order for max cache reuse
+import asyncio, openai
+async def generate_all(batch):
+    aclient = openai.AsyncOpenAI(base_url="http://localhost:30000/v1", api_key="...")
+    tasks = [aclient.chat.completions.create(**req) for req in batch]
+    return await asyncio.gather(*tasks)
+responses = asyncio.run(generate_all(messages_batch))
+for resp, orig_idx in zip(responses, order):
+    print(f"Q: {queries[orig_idx]}\nA: {resp.choices[0].message.content}\n")
 ```
-> **Note:** Stateful mode works without eviction sync — `LiveContextIndex` tracks the previous ordering and reorders new contexts to maximize prefix cache hits. For production deployments with limited storage size where the KV cache may evict entries, install the [SGLang eviction patch](docs/guides/online_usage.md#sglang-integration) to keep the index in sync. See the [online usage guide](docs/guides/online_usage.md) for HTTP server setup.
+> For online stateless scheduling via HTTP server, see the [online usage guide](docs/guides/online_usage.md).
 ## Documentation

{contextpilot-0.3.2 → contextpilot-0.3.3}/README.md RENAMED Viewed

@@ -84,31 +84,35 @@ More [detailed installation instructions](docs/getting_started/installation.md)
 ### Quick Start
-**Offline / Online Stateless** — build index & schedule in one shot:
+**Stateful** — `ContextPilot` tracks cached state across turns so
+overlapping documents are moved to the prefix for KV-cache reuse:
 ```python
 from openai import OpenAI
 import contextpilot as cp
-client = OpenAI(base_url="http://localhost:30000/v1", api_key="...") # Your inference engine URL and API key
-queries = ["What is AI?", "Explain neural networks", "What is deep learning?"]
-all_contexts = [
-    ["Doc about AI", "Doc about ML", "Doc about computing"],
-    ["Doc about neural nets", "Doc about deep learning"],
-    ["Doc about ML", "Doc about AI", "Doc about deep learning basics"],
+client = OpenAI(base_url="http://localhost:30000/v1", api_key="...")
+cp_live = cp.ContextPilot(use_gpu=False)
+# Simulated per-turn memory search (e.g. from mem0)
+# Each turn retrieves different but partially overlapping documents
+turn_memories = [
+    ["Transformers use self-attention", "GPT is based on transformers", "BERT is bidirectional"],
+    ["RNNs use hidden states", "GPT is based on transformers", "LSTMs solve vanishing gradients"],
+    ["Attention computes QKV", "Transformers use self-attention", "GPT is based on transformers"],
 ]
+queries = ["What are transformers?", "How do RNNs compare?", "Explain attention in detail."]
-# Build index and schedule for prefix sharing
-index = cp.build_context_index(all_contexts, use_gpu=False)
-reordered, _, order, _ = cp.InterContextScheduler().schedule_contexts(index)
+for turn_idx, (query, mems) in enumerate(zip(queries, turn_memories)):
+    # 1. Reorder for prefix sharing (handles cold start & incremental)
+    [ctx], order = cp_live.reorder([mems])   # single request per turn
+    # Turn 2: "GPT is based on transformers" ← moved to prefix (shared with turn 1)
+    # Turn 3: "Transformers …", "GPT …"     ← both moved to prefix
-# Send in optimized order — shared prefixes hit KV cache
-for ctx, orig_idx in zip(reordered, order):
+    # 2. Generate answer with reordered context
     docs_section = "\n".join(f"[{i+1}] {doc}" for i, doc in enumerate(ctx))
-    # Importance ranking restores original retrieval order for the model
     importance_ranking = ">".join(
-        str(ctx.index(doc) + 1) for doc in all_contexts[orig_idx] if doc in ctx
+        str(ctx.index(doc) + 1) for doc in mems if doc in ctx
     )
     response = client.chat.completions.create(
         model="Qwen/Qwen3-4B",
@@ -119,68 +123,68 @@ for ctx, orig_idx in zip(reordered, order):
                 f"Read the documents in this importance ranking: {importance_ranking}\n"
                 f"Prioritize information from higher-ranked documents."
             )},
-            {"role": "user", "content": queries[orig_idx]},
+            {"role": "user", "content": query},
         ],
     )
-    print(f"Q: {queries[orig_idx]}\nA: {response.choices[0].message.content}\n")
+    print(f"[Turn {turn_idx+1}] Q: {query}")
+    print(f"A: {response.choices[0].message.content}\n")
 ```
-> For online stateless scheduling via HTTP server, see the [online usage guide](docs/guides/online_usage.md).
+> **Note:** Stateful mode works without eviction sync — `ContextPilot` tracks the previous ordering and reorders new contexts to maximize prefix cache hits. For production deployments with limited KV-cache capacity, install the [SGLang eviction patch](docs/guides/online_usage.md#sglang-integration) to keep the index in sync. See the [online usage guide](docs/guides/online_usage.md) for HTTP server setup.
-**Stateful** — `LiveContextIndex` tracks cached state:
+**Offline / Online Stateless** — same API, just pass the full batch at once:
 ```python
 from openai import OpenAI
 import contextpilot as cp
-client = OpenAI(base_url="http://localhost:30000/v1", api_key="...")
-live = cp.LiveContextIndex(use_gpu=False)
-# Simulate multi-turn: each turn has batch_size=1
-turns = [
-    {
-        "query": "What is AI?",
-        "contexts": [["Doc about AI", "Doc about ML", "Doc about computing"]],
-    },
-    {
-        "query": "Compare supervised and unsupervised learning",
-        # 2 of 3 docs overlap with Turn 1 ("Doc about AI", "Doc about ML"), different order + 1 new doc
-        "contexts": [["Doc about ML", "Doc about clustering", "Doc about AI"]],
-    },
-]
+client = OpenAI(base_url="http://localhost:30000/v1", api_key="...") # Your inference engine URL and API key
+cp_batch = cp.ContextPilot(use_gpu=False)
-for turn_idx, turn in enumerate(turns):
-    contexts = turn["contexts"]
-    query = turn["query"]
+queries = ["What is AI?", "Explain neural networks", "What is deep learning?"]
+all_contexts = [
+    ["Doc about AI", "Doc about ML", "Doc about computing"],
+    ["Doc about neural nets", "Doc about deep learning"],
+    ["Doc about ML", "Doc about AI", "Doc about deep learning basics"],
+]
-    # build_incremental handles both cold start and incremental turns
-    result = live.build_incremental(contexts)
-    reordered = result['reordered_contexts']
-    # Turn 2: reordered to ["Doc about AI", "Doc about ML", "Doc about clustering"]
-    #                        ^— shared prefix from Turn 1 —^    ^— new doc appended
+# One call: builds index, reorders docs for prefix sharing, and schedules execution order
+reordered, order = cp_batch.reorder(all_contexts)
-    ctx = reordered[0]
+# Build all prompts in optimized order
+messages_batch = []
+for ctx, orig_idx in zip(reordered, order):
     docs_section = "\n".join(f"[{i+1}] {doc}" for i, doc in enumerate(ctx))
     importance_ranking = ">".join(
-        str(ctx.index(doc) + 1) for doc in contexts[0] if doc in ctx
+        str(ctx.index(doc) + 1) for doc in all_contexts[orig_idx] if doc in ctx
     )
-    response = client.chat.completions.create(
-        model="Qwen/Qwen3-4B",
-        messages=[
+    messages_batch.append({
+        "model": "Qwen/Qwen3-4B",
+        "messages": [
             {"role": "system", "content": (
                 f"Answer the question based on the provided documents.\n\n"
                 f"<documents>\n{docs_section}\n</documents>\n\n"
                 f"Read the documents in this importance ranking: {importance_ranking}\n"
                 f"Prioritize information from higher-ranked documents."
             )},
-            {"role": "user", "content": query},
+            {"role": "user", "content": queries[orig_idx]},
         ],
-    )
-    print(f"[Turn {turn_idx+1}] Q: {query}")
-    print(f"A: {response.choices[0].message.content}\n")
+    })
+# Send concurrently — inference engine processes them in order for max cache reuse
+import asyncio, openai
+async def generate_all(batch):
+    aclient = openai.AsyncOpenAI(base_url="http://localhost:30000/v1", api_key="...")
+    tasks = [aclient.chat.completions.create(**req) for req in batch]
+    return await asyncio.gather(*tasks)
+responses = asyncio.run(generate_all(messages_batch))
+for resp, orig_idx in zip(responses, order):
+    print(f"Q: {queries[orig_idx]}\nA: {resp.choices[0].message.content}\n")
 ```
-> **Note:** Stateful mode works without eviction sync — `LiveContextIndex` tracks the previous ordering and reorders new contexts to maximize prefix cache hits. For production deployments with limited storage size where the KV cache may evict entries, install the [SGLang eviction patch](docs/guides/online_usage.md#sglang-integration) to keep the index in sync. See the [online usage guide](docs/guides/online_usage.md) for HTTP server setup.
+> For online stateless scheduling via HTTP server, see the [online usage guide](docs/guides/online_usage.md).
 ## Documentation

{contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/__init__.py RENAMED Viewed

@@ -16,7 +16,7 @@ Quick Start:
     >>>
     >>> results = pipeline.run(queries=["What is AI?"])
-See docs/PIPELINE_API.md for detailed documentation.
+See docs/reference/api.md for detailed documentation.
 """
 from .pipeline import (
@@ -30,15 +30,13 @@ from .pipeline import (
 from .context_index import (
     ContextIndex,
     IndexResult,
-    build_context_index,
 )
 from .context_ordering import (
     IntraContextOrderer,
-    InterContextScheduler,
 )
-from .server.live_index import LiveContextIndex
+from .server.live_index import ContextPilot
 from .retriever import (
     BM25Retriever,
@@ -49,7 +47,7 @@ from .retriever import (
     MEM0_AVAILABLE,
 )
-__version__ = "0.3.2"
+__version__ = "0.3.3"
 __all__ = [
     # High-level pipeline API
@@ -62,10 +60,8 @@ __all__ = [
     # Core components
     'ContextIndex',
     'IndexResult',
-    'build_context_index',
     'IntraContextOrderer',
-    'InterContextScheduler',
-    'LiveContextIndex',
+    'ContextPilot',
     # Retrievers
     'BM25Retriever',

{contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/context_index/__init__.py RENAMED Viewed

@@ -15,7 +15,7 @@ from .tree_nodes import (
 from .index_construction import (
     ContextIndex,
     IndexResult,
-    build_context_index
+    build_context_index,
 )
 # Import distance computation

{contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/context_index/compute_distance_cpu.py RENAMED Viewed

@@ -9,7 +9,7 @@ from multiprocessing import Pool, cpu_count
 from typing import List
-def compute_distance_single(context_a: List[int], context_b: List[int], alpha: float = 0.005) -> float:
+def compute_distance_single(context_a: List[int], context_b: List[int], alpha: float = 0.001) -> float:
     """
     Compute distance between two contexts using our metric:
     distance = (1 - overlap/max_size) + alpha * avg_position_diff
@@ -49,7 +49,7 @@ def compute_distance_single(context_a: List[int], context_b: List[int], alpha: f
 def compute_distances_batch(queries: List[List[int]],
                            targets: List[List[int]],
-                           alpha: float = 0.005,
+                           alpha: float = 0.001,
                            num_workers: int = None) -> np.ndarray:
     """
     Compute distances from multiple query contexts to multiple target contexts.
@@ -233,7 +233,7 @@ def compute_batch_worker(args):
 def compute_distance_matrix_cpu_optimized(contexts: List[List[int]],
-                                          alpha: float = 0.005,
+                                          alpha: float = 0.001,
                                           num_workers: int = None,
                                           batch_size: int = 1000) -> np.ndarray:
     """

{contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/context_index/index_construction.py RENAMED Viewed

@@ -74,7 +74,7 @@ class ContextIndex:
     def __init__(self,
                  linkage_method: str = "average",
                  use_gpu: bool = True,
-                 alpha: float = 0.005,
+                 alpha: float = 0.001,
                  num_workers: Optional[int] = None,
                  batch_size: int = 1000):
         """
@@ -301,7 +301,7 @@ class ContextIndex:
 def build_context_index(contexts,
                        linkage_method: str = "average",
                        use_gpu: bool = True,
-                       alpha: float = 0.005,
+                       alpha: float = 0.001,
                        num_workers: Optional[int] = None,
                        batch_size: int = 1000) -> IndexResult:
     """
@@ -333,3 +333,5 @@ def build_context_index(contexts,
         result.reordered_prompts = result.reordered_contexts
         result.original_prompts = result.original_contexts
     return result

{contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/pipeline/rag_pipeline.py RENAMED Viewed

@@ -840,7 +840,7 @@ class RAGPipeline:
                     json={
                         "contexts": contexts,
                         "initial_tokens_per_context": 100,
-                        "alpha": 0.005,
+                        "alpha": 0.001,
                         "use_gpu": False,
                         "linkage_method": "average",
                         "incremental": incremental

{contextpilot-0.3.2 → contextpilot-0.3.3}/contextpilot/server/__init__.py RENAMED Viewed

@@ -9,7 +9,7 @@ Includes HTTP server/client for remote index access from SGLang.
 from .metadata import NodeMetadata
 from .eviction_heap import EvictionHeap
-from .live_index import LiveContextIndex
+from .live_index import ContextPilot
 # HTTP server/client (optional - requires fastapi/requests)
 try:
@@ -25,7 +25,7 @@ except ImportError:
 __all__ = [
     'NodeMetadata',
     'EvictionHeap',
-    'LiveContextIndex',
+    'ContextPilot',
     'ContextPilotIndexClient',
     'evict_tokens',
     'http_app',

contextpilot 0.3.2__tar.gz → 0.3.3__tar.gz

contextpilot 0.3.2tar.gz → 0.3.3tar.gz