PyPI - contextpilot - Versions diffs - 0.3.3__tar.gz → 0.3.4__tar.gz - Mend

contextpilot 0.3.3tar.gz → 0.3.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

{contextpilot-0.3.3 → contextpilot-0.3.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: contextpilot
-Version: 0.3.3
+Version: 0.3.4
 Summary: Efficient Retrieval-Augmented Generation with Accuracy-Preserving Context Reuse
 Author: Yinsicheng Jiang, Chivier Humber
 License: Apache-2.0
@@ -42,7 +42,7 @@ Dynamic: license-file
 <div align="center">
   <img src="assets/about.png" alt="ContextPilot Logo" width="800"/>
-  <h1><strong>ContextPilot: Efficient Long Context Inference with Context Reuse</strong></h1>
+  <h1><strong>ContextPilot: Fast Long-Context Inference via Context Reuse</strong></h1>
   [![Python](https://img.shields.io/badge/python-≥3.10-blue)](https://www.python.org/)
   [![PyPI](https://img.shields.io/pypi/v/contextpilot)](https://pypi.org/project/contextpilot/)
@@ -80,7 +80,7 @@ ContextPilot is a fast optimization system on context engineering layer for agen
 ### System Performance
 <div align="center">
-<img src="assets/deepseek_r1_results.png" alt="Benchmark Results" width="600"/>
+<img src="assets/ds_r1_result_horizontal.png" alt="Benchmark Results" width="800"/>
 </div>
 ContextPilot (Stateless) on DeepSeek-R1 maintains accuracy compared to SGLang, achieving 64.68% vs 64.15% F1 on MultihopRAG and 41.08% vs 40.20% F1 on NarrativeQA.
@@ -146,15 +146,18 @@ queries = ["What are transformers?", "How do RNNs compare?", "Explain attention
 for turn_idx, (query, mems) in enumerate(zip(queries, turn_memories)):
     # 1. Reorder for prefix sharing (handles cold start & incremental)
-    [ctx], order = cp_live.reorder([mems])   # single request per turn
+    # .reorder() accepts a single list or list-of-lists
+    reordered, indices = cp_live.reorder(mems)
+    ctx = reordered[0]  # single context per turn
     # Turn 2: "GPT is based on transformers" ← moved to prefix (shared with turn 1)
     # Turn 3: "Transformers …", "GPT …"     ← both moved to prefix
     # 2. Generate answer with reordered context
     docs_section = "\n".join(f"[{i+1}] {doc}" for i, doc in enumerate(ctx))
-    importance_ranking = ">".join(
-        str(ctx.index(doc) + 1) for doc in mems if doc in ctx
-    )
+    # Map original importance order (mems) → 1-based positions in reordered ctx
+    pos = {doc: i + 1 for i, doc in enumerate(ctx)}
+    importance_ranking = ">".join(str(pos[doc]) for doc in mems if doc in pos)
+    # System prompt = documents + importance ranking (after </documents>, doesn't affect prefix sharing)
     response = client.chat.completions.create(
         model="Qwen/Qwen3-4B",
         messages=[
@@ -171,7 +174,7 @@ for turn_idx, (query, mems) in enumerate(zip(queries, turn_memories)):
     print(f"A: {response.choices[0].message.content}\n")
 ```
-> **Note:** Stateful mode works without eviction sync — `ContextPilot` tracks the previous ordering and reorders new contexts to maximize prefix cache hits. For production deployments with limited KV-cache capacity, install the [SGLang eviction patch](docs/guides/online_usage.md#sglang-integration) to keep the index in sync. See the [online usage guide](docs/guides/online_usage.md) for HTTP server setup.
+> **Note:** Stateful mode works without eviction sync — `ContextPilot` tracks the previous ordering and reorders new contexts to maximize prefix cache hits. For production deployments with limited KV-cache capacity, install the eviction patch for your inference engine ([SGLang](docs/guides/online_usage.md#sglang-integration) or [vLLM](docs/guides/online_usage.md#vllm-integration)) to keep the index in sync. See the [online usage guide](docs/guides/online_usage.md) for HTTP server setup.
 **Offline / Online Stateless** — same API, just pass the full batch at once:
@@ -190,15 +193,18 @@ all_contexts = [
 ]
 # One call: builds index, reorders docs for prefix sharing, and schedules execution order
-reordered, order = cp_batch.reorder(all_contexts)
+# .reorder() returns (reordered_contexts, original_indices)
+reordered_ctx, order = cp_batch.reorder(all_contexts)
 # Build all prompts in optimized order
 messages_batch = []
-for ctx, orig_idx in zip(reordered, order):
+for ctx, orig_idx in zip(reordered_ctx, order):
     docs_section = "\n".join(f"[{i+1}] {doc}" for i, doc in enumerate(ctx))
+    pos = {doc: i + 1 for i, doc in enumerate(ctx)}
     importance_ranking = ">".join(
-        str(ctx.index(doc) + 1) for doc in all_contexts[orig_idx] if doc in ctx
+        str(pos[doc]) for doc in all_contexts[orig_idx] if doc in pos
     )
+    # System prompt = documents + importance ranking (after </documents>, doesn't affect prefix sharing)
     messages_batch.append({
         "model": "Qwen/Qwen3-4B",
         "messages": [

{contextpilot-0.3.3 → contextpilot-0.3.4}/README.md RENAMED Viewed

@@ -1,7 +1,7 @@
 <div align="center">
   <img src="assets/about.png" alt="ContextPilot Logo" width="800"/>
-  <h1><strong>ContextPilot: Efficient Long Context Inference with Context Reuse</strong></h1>
+  <h1><strong>ContextPilot: Fast Long-Context Inference via Context Reuse</strong></h1>
   [![Python](https://img.shields.io/badge/python-≥3.10-blue)](https://www.python.org/)
   [![PyPI](https://img.shields.io/pypi/v/contextpilot)](https://pypi.org/project/contextpilot/)
@@ -39,7 +39,7 @@ ContextPilot is a fast optimization system on context engineering layer for agen
 ### System Performance
 <div align="center">
-<img src="assets/deepseek_r1_results.png" alt="Benchmark Results" width="600"/>
+<img src="assets/ds_r1_result_horizontal.png" alt="Benchmark Results" width="800"/>
 </div>
 ContextPilot (Stateless) on DeepSeek-R1 maintains accuracy compared to SGLang, achieving 64.68% vs 64.15% F1 on MultihopRAG and 41.08% vs 40.20% F1 on NarrativeQA.
@@ -105,15 +105,18 @@ queries = ["What are transformers?", "How do RNNs compare?", "Explain attention
 for turn_idx, (query, mems) in enumerate(zip(queries, turn_memories)):
     # 1. Reorder for prefix sharing (handles cold start & incremental)
-    [ctx], order = cp_live.reorder([mems])   # single request per turn
+    # .reorder() accepts a single list or list-of-lists
+    reordered, indices = cp_live.reorder(mems)
+    ctx = reordered[0]  # single context per turn
     # Turn 2: "GPT is based on transformers" ← moved to prefix (shared with turn 1)
     # Turn 3: "Transformers …", "GPT …"     ← both moved to prefix
     # 2. Generate answer with reordered context
     docs_section = "\n".join(f"[{i+1}] {doc}" for i, doc in enumerate(ctx))
-    importance_ranking = ">".join(
-        str(ctx.index(doc) + 1) for doc in mems if doc in ctx
-    )
+    # Map original importance order (mems) → 1-based positions in reordered ctx
+    pos = {doc: i + 1 for i, doc in enumerate(ctx)}
+    importance_ranking = ">".join(str(pos[doc]) for doc in mems if doc in pos)
+    # System prompt = documents + importance ranking (after </documents>, doesn't affect prefix sharing)
     response = client.chat.completions.create(
         model="Qwen/Qwen3-4B",
         messages=[
@@ -130,7 +133,7 @@ for turn_idx, (query, mems) in enumerate(zip(queries, turn_memories)):
     print(f"A: {response.choices[0].message.content}\n")
 ```
-> **Note:** Stateful mode works without eviction sync — `ContextPilot` tracks the previous ordering and reorders new contexts to maximize prefix cache hits. For production deployments with limited KV-cache capacity, install the [SGLang eviction patch](docs/guides/online_usage.md#sglang-integration) to keep the index in sync. See the [online usage guide](docs/guides/online_usage.md) for HTTP server setup.
+> **Note:** Stateful mode works without eviction sync — `ContextPilot` tracks the previous ordering and reorders new contexts to maximize prefix cache hits. For production deployments with limited KV-cache capacity, install the eviction patch for your inference engine ([SGLang](docs/guides/online_usage.md#sglang-integration) or [vLLM](docs/guides/online_usage.md#vllm-integration)) to keep the index in sync. See the [online usage guide](docs/guides/online_usage.md) for HTTP server setup.
 **Offline / Online Stateless** — same API, just pass the full batch at once:
@@ -149,15 +152,18 @@ all_contexts = [
 ]
 # One call: builds index, reorders docs for prefix sharing, and schedules execution order
-reordered, order = cp_batch.reorder(all_contexts)
+# .reorder() returns (reordered_contexts, original_indices)
+reordered_ctx, order = cp_batch.reorder(all_contexts)
 # Build all prompts in optimized order
 messages_batch = []
-for ctx, orig_idx in zip(reordered, order):
+for ctx, orig_idx in zip(reordered_ctx, order):
     docs_section = "\n".join(f"[{i+1}] {doc}" for i, doc in enumerate(ctx))
+    pos = {doc: i + 1 for i, doc in enumerate(ctx)}
     importance_ranking = ">".join(
-        str(ctx.index(doc) + 1) for doc in all_contexts[orig_idx] if doc in ctx
+        str(pos[doc]) for doc in all_contexts[orig_idx] if doc in pos
     )
+    # System prompt = documents + importance ranking (after </documents>, doesn't affect prefix sharing)
     messages_batch.append({
         "model": "Qwen/Qwen3-4B",
         "messages": [

{contextpilot-0.3.3 → contextpilot-0.3.4}/contextpilot/__init__.py RENAMED Viewed

@@ -47,7 +47,7 @@ from .retriever import (
     MEM0_AVAILABLE,
 )
-__version__ = "0.3.3"
+__version__ = "0.3.4"
 __all__ = [
     # High-level pipeline API

{contextpilot-0.3.3 → contextpilot-0.3.4}/contextpilot/pipeline/rag_pipeline.py RENAMED Viewed

@@ -566,10 +566,11 @@ class RAGPipeline:
                 **extra_request_body,
             }
-            # Add rid for request tracking in SGLang's radix cache
-            # SGLang uses 'rid' field to identify requests
             if request_id:
-                payload["rid"] = request_id
+                if self.inference_config.backend == "vllm":
+                    payload["request_id"] = request_id
+                else:
+                    payload["rid"] = request_id  # SGLang field name
             output = {
                 "generated_text": "",

{contextpilot-0.3.3 → contextpilot-0.3.4}/contextpilot/server/http_server.py RENAMED Viewed

@@ -20,6 +20,7 @@ import logging
 import time
 import asyncio
 import os
+import re
 import uuid
 from typing import List, Dict, Any, Optional
 from contextlib import asynccontextmanager
@@ -70,6 +71,19 @@ _str_to_id: Dict[str, int] = {}
 _id_to_str: Dict[int, str] = {}
 _next_str_id: int = 0
+# Request ID normalization (engine -> ContextPilot canonical IDs)
+_ENGINE_REQ_ID_PREFIX = re.compile(r"^(cmpl-|chatcmpl-|batch-)")
+_VLLM_REQ_SUFFIX = re.compile(r"^(req-[^-]+)-\d+-[0-9a-f]+$")
+def _normalize_request_id(request_id: str) -> str:
+    """Normalize engine-specific request IDs to ContextPilot canonical form."""
+    rid = _ENGINE_REQ_ID_PREFIX.sub("", request_id or "")
+    m = _VLLM_REQ_SUFFIX.match(rid)
+    if m:
+        return m.group(1)
+    return rid
 def _init_config():
     """Initialize config from environment variables."""
@@ -607,24 +621,16 @@ async def evict(request: EvictRequest):
     THIS IS THE MAIN ENDPOINT THAT THE INFERENCE ENGINE'S EVICTION CALLBACK SHOULD CALL.
-    When the inference engine's cache evicts nodes, it collects the request_ids
-    from the evicted nodes and invokes the registered callback. That callback
-    should call this endpoint to remove the corresponding entries from ContextPilot.
-    Integration example (SGLang):
-        def eviction_callback(evicted_request_ids: set):
-            if evicted_request_ids:
-                try:
-                    requests.post(
-                        "http://localhost:8765/evict",
-                        json={"request_ids": list(evicted_request_ids)},
-                        timeout=1.0
-                    )
-                except Exception as e:
-                    logger.warning(f"ContextPilot eviction sync failed: {e}")
-        # Register callback when initializing radix cache
-        tree_cache.set_eviction_callback(eviction_callback)
+    When the inference engine's cache evicts entries, it collects the request_ids
+    from the evicted entries and invokes the registered callback. That callback
+    calls this endpoint to remove the corresponding entries from ContextPilot.
+    Supported engines:
+        - SGLang: patches/sglang/ patches the radix cache to fire callbacks on eviction
+        - vLLM:   patches/vllm/ patches the block pool to fire callbacks on eviction
+    Both use the same protocol:
+        POST /evict  {"request_ids": ["req-1", "req-2", ...]}
     """
     # Check if index is initialized
     if _index is None:
@@ -633,14 +639,25 @@ async def evict(request: EvictRequest):
         )
     try:
+        normalized_ids = [
+            _normalize_request_id(rid)
+            for rid in request.request_ids
+        ]
+        normalized_ids = [
+            rid for rid in normalized_ids
+            if rid and not rid.startswith("HEALTH_CHECK")
+        ]
+        # Deduplicate while preserving order for deterministic logs/responses.
+        normalized_ids = list(dict.fromkeys(normalized_ids))
         # Remove the evicted requests from our index
-        result = _index.remove_requests(request.request_ids)
+        result = _index.remove_requests(normalized_ids)
         # Also clear conversation history for evicted requests
         # This ensures ConversationTracker stays in sync with the engine's cache
         tracker = get_conversation_tracker()
         conversations_cleared = 0
-        for req_id in request.request_ids:
+        for req_id in normalized_ids:
             cleared = tracker.clear_conversation(req_id)
             conversations_cleared += cleared
@@ -648,12 +665,14 @@ async def evict(request: EvictRequest):
         logger.info(
             f"Eviction: removed {result['removed_count']} requests from index, "
             f"cleared {conversations_cleared} conversation entries, "
-            f"not_found={len(result['not_found'])}"
+            f"not_found={len(result['not_found'])}, "
+            f"incoming={len(request.request_ids)}, normalized={len(normalized_ids)}"
         )
         return {
             "status": "success",
             "conversations_cleared": conversations_cleared,
+            "normalized_request_ids": normalized_ids,
             **result,
         }
@@ -916,8 +935,9 @@ async def proxy_completions(request: Request):
         # Pass request_id to inference engine so it can use the same ID for request tracking
         # Engine will notify ContextPilot via /evict callback when this request is evicted
         if request_id:
-            body["rid"] = request_id
-            logger.info(f"Proxy: forwarding request with rid={request_id}")
+            body["rid"] = request_id          # SGLang
+            body["request_id"] = request_id   # vLLM
+            logger.info(f"Proxy: forwarding request with request_id={request_id}")
         else:
             logger.info("Proxy: forwarding request without rid (no ContextPilot tracking)")

{contextpilot-0.3.3 → contextpilot-0.3.4}/contextpilot/server/live_index.py RENAMED Viewed

@@ -224,8 +224,10 @@ class ContextPilot(ContextIndex):
         transparently — callers never need to distinguish between them.
         Args:
-            contexts: ``List[List[int]]`` or ``List[List[str]]`` — each
-                inner list is one context (document IDs or text strings).
+            contexts: A single context (``List[int]`` / ``List[str]``)
+                or a batch of contexts (``List[List[int]]`` /
+                ``List[List[str]]``).  A single list is automatically
+                wrapped into ``[contexts]``.
             initial_tokens_per_context: Initial token budget per context
                 (used for eviction tracking; 0 to ignore).
             conversation_id: Conversation key for multi-turn
@@ -243,6 +245,10 @@ class ContextPilot(ContextIndex):
               ``reordered_contexts[i]`` corresponds to
               ``contexts[original_indices[i]]``.
         """
+        # Accept a single list and wrap it
+        if contexts and not isinstance(contexts[0], list):
+            contexts = [contexts]
         result = self.build_incremental(contexts, initial_tokens_per_context)
         reordered = result["reordered_contexts"]
@@ -958,25 +964,11 @@ class ContextPilot(ContextIndex):
         return request_id_mapping, request_ids_ordered
     # =========================================================================
-    # Request Eviction (Called by SGLang's radix cache callback)
+    # Request Eviction (Called by inference engine's eviction callback)
     # =========================================================================
     def remove_requests(self, request_ids: Set[str]) -> Dict[str, Any]:
-        """
-        Remove requests from the context index.
-        THIS IS THE METHOD CALLED BY SGLANG'S EVICTION CALLBACK.
-        When SGLang's radix cache evicts requests, it calls a callback
-        with the set of evicted request_ids. That callback should invoke
-        this method to keep the context index in sync.
-        Args:
-            request_ids: Set of request IDs to remove (from SGLang callback)
-        Returns:
-            Dictionary with eviction results
-        """
+        """Remove requests from the context index (called by engine eviction callback)."""
         evicted_nodes = []
         not_found = []

{contextpilot-0.3.3 → contextpilot-0.3.4}/contextpilot.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: contextpilot
-Version: 0.3.3
+Version: 0.3.4
 Summary: Efficient Retrieval-Augmented Generation with Accuracy-Preserving Context Reuse
 Author: Yinsicheng Jiang, Chivier Humber
 License: Apache-2.0
@@ -42,7 +42,7 @@ Dynamic: license-file
 <div align="center">
   <img src="assets/about.png" alt="ContextPilot Logo" width="800"/>
-  <h1><strong>ContextPilot: Efficient Long Context Inference with Context Reuse</strong></h1>
+  <h1><strong>ContextPilot: Fast Long-Context Inference via Context Reuse</strong></h1>
   [![Python](https://img.shields.io/badge/python-≥3.10-blue)](https://www.python.org/)
   [![PyPI](https://img.shields.io/pypi/v/contextpilot)](https://pypi.org/project/contextpilot/)
@@ -80,7 +80,7 @@ ContextPilot is a fast optimization system on context engineering layer for agen
 ### System Performance
 <div align="center">
-<img src="assets/deepseek_r1_results.png" alt="Benchmark Results" width="600"/>
+<img src="assets/ds_r1_result_horizontal.png" alt="Benchmark Results" width="800"/>
 </div>
 ContextPilot (Stateless) on DeepSeek-R1 maintains accuracy compared to SGLang, achieving 64.68% vs 64.15% F1 on MultihopRAG and 41.08% vs 40.20% F1 on NarrativeQA.
@@ -146,15 +146,18 @@ queries = ["What are transformers?", "How do RNNs compare?", "Explain attention
 for turn_idx, (query, mems) in enumerate(zip(queries, turn_memories)):
     # 1. Reorder for prefix sharing (handles cold start & incremental)
-    [ctx], order = cp_live.reorder([mems])   # single request per turn
+    # .reorder() accepts a single list or list-of-lists
+    reordered, indices = cp_live.reorder(mems)
+    ctx = reordered[0]  # single context per turn
     # Turn 2: "GPT is based on transformers" ← moved to prefix (shared with turn 1)
     # Turn 3: "Transformers …", "GPT …"     ← both moved to prefix
     # 2. Generate answer with reordered context
     docs_section = "\n".join(f"[{i+1}] {doc}" for i, doc in enumerate(ctx))
-    importance_ranking = ">".join(
-        str(ctx.index(doc) + 1) for doc in mems if doc in ctx
-    )
+    # Map original importance order (mems) → 1-based positions in reordered ctx
+    pos = {doc: i + 1 for i, doc in enumerate(ctx)}
+    importance_ranking = ">".join(str(pos[doc]) for doc in mems if doc in pos)
+    # System prompt = documents + importance ranking (after </documents>, doesn't affect prefix sharing)
     response = client.chat.completions.create(
         model="Qwen/Qwen3-4B",
         messages=[
@@ -171,7 +174,7 @@ for turn_idx, (query, mems) in enumerate(zip(queries, turn_memories)):
     print(f"A: {response.choices[0].message.content}\n")
 ```
-> **Note:** Stateful mode works without eviction sync — `ContextPilot` tracks the previous ordering and reorders new contexts to maximize prefix cache hits. For production deployments with limited KV-cache capacity, install the [SGLang eviction patch](docs/guides/online_usage.md#sglang-integration) to keep the index in sync. See the [online usage guide](docs/guides/online_usage.md) for HTTP server setup.
+> **Note:** Stateful mode works without eviction sync — `ContextPilot` tracks the previous ordering and reorders new contexts to maximize prefix cache hits. For production deployments with limited KV-cache capacity, install the eviction patch for your inference engine ([SGLang](docs/guides/online_usage.md#sglang-integration) or [vLLM](docs/guides/online_usage.md#vllm-integration)) to keep the index in sync. See the [online usage guide](docs/guides/online_usage.md) for HTTP server setup.
 **Offline / Online Stateless** — same API, just pass the full batch at once:
@@ -190,15 +193,18 @@ all_contexts = [
 ]
 # One call: builds index, reorders docs for prefix sharing, and schedules execution order
-reordered, order = cp_batch.reorder(all_contexts)
+# .reorder() returns (reordered_contexts, original_indices)
+reordered_ctx, order = cp_batch.reorder(all_contexts)
 # Build all prompts in optimized order
 messages_batch = []
-for ctx, orig_idx in zip(reordered, order):
+for ctx, orig_idx in zip(reordered_ctx, order):
     docs_section = "\n".join(f"[{i+1}] {doc}" for i, doc in enumerate(ctx))
+    pos = {doc: i + 1 for i, doc in enumerate(ctx)}
     importance_ranking = ">".join(
-        str(ctx.index(doc) + 1) for doc in all_contexts[orig_idx] if doc in ctx
+        str(pos[doc]) for doc in all_contexts[orig_idx] if doc in pos
     )
+    # System prompt = documents + importance ranking (after </documents>, doesn't affect prefix sharing)
     messages_batch.append({
         "model": "Qwen/Qwen3-4B",
         "messages": [

{contextpilot-0.3.3 → contextpilot-0.3.4}/contextpilot.egg-info/SOURCES.txt RENAMED Viewed

@@ -52,4 +52,5 @@ tests/test_pageindex_integration.py
 tests/test_performance.py
 tests/test_pipeline.py
 tests/test_server_integration.py
-tests/test_utils.py
+tests/test_utils.py
+tests/test_vllm_patch.py

{contextpilot-0.3.3 → contextpilot-0.3.4}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "contextpilot"
-version = "0.3.3"
+version = "0.3.4"
 description = "Efficient Retrieval-Augmented Generation with Accuracy-Preserving Context Reuse"
 readme = "README.md"
 requires-python = ">=3.10"

{contextpilot-0.3.3 → contextpilot-0.3.4}/tests/test_live_index.py RENAMED Viewed

@@ -255,6 +255,29 @@ class TestLiveIndexRequestTracking:
         assert 'request_ids' in result
         assert len(result['request_ids']) == len(contexts)
+    def test_reorder_single_list(self):
+        """reorder() should accept a single list and auto-wrap it."""
+        from contextpilot import ContextPilot
+        engine = ContextPilot(use_gpu=False)
+        # Pass a flat list instead of list-of-lists
+        reordered, indices = engine.reorder([1, 2, 3])
+        assert len(reordered) == 1
+        assert set(reordered[0]) == {1, 2, 3}
+        assert indices == [0]
+    def test_reorder_single_list_strings(self):
+        """reorder() should accept a single list of strings."""
+        from contextpilot import ContextPilot
+        engine = ContextPilot(use_gpu=False)
+        reordered, indices = engine.reorder(["doc_a", "doc_b", "doc_c"])
+        assert len(reordered) == 1
+        assert set(reordered[0]) == {"doc_a", "doc_b", "doc_c"}
+        assert indices == [0]
 class TestDeduplication:
     """Test ContextPilot.deduplicate() for multi-turn deduplication."""

contextpilot-0.3.4/tests/test_vllm_patch.py ADDED Viewed

@@ -0,0 +1,493 @@
+"""
+Tests for vLLM block_pool.py eviction sync patch.
+Tests the ContextPilot tracking dicts and eviction callback logic
+without requiring a vLLM installation — all vLLM internals are mocked.
+"""
+import pytest
+from unittest.mock import MagicMock, patch
+from dataclasses import dataclass, field
+from typing import Optional
+# ---------------------------------------------------------------------------
+# Mock vLLM types so we can import/test block_pool logic without vLLM
+# ---------------------------------------------------------------------------
+@dataclass
+class MockKVCacheBlock:
+    block_id: int
+    block_hash: Optional[bytes] = None
+    ref_cnt: int = 0
+    is_null: bool = False
+    prev_free_block: Optional["MockKVCacheBlock"] = field(default=None, repr=False)
+    next_free_block: Optional["MockKVCacheBlock"] = field(default=None, repr=False)
+    def reset_hash(self):
+        self.block_hash = None
+class MockFreeKVCacheBlockQueue:
+    """Simplified free block queue for testing."""
+    def __init__(self, blocks):
+        self._blocks = list(blocks)
+        self.num_free_blocks = len(self._blocks)
+    def popleft(self):
+        self.num_free_blocks -= 1
+        return self._blocks.pop(0)
+    def popleft_n(self, n):
+        result = self._blocks[:n]
+        self._blocks = self._blocks[n:]
+        self.num_free_blocks -= n
+        return result
+    def remove(self, block):
+        if block in self._blocks:
+            self._blocks.remove(block)
+            self.num_free_blocks -= 1
+    def append_n(self, blocks):
+        self._blocks.extend(blocks)
+        self.num_free_blocks += len(blocks)
+class MockRequest:
+    def __init__(self, request_id, block_hashes, all_token_ids=None):
+        self.request_id = request_id
+        self.block_hashes = block_hashes
+        self.all_token_ids = all_token_ids or []
+        self.lora_request = None
+class MockBlockHashToBlockMap:
+    """Mirrors the real BlockHashToBlockMap for testing."""
+    def __init__(self):
+        self._cache = {}
+    def get_one_block(self, key):
+        blocks = self._cache.get(key)
+        if blocks is None:
+            return None
+        if isinstance(blocks, MockKVCacheBlock):
+            return blocks
+        if isinstance(blocks, dict):
+            return next(iter(blocks.values()))
+        raise AssertionError(f"Invalid cache block type: {type(blocks)}")
+    def insert(self, key, block):
+        blocks = self._cache.get(key)
+        if blocks is None:
+            self._cache[key] = block
+        elif isinstance(blocks, MockKVCacheBlock):
+            self._cache[key] = {
+                blocks.block_id: blocks,
+                block.block_id: block,
+            }
+        elif isinstance(blocks, dict):
+            blocks[block.block_id] = block
+        else:
+            raise AssertionError(f"Invalid cache block type: {type(blocks)}")
+    def pop(self, key, block_id):
+        blocks = self._cache.pop(key, None)
+        if blocks is None:
+            return None
+        if isinstance(blocks, MockKVCacheBlock):
+            if blocks.block_id == block_id:
+                return blocks
+            self._cache[key] = blocks
+            return None
+        if isinstance(blocks, dict):
+            block = blocks.pop(block_id, None)
+            if blocks:
+                self._cache[key] = blocks
+            return block
+        self._cache[key] = blocks
+        return None
+    def __len__(self):
+        return len(self._cache)
+# ---------------------------------------------------------------------------
+# BlockPool under test — extracted logic (no vLLM imports needed)
+# ---------------------------------------------------------------------------
+class TestableBlockPool:
+    """ContextPilot-patched BlockPool logic, mocked for testing."""
+    def __init__(self, num_blocks=10, eviction_callback=None):
+        self.blocks = [MockKVCacheBlock(i) for i in range(num_blocks)]
+        self.free_block_queue = MockFreeKVCacheBlockQueue(list(self.blocks))
+        self.cached_block_hash_to_block = MockBlockHashToBlockMap()
+        self.enable_caching = True
+        self.num_gpu_blocks = num_blocks
+        self.metrics_collector = None
+        # Null block
+        self.null_block = self.free_block_queue.popleft()
+        self.null_block.is_null = True
+        # ContextPilot tracking
+        self._block_to_requests: dict[bytes, set[str]] = {}
+        self._request_to_blocks: dict[str, set[bytes]] = {}
+        self.eviction_callback = eviction_callback
+    def cache_full_blocks_simple(self, request_id, block_indices, block_hashes):
+        for idx, bh in zip(block_indices, block_hashes):
+            blk = self.blocks[idx]
+            blk.block_hash = bh
+            self.cached_block_hash_to_block.insert(bh, blk)
+            if self.eviction_callback is not None:
+                self._block_to_requests.setdefault(bh, set()).add(request_id)
+                self._request_to_blocks.setdefault(request_id, set()).add(bh)
+    def _maybe_evict_cached_block(self, block) -> set:
+        fully_evicted = set()
+        block_hash = block.block_hash
+        if block_hash is None:
+            return fully_evicted
+        if self.cached_block_hash_to_block.pop(block_hash, block.block_id) is None:
+            return fully_evicted
+        if self.cached_block_hash_to_block.get_one_block(block_hash) is None:
+            request_ids = self._block_to_requests.pop(block_hash, None)
+            if request_ids:
+                for rid in request_ids:
+                    blocks_set = self._request_to_blocks.get(rid)
+                    if blocks_set is not None:
+                        blocks_set.discard(block_hash)
+                        if not blocks_set:
+                            fully_evicted.add(rid)
+                            del self._request_to_blocks[rid]
+        block.reset_hash()
+        return fully_evicted
+    def get_new_blocks(self, num_blocks):
+        ret = self.free_block_queue.popleft_n(num_blocks)
+        fully_evicted = set()
+        if self.enable_caching:
+            for block in ret:
+                evicted = self._maybe_evict_cached_block(block)
+                fully_evicted.update(evicted)
+                block.ref_cnt += 1
+        else:
+            for block in ret:
+                block.ref_cnt += 1
+        if fully_evicted and self.eviction_callback is not None:
+            try:
+                self.eviction_callback(fully_evicted)
+            except Exception:
+                pass
+        return ret
+    def free_blocks(self, blocks):
+        blocks_list = list(blocks)
+        for block in blocks_list:
+            block.ref_cnt -= 1
+        self.free_block_queue.append_n(
+            [b for b in blocks_list if b.ref_cnt == 0 and not b.is_null]
+        )
+    def touch(self, blocks):
+        if not blocks:
+            return
+        if isinstance(blocks[0], MockKVCacheBlock):
+            block_iter = blocks
+        else:
+            block_iter = (b for group in blocks for b in group)
+        for block in block_iter:
+            if block.ref_cnt == 0 and not block.is_null:
+                self.free_block_queue.remove(block)
+            block.ref_cnt += 1
+    def evict_blocks(self, block_ids):
+        fully_evicted = set()
+        for block_id in block_ids:
+            block = self.blocks[block_id]
+            evicted = self._maybe_evict_cached_block(block)
+            fully_evicted.update(evicted)
+        if fully_evicted and self.eviction_callback is not None:
+            try:
+                self.eviction_callback(fully_evicted)
+            except Exception:
+                pass
+    def reset_prefix_cache(self):
+        if self._request_to_blocks and self.eviction_callback is not None:
+            all_requests = set(self._request_to_blocks.keys())
+            try:
+                self.eviction_callback(all_requests)
+            except Exception:
+                pass
+        self._block_to_requests.clear()
+        self._request_to_blocks.clear()
+        self.cached_block_hash_to_block = MockBlockHashToBlockMap()
+        for block in self.blocks:
+            block.reset_hash()
+    def get_tracked_request_ids(self):
+        return set(self._request_to_blocks.keys())
+    def is_request_in_cache(self, request_id):
+        return request_id in self._request_to_blocks
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+class TestTrackingDicts:
+    def test_cache_records_mapping(self):
+        callback = MagicMock()
+        pool = TestableBlockPool(eviction_callback=callback)
+        pool.cache_full_blocks_simple(
+            "req-1", [1, 2, 3], [b"h1", b"h2", b"h3"]
+        )
+        assert pool.is_request_in_cache("req-1")
+        assert pool._request_to_blocks["req-1"] == {b"h1", b"h2", b"h3"}
+        assert "req-1" in pool._block_to_requests[b"h1"]
+        assert "req-1" in pool._block_to_requests[b"h2"]
+        assert "req-1" in pool._block_to_requests[b"h3"]
+    def test_shared_blocks_track_both_requests(self):
+        callback = MagicMock()
+        pool = TestableBlockPool(num_blocks=20, eviction_callback=callback)
+        # Two requests share block hash h1 (different block_ids, same hash)
+        pool.cache_full_blocks_simple("req-A", [1, 2], [b"h1", b"h2"])
+        pool.cache_full_blocks_simple("req-B", [3, 4], [b"h1", b"h3"])
+        assert pool._block_to_requests[b"h1"] == {"req-A", "req-B"}
+        assert pool._request_to_blocks["req-A"] == {b"h1", b"h2"}
+        assert pool._request_to_blocks["req-B"] == {b"h1", b"h3"}
+    def test_no_tracking_when_callback_is_none(self):
+        pool = TestableBlockPool(eviction_callback=None)
+        pool.cache_full_blocks_simple("req-1", [1, 2], [b"h1", b"h2"])
+        assert len(pool._block_to_requests) == 0
+        assert len(pool._request_to_blocks) == 0
+class TestEvictionCallback:
+    def test_full_eviction_fires_callback(self):
+        callback = MagicMock()
+        pool = TestableBlockPool(num_blocks=10, eviction_callback=callback)
+        # Cache 3 blocks for req-1 using blocks 1,2,3
+        pool.cache_full_blocks_simple("req-1", [1, 2, 3], [b"h1", b"h2", b"h3"])
+        # Evict all 3 blocks
+        pool.evict_blocks({1, 2, 3})
+        callback.assert_called_once()
+        evicted_ids = callback.call_args[0][0]
+        assert "req-1" in evicted_ids
+    def test_partial_eviction_does_not_fire_callback(self):
+        callback = MagicMock()
+        pool = TestableBlockPool(num_blocks=10, eviction_callback=callback)
+        pool.cache_full_blocks_simple("req-1", [1, 2, 3], [b"h1", b"h2", b"h3"])
+        # Evict only 2 of 3 blocks — request still has h3
+        pool.evict_blocks({1, 2})
+        callback.assert_not_called()
+        assert pool.is_request_in_cache("req-1")
+        assert pool._request_to_blocks["req-1"] == {b"h3"}
+    def test_evict_last_block_fires_callback(self):
+        callback = MagicMock()
+        pool = TestableBlockPool(num_blocks=10, eviction_callback=callback)
+        pool.cache_full_blocks_simple("req-1", [1, 2, 3], [b"h1", b"h2", b"h3"])
+        # Evict 2, then the last 1
+        pool.evict_blocks({1, 2})
+        callback.assert_not_called()
+        pool.evict_blocks({3})
+        callback.assert_called_once()
+        assert "req-1" in callback.call_args[0][0]
+        assert not pool.is_request_in_cache("req-1")
+    def test_shared_hash_not_evicted_until_last_copy_removed(self):
+        callback = MagicMock()
+        pool = TestableBlockPool(num_blocks=12, eviction_callback=callback)
+        # req-A: shared + unique, req-B: shared only
+        pool.cache_full_blocks_simple("req-A", [1, 2], [b"h_shared", b"h_a"])
+        pool.cache_full_blocks_simple("req-B", [3], [b"h_shared"])
+        # Remove one shared copy + req-A unique block.
+        # h_shared is still available via req-B's block.
+        pool.evict_blocks({1, 2})
+        callback.assert_not_called()
+        assert pool.is_request_in_cache("req-A")
+        assert pool.is_request_in_cache("req-B")
+        # Remove final shared copy: now both requests are fully evicted.
+        pool.evict_blocks({3})
+        callback.assert_called_once()
+        assert callback.call_args[0][0] == {"req-A", "req-B"}
+    def test_multiple_requests_evicted_together(self):
+        callback = MagicMock()
+        pool = TestableBlockPool(num_blocks=10, eviction_callback=callback)
+        pool.cache_full_blocks_simple("req-A", [1], [b"hA"])
+        pool.cache_full_blocks_simple("req-B", [2], [b"hB"])
+        pool.evict_blocks({1, 2})
+        callback.assert_called_once()
+        evicted = callback.call_args[0][0]
+        assert evicted == {"req-A", "req-B"}
+    def test_callback_not_called_when_none(self):
+        pool = TestableBlockPool(eviction_callback=None)
+        pool.cache_full_blocks_simple("req-1", [1], [b"h1"])
+        # Should not raise
+        pool.evict_blocks({1})
+    def test_callback_exception_is_swallowed(self):
+        callback = MagicMock(side_effect=Exception("network error"))
+        pool = TestableBlockPool(num_blocks=10, eviction_callback=callback)
+        pool.cache_full_blocks_simple("req-1", [1], [b"h1"])
+        # Should not raise even though callback throws
+        pool.evict_blocks({1})
+        callback.assert_called_once()
+class TestGetNewBlocksEviction:
+    def test_allocating_cached_blocks_fires_callback(self):
+        """When get_new_blocks pops cached blocks, eviction callback fires."""
+        callback = MagicMock()
+        pool = TestableBlockPool(num_blocks=10, eviction_callback=callback)
+        # Allocate/cache/free blocks first so they become eviction candidates.
+        blocks = pool.get_new_blocks(3)
+        block_ids = [b.block_id for b in blocks]
+        pool.cache_full_blocks_simple("req-X", block_ids, [b"h1", b"h2", b"h3"])
+        pool.free_blocks(blocks)
+        assert pool.is_request_in_cache("req-X")
+        # Force allocation of all free blocks to guarantee cached blocks are popped.
+        pool.get_new_blocks(pool.free_block_queue.num_free_blocks)
+        callback.assert_called_once()
+        assert "req-X" in callback.call_args[0][0]
+        assert not pool.is_request_in_cache("req-X")
+class TestTouchCompatibility:
+    def test_touch_accepts_grouped_blocks(self):
+        pool = TestableBlockPool(num_blocks=8, eviction_callback=None)
+        blocks = pool.get_new_blocks(2)
+        pool.free_blocks(blocks)
+        # Upstream style: tuple[Sequence[KVCacheBlock], ...]
+        pool.touch((blocks,))
+        assert blocks[0].ref_cnt == 1
+        assert blocks[1].ref_cnt == 1
+class TestResetPrefixCache:
+    def test_reset_fires_callback_for_all(self):
+        callback = MagicMock()
+        pool = TestableBlockPool(num_blocks=10, eviction_callback=callback)
+        pool.cache_full_blocks_simple("req-A", [1], [b"hA"])
+        pool.cache_full_blocks_simple("req-B", [2], [b"hB"])
+        pool.cache_full_blocks_simple("req-C", [3], [b"hC"])
+        pool.reset_prefix_cache()
+        callback.assert_called_once()
+        evicted = callback.call_args[0][0]
+        assert evicted == {"req-A", "req-B", "req-C"}
+        # Tracking should be cleared
+        assert len(pool._block_to_requests) == 0
+        assert len(pool._request_to_blocks) == 0
+    def test_reset_with_no_tracked_requests(self):
+        callback = MagicMock()
+        pool = TestableBlockPool(num_blocks=10, eviction_callback=callback)
+        pool.reset_prefix_cache()
+        callback.assert_not_called()
+class TestCallbackPrefixStripping:
+    def test_strips_cmpl_prefix(self):
+        import re
+        prefix_re = re.compile(r"^(cmpl-|chatcmpl-|batch-)")
+        ids = {"cmpl-req-123", "chatcmpl-req-456", "batch-req-789", "plain-id"}
+        stripped = {prefix_re.sub("", rid) for rid in ids}
+        assert stripped == {"req-123", "req-456", "req-789", "plain-id"}
+    def test_no_prefix_unchanged(self):
+        import re
+        prefix_re = re.compile(r"^(cmpl-|chatcmpl-|batch-)")
+        ids = {"my-request-1", "another-req"}
+        stripped = {prefix_re.sub("", rid) for rid in ids}
+        assert stripped == {"my-request-1", "another-req"}
+class TestHelperMethods:
+    def test_get_tracked_request_ids(self):
+        callback = MagicMock()
+        pool = TestableBlockPool(num_blocks=10, eviction_callback=callback)
+        pool.cache_full_blocks_simple("req-A", [1], [b"hA"])
+        pool.cache_full_blocks_simple("req-B", [2], [b"hB"])
+        assert pool.get_tracked_request_ids() == {"req-A", "req-B"}
+    def test_is_request_in_cache(self):
+        callback = MagicMock()
+        pool = TestableBlockPool(num_blocks=10, eviction_callback=callback)
+        pool.cache_full_blocks_simple("req-A", [1], [b"hA"])
+        assert pool.is_request_in_cache("req-A")
+        assert not pool.is_request_in_cache("req-B")
+        pool.evict_blocks({1})
+        assert not pool.is_request_in_cache("req-A")