PyPI - hindsight-api - Versions diffs - 0.2.1__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

hindsight-api 0.2.1py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (88) hide show

hindsight_api/admin/__init__.py +1 -0
hindsight_api/admin/cli.py +311 -0
hindsight_api/alembic/versions/f1a2b3c4d5e6_add_memory_links_composite_index.py +44 -0
hindsight_api/alembic/versions/g2a3b4c5d6e7_add_tags_column.py +48 -0
hindsight_api/alembic/versions/h3c4d5e6f7g8_mental_models_v4.py +112 -0
hindsight_api/alembic/versions/i4d5e6f7g8h9_delete_opinions.py +41 -0
hindsight_api/alembic/versions/j5e6f7g8h9i0_mental_model_versions.py +95 -0
hindsight_api/alembic/versions/k6f7g8h9i0j1_add_directive_subtype.py +58 -0
hindsight_api/alembic/versions/l7g8h9i0j1k2_add_worker_columns.py +109 -0
hindsight_api/alembic/versions/m8h9i0j1k2l3_mental_model_id_to_text.py +41 -0
hindsight_api/alembic/versions/n9i0j1k2l3m4_learnings_and_pinned_reflections.py +134 -0
hindsight_api/alembic/versions/o0j1k2l3m4n5_migrate_mental_models_data.py +113 -0
hindsight_api/alembic/versions/p1k2l3m4n5o6_new_knowledge_architecture.py +194 -0
hindsight_api/alembic/versions/q2l3m4n5o6p7_fix_mental_model_fact_type.py +50 -0
hindsight_api/alembic/versions/r3m4n5o6p7q8_add_reflect_response_to_reflections.py +47 -0
hindsight_api/alembic/versions/s4n5o6p7q8r9_add_consolidated_at_to_memory_units.py +53 -0
hindsight_api/alembic/versions/t5o6p7q8r9s0_rename_mental_models_to_observations.py +134 -0
hindsight_api/alembic/versions/u6p7q8r9s0t1_mental_models_text_id.py +41 -0
hindsight_api/alembic/versions/v7q8r9s0t1u2_add_max_tokens_to_mental_models.py +50 -0
hindsight_api/api/http.py +1406 -118
hindsight_api/api/mcp.py +11 -196
hindsight_api/config.py +359 -27
hindsight_api/engine/consolidation/__init__.py +5 -0
hindsight_api/engine/consolidation/consolidator.py +859 -0
hindsight_api/engine/consolidation/prompts.py +69 -0
hindsight_api/engine/cross_encoder.py +706 -88
hindsight_api/engine/db_budget.py +284 -0
hindsight_api/engine/db_utils.py +11 -0
hindsight_api/engine/directives/__init__.py +5 -0
hindsight_api/engine/directives/models.py +37 -0
hindsight_api/engine/embeddings.py +553 -29
hindsight_api/engine/entity_resolver.py +8 -5
hindsight_api/engine/interface.py +40 -17
hindsight_api/engine/llm_wrapper.py +744 -68
hindsight_api/engine/memory_engine.py +2505 -1017
hindsight_api/engine/mental_models/__init__.py +14 -0
hindsight_api/engine/mental_models/models.py +53 -0
hindsight_api/engine/query_analyzer.py +4 -3
hindsight_api/engine/reflect/__init__.py +18 -0
hindsight_api/engine/reflect/agent.py +933 -0
hindsight_api/engine/reflect/models.py +109 -0
hindsight_api/engine/reflect/observations.py +186 -0
hindsight_api/engine/reflect/prompts.py +483 -0
hindsight_api/engine/reflect/tools.py +437 -0
hindsight_api/engine/reflect/tools_schema.py +250 -0
hindsight_api/engine/response_models.py +168 -4
hindsight_api/engine/retain/bank_utils.py +79 -201
hindsight_api/engine/retain/fact_extraction.py +424 -195
hindsight_api/engine/retain/fact_storage.py +35 -12
hindsight_api/engine/retain/link_utils.py +29 -24
hindsight_api/engine/retain/orchestrator.py +24 -43
hindsight_api/engine/retain/types.py +11 -2
hindsight_api/engine/search/graph_retrieval.py +43 -14
hindsight_api/engine/search/link_expansion_retrieval.py +391 -0
hindsight_api/engine/search/mpfp_retrieval.py +362 -117
hindsight_api/engine/search/reranking.py +2 -2
hindsight_api/engine/search/retrieval.py +848 -201
hindsight_api/engine/search/tags.py +172 -0
hindsight_api/engine/search/think_utils.py +42 -141
hindsight_api/engine/search/trace.py +12 -1
hindsight_api/engine/search/tracer.py +26 -6
hindsight_api/engine/search/types.py +21 -3
hindsight_api/engine/task_backend.py +113 -106
hindsight_api/engine/utils.py +1 -152
hindsight_api/extensions/__init__.py +10 -1
hindsight_api/extensions/builtin/tenant.py +5 -1
hindsight_api/extensions/context.py +10 -1
hindsight_api/extensions/operation_validator.py +81 -4
hindsight_api/extensions/tenant.py +26 -0
hindsight_api/main.py +69 -6
hindsight_api/mcp_local.py +12 -53
hindsight_api/mcp_tools.py +494 -0
hindsight_api/metrics.py +433 -48
hindsight_api/migrations.py +141 -1
hindsight_api/models.py +3 -3
hindsight_api/pg0.py +53 -0
hindsight_api/server.py +39 -2
hindsight_api/worker/__init__.py +11 -0
hindsight_api/worker/main.py +296 -0
hindsight_api/worker/poller.py +486 -0
{hindsight_api-0.2.1.dist-info → hindsight_api-0.4.0.dist-info}/METADATA +16 -6
hindsight_api-0.4.0.dist-info/RECORD +112 -0
{hindsight_api-0.2.1.dist-info → hindsight_api-0.4.0.dist-info}/entry_points.txt +2 -0
hindsight_api/engine/retain/observation_regeneration.py +0 -254
hindsight_api/engine/search/observation_utils.py +0 -125
hindsight_api/engine/search/scoring.py +0 -159
hindsight_api-0.2.1.dist-info/RECORD +0 -75
{hindsight_api-0.2.1.dist-info → hindsight_api-0.4.0.dist-info}/WHEEL +0 -0

hindsight_api/engine/cross_encoder.py CHANGED Viewed

@@ -6,17 +6,38 @@ Provides an interface for reranking with different backends.
 Configuration via environment variables - see hindsight_api.config for all env var names.
 """
+import asyncio
 import logging
 import os
 from abc import ABC, abstractmethod
+from concurrent.futures import ThreadPoolExecutor
 import httpx
 from ..config import (
+    DEFAULT_LITELLM_API_BASE,
+    DEFAULT_RERANKER_COHERE_MODEL,
+    DEFAULT_RERANKER_FLASHRANK_CACHE_DIR,
+    DEFAULT_RERANKER_FLASHRANK_MODEL,
+    DEFAULT_RERANKER_LITELLM_MODEL,
+    DEFAULT_RERANKER_LOCAL_MAX_CONCURRENT,
     DEFAULT_RERANKER_LOCAL_MODEL,
     DEFAULT_RERANKER_PROVIDER,
+    DEFAULT_RERANKER_TEI_BATCH_SIZE,
+    DEFAULT_RERANKER_TEI_MAX_CONCURRENT,
+    ENV_COHERE_API_KEY,
+    ENV_LITELLM_API_BASE,
+    ENV_LITELLM_API_KEY,
+    ENV_RERANKER_COHERE_BASE_URL,
+    ENV_RERANKER_COHERE_MODEL,
+    ENV_RERANKER_FLASHRANK_CACHE_DIR,
+    ENV_RERANKER_FLASHRANK_MODEL,
+    ENV_RERANKER_LITELLM_MODEL,
+    ENV_RERANKER_LOCAL_MAX_CONCURRENT,
     ENV_RERANKER_LOCAL_MODEL,
     ENV_RERANKER_PROVIDER,
+    ENV_RERANKER_TEI_BATCH_SIZE,
+    ENV_RERANKER_TEI_MAX_CONCURRENT,
     ENV_RERANKER_TEI_URL,
 )
@@ -47,7 +68,7 @@ class CrossEncoderModel(ABC):
         pass
     @abstractmethod
-    def predict(self, pairs: list[tuple[str, str]]) -> list[float]:
+    async def predict(self, pairs: list[tuple[str, str]]) -> list[float]:
         """
         Score query-document pairs for relevance.
@@ -70,25 +91,34 @@ class LocalSTCrossEncoder(CrossEncoderModel):
     - Fast inference (~80ms for 100 pairs on CPU)
     - Small model (80MB)
     - Trained for passage re-ranking
+    Uses a dedicated thread pool to limit concurrent CPU-bound work.
     """
-    def __init__(self, model_name: str | None = None):
+    # Shared executor across all instances (one model loaded anyway)
+    _executor: ThreadPoolExecutor | None = None
+    _max_concurrent: int = 4  # Limit concurrent CPU-bound reranking calls
+    def __init__(self, model_name: str | None = None, max_concurrent: int = 4):
         """
         Initialize local SentenceTransformers cross-encoder.
         Args:
             model_name: Name of the CrossEncoder model to use.
                        Default: cross-encoder/ms-marco-MiniLM-L-6-v2
+            max_concurrent: Maximum concurrent reranking calls (default: 2).
+                           Higher values may cause CPU thrashing under load.
         """
         self.model_name = model_name or DEFAULT_RERANKER_LOCAL_MODEL
         self._model = None
+        LocalSTCrossEncoder._max_concurrent = max_concurrent
     @property
     def provider_name(self) -> str:
         return "local"
     async def initialize(self) -> None:
-        """Load the cross-encoder model."""
+        """Load the cross-encoder model and initialize the executor."""
         if self._model is not None:
             return
@@ -101,13 +131,134 @@ class LocalSTCrossEncoder(CrossEncoderModel):
             )
         logger.info(f"Reranker: initializing local provider with model {self.model_name}")
-        self._model = CrossEncoder(self.model_name)
-        logger.info("Reranker: local provider initialized")
-    def predict(self, pairs: list[tuple[str, str]]) -> list[float]:
+        # Determine device based on hardware availability.
+        # We always set low_cpu_mem_usage=False to prevent lazy loading (meta tensors)
+        # which can cause issues when accelerate is installed but no GPU is available.
+        # Note: We do NOT use device_map because CrossEncoder internally calls .to(device)
+        # after loading, which conflicts with accelerate's device_map handling.
+        import torch
+        # Check for GPU (CUDA) or Apple Silicon (MPS)
+        has_gpu = torch.cuda.is_available() or (hasattr(torch.backends, "mps") and torch.backends.mps.is_available())
+        if has_gpu:
+            device = None  # Let sentence-transformers auto-detect GPU/MPS
+        else:
+            device = "cpu"
+        self._model = CrossEncoder(
+            self.model_name,
+            device=device,
+            model_kwargs={"low_cpu_mem_usage": False},
+        )
+        # Initialize shared executor (limited workers naturally limits concurrency)
+        if LocalSTCrossEncoder._executor is None:
+            LocalSTCrossEncoder._executor = ThreadPoolExecutor(
+                max_workers=LocalSTCrossEncoder._max_concurrent,
+                thread_name_prefix="reranker",
+            )
+            logger.info(f"Reranker: local provider initialized (max_concurrent={LocalSTCrossEncoder._max_concurrent})")
+        else:
+            logger.info("Reranker: local provider initialized (using existing executor)")
+    def _is_xpc_error(self, error: Exception) -> bool:
+        """
+        Check if an error is an XPC connection error (macOS daemon issue).
+        On macOS, long-running daemons can lose XPC connections to system services
+        when the process is idle for extended periods.
+        """
+        error_str = str(error).lower()
+        return "xpc_error_connection_invalid" in error_str or "xpc error" in error_str
+    def _reinitialize_model_sync(self) -> None:
+        """
+        Clear and reinitialize the cross-encoder model synchronously.
+        This is used to recover from XPC errors on macOS where the
+        PyTorch/MPS backend loses its connection to system services.
+        """
+        logger.warning(f"Reinitializing reranker model {self.model_name} due to backend error")
+        # Clear existing model
+        self._model = None
+        # Force garbage collection to free resources
+        import gc
+        import torch
+        gc.collect()
+        # If using CUDA/MPS, clear the cache
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+            try:
+                torch.mps.empty_cache()
+            except AttributeError:
+                pass  # Method might not exist in all PyTorch versions
+        # Reinitialize the model
+        try:
+            from sentence_transformers import CrossEncoder
+        except ImportError:
+            raise ImportError(
+                "sentence-transformers is required for LocalSTCrossEncoder. "
+                "Install it with: pip install sentence-transformers"
+            )
+        # Determine device based on hardware availability
+        has_gpu = torch.cuda.is_available() or (hasattr(torch.backends, "mps") and torch.backends.mps.is_available())
+        if has_gpu:
+            device = None  # Let sentence-transformers auto-detect GPU/MPS
+        else:
+            device = "cpu"
+        self._model = CrossEncoder(
+            self.model_name,
+            device=device,
+            model_kwargs={"low_cpu_mem_usage": False},
+        )
+        logger.info("Reranker: local provider reinitialized successfully")
+    def _predict_with_recovery(self, pairs: list[tuple[str, str]]) -> list[float]:
+        """
+        Predict with automatic recovery from XPC errors.
+        This runs synchronously in the thread pool.
+        """
+        max_retries = 1
+        for attempt in range(max_retries + 1):
+            try:
+                scores = self._model.predict(pairs, show_progress_bar=False)
+                return scores.tolist() if hasattr(scores, "tolist") else list(scores)
+            except Exception as e:
+                # Check if this is an XPC error (macOS daemon issue)
+                if self._is_xpc_error(e) and attempt < max_retries:
+                    logger.warning(f"XPC error detected in reranker (attempt {attempt + 1}): {e}")
+                    try:
+                        self._reinitialize_model_sync()
+                        logger.info("Reranker reinitialized successfully, retrying prediction")
+                        continue
+                    except Exception as reinit_error:
+                        logger.error(f"Failed to reinitialize reranker: {reinit_error}")
+                        raise Exception(f"Failed to recover from XPC error: {str(e)}")
+                else:
+                    # Not an XPC error or out of retries
+                    raise
+    async def predict(self, pairs: list[tuple[str, str]]) -> list[float]:
         """
         Score query-document pairs for relevance.
+        Uses a dedicated thread pool with limited workers to prevent CPU thrashing.
+        Automatically recovers from XPC errors on macOS by reinitializing the model.
         Args:
             pairs: List of (query, document) tuples to score
@@ -116,8 +267,14 @@ class LocalSTCrossEncoder(CrossEncoderModel):
         """
         if self._model is None:
             raise RuntimeError("Reranker not initialized. Call initialize() first.")
-        scores = self._model.predict(pairs, show_progress_bar=False)
-        return scores.tolist() if hasattr(scores, "tolist") else list(scores)
+        # Use dedicated executor - limited workers naturally limits concurrency
+        loop = asyncio.get_event_loop()
+        return await loop.run_in_executor(
+            LocalSTCrossEncoder._executor,
+            self._predict_with_recovery,
+            pairs,
+        )
 class RemoteTEICrossEncoder(CrossEncoderModel):
@@ -128,13 +285,21 @@ class RemoteTEICrossEncoder(CrossEncoderModel):
     See: https://github.com/huggingface/text-embeddings-inference
     Note: The TEI server must be running a cross-encoder/reranker model.
+    Requests are made in parallel with configurable batch size and max concurrency (backpressure).
+    Uses a GLOBAL semaphore to limit concurrent requests across ALL recall operations.
     """
+    # Global semaphore shared across all instances and calls to prevent thundering herd
+    _global_semaphore: asyncio.Semaphore | None = None
+    _global_max_concurrent: int = DEFAULT_RERANKER_TEI_MAX_CONCURRENT
     def __init__(
         self,
         base_url: str,
         timeout: float = 30.0,
-        batch_size: int = 32,
+        batch_size: int = DEFAULT_RERANKER_TEI_BATCH_SIZE,
+        max_concurrent: int = DEFAULT_RERANKER_TEI_MAX_CONCURRENT,
         max_retries: int = 3,
         retry_delay: float = 0.5,
     ):
@@ -144,80 +309,246 @@ class RemoteTEICrossEncoder(CrossEncoderModel):
         Args:
             base_url: Base URL of the TEI server (e.g., "http://localhost:8080")
             timeout: Request timeout in seconds (default: 30.0)
-            batch_size: Maximum batch size for rerank requests (default: 32)
+            batch_size: Maximum batch size for rerank requests (default: 128)
+            max_concurrent: Maximum concurrent requests for backpressure (default: 8).
+                           This is a GLOBAL limit across all parallel recall operations.
             max_retries: Maximum number of retries for failed requests (default: 3)
             retry_delay: Initial delay between retries in seconds, doubles each retry (default: 0.5)
         """
         self.base_url = base_url.rstrip("/")
         self.timeout = timeout
         self.batch_size = batch_size
+        self.max_concurrent = max_concurrent
         self.max_retries = max_retries
         self.retry_delay = retry_delay
-        self._client: httpx.Client | None = None
+        self._async_client: httpx.AsyncClient | None = None
         self._model_id: str | None = None
+        # Update global semaphore if max_concurrent changed
+        if (
+            RemoteTEICrossEncoder._global_semaphore is None
+            or RemoteTEICrossEncoder._global_max_concurrent != max_concurrent
+        ):
+            RemoteTEICrossEncoder._global_max_concurrent = max_concurrent
+            RemoteTEICrossEncoder._global_semaphore = asyncio.Semaphore(max_concurrent)
     @property
     def provider_name(self) -> str:
         return "tei"
-    def _request_with_retry(self, method: str, url: str, **kwargs) -> httpx.Response:
-        """Make an HTTP request with automatic retries on transient errors."""
-        import time
+    async def _async_request_with_retry(
+        self,
+        client: httpx.AsyncClient,
+        semaphore: asyncio.Semaphore,
+        method: str,
+        url: str,
+        **kwargs,
+    ) -> httpx.Response:
+        """Make an async HTTP request with automatic retries on transient errors and semaphore for backpressure."""
         last_error = None
         delay = self.retry_delay
-        for attempt in range(self.max_retries + 1):
-            try:
-                if method == "GET":
-                    response = self._client.get(url, **kwargs)
-                else:
-                    response = self._client.post(url, **kwargs)
-                response.raise_for_status()
-                return response
-            except (httpx.ConnectError, httpx.ReadTimeout, httpx.WriteTimeout) as e:
-                last_error = e
-                if attempt < self.max_retries:
-                    logger.warning(
-                        f"TEI request failed (attempt {attempt + 1}/{self.max_retries + 1}): {e}. Retrying in {delay}s..."
-                    )
-                    time.sleep(delay)
-                    delay *= 2  # Exponential backoff
-            except httpx.HTTPStatusError as e:
-                # Retry on 5xx server errors
-                if e.response.status_code >= 500 and attempt < self.max_retries:
+        async with semaphore:
+            for attempt in range(self.max_retries + 1):
+                try:
+                    if method == "GET":
+                        response = await client.get(url, **kwargs)
+                    else:
+                        response = await client.post(url, **kwargs)
+                    response.raise_for_status()
+                    return response
+                except (httpx.ConnectError, httpx.ReadTimeout, httpx.WriteTimeout) as e:
                     last_error = e
-                    logger.warning(
-                        f"TEI server error (attempt {attempt + 1}/{self.max_retries + 1}): {e}. Retrying in {delay}s..."
-                    )
-                    time.sleep(delay)
-                    delay *= 2
-                else:
-                    raise
+                    if attempt < self.max_retries:
+                        logger.warning(
+                            f"TEI request failed (attempt {attempt + 1}/{self.max_retries + 1}): {e}. "
+                            f"Retrying in {delay}s..."
+                        )
+                        await asyncio.sleep(delay)
+                        delay *= 2  # Exponential backoff
+                except httpx.HTTPStatusError as e:
+                    # Retry on 5xx server errors
+                    if e.response.status_code >= 500 and attempt < self.max_retries:
+                        last_error = e
+                        logger.warning(
+                            f"TEI server error (attempt {attempt + 1}/{self.max_retries + 1}): {e}. "
+                            f"Retrying in {delay}s..."
+                        )
+                        await asyncio.sleep(delay)
+                        delay *= 2
+                    else:
+                        raise
         raise last_error
     async def initialize(self) -> None:
         """Initialize the HTTP client and verify server connectivity."""
-        if self._client is not None:
+        if self._async_client is not None:
             return
-        logger.info(f"Reranker: initializing TEI provider at {self.base_url}")
-        self._client = httpx.Client(timeout=self.timeout)
+        logger.info(
+            f"Reranker: initializing TEI provider at {self.base_url} "
+            f"(batch_size={self.batch_size}, max_concurrent={self.max_concurrent})"
+        )
+        self._async_client = httpx.AsyncClient(timeout=self.timeout)
         # Verify server is reachable and get model info
+        # Use a temporary semaphore for initialization
+        init_semaphore = asyncio.Semaphore(1)
         try:
-            response = self._request_with_retry("GET", f"{self.base_url}/info")
+            response = await self._async_request_with_retry(
+                self._async_client, init_semaphore, "GET", f"{self.base_url}/info"
+            )
             info = response.json()
             self._model_id = info.get("model_id", "unknown")
             logger.info(f"Reranker: TEI provider initialized (model: {self._model_id})")
         except httpx.HTTPError as e:
+            self._async_client = None
             raise RuntimeError(f"Failed to connect to TEI server at {self.base_url}: {e}")
-    def predict(self, pairs: list[tuple[str, str]]) -> list[float]:
+    async def _rerank_query_group(
+        self,
+        client: httpx.AsyncClient,
+        semaphore: asyncio.Semaphore,
+        query: str,
+        texts: list[str],
+    ) -> list[tuple[int, float]]:
+        """Rerank a single query group and return list of (original_index, score) tuples."""
+        try:
+            response = await self._async_request_with_retry(
+                client,
+                semaphore,
+                "POST",
+                f"{self.base_url}/rerank",
+                json={
+                    "query": query,
+                    "texts": texts,
+                    "return_text": False,
+                },
+            )
+            results = response.json()
+            # TEI returns results sorted by score descending, with original index
+            return [(result["index"], result["score"]) for result in results]
+        except httpx.HTTPError as e:
+            raise RuntimeError(f"TEI rerank request failed: {e}")
+    async def _predict_async(self, pairs: list[tuple[str, str]]) -> list[float]:
+        """Async implementation of predict that runs requests in parallel with backpressure."""
+        if not pairs:
+            return []
+        # Group all pairs by query
+        query_groups: dict[str, list[tuple[int, str]]] = {}
+        for idx, (query, text) in enumerate(pairs):
+            if query not in query_groups:
+                query_groups[query] = []
+            query_groups[query].append((idx, text))
+        # Split each query group into batches
+        tasks_info: list[tuple[str, list[int], list[str]]] = []  # (query, indices, texts)
+        for query, indexed_texts in query_groups.items():
+            indices = [idx for idx, _ in indexed_texts]
+            texts = [text for _, text in indexed_texts]
+            # Split into batches
+            for i in range(0, len(texts), self.batch_size):
+                batch_indices = indices[i : i + self.batch_size]
+                batch_texts = texts[i : i + self.batch_size]
+                tasks_info.append((query, batch_indices, batch_texts))
+        # Run all requests in parallel with GLOBAL semaphore for backpressure
+        # This ensures max_concurrent is respected across ALL parallel recall operations
+        all_scores = [0.0] * len(pairs)
+        semaphore = RemoteTEICrossEncoder._global_semaphore
+        tasks = [
+            self._rerank_query_group(self._async_client, semaphore, query, texts) for query, _, texts in tasks_info
+        ]
+        results = await asyncio.gather(*tasks)
+        # Map scores back to original positions
+        for (_, indices, _), result_scores in zip(tasks_info, results):
+            for original_idx_in_batch, score in result_scores:
+                global_idx = indices[original_idx_in_batch]
+                all_scores[global_idx] = score
+        return all_scores
+    async def predict(self, pairs: list[tuple[str, str]]) -> list[float]:
         """
         Score query-document pairs using the remote TEI reranker.
+        Requests are made in parallel with configurable backpressure.
+        Args:
+            pairs: List of (query, document) tuples to score
+        Returns:
+            List of relevance scores
+        """
+        if self._async_client is None:
+            raise RuntimeError("Reranker not initialized. Call initialize() first.")
+        return await self._predict_async(pairs)
+class CohereCrossEncoder(CrossEncoderModel):
+    """
+    Cohere cross-encoder implementation using the Cohere Rerank API.
+    Supports rerank-english-v3.0 and rerank-multilingual-v3.0 models.
+    """
+    def __init__(
+        self,
+        api_key: str,
+        model: str = DEFAULT_RERANKER_COHERE_MODEL,
+        base_url: str | None = None,
+        timeout: float = 60.0,
+    ):
+        """
+        Initialize Cohere cross-encoder client.
+        Args:
+            api_key: Cohere API key
+            model: Cohere rerank model name (default: rerank-english-v3.0)
+            base_url: Custom base URL for Cohere-compatible API (e.g., Azure-hosted endpoint)
+            timeout: Request timeout in seconds (default: 60.0)
+        """
+        self.api_key = api_key
+        self.model = model
+        self.base_url = base_url
+        self.timeout = timeout
+        self._client = None
+    @property
+    def provider_name(self) -> str:
+        return "cohere"
+    async def initialize(self) -> None:
+        """Initialize the Cohere client."""
+        if self._client is not None:
+            return
+        try:
+            import cohere
+        except ImportError:
+            raise ImportError("cohere is required for CohereCrossEncoder. Install it with: pip install cohere")
+        base_url_msg = f" at {self.base_url}" if self.base_url else ""
+        logger.info(f"Reranker: initializing Cohere provider with model {self.model}{base_url_msg}")
+        # Build client kwargs, only including base_url if set (for Azure or custom endpoints)
+        client_kwargs = {"api_key": self.api_key, "timeout": self.timeout}
+        if self.base_url:
+            client_kwargs["base_url"] = self.base_url
+        self._client = cohere.Client(**client_kwargs)
+        logger.info("Reranker: Cohere provider initialized")
+    async def predict(self, pairs: list[tuple[str, str]]) -> list[float]:
+        """
+        Score query-document pairs using the Cohere Rerank API.
         Args:
             pairs: List of (query, document) tuples to score
@@ -230,50 +561,312 @@ class RemoteTEICrossEncoder(CrossEncoderModel):
         if not pairs:
             return []
-        all_scores = []
+        # Run sync Cohere API calls in thread pool
+        loop = asyncio.get_event_loop()
+        return await loop.run_in_executor(None, self._predict_sync, pairs)
+    def _predict_sync(self, pairs: list[tuple[str, str]]) -> list[float]:
+        """Synchronous predict implementation for Cohere API."""
+        # Group pairs by query for efficient batching
+        # Cohere rerank expects one query with multiple documents
+        query_groups: dict[str, list[tuple[int, str]]] = {}
+        for idx, (query, text) in enumerate(pairs):
+            if query not in query_groups:
+                query_groups[query] = []
+            query_groups[query].append((idx, text))
+        all_scores = [0.0] * len(pairs)
+        for query, indexed_texts in query_groups.items():
+            texts = [text for _, text in indexed_texts]
+            indices = [idx for idx, _ in indexed_texts]
+            response = self._client.rerank(
+                query=query,
+                documents=texts,
+                model=self.model,
+                return_documents=False,
+            )
-        # Process in batches
-        for i in range(0, len(pairs), self.batch_size):
-            batch = pairs[i : i + self.batch_size]
+            # Map scores back to original positions
+            for result in response.results:
+                original_idx = result.index
+                score = result.relevance_score
+                all_scores[indices[original_idx]] = score
-            # TEI rerank endpoint expects query and texts separately
-            # All pairs in a batch should have the same query for optimal performance
-            # but we handle mixed queries by making separate requests per unique query
-            query_groups: dict[str, list[tuple[int, str]]] = {}
-            for idx, (query, text) in enumerate(batch):
-                if query not in query_groups:
-                    query_groups[query] = []
-                query_groups[query].append((idx, text))
+        return all_scores
-            batch_scores = [0.0] * len(batch)
-            for query, indexed_texts in query_groups.items():
-                texts = [text for _, text in indexed_texts]
-                indices = [idx for idx, _ in indexed_texts]
+class RRFPassthroughCrossEncoder(CrossEncoderModel):
+    """
+    Passthrough cross-encoder that preserves RRF scores without neural reranking.
-                try:
-                    response = self._request_with_retry(
-                        "POST",
-                        f"{self.base_url}/rerank",
-                        json={
-                            "query": query,
-                            "texts": texts,
-                            "return_text": False,
-                        },
-                    )
-                    results = response.json()
-                    # TEI returns results sorted by score descending, with original index
-                    for result in results:
-                        original_idx = result["index"]
-                        score = result["score"]
-                        # Map back to batch position
-                        batch_scores[indices[original_idx]] = score
-                except httpx.HTTPError as e:
-                    raise RuntimeError(f"TEI rerank request failed: {e}")
-            all_scores.extend(batch_scores)
+    This is useful for:
+    - Testing retrieval quality without reranking overhead
+    - Deployments where reranking latency is unacceptable
+    - Debugging to isolate retrieval vs reranking issues
+    """
+    def __init__(self):
+        """Initialize RRF passthrough cross-encoder."""
+        pass
+    @property
+    def provider_name(self) -> str:
+        return "rrf"
+    async def initialize(self) -> None:
+        """No initialization needed."""
+        logger.info("Reranker: RRF passthrough provider initialized (neural reranking disabled)")
+    async def predict(self, pairs: list[tuple[str, str]]) -> list[float]:
+        """
+        Return neutral scores - actual ranking uses RRF scores from retrieval.
+        Args:
+            pairs: List of (query, document) tuples (ignored)
+        Returns:
+            List of 0.5 scores (neutral, lets RRF scores dominate)
+        """
+        # Return neutral scores so RRF ranking is preserved
+        return [0.5] * len(pairs)
+class FlashRankCrossEncoder(CrossEncoderModel):
+    """
+    FlashRank cross-encoder implementation.
+    FlashRank is an ultra-lite reranking library that runs on CPU without
+    requiring PyTorch or Transformers. It's ideal for serverless deployments
+    with minimal cold-start overhead.
+    Available models:
+    - ms-marco-TinyBERT-L-2-v2: Fastest, ~4MB
+    - ms-marco-MiniLM-L-12-v2: Best quality, ~34MB (default)
+    - rank-T5-flan: Best zero-shot, ~110MB
+    - ms-marco-MultiBERT-L-12: Multi-lingual, ~150MB
+    """
+    # Shared executor for CPU-bound reranking
+    _executor: ThreadPoolExecutor | None = None
+    _max_concurrent: int = 4
+    def __init__(
+        self,
+        model_name: str | None = None,
+        cache_dir: str | None = None,
+        max_length: int = 512,
+        max_concurrent: int = 4,
+    ):
+        """
+        Initialize FlashRank cross-encoder.
+        Args:
+            model_name: FlashRank model name. Default: ms-marco-MiniLM-L-12-v2
+            cache_dir: Directory to cache downloaded models. Default: system cache
+            max_length: Maximum sequence length for reranking. Default: 512
+            max_concurrent: Maximum concurrent reranking calls. Default: 4
+        """
+        self.model_name = model_name or DEFAULT_RERANKER_FLASHRANK_MODEL
+        self.cache_dir = cache_dir or DEFAULT_RERANKER_FLASHRANK_CACHE_DIR
+        self.max_length = max_length
+        self._ranker = None
+        FlashRankCrossEncoder._max_concurrent = max_concurrent
+    @property
+    def provider_name(self) -> str:
+        return "flashrank"
+    async def initialize(self) -> None:
+        """Load the FlashRank model."""
+        if self._ranker is not None:
+            return
+        try:
+            from flashrank import Ranker  # type: ignore[import-untyped]
+        except ImportError:
+            raise ImportError("flashrank is required for FlashRankCrossEncoder. Install it with: pip install flashrank")
+        logger.info(f"Reranker: initializing FlashRank provider with model {self.model_name}")
+        # Initialize ranker with optional cache directory
+        ranker_kwargs = {"model_name": self.model_name, "max_length": self.max_length}
+        if self.cache_dir:
+            ranker_kwargs["cache_dir"] = self.cache_dir
+        self._ranker = Ranker(**ranker_kwargs)
+        # Initialize shared executor
+        if FlashRankCrossEncoder._executor is None:
+            FlashRankCrossEncoder._executor = ThreadPoolExecutor(
+                max_workers=FlashRankCrossEncoder._max_concurrent,
+                thread_name_prefix="flashrank",
+            )
+            logger.info(
+                f"Reranker: FlashRank provider initialized (max_concurrent={FlashRankCrossEncoder._max_concurrent})"
+            )
+        else:
+            logger.info("Reranker: FlashRank provider initialized (using existing executor)")
+    def _predict_sync(self, pairs: list[tuple[str, str]]) -> list[float]:
+        """Synchronous predict - processes each query group."""
+        from flashrank import RerankRequest  # type: ignore[import-untyped]
+        if not pairs:
+            return []
+        # Group pairs by query
+        query_groups: dict[str, list[tuple[int, str]]] = {}
+        for idx, (query, text) in enumerate(pairs):
+            if query not in query_groups:
+                query_groups[query] = []
+            query_groups[query].append((idx, text))
+        all_scores = [0.0] * len(pairs)
+        for query, indexed_texts in query_groups.items():
+            # Build passages list for FlashRank
+            passages = [{"id": i, "text": text} for i, (_, text) in enumerate(indexed_texts)]
+            global_indices = [idx for idx, _ in indexed_texts]
+            # Create rerank request
+            request = RerankRequest(query=query, passages=passages)
+            results = self._ranker.rerank(request)
+            # Map scores back to original positions
+            for result in results:
+                local_idx = result["id"]
+                score = result["score"]
+                global_idx = global_indices[local_idx]
+                all_scores[global_idx] = score
+        return all_scores
+    async def predict(self, pairs: list[tuple[str, str]]) -> list[float]:
+        """
+        Score query-document pairs using FlashRank.
+        Args:
+            pairs: List of (query, document) tuples to score
+        Returns:
+            List of relevance scores (higher = more relevant)
+        """
+        if self._ranker is None:
+            raise RuntimeError("Reranker not initialized. Call initialize() first.")
+        # Run in thread pool to avoid blocking event loop
+        loop = asyncio.get_event_loop()
+        return await loop.run_in_executor(FlashRankCrossEncoder._executor, self._predict_sync, pairs)
+class LiteLLMCrossEncoder(CrossEncoderModel):
+    """
+    LiteLLM cross-encoder implementation using LiteLLM proxy's /rerank endpoint.
+    LiteLLM provides a unified interface for multiple reranking providers via
+    the Cohere-compatible /rerank endpoint.
+    See: https://docs.litellm.ai/docs/rerank
+    Supported providers via LiteLLM:
+    - Cohere (rerank-english-v3.0, etc.) - prefix with cohere/
+    - Together AI - prefix with together_ai/
+    - Azure AI - prefix with azure_ai/
+    - Jina AI - prefix with jina_ai/
+    - AWS Bedrock - prefix with bedrock/
+    - Voyage AI - prefix with voyage/
+    """
+    def __init__(
+        self,
+        api_base: str = DEFAULT_LITELLM_API_BASE,
+        api_key: str | None = None,
+        model: str = DEFAULT_RERANKER_LITELLM_MODEL,
+        timeout: float = 60.0,
+    ):
+        """
+        Initialize LiteLLM cross-encoder client.
+        Args:
+            api_base: Base URL of the LiteLLM proxy (default: http://localhost:4000)
+            api_key: API key for the LiteLLM proxy (optional, depends on proxy config)
+            model: Reranking model name (default: cohere/rerank-english-v3.0)
+                   Use provider prefix (e.g., cohere/, together_ai/, voyage/)
+            timeout: Request timeout in seconds (default: 60.0)
+        """
+        self.api_base = api_base.rstrip("/")
+        self.api_key = api_key
+        self.model = model
+        self.timeout = timeout
+        self._async_client: httpx.AsyncClient | None = None
+    @property
+    def provider_name(self) -> str:
+        return "litellm"
+    async def initialize(self) -> None:
+        """Initialize the async HTTP client."""
+        if self._async_client is not None:
+            return
+        logger.info(f"Reranker: initializing LiteLLM provider at {self.api_base} with model {self.model}")
+        headers = {"Content-Type": "application/json"}
+        if self.api_key:
+            headers["Authorization"] = f"Bearer {self.api_key}"
+        self._async_client = httpx.AsyncClient(timeout=self.timeout, headers=headers)
+        logger.info("Reranker: LiteLLM provider initialized")
+    async def predict(self, pairs: list[tuple[str, str]]) -> list[float]:
+        """
+        Score query-document pairs using the LiteLLM proxy's /rerank endpoint.
+        Args:
+            pairs: List of (query, document) tuples to score
+        Returns:
+            List of relevance scores
+        """
+        if self._async_client is None:
+            raise RuntimeError("Reranker not initialized. Call initialize() first.")
+        if not pairs:
+            return []
+        # Group pairs by query (LiteLLM rerank expects one query with multiple documents)
+        query_groups: dict[str, list[tuple[int, str]]] = {}
+        for idx, (query, text) in enumerate(pairs):
+            if query not in query_groups:
+                query_groups[query] = []
+            query_groups[query].append((idx, text))
+        all_scores = [0.0] * len(pairs)
+        for query, indexed_texts in query_groups.items():
+            texts = [text for _, text in indexed_texts]
+            indices = [idx for idx, _ in indexed_texts]
+            # LiteLLM /rerank follows Cohere API format
+            response = await self._async_client.post(
+                f"{self.api_base}/rerank",
+                json={
+                    "model": self.model,
+                    "query": query,
+                    "documents": texts,
+                    "top_n": len(texts),  # Return all scores
+                },
+            )
+            response.raise_for_status()
+            result = response.json()
+            # Map scores back to original positions
+            # Response format: {"results": [{"index": 0, "relevance_score": 0.9}, ...]}
+            for item in result.get("results", []):
+                original_idx = item["index"]
+                score = item.get("relevance_score", item.get("score", 0.0))
+                all_scores[indices[original_idx]] = score
         return all_scores
@@ -293,10 +886,35 @@ def create_cross_encoder_from_env() -> CrossEncoderModel:
         url = os.environ.get(ENV_RERANKER_TEI_URL)
         if not url:
             raise ValueError(f"{ENV_RERANKER_TEI_URL} is required when {ENV_RERANKER_PROVIDER} is 'tei'")
-        return RemoteTEICrossEncoder(base_url=url)
+        batch_size = int(os.environ.get(ENV_RERANKER_TEI_BATCH_SIZE, str(DEFAULT_RERANKER_TEI_BATCH_SIZE)))
+        max_concurrent = int(os.environ.get(ENV_RERANKER_TEI_MAX_CONCURRENT, str(DEFAULT_RERANKER_TEI_MAX_CONCURRENT)))
+        return RemoteTEICrossEncoder(base_url=url, batch_size=batch_size, max_concurrent=max_concurrent)
     elif provider == "local":
         model = os.environ.get(ENV_RERANKER_LOCAL_MODEL)
         model_name = model or DEFAULT_RERANKER_LOCAL_MODEL
-        return LocalSTCrossEncoder(model_name=model_name)
+        max_concurrent = int(
+            os.environ.get(ENV_RERANKER_LOCAL_MAX_CONCURRENT, str(DEFAULT_RERANKER_LOCAL_MAX_CONCURRENT))
+        )
+        return LocalSTCrossEncoder(model_name=model_name, max_concurrent=max_concurrent)
+    elif provider == "cohere":
+        api_key = os.environ.get(ENV_COHERE_API_KEY)
+        if not api_key:
+            raise ValueError(f"{ENV_COHERE_API_KEY} is required when {ENV_RERANKER_PROVIDER} is 'cohere'")
+        model = os.environ.get(ENV_RERANKER_COHERE_MODEL, DEFAULT_RERANKER_COHERE_MODEL)
+        base_url = os.environ.get(ENV_RERANKER_COHERE_BASE_URL) or None
+        return CohereCrossEncoder(api_key=api_key, model=model, base_url=base_url)
+    elif provider == "flashrank":
+        model = os.environ.get(ENV_RERANKER_FLASHRANK_MODEL, DEFAULT_RERANKER_FLASHRANK_MODEL)
+        cache_dir = os.environ.get(ENV_RERANKER_FLASHRANK_CACHE_DIR, DEFAULT_RERANKER_FLASHRANK_CACHE_DIR)
+        return FlashRankCrossEncoder(model_name=model, cache_dir=cache_dir)
+    elif provider == "litellm":
+        api_base = os.environ.get(ENV_LITELLM_API_BASE, DEFAULT_LITELLM_API_BASE)
+        api_key = os.environ.get(ENV_LITELLM_API_KEY)
+        model = os.environ.get(ENV_RERANKER_LITELLM_MODEL, DEFAULT_RERANKER_LITELLM_MODEL)
+        return LiteLLMCrossEncoder(api_base=api_base, api_key=api_key, model=model)
+    elif provider == "rrf":
+        return RRFPassthroughCrossEncoder()
     else:
-        raise ValueError(f"Unknown reranker provider: {provider}. Supported: 'local', 'tei'")
+        raise ValueError(
+            f"Unknown reranker provider: {provider}. Supported: 'local', 'tei', 'cohere', 'flashrank', 'litellm', 'rrf'"
+        )

hindsight-api 0.2.1__py3-none-any.whl → 0.4.0__py3-none-any.whl

hindsight-api 0.2.1py3-none-any.whl → 0.4.0py3-none-any.whl