PyPI - hindsight-api - Versions diffs - 0.4.0__tar.gz → 0.4.1__tar.gz - Mend

hindsight-api 0.4.0tar.gz → 0.4.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (112) hide show

{hindsight_api-0.4.0 → hindsight_api-0.4.1}/.gitignore RENAMED Viewed

@@ -45,9 +45,12 @@ hindsight-docs/static/llms-full.txt
 hindsight-dev/benchmarks/locomo/results/
 hindsight-dev/benchmarks/longmemeval/results/
+hindsight-dev/benchmarks/consolidation/results/
+benchmarks/results/
 hindsight-cli/target
 hindsight-clients/rust/target
 .claude
 whats-next.md
 TASK.md
-CHANGELOG.md
+# Changelog is now tracked in hindsight-docs/src/pages/changelog.md
+# CHANGELOG.md

{hindsight_api-0.4.0 → hindsight_api-0.4.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hindsight-api
-Version: 0.4.0
+Version: 0.4.1
 Summary: Hindsight: Agent Memory That Works Like Human Memory
 Requires-Python: >=3.11
 Requires-Dist: aiohttp>=3.13.3

{hindsight_api-0.4.0 → hindsight_api-0.4.1}/hindsight_api/__init__.py RENAMED Viewed

@@ -46,4 +46,4 @@ __all__ = [
     "RemoteTEICrossEncoder",
     "LLMConfig",
 ]
-__version__ = "0.1.0"
+__version__ = "0.4.1"

{hindsight_api-0.4.0 → hindsight_api-0.4.1}/hindsight_api/api/http.py RENAMED Viewed

@@ -1323,7 +1323,7 @@ class VersionResponse(BaseModel):
     model_config = ConfigDict(
         json_schema_extra={
             "example": {
-                "api_version": "1.0.0",
+                "api_version": "0.4.0",
                 "features": {
                     "observations": False,
                     "mcp": True,
@@ -1567,11 +1567,12 @@ def _register_routes(app: FastAPI):
         Returns version info and feature flags that can be used by clients
         to determine which capabilities are available.
         """
+        from hindsight_api import __version__
         from hindsight_api.config import get_config
         config = get_config()
         return VersionResponse(
-            api_version="1.0.0",
+            api_version=__version__,
             features=FeaturesInfo(
                 observations=config.enable_observations,
                 mcp=config.mcp_enabled,

{hindsight_api-0.4.0 → hindsight_api-0.4.1}/hindsight_api/config.py RENAMED Viewed

@@ -20,6 +20,7 @@ logger = logging.getLogger(__name__)
 # Environment variable names
 ENV_DATABASE_URL = "HINDSIGHT_API_DATABASE_URL"
+ENV_DATABASE_SCHEMA = "HINDSIGHT_API_DATABASE_SCHEMA"
 ENV_LLM_PROVIDER = "HINDSIGHT_API_LLM_PROVIDER"
 ENV_LLM_API_KEY = "HINDSIGHT_API_LLM_API_KEY"
 ENV_LLM_MODEL = "HINDSIGHT_API_LLM_MODEL"
@@ -46,6 +47,7 @@ ENV_CONSOLIDATION_LLM_BASE_URL = "HINDSIGHT_API_CONSOLIDATION_LLM_BASE_URL"
 ENV_EMBEDDINGS_PROVIDER = "HINDSIGHT_API_EMBEDDINGS_PROVIDER"
 ENV_EMBEDDINGS_LOCAL_MODEL = "HINDSIGHT_API_EMBEDDINGS_LOCAL_MODEL"
+ENV_EMBEDDINGS_LOCAL_FORCE_CPU = "HINDSIGHT_API_EMBEDDINGS_LOCAL_FORCE_CPU"
 ENV_EMBEDDINGS_TEI_URL = "HINDSIGHT_API_EMBEDDINGS_TEI_URL"
 ENV_EMBEDDINGS_OPENAI_API_KEY = "HINDSIGHT_API_EMBEDDINGS_OPENAI_API_KEY"
 ENV_EMBEDDINGS_OPENAI_MODEL = "HINDSIGHT_API_EMBEDDINGS_OPENAI_MODEL"
@@ -65,6 +67,7 @@ ENV_RERANKER_LITELLM_MODEL = "HINDSIGHT_API_RERANKER_LITELLM_MODEL"
 ENV_RERANKER_PROVIDER = "HINDSIGHT_API_RERANKER_PROVIDER"
 ENV_RERANKER_LOCAL_MODEL = "HINDSIGHT_API_RERANKER_LOCAL_MODEL"
+ENV_RERANKER_LOCAL_FORCE_CPU = "HINDSIGHT_API_RERANKER_LOCAL_FORCE_CPU"
 ENV_RERANKER_LOCAL_MAX_CONCURRENT = "HINDSIGHT_API_RERANKER_LOCAL_MAX_CONCURRENT"
 ENV_RERANKER_TEI_URL = "HINDSIGHT_API_RERANKER_TEI_URL"
 ENV_RERANKER_TEI_BATCH_SIZE = "HINDSIGHT_API_RERANKER_TEI_BATCH_SIZE"
@@ -98,6 +101,7 @@ ENV_RETAIN_OBSERVATIONS_ASYNC = "HINDSIGHT_API_RETAIN_OBSERVATIONS_ASYNC"
 # Observations settings (consolidated knowledge from facts)
 ENV_ENABLE_OBSERVATIONS = "HINDSIGHT_API_ENABLE_OBSERVATIONS"
 ENV_CONSOLIDATION_BATCH_SIZE = "HINDSIGHT_API_CONSOLIDATION_BATCH_SIZE"
+ENV_CONSOLIDATION_MAX_TOKENS = "HINDSIGHT_API_CONSOLIDATION_MAX_TOKENS"
 # Optimization flags
 ENV_SKIP_LLM_VERIFICATION = "HINDSIGHT_API_SKIP_LLM_VERIFICATION"
@@ -125,6 +129,7 @@ ENV_REFLECT_MAX_ITERATIONS = "HINDSIGHT_API_REFLECT_MAX_ITERATIONS"
 # Default values
 DEFAULT_DATABASE_URL = "pg0"
+DEFAULT_DATABASE_SCHEMA = "public"
 DEFAULT_LLM_PROVIDER = "openai"
 DEFAULT_LLM_MODEL = "gpt-5-mini"
 DEFAULT_LLM_MAX_CONCURRENT = 32
@@ -132,11 +137,13 @@ DEFAULT_LLM_TIMEOUT = 120.0  # seconds
 DEFAULT_EMBEDDINGS_PROVIDER = "local"
 DEFAULT_EMBEDDINGS_LOCAL_MODEL = "BAAI/bge-small-en-v1.5"
+DEFAULT_EMBEDDINGS_LOCAL_FORCE_CPU = False  # Force CPU mode for local embeddings (avoids MPS/XPC issues on macOS)
 DEFAULT_EMBEDDINGS_OPENAI_MODEL = "text-embedding-3-small"
 DEFAULT_EMBEDDING_DIMENSION = 384
 DEFAULT_RERANKER_PROVIDER = "local"
 DEFAULT_RERANKER_LOCAL_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"
+DEFAULT_RERANKER_LOCAL_FORCE_CPU = False  # Force CPU mode for local reranker (avoids MPS/XPC issues on macOS)
 DEFAULT_RERANKER_LOCAL_MAX_CONCURRENT = 4  # Limit concurrent CPU-bound reranking to prevent thrashing
 DEFAULT_RERANKER_TEI_BATCH_SIZE = 128
 DEFAULT_RERANKER_TEI_MAX_CONCURRENT = 8
@@ -177,6 +184,7 @@ DEFAULT_RETAIN_OBSERVATIONS_ASYNC = False  # Run observation generation async (a
 # Observations defaults (consolidated knowledge from facts)
 DEFAULT_ENABLE_OBSERVATIONS = True  # Observations enabled by default
 DEFAULT_CONSOLIDATION_BATCH_SIZE = 50  # Memories to load per batch (internal memory optimization)
+DEFAULT_CONSOLIDATION_MAX_TOKENS = 1024  # Max tokens for recall when finding related observations
 # Database migrations
 DEFAULT_RUN_MIGRATIONS_ON_STARTUP = True
@@ -270,6 +278,7 @@ class HindsightConfig:
     # Database
     database_url: str
+    database_schema: str
     # LLM (default, used as fallback for per-operation config)
     llm_provider: str
@@ -298,6 +307,7 @@ class HindsightConfig:
     # Embeddings
     embeddings_provider: str
     embeddings_local_model: str
+    embeddings_local_force_cpu: bool
     embeddings_tei_url: str | None
     embeddings_openai_base_url: str | None
     embeddings_cohere_base_url: str | None
@@ -305,6 +315,8 @@ class HindsightConfig:
     # Reranker
     reranker_provider: str
     reranker_local_model: str
+    reranker_local_force_cpu: bool
+    reranker_local_max_concurrent: int
     reranker_tei_url: str | None
     reranker_tei_batch_size: int
     reranker_tei_max_concurrent: int
@@ -336,6 +348,7 @@ class HindsightConfig:
     # Observations settings (consolidated knowledge from facts)
     enable_observations: bool
     consolidation_batch_size: int
+    consolidation_max_tokens: int
     # Optimization flags
     skip_llm_verification: bool
@@ -367,6 +380,7 @@ class HindsightConfig:
         return cls(
             # Database
             database_url=os.getenv(ENV_DATABASE_URL, DEFAULT_DATABASE_URL),
+            database_schema=os.getenv(ENV_DATABASE_SCHEMA, DEFAULT_DATABASE_SCHEMA),
             # LLM
             llm_provider=os.getenv(ENV_LLM_PROVIDER, DEFAULT_LLM_PROVIDER),
             llm_api_key=os.getenv(ENV_LLM_API_KEY),
@@ -390,12 +404,23 @@ class HindsightConfig:
             # Embeddings
             embeddings_provider=os.getenv(ENV_EMBEDDINGS_PROVIDER, DEFAULT_EMBEDDINGS_PROVIDER),
             embeddings_local_model=os.getenv(ENV_EMBEDDINGS_LOCAL_MODEL, DEFAULT_EMBEDDINGS_LOCAL_MODEL),
+            embeddings_local_force_cpu=os.getenv(
+                ENV_EMBEDDINGS_LOCAL_FORCE_CPU, str(DEFAULT_EMBEDDINGS_LOCAL_FORCE_CPU)
+            ).lower()
+            in ("true", "1"),
             embeddings_tei_url=os.getenv(ENV_EMBEDDINGS_TEI_URL),
             embeddings_openai_base_url=os.getenv(ENV_EMBEDDINGS_OPENAI_BASE_URL) or None,
             embeddings_cohere_base_url=os.getenv(ENV_EMBEDDINGS_COHERE_BASE_URL) or None,
             # Reranker
             reranker_provider=os.getenv(ENV_RERANKER_PROVIDER, DEFAULT_RERANKER_PROVIDER),
             reranker_local_model=os.getenv(ENV_RERANKER_LOCAL_MODEL, DEFAULT_RERANKER_LOCAL_MODEL),
+            reranker_local_force_cpu=os.getenv(
+                ENV_RERANKER_LOCAL_FORCE_CPU, str(DEFAULT_RERANKER_LOCAL_FORCE_CPU)
+            ).lower()
+            in ("true", "1"),
+            reranker_local_max_concurrent=int(
+                os.getenv(ENV_RERANKER_LOCAL_MAX_CONCURRENT, str(DEFAULT_RERANKER_LOCAL_MAX_CONCURRENT))
+            ),
             reranker_tei_url=os.getenv(ENV_RERANKER_TEI_URL),
             reranker_tei_batch_size=int(os.getenv(ENV_RERANKER_TEI_BATCH_SIZE, str(DEFAULT_RERANKER_TEI_BATCH_SIZE))),
             reranker_tei_max_concurrent=int(
@@ -444,6 +469,9 @@ class HindsightConfig:
             consolidation_batch_size=int(
                 os.getenv(ENV_CONSOLIDATION_BATCH_SIZE, str(DEFAULT_CONSOLIDATION_BATCH_SIZE))
             ),
+            consolidation_max_tokens=int(
+                os.getenv(ENV_CONSOLIDATION_MAX_TOKENS, str(DEFAULT_CONSOLIDATION_MAX_TOKENS))
+            ),
             # Database migrations
             run_migrations_on_startup=os.getenv(ENV_RUN_MIGRATIONS_ON_STARTUP, "true").lower() == "true",
             # Database connection pool
@@ -515,7 +543,7 @@ class HindsightConfig:
     def log_config(self) -> None:
         """Log the current configuration (without sensitive values)."""
-        logger.info(f"Database: {self.database_url}")
+        logger.info(f"Database: {self.database_url} (schema: {self.database_schema})")
         logger.info(f"LLM: provider={self.llm_provider}, model={self.llm_model}")
         if self.retain_llm_provider or self.retain_llm_model:
             retain_provider = self.retain_llm_provider or self.llm_provider

{hindsight_api-0.4.0 → hindsight_api-0.4.1}/hindsight_api/engine/consolidation/consolidator.py RENAMED Viewed

@@ -639,28 +639,27 @@ async def _find_related_observations(
     request_context: "RequestContext",
 ) -> list[dict[str, Any]]:
     """
-    Find observations related to the given query using the full recall system.
+    Find observations related to the given query using optimized recall.
     IMPORTANT: We do NOT filter by tags here. Consolidation needs to see ALL
     potentially related observations regardless of scope, so the LLM can
     decide on tag routing (same scope update vs cross-scope create).
-    This leverages:
-    - Semantic search (embedding similarity)
-    - BM25 text search (keyword matching)
-    - Entity-based retrieval (shared entities)
-    - Graph traversal (connected via entity links)
+    Uses max_tokens to naturally limit observations (no artificial count limit).
+    Includes source memories with dates for LLM context.
     Returns:
-        List of related observations with their tags for LLM tag routing
+        List of related observations with their tags, source memories, and dates
     """
-    # Use recall to find related observations
-    # NO tags parameter - we want ALL observations regardless of scope
-    # Use low max_tokens since we only need observations, not memories
+    # Use recall to find related observations with token budget
+    # max_tokens naturally limits how many observations are returned
+    from ...config import get_config
+    config = get_config()
     recall_result = await memory_engine.recall_async(
         bank_id=bank_id,
         query=query,
-        max_tokens=5000,  # Token budget for observations
+        max_tokens=config.consolidation_max_tokens,  # Token budget for observations (configurable)
         fact_type=["observation"],  # Only retrieve observations
         request_context=request_context,
         _quiet=True,  # Suppress logging
@@ -668,43 +667,82 @@ async def _find_related_observations(
     )
     # If no observations returned, return empty list
-    # When fact_type=["observation"], results come back in `results` field
     if not recall_result.results:
         return []
-    # Trust recall's relevance filtering - fetch full data for each observation
+    # Batch fetch all observations in a single query (no artificial limit)
+    observation_ids = [uuid.UUID(obs.id) for obs in recall_result.results]
+    rows = await conn.fetch(
+        f"""
+        SELECT id, text, proof_count, history, tags, source_memory_ids, created_at, updated_at,
+               occurred_start, occurred_end, mentioned_at
+        FROM {fq_table("memory_units")}
+        WHERE id = ANY($1) AND bank_id = $2 AND fact_type = 'observation'
+        """,
+        observation_ids,
+        bank_id,
+    )
+    # Build results list preserving recall order
+    id_to_row = {row["id"]: row for row in rows}
     results = []
-    for obs in recall_result.results:
-        # Fetch full observation data from DB to get history, source_memory_ids, tags
-        row = await conn.fetchrow(
-            f"""
-            SELECT id, text, proof_count, history, tags, source_memory_ids, created_at, updated_at
-            FROM {fq_table("memory_units")}
-            WHERE id = $1 AND bank_id = $2 AND fact_type = 'observation'
-            """,
-            uuid.UUID(obs.id),
-            bank_id,
-        )
-        if row:
-            history = row["history"]
-            if isinstance(history, str):
-                history = json.loads(history)
-            elif history is None:
-                history = []
-            results.append(
-                {
-                    "id": row["id"],
-                    "text": row["text"],
-                    "proof_count": row["proof_count"] or 1,
-                    "history": history,
-                    "tags": row["tags"] or [],  # Include tags for LLM tag routing
-                    "source_memory_ids": row["source_memory_ids"] or [],
-                    "similarity": 1.0,  # Retrieved via recall so assumed relevant
-                }
+    for obs in recall_result.results:
+        obs_id = uuid.UUID(obs.id)
+        if obs_id not in id_to_row:
+            continue
+        row = id_to_row[obs_id]
+        history = row["history"]
+        if isinstance(history, str):
+            history = json.loads(history)
+        elif history is None:
+            history = []
+        # Fetch source memories to include their text and dates
+        source_memory_ids = row["source_memory_ids"] or []
+        source_memories = []
+        if source_memory_ids:
+            source_rows = await conn.fetch(
+                f"""
+                SELECT text, occurred_start, occurred_end, mentioned_at, event_date
+                FROM {fq_table("memory_units")}
+                WHERE id = ANY($1) AND bank_id = $2
+                ORDER BY created_at ASC
+                LIMIT 5
+                """,
+                source_memory_ids[:5],  # Limit to first 5 source memories for token efficiency
+                bank_id,
             )
+            for src_row in source_rows:
+                source_memories.append(
+                    {
+                        "text": src_row["text"],
+                        "occurred_start": src_row["occurred_start"],
+                        "occurred_end": src_row["occurred_end"],
+                        "mentioned_at": src_row["mentioned_at"],
+                        "event_date": src_row["event_date"],
+                    }
+                )
+        results.append(
+            {
+                "id": row["id"],
+                "text": row["text"],
+                "proof_count": row["proof_count"] or 1,
+                "tags": row["tags"] or [],
+                "source_memories": source_memories,
+                "occurred_start": row["occurred_start"],
+                "occurred_end": row["occurred_end"],
+                "mentioned_at": row["mentioned_at"],
+                "created_at": row["created_at"],
+                "updated_at": row["updated_at"],
+            }
+        )
     return results
@@ -732,14 +770,43 @@ async def _consolidate_with_llm(
         - {"action": "create", "text": "...", "reason": "..."}
         - [] if fact is purely ephemeral (no durable knowledge)
     """
-    # Format observations WITH their tags (or "None" if empty)
+    # Format observations as JSON with source memories and dates
     if observations:
-        observations_text = "\n".join(
-            f'- ID: {obs["id"]}, Tags: {json.dumps(obs["tags"])}, Text: "{obs["text"]}" (proof_count: {obs["proof_count"]})'
-            for obs in observations
-        )
+        obs_list = []
+        for obs in observations:
+            obs_data = {
+                "id": str(obs["id"]),
+                "text": obs["text"],
+                "proof_count": obs["proof_count"],
+                "tags": obs["tags"],
+                "created_at": obs["created_at"].isoformat() if obs.get("created_at") else None,
+                "updated_at": obs["updated_at"].isoformat() if obs.get("updated_at") else None,
+            }
+            # Include temporal info if available
+            if obs.get("occurred_start"):
+                obs_data["occurred_start"] = obs["occurred_start"].isoformat()
+            if obs.get("occurred_end"):
+                obs_data["occurred_end"] = obs["occurred_end"].isoformat()
+            if obs.get("mentioned_at"):
+                obs_data["mentioned_at"] = obs["mentioned_at"].isoformat()
+            # Include source memories (up to 3 for brevity)
+            if obs.get("source_memories"):
+                obs_data["source_memories"] = [
+                    {
+                        "text": sm["text"],
+                        "event_date": sm["event_date"].isoformat() if sm.get("event_date") else None,
+                        "occurred_start": sm["occurred_start"].isoformat() if sm.get("occurred_start") else None,
+                    }
+                    for sm in obs["source_memories"][:3]  # Limit to 3 for token efficiency
+                ]
+            obs_list.append(obs_data)
+        observations_text = json.dumps(obs_list, indent=2)
     else:
-        observations_text = "None (this is a new topic - create if fact contains durable knowledge)"
+        observations_text = "[]"
     # Only include mission section if mission is set and not the default
     mission_section = ""

{hindsight_api-0.4.0 → hindsight_api-0.4.1}/hindsight_api/engine/consolidation/prompts.py RENAMED Viewed

@@ -47,23 +47,31 @@ CONSOLIDATION_USER_PROMPT = """Analyze this new fact and consolidate into knowle
 {mission_section}
 NEW FACT: {fact_text}
-EXISTING OBSERVATIONS:
+EXISTING OBSERVATIONS (JSON array with source memories and dates):
 {observations_text}
-Instructions:
-1. First, extract the DURABLE KNOWLEDGE from the fact (not ephemeral state like "user is at X")
-2. Then compare with existing observations:
-   - If an observation covers the same topic: UPDATE it with the new knowledge
-   - If no observation covers the topic: CREATE a new one
+Each observation includes:
+- id: unique identifier for updating
+- text: the observation content
+- proof_count: number of supporting memories
+- tags: visibility scope (handled automatically)
+- created_at/updated_at: when observation was created/modified
+- occurred_start/occurred_end: temporal range of source facts
+- source_memories: array of supporting facts with their text and dates
-Output JSON array of actions (ALWAYS an array, even for single action):
+Instructions:
+1. Extract DURABLE KNOWLEDGE from the new fact (not ephemeral state)
+2. Review source_memories in existing observations to understand evidence
+3. Check dates to detect contradictions or updates
+4. Compare with observations:
+   - Same topic → UPDATE with learning_id
+   - New topic → CREATE new observation
+   - Purely ephemeral → return []
+Output JSON array of actions:
 [
-  {{"action": "update", "learning_id": "uuid", "text": "updated durable knowledge", "reason": "..."}},
+  {{"action": "update", "learning_id": "uuid-from-observations", "text": "updated knowledge", "reason": "..."}},
   {{"action": "create", "text": "new durable knowledge", "reason": "..."}}
 ]
-If NO consolidation is needed (fact is purely ephemeral with no durable knowledge):
-[]
-If no observations exist and fact contains durable knowledge:
-[{{"action": "create", "text": "durable knowledge text", "reason": "new topic"}}]"""
+Return [] if fact contains no durable knowledge."""

{hindsight_api-0.4.0 → hindsight_api-0.4.1}/hindsight_api/engine/cross_encoder.py RENAMED Viewed

@@ -20,6 +20,7 @@ from ..config import (
     DEFAULT_RERANKER_FLASHRANK_CACHE_DIR,
     DEFAULT_RERANKER_FLASHRANK_MODEL,
     DEFAULT_RERANKER_LITELLM_MODEL,
+    DEFAULT_RERANKER_LOCAL_FORCE_CPU,
     DEFAULT_RERANKER_LOCAL_MAX_CONCURRENT,
     DEFAULT_RERANKER_LOCAL_MODEL,
     DEFAULT_RERANKER_PROVIDER,
@@ -33,6 +34,7 @@ from ..config import (
     ENV_RERANKER_FLASHRANK_CACHE_DIR,
     ENV_RERANKER_FLASHRANK_MODEL,
     ENV_RERANKER_LITELLM_MODEL,
+    ENV_RERANKER_LOCAL_FORCE_CPU,
     ENV_RERANKER_LOCAL_MAX_CONCURRENT,
     ENV_RERANKER_LOCAL_MODEL,
     ENV_RERANKER_PROVIDER,
@@ -99,7 +101,7 @@ class LocalSTCrossEncoder(CrossEncoderModel):
     _executor: ThreadPoolExecutor | None = None
     _max_concurrent: int = 4  # Limit concurrent CPU-bound reranking calls
-    def __init__(self, model_name: str | None = None, max_concurrent: int = 4):
+    def __init__(self, model_name: str | None = None, max_concurrent: int = 4, force_cpu: bool = False):
         """
         Initialize local SentenceTransformers cross-encoder.
@@ -108,8 +110,11 @@ class LocalSTCrossEncoder(CrossEncoderModel):
                        Default: cross-encoder/ms-marco-MiniLM-L-6-v2
             max_concurrent: Maximum concurrent reranking calls (default: 2).
                            Higher values may cause CPU thrashing under load.
+            force_cpu: Force CPU mode (avoids MPS/XPC issues on macOS in daemon mode).
+                      Default: False
         """
         self.model_name = model_name or DEFAULT_RERANKER_LOCAL_MODEL
+        self.force_cpu = force_cpu
         self._model = None
         LocalSTCrossEncoder._max_concurrent = max_concurrent
@@ -139,13 +144,23 @@ class LocalSTCrossEncoder(CrossEncoderModel):
         # after loading, which conflicts with accelerate's device_map handling.
         import torch
-        # Check for GPU (CUDA) or Apple Silicon (MPS)
-        has_gpu = torch.cuda.is_available() or (hasattr(torch.backends, "mps") and torch.backends.mps.is_available())
-        if has_gpu:
-            device = None  # Let sentence-transformers auto-detect GPU/MPS
-        else:
+        # Force CPU mode if configured (used in daemon mode to avoid MPS/XPC issues on macOS)
+        if self.force_cpu:
             device = "cpu"
+            logger.info("Reranker: forcing CPU mode (HINDSIGHT_API_RERANKER_LOCAL_FORCE_CPU=1)")
+        else:
+            # Check for GPU (CUDA) or Apple Silicon (MPS)
+            # Wrap in try-except to gracefully handle any device detection issues
+            # (e.g., in CI environments or when PyTorch is built without GPU support)
+            device = "cpu"  # Default to CPU
+            try:
+                has_gpu = torch.cuda.is_available() or (
+                    hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
+                )
+                if has_gpu:
+                    device = None  # Let sentence-transformers auto-detect GPU/MPS
+            except Exception as e:
+                logger.warning(f"Failed to detect GPU/MPS, falling back to CPU: {e}")
         self._model = CrossEncoder(
             self.model_name,
@@ -211,12 +226,19 @@ class LocalSTCrossEncoder(CrossEncoderModel):
             )
         # Determine device based on hardware availability
-        has_gpu = torch.cuda.is_available() or (hasattr(torch.backends, "mps") and torch.backends.mps.is_available())
-        if has_gpu:
-            device = None  # Let sentence-transformers auto-detect GPU/MPS
-        else:
+        if self.force_cpu:
             device = "cpu"
+        else:
+            # Wrap in try-except to gracefully handle any device detection issues
+            device = "cpu"  # Default to CPU
+            try:
+                has_gpu = torch.cuda.is_available() or (
+                    hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
+                )
+                if has_gpu:
+                    device = None  # Let sentence-transformers auto-detect GPU/MPS
+            except Exception as e:
+                logger.warning(f"Failed to detect GPU/MPS during reinit, falling back to CPU: {e}")
         self._model = CrossEncoder(
             self.model_name,
@@ -873,29 +895,33 @@ class LiteLLMCrossEncoder(CrossEncoderModel):
 def create_cross_encoder_from_env() -> CrossEncoderModel:
     """
-    Create a CrossEncoderModel instance based on environment variables.
+    Create a CrossEncoderModel instance based on configuration.
-    See hindsight_api.config for environment variable names and defaults.
+    Reads configuration via get_config() to ensure consistency across the codebase.
     Returns:
         Configured CrossEncoderModel instance
     """
-    provider = os.environ.get(ENV_RERANKER_PROVIDER, DEFAULT_RERANKER_PROVIDER).lower()
+    from ..config import get_config
+    config = get_config()
+    provider = config.reranker_provider.lower()
     if provider == "tei":
-        url = os.environ.get(ENV_RERANKER_TEI_URL)
+        url = config.reranker_tei_url
         if not url:
             raise ValueError(f"{ENV_RERANKER_TEI_URL} is required when {ENV_RERANKER_PROVIDER} is 'tei'")
-        batch_size = int(os.environ.get(ENV_RERANKER_TEI_BATCH_SIZE, str(DEFAULT_RERANKER_TEI_BATCH_SIZE)))
-        max_concurrent = int(os.environ.get(ENV_RERANKER_TEI_MAX_CONCURRENT, str(DEFAULT_RERANKER_TEI_MAX_CONCURRENT)))
-        return RemoteTEICrossEncoder(base_url=url, batch_size=batch_size, max_concurrent=max_concurrent)
+        return RemoteTEICrossEncoder(
+            base_url=url,
+            batch_size=config.reranker_tei_batch_size,
+            max_concurrent=config.reranker_tei_max_concurrent,
+        )
     elif provider == "local":
-        model = os.environ.get(ENV_RERANKER_LOCAL_MODEL)
-        model_name = model or DEFAULT_RERANKER_LOCAL_MODEL
-        max_concurrent = int(
-            os.environ.get(ENV_RERANKER_LOCAL_MAX_CONCURRENT, str(DEFAULT_RERANKER_LOCAL_MAX_CONCURRENT))
+        return LocalSTCrossEncoder(
+            model_name=config.reranker_local_model,
+            max_concurrent=config.reranker_local_max_concurrent,
+            force_cpu=config.reranker_local_force_cpu,
         )
-        return LocalSTCrossEncoder(model_name=model_name, max_concurrent=max_concurrent)
     elif provider == "cohere":
         api_key = os.environ.get(ENV_COHERE_API_KEY)
         if not api_key:

hindsight-api 0.4.0__tar.gz → 0.4.1__tar.gz

hindsight-api 0.4.0tar.gz → 0.4.1tar.gz