PyPI - hindsight-api - Versions diffs - 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl - Mend

hindsight-api 0.4.1py3-none-any.whl → 0.4.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

hindsight_api/__init__.py +1 -1
hindsight_api/api/http.py +7 -19
hindsight_api/api/mcp.py +45 -5
hindsight_api/config.py +115 -11
hindsight_api/daemon.py +4 -1
hindsight_api/engine/consolidation/consolidator.py +39 -3
hindsight_api/engine/cross_encoder.py +7 -99
hindsight_api/engine/embeddings.py +3 -93
hindsight_api/engine/interface.py +0 -43
hindsight_api/engine/llm_wrapper.py +93 -22
hindsight_api/engine/memory_engine.py +37 -138
hindsight_api/engine/response_models.py +1 -21
hindsight_api/engine/retain/fact_extraction.py +19 -23
hindsight_api/engine/retain/orchestrator.py +1 -4
hindsight_api/engine/utils.py +0 -3
hindsight_api/main.py +27 -12
hindsight_api/mcp_tools.py +31 -12
hindsight_api/metrics.py +3 -3
hindsight_api/pg0.py +1 -1
hindsight_api/worker/main.py +11 -11
hindsight_api/worker/poller.py +226 -97
{hindsight_api-0.4.1.dist-info → hindsight_api-0.4.3.dist-info}/METADATA +2 -1
{hindsight_api-0.4.1.dist-info → hindsight_api-0.4.3.dist-info}/RECORD +25 -25
{hindsight_api-0.4.1.dist-info → hindsight_api-0.4.3.dist-info}/WHEEL +0 -0
{hindsight_api-0.4.1.dist-info → hindsight_api-0.4.3.dist-info}/entry_points.txt +0 -0

hindsight_api/engine/cross_encoder.py CHANGED Viewed

@@ -178,108 +178,16 @@ class LocalSTCrossEncoder(CrossEncoderModel):
         else:
             logger.info("Reranker: local provider initialized (using existing executor)")
-    def _is_xpc_error(self, error: Exception) -> bool:
-        """
-        Check if an error is an XPC connection error (macOS daemon issue).
-        On macOS, long-running daemons can lose XPC connections to system services
-        when the process is idle for extended periods.
-        """
-        error_str = str(error).lower()
-        return "xpc_error_connection_invalid" in error_str or "xpc error" in error_str
-    def _reinitialize_model_sync(self) -> None:
-        """
-        Clear and reinitialize the cross-encoder model synchronously.
-        This is used to recover from XPC errors on macOS where the
-        PyTorch/MPS backend loses its connection to system services.
-        """
-        logger.warning(f"Reinitializing reranker model {self.model_name} due to backend error")
-        # Clear existing model
-        self._model = None
-        # Force garbage collection to free resources
-        import gc
-        import torch
-        gc.collect()
-        # If using CUDA/MPS, clear the cache
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-        elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
-            try:
-                torch.mps.empty_cache()
-            except AttributeError:
-                pass  # Method might not exist in all PyTorch versions
-        # Reinitialize the model
-        try:
-            from sentence_transformers import CrossEncoder
-        except ImportError:
-            raise ImportError(
-                "sentence-transformers is required for LocalSTCrossEncoder. "
-                "Install it with: pip install sentence-transformers"
-            )
-        # Determine device based on hardware availability
-        if self.force_cpu:
-            device = "cpu"
-        else:
-            # Wrap in try-except to gracefully handle any device detection issues
-            device = "cpu"  # Default to CPU
-            try:
-                has_gpu = torch.cuda.is_available() or (
-                    hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
-                )
-                if has_gpu:
-                    device = None  # Let sentence-transformers auto-detect GPU/MPS
-            except Exception as e:
-                logger.warning(f"Failed to detect GPU/MPS during reinit, falling back to CPU: {e}")
-        self._model = CrossEncoder(
-            self.model_name,
-            device=device,
-            model_kwargs={"low_cpu_mem_usage": False},
-        )
-        logger.info("Reranker: local provider reinitialized successfully")
-    def _predict_with_recovery(self, pairs: list[tuple[str, str]]) -> list[float]:
-        """
-        Predict with automatic recovery from XPC errors.
-        This runs synchronously in the thread pool.
-        """
-        max_retries = 1
-        for attempt in range(max_retries + 1):
-            try:
-                scores = self._model.predict(pairs, show_progress_bar=False)
-                return scores.tolist() if hasattr(scores, "tolist") else list(scores)
-            except Exception as e:
-                # Check if this is an XPC error (macOS daemon issue)
-                if self._is_xpc_error(e) and attempt < max_retries:
-                    logger.warning(f"XPC error detected in reranker (attempt {attempt + 1}): {e}")
-                    try:
-                        self._reinitialize_model_sync()
-                        logger.info("Reranker reinitialized successfully, retrying prediction")
-                        continue
-                    except Exception as reinit_error:
-                        logger.error(f"Failed to reinitialize reranker: {reinit_error}")
-                        raise Exception(f"Failed to recover from XPC error: {str(e)}")
-                else:
-                    # Not an XPC error or out of retries
-                    raise
+    def _predict_sync(self, pairs: list[tuple[str, str]]) -> list[float]:
+        """Synchronous prediction wrapper for thread pool execution."""
+        scores = self._model.predict(pairs, show_progress_bar=False)
+        return scores.tolist() if hasattr(scores, "tolist") else list(scores)
     async def predict(self, pairs: list[tuple[str, str]]) -> list[float]:
         """
         Score query-document pairs for relevance.
         Uses a dedicated thread pool with limited workers to prevent CPU thrashing.
-        Automatically recovers from XPC errors on macOS by reinitializing the model.
         Args:
             pairs: List of (query, document) tuples to score
@@ -294,7 +202,7 @@ class LocalSTCrossEncoder(CrossEncoderModel):
         loop = asyncio.get_event_loop()
         return await loop.run_in_executor(
             LocalSTCrossEncoder._executor,
-            self._predict_with_recovery,
+            self._predict_sync,
             pairs,
         )
@@ -706,7 +614,7 @@ class FlashRankCrossEncoder(CrossEncoderModel):
             return
         try:
-            from flashrank import Ranker  # type: ignore[import-untyped]
+            from flashrank import Ranker
         except ImportError:
             raise ImportError("flashrank is required for FlashRankCrossEncoder. Install it with: pip install flashrank")
@@ -733,7 +641,7 @@ class FlashRankCrossEncoder(CrossEncoderModel):
     def _predict_sync(self, pairs: list[tuple[str, str]]) -> list[float]:
         """Synchronous predict - processes each query group."""
-        from flashrank import RerankRequest  # type: ignore[import-untyped]
+        from flashrank import RerankRequest
         if not pairs:
             return []

hindsight_api/engine/embeddings.py CHANGED Viewed

@@ -166,82 +166,10 @@ class LocalSTEmbeddings(Embeddings):
         self._dimension = self._model.get_sentence_embedding_dimension()
         logger.info(f"Embeddings: local provider initialized (dim: {self._dimension})")
-    def _is_xpc_error(self, error: Exception) -> bool:
-        """
-        Check if an error is an XPC connection error (macOS daemon issue).
-        On macOS, long-running daemons can lose XPC connections to system services
-        when the process is idle for extended periods.
-        """
-        error_str = str(error).lower()
-        return "xpc_error_connection_invalid" in error_str or "xpc error" in error_str
-    def _reinitialize_model_sync(self) -> None:
-        """
-        Clear and reinitialize the embedding model synchronously.
-        This is used to recover from XPC errors on macOS where the
-        PyTorch/MPS backend loses its connection to system services.
-        """
-        logger.warning(f"Reinitializing embedding model {self.model_name} due to backend error")
-        # Clear existing model
-        self._model = None
-        # Force garbage collection to free resources
-        import gc
-        import torch
-        gc.collect()
-        # If using CUDA/MPS, clear the cache
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-        elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
-            try:
-                torch.mps.empty_cache()
-            except AttributeError:
-                pass  # Method might not exist in all PyTorch versions
-        # Reinitialize the model (inline version of initialize() but synchronous)
-        try:
-            from sentence_transformers import SentenceTransformer
-        except ImportError:
-            raise ImportError(
-                "sentence-transformers is required for LocalSTEmbeddings. "
-                "Install it with: pip install sentence-transformers"
-            )
-        # Determine device based on hardware availability
-        if self.force_cpu:
-            device = "cpu"
-        else:
-            # Wrap in try-except to gracefully handle any device detection issues
-            device = "cpu"  # Default to CPU
-            try:
-                has_gpu = torch.cuda.is_available() or (
-                    hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
-                )
-                if has_gpu:
-                    device = None  # Let sentence-transformers auto-detect GPU/MPS
-            except Exception as e:
-                logger.warning(f"Failed to detect GPU/MPS during reinit, falling back to CPU: {e}")
-        self._model = SentenceTransformer(
-            self.model_name,
-            device=device,
-            model_kwargs={"low_cpu_mem_usage": False},
-        )
-        logger.info("Embeddings: local provider reinitialized successfully")
     def encode(self, texts: list[str]) -> list[list[float]]:
         """
         Generate embeddings for a list of texts.
-        Automatically recovers from XPC errors on macOS by reinitializing the model.
         Args:
             texts: List of text strings to encode
@@ -251,26 +179,8 @@ class LocalSTEmbeddings(Embeddings):
         if self._model is None:
             raise RuntimeError("Embeddings not initialized. Call initialize() first.")
-        # Try encoding with automatic recovery from XPC errors
-        max_retries = 1
-        for attempt in range(max_retries + 1):
-            try:
-                embeddings = self._model.encode(texts, convert_to_numpy=True, show_progress_bar=False)
-                return [emb.tolist() for emb in embeddings]
-            except Exception as e:
-                # Check if this is an XPC error (macOS daemon issue)
-                if self._is_xpc_error(e) and attempt < max_retries:
-                    logger.warning(f"XPC error detected in embedding generation (attempt {attempt + 1}): {e}")
-                    try:
-                        self._reinitialize_model_sync()
-                        logger.info("Model reinitialized successfully, retrying embedding generation")
-                        continue
-                    except Exception as reinit_error:
-                        logger.error(f"Failed to reinitialize model: {reinit_error}")
-                        raise Exception(f"Failed to recover from XPC error: {str(e)}")
-                else:
-                    # Not an XPC error or out of retries
-                    raise
+        embeddings = self._model.encode(texts, convert_to_numpy=True, show_progress_bar=False)
+        return [emb.tolist() for emb in embeddings]
 class RemoteTEIEmbeddings(Embeddings):
@@ -635,7 +545,7 @@ class CohereEmbeddings(Embeddings):
                 model=self.model,
                 input_type=self.input_type,
             )
-            if response.embeddings:
+            if response.embeddings and isinstance(response.embeddings, list):
                 self._dimension = len(response.embeddings[0])
         logger.info(f"Embeddings: Cohere provider initialized (model: {self.model}, dim: {self._dimension})")

hindsight_api/engine/interface.py CHANGED Viewed

@@ -442,49 +442,6 @@ class MemoryEngineInterface(ABC):
         """
         ...
-    @abstractmethod
-    async def get_entity_observations(
-        self,
-        bank_id: str,
-        entity_id: str,
-        *,
-        limit: int = 10,
-        request_context: "RequestContext",
-    ) -> list[Any]:
-        """
-        Get observations for an entity.
-        Args:
-            bank_id: The memory bank ID.
-            entity_id: The entity ID.
-            limit: Maximum observations.
-            request_context: Request context for authentication.
-        Returns:
-            List of EntityObservation objects.
-        """
-        ...
-    @abstractmethod
-    async def regenerate_entity_observations(
-        self,
-        bank_id: str,
-        entity_id: str,
-        entity_name: str,
-        *,
-        request_context: "RequestContext",
-    ) -> None:
-        """
-        Regenerate observations for an entity.
-        Args:
-            bank_id: The memory bank ID.
-            entity_id: The entity ID.
-            entity_name: The entity's canonical name.
-            request_context: Request context for authentication.
-        """
-        ...
     # =========================================================================
     # Statistics & Operations
     # =========================================================================

hindsight_api/engine/llm_wrapper.py CHANGED Viewed

@@ -16,6 +16,15 @@ from google.genai import errors as genai_errors
 from google.genai import types as genai_types
 from openai import APIConnectionError, APIStatusError, AsyncOpenAI, LengthFinishReasonError
+# Vertex AI imports (conditional)
+try:
+    import google.auth
+    from google.oauth2 import service_account
+    VERTEXAI_AVAILABLE = True
+except ImportError:
+    VERTEXAI_AVAILABLE = False
 from ..config import (
     DEFAULT_LLM_MAX_CONCURRENT,
     DEFAULT_LLM_TIMEOUT,
@@ -88,7 +97,7 @@ class LLMProvider:
         self.groq_service_tier = groq_service_tier or os.getenv(ENV_LLM_GROQ_SERVICE_TIER, "auto")
         # Validate provider
-        valid_providers = ["openai", "groq", "ollama", "gemini", "anthropic", "lmstudio", "mock"]
+        valid_providers = ["openai", "groq", "ollama", "gemini", "anthropic", "lmstudio", "vertexai", "mock"]
         if self.provider not in valid_providers:
             raise ValueError(f"Invalid LLM provider: {self.provider}. Must be one of: {', '.join(valid_providers)}")
@@ -105,8 +114,51 @@ class LLMProvider:
             elif self.provider == "lmstudio":
                 self.base_url = "http://localhost:1234/v1"
-        # Validate API key (not needed for ollama, lmstudio, or mock)
-        if self.provider not in ("ollama", "lmstudio", "mock") and not self.api_key:
+        # Vertex AI config — stored for client creation below
+        self._vertexai_project_id: str | None = None
+        self._vertexai_region: str | None = None
+        self._vertexai_credentials: Any = None
+        if self.provider == "vertexai":
+            from ..config import get_config
+            config = get_config()
+            self._vertexai_project_id = config.llm_vertexai_project_id
+            if not self._vertexai_project_id:
+                raise ValueError(
+                    "HINDSIGHT_API_LLM_VERTEXAI_PROJECT_ID is required for Vertex AI provider. "
+                    "Set it to your GCP project ID."
+                )
+            self._vertexai_region = config.llm_vertexai_region or "us-central1"
+            service_account_key = config.llm_vertexai_service_account_key
+            # Load explicit service account credentials if provided
+            if service_account_key:
+                if not VERTEXAI_AVAILABLE:
+                    raise ValueError(
+                        "Vertex AI service account auth requires 'google-auth' package. "
+                        "Install with: pip install google-auth"
+                    )
+                self._vertexai_credentials = service_account.Credentials.from_service_account_file(
+                    service_account_key,
+                    scopes=["https://www.googleapis.com/auth/cloud-platform"],
+                )
+                logger.info(f"Vertex AI: Using service account key: {service_account_key}")
+            # Strip google/ prefix from model name — native SDK uses bare names
+            # e.g. "google/gemini-2.0-flash-lite-001" -> "gemini-2.0-flash-lite-001"
+            if self.model.startswith("google/"):
+                self.model = self.model[len("google/") :]
+            logger.info(
+                f"Vertex AI: project={self._vertexai_project_id}, region={self._vertexai_region}, "
+                f"model={self.model}, auth={'service_account' if service_account_key else 'ADC'}"
+            )
+        # Validate API key (not needed for ollama, lmstudio, vertexai, or mock)
+        if self.provider not in ("ollama", "lmstudio", "vertexai", "mock") and not self.api_key:
             raise ValueError(f"API key not found for {self.provider}")
         # Get timeout config (set HINDSIGHT_API_LLM_TIMEOUT for local LLMs that need longer timeouts)
@@ -132,6 +184,17 @@ class LLMProvider:
             if self.timeout:
                 anthropic_kwargs["timeout"] = self.timeout
             self._anthropic_client = AsyncAnthropic(**anthropic_kwargs)
+        elif self.provider == "vertexai":
+            # Native genai SDK with Vertex AI — handles ADC automatically,
+            # or uses explicit service account credentials if provided
+            client_kwargs = {
+                "vertexai": True,
+                "project": self._vertexai_project_id,
+                "location": self._vertexai_region,
+            }
+            if self._vertexai_credentials is not None:
+                client_kwargs["credentials"] = self._vertexai_credentials
+            self._gemini_client = genai.Client(**client_kwargs)
         elif self.provider in ("ollama", "lmstudio"):
             # Use dummy key if not provided for local
             api_key = self.api_key or "local"
@@ -223,8 +286,8 @@ class LLMProvider:
                     return_usage,
                 )
-            # Handle Gemini provider separately
-            if self.provider == "gemini":
+            # Handle Gemini and Vertex AI providers (both use native genai SDK)
+            if self.provider in ("gemini", "vertexai"):
                 return await self._call_gemini(
                     messages,
                     response_format,
@@ -342,11 +405,13 @@ class LLMProvider:
                         schema_msg = f"\n\nYou must respond with valid JSON matching this schema:\n{json.dumps(schema, indent=2)}"
                         if call_params["messages"] and call_params["messages"][0].get("role") == "system":
-                            call_params["messages"][0]["content"] += schema_msg
+                            first_msg = call_params["messages"][0]
+                            if isinstance(first_msg, dict) and isinstance(first_msg.get("content"), str):
+                                first_msg["content"] += schema_msg
                         elif call_params["messages"]:
-                            call_params["messages"][0]["content"] = (
-                                schema_msg + "\n\n" + call_params["messages"][0]["content"]
-                            )
+                            first_msg = call_params["messages"][0]
+                            if isinstance(first_msg, dict) and isinstance(first_msg.get("content"), str):
+                                first_msg["content"] = schema_msg + "\n\n" + first_msg["content"]
                     if self.provider not in ("lmstudio", "ollama"):
                         # LM Studio and Ollama don't support json_object response format reliably
                         # We rely on the schema in the system message instead
@@ -586,8 +651,8 @@ class LLMProvider:
                     messages, tools, max_completion_tokens, max_retries, initial_backoff, max_backoff, start_time, scope
                 )
-            # Handle Gemini (convert to Gemini tool format)
-            if self.provider == "gemini":
+            # Handle Gemini and Vertex AI (convert to Gemini tool format)
+            if self.provider in ("gemini", "vertexai"):
                 return await self._call_with_tools_gemini(
                     messages, tools, max_retries, initial_backoff, max_backoff, start_time, scope
                 )
@@ -917,18 +982,20 @@ class LLMProvider:
                 tool_calls: list[LLMToolCall] = []
                 if response.candidates and response.candidates[0].content:
-                    for part in response.candidates[0].content.parts:
-                        if hasattr(part, "text") and part.text:
-                            content = part.text
-                        if hasattr(part, "function_call") and part.function_call:
-                            fc = part.function_call
-                            tool_calls.append(
-                                LLMToolCall(
-                                    id=f"gemini_{len(tool_calls)}",
-                                    name=fc.name,
-                                    arguments=dict(fc.args) if fc.args else {},
+                    parts = response.candidates[0].content.parts
+                    if parts:
+                        for part in parts:
+                            if hasattr(part, "text") and part.text:
+                                content = part.text
+                            if hasattr(part, "function_call") and part.function_call:
+                                fc = part.function_call
+                                tool_calls.append(
+                                    LLMToolCall(
+                                        id=f"gemini_{len(tool_calls)}",
+                                        name=fc.name,
+                                        arguments=dict(fc.args) if fc.args else {},
+                                    )
                                 )
-                            )
                 finish_reason = "tool_calls" if tool_calls else "stop"
@@ -1504,6 +1571,10 @@ class LLMProvider:
         """Clear the recorded mock calls."""
         self._mock_calls = []
+    async def cleanup(self) -> None:
+        """Clean up resources."""
+        pass
     @classmethod
     def for_memory(cls) -> "LLMProvider":
         """Create provider for memory operations from environment variables."""

hindsight-api 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl

hindsight-api 0.4.1py3-none-any.whl → 0.4.3py3-none-any.whl