PyPI - hindsight-api - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl - Mend

hindsight-api 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

hindsight_api/__init__.py +10 -9
hindsight_api/alembic/env.py +5 -8
hindsight_api/alembic/versions/5a366d414dce_initial_schema.py +266 -180
hindsight_api/alembic/versions/b7c4d8e9f1a2_add_chunks_table.py +32 -32
hindsight_api/alembic/versions/c8e5f2a3b4d1_add_retain_params_to_documents.py +11 -11
hindsight_api/alembic/versions/d9f6a3b4c5e2_rename_bank_to_interactions.py +7 -12
hindsight_api/alembic/versions/e0a1b2c3d4e5_disposition_to_3_traits.py +23 -15
hindsight_api/alembic/versions/rename_personality_to_disposition.py +30 -21
hindsight_api/api/__init__.py +10 -10
hindsight_api/api/http.py +575 -593
hindsight_api/api/mcp.py +31 -33
hindsight_api/banner.py +13 -6
hindsight_api/config.py +17 -12
hindsight_api/engine/__init__.py +9 -9
hindsight_api/engine/cross_encoder.py +23 -27
hindsight_api/engine/db_utils.py +5 -4
hindsight_api/engine/embeddings.py +22 -21
hindsight_api/engine/entity_resolver.py +81 -75
hindsight_api/engine/llm_wrapper.py +74 -88
hindsight_api/engine/memory_engine.py +663 -673
hindsight_api/engine/query_analyzer.py +100 -97
hindsight_api/engine/response_models.py +105 -106
hindsight_api/engine/retain/__init__.py +9 -16
hindsight_api/engine/retain/bank_utils.py +34 -58
hindsight_api/engine/retain/chunk_storage.py +4 -12
hindsight_api/engine/retain/deduplication.py +9 -28
hindsight_api/engine/retain/embedding_processing.py +4 -11
hindsight_api/engine/retain/embedding_utils.py +3 -4
hindsight_api/engine/retain/entity_processing.py +7 -17
hindsight_api/engine/retain/fact_extraction.py +155 -165
hindsight_api/engine/retain/fact_storage.py +11 -23
hindsight_api/engine/retain/link_creation.py +11 -39
hindsight_api/engine/retain/link_utils.py +166 -95
hindsight_api/engine/retain/observation_regeneration.py +39 -52
hindsight_api/engine/retain/orchestrator.py +72 -62
hindsight_api/engine/retain/types.py +49 -43
hindsight_api/engine/search/__init__.py +15 -1
hindsight_api/engine/search/fusion.py +6 -15
hindsight_api/engine/search/graph_retrieval.py +234 -0
hindsight_api/engine/search/mpfp_retrieval.py +438 -0
hindsight_api/engine/search/observation_utils.py +9 -16
hindsight_api/engine/search/reranking.py +4 -7
hindsight_api/engine/search/retrieval.py +388 -193
hindsight_api/engine/search/scoring.py +5 -7
hindsight_api/engine/search/temporal_extraction.py +8 -11
hindsight_api/engine/search/think_utils.py +115 -39
hindsight_api/engine/search/trace.py +68 -38
hindsight_api/engine/search/tracer.py +49 -35
hindsight_api/engine/search/types.py +22 -16
hindsight_api/engine/task_backend.py +21 -26
hindsight_api/engine/utils.py +25 -10
hindsight_api/main.py +21 -40
hindsight_api/mcp_local.py +190 -0
hindsight_api/metrics.py +44 -30
hindsight_api/migrations.py +10 -8
hindsight_api/models.py +60 -72
hindsight_api/pg0.py +64 -337
hindsight_api/server.py +3 -6
{hindsight_api-0.1.4.dist-info → hindsight_api-0.1.6.dist-info}/METADATA +6 -5
hindsight_api-0.1.6.dist-info/RECORD +64 -0
{hindsight_api-0.1.4.dist-info → hindsight_api-0.1.6.dist-info}/entry_points.txt +1 -0
hindsight_api-0.1.4.dist-info/RECORD +0 -61
{hindsight_api-0.1.4.dist-info → hindsight_api-0.1.6.dist-info}/WHEEL +0 -0

hindsight_api/engine/llm_wrapper.py CHANGED Viewed

@@ -1,15 +1,17 @@
 """
 LLM wrapper for unified configuration across providers.
 """
+import asyncio
+import logging
 import os
 import time
-import asyncio
-from typing import Optional, Any, Dict, List
-from openai import AsyncOpenAI, RateLimitError, APIError, APIStatusError, APIConnectionError, LengthFinishReasonError
+from typing import Any
 from google import genai
-from google.genai import types as genai_types
 from google.genai import errors as genai_errors
-import logging
+from google.genai import types as genai_types
+from openai import APIConnectionError, APIStatusError, AsyncOpenAI, LengthFinishReasonError
 # Seed applied to every Groq request for deterministic behavior.
 DEFAULT_LLM_SEED = 4242
@@ -31,6 +33,7 @@ class OutputTooLongError(Exception):
     to allow callers to handle output length issues without depending on
     provider-specific implementations.
     """
     pass
@@ -68,9 +71,7 @@ class LLMProvider:
         # Validate provider
         valid_providers = ["openai", "groq", "ollama", "gemini"]
         if self.provider not in valid_providers:
-            raise ValueError(
-                f"Invalid LLM provider: {self.provider}. Must be one of: {', '.join(valid_providers)}"
-            )
+            raise ValueError(f"Invalid LLM provider: {self.provider}. Must be one of: {', '.join(valid_providers)}")
         # Set default base URLs
         if not self.base_url:
@@ -106,7 +107,9 @@ class LLMProvider:
             RuntimeError: If the connection test fails.
         """
         try:
-            logger.info(f"Verifying LLM: provider={self.provider}, model={self.model}, base_url={self.base_url or 'default'}...")
+            logger.info(
+                f"Verifying LLM: provider={self.provider}, model={self.model}, base_url={self.base_url or 'default'}..."
+            )
             await self.call(
                 messages=[{"role": "user", "content": "Say 'ok'"}],
                 max_completion_tokens=10,
@@ -117,16 +120,14 @@ class LLMProvider:
             # If we get here without exception, the connection is working
             logger.info(f"LLM verified: {self.provider}/{self.model}")
         except Exception as e:
-            raise RuntimeError(
-                f"LLM connection verification failed for {self.provider}/{self.model}: {e}"
-            ) from e
+            raise RuntimeError(f"LLM connection verification failed for {self.provider}/{self.model}: {e}") from e
     async def call(
         self,
-        messages: List[Dict[str, str]],
-        response_format: Optional[Any] = None,
-        max_completion_tokens: Optional[int] = None,
-        temperature: Optional[float] = None,
+        messages: list[dict[str, str]],
+        response_format: Any | None = None,
+        max_completion_tokens: int | None = None,
+        temperature: float | None = None,
         scope: str = "memory",
         max_retries: int = 10,
         initial_backoff: float = 1.0,
@@ -161,8 +162,7 @@ class LLMProvider:
             # Handle Gemini provider separately
             if self.provider == "gemini":
                 return await self._call_gemini(
-                    messages, response_format, max_retries, initial_backoff,
-                    max_backoff, skip_validation, start_time
+                    messages, response_format, max_retries, initial_backoff, max_backoff, skip_validation, start_time
                 )
             call_params = {
@@ -175,9 +175,13 @@ class LLMProvider:
             is_reasoning_model = any(x in model_lower for x in ["gpt-5", "o1", "o3"])
             # For GPT-4 and GPT-4.1 models, cap max_completion_tokens to 32000
+            # For GPT-4o models, cap to 16384
             is_gpt4_model = any(x in model_lower for x in ["gpt-4.1", "gpt-4-"])
+            is_gpt4o_model = "gpt-4o" in model_lower
             if max_completion_tokens is not None:
-                if is_gpt4_model and max_completion_tokens > 32000:
+                if is_gpt4o_model and max_completion_tokens > 16384:
+                    max_completion_tokens = 16384
+                elif is_gpt4_model and max_completion_tokens > 32000:
                     max_completion_tokens = 32000
                 # For reasoning models, max_completion_tokens includes reasoning + output tokens
                 # Enforce minimum of 16000 to ensure enough space for both
@@ -209,16 +213,18 @@ class LLMProvider:
                 try:
                     if response_format is not None:
                         # Add schema to system message for JSON mode
-                        if hasattr(response_format, 'model_json_schema'):
+                        if hasattr(response_format, "model_json_schema"):
                             schema = response_format.model_json_schema()
                             schema_msg = f"\n\nYou must respond with valid JSON matching this schema:\n{json.dumps(schema, indent=2)}"
-                            if call_params['messages'] and call_params['messages'][0].get('role') == 'system':
-                                call_params['messages'][0]['content'] += schema_msg
-                            elif call_params['messages']:
-                                call_params['messages'][0]['content'] = schema_msg + "\n\n" + call_params['messages'][0]['content']
+                            if call_params["messages"] and call_params["messages"][0].get("role") == "system":
+                                call_params["messages"][0]["content"] += schema_msg
+                            elif call_params["messages"]:
+                                call_params["messages"][0]["content"] = (
+                                    schema_msg + "\n\n" + call_params["messages"][0]["content"]
+                                )
-                        call_params['response_format'] = {"type": "json_object"}
+                        call_params["response_format"] = {"type": "json_object"}
                         response = await self._client.chat.completions.create(**call_params)
                         content = response.choices[0].message.content
@@ -238,8 +244,8 @@ class LLMProvider:
                     if duration > 10.0:
                         ratio = max(1, usage.completion_tokens) / usage.prompt_tokens
                         cached_tokens = 0
-                        if hasattr(usage, 'prompt_tokens_details') and usage.prompt_tokens_details:
-                            cached_tokens = getattr(usage.prompt_tokens_details, 'cached_tokens', 0) or 0
+                        if hasattr(usage, "prompt_tokens_details") and usage.prompt_tokens_details:
+                            cached_tokens = getattr(usage.prompt_tokens_details, "cached_tokens", 0) or 0
                         cache_info = f", cached_tokens={cached_tokens}" if cached_tokens > 0 else ""
                         logger.info(
                             f"slow llm call: model={self.provider}/{self.model}, "
@@ -252,15 +258,19 @@ class LLMProvider:
                 except LengthFinishReasonError as e:
                     logger.warning(f"LLM output exceeded token limits: {str(e)}")
                     raise OutputTooLongError(
-                        f"LLM output exceeded token limits. Input may need to be split into smaller chunks."
+                        "LLM output exceeded token limits. Input may need to be split into smaller chunks."
                     ) from e
                 except APIConnectionError as e:
                     last_exception = e
                     if attempt < max_retries:
-                        status_code = getattr(e, 'status_code', None) or getattr(getattr(e, 'response', None), 'status_code', None)
-                        logger.warning(f"Connection error, retrying... (attempt {attempt + 1}/{max_retries + 1}) - status_code={status_code}, message={e}")
-                        backoff = min(initial_backoff * (2 ** attempt), max_backoff)
+                        status_code = getattr(e, "status_code", None) or getattr(
+                            getattr(e, "response", None), "status_code", None
+                        )
+                        logger.warning(
+                            f"Connection error, retrying... (attempt {attempt + 1}/{max_retries + 1}) - status_code={status_code}, message={e}"
+                        )
+                        backoff = min(initial_backoff * (2**attempt), max_backoff)
                         await asyncio.sleep(backoff)
                         continue
                     else:
@@ -268,14 +278,14 @@ class LLMProvider:
                         raise
                 except APIStatusError as e:
-                    # Fast fail on 4xx client errors (except 429 rate limit and 498 which is treated as server error)
-                    if 400 <= e.status_code < 500 and e.status_code not in (429, 498):
-                        logger.error(f"Client error (HTTP {e.status_code}), not retrying: {str(e)}")
+                    # Fast fail only on 401 (unauthorized) and 403 (forbidden) - these won't recover with retries
+                    if e.status_code in (401, 403):
+                        logger.error(f"Auth error (HTTP {e.status_code}), not retrying: {str(e)}")
                         raise
                     last_exception = e
                     if attempt < max_retries:
-                        backoff = min(initial_backoff * (2 ** attempt), max_backoff)
+                        backoff = min(initial_backoff * (2**attempt), max_backoff)
                         jitter = backoff * 0.2 * (2 * (time.time() % 1) - 1)
                         sleep_time = backoff + jitter
                         await asyncio.sleep(sleep_time)
@@ -289,12 +299,12 @@ class LLMProvider:
             if last_exception:
                 raise last_exception
-            raise RuntimeError(f"LLM call failed after all retries with no exception captured")
+            raise RuntimeError("LLM call failed after all retries with no exception captured")
     async def _call_gemini(
         self,
-        messages: List[Dict[str, str]],
-        response_format: Optional[Any],
+        messages: list[dict[str, str]],
+        response_format: Any | None,
         max_retries: int,
         initial_backoff: float,
         max_backoff: float,
@@ -309,27 +319,21 @@ class LLMProvider:
         gemini_contents = []
         for msg in messages:
-            role = msg.get('role', 'user')
-            content = msg.get('content', '')
+            role = msg.get("role", "user")
+            content = msg.get("content", "")
-            if role == 'system':
+            if role == "system":
                 if system_instruction:
                     system_instruction += "\n\n" + content
                 else:
                     system_instruction = content
-            elif role == 'assistant':
-                gemini_contents.append(genai_types.Content(
-                    role="model",
-                    parts=[genai_types.Part(text=content)]
-                ))
+            elif role == "assistant":
+                gemini_contents.append(genai_types.Content(role="model", parts=[genai_types.Part(text=content)]))
             else:
-                gemini_contents.append(genai_types.Content(
-                    role="user",
-                    parts=[genai_types.Part(text=content)]
-                ))
+                gemini_contents.append(genai_types.Content(role="user", parts=[genai_types.Part(text=content)]))
         # Add JSON schema instruction if response_format is provided
-        if response_format is not None and hasattr(response_format, 'model_json_schema'):
+        if response_format is not None and hasattr(response_format, "model_json_schema"):
             schema = response_format.model_json_schema()
             schema_msg = f"\n\nYou must respond with valid JSON matching this schema:\n{json.dumps(schema, indent=2)}"
             if system_instruction:
@@ -340,10 +344,10 @@ class LLMProvider:
         # Build generation config
         config_kwargs = {}
         if system_instruction:
-            config_kwargs['system_instruction'] = system_instruction
+            config_kwargs["system_instruction"] = system_instruction
         if response_format is not None:
-            config_kwargs['response_mime_type'] = 'application/json'
-            config_kwargs['response_schema'] = response_format
+            config_kwargs["response_mime_type"] = "application/json"
+            config_kwargs["response_schema"] = response_format
         generation_config = genai_types.GenerateContentConfig(**config_kwargs) if config_kwargs else None
@@ -362,14 +366,14 @@ class LLMProvider:
                 # Handle empty response
                 if content is None:
                     block_reason = None
-                    if hasattr(response, 'candidates') and response.candidates:
+                    if hasattr(response, "candidates") and response.candidates:
                         candidate = response.candidates[0]
-                        if hasattr(candidate, 'finish_reason'):
+                        if hasattr(candidate, "finish_reason"):
                             block_reason = candidate.finish_reason
                     if attempt < max_retries:
                         logger.warning(f"Gemini returned empty response (reason: {block_reason}), retrying...")
-                        backoff = min(initial_backoff * (2 ** attempt), max_backoff)
+                        backoff = min(initial_backoff * (2**attempt), max_backoff)
                         await asyncio.sleep(backoff)
                         continue
                     else:
@@ -386,7 +390,7 @@ class LLMProvider:
                 # Log slow calls
                 duration = time.time() - start_time
-                if duration > 10.0 and hasattr(response, 'usage_metadata') and response.usage_metadata:
+                if duration > 10.0 and hasattr(response, "usage_metadata") and response.usage_metadata:
                     usage = response.usage_metadata
                     logger.info(
                         f"slow llm call: model={self.provider}/{self.model}, "
@@ -399,8 +403,8 @@ class LLMProvider:
             except json.JSONDecodeError as e:
                 last_exception = e
                 if attempt < max_retries:
-                    logger.warning(f"Gemini returned invalid JSON, retrying...")
-                    backoff = min(initial_backoff * (2 ** attempt), max_backoff)
+                    logger.warning("Gemini returned invalid JSON, retrying...")
+                    backoff = min(initial_backoff * (2**attempt), max_backoff)
                     await asyncio.sleep(backoff)
                     continue
                 else:
@@ -408,16 +412,16 @@ class LLMProvider:
                     raise
             except genai_errors.APIError as e:
-                # Fast fail on 4xx client errors (except 429 rate limit)
-                if e.code and 400 <= e.code < 500 and e.code != 429:
-                    logger.error(f"Gemini client error (HTTP {e.code}), not retrying: {str(e)}")
+                # Fast fail only on 401 (unauthorized) and 403 (forbidden) - these won't recover with retries
+                if e.code in (401, 403):
+                    logger.error(f"Gemini auth error (HTTP {e.code}), not retrying: {str(e)}")
                     raise
-                # Retry on 429 and 5xx
-                if e.code in (429, 500, 502, 503, 504):
+                # Retry on retryable errors (rate limits, server errors, and other client errors like 400)
+                if e.code in (400, 429, 500, 502, 503, 504) or (e.code and e.code >= 500):
                     last_exception = e
                     if attempt < max_retries:
-                        backoff = min(initial_backoff * (2 ** attempt), max_backoff)
+                        backoff = min(initial_backoff * (2**attempt), max_backoff)
                         jitter = backoff * 0.2 * (2 * (time.time() % 1) - 1)
                         await asyncio.sleep(backoff + jitter)
                     else:
@@ -433,7 +437,7 @@ class LLMProvider:
         if last_exception:
             raise last_exception
-        raise RuntimeError(f"Gemini call failed after all retries")
+        raise RuntimeError("Gemini call failed after all retries")
     @classmethod
     def for_memory(cls) -> "LLMProvider":
@@ -443,13 +447,7 @@ class LLMProvider:
         base_url = os.getenv("HINDSIGHT_API_LLM_BASE_URL", "")
         model = os.getenv("HINDSIGHT_API_LLM_MODEL", "openai/gpt-oss-120b")
-        return cls(
-            provider=provider,
-            api_key=api_key,
-            base_url=base_url,
-            model=model,
-            reasoning_effort="low"
-        )
+        return cls(provider=provider, api_key=api_key, base_url=base_url, model=model, reasoning_effort="low")
     @classmethod
     def for_answer_generation(cls) -> "LLMProvider":
@@ -459,13 +457,7 @@ class LLMProvider:
         base_url = os.getenv("HINDSIGHT_API_ANSWER_LLM_BASE_URL", os.getenv("HINDSIGHT_API_LLM_BASE_URL", ""))
         model = os.getenv("HINDSIGHT_API_ANSWER_LLM_MODEL", os.getenv("HINDSIGHT_API_LLM_MODEL", "openai/gpt-oss-120b"))
-        return cls(
-            provider=provider,
-            api_key=api_key,
-            base_url=base_url,
-            model=model,
-            reasoning_effort="high"
-        )
+        return cls(provider=provider, api_key=api_key, base_url=base_url, model=model, reasoning_effort="high")
     @classmethod
     def for_judge(cls) -> "LLMProvider":
@@ -475,13 +467,7 @@ class LLMProvider:
         base_url = os.getenv("HINDSIGHT_API_JUDGE_LLM_BASE_URL", os.getenv("HINDSIGHT_API_LLM_BASE_URL", ""))
         model = os.getenv("HINDSIGHT_API_JUDGE_LLM_MODEL", os.getenv("HINDSIGHT_API_LLM_MODEL", "openai/gpt-oss-120b"))
-        return cls(
-            provider=provider,
-            api_key=api_key,
-            base_url=base_url,
-            model=model,
-            reasoning_effort="high"
-        )
+        return cls(provider=provider, api_key=api_key, base_url=base_url, model=model, reasoning_effort="high")
 # Backwards compatibility alias

hindsight-api 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

hindsight-api 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl