PyPI - hindsight-api - Versions diffs - 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl - Mend

hindsight-api 0.1.11py3-none-any.whl → 0.1.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

hindsight_api/__init__.py +2 -0
hindsight_api/alembic/env.py +24 -1
hindsight_api/alembic/versions/d9f6a3b4c5e2_rename_bank_to_interactions.py +14 -4
hindsight_api/alembic/versions/e0a1b2c3d4e5_disposition_to_3_traits.py +54 -13
hindsight_api/alembic/versions/rename_personality_to_disposition.py +18 -7
hindsight_api/api/http.py +253 -230
hindsight_api/api/mcp.py +14 -3
hindsight_api/config.py +11 -0
hindsight_api/daemon.py +204 -0
hindsight_api/engine/__init__.py +12 -1
hindsight_api/engine/entity_resolver.py +38 -37
hindsight_api/engine/interface.py +592 -0
hindsight_api/engine/llm_wrapper.py +176 -6
hindsight_api/engine/memory_engine.py +1092 -293
hindsight_api/engine/retain/bank_utils.py +13 -12
hindsight_api/engine/retain/chunk_storage.py +3 -2
hindsight_api/engine/retain/fact_storage.py +10 -7
hindsight_api/engine/retain/link_utils.py +17 -16
hindsight_api/engine/retain/observation_regeneration.py +17 -16
hindsight_api/engine/retain/orchestrator.py +2 -3
hindsight_api/engine/retain/types.py +25 -8
hindsight_api/engine/search/graph_retrieval.py +6 -5
hindsight_api/engine/search/mpfp_retrieval.py +8 -7
hindsight_api/engine/search/reranking.py +17 -0
hindsight_api/engine/search/retrieval.py +12 -11
hindsight_api/engine/search/think_utils.py +1 -1
hindsight_api/engine/search/tracer.py +1 -1
hindsight_api/engine/task_backend.py +32 -0
hindsight_api/extensions/__init__.py +66 -0
hindsight_api/extensions/base.py +81 -0
hindsight_api/extensions/builtin/__init__.py +18 -0
hindsight_api/extensions/builtin/tenant.py +33 -0
hindsight_api/extensions/context.py +110 -0
hindsight_api/extensions/http.py +89 -0
hindsight_api/extensions/loader.py +125 -0
hindsight_api/extensions/operation_validator.py +325 -0
hindsight_api/extensions/tenant.py +63 -0
hindsight_api/main.py +97 -17
hindsight_api/mcp_local.py +7 -1
hindsight_api/migrations.py +54 -10
hindsight_api/models.py +15 -0
hindsight_api/pg0.py +1 -1
{hindsight_api-0.1.11.dist-info → hindsight_api-0.1.13.dist-info}/METADATA +1 -1
hindsight_api-0.1.13.dist-info/RECORD +75 -0
hindsight_api-0.1.11.dist-info/RECORD +0 -64
{hindsight_api-0.1.11.dist-info → hindsight_api-0.1.13.dist-info}/WHEEL +0 -0
{hindsight_api-0.1.11.dist-info → hindsight_api-0.1.13.dist-info}/entry_points.txt +0 -0

hindsight_api/engine/llm_wrapper.py CHANGED Viewed

@@ -3,11 +3,13 @@ LLM wrapper for unified configuration across providers.
 """
 import asyncio
+import json
 import logging
 import os
 import time
 from typing import Any
+import httpx
 from google import genai
 from google.genai import errors as genai_errors
 from google.genai import types as genai_types
@@ -96,7 +98,7 @@ class LLMProvider:
             client_kwargs = {"api_key": self.api_key, "max_retries": 0}
             if self.base_url:
                 client_kwargs["base_url"] = self.base_url
-            self._client = AsyncOpenAI(**client_kwargs)
+            self._client = AsyncOpenAI(**client_kwargs)  # type: ignore[invalid-argument-type] - dict kwargs
             self._gemini_client = None
     async def verify_connection(self) -> None:
@@ -112,7 +114,7 @@ class LLMProvider:
             )
             await self.call(
                 messages=[{"role": "user", "content": "Say 'ok'"}],
-                max_completion_tokens=10,
+                max_completion_tokens=100,
                 max_retries=2,
                 initial_backoff=0.5,
                 max_backoff=2.0,
@@ -157,7 +159,6 @@ class LLMProvider:
         """
         async with _global_llm_semaphore:
             start_time = time.time()
-            import json
             # Handle Gemini provider separately
             if self.provider == "gemini":
@@ -165,6 +166,20 @@ class LLMProvider:
                     messages, response_format, max_retries, initial_backoff, max_backoff, skip_validation, start_time
                 )
+            # Handle Ollama with native API for structured output (better schema enforcement)
+            if self.provider == "ollama" and response_format is not None:
+                return await self._call_ollama_native(
+                    messages,
+                    response_format,
+                    max_completion_tokens,
+                    temperature,
+                    max_retries,
+                    initial_backoff,
+                    max_backoff,
+                    skip_validation,
+                    start_time,
+                )
             call_params = {
                 "model": self.model,
                 "messages": messages,
@@ -227,7 +242,31 @@ class LLMProvider:
                         response = await self._client.chat.completions.create(**call_params)
                         content = response.choices[0].message.content
-                        json_data = json.loads(content)
+                        # Log raw LLM response for debugging JSON parse issues
+                        try:
+                            json_data = json.loads(content)
+                        except json.JSONDecodeError as json_err:
+                            # Truncate content for logging (first 500 and last 200 chars)
+                            content_preview = content[:500] if content else "<empty>"
+                            if content and len(content) > 700:
+                                content_preview = f"{content[:500]}...TRUNCATED...{content[-200:]}"
+                            logger.warning(
+                                f"JSON parse error from LLM response (attempt {attempt + 1}/{max_retries + 1}): {json_err}\n"
+                                f"  Model: {self.provider}/{self.model}\n"
+                                f"  Content length: {len(content) if content else 0} chars\n"
+                                f"  Content preview: {content_preview!r}\n"
+                                f"  Finish reason: {response.choices[0].finish_reason if response.choices else 'unknown'}"
+                            )
+                            # Retry on JSON parse errors - LLM may return valid JSON on next attempt
+                            if attempt < max_retries:
+                                backoff = min(initial_backoff * (2**attempt), max_backoff)
+                                await asyncio.sleep(backoff)
+                                last_exception = json_err
+                                continue
+                            else:
+                                logger.error(f"JSON parse error after {max_retries + 1} attempts, giving up")
+                                raise
                         if skip_validation:
                             result = json_data
@@ -300,6 +339,129 @@ class LLMProvider:
                 raise last_exception
             raise RuntimeError("LLM call failed after all retries with no exception captured")
+    async def _call_ollama_native(
+        self,
+        messages: list[dict[str, str]],
+        response_format: Any,
+        max_completion_tokens: int | None,
+        temperature: float | None,
+        max_retries: int,
+        initial_backoff: float,
+        max_backoff: float,
+        skip_validation: bool,
+        start_time: float,
+    ) -> Any:
+        """
+        Call Ollama using native API with JSON schema enforcement.
+        Ollama's native API supports passing a full JSON schema in the 'format' parameter,
+        which provides better structured output control than the OpenAI-compatible API.
+        """
+        # Get the JSON schema from the Pydantic model
+        schema = response_format.model_json_schema() if hasattr(response_format, "model_json_schema") else None
+        # Build the base URL for Ollama's native API
+        # Default OpenAI-compatible URL is http://localhost:11434/v1
+        # Native API is at http://localhost:11434/api/chat
+        base_url = self.base_url or "http://localhost:11434/v1"
+        if base_url.endswith("/v1"):
+            native_url = base_url[:-3] + "/api/chat"
+        else:
+            native_url = base_url.rstrip("/") + "/api/chat"
+        # Build request payload
+        payload = {
+            "model": self.model,
+            "messages": messages,
+            "stream": False,
+        }
+        # Add schema as format parameter for structured output
+        if schema:
+            payload["format"] = schema
+        # Add optional parameters with optimized defaults for Ollama
+        # Benchmarking shows num_ctx=16384 + num_batch=512 is optimal
+        options = {
+            "num_ctx": 16384,  # 16k context window for larger prompts
+            "num_batch": 512,  # Optimal batch size for prompt processing
+        }
+        if max_completion_tokens:
+            options["num_predict"] = max_completion_tokens
+        if temperature is not None:
+            options["temperature"] = temperature
+        payload["options"] = options
+        last_exception = None
+        async with httpx.AsyncClient(timeout=300.0) as client:
+            for attempt in range(max_retries + 1):
+                try:
+                    response = await client.post(native_url, json=payload)
+                    response.raise_for_status()
+                    result = response.json()
+                    content = result.get("message", {}).get("content", "")
+                    # Parse JSON response
+                    try:
+                        json_data = json.loads(content)
+                    except json.JSONDecodeError as json_err:
+                        content_preview = content[:500] if content else "<empty>"
+                        if content and len(content) > 700:
+                            content_preview = f"{content[:500]}...TRUNCATED...{content[-200:]}"
+                        logger.warning(
+                            f"Ollama JSON parse error (attempt {attempt + 1}/{max_retries + 1}): {json_err}\n"
+                            f"  Model: ollama/{self.model}\n"
+                            f"  Content length: {len(content) if content else 0} chars\n"
+                            f"  Content preview: {content_preview!r}"
+                        )
+                        if attempt < max_retries:
+                            backoff = min(initial_backoff * (2**attempt), max_backoff)
+                            await asyncio.sleep(backoff)
+                            last_exception = json_err
+                            continue
+                        else:
+                            raise
+                    # Validate against Pydantic model or return raw JSON
+                    if skip_validation:
+                        return json_data
+                    else:
+                        return response_format.model_validate(json_data)
+                except httpx.HTTPStatusError as e:
+                    last_exception = e
+                    if attempt < max_retries:
+                        logger.warning(
+                            f"Ollama HTTP error (attempt {attempt + 1}/{max_retries + 1}): {e.response.status_code}"
+                        )
+                        backoff = min(initial_backoff * (2**attempt), max_backoff)
+                        await asyncio.sleep(backoff)
+                        continue
+                    else:
+                        logger.error(f"Ollama HTTP error after {max_retries + 1} attempts: {e}")
+                        raise
+                except httpx.RequestError as e:
+                    last_exception = e
+                    if attempt < max_retries:
+                        logger.warning(f"Ollama connection error (attempt {attempt + 1}/{max_retries + 1}): {e}")
+                        backoff = min(initial_backoff * (2**attempt), max_backoff)
+                        await asyncio.sleep(backoff)
+                        continue
+                    else:
+                        logger.error(f"Ollama connection error after {max_retries + 1} attempts: {e}")
+                        raise
+                except Exception as e:
+                    logger.error(f"Unexpected error during Ollama call: {type(e).__name__}: {e}")
+                    raise
+        if last_exception:
+            raise last_exception
+        raise RuntimeError("Ollama call failed after all retries")
     async def _call_gemini(
         self,
         messages: list[dict[str, str]],
@@ -311,8 +473,6 @@ class LLMProvider:
         start_time: float,
     ) -> Any:
         """Handle Gemini-specific API calls."""
-        import json
         # Convert OpenAI-style messages to Gemini format
         system_instruction = None
         gemini_contents = []
@@ -443,6 +603,8 @@ class LLMProvider:
         """Create provider for memory operations from environment variables."""
         provider = os.getenv("HINDSIGHT_API_LLM_PROVIDER", "groq")
         api_key = os.getenv("HINDSIGHT_API_LLM_API_KEY")
+        if not api_key:
+            raise ValueError("HINDSIGHT_API_LLM_API_KEY environment variable is required")
         base_url = os.getenv("HINDSIGHT_API_LLM_BASE_URL", "")
         model = os.getenv("HINDSIGHT_API_LLM_MODEL", "openai/gpt-oss-120b")
@@ -453,6 +615,10 @@ class LLMProvider:
         """Create provider for answer generation. Falls back to memory config if not set."""
         provider = os.getenv("HINDSIGHT_API_ANSWER_LLM_PROVIDER", os.getenv("HINDSIGHT_API_LLM_PROVIDER", "groq"))
         api_key = os.getenv("HINDSIGHT_API_ANSWER_LLM_API_KEY", os.getenv("HINDSIGHT_API_LLM_API_KEY"))
+        if not api_key:
+            raise ValueError(
+                "HINDSIGHT_API_LLM_API_KEY or HINDSIGHT_API_ANSWER_LLM_API_KEY environment variable is required"
+            )
         base_url = os.getenv("HINDSIGHT_API_ANSWER_LLM_BASE_URL", os.getenv("HINDSIGHT_API_LLM_BASE_URL", ""))
         model = os.getenv("HINDSIGHT_API_ANSWER_LLM_MODEL", os.getenv("HINDSIGHT_API_LLM_MODEL", "openai/gpt-oss-120b"))
@@ -463,6 +629,10 @@ class LLMProvider:
         """Create provider for judge/evaluator operations. Falls back to memory config if not set."""
         provider = os.getenv("HINDSIGHT_API_JUDGE_LLM_PROVIDER", os.getenv("HINDSIGHT_API_LLM_PROVIDER", "groq"))
         api_key = os.getenv("HINDSIGHT_API_JUDGE_LLM_API_KEY", os.getenv("HINDSIGHT_API_LLM_API_KEY"))
+        if not api_key:
+            raise ValueError(
+                "HINDSIGHT_API_LLM_API_KEY or HINDSIGHT_API_JUDGE_LLM_API_KEY environment variable is required"
+            )
         base_url = os.getenv("HINDSIGHT_API_JUDGE_LLM_BASE_URL", os.getenv("HINDSIGHT_API_LLM_BASE_URL", ""))
         model = os.getenv("HINDSIGHT_API_JUDGE_LLM_MODEL", os.getenv("HINDSIGHT_API_LLM_MODEL", "openai/gpt-oss-120b"))

hindsight-api 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl

hindsight-api 0.1.11py3-none-any.whl → 0.1.13py3-none-any.whl