PyPI - hindsight-api - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl - Mend

hindsight-api 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

hindsight_api/__init__.py +10 -2
hindsight_api/alembic/README +1 -0
hindsight_api/alembic/env.py +146 -0
hindsight_api/alembic/script.py.mako +28 -0
hindsight_api/alembic/versions/5a366d414dce_initial_schema.py +274 -0
hindsight_api/alembic/versions/b7c4d8e9f1a2_add_chunks_table.py +70 -0
hindsight_api/alembic/versions/c8e5f2a3b4d1_add_retain_params_to_documents.py +39 -0
hindsight_api/alembic/versions/d9f6a3b4c5e2_rename_bank_to_interactions.py +48 -0
hindsight_api/alembic/versions/e0a1b2c3d4e5_disposition_to_3_traits.py +62 -0
hindsight_api/alembic/versions/rename_personality_to_disposition.py +65 -0
hindsight_api/api/http.py +84 -86
hindsight_api/config.py +154 -0
hindsight_api/engine/__init__.py +7 -2
hindsight_api/engine/cross_encoder.py +219 -15
hindsight_api/engine/embeddings.py +192 -18
hindsight_api/engine/llm_wrapper.py +88 -139
hindsight_api/engine/memory_engine.py +71 -51
hindsight_api/engine/retain/bank_utils.py +2 -2
hindsight_api/engine/retain/fact_extraction.py +1 -1
hindsight_api/engine/search/reranking.py +6 -10
hindsight_api/engine/search/tracer.py +1 -1
hindsight_api/main.py +201 -0
hindsight_api/migrations.py +7 -7
hindsight_api/server.py +43 -0
{hindsight_api-0.1.0.dist-info → hindsight_api-0.1.1.dist-info}/METADATA +1 -1
{hindsight_api-0.1.0.dist-info → hindsight_api-0.1.1.dist-info}/RECORD +28 -19
hindsight_api-0.1.1.dist-info/entry_points.txt +2 -0
hindsight_api/cli.py +0 -127
hindsight_api/web/__init__.py +0 -12
hindsight_api/web/server.py +0 -109
hindsight_api-0.1.0.dist-info/entry_points.txt +0 -2
{hindsight_api-0.1.0.dist-info → hindsight_api-0.1.1.dist-info}/WHEEL +0 -0

hindsight_api/engine/llm_wrapper.py CHANGED Viewed

@@ -34,8 +34,12 @@ class OutputTooLongError(Exception):
     pass
-class LLMConfig:
-    """Configuration for an LLM provider."""
+class LLMProvider:
+    """
+    Unified LLM provider.
+    Supports OpenAI, Groq, Ollama (OpenAI-compatible), and Gemini.
+    """
     def __init__(
         self,
@@ -43,16 +47,17 @@ class LLMConfig:
         api_key: str,
         base_url: str,
         model: str,
-            reasoning_effort: str = "low",
+        reasoning_effort: str = "low",
     ):
         """
-        Initialize LLM configuration.
+        Initialize LLM provider.
         Args:
-            provider: Provider name ("openai", "groq", "ollama"). Required.
-            api_key: API key. Required.
-            base_url: Base URL. Required.
-            model: Model name. Required.
+            provider: Provider name ("openai", "groq", "ollama", "gemini").
+            api_key: API key.
+            base_url: Base URL for the API.
+            model: Model name.
+            reasoning_effort: Reasoning effort level for supported providers.
         """
         self.provider = provider.lower()
         self.api_key = api_key
@@ -61,9 +66,10 @@ class LLMConfig:
         self.reasoning_effort = reasoning_effort
         # Validate provider
-        if self.provider not in ["openai", "groq", "ollama", "gemini"]:
+        valid_providers = ["openai", "groq", "ollama", "gemini"]
+        if self.provider not in valid_providers:
             raise ValueError(
-                f"Invalid LLM provider: {self.provider}. Must be 'openai', 'groq', 'ollama', or 'gemini'."
+                f"Invalid LLM provider: {self.provider}. Must be one of: {', '.join(valid_providers)}"
             )
         # Set default base URLs
@@ -74,24 +80,18 @@ class LLMConfig:
                 self.base_url = "http://localhost:11434/v1"
         # Validate API key (not needed for ollama)
-        if self.provider not in ["ollama"] and not self.api_key:
-            raise ValueError(
-                f"API key not found for {self.provider}"
-            )
+        if self.provider != "ollama" and not self.api_key:
+            raise ValueError(f"API key not found for {self.provider}")
-        # Create client (private - use .call() method instead)
-        # Disable automatic retries - we handle retries in the call() method
+        # Create client based on provider
         if self.provider == "gemini":
             self._gemini_client = genai.Client(api_key=self.api_key)
-            self._client = None  # Not used for Gemini
+            self._client = None
         elif self.provider == "ollama":
             self._client = AsyncOpenAI(api_key="ollama", base_url=self.base_url, max_retries=0)
             self._gemini_client = None
-        elif self.base_url:
-            self._client = AsyncOpenAI(api_key=self.api_key, base_url=self.base_url, max_retries=0)
-            self._gemini_client = None
         else:
-            self._client = AsyncOpenAI(api_key=self.api_key, max_retries=0)
+            self._client = AsyncOpenAI(api_key=self.api_key, base_url=self.base_url, max_retries=0)
             self._gemini_client = None
         logger.info(
@@ -102,101 +102,99 @@ class LLMConfig:
         self,
         messages: List[Dict[str, str]],
         response_format: Optional[Any] = None,
+        max_completion_tokens: Optional[int] = None,
+        temperature: Optional[float] = None,
         scope: str = "memory",
         max_retries: int = 10,
         initial_backoff: float = 1.0,
         max_backoff: float = 60.0,
         skip_validation: bool = False,
-        **kwargs
     ) -> Any:
         """
-        Make an LLM API call with consistent configuration and retry logic.
+        Make an LLM API call with retry logic.
         Args:
-            messages: List of message dicts with 'role' and 'content'
-            response_format: Optional Pydantic model for structured output
-            scope: Scope identifier (e.g., 'memory', 'judge') for future tracking
-            max_retries: Maximum number of retry attempts (default: 5)
-            initial_backoff: Initial backoff time in seconds (default: 1.0)
-            max_backoff: Maximum backoff time in seconds (default: 60.0)
-            **kwargs: Additional parameters to pass to the API (temperature, max_tokens, etc.)
+            messages: List of message dicts with 'role' and 'content'.
+            response_format: Optional Pydantic model for structured output.
+            max_completion_tokens: Maximum tokens in response.
+            temperature: Sampling temperature (0.0-2.0).
+            scope: Scope identifier for tracking.
+            max_retries: Maximum retry attempts.
+            initial_backoff: Initial backoff time in seconds.
+            max_backoff: Maximum backoff time in seconds.
+            skip_validation: Return raw JSON without Pydantic validation.
         Returns:
-            Parsed response if response_format is provided, otherwise the text content
+            Parsed response if response_format is provided, otherwise text content.
         Raises:
-            Exception: Re-raises any API errors after all retries are exhausted
+            OutputTooLongError: If output exceeds token limits.
+            Exception: Re-raises API errors after retries exhausted.
         """
-        # Use global semaphore to limit concurrent requests
         async with _global_llm_semaphore:
             start_time = time.time()
             import json
             # Handle Gemini provider separately
             if self.provider == "gemini":
-                return await self._call_gemini(messages, response_format, max_retries, initial_backoff, max_backoff, skip_validation, start_time, **kwargs)
+                return await self._call_gemini(
+                    messages, response_format, max_retries, initial_backoff,
+                    max_backoff, skip_validation, start_time
+                )
             call_params = {
                 "model": self.model,
                 "messages": messages,
-                **kwargs
             }
+            if max_completion_tokens is not None:
+                call_params["max_completion_tokens"] = max_completion_tokens
+            if temperature is not None:
+                call_params["temperature"] = temperature
+            # Provider-specific parameters
             if self.provider == "groq":
                 call_params["seed"] = DEFAULT_LLM_SEED
-            if self.provider == "groq":
                 call_params["extra_body"] = {
                     "service_tier": "auto",
                     "reasoning_effort": self.reasoning_effort,
-                    "include_reasoning": False,  # Disable hidden reasoning tokens
+                    "include_reasoning": False,
                 }
             last_exception = None
             for attempt in range(max_retries + 1):
                 try:
-                    # Use the appropriate response format
                     if response_format is not None:
-                        # Use JSON mode instead of strict parse for flexibility with optional fields
-                        # This allows the LLM to omit optional fields without validation errors
-                        # Add schema to the system message
+                        # Add schema to system message for JSON mode
                         if hasattr(response_format, 'model_json_schema'):
                             schema = response_format.model_json_schema()
                             schema_msg = f"\n\nYou must respond with valid JSON matching this schema:\n{json.dumps(schema, indent=2)}"
-                            # Add schema to the system message if present, otherwise prepend as user message
                             if call_params['messages'] and call_params['messages'][0].get('role') == 'system':
                                 call_params['messages'][0]['content'] += schema_msg
-                            else:
-                                # No system message, add schema instruction to first user message
-                                if call_params['messages']:
-                                    call_params['messages'][0]['content'] = schema_msg + "\n\n" + call_params['messages'][0]['content']
+                            elif call_params['messages']:
+                                call_params['messages'][0]['content'] = schema_msg + "\n\n" + call_params['messages'][0]['content']
                         call_params['response_format'] = {"type": "json_object"}
                         response = await self._client.chat.completions.create(**call_params)
-                        # Parse the JSON response
                         content = response.choices[0].message.content
                         json_data = json.loads(content)
-                        # Return raw JSON if skip_validation is True, otherwise validate with Pydantic
                         if skip_validation:
                             result = json_data
                         else:
                             result = response_format.model_validate(json_data)
                     else:
-                        # Standard completion and return text content
                         response = await self._client.chat.completions.create(**call_params)
                         result = response.choices[0].message.content
-                    # Log call details only if it takes more than 5 seconds
+                    # Log slow calls
                     duration = time.time() - start_time
                     usage = response.usage
                     if duration > 10.0:
                         ratio = max(1, usage.completion_tokens) / usage.prompt_tokens
-                        # Check for cached tokens (OpenAI/Groq may include this)
                         cached_tokens = 0
                         if hasattr(usage, 'prompt_tokens_details') and usage.prompt_tokens_details:
                             cached_tokens = getattr(usage.prompt_tokens_details, 'cached_tokens', 0) or 0
@@ -210,14 +208,12 @@ class LLMConfig:
                     return result
                 except LengthFinishReasonError as e:
-                    # Output exceeded token limits - raise bridge exception for caller to handle
                     logger.warning(f"LLM output exceeded token limits: {str(e)}")
                     raise OutputTooLongError(
                         f"LLM output exceeded token limits. Input may need to be split into smaller chunks."
                     ) from e
                 except APIConnectionError as e:
-                    # Handle connection errors (server disconnected, network issues) with retry
                     last_exception = e
                     if attempt < max_retries:
                         logger.warning(f"Connection error, retrying... (attempt {attempt + 1}/{max_retries + 1})")
@@ -229,19 +225,18 @@ class LLMConfig:
                         raise
                 except APIStatusError as e:
+                    # Fast fail on 4xx client errors (except 429 rate limit and 498 which is treated as server error)
+                    if 400 <= e.status_code < 500 and e.status_code not in (429, 498):
+                        logger.error(f"Client error (HTTP {e.status_code}), not retrying: {str(e)}")
+                        raise
                     last_exception = e
                     if attempt < max_retries:
-                        # Calculate exponential backoff with jitter
                         backoff = min(initial_backoff * (2 ** attempt), max_backoff)
-                        # Add jitter (±20%)
                         jitter = backoff * 0.2 * (2 * (time.time() % 1) - 1)
                         sleep_time = backoff + jitter
-                        # Only log if it's a non-retryable error or final attempt
-                        # Silent retry for common transient errors like capacity exceeded
                         await asyncio.sleep(sleep_time)
                     else:
-                        # Log only on final failed attempt
                         logger.error(f"API error after {max_retries + 1} attempts: {str(e)}")
                         raise
@@ -249,7 +244,6 @@ class LLMConfig:
                     logger.error(f"Unexpected error during LLM call: {type(e).__name__}: {str(e)}")
                     raise
-            # This should never be reached, but just in case
             if last_exception:
                 raise last_exception
             raise RuntimeError(f"LLM call failed after all retries with no exception captured")
@@ -263,13 +257,11 @@ class LLMConfig:
         max_backoff: float,
         skip_validation: bool,
         start_time: float,
-        **kwargs
-) -> Any:
-        """Handle Gemini-specific API calls using google-genai SDK."""
+    ) -> Any:
+        """Handle Gemini-specific API calls."""
         import json
         # Convert OpenAI-style messages to Gemini format
-        # Gemini uses 'user' and 'model' roles, and system instructions are separate
         system_instruction = None
         gemini_contents = []
@@ -278,7 +270,6 @@ class LLMConfig:
             content = msg.get('content', '')
             if role == 'system':
-                # Accumulate system messages as system instruction
                 if system_instruction:
                     system_instruction += "\n\n" + content
                 else:
@@ -288,7 +279,7 @@ class LLMConfig:
                     role="model",
                     parts=[genai_types.Part(text=content)]
                 ))
-            else:  # user or any other role
+            else:
                 gemini_contents.append(genai_types.Content(
                     role="user",
                     parts=[genai_types.Part(text=content)]
@@ -307,13 +298,8 @@ class LLMConfig:
         config_kwargs = {}
         if system_instruction:
             config_kwargs['system_instruction'] = system_instruction
-        if 'temperature' in kwargs:
-            config_kwargs['temperature'] = kwargs['temperature']
-        if 'max_tokens' in kwargs:
-            config_kwargs['max_output_tokens'] = kwargs['max_tokens']
         if response_format is not None:
             config_kwargs['response_mime_type'] = 'application/json'
-            # Pass the Pydantic model directly as response_schema for structured output
             config_kwargs['response_schema'] = response_format
         generation_config = genai_types.GenerateContentConfig(**config_kwargs) if config_kwargs else None
@@ -330,9 +316,8 @@ class LLMConfig:
                 content = response.text
-                # Handle empty/None response (can happen with content filtering or timeouts)
+                # Handle empty response
                 if content is None:
-                    # Check if there's a block reason
                     block_reason = None
                     if hasattr(response, 'candidates') and response.candidates:
                         candidate = response.candidates[0]
@@ -340,18 +325,15 @@ class LLMConfig:
                             block_reason = candidate.finish_reason
                     if attempt < max_retries:
-                        logger.warning(f"Gemini returned empty response (reason: {block_reason}), retrying... (attempt {attempt + 1}/{max_retries + 1})")
+                        logger.warning(f"Gemini returned empty response (reason: {block_reason}), retrying...")
                         backoff = min(initial_backoff * (2 ** attempt), max_backoff)
                         await asyncio.sleep(backoff)
                         continue
                     else:
-                        raise RuntimeError(f"Gemini returned empty response after {max_retries + 1} attempts (reason: {block_reason})")
+                        raise RuntimeError(f"Gemini returned empty response after {max_retries + 1} attempts")
                 if response_format is not None:
-                    # Parse the JSON response
                     json_data = json.loads(content)
-                    # Return raw JSON if skip_validation is True, otherwise validate with Pydantic
                     if skip_validation:
                         result = json_data
                     else:
@@ -359,42 +341,42 @@ class LLMConfig:
                 else:
                     result = content
-                # Log call details only if it takes more than 10 seconds
+                # Log slow calls
                 duration = time.time() - start_time
                 if duration > 10.0 and hasattr(response, 'usage_metadata') and response.usage_metadata:
                     usage = response.usage_metadata
-                    # Check for cached tokens (Gemini uses cached_content_token_count)
-                    cached_tokens = getattr(usage, 'cached_content_token_count', 0) or 0
-                    cache_info = f", cached_tokens={cached_tokens}" if cached_tokens > 0 else ""
                     logger.info(
                         f"slow llm call: model={self.provider}/{self.model}, "
-                        f"input_tokens={usage.prompt_token_count}, output_tokens={usage.candidates_token_count}{cache_info}, "
+                        f"input_tokens={usage.prompt_token_count}, output_tokens={usage.candidates_token_count}, "
                         f"time={duration:.3f}s"
                     )
                 return result
             except json.JSONDecodeError as e:
-                # Handle truncated JSON responses (often from MAX_TOKENS) with retry
                 last_exception = e
                 if attempt < max_retries:
-                    logger.warning(f"Gemini returned invalid JSON (truncated response?), retrying... (attempt {attempt + 1}/{max_retries + 1})")
+                    logger.warning(f"Gemini returned invalid JSON, retrying...")
                     backoff = min(initial_backoff * (2 ** attempt), max_backoff)
                     await asyncio.sleep(backoff)
                     continue
                 else:
-                    logger.error(f"Gemini returned invalid JSON after {max_retries + 1} attempts: {str(e)}")
+                    logger.error(f"Gemini returned invalid JSON after {max_retries + 1} attempts")
                     raise
             except genai_errors.APIError as e:
-                # Handle rate limits and server errors with retry
-                if e.code in (429, 503, 500):
+                # Fast fail on 4xx client errors (except 429 rate limit)
+                if e.code and 400 <= e.code < 500 and e.code != 429:
+                    logger.error(f"Gemini client error (HTTP {e.code}), not retrying: {str(e)}")
+                    raise
+                # Retry on 429 and 5xx
+                if e.code in (429, 500, 502, 503, 504):
                     last_exception = e
                     if attempt < max_retries:
                         backoff = min(initial_backoff * (2 ** attempt), max_backoff)
                         jitter = backoff * 0.2 * (2 * (time.time() % 1) - 1)
-                        sleep_time = backoff + jitter
-                        await asyncio.sleep(sleep_time)
+                        await asyncio.sleep(backoff + jitter)
                     else:
                         logger.error(f"Gemini API error after {max_retries + 1} attempts: {str(e)}")
                         raise
@@ -408,25 +390,16 @@ class LLMConfig:
         if last_exception:
             raise last_exception
-        raise RuntimeError(f"Gemini call failed after all retries with no exception captured")
+        raise RuntimeError(f"Gemini call failed after all retries")
     @classmethod
-    def for_memory(cls) -> "LLMConfig":
-        """Create configuration for memory operations from environment variables."""
+    def for_memory(cls) -> "LLMProvider":
+        """Create provider for memory operations from environment variables."""
         provider = os.getenv("HINDSIGHT_API_LLM_PROVIDER", "groq")
         api_key = os.getenv("HINDSIGHT_API_LLM_API_KEY")
-        base_url = os.getenv("HINDSIGHT_API_LLM_BASE_URL")
+        base_url = os.getenv("HINDSIGHT_API_LLM_BASE_URL", "")
         model = os.getenv("HINDSIGHT_API_LLM_MODEL", "openai/gpt-oss-120b")
-        # Set default base URL if not provided
-        if not base_url:
-            if provider == "groq":
-                base_url = "https://api.groq.com/openai/v1"
-            elif provider == "ollama":
-                base_url = "http://localhost:11434/v1"
-            else:
-                base_url = ""
         return cls(
             provider=provider,
             api_key=api_key,
@@ -436,27 +409,13 @@ class LLMConfig:
         )
     @classmethod
-    def for_answer_generation(cls) -> "LLMConfig":
-        """
-        Create configuration for answer generation operations from environment variables.
-        Falls back to memory LLM config if answer-specific config not set.
-        """
-        # Check if answer-specific config exists, otherwise fall back to memory config
+    def for_answer_generation(cls) -> "LLMProvider":
+        """Create provider for answer generation. Falls back to memory config if not set."""
         provider = os.getenv("HINDSIGHT_API_ANSWER_LLM_PROVIDER", os.getenv("HINDSIGHT_API_LLM_PROVIDER", "groq"))
         api_key = os.getenv("HINDSIGHT_API_ANSWER_LLM_API_KEY", os.getenv("HINDSIGHT_API_LLM_API_KEY"))
-        base_url = os.getenv("HINDSIGHT_API_ANSWER_LLM_BASE_URL", os.getenv("HINDSIGHT_API_LLM_BASE_URL"))
+        base_url = os.getenv("HINDSIGHT_API_ANSWER_LLM_BASE_URL", os.getenv("HINDSIGHT_API_LLM_BASE_URL", ""))
         model = os.getenv("HINDSIGHT_API_ANSWER_LLM_MODEL", os.getenv("HINDSIGHT_API_LLM_MODEL", "openai/gpt-oss-120b"))
-        # Set default base URL if not provided
-        if not base_url:
-            if provider == "groq":
-                base_url = "https://api.groq.com/openai/v1"
-            elif provider == "ollama":
-                base_url = "http://localhost:11434/v1"
-            else:
-                base_url = ""
         return cls(
             provider=provider,
             api_key=api_key,
@@ -466,27 +425,13 @@ class LLMConfig:
         )
     @classmethod
-    def for_judge(cls) -> "LLMConfig":
-        """
-        Create configuration for judge/evaluator operations from environment variables.
-        Falls back to memory LLM config if judge-specific config not set.
-        """
-        # Check if judge-specific config exists, otherwise fall back to memory config
+    def for_judge(cls) -> "LLMProvider":
+        """Create provider for judge/evaluator operations. Falls back to memory config if not set."""
         provider = os.getenv("HINDSIGHT_API_JUDGE_LLM_PROVIDER", os.getenv("HINDSIGHT_API_LLM_PROVIDER", "groq"))
         api_key = os.getenv("HINDSIGHT_API_JUDGE_LLM_API_KEY", os.getenv("HINDSIGHT_API_LLM_API_KEY"))
-        base_url = os.getenv("HINDSIGHT_API_JUDGE_LLM_BASE_URL", os.getenv("HINDSIGHT_API_LLM_BASE_URL"))
+        base_url = os.getenv("HINDSIGHT_API_JUDGE_LLM_BASE_URL", os.getenv("HINDSIGHT_API_LLM_BASE_URL", ""))
         model = os.getenv("HINDSIGHT_API_JUDGE_LLM_MODEL", os.getenv("HINDSIGHT_API_LLM_MODEL", "openai/gpt-oss-120b"))
-        # Set default base URL if not provided
-        if not base_url:
-            if provider == "groq":
-                base_url = "https://api.groq.com/openai/v1"
-            elif provider == "ollama":
-                base_url = "http://localhost:11434/v1"
-            else:
-                base_url = ""
         return cls(
             provider=provider,
             api_key=api_key,
@@ -494,3 +439,7 @@ class LLMConfig:
             model=model,
             reasoning_effort="high"
         )
+# Backwards compatibility alias
+LLMConfig = LLMProvider

hindsight-api 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

hindsight-api 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl