PyPI - hindsight-api - Versions diffs - 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

hindsight-api 0.2.1py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

hindsight_api/admin/__init__.py +1 -0
hindsight_api/admin/cli.py +252 -0
hindsight_api/alembic/versions/f1a2b3c4d5e6_add_memory_links_composite_index.py +44 -0
hindsight_api/alembic/versions/g2a3b4c5d6e7_add_tags_column.py +48 -0
hindsight_api/api/http.py +282 -20
hindsight_api/api/mcp.py +47 -52
hindsight_api/config.py +238 -6
hindsight_api/engine/cross_encoder.py +599 -86
hindsight_api/engine/db_budget.py +284 -0
hindsight_api/engine/db_utils.py +11 -0
hindsight_api/engine/embeddings.py +453 -26
hindsight_api/engine/entity_resolver.py +8 -5
hindsight_api/engine/interface.py +8 -4
hindsight_api/engine/llm_wrapper.py +241 -27
hindsight_api/engine/memory_engine.py +609 -122
hindsight_api/engine/query_analyzer.py +4 -3
hindsight_api/engine/response_models.py +38 -0
hindsight_api/engine/retain/fact_extraction.py +388 -192
hindsight_api/engine/retain/fact_storage.py +34 -8
hindsight_api/engine/retain/link_utils.py +24 -16
hindsight_api/engine/retain/orchestrator.py +52 -17
hindsight_api/engine/retain/types.py +9 -0
hindsight_api/engine/search/graph_retrieval.py +42 -13
hindsight_api/engine/search/link_expansion_retrieval.py +256 -0
hindsight_api/engine/search/mpfp_retrieval.py +362 -117
hindsight_api/engine/search/reranking.py +2 -2
hindsight_api/engine/search/retrieval.py +847 -200
hindsight_api/engine/search/tags.py +172 -0
hindsight_api/engine/search/think_utils.py +1 -1
hindsight_api/engine/search/trace.py +12 -0
hindsight_api/engine/search/tracer.py +24 -1
hindsight_api/engine/search/types.py +21 -0
hindsight_api/engine/task_backend.py +109 -18
hindsight_api/engine/utils.py +1 -1
hindsight_api/extensions/context.py +10 -1
hindsight_api/main.py +56 -4
hindsight_api/metrics.py +433 -48
hindsight_api/migrations.py +141 -1
hindsight_api/models.py +3 -1
hindsight_api/pg0.py +53 -0
hindsight_api/server.py +39 -2
{hindsight_api-0.2.1.dist-info → hindsight_api-0.3.0.dist-info}/METADATA +5 -1
hindsight_api-0.3.0.dist-info/RECORD +82 -0
{hindsight_api-0.2.1.dist-info → hindsight_api-0.3.0.dist-info}/entry_points.txt +1 -0
hindsight_api-0.2.1.dist-info/RECORD +0 -75
{hindsight_api-0.2.1.dist-info → hindsight_api-0.3.0.dist-info}/WHEEL +0 -0

hindsight_api/engine/llm_wrapper.py CHANGED Viewed

@@ -19,9 +19,12 @@ from openai import APIConnectionError, APIStatusError, AsyncOpenAI, LengthFinish
 from ..config import (
     DEFAULT_LLM_MAX_CONCURRENT,
     DEFAULT_LLM_TIMEOUT,
+    ENV_LLM_GROQ_SERVICE_TIER,
     ENV_LLM_MAX_CONCURRENT,
     ENV_LLM_TIMEOUT,
 )
+from ..metrics import get_metrics_collector
+from .response_models import TokenUsage
 # Seed applied to every Groq request for deterministic behavior.
 DEFAULT_LLM_SEED = 4242
@@ -63,6 +66,7 @@ class LLMProvider:
         base_url: str,
         model: str,
         reasoning_effort: str = "low",
+        groq_service_tier: str | None = None,
     ):
         """
         Initialize LLM provider.
@@ -73,18 +77,25 @@ class LLMProvider:
             base_url: Base URL for the API.
             model: Model name.
             reasoning_effort: Reasoning effort level for supported providers.
+            groq_service_tier: Groq service tier ("on_demand", "flex", "auto"). Default: None (uses Groq's default).
         """
         self.provider = provider.lower()
         self.api_key = api_key
         self.base_url = base_url
         self.model = model
         self.reasoning_effort = reasoning_effort
+        # Default to 'auto' for best performance, users can override to 'on_demand' for free tier
+        self.groq_service_tier = groq_service_tier or os.getenv(ENV_LLM_GROQ_SERVICE_TIER, "auto")
         # Validate provider
-        valid_providers = ["openai", "groq", "ollama", "gemini", "anthropic", "lmstudio"]
+        valid_providers = ["openai", "groq", "ollama", "gemini", "anthropic", "lmstudio", "mock"]
         if self.provider not in valid_providers:
             raise ValueError(f"Invalid LLM provider: {self.provider}. Must be one of: {', '.join(valid_providers)}")
+        # Mock provider tracking (for testing)
+        self._mock_calls: list[dict] = []
+        self._mock_response: Any = None
         # Set default base URLs
         if not self.base_url:
             if self.provider == "groq":
@@ -94,8 +105,8 @@ class LLMProvider:
             elif self.provider == "lmstudio":
                 self.base_url = "http://localhost:1234/v1"
-        # Validate API key (not needed for ollama or lmstudio)
-        if self.provider not in ("ollama", "lmstudio") and not self.api_key:
+        # Validate API key (not needed for ollama, lmstudio, or mock)
+        if self.provider not in ("ollama", "lmstudio", "mock") and not self.api_key:
             raise ValueError(f"API key not found for {self.provider}")
         # Get timeout config (set HINDSIGHT_API_LLM_TIMEOUT for local LLMs that need longer timeouts)
@@ -106,7 +117,10 @@ class LLMProvider:
         self._gemini_client = None
         self._anthropic_client = None
-        if self.provider == "gemini":
+        if self.provider == "mock":
+            # Mock provider - no client needed
+            pass
+        elif self.provider == "gemini":
             self._gemini_client = genai.Client(api_key=self.api_key)
         elif self.provider == "anthropic":
             from anthropic import AsyncAnthropic
@@ -169,6 +183,7 @@ class LLMProvider:
         max_backoff: float = 60.0,
         skip_validation: bool = False,
         strict_schema: bool = False,
+        return_usage: bool = False,
     ) -> Any:
         """
         Make an LLM API call with retry logic.
@@ -184,21 +199,43 @@ class LLMProvider:
             max_backoff: Maximum backoff time in seconds.
             skip_validation: Return raw JSON without Pydantic validation.
             strict_schema: Use strict JSON schema enforcement (OpenAI only). Guarantees all required fields.
+            return_usage: If True, return tuple (result, TokenUsage) instead of just result.
         Returns:
-            Parsed response if response_format is provided, otherwise text content.
+            If return_usage=False: Parsed response if response_format is provided, otherwise text content.
+            If return_usage=True: Tuple of (result, TokenUsage) with token counts from the LLM call.
         Raises:
             OutputTooLongError: If output exceeds token limits.
             Exception: Re-raises API errors after retries exhausted.
         """
+        queue_start_time = time.time()
         async with _global_llm_semaphore:
             start_time = time.time()
+            semaphore_wait_time = start_time - queue_start_time
+            # Handle Mock provider (for testing)
+            if self.provider == "mock":
+                return await self._call_mock(
+                    messages,
+                    response_format,
+                    scope,
+                    return_usage,
+                )
             # Handle Gemini provider separately
             if self.provider == "gemini":
                 return await self._call_gemini(
-                    messages, response_format, max_retries, initial_backoff, max_backoff, skip_validation, start_time
+                    messages,
+                    response_format,
+                    max_retries,
+                    initial_backoff,
+                    max_backoff,
+                    skip_validation,
+                    start_time,
+                    scope,
+                    return_usage,
+                    semaphore_wait_time,
                 )
             # Handle Anthropic provider separately
@@ -212,6 +249,9 @@ class LLMProvider:
                     max_backoff,
                     skip_validation,
                     start_time,
+                    scope,
+                    return_usage,
+                    semaphore_wait_time,
                 )
             # Handle Ollama with native API for structured output (better schema enforcement)
@@ -226,6 +266,9 @@ class LLMProvider:
                     max_backoff,
                     skip_validation,
                     start_time,
+                    scope,
+                    return_usage,
+                    semaphore_wait_time,
                 )
             call_params = {
@@ -263,11 +306,15 @@ class LLMProvider:
             # Provider-specific parameters
             if self.provider == "groq":
                 call_params["seed"] = DEFAULT_LLM_SEED
-                extra_body = {"service_tier": "auto"}
-                # Only add reasoning parameters for reasoning models
+                extra_body: dict[str, Any] = {}
+                # Add service_tier if configured (requires paid plan for flex/auto)
+                if self.groq_service_tier:
+                    extra_body["service_tier"] = self.groq_service_tier
+                # Add reasoning parameters for reasoning models
                 if is_reasoning_model:
                     extra_body["include_reasoning"] = False
-                call_params["extra_body"] = extra_body
+                if extra_body:
+                    call_params["extra_body"] = extra_body
             last_exception = None
@@ -370,21 +417,46 @@ class LLMProvider:
                         response = await self._client.chat.completions.create(**call_params)
                         result = response.choices[0].message.content
-                    # Log slow calls
+                    # Record token usage metrics
                     duration = time.time() - start_time
                     usage = response.usage
-                    if duration > 10.0:
-                        ratio = max(1, usage.completion_tokens) / usage.prompt_tokens
+                    input_tokens = usage.prompt_tokens or 0 if usage else 0
+                    output_tokens = usage.completion_tokens or 0 if usage else 0
+                    total_tokens = usage.total_tokens or 0 if usage else 0
+                    # Record LLM metrics
+                    metrics = get_metrics_collector()
+                    metrics.record_llm_call(
+                        provider=self.provider,
+                        model=self.model,
+                        scope=scope,
+                        duration=duration,
+                        input_tokens=input_tokens,
+                        output_tokens=output_tokens,
+                        success=True,
+                    )
+                    # Log slow calls
+                    if duration > 10.0 and usage:
+                        ratio = max(1, output_tokens) / max(1, input_tokens)
                         cached_tokens = 0
                         if hasattr(usage, "prompt_tokens_details") and usage.prompt_tokens_details:
                             cached_tokens = getattr(usage.prompt_tokens_details, "cached_tokens", 0) or 0
                         cache_info = f", cached_tokens={cached_tokens}" if cached_tokens > 0 else ""
+                        wait_info = f", wait={semaphore_wait_time:.3f}s" if semaphore_wait_time > 0.1 else ""
                         logger.info(
-                            f"slow llm call: model={self.provider}/{self.model}, "
-                            f"input_tokens={usage.prompt_tokens}, output_tokens={usage.completion_tokens}, "
-                            f"total_tokens={usage.total_tokens}{cache_info}, time={duration:.3f}s, ratio out/in={ratio:.2f}"
+                            f"slow llm call: scope={scope}, model={self.provider}/{self.model}, "
+                            f"input_tokens={input_tokens}, output_tokens={output_tokens}, "
+                            f"total_tokens={total_tokens}{cache_info}, time={duration:.3f}s{wait_info}, ratio out/in={ratio:.2f}"
                         )
+                    if return_usage:
+                        token_usage = TokenUsage(
+                            input_tokens=input_tokens,
+                            output_tokens=output_tokens,
+                            total_tokens=total_tokens,
+                        )
+                        return result, token_usage
                     return result
                 except LengthFinishReasonError as e:
@@ -443,6 +515,9 @@ class LLMProvider:
         max_backoff: float,
         skip_validation: bool,
         start_time: float,
+        scope: str = "memory",
+        return_usage: bool = False,
+        semaphore_wait_time: float = 0.0,
     ) -> Any:
         """Handle Anthropic-specific API calls."""
         from anthropic import APIConnectionError, APIStatusError, RateLimitError
@@ -515,17 +590,40 @@ class LLMProvider:
                 else:
                     result = content
-                # Log slow calls
+                # Record metrics and log slow calls
                 duration = time.time() - start_time
+                input_tokens = response.usage.input_tokens or 0 if response.usage else 0
+                output_tokens = response.usage.output_tokens or 0 if response.usage else 0
+                total_tokens = input_tokens + output_tokens
+                # Record LLM metrics
+                metrics = get_metrics_collector()
+                metrics.record_llm_call(
+                    provider=self.provider,
+                    model=self.model,
+                    scope=scope,
+                    duration=duration,
+                    input_tokens=input_tokens,
+                    output_tokens=output_tokens,
+                    success=True,
+                )
+                # Log slow calls
                 if duration > 10.0:
-                    input_tokens = response.usage.input_tokens
-                    output_tokens = response.usage.output_tokens
+                    wait_info = f", wait={semaphore_wait_time:.3f}s" if semaphore_wait_time > 0.1 else ""
                     logger.info(
-                        f"slow llm call: model={self.provider}/{self.model}, "
+                        f"slow llm call: scope={scope}, model={self.provider}/{self.model}, "
                         f"input_tokens={input_tokens}, output_tokens={output_tokens}, "
-                        f"time={duration:.3f}s"
+                        f"time={duration:.3f}s{wait_info}"
                     )
+                if return_usage:
+                    token_usage = TokenUsage(
+                        input_tokens=input_tokens,
+                        output_tokens=output_tokens,
+                        total_tokens=total_tokens,
+                    )
+                    return result, token_usage
                 return result
             except json.JSONDecodeError as e:
@@ -580,6 +678,9 @@ class LLMProvider:
         max_backoff: float,
         skip_validation: bool,
         start_time: float,
+        scope: str = "memory",
+        return_usage: bool = False,
+        semaphore_wait_time: float = 0.0,
     ) -> Any:
         """
         Call Ollama using native API with JSON schema enforcement.
@@ -654,11 +755,39 @@ class LLMProvider:
                         else:
                             raise
+                    # Extract token usage from Ollama response
+                    # Ollama returns prompt_eval_count (input) and eval_count (output)
+                    duration = time.time() - start_time
+                    input_tokens = result.get("prompt_eval_count", 0) or 0
+                    output_tokens = result.get("eval_count", 0) or 0
+                    total_tokens = input_tokens + output_tokens
+                    # Record LLM metrics
+                    metrics = get_metrics_collector()
+                    metrics.record_llm_call(
+                        provider=self.provider,
+                        model=self.model,
+                        scope=scope,
+                        duration=duration,
+                        input_tokens=input_tokens,
+                        output_tokens=output_tokens,
+                        success=True,
+                    )
                     # Validate against Pydantic model or return raw JSON
                     if skip_validation:
-                        return json_data
+                        validated_result = json_data
                     else:
-                        return response_format.model_validate(json_data)
+                        validated_result = response_format.model_validate(json_data)
+                    if return_usage:
+                        token_usage = TokenUsage(
+                            input_tokens=input_tokens,
+                            output_tokens=output_tokens,
+                            total_tokens=total_tokens,
+                        )
+                        return validated_result, token_usage
+                    return validated_result
                 except httpx.HTTPStatusError as e:
                     last_exception = e
@@ -701,6 +830,9 @@ class LLMProvider:
         max_backoff: float,
         skip_validation: bool,
         start_time: float,
+        scope: str = "memory",
+        return_usage: bool = False,
+        semaphore_wait_time: float = 0.0,
     ) -> Any:
         """Handle Gemini-specific API calls."""
         # Convert OpenAI-style messages to Gemini format
@@ -777,16 +909,43 @@ class LLMProvider:
                 else:
                     result = content
-                # Log slow calls
+                # Record metrics and log slow calls
                 duration = time.time() - start_time
-                if duration > 10.0 and hasattr(response, "usage_metadata") and response.usage_metadata:
+                input_tokens = 0
+                output_tokens = 0
+                if hasattr(response, "usage_metadata") and response.usage_metadata:
                     usage = response.usage_metadata
+                    input_tokens = usage.prompt_token_count or 0
+                    output_tokens = usage.candidates_token_count or 0
+                # Record LLM metrics
+                metrics = get_metrics_collector()
+                metrics.record_llm_call(
+                    provider=self.provider,
+                    model=self.model,
+                    scope=scope,
+                    duration=duration,
+                    input_tokens=input_tokens,
+                    output_tokens=output_tokens,
+                    success=True,
+                )
+                # Log slow calls
+                if duration > 10.0 and input_tokens > 0:
+                    wait_info = f", wait={semaphore_wait_time:.3f}s" if semaphore_wait_time > 0.1 else ""
                     logger.info(
-                        f"slow llm call: model={self.provider}/{self.model}, "
-                        f"input_tokens={usage.prompt_token_count}, output_tokens={usage.candidates_token_count}, "
-                        f"time={duration:.3f}s"
+                        f"slow llm call: scope={scope}, model={self.provider}/{self.model}, "
+                        f"input_tokens={input_tokens}, output_tokens={output_tokens}, "
+                        f"time={duration:.3f}s{wait_info}"
                     )
+                if return_usage:
+                    token_usage = TokenUsage(
+                        input_tokens=input_tokens,
+                        output_tokens=output_tokens,
+                        total_tokens=input_tokens + output_tokens,
+                    )
+                    return result, token_usage
                 return result
             except json.JSONDecodeError as e:
@@ -828,6 +987,61 @@ class LLMProvider:
             raise last_exception
         raise RuntimeError("Gemini call failed after all retries")
+    async def _call_mock(
+        self,
+        messages: list[dict[str, str]],
+        response_format: Any | None,
+        scope: str,
+        return_usage: bool,
+    ) -> Any:
+        """
+        Handle mock provider calls for testing.
+        Records the call and returns a configurable mock response.
+        """
+        # Record the call for test verification
+        call_record = {
+            "provider": self.provider,
+            "model": self.model,
+            "messages": messages,
+            "response_format": response_format.__name__
+            if response_format and hasattr(response_format, "__name__")
+            else str(response_format),
+            "scope": scope,
+        }
+        self._mock_calls.append(call_record)
+        logger.debug(f"Mock LLM call recorded: scope={scope}, model={self.model}")
+        # Return mock response
+        if self._mock_response is not None:
+            result = self._mock_response
+        elif response_format is not None:
+            # Try to create a minimal valid instance of the response format
+            try:
+                # For Pydantic models, try to create with minimal valid data
+                result = {"mock": True}
+            except Exception:
+                result = {"mock": True}
+        else:
+            result = "mock response"
+        if return_usage:
+            token_usage = TokenUsage(input_tokens=10, output_tokens=5, total_tokens=15)
+            return result, token_usage
+        return result
+    def set_mock_response(self, response: Any) -> None:
+        """Set the response to return from mock calls."""
+        self._mock_response = response
+    def get_mock_calls(self) -> list[dict]:
+        """Get the list of recorded mock calls."""
+        return self._mock_calls
+    def clear_mock_calls(self) -> None:
+        """Clear the recorded mock calls."""
+        self._mock_calls = []
     @classmethod
     def for_memory(cls) -> "LLMProvider":
         """Create provider for memory operations from environment variables."""

hindsight-api 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

hindsight-api 0.2.1py3-none-any.whl → 0.3.0py3-none-any.whl