PyPI - hindsight-api - Versions diffs - 0.2.1__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

hindsight-api 0.2.1py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (88) hide show

hindsight_api/admin/__init__.py +1 -0
hindsight_api/admin/cli.py +311 -0
hindsight_api/alembic/versions/f1a2b3c4d5e6_add_memory_links_composite_index.py +44 -0
hindsight_api/alembic/versions/g2a3b4c5d6e7_add_tags_column.py +48 -0
hindsight_api/alembic/versions/h3c4d5e6f7g8_mental_models_v4.py +112 -0
hindsight_api/alembic/versions/i4d5e6f7g8h9_delete_opinions.py +41 -0
hindsight_api/alembic/versions/j5e6f7g8h9i0_mental_model_versions.py +95 -0
hindsight_api/alembic/versions/k6f7g8h9i0j1_add_directive_subtype.py +58 -0
hindsight_api/alembic/versions/l7g8h9i0j1k2_add_worker_columns.py +109 -0
hindsight_api/alembic/versions/m8h9i0j1k2l3_mental_model_id_to_text.py +41 -0
hindsight_api/alembic/versions/n9i0j1k2l3m4_learnings_and_pinned_reflections.py +134 -0
hindsight_api/alembic/versions/o0j1k2l3m4n5_migrate_mental_models_data.py +113 -0
hindsight_api/alembic/versions/p1k2l3m4n5o6_new_knowledge_architecture.py +194 -0
hindsight_api/alembic/versions/q2l3m4n5o6p7_fix_mental_model_fact_type.py +50 -0
hindsight_api/alembic/versions/r3m4n5o6p7q8_add_reflect_response_to_reflections.py +47 -0
hindsight_api/alembic/versions/s4n5o6p7q8r9_add_consolidated_at_to_memory_units.py +53 -0
hindsight_api/alembic/versions/t5o6p7q8r9s0_rename_mental_models_to_observations.py +134 -0
hindsight_api/alembic/versions/u6p7q8r9s0t1_mental_models_text_id.py +41 -0
hindsight_api/alembic/versions/v7q8r9s0t1u2_add_max_tokens_to_mental_models.py +50 -0
hindsight_api/api/http.py +1406 -118
hindsight_api/api/mcp.py +11 -196
hindsight_api/config.py +359 -27
hindsight_api/engine/consolidation/__init__.py +5 -0
hindsight_api/engine/consolidation/consolidator.py +859 -0
hindsight_api/engine/consolidation/prompts.py +69 -0
hindsight_api/engine/cross_encoder.py +706 -88
hindsight_api/engine/db_budget.py +284 -0
hindsight_api/engine/db_utils.py +11 -0
hindsight_api/engine/directives/__init__.py +5 -0
hindsight_api/engine/directives/models.py +37 -0
hindsight_api/engine/embeddings.py +553 -29
hindsight_api/engine/entity_resolver.py +8 -5
hindsight_api/engine/interface.py +40 -17
hindsight_api/engine/llm_wrapper.py +744 -68
hindsight_api/engine/memory_engine.py +2505 -1017
hindsight_api/engine/mental_models/__init__.py +14 -0
hindsight_api/engine/mental_models/models.py +53 -0
hindsight_api/engine/query_analyzer.py +4 -3
hindsight_api/engine/reflect/__init__.py +18 -0
hindsight_api/engine/reflect/agent.py +933 -0
hindsight_api/engine/reflect/models.py +109 -0
hindsight_api/engine/reflect/observations.py +186 -0
hindsight_api/engine/reflect/prompts.py +483 -0
hindsight_api/engine/reflect/tools.py +437 -0
hindsight_api/engine/reflect/tools_schema.py +250 -0
hindsight_api/engine/response_models.py +168 -4
hindsight_api/engine/retain/bank_utils.py +79 -201
hindsight_api/engine/retain/fact_extraction.py +424 -195
hindsight_api/engine/retain/fact_storage.py +35 -12
hindsight_api/engine/retain/link_utils.py +29 -24
hindsight_api/engine/retain/orchestrator.py +24 -43
hindsight_api/engine/retain/types.py +11 -2
hindsight_api/engine/search/graph_retrieval.py +43 -14
hindsight_api/engine/search/link_expansion_retrieval.py +391 -0
hindsight_api/engine/search/mpfp_retrieval.py +362 -117
hindsight_api/engine/search/reranking.py +2 -2
hindsight_api/engine/search/retrieval.py +848 -201
hindsight_api/engine/search/tags.py +172 -0
hindsight_api/engine/search/think_utils.py +42 -141
hindsight_api/engine/search/trace.py +12 -1
hindsight_api/engine/search/tracer.py +26 -6
hindsight_api/engine/search/types.py +21 -3
hindsight_api/engine/task_backend.py +113 -106
hindsight_api/engine/utils.py +1 -152
hindsight_api/extensions/__init__.py +10 -1
hindsight_api/extensions/builtin/tenant.py +5 -1
hindsight_api/extensions/context.py +10 -1
hindsight_api/extensions/operation_validator.py +81 -4
hindsight_api/extensions/tenant.py +26 -0
hindsight_api/main.py +69 -6
hindsight_api/mcp_local.py +12 -53
hindsight_api/mcp_tools.py +494 -0
hindsight_api/metrics.py +433 -48
hindsight_api/migrations.py +141 -1
hindsight_api/models.py +3 -3
hindsight_api/pg0.py +53 -0
hindsight_api/server.py +39 -2
hindsight_api/worker/__init__.py +11 -0
hindsight_api/worker/main.py +296 -0
hindsight_api/worker/poller.py +486 -0
{hindsight_api-0.2.1.dist-info → hindsight_api-0.4.0.dist-info}/METADATA +16 -6
hindsight_api-0.4.0.dist-info/RECORD +112 -0
{hindsight_api-0.2.1.dist-info → hindsight_api-0.4.0.dist-info}/entry_points.txt +2 -0
hindsight_api/engine/retain/observation_regeneration.py +0 -254
hindsight_api/engine/search/observation_utils.py +0 -125
hindsight_api/engine/search/scoring.py +0 -159
hindsight_api-0.2.1.dist-info/RECORD +0 -75
{hindsight_api-0.2.1.dist-info → hindsight_api-0.4.0.dist-info}/WHEEL +0 -0

hindsight_api/engine/llm_wrapper.py CHANGED Viewed

@@ -19,9 +19,12 @@ from openai import APIConnectionError, APIStatusError, AsyncOpenAI, LengthFinish
 from ..config import (
     DEFAULT_LLM_MAX_CONCURRENT,
     DEFAULT_LLM_TIMEOUT,
+    ENV_LLM_GROQ_SERVICE_TIER,
     ENV_LLM_MAX_CONCURRENT,
     ENV_LLM_TIMEOUT,
 )
+from ..metrics import get_metrics_collector
+from .response_models import TokenUsage
 # Seed applied to every Groq request for deterministic behavior.
 DEFAULT_LLM_SEED = 4242
@@ -63,6 +66,7 @@ class LLMProvider:
         base_url: str,
         model: str,
         reasoning_effort: str = "low",
+        groq_service_tier: str | None = None,
     ):
         """
         Initialize LLM provider.
@@ -73,18 +77,25 @@ class LLMProvider:
             base_url: Base URL for the API.
             model: Model name.
             reasoning_effort: Reasoning effort level for supported providers.
+            groq_service_tier: Groq service tier ("on_demand", "flex", "auto"). Default: None (uses Groq's default).
         """
         self.provider = provider.lower()
         self.api_key = api_key
         self.base_url = base_url
         self.model = model
         self.reasoning_effort = reasoning_effort
+        # Default to 'auto' for best performance, users can override to 'on_demand' for free tier
+        self.groq_service_tier = groq_service_tier or os.getenv(ENV_LLM_GROQ_SERVICE_TIER, "auto")
         # Validate provider
-        valid_providers = ["openai", "groq", "ollama", "gemini", "anthropic", "lmstudio"]
+        valid_providers = ["openai", "groq", "ollama", "gemini", "anthropic", "lmstudio", "mock"]
         if self.provider not in valid_providers:
             raise ValueError(f"Invalid LLM provider: {self.provider}. Must be one of: {', '.join(valid_providers)}")
+        # Mock provider tracking (for testing)
+        self._mock_calls: list[dict] = []
+        self._mock_response: Any = None
         # Set default base URLs
         if not self.base_url:
             if self.provider == "groq":
@@ -94,8 +105,8 @@ class LLMProvider:
             elif self.provider == "lmstudio":
                 self.base_url = "http://localhost:1234/v1"
-        # Validate API key (not needed for ollama or lmstudio)
-        if self.provider not in ("ollama", "lmstudio") and not self.api_key:
+        # Validate API key (not needed for ollama, lmstudio, or mock)
+        if self.provider not in ("ollama", "lmstudio", "mock") and not self.api_key:
             raise ValueError(f"API key not found for {self.provider}")
         # Get timeout config (set HINDSIGHT_API_LLM_TIMEOUT for local LLMs that need longer timeouts)
@@ -106,7 +117,10 @@ class LLMProvider:
         self._gemini_client = None
         self._anthropic_client = None
-        if self.provider == "gemini":
+        if self.provider == "mock":
+            # Mock provider - no client needed
+            pass
+        elif self.provider == "gemini":
             self._gemini_client = genai.Client(api_key=self.api_key)
         elif self.provider == "anthropic":
             from anthropic import AsyncAnthropic
@@ -169,6 +183,7 @@ class LLMProvider:
         max_backoff: float = 60.0,
         skip_validation: bool = False,
         strict_schema: bool = False,
+        return_usage: bool = False,
     ) -> Any:
         """
         Make an LLM API call with retry logic.
@@ -184,21 +199,43 @@ class LLMProvider:
             max_backoff: Maximum backoff time in seconds.
             skip_validation: Return raw JSON without Pydantic validation.
             strict_schema: Use strict JSON schema enforcement (OpenAI only). Guarantees all required fields.
+            return_usage: If True, return tuple (result, TokenUsage) instead of just result.
         Returns:
-            Parsed response if response_format is provided, otherwise text content.
+            If return_usage=False: Parsed response if response_format is provided, otherwise text content.
+            If return_usage=True: Tuple of (result, TokenUsage) with token counts from the LLM call.
         Raises:
             OutputTooLongError: If output exceeds token limits.
             Exception: Re-raises API errors after retries exhausted.
         """
+        semaphore_start = time.time()
         async with _global_llm_semaphore:
+            semaphore_wait_time = time.time() - semaphore_start
             start_time = time.time()
+            # Handle Mock provider (for testing)
+            if self.provider == "mock":
+                return await self._call_mock(
+                    messages,
+                    response_format,
+                    scope,
+                    return_usage,
+                )
             # Handle Gemini provider separately
             if self.provider == "gemini":
                 return await self._call_gemini(
-                    messages, response_format, max_retries, initial_backoff, max_backoff, skip_validation, start_time
+                    messages,
+                    response_format,
+                    max_retries,
+                    initial_backoff,
+                    max_backoff,
+                    skip_validation,
+                    start_time,
+                    scope,
+                    return_usage,
+                    semaphore_wait_time,
                 )
             # Handle Anthropic provider separately
@@ -212,6 +249,9 @@ class LLMProvider:
                     max_backoff,
                     skip_validation,
                     start_time,
+                    scope,
+                    return_usage,
+                    semaphore_wait_time,
                 )
             # Handle Ollama with native API for structured output (better schema enforcement)
@@ -226,6 +266,9 @@ class LLMProvider:
                     max_backoff,
                     skip_validation,
                     start_time,
+                    scope,
+                    return_usage,
+                    semaphore_wait_time,
                 )
             call_params = {
@@ -263,51 +306,56 @@ class LLMProvider:
             # Provider-specific parameters
             if self.provider == "groq":
                 call_params["seed"] = DEFAULT_LLM_SEED
-                extra_body = {"service_tier": "auto"}
-                # Only add reasoning parameters for reasoning models
+                extra_body: dict[str, Any] = {}
+                # Add service_tier if configured (requires paid plan for flex/auto)
+                if self.groq_service_tier:
+                    extra_body["service_tier"] = self.groq_service_tier
+                # Add reasoning parameters for reasoning models
                 if is_reasoning_model:
                     extra_body["include_reasoning"] = False
-                call_params["extra_body"] = extra_body
+                if extra_body:
+                    call_params["extra_body"] = extra_body
             last_exception = None
+            # Prepare response format ONCE before the retry loop
+            # (to avoid appending schema to messages on every retry)
+            if response_format is not None:
+                schema = None
+                if hasattr(response_format, "model_json_schema"):
+                    schema = response_format.model_json_schema()
+                if strict_schema and schema is not None:
+                    # Use OpenAI's strict JSON schema enforcement
+                    # This guarantees all required fields are returned
+                    call_params["response_format"] = {
+                        "type": "json_schema",
+                        "json_schema": {
+                            "name": "response",
+                            "strict": True,
+                            "schema": schema,
+                        },
+                    }
+                else:
+                    # Soft enforcement: add schema to prompt and use json_object mode
+                    if schema is not None:
+                        schema_msg = f"\n\nYou must respond with valid JSON matching this schema:\n{json.dumps(schema, indent=2)}"
+                        if call_params["messages"] and call_params["messages"][0].get("role") == "system":
+                            call_params["messages"][0]["content"] += schema_msg
+                        elif call_params["messages"]:
+                            call_params["messages"][0]["content"] = (
+                                schema_msg + "\n\n" + call_params["messages"][0]["content"]
+                            )
+                    if self.provider not in ("lmstudio", "ollama"):
+                        # LM Studio and Ollama don't support json_object response format reliably
+                        # We rely on the schema in the system message instead
+                        call_params["response_format"] = {"type": "json_object"}
             for attempt in range(max_retries + 1):
                 try:
                     if response_format is not None:
-                        schema = None
-                        if hasattr(response_format, "model_json_schema"):
-                            schema = response_format.model_json_schema()
-                        if strict_schema and schema is not None:
-                            # Use OpenAI's strict JSON schema enforcement
-                            # This guarantees all required fields are returned
-                            call_params["response_format"] = {
-                                "type": "json_schema",
-                                "json_schema": {
-                                    "name": "response",
-                                    "strict": True,
-                                    "schema": schema,
-                                },
-                            }
-                        else:
-                            # Soft enforcement: add schema to prompt and use json_object mode
-                            if schema is not None:
-                                schema_msg = f"\n\nYou must respond with valid JSON matching this schema:\n{json.dumps(schema, indent=2)}"
-                                if call_params["messages"] and call_params["messages"][0].get("role") == "system":
-                                    call_params["messages"][0]["content"] += schema_msg
-                                elif call_params["messages"]:
-                                    call_params["messages"][0]["content"] = (
-                                        schema_msg + "\n\n" + call_params["messages"][0]["content"]
-                                    )
-                            if self.provider not in ("lmstudio", "ollama"):
-                                # LM Studio and Ollama don't support json_object response format reliably
-                                # We rely on the schema in the system message instead
-                                call_params["response_format"] = {"type": "json_object"}
-                        logger.debug(f"Sending request to {self.provider}/{self.model} (timeout={self.timeout})")
                         response = await self._client.chat.completions.create(**call_params)
-                        logger.debug(f"Received response from {self.provider}/{self.model}")
                         content = response.choices[0].message.content
@@ -370,21 +418,46 @@ class LLMProvider:
                         response = await self._client.chat.completions.create(**call_params)
                         result = response.choices[0].message.content
-                    # Log slow calls
+                    # Record token usage metrics
                     duration = time.time() - start_time
                     usage = response.usage
-                    if duration > 10.0:
-                        ratio = max(1, usage.completion_tokens) / usage.prompt_tokens
+                    input_tokens = usage.prompt_tokens or 0 if usage else 0
+                    output_tokens = usage.completion_tokens or 0 if usage else 0
+                    total_tokens = usage.total_tokens or 0 if usage else 0
+                    # Record LLM metrics
+                    metrics = get_metrics_collector()
+                    metrics.record_llm_call(
+                        provider=self.provider,
+                        model=self.model,
+                        scope=scope,
+                        duration=duration,
+                        input_tokens=input_tokens,
+                        output_tokens=output_tokens,
+                        success=True,
+                    )
+                    # Log slow calls
+                    if duration > 10.0 and usage:
+                        ratio = max(1, output_tokens) / max(1, input_tokens)
                         cached_tokens = 0
                         if hasattr(usage, "prompt_tokens_details") and usage.prompt_tokens_details:
                             cached_tokens = getattr(usage.prompt_tokens_details, "cached_tokens", 0) or 0
                         cache_info = f", cached_tokens={cached_tokens}" if cached_tokens > 0 else ""
+                        wait_info = f", wait={semaphore_wait_time:.3f}s" if semaphore_wait_time > 0.1 else ""
                         logger.info(
-                            f"slow llm call: model={self.provider}/{self.model}, "
-                            f"input_tokens={usage.prompt_tokens}, output_tokens={usage.completion_tokens}, "
-                            f"total_tokens={usage.total_tokens}{cache_info}, time={duration:.3f}s, ratio out/in={ratio:.2f}"
+                            f"slow llm call: scope={scope}, model={self.provider}/{self.model}, "
+                            f"input_tokens={input_tokens}, output_tokens={output_tokens}, "
+                            f"total_tokens={total_tokens}{cache_info}, time={duration:.3f}s{wait_info}, ratio out/in={ratio:.2f}"
                         )
+                    if return_usage:
+                        token_usage = TokenUsage(
+                            input_tokens=input_tokens,
+                            output_tokens=output_tokens,
+                            total_tokens=total_tokens,
+                        )
+                        return result, token_usage
                     return result
                 except LengthFinishReasonError as e:
@@ -395,13 +468,11 @@ class LLMProvider:
                 except APIConnectionError as e:
                     last_exception = e
+                    status_code = getattr(e, "status_code", None) or getattr(
+                        getattr(e, "response", None), "status_code", None
+                    )
+                    logger.warning(f"APIConnectionError (HTTP {status_code}), attempt {attempt + 1}: {str(e)[:200]}")
                     if attempt < max_retries:
-                        status_code = getattr(e, "status_code", None) or getattr(
-                            getattr(e, "response", None), "status_code", None
-                        )
-                        logger.warning(
-                            f"Connection error, retrying... (attempt {attempt + 1}/{max_retries + 1}) - status_code={status_code}, message={e}"
-                        )
                         backoff = min(initial_backoff * (2**attempt), max_backoff)
                         await asyncio.sleep(backoff)
                         continue
@@ -415,6 +486,45 @@ class LLMProvider:
                         logger.error(f"Auth error (HTTP {e.status_code}), not retrying: {str(e)}")
                         raise
+                    # Handle tool_use_failed error - model outputted in tool call format
+                    # Convert to expected JSON format and continue
+                    if e.status_code == 400 and response_format is not None:
+                        try:
+                            error_body = e.body if hasattr(e, "body") else {}
+                            if isinstance(error_body, dict):
+                                error_info: dict[str, Any] = error_body.get("error") or {}
+                                if error_info.get("code") == "tool_use_failed":
+                                    failed_gen = error_info.get("failed_generation", "")
+                                    if failed_gen:
+                                        # Parse the tool call format and convert to actions format
+                                        tool_call = json.loads(failed_gen)
+                                        tool_name = tool_call.get("name", "")
+                                        tool_args = tool_call.get("arguments", {})
+                                        # Convert to actions format: {"actions": [{"tool": "name", ...args}]}
+                                        converted = {"actions": [{"tool": tool_name, **tool_args}]}
+                                        if skip_validation:
+                                            result = converted
+                                        else:
+                                            result = response_format.model_validate(converted)
+                                        # Record metrics for this successful recovery
+                                        duration = time.time() - start_time
+                                        metrics = get_metrics_collector()
+                                        metrics.record_llm_call(
+                                            provider=self.provider,
+                                            model=self.model,
+                                            scope=scope,
+                                            duration=duration,
+                                            input_tokens=0,
+                                            output_tokens=0,
+                                            success=True,
+                                        )
+                                        if return_usage:
+                                            return result, TokenUsage(input_tokens=0, output_tokens=0, total_tokens=0)
+                                        return result
+                        except (json.JSONDecodeError, KeyError, TypeError):
+                            pass  # Failed to parse tool_use_failed, continue with normal retry
                     last_exception = e
                     if attempt < max_retries:
                         backoff = min(initial_backoff * (2**attempt), max_backoff)
@@ -425,14 +535,438 @@ class LLMProvider:
                         logger.error(f"API error after {max_retries + 1} attempts: {str(e)}")
                         raise
-                except Exception as e:
-                    logger.error(f"Unexpected error during LLM call: {type(e).__name__}: {str(e)}")
+                except Exception:
                     raise
             if last_exception:
                 raise last_exception
             raise RuntimeError("LLM call failed after all retries with no exception captured")
+    async def call_with_tools(
+        self,
+        messages: list[dict[str, Any]],
+        tools: list[dict[str, Any]],
+        max_completion_tokens: int | None = None,
+        temperature: float | None = None,
+        scope: str = "tools",
+        max_retries: int = 5,
+        initial_backoff: float = 1.0,
+        max_backoff: float = 30.0,
+        tool_choice: str | dict[str, Any] = "auto",
+    ) -> "LLMToolCallResult":
+        """
+        Make an LLM API call with tool/function calling support.
+        Args:
+            messages: List of message dicts. Can include tool results with role='tool'.
+            tools: List of tool definitions in OpenAI format.
+            max_completion_tokens: Maximum tokens in response.
+            temperature: Sampling temperature (0.0-2.0).
+            scope: Scope identifier for tracking.
+            max_retries: Maximum retry attempts.
+            initial_backoff: Initial backoff time in seconds.
+            max_backoff: Maximum backoff time in seconds.
+            tool_choice: How to choose tools - "auto", "none", "required", or {"type": "function", "function": {"name": "..."}}
+        Returns:
+            LLMToolCallResult with content and/or tool_calls.
+        """
+        from .response_models import LLMToolCall, LLMToolCallResult
+        async with _global_llm_semaphore:
+            start_time = time.time()
+            # Handle Mock provider
+            if self.provider == "mock":
+                return await self._call_with_tools_mock(messages, tools, scope)
+            # Handle Anthropic separately (uses different tool format)
+            if self.provider == "anthropic":
+                return await self._call_with_tools_anthropic(
+                    messages, tools, max_completion_tokens, max_retries, initial_backoff, max_backoff, start_time, scope
+                )
+            # Handle Gemini (convert to Gemini tool format)
+            if self.provider == "gemini":
+                return await self._call_with_tools_gemini(
+                    messages, tools, max_retries, initial_backoff, max_backoff, start_time, scope
+                )
+            # OpenAI-compatible providers (OpenAI, Groq, Ollama, LMStudio)
+            call_params: dict[str, Any] = {
+                "model": self.model,
+                "messages": messages,
+                "tools": tools,
+                "tool_choice": tool_choice,
+            }
+            if max_completion_tokens is not None:
+                call_params["max_completion_tokens"] = max_completion_tokens
+            if temperature is not None:
+                call_params["temperature"] = temperature
+            # Provider-specific parameters
+            if self.provider == "groq":
+                call_params["seed"] = DEFAULT_LLM_SEED
+            last_exception = None
+            for attempt in range(max_retries + 1):
+                try:
+                    response = await self._client.chat.completions.create(**call_params)
+                    message = response.choices[0].message
+                    finish_reason = response.choices[0].finish_reason
+                    # Extract tool calls if present
+                    tool_calls: list[LLMToolCall] = []
+                    if message.tool_calls:
+                        for tc in message.tool_calls:
+                            try:
+                                args = json.loads(tc.function.arguments) if tc.function.arguments else {}
+                            except json.JSONDecodeError:
+                                args = {"_raw": tc.function.arguments}
+                            tool_calls.append(LLMToolCall(id=tc.id, name=tc.function.name, arguments=args))
+                    content = message.content
+                    # Record metrics
+                    duration = time.time() - start_time
+                    usage = response.usage
+                    input_tokens = usage.prompt_tokens or 0 if usage else 0
+                    output_tokens = usage.completion_tokens or 0 if usage else 0
+                    metrics = get_metrics_collector()
+                    metrics.record_llm_call(
+                        provider=self.provider,
+                        model=self.model,
+                        scope=scope,
+                        duration=duration,
+                        input_tokens=input_tokens,
+                        output_tokens=output_tokens,
+                        success=True,
+                    )
+                    return LLMToolCallResult(
+                        content=content,
+                        tool_calls=tool_calls,
+                        finish_reason=finish_reason,
+                        input_tokens=input_tokens,
+                        output_tokens=output_tokens,
+                    )
+                except APIConnectionError as e:
+                    last_exception = e
+                    if attempt < max_retries:
+                        await asyncio.sleep(min(initial_backoff * (2**attempt), max_backoff))
+                        continue
+                    raise
+                except APIStatusError as e:
+                    if e.status_code in (401, 403):
+                        raise
+                    last_exception = e
+                    if attempt < max_retries:
+                        await asyncio.sleep(min(initial_backoff * (2**attempt), max_backoff))
+                        continue
+                    raise
+                except Exception:
+                    raise
+            if last_exception:
+                raise last_exception
+            raise RuntimeError("Tool call failed after all retries")
+    async def _call_with_tools_mock(
+        self,
+        messages: list[dict[str, Any]],
+        tools: list[dict[str, Any]],
+        scope: str,
+    ) -> "LLMToolCallResult":
+        """Handle mock tool calls for testing."""
+        from .response_models import LLMToolCallResult
+        call_record = {
+            "provider": self.provider,
+            "model": self.model,
+            "messages": messages,
+            "tools": [t.get("function", {}).get("name") for t in tools],
+            "scope": scope,
+        }
+        self._mock_calls.append(call_record)
+        if self._mock_response is not None:
+            if isinstance(self._mock_response, LLMToolCallResult):
+                return self._mock_response
+            # Allow setting just tool calls as a list
+            if isinstance(self._mock_response, list):
+                from .response_models import LLMToolCall
+                return LLMToolCallResult(
+                    tool_calls=[
+                        LLMToolCall(id=f"mock_{i}", name=tc["name"], arguments=tc.get("arguments", {}))
+                        for i, tc in enumerate(self._mock_response)
+                    ],
+                    finish_reason="tool_calls",
+                )
+        return LLMToolCallResult(content="mock response", finish_reason="stop")
+    async def _call_with_tools_anthropic(
+        self,
+        messages: list[dict[str, Any]],
+        tools: list[dict[str, Any]],
+        max_completion_tokens: int | None,
+        max_retries: int,
+        initial_backoff: float,
+        max_backoff: float,
+        start_time: float,
+        scope: str,
+    ) -> "LLMToolCallResult":
+        """Handle Anthropic tool calling."""
+        from anthropic import APIConnectionError, APIStatusError
+        from .response_models import LLMToolCall, LLMToolCallResult
+        # Convert OpenAI tool format to Anthropic format
+        anthropic_tools = []
+        for tool in tools:
+            func = tool.get("function", {})
+            anthropic_tools.append(
+                {
+                    "name": func.get("name", ""),
+                    "description": func.get("description", ""),
+                    "input_schema": func.get("parameters", {"type": "object", "properties": {}}),
+                }
+            )
+        # Convert messages - handle tool results
+        system_prompt = None
+        anthropic_messages = []
+        for msg in messages:
+            role = msg.get("role", "user")
+            content = msg.get("content", "")
+            if role == "system":
+                system_prompt = (system_prompt + "\n\n" + content) if system_prompt else content
+            elif role == "tool":
+                # Anthropic uses tool_result blocks
+                anthropic_messages.append(
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "tool_result", "tool_use_id": msg.get("tool_call_id", ""), "content": content}
+                        ],
+                    }
+                )
+            elif role == "assistant" and msg.get("tool_calls"):
+                # Convert assistant tool calls
+                tool_use_blocks = []
+                for tc in msg["tool_calls"]:
+                    tool_use_blocks.append(
+                        {
+                            "type": "tool_use",
+                            "id": tc.get("id", ""),
+                            "name": tc.get("function", {}).get("name", ""),
+                            "input": json.loads(tc.get("function", {}).get("arguments", "{}")),
+                        }
+                    )
+                anthropic_messages.append({"role": "assistant", "content": tool_use_blocks})
+            else:
+                anthropic_messages.append({"role": role, "content": content})
+        call_params: dict[str, Any] = {
+            "model": self.model,
+            "messages": anthropic_messages,
+            "tools": anthropic_tools,
+            "max_tokens": max_completion_tokens or 4096,
+        }
+        if system_prompt:
+            call_params["system"] = system_prompt
+        last_exception = None
+        for attempt in range(max_retries + 1):
+            try:
+                response = await self._anthropic_client.messages.create(**call_params)
+                # Extract content and tool calls
+                content_parts = []
+                tool_calls: list[LLMToolCall] = []
+                for block in response.content:
+                    if block.type == "text":
+                        content_parts.append(block.text)
+                    elif block.type == "tool_use":
+                        tool_calls.append(LLMToolCall(id=block.id, name=block.name, arguments=block.input or {}))
+                content = "".join(content_parts) if content_parts else None
+                finish_reason = "tool_calls" if tool_calls else "stop"
+                # Extract token usage
+                input_tokens = response.usage.input_tokens or 0
+                output_tokens = response.usage.output_tokens or 0
+                # Record metrics
+                metrics = get_metrics_collector()
+                metrics.record_llm_call(
+                    provider=self.provider,
+                    model=self.model,
+                    scope=scope,
+                    duration=time.time() - start_time,
+                    input_tokens=input_tokens,
+                    output_tokens=output_tokens,
+                    success=True,
+                )
+                return LLMToolCallResult(
+                    content=content,
+                    tool_calls=tool_calls,
+                    finish_reason=finish_reason,
+                    input_tokens=input_tokens,
+                    output_tokens=output_tokens,
+                )
+            except (APIConnectionError, APIStatusError) as e:
+                if isinstance(e, APIStatusError) and e.status_code in (401, 403):
+                    raise
+                last_exception = e
+                if attempt < max_retries:
+                    await asyncio.sleep(min(initial_backoff * (2**attempt), max_backoff))
+                    continue
+                raise
+        if last_exception:
+            raise last_exception
+        raise RuntimeError("Anthropic tool call failed")
+    async def _call_with_tools_gemini(
+        self,
+        messages: list[dict[str, Any]],
+        tools: list[dict[str, Any]],
+        max_retries: int,
+        initial_backoff: float,
+        max_backoff: float,
+        start_time: float,
+        scope: str,
+    ) -> "LLMToolCallResult":
+        """Handle Gemini tool calling."""
+        from .response_models import LLMToolCall, LLMToolCallResult
+        # Convert tools to Gemini format
+        gemini_tools = []
+        for tool in tools:
+            func = tool.get("function", {})
+            gemini_tools.append(
+                genai_types.Tool(
+                    function_declarations=[
+                        genai_types.FunctionDeclaration(
+                            name=func.get("name", ""),
+                            description=func.get("description", ""),
+                            parameters=func.get("parameters"),
+                        )
+                    ]
+                )
+            )
+        # Convert messages
+        system_instruction = None
+        gemini_contents = []
+        for msg in messages:
+            role = msg.get("role", "user")
+            content = msg.get("content", "")
+            if role == "system":
+                system_instruction = (system_instruction + "\n\n" + content) if system_instruction else content
+            elif role == "tool":
+                # Gemini uses function_response
+                gemini_contents.append(
+                    genai_types.Content(
+                        role="user",
+                        parts=[
+                            genai_types.Part(
+                                function_response=genai_types.FunctionResponse(
+                                    name=msg.get("name", ""),
+                                    response={"result": content},
+                                )
+                            )
+                        ],
+                    )
+                )
+            elif role == "assistant":
+                gemini_contents.append(genai_types.Content(role="model", parts=[genai_types.Part(text=content)]))
+            else:
+                gemini_contents.append(genai_types.Content(role="user", parts=[genai_types.Part(text=content)]))
+        config = genai_types.GenerateContentConfig(
+            system_instruction=system_instruction,
+            tools=gemini_tools,
+        )
+        last_exception = None
+        for attempt in range(max_retries + 1):
+            try:
+                response = await self._gemini_client.aio.models.generate_content(
+                    model=self.model,
+                    contents=gemini_contents,
+                    config=config,
+                )
+                # Extract content and tool calls
+                content = None
+                tool_calls: list[LLMToolCall] = []
+                if response.candidates and response.candidates[0].content:
+                    for part in response.candidates[0].content.parts:
+                        if hasattr(part, "text") and part.text:
+                            content = part.text
+                        if hasattr(part, "function_call") and part.function_call:
+                            fc = part.function_call
+                            tool_calls.append(
+                                LLMToolCall(
+                                    id=f"gemini_{len(tool_calls)}",
+                                    name=fc.name,
+                                    arguments=dict(fc.args) if fc.args else {},
+                                )
+                            )
+                finish_reason = "tool_calls" if tool_calls else "stop"
+                # Record metrics
+                metrics = get_metrics_collector()
+                input_tokens = response.usage_metadata.prompt_token_count if response.usage_metadata else 0
+                output_tokens = response.usage_metadata.candidates_token_count if response.usage_metadata else 0
+                metrics.record_llm_call(
+                    provider=self.provider,
+                    model=self.model,
+                    scope=scope,
+                    duration=time.time() - start_time,
+                    input_tokens=input_tokens,
+                    output_tokens=output_tokens,
+                    success=True,
+                )
+                return LLMToolCallResult(
+                    content=content,
+                    tool_calls=tool_calls,
+                    finish_reason=finish_reason,
+                    input_tokens=input_tokens,
+                    output_tokens=output_tokens,
+                )
+            except genai_errors.APIError as e:
+                if e.code in (401, 403):
+                    raise
+                last_exception = e
+                if attempt < max_retries:
+                    await asyncio.sleep(min(initial_backoff * (2**attempt), max_backoff))
+                    continue
+                raise
+        if last_exception:
+            raise last_exception
+        raise RuntimeError("Gemini tool call failed")
     async def _call_anthropic(
         self,
         messages: list[dict[str, str]],
@@ -443,6 +977,9 @@ class LLMProvider:
         max_backoff: float,
         skip_validation: bool,
         start_time: float,
+        scope: str = "memory",
+        return_usage: bool = False,
+        semaphore_wait_time: float = 0.0,
     ) -> Any:
         """Handle Anthropic-specific API calls."""
         from anthropic import APIConnectionError, APIStatusError, RateLimitError
@@ -515,17 +1052,40 @@ class LLMProvider:
                 else:
                     result = content
-                # Log slow calls
+                # Record metrics and log slow calls
                 duration = time.time() - start_time
+                input_tokens = response.usage.input_tokens or 0 if response.usage else 0
+                output_tokens = response.usage.output_tokens or 0 if response.usage else 0
+                total_tokens = input_tokens + output_tokens
+                # Record LLM metrics
+                metrics = get_metrics_collector()
+                metrics.record_llm_call(
+                    provider=self.provider,
+                    model=self.model,
+                    scope=scope,
+                    duration=duration,
+                    input_tokens=input_tokens,
+                    output_tokens=output_tokens,
+                    success=True,
+                )
+                # Log slow calls
                 if duration > 10.0:
-                    input_tokens = response.usage.input_tokens
-                    output_tokens = response.usage.output_tokens
+                    wait_info = f", wait={semaphore_wait_time:.3f}s" if semaphore_wait_time > 0.1 else ""
                     logger.info(
-                        f"slow llm call: model={self.provider}/{self.model}, "
+                        f"slow llm call: scope={scope}, model={self.provider}/{self.model}, "
                         f"input_tokens={input_tokens}, output_tokens={output_tokens}, "
-                        f"time={duration:.3f}s"
+                        f"time={duration:.3f}s{wait_info}"
                     )
+                if return_usage:
+                    token_usage = TokenUsage(
+                        input_tokens=input_tokens,
+                        output_tokens=output_tokens,
+                        total_tokens=total_tokens,
+                    )
+                    return result, token_usage
                 return result
             except json.JSONDecodeError as e:
@@ -580,6 +1140,9 @@ class LLMProvider:
         max_backoff: float,
         skip_validation: bool,
         start_time: float,
+        scope: str = "memory",
+        return_usage: bool = False,
+        semaphore_wait_time: float = 0.0,
     ) -> Any:
         """
         Call Ollama using native API with JSON schema enforcement.
@@ -654,11 +1217,39 @@ class LLMProvider:
                         else:
                             raise
+                    # Extract token usage from Ollama response
+                    # Ollama returns prompt_eval_count (input) and eval_count (output)
+                    duration = time.time() - start_time
+                    input_tokens = result.get("prompt_eval_count", 0) or 0
+                    output_tokens = result.get("eval_count", 0) or 0
+                    total_tokens = input_tokens + output_tokens
+                    # Record LLM metrics
+                    metrics = get_metrics_collector()
+                    metrics.record_llm_call(
+                        provider=self.provider,
+                        model=self.model,
+                        scope=scope,
+                        duration=duration,
+                        input_tokens=input_tokens,
+                        output_tokens=output_tokens,
+                        success=True,
+                    )
                     # Validate against Pydantic model or return raw JSON
                     if skip_validation:
-                        return json_data
+                        validated_result = json_data
                     else:
-                        return response_format.model_validate(json_data)
+                        validated_result = response_format.model_validate(json_data)
+                    if return_usage:
+                        token_usage = TokenUsage(
+                            input_tokens=input_tokens,
+                            output_tokens=output_tokens,
+                            total_tokens=total_tokens,
+                        )
+                        return validated_result, token_usage
+                    return validated_result
                 except httpx.HTTPStatusError as e:
                     last_exception = e
@@ -701,6 +1292,9 @@ class LLMProvider:
         max_backoff: float,
         skip_validation: bool,
         start_time: float,
+        scope: str = "memory",
+        return_usage: bool = False,
+        semaphore_wait_time: float = 0.0,
     ) -> Any:
         """Handle Gemini-specific API calls."""
         # Convert OpenAI-style messages to Gemini format
@@ -777,16 +1371,43 @@ class LLMProvider:
                 else:
                     result = content
-                # Log slow calls
+                # Record metrics and log slow calls
                 duration = time.time() - start_time
-                if duration > 10.0 and hasattr(response, "usage_metadata") and response.usage_metadata:
+                input_tokens = 0
+                output_tokens = 0
+                if hasattr(response, "usage_metadata") and response.usage_metadata:
                     usage = response.usage_metadata
+                    input_tokens = usage.prompt_token_count or 0
+                    output_tokens = usage.candidates_token_count or 0
+                # Record LLM metrics
+                metrics = get_metrics_collector()
+                metrics.record_llm_call(
+                    provider=self.provider,
+                    model=self.model,
+                    scope=scope,
+                    duration=duration,
+                    input_tokens=input_tokens,
+                    output_tokens=output_tokens,
+                    success=True,
+                )
+                # Log slow calls
+                if duration > 10.0 and input_tokens > 0:
+                    wait_info = f", wait={semaphore_wait_time:.3f}s" if semaphore_wait_time > 0.1 else ""
                     logger.info(
-                        f"slow llm call: model={self.provider}/{self.model}, "
-                        f"input_tokens={usage.prompt_token_count}, output_tokens={usage.candidates_token_count}, "
-                        f"time={duration:.3f}s"
+                        f"slow llm call: scope={scope}, model={self.provider}/{self.model}, "
+                        f"input_tokens={input_tokens}, output_tokens={output_tokens}, "
+                        f"time={duration:.3f}s{wait_info}"
                     )
+                if return_usage:
+                    token_usage = TokenUsage(
+                        input_tokens=input_tokens,
+                        output_tokens=output_tokens,
+                        total_tokens=input_tokens + output_tokens,
+                    )
+                    return result, token_usage
                 return result
             except json.JSONDecodeError as e:
@@ -828,6 +1449,61 @@ class LLMProvider:
             raise last_exception
         raise RuntimeError("Gemini call failed after all retries")
+    async def _call_mock(
+        self,
+        messages: list[dict[str, str]],
+        response_format: Any | None,
+        scope: str,
+        return_usage: bool,
+    ) -> Any:
+        """
+        Handle mock provider calls for testing.
+        Records the call and returns a configurable mock response.
+        """
+        # Record the call for test verification
+        call_record = {
+            "provider": self.provider,
+            "model": self.model,
+            "messages": messages,
+            "response_format": response_format.__name__
+            if response_format and hasattr(response_format, "__name__")
+            else str(response_format),
+            "scope": scope,
+        }
+        self._mock_calls.append(call_record)
+        logger.debug(f"Mock LLM call recorded: scope={scope}, model={self.model}")
+        # Return mock response
+        if self._mock_response is not None:
+            result = self._mock_response
+        elif response_format is not None:
+            # Try to create a minimal valid instance of the response format
+            try:
+                # For Pydantic models, try to create with minimal valid data
+                result = {"mock": True}
+            except Exception:
+                result = {"mock": True}
+        else:
+            result = "mock response"
+        if return_usage:
+            token_usage = TokenUsage(input_tokens=10, output_tokens=5, total_tokens=15)
+            return result, token_usage
+        return result
+    def set_mock_response(self, response: Any) -> None:
+        """Set the response to return from mock calls."""
+        self._mock_response = response
+    def get_mock_calls(self) -> list[dict]:
+        """Get the list of recorded mock calls."""
+        return self._mock_calls
+    def clear_mock_calls(self) -> None:
+        """Clear the recorded mock calls."""
+        self._mock_calls = []
     @classmethod
     def for_memory(cls) -> "LLMProvider":
         """Create provider for memory operations from environment variables."""

hindsight-api 0.2.1__py3-none-any.whl → 0.4.0__py3-none-any.whl

hindsight-api 0.2.1py3-none-any.whl → 0.4.0py3-none-any.whl