PyPI - hindsight-api - Versions diffs - 0.1.16__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

hindsight-api 0.1.16py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

hindsight_api/api/__init__.py +38 -14
hindsight_api/api/http.py +100 -9
hindsight_api/api/mcp.py +203 -52
hindsight_api/config.py +27 -0
hindsight_api/engine/interface.py +4 -0
hindsight_api/engine/llm_wrapper.py +275 -45
hindsight_api/engine/memory_engine.py +69 -16
hindsight_api/engine/response_models.py +7 -1
hindsight_api/engine/retain/entity_processing.py +37 -8
hindsight_api/engine/retain/fact_extraction.py +49 -6
hindsight_api/engine/retain/observation_regeneration.py +4 -2
hindsight_api/engine/retain/orchestrator.py +12 -1
hindsight_api/engine/retain/types.py +7 -0
hindsight_api/extensions/context.py +8 -1
hindsight_api/extensions/operation_validator.py +6 -4
hindsight_api/main.py +29 -1
hindsight_api/models.py +3 -0
{hindsight_api-0.1.16.dist-info → hindsight_api-0.2.0.dist-info}/METADATA +3 -2
{hindsight_api-0.1.16.dist-info → hindsight_api-0.2.0.dist-info}/RECORD +21 -21
{hindsight_api-0.1.16.dist-info → hindsight_api-0.2.0.dist-info}/WHEEL +0 -0
{hindsight_api-0.1.16.dist-info → hindsight_api-0.2.0.dist-info}/entry_points.txt +0 -0

hindsight_api/engine/llm_wrapper.py CHANGED Viewed

@@ -6,6 +6,7 @@ import asyncio
 import json
 import logging
 import os
+import re
 import time
 from typing import Any
@@ -15,6 +16,13 @@ from google.genai import errors as genai_errors
 from google.genai import types as genai_types
 from openai import APIConnectionError, APIStatusError, AsyncOpenAI, LengthFinishReasonError
+from ..config import (
+    DEFAULT_LLM_MAX_CONCURRENT,
+    DEFAULT_LLM_TIMEOUT,
+    ENV_LLM_MAX_CONCURRENT,
+    ENV_LLM_TIMEOUT,
+)
 # Seed applied to every Groq request for deterministic behavior.
 DEFAULT_LLM_SEED = 4242
@@ -24,7 +32,9 @@ logger = logging.getLogger(__name__)
 logging.getLogger("httpx").setLevel(logging.WARNING)
 # Global semaphore to limit concurrent LLM requests across all instances
-_global_llm_semaphore = asyncio.Semaphore(32)
+# Set HINDSIGHT_API_LLM_MAX_CONCURRENT=1 for local LLMs (LM Studio, Ollama)
+_llm_max_concurrent = int(os.getenv(ENV_LLM_MAX_CONCURRENT, str(DEFAULT_LLM_MAX_CONCURRENT)))
+_global_llm_semaphore = asyncio.Semaphore(_llm_max_concurrent)
 class OutputTooLongError(Exception):
@@ -58,7 +68,7 @@ class LLMProvider:
         Initialize LLM provider.
         Args:
-            provider: Provider name ("openai", "groq", "ollama", "gemini").
+            provider: Provider name ("openai", "groq", "ollama", "gemini", "anthropic", "lmstudio").
             api_key: API key.
             base_url: Base URL for the API.
             model: Model name.
@@ -71,7 +81,7 @@ class LLMProvider:
         self.reasoning_effort = reasoning_effort
         # Validate provider
-        valid_providers = ["openai", "groq", "ollama", "gemini"]
+        valid_providers = ["openai", "groq", "ollama", "gemini", "anthropic", "lmstudio"]
         if self.provider not in valid_providers:
             raise ValueError(f"Invalid LLM provider: {self.provider}. Must be one of: {', '.join(valid_providers)}")
@@ -81,25 +91,48 @@ class LLMProvider:
                 self.base_url = "https://api.groq.com/openai/v1"
             elif self.provider == "ollama":
                 self.base_url = "http://localhost:11434/v1"
+            elif self.provider == "lmstudio":
+                self.base_url = "http://localhost:1234/v1"
-        # Validate API key (not needed for ollama)
-        if self.provider != "ollama" and not self.api_key:
+        # Validate API key (not needed for ollama or lmstudio)
+        if self.provider not in ("ollama", "lmstudio") and not self.api_key:
             raise ValueError(f"API key not found for {self.provider}")
+        # Get timeout config (set HINDSIGHT_API_LLM_TIMEOUT for local LLMs that need longer timeouts)
+        self.timeout = float(os.getenv(ENV_LLM_TIMEOUT, str(DEFAULT_LLM_TIMEOUT)))
         # Create client based on provider
+        self._client = None
+        self._gemini_client = None
+        self._anthropic_client = None
         if self.provider == "gemini":
             self._gemini_client = genai.Client(api_key=self.api_key)
-            self._client = None
-        elif self.provider == "ollama":
-            self._client = AsyncOpenAI(api_key="ollama", base_url=self.base_url, max_retries=0)
-            self._gemini_client = None
+        elif self.provider == "anthropic":
+            from anthropic import AsyncAnthropic
+            # Only pass base_url if it's set (Anthropic uses default URL otherwise)
+            anthropic_kwargs = {"api_key": self.api_key}
+            if self.base_url:
+                anthropic_kwargs["base_url"] = self.base_url
+            if self.timeout:
+                anthropic_kwargs["timeout"] = self.timeout
+            self._anthropic_client = AsyncAnthropic(**anthropic_kwargs)
+        elif self.provider in ("ollama", "lmstudio"):
+            # Use dummy key if not provided for local
+            api_key = self.api_key or "local"
+            client_kwargs = {"api_key": api_key, "base_url": self.base_url, "max_retries": 0}
+            if self.timeout:
+                client_kwargs["timeout"] = self.timeout
+            self._client = AsyncOpenAI(**client_kwargs)
         else:
             # Only pass base_url if it's set (OpenAI uses default URL otherwise)
             client_kwargs = {"api_key": self.api_key, "max_retries": 0}
             if self.base_url:
                 client_kwargs["base_url"] = self.base_url
-            self._client = AsyncOpenAI(**client_kwargs)  # type: ignore[invalid-argument-type] - dict kwargs
-            self._gemini_client = None
+            if self.timeout:
+                client_kwargs["timeout"] = self.timeout
+            self._client = AsyncOpenAI(**client_kwargs)
     async def verify_connection(self) -> None:
         """
@@ -135,6 +168,7 @@ class LLMProvider:
         initial_backoff: float = 1.0,
         max_backoff: float = 60.0,
         skip_validation: bool = False,
+        strict_schema: bool = False,
     ) -> Any:
         """
         Make an LLM API call with retry logic.
@@ -149,6 +183,7 @@ class LLMProvider:
             initial_backoff: Initial backoff time in seconds.
             max_backoff: Maximum backoff time in seconds.
             skip_validation: Return raw JSON without Pydantic validation.
+            strict_schema: Use strict JSON schema enforcement (OpenAI only). Guarantees all required fields.
         Returns:
             Parsed response if response_format is provided, otherwise text content.
@@ -166,6 +201,19 @@ class LLMProvider:
                     messages, response_format, max_retries, initial_backoff, max_backoff, skip_validation, start_time
                 )
+            # Handle Anthropic provider separately
+            if self.provider == "anthropic":
+                return await self._call_anthropic(
+                    messages,
+                    response_format,
+                    max_completion_tokens,
+                    max_retries,
+                    initial_backoff,
+                    max_backoff,
+                    skip_validation,
+                    start_time,
+                )
             # Handle Ollama with native API for structured output (better schema enforcement)
             if self.provider == "ollama" and response_format is not None:
                 return await self._call_ollama_native(
@@ -226,47 +274,93 @@ class LLMProvider:
             for attempt in range(max_retries + 1):
                 try:
                     if response_format is not None:
-                        # Add schema to system message for JSON mode
+                        schema = None
                         if hasattr(response_format, "model_json_schema"):
                             schema = response_format.model_json_schema()
-                            schema_msg = f"\n\nYou must respond with valid JSON matching this schema:\n{json.dumps(schema, indent=2)}"
-                            if call_params["messages"] and call_params["messages"][0].get("role") == "system":
-                                call_params["messages"][0]["content"] += schema_msg
-                            elif call_params["messages"]:
-                                call_params["messages"][0]["content"] = (
-                                    schema_msg + "\n\n" + call_params["messages"][0]["content"]
-                                )
-                        call_params["response_format"] = {"type": "json_object"}
+                        if strict_schema and schema is not None:
+                            # Use OpenAI's strict JSON schema enforcement
+                            # This guarantees all required fields are returned
+                            call_params["response_format"] = {
+                                "type": "json_schema",
+                                "json_schema": {
+                                    "name": "response",
+                                    "strict": True,
+                                    "schema": schema,
+                                },
+                            }
+                        else:
+                            # Soft enforcement: add schema to prompt and use json_object mode
+                            if schema is not None:
+                                schema_msg = f"\n\nYou must respond with valid JSON matching this schema:\n{json.dumps(schema, indent=2)}"
+                                if call_params["messages"] and call_params["messages"][0].get("role") == "system":
+                                    call_params["messages"][0]["content"] += schema_msg
+                                elif call_params["messages"]:
+                                    call_params["messages"][0]["content"] = (
+                                        schema_msg + "\n\n" + call_params["messages"][0]["content"]
+                                    )
+                            if self.provider not in ("lmstudio", "ollama"):
+                                # LM Studio and Ollama don't support json_object response format reliably
+                                # We rely on the schema in the system message instead
+                                call_params["response_format"] = {"type": "json_object"}
+                        logger.debug(f"Sending request to {self.provider}/{self.model} (timeout={self.timeout})")
                         response = await self._client.chat.completions.create(**call_params)
+                        logger.debug(f"Received response from {self.provider}/{self.model}")
                         content = response.choices[0].message.content
-                        # Log raw LLM response for debugging JSON parse issues
-                        try:
-                            json_data = json.loads(content)
-                        except json.JSONDecodeError as json_err:
-                            # Truncate content for logging (first 500 and last 200 chars)
-                            content_preview = content[:500] if content else "<empty>"
-                            if content and len(content) > 700:
-                                content_preview = f"{content[:500]}...TRUNCATED...{content[-200:]}"
-                            logger.warning(
-                                f"JSON parse error from LLM response (attempt {attempt + 1}/{max_retries + 1}): {json_err}\n"
-                                f"  Model: {self.provider}/{self.model}\n"
-                                f"  Content length: {len(content) if content else 0} chars\n"
-                                f"  Content preview: {content_preview!r}\n"
-                                f"  Finish reason: {response.choices[0].finish_reason if response.choices else 'unknown'}"
-                            )
-                            # Retry on JSON parse errors - LLM may return valid JSON on next attempt
-                            if attempt < max_retries:
-                                backoff = min(initial_backoff * (2**attempt), max_backoff)
-                                await asyncio.sleep(backoff)
-                                last_exception = json_err
-                                continue
-                            else:
-                                logger.error(f"JSON parse error after {max_retries + 1} attempts, giving up")
-                                raise
+                        # Strip reasoning model thinking tags
+                        # Supports: <think>, <thinking>, <reasoning>, |startthink|/|endthink|
+                        # for reasoning models that embed thinking in their output (e.g., Qwen3, DeepSeek)
+                        if content:
+                            original_len = len(content)
+                            content = re.sub(r"<think>.*?</think>", "", content, flags=re.DOTALL)
+                            content = re.sub(r"<thinking>.*?</thinking>", "", content, flags=re.DOTALL)
+                            content = re.sub(r"<reasoning>.*?</reasoning>", "", content, flags=re.DOTALL)
+                            content = re.sub(r"\|startthink\|.*?\|endthink\|", "", content, flags=re.DOTALL)
+                            content = content.strip()
+                            if len(content) < original_len:
+                                logger.debug(f"Stripped {original_len - len(content)} chars of reasoning tokens")
+                        # For local models, they may wrap JSON in markdown code blocks
+                        if self.provider in ("lmstudio", "ollama"):
+                            clean_content = content
+                            if "```json" in content:
+                                clean_content = content.split("```json")[1].split("```")[0].strip()
+                            elif "```" in content:
+                                clean_content = content.split("```")[1].split("```")[0].strip()
+                            try:
+                                json_data = json.loads(clean_content)
+                            except json.JSONDecodeError:
+                                # Fallback to parsing raw content
+                                json_data = json.loads(content)
+                        else:
+                            # Log raw LLM response for debugging JSON parse issues
+                            try:
+                                json_data = json.loads(content)
+                            except json.JSONDecodeError as json_err:
+                                # Truncate content for logging (first 500 and last 200 chars)
+                                content_preview = content[:500] if content else "<empty>"
+                                if content and len(content) > 700:
+                                    content_preview = f"{content[:500]}...TRUNCATED...{content[-200:]}"
+                                logger.warning(
+                                    f"JSON parse error from LLM response (attempt {attempt + 1}/{max_retries + 1}): {json_err}\n"
+                                    f"  Model: {self.provider}/{self.model}\n"
+                                    f"  Content length: {len(content) if content else 0} chars\n"
+                                    f"  Content preview: {content_preview!r}\n"
+                                    f"  Finish reason: {response.choices[0].finish_reason if response.choices else 'unknown'}"
+                                )
+                                # Retry on JSON parse errors - LLM may return valid JSON on next attempt
+                                if attempt < max_retries:
+                                    backoff = min(initial_backoff * (2**attempt), max_backoff)
+                                    await asyncio.sleep(backoff)
+                                    last_exception = json_err
+                                    continue
+                                else:
+                                    logger.error(f"JSON parse error after {max_retries + 1} attempts, giving up")
+                                    raise
                         if skip_validation:
                             result = json_data
@@ -339,6 +433,142 @@ class LLMProvider:
                 raise last_exception
             raise RuntimeError("LLM call failed after all retries with no exception captured")
+    async def _call_anthropic(
+        self,
+        messages: list[dict[str, str]],
+        response_format: Any | None,
+        max_completion_tokens: int | None,
+        max_retries: int,
+        initial_backoff: float,
+        max_backoff: float,
+        skip_validation: bool,
+        start_time: float,
+    ) -> Any:
+        """Handle Anthropic-specific API calls."""
+        from anthropic import APIConnectionError, APIStatusError, RateLimitError
+        # Convert OpenAI-style messages to Anthropic format
+        system_prompt = None
+        anthropic_messages = []
+        for msg in messages:
+            role = msg.get("role", "user")
+            content = msg.get("content", "")
+            if role == "system":
+                if system_prompt:
+                    system_prompt += "\n\n" + content
+                else:
+                    system_prompt = content
+            else:
+                anthropic_messages.append({"role": role, "content": content})
+        # Add JSON schema instruction if response_format is provided
+        if response_format is not None and hasattr(response_format, "model_json_schema"):
+            schema = response_format.model_json_schema()
+            schema_msg = f"\n\nYou must respond with valid JSON matching this schema:\n{json.dumps(schema, indent=2)}"
+            if system_prompt:
+                system_prompt += schema_msg
+            else:
+                system_prompt = schema_msg
+        # Prepare parameters
+        call_params = {
+            "model": self.model,
+            "messages": anthropic_messages,
+            "max_tokens": max_completion_tokens if max_completion_tokens is not None else 4096,
+        }
+        if system_prompt:
+            call_params["system"] = system_prompt
+        last_exception = None
+        for attempt in range(max_retries + 1):
+            try:
+                response = await self._anthropic_client.messages.create(**call_params)
+                # Anthropic response content is a list of blocks
+                content = ""
+                for block in response.content:
+                    if block.type == "text":
+                        content += block.text
+                if response_format is not None:
+                    # Models may wrap JSON in markdown code blocks
+                    clean_content = content
+                    if "```json" in content:
+                        clean_content = content.split("```json")[1].split("```")[0].strip()
+                    elif "```" in content:
+                        clean_content = content.split("```")[1].split("```")[0].strip()
+                    try:
+                        json_data = json.loads(clean_content)
+                    except json.JSONDecodeError:
+                        # Fallback to parsing raw content if markdown stripping failed
+                        json_data = json.loads(content)
+                    if skip_validation:
+                        result = json_data
+                    else:
+                        result = response_format.model_validate(json_data)
+                else:
+                    result = content
+                # Log slow calls
+                duration = time.time() - start_time
+                if duration > 10.0:
+                    input_tokens = response.usage.input_tokens
+                    output_tokens = response.usage.output_tokens
+                    logger.info(
+                        f"slow llm call: model={self.provider}/{self.model}, "
+                        f"input_tokens={input_tokens}, output_tokens={output_tokens}, "
+                        f"time={duration:.3f}s"
+                    )
+                return result
+            except json.JSONDecodeError as e:
+                last_exception = e
+                if attempt < max_retries:
+                    logger.warning("Anthropic returned invalid JSON, retrying...")
+                    backoff = min(initial_backoff * (2**attempt), max_backoff)
+                    await asyncio.sleep(backoff)
+                    continue
+                else:
+                    logger.error(f"Anthropic returned invalid JSON after {max_retries + 1} attempts")
+                    raise
+            except (APIConnectionError, RateLimitError, APIStatusError) as e:
+                # Fast fail on 401/403
+                if isinstance(e, APIStatusError) and e.status_code in (401, 403):
+                    logger.error(f"Anthropic auth error (HTTP {e.status_code}), not retrying: {str(e)}")
+                    raise
+                last_exception = e
+                if attempt < max_retries:
+                    # Check if it's a rate limit or server error
+                    should_retry = isinstance(e, (APIConnectionError, RateLimitError)) or (
+                        isinstance(e, APIStatusError) and e.status_code >= 500
+                    )
+                    if should_retry:
+                        backoff = min(initial_backoff * (2**attempt), max_backoff)
+                        jitter = backoff * 0.2 * (2 * (time.time() % 1) - 1)
+                        await asyncio.sleep(backoff + jitter)
+                        continue
+                logger.error(f"Anthropic API error after {max_retries + 1} attempts: {str(e)}")
+                raise
+            except Exception as e:
+                logger.error(f"Unexpected error during Anthropic call: {type(e).__name__}: {str(e)}")
+                raise
+        if last_exception:
+            raise last_exception
+        raise RuntimeError("Anthropic call failed after all retries")
     async def _call_ollama_native(
         self,
         messages: list[dict[str, str]],

hindsight_api/engine/memory_engine.py CHANGED Viewed

@@ -17,6 +17,8 @@ import uuid
 from datetime import UTC, datetime, timedelta
 from typing import TYPE_CHECKING, Any
+from ..config import get_config
 # Context variable for current schema (async-safe, per-task isolation)
 _current_schema: contextvars.ContextVar[str] = contextvars.ContextVar("current_schema", default="public")
@@ -372,7 +374,7 @@ class MemoryEngine(MemoryEngineInterface):
         result = await validation_coro
         if not result.allowed:
-            raise OperationValidationError(result.reason or "Operation not allowed")
+            raise OperationValidationError(result.reason or "Operation not allowed", result.status_code)
     async def _authenticate_tenant(self, request_context: "RequestContext | None") -> str:
         """
@@ -399,7 +401,9 @@ class MemoryEngine(MemoryEngineInterface):
         if request_context is None:
             raise AuthenticationError("RequestContext is required when tenant extension is configured")
+        # Let AuthenticationError propagate - HTTP layer will convert to 401
         tenant_context = await self._tenant_extension.authenticate(request_context)
         _current_schema.set(tenant_context.schema_name)
         return tenant_context.schema_name
@@ -2825,13 +2829,16 @@ Guidelines:
         Handler for form opinion tasks.
         Args:
-            task_dict: Dict with keys: 'bank_id', 'answer_text', 'query'
+            task_dict: Dict with keys: 'bank_id', 'answer_text', 'query', 'tenant_id'
         """
         bank_id = task_dict["bank_id"]
         answer_text = task_dict["answer_text"]
         query = task_dict["query"]
+        tenant_id = task_dict.get("tenant_id")
-        await self._extract_and_store_opinions_async(bank_id=bank_id, answer_text=answer_text, query=query)
+        await self._extract_and_store_opinions_async(
+            bank_id=bank_id, answer_text=answer_text, query=query, tenant_id=tenant_id
+        )
     async def _handle_reinforce_opinion(self, task_dict: dict[str, Any]):
         """
@@ -3076,6 +3083,8 @@ Guidelines:
         *,
         budget: Budget | None = None,
         context: str | None = None,
+        max_tokens: int = 4096,
+        response_schema: dict | None = None,
         request_context: "RequestContext",
     ) -> ReflectResult:
         """
@@ -3087,19 +3096,22 @@ Guidelines:
         3. Retrieves existing opinions (bank's formed perspectives)
         4. Uses LLM to formulate an answer
         5. Extracts and stores any new opinions formed during reflection
-        6. Returns plain text answer and the facts used
+        6. Optionally generates structured output based on response_schema
+        7. Returns plain text answer and the facts used
         Args:
             bank_id: bank identifier
             query: Question to answer
             budget: Budget level for memory exploration (low=100, mid=300, high=600 units)
             context: Additional context string to include in LLM prompt (not used in recall)
+            response_schema: Optional JSON Schema for structured output
         Returns:
             ReflectResult containing:
                 - text: Plain text answer (no markdown)
                 - based_on: Dict with 'world', 'experience', and 'opinion' fact lists (MemoryFact objects)
                 - new_opinions: List of newly formed opinions
+                - structured_output: Optional dict if response_schema was provided
         """
         # Use cached LLM config
         if self._llm_config is None:
@@ -3177,21 +3189,53 @@ Guidelines:
         log_buffer.append(f"[REFLECT {reflect_id}] Prompt: {len(prompt)} chars")
         system_message = think_utils.get_system_message(disposition)
+        messages = [{"role": "system", "content": system_message}, {"role": "user", "content": prompt}]
+        # Prepare response_format if schema provided
+        response_format = None
+        if response_schema is not None:
+            # Wrapper class to provide Pydantic-like interface for raw JSON schemas
+            class JsonSchemaWrapper:
+                def __init__(self, schema: dict):
+                    self._schema = schema
+                def model_json_schema(self):
+                    return self._schema
+            response_format = JsonSchemaWrapper(response_schema)
         llm_start = time.time()
-        answer_text = await self._llm_config.call(
-            messages=[{"role": "system", "content": system_message}, {"role": "user", "content": prompt}],
-            scope="memory_think",
-            temperature=0.9,
-            max_completion_tokens=1000,
+        result = await self._llm_config.call(
+            messages=messages,
+            scope="memory_reflect",
+            max_completion_tokens=max_tokens,
+            response_format=response_format,
+            skip_validation=True if response_format else False,
+            # Don't enforce strict_schema - not all providers support it and may retry forever
+            # Soft enforcement (schema in prompt + json_object mode) is sufficient
+            strict_schema=False,
         )
         llm_time = time.time() - llm_start
-        answer_text = answer_text.strip()
+        # Handle response based on whether structured output was requested
+        if response_schema is not None:
+            structured_output = result
+            answer_text = ""  # Empty for backward compatibility
+            log_buffer.append(f"[REFLECT {reflect_id}] Structured output generated")
+        else:
+            structured_output = None
+            answer_text = result.strip()
         # Submit form_opinion task for background processing
+        # Pass tenant_id from request context for internal authentication in background task
         await self._task_backend.submit_task(
-            {"type": "form_opinion", "bank_id": bank_id, "answer_text": answer_text, "query": query}
+            {
+                "type": "form_opinion",
+                "bank_id": bank_id,
+                "answer_text": answer_text,
+                "query": query,
+                "tenant_id": getattr(request_context, "tenant_id", None) if request_context else None,
+            }
         )
         total_time = time.time() - reflect_start
@@ -3205,6 +3249,7 @@ Guidelines:
             text=answer_text,
             based_on={"world": world_results, "experience": agent_results, "opinion": opinion_results},
             new_opinions=[],  # Opinions are being extracted asynchronously
+            structured_output=structured_output,
         )
         # Call post-operation hook if validator is configured
@@ -3228,7 +3273,9 @@ Guidelines:
         return result
-    async def _extract_and_store_opinions_async(self, bank_id: str, answer_text: str, query: str):
+    async def _extract_and_store_opinions_async(
+        self, bank_id: str, answer_text: str, query: str, tenant_id: str | None = None
+    ):
         """
         Background task to extract and store opinions from think response.
@@ -3238,6 +3285,7 @@ Guidelines:
             bank_id: bank IDentifier
             answer_text: The generated answer text
             query: The original query
+            tenant_id: Tenant identifier for internal authentication
         """
         try:
             # Extract opinions from the answer
@@ -3248,10 +3296,11 @@ Guidelines:
                 from datetime import datetime
                 current_time = datetime.now(UTC)
-                # Use internal request context for background tasks
+                # Use internal context with tenant_id for background authentication
+                # Extension can check internal=True to bypass normal auth
                 from hindsight_api.models import RequestContext
-                internal_context = RequestContext()
+                internal_context = RequestContext(tenant_id=tenant_id, internal=True)
                 for opinion in new_opinions:
                     await self.retain_async(
                         bank_id=bank_id,
@@ -3572,7 +3621,7 @@ Guidelines:
         self,
         bank_id: str,
         entity_ids: list[str],
-        min_facts: int = 5,
+        min_facts: int | None = None,
         conn=None,
         request_context: "RequestContext | None" = None,
     ) -> None:
@@ -3584,12 +3633,16 @@ Guidelines:
         Args:
             bank_id: Bank identifier
             entity_ids: List of entity IDs to process
-            min_facts: Minimum facts required to regenerate observations
+            min_facts: Minimum facts required to regenerate observations (uses config default if None)
             conn: Optional database connection (for transactional atomicity)
         """
         if not bank_id or not entity_ids:
             return
+        # Use config default if min_facts not specified
+        if min_facts is None:
+            min_facts = get_config().observation_min_facts
         # Convert to UUIDs
         entity_uuids = [uuid.UUID(eid) if isinstance(eid, str) else eid for eid in entity_ids]

hindsight_api/engine/response_models.py CHANGED Viewed

@@ -123,7 +123,8 @@ class ReflectResult(BaseModel):
     Result from a reflect operation.
     Contains the formulated answer, the facts it was based on (organized by type),
-    and any new opinions that were formed during the reflection process.
+    any new opinions that were formed during the reflection process, and optionally
+    structured output if a response schema was provided.
     """
     model_config = ConfigDict(
@@ -145,6 +146,7 @@ class ReflectResult(BaseModel):
                     "opinion": [],
                 },
                 "new_opinions": ["Machine learning has great potential in healthcare"],
+                "structured_output": {"summary": "ML in healthcare", "confidence": 0.9},
             }
         }
     )
@@ -154,6 +156,10 @@ class ReflectResult(BaseModel):
         description="Facts used to formulate the answer, organized by type (world, experience, opinion)"
     )
     new_opinions: list[str] = Field(default_factory=list, description="List of newly formed opinions during reflection")
+    structured_output: dict[str, Any] | None = Field(
+        default=None,
+        description="Structured output parsed according to the provided response schema. Only present when response_schema was provided.",
+    )
 class Opinion(BaseModel):

hindsight-api 0.1.16__py3-none-any.whl → 0.2.0__py3-none-any.whl

hindsight-api 0.1.16py3-none-any.whl → 0.2.0py3-none-any.whl