PyPI - hindsight-api - Versions diffs - 0.0.21__py3-none-any.whl → 0.1.0__py3-none-any.whl - Mend

hindsight-api 0.0.21py3-none-any.whl → 0.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

hindsight_api/api/__init__.py +2 -4
hindsight_api/api/http.py +28 -78
hindsight_api/api/mcp.py +2 -1
hindsight_api/cli.py +0 -1
hindsight_api/engine/cross_encoder.py +6 -1
hindsight_api/engine/embeddings.py +6 -1
hindsight_api/engine/entity_resolver.py +56 -29
hindsight_api/engine/llm_wrapper.py +97 -5
hindsight_api/engine/memory_engine.py +264 -139
hindsight_api/engine/response_models.py +15 -17
hindsight_api/engine/retain/bank_utils.py +23 -33
hindsight_api/engine/retain/entity_processing.py +5 -5
hindsight_api/engine/retain/fact_extraction.py +85 -23
hindsight_api/engine/retain/fact_storage.py +1 -1
hindsight_api/engine/retain/link_creation.py +12 -6
hindsight_api/engine/retain/link_utils.py +50 -56
hindsight_api/engine/retain/observation_regeneration.py +264 -0
hindsight_api/engine/retain/orchestrator.py +31 -44
hindsight_api/engine/retain/types.py +14 -0
hindsight_api/engine/search/retrieval.py +2 -2
hindsight_api/engine/search/think_utils.py +59 -30
hindsight_api/migrations.py +54 -32
hindsight_api/models.py +1 -2
hindsight_api/pg0.py +17 -36
{hindsight_api-0.0.21.dist-info → hindsight_api-0.1.0.dist-info}/METADATA +2 -3
hindsight_api-0.1.0.dist-info/RECORD +51 -0
hindsight_api-0.0.21.dist-info/RECORD +0 -50
{hindsight_api-0.0.21.dist-info → hindsight_api-0.1.0.dist-info}/WHEEL +0 -0
{hindsight_api-0.0.21.dist-info → hindsight_api-0.1.0.dist-info}/entry_points.txt +0 -0

hindsight_api/api/__init__.py CHANGED Viewed

@@ -17,18 +17,17 @@ def create_app(
     http_api_enabled: bool = True,
     mcp_api_enabled: bool = False,
     mcp_mount_path: str = "/mcp",
-    run_migrations: bool = True,
     initialize_memory: bool = True
 ) -> FastAPI:
     """
     Create and configure the unified Hindsight API application.
     Args:
-        memory: MemoryEngine instance (already initialized with required parameters)
+        memory: MemoryEngine instance (already initialized with required parameters).
+                Migrations are controlled by the MemoryEngine's run_migrations parameter.
         http_api_enabled: Whether to enable HTTP REST API endpoints (default: True)
         mcp_api_enabled: Whether to enable MCP server (default: False)
         mcp_mount_path: Path to mount MCP server (default: /mcp)
-        run_migrations: Whether to run database migrations on startup (default: True)
         initialize_memory: Whether to initialize memory system on startup (default: True)
     Returns:
@@ -50,7 +49,6 @@ def create_app(
         from .http import create_app as create_http_app
         app = create_http_app(
             memory=memory,
-            run_migrations=run_migrations,
             initialize_memory=initialize_memory
         )
         logger.info("HTTP REST API enabled")

hindsight_api/api/http.py CHANGED Viewed

@@ -36,27 +36,13 @@ from pydantic import BaseModel, Field, ConfigDict
 from hindsight_api import MemoryEngine
 from hindsight_api.engine.memory_engine import Budget
 from hindsight_api.engine.db_utils import acquire_with_retry
+from hindsight_api.engine.response_models import VALID_RECALL_FACT_TYPES
 from hindsight_api.metrics import get_metrics_collector, initialize_metrics, create_metrics_collector
 logger = logging.getLogger(__name__)
-class MetadataFilter(BaseModel):
-    """Filter for metadata fields. Matches records where (key=value) OR (key not set) when match_unset=True."""
-    model_config = ConfigDict(json_schema_extra={
-        "example": {
-            "key": "source",
-            "value": "slack",
-            "match_unset": True
-        }
-    })
-    key: str = Field(description="Metadata key to filter on")
-    value: Optional[str] = Field(default=None, description="Value to match. If None with match_unset=True, matches any record where key is not set.")
-    match_unset: bool = Field(default=True, description="If True, also match records where this metadata key is not set")
 class EntityIncludeOptions(BaseModel):
     """Options for including entity observations in recall results."""
     max_tokens: int = Field(default=500, description="Maximum tokens for entity observations")
@@ -89,7 +75,6 @@ class RecallRequest(BaseModel):
             "max_tokens": 4096,
             "trace": True,
             "query_timestamp": "2023-05-30T23:40:00",
-            "filters": [{"key": "source", "value": "slack", "match_unset": True}],
             "include": {
                 "entities": {
                     "max_tokens": 500
@@ -104,7 +89,6 @@ class RecallRequest(BaseModel):
     max_tokens: int = 4096
     trace: bool = False
     query_timestamp: Optional[str] = Field(default=None, description="ISO format date string (e.g., '2023-05-30T23:40:00')")
-    filters: Optional[List[MetadataFilter]] = Field(default=None, description="Filter by metadata. Multiple filters are ANDed together.")
     include: IncludeOptions = Field(default_factory=IncludeOptions, description="Options for including additional data (entities are included by default)")
@@ -362,7 +346,6 @@ class ReflectRequest(BaseModel):
             "query": "What do you think about artificial intelligence?",
             "budget": "low",
             "context": "This is for a research paper on AI ethics",
-            "filters": [{"key": "source", "value": "slack", "match_unset": True}],
             "include": {
                 "facts": {}
             }
@@ -372,7 +355,6 @@ class ReflectRequest(BaseModel):
     query: str
     budget: Budget = Budget.LOW
     context: Optional[str] = None
-    filters: Optional[List[MetadataFilter]] = Field(default=None, description="Filter by metadata. Multiple filters are ANDed together.")
     include: ReflectIncludeOptions = Field(default_factory=ReflectIncludeOptions, description="Options for including additional data (disabled by default)")
@@ -439,24 +421,18 @@ class BanksResponse(BaseModel):
 class DispositionTraits(BaseModel):
-    """Disposition traits based on Big Five model."""
+    """Disposition traits that influence how memories are formed and interpreted."""
     model_config = ConfigDict(json_schema_extra={
         "example": {
-            "openness": 0.8,
-            "conscientiousness": 0.6,
-            "extraversion": 0.5,
-            "agreeableness": 0.7,
-            "neuroticism": 0.3,
-            "bias_strength": 0.7
+            "skepticism": 3,
+            "literalism": 3,
+            "empathy": 3
         }
     })
-    openness: float = Field(ge=0.0, le=1.0, description="Openness to experience (0-1)")
-    conscientiousness: float = Field(ge=0.0, le=1.0, description="Conscientiousness (0-1)")
-    extraversion: float = Field(ge=0.0, le=1.0, description="Extraversion (0-1)")
-    agreeableness: float = Field(ge=0.0, le=1.0, description="Agreeableness (0-1)")
-    neuroticism: float = Field(ge=0.0, le=1.0, description="Neuroticism (0-1)")
-    bias_strength: float = Field(ge=0.0, le=1.0, description="How strongly disposition influences opinions (0-1)")
+    skepticism: int = Field(ge=1, le=5, description="How skeptical vs trusting (1=trusting, 5=skeptical)")
+    literalism: int = Field(ge=1, le=5, description="How literally to interpret information (1=flexible, 5=literal)")
+    empathy: int = Field(ge=1, le=5, description="How much to consider emotional context (1=detached, 5=empathetic)")
 class BankProfileResponse(BaseModel):
@@ -466,12 +442,9 @@ class BankProfileResponse(BaseModel):
             "bank_id": "user123",
             "name": "Alice",
             "disposition": {
-                "openness": 0.8,
-                "conscientiousness": 0.6,
-                "extraversion": 0.5,
-                "agreeableness": 0.7,
-                "neuroticism": 0.3,
-                "bias_strength": 0.7
+                "skepticism": 3,
+                "literalism": 3,
+                "empathy": 3
             },
             "background": "I am a software engineer with 10 years of experience in startups"
         }
@@ -500,7 +473,7 @@ class AddBackgroundRequest(BaseModel):
     content: str = Field(description="New background information to add or merge")
     update_disposition: bool = Field(
         default=True,
-        description="If true, infer Big Five disposition traits from the merged background (default: true)"
+        description="If true, infer disposition traits from the merged background (default: true)"
     )
@@ -510,12 +483,9 @@ class BackgroundResponse(BaseModel):
         "example": {
             "background": "I was born in Texas. I am a software engineer with 10 years of experience.",
             "disposition": {
-                "openness": 0.7,
-                "conscientiousness": 0.6,
-                "extraversion": 0.5,
-                "agreeableness": 0.8,
-                "neuroticism": 0.4,
-                "bias_strength": 0.6
+                "skepticism": 3,
+                "literalism": 3,
+                "empathy": 3
             }
         }
     })
@@ -543,12 +513,9 @@ class BankListResponse(BaseModel):
                     "bank_id": "user123",
                     "name": "Alice",
                     "disposition": {
-                        "openness": 0.5,
-                        "conscientiousness": 0.5,
-                        "extraversion": 0.5,
-                        "agreeableness": 0.5,
-                        "neuroticism": 0.5,
-                        "bias_strength": 0.5
+                        "skepticism": 3,
+                        "literalism": 3,
+                        "empathy": 3
                     },
                     "background": "I am a software engineer",
                     "created_at": "2024-01-15T10:30:00Z",
@@ -567,12 +534,9 @@ class CreateBankRequest(BaseModel):
         "example": {
             "name": "Alice",
             "disposition": {
-                "openness": 0.8,
-                "conscientiousness": 0.6,
-                "extraversion": 0.5,
-                "agreeableness": 0.7,
-                "neuroticism": 0.3,
-                "bias_strength": 0.7
+                "skepticism": 3,
+                "literalism": 3,
+                "empathy": 3
             },
             "background": "I am a creative software engineer with 10 years of experience"
         }
@@ -715,13 +679,13 @@ class DeleteResponse(BaseModel):
     success: bool
-def create_app(memory: MemoryEngine, run_migrations: bool = True, initialize_memory: bool = True) -> FastAPI:
+def create_app(memory: MemoryEngine, initialize_memory: bool = True) -> FastAPI:
     """
     Create and configure the FastAPI application.
     Args:
-        memory: MemoryEngine instance (already initialized with required parameters)
-        run_migrations: Whether to run database migrations on startup (default: True)
+        memory: MemoryEngine instance (already initialized with required parameters).
+                Migrations are controlled by the MemoryEngine's run_migrations parameter.
         initialize_memory: Whether to initialize memory system on startup (default: True)
     Returns:
@@ -752,16 +716,11 @@ def create_app(memory: MemoryEngine, run_migrations: bool = True, initialize_mem
             app.state.prometheus_reader = None
             # Metrics collector is already initialized as no-op by default
-        # Startup: Initialize database and memory system
+        # Startup: Initialize database and memory system (migrations run inside initialize if enabled)
         if initialize_memory:
             await memory.initialize()
             logging.info("Memory system initialized")
-        if run_migrations:
-            from hindsight_api.migrations import run_migrations as do_migrations
-            do_migrations(memory.db_url)
-            logging.info("Database migrations applied")
         yield
@@ -913,17 +872,8 @@ def _register_routes(app: FastAPI):
         metrics = get_metrics_collector()
         try:
-            # Validate types
-            valid_fact_types = ["world", "experience", "opinion"]
             # Default to world, experience, opinion if not specified (exclude observation by default)
-            fact_types = request.types if request.types else ["world", "experience", "opinion"]
-            for ft in fact_types:
-                if ft not in valid_fact_types:
-                    raise HTTPException(
-                        status_code=400,
-                        detail=f"Invalid type '{ft}'. Must be one of: {', '.join(valid_fact_types)}"
-                    )
+            fact_types = request.types if request.types else list(VALID_RECALL_FACT_TYPES)
             # Parse query_timestamp if provided
             question_date = None
@@ -1605,7 +1555,7 @@ This operation cannot be undone.
         "/v1/default/banks/{bank_id}/profile",
         response_model=BankProfileResponse,
         summary="Update memory bank disposition",
-        description="Update bank's Big Five disposition traits and bias strength",
+        description="Update bank's disposition traits (skepticism, literalism, empathy)",
         operation_id="update_bank_disposition"
     )
     async def api_update_bank_disposition(bank_id: str,
@@ -1852,7 +1802,7 @@ This operation cannot be undone.
         "/v1/default/banks/{bank_id}/memories",
         response_model=DeleteResponse,
         summary="Clear memory bank memories",
-        description="Delete memory units for a memory bank. Optionally filter by type (world, experience, opinion) to delete only specific types. This is a destructive operation that cannot be undone. The bank profile (personality and background) will be preserved.",
+        description="Delete memory units for a memory bank. Optionally filter by type (world, experience, opinion) to delete only specific types. This is a destructive operation that cannot be undone. The bank profile (disposition and background) will be preserved.",
         operation_id="clear_bank_memories"
     )
     async def api_clear_bank_memories(bank_id: str,

hindsight_api/api/mcp.py CHANGED Viewed

@@ -8,6 +8,7 @@ from typing import Optional
 from fastmcp import FastMCP
 from hindsight_api import MemoryEngine
+from hindsight_api.engine.response_models import VALID_RECALL_FACT_TYPES
 # Configure logging from HINDSIGHT_API_LOG_LEVEL environment variable
 _log_level_str = os.environ.get("HINDSIGHT_API_LOG_LEVEL", "info").lower()
@@ -90,7 +91,7 @@ def create_mcp_server(memory: MemoryEngine) -> FastMCP:
             search_result = await memory.recall_async(
                 bank_id=bank_id,
                 query=query,
-                fact_type=["world", "experience", "opinion"],
+                fact_type=list(VALID_RECALL_FACT_TYPES),
                 budget=Budget.LOW
             )

hindsight_api/cli.py CHANGED Viewed

@@ -102,7 +102,6 @@ def main():
         http_api_enabled=True,
         mcp_api_enabled=True,
         mcp_mount_path="/mcp",
-        run_migrations=True,
         initialize_memory=True,
     )

hindsight_api/engine/cross_encoder.py CHANGED Viewed

@@ -78,7 +78,12 @@ class SentenceTransformersCrossEncoder(CrossEncoderModel):
             )
         logger.info(f"Loading cross-encoder model: {self.model_name}...")
-        self._model = CrossEncoder(self.model_name)
+        # Disable lazy loading (meta tensors) which causes issues with newer transformers/accelerate
+        # Setting low_cpu_mem_usage=False and device_map=None ensures tensors are fully materialized
+        self._model = CrossEncoder(
+            self.model_name,
+            model_kwargs={"low_cpu_mem_usage": False, "device_map": None},
+        )
         logger.info("Cross-encoder model loaded")
     def predict(self, pairs: List[Tuple[str, str]]) -> List[float]:

hindsight_api/engine/embeddings.py CHANGED Viewed

@@ -84,7 +84,12 @@ class SentenceTransformersEmbeddings(Embeddings):
             )
         logger.info(f"Loading embedding model: {self.model_name}...")
-        self._model = SentenceTransformer(self.model_name)
+        # Disable lazy loading (meta tensors) which causes issues with newer transformers/accelerate
+        # Setting low_cpu_mem_usage=False and device_map=None ensures tensors are fully materialized
+        self._model = SentenceTransformer(
+            self.model_name,
+            model_kwargs={"low_cpu_mem_usage": False, "device_map": None},
+        )
         # Validate dimension matches database schema
         model_dim = self._model.get_sentence_embedding_dimension()

hindsight_api/engine/entity_resolver.py CHANGED Viewed

@@ -126,18 +126,20 @@ class EntityResolver:
         # Resolve each entity using pre-fetched candidates
         entity_ids = [None] * len(entities_data)
-        entities_to_update = []  # (entity_id, unit_event_date)
-        entities_to_create = []  # (idx, entity_data)
+        entities_to_update = []  # (entity_id, event_date)
+        entities_to_create = []  # (idx, entity_data, event_date)
         for idx, entity_data in enumerate(entities_data):
             entity_text = entity_data['text']
             nearby_entities = entity_data.get('nearby_entities', [])
+            # Use per-entity date if available, otherwise fall back to batch-level date
+            entity_event_date = entity_data.get('event_date', unit_event_date)
             candidates = all_candidates.get(entity_text, [])
             if not candidates:
                 # Will create new entity
-                entities_to_create.append((idx, entity_data))
+                entities_to_create.append((idx, entity_data, entity_event_date))
                 continue
             # Score candidates
@@ -165,9 +167,9 @@ class EntityResolver:
                     score += co_entity_score * 0.3
                 # 3. Temporal proximity (0-0.2)
-                if last_seen:
+                if last_seen and entity_event_date:
                     # Normalize timezone awareness for comparison
-                    event_date_utc = unit_event_date if unit_event_date.tzinfo else unit_event_date.replace(tzinfo=timezone.utc)
+                    event_date_utc = entity_event_date if entity_event_date.tzinfo else entity_event_date.replace(tzinfo=timezone.utc)
                     last_seen_utc = last_seen if last_seen.tzinfo else last_seen.replace(tzinfo=timezone.utc)
                     days_diff = abs((event_date_utc - last_seen_utc).total_seconds() / 86400)
                     if days_diff < 7:
@@ -183,9 +185,9 @@ class EntityResolver:
             if best_score > threshold:
                 entity_ids[idx] = best_candidate
-                entities_to_update.append((best_candidate, unit_event_date))
+                entities_to_update.append((best_candidate, entity_event_date))
             else:
-                entities_to_create.append((idx, entity_data))
+                entities_to_create.append((idx, entity_data, entity_event_date))
         # Batch update existing entities
         if entities_to_update:
@@ -199,29 +201,54 @@ class EntityResolver:
                 entities_to_update
             )
-        # Create new entities using INSERT ... ON CONFLICT to handle race conditions
-        # This ensures that if two concurrent transactions try to create the same entity,
-        # only one succeeds and the other gets the existing ID
+        # Batch create new entities using COPY + INSERT for maximum speed
+        # This handles duplicates via ON CONFLICT and returns all IDs
         if entities_to_create:
-            for idx, entity_data in entities_to_create:
-                # Use INSERT ... ON CONFLICT to atomically get-or-create
-                # The unique index is on (bank_id, LOWER(canonical_name))
-                row = await conn.fetchrow(
-                    """
-                    INSERT INTO entities (bank_id, canonical_name, first_seen, last_seen, mention_count)
-                    VALUES ($1, $2, $3, $4, 1)
-                    ON CONFLICT (bank_id, LOWER(canonical_name))
-                    DO UPDATE SET
-                        mention_count = entities.mention_count + 1,
-                        last_seen = EXCLUDED.last_seen
-                    RETURNING id
-                    """,
-                    bank_id,
-                    entity_data['text'],
-                    unit_event_date,
-                    unit_event_date
-                )
-                entity_ids[idx] = row['id']
+            # Group entities by canonical name (lowercase) to handle duplicates within batch
+            # For duplicates, we only insert once and reuse the ID
+            unique_entities = {}  # lowercase_name -> (entity_data, event_date, [indices])
+            for idx, entity_data, event_date in entities_to_create:
+                name_lower = entity_data['text'].lower()
+                if name_lower not in unique_entities:
+                    unique_entities[name_lower] = (entity_data, event_date, [idx])
+                else:
+                    # Same entity appears multiple times - add index to list
+                    unique_entities[name_lower][2].append(idx)
+            # Batch insert unique entities and get their IDs
+            # Use a single query with unnest for speed
+            entity_names = []
+            entity_dates = []
+            indices_map = []  # Maps result index -> list of original indices
+            for name_lower, (entity_data, event_date, indices) in unique_entities.items():
+                entity_names.append(entity_data['text'])
+                entity_dates.append(event_date)
+                indices_map.append(indices)
+            # Batch INSERT ... ON CONFLICT with RETURNING
+            # This is much faster than individual inserts
+            rows = await conn.fetch(
+                """
+                INSERT INTO entities (bank_id, canonical_name, first_seen, last_seen, mention_count)
+                SELECT $1, name, event_date, event_date, 1
+                FROM unnest($2::text[], $3::timestamptz[]) AS t(name, event_date)
+                ON CONFLICT (bank_id, LOWER(canonical_name))
+                DO UPDATE SET
+                    mention_count = entities.mention_count + 1,
+                    last_seen = EXCLUDED.last_seen
+                RETURNING id
+                """,
+                bank_id,
+                entity_names,
+                entity_dates
+            )
+            # Map returned IDs back to original indices
+            for result_idx, row in enumerate(rows):
+                entity_id = row['id']
+                for original_idx in indices_map[result_idx]:
+                    entity_ids[original_idx] = entity_id
         return entity_ids

hindsight_api/engine/llm_wrapper.py CHANGED Viewed

@@ -5,12 +5,15 @@ import os
 import time
 import asyncio
 from typing import Optional, Any, Dict, List
-from openai import AsyncOpenAI, RateLimitError, APIError, APIStatusError, LengthFinishReasonError
+from openai import AsyncOpenAI, RateLimitError, APIError, APIStatusError, APIConnectionError, LengthFinishReasonError
 from google import genai
 from google.genai import types as genai_types
 from google.genai import errors as genai_errors
 import logging
+# Seed applied to every Groq request for deterministic behavior.
+DEFAULT_LLM_SEED = 4242
 logger = logging.getLogger(__name__)
 # Disable httpx logging
@@ -40,6 +43,7 @@ class LLMConfig:
         api_key: str,
         base_url: str,
         model: str,
+            reasoning_effort: str = "low",
     ):
         """
         Initialize LLM configuration.
@@ -54,6 +58,7 @@ class LLMConfig:
         self.api_key = api_key
         self.base_url = base_url
         self.model = model
+        self.reasoning_effort = reasoning_effort
         # Validate provider
         if self.provider not in ["openai", "groq", "ollama", "gemini"]:
@@ -136,10 +141,14 @@ class LLMConfig:
                 "messages": messages,
                 **kwargs
             }
+            if self.provider == "groq":
+                call_params["seed"] = DEFAULT_LLM_SEED
             if self.provider == "groq":
                 call_params["extra_body"] = {
                     "service_tier": "auto",
-                    "reasoning_effort": "low",  # Reduce reasoning overhead
+                    "reasoning_effort": self.reasoning_effort,
                     "include_reasoning": False,  # Disable hidden reasoning tokens
                 }
@@ -187,10 +196,15 @@ class LLMConfig:
                     usage = response.usage
                     if duration > 10.0:
                         ratio = max(1, usage.completion_tokens) / usage.prompt_tokens
+                        # Check for cached tokens (OpenAI/Groq may include this)
+                        cached_tokens = 0
+                        if hasattr(usage, 'prompt_tokens_details') and usage.prompt_tokens_details:
+                            cached_tokens = getattr(usage.prompt_tokens_details, 'cached_tokens', 0) or 0
+                        cache_info = f", cached_tokens={cached_tokens}" if cached_tokens > 0 else ""
                         logger.info(
                             f"slow llm call: model={self.provider}/{self.model}, "
                             f"input_tokens={usage.prompt_tokens}, output_tokens={usage.completion_tokens}, "
-                            f"total_tokens={usage.total_tokens}, time={duration:.3f}s, ratio out/in={ratio:.2f}"
+                            f"total_tokens={usage.total_tokens}{cache_info}, time={duration:.3f}s, ratio out/in={ratio:.2f}"
                         )
                     return result
@@ -202,6 +216,18 @@ class LLMConfig:
                         f"LLM output exceeded token limits. Input may need to be split into smaller chunks."
                     ) from e
+                except APIConnectionError as e:
+                    # Handle connection errors (server disconnected, network issues) with retry
+                    last_exception = e
+                    if attempt < max_retries:
+                        logger.warning(f"Connection error, retrying... (attempt {attempt + 1}/{max_retries + 1})")
+                        backoff = min(initial_backoff * (2 ** attempt), max_backoff)
+                        await asyncio.sleep(backoff)
+                        continue
+                    else:
+                        logger.error(f"Connection error after {max_retries + 1} attempts: {str(e)}")
+                        raise
                 except APIStatusError as e:
                     last_exception = e
                     if attempt < max_retries:
@@ -238,7 +264,7 @@ class LLMConfig:
         skip_validation: bool,
         start_time: float,
         **kwargs
-    ) -> Any:
+) -> Any:
         """Handle Gemini-specific API calls using google-genai SDK."""
         import json
@@ -287,6 +313,8 @@ class LLMConfig:
             config_kwargs['max_output_tokens'] = kwargs['max_tokens']
         if response_format is not None:
             config_kwargs['response_mime_type'] = 'application/json'
+            # Pass the Pydantic model directly as response_schema for structured output
+            config_kwargs['response_schema'] = response_format
         generation_config = genai_types.GenerateContentConfig(**config_kwargs) if config_kwargs else None
@@ -302,6 +330,23 @@ class LLMConfig:
                 content = response.text
+                # Handle empty/None response (can happen with content filtering or timeouts)
+                if content is None:
+                    # Check if there's a block reason
+                    block_reason = None
+                    if hasattr(response, 'candidates') and response.candidates:
+                        candidate = response.candidates[0]
+                        if hasattr(candidate, 'finish_reason'):
+                            block_reason = candidate.finish_reason
+                    if attempt < max_retries:
+                        logger.warning(f"Gemini returned empty response (reason: {block_reason}), retrying... (attempt {attempt + 1}/{max_retries + 1})")
+                        backoff = min(initial_backoff * (2 ** attempt), max_backoff)
+                        await asyncio.sleep(backoff)
+                        continue
+                    else:
+                        raise RuntimeError(f"Gemini returned empty response after {max_retries + 1} attempts (reason: {block_reason})")
                 if response_format is not None:
                     # Parse the JSON response
                     json_data = json.loads(content)
@@ -318,14 +363,29 @@ class LLMConfig:
                 duration = time.time() - start_time
                 if duration > 10.0 and hasattr(response, 'usage_metadata') and response.usage_metadata:
                     usage = response.usage_metadata
+                    # Check for cached tokens (Gemini uses cached_content_token_count)
+                    cached_tokens = getattr(usage, 'cached_content_token_count', 0) or 0
+                    cache_info = f", cached_tokens={cached_tokens}" if cached_tokens > 0 else ""
                     logger.info(
                         f"slow llm call: model={self.provider}/{self.model}, "
-                        f"input_tokens={usage.prompt_token_count}, output_tokens={usage.candidates_token_count}, "
+                        f"input_tokens={usage.prompt_token_count}, output_tokens={usage.candidates_token_count}{cache_info}, "
                         f"time={duration:.3f}s"
                     )
                 return result
+            except json.JSONDecodeError as e:
+                # Handle truncated JSON responses (often from MAX_TOKENS) with retry
+                last_exception = e
+                if attempt < max_retries:
+                    logger.warning(f"Gemini returned invalid JSON (truncated response?), retrying... (attempt {attempt + 1}/{max_retries + 1})")
+                    backoff = min(initial_backoff * (2 ** attempt), max_backoff)
+                    await asyncio.sleep(backoff)
+                    continue
+                else:
+                    logger.error(f"Gemini returned invalid JSON after {max_retries + 1} attempts: {str(e)}")
+                    raise
             except genai_errors.APIError as e:
                 # Handle rate limits and server errors with retry
                 if e.code in (429, 503, 500):
@@ -372,6 +432,37 @@ class LLMConfig:
             api_key=api_key,
             base_url=base_url,
             model=model,
+            reasoning_effort="low"
+        )
+    @classmethod
+    def for_answer_generation(cls) -> "LLMConfig":
+        """
+        Create configuration for answer generation operations from environment variables.
+        Falls back to memory LLM config if answer-specific config not set.
+        """
+        # Check if answer-specific config exists, otherwise fall back to memory config
+        provider = os.getenv("HINDSIGHT_API_ANSWER_LLM_PROVIDER", os.getenv("HINDSIGHT_API_LLM_PROVIDER", "groq"))
+        api_key = os.getenv("HINDSIGHT_API_ANSWER_LLM_API_KEY", os.getenv("HINDSIGHT_API_LLM_API_KEY"))
+        base_url = os.getenv("HINDSIGHT_API_ANSWER_LLM_BASE_URL", os.getenv("HINDSIGHT_API_LLM_BASE_URL"))
+        model = os.getenv("HINDSIGHT_API_ANSWER_LLM_MODEL", os.getenv("HINDSIGHT_API_LLM_MODEL", "openai/gpt-oss-120b"))
+        # Set default base URL if not provided
+        if not base_url:
+            if provider == "groq":
+                base_url = "https://api.groq.com/openai/v1"
+            elif provider == "ollama":
+                base_url = "http://localhost:11434/v1"
+            else:
+                base_url = ""
+        return cls(
+            provider=provider,
+            api_key=api_key,
+            base_url=base_url,
+            model=model,
+            reasoning_effort="high"
         )
     @classmethod
@@ -401,4 +492,5 @@ class LLMConfig:
             api_key=api_key,
             base_url=base_url,
             model=model,
+            reasoning_effort="high"
         )

hindsight-api 0.0.21__py3-none-any.whl → 0.1.0__py3-none-any.whl

hindsight-api 0.0.21py3-none-any.whl → 0.1.0py3-none-any.whl