PyPI - hindsight-api - Versions diffs - 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl - Mend

hindsight-api 0.4.1py3-none-any.whl → 0.4.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

hindsight_api/__init__.py +1 -1
hindsight_api/api/http.py +7 -19
hindsight_api/api/mcp.py +45 -5
hindsight_api/config.py +115 -11
hindsight_api/daemon.py +4 -1
hindsight_api/engine/consolidation/consolidator.py +39 -3
hindsight_api/engine/cross_encoder.py +7 -99
hindsight_api/engine/embeddings.py +3 -93
hindsight_api/engine/interface.py +0 -43
hindsight_api/engine/llm_wrapper.py +93 -22
hindsight_api/engine/memory_engine.py +37 -138
hindsight_api/engine/response_models.py +1 -21
hindsight_api/engine/retain/fact_extraction.py +19 -23
hindsight_api/engine/retain/orchestrator.py +1 -4
hindsight_api/engine/utils.py +0 -3
hindsight_api/main.py +27 -12
hindsight_api/mcp_tools.py +31 -12
hindsight_api/metrics.py +3 -3
hindsight_api/pg0.py +1 -1
hindsight_api/worker/main.py +11 -11
hindsight_api/worker/poller.py +226 -97
{hindsight_api-0.4.1.dist-info → hindsight_api-0.4.3.dist-info}/METADATA +2 -1
{hindsight_api-0.4.1.dist-info → hindsight_api-0.4.3.dist-info}/RECORD +25 -25
{hindsight_api-0.4.1.dist-info → hindsight_api-0.4.3.dist-info}/WHEEL +0 -0
{hindsight_api-0.4.1.dist-info → hindsight_api-0.4.3.dist-info}/entry_points.txt +0 -0

hindsight_api/__init__.py CHANGED Viewed

@@ -46,4 +46,4 @@ __all__ = [
     "RemoteTEICrossEncoder",
     "LLMConfig",
 ]
-__version__ = "0.4.1"
+__version__ = "0.4.3"

hindsight_api/api/http.py CHANGED Viewed

@@ -92,8 +92,7 @@ class RecallRequest(BaseModel):
     query: str
     types: list[str] | None = Field(
         default=None,
-        description="List of fact types to recall: 'world', 'experience', 'observation'. Defaults to world and experience if not specified. "
-        "Note: 'opinion' is accepted but ignored (opinions are excluded from recall).",
+        description="List of fact types to recall: 'world', 'experience', 'observation'. Defaults to world and experience if not specified.",
     )
     budget: Budget = Budget.MID
     max_tokens: int = 4096
@@ -504,13 +503,6 @@ class ReflectRequest(BaseModel):
     )
-class OpinionItem(BaseModel):
-    """Model for an opinion with confidence score."""
-    text: str
-    confidence: float
 class ReflectFact(BaseModel):
     """A fact used in think response."""
@@ -529,7 +521,7 @@ class ReflectFact(BaseModel):
     id: str | None = None
     text: str
-    type: str | None = None  # fact type: world, experience, opinion
+    type: str | None = None  # fact type: world, experience, observation
     context: str | None = None
     occurred_start: str | None = None
     occurred_end: str | None = None
@@ -1412,9 +1404,10 @@ def create_app(
                 worker_id=worker_id,
                 executor=memory.execute_task,
                 poll_interval_ms=config.worker_poll_interval_ms,
-                batch_size=config.worker_batch_size,
                 max_retries=config.worker_max_retries,
                 tenant_extension=getattr(memory, "_tenant_extension", None),
+                max_slots=config.worker_max_slots,
+                consolidation_max_slots=config.worker_consolidation_max_slots,
             )
             poller_task = asyncio.create_task(poller.run())
             logging.info(f"Worker poller started (worker_id={worker_id})")
@@ -1707,9 +1700,7 @@ def _register_routes(app: FastAPI):
         description="Recall memory using semantic similarity and spreading activation.\n\n"
         "The type parameter is optional and must be one of:\n"
         "- `world`: General knowledge about people, places, events, and things that happen\n"
-        "- `experience`: Memories about experience, conversations, actions taken, and tasks performed\n"
-        "- `opinion`: The bank's formed beliefs, perspectives, and viewpoints\n\n"
-        "Set `include_entities=true` to get entity observations alongside recall results.",
+        "- `experience`: Memories about experience, conversations, actions taken, and tasks performed",
         operation_id="recall_memories",
         tags=["Memory"],
     )
@@ -1723,10 +1714,8 @@ def _register_routes(app: FastAPI):
         metrics = get_metrics_collector()
         try:
-            # Default to world and experience if not specified (exclude observation and opinion)
-            # Filter out 'opinion' even if requested - opinions are excluded from recall
+            # Default to world and experience if not specified (exclude observation)
             fact_types = request.types if request.types else list(VALID_RECALL_FACT_TYPES)
-            fact_types = [ft for ft in fact_types if ft != "opinion"]
             # Parse query_timestamp if provided
             question_date = None
@@ -1858,8 +1847,7 @@ def _register_routes(app: FastAPI):
         "2. Retrieves world facts relevant to the query\n"
         "3. Retrieves existing opinions (bank's perspectives)\n"
         "4. Uses LLM to formulate a contextual answer\n"
-        "5. Extracts and stores any new opinions formed\n"
-        "6. Returns plain text answer, the facts used, and new opinions",
+        "5. Returns plain text answer and the facts used",
         operation_id="reflect",
         tags=["Memory"],
     )

hindsight_api/api/mcp.py CHANGED Viewed

@@ -29,15 +29,26 @@ logger = logging.getLogger(__name__)
 # Default bank_id from environment variable
 DEFAULT_BANK_ID = os.environ.get("HINDSIGHT_MCP_BANK_ID", "default")
+# MCP authentication token (optional - if set, Bearer token auth is required)
+MCP_AUTH_TOKEN = os.environ.get("HINDSIGHT_API_MCP_AUTH_TOKEN")
 # Context variable to hold the current bank_id
 _current_bank_id: ContextVar[str | None] = ContextVar("current_bank_id", default=None)
+# Context variable to hold the current API key (for tenant auth propagation)
+_current_api_key: ContextVar[str | None] = ContextVar("current_api_key", default=None)
 def get_current_bank_id() -> str | None:
     """Get the current bank_id from context."""
     return _current_bank_id.get()
+def get_current_api_key() -> str | None:
+    """Get the current API key from context."""
+    return _current_api_key.get()
 def create_mcp_server(memory: MemoryEngine) -> FastMCP:
     """
     Create and configure the Hindsight MCP server.
@@ -54,6 +65,7 @@ def create_mcp_server(memory: MemoryEngine) -> FastMCP:
     # Configure and register tools using shared module
     config = MCPToolsConfig(
         bank_id_resolver=get_current_bank_id,
+        api_key_resolver=get_current_api_key,  # Propagate API key for tenant auth
         include_bank_id_param=True,  # HTTP MCP supports multi-bank via parameter
         tools=None,  # All tools
         retain_fire_and_forget=False,  # HTTP MCP supports sync/async modes
@@ -65,7 +77,11 @@ def create_mcp_server(memory: MemoryEngine) -> FastMCP:
 class MCPMiddleware:
-    """ASGI middleware that extracts bank_id from header or path and sets context.
+    """ASGI middleware that handles authentication and extracts bank_id from header or path.
+    Authentication:
+        If HINDSIGHT_API_MCP_AUTH_TOKEN is set, all requests must include a valid
+        Authorization header with Bearer token or direct token matching the configured value.
     Bank ID can be provided via:
     1. X-Bank-Id header (recommended for Claude Code)
@@ -74,7 +90,7 @@ class MCPMiddleware:
     For Claude Code, configure with:
         claude mcp add --transport http hindsight http://localhost:8888/mcp \\
-            --header "X-Bank-Id: my-bank"
+            --header "X-Bank-Id: my-bank" --header "Authorization: Bearer <token>"
     """
     def __init__(self, app, memory: MemoryEngine):
@@ -98,6 +114,22 @@ class MCPMiddleware:
             await self.mcp_app(scope, receive, send)
             return
+        # Extract auth token from header (for tenant auth propagation)
+        auth_header = self._get_header(scope, "Authorization")
+        auth_token: str | None = None
+        if auth_header:
+            # Support both "Bearer <token>" and direct token
+            auth_token = auth_header[7:].strip() if auth_header.startswith("Bearer ") else auth_header.strip()
+        # Authenticate if MCP_AUTH_TOKEN is configured
+        if MCP_AUTH_TOKEN:
+            if not auth_token:
+                await self._send_error(send, 401, "Authorization header required")
+                return
+            if auth_token != MCP_AUTH_TOKEN:
+                await self._send_error(send, 401, "Invalid authentication token")
+                return
         path = scope.get("path", "")
         # Strip any mount prefix (e.g., /mcp) that FastAPI might not have stripped
@@ -132,8 +164,10 @@ class MCPMiddleware:
             bank_id = DEFAULT_BANK_ID
             logger.debug(f"Using default bank_id: {bank_id}")
-        # Set bank_id context
-        token = _current_bank_id.set(bank_id)
+        # Set bank_id and api_key context
+        bank_id_token = _current_bank_id.set(bank_id)
+        # Store the auth token for tenant extension to validate
+        api_key_token = _current_api_key.set(auth_token) if auth_token else None
         try:
             new_scope = scope.copy()
             new_scope["path"] = new_path
@@ -152,7 +186,9 @@ class MCPMiddleware:
             await self.mcp_app(new_scope, receive, send_wrapper)
         finally:
-            _current_bank_id.reset(token)
+            _current_bank_id.reset(bank_id_token)
+            if api_key_token is not None:
+                _current_api_key.reset(api_key_token)
     async def _send_error(self, send, status: int, message: str):
         """Send an error response."""
@@ -176,6 +212,10 @@ def create_mcp_app(memory: MemoryEngine):
     """
     Create an ASGI app that handles MCP requests.
+    Authentication:
+        Set HINDSIGHT_API_MCP_AUTH_TOKEN to require Bearer token authentication.
+        If not set, MCP endpoint is open (for local development).
     Bank ID can be provided via:
     1. X-Bank-Id header: claude mcp add --transport http hindsight http://localhost:8888/mcp --header "X-Bank-Id: my-bank"
     2. URL path: /mcp/{bank_id}/

hindsight_api/config.py CHANGED Viewed

@@ -26,6 +26,9 @@ ENV_LLM_API_KEY = "HINDSIGHT_API_LLM_API_KEY"
 ENV_LLM_MODEL = "HINDSIGHT_API_LLM_MODEL"
 ENV_LLM_BASE_URL = "HINDSIGHT_API_LLM_BASE_URL"
 ENV_LLM_MAX_CONCURRENT = "HINDSIGHT_API_LLM_MAX_CONCURRENT"
+ENV_LLM_MAX_RETRIES = "HINDSIGHT_API_LLM_MAX_RETRIES"
+ENV_LLM_INITIAL_BACKOFF = "HINDSIGHT_API_LLM_INITIAL_BACKOFF"
+ENV_LLM_MAX_BACKOFF = "HINDSIGHT_API_LLM_MAX_BACKOFF"
 ENV_LLM_TIMEOUT = "HINDSIGHT_API_LLM_TIMEOUT"
 ENV_LLM_GROQ_SERVICE_TIER = "HINDSIGHT_API_LLM_GROQ_SERVICE_TIER"
@@ -34,16 +37,31 @@ ENV_RETAIN_LLM_PROVIDER = "HINDSIGHT_API_RETAIN_LLM_PROVIDER"
 ENV_RETAIN_LLM_API_KEY = "HINDSIGHT_API_RETAIN_LLM_API_KEY"
 ENV_RETAIN_LLM_MODEL = "HINDSIGHT_API_RETAIN_LLM_MODEL"
 ENV_RETAIN_LLM_BASE_URL = "HINDSIGHT_API_RETAIN_LLM_BASE_URL"
+ENV_RETAIN_LLM_MAX_CONCURRENT = "HINDSIGHT_API_RETAIN_LLM_MAX_CONCURRENT"
+ENV_RETAIN_LLM_MAX_RETRIES = "HINDSIGHT_API_RETAIN_LLM_MAX_RETRIES"
+ENV_RETAIN_LLM_INITIAL_BACKOFF = "HINDSIGHT_API_RETAIN_LLM_INITIAL_BACKOFF"
+ENV_RETAIN_LLM_MAX_BACKOFF = "HINDSIGHT_API_RETAIN_LLM_MAX_BACKOFF"
+ENV_RETAIN_LLM_TIMEOUT = "HINDSIGHT_API_RETAIN_LLM_TIMEOUT"
 ENV_REFLECT_LLM_PROVIDER = "HINDSIGHT_API_REFLECT_LLM_PROVIDER"
 ENV_REFLECT_LLM_API_KEY = "HINDSIGHT_API_REFLECT_LLM_API_KEY"
 ENV_REFLECT_LLM_MODEL = "HINDSIGHT_API_REFLECT_LLM_MODEL"
 ENV_REFLECT_LLM_BASE_URL = "HINDSIGHT_API_REFLECT_LLM_BASE_URL"
+ENV_REFLECT_LLM_MAX_CONCURRENT = "HINDSIGHT_API_REFLECT_LLM_MAX_CONCURRENT"
+ENV_REFLECT_LLM_MAX_RETRIES = "HINDSIGHT_API_REFLECT_LLM_MAX_RETRIES"
+ENV_REFLECT_LLM_INITIAL_BACKOFF = "HINDSIGHT_API_REFLECT_LLM_INITIAL_BACKOFF"
+ENV_REFLECT_LLM_MAX_BACKOFF = "HINDSIGHT_API_REFLECT_LLM_MAX_BACKOFF"
+ENV_REFLECT_LLM_TIMEOUT = "HINDSIGHT_API_REFLECT_LLM_TIMEOUT"
 ENV_CONSOLIDATION_LLM_PROVIDER = "HINDSIGHT_API_CONSOLIDATION_LLM_PROVIDER"
 ENV_CONSOLIDATION_LLM_API_KEY = "HINDSIGHT_API_CONSOLIDATION_LLM_API_KEY"
 ENV_CONSOLIDATION_LLM_MODEL = "HINDSIGHT_API_CONSOLIDATION_LLM_MODEL"
 ENV_CONSOLIDATION_LLM_BASE_URL = "HINDSIGHT_API_CONSOLIDATION_LLM_BASE_URL"
+ENV_CONSOLIDATION_LLM_MAX_CONCURRENT = "HINDSIGHT_API_CONSOLIDATION_LLM_MAX_CONCURRENT"
+ENV_CONSOLIDATION_LLM_MAX_RETRIES = "HINDSIGHT_API_CONSOLIDATION_LLM_MAX_RETRIES"
+ENV_CONSOLIDATION_LLM_INITIAL_BACKOFF = "HINDSIGHT_API_CONSOLIDATION_LLM_INITIAL_BACKOFF"
+ENV_CONSOLIDATION_LLM_MAX_BACKOFF = "HINDSIGHT_API_CONSOLIDATION_LLM_MAX_BACKOFF"
+ENV_CONSOLIDATION_LLM_TIMEOUT = "HINDSIGHT_API_CONSOLIDATION_LLM_TIMEOUT"
 ENV_EMBEDDINGS_PROVIDER = "HINDSIGHT_API_EMBEDDINGS_PROVIDER"
 ENV_EMBEDDINGS_LOCAL_MODEL = "HINDSIGHT_API_EMBEDDINGS_LOCAL_MODEL"
@@ -90,13 +108,17 @@ ENV_MCP_LOCAL_BANK_ID = "HINDSIGHT_API_MCP_LOCAL_BANK_ID"
 ENV_MCP_INSTRUCTIONS = "HINDSIGHT_API_MCP_INSTRUCTIONS"
 ENV_MENTAL_MODEL_REFRESH_CONCURRENCY = "HINDSIGHT_API_MENTAL_MODEL_REFRESH_CONCURRENCY"
+# Vertex AI configuration
+ENV_LLM_VERTEXAI_PROJECT_ID = "HINDSIGHT_API_LLM_VERTEXAI_PROJECT_ID"
+ENV_LLM_VERTEXAI_REGION = "HINDSIGHT_API_LLM_VERTEXAI_REGION"
+ENV_LLM_VERTEXAI_SERVICE_ACCOUNT_KEY = "HINDSIGHT_API_LLM_VERTEXAI_SERVICE_ACCOUNT_KEY"
 # Retain settings
 ENV_RETAIN_MAX_COMPLETION_TOKENS = "HINDSIGHT_API_RETAIN_MAX_COMPLETION_TOKENS"
 ENV_RETAIN_CHUNK_SIZE = "HINDSIGHT_API_RETAIN_CHUNK_SIZE"
 ENV_RETAIN_EXTRACT_CAUSAL_LINKS = "HINDSIGHT_API_RETAIN_EXTRACT_CAUSAL_LINKS"
 ENV_RETAIN_EXTRACTION_MODE = "HINDSIGHT_API_RETAIN_EXTRACTION_MODE"
 ENV_RETAIN_CUSTOM_INSTRUCTIONS = "HINDSIGHT_API_RETAIN_CUSTOM_INSTRUCTIONS"
-ENV_RETAIN_OBSERVATIONS_ASYNC = "HINDSIGHT_API_RETAIN_OBSERVATIONS_ASYNC"
 # Observations settings (consolidated knowledge from facts)
 ENV_ENABLE_OBSERVATIONS = "HINDSIGHT_API_ENABLE_OBSERVATIONS"
@@ -121,8 +143,9 @@ ENV_WORKER_ENABLED = "HINDSIGHT_API_WORKER_ENABLED"
 ENV_WORKER_ID = "HINDSIGHT_API_WORKER_ID"
 ENV_WORKER_POLL_INTERVAL_MS = "HINDSIGHT_API_WORKER_POLL_INTERVAL_MS"
 ENV_WORKER_MAX_RETRIES = "HINDSIGHT_API_WORKER_MAX_RETRIES"
-ENV_WORKER_BATCH_SIZE = "HINDSIGHT_API_WORKER_BATCH_SIZE"
 ENV_WORKER_HTTP_PORT = "HINDSIGHT_API_WORKER_HTTP_PORT"
+ENV_WORKER_MAX_SLOTS = "HINDSIGHT_API_WORKER_MAX_SLOTS"
+ENV_WORKER_CONSOLIDATION_MAX_SLOTS = "HINDSIGHT_API_WORKER_CONSOLIDATION_MAX_SLOTS"
 # Reflect agent settings
 ENV_REFLECT_MAX_ITERATIONS = "HINDSIGHT_API_REFLECT_MAX_ITERATIONS"
@@ -133,8 +156,16 @@ DEFAULT_DATABASE_SCHEMA = "public"
 DEFAULT_LLM_PROVIDER = "openai"
 DEFAULT_LLM_MODEL = "gpt-5-mini"
 DEFAULT_LLM_MAX_CONCURRENT = 32
+DEFAULT_LLM_MAX_RETRIES = 10  # Max retry attempts for LLM API calls
+DEFAULT_LLM_INITIAL_BACKOFF = 1.0  # Initial backoff in seconds for retry exponential backoff
+DEFAULT_LLM_MAX_BACKOFF = 60.0  # Max backoff cap in seconds for retry exponential backoff
 DEFAULT_LLM_TIMEOUT = 120.0  # seconds
+# Vertex AI defaults
+DEFAULT_LLM_VERTEXAI_PROJECT_ID = None  # Required for Vertex AI
+DEFAULT_LLM_VERTEXAI_REGION = "us-central1"
+DEFAULT_LLM_VERTEXAI_SERVICE_ACCOUNT_KEY = None  # Optional, uses ADC if not set
 DEFAULT_EMBEDDINGS_PROVIDER = "local"
 DEFAULT_EMBEDDINGS_LOCAL_MODEL = "BAAI/bge-small-en-v1.5"
 DEFAULT_EMBEDDINGS_LOCAL_FORCE_CPU = False  # Force CPU mode for local embeddings (avoids MPS/XPC issues on macOS)
@@ -179,7 +210,6 @@ DEFAULT_RETAIN_EXTRACT_CAUSAL_LINKS = True  # Extract causal links between facts
 DEFAULT_RETAIN_EXTRACTION_MODE = "concise"  # Extraction mode: "concise", "verbose", or "custom"
 RETAIN_EXTRACTION_MODES = ("concise", "verbose", "custom")  # Allowed extraction modes
 DEFAULT_RETAIN_CUSTOM_INSTRUCTIONS = None  # Custom extraction guidelines (only used when mode="custom")
-DEFAULT_RETAIN_OBSERVATIONS_ASYNC = False  # Run observation generation async (after retain completes)
 # Observations defaults (consolidated knowledge from facts)
 DEFAULT_ENABLE_OBSERVATIONS = True  # Observations enabled by default
@@ -200,8 +230,9 @@ DEFAULT_WORKER_ENABLED = True  # API runs worker by default (standalone mode)
 DEFAULT_WORKER_ID = None  # Will use hostname if not specified
 DEFAULT_WORKER_POLL_INTERVAL_MS = 500  # Poll database every 500ms
 DEFAULT_WORKER_MAX_RETRIES = 3  # Max retries before marking task failed
-DEFAULT_WORKER_BATCH_SIZE = 10  # Tasks to claim per poll cycle
 DEFAULT_WORKER_HTTP_PORT = 8889  # HTTP port for worker metrics/health
+DEFAULT_WORKER_MAX_SLOTS = 10  # Total concurrent tasks per worker
+DEFAULT_WORKER_CONSOLIDATION_MAX_SLOTS = 2  # Max concurrent consolidation tasks per worker
 # Reflect agent settings
 DEFAULT_REFLECT_MAX_ITERATIONS = 10  # Max tool call iterations before forcing response
@@ -286,23 +317,46 @@ class HindsightConfig:
     llm_model: str
     llm_base_url: str | None
     llm_max_concurrent: int
+    llm_max_retries: int
+    llm_initial_backoff: float
+    llm_max_backoff: float
     llm_timeout: float
+    # Vertex AI configuration
+    llm_vertexai_project_id: str | None
+    llm_vertexai_region: str
+    llm_vertexai_service_account_key: str | None
     # Per-operation LLM configuration (None = use default LLM config)
     retain_llm_provider: str | None
     retain_llm_api_key: str | None
     retain_llm_model: str | None
     retain_llm_base_url: str | None
+    retain_llm_max_concurrent: int | None
+    retain_llm_max_retries: int | None
+    retain_llm_initial_backoff: float | None
+    retain_llm_max_backoff: float | None
+    retain_llm_timeout: float | None
     reflect_llm_provider: str | None
     reflect_llm_api_key: str | None
     reflect_llm_model: str | None
     reflect_llm_base_url: str | None
+    reflect_llm_max_concurrent: int | None
+    reflect_llm_max_retries: int | None
+    reflect_llm_initial_backoff: float | None
+    reflect_llm_max_backoff: float | None
+    reflect_llm_timeout: float | None
     consolidation_llm_provider: str | None
     consolidation_llm_api_key: str | None
     consolidation_llm_model: str | None
     consolidation_llm_base_url: str | None
+    consolidation_llm_max_concurrent: int | None
+    consolidation_llm_max_retries: int | None
+    consolidation_llm_initial_backoff: float | None
+    consolidation_llm_max_backoff: float | None
+    consolidation_llm_timeout: float | None
     # Embeddings
     embeddings_provider: str
@@ -343,7 +397,6 @@ class HindsightConfig:
     retain_extract_causal_links: bool
     retain_extraction_mode: str
     retain_custom_instructions: str | None
-    retain_observations_async: bool
     # Observations settings (consolidated knowledge from facts)
     enable_observations: bool
@@ -368,8 +421,9 @@ class HindsightConfig:
     worker_id: str | None
     worker_poll_interval_ms: int
     worker_max_retries: int
-    worker_batch_size: int
     worker_http_port: int
+    worker_max_slots: int
+    worker_consolidation_max_slots: int
     # Reflect agent settings
     reflect_max_iterations: int
@@ -387,20 +441,71 @@ class HindsightConfig:
             llm_model=os.getenv(ENV_LLM_MODEL, DEFAULT_LLM_MODEL),
             llm_base_url=os.getenv(ENV_LLM_BASE_URL) or None,
             llm_max_concurrent=int(os.getenv(ENV_LLM_MAX_CONCURRENT, str(DEFAULT_LLM_MAX_CONCURRENT))),
+            llm_max_retries=int(os.getenv(ENV_LLM_MAX_RETRIES, str(DEFAULT_LLM_MAX_RETRIES))),
+            llm_initial_backoff=float(os.getenv(ENV_LLM_INITIAL_BACKOFF, str(DEFAULT_LLM_INITIAL_BACKOFF))),
+            llm_max_backoff=float(os.getenv(ENV_LLM_MAX_BACKOFF, str(DEFAULT_LLM_MAX_BACKOFF))),
             llm_timeout=float(os.getenv(ENV_LLM_TIMEOUT, str(DEFAULT_LLM_TIMEOUT))),
+            # Vertex AI
+            llm_vertexai_project_id=os.getenv(ENV_LLM_VERTEXAI_PROJECT_ID) or DEFAULT_LLM_VERTEXAI_PROJECT_ID,
+            llm_vertexai_region=os.getenv(ENV_LLM_VERTEXAI_REGION, DEFAULT_LLM_VERTEXAI_REGION),
+            llm_vertexai_service_account_key=os.getenv(ENV_LLM_VERTEXAI_SERVICE_ACCOUNT_KEY)
+            or DEFAULT_LLM_VERTEXAI_SERVICE_ACCOUNT_KEY,
             # Per-operation LLM config (None = use default)
             retain_llm_provider=os.getenv(ENV_RETAIN_LLM_PROVIDER) or None,
             retain_llm_api_key=os.getenv(ENV_RETAIN_LLM_API_KEY) or None,
             retain_llm_model=os.getenv(ENV_RETAIN_LLM_MODEL) or None,
             retain_llm_base_url=os.getenv(ENV_RETAIN_LLM_BASE_URL) or None,
+            retain_llm_max_concurrent=int(os.getenv(ENV_RETAIN_LLM_MAX_CONCURRENT))
+            if os.getenv(ENV_RETAIN_LLM_MAX_CONCURRENT)
+            else None,
+            retain_llm_max_retries=int(os.getenv(ENV_RETAIN_LLM_MAX_RETRIES))
+            if os.getenv(ENV_RETAIN_LLM_MAX_RETRIES)
+            else None,
+            retain_llm_initial_backoff=float(os.getenv(ENV_RETAIN_LLM_INITIAL_BACKOFF))
+            if os.getenv(ENV_RETAIN_LLM_INITIAL_BACKOFF)
+            else None,
+            retain_llm_max_backoff=float(os.getenv(ENV_RETAIN_LLM_MAX_BACKOFF))
+            if os.getenv(ENV_RETAIN_LLM_MAX_BACKOFF)
+            else None,
+            retain_llm_timeout=float(os.getenv(ENV_RETAIN_LLM_TIMEOUT)) if os.getenv(ENV_RETAIN_LLM_TIMEOUT) else None,
             reflect_llm_provider=os.getenv(ENV_REFLECT_LLM_PROVIDER) or None,
             reflect_llm_api_key=os.getenv(ENV_REFLECT_LLM_API_KEY) or None,
             reflect_llm_model=os.getenv(ENV_REFLECT_LLM_MODEL) or None,
             reflect_llm_base_url=os.getenv(ENV_REFLECT_LLM_BASE_URL) or None,
+            reflect_llm_max_concurrent=int(os.getenv(ENV_REFLECT_LLM_MAX_CONCURRENT))
+            if os.getenv(ENV_REFLECT_LLM_MAX_CONCURRENT)
+            else None,
+            reflect_llm_max_retries=int(os.getenv(ENV_REFLECT_LLM_MAX_RETRIES))
+            if os.getenv(ENV_REFLECT_LLM_MAX_RETRIES)
+            else None,
+            reflect_llm_initial_backoff=float(os.getenv(ENV_REFLECT_LLM_INITIAL_BACKOFF))
+            if os.getenv(ENV_REFLECT_LLM_INITIAL_BACKOFF)
+            else None,
+            reflect_llm_max_backoff=float(os.getenv(ENV_REFLECT_LLM_MAX_BACKOFF))
+            if os.getenv(ENV_REFLECT_LLM_MAX_BACKOFF)
+            else None,
+            reflect_llm_timeout=float(os.getenv(ENV_REFLECT_LLM_TIMEOUT))
+            if os.getenv(ENV_REFLECT_LLM_TIMEOUT)
+            else None,
             consolidation_llm_provider=os.getenv(ENV_CONSOLIDATION_LLM_PROVIDER) or None,
             consolidation_llm_api_key=os.getenv(ENV_CONSOLIDATION_LLM_API_KEY) or None,
             consolidation_llm_model=os.getenv(ENV_CONSOLIDATION_LLM_MODEL) or None,
             consolidation_llm_base_url=os.getenv(ENV_CONSOLIDATION_LLM_BASE_URL) or None,
+            consolidation_llm_max_concurrent=int(os.getenv(ENV_CONSOLIDATION_LLM_MAX_CONCURRENT))
+            if os.getenv(ENV_CONSOLIDATION_LLM_MAX_CONCURRENT)
+            else None,
+            consolidation_llm_max_retries=int(os.getenv(ENV_CONSOLIDATION_LLM_MAX_RETRIES))
+            if os.getenv(ENV_CONSOLIDATION_LLM_MAX_RETRIES)
+            else None,
+            consolidation_llm_initial_backoff=float(os.getenv(ENV_CONSOLIDATION_LLM_INITIAL_BACKOFF))
+            if os.getenv(ENV_CONSOLIDATION_LLM_INITIAL_BACKOFF)
+            else None,
+            consolidation_llm_max_backoff=float(os.getenv(ENV_CONSOLIDATION_LLM_MAX_BACKOFF))
+            if os.getenv(ENV_CONSOLIDATION_LLM_MAX_BACKOFF)
+            else None,
+            consolidation_llm_timeout=float(os.getenv(ENV_CONSOLIDATION_LLM_TIMEOUT))
+            if os.getenv(ENV_CONSOLIDATION_LLM_TIMEOUT)
+            else None,
             # Embeddings
             embeddings_provider=os.getenv(ENV_EMBEDDINGS_PROVIDER, DEFAULT_EMBEDDINGS_PROVIDER),
             embeddings_local_model=os.getenv(ENV_EMBEDDINGS_LOCAL_MODEL, DEFAULT_EMBEDDINGS_LOCAL_MODEL),
@@ -460,10 +565,6 @@ class HindsightConfig:
                 os.getenv(ENV_RETAIN_EXTRACTION_MODE, DEFAULT_RETAIN_EXTRACTION_MODE)
             ),
             retain_custom_instructions=os.getenv(ENV_RETAIN_CUSTOM_INSTRUCTIONS) or DEFAULT_RETAIN_CUSTOM_INSTRUCTIONS,
-            retain_observations_async=os.getenv(
-                ENV_RETAIN_OBSERVATIONS_ASYNC, str(DEFAULT_RETAIN_OBSERVATIONS_ASYNC)
-            ).lower()
-            == "true",
             # Observations settings (consolidated knowledge from facts)
             enable_observations=os.getenv(ENV_ENABLE_OBSERVATIONS, str(DEFAULT_ENABLE_OBSERVATIONS)).lower() == "true",
             consolidation_batch_size=int(
@@ -484,8 +585,11 @@ class HindsightConfig:
             worker_id=os.getenv(ENV_WORKER_ID) or DEFAULT_WORKER_ID,
             worker_poll_interval_ms=int(os.getenv(ENV_WORKER_POLL_INTERVAL_MS, str(DEFAULT_WORKER_POLL_INTERVAL_MS))),
             worker_max_retries=int(os.getenv(ENV_WORKER_MAX_RETRIES, str(DEFAULT_WORKER_MAX_RETRIES))),
-            worker_batch_size=int(os.getenv(ENV_WORKER_BATCH_SIZE, str(DEFAULT_WORKER_BATCH_SIZE))),
             worker_http_port=int(os.getenv(ENV_WORKER_HTTP_PORT, str(DEFAULT_WORKER_HTTP_PORT))),
+            worker_max_slots=int(os.getenv(ENV_WORKER_MAX_SLOTS, str(DEFAULT_WORKER_MAX_SLOTS))),
+            worker_consolidation_max_slots=int(
+                os.getenv(ENV_WORKER_CONSOLIDATION_MAX_SLOTS, str(DEFAULT_WORKER_CONSOLIDATION_MAX_SLOTS))
+            ),
             # Reflect agent settings
             reflect_max_iterations=int(os.getenv(ENV_REFLECT_MAX_ITERATIONS, str(DEFAULT_REFLECT_MAX_ITERATIONS))),
         )

hindsight_api/daemon.py CHANGED Viewed

@@ -52,7 +52,10 @@ class IdleTimeoutMiddleware:
                 logger.info(f"Idle timeout reached ({self.idle_timeout}s), shutting down daemon")
                 # Give a moment for any in-flight requests
                 await asyncio.sleep(1)
-                os._exit(0)
+                # Send SIGTERM to ourselves to trigger graceful shutdown
+                import signal
+                os.kill(os.getpid(), signal.SIGTERM)
 class DaemonLock:

hindsight_api/engine/consolidation/consolidator.py CHANGED Viewed

@@ -144,10 +144,14 @@ async def run_consolidation_job(
     }
     batch_num = 0
+    last_progress_timings = {}  # Track timings at last progress log
     while True:
         batch_num += 1
         batch_start = time.time()
+        # Snapshot timings at batch start for per-batch calculation
+        batch_start_timings = perf.timings.copy()
         # Fetch next batch of unconsolidated memories
         async with pool.acquire() as conn:
             t0 = time.time()
@@ -217,19 +221,44 @@ async def run_consolidation_job(
             elif action == "skipped":
                 stats["skipped"] += 1
-            # Log progress periodically
+            # Log progress periodically with timing breakdown
             if stats["memories_processed"] % 10 == 0:
+                # Calculate timing deltas since last progress log
+                timing_parts = []
+                for key in ["recall", "llm", "embedding", "db_write"]:
+                    if key in perf.timings:
+                        delta = perf.timings[key] - last_progress_timings.get(key, 0)
+                        timing_parts.append(f"{key}={delta:.2f}s")
+                timing_str = f" | {', '.join(timing_parts)}" if timing_parts else ""
                 logger.info(
                     f"[CONSOLIDATION] bank={bank_id} progress: "
-                    f"{stats['memories_processed']}/{total_count} memories processed"
+                    f"{stats['memories_processed']}/{total_count} memories processed{timing_str}"
                 )
+                # Update last progress snapshot
+                last_progress_timings = perf.timings.copy()
         batch_time = time.time() - batch_start
         perf.log(
             f"[2] Batch {batch_num}: {len(memories)} memories in {batch_time:.3f}s "
             f"(avg {batch_time / len(memories):.3f}s/memory)"
         )
+        # Log timing breakdown after each batch (delta from batch start)
+        timing_parts = []
+        for key in ["recall", "llm", "embedding", "db_write"]:
+            if key in perf.timings:
+                delta = perf.timings[key] - batch_start_timings.get(key, 0)
+                timing_parts.append(f"{key}={delta:.3f}s")
+        if timing_parts:
+            avg_per_memory = batch_time / len(memories) if memories else 0
+            logger.info(
+                f"[CONSOLIDATION] bank={bank_id} batch {batch_num}/{len(memories)} memories: "
+                f"{', '.join(timing_parts)} | avg={avg_per_memory:.3f}s/memory"
+            )
     # Build summary
     perf.log(
         f"[3] Results: {stats['memories_processed']} memories -> "
@@ -836,7 +865,14 @@ Focus on DURABLE knowledge that serves this mission, not ephemeral state.
         )
         # Parse JSON response - should be an array
         if isinstance(result, str):
-            result = json.loads(result)
+            # Strip markdown code fences (some models wrap JSON in ```json ... ```)
+            clean = result.strip()
+            if clean.startswith("```"):
+                clean = clean.split("\n", 1)[1] if "\n" in clean else clean[3:]
+                if clean.endswith("```"):
+                    clean = clean[:-3]
+                clean = clean.strip()
+            result = json.loads(clean)
         # Ensure result is a list
         if isinstance(result, list):
             return result

hindsight-api 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl

hindsight-api 0.4.1py3-none-any.whl → 0.4.3py3-none-any.whl