hindsight-api 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hindsight_api/__init__.py CHANGED
@@ -46,4 +46,4 @@ __all__ = [
46
46
  "RemoteTEICrossEncoder",
47
47
  "LLMConfig",
48
48
  ]
49
- __version__ = "0.4.1"
49
+ __version__ = "0.4.3"
hindsight_api/api/http.py CHANGED
@@ -92,8 +92,7 @@ class RecallRequest(BaseModel):
92
92
  query: str
93
93
  types: list[str] | None = Field(
94
94
  default=None,
95
- description="List of fact types to recall: 'world', 'experience', 'observation'. Defaults to world and experience if not specified. "
96
- "Note: 'opinion' is accepted but ignored (opinions are excluded from recall).",
95
+ description="List of fact types to recall: 'world', 'experience', 'observation'. Defaults to world and experience if not specified.",
97
96
  )
98
97
  budget: Budget = Budget.MID
99
98
  max_tokens: int = 4096
@@ -504,13 +503,6 @@ class ReflectRequest(BaseModel):
504
503
  )
505
504
 
506
505
 
507
- class OpinionItem(BaseModel):
508
- """Model for an opinion with confidence score."""
509
-
510
- text: str
511
- confidence: float
512
-
513
-
514
506
  class ReflectFact(BaseModel):
515
507
  """A fact used in think response."""
516
508
 
@@ -529,7 +521,7 @@ class ReflectFact(BaseModel):
529
521
 
530
522
  id: str | None = None
531
523
  text: str
532
- type: str | None = None # fact type: world, experience, opinion
524
+ type: str | None = None # fact type: world, experience, observation
533
525
  context: str | None = None
534
526
  occurred_start: str | None = None
535
527
  occurred_end: str | None = None
@@ -1412,9 +1404,10 @@ def create_app(
1412
1404
  worker_id=worker_id,
1413
1405
  executor=memory.execute_task,
1414
1406
  poll_interval_ms=config.worker_poll_interval_ms,
1415
- batch_size=config.worker_batch_size,
1416
1407
  max_retries=config.worker_max_retries,
1417
1408
  tenant_extension=getattr(memory, "_tenant_extension", None),
1409
+ max_slots=config.worker_max_slots,
1410
+ consolidation_max_slots=config.worker_consolidation_max_slots,
1418
1411
  )
1419
1412
  poller_task = asyncio.create_task(poller.run())
1420
1413
  logging.info(f"Worker poller started (worker_id={worker_id})")
@@ -1707,9 +1700,7 @@ def _register_routes(app: FastAPI):
1707
1700
  description="Recall memory using semantic similarity and spreading activation.\n\n"
1708
1701
  "The type parameter is optional and must be one of:\n"
1709
1702
  "- `world`: General knowledge about people, places, events, and things that happen\n"
1710
- "- `experience`: Memories about experience, conversations, actions taken, and tasks performed\n"
1711
- "- `opinion`: The bank's formed beliefs, perspectives, and viewpoints\n\n"
1712
- "Set `include_entities=true` to get entity observations alongside recall results.",
1703
+ "- `experience`: Memories about experience, conversations, actions taken, and tasks performed",
1713
1704
  operation_id="recall_memories",
1714
1705
  tags=["Memory"],
1715
1706
  )
@@ -1723,10 +1714,8 @@ def _register_routes(app: FastAPI):
1723
1714
  metrics = get_metrics_collector()
1724
1715
 
1725
1716
  try:
1726
- # Default to world and experience if not specified (exclude observation and opinion)
1727
- # Filter out 'opinion' even if requested - opinions are excluded from recall
1717
+ # Default to world and experience if not specified (exclude observation)
1728
1718
  fact_types = request.types if request.types else list(VALID_RECALL_FACT_TYPES)
1729
- fact_types = [ft for ft in fact_types if ft != "opinion"]
1730
1719
 
1731
1720
  # Parse query_timestamp if provided
1732
1721
  question_date = None
@@ -1858,8 +1847,7 @@ def _register_routes(app: FastAPI):
1858
1847
  "2. Retrieves world facts relevant to the query\n"
1859
1848
  "3. Retrieves existing opinions (bank's perspectives)\n"
1860
1849
  "4. Uses LLM to formulate a contextual answer\n"
1861
- "5. Extracts and stores any new opinions formed\n"
1862
- "6. Returns plain text answer, the facts used, and new opinions",
1850
+ "5. Returns plain text answer and the facts used",
1863
1851
  operation_id="reflect",
1864
1852
  tags=["Memory"],
1865
1853
  )
hindsight_api/api/mcp.py CHANGED
@@ -29,15 +29,26 @@ logger = logging.getLogger(__name__)
29
29
  # Default bank_id from environment variable
30
30
  DEFAULT_BANK_ID = os.environ.get("HINDSIGHT_MCP_BANK_ID", "default")
31
31
 
32
+ # MCP authentication token (optional - if set, Bearer token auth is required)
33
+ MCP_AUTH_TOKEN = os.environ.get("HINDSIGHT_API_MCP_AUTH_TOKEN")
34
+
32
35
  # Context variable to hold the current bank_id
33
36
  _current_bank_id: ContextVar[str | None] = ContextVar("current_bank_id", default=None)
34
37
 
38
+ # Context variable to hold the current API key (for tenant auth propagation)
39
+ _current_api_key: ContextVar[str | None] = ContextVar("current_api_key", default=None)
40
+
35
41
 
36
42
  def get_current_bank_id() -> str | None:
37
43
  """Get the current bank_id from context."""
38
44
  return _current_bank_id.get()
39
45
 
40
46
 
47
+ def get_current_api_key() -> str | None:
48
+ """Get the current API key from context."""
49
+ return _current_api_key.get()
50
+
51
+
41
52
  def create_mcp_server(memory: MemoryEngine) -> FastMCP:
42
53
  """
43
54
  Create and configure the Hindsight MCP server.
@@ -54,6 +65,7 @@ def create_mcp_server(memory: MemoryEngine) -> FastMCP:
54
65
  # Configure and register tools using shared module
55
66
  config = MCPToolsConfig(
56
67
  bank_id_resolver=get_current_bank_id,
68
+ api_key_resolver=get_current_api_key, # Propagate API key for tenant auth
57
69
  include_bank_id_param=True, # HTTP MCP supports multi-bank via parameter
58
70
  tools=None, # All tools
59
71
  retain_fire_and_forget=False, # HTTP MCP supports sync/async modes
@@ -65,7 +77,11 @@ def create_mcp_server(memory: MemoryEngine) -> FastMCP:
65
77
 
66
78
 
67
79
  class MCPMiddleware:
68
- """ASGI middleware that extracts bank_id from header or path and sets context.
80
+ """ASGI middleware that handles authentication and extracts bank_id from header or path.
81
+
82
+ Authentication:
83
+ If HINDSIGHT_API_MCP_AUTH_TOKEN is set, all requests must include a valid
84
+ Authorization header with Bearer token or direct token matching the configured value.
69
85
 
70
86
  Bank ID can be provided via:
71
87
  1. X-Bank-Id header (recommended for Claude Code)
@@ -74,7 +90,7 @@ class MCPMiddleware:
74
90
 
75
91
  For Claude Code, configure with:
76
92
  claude mcp add --transport http hindsight http://localhost:8888/mcp \\
77
- --header "X-Bank-Id: my-bank"
93
+ --header "X-Bank-Id: my-bank" --header "Authorization: Bearer <token>"
78
94
  """
79
95
 
80
96
  def __init__(self, app, memory: MemoryEngine):
@@ -98,6 +114,22 @@ class MCPMiddleware:
98
114
  await self.mcp_app(scope, receive, send)
99
115
  return
100
116
 
117
+ # Extract auth token from header (for tenant auth propagation)
118
+ auth_header = self._get_header(scope, "Authorization")
119
+ auth_token: str | None = None
120
+ if auth_header:
121
+ # Support both "Bearer <token>" and direct token
122
+ auth_token = auth_header[7:].strip() if auth_header.startswith("Bearer ") else auth_header.strip()
123
+
124
+ # Authenticate if MCP_AUTH_TOKEN is configured
125
+ if MCP_AUTH_TOKEN:
126
+ if not auth_token:
127
+ await self._send_error(send, 401, "Authorization header required")
128
+ return
129
+ if auth_token != MCP_AUTH_TOKEN:
130
+ await self._send_error(send, 401, "Invalid authentication token")
131
+ return
132
+
101
133
  path = scope.get("path", "")
102
134
 
103
135
  # Strip any mount prefix (e.g., /mcp) that FastAPI might not have stripped
@@ -132,8 +164,10 @@ class MCPMiddleware:
132
164
  bank_id = DEFAULT_BANK_ID
133
165
  logger.debug(f"Using default bank_id: {bank_id}")
134
166
 
135
- # Set bank_id context
136
- token = _current_bank_id.set(bank_id)
167
+ # Set bank_id and api_key context
168
+ bank_id_token = _current_bank_id.set(bank_id)
169
+ # Store the auth token for tenant extension to validate
170
+ api_key_token = _current_api_key.set(auth_token) if auth_token else None
137
171
  try:
138
172
  new_scope = scope.copy()
139
173
  new_scope["path"] = new_path
@@ -152,7 +186,9 @@ class MCPMiddleware:
152
186
 
153
187
  await self.mcp_app(new_scope, receive, send_wrapper)
154
188
  finally:
155
- _current_bank_id.reset(token)
189
+ _current_bank_id.reset(bank_id_token)
190
+ if api_key_token is not None:
191
+ _current_api_key.reset(api_key_token)
156
192
 
157
193
  async def _send_error(self, send, status: int, message: str):
158
194
  """Send an error response."""
@@ -176,6 +212,10 @@ def create_mcp_app(memory: MemoryEngine):
176
212
  """
177
213
  Create an ASGI app that handles MCP requests.
178
214
 
215
+ Authentication:
216
+ Set HINDSIGHT_API_MCP_AUTH_TOKEN to require Bearer token authentication.
217
+ If not set, MCP endpoint is open (for local development).
218
+
179
219
  Bank ID can be provided via:
180
220
  1. X-Bank-Id header: claude mcp add --transport http hindsight http://localhost:8888/mcp --header "X-Bank-Id: my-bank"
181
221
  2. URL path: /mcp/{bank_id}/
hindsight_api/config.py CHANGED
@@ -26,6 +26,9 @@ ENV_LLM_API_KEY = "HINDSIGHT_API_LLM_API_KEY"
26
26
  ENV_LLM_MODEL = "HINDSIGHT_API_LLM_MODEL"
27
27
  ENV_LLM_BASE_URL = "HINDSIGHT_API_LLM_BASE_URL"
28
28
  ENV_LLM_MAX_CONCURRENT = "HINDSIGHT_API_LLM_MAX_CONCURRENT"
29
+ ENV_LLM_MAX_RETRIES = "HINDSIGHT_API_LLM_MAX_RETRIES"
30
+ ENV_LLM_INITIAL_BACKOFF = "HINDSIGHT_API_LLM_INITIAL_BACKOFF"
31
+ ENV_LLM_MAX_BACKOFF = "HINDSIGHT_API_LLM_MAX_BACKOFF"
29
32
  ENV_LLM_TIMEOUT = "HINDSIGHT_API_LLM_TIMEOUT"
30
33
  ENV_LLM_GROQ_SERVICE_TIER = "HINDSIGHT_API_LLM_GROQ_SERVICE_TIER"
31
34
 
@@ -34,16 +37,31 @@ ENV_RETAIN_LLM_PROVIDER = "HINDSIGHT_API_RETAIN_LLM_PROVIDER"
34
37
  ENV_RETAIN_LLM_API_KEY = "HINDSIGHT_API_RETAIN_LLM_API_KEY"
35
38
  ENV_RETAIN_LLM_MODEL = "HINDSIGHT_API_RETAIN_LLM_MODEL"
36
39
  ENV_RETAIN_LLM_BASE_URL = "HINDSIGHT_API_RETAIN_LLM_BASE_URL"
40
+ ENV_RETAIN_LLM_MAX_CONCURRENT = "HINDSIGHT_API_RETAIN_LLM_MAX_CONCURRENT"
41
+ ENV_RETAIN_LLM_MAX_RETRIES = "HINDSIGHT_API_RETAIN_LLM_MAX_RETRIES"
42
+ ENV_RETAIN_LLM_INITIAL_BACKOFF = "HINDSIGHT_API_RETAIN_LLM_INITIAL_BACKOFF"
43
+ ENV_RETAIN_LLM_MAX_BACKOFF = "HINDSIGHT_API_RETAIN_LLM_MAX_BACKOFF"
44
+ ENV_RETAIN_LLM_TIMEOUT = "HINDSIGHT_API_RETAIN_LLM_TIMEOUT"
37
45
 
38
46
  ENV_REFLECT_LLM_PROVIDER = "HINDSIGHT_API_REFLECT_LLM_PROVIDER"
39
47
  ENV_REFLECT_LLM_API_KEY = "HINDSIGHT_API_REFLECT_LLM_API_KEY"
40
48
  ENV_REFLECT_LLM_MODEL = "HINDSIGHT_API_REFLECT_LLM_MODEL"
41
49
  ENV_REFLECT_LLM_BASE_URL = "HINDSIGHT_API_REFLECT_LLM_BASE_URL"
50
+ ENV_REFLECT_LLM_MAX_CONCURRENT = "HINDSIGHT_API_REFLECT_LLM_MAX_CONCURRENT"
51
+ ENV_REFLECT_LLM_MAX_RETRIES = "HINDSIGHT_API_REFLECT_LLM_MAX_RETRIES"
52
+ ENV_REFLECT_LLM_INITIAL_BACKOFF = "HINDSIGHT_API_REFLECT_LLM_INITIAL_BACKOFF"
53
+ ENV_REFLECT_LLM_MAX_BACKOFF = "HINDSIGHT_API_REFLECT_LLM_MAX_BACKOFF"
54
+ ENV_REFLECT_LLM_TIMEOUT = "HINDSIGHT_API_REFLECT_LLM_TIMEOUT"
42
55
 
43
56
  ENV_CONSOLIDATION_LLM_PROVIDER = "HINDSIGHT_API_CONSOLIDATION_LLM_PROVIDER"
44
57
  ENV_CONSOLIDATION_LLM_API_KEY = "HINDSIGHT_API_CONSOLIDATION_LLM_API_KEY"
45
58
  ENV_CONSOLIDATION_LLM_MODEL = "HINDSIGHT_API_CONSOLIDATION_LLM_MODEL"
46
59
  ENV_CONSOLIDATION_LLM_BASE_URL = "HINDSIGHT_API_CONSOLIDATION_LLM_BASE_URL"
60
+ ENV_CONSOLIDATION_LLM_MAX_CONCURRENT = "HINDSIGHT_API_CONSOLIDATION_LLM_MAX_CONCURRENT"
61
+ ENV_CONSOLIDATION_LLM_MAX_RETRIES = "HINDSIGHT_API_CONSOLIDATION_LLM_MAX_RETRIES"
62
+ ENV_CONSOLIDATION_LLM_INITIAL_BACKOFF = "HINDSIGHT_API_CONSOLIDATION_LLM_INITIAL_BACKOFF"
63
+ ENV_CONSOLIDATION_LLM_MAX_BACKOFF = "HINDSIGHT_API_CONSOLIDATION_LLM_MAX_BACKOFF"
64
+ ENV_CONSOLIDATION_LLM_TIMEOUT = "HINDSIGHT_API_CONSOLIDATION_LLM_TIMEOUT"
47
65
 
48
66
  ENV_EMBEDDINGS_PROVIDER = "HINDSIGHT_API_EMBEDDINGS_PROVIDER"
49
67
  ENV_EMBEDDINGS_LOCAL_MODEL = "HINDSIGHT_API_EMBEDDINGS_LOCAL_MODEL"
@@ -90,13 +108,17 @@ ENV_MCP_LOCAL_BANK_ID = "HINDSIGHT_API_MCP_LOCAL_BANK_ID"
90
108
  ENV_MCP_INSTRUCTIONS = "HINDSIGHT_API_MCP_INSTRUCTIONS"
91
109
  ENV_MENTAL_MODEL_REFRESH_CONCURRENCY = "HINDSIGHT_API_MENTAL_MODEL_REFRESH_CONCURRENCY"
92
110
 
111
+ # Vertex AI configuration
112
+ ENV_LLM_VERTEXAI_PROJECT_ID = "HINDSIGHT_API_LLM_VERTEXAI_PROJECT_ID"
113
+ ENV_LLM_VERTEXAI_REGION = "HINDSIGHT_API_LLM_VERTEXAI_REGION"
114
+ ENV_LLM_VERTEXAI_SERVICE_ACCOUNT_KEY = "HINDSIGHT_API_LLM_VERTEXAI_SERVICE_ACCOUNT_KEY"
115
+
93
116
  # Retain settings
94
117
  ENV_RETAIN_MAX_COMPLETION_TOKENS = "HINDSIGHT_API_RETAIN_MAX_COMPLETION_TOKENS"
95
118
  ENV_RETAIN_CHUNK_SIZE = "HINDSIGHT_API_RETAIN_CHUNK_SIZE"
96
119
  ENV_RETAIN_EXTRACT_CAUSAL_LINKS = "HINDSIGHT_API_RETAIN_EXTRACT_CAUSAL_LINKS"
97
120
  ENV_RETAIN_EXTRACTION_MODE = "HINDSIGHT_API_RETAIN_EXTRACTION_MODE"
98
121
  ENV_RETAIN_CUSTOM_INSTRUCTIONS = "HINDSIGHT_API_RETAIN_CUSTOM_INSTRUCTIONS"
99
- ENV_RETAIN_OBSERVATIONS_ASYNC = "HINDSIGHT_API_RETAIN_OBSERVATIONS_ASYNC"
100
122
 
101
123
  # Observations settings (consolidated knowledge from facts)
102
124
  ENV_ENABLE_OBSERVATIONS = "HINDSIGHT_API_ENABLE_OBSERVATIONS"
@@ -121,8 +143,9 @@ ENV_WORKER_ENABLED = "HINDSIGHT_API_WORKER_ENABLED"
121
143
  ENV_WORKER_ID = "HINDSIGHT_API_WORKER_ID"
122
144
  ENV_WORKER_POLL_INTERVAL_MS = "HINDSIGHT_API_WORKER_POLL_INTERVAL_MS"
123
145
  ENV_WORKER_MAX_RETRIES = "HINDSIGHT_API_WORKER_MAX_RETRIES"
124
- ENV_WORKER_BATCH_SIZE = "HINDSIGHT_API_WORKER_BATCH_SIZE"
125
146
  ENV_WORKER_HTTP_PORT = "HINDSIGHT_API_WORKER_HTTP_PORT"
147
+ ENV_WORKER_MAX_SLOTS = "HINDSIGHT_API_WORKER_MAX_SLOTS"
148
+ ENV_WORKER_CONSOLIDATION_MAX_SLOTS = "HINDSIGHT_API_WORKER_CONSOLIDATION_MAX_SLOTS"
126
149
 
127
150
  # Reflect agent settings
128
151
  ENV_REFLECT_MAX_ITERATIONS = "HINDSIGHT_API_REFLECT_MAX_ITERATIONS"
@@ -133,8 +156,16 @@ DEFAULT_DATABASE_SCHEMA = "public"
133
156
  DEFAULT_LLM_PROVIDER = "openai"
134
157
  DEFAULT_LLM_MODEL = "gpt-5-mini"
135
158
  DEFAULT_LLM_MAX_CONCURRENT = 32
159
+ DEFAULT_LLM_MAX_RETRIES = 10 # Max retry attempts for LLM API calls
160
+ DEFAULT_LLM_INITIAL_BACKOFF = 1.0 # Initial backoff in seconds for retry exponential backoff
161
+ DEFAULT_LLM_MAX_BACKOFF = 60.0 # Max backoff cap in seconds for retry exponential backoff
136
162
  DEFAULT_LLM_TIMEOUT = 120.0 # seconds
137
163
 
164
+ # Vertex AI defaults
165
+ DEFAULT_LLM_VERTEXAI_PROJECT_ID = None # Required for Vertex AI
166
+ DEFAULT_LLM_VERTEXAI_REGION = "us-central1"
167
+ DEFAULT_LLM_VERTEXAI_SERVICE_ACCOUNT_KEY = None # Optional, uses ADC if not set
168
+
138
169
  DEFAULT_EMBEDDINGS_PROVIDER = "local"
139
170
  DEFAULT_EMBEDDINGS_LOCAL_MODEL = "BAAI/bge-small-en-v1.5"
140
171
  DEFAULT_EMBEDDINGS_LOCAL_FORCE_CPU = False # Force CPU mode for local embeddings (avoids MPS/XPC issues on macOS)
@@ -179,7 +210,6 @@ DEFAULT_RETAIN_EXTRACT_CAUSAL_LINKS = True # Extract causal links between facts
179
210
  DEFAULT_RETAIN_EXTRACTION_MODE = "concise" # Extraction mode: "concise", "verbose", or "custom"
180
211
  RETAIN_EXTRACTION_MODES = ("concise", "verbose", "custom") # Allowed extraction modes
181
212
  DEFAULT_RETAIN_CUSTOM_INSTRUCTIONS = None # Custom extraction guidelines (only used when mode="custom")
182
- DEFAULT_RETAIN_OBSERVATIONS_ASYNC = False # Run observation generation async (after retain completes)
183
213
 
184
214
  # Observations defaults (consolidated knowledge from facts)
185
215
  DEFAULT_ENABLE_OBSERVATIONS = True # Observations enabled by default
@@ -200,8 +230,9 @@ DEFAULT_WORKER_ENABLED = True # API runs worker by default (standalone mode)
200
230
  DEFAULT_WORKER_ID = None # Will use hostname if not specified
201
231
  DEFAULT_WORKER_POLL_INTERVAL_MS = 500 # Poll database every 500ms
202
232
  DEFAULT_WORKER_MAX_RETRIES = 3 # Max retries before marking task failed
203
- DEFAULT_WORKER_BATCH_SIZE = 10 # Tasks to claim per poll cycle
204
233
  DEFAULT_WORKER_HTTP_PORT = 8889 # HTTP port for worker metrics/health
234
+ DEFAULT_WORKER_MAX_SLOTS = 10 # Total concurrent tasks per worker
235
+ DEFAULT_WORKER_CONSOLIDATION_MAX_SLOTS = 2 # Max concurrent consolidation tasks per worker
205
236
 
206
237
  # Reflect agent settings
207
238
  DEFAULT_REFLECT_MAX_ITERATIONS = 10 # Max tool call iterations before forcing response
@@ -286,23 +317,46 @@ class HindsightConfig:
286
317
  llm_model: str
287
318
  llm_base_url: str | None
288
319
  llm_max_concurrent: int
320
+ llm_max_retries: int
321
+ llm_initial_backoff: float
322
+ llm_max_backoff: float
289
323
  llm_timeout: float
290
324
 
325
+ # Vertex AI configuration
326
+ llm_vertexai_project_id: str | None
327
+ llm_vertexai_region: str
328
+ llm_vertexai_service_account_key: str | None
329
+
291
330
  # Per-operation LLM configuration (None = use default LLM config)
292
331
  retain_llm_provider: str | None
293
332
  retain_llm_api_key: str | None
294
333
  retain_llm_model: str | None
295
334
  retain_llm_base_url: str | None
335
+ retain_llm_max_concurrent: int | None
336
+ retain_llm_max_retries: int | None
337
+ retain_llm_initial_backoff: float | None
338
+ retain_llm_max_backoff: float | None
339
+ retain_llm_timeout: float | None
296
340
 
297
341
  reflect_llm_provider: str | None
298
342
  reflect_llm_api_key: str | None
299
343
  reflect_llm_model: str | None
300
344
  reflect_llm_base_url: str | None
345
+ reflect_llm_max_concurrent: int | None
346
+ reflect_llm_max_retries: int | None
347
+ reflect_llm_initial_backoff: float | None
348
+ reflect_llm_max_backoff: float | None
349
+ reflect_llm_timeout: float | None
301
350
 
302
351
  consolidation_llm_provider: str | None
303
352
  consolidation_llm_api_key: str | None
304
353
  consolidation_llm_model: str | None
305
354
  consolidation_llm_base_url: str | None
355
+ consolidation_llm_max_concurrent: int | None
356
+ consolidation_llm_max_retries: int | None
357
+ consolidation_llm_initial_backoff: float | None
358
+ consolidation_llm_max_backoff: float | None
359
+ consolidation_llm_timeout: float | None
306
360
 
307
361
  # Embeddings
308
362
  embeddings_provider: str
@@ -343,7 +397,6 @@ class HindsightConfig:
343
397
  retain_extract_causal_links: bool
344
398
  retain_extraction_mode: str
345
399
  retain_custom_instructions: str | None
346
- retain_observations_async: bool
347
400
 
348
401
  # Observations settings (consolidated knowledge from facts)
349
402
  enable_observations: bool
@@ -368,8 +421,9 @@ class HindsightConfig:
368
421
  worker_id: str | None
369
422
  worker_poll_interval_ms: int
370
423
  worker_max_retries: int
371
- worker_batch_size: int
372
424
  worker_http_port: int
425
+ worker_max_slots: int
426
+ worker_consolidation_max_slots: int
373
427
 
374
428
  # Reflect agent settings
375
429
  reflect_max_iterations: int
@@ -387,20 +441,71 @@ class HindsightConfig:
387
441
  llm_model=os.getenv(ENV_LLM_MODEL, DEFAULT_LLM_MODEL),
388
442
  llm_base_url=os.getenv(ENV_LLM_BASE_URL) or None,
389
443
  llm_max_concurrent=int(os.getenv(ENV_LLM_MAX_CONCURRENT, str(DEFAULT_LLM_MAX_CONCURRENT))),
444
+ llm_max_retries=int(os.getenv(ENV_LLM_MAX_RETRIES, str(DEFAULT_LLM_MAX_RETRIES))),
445
+ llm_initial_backoff=float(os.getenv(ENV_LLM_INITIAL_BACKOFF, str(DEFAULT_LLM_INITIAL_BACKOFF))),
446
+ llm_max_backoff=float(os.getenv(ENV_LLM_MAX_BACKOFF, str(DEFAULT_LLM_MAX_BACKOFF))),
390
447
  llm_timeout=float(os.getenv(ENV_LLM_TIMEOUT, str(DEFAULT_LLM_TIMEOUT))),
448
+ # Vertex AI
449
+ llm_vertexai_project_id=os.getenv(ENV_LLM_VERTEXAI_PROJECT_ID) or DEFAULT_LLM_VERTEXAI_PROJECT_ID,
450
+ llm_vertexai_region=os.getenv(ENV_LLM_VERTEXAI_REGION, DEFAULT_LLM_VERTEXAI_REGION),
451
+ llm_vertexai_service_account_key=os.getenv(ENV_LLM_VERTEXAI_SERVICE_ACCOUNT_KEY)
452
+ or DEFAULT_LLM_VERTEXAI_SERVICE_ACCOUNT_KEY,
391
453
  # Per-operation LLM config (None = use default)
392
454
  retain_llm_provider=os.getenv(ENV_RETAIN_LLM_PROVIDER) or None,
393
455
  retain_llm_api_key=os.getenv(ENV_RETAIN_LLM_API_KEY) or None,
394
456
  retain_llm_model=os.getenv(ENV_RETAIN_LLM_MODEL) or None,
395
457
  retain_llm_base_url=os.getenv(ENV_RETAIN_LLM_BASE_URL) or None,
458
+ retain_llm_max_concurrent=int(os.getenv(ENV_RETAIN_LLM_MAX_CONCURRENT))
459
+ if os.getenv(ENV_RETAIN_LLM_MAX_CONCURRENT)
460
+ else None,
461
+ retain_llm_max_retries=int(os.getenv(ENV_RETAIN_LLM_MAX_RETRIES))
462
+ if os.getenv(ENV_RETAIN_LLM_MAX_RETRIES)
463
+ else None,
464
+ retain_llm_initial_backoff=float(os.getenv(ENV_RETAIN_LLM_INITIAL_BACKOFF))
465
+ if os.getenv(ENV_RETAIN_LLM_INITIAL_BACKOFF)
466
+ else None,
467
+ retain_llm_max_backoff=float(os.getenv(ENV_RETAIN_LLM_MAX_BACKOFF))
468
+ if os.getenv(ENV_RETAIN_LLM_MAX_BACKOFF)
469
+ else None,
470
+ retain_llm_timeout=float(os.getenv(ENV_RETAIN_LLM_TIMEOUT)) if os.getenv(ENV_RETAIN_LLM_TIMEOUT) else None,
396
471
  reflect_llm_provider=os.getenv(ENV_REFLECT_LLM_PROVIDER) or None,
397
472
  reflect_llm_api_key=os.getenv(ENV_REFLECT_LLM_API_KEY) or None,
398
473
  reflect_llm_model=os.getenv(ENV_REFLECT_LLM_MODEL) or None,
399
474
  reflect_llm_base_url=os.getenv(ENV_REFLECT_LLM_BASE_URL) or None,
475
+ reflect_llm_max_concurrent=int(os.getenv(ENV_REFLECT_LLM_MAX_CONCURRENT))
476
+ if os.getenv(ENV_REFLECT_LLM_MAX_CONCURRENT)
477
+ else None,
478
+ reflect_llm_max_retries=int(os.getenv(ENV_REFLECT_LLM_MAX_RETRIES))
479
+ if os.getenv(ENV_REFLECT_LLM_MAX_RETRIES)
480
+ else None,
481
+ reflect_llm_initial_backoff=float(os.getenv(ENV_REFLECT_LLM_INITIAL_BACKOFF))
482
+ if os.getenv(ENV_REFLECT_LLM_INITIAL_BACKOFF)
483
+ else None,
484
+ reflect_llm_max_backoff=float(os.getenv(ENV_REFLECT_LLM_MAX_BACKOFF))
485
+ if os.getenv(ENV_REFLECT_LLM_MAX_BACKOFF)
486
+ else None,
487
+ reflect_llm_timeout=float(os.getenv(ENV_REFLECT_LLM_TIMEOUT))
488
+ if os.getenv(ENV_REFLECT_LLM_TIMEOUT)
489
+ else None,
400
490
  consolidation_llm_provider=os.getenv(ENV_CONSOLIDATION_LLM_PROVIDER) or None,
401
491
  consolidation_llm_api_key=os.getenv(ENV_CONSOLIDATION_LLM_API_KEY) or None,
402
492
  consolidation_llm_model=os.getenv(ENV_CONSOLIDATION_LLM_MODEL) or None,
403
493
  consolidation_llm_base_url=os.getenv(ENV_CONSOLIDATION_LLM_BASE_URL) or None,
494
+ consolidation_llm_max_concurrent=int(os.getenv(ENV_CONSOLIDATION_LLM_MAX_CONCURRENT))
495
+ if os.getenv(ENV_CONSOLIDATION_LLM_MAX_CONCURRENT)
496
+ else None,
497
+ consolidation_llm_max_retries=int(os.getenv(ENV_CONSOLIDATION_LLM_MAX_RETRIES))
498
+ if os.getenv(ENV_CONSOLIDATION_LLM_MAX_RETRIES)
499
+ else None,
500
+ consolidation_llm_initial_backoff=float(os.getenv(ENV_CONSOLIDATION_LLM_INITIAL_BACKOFF))
501
+ if os.getenv(ENV_CONSOLIDATION_LLM_INITIAL_BACKOFF)
502
+ else None,
503
+ consolidation_llm_max_backoff=float(os.getenv(ENV_CONSOLIDATION_LLM_MAX_BACKOFF))
504
+ if os.getenv(ENV_CONSOLIDATION_LLM_MAX_BACKOFF)
505
+ else None,
506
+ consolidation_llm_timeout=float(os.getenv(ENV_CONSOLIDATION_LLM_TIMEOUT))
507
+ if os.getenv(ENV_CONSOLIDATION_LLM_TIMEOUT)
508
+ else None,
404
509
  # Embeddings
405
510
  embeddings_provider=os.getenv(ENV_EMBEDDINGS_PROVIDER, DEFAULT_EMBEDDINGS_PROVIDER),
406
511
  embeddings_local_model=os.getenv(ENV_EMBEDDINGS_LOCAL_MODEL, DEFAULT_EMBEDDINGS_LOCAL_MODEL),
@@ -460,10 +565,6 @@ class HindsightConfig:
460
565
  os.getenv(ENV_RETAIN_EXTRACTION_MODE, DEFAULT_RETAIN_EXTRACTION_MODE)
461
566
  ),
462
567
  retain_custom_instructions=os.getenv(ENV_RETAIN_CUSTOM_INSTRUCTIONS) or DEFAULT_RETAIN_CUSTOM_INSTRUCTIONS,
463
- retain_observations_async=os.getenv(
464
- ENV_RETAIN_OBSERVATIONS_ASYNC, str(DEFAULT_RETAIN_OBSERVATIONS_ASYNC)
465
- ).lower()
466
- == "true",
467
568
  # Observations settings (consolidated knowledge from facts)
468
569
  enable_observations=os.getenv(ENV_ENABLE_OBSERVATIONS, str(DEFAULT_ENABLE_OBSERVATIONS)).lower() == "true",
469
570
  consolidation_batch_size=int(
@@ -484,8 +585,11 @@ class HindsightConfig:
484
585
  worker_id=os.getenv(ENV_WORKER_ID) or DEFAULT_WORKER_ID,
485
586
  worker_poll_interval_ms=int(os.getenv(ENV_WORKER_POLL_INTERVAL_MS, str(DEFAULT_WORKER_POLL_INTERVAL_MS))),
486
587
  worker_max_retries=int(os.getenv(ENV_WORKER_MAX_RETRIES, str(DEFAULT_WORKER_MAX_RETRIES))),
487
- worker_batch_size=int(os.getenv(ENV_WORKER_BATCH_SIZE, str(DEFAULT_WORKER_BATCH_SIZE))),
488
588
  worker_http_port=int(os.getenv(ENV_WORKER_HTTP_PORT, str(DEFAULT_WORKER_HTTP_PORT))),
589
+ worker_max_slots=int(os.getenv(ENV_WORKER_MAX_SLOTS, str(DEFAULT_WORKER_MAX_SLOTS))),
590
+ worker_consolidation_max_slots=int(
591
+ os.getenv(ENV_WORKER_CONSOLIDATION_MAX_SLOTS, str(DEFAULT_WORKER_CONSOLIDATION_MAX_SLOTS))
592
+ ),
489
593
  # Reflect agent settings
490
594
  reflect_max_iterations=int(os.getenv(ENV_REFLECT_MAX_ITERATIONS, str(DEFAULT_REFLECT_MAX_ITERATIONS))),
491
595
  )
hindsight_api/daemon.py CHANGED
@@ -52,7 +52,10 @@ class IdleTimeoutMiddleware:
52
52
  logger.info(f"Idle timeout reached ({self.idle_timeout}s), shutting down daemon")
53
53
  # Give a moment for any in-flight requests
54
54
  await asyncio.sleep(1)
55
- os._exit(0)
55
+ # Send SIGTERM to ourselves to trigger graceful shutdown
56
+ import signal
57
+
58
+ os.kill(os.getpid(), signal.SIGTERM)
56
59
 
57
60
 
58
61
  class DaemonLock:
@@ -144,10 +144,14 @@ async def run_consolidation_job(
144
144
  }
145
145
 
146
146
  batch_num = 0
147
+ last_progress_timings = {} # Track timings at last progress log
147
148
  while True:
148
149
  batch_num += 1
149
150
  batch_start = time.time()
150
151
 
152
+ # Snapshot timings at batch start for per-batch calculation
153
+ batch_start_timings = perf.timings.copy()
154
+
151
155
  # Fetch next batch of unconsolidated memories
152
156
  async with pool.acquire() as conn:
153
157
  t0 = time.time()
@@ -217,19 +221,44 @@ async def run_consolidation_job(
217
221
  elif action == "skipped":
218
222
  stats["skipped"] += 1
219
223
 
220
- # Log progress periodically
224
+ # Log progress periodically with timing breakdown
221
225
  if stats["memories_processed"] % 10 == 0:
226
+ # Calculate timing deltas since last progress log
227
+ timing_parts = []
228
+ for key in ["recall", "llm", "embedding", "db_write"]:
229
+ if key in perf.timings:
230
+ delta = perf.timings[key] - last_progress_timings.get(key, 0)
231
+ timing_parts.append(f"{key}={delta:.2f}s")
232
+
233
+ timing_str = f" | {', '.join(timing_parts)}" if timing_parts else ""
222
234
  logger.info(
223
235
  f"[CONSOLIDATION] bank={bank_id} progress: "
224
- f"{stats['memories_processed']}/{total_count} memories processed"
236
+ f"{stats['memories_processed']}/{total_count} memories processed{timing_str}"
225
237
  )
226
238
 
239
+ # Update last progress snapshot
240
+ last_progress_timings = perf.timings.copy()
241
+
227
242
  batch_time = time.time() - batch_start
228
243
  perf.log(
229
244
  f"[2] Batch {batch_num}: {len(memories)} memories in {batch_time:.3f}s "
230
245
  f"(avg {batch_time / len(memories):.3f}s/memory)"
231
246
  )
232
247
 
248
+ # Log timing breakdown after each batch (delta from batch start)
249
+ timing_parts = []
250
+ for key in ["recall", "llm", "embedding", "db_write"]:
251
+ if key in perf.timings:
252
+ delta = perf.timings[key] - batch_start_timings.get(key, 0)
253
+ timing_parts.append(f"{key}={delta:.3f}s")
254
+
255
+ if timing_parts:
256
+ avg_per_memory = batch_time / len(memories) if memories else 0
257
+ logger.info(
258
+ f"[CONSOLIDATION] bank={bank_id} batch {batch_num}/{len(memories)} memories: "
259
+ f"{', '.join(timing_parts)} | avg={avg_per_memory:.3f}s/memory"
260
+ )
261
+
233
262
  # Build summary
234
263
  perf.log(
235
264
  f"[3] Results: {stats['memories_processed']} memories -> "
@@ -836,7 +865,14 @@ Focus on DURABLE knowledge that serves this mission, not ephemeral state.
836
865
  )
837
866
  # Parse JSON response - should be an array
838
867
  if isinstance(result, str):
839
- result = json.loads(result)
868
+ # Strip markdown code fences (some models wrap JSON in ```json ... ```)
869
+ clean = result.strip()
870
+ if clean.startswith("```"):
871
+ clean = clean.split("\n", 1)[1] if "\n" in clean else clean[3:]
872
+ if clean.endswith("```"):
873
+ clean = clean[:-3]
874
+ clean = clean.strip()
875
+ result = json.loads(clean)
840
876
  # Ensure result is a list
841
877
  if isinstance(result, list):
842
878
  return result