hindsight-api 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hindsight_api/__init__.py +1 -1
- hindsight_api/api/http.py +7 -19
- hindsight_api/api/mcp.py +45 -5
- hindsight_api/config.py +115 -11
- hindsight_api/daemon.py +4 -1
- hindsight_api/engine/consolidation/consolidator.py +39 -3
- hindsight_api/engine/cross_encoder.py +7 -99
- hindsight_api/engine/embeddings.py +3 -93
- hindsight_api/engine/interface.py +0 -43
- hindsight_api/engine/llm_wrapper.py +93 -22
- hindsight_api/engine/memory_engine.py +37 -138
- hindsight_api/engine/response_models.py +1 -21
- hindsight_api/engine/retain/fact_extraction.py +19 -23
- hindsight_api/engine/retain/orchestrator.py +1 -4
- hindsight_api/engine/utils.py +0 -3
- hindsight_api/main.py +27 -12
- hindsight_api/mcp_tools.py +31 -12
- hindsight_api/metrics.py +3 -3
- hindsight_api/pg0.py +1 -1
- hindsight_api/worker/main.py +11 -11
- hindsight_api/worker/poller.py +226 -97
- {hindsight_api-0.4.1.dist-info → hindsight_api-0.4.3.dist-info}/METADATA +2 -1
- {hindsight_api-0.4.1.dist-info → hindsight_api-0.4.3.dist-info}/RECORD +25 -25
- {hindsight_api-0.4.1.dist-info → hindsight_api-0.4.3.dist-info}/WHEEL +0 -0
- {hindsight_api-0.4.1.dist-info → hindsight_api-0.4.3.dist-info}/entry_points.txt +0 -0
hindsight_api/__init__.py
CHANGED
hindsight_api/api/http.py
CHANGED
|
@@ -92,8 +92,7 @@ class RecallRequest(BaseModel):
|
|
|
92
92
|
query: str
|
|
93
93
|
types: list[str] | None = Field(
|
|
94
94
|
default=None,
|
|
95
|
-
description="List of fact types to recall: 'world', 'experience', 'observation'. Defaults to world and experience if not specified.
|
|
96
|
-
"Note: 'opinion' is accepted but ignored (opinions are excluded from recall).",
|
|
95
|
+
description="List of fact types to recall: 'world', 'experience', 'observation'. Defaults to world and experience if not specified.",
|
|
97
96
|
)
|
|
98
97
|
budget: Budget = Budget.MID
|
|
99
98
|
max_tokens: int = 4096
|
|
@@ -504,13 +503,6 @@ class ReflectRequest(BaseModel):
|
|
|
504
503
|
)
|
|
505
504
|
|
|
506
505
|
|
|
507
|
-
class OpinionItem(BaseModel):
|
|
508
|
-
"""Model for an opinion with confidence score."""
|
|
509
|
-
|
|
510
|
-
text: str
|
|
511
|
-
confidence: float
|
|
512
|
-
|
|
513
|
-
|
|
514
506
|
class ReflectFact(BaseModel):
|
|
515
507
|
"""A fact used in think response."""
|
|
516
508
|
|
|
@@ -529,7 +521,7 @@ class ReflectFact(BaseModel):
|
|
|
529
521
|
|
|
530
522
|
id: str | None = None
|
|
531
523
|
text: str
|
|
532
|
-
type: str | None = None # fact type: world, experience,
|
|
524
|
+
type: str | None = None # fact type: world, experience, observation
|
|
533
525
|
context: str | None = None
|
|
534
526
|
occurred_start: str | None = None
|
|
535
527
|
occurred_end: str | None = None
|
|
@@ -1412,9 +1404,10 @@ def create_app(
|
|
|
1412
1404
|
worker_id=worker_id,
|
|
1413
1405
|
executor=memory.execute_task,
|
|
1414
1406
|
poll_interval_ms=config.worker_poll_interval_ms,
|
|
1415
|
-
batch_size=config.worker_batch_size,
|
|
1416
1407
|
max_retries=config.worker_max_retries,
|
|
1417
1408
|
tenant_extension=getattr(memory, "_tenant_extension", None),
|
|
1409
|
+
max_slots=config.worker_max_slots,
|
|
1410
|
+
consolidation_max_slots=config.worker_consolidation_max_slots,
|
|
1418
1411
|
)
|
|
1419
1412
|
poller_task = asyncio.create_task(poller.run())
|
|
1420
1413
|
logging.info(f"Worker poller started (worker_id={worker_id})")
|
|
@@ -1707,9 +1700,7 @@ def _register_routes(app: FastAPI):
|
|
|
1707
1700
|
description="Recall memory using semantic similarity and spreading activation.\n\n"
|
|
1708
1701
|
"The type parameter is optional and must be one of:\n"
|
|
1709
1702
|
"- `world`: General knowledge about people, places, events, and things that happen\n"
|
|
1710
|
-
"- `experience`: Memories about experience, conversations, actions taken, and tasks performed
|
|
1711
|
-
"- `opinion`: The bank's formed beliefs, perspectives, and viewpoints\n\n"
|
|
1712
|
-
"Set `include_entities=true` to get entity observations alongside recall results.",
|
|
1703
|
+
"- `experience`: Memories about experience, conversations, actions taken, and tasks performed",
|
|
1713
1704
|
operation_id="recall_memories",
|
|
1714
1705
|
tags=["Memory"],
|
|
1715
1706
|
)
|
|
@@ -1723,10 +1714,8 @@ def _register_routes(app: FastAPI):
|
|
|
1723
1714
|
metrics = get_metrics_collector()
|
|
1724
1715
|
|
|
1725
1716
|
try:
|
|
1726
|
-
# Default to world and experience if not specified (exclude observation
|
|
1727
|
-
# Filter out 'opinion' even if requested - opinions are excluded from recall
|
|
1717
|
+
# Default to world and experience if not specified (exclude observation)
|
|
1728
1718
|
fact_types = request.types if request.types else list(VALID_RECALL_FACT_TYPES)
|
|
1729
|
-
fact_types = [ft for ft in fact_types if ft != "opinion"]
|
|
1730
1719
|
|
|
1731
1720
|
# Parse query_timestamp if provided
|
|
1732
1721
|
question_date = None
|
|
@@ -1858,8 +1847,7 @@ def _register_routes(app: FastAPI):
|
|
|
1858
1847
|
"2. Retrieves world facts relevant to the query\n"
|
|
1859
1848
|
"3. Retrieves existing opinions (bank's perspectives)\n"
|
|
1860
1849
|
"4. Uses LLM to formulate a contextual answer\n"
|
|
1861
|
-
"5.
|
|
1862
|
-
"6. Returns plain text answer, the facts used, and new opinions",
|
|
1850
|
+
"5. Returns plain text answer and the facts used",
|
|
1863
1851
|
operation_id="reflect",
|
|
1864
1852
|
tags=["Memory"],
|
|
1865
1853
|
)
|
hindsight_api/api/mcp.py
CHANGED
|
@@ -29,15 +29,26 @@ logger = logging.getLogger(__name__)
|
|
|
29
29
|
# Default bank_id from environment variable
|
|
30
30
|
DEFAULT_BANK_ID = os.environ.get("HINDSIGHT_MCP_BANK_ID", "default")
|
|
31
31
|
|
|
32
|
+
# MCP authentication token (optional - if set, Bearer token auth is required)
|
|
33
|
+
MCP_AUTH_TOKEN = os.environ.get("HINDSIGHT_API_MCP_AUTH_TOKEN")
|
|
34
|
+
|
|
32
35
|
# Context variable to hold the current bank_id
|
|
33
36
|
_current_bank_id: ContextVar[str | None] = ContextVar("current_bank_id", default=None)
|
|
34
37
|
|
|
38
|
+
# Context variable to hold the current API key (for tenant auth propagation)
|
|
39
|
+
_current_api_key: ContextVar[str | None] = ContextVar("current_api_key", default=None)
|
|
40
|
+
|
|
35
41
|
|
|
36
42
|
def get_current_bank_id() -> str | None:
|
|
37
43
|
"""Get the current bank_id from context."""
|
|
38
44
|
return _current_bank_id.get()
|
|
39
45
|
|
|
40
46
|
|
|
47
|
+
def get_current_api_key() -> str | None:
|
|
48
|
+
"""Get the current API key from context."""
|
|
49
|
+
return _current_api_key.get()
|
|
50
|
+
|
|
51
|
+
|
|
41
52
|
def create_mcp_server(memory: MemoryEngine) -> FastMCP:
|
|
42
53
|
"""
|
|
43
54
|
Create and configure the Hindsight MCP server.
|
|
@@ -54,6 +65,7 @@ def create_mcp_server(memory: MemoryEngine) -> FastMCP:
|
|
|
54
65
|
# Configure and register tools using shared module
|
|
55
66
|
config = MCPToolsConfig(
|
|
56
67
|
bank_id_resolver=get_current_bank_id,
|
|
68
|
+
api_key_resolver=get_current_api_key, # Propagate API key for tenant auth
|
|
57
69
|
include_bank_id_param=True, # HTTP MCP supports multi-bank via parameter
|
|
58
70
|
tools=None, # All tools
|
|
59
71
|
retain_fire_and_forget=False, # HTTP MCP supports sync/async modes
|
|
@@ -65,7 +77,11 @@ def create_mcp_server(memory: MemoryEngine) -> FastMCP:
|
|
|
65
77
|
|
|
66
78
|
|
|
67
79
|
class MCPMiddleware:
|
|
68
|
-
"""ASGI middleware that extracts bank_id from header or path
|
|
80
|
+
"""ASGI middleware that handles authentication and extracts bank_id from header or path.
|
|
81
|
+
|
|
82
|
+
Authentication:
|
|
83
|
+
If HINDSIGHT_API_MCP_AUTH_TOKEN is set, all requests must include a valid
|
|
84
|
+
Authorization header with Bearer token or direct token matching the configured value.
|
|
69
85
|
|
|
70
86
|
Bank ID can be provided via:
|
|
71
87
|
1. X-Bank-Id header (recommended for Claude Code)
|
|
@@ -74,7 +90,7 @@ class MCPMiddleware:
|
|
|
74
90
|
|
|
75
91
|
For Claude Code, configure with:
|
|
76
92
|
claude mcp add --transport http hindsight http://localhost:8888/mcp \\
|
|
77
|
-
--header "X-Bank-Id: my-bank"
|
|
93
|
+
--header "X-Bank-Id: my-bank" --header "Authorization: Bearer <token>"
|
|
78
94
|
"""
|
|
79
95
|
|
|
80
96
|
def __init__(self, app, memory: MemoryEngine):
|
|
@@ -98,6 +114,22 @@ class MCPMiddleware:
|
|
|
98
114
|
await self.mcp_app(scope, receive, send)
|
|
99
115
|
return
|
|
100
116
|
|
|
117
|
+
# Extract auth token from header (for tenant auth propagation)
|
|
118
|
+
auth_header = self._get_header(scope, "Authorization")
|
|
119
|
+
auth_token: str | None = None
|
|
120
|
+
if auth_header:
|
|
121
|
+
# Support both "Bearer <token>" and direct token
|
|
122
|
+
auth_token = auth_header[7:].strip() if auth_header.startswith("Bearer ") else auth_header.strip()
|
|
123
|
+
|
|
124
|
+
# Authenticate if MCP_AUTH_TOKEN is configured
|
|
125
|
+
if MCP_AUTH_TOKEN:
|
|
126
|
+
if not auth_token:
|
|
127
|
+
await self._send_error(send, 401, "Authorization header required")
|
|
128
|
+
return
|
|
129
|
+
if auth_token != MCP_AUTH_TOKEN:
|
|
130
|
+
await self._send_error(send, 401, "Invalid authentication token")
|
|
131
|
+
return
|
|
132
|
+
|
|
101
133
|
path = scope.get("path", "")
|
|
102
134
|
|
|
103
135
|
# Strip any mount prefix (e.g., /mcp) that FastAPI might not have stripped
|
|
@@ -132,8 +164,10 @@ class MCPMiddleware:
|
|
|
132
164
|
bank_id = DEFAULT_BANK_ID
|
|
133
165
|
logger.debug(f"Using default bank_id: {bank_id}")
|
|
134
166
|
|
|
135
|
-
# Set bank_id context
|
|
136
|
-
|
|
167
|
+
# Set bank_id and api_key context
|
|
168
|
+
bank_id_token = _current_bank_id.set(bank_id)
|
|
169
|
+
# Store the auth token for tenant extension to validate
|
|
170
|
+
api_key_token = _current_api_key.set(auth_token) if auth_token else None
|
|
137
171
|
try:
|
|
138
172
|
new_scope = scope.copy()
|
|
139
173
|
new_scope["path"] = new_path
|
|
@@ -152,7 +186,9 @@ class MCPMiddleware:
|
|
|
152
186
|
|
|
153
187
|
await self.mcp_app(new_scope, receive, send_wrapper)
|
|
154
188
|
finally:
|
|
155
|
-
_current_bank_id.reset(
|
|
189
|
+
_current_bank_id.reset(bank_id_token)
|
|
190
|
+
if api_key_token is not None:
|
|
191
|
+
_current_api_key.reset(api_key_token)
|
|
156
192
|
|
|
157
193
|
async def _send_error(self, send, status: int, message: str):
|
|
158
194
|
"""Send an error response."""
|
|
@@ -176,6 +212,10 @@ def create_mcp_app(memory: MemoryEngine):
|
|
|
176
212
|
"""
|
|
177
213
|
Create an ASGI app that handles MCP requests.
|
|
178
214
|
|
|
215
|
+
Authentication:
|
|
216
|
+
Set HINDSIGHT_API_MCP_AUTH_TOKEN to require Bearer token authentication.
|
|
217
|
+
If not set, MCP endpoint is open (for local development).
|
|
218
|
+
|
|
179
219
|
Bank ID can be provided via:
|
|
180
220
|
1. X-Bank-Id header: claude mcp add --transport http hindsight http://localhost:8888/mcp --header "X-Bank-Id: my-bank"
|
|
181
221
|
2. URL path: /mcp/{bank_id}/
|
hindsight_api/config.py
CHANGED
|
@@ -26,6 +26,9 @@ ENV_LLM_API_KEY = "HINDSIGHT_API_LLM_API_KEY"
|
|
|
26
26
|
ENV_LLM_MODEL = "HINDSIGHT_API_LLM_MODEL"
|
|
27
27
|
ENV_LLM_BASE_URL = "HINDSIGHT_API_LLM_BASE_URL"
|
|
28
28
|
ENV_LLM_MAX_CONCURRENT = "HINDSIGHT_API_LLM_MAX_CONCURRENT"
|
|
29
|
+
ENV_LLM_MAX_RETRIES = "HINDSIGHT_API_LLM_MAX_RETRIES"
|
|
30
|
+
ENV_LLM_INITIAL_BACKOFF = "HINDSIGHT_API_LLM_INITIAL_BACKOFF"
|
|
31
|
+
ENV_LLM_MAX_BACKOFF = "HINDSIGHT_API_LLM_MAX_BACKOFF"
|
|
29
32
|
ENV_LLM_TIMEOUT = "HINDSIGHT_API_LLM_TIMEOUT"
|
|
30
33
|
ENV_LLM_GROQ_SERVICE_TIER = "HINDSIGHT_API_LLM_GROQ_SERVICE_TIER"
|
|
31
34
|
|
|
@@ -34,16 +37,31 @@ ENV_RETAIN_LLM_PROVIDER = "HINDSIGHT_API_RETAIN_LLM_PROVIDER"
|
|
|
34
37
|
ENV_RETAIN_LLM_API_KEY = "HINDSIGHT_API_RETAIN_LLM_API_KEY"
|
|
35
38
|
ENV_RETAIN_LLM_MODEL = "HINDSIGHT_API_RETAIN_LLM_MODEL"
|
|
36
39
|
ENV_RETAIN_LLM_BASE_URL = "HINDSIGHT_API_RETAIN_LLM_BASE_URL"
|
|
40
|
+
ENV_RETAIN_LLM_MAX_CONCURRENT = "HINDSIGHT_API_RETAIN_LLM_MAX_CONCURRENT"
|
|
41
|
+
ENV_RETAIN_LLM_MAX_RETRIES = "HINDSIGHT_API_RETAIN_LLM_MAX_RETRIES"
|
|
42
|
+
ENV_RETAIN_LLM_INITIAL_BACKOFF = "HINDSIGHT_API_RETAIN_LLM_INITIAL_BACKOFF"
|
|
43
|
+
ENV_RETAIN_LLM_MAX_BACKOFF = "HINDSIGHT_API_RETAIN_LLM_MAX_BACKOFF"
|
|
44
|
+
ENV_RETAIN_LLM_TIMEOUT = "HINDSIGHT_API_RETAIN_LLM_TIMEOUT"
|
|
37
45
|
|
|
38
46
|
ENV_REFLECT_LLM_PROVIDER = "HINDSIGHT_API_REFLECT_LLM_PROVIDER"
|
|
39
47
|
ENV_REFLECT_LLM_API_KEY = "HINDSIGHT_API_REFLECT_LLM_API_KEY"
|
|
40
48
|
ENV_REFLECT_LLM_MODEL = "HINDSIGHT_API_REFLECT_LLM_MODEL"
|
|
41
49
|
ENV_REFLECT_LLM_BASE_URL = "HINDSIGHT_API_REFLECT_LLM_BASE_URL"
|
|
50
|
+
ENV_REFLECT_LLM_MAX_CONCURRENT = "HINDSIGHT_API_REFLECT_LLM_MAX_CONCURRENT"
|
|
51
|
+
ENV_REFLECT_LLM_MAX_RETRIES = "HINDSIGHT_API_REFLECT_LLM_MAX_RETRIES"
|
|
52
|
+
ENV_REFLECT_LLM_INITIAL_BACKOFF = "HINDSIGHT_API_REFLECT_LLM_INITIAL_BACKOFF"
|
|
53
|
+
ENV_REFLECT_LLM_MAX_BACKOFF = "HINDSIGHT_API_REFLECT_LLM_MAX_BACKOFF"
|
|
54
|
+
ENV_REFLECT_LLM_TIMEOUT = "HINDSIGHT_API_REFLECT_LLM_TIMEOUT"
|
|
42
55
|
|
|
43
56
|
ENV_CONSOLIDATION_LLM_PROVIDER = "HINDSIGHT_API_CONSOLIDATION_LLM_PROVIDER"
|
|
44
57
|
ENV_CONSOLIDATION_LLM_API_KEY = "HINDSIGHT_API_CONSOLIDATION_LLM_API_KEY"
|
|
45
58
|
ENV_CONSOLIDATION_LLM_MODEL = "HINDSIGHT_API_CONSOLIDATION_LLM_MODEL"
|
|
46
59
|
ENV_CONSOLIDATION_LLM_BASE_URL = "HINDSIGHT_API_CONSOLIDATION_LLM_BASE_URL"
|
|
60
|
+
ENV_CONSOLIDATION_LLM_MAX_CONCURRENT = "HINDSIGHT_API_CONSOLIDATION_LLM_MAX_CONCURRENT"
|
|
61
|
+
ENV_CONSOLIDATION_LLM_MAX_RETRIES = "HINDSIGHT_API_CONSOLIDATION_LLM_MAX_RETRIES"
|
|
62
|
+
ENV_CONSOLIDATION_LLM_INITIAL_BACKOFF = "HINDSIGHT_API_CONSOLIDATION_LLM_INITIAL_BACKOFF"
|
|
63
|
+
ENV_CONSOLIDATION_LLM_MAX_BACKOFF = "HINDSIGHT_API_CONSOLIDATION_LLM_MAX_BACKOFF"
|
|
64
|
+
ENV_CONSOLIDATION_LLM_TIMEOUT = "HINDSIGHT_API_CONSOLIDATION_LLM_TIMEOUT"
|
|
47
65
|
|
|
48
66
|
ENV_EMBEDDINGS_PROVIDER = "HINDSIGHT_API_EMBEDDINGS_PROVIDER"
|
|
49
67
|
ENV_EMBEDDINGS_LOCAL_MODEL = "HINDSIGHT_API_EMBEDDINGS_LOCAL_MODEL"
|
|
@@ -90,13 +108,17 @@ ENV_MCP_LOCAL_BANK_ID = "HINDSIGHT_API_MCP_LOCAL_BANK_ID"
|
|
|
90
108
|
ENV_MCP_INSTRUCTIONS = "HINDSIGHT_API_MCP_INSTRUCTIONS"
|
|
91
109
|
ENV_MENTAL_MODEL_REFRESH_CONCURRENCY = "HINDSIGHT_API_MENTAL_MODEL_REFRESH_CONCURRENCY"
|
|
92
110
|
|
|
111
|
+
# Vertex AI configuration
|
|
112
|
+
ENV_LLM_VERTEXAI_PROJECT_ID = "HINDSIGHT_API_LLM_VERTEXAI_PROJECT_ID"
|
|
113
|
+
ENV_LLM_VERTEXAI_REGION = "HINDSIGHT_API_LLM_VERTEXAI_REGION"
|
|
114
|
+
ENV_LLM_VERTEXAI_SERVICE_ACCOUNT_KEY = "HINDSIGHT_API_LLM_VERTEXAI_SERVICE_ACCOUNT_KEY"
|
|
115
|
+
|
|
93
116
|
# Retain settings
|
|
94
117
|
ENV_RETAIN_MAX_COMPLETION_TOKENS = "HINDSIGHT_API_RETAIN_MAX_COMPLETION_TOKENS"
|
|
95
118
|
ENV_RETAIN_CHUNK_SIZE = "HINDSIGHT_API_RETAIN_CHUNK_SIZE"
|
|
96
119
|
ENV_RETAIN_EXTRACT_CAUSAL_LINKS = "HINDSIGHT_API_RETAIN_EXTRACT_CAUSAL_LINKS"
|
|
97
120
|
ENV_RETAIN_EXTRACTION_MODE = "HINDSIGHT_API_RETAIN_EXTRACTION_MODE"
|
|
98
121
|
ENV_RETAIN_CUSTOM_INSTRUCTIONS = "HINDSIGHT_API_RETAIN_CUSTOM_INSTRUCTIONS"
|
|
99
|
-
ENV_RETAIN_OBSERVATIONS_ASYNC = "HINDSIGHT_API_RETAIN_OBSERVATIONS_ASYNC"
|
|
100
122
|
|
|
101
123
|
# Observations settings (consolidated knowledge from facts)
|
|
102
124
|
ENV_ENABLE_OBSERVATIONS = "HINDSIGHT_API_ENABLE_OBSERVATIONS"
|
|
@@ -121,8 +143,9 @@ ENV_WORKER_ENABLED = "HINDSIGHT_API_WORKER_ENABLED"
|
|
|
121
143
|
ENV_WORKER_ID = "HINDSIGHT_API_WORKER_ID"
|
|
122
144
|
ENV_WORKER_POLL_INTERVAL_MS = "HINDSIGHT_API_WORKER_POLL_INTERVAL_MS"
|
|
123
145
|
ENV_WORKER_MAX_RETRIES = "HINDSIGHT_API_WORKER_MAX_RETRIES"
|
|
124
|
-
ENV_WORKER_BATCH_SIZE = "HINDSIGHT_API_WORKER_BATCH_SIZE"
|
|
125
146
|
ENV_WORKER_HTTP_PORT = "HINDSIGHT_API_WORKER_HTTP_PORT"
|
|
147
|
+
ENV_WORKER_MAX_SLOTS = "HINDSIGHT_API_WORKER_MAX_SLOTS"
|
|
148
|
+
ENV_WORKER_CONSOLIDATION_MAX_SLOTS = "HINDSIGHT_API_WORKER_CONSOLIDATION_MAX_SLOTS"
|
|
126
149
|
|
|
127
150
|
# Reflect agent settings
|
|
128
151
|
ENV_REFLECT_MAX_ITERATIONS = "HINDSIGHT_API_REFLECT_MAX_ITERATIONS"
|
|
@@ -133,8 +156,16 @@ DEFAULT_DATABASE_SCHEMA = "public"
|
|
|
133
156
|
DEFAULT_LLM_PROVIDER = "openai"
|
|
134
157
|
DEFAULT_LLM_MODEL = "gpt-5-mini"
|
|
135
158
|
DEFAULT_LLM_MAX_CONCURRENT = 32
|
|
159
|
+
DEFAULT_LLM_MAX_RETRIES = 10 # Max retry attempts for LLM API calls
|
|
160
|
+
DEFAULT_LLM_INITIAL_BACKOFF = 1.0 # Initial backoff in seconds for retry exponential backoff
|
|
161
|
+
DEFAULT_LLM_MAX_BACKOFF = 60.0 # Max backoff cap in seconds for retry exponential backoff
|
|
136
162
|
DEFAULT_LLM_TIMEOUT = 120.0 # seconds
|
|
137
163
|
|
|
164
|
+
# Vertex AI defaults
|
|
165
|
+
DEFAULT_LLM_VERTEXAI_PROJECT_ID = None # Required for Vertex AI
|
|
166
|
+
DEFAULT_LLM_VERTEXAI_REGION = "us-central1"
|
|
167
|
+
DEFAULT_LLM_VERTEXAI_SERVICE_ACCOUNT_KEY = None # Optional, uses ADC if not set
|
|
168
|
+
|
|
138
169
|
DEFAULT_EMBEDDINGS_PROVIDER = "local"
|
|
139
170
|
DEFAULT_EMBEDDINGS_LOCAL_MODEL = "BAAI/bge-small-en-v1.5"
|
|
140
171
|
DEFAULT_EMBEDDINGS_LOCAL_FORCE_CPU = False # Force CPU mode for local embeddings (avoids MPS/XPC issues on macOS)
|
|
@@ -179,7 +210,6 @@ DEFAULT_RETAIN_EXTRACT_CAUSAL_LINKS = True # Extract causal links between facts
|
|
|
179
210
|
DEFAULT_RETAIN_EXTRACTION_MODE = "concise" # Extraction mode: "concise", "verbose", or "custom"
|
|
180
211
|
RETAIN_EXTRACTION_MODES = ("concise", "verbose", "custom") # Allowed extraction modes
|
|
181
212
|
DEFAULT_RETAIN_CUSTOM_INSTRUCTIONS = None # Custom extraction guidelines (only used when mode="custom")
|
|
182
|
-
DEFAULT_RETAIN_OBSERVATIONS_ASYNC = False # Run observation generation async (after retain completes)
|
|
183
213
|
|
|
184
214
|
# Observations defaults (consolidated knowledge from facts)
|
|
185
215
|
DEFAULT_ENABLE_OBSERVATIONS = True # Observations enabled by default
|
|
@@ -200,8 +230,9 @@ DEFAULT_WORKER_ENABLED = True # API runs worker by default (standalone mode)
|
|
|
200
230
|
DEFAULT_WORKER_ID = None # Will use hostname if not specified
|
|
201
231
|
DEFAULT_WORKER_POLL_INTERVAL_MS = 500 # Poll database every 500ms
|
|
202
232
|
DEFAULT_WORKER_MAX_RETRIES = 3 # Max retries before marking task failed
|
|
203
|
-
DEFAULT_WORKER_BATCH_SIZE = 10 # Tasks to claim per poll cycle
|
|
204
233
|
DEFAULT_WORKER_HTTP_PORT = 8889 # HTTP port for worker metrics/health
|
|
234
|
+
DEFAULT_WORKER_MAX_SLOTS = 10 # Total concurrent tasks per worker
|
|
235
|
+
DEFAULT_WORKER_CONSOLIDATION_MAX_SLOTS = 2 # Max concurrent consolidation tasks per worker
|
|
205
236
|
|
|
206
237
|
# Reflect agent settings
|
|
207
238
|
DEFAULT_REFLECT_MAX_ITERATIONS = 10 # Max tool call iterations before forcing response
|
|
@@ -286,23 +317,46 @@ class HindsightConfig:
|
|
|
286
317
|
llm_model: str
|
|
287
318
|
llm_base_url: str | None
|
|
288
319
|
llm_max_concurrent: int
|
|
320
|
+
llm_max_retries: int
|
|
321
|
+
llm_initial_backoff: float
|
|
322
|
+
llm_max_backoff: float
|
|
289
323
|
llm_timeout: float
|
|
290
324
|
|
|
325
|
+
# Vertex AI configuration
|
|
326
|
+
llm_vertexai_project_id: str | None
|
|
327
|
+
llm_vertexai_region: str
|
|
328
|
+
llm_vertexai_service_account_key: str | None
|
|
329
|
+
|
|
291
330
|
# Per-operation LLM configuration (None = use default LLM config)
|
|
292
331
|
retain_llm_provider: str | None
|
|
293
332
|
retain_llm_api_key: str | None
|
|
294
333
|
retain_llm_model: str | None
|
|
295
334
|
retain_llm_base_url: str | None
|
|
335
|
+
retain_llm_max_concurrent: int | None
|
|
336
|
+
retain_llm_max_retries: int | None
|
|
337
|
+
retain_llm_initial_backoff: float | None
|
|
338
|
+
retain_llm_max_backoff: float | None
|
|
339
|
+
retain_llm_timeout: float | None
|
|
296
340
|
|
|
297
341
|
reflect_llm_provider: str | None
|
|
298
342
|
reflect_llm_api_key: str | None
|
|
299
343
|
reflect_llm_model: str | None
|
|
300
344
|
reflect_llm_base_url: str | None
|
|
345
|
+
reflect_llm_max_concurrent: int | None
|
|
346
|
+
reflect_llm_max_retries: int | None
|
|
347
|
+
reflect_llm_initial_backoff: float | None
|
|
348
|
+
reflect_llm_max_backoff: float | None
|
|
349
|
+
reflect_llm_timeout: float | None
|
|
301
350
|
|
|
302
351
|
consolidation_llm_provider: str | None
|
|
303
352
|
consolidation_llm_api_key: str | None
|
|
304
353
|
consolidation_llm_model: str | None
|
|
305
354
|
consolidation_llm_base_url: str | None
|
|
355
|
+
consolidation_llm_max_concurrent: int | None
|
|
356
|
+
consolidation_llm_max_retries: int | None
|
|
357
|
+
consolidation_llm_initial_backoff: float | None
|
|
358
|
+
consolidation_llm_max_backoff: float | None
|
|
359
|
+
consolidation_llm_timeout: float | None
|
|
306
360
|
|
|
307
361
|
# Embeddings
|
|
308
362
|
embeddings_provider: str
|
|
@@ -343,7 +397,6 @@ class HindsightConfig:
|
|
|
343
397
|
retain_extract_causal_links: bool
|
|
344
398
|
retain_extraction_mode: str
|
|
345
399
|
retain_custom_instructions: str | None
|
|
346
|
-
retain_observations_async: bool
|
|
347
400
|
|
|
348
401
|
# Observations settings (consolidated knowledge from facts)
|
|
349
402
|
enable_observations: bool
|
|
@@ -368,8 +421,9 @@ class HindsightConfig:
|
|
|
368
421
|
worker_id: str | None
|
|
369
422
|
worker_poll_interval_ms: int
|
|
370
423
|
worker_max_retries: int
|
|
371
|
-
worker_batch_size: int
|
|
372
424
|
worker_http_port: int
|
|
425
|
+
worker_max_slots: int
|
|
426
|
+
worker_consolidation_max_slots: int
|
|
373
427
|
|
|
374
428
|
# Reflect agent settings
|
|
375
429
|
reflect_max_iterations: int
|
|
@@ -387,20 +441,71 @@ class HindsightConfig:
|
|
|
387
441
|
llm_model=os.getenv(ENV_LLM_MODEL, DEFAULT_LLM_MODEL),
|
|
388
442
|
llm_base_url=os.getenv(ENV_LLM_BASE_URL) or None,
|
|
389
443
|
llm_max_concurrent=int(os.getenv(ENV_LLM_MAX_CONCURRENT, str(DEFAULT_LLM_MAX_CONCURRENT))),
|
|
444
|
+
llm_max_retries=int(os.getenv(ENV_LLM_MAX_RETRIES, str(DEFAULT_LLM_MAX_RETRIES))),
|
|
445
|
+
llm_initial_backoff=float(os.getenv(ENV_LLM_INITIAL_BACKOFF, str(DEFAULT_LLM_INITIAL_BACKOFF))),
|
|
446
|
+
llm_max_backoff=float(os.getenv(ENV_LLM_MAX_BACKOFF, str(DEFAULT_LLM_MAX_BACKOFF))),
|
|
390
447
|
llm_timeout=float(os.getenv(ENV_LLM_TIMEOUT, str(DEFAULT_LLM_TIMEOUT))),
|
|
448
|
+
# Vertex AI
|
|
449
|
+
llm_vertexai_project_id=os.getenv(ENV_LLM_VERTEXAI_PROJECT_ID) or DEFAULT_LLM_VERTEXAI_PROJECT_ID,
|
|
450
|
+
llm_vertexai_region=os.getenv(ENV_LLM_VERTEXAI_REGION, DEFAULT_LLM_VERTEXAI_REGION),
|
|
451
|
+
llm_vertexai_service_account_key=os.getenv(ENV_LLM_VERTEXAI_SERVICE_ACCOUNT_KEY)
|
|
452
|
+
or DEFAULT_LLM_VERTEXAI_SERVICE_ACCOUNT_KEY,
|
|
391
453
|
# Per-operation LLM config (None = use default)
|
|
392
454
|
retain_llm_provider=os.getenv(ENV_RETAIN_LLM_PROVIDER) or None,
|
|
393
455
|
retain_llm_api_key=os.getenv(ENV_RETAIN_LLM_API_KEY) or None,
|
|
394
456
|
retain_llm_model=os.getenv(ENV_RETAIN_LLM_MODEL) or None,
|
|
395
457
|
retain_llm_base_url=os.getenv(ENV_RETAIN_LLM_BASE_URL) or None,
|
|
458
|
+
retain_llm_max_concurrent=int(os.getenv(ENV_RETAIN_LLM_MAX_CONCURRENT))
|
|
459
|
+
if os.getenv(ENV_RETAIN_LLM_MAX_CONCURRENT)
|
|
460
|
+
else None,
|
|
461
|
+
retain_llm_max_retries=int(os.getenv(ENV_RETAIN_LLM_MAX_RETRIES))
|
|
462
|
+
if os.getenv(ENV_RETAIN_LLM_MAX_RETRIES)
|
|
463
|
+
else None,
|
|
464
|
+
retain_llm_initial_backoff=float(os.getenv(ENV_RETAIN_LLM_INITIAL_BACKOFF))
|
|
465
|
+
if os.getenv(ENV_RETAIN_LLM_INITIAL_BACKOFF)
|
|
466
|
+
else None,
|
|
467
|
+
retain_llm_max_backoff=float(os.getenv(ENV_RETAIN_LLM_MAX_BACKOFF))
|
|
468
|
+
if os.getenv(ENV_RETAIN_LLM_MAX_BACKOFF)
|
|
469
|
+
else None,
|
|
470
|
+
retain_llm_timeout=float(os.getenv(ENV_RETAIN_LLM_TIMEOUT)) if os.getenv(ENV_RETAIN_LLM_TIMEOUT) else None,
|
|
396
471
|
reflect_llm_provider=os.getenv(ENV_REFLECT_LLM_PROVIDER) or None,
|
|
397
472
|
reflect_llm_api_key=os.getenv(ENV_REFLECT_LLM_API_KEY) or None,
|
|
398
473
|
reflect_llm_model=os.getenv(ENV_REFLECT_LLM_MODEL) or None,
|
|
399
474
|
reflect_llm_base_url=os.getenv(ENV_REFLECT_LLM_BASE_URL) or None,
|
|
475
|
+
reflect_llm_max_concurrent=int(os.getenv(ENV_REFLECT_LLM_MAX_CONCURRENT))
|
|
476
|
+
if os.getenv(ENV_REFLECT_LLM_MAX_CONCURRENT)
|
|
477
|
+
else None,
|
|
478
|
+
reflect_llm_max_retries=int(os.getenv(ENV_REFLECT_LLM_MAX_RETRIES))
|
|
479
|
+
if os.getenv(ENV_REFLECT_LLM_MAX_RETRIES)
|
|
480
|
+
else None,
|
|
481
|
+
reflect_llm_initial_backoff=float(os.getenv(ENV_REFLECT_LLM_INITIAL_BACKOFF))
|
|
482
|
+
if os.getenv(ENV_REFLECT_LLM_INITIAL_BACKOFF)
|
|
483
|
+
else None,
|
|
484
|
+
reflect_llm_max_backoff=float(os.getenv(ENV_REFLECT_LLM_MAX_BACKOFF))
|
|
485
|
+
if os.getenv(ENV_REFLECT_LLM_MAX_BACKOFF)
|
|
486
|
+
else None,
|
|
487
|
+
reflect_llm_timeout=float(os.getenv(ENV_REFLECT_LLM_TIMEOUT))
|
|
488
|
+
if os.getenv(ENV_REFLECT_LLM_TIMEOUT)
|
|
489
|
+
else None,
|
|
400
490
|
consolidation_llm_provider=os.getenv(ENV_CONSOLIDATION_LLM_PROVIDER) or None,
|
|
401
491
|
consolidation_llm_api_key=os.getenv(ENV_CONSOLIDATION_LLM_API_KEY) or None,
|
|
402
492
|
consolidation_llm_model=os.getenv(ENV_CONSOLIDATION_LLM_MODEL) or None,
|
|
403
493
|
consolidation_llm_base_url=os.getenv(ENV_CONSOLIDATION_LLM_BASE_URL) or None,
|
|
494
|
+
consolidation_llm_max_concurrent=int(os.getenv(ENV_CONSOLIDATION_LLM_MAX_CONCURRENT))
|
|
495
|
+
if os.getenv(ENV_CONSOLIDATION_LLM_MAX_CONCURRENT)
|
|
496
|
+
else None,
|
|
497
|
+
consolidation_llm_max_retries=int(os.getenv(ENV_CONSOLIDATION_LLM_MAX_RETRIES))
|
|
498
|
+
if os.getenv(ENV_CONSOLIDATION_LLM_MAX_RETRIES)
|
|
499
|
+
else None,
|
|
500
|
+
consolidation_llm_initial_backoff=float(os.getenv(ENV_CONSOLIDATION_LLM_INITIAL_BACKOFF))
|
|
501
|
+
if os.getenv(ENV_CONSOLIDATION_LLM_INITIAL_BACKOFF)
|
|
502
|
+
else None,
|
|
503
|
+
consolidation_llm_max_backoff=float(os.getenv(ENV_CONSOLIDATION_LLM_MAX_BACKOFF))
|
|
504
|
+
if os.getenv(ENV_CONSOLIDATION_LLM_MAX_BACKOFF)
|
|
505
|
+
else None,
|
|
506
|
+
consolidation_llm_timeout=float(os.getenv(ENV_CONSOLIDATION_LLM_TIMEOUT))
|
|
507
|
+
if os.getenv(ENV_CONSOLIDATION_LLM_TIMEOUT)
|
|
508
|
+
else None,
|
|
404
509
|
# Embeddings
|
|
405
510
|
embeddings_provider=os.getenv(ENV_EMBEDDINGS_PROVIDER, DEFAULT_EMBEDDINGS_PROVIDER),
|
|
406
511
|
embeddings_local_model=os.getenv(ENV_EMBEDDINGS_LOCAL_MODEL, DEFAULT_EMBEDDINGS_LOCAL_MODEL),
|
|
@@ -460,10 +565,6 @@ class HindsightConfig:
|
|
|
460
565
|
os.getenv(ENV_RETAIN_EXTRACTION_MODE, DEFAULT_RETAIN_EXTRACTION_MODE)
|
|
461
566
|
),
|
|
462
567
|
retain_custom_instructions=os.getenv(ENV_RETAIN_CUSTOM_INSTRUCTIONS) or DEFAULT_RETAIN_CUSTOM_INSTRUCTIONS,
|
|
463
|
-
retain_observations_async=os.getenv(
|
|
464
|
-
ENV_RETAIN_OBSERVATIONS_ASYNC, str(DEFAULT_RETAIN_OBSERVATIONS_ASYNC)
|
|
465
|
-
).lower()
|
|
466
|
-
== "true",
|
|
467
568
|
# Observations settings (consolidated knowledge from facts)
|
|
468
569
|
enable_observations=os.getenv(ENV_ENABLE_OBSERVATIONS, str(DEFAULT_ENABLE_OBSERVATIONS)).lower() == "true",
|
|
469
570
|
consolidation_batch_size=int(
|
|
@@ -484,8 +585,11 @@ class HindsightConfig:
|
|
|
484
585
|
worker_id=os.getenv(ENV_WORKER_ID) or DEFAULT_WORKER_ID,
|
|
485
586
|
worker_poll_interval_ms=int(os.getenv(ENV_WORKER_POLL_INTERVAL_MS, str(DEFAULT_WORKER_POLL_INTERVAL_MS))),
|
|
486
587
|
worker_max_retries=int(os.getenv(ENV_WORKER_MAX_RETRIES, str(DEFAULT_WORKER_MAX_RETRIES))),
|
|
487
|
-
worker_batch_size=int(os.getenv(ENV_WORKER_BATCH_SIZE, str(DEFAULT_WORKER_BATCH_SIZE))),
|
|
488
588
|
worker_http_port=int(os.getenv(ENV_WORKER_HTTP_PORT, str(DEFAULT_WORKER_HTTP_PORT))),
|
|
589
|
+
worker_max_slots=int(os.getenv(ENV_WORKER_MAX_SLOTS, str(DEFAULT_WORKER_MAX_SLOTS))),
|
|
590
|
+
worker_consolidation_max_slots=int(
|
|
591
|
+
os.getenv(ENV_WORKER_CONSOLIDATION_MAX_SLOTS, str(DEFAULT_WORKER_CONSOLIDATION_MAX_SLOTS))
|
|
592
|
+
),
|
|
489
593
|
# Reflect agent settings
|
|
490
594
|
reflect_max_iterations=int(os.getenv(ENV_REFLECT_MAX_ITERATIONS, str(DEFAULT_REFLECT_MAX_ITERATIONS))),
|
|
491
595
|
)
|
hindsight_api/daemon.py
CHANGED
|
@@ -52,7 +52,10 @@ class IdleTimeoutMiddleware:
|
|
|
52
52
|
logger.info(f"Idle timeout reached ({self.idle_timeout}s), shutting down daemon")
|
|
53
53
|
# Give a moment for any in-flight requests
|
|
54
54
|
await asyncio.sleep(1)
|
|
55
|
-
|
|
55
|
+
# Send SIGTERM to ourselves to trigger graceful shutdown
|
|
56
|
+
import signal
|
|
57
|
+
|
|
58
|
+
os.kill(os.getpid(), signal.SIGTERM)
|
|
56
59
|
|
|
57
60
|
|
|
58
61
|
class DaemonLock:
|
|
@@ -144,10 +144,14 @@ async def run_consolidation_job(
|
|
|
144
144
|
}
|
|
145
145
|
|
|
146
146
|
batch_num = 0
|
|
147
|
+
last_progress_timings = {} # Track timings at last progress log
|
|
147
148
|
while True:
|
|
148
149
|
batch_num += 1
|
|
149
150
|
batch_start = time.time()
|
|
150
151
|
|
|
152
|
+
# Snapshot timings at batch start for per-batch calculation
|
|
153
|
+
batch_start_timings = perf.timings.copy()
|
|
154
|
+
|
|
151
155
|
# Fetch next batch of unconsolidated memories
|
|
152
156
|
async with pool.acquire() as conn:
|
|
153
157
|
t0 = time.time()
|
|
@@ -217,19 +221,44 @@ async def run_consolidation_job(
|
|
|
217
221
|
elif action == "skipped":
|
|
218
222
|
stats["skipped"] += 1
|
|
219
223
|
|
|
220
|
-
# Log progress periodically
|
|
224
|
+
# Log progress periodically with timing breakdown
|
|
221
225
|
if stats["memories_processed"] % 10 == 0:
|
|
226
|
+
# Calculate timing deltas since last progress log
|
|
227
|
+
timing_parts = []
|
|
228
|
+
for key in ["recall", "llm", "embedding", "db_write"]:
|
|
229
|
+
if key in perf.timings:
|
|
230
|
+
delta = perf.timings[key] - last_progress_timings.get(key, 0)
|
|
231
|
+
timing_parts.append(f"{key}={delta:.2f}s")
|
|
232
|
+
|
|
233
|
+
timing_str = f" | {', '.join(timing_parts)}" if timing_parts else ""
|
|
222
234
|
logger.info(
|
|
223
235
|
f"[CONSOLIDATION] bank={bank_id} progress: "
|
|
224
|
-
f"{stats['memories_processed']}/{total_count} memories processed"
|
|
236
|
+
f"{stats['memories_processed']}/{total_count} memories processed{timing_str}"
|
|
225
237
|
)
|
|
226
238
|
|
|
239
|
+
# Update last progress snapshot
|
|
240
|
+
last_progress_timings = perf.timings.copy()
|
|
241
|
+
|
|
227
242
|
batch_time = time.time() - batch_start
|
|
228
243
|
perf.log(
|
|
229
244
|
f"[2] Batch {batch_num}: {len(memories)} memories in {batch_time:.3f}s "
|
|
230
245
|
f"(avg {batch_time / len(memories):.3f}s/memory)"
|
|
231
246
|
)
|
|
232
247
|
|
|
248
|
+
# Log timing breakdown after each batch (delta from batch start)
|
|
249
|
+
timing_parts = []
|
|
250
|
+
for key in ["recall", "llm", "embedding", "db_write"]:
|
|
251
|
+
if key in perf.timings:
|
|
252
|
+
delta = perf.timings[key] - batch_start_timings.get(key, 0)
|
|
253
|
+
timing_parts.append(f"{key}={delta:.3f}s")
|
|
254
|
+
|
|
255
|
+
if timing_parts:
|
|
256
|
+
avg_per_memory = batch_time / len(memories) if memories else 0
|
|
257
|
+
logger.info(
|
|
258
|
+
f"[CONSOLIDATION] bank={bank_id} batch {batch_num}/{len(memories)} memories: "
|
|
259
|
+
f"{', '.join(timing_parts)} | avg={avg_per_memory:.3f}s/memory"
|
|
260
|
+
)
|
|
261
|
+
|
|
233
262
|
# Build summary
|
|
234
263
|
perf.log(
|
|
235
264
|
f"[3] Results: {stats['memories_processed']} memories -> "
|
|
@@ -836,7 +865,14 @@ Focus on DURABLE knowledge that serves this mission, not ephemeral state.
|
|
|
836
865
|
)
|
|
837
866
|
# Parse JSON response - should be an array
|
|
838
867
|
if isinstance(result, str):
|
|
839
|
-
|
|
868
|
+
# Strip markdown code fences (some models wrap JSON in ```json ... ```)
|
|
869
|
+
clean = result.strip()
|
|
870
|
+
if clean.startswith("```"):
|
|
871
|
+
clean = clean.split("\n", 1)[1] if "\n" in clean else clean[3:]
|
|
872
|
+
if clean.endswith("```"):
|
|
873
|
+
clean = clean[:-3]
|
|
874
|
+
clean = clean.strip()
|
|
875
|
+
result = json.loads(clean)
|
|
840
876
|
# Ensure result is a list
|
|
841
877
|
if isinstance(result, list):
|
|
842
878
|
return result
|