hindsight-api 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hindsight_api/__init__.py +1 -1
- hindsight_api/admin/cli.py +59 -0
- hindsight_api/alembic/versions/h3c4d5e6f7g8_mental_models_v4.py +112 -0
- hindsight_api/alembic/versions/i4d5e6f7g8h9_delete_opinions.py +41 -0
- hindsight_api/alembic/versions/j5e6f7g8h9i0_mental_model_versions.py +95 -0
- hindsight_api/alembic/versions/k6f7g8h9i0j1_add_directive_subtype.py +58 -0
- hindsight_api/alembic/versions/l7g8h9i0j1k2_add_worker_columns.py +109 -0
- hindsight_api/alembic/versions/m8h9i0j1k2l3_mental_model_id_to_text.py +41 -0
- hindsight_api/alembic/versions/n9i0j1k2l3m4_learnings_and_pinned_reflections.py +134 -0
- hindsight_api/alembic/versions/o0j1k2l3m4n5_migrate_mental_models_data.py +113 -0
- hindsight_api/alembic/versions/p1k2l3m4n5o6_new_knowledge_architecture.py +194 -0
- hindsight_api/alembic/versions/q2l3m4n5o6p7_fix_mental_model_fact_type.py +50 -0
- hindsight_api/alembic/versions/r3m4n5o6p7q8_add_reflect_response_to_reflections.py +47 -0
- hindsight_api/alembic/versions/s4n5o6p7q8r9_add_consolidated_at_to_memory_units.py +53 -0
- hindsight_api/alembic/versions/t5o6p7q8r9s0_rename_mental_models_to_observations.py +134 -0
- hindsight_api/alembic/versions/u6p7q8r9s0t1_mental_models_text_id.py +41 -0
- hindsight_api/alembic/versions/v7q8r9s0t1u2_add_max_tokens_to_mental_models.py +50 -0
- hindsight_api/api/http.py +1120 -93
- hindsight_api/api/mcp.py +11 -191
- hindsight_api/config.py +174 -46
- hindsight_api/engine/consolidation/__init__.py +5 -0
- hindsight_api/engine/consolidation/consolidator.py +926 -0
- hindsight_api/engine/consolidation/prompts.py +77 -0
- hindsight_api/engine/cross_encoder.py +153 -22
- hindsight_api/engine/directives/__init__.py +5 -0
- hindsight_api/engine/directives/models.py +37 -0
- hindsight_api/engine/embeddings.py +136 -13
- hindsight_api/engine/interface.py +32 -13
- hindsight_api/engine/llm_wrapper.py +505 -43
- hindsight_api/engine/memory_engine.py +2101 -1094
- hindsight_api/engine/mental_models/__init__.py +14 -0
- hindsight_api/engine/mental_models/models.py +53 -0
- hindsight_api/engine/reflect/__init__.py +18 -0
- hindsight_api/engine/reflect/agent.py +933 -0
- hindsight_api/engine/reflect/models.py +109 -0
- hindsight_api/engine/reflect/observations.py +186 -0
- hindsight_api/engine/reflect/prompts.py +483 -0
- hindsight_api/engine/reflect/tools.py +437 -0
- hindsight_api/engine/reflect/tools_schema.py +250 -0
- hindsight_api/engine/response_models.py +130 -4
- hindsight_api/engine/retain/bank_utils.py +79 -201
- hindsight_api/engine/retain/fact_extraction.py +81 -48
- hindsight_api/engine/retain/fact_storage.py +5 -8
- hindsight_api/engine/retain/link_utils.py +5 -8
- hindsight_api/engine/retain/orchestrator.py +1 -55
- hindsight_api/engine/retain/types.py +2 -2
- hindsight_api/engine/search/graph_retrieval.py +2 -2
- hindsight_api/engine/search/link_expansion_retrieval.py +164 -29
- hindsight_api/engine/search/mpfp_retrieval.py +1 -1
- hindsight_api/engine/search/retrieval.py +14 -14
- hindsight_api/engine/search/think_utils.py +41 -140
- hindsight_api/engine/search/trace.py +0 -1
- hindsight_api/engine/search/tracer.py +2 -5
- hindsight_api/engine/search/types.py +0 -3
- hindsight_api/engine/task_backend.py +112 -196
- hindsight_api/engine/utils.py +0 -151
- hindsight_api/extensions/__init__.py +10 -1
- hindsight_api/extensions/builtin/tenant.py +11 -4
- hindsight_api/extensions/operation_validator.py +81 -4
- hindsight_api/extensions/tenant.py +26 -0
- hindsight_api/main.py +28 -5
- hindsight_api/mcp_local.py +12 -53
- hindsight_api/mcp_tools.py +494 -0
- hindsight_api/models.py +0 -2
- hindsight_api/worker/__init__.py +11 -0
- hindsight_api/worker/main.py +296 -0
- hindsight_api/worker/poller.py +486 -0
- {hindsight_api-0.3.0.dist-info → hindsight_api-0.4.1.dist-info}/METADATA +12 -6
- hindsight_api-0.4.1.dist-info/RECORD +112 -0
- {hindsight_api-0.3.0.dist-info → hindsight_api-0.4.1.dist-info}/entry_points.txt +1 -0
- hindsight_api/engine/retain/observation_regeneration.py +0 -254
- hindsight_api/engine/search/observation_utils.py +0 -125
- hindsight_api/engine/search/scoring.py +0 -159
- hindsight_api-0.3.0.dist-info/RECORD +0 -82
- {hindsight_api-0.3.0.dist-info → hindsight_api-0.4.1.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""Prompts for the consolidation engine."""
|
|
2
|
+
|
|
3
|
+
CONSOLIDATION_SYSTEM_PROMPT = """You are a memory consolidation system. Your job is to convert facts into durable knowledge (observations) and merge with existing knowledge when appropriate.
|
|
4
|
+
|
|
5
|
+
You must output ONLY valid JSON with no markdown formatting, no code blocks, and no additional text.
|
|
6
|
+
|
|
7
|
+
## EXTRACT DURABLE KNOWLEDGE, NOT EPHEMERAL STATE
|
|
8
|
+
Facts often describe events or actions. Extract the DURABLE KNOWLEDGE implied by the fact, not the transient state.
|
|
9
|
+
|
|
10
|
+
Examples of extracting durable knowledge:
|
|
11
|
+
- "User moved to Room 203" -> "Room 203 exists" (location exists, not where user is now)
|
|
12
|
+
- "User visited Acme Corp at Room 105" -> "Acme Corp is located in Room 105"
|
|
13
|
+
- "User took the elevator to floor 3" -> "Floor 3 is accessible by elevator"
|
|
14
|
+
- "User met Sarah at the lobby" -> "Sarah can be found at the lobby"
|
|
15
|
+
|
|
16
|
+
DO NOT track current user position/state as knowledge - that changes constantly.
|
|
17
|
+
DO track permanent facts learned from the user's actions.
|
|
18
|
+
|
|
19
|
+
## PRESERVE SPECIFIC DETAILS
|
|
20
|
+
Keep names, locations, numbers, and other specifics. Do NOT:
|
|
21
|
+
- Abstract into general principles
|
|
22
|
+
- Generate business insights
|
|
23
|
+
- Make knowledge generic
|
|
24
|
+
|
|
25
|
+
GOOD examples:
|
|
26
|
+
- Fact: "John likes pizza" -> "John likes pizza"
|
|
27
|
+
- Fact: "Alice works at Google" -> "Alice works at Google"
|
|
28
|
+
|
|
29
|
+
BAD examples:
|
|
30
|
+
- "John likes pizza" -> "Understanding dietary preferences helps..." (TOO ABSTRACT)
|
|
31
|
+
- "User is at Room 203" -> "User is currently at Room 203" (EPHEMERAL STATE)
|
|
32
|
+
|
|
33
|
+
## MERGE RULES (when comparing to existing observations):
|
|
34
|
+
1. REDUNDANT: Same information worded differently → update existing
|
|
35
|
+
2. CONTRADICTION: Opposite information about same topic → update with history (e.g., "used to X, now Y")
|
|
36
|
+
3. UPDATE: New state replacing old state → update with history
|
|
37
|
+
|
|
38
|
+
## CRITICAL RULES:
|
|
39
|
+
- NEVER merge facts about DIFFERENT people
|
|
40
|
+
- NEVER merge unrelated topics (food preferences vs work vs hobbies)
|
|
41
|
+
- When merging contradictions, capture the CHANGE (before → after)
|
|
42
|
+
- Keep observations focused on ONE specific topic per person
|
|
43
|
+
- The "text" field MUST contain durable knowledge, not ephemeral state
|
|
44
|
+
- Do NOT include "tags" in output - tags are handled automatically"""
|
|
45
|
+
|
|
46
|
+
CONSOLIDATION_USER_PROMPT = """Analyze this new fact and consolidate into knowledge.
|
|
47
|
+
{mission_section}
|
|
48
|
+
NEW FACT: {fact_text}
|
|
49
|
+
|
|
50
|
+
EXISTING OBSERVATIONS (JSON array with source memories and dates):
|
|
51
|
+
{observations_text}
|
|
52
|
+
|
|
53
|
+
Each observation includes:
|
|
54
|
+
- id: unique identifier for updating
|
|
55
|
+
- text: the observation content
|
|
56
|
+
- proof_count: number of supporting memories
|
|
57
|
+
- tags: visibility scope (handled automatically)
|
|
58
|
+
- created_at/updated_at: when observation was created/modified
|
|
59
|
+
- occurred_start/occurred_end: temporal range of source facts
|
|
60
|
+
- source_memories: array of supporting facts with their text and dates
|
|
61
|
+
|
|
62
|
+
Instructions:
|
|
63
|
+
1. Extract DURABLE KNOWLEDGE from the new fact (not ephemeral state)
|
|
64
|
+
2. Review source_memories in existing observations to understand evidence
|
|
65
|
+
3. Check dates to detect contradictions or updates
|
|
66
|
+
4. Compare with observations:
|
|
67
|
+
- Same topic → UPDATE with learning_id
|
|
68
|
+
- New topic → CREATE new observation
|
|
69
|
+
- Purely ephemeral → return []
|
|
70
|
+
|
|
71
|
+
Output JSON array of actions:
|
|
72
|
+
[
|
|
73
|
+
{{"action": "update", "learning_id": "uuid-from-observations", "text": "updated knowledge", "reason": "..."}},
|
|
74
|
+
{{"action": "create", "text": "new durable knowledge", "reason": "..."}}
|
|
75
|
+
]
|
|
76
|
+
|
|
77
|
+
Return [] if fact contains no durable knowledge."""
|
|
@@ -20,6 +20,7 @@ from ..config import (
|
|
|
20
20
|
DEFAULT_RERANKER_FLASHRANK_CACHE_DIR,
|
|
21
21
|
DEFAULT_RERANKER_FLASHRANK_MODEL,
|
|
22
22
|
DEFAULT_RERANKER_LITELLM_MODEL,
|
|
23
|
+
DEFAULT_RERANKER_LOCAL_FORCE_CPU,
|
|
23
24
|
DEFAULT_RERANKER_LOCAL_MAX_CONCURRENT,
|
|
24
25
|
DEFAULT_RERANKER_LOCAL_MODEL,
|
|
25
26
|
DEFAULT_RERANKER_PROVIDER,
|
|
@@ -33,6 +34,7 @@ from ..config import (
|
|
|
33
34
|
ENV_RERANKER_FLASHRANK_CACHE_DIR,
|
|
34
35
|
ENV_RERANKER_FLASHRANK_MODEL,
|
|
35
36
|
ENV_RERANKER_LITELLM_MODEL,
|
|
37
|
+
ENV_RERANKER_LOCAL_FORCE_CPU,
|
|
36
38
|
ENV_RERANKER_LOCAL_MAX_CONCURRENT,
|
|
37
39
|
ENV_RERANKER_LOCAL_MODEL,
|
|
38
40
|
ENV_RERANKER_PROVIDER,
|
|
@@ -99,7 +101,7 @@ class LocalSTCrossEncoder(CrossEncoderModel):
|
|
|
99
101
|
_executor: ThreadPoolExecutor | None = None
|
|
100
102
|
_max_concurrent: int = 4 # Limit concurrent CPU-bound reranking calls
|
|
101
103
|
|
|
102
|
-
def __init__(self, model_name: str | None = None, max_concurrent: int = 4):
|
|
104
|
+
def __init__(self, model_name: str | None = None, max_concurrent: int = 4, force_cpu: bool = False):
|
|
103
105
|
"""
|
|
104
106
|
Initialize local SentenceTransformers cross-encoder.
|
|
105
107
|
|
|
@@ -108,8 +110,11 @@ class LocalSTCrossEncoder(CrossEncoderModel):
|
|
|
108
110
|
Default: cross-encoder/ms-marco-MiniLM-L-6-v2
|
|
109
111
|
max_concurrent: Maximum concurrent reranking calls (default: 2).
|
|
110
112
|
Higher values may cause CPU thrashing under load.
|
|
113
|
+
force_cpu: Force CPU mode (avoids MPS/XPC issues on macOS in daemon mode).
|
|
114
|
+
Default: False
|
|
111
115
|
"""
|
|
112
116
|
self.model_name = model_name or DEFAULT_RERANKER_LOCAL_MODEL
|
|
117
|
+
self.force_cpu = force_cpu
|
|
113
118
|
self._model = None
|
|
114
119
|
LocalSTCrossEncoder._max_concurrent = max_concurrent
|
|
115
120
|
|
|
@@ -130,13 +135,38 @@ class LocalSTCrossEncoder(CrossEncoderModel):
|
|
|
130
135
|
"Install it with: pip install sentence-transformers"
|
|
131
136
|
)
|
|
132
137
|
|
|
133
|
-
# Note: We use CPU even when GPU/MPS is available because:
|
|
134
|
-
# 1. The reranker model (MiniLM) is tiny (~22M params)
|
|
135
|
-
# 2. Batch sizes are small (~100-200 pairs)
|
|
136
|
-
# 3. Data transfer overhead to GPU outweighs compute benefit
|
|
137
|
-
# 4. CPU inference is actually faster for this workload
|
|
138
138
|
logger.info(f"Reranker: initializing local provider with model {self.model_name}")
|
|
139
|
-
|
|
139
|
+
|
|
140
|
+
# Determine device based on hardware availability.
|
|
141
|
+
# We always set low_cpu_mem_usage=False to prevent lazy loading (meta tensors)
|
|
142
|
+
# which can cause issues when accelerate is installed but no GPU is available.
|
|
143
|
+
# Note: We do NOT use device_map because CrossEncoder internally calls .to(device)
|
|
144
|
+
# after loading, which conflicts with accelerate's device_map handling.
|
|
145
|
+
import torch
|
|
146
|
+
|
|
147
|
+
# Force CPU mode if configured (used in daemon mode to avoid MPS/XPC issues on macOS)
|
|
148
|
+
if self.force_cpu:
|
|
149
|
+
device = "cpu"
|
|
150
|
+
logger.info("Reranker: forcing CPU mode (HINDSIGHT_API_RERANKER_LOCAL_FORCE_CPU=1)")
|
|
151
|
+
else:
|
|
152
|
+
# Check for GPU (CUDA) or Apple Silicon (MPS)
|
|
153
|
+
# Wrap in try-except to gracefully handle any device detection issues
|
|
154
|
+
# (e.g., in CI environments or when PyTorch is built without GPU support)
|
|
155
|
+
device = "cpu" # Default to CPU
|
|
156
|
+
try:
|
|
157
|
+
has_gpu = torch.cuda.is_available() or (
|
|
158
|
+
hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
|
|
159
|
+
)
|
|
160
|
+
if has_gpu:
|
|
161
|
+
device = None # Let sentence-transformers auto-detect GPU/MPS
|
|
162
|
+
except Exception as e:
|
|
163
|
+
logger.warning(f"Failed to detect GPU/MPS, falling back to CPU: {e}")
|
|
164
|
+
|
|
165
|
+
self._model = CrossEncoder(
|
|
166
|
+
self.model_name,
|
|
167
|
+
device=device,
|
|
168
|
+
model_kwargs={"low_cpu_mem_usage": False},
|
|
169
|
+
)
|
|
140
170
|
|
|
141
171
|
# Initialize shared executor (limited workers naturally limits concurrency)
|
|
142
172
|
if LocalSTCrossEncoder._executor is None:
|
|
@@ -148,11 +178,108 @@ class LocalSTCrossEncoder(CrossEncoderModel):
|
|
|
148
178
|
else:
|
|
149
179
|
logger.info("Reranker: local provider initialized (using existing executor)")
|
|
150
180
|
|
|
181
|
+
def _is_xpc_error(self, error: Exception) -> bool:
|
|
182
|
+
"""
|
|
183
|
+
Check if an error is an XPC connection error (macOS daemon issue).
|
|
184
|
+
|
|
185
|
+
On macOS, long-running daemons can lose XPC connections to system services
|
|
186
|
+
when the process is idle for extended periods.
|
|
187
|
+
"""
|
|
188
|
+
error_str = str(error).lower()
|
|
189
|
+
return "xpc_error_connection_invalid" in error_str or "xpc error" in error_str
|
|
190
|
+
|
|
191
|
+
def _reinitialize_model_sync(self) -> None:
|
|
192
|
+
"""
|
|
193
|
+
Clear and reinitialize the cross-encoder model synchronously.
|
|
194
|
+
|
|
195
|
+
This is used to recover from XPC errors on macOS where the
|
|
196
|
+
PyTorch/MPS backend loses its connection to system services.
|
|
197
|
+
"""
|
|
198
|
+
logger.warning(f"Reinitializing reranker model {self.model_name} due to backend error")
|
|
199
|
+
|
|
200
|
+
# Clear existing model
|
|
201
|
+
self._model = None
|
|
202
|
+
|
|
203
|
+
# Force garbage collection to free resources
|
|
204
|
+
import gc
|
|
205
|
+
|
|
206
|
+
import torch
|
|
207
|
+
|
|
208
|
+
gc.collect()
|
|
209
|
+
|
|
210
|
+
# If using CUDA/MPS, clear the cache
|
|
211
|
+
if torch.cuda.is_available():
|
|
212
|
+
torch.cuda.empty_cache()
|
|
213
|
+
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
|
214
|
+
try:
|
|
215
|
+
torch.mps.empty_cache()
|
|
216
|
+
except AttributeError:
|
|
217
|
+
pass # Method might not exist in all PyTorch versions
|
|
218
|
+
|
|
219
|
+
# Reinitialize the model
|
|
220
|
+
try:
|
|
221
|
+
from sentence_transformers import CrossEncoder
|
|
222
|
+
except ImportError:
|
|
223
|
+
raise ImportError(
|
|
224
|
+
"sentence-transformers is required for LocalSTCrossEncoder. "
|
|
225
|
+
"Install it with: pip install sentence-transformers"
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
# Determine device based on hardware availability
|
|
229
|
+
if self.force_cpu:
|
|
230
|
+
device = "cpu"
|
|
231
|
+
else:
|
|
232
|
+
# Wrap in try-except to gracefully handle any device detection issues
|
|
233
|
+
device = "cpu" # Default to CPU
|
|
234
|
+
try:
|
|
235
|
+
has_gpu = torch.cuda.is_available() or (
|
|
236
|
+
hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
|
|
237
|
+
)
|
|
238
|
+
if has_gpu:
|
|
239
|
+
device = None # Let sentence-transformers auto-detect GPU/MPS
|
|
240
|
+
except Exception as e:
|
|
241
|
+
logger.warning(f"Failed to detect GPU/MPS during reinit, falling back to CPU: {e}")
|
|
242
|
+
|
|
243
|
+
self._model = CrossEncoder(
|
|
244
|
+
self.model_name,
|
|
245
|
+
device=device,
|
|
246
|
+
model_kwargs={"low_cpu_mem_usage": False},
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
logger.info("Reranker: local provider reinitialized successfully")
|
|
250
|
+
|
|
251
|
+
def _predict_with_recovery(self, pairs: list[tuple[str, str]]) -> list[float]:
|
|
252
|
+
"""
|
|
253
|
+
Predict with automatic recovery from XPC errors.
|
|
254
|
+
|
|
255
|
+
This runs synchronously in the thread pool.
|
|
256
|
+
"""
|
|
257
|
+
max_retries = 1
|
|
258
|
+
for attempt in range(max_retries + 1):
|
|
259
|
+
try:
|
|
260
|
+
scores = self._model.predict(pairs, show_progress_bar=False)
|
|
261
|
+
return scores.tolist() if hasattr(scores, "tolist") else list(scores)
|
|
262
|
+
except Exception as e:
|
|
263
|
+
# Check if this is an XPC error (macOS daemon issue)
|
|
264
|
+
if self._is_xpc_error(e) and attempt < max_retries:
|
|
265
|
+
logger.warning(f"XPC error detected in reranker (attempt {attempt + 1}): {e}")
|
|
266
|
+
try:
|
|
267
|
+
self._reinitialize_model_sync()
|
|
268
|
+
logger.info("Reranker reinitialized successfully, retrying prediction")
|
|
269
|
+
continue
|
|
270
|
+
except Exception as reinit_error:
|
|
271
|
+
logger.error(f"Failed to reinitialize reranker: {reinit_error}")
|
|
272
|
+
raise Exception(f"Failed to recover from XPC error: {str(e)}")
|
|
273
|
+
else:
|
|
274
|
+
# Not an XPC error or out of retries
|
|
275
|
+
raise
|
|
276
|
+
|
|
151
277
|
async def predict(self, pairs: list[tuple[str, str]]) -> list[float]:
|
|
152
278
|
"""
|
|
153
279
|
Score query-document pairs for relevance.
|
|
154
280
|
|
|
155
281
|
Uses a dedicated thread pool with limited workers to prevent CPU thrashing.
|
|
282
|
+
Automatically recovers from XPC errors on macOS by reinitializing the model.
|
|
156
283
|
|
|
157
284
|
Args:
|
|
158
285
|
pairs: List of (query, document) tuples to score
|
|
@@ -165,11 +292,11 @@ class LocalSTCrossEncoder(CrossEncoderModel):
|
|
|
165
292
|
|
|
166
293
|
# Use dedicated executor - limited workers naturally limits concurrency
|
|
167
294
|
loop = asyncio.get_event_loop()
|
|
168
|
-
|
|
295
|
+
return await loop.run_in_executor(
|
|
169
296
|
LocalSTCrossEncoder._executor,
|
|
170
|
-
|
|
297
|
+
self._predict_with_recovery,
|
|
298
|
+
pairs,
|
|
171
299
|
)
|
|
172
|
-
return scores.tolist() if hasattr(scores, "tolist") else list(scores)
|
|
173
300
|
|
|
174
301
|
|
|
175
302
|
class RemoteTEICrossEncoder(CrossEncoderModel):
|
|
@@ -768,29 +895,33 @@ class LiteLLMCrossEncoder(CrossEncoderModel):
|
|
|
768
895
|
|
|
769
896
|
def create_cross_encoder_from_env() -> CrossEncoderModel:
|
|
770
897
|
"""
|
|
771
|
-
Create a CrossEncoderModel instance based on
|
|
898
|
+
Create a CrossEncoderModel instance based on configuration.
|
|
772
899
|
|
|
773
|
-
|
|
900
|
+
Reads configuration via get_config() to ensure consistency across the codebase.
|
|
774
901
|
|
|
775
902
|
Returns:
|
|
776
903
|
Configured CrossEncoderModel instance
|
|
777
904
|
"""
|
|
778
|
-
|
|
905
|
+
from ..config import get_config
|
|
906
|
+
|
|
907
|
+
config = get_config()
|
|
908
|
+
provider = config.reranker_provider.lower()
|
|
779
909
|
|
|
780
910
|
if provider == "tei":
|
|
781
|
-
url =
|
|
911
|
+
url = config.reranker_tei_url
|
|
782
912
|
if not url:
|
|
783
913
|
raise ValueError(f"{ENV_RERANKER_TEI_URL} is required when {ENV_RERANKER_PROVIDER} is 'tei'")
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
914
|
+
return RemoteTEICrossEncoder(
|
|
915
|
+
base_url=url,
|
|
916
|
+
batch_size=config.reranker_tei_batch_size,
|
|
917
|
+
max_concurrent=config.reranker_tei_max_concurrent,
|
|
918
|
+
)
|
|
787
919
|
elif provider == "local":
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
920
|
+
return LocalSTCrossEncoder(
|
|
921
|
+
model_name=config.reranker_local_model,
|
|
922
|
+
max_concurrent=config.reranker_local_max_concurrent,
|
|
923
|
+
force_cpu=config.reranker_local_force_cpu,
|
|
792
924
|
)
|
|
793
|
-
return LocalSTCrossEncoder(model_name=model_name, max_concurrent=max_concurrent)
|
|
794
925
|
elif provider == "cohere":
|
|
795
926
|
api_key = os.environ.get(ENV_COHERE_API_KEY)
|
|
796
927
|
if not api_key:
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""Pydantic models for directives."""
|
|
2
|
+
|
|
3
|
+
from datetime import datetime, timezone
|
|
4
|
+
from uuid import UUID
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, Field
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Directive(BaseModel):
|
|
10
|
+
"""A directive is a hard rule injected into prompts.
|
|
11
|
+
|
|
12
|
+
Directives are user-defined rules that guide agent behavior. Unlike mental models
|
|
13
|
+
which are automatically consolidated from memories, directives are explicit
|
|
14
|
+
instructions that are always included in relevant prompts.
|
|
15
|
+
|
|
16
|
+
Examples:
|
|
17
|
+
- "Always respond in formal English"
|
|
18
|
+
- "Never share personal data with third parties"
|
|
19
|
+
- "Prefer conservative investment recommendations"
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
id: UUID = Field(description="Unique identifier")
|
|
23
|
+
bank_id: str = Field(description="Bank this directive belongs to")
|
|
24
|
+
name: str = Field(description="Human-readable name")
|
|
25
|
+
content: str = Field(description="The directive text to inject into prompts")
|
|
26
|
+
priority: int = Field(default=0, description="Higher priority directives are injected first")
|
|
27
|
+
is_active: bool = Field(default=True, description="Whether this directive is currently active")
|
|
28
|
+
tags: list[str] = Field(default_factory=list, description="Tags for filtering")
|
|
29
|
+
created_at: datetime = Field(
|
|
30
|
+
default_factory=lambda: datetime.now(timezone.utc), description="When this directive was created"
|
|
31
|
+
)
|
|
32
|
+
updated_at: datetime = Field(
|
|
33
|
+
default_factory=lambda: datetime.now(timezone.utc), description="When this directive was last updated"
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
class Config:
|
|
37
|
+
from_attributes = True
|
|
@@ -18,6 +18,7 @@ import httpx
|
|
|
18
18
|
from ..config import (
|
|
19
19
|
DEFAULT_EMBEDDINGS_COHERE_MODEL,
|
|
20
20
|
DEFAULT_EMBEDDINGS_LITELLM_MODEL,
|
|
21
|
+
DEFAULT_EMBEDDINGS_LOCAL_FORCE_CPU,
|
|
21
22
|
DEFAULT_EMBEDDINGS_LOCAL_MODEL,
|
|
22
23
|
DEFAULT_EMBEDDINGS_OPENAI_MODEL,
|
|
23
24
|
DEFAULT_EMBEDDINGS_PROVIDER,
|
|
@@ -26,6 +27,7 @@ from ..config import (
|
|
|
26
27
|
ENV_EMBEDDINGS_COHERE_BASE_URL,
|
|
27
28
|
ENV_EMBEDDINGS_COHERE_MODEL,
|
|
28
29
|
ENV_EMBEDDINGS_LITELLM_MODEL,
|
|
30
|
+
ENV_EMBEDDINGS_LOCAL_FORCE_CPU,
|
|
29
31
|
ENV_EMBEDDINGS_LOCAL_MODEL,
|
|
30
32
|
ENV_EMBEDDINGS_OPENAI_API_KEY,
|
|
31
33
|
ENV_EMBEDDINGS_OPENAI_BASE_URL,
|
|
@@ -92,15 +94,18 @@ class LocalSTEmbeddings(Embeddings):
|
|
|
92
94
|
The embedding dimension is auto-detected from the model.
|
|
93
95
|
"""
|
|
94
96
|
|
|
95
|
-
def __init__(self, model_name: str | None = None):
|
|
97
|
+
def __init__(self, model_name: str | None = None, force_cpu: bool = False):
|
|
96
98
|
"""
|
|
97
99
|
Initialize local SentenceTransformers embeddings.
|
|
98
100
|
|
|
99
101
|
Args:
|
|
100
102
|
model_name: Name of the SentenceTransformer model to use.
|
|
101
103
|
Default: BAAI/bge-small-en-v1.5
|
|
104
|
+
force_cpu: Force CPU mode (avoids MPS/XPC issues on macOS in daemon mode).
|
|
105
|
+
Default: False
|
|
102
106
|
"""
|
|
103
107
|
self.model_name = model_name or DEFAULT_EMBEDDINGS_LOCAL_MODEL
|
|
108
|
+
self.force_cpu = force_cpu
|
|
104
109
|
self._model = None
|
|
105
110
|
self._dimension: int | None = None
|
|
106
111
|
|
|
@@ -128,20 +133,115 @@ class LocalSTEmbeddings(Embeddings):
|
|
|
128
133
|
)
|
|
129
134
|
|
|
130
135
|
logger.info(f"Embeddings: initializing local provider with model {self.model_name}")
|
|
131
|
-
|
|
132
|
-
#
|
|
136
|
+
|
|
137
|
+
# Determine device based on hardware availability.
|
|
138
|
+
# We always set low_cpu_mem_usage=False to prevent lazy loading (meta tensors)
|
|
139
|
+
# which can cause issues when accelerate is installed but no GPU is available.
|
|
140
|
+
import torch
|
|
141
|
+
|
|
142
|
+
# Force CPU mode if configured (used in daemon mode to avoid MPS/XPC issues on macOS)
|
|
143
|
+
if self.force_cpu:
|
|
144
|
+
device = "cpu"
|
|
145
|
+
logger.info("Embeddings: forcing CPU mode")
|
|
146
|
+
else:
|
|
147
|
+
# Check for GPU (CUDA) or Apple Silicon (MPS)
|
|
148
|
+
# Wrap in try-except to gracefully handle any device detection issues
|
|
149
|
+
# (e.g., in CI environments or when PyTorch is built without GPU support)
|
|
150
|
+
device = "cpu" # Default to CPU
|
|
151
|
+
try:
|
|
152
|
+
has_gpu = torch.cuda.is_available() or (
|
|
153
|
+
hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
|
|
154
|
+
)
|
|
155
|
+
if has_gpu:
|
|
156
|
+
device = None # Let sentence-transformers auto-detect GPU/MPS
|
|
157
|
+
except Exception as e:
|
|
158
|
+
logger.warning(f"Failed to detect GPU/MPS, falling back to CPU: {e}")
|
|
159
|
+
|
|
133
160
|
self._model = SentenceTransformer(
|
|
134
161
|
self.model_name,
|
|
135
|
-
|
|
162
|
+
device=device,
|
|
163
|
+
model_kwargs={"low_cpu_mem_usage": False},
|
|
136
164
|
)
|
|
137
165
|
|
|
138
166
|
self._dimension = self._model.get_sentence_embedding_dimension()
|
|
139
167
|
logger.info(f"Embeddings: local provider initialized (dim: {self._dimension})")
|
|
140
168
|
|
|
169
|
+
def _is_xpc_error(self, error: Exception) -> bool:
|
|
170
|
+
"""
|
|
171
|
+
Check if an error is an XPC connection error (macOS daemon issue).
|
|
172
|
+
|
|
173
|
+
On macOS, long-running daemons can lose XPC connections to system services
|
|
174
|
+
when the process is idle for extended periods.
|
|
175
|
+
"""
|
|
176
|
+
error_str = str(error).lower()
|
|
177
|
+
return "xpc_error_connection_invalid" in error_str or "xpc error" in error_str
|
|
178
|
+
|
|
179
|
+
def _reinitialize_model_sync(self) -> None:
|
|
180
|
+
"""
|
|
181
|
+
Clear and reinitialize the embedding model synchronously.
|
|
182
|
+
|
|
183
|
+
This is used to recover from XPC errors on macOS where the
|
|
184
|
+
PyTorch/MPS backend loses its connection to system services.
|
|
185
|
+
"""
|
|
186
|
+
logger.warning(f"Reinitializing embedding model {self.model_name} due to backend error")
|
|
187
|
+
|
|
188
|
+
# Clear existing model
|
|
189
|
+
self._model = None
|
|
190
|
+
|
|
191
|
+
# Force garbage collection to free resources
|
|
192
|
+
import gc
|
|
193
|
+
|
|
194
|
+
import torch
|
|
195
|
+
|
|
196
|
+
gc.collect()
|
|
197
|
+
|
|
198
|
+
# If using CUDA/MPS, clear the cache
|
|
199
|
+
if torch.cuda.is_available():
|
|
200
|
+
torch.cuda.empty_cache()
|
|
201
|
+
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
|
202
|
+
try:
|
|
203
|
+
torch.mps.empty_cache()
|
|
204
|
+
except AttributeError:
|
|
205
|
+
pass # Method might not exist in all PyTorch versions
|
|
206
|
+
|
|
207
|
+
# Reinitialize the model (inline version of initialize() but synchronous)
|
|
208
|
+
try:
|
|
209
|
+
from sentence_transformers import SentenceTransformer
|
|
210
|
+
except ImportError:
|
|
211
|
+
raise ImportError(
|
|
212
|
+
"sentence-transformers is required for LocalSTEmbeddings. "
|
|
213
|
+
"Install it with: pip install sentence-transformers"
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
# Determine device based on hardware availability
|
|
217
|
+
if self.force_cpu:
|
|
218
|
+
device = "cpu"
|
|
219
|
+
else:
|
|
220
|
+
# Wrap in try-except to gracefully handle any device detection issues
|
|
221
|
+
device = "cpu" # Default to CPU
|
|
222
|
+
try:
|
|
223
|
+
has_gpu = torch.cuda.is_available() or (
|
|
224
|
+
hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
|
|
225
|
+
)
|
|
226
|
+
if has_gpu:
|
|
227
|
+
device = None # Let sentence-transformers auto-detect GPU/MPS
|
|
228
|
+
except Exception as e:
|
|
229
|
+
logger.warning(f"Failed to detect GPU/MPS during reinit, falling back to CPU: {e}")
|
|
230
|
+
|
|
231
|
+
self._model = SentenceTransformer(
|
|
232
|
+
self.model_name,
|
|
233
|
+
device=device,
|
|
234
|
+
model_kwargs={"low_cpu_mem_usage": False},
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
logger.info("Embeddings: local provider reinitialized successfully")
|
|
238
|
+
|
|
141
239
|
def encode(self, texts: list[str]) -> list[list[float]]:
|
|
142
240
|
"""
|
|
143
241
|
Generate embeddings for a list of texts.
|
|
144
242
|
|
|
243
|
+
Automatically recovers from XPC errors on macOS by reinitializing the model.
|
|
244
|
+
|
|
145
245
|
Args:
|
|
146
246
|
texts: List of text strings to encode
|
|
147
247
|
|
|
@@ -150,8 +250,27 @@ class LocalSTEmbeddings(Embeddings):
|
|
|
150
250
|
"""
|
|
151
251
|
if self._model is None:
|
|
152
252
|
raise RuntimeError("Embeddings not initialized. Call initialize() first.")
|
|
153
|
-
|
|
154
|
-
|
|
253
|
+
|
|
254
|
+
# Try encoding with automatic recovery from XPC errors
|
|
255
|
+
max_retries = 1
|
|
256
|
+
for attempt in range(max_retries + 1):
|
|
257
|
+
try:
|
|
258
|
+
embeddings = self._model.encode(texts, convert_to_numpy=True, show_progress_bar=False)
|
|
259
|
+
return [emb.tolist() for emb in embeddings]
|
|
260
|
+
except Exception as e:
|
|
261
|
+
# Check if this is an XPC error (macOS daemon issue)
|
|
262
|
+
if self._is_xpc_error(e) and attempt < max_retries:
|
|
263
|
+
logger.warning(f"XPC error detected in embedding generation (attempt {attempt + 1}): {e}")
|
|
264
|
+
try:
|
|
265
|
+
self._reinitialize_model_sync()
|
|
266
|
+
logger.info("Model reinitialized successfully, retrying embedding generation")
|
|
267
|
+
continue
|
|
268
|
+
except Exception as reinit_error:
|
|
269
|
+
logger.error(f"Failed to reinitialize model: {reinit_error}")
|
|
270
|
+
raise Exception(f"Failed to recover from XPC error: {str(e)}")
|
|
271
|
+
else:
|
|
272
|
+
# Not an XPC error or out of retries
|
|
273
|
+
raise
|
|
155
274
|
|
|
156
275
|
|
|
157
276
|
class RemoteTEIEmbeddings(Embeddings):
|
|
@@ -673,24 +792,28 @@ class LiteLLMEmbeddings(Embeddings):
|
|
|
673
792
|
|
|
674
793
|
def create_embeddings_from_env() -> Embeddings:
|
|
675
794
|
"""
|
|
676
|
-
Create an Embeddings instance based on
|
|
795
|
+
Create an Embeddings instance based on configuration.
|
|
677
796
|
|
|
678
|
-
|
|
797
|
+
Reads configuration via get_config() to ensure consistency across the codebase.
|
|
679
798
|
|
|
680
799
|
Returns:
|
|
681
800
|
Configured Embeddings instance
|
|
682
801
|
"""
|
|
683
|
-
|
|
802
|
+
from ..config import get_config
|
|
803
|
+
|
|
804
|
+
config = get_config()
|
|
805
|
+
provider = config.embeddings_provider.lower()
|
|
684
806
|
|
|
685
807
|
if provider == "tei":
|
|
686
|
-
url =
|
|
808
|
+
url = config.embeddings_tei_url
|
|
687
809
|
if not url:
|
|
688
810
|
raise ValueError(f"{ENV_EMBEDDINGS_TEI_URL} is required when {ENV_EMBEDDINGS_PROVIDER} is 'tei'")
|
|
689
811
|
return RemoteTEIEmbeddings(base_url=url)
|
|
690
812
|
elif provider == "local":
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
813
|
+
return LocalSTEmbeddings(
|
|
814
|
+
model_name=config.embeddings_local_model,
|
|
815
|
+
force_cpu=config.embeddings_local_force_cpu,
|
|
816
|
+
)
|
|
694
817
|
elif provider == "openai":
|
|
695
818
|
# Use dedicated embeddings API key, or fall back to LLM API key
|
|
696
819
|
api_key = os.environ.get(ENV_EMBEDDINGS_OPENAI_API_KEY) or os.environ.get(ENV_LLM_API_KEY)
|