hindsight-api 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. hindsight_api/__init__.py +1 -1
  2. hindsight_api/admin/cli.py +59 -0
  3. hindsight_api/alembic/versions/h3c4d5e6f7g8_mental_models_v4.py +112 -0
  4. hindsight_api/alembic/versions/i4d5e6f7g8h9_delete_opinions.py +41 -0
  5. hindsight_api/alembic/versions/j5e6f7g8h9i0_mental_model_versions.py +95 -0
  6. hindsight_api/alembic/versions/k6f7g8h9i0j1_add_directive_subtype.py +58 -0
  7. hindsight_api/alembic/versions/l7g8h9i0j1k2_add_worker_columns.py +109 -0
  8. hindsight_api/alembic/versions/m8h9i0j1k2l3_mental_model_id_to_text.py +41 -0
  9. hindsight_api/alembic/versions/n9i0j1k2l3m4_learnings_and_pinned_reflections.py +134 -0
  10. hindsight_api/alembic/versions/o0j1k2l3m4n5_migrate_mental_models_data.py +113 -0
  11. hindsight_api/alembic/versions/p1k2l3m4n5o6_new_knowledge_architecture.py +194 -0
  12. hindsight_api/alembic/versions/q2l3m4n5o6p7_fix_mental_model_fact_type.py +50 -0
  13. hindsight_api/alembic/versions/r3m4n5o6p7q8_add_reflect_response_to_reflections.py +47 -0
  14. hindsight_api/alembic/versions/s4n5o6p7q8r9_add_consolidated_at_to_memory_units.py +53 -0
  15. hindsight_api/alembic/versions/t5o6p7q8r9s0_rename_mental_models_to_observations.py +134 -0
  16. hindsight_api/alembic/versions/u6p7q8r9s0t1_mental_models_text_id.py +41 -0
  17. hindsight_api/alembic/versions/v7q8r9s0t1u2_add_max_tokens_to_mental_models.py +50 -0
  18. hindsight_api/api/http.py +1120 -93
  19. hindsight_api/api/mcp.py +11 -191
  20. hindsight_api/config.py +174 -46
  21. hindsight_api/engine/consolidation/__init__.py +5 -0
  22. hindsight_api/engine/consolidation/consolidator.py +926 -0
  23. hindsight_api/engine/consolidation/prompts.py +77 -0
  24. hindsight_api/engine/cross_encoder.py +153 -22
  25. hindsight_api/engine/directives/__init__.py +5 -0
  26. hindsight_api/engine/directives/models.py +37 -0
  27. hindsight_api/engine/embeddings.py +136 -13
  28. hindsight_api/engine/interface.py +32 -13
  29. hindsight_api/engine/llm_wrapper.py +505 -43
  30. hindsight_api/engine/memory_engine.py +2101 -1094
  31. hindsight_api/engine/mental_models/__init__.py +14 -0
  32. hindsight_api/engine/mental_models/models.py +53 -0
  33. hindsight_api/engine/reflect/__init__.py +18 -0
  34. hindsight_api/engine/reflect/agent.py +933 -0
  35. hindsight_api/engine/reflect/models.py +109 -0
  36. hindsight_api/engine/reflect/observations.py +186 -0
  37. hindsight_api/engine/reflect/prompts.py +483 -0
  38. hindsight_api/engine/reflect/tools.py +437 -0
  39. hindsight_api/engine/reflect/tools_schema.py +250 -0
  40. hindsight_api/engine/response_models.py +130 -4
  41. hindsight_api/engine/retain/bank_utils.py +79 -201
  42. hindsight_api/engine/retain/fact_extraction.py +81 -48
  43. hindsight_api/engine/retain/fact_storage.py +5 -8
  44. hindsight_api/engine/retain/link_utils.py +5 -8
  45. hindsight_api/engine/retain/orchestrator.py +1 -55
  46. hindsight_api/engine/retain/types.py +2 -2
  47. hindsight_api/engine/search/graph_retrieval.py +2 -2
  48. hindsight_api/engine/search/link_expansion_retrieval.py +164 -29
  49. hindsight_api/engine/search/mpfp_retrieval.py +1 -1
  50. hindsight_api/engine/search/retrieval.py +14 -14
  51. hindsight_api/engine/search/think_utils.py +41 -140
  52. hindsight_api/engine/search/trace.py +0 -1
  53. hindsight_api/engine/search/tracer.py +2 -5
  54. hindsight_api/engine/search/types.py +0 -3
  55. hindsight_api/engine/task_backend.py +112 -196
  56. hindsight_api/engine/utils.py +0 -151
  57. hindsight_api/extensions/__init__.py +10 -1
  58. hindsight_api/extensions/builtin/tenant.py +11 -4
  59. hindsight_api/extensions/operation_validator.py +81 -4
  60. hindsight_api/extensions/tenant.py +26 -0
  61. hindsight_api/main.py +28 -5
  62. hindsight_api/mcp_local.py +12 -53
  63. hindsight_api/mcp_tools.py +494 -0
  64. hindsight_api/models.py +0 -2
  65. hindsight_api/worker/__init__.py +11 -0
  66. hindsight_api/worker/main.py +296 -0
  67. hindsight_api/worker/poller.py +486 -0
  68. {hindsight_api-0.3.0.dist-info → hindsight_api-0.4.1.dist-info}/METADATA +12 -6
  69. hindsight_api-0.4.1.dist-info/RECORD +112 -0
  70. {hindsight_api-0.3.0.dist-info → hindsight_api-0.4.1.dist-info}/entry_points.txt +1 -0
  71. hindsight_api/engine/retain/observation_regeneration.py +0 -254
  72. hindsight_api/engine/search/observation_utils.py +0 -125
  73. hindsight_api/engine/search/scoring.py +0 -159
  74. hindsight_api-0.3.0.dist-info/RECORD +0 -82
  75. {hindsight_api-0.3.0.dist-info → hindsight_api-0.4.1.dist-info}/WHEEL +0 -0
@@ -0,0 +1,77 @@
1
+ """Prompts for the consolidation engine."""
2
+
3
+ CONSOLIDATION_SYSTEM_PROMPT = """You are a memory consolidation system. Your job is to convert facts into durable knowledge (observations) and merge with existing knowledge when appropriate.
4
+
5
+ You must output ONLY valid JSON with no markdown formatting, no code blocks, and no additional text.
6
+
7
+ ## EXTRACT DURABLE KNOWLEDGE, NOT EPHEMERAL STATE
8
+ Facts often describe events or actions. Extract the DURABLE KNOWLEDGE implied by the fact, not the transient state.
9
+
10
+ Examples of extracting durable knowledge:
11
+ - "User moved to Room 203" -> "Room 203 exists" (location exists, not where user is now)
12
+ - "User visited Acme Corp at Room 105" -> "Acme Corp is located in Room 105"
13
+ - "User took the elevator to floor 3" -> "Floor 3 is accessible by elevator"
14
+ - "User met Sarah at the lobby" -> "Sarah can be found at the lobby"
15
+
16
+ DO NOT track current user position/state as knowledge - that changes constantly.
17
+ DO track permanent facts learned from the user's actions.
18
+
19
+ ## PRESERVE SPECIFIC DETAILS
20
+ Keep names, locations, numbers, and other specifics. Do NOT:
21
+ - Abstract into general principles
22
+ - Generate business insights
23
+ - Make knowledge generic
24
+
25
+ GOOD examples:
26
+ - Fact: "John likes pizza" -> "John likes pizza"
27
+ - Fact: "Alice works at Google" -> "Alice works at Google"
28
+
29
+ BAD examples:
30
+ - "John likes pizza" -> "Understanding dietary preferences helps..." (TOO ABSTRACT)
31
+ - "User is at Room 203" -> "User is currently at Room 203" (EPHEMERAL STATE)
32
+
33
+ ## MERGE RULES (when comparing to existing observations):
34
+ 1. REDUNDANT: Same information worded differently → update existing
35
+ 2. CONTRADICTION: Opposite information about same topic → update with history (e.g., "used to X, now Y")
36
+ 3. UPDATE: New state replacing old state → update with history
37
+
38
+ ## CRITICAL RULES:
39
+ - NEVER merge facts about DIFFERENT people
40
+ - NEVER merge unrelated topics (food preferences vs work vs hobbies)
41
+ - When merging contradictions, capture the CHANGE (before → after)
42
+ - Keep observations focused on ONE specific topic per person
43
+ - The "text" field MUST contain durable knowledge, not ephemeral state
44
+ - Do NOT include "tags" in output - tags are handled automatically"""
45
+
46
+ CONSOLIDATION_USER_PROMPT = """Analyze this new fact and consolidate into knowledge.
47
+ {mission_section}
48
+ NEW FACT: {fact_text}
49
+
50
+ EXISTING OBSERVATIONS (JSON array with source memories and dates):
51
+ {observations_text}
52
+
53
+ Each observation includes:
54
+ - id: unique identifier for updating
55
+ - text: the observation content
56
+ - proof_count: number of supporting memories
57
+ - tags: visibility scope (handled automatically)
58
+ - created_at/updated_at: when observation was created/modified
59
+ - occurred_start/occurred_end: temporal range of source facts
60
+ - source_memories: array of supporting facts with their text and dates
61
+
62
+ Instructions:
63
+ 1. Extract DURABLE KNOWLEDGE from the new fact (not ephemeral state)
64
+ 2. Review source_memories in existing observations to understand evidence
65
+ 3. Check dates to detect contradictions or updates
66
+ 4. Compare with observations:
67
+ - Same topic → UPDATE with learning_id
68
+ - New topic → CREATE new observation
69
+ - Purely ephemeral → return []
70
+
71
+ Output JSON array of actions:
72
+ [
73
+ {{"action": "update", "learning_id": "uuid-from-observations", "text": "updated knowledge", "reason": "..."}},
74
+ {{"action": "create", "text": "new durable knowledge", "reason": "..."}}
75
+ ]
76
+
77
+ Return [] if fact contains no durable knowledge."""
@@ -20,6 +20,7 @@ from ..config import (
20
20
  DEFAULT_RERANKER_FLASHRANK_CACHE_DIR,
21
21
  DEFAULT_RERANKER_FLASHRANK_MODEL,
22
22
  DEFAULT_RERANKER_LITELLM_MODEL,
23
+ DEFAULT_RERANKER_LOCAL_FORCE_CPU,
23
24
  DEFAULT_RERANKER_LOCAL_MAX_CONCURRENT,
24
25
  DEFAULT_RERANKER_LOCAL_MODEL,
25
26
  DEFAULT_RERANKER_PROVIDER,
@@ -33,6 +34,7 @@ from ..config import (
33
34
  ENV_RERANKER_FLASHRANK_CACHE_DIR,
34
35
  ENV_RERANKER_FLASHRANK_MODEL,
35
36
  ENV_RERANKER_LITELLM_MODEL,
37
+ ENV_RERANKER_LOCAL_FORCE_CPU,
36
38
  ENV_RERANKER_LOCAL_MAX_CONCURRENT,
37
39
  ENV_RERANKER_LOCAL_MODEL,
38
40
  ENV_RERANKER_PROVIDER,
@@ -99,7 +101,7 @@ class LocalSTCrossEncoder(CrossEncoderModel):
99
101
  _executor: ThreadPoolExecutor | None = None
100
102
  _max_concurrent: int = 4 # Limit concurrent CPU-bound reranking calls
101
103
 
102
- def __init__(self, model_name: str | None = None, max_concurrent: int = 4):
104
+ def __init__(self, model_name: str | None = None, max_concurrent: int = 4, force_cpu: bool = False):
103
105
  """
104
106
  Initialize local SentenceTransformers cross-encoder.
105
107
 
@@ -108,8 +110,11 @@ class LocalSTCrossEncoder(CrossEncoderModel):
108
110
  Default: cross-encoder/ms-marco-MiniLM-L-6-v2
109
111
  max_concurrent: Maximum concurrent reranking calls (default: 2).
110
112
  Higher values may cause CPU thrashing under load.
113
+ force_cpu: Force CPU mode (avoids MPS/XPC issues on macOS in daemon mode).
114
+ Default: False
111
115
  """
112
116
  self.model_name = model_name or DEFAULT_RERANKER_LOCAL_MODEL
117
+ self.force_cpu = force_cpu
113
118
  self._model = None
114
119
  LocalSTCrossEncoder._max_concurrent = max_concurrent
115
120
 
@@ -130,13 +135,38 @@ class LocalSTCrossEncoder(CrossEncoderModel):
130
135
  "Install it with: pip install sentence-transformers"
131
136
  )
132
137
 
133
- # Note: We use CPU even when GPU/MPS is available because:
134
- # 1. The reranker model (MiniLM) is tiny (~22M params)
135
- # 2. Batch sizes are small (~100-200 pairs)
136
- # 3. Data transfer overhead to GPU outweighs compute benefit
137
- # 4. CPU inference is actually faster for this workload
138
138
  logger.info(f"Reranker: initializing local provider with model {self.model_name}")
139
- self._model = CrossEncoder(self.model_name)
139
+
140
+ # Determine device based on hardware availability.
141
+ # We always set low_cpu_mem_usage=False to prevent lazy loading (meta tensors)
142
+ # which can cause issues when accelerate is installed but no GPU is available.
143
+ # Note: We do NOT use device_map because CrossEncoder internally calls .to(device)
144
+ # after loading, which conflicts with accelerate's device_map handling.
145
+ import torch
146
+
147
+ # Force CPU mode if configured (used in daemon mode to avoid MPS/XPC issues on macOS)
148
+ if self.force_cpu:
149
+ device = "cpu"
150
+ logger.info("Reranker: forcing CPU mode (HINDSIGHT_API_RERANKER_LOCAL_FORCE_CPU=1)")
151
+ else:
152
+ # Check for GPU (CUDA) or Apple Silicon (MPS)
153
+ # Wrap in try-except to gracefully handle any device detection issues
154
+ # (e.g., in CI environments or when PyTorch is built without GPU support)
155
+ device = "cpu" # Default to CPU
156
+ try:
157
+ has_gpu = torch.cuda.is_available() or (
158
+ hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
159
+ )
160
+ if has_gpu:
161
+ device = None # Let sentence-transformers auto-detect GPU/MPS
162
+ except Exception as e:
163
+ logger.warning(f"Failed to detect GPU/MPS, falling back to CPU: {e}")
164
+
165
+ self._model = CrossEncoder(
166
+ self.model_name,
167
+ device=device,
168
+ model_kwargs={"low_cpu_mem_usage": False},
169
+ )
140
170
 
141
171
  # Initialize shared executor (limited workers naturally limits concurrency)
142
172
  if LocalSTCrossEncoder._executor is None:
@@ -148,11 +178,108 @@ class LocalSTCrossEncoder(CrossEncoderModel):
148
178
  else:
149
179
  logger.info("Reranker: local provider initialized (using existing executor)")
150
180
 
181
+ def _is_xpc_error(self, error: Exception) -> bool:
182
+ """
183
+ Check if an error is an XPC connection error (macOS daemon issue).
184
+
185
+ On macOS, long-running daemons can lose XPC connections to system services
186
+ when the process is idle for extended periods.
187
+ """
188
+ error_str = str(error).lower()
189
+ return "xpc_error_connection_invalid" in error_str or "xpc error" in error_str
190
+
191
+ def _reinitialize_model_sync(self) -> None:
192
+ """
193
+ Clear and reinitialize the cross-encoder model synchronously.
194
+
195
+ This is used to recover from XPC errors on macOS where the
196
+ PyTorch/MPS backend loses its connection to system services.
197
+ """
198
+ logger.warning(f"Reinitializing reranker model {self.model_name} due to backend error")
199
+
200
+ # Clear existing model
201
+ self._model = None
202
+
203
+ # Force garbage collection to free resources
204
+ import gc
205
+
206
+ import torch
207
+
208
+ gc.collect()
209
+
210
+ # If using CUDA/MPS, clear the cache
211
+ if torch.cuda.is_available():
212
+ torch.cuda.empty_cache()
213
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
214
+ try:
215
+ torch.mps.empty_cache()
216
+ except AttributeError:
217
+ pass # Method might not exist in all PyTorch versions
218
+
219
+ # Reinitialize the model
220
+ try:
221
+ from sentence_transformers import CrossEncoder
222
+ except ImportError:
223
+ raise ImportError(
224
+ "sentence-transformers is required for LocalSTCrossEncoder. "
225
+ "Install it with: pip install sentence-transformers"
226
+ )
227
+
228
+ # Determine device based on hardware availability
229
+ if self.force_cpu:
230
+ device = "cpu"
231
+ else:
232
+ # Wrap in try-except to gracefully handle any device detection issues
233
+ device = "cpu" # Default to CPU
234
+ try:
235
+ has_gpu = torch.cuda.is_available() or (
236
+ hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
237
+ )
238
+ if has_gpu:
239
+ device = None # Let sentence-transformers auto-detect GPU/MPS
240
+ except Exception as e:
241
+ logger.warning(f"Failed to detect GPU/MPS during reinit, falling back to CPU: {e}")
242
+
243
+ self._model = CrossEncoder(
244
+ self.model_name,
245
+ device=device,
246
+ model_kwargs={"low_cpu_mem_usage": False},
247
+ )
248
+
249
+ logger.info("Reranker: local provider reinitialized successfully")
250
+
251
+ def _predict_with_recovery(self, pairs: list[tuple[str, str]]) -> list[float]:
252
+ """
253
+ Predict with automatic recovery from XPC errors.
254
+
255
+ This runs synchronously in the thread pool.
256
+ """
257
+ max_retries = 1
258
+ for attempt in range(max_retries + 1):
259
+ try:
260
+ scores = self._model.predict(pairs, show_progress_bar=False)
261
+ return scores.tolist() if hasattr(scores, "tolist") else list(scores)
262
+ except Exception as e:
263
+ # Check if this is an XPC error (macOS daemon issue)
264
+ if self._is_xpc_error(e) and attempt < max_retries:
265
+ logger.warning(f"XPC error detected in reranker (attempt {attempt + 1}): {e}")
266
+ try:
267
+ self._reinitialize_model_sync()
268
+ logger.info("Reranker reinitialized successfully, retrying prediction")
269
+ continue
270
+ except Exception as reinit_error:
271
+ logger.error(f"Failed to reinitialize reranker: {reinit_error}")
272
+ raise Exception(f"Failed to recover from XPC error: {str(e)}")
273
+ else:
274
+ # Not an XPC error or out of retries
275
+ raise
276
+
151
277
  async def predict(self, pairs: list[tuple[str, str]]) -> list[float]:
152
278
  """
153
279
  Score query-document pairs for relevance.
154
280
 
155
281
  Uses a dedicated thread pool with limited workers to prevent CPU thrashing.
282
+ Automatically recovers from XPC errors on macOS by reinitializing the model.
156
283
 
157
284
  Args:
158
285
  pairs: List of (query, document) tuples to score
@@ -165,11 +292,11 @@ class LocalSTCrossEncoder(CrossEncoderModel):
165
292
 
166
293
  # Use dedicated executor - limited workers naturally limits concurrency
167
294
  loop = asyncio.get_event_loop()
168
- scores = await loop.run_in_executor(
295
+ return await loop.run_in_executor(
169
296
  LocalSTCrossEncoder._executor,
170
- lambda: self._model.predict(pairs, show_progress_bar=False),
297
+ self._predict_with_recovery,
298
+ pairs,
171
299
  )
172
- return scores.tolist() if hasattr(scores, "tolist") else list(scores)
173
300
 
174
301
 
175
302
  class RemoteTEICrossEncoder(CrossEncoderModel):
@@ -768,29 +895,33 @@ class LiteLLMCrossEncoder(CrossEncoderModel):
768
895
 
769
896
  def create_cross_encoder_from_env() -> CrossEncoderModel:
770
897
  """
771
- Create a CrossEncoderModel instance based on environment variables.
898
+ Create a CrossEncoderModel instance based on configuration.
772
899
 
773
- See hindsight_api.config for environment variable names and defaults.
900
+ Reads configuration via get_config() to ensure consistency across the codebase.
774
901
 
775
902
  Returns:
776
903
  Configured CrossEncoderModel instance
777
904
  """
778
- provider = os.environ.get(ENV_RERANKER_PROVIDER, DEFAULT_RERANKER_PROVIDER).lower()
905
+ from ..config import get_config
906
+
907
+ config = get_config()
908
+ provider = config.reranker_provider.lower()
779
909
 
780
910
  if provider == "tei":
781
- url = os.environ.get(ENV_RERANKER_TEI_URL)
911
+ url = config.reranker_tei_url
782
912
  if not url:
783
913
  raise ValueError(f"{ENV_RERANKER_TEI_URL} is required when {ENV_RERANKER_PROVIDER} is 'tei'")
784
- batch_size = int(os.environ.get(ENV_RERANKER_TEI_BATCH_SIZE, str(DEFAULT_RERANKER_TEI_BATCH_SIZE)))
785
- max_concurrent = int(os.environ.get(ENV_RERANKER_TEI_MAX_CONCURRENT, str(DEFAULT_RERANKER_TEI_MAX_CONCURRENT)))
786
- return RemoteTEICrossEncoder(base_url=url, batch_size=batch_size, max_concurrent=max_concurrent)
914
+ return RemoteTEICrossEncoder(
915
+ base_url=url,
916
+ batch_size=config.reranker_tei_batch_size,
917
+ max_concurrent=config.reranker_tei_max_concurrent,
918
+ )
787
919
  elif provider == "local":
788
- model = os.environ.get(ENV_RERANKER_LOCAL_MODEL)
789
- model_name = model or DEFAULT_RERANKER_LOCAL_MODEL
790
- max_concurrent = int(
791
- os.environ.get(ENV_RERANKER_LOCAL_MAX_CONCURRENT, str(DEFAULT_RERANKER_LOCAL_MAX_CONCURRENT))
920
+ return LocalSTCrossEncoder(
921
+ model_name=config.reranker_local_model,
922
+ max_concurrent=config.reranker_local_max_concurrent,
923
+ force_cpu=config.reranker_local_force_cpu,
792
924
  )
793
- return LocalSTCrossEncoder(model_name=model_name, max_concurrent=max_concurrent)
794
925
  elif provider == "cohere":
795
926
  api_key = os.environ.get(ENV_COHERE_API_KEY)
796
927
  if not api_key:
@@ -0,0 +1,5 @@
1
+ """Directives module for hard rules injected into prompts."""
2
+
3
+ from .models import Directive
4
+
5
+ __all__ = ["Directive"]
@@ -0,0 +1,37 @@
1
+ """Pydantic models for directives."""
2
+
3
+ from datetime import datetime, timezone
4
+ from uuid import UUID
5
+
6
+ from pydantic import BaseModel, Field
7
+
8
+
9
+ class Directive(BaseModel):
10
+ """A directive is a hard rule injected into prompts.
11
+
12
+ Directives are user-defined rules that guide agent behavior. Unlike mental models
13
+ which are automatically consolidated from memories, directives are explicit
14
+ instructions that are always included in relevant prompts.
15
+
16
+ Examples:
17
+ - "Always respond in formal English"
18
+ - "Never share personal data with third parties"
19
+ - "Prefer conservative investment recommendations"
20
+ """
21
+
22
+ id: UUID = Field(description="Unique identifier")
23
+ bank_id: str = Field(description="Bank this directive belongs to")
24
+ name: str = Field(description="Human-readable name")
25
+ content: str = Field(description="The directive text to inject into prompts")
26
+ priority: int = Field(default=0, description="Higher priority directives are injected first")
27
+ is_active: bool = Field(default=True, description="Whether this directive is currently active")
28
+ tags: list[str] = Field(default_factory=list, description="Tags for filtering")
29
+ created_at: datetime = Field(
30
+ default_factory=lambda: datetime.now(timezone.utc), description="When this directive was created"
31
+ )
32
+ updated_at: datetime = Field(
33
+ default_factory=lambda: datetime.now(timezone.utc), description="When this directive was last updated"
34
+ )
35
+
36
+ class Config:
37
+ from_attributes = True
@@ -18,6 +18,7 @@ import httpx
18
18
  from ..config import (
19
19
  DEFAULT_EMBEDDINGS_COHERE_MODEL,
20
20
  DEFAULT_EMBEDDINGS_LITELLM_MODEL,
21
+ DEFAULT_EMBEDDINGS_LOCAL_FORCE_CPU,
21
22
  DEFAULT_EMBEDDINGS_LOCAL_MODEL,
22
23
  DEFAULT_EMBEDDINGS_OPENAI_MODEL,
23
24
  DEFAULT_EMBEDDINGS_PROVIDER,
@@ -26,6 +27,7 @@ from ..config import (
26
27
  ENV_EMBEDDINGS_COHERE_BASE_URL,
27
28
  ENV_EMBEDDINGS_COHERE_MODEL,
28
29
  ENV_EMBEDDINGS_LITELLM_MODEL,
30
+ ENV_EMBEDDINGS_LOCAL_FORCE_CPU,
29
31
  ENV_EMBEDDINGS_LOCAL_MODEL,
30
32
  ENV_EMBEDDINGS_OPENAI_API_KEY,
31
33
  ENV_EMBEDDINGS_OPENAI_BASE_URL,
@@ -92,15 +94,18 @@ class LocalSTEmbeddings(Embeddings):
92
94
  The embedding dimension is auto-detected from the model.
93
95
  """
94
96
 
95
- def __init__(self, model_name: str | None = None):
97
+ def __init__(self, model_name: str | None = None, force_cpu: bool = False):
96
98
  """
97
99
  Initialize local SentenceTransformers embeddings.
98
100
 
99
101
  Args:
100
102
  model_name: Name of the SentenceTransformer model to use.
101
103
  Default: BAAI/bge-small-en-v1.5
104
+ force_cpu: Force CPU mode (avoids MPS/XPC issues on macOS in daemon mode).
105
+ Default: False
102
106
  """
103
107
  self.model_name = model_name or DEFAULT_EMBEDDINGS_LOCAL_MODEL
108
+ self.force_cpu = force_cpu
104
109
  self._model = None
105
110
  self._dimension: int | None = None
106
111
 
@@ -128,20 +133,115 @@ class LocalSTEmbeddings(Embeddings):
128
133
  )
129
134
 
130
135
  logger.info(f"Embeddings: initializing local provider with model {self.model_name}")
131
- # Disable lazy loading (meta tensors) which causes issues with newer transformers/accelerate
132
- # Setting low_cpu_mem_usage=False and device_map=None ensures tensors are fully materialized
136
+
137
+ # Determine device based on hardware availability.
138
+ # We always set low_cpu_mem_usage=False to prevent lazy loading (meta tensors)
139
+ # which can cause issues when accelerate is installed but no GPU is available.
140
+ import torch
141
+
142
+ # Force CPU mode if configured (used in daemon mode to avoid MPS/XPC issues on macOS)
143
+ if self.force_cpu:
144
+ device = "cpu"
145
+ logger.info("Embeddings: forcing CPU mode")
146
+ else:
147
+ # Check for GPU (CUDA) or Apple Silicon (MPS)
148
+ # Wrap in try-except to gracefully handle any device detection issues
149
+ # (e.g., in CI environments or when PyTorch is built without GPU support)
150
+ device = "cpu" # Default to CPU
151
+ try:
152
+ has_gpu = torch.cuda.is_available() or (
153
+ hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
154
+ )
155
+ if has_gpu:
156
+ device = None # Let sentence-transformers auto-detect GPU/MPS
157
+ except Exception as e:
158
+ logger.warning(f"Failed to detect GPU/MPS, falling back to CPU: {e}")
159
+
133
160
  self._model = SentenceTransformer(
134
161
  self.model_name,
135
- model_kwargs={"low_cpu_mem_usage": False, "device_map": None},
162
+ device=device,
163
+ model_kwargs={"low_cpu_mem_usage": False},
136
164
  )
137
165
 
138
166
  self._dimension = self._model.get_sentence_embedding_dimension()
139
167
  logger.info(f"Embeddings: local provider initialized (dim: {self._dimension})")
140
168
 
169
+ def _is_xpc_error(self, error: Exception) -> bool:
170
+ """
171
+ Check if an error is an XPC connection error (macOS daemon issue).
172
+
173
+ On macOS, long-running daemons can lose XPC connections to system services
174
+ when the process is idle for extended periods.
175
+ """
176
+ error_str = str(error).lower()
177
+ return "xpc_error_connection_invalid" in error_str or "xpc error" in error_str
178
+
179
+ def _reinitialize_model_sync(self) -> None:
180
+ """
181
+ Clear and reinitialize the embedding model synchronously.
182
+
183
+ This is used to recover from XPC errors on macOS where the
184
+ PyTorch/MPS backend loses its connection to system services.
185
+ """
186
+ logger.warning(f"Reinitializing embedding model {self.model_name} due to backend error")
187
+
188
+ # Clear existing model
189
+ self._model = None
190
+
191
+ # Force garbage collection to free resources
192
+ import gc
193
+
194
+ import torch
195
+
196
+ gc.collect()
197
+
198
+ # If using CUDA/MPS, clear the cache
199
+ if torch.cuda.is_available():
200
+ torch.cuda.empty_cache()
201
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
202
+ try:
203
+ torch.mps.empty_cache()
204
+ except AttributeError:
205
+ pass # Method might not exist in all PyTorch versions
206
+
207
+ # Reinitialize the model (inline version of initialize() but synchronous)
208
+ try:
209
+ from sentence_transformers import SentenceTransformer
210
+ except ImportError:
211
+ raise ImportError(
212
+ "sentence-transformers is required for LocalSTEmbeddings. "
213
+ "Install it with: pip install sentence-transformers"
214
+ )
215
+
216
+ # Determine device based on hardware availability
217
+ if self.force_cpu:
218
+ device = "cpu"
219
+ else:
220
+ # Wrap in try-except to gracefully handle any device detection issues
221
+ device = "cpu" # Default to CPU
222
+ try:
223
+ has_gpu = torch.cuda.is_available() or (
224
+ hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
225
+ )
226
+ if has_gpu:
227
+ device = None # Let sentence-transformers auto-detect GPU/MPS
228
+ except Exception as e:
229
+ logger.warning(f"Failed to detect GPU/MPS during reinit, falling back to CPU: {e}")
230
+
231
+ self._model = SentenceTransformer(
232
+ self.model_name,
233
+ device=device,
234
+ model_kwargs={"low_cpu_mem_usage": False},
235
+ )
236
+
237
+ logger.info("Embeddings: local provider reinitialized successfully")
238
+
141
239
  def encode(self, texts: list[str]) -> list[list[float]]:
142
240
  """
143
241
  Generate embeddings for a list of texts.
144
242
 
243
+ Automatically recovers from XPC errors on macOS by reinitializing the model.
244
+
145
245
  Args:
146
246
  texts: List of text strings to encode
147
247
 
@@ -150,8 +250,27 @@ class LocalSTEmbeddings(Embeddings):
150
250
  """
151
251
  if self._model is None:
152
252
  raise RuntimeError("Embeddings not initialized. Call initialize() first.")
153
- embeddings = self._model.encode(texts, convert_to_numpy=True, show_progress_bar=False)
154
- return [emb.tolist() for emb in embeddings]
253
+
254
+ # Try encoding with automatic recovery from XPC errors
255
+ max_retries = 1
256
+ for attempt in range(max_retries + 1):
257
+ try:
258
+ embeddings = self._model.encode(texts, convert_to_numpy=True, show_progress_bar=False)
259
+ return [emb.tolist() for emb in embeddings]
260
+ except Exception as e:
261
+ # Check if this is an XPC error (macOS daemon issue)
262
+ if self._is_xpc_error(e) and attempt < max_retries:
263
+ logger.warning(f"XPC error detected in embedding generation (attempt {attempt + 1}): {e}")
264
+ try:
265
+ self._reinitialize_model_sync()
266
+ logger.info("Model reinitialized successfully, retrying embedding generation")
267
+ continue
268
+ except Exception as reinit_error:
269
+ logger.error(f"Failed to reinitialize model: {reinit_error}")
270
+ raise Exception(f"Failed to recover from XPC error: {str(e)}")
271
+ else:
272
+ # Not an XPC error or out of retries
273
+ raise
155
274
 
156
275
 
157
276
  class RemoteTEIEmbeddings(Embeddings):
@@ -673,24 +792,28 @@ class LiteLLMEmbeddings(Embeddings):
673
792
 
674
793
  def create_embeddings_from_env() -> Embeddings:
675
794
  """
676
- Create an Embeddings instance based on environment variables.
795
+ Create an Embeddings instance based on configuration.
677
796
 
678
- See hindsight_api.config for environment variable names and defaults.
797
+ Reads configuration via get_config() to ensure consistency across the codebase.
679
798
 
680
799
  Returns:
681
800
  Configured Embeddings instance
682
801
  """
683
- provider = os.environ.get(ENV_EMBEDDINGS_PROVIDER, DEFAULT_EMBEDDINGS_PROVIDER).lower()
802
+ from ..config import get_config
803
+
804
+ config = get_config()
805
+ provider = config.embeddings_provider.lower()
684
806
 
685
807
  if provider == "tei":
686
- url = os.environ.get(ENV_EMBEDDINGS_TEI_URL)
808
+ url = config.embeddings_tei_url
687
809
  if not url:
688
810
  raise ValueError(f"{ENV_EMBEDDINGS_TEI_URL} is required when {ENV_EMBEDDINGS_PROVIDER} is 'tei'")
689
811
  return RemoteTEIEmbeddings(base_url=url)
690
812
  elif provider == "local":
691
- model = os.environ.get(ENV_EMBEDDINGS_LOCAL_MODEL)
692
- model_name = model or DEFAULT_EMBEDDINGS_LOCAL_MODEL
693
- return LocalSTEmbeddings(model_name=model_name)
813
+ return LocalSTEmbeddings(
814
+ model_name=config.embeddings_local_model,
815
+ force_cpu=config.embeddings_local_force_cpu,
816
+ )
694
817
  elif provider == "openai":
695
818
  # Use dedicated embeddings API key, or fall back to LLM API key
696
819
  api_key = os.environ.get(ENV_EMBEDDINGS_OPENAI_API_KEY) or os.environ.get(ENV_LLM_API_KEY)