hindsight-api 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hindsight_api/__init__.py +1 -1
- hindsight_api/api/http.py +3 -2
- hindsight_api/config.py +114 -1
- hindsight_api/daemon.py +4 -1
- hindsight_api/engine/consolidation/consolidator.py +145 -49
- hindsight_api/engine/consolidation/prompts.py +21 -13
- hindsight_api/engine/cross_encoder.py +43 -109
- hindsight_api/engine/embeddings.py +35 -99
- hindsight_api/engine/memory_engine.py +11 -5
- hindsight_api/engine/reflect/tools.py +1 -1
- hindsight_api/engine/retain/fact_extraction.py +16 -0
- hindsight_api/extensions/builtin/tenant.py +8 -5
- hindsight_api/main.py +26 -2
- {hindsight_api-0.4.0.dist-info → hindsight_api-0.4.2.dist-info}/METADATA +1 -1
- {hindsight_api-0.4.0.dist-info → hindsight_api-0.4.2.dist-info}/RECORD +17 -17
- {hindsight_api-0.4.0.dist-info → hindsight_api-0.4.2.dist-info}/WHEEL +0 -0
- {hindsight_api-0.4.0.dist-info → hindsight_api-0.4.2.dist-info}/entry_points.txt +0 -0
|
@@ -20,6 +20,7 @@ from ..config import (
|
|
|
20
20
|
DEFAULT_RERANKER_FLASHRANK_CACHE_DIR,
|
|
21
21
|
DEFAULT_RERANKER_FLASHRANK_MODEL,
|
|
22
22
|
DEFAULT_RERANKER_LITELLM_MODEL,
|
|
23
|
+
DEFAULT_RERANKER_LOCAL_FORCE_CPU,
|
|
23
24
|
DEFAULT_RERANKER_LOCAL_MAX_CONCURRENT,
|
|
24
25
|
DEFAULT_RERANKER_LOCAL_MODEL,
|
|
25
26
|
DEFAULT_RERANKER_PROVIDER,
|
|
@@ -33,6 +34,7 @@ from ..config import (
|
|
|
33
34
|
ENV_RERANKER_FLASHRANK_CACHE_DIR,
|
|
34
35
|
ENV_RERANKER_FLASHRANK_MODEL,
|
|
35
36
|
ENV_RERANKER_LITELLM_MODEL,
|
|
37
|
+
ENV_RERANKER_LOCAL_FORCE_CPU,
|
|
36
38
|
ENV_RERANKER_LOCAL_MAX_CONCURRENT,
|
|
37
39
|
ENV_RERANKER_LOCAL_MODEL,
|
|
38
40
|
ENV_RERANKER_PROVIDER,
|
|
@@ -99,7 +101,7 @@ class LocalSTCrossEncoder(CrossEncoderModel):
|
|
|
99
101
|
_executor: ThreadPoolExecutor | None = None
|
|
100
102
|
_max_concurrent: int = 4 # Limit concurrent CPU-bound reranking calls
|
|
101
103
|
|
|
102
|
-
def __init__(self, model_name: str | None = None, max_concurrent: int = 4):
|
|
104
|
+
def __init__(self, model_name: str | None = None, max_concurrent: int = 4, force_cpu: bool = False):
|
|
103
105
|
"""
|
|
104
106
|
Initialize local SentenceTransformers cross-encoder.
|
|
105
107
|
|
|
@@ -108,8 +110,11 @@ class LocalSTCrossEncoder(CrossEncoderModel):
|
|
|
108
110
|
Default: cross-encoder/ms-marco-MiniLM-L-6-v2
|
|
109
111
|
max_concurrent: Maximum concurrent reranking calls (default: 2).
|
|
110
112
|
Higher values may cause CPU thrashing under load.
|
|
113
|
+
force_cpu: Force CPU mode (avoids MPS/XPC issues on macOS in daemon mode).
|
|
114
|
+
Default: False
|
|
111
115
|
"""
|
|
112
116
|
self.model_name = model_name or DEFAULT_RERANKER_LOCAL_MODEL
|
|
117
|
+
self.force_cpu = force_cpu
|
|
113
118
|
self._model = None
|
|
114
119
|
LocalSTCrossEncoder._max_concurrent = max_concurrent
|
|
115
120
|
|
|
@@ -139,13 +144,23 @@ class LocalSTCrossEncoder(CrossEncoderModel):
|
|
|
139
144
|
# after loading, which conflicts with accelerate's device_map handling.
|
|
140
145
|
import torch
|
|
141
146
|
|
|
142
|
-
#
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
if has_gpu:
|
|
146
|
-
device = None # Let sentence-transformers auto-detect GPU/MPS
|
|
147
|
-
else:
|
|
147
|
+
# Force CPU mode if configured (used in daemon mode to avoid MPS/XPC issues on macOS)
|
|
148
|
+
if self.force_cpu:
|
|
148
149
|
device = "cpu"
|
|
150
|
+
logger.info("Reranker: forcing CPU mode (HINDSIGHT_API_RERANKER_LOCAL_FORCE_CPU=1)")
|
|
151
|
+
else:
|
|
152
|
+
# Check for GPU (CUDA) or Apple Silicon (MPS)
|
|
153
|
+
# Wrap in try-except to gracefully handle any device detection issues
|
|
154
|
+
# (e.g., in CI environments or when PyTorch is built without GPU support)
|
|
155
|
+
device = "cpu" # Default to CPU
|
|
156
|
+
try:
|
|
157
|
+
has_gpu = torch.cuda.is_available() or (
|
|
158
|
+
hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
|
|
159
|
+
)
|
|
160
|
+
if has_gpu:
|
|
161
|
+
device = None # Let sentence-transformers auto-detect GPU/MPS
|
|
162
|
+
except Exception as e:
|
|
163
|
+
logger.warning(f"Failed to detect GPU/MPS, falling back to CPU: {e}")
|
|
149
164
|
|
|
150
165
|
self._model = CrossEncoder(
|
|
151
166
|
self.model_name,
|
|
@@ -163,101 +178,16 @@ class LocalSTCrossEncoder(CrossEncoderModel):
|
|
|
163
178
|
else:
|
|
164
179
|
logger.info("Reranker: local provider initialized (using existing executor)")
|
|
165
180
|
|
|
166
|
-
def
|
|
167
|
-
"""
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
On macOS, long-running daemons can lose XPC connections to system services
|
|
171
|
-
when the process is idle for extended periods.
|
|
172
|
-
"""
|
|
173
|
-
error_str = str(error).lower()
|
|
174
|
-
return "xpc_error_connection_invalid" in error_str or "xpc error" in error_str
|
|
175
|
-
|
|
176
|
-
def _reinitialize_model_sync(self) -> None:
|
|
177
|
-
"""
|
|
178
|
-
Clear and reinitialize the cross-encoder model synchronously.
|
|
179
|
-
|
|
180
|
-
This is used to recover from XPC errors on macOS where the
|
|
181
|
-
PyTorch/MPS backend loses its connection to system services.
|
|
182
|
-
"""
|
|
183
|
-
logger.warning(f"Reinitializing reranker model {self.model_name} due to backend error")
|
|
184
|
-
|
|
185
|
-
# Clear existing model
|
|
186
|
-
self._model = None
|
|
187
|
-
|
|
188
|
-
# Force garbage collection to free resources
|
|
189
|
-
import gc
|
|
190
|
-
|
|
191
|
-
import torch
|
|
192
|
-
|
|
193
|
-
gc.collect()
|
|
194
|
-
|
|
195
|
-
# If using CUDA/MPS, clear the cache
|
|
196
|
-
if torch.cuda.is_available():
|
|
197
|
-
torch.cuda.empty_cache()
|
|
198
|
-
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
|
199
|
-
try:
|
|
200
|
-
torch.mps.empty_cache()
|
|
201
|
-
except AttributeError:
|
|
202
|
-
pass # Method might not exist in all PyTorch versions
|
|
203
|
-
|
|
204
|
-
# Reinitialize the model
|
|
205
|
-
try:
|
|
206
|
-
from sentence_transformers import CrossEncoder
|
|
207
|
-
except ImportError:
|
|
208
|
-
raise ImportError(
|
|
209
|
-
"sentence-transformers is required for LocalSTCrossEncoder. "
|
|
210
|
-
"Install it with: pip install sentence-transformers"
|
|
211
|
-
)
|
|
212
|
-
|
|
213
|
-
# Determine device based on hardware availability
|
|
214
|
-
has_gpu = torch.cuda.is_available() or (hasattr(torch.backends, "mps") and torch.backends.mps.is_available())
|
|
215
|
-
|
|
216
|
-
if has_gpu:
|
|
217
|
-
device = None # Let sentence-transformers auto-detect GPU/MPS
|
|
218
|
-
else:
|
|
219
|
-
device = "cpu"
|
|
220
|
-
|
|
221
|
-
self._model = CrossEncoder(
|
|
222
|
-
self.model_name,
|
|
223
|
-
device=device,
|
|
224
|
-
model_kwargs={"low_cpu_mem_usage": False},
|
|
225
|
-
)
|
|
226
|
-
|
|
227
|
-
logger.info("Reranker: local provider reinitialized successfully")
|
|
228
|
-
|
|
229
|
-
def _predict_with_recovery(self, pairs: list[tuple[str, str]]) -> list[float]:
|
|
230
|
-
"""
|
|
231
|
-
Predict with automatic recovery from XPC errors.
|
|
232
|
-
|
|
233
|
-
This runs synchronously in the thread pool.
|
|
234
|
-
"""
|
|
235
|
-
max_retries = 1
|
|
236
|
-
for attempt in range(max_retries + 1):
|
|
237
|
-
try:
|
|
238
|
-
scores = self._model.predict(pairs, show_progress_bar=False)
|
|
239
|
-
return scores.tolist() if hasattr(scores, "tolist") else list(scores)
|
|
240
|
-
except Exception as e:
|
|
241
|
-
# Check if this is an XPC error (macOS daemon issue)
|
|
242
|
-
if self._is_xpc_error(e) and attempt < max_retries:
|
|
243
|
-
logger.warning(f"XPC error detected in reranker (attempt {attempt + 1}): {e}")
|
|
244
|
-
try:
|
|
245
|
-
self._reinitialize_model_sync()
|
|
246
|
-
logger.info("Reranker reinitialized successfully, retrying prediction")
|
|
247
|
-
continue
|
|
248
|
-
except Exception as reinit_error:
|
|
249
|
-
logger.error(f"Failed to reinitialize reranker: {reinit_error}")
|
|
250
|
-
raise Exception(f"Failed to recover from XPC error: {str(e)}")
|
|
251
|
-
else:
|
|
252
|
-
# Not an XPC error or out of retries
|
|
253
|
-
raise
|
|
181
|
+
def _predict_sync(self, pairs: list[tuple[str, str]]) -> list[float]:
|
|
182
|
+
"""Synchronous prediction wrapper for thread pool execution."""
|
|
183
|
+
scores = self._model.predict(pairs, show_progress_bar=False)
|
|
184
|
+
return scores.tolist() if hasattr(scores, "tolist") else list(scores)
|
|
254
185
|
|
|
255
186
|
async def predict(self, pairs: list[tuple[str, str]]) -> list[float]:
|
|
256
187
|
"""
|
|
257
188
|
Score query-document pairs for relevance.
|
|
258
189
|
|
|
259
190
|
Uses a dedicated thread pool with limited workers to prevent CPU thrashing.
|
|
260
|
-
Automatically recovers from XPC errors on macOS by reinitializing the model.
|
|
261
191
|
|
|
262
192
|
Args:
|
|
263
193
|
pairs: List of (query, document) tuples to score
|
|
@@ -272,7 +202,7 @@ class LocalSTCrossEncoder(CrossEncoderModel):
|
|
|
272
202
|
loop = asyncio.get_event_loop()
|
|
273
203
|
return await loop.run_in_executor(
|
|
274
204
|
LocalSTCrossEncoder._executor,
|
|
275
|
-
self.
|
|
205
|
+
self._predict_sync,
|
|
276
206
|
pairs,
|
|
277
207
|
)
|
|
278
208
|
|
|
@@ -873,29 +803,33 @@ class LiteLLMCrossEncoder(CrossEncoderModel):
|
|
|
873
803
|
|
|
874
804
|
def create_cross_encoder_from_env() -> CrossEncoderModel:
|
|
875
805
|
"""
|
|
876
|
-
Create a CrossEncoderModel instance based on
|
|
806
|
+
Create a CrossEncoderModel instance based on configuration.
|
|
877
807
|
|
|
878
|
-
|
|
808
|
+
Reads configuration via get_config() to ensure consistency across the codebase.
|
|
879
809
|
|
|
880
810
|
Returns:
|
|
881
811
|
Configured CrossEncoderModel instance
|
|
882
812
|
"""
|
|
883
|
-
|
|
813
|
+
from ..config import get_config
|
|
814
|
+
|
|
815
|
+
config = get_config()
|
|
816
|
+
provider = config.reranker_provider.lower()
|
|
884
817
|
|
|
885
818
|
if provider == "tei":
|
|
886
|
-
url =
|
|
819
|
+
url = config.reranker_tei_url
|
|
887
820
|
if not url:
|
|
888
821
|
raise ValueError(f"{ENV_RERANKER_TEI_URL} is required when {ENV_RERANKER_PROVIDER} is 'tei'")
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
822
|
+
return RemoteTEICrossEncoder(
|
|
823
|
+
base_url=url,
|
|
824
|
+
batch_size=config.reranker_tei_batch_size,
|
|
825
|
+
max_concurrent=config.reranker_tei_max_concurrent,
|
|
826
|
+
)
|
|
892
827
|
elif provider == "local":
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
828
|
+
return LocalSTCrossEncoder(
|
|
829
|
+
model_name=config.reranker_local_model,
|
|
830
|
+
max_concurrent=config.reranker_local_max_concurrent,
|
|
831
|
+
force_cpu=config.reranker_local_force_cpu,
|
|
897
832
|
)
|
|
898
|
-
return LocalSTCrossEncoder(model_name=model_name, max_concurrent=max_concurrent)
|
|
899
833
|
elif provider == "cohere":
|
|
900
834
|
api_key = os.environ.get(ENV_COHERE_API_KEY)
|
|
901
835
|
if not api_key:
|
|
@@ -18,6 +18,7 @@ import httpx
|
|
|
18
18
|
from ..config import (
|
|
19
19
|
DEFAULT_EMBEDDINGS_COHERE_MODEL,
|
|
20
20
|
DEFAULT_EMBEDDINGS_LITELLM_MODEL,
|
|
21
|
+
DEFAULT_EMBEDDINGS_LOCAL_FORCE_CPU,
|
|
21
22
|
DEFAULT_EMBEDDINGS_LOCAL_MODEL,
|
|
22
23
|
DEFAULT_EMBEDDINGS_OPENAI_MODEL,
|
|
23
24
|
DEFAULT_EMBEDDINGS_PROVIDER,
|
|
@@ -26,6 +27,7 @@ from ..config import (
|
|
|
26
27
|
ENV_EMBEDDINGS_COHERE_BASE_URL,
|
|
27
28
|
ENV_EMBEDDINGS_COHERE_MODEL,
|
|
28
29
|
ENV_EMBEDDINGS_LITELLM_MODEL,
|
|
30
|
+
ENV_EMBEDDINGS_LOCAL_FORCE_CPU,
|
|
29
31
|
ENV_EMBEDDINGS_LOCAL_MODEL,
|
|
30
32
|
ENV_EMBEDDINGS_OPENAI_API_KEY,
|
|
31
33
|
ENV_EMBEDDINGS_OPENAI_BASE_URL,
|
|
@@ -92,15 +94,18 @@ class LocalSTEmbeddings(Embeddings):
|
|
|
92
94
|
The embedding dimension is auto-detected from the model.
|
|
93
95
|
"""
|
|
94
96
|
|
|
95
|
-
def __init__(self, model_name: str | None = None):
|
|
97
|
+
def __init__(self, model_name: str | None = None, force_cpu: bool = False):
|
|
96
98
|
"""
|
|
97
99
|
Initialize local SentenceTransformers embeddings.
|
|
98
100
|
|
|
99
101
|
Args:
|
|
100
102
|
model_name: Name of the SentenceTransformer model to use.
|
|
101
103
|
Default: BAAI/bge-small-en-v1.5
|
|
104
|
+
force_cpu: Force CPU mode (avoids MPS/XPC issues on macOS in daemon mode).
|
|
105
|
+
Default: False
|
|
102
106
|
"""
|
|
103
107
|
self.model_name = model_name or DEFAULT_EMBEDDINGS_LOCAL_MODEL
|
|
108
|
+
self.force_cpu = force_cpu
|
|
104
109
|
self._model = None
|
|
105
110
|
self._dimension: int | None = None
|
|
106
111
|
|
|
@@ -134,13 +139,23 @@ class LocalSTEmbeddings(Embeddings):
|
|
|
134
139
|
# which can cause issues when accelerate is installed but no GPU is available.
|
|
135
140
|
import torch
|
|
136
141
|
|
|
137
|
-
#
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
if has_gpu:
|
|
141
|
-
device = None # Let sentence-transformers auto-detect GPU/MPS
|
|
142
|
-
else:
|
|
142
|
+
# Force CPU mode if configured (used in daemon mode to avoid MPS/XPC issues on macOS)
|
|
143
|
+
if self.force_cpu:
|
|
143
144
|
device = "cpu"
|
|
145
|
+
logger.info("Embeddings: forcing CPU mode")
|
|
146
|
+
else:
|
|
147
|
+
# Check for GPU (CUDA) or Apple Silicon (MPS)
|
|
148
|
+
# Wrap in try-except to gracefully handle any device detection issues
|
|
149
|
+
# (e.g., in CI environments or when PyTorch is built without GPU support)
|
|
150
|
+
device = "cpu" # Default to CPU
|
|
151
|
+
try:
|
|
152
|
+
has_gpu = torch.cuda.is_available() or (
|
|
153
|
+
hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
|
|
154
|
+
)
|
|
155
|
+
if has_gpu:
|
|
156
|
+
device = None # Let sentence-transformers auto-detect GPU/MPS
|
|
157
|
+
except Exception as e:
|
|
158
|
+
logger.warning(f"Failed to detect GPU/MPS, falling back to CPU: {e}")
|
|
144
159
|
|
|
145
160
|
self._model = SentenceTransformer(
|
|
146
161
|
self.model_name,
|
|
@@ -151,75 +166,10 @@ class LocalSTEmbeddings(Embeddings):
|
|
|
151
166
|
self._dimension = self._model.get_sentence_embedding_dimension()
|
|
152
167
|
logger.info(f"Embeddings: local provider initialized (dim: {self._dimension})")
|
|
153
168
|
|
|
154
|
-
def _is_xpc_error(self, error: Exception) -> bool:
|
|
155
|
-
"""
|
|
156
|
-
Check if an error is an XPC connection error (macOS daemon issue).
|
|
157
|
-
|
|
158
|
-
On macOS, long-running daemons can lose XPC connections to system services
|
|
159
|
-
when the process is idle for extended periods.
|
|
160
|
-
"""
|
|
161
|
-
error_str = str(error).lower()
|
|
162
|
-
return "xpc_error_connection_invalid" in error_str or "xpc error" in error_str
|
|
163
|
-
|
|
164
|
-
def _reinitialize_model_sync(self) -> None:
|
|
165
|
-
"""
|
|
166
|
-
Clear and reinitialize the embedding model synchronously.
|
|
167
|
-
|
|
168
|
-
This is used to recover from XPC errors on macOS where the
|
|
169
|
-
PyTorch/MPS backend loses its connection to system services.
|
|
170
|
-
"""
|
|
171
|
-
logger.warning(f"Reinitializing embedding model {self.model_name} due to backend error")
|
|
172
|
-
|
|
173
|
-
# Clear existing model
|
|
174
|
-
self._model = None
|
|
175
|
-
|
|
176
|
-
# Force garbage collection to free resources
|
|
177
|
-
import gc
|
|
178
|
-
|
|
179
|
-
import torch
|
|
180
|
-
|
|
181
|
-
gc.collect()
|
|
182
|
-
|
|
183
|
-
# If using CUDA/MPS, clear the cache
|
|
184
|
-
if torch.cuda.is_available():
|
|
185
|
-
torch.cuda.empty_cache()
|
|
186
|
-
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
|
187
|
-
try:
|
|
188
|
-
torch.mps.empty_cache()
|
|
189
|
-
except AttributeError:
|
|
190
|
-
pass # Method might not exist in all PyTorch versions
|
|
191
|
-
|
|
192
|
-
# Reinitialize the model (inline version of initialize() but synchronous)
|
|
193
|
-
try:
|
|
194
|
-
from sentence_transformers import SentenceTransformer
|
|
195
|
-
except ImportError:
|
|
196
|
-
raise ImportError(
|
|
197
|
-
"sentence-transformers is required for LocalSTEmbeddings. "
|
|
198
|
-
"Install it with: pip install sentence-transformers"
|
|
199
|
-
)
|
|
200
|
-
|
|
201
|
-
# Determine device based on hardware availability
|
|
202
|
-
has_gpu = torch.cuda.is_available() or (hasattr(torch.backends, "mps") and torch.backends.mps.is_available())
|
|
203
|
-
|
|
204
|
-
if has_gpu:
|
|
205
|
-
device = None # Let sentence-transformers auto-detect GPU/MPS
|
|
206
|
-
else:
|
|
207
|
-
device = "cpu"
|
|
208
|
-
|
|
209
|
-
self._model = SentenceTransformer(
|
|
210
|
-
self.model_name,
|
|
211
|
-
device=device,
|
|
212
|
-
model_kwargs={"low_cpu_mem_usage": False},
|
|
213
|
-
)
|
|
214
|
-
|
|
215
|
-
logger.info("Embeddings: local provider reinitialized successfully")
|
|
216
|
-
|
|
217
169
|
def encode(self, texts: list[str]) -> list[list[float]]:
|
|
218
170
|
"""
|
|
219
171
|
Generate embeddings for a list of texts.
|
|
220
172
|
|
|
221
|
-
Automatically recovers from XPC errors on macOS by reinitializing the model.
|
|
222
|
-
|
|
223
173
|
Args:
|
|
224
174
|
texts: List of text strings to encode
|
|
225
175
|
|
|
@@ -229,26 +179,8 @@ class LocalSTEmbeddings(Embeddings):
|
|
|
229
179
|
if self._model is None:
|
|
230
180
|
raise RuntimeError("Embeddings not initialized. Call initialize() first.")
|
|
231
181
|
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
for attempt in range(max_retries + 1):
|
|
235
|
-
try:
|
|
236
|
-
embeddings = self._model.encode(texts, convert_to_numpy=True, show_progress_bar=False)
|
|
237
|
-
return [emb.tolist() for emb in embeddings]
|
|
238
|
-
except Exception as e:
|
|
239
|
-
# Check if this is an XPC error (macOS daemon issue)
|
|
240
|
-
if self._is_xpc_error(e) and attempt < max_retries:
|
|
241
|
-
logger.warning(f"XPC error detected in embedding generation (attempt {attempt + 1}): {e}")
|
|
242
|
-
try:
|
|
243
|
-
self._reinitialize_model_sync()
|
|
244
|
-
logger.info("Model reinitialized successfully, retrying embedding generation")
|
|
245
|
-
continue
|
|
246
|
-
except Exception as reinit_error:
|
|
247
|
-
logger.error(f"Failed to reinitialize model: {reinit_error}")
|
|
248
|
-
raise Exception(f"Failed to recover from XPC error: {str(e)}")
|
|
249
|
-
else:
|
|
250
|
-
# Not an XPC error or out of retries
|
|
251
|
-
raise
|
|
182
|
+
embeddings = self._model.encode(texts, convert_to_numpy=True, show_progress_bar=False)
|
|
183
|
+
return [emb.tolist() for emb in embeddings]
|
|
252
184
|
|
|
253
185
|
|
|
254
186
|
class RemoteTEIEmbeddings(Embeddings):
|
|
@@ -770,24 +702,28 @@ class LiteLLMEmbeddings(Embeddings):
|
|
|
770
702
|
|
|
771
703
|
def create_embeddings_from_env() -> Embeddings:
|
|
772
704
|
"""
|
|
773
|
-
Create an Embeddings instance based on
|
|
705
|
+
Create an Embeddings instance based on configuration.
|
|
774
706
|
|
|
775
|
-
|
|
707
|
+
Reads configuration via get_config() to ensure consistency across the codebase.
|
|
776
708
|
|
|
777
709
|
Returns:
|
|
778
710
|
Configured Embeddings instance
|
|
779
711
|
"""
|
|
780
|
-
|
|
712
|
+
from ..config import get_config
|
|
713
|
+
|
|
714
|
+
config = get_config()
|
|
715
|
+
provider = config.embeddings_provider.lower()
|
|
781
716
|
|
|
782
717
|
if provider == "tei":
|
|
783
|
-
url =
|
|
718
|
+
url = config.embeddings_tei_url
|
|
784
719
|
if not url:
|
|
785
720
|
raise ValueError(f"{ENV_EMBEDDINGS_TEI_URL} is required when {ENV_EMBEDDINGS_PROVIDER} is 'tei'")
|
|
786
721
|
return RemoteTEIEmbeddings(base_url=url)
|
|
787
722
|
elif provider == "local":
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
723
|
+
return LocalSTEmbeddings(
|
|
724
|
+
model_name=config.embeddings_local_model,
|
|
725
|
+
force_cpu=config.embeddings_local_force_cpu,
|
|
726
|
+
)
|
|
791
727
|
elif provider == "openai":
|
|
792
728
|
# Use dedicated embeddings API key, or fall back to LLM API key
|
|
793
729
|
api_key = os.environ.get(ENV_EMBEDDINGS_OPENAI_API_KEY) or os.environ.get(ENV_LLM_API_KEY)
|
|
@@ -23,12 +23,17 @@ from ..metrics import get_metrics_collector
|
|
|
23
23
|
from .db_budget import budgeted_operation
|
|
24
24
|
|
|
25
25
|
# Context variable for current schema (async-safe, per-task isolation)
|
|
26
|
-
|
|
26
|
+
# Note: default is None, actual default comes from config via get_current_schema()
|
|
27
|
+
_current_schema: contextvars.ContextVar[str | None] = contextvars.ContextVar("current_schema", default=None)
|
|
27
28
|
|
|
28
29
|
|
|
29
30
|
def get_current_schema() -> str:
|
|
30
|
-
"""Get the current schema from context (default
|
|
31
|
-
|
|
31
|
+
"""Get the current schema from context (falls back to config default)."""
|
|
32
|
+
schema = _current_schema.get()
|
|
33
|
+
if schema is None:
|
|
34
|
+
# Fall back to configured default schema
|
|
35
|
+
return get_config().database_schema
|
|
36
|
+
return schema
|
|
32
37
|
|
|
33
38
|
|
|
34
39
|
def fq_table(table_name: str) -> str:
|
|
@@ -881,11 +886,12 @@ class MemoryEngine(MemoryEngineInterface):
|
|
|
881
886
|
if not self.db_url:
|
|
882
887
|
raise ValueError("Database URL is required for migrations")
|
|
883
888
|
logger.info("Running database migrations...")
|
|
884
|
-
|
|
889
|
+
# Use configured database schema for migrations (defaults to "public")
|
|
890
|
+
run_migrations(self.db_url, schema=get_config().database_schema)
|
|
885
891
|
|
|
886
892
|
# Ensure embedding column dimension matches the model's dimension
|
|
887
893
|
# This is done after migrations and after embeddings.initialize()
|
|
888
|
-
ensure_embedding_dimension(self.db_url, self.embeddings.dimension)
|
|
894
|
+
ensure_embedding_dimension(self.db_url, self.embeddings.dimension, schema=get_config().database_schema)
|
|
889
895
|
|
|
890
896
|
logger.info(f"Connecting to PostgreSQL at {self.db_url}")
|
|
891
897
|
|
|
@@ -782,12 +782,28 @@ Text:
|
|
|
782
782
|
usage = TokenUsage() # Track cumulative usage across retries
|
|
783
783
|
for attempt in range(max_retries):
|
|
784
784
|
try:
|
|
785
|
+
# Use retain-specific overrides if set, otherwise fall back to global LLM config
|
|
786
|
+
max_retries = (
|
|
787
|
+
config.retain_llm_max_retries if config.retain_llm_max_retries is not None else config.llm_max_retries
|
|
788
|
+
)
|
|
789
|
+
initial_backoff = (
|
|
790
|
+
config.retain_llm_initial_backoff
|
|
791
|
+
if config.retain_llm_initial_backoff is not None
|
|
792
|
+
else config.llm_initial_backoff
|
|
793
|
+
)
|
|
794
|
+
max_backoff = (
|
|
795
|
+
config.retain_llm_max_backoff if config.retain_llm_max_backoff is not None else config.llm_max_backoff
|
|
796
|
+
)
|
|
797
|
+
|
|
785
798
|
extraction_response_json, call_usage = await llm_config.call(
|
|
786
799
|
messages=[{"role": "system", "content": prompt}, {"role": "user", "content": user_message}],
|
|
787
800
|
response_format=response_schema,
|
|
788
801
|
scope="memory_extract_facts",
|
|
789
802
|
temperature=0.1,
|
|
790
803
|
max_completion_tokens=config.retain_max_completion_tokens,
|
|
804
|
+
max_retries=max_retries,
|
|
805
|
+
initial_backoff=initial_backoff,
|
|
806
|
+
max_backoff=max_backoff,
|
|
791
807
|
skip_validation=True, # Get raw JSON, we'll validate leniently
|
|
792
808
|
return_usage=True,
|
|
793
809
|
)
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Built-in tenant extension implementations."""
|
|
2
2
|
|
|
3
|
+
from hindsight_api.config import get_config
|
|
3
4
|
from hindsight_api.extensions.tenant import AuthenticationError, Tenant, TenantContext, TenantExtension
|
|
4
5
|
from hindsight_api.models import RequestContext
|
|
5
6
|
|
|
@@ -10,11 +11,13 @@ class ApiKeyTenantExtension(TenantExtension):
|
|
|
10
11
|
|
|
11
12
|
This is a simple implementation that:
|
|
12
13
|
1. Validates the API key matches HINDSIGHT_API_TENANT_API_KEY
|
|
13
|
-
2. Returns
|
|
14
|
+
2. Returns the configured schema (HINDSIGHT_API_DATABASE_SCHEMA, default 'public')
|
|
15
|
+
for all authenticated requests
|
|
14
16
|
|
|
15
17
|
Configuration:
|
|
16
18
|
HINDSIGHT_API_TENANT_EXTENSION=hindsight_api.extensions.builtin.tenant:ApiKeyTenantExtension
|
|
17
19
|
HINDSIGHT_API_TENANT_API_KEY=your-secret-key
|
|
20
|
+
HINDSIGHT_API_DATABASE_SCHEMA=your-schema (optional, defaults to 'public')
|
|
18
21
|
|
|
19
22
|
For multi-tenant setups with separate schemas per tenant, implement a custom
|
|
20
23
|
TenantExtension that looks up the schema based on the API key or token claims.
|
|
@@ -27,11 +30,11 @@ class ApiKeyTenantExtension(TenantExtension):
|
|
|
27
30
|
raise ValueError("HINDSIGHT_API_TENANT_API_KEY is required when using ApiKeyTenantExtension")
|
|
28
31
|
|
|
29
32
|
async def authenticate(self, context: RequestContext) -> TenantContext:
|
|
30
|
-
"""Validate API key and return
|
|
33
|
+
"""Validate API key and return configured schema context."""
|
|
31
34
|
if context.api_key != self.expected_api_key:
|
|
32
35
|
raise AuthenticationError("Invalid API key")
|
|
33
|
-
return TenantContext(schema_name=
|
|
36
|
+
return TenantContext(schema_name=get_config().database_schema)
|
|
34
37
|
|
|
35
38
|
async def list_tenants(self) -> list[Tenant]:
|
|
36
|
-
"""Return
|
|
37
|
-
return [Tenant(schema=
|
|
39
|
+
"""Return configured schema for single-tenant setup."""
|
|
40
|
+
return [Tenant(schema=get_config().database_schema)]
|
hindsight_api/main.py
CHANGED
|
@@ -170,31 +170,53 @@ def main():
|
|
|
170
170
|
if args.log_level != config.log_level:
|
|
171
171
|
config = HindsightConfig(
|
|
172
172
|
database_url=config.database_url,
|
|
173
|
+
database_schema=config.database_schema,
|
|
173
174
|
llm_provider=config.llm_provider,
|
|
174
175
|
llm_api_key=config.llm_api_key,
|
|
175
176
|
llm_model=config.llm_model,
|
|
176
177
|
llm_base_url=config.llm_base_url,
|
|
177
178
|
llm_max_concurrent=config.llm_max_concurrent,
|
|
179
|
+
llm_max_retries=config.llm_max_retries,
|
|
180
|
+
llm_initial_backoff=config.llm_initial_backoff,
|
|
181
|
+
llm_max_backoff=config.llm_max_backoff,
|
|
178
182
|
llm_timeout=config.llm_timeout,
|
|
179
183
|
retain_llm_provider=config.retain_llm_provider,
|
|
180
184
|
retain_llm_api_key=config.retain_llm_api_key,
|
|
181
185
|
retain_llm_model=config.retain_llm_model,
|
|
182
186
|
retain_llm_base_url=config.retain_llm_base_url,
|
|
187
|
+
retain_llm_max_concurrent=config.retain_llm_max_concurrent,
|
|
188
|
+
retain_llm_max_retries=config.retain_llm_max_retries,
|
|
189
|
+
retain_llm_initial_backoff=config.retain_llm_initial_backoff,
|
|
190
|
+
retain_llm_max_backoff=config.retain_llm_max_backoff,
|
|
191
|
+
retain_llm_timeout=config.retain_llm_timeout,
|
|
183
192
|
reflect_llm_provider=config.reflect_llm_provider,
|
|
184
193
|
reflect_llm_api_key=config.reflect_llm_api_key,
|
|
185
194
|
reflect_llm_model=config.reflect_llm_model,
|
|
186
195
|
reflect_llm_base_url=config.reflect_llm_base_url,
|
|
196
|
+
reflect_llm_max_concurrent=config.reflect_llm_max_concurrent,
|
|
197
|
+
reflect_llm_max_retries=config.reflect_llm_max_retries,
|
|
198
|
+
reflect_llm_initial_backoff=config.reflect_llm_initial_backoff,
|
|
199
|
+
reflect_llm_max_backoff=config.reflect_llm_max_backoff,
|
|
200
|
+
reflect_llm_timeout=config.reflect_llm_timeout,
|
|
187
201
|
consolidation_llm_provider=config.consolidation_llm_provider,
|
|
188
202
|
consolidation_llm_api_key=config.consolidation_llm_api_key,
|
|
189
203
|
consolidation_llm_model=config.consolidation_llm_model,
|
|
190
204
|
consolidation_llm_base_url=config.consolidation_llm_base_url,
|
|
205
|
+
consolidation_llm_max_concurrent=config.consolidation_llm_max_concurrent,
|
|
206
|
+
consolidation_llm_max_retries=config.consolidation_llm_max_retries,
|
|
207
|
+
consolidation_llm_initial_backoff=config.consolidation_llm_initial_backoff,
|
|
208
|
+
consolidation_llm_max_backoff=config.consolidation_llm_max_backoff,
|
|
209
|
+
consolidation_llm_timeout=config.consolidation_llm_timeout,
|
|
191
210
|
embeddings_provider=config.embeddings_provider,
|
|
192
211
|
embeddings_local_model=config.embeddings_local_model,
|
|
212
|
+
embeddings_local_force_cpu=config.embeddings_local_force_cpu,
|
|
193
213
|
embeddings_tei_url=config.embeddings_tei_url,
|
|
194
214
|
embeddings_openai_base_url=config.embeddings_openai_base_url,
|
|
195
215
|
embeddings_cohere_base_url=config.embeddings_cohere_base_url,
|
|
196
216
|
reranker_provider=config.reranker_provider,
|
|
197
217
|
reranker_local_model=config.reranker_local_model,
|
|
218
|
+
reranker_local_force_cpu=config.reranker_local_force_cpu,
|
|
219
|
+
reranker_local_max_concurrent=config.reranker_local_max_concurrent,
|
|
198
220
|
reranker_tei_url=config.reranker_tei_url,
|
|
199
221
|
reranker_tei_batch_size=config.reranker_tei_batch_size,
|
|
200
222
|
reranker_tei_max_concurrent=config.reranker_tei_max_concurrent,
|
|
@@ -217,6 +239,7 @@ def main():
|
|
|
217
239
|
retain_observations_async=config.retain_observations_async,
|
|
218
240
|
enable_observations=config.enable_observations,
|
|
219
241
|
consolidation_batch_size=config.consolidation_batch_size,
|
|
242
|
+
consolidation_max_tokens=config.consolidation_max_tokens,
|
|
220
243
|
skip_llm_verification=config.skip_llm_verification,
|
|
221
244
|
lazy_reranker=config.lazy_reranker,
|
|
222
245
|
run_migrations_on_startup=config.run_migrations_on_startup,
|
|
@@ -341,6 +364,7 @@ def main():
|
|
|
341
364
|
# Start idle checker in daemon mode
|
|
342
365
|
if idle_middleware is not None:
|
|
343
366
|
# Start the idle checker in a background thread with its own event loop
|
|
367
|
+
import logging
|
|
344
368
|
import threading
|
|
345
369
|
|
|
346
370
|
def run_idle_checker():
|
|
@@ -351,8 +375,8 @@ def main():
|
|
|
351
375
|
loop = asyncio.new_event_loop()
|
|
352
376
|
asyncio.set_event_loop(loop)
|
|
353
377
|
loop.run_until_complete(idle_middleware._check_idle())
|
|
354
|
-
except Exception:
|
|
355
|
-
|
|
378
|
+
except Exception as e:
|
|
379
|
+
logging.error(f"Idle checker error: {e}", exc_info=True)
|
|
356
380
|
|
|
357
381
|
threading.Thread(target=run_idle_checker, daemon=True).start()
|
|
358
382
|
|