hindsight-api 0.2.1__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. hindsight_api/admin/__init__.py +1 -0
  2. hindsight_api/admin/cli.py +311 -0
  3. hindsight_api/alembic/versions/f1a2b3c4d5e6_add_memory_links_composite_index.py +44 -0
  4. hindsight_api/alembic/versions/g2a3b4c5d6e7_add_tags_column.py +48 -0
  5. hindsight_api/alembic/versions/h3c4d5e6f7g8_mental_models_v4.py +112 -0
  6. hindsight_api/alembic/versions/i4d5e6f7g8h9_delete_opinions.py +41 -0
  7. hindsight_api/alembic/versions/j5e6f7g8h9i0_mental_model_versions.py +95 -0
  8. hindsight_api/alembic/versions/k6f7g8h9i0j1_add_directive_subtype.py +58 -0
  9. hindsight_api/alembic/versions/l7g8h9i0j1k2_add_worker_columns.py +109 -0
  10. hindsight_api/alembic/versions/m8h9i0j1k2l3_mental_model_id_to_text.py +41 -0
  11. hindsight_api/alembic/versions/n9i0j1k2l3m4_learnings_and_pinned_reflections.py +134 -0
  12. hindsight_api/alembic/versions/o0j1k2l3m4n5_migrate_mental_models_data.py +113 -0
  13. hindsight_api/alembic/versions/p1k2l3m4n5o6_new_knowledge_architecture.py +194 -0
  14. hindsight_api/alembic/versions/q2l3m4n5o6p7_fix_mental_model_fact_type.py +50 -0
  15. hindsight_api/alembic/versions/r3m4n5o6p7q8_add_reflect_response_to_reflections.py +47 -0
  16. hindsight_api/alembic/versions/s4n5o6p7q8r9_add_consolidated_at_to_memory_units.py +53 -0
  17. hindsight_api/alembic/versions/t5o6p7q8r9s0_rename_mental_models_to_observations.py +134 -0
  18. hindsight_api/alembic/versions/u6p7q8r9s0t1_mental_models_text_id.py +41 -0
  19. hindsight_api/alembic/versions/v7q8r9s0t1u2_add_max_tokens_to_mental_models.py +50 -0
  20. hindsight_api/api/http.py +1406 -118
  21. hindsight_api/api/mcp.py +11 -196
  22. hindsight_api/config.py +359 -27
  23. hindsight_api/engine/consolidation/__init__.py +5 -0
  24. hindsight_api/engine/consolidation/consolidator.py +859 -0
  25. hindsight_api/engine/consolidation/prompts.py +69 -0
  26. hindsight_api/engine/cross_encoder.py +706 -88
  27. hindsight_api/engine/db_budget.py +284 -0
  28. hindsight_api/engine/db_utils.py +11 -0
  29. hindsight_api/engine/directives/__init__.py +5 -0
  30. hindsight_api/engine/directives/models.py +37 -0
  31. hindsight_api/engine/embeddings.py +553 -29
  32. hindsight_api/engine/entity_resolver.py +8 -5
  33. hindsight_api/engine/interface.py +40 -17
  34. hindsight_api/engine/llm_wrapper.py +744 -68
  35. hindsight_api/engine/memory_engine.py +2505 -1017
  36. hindsight_api/engine/mental_models/__init__.py +14 -0
  37. hindsight_api/engine/mental_models/models.py +53 -0
  38. hindsight_api/engine/query_analyzer.py +4 -3
  39. hindsight_api/engine/reflect/__init__.py +18 -0
  40. hindsight_api/engine/reflect/agent.py +933 -0
  41. hindsight_api/engine/reflect/models.py +109 -0
  42. hindsight_api/engine/reflect/observations.py +186 -0
  43. hindsight_api/engine/reflect/prompts.py +483 -0
  44. hindsight_api/engine/reflect/tools.py +437 -0
  45. hindsight_api/engine/reflect/tools_schema.py +250 -0
  46. hindsight_api/engine/response_models.py +168 -4
  47. hindsight_api/engine/retain/bank_utils.py +79 -201
  48. hindsight_api/engine/retain/fact_extraction.py +424 -195
  49. hindsight_api/engine/retain/fact_storage.py +35 -12
  50. hindsight_api/engine/retain/link_utils.py +29 -24
  51. hindsight_api/engine/retain/orchestrator.py +24 -43
  52. hindsight_api/engine/retain/types.py +11 -2
  53. hindsight_api/engine/search/graph_retrieval.py +43 -14
  54. hindsight_api/engine/search/link_expansion_retrieval.py +391 -0
  55. hindsight_api/engine/search/mpfp_retrieval.py +362 -117
  56. hindsight_api/engine/search/reranking.py +2 -2
  57. hindsight_api/engine/search/retrieval.py +848 -201
  58. hindsight_api/engine/search/tags.py +172 -0
  59. hindsight_api/engine/search/think_utils.py +42 -141
  60. hindsight_api/engine/search/trace.py +12 -1
  61. hindsight_api/engine/search/tracer.py +26 -6
  62. hindsight_api/engine/search/types.py +21 -3
  63. hindsight_api/engine/task_backend.py +113 -106
  64. hindsight_api/engine/utils.py +1 -152
  65. hindsight_api/extensions/__init__.py +10 -1
  66. hindsight_api/extensions/builtin/tenant.py +5 -1
  67. hindsight_api/extensions/context.py +10 -1
  68. hindsight_api/extensions/operation_validator.py +81 -4
  69. hindsight_api/extensions/tenant.py +26 -0
  70. hindsight_api/main.py +69 -6
  71. hindsight_api/mcp_local.py +12 -53
  72. hindsight_api/mcp_tools.py +494 -0
  73. hindsight_api/metrics.py +433 -48
  74. hindsight_api/migrations.py +141 -1
  75. hindsight_api/models.py +3 -3
  76. hindsight_api/pg0.py +53 -0
  77. hindsight_api/server.py +39 -2
  78. hindsight_api/worker/__init__.py +11 -0
  79. hindsight_api/worker/main.py +296 -0
  80. hindsight_api/worker/poller.py +486 -0
  81. {hindsight_api-0.2.1.dist-info → hindsight_api-0.4.0.dist-info}/METADATA +16 -6
  82. hindsight_api-0.4.0.dist-info/RECORD +112 -0
  83. {hindsight_api-0.2.1.dist-info → hindsight_api-0.4.0.dist-info}/entry_points.txt +2 -0
  84. hindsight_api/engine/retain/observation_regeneration.py +0 -254
  85. hindsight_api/engine/search/observation_utils.py +0 -125
  86. hindsight_api/engine/search/scoring.py +0 -159
  87. hindsight_api-0.2.1.dist-info/RECORD +0 -75
  88. {hindsight_api-0.2.1.dist-info → hindsight_api-0.4.0.dist-info}/WHEEL +0 -0
@@ -3,8 +3,8 @@ Embeddings abstraction for the memory system.
3
3
 
4
4
  Provides an interface for generating embeddings with different backends.
5
5
 
6
- IMPORTANT: All embeddings must produce 384-dimensional vectors to match
7
- the database schema (pgvector column defined as vector(384)).
6
+ The embedding dimension is auto-detected from the model at initialization.
7
+ The database schema is automatically adjusted to match the model's dimension.
8
8
 
9
9
  Configuration via environment variables - see hindsight_api.config for all env var names.
10
10
  """
@@ -16,12 +16,25 @@ from abc import ABC, abstractmethod
16
16
  import httpx
17
17
 
18
18
  from ..config import (
19
+ DEFAULT_EMBEDDINGS_COHERE_MODEL,
20
+ DEFAULT_EMBEDDINGS_LITELLM_MODEL,
19
21
  DEFAULT_EMBEDDINGS_LOCAL_MODEL,
22
+ DEFAULT_EMBEDDINGS_OPENAI_MODEL,
20
23
  DEFAULT_EMBEDDINGS_PROVIDER,
21
- EMBEDDING_DIMENSION,
24
+ DEFAULT_LITELLM_API_BASE,
25
+ ENV_COHERE_API_KEY,
26
+ ENV_EMBEDDINGS_COHERE_BASE_URL,
27
+ ENV_EMBEDDINGS_COHERE_MODEL,
28
+ ENV_EMBEDDINGS_LITELLM_MODEL,
22
29
  ENV_EMBEDDINGS_LOCAL_MODEL,
30
+ ENV_EMBEDDINGS_OPENAI_API_KEY,
31
+ ENV_EMBEDDINGS_OPENAI_BASE_URL,
32
+ ENV_EMBEDDINGS_OPENAI_MODEL,
23
33
  ENV_EMBEDDINGS_PROVIDER,
24
34
  ENV_EMBEDDINGS_TEI_URL,
35
+ ENV_LITELLM_API_BASE,
36
+ ENV_LITELLM_API_KEY,
37
+ ENV_LLM_API_KEY,
25
38
  )
26
39
 
27
40
  logger = logging.getLogger(__name__)
@@ -31,8 +44,8 @@ class Embeddings(ABC):
31
44
  """
32
45
  Abstract base class for embedding generation.
33
46
 
34
- All implementations MUST generate 384-dimensional embeddings to match
35
- the database schema.
47
+ The embedding dimension is determined by the model and detected at initialization.
48
+ The database schema is automatically adjusted to match the model's dimension.
36
49
  """
37
50
 
38
51
  @property
@@ -41,6 +54,12 @@ class Embeddings(ABC):
41
54
  """Return a human-readable name for this provider (e.g., 'local', 'tei')."""
42
55
  pass
43
56
 
57
+ @property
58
+ @abstractmethod
59
+ def dimension(self) -> int:
60
+ """Return the embedding dimension produced by this model."""
61
+ pass
62
+
44
63
  @abstractmethod
45
64
  async def initialize(self) -> None:
46
65
  """
@@ -54,13 +73,13 @@ class Embeddings(ABC):
54
73
  @abstractmethod
55
74
  def encode(self, texts: list[str]) -> list[list[float]]:
56
75
  """
57
- Generate 384-dimensional embeddings for a list of texts.
76
+ Generate embeddings for a list of texts.
58
77
 
59
78
  Args:
60
79
  texts: List of text strings to encode
61
80
 
62
81
  Returns:
63
- List of 384-dimensional embedding vectors (each is a list of floats)
82
+ List of embedding vectors (each is a list of floats)
64
83
  """
65
84
  pass
66
85
 
@@ -70,9 +89,7 @@ class LocalSTEmbeddings(Embeddings):
70
89
  Local embeddings implementation using SentenceTransformers.
71
90
 
72
91
  Call initialize() during startup to load the model and avoid cold starts.
73
-
74
- Default model is BAAI/bge-small-en-v1.5 which produces 384-dimensional
75
- embeddings matching the database schema.
92
+ The embedding dimension is auto-detected from the model.
76
93
  """
77
94
 
78
95
  def __init__(self, model_name: str | None = None):
@@ -81,16 +98,22 @@ class LocalSTEmbeddings(Embeddings):
81
98
 
82
99
  Args:
83
100
  model_name: Name of the SentenceTransformer model to use.
84
- Must produce 384-dimensional embeddings.
85
101
  Default: BAAI/bge-small-en-v1.5
86
102
  """
87
103
  self.model_name = model_name or DEFAULT_EMBEDDINGS_LOCAL_MODEL
88
104
  self._model = None
105
+ self._dimension: int | None = None
89
106
 
90
107
  @property
91
108
  def provider_name(self) -> str:
92
109
  return "local"
93
110
 
111
+ @property
112
+ def dimension(self) -> int:
113
+ if self._dimension is None:
114
+ raise RuntimeError("Embeddings not initialized. Call initialize() first.")
115
+ return self._dimension
116
+
94
117
  async def initialize(self) -> None:
95
118
  """Load the embedding model."""
96
119
  if self._model is not None:
@@ -105,38 +128,127 @@ class LocalSTEmbeddings(Embeddings):
105
128
  )
106
129
 
107
130
  logger.info(f"Embeddings: initializing local provider with model {self.model_name}")
108
- # Disable lazy loading (meta tensors) which causes issues with newer transformers/accelerate
109
- # Setting low_cpu_mem_usage=False and device_map=None ensures tensors are fully materialized
131
+
132
+ # Determine device based on hardware availability.
133
+ # We always set low_cpu_mem_usage=False to prevent lazy loading (meta tensors)
134
+ # which can cause issues when accelerate is installed but no GPU is available.
135
+ import torch
136
+
137
+ # Check for GPU (CUDA) or Apple Silicon (MPS)
138
+ has_gpu = torch.cuda.is_available() or (hasattr(torch.backends, "mps") and torch.backends.mps.is_available())
139
+
140
+ if has_gpu:
141
+ device = None # Let sentence-transformers auto-detect GPU/MPS
142
+ else:
143
+ device = "cpu"
144
+
110
145
  self._model = SentenceTransformer(
111
146
  self.model_name,
112
- model_kwargs={"low_cpu_mem_usage": False, "device_map": None},
147
+ device=device,
148
+ model_kwargs={"low_cpu_mem_usage": False},
113
149
  )
114
150
 
115
- # Validate dimension matches database schema
116
- model_dim = self._model.get_sentence_embedding_dimension()
117
- if model_dim != EMBEDDING_DIMENSION:
118
- raise ValueError(
119
- f"Model {self.model_name} produces {model_dim}-dimensional embeddings, "
120
- f"but database schema requires {EMBEDDING_DIMENSION} dimensions. "
121
- f"Use a model that produces {EMBEDDING_DIMENSION}-dimensional embeddings."
151
+ self._dimension = self._model.get_sentence_embedding_dimension()
152
+ logger.info(f"Embeddings: local provider initialized (dim: {self._dimension})")
153
+
154
+ def _is_xpc_error(self, error: Exception) -> bool:
155
+ """
156
+ Check if an error is an XPC connection error (macOS daemon issue).
157
+
158
+ On macOS, long-running daemons can lose XPC connections to system services
159
+ when the process is idle for extended periods.
160
+ """
161
+ error_str = str(error).lower()
162
+ return "xpc_error_connection_invalid" in error_str or "xpc error" in error_str
163
+
164
+ def _reinitialize_model_sync(self) -> None:
165
+ """
166
+ Clear and reinitialize the embedding model synchronously.
167
+
168
+ This is used to recover from XPC errors on macOS where the
169
+ PyTorch/MPS backend loses its connection to system services.
170
+ """
171
+ logger.warning(f"Reinitializing embedding model {self.model_name} due to backend error")
172
+
173
+ # Clear existing model
174
+ self._model = None
175
+
176
+ # Force garbage collection to free resources
177
+ import gc
178
+
179
+ import torch
180
+
181
+ gc.collect()
182
+
183
+ # If using CUDA/MPS, clear the cache
184
+ if torch.cuda.is_available():
185
+ torch.cuda.empty_cache()
186
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
187
+ try:
188
+ torch.mps.empty_cache()
189
+ except AttributeError:
190
+ pass # Method might not exist in all PyTorch versions
191
+
192
+ # Reinitialize the model (inline version of initialize() but synchronous)
193
+ try:
194
+ from sentence_transformers import SentenceTransformer
195
+ except ImportError:
196
+ raise ImportError(
197
+ "sentence-transformers is required for LocalSTEmbeddings. "
198
+ "Install it with: pip install sentence-transformers"
122
199
  )
123
200
 
124
- logger.info(f"Embeddings: local provider initialized (dim: {model_dim})")
201
+ # Determine device based on hardware availability
202
+ has_gpu = torch.cuda.is_available() or (hasattr(torch.backends, "mps") and torch.backends.mps.is_available())
203
+
204
+ if has_gpu:
205
+ device = None # Let sentence-transformers auto-detect GPU/MPS
206
+ else:
207
+ device = "cpu"
208
+
209
+ self._model = SentenceTransformer(
210
+ self.model_name,
211
+ device=device,
212
+ model_kwargs={"low_cpu_mem_usage": False},
213
+ )
214
+
215
+ logger.info("Embeddings: local provider reinitialized successfully")
125
216
 
126
217
  def encode(self, texts: list[str]) -> list[list[float]]:
127
218
  """
128
- Generate 384-dimensional embeddings for a list of texts.
219
+ Generate embeddings for a list of texts.
220
+
221
+ Automatically recovers from XPC errors on macOS by reinitializing the model.
129
222
 
130
223
  Args:
131
224
  texts: List of text strings to encode
132
225
 
133
226
  Returns:
134
- List of 384-dimensional embedding vectors
227
+ List of embedding vectors
135
228
  """
136
229
  if self._model is None:
137
230
  raise RuntimeError("Embeddings not initialized. Call initialize() first.")
138
- embeddings = self._model.encode(texts, convert_to_numpy=True, show_progress_bar=False)
139
- return [emb.tolist() for emb in embeddings]
231
+
232
+ # Try encoding with automatic recovery from XPC errors
233
+ max_retries = 1
234
+ for attempt in range(max_retries + 1):
235
+ try:
236
+ embeddings = self._model.encode(texts, convert_to_numpy=True, show_progress_bar=False)
237
+ return [emb.tolist() for emb in embeddings]
238
+ except Exception as e:
239
+ # Check if this is an XPC error (macOS daemon issue)
240
+ if self._is_xpc_error(e) and attempt < max_retries:
241
+ logger.warning(f"XPC error detected in embedding generation (attempt {attempt + 1}): {e}")
242
+ try:
243
+ self._reinitialize_model_sync()
244
+ logger.info("Model reinitialized successfully, retrying embedding generation")
245
+ continue
246
+ except Exception as reinit_error:
247
+ logger.error(f"Failed to reinitialize model: {reinit_error}")
248
+ raise Exception(f"Failed to recover from XPC error: {str(e)}")
249
+ else:
250
+ # Not an XPC error or out of retries
251
+ raise
140
252
 
141
253
 
142
254
  class RemoteTEIEmbeddings(Embeddings):
@@ -146,7 +258,7 @@ class RemoteTEIEmbeddings(Embeddings):
146
258
  TEI provides a high-performance inference server for embedding models.
147
259
  See: https://github.com/huggingface/text-embeddings-inference
148
260
 
149
- The server should be running a model that produces 384-dimensional embeddings.
261
+ The embedding dimension is auto-detected from the server at initialization.
150
262
  """
151
263
 
152
264
  def __init__(
@@ -174,11 +286,18 @@ class RemoteTEIEmbeddings(Embeddings):
174
286
  self.retry_delay = retry_delay
175
287
  self._client: httpx.Client | None = None
176
288
  self._model_id: str | None = None
289
+ self._dimension: int | None = None
177
290
 
178
291
  @property
179
292
  def provider_name(self) -> str:
180
293
  return "tei"
181
294
 
295
+ @property
296
+ def dimension(self) -> int:
297
+ if self._dimension is None:
298
+ raise RuntimeError("Embeddings not initialized. Call initialize() first.")
299
+ return self._dimension
300
+
182
301
  def _request_with_retry(self, method: str, url: str, **kwargs) -> httpx.Response:
183
302
  """Make an HTTP request with automatic retries on transient errors."""
184
303
  import time
@@ -229,7 +348,24 @@ class RemoteTEIEmbeddings(Embeddings):
229
348
  response = self._request_with_retry("GET", f"{self.base_url}/info")
230
349
  info = response.json()
231
350
  self._model_id = info.get("model_id", "unknown")
232
- logger.info(f"Embeddings: TEI provider initialized (model: {self._model_id})")
351
+
352
+ # Get dimension from server info or by doing a test embedding
353
+ if "max_input_length" in info and "model_dtype" in info:
354
+ # Try to get dimension from info endpoint (some TEI versions expose it)
355
+ # If not available, do a test embedding
356
+ pass
357
+
358
+ # Do a test embedding to detect dimension
359
+ test_response = self._request_with_retry(
360
+ "POST",
361
+ f"{self.base_url}/embed",
362
+ json={"inputs": ["test"]},
363
+ )
364
+ test_embeddings = test_response.json()
365
+ if test_embeddings and len(test_embeddings) > 0:
366
+ self._dimension = len(test_embeddings[0])
367
+
368
+ logger.info(f"Embeddings: TEI provider initialized (model: {self._model_id}, dim: {self._dimension})")
233
369
  except httpx.HTTPError as e:
234
370
  raise RuntimeError(f"Failed to connect to TEI server at {self.base_url}: {e}")
235
371
 
@@ -269,6 +405,369 @@ class RemoteTEIEmbeddings(Embeddings):
269
405
  return all_embeddings
270
406
 
271
407
 
408
+ class OpenAIEmbeddings(Embeddings):
409
+ """
410
+ OpenAI embeddings implementation using the OpenAI API.
411
+
412
+ Supports text-embedding-3-small (1536 dims), text-embedding-3-large (3072 dims),
413
+ and text-embedding-ada-002 (1536 dims, legacy).
414
+
415
+ The embedding dimension is auto-detected from the model at initialization.
416
+ """
417
+
418
+ # Known dimensions for OpenAI embedding models
419
+ MODEL_DIMENSIONS = {
420
+ "text-embedding-3-small": 1536,
421
+ "text-embedding-3-large": 3072,
422
+ "text-embedding-ada-002": 1536,
423
+ }
424
+
425
+ def __init__(
426
+ self,
427
+ api_key: str,
428
+ model: str = DEFAULT_EMBEDDINGS_OPENAI_MODEL,
429
+ base_url: str | None = None,
430
+ batch_size: int = 100,
431
+ max_retries: int = 3,
432
+ ):
433
+ """
434
+ Initialize OpenAI embeddings client.
435
+
436
+ Args:
437
+ api_key: OpenAI API key
438
+ model: OpenAI embedding model name (default: text-embedding-3-small)
439
+ base_url: Custom base URL for OpenAI-compatible API (e.g., Azure OpenAI endpoint)
440
+ batch_size: Maximum batch size for embedding requests (default: 100)
441
+ max_retries: Maximum number of retries for failed requests (default: 3)
442
+ """
443
+ self.api_key = api_key
444
+ self.model = model
445
+ self.base_url = base_url
446
+ self.batch_size = batch_size
447
+ self.max_retries = max_retries
448
+ self._client = None
449
+ self._dimension: int | None = None
450
+
451
+ @property
452
+ def provider_name(self) -> str:
453
+ return "openai"
454
+
455
+ @property
456
+ def dimension(self) -> int:
457
+ if self._dimension is None:
458
+ raise RuntimeError("Embeddings not initialized. Call initialize() first.")
459
+ return self._dimension
460
+
461
+ async def initialize(self) -> None:
462
+ """Initialize the OpenAI client and detect dimension."""
463
+ if self._client is not None:
464
+ return
465
+
466
+ try:
467
+ from openai import OpenAI
468
+ except ImportError:
469
+ raise ImportError("openai is required for OpenAIEmbeddings. Install it with: pip install openai")
470
+
471
+ base_url_msg = f" at {self.base_url}" if self.base_url else ""
472
+ logger.info(f"Embeddings: initializing OpenAI provider with model {self.model}{base_url_msg}")
473
+
474
+ # Build client kwargs, only including base_url if set (for Azure or custom endpoints)
475
+ client_kwargs = {"api_key": self.api_key, "max_retries": self.max_retries}
476
+ if self.base_url:
477
+ client_kwargs["base_url"] = self.base_url
478
+ self._client = OpenAI(**client_kwargs)
479
+
480
+ # Try to get dimension from known models, otherwise do a test embedding
481
+ if self.model in self.MODEL_DIMENSIONS:
482
+ self._dimension = self.MODEL_DIMENSIONS[self.model]
483
+ else:
484
+ # Do a test embedding to detect dimension
485
+ response = self._client.embeddings.create(
486
+ model=self.model,
487
+ input=["test"],
488
+ )
489
+ if response.data:
490
+ self._dimension = len(response.data[0].embedding)
491
+
492
+ logger.info(f"Embeddings: OpenAI provider initialized (model: {self.model}, dim: {self._dimension})")
493
+
494
+ def encode(self, texts: list[str]) -> list[list[float]]:
495
+ """
496
+ Generate embeddings using the OpenAI API.
497
+
498
+ Args:
499
+ texts: List of text strings to encode
500
+
501
+ Returns:
502
+ List of embedding vectors
503
+ """
504
+ if self._client is None:
505
+ raise RuntimeError("Embeddings not initialized. Call initialize() first.")
506
+
507
+ if not texts:
508
+ return []
509
+
510
+ all_embeddings = []
511
+
512
+ # Process in batches
513
+ for i in range(0, len(texts), self.batch_size):
514
+ batch = texts[i : i + self.batch_size]
515
+
516
+ response = self._client.embeddings.create(
517
+ model=self.model,
518
+ input=batch,
519
+ )
520
+
521
+ # Sort by index to ensure correct order
522
+ batch_embeddings = sorted(response.data, key=lambda x: x.index)
523
+ all_embeddings.extend([e.embedding for e in batch_embeddings])
524
+
525
+ return all_embeddings
526
+
527
+
528
+ class CohereEmbeddings(Embeddings):
529
+ """
530
+ Cohere embeddings implementation using the Cohere API.
531
+
532
+ Supports embed-english-v3.0 (1024 dims) and embed-multilingual-v3.0 (1024 dims).
533
+
534
+ The embedding dimension is auto-detected from the model at initialization.
535
+ """
536
+
537
+ # Known dimensions for Cohere embedding models
538
+ MODEL_DIMENSIONS = {
539
+ "embed-english-v3.0": 1024,
540
+ "embed-multilingual-v3.0": 1024,
541
+ "embed-english-light-v3.0": 384,
542
+ "embed-multilingual-light-v3.0": 384,
543
+ "embed-english-v2.0": 4096,
544
+ "embed-multilingual-v2.0": 768,
545
+ }
546
+
547
+ def __init__(
548
+ self,
549
+ api_key: str,
550
+ model: str = DEFAULT_EMBEDDINGS_COHERE_MODEL,
551
+ base_url: str | None = None,
552
+ batch_size: int = 96,
553
+ timeout: float = 60.0,
554
+ input_type: str = "search_document",
555
+ ):
556
+ """
557
+ Initialize Cohere embeddings client.
558
+
559
+ Args:
560
+ api_key: Cohere API key
561
+ model: Cohere embedding model name (default: embed-english-v3.0)
562
+ base_url: Custom base URL for Cohere-compatible API (e.g., Azure-hosted endpoint)
563
+ batch_size: Maximum batch size for embedding requests (default: 96, Cohere's limit)
564
+ timeout: Request timeout in seconds (default: 60.0)
565
+ input_type: Input type for embeddings (default: search_document).
566
+ Options: search_document, search_query, classification, clustering
567
+ """
568
+ self.api_key = api_key
569
+ self.model = model
570
+ self.base_url = base_url
571
+ self.batch_size = batch_size
572
+ self.timeout = timeout
573
+ self.input_type = input_type
574
+ self._client = None
575
+ self._dimension: int | None = None
576
+
577
+ @property
578
+ def provider_name(self) -> str:
579
+ return "cohere"
580
+
581
+ @property
582
+ def dimension(self) -> int:
583
+ if self._dimension is None:
584
+ raise RuntimeError("Embeddings not initialized. Call initialize() first.")
585
+ return self._dimension
586
+
587
+ async def initialize(self) -> None:
588
+ """Initialize the Cohere client and detect dimension."""
589
+ if self._client is not None:
590
+ return
591
+
592
+ try:
593
+ import cohere
594
+ except ImportError:
595
+ raise ImportError("cohere is required for CohereEmbeddings. Install it with: pip install cohere")
596
+
597
+ base_url_msg = f" at {self.base_url}" if self.base_url else ""
598
+ logger.info(f"Embeddings: initializing Cohere provider with model {self.model}{base_url_msg}")
599
+
600
+ # Build client kwargs, only including base_url if set (for Azure or custom endpoints)
601
+ client_kwargs = {"api_key": self.api_key, "timeout": self.timeout}
602
+ if self.base_url:
603
+ client_kwargs["base_url"] = self.base_url
604
+ self._client = cohere.Client(**client_kwargs)
605
+
606
+ # Try to get dimension from known models, otherwise do a test embedding
607
+ if self.model in self.MODEL_DIMENSIONS:
608
+ self._dimension = self.MODEL_DIMENSIONS[self.model]
609
+ else:
610
+ # Do a test embedding to detect dimension
611
+ response = self._client.embed(
612
+ texts=["test"],
613
+ model=self.model,
614
+ input_type=self.input_type,
615
+ )
616
+ if response.embeddings:
617
+ self._dimension = len(response.embeddings[0])
618
+
619
+ logger.info(f"Embeddings: Cohere provider initialized (model: {self.model}, dim: {self._dimension})")
620
+
621
+ def encode(self, texts: list[str]) -> list[list[float]]:
622
+ """
623
+ Generate embeddings using the Cohere API.
624
+
625
+ Args:
626
+ texts: List of text strings to encode
627
+
628
+ Returns:
629
+ List of embedding vectors
630
+ """
631
+ if self._client is None:
632
+ raise RuntimeError("Embeddings not initialized. Call initialize() first.")
633
+
634
+ if not texts:
635
+ return []
636
+
637
+ all_embeddings = []
638
+
639
+ # Process in batches
640
+ for i in range(0, len(texts), self.batch_size):
641
+ batch = texts[i : i + self.batch_size]
642
+
643
+ response = self._client.embed(
644
+ texts=batch,
645
+ model=self.model,
646
+ input_type=self.input_type,
647
+ )
648
+
649
+ all_embeddings.extend(response.embeddings)
650
+
651
+ return all_embeddings
652
+
653
+
654
+ class LiteLLMEmbeddings(Embeddings):
655
+ """
656
+ LiteLLM embeddings implementation using LiteLLM proxy's /embeddings endpoint.
657
+
658
+ LiteLLM provides a unified interface for multiple embedding providers.
659
+ The proxy exposes an OpenAI-compatible /embeddings endpoint.
660
+ See: https://docs.litellm.ai/docs/embedding/supported_embedding
661
+
662
+ Supported providers via LiteLLM:
663
+ - OpenAI (text-embedding-3-small, text-embedding-ada-002, etc.)
664
+ - Cohere (embed-english-v3.0, etc.) - prefix with cohere/
665
+ - Vertex AI (textembedding-gecko, etc.) - prefix with vertex_ai/
666
+ - HuggingFace, Mistral, Voyage AI, etc.
667
+
668
+ The embedding dimension is auto-detected from the model at initialization.
669
+ """
670
+
671
+ def __init__(
672
+ self,
673
+ api_base: str = DEFAULT_LITELLM_API_BASE,
674
+ api_key: str | None = None,
675
+ model: str = DEFAULT_EMBEDDINGS_LITELLM_MODEL,
676
+ batch_size: int = 100,
677
+ timeout: float = 60.0,
678
+ ):
679
+ """
680
+ Initialize LiteLLM embeddings client.
681
+
682
+ Args:
683
+ api_base: Base URL of the LiteLLM proxy (default: http://localhost:4000)
684
+ api_key: API key for the LiteLLM proxy (optional, depends on proxy config)
685
+ model: Embedding model name (default: text-embedding-3-small)
686
+ Use provider prefix for non-OpenAI models (e.g., cohere/embed-english-v3.0)
687
+ batch_size: Maximum batch size for embedding requests (default: 100)
688
+ timeout: Request timeout in seconds (default: 60.0)
689
+ """
690
+ self.api_base = api_base.rstrip("/")
691
+ self.api_key = api_key
692
+ self.model = model
693
+ self.batch_size = batch_size
694
+ self.timeout = timeout
695
+ self._client: httpx.Client | None = None
696
+ self._dimension: int | None = None
697
+
698
+ @property
699
+ def provider_name(self) -> str:
700
+ return "litellm"
701
+
702
+ @property
703
+ def dimension(self) -> int:
704
+ if self._dimension is None:
705
+ raise RuntimeError("Embeddings not initialized. Call initialize() first.")
706
+ return self._dimension
707
+
708
+ async def initialize(self) -> None:
709
+ """Initialize the HTTP client and detect embedding dimension."""
710
+ if self._client is not None:
711
+ return
712
+
713
+ logger.info(f"Embeddings: initializing LiteLLM provider at {self.api_base} with model {self.model}")
714
+
715
+ headers = {"Content-Type": "application/json"}
716
+ if self.api_key:
717
+ headers["Authorization"] = f"Bearer {self.api_key}"
718
+
719
+ self._client = httpx.Client(timeout=self.timeout, headers=headers)
720
+
721
+ # Do a test embedding to detect dimension
722
+ try:
723
+ response = self._client.post(
724
+ f"{self.api_base}/embeddings",
725
+ json={"model": self.model, "input": ["test"]},
726
+ )
727
+ response.raise_for_status()
728
+ result = response.json()
729
+ if result.get("data") and len(result["data"]) > 0:
730
+ self._dimension = len(result["data"][0]["embedding"])
731
+ logger.info(f"Embeddings: LiteLLM provider initialized (model: {self.model}, dim: {self._dimension})")
732
+ except httpx.HTTPError as e:
733
+ raise RuntimeError(f"Failed to connect to LiteLLM proxy at {self.api_base}: {e}")
734
+
735
+ def encode(self, texts: list[str]) -> list[list[float]]:
736
+ """
737
+ Generate embeddings using the LiteLLM proxy.
738
+
739
+ Args:
740
+ texts: List of text strings to encode
741
+
742
+ Returns:
743
+ List of embedding vectors
744
+ """
745
+ if self._client is None:
746
+ raise RuntimeError("Embeddings not initialized. Call initialize() first.")
747
+
748
+ if not texts:
749
+ return []
750
+
751
+ all_embeddings = []
752
+
753
+ # Process in batches
754
+ for i in range(0, len(texts), self.batch_size):
755
+ batch = texts[i : i + self.batch_size]
756
+
757
+ response = self._client.post(
758
+ f"{self.api_base}/embeddings",
759
+ json={"model": self.model, "input": batch},
760
+ )
761
+ response.raise_for_status()
762
+ result = response.json()
763
+
764
+ # Sort by index to ensure correct order
765
+ batch_embeddings = sorted(result["data"], key=lambda x: x["index"])
766
+ all_embeddings.extend([e["embedding"] for e in batch_embeddings])
767
+
768
+ return all_embeddings
769
+
770
+
272
771
  def create_embeddings_from_env() -> Embeddings:
273
772
  """
274
773
  Create an Embeddings instance based on environment variables.
@@ -289,5 +788,30 @@ def create_embeddings_from_env() -> Embeddings:
289
788
  model = os.environ.get(ENV_EMBEDDINGS_LOCAL_MODEL)
290
789
  model_name = model or DEFAULT_EMBEDDINGS_LOCAL_MODEL
291
790
  return LocalSTEmbeddings(model_name=model_name)
791
+ elif provider == "openai":
792
+ # Use dedicated embeddings API key, or fall back to LLM API key
793
+ api_key = os.environ.get(ENV_EMBEDDINGS_OPENAI_API_KEY) or os.environ.get(ENV_LLM_API_KEY)
794
+ if not api_key:
795
+ raise ValueError(
796
+ f"{ENV_EMBEDDINGS_OPENAI_API_KEY} or {ENV_LLM_API_KEY} is required "
797
+ f"when {ENV_EMBEDDINGS_PROVIDER} is 'openai'"
798
+ )
799
+ model = os.environ.get(ENV_EMBEDDINGS_OPENAI_MODEL, DEFAULT_EMBEDDINGS_OPENAI_MODEL)
800
+ base_url = os.environ.get(ENV_EMBEDDINGS_OPENAI_BASE_URL) or None
801
+ return OpenAIEmbeddings(api_key=api_key, model=model, base_url=base_url)
802
+ elif provider == "cohere":
803
+ api_key = os.environ.get(ENV_COHERE_API_KEY)
804
+ if not api_key:
805
+ raise ValueError(f"{ENV_COHERE_API_KEY} is required when {ENV_EMBEDDINGS_PROVIDER} is 'cohere'")
806
+ model = os.environ.get(ENV_EMBEDDINGS_COHERE_MODEL, DEFAULT_EMBEDDINGS_COHERE_MODEL)
807
+ base_url = os.environ.get(ENV_EMBEDDINGS_COHERE_BASE_URL) or None
808
+ return CohereEmbeddings(api_key=api_key, model=model, base_url=base_url)
809
+ elif provider == "litellm":
810
+ api_base = os.environ.get(ENV_LITELLM_API_BASE, DEFAULT_LITELLM_API_BASE)
811
+ api_key = os.environ.get(ENV_LITELLM_API_KEY)
812
+ model = os.environ.get(ENV_EMBEDDINGS_LITELLM_MODEL, DEFAULT_EMBEDDINGS_LITELLM_MODEL)
813
+ return LiteLLMEmbeddings(api_base=api_base, api_key=api_key, model=model)
292
814
  else:
293
- raise ValueError(f"Unknown embeddings provider: {provider}. Supported: 'local', 'tei'")
815
+ raise ValueError(
816
+ f"Unknown embeddings provider: {provider}. Supported: 'local', 'tei', 'openai', 'cohere', 'litellm'"
817
+ )