rakam-systems-vectorstore 0.1.1rc7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. rakam_systems_vectorstore/MANIFEST.in +26 -0
  2. rakam_systems_vectorstore/README.md +1071 -0
  3. rakam_systems_vectorstore/__init__.py +93 -0
  4. rakam_systems_vectorstore/components/__init__.py +0 -0
  5. rakam_systems_vectorstore/components/chunker/__init__.py +19 -0
  6. rakam_systems_vectorstore/components/chunker/advanced_chunker.py +1019 -0
  7. rakam_systems_vectorstore/components/chunker/text_chunker.py +154 -0
  8. rakam_systems_vectorstore/components/embedding_model/__init__.py +0 -0
  9. rakam_systems_vectorstore/components/embedding_model/configurable_embeddings.py +546 -0
  10. rakam_systems_vectorstore/components/embedding_model/openai_embeddings.py +259 -0
  11. rakam_systems_vectorstore/components/loader/__init__.py +31 -0
  12. rakam_systems_vectorstore/components/loader/adaptive_loader.py +512 -0
  13. rakam_systems_vectorstore/components/loader/code_loader.py +699 -0
  14. rakam_systems_vectorstore/components/loader/doc_loader.py +812 -0
  15. rakam_systems_vectorstore/components/loader/eml_loader.py +556 -0
  16. rakam_systems_vectorstore/components/loader/html_loader.py +626 -0
  17. rakam_systems_vectorstore/components/loader/md_loader.py +622 -0
  18. rakam_systems_vectorstore/components/loader/odt_loader.py +750 -0
  19. rakam_systems_vectorstore/components/loader/pdf_loader.py +771 -0
  20. rakam_systems_vectorstore/components/loader/pdf_loader_light.py +723 -0
  21. rakam_systems_vectorstore/components/loader/tabular_loader.py +597 -0
  22. rakam_systems_vectorstore/components/vectorstore/__init__.py +0 -0
  23. rakam_systems_vectorstore/components/vectorstore/apps.py +10 -0
  24. rakam_systems_vectorstore/components/vectorstore/configurable_pg_vector_store.py +1661 -0
  25. rakam_systems_vectorstore/components/vectorstore/faiss_vector_store.py +878 -0
  26. rakam_systems_vectorstore/components/vectorstore/migrations/0001_initial.py +55 -0
  27. rakam_systems_vectorstore/components/vectorstore/migrations/__init__.py +0 -0
  28. rakam_systems_vectorstore/components/vectorstore/models.py +10 -0
  29. rakam_systems_vectorstore/components/vectorstore/pg_models.py +97 -0
  30. rakam_systems_vectorstore/components/vectorstore/pg_vector_store.py +827 -0
  31. rakam_systems_vectorstore/config.py +266 -0
  32. rakam_systems_vectorstore/core.py +8 -0
  33. rakam_systems_vectorstore/pyproject.toml +113 -0
  34. rakam_systems_vectorstore/server/README.md +290 -0
  35. rakam_systems_vectorstore/server/__init__.py +20 -0
  36. rakam_systems_vectorstore/server/mcp_server_vector.py +325 -0
  37. rakam_systems_vectorstore/setup.py +103 -0
  38. rakam_systems_vectorstore-0.1.1rc7.dist-info/METADATA +370 -0
  39. rakam_systems_vectorstore-0.1.1rc7.dist-info/RECORD +40 -0
  40. rakam_systems_vectorstore-0.1.1rc7.dist-info/WHEEL +4 -0
@@ -0,0 +1,154 @@
1
+ """
2
+ Text chunking utilities for splitting text into smaller pieces with overlap.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Any, List
8
+
9
+ from rakam_systems_core.ai_utils import logging
10
+ from rakam_systems_core.ai_core.interfaces.chunker import Chunker
11
+
12
+ try:
13
+ from chonkie import SentenceChunker
14
+ CHONKIE_AVAILABLE = True
15
+ except ImportError:
16
+ CHONKIE_AVAILABLE = False
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class TextChunker(Chunker):
22
+ """
23
+ Text chunker that splits text into smaller pieces with overlap.
24
+
25
+ This chunker uses Chonkie's SentenceChunker for sentence-based chunking
26
+ with token-aware splitting and configurable overlap.
27
+ """
28
+
29
+ def __init__(
30
+ self,
31
+ chunk_size: int = 512,
32
+ chunk_overlap: int = 50,
33
+ min_sentences_per_chunk: int = 1,
34
+ tokenizer: str = "character",
35
+ name: str = "text_chunker"
36
+ ):
37
+ """
38
+ Initialize text chunker.
39
+
40
+ Args:
41
+ chunk_size: Size of text chunks in tokens
42
+ chunk_overlap: Overlap between chunks in tokens
43
+ min_sentences_per_chunk: Minimum sentences per chunk (default: 1)
44
+ tokenizer: Tokenizer to use - "character", "gpt2", or any HuggingFace tokenizer (default: "character")
45
+ name: Component name
46
+ """
47
+ super().__init__(name=name)
48
+ self._chunk_size = chunk_size
49
+ self._chunk_overlap = chunk_overlap
50
+ self._min_sentences_per_chunk = min_sentences_per_chunk
51
+ self._tokenizer = tokenizer
52
+
53
+ def run(self, documents: List[str]) -> List[str]:
54
+ """
55
+ Chunk a list of documents into smaller text pieces.
56
+
57
+ Args:
58
+ documents: List of text documents to chunk
59
+
60
+ Returns:
61
+ List of text chunks (just the text content)
62
+ """
63
+ all_chunks = []
64
+ for doc_idx, document in enumerate(documents):
65
+ chunk_results = self.chunk_text(document, context=f"doc_{doc_idx}")
66
+ # Extract just the text from the chunk dictionaries
67
+ chunks = [chunk_info["text"] for chunk_info in chunk_results]
68
+ all_chunks.extend(chunks)
69
+ return all_chunks
70
+
71
+ def chunk_text(self, text: str, context: str = "") -> List[dict[str, Any]]:
72
+ """
73
+ Chunk text into smaller pieces with overlap using Chonkie's SentenceChunker.
74
+
75
+ This method uses sentence-based chunking with configurable token limits and overlap,
76
+ providing more intelligent chunking than simple character-based splitting.
77
+
78
+ Args:
79
+ text: Text to chunk
80
+ context: Context label for logging (optional)
81
+
82
+ Returns:
83
+ List of dictionaries with chunk information:
84
+ - text: The chunk text
85
+ - token_count: Number of tokens in the chunk
86
+ - start_index: Starting character index in original text
87
+ - end_index: Ending character index in original text
88
+
89
+ Raises:
90
+ ImportError: If chonkie is not installed
91
+ """
92
+ if not text or not text.strip():
93
+ return []
94
+
95
+ if not CHONKIE_AVAILABLE:
96
+ raise ImportError(
97
+ "chonkie is not installed. Please install it with: "
98
+ "pip install chonkie==1.4.2"
99
+ )
100
+
101
+ # Initialize the Chonkie SentenceChunker
102
+ chonkie_chunker = SentenceChunker(
103
+ tokenizer=self._tokenizer,
104
+ chunk_size=self._chunk_size,
105
+ chunk_overlap=self._chunk_overlap,
106
+ min_sentences_per_chunk=self._min_sentences_per_chunk,
107
+ )
108
+
109
+ # Chunk the text
110
+ chunks = chonkie_chunker(text)
111
+
112
+ # Convert Chonkie chunks to our format
113
+ result = []
114
+ for chunk in chunks:
115
+ chunk_info = {
116
+ "text": chunk.text,
117
+ "token_count": chunk.token_count,
118
+ "start_index": chunk.start_index,
119
+ "end_index": chunk.end_index,
120
+ }
121
+ result.append(chunk_info)
122
+
123
+ logger.debug(
124
+ f"Chunked {context}: {len(text)} chars -> {len(result)} chunks")
125
+ return result
126
+
127
+
128
+ def create_text_chunker(
129
+ chunk_size: int = 512,
130
+ chunk_overlap: int = 50,
131
+ min_sentences_per_chunk: int = 1,
132
+ tokenizer: str = "character"
133
+ ) -> TextChunker:
134
+ """
135
+ Factory function to create a text chunker.
136
+
137
+ Args:
138
+ chunk_size: Size of text chunks in tokens
139
+ chunk_overlap: Overlap between chunks in tokens
140
+ min_sentences_per_chunk: Minimum sentences per chunk (default: 1)
141
+ tokenizer: Tokenizer to use - "character", "gpt2", or any HuggingFace tokenizer (default: "character")
142
+
143
+ Returns:
144
+ Configured text chunker
145
+ """
146
+ return TextChunker(
147
+ chunk_size=chunk_size,
148
+ chunk_overlap=chunk_overlap,
149
+ min_sentences_per_chunk=min_sentences_per_chunk,
150
+ tokenizer=tokenizer
151
+ )
152
+
153
+
154
+ __all__ = ["TextChunker", "create_text_chunker"]
@@ -0,0 +1,546 @@
1
+ """
2
+ Configurable embedding model with support for multiple backends.
3
+
4
+ Supports:
5
+ - Sentence Transformers (local models)
6
+ - OpenAI API
7
+ - Cohere API
8
+ - Custom embedding providers
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import os
14
+ import time
15
+ from functools import lru_cache
16
+ from typing import List, Optional, Union
17
+
18
+ import numpy as np
19
+
20
+ from rakam_systems_core.ai_utils import logging
21
+ from rakam_systems_core.ai_core.interfaces.embedding_model import EmbeddingModel
22
+ from rakam_systems_vectorstore.config import EmbeddingConfig
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ class ConfigurableEmbeddings(EmbeddingModel):
28
+ """
29
+ Configurable embedding model that supports multiple backends.
30
+
31
+ This component automatically selects the appropriate embedding backend
32
+ based on configuration and provides a unified interface.
33
+ """
34
+
35
+ def __init__(
36
+ self,
37
+ name: str = "configurable_embeddings",
38
+ config: Optional[Union[EmbeddingConfig, dict]] = None
39
+ ):
40
+ """
41
+ Initialize configurable embeddings.
42
+
43
+ Args:
44
+ name: Component name
45
+ config: EmbeddingConfig or dict with embedding settings
46
+ """
47
+ # Parse config first
48
+ if isinstance(config, dict):
49
+ self.embedding_config = EmbeddingConfig(**config)
50
+ config_dict = config
51
+ elif isinstance(config, EmbeddingConfig):
52
+ self.embedding_config = config
53
+ # Convert EmbeddingConfig to dict for parent class
54
+ from dataclasses import asdict
55
+ config_dict = asdict(config)
56
+ else:
57
+ self.embedding_config = EmbeddingConfig()
58
+ config_dict = None
59
+
60
+ # Pass dict to parent class
61
+ super().__init__(name=name, config=config_dict)
62
+
63
+ self.model_type = self.embedding_config.model_type
64
+ self.model_name = self.embedding_config.model_name
65
+ self.batch_size = self.embedding_config.batch_size
66
+ self.normalize = self.embedding_config.normalize
67
+
68
+ # Backend-specific attributes
69
+ self._model = None
70
+ self._client = None
71
+ self._embedding_dim = None
72
+
73
+ def setup(self) -> None:
74
+ """Initialize the embedding backend."""
75
+ # Skip if already initialized to avoid reloading the model
76
+ if self.initialized:
77
+ logger.debug(
78
+ f"Embedding model {self.model_name} already initialized, skipping setup")
79
+ return
80
+
81
+ logger.info(
82
+ f"Setting up {self.model_type} embedding model: {self.model_name}")
83
+
84
+ if self.model_type == "sentence_transformer":
85
+ self._setup_sentence_transformer()
86
+ elif self.model_type == "openai":
87
+ pass # OpenAI client is created on-demand in _encode_openai
88
+ elif self.model_type == "cohere":
89
+ self._setup_cohere()
90
+ else:
91
+ raise ValueError(f"Unsupported model type: {self.model_type}")
92
+
93
+ # Detect embedding dimension
94
+ if self.embedding_config.dimensions:
95
+ self._embedding_dim = self.embedding_config.dimensions
96
+ else:
97
+ self._embedding_dim = self._detect_embedding_dimension()
98
+
99
+ logger.info(
100
+ f"Embedding model initialized with dimension: {self._embedding_dim}")
101
+ super().setup()
102
+
103
+ def _setup_sentence_transformer(self) -> None:
104
+ """Setup Sentence Transformer model."""
105
+ try:
106
+ from sentence_transformers import SentenceTransformer
107
+ except ImportError:
108
+ raise ImportError(
109
+ "sentence-transformers is required for sentence_transformer model type. "
110
+ "Install it with: pip install sentence-transformers"
111
+ )
112
+
113
+ # Authenticate with Hugging Face if token is available
114
+ hf_token = os.getenv("HF_TOKEN")
115
+ if hf_token:
116
+ try:
117
+ from huggingface_hub import login
118
+ login(token=hf_token)
119
+ logger.info("Successfully authenticated with Hugging Face")
120
+ except ImportError:
121
+ logger.warning(
122
+ "huggingface-hub is not installed. "
123
+ "Install it with: pip install huggingface-hub"
124
+ )
125
+ except Exception as e:
126
+ logger.warning(
127
+ f"Failed to authenticate with Hugging Face: {e}")
128
+ else:
129
+ logger.debug(
130
+ "No HF_TOKEN found in environment, skipping Hugging Face authentication")
131
+
132
+ self._model = SentenceTransformer(
133
+ self.model_name, trust_remote_code=True)
134
+ logger.info(f"Loaded SentenceTransformer model: {self.model_name}")
135
+
136
+ def _setup_cohere(self) -> None:
137
+ """Setup Cohere API client."""
138
+ try:
139
+ import cohere
140
+ except ImportError:
141
+ raise ImportError(
142
+ "cohere is required for cohere model type. "
143
+ "Install it with: pip install cohere"
144
+ )
145
+
146
+ api_key = self.embedding_config.api_key or os.getenv("COHERE_API_KEY")
147
+ if not api_key:
148
+ raise ValueError(
149
+ "Cohere API key not found. Set COHERE_API_KEY environment variable "
150
+ "or provide it in config."
151
+ )
152
+
153
+ self._client = cohere.Client(api_key)
154
+ logger.info(f"Initialized Cohere client with model: {self.model_name}")
155
+
156
+ def _detect_embedding_dimension(self) -> int:
157
+ """Detect embedding dimension by encoding a sample text."""
158
+ sample_embedding = self._encode_batch(
159
+ ["sample text for dimension detection"])[0]
160
+ return len(sample_embedding)
161
+
162
+ def _encode_batch(self, texts: List[str]) -> List[List[float]]:
163
+ """
164
+ Encode a batch of texts using the configured backend.
165
+
166
+ Args:
167
+ texts: List of texts to encode
168
+
169
+ Returns:
170
+ List of embedding vectors
171
+ """
172
+ if self.model_type == "sentence_transformer":
173
+ return self._encode_sentence_transformer(texts)
174
+ elif self.model_type == "openai":
175
+ return self._encode_openai(texts)
176
+ elif self.model_type == "cohere":
177
+ return self._encode_cohere(texts)
178
+ else:
179
+ raise ValueError(f"Unsupported model type: {self.model_type}")
180
+
181
+ def _encode_sentence_transformer(self, texts: List[str]) -> List[List[float]]:
182
+ """Encode texts using Sentence Transformer."""
183
+ import gc
184
+
185
+ # CRITICAL: Disable tokenizer parallelism to prevent deadlocks in Docker/multiprocessing environments
186
+ # This is a known issue with HuggingFace tokenizers in containerized environments
187
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
188
+
189
+ logger.info(
190
+ f"_encode_sentence_transformer() called with {len(texts)} texts")
191
+
192
+ # Ensure all texts are strings (sentence_transformers v3.x compatibility)
193
+ sanitized_texts = [str(t) if not isinstance(
194
+ t, str) else t for t in texts]
195
+
196
+ total_texts = len(sanitized_texts)
197
+ num_batches = (total_texts + self.batch_size - 1) // self.batch_size
198
+
199
+ logger.info(
200
+ f"Total texts: {total_texts}, Batch size: {self.batch_size}, Num batches: {num_batches}")
201
+
202
+ if total_texts > 1000:
203
+ logger.info(
204
+ f"Encoding {total_texts} texts with sentence transformer (batch_size={self.batch_size}, {num_batches} batches)...")
205
+
206
+ # For large datasets, encode in chunks with progress logging
207
+ if total_texts > 10000:
208
+ all_embeddings = []
209
+ # Log 1000 times during encoding
210
+ log_interval = max(1, num_batches // 1000)
211
+ batch_start_time = time.time()
212
+
213
+ logger.info(
214
+ f"Large dataset: processing {total_texts} texts in {num_batches} batches (logging every {log_interval} batches)")
215
+
216
+ for i in range(0, total_texts, self.batch_size):
217
+ batch_num = i // self.batch_size + 1
218
+ batch_texts = sanitized_texts[i:i + self.batch_size]
219
+
220
+ # Log before encoding starts (for first batch and every log_interval)
221
+ if batch_num == 1 or batch_num % log_interval == 0 or batch_num == num_batches:
222
+ logger.info(
223
+ f"Starting batch {batch_num}/{num_batches}: encoding {len(batch_texts)} texts...")
224
+ batch_encode_start = time.time()
225
+
226
+ batch_embeddings = self._model.encode(
227
+ batch_texts,
228
+ batch_size=self.batch_size,
229
+ show_progress_bar=False, # Disabled to prevent blocking in Docker/non-TTY environments
230
+ convert_to_tensor=False,
231
+ normalize_embeddings=False
232
+ )
233
+
234
+ if batch_num == 1 or batch_num % log_interval == 0 or batch_num == num_batches:
235
+ batch_encode_time = time.time() - batch_encode_start
236
+ logger.info(
237
+ f"Batch {batch_num}/{num_batches} encoding completed in {batch_encode_time:.2f}s")
238
+
239
+ all_embeddings.append(batch_embeddings)
240
+
241
+ # MEMORY OPTIMIZATION: Clear batch_texts reference
242
+ del batch_texts
243
+
244
+ if batch_num % log_interval == 0 or batch_num == num_batches:
245
+ progress_pct = (batch_num / num_batches) * 100
246
+ elapsed = time.time() - batch_start_time
247
+ texts_processed = min(i + self.batch_size, total_texts)
248
+ rate = texts_processed / elapsed if elapsed > 0 else 0
249
+ eta_seconds = (total_texts - texts_processed) / \
250
+ rate if rate > 0 else 0
251
+ logger.info(
252
+ f"Embedding progress: {batch_num}/{num_batches} batches ({progress_pct:.1f}%) - {texts_processed}/{total_texts} texts - {rate:.1f} texts/s - ETA: {eta_seconds:.0f}s")
253
+
254
+ embeddings = np.vstack(all_embeddings)
255
+ # MEMORY OPTIMIZATION: Clear intermediate arrays
256
+ del all_embeddings
257
+ elif num_batches > 1:
258
+ # For medium datasets (multiple batches but < 10000 texts), also log progress
259
+ all_embeddings = []
260
+ batch_start_time = time.time()
261
+ logger.info(
262
+ f"Processing {total_texts} texts in {num_batches} batches...")
263
+
264
+ for i in range(0, total_texts, self.batch_size):
265
+ batch_num = i // self.batch_size + 1
266
+ batch_texts = sanitized_texts[i:i + self.batch_size]
267
+
268
+ # Log before encoding starts
269
+ logger.info(
270
+ f"Starting batch {batch_num}/{num_batches}: encoding {len(batch_texts)} texts...")
271
+ batch_encode_start = time.time()
272
+
273
+ batch_embeddings = self._model.encode(
274
+ batch_texts,
275
+ batch_size=self.batch_size,
276
+ show_progress_bar=False, # Disabled to prevent blocking in Docker/non-TTY environments
277
+ convert_to_tensor=False,
278
+ normalize_embeddings=False
279
+ )
280
+
281
+ batch_encode_time = time.time() - batch_encode_start
282
+ logger.info(
283
+ f"Batch {batch_num}/{num_batches} encoding completed in {batch_encode_time:.2f}s")
284
+
285
+ all_embeddings.append(batch_embeddings)
286
+
287
+ # MEMORY OPTIMIZATION: Clear batch_texts reference
288
+ del batch_texts
289
+
290
+ # Log progress for each batch
291
+ progress_pct = (batch_num / num_batches) * 100
292
+ elapsed = time.time() - batch_start_time
293
+ texts_processed = min(i + self.batch_size, total_texts)
294
+ rate = texts_processed / elapsed if elapsed > 0 else 0
295
+ eta_seconds = (total_texts - texts_processed) / \
296
+ rate if rate > 0 else 0
297
+ logger.info(
298
+ f"Embedding progress: batch {batch_num}/{num_batches} ({progress_pct:.1f}%) - {texts_processed}/{total_texts} texts - {rate:.1f} texts/s - ETA: {eta_seconds:.0f}s")
299
+
300
+ embeddings = np.vstack(all_embeddings)
301
+ # MEMORY OPTIMIZATION: Clear intermediate arrays
302
+ del all_embeddings
303
+ else:
304
+ # Single batch - but still process in smaller chunks to show progress
305
+ logger.info(
306
+ f"Processing {total_texts} texts (will process in mini-batches of {self.batch_size})...")
307
+ all_embeddings = []
308
+ batch_start_time = time.time()
309
+
310
+ # Process in mini-batches even for "single batch" to show progress
311
+ # Use smaller batches for better progress visibility
312
+ mini_batch_size = min(self.batch_size, 32)
313
+ num_mini_batches = (
314
+ total_texts + mini_batch_size - 1) // mini_batch_size
315
+
316
+ logger.info(
317
+ f"Will process {total_texts} texts in {num_mini_batches} mini-batches of size {mini_batch_size}")
318
+
319
+ for i in range(0, total_texts, mini_batch_size):
320
+ batch_num = i // mini_batch_size + 1
321
+ batch_texts = sanitized_texts[i:i + mini_batch_size]
322
+
323
+ # Log before encoding starts
324
+ logger.info(
325
+ f"Starting mini-batch {batch_num}/{num_mini_batches}: encoding {len(batch_texts)} texts...")
326
+ batch_encode_start = time.time()
327
+
328
+ batch_embeddings = self._model.encode(
329
+ batch_texts,
330
+ batch_size=mini_batch_size,
331
+ show_progress_bar=False, # Disabled to prevent blocking in Docker/non-TTY environments
332
+ convert_to_tensor=False,
333
+ normalize_embeddings=False
334
+ )
335
+
336
+ batch_encode_time = time.time() - batch_encode_start
337
+ logger.info(
338
+ f"Mini-batch {batch_num}/{num_mini_batches} encoding completed in {batch_encode_time:.2f}s")
339
+
340
+ all_embeddings.append(batch_embeddings)
341
+
342
+ # MEMORY OPTIMIZATION: Clear batch_texts reference
343
+ del batch_texts
344
+
345
+ # Log progress for each mini-batch
346
+ progress_pct = (batch_num / num_mini_batches) * 100
347
+ elapsed = time.time() - batch_start_time
348
+ texts_processed = min(i + mini_batch_size, total_texts)
349
+ rate = texts_processed / elapsed if elapsed > 0 else 0
350
+ eta_seconds = (total_texts - texts_processed) / \
351
+ rate if rate > 0 else 0
352
+ logger.info(
353
+ f"Embedding progress: mini-batch {batch_num}/{num_mini_batches} ({progress_pct:.1f}%) - {texts_processed}/{total_texts} texts - {rate:.1f} texts/s - ETA: {eta_seconds:.0f}s")
354
+
355
+ embeddings = np.vstack(all_embeddings)
356
+ # MEMORY OPTIMIZATION: Clear intermediate arrays
357
+ del all_embeddings
358
+
359
+ # MEMORY OPTIMIZATION: Clear sanitized_texts as no longer needed
360
+ del sanitized_texts
361
+
362
+ # Force garbage collection after encoding to prevent memory buildup
363
+ # This is especially important for long-running batch processes
364
+ gc.collect()
365
+ try:
366
+ import torch
367
+ if torch.cuda.is_available():
368
+ torch.cuda.empty_cache()
369
+ except ImportError:
370
+ pass
371
+
372
+ # Always log completion for visibility
373
+ logger.info(f"✓ Encoding completed for {total_texts} texts")
374
+
375
+ # Convert to list of lists and release numpy array
376
+ if isinstance(embeddings, np.ndarray):
377
+ result = embeddings.tolist()
378
+ del embeddings # Release the numpy array
379
+ gc.collect()
380
+ return result
381
+
382
+ return embeddings
383
+
384
+ def _encode_openai(self, texts: List[str]) -> List[List[float]]:
385
+ """Encode texts using OpenAI API."""
386
+ from rakam_systems_vectorstore.components.embedding_model.openai_embeddings import OpenAIEmbeddings
387
+
388
+ # Use the OpenAIEmbeddings implementation with configured batch_size
389
+ openai_embeddings = OpenAIEmbeddings(
390
+ model=self.model_name,
391
+ api_key=self.embedding_config.api_key,
392
+ batch_size=self.batch_size # Pass the batch_size from ConfigurableEmbeddings
393
+ )
394
+ logger.info(f"OpenAI embeddings using batch_size={self.batch_size}")
395
+ return openai_embeddings.run(texts)
396
+
397
+ def _encode_cohere(self, texts: List[str]) -> List[List[float]]:
398
+ """Encode texts using Cohere API."""
399
+ all_embeddings = []
400
+
401
+ # Process in batches
402
+ for i in range(0, len(texts), self.batch_size):
403
+ batch = texts[i:i + self.batch_size]
404
+
405
+ try:
406
+ response = self._client.embed(
407
+ texts=batch,
408
+ model=self.model_name,
409
+ input_type="search_document"
410
+ )
411
+ all_embeddings.extend(response.embeddings)
412
+ except Exception as e:
413
+ logger.error(f"Error encoding batch with Cohere: {e}")
414
+ raise
415
+
416
+ return all_embeddings
417
+
418
+ def _normalize_embeddings(self, embeddings: List[List[float]]) -> List[List[float]]:
419
+ """Normalize embeddings to unit length."""
420
+ embeddings_array = np.array(embeddings, dtype=np.float32)
421
+ norms = np.linalg.norm(embeddings_array, axis=1, keepdims=True)
422
+ norms[norms == 0] = 1 # Avoid division by zero
423
+ normalized = embeddings_array / norms
424
+ return normalized.tolist()
425
+
426
+ def run(self, texts: List[str]) -> List[List[float]]:
427
+ """
428
+ Encode texts into embeddings.
429
+
430
+ Args:
431
+ texts: List of texts to encode
432
+
433
+ Returns:
434
+ List of embedding vectors
435
+ """
436
+ if not texts:
437
+ return []
438
+
439
+ start_time = time.time()
440
+ logger.info(
441
+ f"run() called: Encoding {len(texts)} texts with {self.model_type} model '{self.model_name}'")
442
+
443
+ # Encode texts
444
+ logger.info(f"Calling _encode_batch() for {len(texts)} texts...")
445
+ embeddings = self._encode_batch(texts)
446
+ logger.info(f"_encode_batch() returned {len(embeddings)} embeddings")
447
+
448
+ # Normalize if configured
449
+ if self.normalize:
450
+ logger.info(f"Normalizing {len(embeddings)} embeddings...")
451
+ embeddings = self._normalize_embeddings(embeddings)
452
+ logger.info(f"Normalization complete")
453
+
454
+ elapsed = time.time() - start_time
455
+ logger.info(
456
+ f"run() completed: Encoded {len(texts)} texts in {elapsed:.2f}s ({len(texts)/elapsed:.1f} texts/s)")
457
+
458
+ return embeddings
459
+
460
+ def encode_query(self, query: str) -> List[float]:
461
+ """
462
+ Encode a single query text.
463
+
464
+ Args:
465
+ query: Query text to encode
466
+
467
+ Returns:
468
+ Embedding vector
469
+ """
470
+ embeddings = self.run([query])
471
+ return embeddings[0] if embeddings else []
472
+
473
+ def encode_documents(self, documents: List[str]) -> List[List[float]]:
474
+ """
475
+ Encode multiple documents.
476
+
477
+ Args:
478
+ documents: List of documents to encode
479
+
480
+ Returns:
481
+ List of embedding vectors
482
+ """
483
+ return self.run(documents)
484
+
485
+ @property
486
+ def embedding_dimension(self) -> int:
487
+ """Get the embedding dimension."""
488
+ if not self.initialized:
489
+ self.setup()
490
+ return self._embedding_dim
491
+
492
+ def shutdown(self) -> None:
493
+ """Clean up resources."""
494
+ logger.info(f"Shutting down {self.model_type} embedding model")
495
+
496
+ if self.model_type == "sentence_transformer" and self._model:
497
+ # Clean up CUDA memory if using GPU
498
+ try:
499
+ import torch
500
+ if torch.cuda.is_available():
501
+ torch.cuda.empty_cache()
502
+ except ImportError:
503
+ pass
504
+
505
+ self._model = None
506
+ self._client = None
507
+ super().shutdown()
508
+
509
+
510
+ # Convenience factory function
511
+ def create_embedding_model(
512
+ model_type: str = "sentence_transformer",
513
+ model_name: Optional[str] = None,
514
+ **kwargs
515
+ ) -> ConfigurableEmbeddings:
516
+ """
517
+ Factory function to create an embedding model.
518
+
519
+ Args:
520
+ model_type: Type of model (sentence_transformer, openai, cohere)
521
+ model_name: Model name/identifier
522
+ **kwargs: Additional configuration parameters
523
+
524
+ Returns:
525
+ Configured embedding model
526
+ """
527
+ config = EmbeddingConfig(
528
+ model_type=model_type,
529
+ model_name=model_name or _get_default_model_name(model_type),
530
+ **kwargs
531
+ )
532
+
533
+ return ConfigurableEmbeddings(config=config)
534
+
535
+
536
+ def _get_default_model_name(model_type: str) -> str:
537
+ """Get default model name for a given type."""
538
+ defaults = {
539
+ "sentence_transformer": "Snowflake/snowflake-arctic-embed-m",
540
+ "openai": "text-embedding-3-small",
541
+ "cohere": "embed-english-v3.0"
542
+ }
543
+ return defaults.get(model_type, "")
544
+
545
+
546
+ __all__ = ["ConfigurableEmbeddings", "create_embedding_model"]