rakam-systems-vectorstore 0.1.1rc7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rakam_systems_vectorstore/MANIFEST.in +26 -0
- rakam_systems_vectorstore/README.md +1071 -0
- rakam_systems_vectorstore/__init__.py +93 -0
- rakam_systems_vectorstore/components/__init__.py +0 -0
- rakam_systems_vectorstore/components/chunker/__init__.py +19 -0
- rakam_systems_vectorstore/components/chunker/advanced_chunker.py +1019 -0
- rakam_systems_vectorstore/components/chunker/text_chunker.py +154 -0
- rakam_systems_vectorstore/components/embedding_model/__init__.py +0 -0
- rakam_systems_vectorstore/components/embedding_model/configurable_embeddings.py +546 -0
- rakam_systems_vectorstore/components/embedding_model/openai_embeddings.py +259 -0
- rakam_systems_vectorstore/components/loader/__init__.py +31 -0
- rakam_systems_vectorstore/components/loader/adaptive_loader.py +512 -0
- rakam_systems_vectorstore/components/loader/code_loader.py +699 -0
- rakam_systems_vectorstore/components/loader/doc_loader.py +812 -0
- rakam_systems_vectorstore/components/loader/eml_loader.py +556 -0
- rakam_systems_vectorstore/components/loader/html_loader.py +626 -0
- rakam_systems_vectorstore/components/loader/md_loader.py +622 -0
- rakam_systems_vectorstore/components/loader/odt_loader.py +750 -0
- rakam_systems_vectorstore/components/loader/pdf_loader.py +771 -0
- rakam_systems_vectorstore/components/loader/pdf_loader_light.py +723 -0
- rakam_systems_vectorstore/components/loader/tabular_loader.py +597 -0
- rakam_systems_vectorstore/components/vectorstore/__init__.py +0 -0
- rakam_systems_vectorstore/components/vectorstore/apps.py +10 -0
- rakam_systems_vectorstore/components/vectorstore/configurable_pg_vector_store.py +1661 -0
- rakam_systems_vectorstore/components/vectorstore/faiss_vector_store.py +878 -0
- rakam_systems_vectorstore/components/vectorstore/migrations/0001_initial.py +55 -0
- rakam_systems_vectorstore/components/vectorstore/migrations/__init__.py +0 -0
- rakam_systems_vectorstore/components/vectorstore/models.py +10 -0
- rakam_systems_vectorstore/components/vectorstore/pg_models.py +97 -0
- rakam_systems_vectorstore/components/vectorstore/pg_vector_store.py +827 -0
- rakam_systems_vectorstore/config.py +266 -0
- rakam_systems_vectorstore/core.py +8 -0
- rakam_systems_vectorstore/pyproject.toml +113 -0
- rakam_systems_vectorstore/server/README.md +290 -0
- rakam_systems_vectorstore/server/__init__.py +20 -0
- rakam_systems_vectorstore/server/mcp_server_vector.py +325 -0
- rakam_systems_vectorstore/setup.py +103 -0
- rakam_systems_vectorstore-0.1.1rc7.dist-info/METADATA +370 -0
- rakam_systems_vectorstore-0.1.1rc7.dist-info/RECORD +40 -0
- rakam_systems_vectorstore-0.1.1rc7.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Text chunking utilities for splitting text into smaller pieces with overlap.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Any, List
|
|
8
|
+
|
|
9
|
+
from rakam_systems_core.ai_utils import logging
|
|
10
|
+
from rakam_systems_core.ai_core.interfaces.chunker import Chunker
|
|
11
|
+
|
|
12
|
+
try:
|
|
13
|
+
from chonkie import SentenceChunker
|
|
14
|
+
CHONKIE_AVAILABLE = True
|
|
15
|
+
except ImportError:
|
|
16
|
+
CHONKIE_AVAILABLE = False
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class TextChunker(Chunker):
|
|
22
|
+
"""
|
|
23
|
+
Text chunker that splits text into smaller pieces with overlap.
|
|
24
|
+
|
|
25
|
+
This chunker uses Chonkie's SentenceChunker for sentence-based chunking
|
|
26
|
+
with token-aware splitting and configurable overlap.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
chunk_size: int = 512,
|
|
32
|
+
chunk_overlap: int = 50,
|
|
33
|
+
min_sentences_per_chunk: int = 1,
|
|
34
|
+
tokenizer: str = "character",
|
|
35
|
+
name: str = "text_chunker"
|
|
36
|
+
):
|
|
37
|
+
"""
|
|
38
|
+
Initialize text chunker.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
chunk_size: Size of text chunks in tokens
|
|
42
|
+
chunk_overlap: Overlap between chunks in tokens
|
|
43
|
+
min_sentences_per_chunk: Minimum sentences per chunk (default: 1)
|
|
44
|
+
tokenizer: Tokenizer to use - "character", "gpt2", or any HuggingFace tokenizer (default: "character")
|
|
45
|
+
name: Component name
|
|
46
|
+
"""
|
|
47
|
+
super().__init__(name=name)
|
|
48
|
+
self._chunk_size = chunk_size
|
|
49
|
+
self._chunk_overlap = chunk_overlap
|
|
50
|
+
self._min_sentences_per_chunk = min_sentences_per_chunk
|
|
51
|
+
self._tokenizer = tokenizer
|
|
52
|
+
|
|
53
|
+
def run(self, documents: List[str]) -> List[str]:
|
|
54
|
+
"""
|
|
55
|
+
Chunk a list of documents into smaller text pieces.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
documents: List of text documents to chunk
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
List of text chunks (just the text content)
|
|
62
|
+
"""
|
|
63
|
+
all_chunks = []
|
|
64
|
+
for doc_idx, document in enumerate(documents):
|
|
65
|
+
chunk_results = self.chunk_text(document, context=f"doc_{doc_idx}")
|
|
66
|
+
# Extract just the text from the chunk dictionaries
|
|
67
|
+
chunks = [chunk_info["text"] for chunk_info in chunk_results]
|
|
68
|
+
all_chunks.extend(chunks)
|
|
69
|
+
return all_chunks
|
|
70
|
+
|
|
71
|
+
def chunk_text(self, text: str, context: str = "") -> List[dict[str, Any]]:
|
|
72
|
+
"""
|
|
73
|
+
Chunk text into smaller pieces with overlap using Chonkie's SentenceChunker.
|
|
74
|
+
|
|
75
|
+
This method uses sentence-based chunking with configurable token limits and overlap,
|
|
76
|
+
providing more intelligent chunking than simple character-based splitting.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
text: Text to chunk
|
|
80
|
+
context: Context label for logging (optional)
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
List of dictionaries with chunk information:
|
|
84
|
+
- text: The chunk text
|
|
85
|
+
- token_count: Number of tokens in the chunk
|
|
86
|
+
- start_index: Starting character index in original text
|
|
87
|
+
- end_index: Ending character index in original text
|
|
88
|
+
|
|
89
|
+
Raises:
|
|
90
|
+
ImportError: If chonkie is not installed
|
|
91
|
+
"""
|
|
92
|
+
if not text or not text.strip():
|
|
93
|
+
return []
|
|
94
|
+
|
|
95
|
+
if not CHONKIE_AVAILABLE:
|
|
96
|
+
raise ImportError(
|
|
97
|
+
"chonkie is not installed. Please install it with: "
|
|
98
|
+
"pip install chonkie==1.4.2"
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
# Initialize the Chonkie SentenceChunker
|
|
102
|
+
chonkie_chunker = SentenceChunker(
|
|
103
|
+
tokenizer=self._tokenizer,
|
|
104
|
+
chunk_size=self._chunk_size,
|
|
105
|
+
chunk_overlap=self._chunk_overlap,
|
|
106
|
+
min_sentences_per_chunk=self._min_sentences_per_chunk,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# Chunk the text
|
|
110
|
+
chunks = chonkie_chunker(text)
|
|
111
|
+
|
|
112
|
+
# Convert Chonkie chunks to our format
|
|
113
|
+
result = []
|
|
114
|
+
for chunk in chunks:
|
|
115
|
+
chunk_info = {
|
|
116
|
+
"text": chunk.text,
|
|
117
|
+
"token_count": chunk.token_count,
|
|
118
|
+
"start_index": chunk.start_index,
|
|
119
|
+
"end_index": chunk.end_index,
|
|
120
|
+
}
|
|
121
|
+
result.append(chunk_info)
|
|
122
|
+
|
|
123
|
+
logger.debug(
|
|
124
|
+
f"Chunked {context}: {len(text)} chars -> {len(result)} chunks")
|
|
125
|
+
return result
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def create_text_chunker(
|
|
129
|
+
chunk_size: int = 512,
|
|
130
|
+
chunk_overlap: int = 50,
|
|
131
|
+
min_sentences_per_chunk: int = 1,
|
|
132
|
+
tokenizer: str = "character"
|
|
133
|
+
) -> TextChunker:
|
|
134
|
+
"""
|
|
135
|
+
Factory function to create a text chunker.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
chunk_size: Size of text chunks in tokens
|
|
139
|
+
chunk_overlap: Overlap between chunks in tokens
|
|
140
|
+
min_sentences_per_chunk: Minimum sentences per chunk (default: 1)
|
|
141
|
+
tokenizer: Tokenizer to use - "character", "gpt2", or any HuggingFace tokenizer (default: "character")
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
Configured text chunker
|
|
145
|
+
"""
|
|
146
|
+
return TextChunker(
|
|
147
|
+
chunk_size=chunk_size,
|
|
148
|
+
chunk_overlap=chunk_overlap,
|
|
149
|
+
min_sentences_per_chunk=min_sentences_per_chunk,
|
|
150
|
+
tokenizer=tokenizer
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
__all__ = ["TextChunker", "create_text_chunker"]
|
|
File without changes
|
|
@@ -0,0 +1,546 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Configurable embedding model with support for multiple backends.
|
|
3
|
+
|
|
4
|
+
Supports:
|
|
5
|
+
- Sentence Transformers (local models)
|
|
6
|
+
- OpenAI API
|
|
7
|
+
- Cohere API
|
|
8
|
+
- Custom embedding providers
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import os
|
|
14
|
+
import time
|
|
15
|
+
from functools import lru_cache
|
|
16
|
+
from typing import List, Optional, Union
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
|
|
20
|
+
from rakam_systems_core.ai_utils import logging
|
|
21
|
+
from rakam_systems_core.ai_core.interfaces.embedding_model import EmbeddingModel
|
|
22
|
+
from rakam_systems_vectorstore.config import EmbeddingConfig
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ConfigurableEmbeddings(EmbeddingModel):
|
|
28
|
+
"""
|
|
29
|
+
Configurable embedding model that supports multiple backends.
|
|
30
|
+
|
|
31
|
+
This component automatically selects the appropriate embedding backend
|
|
32
|
+
based on configuration and provides a unified interface.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
name: str = "configurable_embeddings",
|
|
38
|
+
config: Optional[Union[EmbeddingConfig, dict]] = None
|
|
39
|
+
):
|
|
40
|
+
"""
|
|
41
|
+
Initialize configurable embeddings.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
name: Component name
|
|
45
|
+
config: EmbeddingConfig or dict with embedding settings
|
|
46
|
+
"""
|
|
47
|
+
# Parse config first
|
|
48
|
+
if isinstance(config, dict):
|
|
49
|
+
self.embedding_config = EmbeddingConfig(**config)
|
|
50
|
+
config_dict = config
|
|
51
|
+
elif isinstance(config, EmbeddingConfig):
|
|
52
|
+
self.embedding_config = config
|
|
53
|
+
# Convert EmbeddingConfig to dict for parent class
|
|
54
|
+
from dataclasses import asdict
|
|
55
|
+
config_dict = asdict(config)
|
|
56
|
+
else:
|
|
57
|
+
self.embedding_config = EmbeddingConfig()
|
|
58
|
+
config_dict = None
|
|
59
|
+
|
|
60
|
+
# Pass dict to parent class
|
|
61
|
+
super().__init__(name=name, config=config_dict)
|
|
62
|
+
|
|
63
|
+
self.model_type = self.embedding_config.model_type
|
|
64
|
+
self.model_name = self.embedding_config.model_name
|
|
65
|
+
self.batch_size = self.embedding_config.batch_size
|
|
66
|
+
self.normalize = self.embedding_config.normalize
|
|
67
|
+
|
|
68
|
+
# Backend-specific attributes
|
|
69
|
+
self._model = None
|
|
70
|
+
self._client = None
|
|
71
|
+
self._embedding_dim = None
|
|
72
|
+
|
|
73
|
+
def setup(self) -> None:
|
|
74
|
+
"""Initialize the embedding backend."""
|
|
75
|
+
# Skip if already initialized to avoid reloading the model
|
|
76
|
+
if self.initialized:
|
|
77
|
+
logger.debug(
|
|
78
|
+
f"Embedding model {self.model_name} already initialized, skipping setup")
|
|
79
|
+
return
|
|
80
|
+
|
|
81
|
+
logger.info(
|
|
82
|
+
f"Setting up {self.model_type} embedding model: {self.model_name}")
|
|
83
|
+
|
|
84
|
+
if self.model_type == "sentence_transformer":
|
|
85
|
+
self._setup_sentence_transformer()
|
|
86
|
+
elif self.model_type == "openai":
|
|
87
|
+
pass # OpenAI client is created on-demand in _encode_openai
|
|
88
|
+
elif self.model_type == "cohere":
|
|
89
|
+
self._setup_cohere()
|
|
90
|
+
else:
|
|
91
|
+
raise ValueError(f"Unsupported model type: {self.model_type}")
|
|
92
|
+
|
|
93
|
+
# Detect embedding dimension
|
|
94
|
+
if self.embedding_config.dimensions:
|
|
95
|
+
self._embedding_dim = self.embedding_config.dimensions
|
|
96
|
+
else:
|
|
97
|
+
self._embedding_dim = self._detect_embedding_dimension()
|
|
98
|
+
|
|
99
|
+
logger.info(
|
|
100
|
+
f"Embedding model initialized with dimension: {self._embedding_dim}")
|
|
101
|
+
super().setup()
|
|
102
|
+
|
|
103
|
+
def _setup_sentence_transformer(self) -> None:
|
|
104
|
+
"""Setup Sentence Transformer model."""
|
|
105
|
+
try:
|
|
106
|
+
from sentence_transformers import SentenceTransformer
|
|
107
|
+
except ImportError:
|
|
108
|
+
raise ImportError(
|
|
109
|
+
"sentence-transformers is required for sentence_transformer model type. "
|
|
110
|
+
"Install it with: pip install sentence-transformers"
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Authenticate with Hugging Face if token is available
|
|
114
|
+
hf_token = os.getenv("HF_TOKEN")
|
|
115
|
+
if hf_token:
|
|
116
|
+
try:
|
|
117
|
+
from huggingface_hub import login
|
|
118
|
+
login(token=hf_token)
|
|
119
|
+
logger.info("Successfully authenticated with Hugging Face")
|
|
120
|
+
except ImportError:
|
|
121
|
+
logger.warning(
|
|
122
|
+
"huggingface-hub is not installed. "
|
|
123
|
+
"Install it with: pip install huggingface-hub"
|
|
124
|
+
)
|
|
125
|
+
except Exception as e:
|
|
126
|
+
logger.warning(
|
|
127
|
+
f"Failed to authenticate with Hugging Face: {e}")
|
|
128
|
+
else:
|
|
129
|
+
logger.debug(
|
|
130
|
+
"No HF_TOKEN found in environment, skipping Hugging Face authentication")
|
|
131
|
+
|
|
132
|
+
self._model = SentenceTransformer(
|
|
133
|
+
self.model_name, trust_remote_code=True)
|
|
134
|
+
logger.info(f"Loaded SentenceTransformer model: {self.model_name}")
|
|
135
|
+
|
|
136
|
+
def _setup_cohere(self) -> None:
|
|
137
|
+
"""Setup Cohere API client."""
|
|
138
|
+
try:
|
|
139
|
+
import cohere
|
|
140
|
+
except ImportError:
|
|
141
|
+
raise ImportError(
|
|
142
|
+
"cohere is required for cohere model type. "
|
|
143
|
+
"Install it with: pip install cohere"
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
api_key = self.embedding_config.api_key or os.getenv("COHERE_API_KEY")
|
|
147
|
+
if not api_key:
|
|
148
|
+
raise ValueError(
|
|
149
|
+
"Cohere API key not found. Set COHERE_API_KEY environment variable "
|
|
150
|
+
"or provide it in config."
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
self._client = cohere.Client(api_key)
|
|
154
|
+
logger.info(f"Initialized Cohere client with model: {self.model_name}")
|
|
155
|
+
|
|
156
|
+
def _detect_embedding_dimension(self) -> int:
|
|
157
|
+
"""Detect embedding dimension by encoding a sample text."""
|
|
158
|
+
sample_embedding = self._encode_batch(
|
|
159
|
+
["sample text for dimension detection"])[0]
|
|
160
|
+
return len(sample_embedding)
|
|
161
|
+
|
|
162
|
+
def _encode_batch(self, texts: List[str]) -> List[List[float]]:
|
|
163
|
+
"""
|
|
164
|
+
Encode a batch of texts using the configured backend.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
texts: List of texts to encode
|
|
168
|
+
|
|
169
|
+
Returns:
|
|
170
|
+
List of embedding vectors
|
|
171
|
+
"""
|
|
172
|
+
if self.model_type == "sentence_transformer":
|
|
173
|
+
return self._encode_sentence_transformer(texts)
|
|
174
|
+
elif self.model_type == "openai":
|
|
175
|
+
return self._encode_openai(texts)
|
|
176
|
+
elif self.model_type == "cohere":
|
|
177
|
+
return self._encode_cohere(texts)
|
|
178
|
+
else:
|
|
179
|
+
raise ValueError(f"Unsupported model type: {self.model_type}")
|
|
180
|
+
|
|
181
|
+
def _encode_sentence_transformer(self, texts: List[str]) -> List[List[float]]:
|
|
182
|
+
"""Encode texts using Sentence Transformer."""
|
|
183
|
+
import gc
|
|
184
|
+
|
|
185
|
+
# CRITICAL: Disable tokenizer parallelism to prevent deadlocks in Docker/multiprocessing environments
|
|
186
|
+
# This is a known issue with HuggingFace tokenizers in containerized environments
|
|
187
|
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
188
|
+
|
|
189
|
+
logger.info(
|
|
190
|
+
f"_encode_sentence_transformer() called with {len(texts)} texts")
|
|
191
|
+
|
|
192
|
+
# Ensure all texts are strings (sentence_transformers v3.x compatibility)
|
|
193
|
+
sanitized_texts = [str(t) if not isinstance(
|
|
194
|
+
t, str) else t for t in texts]
|
|
195
|
+
|
|
196
|
+
total_texts = len(sanitized_texts)
|
|
197
|
+
num_batches = (total_texts + self.batch_size - 1) // self.batch_size
|
|
198
|
+
|
|
199
|
+
logger.info(
|
|
200
|
+
f"Total texts: {total_texts}, Batch size: {self.batch_size}, Num batches: {num_batches}")
|
|
201
|
+
|
|
202
|
+
if total_texts > 1000:
|
|
203
|
+
logger.info(
|
|
204
|
+
f"Encoding {total_texts} texts with sentence transformer (batch_size={self.batch_size}, {num_batches} batches)...")
|
|
205
|
+
|
|
206
|
+
# For large datasets, encode in chunks with progress logging
|
|
207
|
+
if total_texts > 10000:
|
|
208
|
+
all_embeddings = []
|
|
209
|
+
# Log 1000 times during encoding
|
|
210
|
+
log_interval = max(1, num_batches // 1000)
|
|
211
|
+
batch_start_time = time.time()
|
|
212
|
+
|
|
213
|
+
logger.info(
|
|
214
|
+
f"Large dataset: processing {total_texts} texts in {num_batches} batches (logging every {log_interval} batches)")
|
|
215
|
+
|
|
216
|
+
for i in range(0, total_texts, self.batch_size):
|
|
217
|
+
batch_num = i // self.batch_size + 1
|
|
218
|
+
batch_texts = sanitized_texts[i:i + self.batch_size]
|
|
219
|
+
|
|
220
|
+
# Log before encoding starts (for first batch and every log_interval)
|
|
221
|
+
if batch_num == 1 or batch_num % log_interval == 0 or batch_num == num_batches:
|
|
222
|
+
logger.info(
|
|
223
|
+
f"Starting batch {batch_num}/{num_batches}: encoding {len(batch_texts)} texts...")
|
|
224
|
+
batch_encode_start = time.time()
|
|
225
|
+
|
|
226
|
+
batch_embeddings = self._model.encode(
|
|
227
|
+
batch_texts,
|
|
228
|
+
batch_size=self.batch_size,
|
|
229
|
+
show_progress_bar=False, # Disabled to prevent blocking in Docker/non-TTY environments
|
|
230
|
+
convert_to_tensor=False,
|
|
231
|
+
normalize_embeddings=False
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
if batch_num == 1 or batch_num % log_interval == 0 or batch_num == num_batches:
|
|
235
|
+
batch_encode_time = time.time() - batch_encode_start
|
|
236
|
+
logger.info(
|
|
237
|
+
f"Batch {batch_num}/{num_batches} encoding completed in {batch_encode_time:.2f}s")
|
|
238
|
+
|
|
239
|
+
all_embeddings.append(batch_embeddings)
|
|
240
|
+
|
|
241
|
+
# MEMORY OPTIMIZATION: Clear batch_texts reference
|
|
242
|
+
del batch_texts
|
|
243
|
+
|
|
244
|
+
if batch_num % log_interval == 0 or batch_num == num_batches:
|
|
245
|
+
progress_pct = (batch_num / num_batches) * 100
|
|
246
|
+
elapsed = time.time() - batch_start_time
|
|
247
|
+
texts_processed = min(i + self.batch_size, total_texts)
|
|
248
|
+
rate = texts_processed / elapsed if elapsed > 0 else 0
|
|
249
|
+
eta_seconds = (total_texts - texts_processed) / \
|
|
250
|
+
rate if rate > 0 else 0
|
|
251
|
+
logger.info(
|
|
252
|
+
f"Embedding progress: {batch_num}/{num_batches} batches ({progress_pct:.1f}%) - {texts_processed}/{total_texts} texts - {rate:.1f} texts/s - ETA: {eta_seconds:.0f}s")
|
|
253
|
+
|
|
254
|
+
embeddings = np.vstack(all_embeddings)
|
|
255
|
+
# MEMORY OPTIMIZATION: Clear intermediate arrays
|
|
256
|
+
del all_embeddings
|
|
257
|
+
elif num_batches > 1:
|
|
258
|
+
# For medium datasets (multiple batches but < 10000 texts), also log progress
|
|
259
|
+
all_embeddings = []
|
|
260
|
+
batch_start_time = time.time()
|
|
261
|
+
logger.info(
|
|
262
|
+
f"Processing {total_texts} texts in {num_batches} batches...")
|
|
263
|
+
|
|
264
|
+
for i in range(0, total_texts, self.batch_size):
|
|
265
|
+
batch_num = i // self.batch_size + 1
|
|
266
|
+
batch_texts = sanitized_texts[i:i + self.batch_size]
|
|
267
|
+
|
|
268
|
+
# Log before encoding starts
|
|
269
|
+
logger.info(
|
|
270
|
+
f"Starting batch {batch_num}/{num_batches}: encoding {len(batch_texts)} texts...")
|
|
271
|
+
batch_encode_start = time.time()
|
|
272
|
+
|
|
273
|
+
batch_embeddings = self._model.encode(
|
|
274
|
+
batch_texts,
|
|
275
|
+
batch_size=self.batch_size,
|
|
276
|
+
show_progress_bar=False, # Disabled to prevent blocking in Docker/non-TTY environments
|
|
277
|
+
convert_to_tensor=False,
|
|
278
|
+
normalize_embeddings=False
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
batch_encode_time = time.time() - batch_encode_start
|
|
282
|
+
logger.info(
|
|
283
|
+
f"Batch {batch_num}/{num_batches} encoding completed in {batch_encode_time:.2f}s")
|
|
284
|
+
|
|
285
|
+
all_embeddings.append(batch_embeddings)
|
|
286
|
+
|
|
287
|
+
# MEMORY OPTIMIZATION: Clear batch_texts reference
|
|
288
|
+
del batch_texts
|
|
289
|
+
|
|
290
|
+
# Log progress for each batch
|
|
291
|
+
progress_pct = (batch_num / num_batches) * 100
|
|
292
|
+
elapsed = time.time() - batch_start_time
|
|
293
|
+
texts_processed = min(i + self.batch_size, total_texts)
|
|
294
|
+
rate = texts_processed / elapsed if elapsed > 0 else 0
|
|
295
|
+
eta_seconds = (total_texts - texts_processed) / \
|
|
296
|
+
rate if rate > 0 else 0
|
|
297
|
+
logger.info(
|
|
298
|
+
f"Embedding progress: batch {batch_num}/{num_batches} ({progress_pct:.1f}%) - {texts_processed}/{total_texts} texts - {rate:.1f} texts/s - ETA: {eta_seconds:.0f}s")
|
|
299
|
+
|
|
300
|
+
embeddings = np.vstack(all_embeddings)
|
|
301
|
+
# MEMORY OPTIMIZATION: Clear intermediate arrays
|
|
302
|
+
del all_embeddings
|
|
303
|
+
else:
|
|
304
|
+
# Single batch - but still process in smaller chunks to show progress
|
|
305
|
+
logger.info(
|
|
306
|
+
f"Processing {total_texts} texts (will process in mini-batches of {self.batch_size})...")
|
|
307
|
+
all_embeddings = []
|
|
308
|
+
batch_start_time = time.time()
|
|
309
|
+
|
|
310
|
+
# Process in mini-batches even for "single batch" to show progress
|
|
311
|
+
# Use smaller batches for better progress visibility
|
|
312
|
+
mini_batch_size = min(self.batch_size, 32)
|
|
313
|
+
num_mini_batches = (
|
|
314
|
+
total_texts + mini_batch_size - 1) // mini_batch_size
|
|
315
|
+
|
|
316
|
+
logger.info(
|
|
317
|
+
f"Will process {total_texts} texts in {num_mini_batches} mini-batches of size {mini_batch_size}")
|
|
318
|
+
|
|
319
|
+
for i in range(0, total_texts, mini_batch_size):
|
|
320
|
+
batch_num = i // mini_batch_size + 1
|
|
321
|
+
batch_texts = sanitized_texts[i:i + mini_batch_size]
|
|
322
|
+
|
|
323
|
+
# Log before encoding starts
|
|
324
|
+
logger.info(
|
|
325
|
+
f"Starting mini-batch {batch_num}/{num_mini_batches}: encoding {len(batch_texts)} texts...")
|
|
326
|
+
batch_encode_start = time.time()
|
|
327
|
+
|
|
328
|
+
batch_embeddings = self._model.encode(
|
|
329
|
+
batch_texts,
|
|
330
|
+
batch_size=mini_batch_size,
|
|
331
|
+
show_progress_bar=False, # Disabled to prevent blocking in Docker/non-TTY environments
|
|
332
|
+
convert_to_tensor=False,
|
|
333
|
+
normalize_embeddings=False
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
batch_encode_time = time.time() - batch_encode_start
|
|
337
|
+
logger.info(
|
|
338
|
+
f"Mini-batch {batch_num}/{num_mini_batches} encoding completed in {batch_encode_time:.2f}s")
|
|
339
|
+
|
|
340
|
+
all_embeddings.append(batch_embeddings)
|
|
341
|
+
|
|
342
|
+
# MEMORY OPTIMIZATION: Clear batch_texts reference
|
|
343
|
+
del batch_texts
|
|
344
|
+
|
|
345
|
+
# Log progress for each mini-batch
|
|
346
|
+
progress_pct = (batch_num / num_mini_batches) * 100
|
|
347
|
+
elapsed = time.time() - batch_start_time
|
|
348
|
+
texts_processed = min(i + mini_batch_size, total_texts)
|
|
349
|
+
rate = texts_processed / elapsed if elapsed > 0 else 0
|
|
350
|
+
eta_seconds = (total_texts - texts_processed) / \
|
|
351
|
+
rate if rate > 0 else 0
|
|
352
|
+
logger.info(
|
|
353
|
+
f"Embedding progress: mini-batch {batch_num}/{num_mini_batches} ({progress_pct:.1f}%) - {texts_processed}/{total_texts} texts - {rate:.1f} texts/s - ETA: {eta_seconds:.0f}s")
|
|
354
|
+
|
|
355
|
+
embeddings = np.vstack(all_embeddings)
|
|
356
|
+
# MEMORY OPTIMIZATION: Clear intermediate arrays
|
|
357
|
+
del all_embeddings
|
|
358
|
+
|
|
359
|
+
# MEMORY OPTIMIZATION: Clear sanitized_texts as no longer needed
|
|
360
|
+
del sanitized_texts
|
|
361
|
+
|
|
362
|
+
# Force garbage collection after encoding to prevent memory buildup
|
|
363
|
+
# This is especially important for long-running batch processes
|
|
364
|
+
gc.collect()
|
|
365
|
+
try:
|
|
366
|
+
import torch
|
|
367
|
+
if torch.cuda.is_available():
|
|
368
|
+
torch.cuda.empty_cache()
|
|
369
|
+
except ImportError:
|
|
370
|
+
pass
|
|
371
|
+
|
|
372
|
+
# Always log completion for visibility
|
|
373
|
+
logger.info(f"✓ Encoding completed for {total_texts} texts")
|
|
374
|
+
|
|
375
|
+
# Convert to list of lists and release numpy array
|
|
376
|
+
if isinstance(embeddings, np.ndarray):
|
|
377
|
+
result = embeddings.tolist()
|
|
378
|
+
del embeddings # Release the numpy array
|
|
379
|
+
gc.collect()
|
|
380
|
+
return result
|
|
381
|
+
|
|
382
|
+
return embeddings
|
|
383
|
+
|
|
384
|
+
def _encode_openai(self, texts: List[str]) -> List[List[float]]:
|
|
385
|
+
"""Encode texts using OpenAI API."""
|
|
386
|
+
from rakam_systems_vectorstore.components.embedding_model.openai_embeddings import OpenAIEmbeddings
|
|
387
|
+
|
|
388
|
+
# Use the OpenAIEmbeddings implementation with configured batch_size
|
|
389
|
+
openai_embeddings = OpenAIEmbeddings(
|
|
390
|
+
model=self.model_name,
|
|
391
|
+
api_key=self.embedding_config.api_key,
|
|
392
|
+
batch_size=self.batch_size # Pass the batch_size from ConfigurableEmbeddings
|
|
393
|
+
)
|
|
394
|
+
logger.info(f"OpenAI embeddings using batch_size={self.batch_size}")
|
|
395
|
+
return openai_embeddings.run(texts)
|
|
396
|
+
|
|
397
|
+
def _encode_cohere(self, texts: List[str]) -> List[List[float]]:
|
|
398
|
+
"""Encode texts using Cohere API."""
|
|
399
|
+
all_embeddings = []
|
|
400
|
+
|
|
401
|
+
# Process in batches
|
|
402
|
+
for i in range(0, len(texts), self.batch_size):
|
|
403
|
+
batch = texts[i:i + self.batch_size]
|
|
404
|
+
|
|
405
|
+
try:
|
|
406
|
+
response = self._client.embed(
|
|
407
|
+
texts=batch,
|
|
408
|
+
model=self.model_name,
|
|
409
|
+
input_type="search_document"
|
|
410
|
+
)
|
|
411
|
+
all_embeddings.extend(response.embeddings)
|
|
412
|
+
except Exception as e:
|
|
413
|
+
logger.error(f"Error encoding batch with Cohere: {e}")
|
|
414
|
+
raise
|
|
415
|
+
|
|
416
|
+
return all_embeddings
|
|
417
|
+
|
|
418
|
+
def _normalize_embeddings(self, embeddings: List[List[float]]) -> List[List[float]]:
|
|
419
|
+
"""Normalize embeddings to unit length."""
|
|
420
|
+
embeddings_array = np.array(embeddings, dtype=np.float32)
|
|
421
|
+
norms = np.linalg.norm(embeddings_array, axis=1, keepdims=True)
|
|
422
|
+
norms[norms == 0] = 1 # Avoid division by zero
|
|
423
|
+
normalized = embeddings_array / norms
|
|
424
|
+
return normalized.tolist()
|
|
425
|
+
|
|
426
|
+
def run(self, texts: List[str]) -> List[List[float]]:
|
|
427
|
+
"""
|
|
428
|
+
Encode texts into embeddings.
|
|
429
|
+
|
|
430
|
+
Args:
|
|
431
|
+
texts: List of texts to encode
|
|
432
|
+
|
|
433
|
+
Returns:
|
|
434
|
+
List of embedding vectors
|
|
435
|
+
"""
|
|
436
|
+
if not texts:
|
|
437
|
+
return []
|
|
438
|
+
|
|
439
|
+
start_time = time.time()
|
|
440
|
+
logger.info(
|
|
441
|
+
f"run() called: Encoding {len(texts)} texts with {self.model_type} model '{self.model_name}'")
|
|
442
|
+
|
|
443
|
+
# Encode texts
|
|
444
|
+
logger.info(f"Calling _encode_batch() for {len(texts)} texts...")
|
|
445
|
+
embeddings = self._encode_batch(texts)
|
|
446
|
+
logger.info(f"_encode_batch() returned {len(embeddings)} embeddings")
|
|
447
|
+
|
|
448
|
+
# Normalize if configured
|
|
449
|
+
if self.normalize:
|
|
450
|
+
logger.info(f"Normalizing {len(embeddings)} embeddings...")
|
|
451
|
+
embeddings = self._normalize_embeddings(embeddings)
|
|
452
|
+
logger.info(f"Normalization complete")
|
|
453
|
+
|
|
454
|
+
elapsed = time.time() - start_time
|
|
455
|
+
logger.info(
|
|
456
|
+
f"run() completed: Encoded {len(texts)} texts in {elapsed:.2f}s ({len(texts)/elapsed:.1f} texts/s)")
|
|
457
|
+
|
|
458
|
+
return embeddings
|
|
459
|
+
|
|
460
|
+
def encode_query(self, query: str) -> List[float]:
|
|
461
|
+
"""
|
|
462
|
+
Encode a single query text.
|
|
463
|
+
|
|
464
|
+
Args:
|
|
465
|
+
query: Query text to encode
|
|
466
|
+
|
|
467
|
+
Returns:
|
|
468
|
+
Embedding vector
|
|
469
|
+
"""
|
|
470
|
+
embeddings = self.run([query])
|
|
471
|
+
return embeddings[0] if embeddings else []
|
|
472
|
+
|
|
473
|
+
def encode_documents(self, documents: List[str]) -> List[List[float]]:
|
|
474
|
+
"""
|
|
475
|
+
Encode multiple documents.
|
|
476
|
+
|
|
477
|
+
Args:
|
|
478
|
+
documents: List of documents to encode
|
|
479
|
+
|
|
480
|
+
Returns:
|
|
481
|
+
List of embedding vectors
|
|
482
|
+
"""
|
|
483
|
+
return self.run(documents)
|
|
484
|
+
|
|
485
|
+
@property
|
|
486
|
+
def embedding_dimension(self) -> int:
|
|
487
|
+
"""Get the embedding dimension."""
|
|
488
|
+
if not self.initialized:
|
|
489
|
+
self.setup()
|
|
490
|
+
return self._embedding_dim
|
|
491
|
+
|
|
492
|
+
def shutdown(self) -> None:
|
|
493
|
+
"""Clean up resources."""
|
|
494
|
+
logger.info(f"Shutting down {self.model_type} embedding model")
|
|
495
|
+
|
|
496
|
+
if self.model_type == "sentence_transformer" and self._model:
|
|
497
|
+
# Clean up CUDA memory if using GPU
|
|
498
|
+
try:
|
|
499
|
+
import torch
|
|
500
|
+
if torch.cuda.is_available():
|
|
501
|
+
torch.cuda.empty_cache()
|
|
502
|
+
except ImportError:
|
|
503
|
+
pass
|
|
504
|
+
|
|
505
|
+
self._model = None
|
|
506
|
+
self._client = None
|
|
507
|
+
super().shutdown()
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
# Convenience factory function
|
|
511
|
+
def create_embedding_model(
|
|
512
|
+
model_type: str = "sentence_transformer",
|
|
513
|
+
model_name: Optional[str] = None,
|
|
514
|
+
**kwargs
|
|
515
|
+
) -> ConfigurableEmbeddings:
|
|
516
|
+
"""
|
|
517
|
+
Factory function to create an embedding model.
|
|
518
|
+
|
|
519
|
+
Args:
|
|
520
|
+
model_type: Type of model (sentence_transformer, openai, cohere)
|
|
521
|
+
model_name: Model name/identifier
|
|
522
|
+
**kwargs: Additional configuration parameters
|
|
523
|
+
|
|
524
|
+
Returns:
|
|
525
|
+
Configured embedding model
|
|
526
|
+
"""
|
|
527
|
+
config = EmbeddingConfig(
|
|
528
|
+
model_type=model_type,
|
|
529
|
+
model_name=model_name or _get_default_model_name(model_type),
|
|
530
|
+
**kwargs
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
return ConfigurableEmbeddings(config=config)
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
def _get_default_model_name(model_type: str) -> str:
|
|
537
|
+
"""Get default model name for a given type."""
|
|
538
|
+
defaults = {
|
|
539
|
+
"sentence_transformer": "Snowflake/snowflake-arctic-embed-m",
|
|
540
|
+
"openai": "text-embedding-3-small",
|
|
541
|
+
"cohere": "embed-english-v3.0"
|
|
542
|
+
}
|
|
543
|
+
return defaults.get(model_type, "")
|
|
544
|
+
|
|
545
|
+
|
|
546
|
+
__all__ = ["ConfigurableEmbeddings", "create_embedding_model"]
|