rakam-systems-vectorstore 0.1.1rc7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rakam_systems_vectorstore/MANIFEST.in +26 -0
- rakam_systems_vectorstore/README.md +1071 -0
- rakam_systems_vectorstore/__init__.py +93 -0
- rakam_systems_vectorstore/components/__init__.py +0 -0
- rakam_systems_vectorstore/components/chunker/__init__.py +19 -0
- rakam_systems_vectorstore/components/chunker/advanced_chunker.py +1019 -0
- rakam_systems_vectorstore/components/chunker/text_chunker.py +154 -0
- rakam_systems_vectorstore/components/embedding_model/__init__.py +0 -0
- rakam_systems_vectorstore/components/embedding_model/configurable_embeddings.py +546 -0
- rakam_systems_vectorstore/components/embedding_model/openai_embeddings.py +259 -0
- rakam_systems_vectorstore/components/loader/__init__.py +31 -0
- rakam_systems_vectorstore/components/loader/adaptive_loader.py +512 -0
- rakam_systems_vectorstore/components/loader/code_loader.py +699 -0
- rakam_systems_vectorstore/components/loader/doc_loader.py +812 -0
- rakam_systems_vectorstore/components/loader/eml_loader.py +556 -0
- rakam_systems_vectorstore/components/loader/html_loader.py +626 -0
- rakam_systems_vectorstore/components/loader/md_loader.py +622 -0
- rakam_systems_vectorstore/components/loader/odt_loader.py +750 -0
- rakam_systems_vectorstore/components/loader/pdf_loader.py +771 -0
- rakam_systems_vectorstore/components/loader/pdf_loader_light.py +723 -0
- rakam_systems_vectorstore/components/loader/tabular_loader.py +597 -0
- rakam_systems_vectorstore/components/vectorstore/__init__.py +0 -0
- rakam_systems_vectorstore/components/vectorstore/apps.py +10 -0
- rakam_systems_vectorstore/components/vectorstore/configurable_pg_vector_store.py +1661 -0
- rakam_systems_vectorstore/components/vectorstore/faiss_vector_store.py +878 -0
- rakam_systems_vectorstore/components/vectorstore/migrations/0001_initial.py +55 -0
- rakam_systems_vectorstore/components/vectorstore/migrations/__init__.py +0 -0
- rakam_systems_vectorstore/components/vectorstore/models.py +10 -0
- rakam_systems_vectorstore/components/vectorstore/pg_models.py +97 -0
- rakam_systems_vectorstore/components/vectorstore/pg_vector_store.py +827 -0
- rakam_systems_vectorstore/config.py +266 -0
- rakam_systems_vectorstore/core.py +8 -0
- rakam_systems_vectorstore/pyproject.toml +113 -0
- rakam_systems_vectorstore/server/README.md +290 -0
- rakam_systems_vectorstore/server/__init__.py +20 -0
- rakam_systems_vectorstore/server/mcp_server_vector.py +325 -0
- rakam_systems_vectorstore/setup.py +103 -0
- rakam_systems_vectorstore-0.1.1rc7.dist-info/METADATA +370 -0
- rakam_systems_vectorstore-0.1.1rc7.dist-info/RECORD +40 -0
- rakam_systems_vectorstore-0.1.1rc7.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import List
|
|
3
|
+
from rakam_systems_core.ai_core.interfaces.embedding_model import EmbeddingModel
|
|
4
|
+
from openai import OpenAI
|
|
5
|
+
from rakam_systems_core.ai_utils import logging
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class OpenAIEmbeddings(EmbeddingModel):
|
|
11
|
+
"""OpenAI embeddings implementation using the OpenAI API.
|
|
12
|
+
|
|
13
|
+
This module is integrated into ConfigurableEmbeddings and can be used via
|
|
14
|
+
the "openai" provider configuration.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
model: The OpenAI embedding model to use (default: "text-embedding-3-small")
|
|
18
|
+
api_key: Optional API key. If not provided, will use OPENAI_API_KEY environment variable
|
|
19
|
+
max_tokens: Maximum tokens allowed per text (default: 8191 for text-embedding-3-small)
|
|
20
|
+
batch_size: Batch size for API calls (default: 100, recommended by OpenAI)
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
# Model-specific token limits (leaving some margin for safety)
|
|
24
|
+
MODEL_TOKEN_LIMITS = {
|
|
25
|
+
"text-embedding-3-small": 8191,
|
|
26
|
+
"text-embedding-3-large": 8191,
|
|
27
|
+
"text-embedding-ada-002": 8191,
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
def __init__(self, model: str = "text-embedding-3-small", api_key: str = None, max_tokens: int = None, batch_size: int = 100):
|
|
31
|
+
self.model = model
|
|
32
|
+
self.client = OpenAI(api_key=api_key) if api_key else OpenAI()
|
|
33
|
+
self.max_tokens = max_tokens or self.MODEL_TOKEN_LIMITS.get(
|
|
34
|
+
model, 8191)
|
|
35
|
+
self.batch_size = batch_size
|
|
36
|
+
|
|
37
|
+
# Initialize tiktoken for token counting
|
|
38
|
+
try:
|
|
39
|
+
import tiktoken
|
|
40
|
+
self.encoding = tiktoken.encoding_for_model(model)
|
|
41
|
+
except Exception as e:
|
|
42
|
+
logger.warning(
|
|
43
|
+
f"Failed to initialize tiktoken for model {model}: {e}. Using character-based estimation.")
|
|
44
|
+
self.encoding = None
|
|
45
|
+
|
|
46
|
+
def _truncate_batch_with_encode_batch(self, texts: List[str]) -> List[str]:
|
|
47
|
+
"""Truncate texts using encode_batch to find maximum embeddable length.
|
|
48
|
+
|
|
49
|
+
Uses encode_batch to determine the actual token count for each text,
|
|
50
|
+
then truncates only those that exceed the limit.
|
|
51
|
+
"""
|
|
52
|
+
if not self.encoding:
|
|
53
|
+
# Fallback: character-based truncation
|
|
54
|
+
max_chars = self.max_tokens * 4
|
|
55
|
+
return [text[:max_chars] if len(text) > max_chars else text for text in texts]
|
|
56
|
+
|
|
57
|
+
# Clean texts
|
|
58
|
+
cleaned_texts = [text.replace("\n", " ") for text in texts]
|
|
59
|
+
|
|
60
|
+
# Use encode_batch to get token counts for all texts at once
|
|
61
|
+
try:
|
|
62
|
+
encoded_batch = self.encoding.encode_batch(cleaned_texts)
|
|
63
|
+
except Exception as e:
|
|
64
|
+
logger.warning(
|
|
65
|
+
f"encode_batch failed: {e}, falling back to individual encoding")
|
|
66
|
+
encoded_batch = [self.encoding.encode(
|
|
67
|
+
text) for text in cleaned_texts]
|
|
68
|
+
|
|
69
|
+
# Process each text based on its actual token count
|
|
70
|
+
processed_texts = []
|
|
71
|
+
for i, (text, tokens) in enumerate(zip(cleaned_texts, encoded_batch)):
|
|
72
|
+
if len(tokens) <= self.max_tokens:
|
|
73
|
+
# Text is within limit, use as-is
|
|
74
|
+
processed_texts.append(text)
|
|
75
|
+
else:
|
|
76
|
+
# Text exceeds limit, truncate to max_tokens
|
|
77
|
+
truncated_tokens = tokens[:self.max_tokens]
|
|
78
|
+
truncated_text = self.encoding.decode(truncated_tokens)
|
|
79
|
+
logger.warning(
|
|
80
|
+
f"Text {i} truncated from {len(tokens)} to {self.max_tokens} tokens")
|
|
81
|
+
processed_texts.append(truncated_text)
|
|
82
|
+
|
|
83
|
+
return processed_texts
|
|
84
|
+
|
|
85
|
+
def get_embedding(self, text: str) -> List[float]:
|
|
86
|
+
"""Get embedding for a single text."""
|
|
87
|
+
# Use batch truncation for consistency
|
|
88
|
+
processed_texts = self._truncate_batch_with_encode_batch([text])
|
|
89
|
+
return self.client.embeddings.create(input=processed_texts, model=self.model).data[0].embedding
|
|
90
|
+
|
|
91
|
+
def get_embeddings_batch(self, texts: List[str], batch_size: int = 100) -> List[List[float]]:
|
|
92
|
+
"""
|
|
93
|
+
Get embeddings for multiple texts using batch processing.
|
|
94
|
+
|
|
95
|
+
Uses encode_batch to determine the maximum embeddable length for each text,
|
|
96
|
+
then truncates if necessary before sending to the API.
|
|
97
|
+
Also respects OpenAI's 300K tokens per request limit.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
texts: List of texts to embed
|
|
101
|
+
batch_size: Maximum number of texts to send in a single API call (default: 100)
|
|
102
|
+
OpenAI recommends batches of 100 or fewer for optimal performance
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
List of embedding vectors, one for each input text
|
|
106
|
+
"""
|
|
107
|
+
if not texts:
|
|
108
|
+
return []
|
|
109
|
+
|
|
110
|
+
import time
|
|
111
|
+
|
|
112
|
+
# OpenAI API limits
|
|
113
|
+
MAX_TOKENS_PER_REQUEST = 300000 # Total tokens per request limit
|
|
114
|
+
MAX_TOKENS_PER_TEXT = self.max_tokens # Individual text limit (8191)
|
|
115
|
+
|
|
116
|
+
all_embeddings = []
|
|
117
|
+
total_texts = len(texts)
|
|
118
|
+
|
|
119
|
+
# Log initial info
|
|
120
|
+
logger.info(
|
|
121
|
+
f"Starting OpenAI embedding generation for {total_texts} texts")
|
|
122
|
+
logger.info(f"Initial batch size: {batch_size}")
|
|
123
|
+
logger.info(f"Model: {self.model}")
|
|
124
|
+
|
|
125
|
+
start_time = time.time()
|
|
126
|
+
|
|
127
|
+
# Process texts with dynamic batching based on token count
|
|
128
|
+
i = 0
|
|
129
|
+
batch_num = 0
|
|
130
|
+
while i < total_texts:
|
|
131
|
+
batch_num += 1
|
|
132
|
+
batch_start_time = time.time()
|
|
133
|
+
|
|
134
|
+
# Collect texts for this batch, respecting token limits
|
|
135
|
+
batch = []
|
|
136
|
+
batch_indices = []
|
|
137
|
+
current_batch_tokens = 0
|
|
138
|
+
|
|
139
|
+
# Try to fill batch up to batch_size or token limit
|
|
140
|
+
while i < total_texts and len(batch) < batch_size:
|
|
141
|
+
# Peek at next text and process it
|
|
142
|
+
next_batch = [texts[i]]
|
|
143
|
+
processed_next = self._truncate_batch_with_encode_batch(
|
|
144
|
+
next_batch)
|
|
145
|
+
|
|
146
|
+
# Count tokens in processed text
|
|
147
|
+
if self.encoding:
|
|
148
|
+
try:
|
|
149
|
+
text_tokens = len(
|
|
150
|
+
self.encoding.encode(processed_next[0]))
|
|
151
|
+
except:
|
|
152
|
+
# Fallback estimation
|
|
153
|
+
text_tokens = len(processed_next[0]) // 4
|
|
154
|
+
else:
|
|
155
|
+
text_tokens = len(processed_next[0]) // 4
|
|
156
|
+
|
|
157
|
+
# Check if adding this text would exceed the request limit
|
|
158
|
+
if batch and (current_batch_tokens + text_tokens > MAX_TOKENS_PER_REQUEST):
|
|
159
|
+
# Batch is full, stop here
|
|
160
|
+
logger.info(
|
|
161
|
+
f"[OpenAI Batch {batch_num}] Batch token limit reached: {current_batch_tokens} tokens, stopping before adding text with {text_tokens} tokens")
|
|
162
|
+
break
|
|
163
|
+
|
|
164
|
+
# Add text to batch
|
|
165
|
+
batch.append(texts[i])
|
|
166
|
+
batch_indices.append(i)
|
|
167
|
+
current_batch_tokens += text_tokens
|
|
168
|
+
i += 1
|
|
169
|
+
|
|
170
|
+
if not batch:
|
|
171
|
+
# Edge case: single text exceeds request limit (shouldn't happen with 8191 limit)
|
|
172
|
+
logger.error(
|
|
173
|
+
f"Single text at index {i} exceeds request token limit, skipping")
|
|
174
|
+
all_embeddings.append([0.0] * 1536)
|
|
175
|
+
i += 1
|
|
176
|
+
continue
|
|
177
|
+
|
|
178
|
+
# Log batch start
|
|
179
|
+
progress_pct = i / total_texts * 100
|
|
180
|
+
logger.info(f"[OpenAI Batch {batch_num}] Processing texts {batch_indices[0]+1}-{batch_indices[-1]+1} "
|
|
181
|
+
f"({len(batch)} texts, ~{current_batch_tokens} tokens, {progress_pct:.1f}% complete)")
|
|
182
|
+
|
|
183
|
+
# Process the batch
|
|
184
|
+
processed_batch = self._truncate_batch_with_encode_batch(batch)
|
|
185
|
+
|
|
186
|
+
# Send batch to OpenAI API
|
|
187
|
+
try:
|
|
188
|
+
response = self.client.embeddings.create(
|
|
189
|
+
input=processed_batch,
|
|
190
|
+
model=self.model
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
# Extract embeddings in order
|
|
194
|
+
batch_embeddings = [item.embedding for item in response.data]
|
|
195
|
+
all_embeddings.extend(batch_embeddings)
|
|
196
|
+
|
|
197
|
+
batch_elapsed = time.time() - batch_start_time
|
|
198
|
+
|
|
199
|
+
# Calculate statistics
|
|
200
|
+
texts_processed = len(all_embeddings)
|
|
201
|
+
overall_elapsed = time.time() - start_time
|
|
202
|
+
overall_rate = texts_processed / overall_elapsed if overall_elapsed > 0 else 0
|
|
203
|
+
eta_seconds = (total_texts - texts_processed) / \
|
|
204
|
+
overall_rate if overall_rate > 0 else 0
|
|
205
|
+
|
|
206
|
+
# Log batch completion with detailed stats
|
|
207
|
+
logger.info(f"[OpenAI Batch {batch_num}] ✓ Completed in {batch_elapsed:.2f}s | "
|
|
208
|
+
f"Progress: {texts_processed}/{total_texts} ({texts_processed/total_texts*100:.1f}%) | "
|
|
209
|
+
f"Rate: {overall_rate:.1f} texts/s | ETA: {eta_seconds:.0f}s")
|
|
210
|
+
|
|
211
|
+
except Exception as e:
|
|
212
|
+
logger.error(
|
|
213
|
+
f"[OpenAI Batch {batch_num}] Batch processing failed: {e}")
|
|
214
|
+
logger.info(
|
|
215
|
+
f"[OpenAI Batch {batch_num}] Falling back to individual processing...")
|
|
216
|
+
|
|
217
|
+
# Fallback: process texts individually
|
|
218
|
+
for idx, text in enumerate(batch):
|
|
219
|
+
try:
|
|
220
|
+
processed_text = self._truncate_batch_with_encode_batch([text])[
|
|
221
|
+
0]
|
|
222
|
+
embedding = self.client.embeddings.create(
|
|
223
|
+
input=[processed_text],
|
|
224
|
+
model=self.model
|
|
225
|
+
).data[0].embedding
|
|
226
|
+
all_embeddings.append(embedding)
|
|
227
|
+
except Exception as inner_e:
|
|
228
|
+
logger.error(
|
|
229
|
+
f"[OpenAI Batch {batch_num}] Error processing individual text {batch_indices[idx] + 1}: {inner_e}")
|
|
230
|
+
# Return zero vector as fallback
|
|
231
|
+
# Default dimension for text-embedding-3-small
|
|
232
|
+
all_embeddings.append([0.0] * 1536)
|
|
233
|
+
|
|
234
|
+
# Log final summary
|
|
235
|
+
total_elapsed = time.time() - start_time
|
|
236
|
+
overall_rate = total_texts / total_elapsed if total_elapsed > 0 else 0
|
|
237
|
+
logger.info(f"✓ OpenAI embedding generation completed!")
|
|
238
|
+
logger.info(f" Total texts: {total_texts}")
|
|
239
|
+
logger.info(f" Total time: {total_elapsed:.2f}s")
|
|
240
|
+
logger.info(f" Average rate: {overall_rate:.1f} texts/s")
|
|
241
|
+
logger.info(f" Batches processed: {batch_num}")
|
|
242
|
+
|
|
243
|
+
return all_embeddings
|
|
244
|
+
|
|
245
|
+
def run(self, texts: List[str]) -> List[List[float]]:
|
|
246
|
+
"""
|
|
247
|
+
Get embeddings for a list of texts.
|
|
248
|
+
|
|
249
|
+
Uses batch processing for efficiency when multiple texts are provided.
|
|
250
|
+
"""
|
|
251
|
+
if not texts:
|
|
252
|
+
return []
|
|
253
|
+
|
|
254
|
+
# Use batch processing for multiple texts
|
|
255
|
+
if len(texts) > 1:
|
|
256
|
+
return self.get_embeddings_batch(texts, batch_size=self.batch_size)
|
|
257
|
+
else:
|
|
258
|
+
# Single text - use direct method
|
|
259
|
+
return [self.get_embedding(texts[0])]
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from .adaptive_loader import AdaptiveLoader, create_adaptive_loader
|
|
2
|
+
from .code_loader import CodeLoader, create_code_loader
|
|
3
|
+
from .doc_loader import DocLoader, create_doc_loader
|
|
4
|
+
from .eml_loader import EmlLoader, create_eml_loader
|
|
5
|
+
from .html_loader import HtmlLoader, create_html_loader
|
|
6
|
+
from .md_loader import MdLoader, create_md_loader
|
|
7
|
+
from .odt_loader import OdtLoader, create_odt_loader
|
|
8
|
+
from .pdf_loader_light import PdfLoaderLight, create_pdf_loader_light
|
|
9
|
+
from .tabular_loader import TabularLoader, create_tabular_loader
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"AdaptiveLoader",
|
|
13
|
+
"create_adaptive_loader",
|
|
14
|
+
"CodeLoader",
|
|
15
|
+
"create_code_loader",
|
|
16
|
+
"DocLoader",
|
|
17
|
+
"create_doc_loader",
|
|
18
|
+
"EmlLoader",
|
|
19
|
+
"create_eml_loader",
|
|
20
|
+
"HtmlLoader",
|
|
21
|
+
"create_html_loader",
|
|
22
|
+
"MdLoader",
|
|
23
|
+
"create_md_loader",
|
|
24
|
+
"OdtLoader",
|
|
25
|
+
"create_odt_loader",
|
|
26
|
+
"PdfLoaderLight",
|
|
27
|
+
"create_pdf_loader_light",
|
|
28
|
+
"TabularLoader",
|
|
29
|
+
"create_tabular_loader",
|
|
30
|
+
]
|
|
31
|
+
|