rakam-systems-vectorstore 0.1.1rc7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. rakam_systems_vectorstore/MANIFEST.in +26 -0
  2. rakam_systems_vectorstore/README.md +1071 -0
  3. rakam_systems_vectorstore/__init__.py +93 -0
  4. rakam_systems_vectorstore/components/__init__.py +0 -0
  5. rakam_systems_vectorstore/components/chunker/__init__.py +19 -0
  6. rakam_systems_vectorstore/components/chunker/advanced_chunker.py +1019 -0
  7. rakam_systems_vectorstore/components/chunker/text_chunker.py +154 -0
  8. rakam_systems_vectorstore/components/embedding_model/__init__.py +0 -0
  9. rakam_systems_vectorstore/components/embedding_model/configurable_embeddings.py +546 -0
  10. rakam_systems_vectorstore/components/embedding_model/openai_embeddings.py +259 -0
  11. rakam_systems_vectorstore/components/loader/__init__.py +31 -0
  12. rakam_systems_vectorstore/components/loader/adaptive_loader.py +512 -0
  13. rakam_systems_vectorstore/components/loader/code_loader.py +699 -0
  14. rakam_systems_vectorstore/components/loader/doc_loader.py +812 -0
  15. rakam_systems_vectorstore/components/loader/eml_loader.py +556 -0
  16. rakam_systems_vectorstore/components/loader/html_loader.py +626 -0
  17. rakam_systems_vectorstore/components/loader/md_loader.py +622 -0
  18. rakam_systems_vectorstore/components/loader/odt_loader.py +750 -0
  19. rakam_systems_vectorstore/components/loader/pdf_loader.py +771 -0
  20. rakam_systems_vectorstore/components/loader/pdf_loader_light.py +723 -0
  21. rakam_systems_vectorstore/components/loader/tabular_loader.py +597 -0
  22. rakam_systems_vectorstore/components/vectorstore/__init__.py +0 -0
  23. rakam_systems_vectorstore/components/vectorstore/apps.py +10 -0
  24. rakam_systems_vectorstore/components/vectorstore/configurable_pg_vector_store.py +1661 -0
  25. rakam_systems_vectorstore/components/vectorstore/faiss_vector_store.py +878 -0
  26. rakam_systems_vectorstore/components/vectorstore/migrations/0001_initial.py +55 -0
  27. rakam_systems_vectorstore/components/vectorstore/migrations/__init__.py +0 -0
  28. rakam_systems_vectorstore/components/vectorstore/models.py +10 -0
  29. rakam_systems_vectorstore/components/vectorstore/pg_models.py +97 -0
  30. rakam_systems_vectorstore/components/vectorstore/pg_vector_store.py +827 -0
  31. rakam_systems_vectorstore/config.py +266 -0
  32. rakam_systems_vectorstore/core.py +8 -0
  33. rakam_systems_vectorstore/pyproject.toml +113 -0
  34. rakam_systems_vectorstore/server/README.md +290 -0
  35. rakam_systems_vectorstore/server/__init__.py +20 -0
  36. rakam_systems_vectorstore/server/mcp_server_vector.py +325 -0
  37. rakam_systems_vectorstore/setup.py +103 -0
  38. rakam_systems_vectorstore-0.1.1rc7.dist-info/METADATA +370 -0
  39. rakam_systems_vectorstore-0.1.1rc7.dist-info/RECORD +40 -0
  40. rakam_systems_vectorstore-0.1.1rc7.dist-info/WHEEL +4 -0
@@ -0,0 +1,259 @@
1
+ from __future__ import annotations
2
+ from typing import List
3
+ from rakam_systems_core.ai_core.interfaces.embedding_model import EmbeddingModel
4
+ from openai import OpenAI
5
+ from rakam_systems_core.ai_utils import logging
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ class OpenAIEmbeddings(EmbeddingModel):
11
+ """OpenAI embeddings implementation using the OpenAI API.
12
+
13
+ This module is integrated into ConfigurableEmbeddings and can be used via
14
+ the "openai" provider configuration.
15
+
16
+ Args:
17
+ model: The OpenAI embedding model to use (default: "text-embedding-3-small")
18
+ api_key: Optional API key. If not provided, will use OPENAI_API_KEY environment variable
19
+ max_tokens: Maximum tokens allowed per text (default: 8191 for text-embedding-3-small)
20
+ batch_size: Batch size for API calls (default: 100, recommended by OpenAI)
21
+ """
22
+
23
+ # Model-specific token limits (leaving some margin for safety)
24
+ MODEL_TOKEN_LIMITS = {
25
+ "text-embedding-3-small": 8191,
26
+ "text-embedding-3-large": 8191,
27
+ "text-embedding-ada-002": 8191,
28
+ }
29
+
30
+ def __init__(self, model: str = "text-embedding-3-small", api_key: str = None, max_tokens: int = None, batch_size: int = 100):
31
+ self.model = model
32
+ self.client = OpenAI(api_key=api_key) if api_key else OpenAI()
33
+ self.max_tokens = max_tokens or self.MODEL_TOKEN_LIMITS.get(
34
+ model, 8191)
35
+ self.batch_size = batch_size
36
+
37
+ # Initialize tiktoken for token counting
38
+ try:
39
+ import tiktoken
40
+ self.encoding = tiktoken.encoding_for_model(model)
41
+ except Exception as e:
42
+ logger.warning(
43
+ f"Failed to initialize tiktoken for model {model}: {e}. Using character-based estimation.")
44
+ self.encoding = None
45
+
46
+ def _truncate_batch_with_encode_batch(self, texts: List[str]) -> List[str]:
47
+ """Truncate texts using encode_batch to find maximum embeddable length.
48
+
49
+ Uses encode_batch to determine the actual token count for each text,
50
+ then truncates only those that exceed the limit.
51
+ """
52
+ if not self.encoding:
53
+ # Fallback: character-based truncation
54
+ max_chars = self.max_tokens * 4
55
+ return [text[:max_chars] if len(text) > max_chars else text for text in texts]
56
+
57
+ # Clean texts
58
+ cleaned_texts = [text.replace("\n", " ") for text in texts]
59
+
60
+ # Use encode_batch to get token counts for all texts at once
61
+ try:
62
+ encoded_batch = self.encoding.encode_batch(cleaned_texts)
63
+ except Exception as e:
64
+ logger.warning(
65
+ f"encode_batch failed: {e}, falling back to individual encoding")
66
+ encoded_batch = [self.encoding.encode(
67
+ text) for text in cleaned_texts]
68
+
69
+ # Process each text based on its actual token count
70
+ processed_texts = []
71
+ for i, (text, tokens) in enumerate(zip(cleaned_texts, encoded_batch)):
72
+ if len(tokens) <= self.max_tokens:
73
+ # Text is within limit, use as-is
74
+ processed_texts.append(text)
75
+ else:
76
+ # Text exceeds limit, truncate to max_tokens
77
+ truncated_tokens = tokens[:self.max_tokens]
78
+ truncated_text = self.encoding.decode(truncated_tokens)
79
+ logger.warning(
80
+ f"Text {i} truncated from {len(tokens)} to {self.max_tokens} tokens")
81
+ processed_texts.append(truncated_text)
82
+
83
+ return processed_texts
84
+
85
+ def get_embedding(self, text: str) -> List[float]:
86
+ """Get embedding for a single text."""
87
+ # Use batch truncation for consistency
88
+ processed_texts = self._truncate_batch_with_encode_batch([text])
89
+ return self.client.embeddings.create(input=processed_texts, model=self.model).data[0].embedding
90
+
91
+ def get_embeddings_batch(self, texts: List[str], batch_size: int = 100) -> List[List[float]]:
92
+ """
93
+ Get embeddings for multiple texts using batch processing.
94
+
95
+ Uses encode_batch to determine the maximum embeddable length for each text,
96
+ then truncates if necessary before sending to the API.
97
+ Also respects OpenAI's 300K tokens per request limit.
98
+
99
+ Args:
100
+ texts: List of texts to embed
101
+ batch_size: Maximum number of texts to send in a single API call (default: 100)
102
+ OpenAI recommends batches of 100 or fewer for optimal performance
103
+
104
+ Returns:
105
+ List of embedding vectors, one for each input text
106
+ """
107
+ if not texts:
108
+ return []
109
+
110
+ import time
111
+
112
+ # OpenAI API limits
113
+ MAX_TOKENS_PER_REQUEST = 300000 # Total tokens per request limit
114
+ MAX_TOKENS_PER_TEXT = self.max_tokens # Individual text limit (8191)
115
+
116
+ all_embeddings = []
117
+ total_texts = len(texts)
118
+
119
+ # Log initial info
120
+ logger.info(
121
+ f"Starting OpenAI embedding generation for {total_texts} texts")
122
+ logger.info(f"Initial batch size: {batch_size}")
123
+ logger.info(f"Model: {self.model}")
124
+
125
+ start_time = time.time()
126
+
127
+ # Process texts with dynamic batching based on token count
128
+ i = 0
129
+ batch_num = 0
130
+ while i < total_texts:
131
+ batch_num += 1
132
+ batch_start_time = time.time()
133
+
134
+ # Collect texts for this batch, respecting token limits
135
+ batch = []
136
+ batch_indices = []
137
+ current_batch_tokens = 0
138
+
139
+ # Try to fill batch up to batch_size or token limit
140
+ while i < total_texts and len(batch) < batch_size:
141
+ # Peek at next text and process it
142
+ next_batch = [texts[i]]
143
+ processed_next = self._truncate_batch_with_encode_batch(
144
+ next_batch)
145
+
146
+ # Count tokens in processed text
147
+ if self.encoding:
148
+ try:
149
+ text_tokens = len(
150
+ self.encoding.encode(processed_next[0]))
151
+ except:
152
+ # Fallback estimation
153
+ text_tokens = len(processed_next[0]) // 4
154
+ else:
155
+ text_tokens = len(processed_next[0]) // 4
156
+
157
+ # Check if adding this text would exceed the request limit
158
+ if batch and (current_batch_tokens + text_tokens > MAX_TOKENS_PER_REQUEST):
159
+ # Batch is full, stop here
160
+ logger.info(
161
+ f"[OpenAI Batch {batch_num}] Batch token limit reached: {current_batch_tokens} tokens, stopping before adding text with {text_tokens} tokens")
162
+ break
163
+
164
+ # Add text to batch
165
+ batch.append(texts[i])
166
+ batch_indices.append(i)
167
+ current_batch_tokens += text_tokens
168
+ i += 1
169
+
170
+ if not batch:
171
+ # Edge case: single text exceeds request limit (shouldn't happen with 8191 limit)
172
+ logger.error(
173
+ f"Single text at index {i} exceeds request token limit, skipping")
174
+ all_embeddings.append([0.0] * 1536)
175
+ i += 1
176
+ continue
177
+
178
+ # Log batch start
179
+ progress_pct = i / total_texts * 100
180
+ logger.info(f"[OpenAI Batch {batch_num}] Processing texts {batch_indices[0]+1}-{batch_indices[-1]+1} "
181
+ f"({len(batch)} texts, ~{current_batch_tokens} tokens, {progress_pct:.1f}% complete)")
182
+
183
+ # Process the batch
184
+ processed_batch = self._truncate_batch_with_encode_batch(batch)
185
+
186
+ # Send batch to OpenAI API
187
+ try:
188
+ response = self.client.embeddings.create(
189
+ input=processed_batch,
190
+ model=self.model
191
+ )
192
+
193
+ # Extract embeddings in order
194
+ batch_embeddings = [item.embedding for item in response.data]
195
+ all_embeddings.extend(batch_embeddings)
196
+
197
+ batch_elapsed = time.time() - batch_start_time
198
+
199
+ # Calculate statistics
200
+ texts_processed = len(all_embeddings)
201
+ overall_elapsed = time.time() - start_time
202
+ overall_rate = texts_processed / overall_elapsed if overall_elapsed > 0 else 0
203
+ eta_seconds = (total_texts - texts_processed) / \
204
+ overall_rate if overall_rate > 0 else 0
205
+
206
+ # Log batch completion with detailed stats
207
+ logger.info(f"[OpenAI Batch {batch_num}] ✓ Completed in {batch_elapsed:.2f}s | "
208
+ f"Progress: {texts_processed}/{total_texts} ({texts_processed/total_texts*100:.1f}%) | "
209
+ f"Rate: {overall_rate:.1f} texts/s | ETA: {eta_seconds:.0f}s")
210
+
211
+ except Exception as e:
212
+ logger.error(
213
+ f"[OpenAI Batch {batch_num}] Batch processing failed: {e}")
214
+ logger.info(
215
+ f"[OpenAI Batch {batch_num}] Falling back to individual processing...")
216
+
217
+ # Fallback: process texts individually
218
+ for idx, text in enumerate(batch):
219
+ try:
220
+ processed_text = self._truncate_batch_with_encode_batch([text])[
221
+ 0]
222
+ embedding = self.client.embeddings.create(
223
+ input=[processed_text],
224
+ model=self.model
225
+ ).data[0].embedding
226
+ all_embeddings.append(embedding)
227
+ except Exception as inner_e:
228
+ logger.error(
229
+ f"[OpenAI Batch {batch_num}] Error processing individual text {batch_indices[idx] + 1}: {inner_e}")
230
+ # Return zero vector as fallback
231
+ # Default dimension for text-embedding-3-small
232
+ all_embeddings.append([0.0] * 1536)
233
+
234
+ # Log final summary
235
+ total_elapsed = time.time() - start_time
236
+ overall_rate = total_texts / total_elapsed if total_elapsed > 0 else 0
237
+ logger.info(f"✓ OpenAI embedding generation completed!")
238
+ logger.info(f" Total texts: {total_texts}")
239
+ logger.info(f" Total time: {total_elapsed:.2f}s")
240
+ logger.info(f" Average rate: {overall_rate:.1f} texts/s")
241
+ logger.info(f" Batches processed: {batch_num}")
242
+
243
+ return all_embeddings
244
+
245
+ def run(self, texts: List[str]) -> List[List[float]]:
246
+ """
247
+ Get embeddings for a list of texts.
248
+
249
+ Uses batch processing for efficiency when multiple texts are provided.
250
+ """
251
+ if not texts:
252
+ return []
253
+
254
+ # Use batch processing for multiple texts
255
+ if len(texts) > 1:
256
+ return self.get_embeddings_batch(texts, batch_size=self.batch_size)
257
+ else:
258
+ # Single text - use direct method
259
+ return [self.get_embedding(texts[0])]
@@ -0,0 +1,31 @@
1
+ from .adaptive_loader import AdaptiveLoader, create_adaptive_loader
2
+ from .code_loader import CodeLoader, create_code_loader
3
+ from .doc_loader import DocLoader, create_doc_loader
4
+ from .eml_loader import EmlLoader, create_eml_loader
5
+ from .html_loader import HtmlLoader, create_html_loader
6
+ from .md_loader import MdLoader, create_md_loader
7
+ from .odt_loader import OdtLoader, create_odt_loader
8
+ from .pdf_loader_light import PdfLoaderLight, create_pdf_loader_light
9
+ from .tabular_loader import TabularLoader, create_tabular_loader
10
+
11
+ __all__ = [
12
+ "AdaptiveLoader",
13
+ "create_adaptive_loader",
14
+ "CodeLoader",
15
+ "create_code_loader",
16
+ "DocLoader",
17
+ "create_doc_loader",
18
+ "EmlLoader",
19
+ "create_eml_loader",
20
+ "HtmlLoader",
21
+ "create_html_loader",
22
+ "MdLoader",
23
+ "create_md_loader",
24
+ "OdtLoader",
25
+ "create_odt_loader",
26
+ "PdfLoaderLight",
27
+ "create_pdf_loader_light",
28
+ "TabularLoader",
29
+ "create_tabular_loader",
30
+ ]
31
+