rakam-systems-vectorstore 0.1.1rc7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. rakam_systems_vectorstore/MANIFEST.in +26 -0
  2. rakam_systems_vectorstore/README.md +1071 -0
  3. rakam_systems_vectorstore/__init__.py +93 -0
  4. rakam_systems_vectorstore/components/__init__.py +0 -0
  5. rakam_systems_vectorstore/components/chunker/__init__.py +19 -0
  6. rakam_systems_vectorstore/components/chunker/advanced_chunker.py +1019 -0
  7. rakam_systems_vectorstore/components/chunker/text_chunker.py +154 -0
  8. rakam_systems_vectorstore/components/embedding_model/__init__.py +0 -0
  9. rakam_systems_vectorstore/components/embedding_model/configurable_embeddings.py +546 -0
  10. rakam_systems_vectorstore/components/embedding_model/openai_embeddings.py +259 -0
  11. rakam_systems_vectorstore/components/loader/__init__.py +31 -0
  12. rakam_systems_vectorstore/components/loader/adaptive_loader.py +512 -0
  13. rakam_systems_vectorstore/components/loader/code_loader.py +699 -0
  14. rakam_systems_vectorstore/components/loader/doc_loader.py +812 -0
  15. rakam_systems_vectorstore/components/loader/eml_loader.py +556 -0
  16. rakam_systems_vectorstore/components/loader/html_loader.py +626 -0
  17. rakam_systems_vectorstore/components/loader/md_loader.py +622 -0
  18. rakam_systems_vectorstore/components/loader/odt_loader.py +750 -0
  19. rakam_systems_vectorstore/components/loader/pdf_loader.py +771 -0
  20. rakam_systems_vectorstore/components/loader/pdf_loader_light.py +723 -0
  21. rakam_systems_vectorstore/components/loader/tabular_loader.py +597 -0
  22. rakam_systems_vectorstore/components/vectorstore/__init__.py +0 -0
  23. rakam_systems_vectorstore/components/vectorstore/apps.py +10 -0
  24. rakam_systems_vectorstore/components/vectorstore/configurable_pg_vector_store.py +1661 -0
  25. rakam_systems_vectorstore/components/vectorstore/faiss_vector_store.py +878 -0
  26. rakam_systems_vectorstore/components/vectorstore/migrations/0001_initial.py +55 -0
  27. rakam_systems_vectorstore/components/vectorstore/migrations/__init__.py +0 -0
  28. rakam_systems_vectorstore/components/vectorstore/models.py +10 -0
  29. rakam_systems_vectorstore/components/vectorstore/pg_models.py +97 -0
  30. rakam_systems_vectorstore/components/vectorstore/pg_vector_store.py +827 -0
  31. rakam_systems_vectorstore/config.py +266 -0
  32. rakam_systems_vectorstore/core.py +8 -0
  33. rakam_systems_vectorstore/pyproject.toml +113 -0
  34. rakam_systems_vectorstore/server/README.md +290 -0
  35. rakam_systems_vectorstore/server/__init__.py +20 -0
  36. rakam_systems_vectorstore/server/mcp_server_vector.py +325 -0
  37. rakam_systems_vectorstore/setup.py +103 -0
  38. rakam_systems_vectorstore-0.1.1rc7.dist-info/METADATA +370 -0
  39. rakam_systems_vectorstore-0.1.1rc7.dist-info/RECORD +40 -0
  40. rakam_systems_vectorstore-0.1.1rc7.dist-info/WHEEL +4 -0
@@ -0,0 +1,512 @@
1
+ """
2
+ Adaptive data loader that automatically detects and processes different input types.
3
+
4
+ Supports:
5
+ - Plain text files (.txt)
6
+ - PDF documents (.pdf)
7
+ - Word documents (.docx, .doc)
8
+ - ODT documents (.odt)
9
+ - Email files (.eml)
10
+ - Markdown files (.md)
11
+ - JSON data (.json)
12
+ - CSV/TSV/XLSX data (.csv, .tsv, .xlsx, .xls)
13
+ - HTML files (.html)
14
+ - Code files (.py, .js, .java, etc.)
15
+ - Raw text strings
16
+
17
+ This loader delegates to specialized loaders in the same folder for each file type.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import json
23
+ import os
24
+ from pathlib import Path
25
+ from typing import Any, Dict, List, Optional, Union
26
+
27
+ from rakam_systems_core.ai_utils import logging
28
+ from rakam_systems_core.ai_core.interfaces.loader import Loader
29
+ from rakam_systems_vectorstore.components.chunker import TextChunker
30
+ from rakam_systems_vectorstore.core import Node, NodeMetadata, VSFile
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ class AdaptiveLoader(Loader):
36
+ """
37
+ Adaptive data loader that automatically detects input type and applies
38
+ appropriate preprocessing strategy by delegating to specialized loaders.
39
+
40
+ This loader handles various input formats:
41
+ - File paths (detects type by extension)
42
+ - Raw text strings
43
+ - Structured data (JSON, CSV, XLSX)
44
+ - Binary documents (PDF, DOCX, ODT)
45
+ - Email files (EML)
46
+ - Code files
47
+ - HTML files
48
+ - Markdown files
49
+
50
+ All 4 loader interface methods (load_as_text, load_as_chunks, load_as_nodes,
51
+ load_as_vsfile) are supported and delegate to the appropriate specialized loader.
52
+ """
53
+
54
+ # Supported file extensions and their loader types
55
+ FILE_TYPE_MAP = {
56
+ # Text files
57
+ '.txt': 'text',
58
+ '.text': 'text',
59
+
60
+ # Markdown
61
+ '.md': 'md',
62
+ '.markdown': 'md',
63
+
64
+ # Documents
65
+ '.pdf': 'pdf',
66
+ '.docx': 'doc',
67
+ '.doc': 'doc',
68
+ '.odt': 'odt',
69
+
70
+ # Email files
71
+ '.eml': 'eml',
72
+ '.msg': 'eml',
73
+
74
+ # Structured/Tabular data
75
+ '.json': 'json',
76
+ '.csv': 'tabular',
77
+ '.tsv': 'tabular',
78
+ '.xlsx': 'tabular',
79
+ '.xls': 'tabular',
80
+
81
+ # HTML
82
+ '.html': 'html',
83
+ '.htm': 'html',
84
+ '.xhtml': 'html',
85
+
86
+ # Code files - comprehensive list from CodeLoader
87
+ '.py': 'code',
88
+ '.pyw': 'code',
89
+ '.pyi': 'code',
90
+ '.js': 'code',
91
+ '.jsx': 'code',
92
+ '.ts': 'code',
93
+ '.tsx': 'code',
94
+ '.mjs': 'code',
95
+ '.cjs': 'code',
96
+ '.java': 'code',
97
+ '.kt': 'code',
98
+ '.kts': 'code',
99
+ '.c': 'code',
100
+ '.h': 'code',
101
+ '.cpp': 'code',
102
+ '.cc': 'code',
103
+ '.cxx': 'code',
104
+ '.hpp': 'code',
105
+ '.hxx': 'code',
106
+ '.cs': 'code',
107
+ '.go': 'code',
108
+ '.rs': 'code',
109
+ '.rb': 'code',
110
+ '.rake': 'code',
111
+ '.php': 'code',
112
+ '.swift': 'code',
113
+ '.scala': 'code',
114
+ '.sh': 'code',
115
+ '.bash': 'code',
116
+ '.zsh': 'code',
117
+ '.sql': 'code',
118
+ '.yaml': 'code',
119
+ '.yml': 'code',
120
+ '.r': 'code',
121
+ '.R': 'code',
122
+ '.lua': 'code',
123
+ '.pl': 'code',
124
+ '.pm': 'code',
125
+ }
126
+
127
+ def __init__(
128
+ self,
129
+ name: str = "adaptive_loader",
130
+ config: Optional[Dict[str, Any]] = None
131
+ ):
132
+ """
133
+ Initialize adaptive loader.
134
+
135
+ Args:
136
+ name: Component name
137
+ config: Optional configuration with keys:
138
+ - encoding: Text encoding (default: "utf-8")
139
+ - chunk_size: Maximum tokens per chunk (default: 512)
140
+ - chunk_overlap: Overlap between chunks in tokens (default: 50)
141
+
142
+ Additional config options are passed through to specialized loaders.
143
+ """
144
+ super().__init__(name=name, config=config)
145
+ self._config = config or {}
146
+ self._encoding = self._config.get('encoding', 'utf-8')
147
+ chunk_size = self._config.get('chunk_size', 512)
148
+ chunk_overlap = self._config.get('chunk_overlap', 50)
149
+ self._chunker = TextChunker(
150
+ chunk_size=chunk_size, chunk_overlap=chunk_overlap)
151
+
152
+ # Cache for lazy-loaded specialized loaders
153
+ self._loaders: Dict[str, Loader] = {}
154
+
155
+ def _detect_file_type(self, path: Union[str, Path]) -> str:
156
+ """Detect file type based on extension."""
157
+ if isinstance(path, str):
158
+ path = Path(path)
159
+ suffix = path.suffix.lower()
160
+ file_type = self.FILE_TYPE_MAP.get(suffix, 'text')
161
+
162
+ # Log detection for debugging
163
+ logger.debug(
164
+ f"File type detection: {path.name} -> suffix='{suffix}' -> type='{file_type}'")
165
+
166
+ # Safety check: ensure PDFs are never routed to doc loader
167
+ if suffix == '.pdf' and file_type != 'pdf':
168
+ logger.error(
169
+ f"PDF file '{path.name}' was incorrectly mapped to type '{file_type}'. Forcing to 'pdf'.")
170
+ file_type = 'pdf'
171
+
172
+ return file_type
173
+
174
+ def _get_loader(self, loader_type: str) -> Loader:
175
+ """Get or create a cached loader instance."""
176
+ if loader_type not in self._loaders:
177
+ loader_config = {
178
+ 'encoding': self._encoding,
179
+ 'chunk_size': self._chunker._chunk_size,
180
+ 'chunk_overlap': self._chunker._chunk_overlap,
181
+ **{k: v for k, v in self._config.items() if k not in ('encoding', 'chunk_size', 'chunk_overlap')}
182
+ }
183
+
184
+ if loader_type == 'md':
185
+ from .md_loader import MdLoader
186
+ self._loaders[loader_type] = MdLoader(config=loader_config)
187
+ elif loader_type == 'pdf':
188
+ from .pdf_loader_light import PdfLoaderLight
189
+ self._loaders[loader_type] = PdfLoaderLight(
190
+ config=loader_config)
191
+ elif loader_type == 'doc':
192
+ from .doc_loader import DocLoader
193
+ self._loaders[loader_type] = DocLoader(config=loader_config)
194
+ elif loader_type == 'odt':
195
+ from .odt_loader import OdtLoader
196
+ self._loaders[loader_type] = OdtLoader(config=loader_config)
197
+ elif loader_type == 'eml':
198
+ from .eml_loader import EmlLoader
199
+ self._loaders[loader_type] = EmlLoader(config=loader_config)
200
+ elif loader_type == 'html':
201
+ from .html_loader import HtmlLoader
202
+ self._loaders[loader_type] = HtmlLoader(config=loader_config)
203
+ elif loader_type == 'code':
204
+ from .code_loader import CodeLoader
205
+ self._loaders[loader_type] = CodeLoader(config=loader_config)
206
+ elif loader_type == 'tabular':
207
+ from .tabular_loader import TabularLoader
208
+ self._loaders[loader_type] = TabularLoader(
209
+ config=loader_config)
210
+ else:
211
+ raise ValueError(f"Unknown loader type: {loader_type}")
212
+
213
+ return self._loaders[loader_type]
214
+
215
+ def _get_loader_for_file(self, source: Union[str, Path]) -> Optional[Loader]:
216
+ """Get the appropriate loader for a file based on its type."""
217
+ file_type = self._detect_file_type(source)
218
+
219
+ # These types don't have specialized loaders
220
+ if file_type in ('text', 'json'):
221
+ return None
222
+
223
+ return self._get_loader(file_type)
224
+
225
+ # =========================================================================
226
+ # Loader Interface Implementation
227
+ # =========================================================================
228
+
229
+ def run(self, source: str) -> List[str]:
230
+ """
231
+ Load data from source.
232
+
233
+ Args:
234
+ source: File path or raw text
235
+
236
+ Returns:
237
+ List of text chunks
238
+ """
239
+ # Check if source is a file path
240
+ if os.path.isfile(source):
241
+ return self.load_as_chunks(source)
242
+ else:
243
+ # Treat as raw text
244
+ return self._process_text(source)
245
+
246
+ def load_as_text(self, source: Union[str, Path]) -> str:
247
+ """
248
+ Load document and return as a single text string.
249
+
250
+ Detects the file type and delegates to the appropriate specialized loader.
251
+
252
+ Args:
253
+ source: Path to document file or raw text
254
+
255
+ Returns:
256
+ Full text content as a single string
257
+ """
258
+ if isinstance(source, Path):
259
+ source = str(source)
260
+
261
+ # Check if source is a file path
262
+ if not os.path.isfile(source):
263
+ # Treat as raw text
264
+ return source
265
+
266
+ file_type = self._detect_file_type(source)
267
+ logger.info(f"Loading file as text: {source} (type: {file_type})")
268
+
269
+ # Handle text files directly
270
+ if file_type == 'text':
271
+ return self._load_text_file_as_text(source)
272
+
273
+ # Handle JSON files directly
274
+ if file_type == 'json':
275
+ return self._load_json_file_as_text(source)
276
+
277
+ # Delegate to specialized loader
278
+ loader = self._get_loader(file_type)
279
+ return loader.load_as_text(source)
280
+
281
+ def load_as_chunks(self, source: Union[str, Path]) -> List[str]:
282
+ """
283
+ Load document and return as a list of text chunks.
284
+
285
+ Detects the file type and delegates to the appropriate specialized loader.
286
+
287
+ Args:
288
+ source: Path to document file or raw text
289
+
290
+ Returns:
291
+ List of text chunks
292
+ """
293
+ if isinstance(source, Path):
294
+ source = str(source)
295
+
296
+ # Check if source is a file path
297
+ if not os.path.isfile(source):
298
+ # Treat as raw text
299
+ return self._process_text(source)
300
+
301
+ file_type = self._detect_file_type(source)
302
+ logger.info(f"Loading file as chunks: {source} (type: {file_type})")
303
+
304
+ # Handle text files directly
305
+ if file_type == 'text':
306
+ return self._load_text_file_as_chunks(source)
307
+
308
+ # Handle JSON files directly
309
+ if file_type == 'json':
310
+ return self._load_json_file_as_chunks(source)
311
+
312
+ # Delegate to specialized loader
313
+ loader = self._get_loader(file_type)
314
+ return loader.load_as_chunks(source)
315
+
316
+ def load_as_nodes(
317
+ self,
318
+ source: Union[str, Path],
319
+ source_id: Optional[str] = None,
320
+ custom_metadata: Optional[Dict[str, Any]] = None
321
+ ) -> List[Node]:
322
+ """
323
+ Load document and return as Node objects with metadata.
324
+
325
+ Detects the file type and delegates to the appropriate specialized loader.
326
+
327
+ Args:
328
+ source: Path to document file or raw text
329
+ source_id: Optional source identifier (defaults to file path)
330
+ custom_metadata: Optional custom metadata to attach to nodes
331
+
332
+ Returns:
333
+ List of Node objects with text chunks and metadata
334
+ """
335
+ if isinstance(source, Path):
336
+ source = str(source)
337
+
338
+ # Check if source is a file path
339
+ if not os.path.isfile(source):
340
+ # Treat as raw text - create nodes manually
341
+ chunks = self._process_text(source)
342
+ return self._chunks_to_nodes(chunks, source_id or "text_input", custom_metadata)
343
+
344
+ file_type = self._detect_file_type(source)
345
+ source_path = Path(source)
346
+ logger.info(
347
+ f"Loading file as nodes: {source_path.name} (detected type: {file_type}, extension: {source_path.suffix})")
348
+
349
+ # Handle text files directly
350
+ if file_type == 'text':
351
+ chunks = self._load_text_file_as_chunks(source)
352
+ return self._chunks_to_nodes(chunks, source_id or source, custom_metadata)
353
+
354
+ # Handle JSON files directly
355
+ if file_type == 'json':
356
+ chunks = self._load_json_file_as_chunks(source)
357
+ return self._chunks_to_nodes(chunks, source_id or source, custom_metadata)
358
+
359
+ # Delegate to specialized loader
360
+ loader = self._get_loader(file_type)
361
+ return loader.load_as_nodes(source, source_id, custom_metadata)
362
+
363
+ def load_as_vsfile(
364
+ self,
365
+ file_path: Union[str, Path],
366
+ custom_metadata: Optional[Dict[str, Any]] = None
367
+ ) -> VSFile:
368
+ """
369
+ Load document and return as VSFile object.
370
+
371
+ Detects the file type and delegates to the appropriate specialized loader.
372
+
373
+ Args:
374
+ file_path: Path to document file
375
+ custom_metadata: Optional custom metadata
376
+
377
+ Returns:
378
+ VSFile object with nodes
379
+ """
380
+ if isinstance(file_path, Path):
381
+ file_path = str(file_path)
382
+
383
+ if not os.path.isfile(file_path):
384
+ raise FileNotFoundError(f"File not found: {file_path}")
385
+
386
+ file_type = self._detect_file_type(file_path)
387
+ logger.info(f"Loading file as VSFile: {file_path} (type: {file_type})")
388
+
389
+ # Handle text files directly
390
+ if file_type == 'text':
391
+ vsfile = VSFile(file_path)
392
+ chunks = self._load_text_file_as_chunks(file_path)
393
+ vsfile.nodes = self._chunks_to_nodes(
394
+ chunks, str(vsfile.uuid), custom_metadata)
395
+ vsfile.processed = True
396
+ return vsfile
397
+
398
+ # Handle JSON files directly
399
+ if file_type == 'json':
400
+ vsfile = VSFile(file_path)
401
+ chunks = self._load_json_file_as_chunks(file_path)
402
+ vsfile.nodes = self._chunks_to_nodes(
403
+ chunks, str(vsfile.uuid), custom_metadata)
404
+ vsfile.processed = True
405
+ return vsfile
406
+
407
+ # Delegate to specialized loader
408
+ loader = self._get_loader(file_type)
409
+ return loader.load_as_vsfile(file_path, custom_metadata)
410
+
411
+ # =========================================================================
412
+ # Helper Methods
413
+ # =========================================================================
414
+
415
+ def _chunks_to_nodes(
416
+ self,
417
+ chunks: List[str],
418
+ source_id: str,
419
+ custom_metadata: Optional[Dict[str, Any]] = None
420
+ ) -> List[Node]:
421
+ """Convert text chunks to Node objects."""
422
+ nodes = []
423
+ for idx, chunk in enumerate(chunks):
424
+ metadata = NodeMetadata(
425
+ source_file_uuid=source_id,
426
+ position=idx,
427
+ custom=custom_metadata or {}
428
+ )
429
+ node = Node(content=chunk, metadata=metadata)
430
+ nodes.append(node)
431
+ return nodes
432
+
433
+ def _process_text(self, text: str) -> List[str]:
434
+ """Process plain text into chunks."""
435
+ if not text or not text.strip():
436
+ return []
437
+ # chunk_text returns list of dicts with 'text' key
438
+ chunk_dicts = self._chunker.chunk_text(text, "text")
439
+ return [chunk['text'] for chunk in chunk_dicts]
440
+
441
+ def _load_text_file_as_text(self, file_path: str) -> str:
442
+ """Load plain text file and return as single string."""
443
+ try:
444
+ with open(file_path, 'r', encoding=self._encoding) as f:
445
+ return f.read()
446
+ except Exception as e:
447
+ logger.error(f"Error loading text file {file_path}: {e}")
448
+ raise
449
+
450
+ def _load_text_file_as_chunks(self, file_path: str) -> List[str]:
451
+ """Load plain text file and return as chunks."""
452
+ content = self._load_text_file_as_text(file_path)
453
+ return self._process_text(content)
454
+
455
+ def _load_json_file_as_text(self, file_path: str) -> str:
456
+ """Load JSON file and return as formatted string."""
457
+ try:
458
+ with open(file_path, 'r', encoding=self._encoding) as f:
459
+ data = json.load(f)
460
+ return json.dumps(data, indent=2)
461
+ except Exception as e:
462
+ logger.error(f"Error loading JSON file {file_path}: {e}")
463
+ raise
464
+
465
+ def _load_json_file_as_chunks(self, file_path: str) -> List[str]:
466
+ """Load JSON file and return as chunks."""
467
+ try:
468
+ with open(file_path, 'r', encoding=self._encoding) as f:
469
+ data = json.load(f)
470
+
471
+ if isinstance(data, dict):
472
+ # Return entire dict as single chunk
473
+ return [json.dumps(data, indent=2)]
474
+ elif isinstance(data, list):
475
+ # Each item becomes a chunk
476
+ return [json.dumps(item, indent=2) for item in data if item]
477
+ else:
478
+ return [str(data)]
479
+ except Exception as e:
480
+ logger.error(f"Error loading JSON file {file_path}: {e}")
481
+ raise
482
+
483
+
484
+ def create_adaptive_loader(
485
+ chunk_size: int = 512,
486
+ chunk_overlap: int = 50,
487
+ encoding: str = 'utf-8',
488
+ **kwargs
489
+ ) -> AdaptiveLoader:
490
+ """
491
+ Factory function to create an adaptive loader.
492
+
493
+ Args:
494
+ chunk_size: Size of text chunks
495
+ chunk_overlap: Overlap between chunks
496
+ encoding: Text encoding
497
+ **kwargs: Additional configuration options passed to specialized loaders
498
+
499
+ Returns:
500
+ Configured adaptive loader
501
+ """
502
+ config = {
503
+ 'chunk_size': chunk_size,
504
+ 'chunk_overlap': chunk_overlap,
505
+ 'encoding': encoding,
506
+ **kwargs
507
+ }
508
+
509
+ return AdaptiveLoader(config=config)
510
+
511
+
512
+ __all__ = ["AdaptiveLoader", "create_adaptive_loader"]