rakam-systems-vectorstore 0.1.1rc7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. rakam_systems_vectorstore/MANIFEST.in +26 -0
  2. rakam_systems_vectorstore/README.md +1071 -0
  3. rakam_systems_vectorstore/__init__.py +93 -0
  4. rakam_systems_vectorstore/components/__init__.py +0 -0
  5. rakam_systems_vectorstore/components/chunker/__init__.py +19 -0
  6. rakam_systems_vectorstore/components/chunker/advanced_chunker.py +1019 -0
  7. rakam_systems_vectorstore/components/chunker/text_chunker.py +154 -0
  8. rakam_systems_vectorstore/components/embedding_model/__init__.py +0 -0
  9. rakam_systems_vectorstore/components/embedding_model/configurable_embeddings.py +546 -0
  10. rakam_systems_vectorstore/components/embedding_model/openai_embeddings.py +259 -0
  11. rakam_systems_vectorstore/components/loader/__init__.py +31 -0
  12. rakam_systems_vectorstore/components/loader/adaptive_loader.py +512 -0
  13. rakam_systems_vectorstore/components/loader/code_loader.py +699 -0
  14. rakam_systems_vectorstore/components/loader/doc_loader.py +812 -0
  15. rakam_systems_vectorstore/components/loader/eml_loader.py +556 -0
  16. rakam_systems_vectorstore/components/loader/html_loader.py +626 -0
  17. rakam_systems_vectorstore/components/loader/md_loader.py +622 -0
  18. rakam_systems_vectorstore/components/loader/odt_loader.py +750 -0
  19. rakam_systems_vectorstore/components/loader/pdf_loader.py +771 -0
  20. rakam_systems_vectorstore/components/loader/pdf_loader_light.py +723 -0
  21. rakam_systems_vectorstore/components/loader/tabular_loader.py +597 -0
  22. rakam_systems_vectorstore/components/vectorstore/__init__.py +0 -0
  23. rakam_systems_vectorstore/components/vectorstore/apps.py +10 -0
  24. rakam_systems_vectorstore/components/vectorstore/configurable_pg_vector_store.py +1661 -0
  25. rakam_systems_vectorstore/components/vectorstore/faiss_vector_store.py +878 -0
  26. rakam_systems_vectorstore/components/vectorstore/migrations/0001_initial.py +55 -0
  27. rakam_systems_vectorstore/components/vectorstore/migrations/__init__.py +0 -0
  28. rakam_systems_vectorstore/components/vectorstore/models.py +10 -0
  29. rakam_systems_vectorstore/components/vectorstore/pg_models.py +97 -0
  30. rakam_systems_vectorstore/components/vectorstore/pg_vector_store.py +827 -0
  31. rakam_systems_vectorstore/config.py +266 -0
  32. rakam_systems_vectorstore/core.py +8 -0
  33. rakam_systems_vectorstore/pyproject.toml +113 -0
  34. rakam_systems_vectorstore/server/README.md +290 -0
  35. rakam_systems_vectorstore/server/__init__.py +20 -0
  36. rakam_systems_vectorstore/server/mcp_server_vector.py +325 -0
  37. rakam_systems_vectorstore/setup.py +103 -0
  38. rakam_systems_vectorstore-0.1.1rc7.dist-info/METADATA +370 -0
  39. rakam_systems_vectorstore-0.1.1rc7.dist-info/RECORD +40 -0
  40. rakam_systems_vectorstore-0.1.1rc7.dist-info/WHEEL +4 -0
@@ -0,0 +1,597 @@
1
+ """
2
+ Tabular Data Loader for loading spreadsheet and tabular files without chunking.
3
+
4
+ This loader extracts data from tabular files (XLSX, CSV, TSV, etc.) where each row
5
+ becomes a single node. Each node's content is formatted with each field on its own
6
+ line, separated by blank lines:
7
+
8
+ Column_name_1: value_1
9
+
10
+ Column_name_2: value_2
11
+
12
+ ...
13
+
14
+ Empty values are explicitly shown in the node content.
15
+
16
+ Features:
17
+ - No chunking - each row is one node
18
+ - All columns included with their names
19
+ - Readable format with blank line separation between fields
20
+ - Empty values explicitly shown
21
+ - Supports Excel files (.xlsx, .xls)
22
+ - Supports CSV files (.csv)
23
+ - Supports TSV files (.tsv)
24
+ - Supports other delimiter-separated files
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ import csv
30
+ import os
31
+ import time
32
+ from pathlib import Path
33
+ from typing import Any, Dict, List, Optional, Union
34
+
35
+ from rakam_systems_core.ai_utils import logging
36
+ from rakam_systems_core.ai_core.interfaces.loader import Loader
37
+ from rakam_systems_vectorstore.core import Node, NodeMetadata, VSFile
38
+
39
+ logger = logging.getLogger(__name__)
40
+
41
+ # Supported file extensions and their types
42
+ XLSX_EXTENSIONS = ['.xlsx', '.xls']
43
+ CSV_EXTENSIONS = ['.csv']
44
+ TSV_EXTENSIONS = ['.tsv']
45
+ TABULAR_EXTENSIONS = XLSX_EXTENSIONS + CSV_EXTENSIONS + TSV_EXTENSIONS
46
+
47
+
48
+ class TabularLoader(Loader):
49
+ """
50
+ Tabular data loader that converts each row into a node without chunking.
51
+
52
+ This loader provides simple tabular file processing where:
53
+ - Each row becomes one node (chunk)
54
+ - First row is treated as headers (column names)
55
+ - Node content format: each field on its own line, separated by blank lines
56
+ - Empty values are explicitly shown as empty
57
+ - No chunking is performed
58
+
59
+ Supported file types:
60
+ - Excel files (.xlsx, .xls)
61
+ - CSV files (.csv)
62
+ - TSV files (.tsv)
63
+ - Other delimiter-separated files (configurable delimiter)
64
+ """
65
+
66
+ def __init__(
67
+ self,
68
+ name: str = "tabular_loader",
69
+ config: Optional[Dict[str, Any]] = None
70
+ ):
71
+ """
72
+ Initialize tabular data loader.
73
+
74
+ Args:
75
+ name: Component name
76
+ config: Optional configuration with keys:
77
+ - sheet_name: Name or index of sheet to load for Excel files (default: 0 for first sheet)
78
+ - skip_empty_rows: Whether to skip completely empty rows (default: True)
79
+ - empty_value_text: Text to show for empty values (default: "")
80
+ - delimiter: Delimiter for CSV/text files (default: auto-detect based on extension)
81
+ - encoding: File encoding for CSV/text files (default: 'utf-8')
82
+ - has_header: Whether first row contains headers (default: True)
83
+ """
84
+ super().__init__(name=name, config=config)
85
+
86
+ # Extract configuration
87
+ config = config or {}
88
+ self._sheet_name = config.get('sheet_name', 0) # 0 = first sheet
89
+ self._skip_empty_rows = config.get('skip_empty_rows', True)
90
+ self._empty_value_text = config.get('empty_value_text', "")
91
+ self._delimiter = config.get('delimiter', None) # None = auto-detect
92
+ self._encoding = config.get('encoding', 'utf-8')
93
+ self._has_header = config.get('has_header', True)
94
+
95
+ logger.info(
96
+ f"Initialized TabularLoader with sheet_name={self._sheet_name}, skip_empty_rows={self._skip_empty_rows}")
97
+
98
+ def run(self, source: str) -> List[str]:
99
+ """
100
+ Execute the primary operation for the component.
101
+
102
+ This method satisfies the BaseComponent abstract method requirement
103
+ and delegates to load_as_chunks.
104
+
105
+ Args:
106
+ source: Path to tabular file (XLSX, CSV, TSV, etc.)
107
+
108
+ Returns:
109
+ List of formatted strings, one per row (excluding header)
110
+ """
111
+ return self.load_as_chunks(source)
112
+
113
+ def load_as_text(
114
+ self,
115
+ source: Union[str, Path],
116
+ ) -> str:
117
+ """
118
+ Load tabular file and return as a single text string.
119
+
120
+ This method extracts all rows from the file and returns them as a single
121
+ string without chunking. Each row is on its own line.
122
+
123
+ Args:
124
+ source: Path to tabular file (XLSX, CSV, TSV, etc.)
125
+
126
+ Returns:
127
+ Full text content as a single string (all rows joined by newlines)
128
+
129
+ Raises:
130
+ FileNotFoundError: If source file doesn't exist
131
+ ValueError: If source is not a supported tabular file
132
+ Exception: If file processing fails
133
+ """
134
+ # Convert Path to string
135
+ if isinstance(source, Path):
136
+ source = str(source)
137
+
138
+ # Validate file exists
139
+ if not os.path.isfile(source):
140
+ raise FileNotFoundError(f"File not found: {source}")
141
+
142
+ # Validate file type
143
+ if not self._is_supported_file(source):
144
+ raise ValueError(
145
+ f"File is not a supported tabular format: {source}")
146
+
147
+ logger.info(f"Loading tabular file as text: {source}")
148
+ start_time = time.time()
149
+
150
+ try:
151
+ # Get all rows
152
+ row_strings = self._extract_rows(source)
153
+
154
+ # Join all rows into a single string
155
+ full_text = "\n".join(row_strings)
156
+
157
+ elapsed = time.time() - start_time
158
+ logger.info(
159
+ f"Tabular file loaded as text in {elapsed:.2f}s: {len(full_text)} characters, {len(row_strings)} rows")
160
+
161
+ return full_text
162
+
163
+ except Exception as e:
164
+ logger.error(f"Error loading tabular file as text {source}: {e}")
165
+ raise
166
+
167
+ def load_as_chunks(
168
+ self,
169
+ source: Union[str, Path],
170
+ ) -> List[str]:
171
+ """
172
+ Load tabular file and return as a list of text chunks.
173
+
174
+ Each row becomes one chunk (no text-based chunking is performed).
175
+ Each chunk is formatted with each field on its own line, separated by blank lines.
176
+
177
+ Args:
178
+ source: Path to tabular file (XLSX, CSV, TSV, etc.)
179
+
180
+ Returns:
181
+ List of formatted strings, one per row (excluding header)
182
+
183
+ Raises:
184
+ FileNotFoundError: If source file doesn't exist
185
+ ValueError: If source is not a supported tabular file
186
+ Exception: If file processing fails
187
+ """
188
+ # Convert Path to string
189
+ if isinstance(source, Path):
190
+ source = str(source)
191
+
192
+ # Validate file exists
193
+ if not os.path.isfile(source):
194
+ raise FileNotFoundError(f"File not found: {source}")
195
+
196
+ # Validate file type
197
+ if not self._is_supported_file(source):
198
+ raise ValueError(
199
+ f"File is not a supported tabular format: {source}")
200
+
201
+ logger.info(f"Loading tabular file: {source}")
202
+ start_time = time.time()
203
+
204
+ try:
205
+ row_strings = self._extract_rows(source)
206
+
207
+ elapsed = time.time() - start_time
208
+ logger.info(
209
+ f"Tabular file processed in {elapsed:.2f}s: {len(row_strings)} rows")
210
+
211
+ return row_strings
212
+
213
+ except Exception as e:
214
+ logger.error(f"Error processing tabular file {source}: {e}")
215
+ raise
216
+
217
+ def _get_file_type(self, source: str) -> str:
218
+ """
219
+ Determine the file type based on extension.
220
+
221
+ Args:
222
+ source: Path to file
223
+
224
+ Returns:
225
+ File type: 'xlsx', 'csv', 'tsv', or 'unknown'
226
+ """
227
+ ext = Path(source).suffix.lower()
228
+ if ext in XLSX_EXTENSIONS:
229
+ return 'xlsx'
230
+ elif ext in CSV_EXTENSIONS:
231
+ return 'csv'
232
+ elif ext in TSV_EXTENSIONS:
233
+ return 'tsv'
234
+ return 'unknown'
235
+
236
+ def _extract_rows(self, source: str) -> List[str]:
237
+ """
238
+ Extract rows from tabular file and format them as strings.
239
+
240
+ Args:
241
+ source: Path to tabular file
242
+
243
+ Returns:
244
+ List of formatted row strings
245
+ """
246
+ file_type = self._get_file_type(source)
247
+
248
+ if file_type == 'xlsx':
249
+ return self._extract_rows_xlsx(source)
250
+ elif file_type in ('csv', 'tsv'):
251
+ return self._extract_rows_csv(source, file_type)
252
+ else:
253
+ # Try to load as CSV with auto-detected or configured delimiter
254
+ return self._extract_rows_csv(source, 'csv')
255
+
256
+ def _extract_rows_xlsx(self, source: str) -> List[str]:
257
+ """
258
+ Extract rows from XLSX file and format them as strings.
259
+
260
+ Args:
261
+ source: Path to XLSX file
262
+
263
+ Returns:
264
+ List of formatted row strings
265
+ """
266
+ try:
267
+ import openpyxl
268
+ except ImportError:
269
+ raise ImportError(
270
+ "openpyxl is required for XLSX loading. Install it with: pip install openpyxl"
271
+ )
272
+
273
+ # Load workbook
274
+ workbook = openpyxl.load_workbook(source, data_only=True)
275
+
276
+ # Get the specified sheet
277
+ if isinstance(self._sheet_name, int):
278
+ sheet = workbook.worksheets[self._sheet_name]
279
+ else:
280
+ sheet = workbook[self._sheet_name]
281
+
282
+ logger.info(f"Processing sheet: {sheet.title}")
283
+
284
+ # Extract rows
285
+ rows = list(sheet.iter_rows(values_only=True))
286
+
287
+ if not rows:
288
+ logger.warning(f"No rows found in sheet {sheet.title}")
289
+ return []
290
+
291
+ return self._process_raw_rows(rows)
292
+
293
+ def _extract_rows_csv(self, source: str, file_type: str) -> List[str]:
294
+ """
295
+ Extract rows from CSV/TSV file and format them as strings.
296
+
297
+ Args:
298
+ source: Path to CSV/TSV file
299
+ file_type: 'csv' or 'tsv'
300
+
301
+ Returns:
302
+ List of formatted row strings
303
+ """
304
+ # Determine delimiter
305
+ if self._delimiter is not None:
306
+ delimiter = self._delimiter
307
+ elif file_type == 'tsv':
308
+ delimiter = '\t'
309
+ else:
310
+ delimiter = ','
311
+
312
+ logger.info(
313
+ f"Processing {file_type.upper()} file with delimiter: {repr(delimiter)}")
314
+
315
+ rows = []
316
+ with open(source, 'r', encoding=self._encoding, newline='') as f:
317
+ reader = csv.reader(f, delimiter=delimiter)
318
+ for row in reader:
319
+ rows.append(tuple(row))
320
+
321
+ if not rows:
322
+ logger.warning(f"No rows found in file: {source}")
323
+ return []
324
+
325
+ return self._process_raw_rows(rows)
326
+
327
+ def _process_raw_rows(self, rows: List[tuple]) -> List[str]:
328
+ """
329
+ Process raw rows (from any source) into formatted strings.
330
+
331
+ Args:
332
+ rows: List of row tuples
333
+
334
+ Returns:
335
+ List of formatted row strings
336
+ """
337
+ if not rows:
338
+ return []
339
+
340
+ # First row is headers (if configured)
341
+ if self._has_header:
342
+ headers = rows[0]
343
+ data_rows = rows[1:]
344
+ start_row_idx = 2 # For logging purposes
345
+ else:
346
+ # Generate column names if no header
347
+ headers = tuple(f"Column_{i+1}" for i in range(len(rows[0])))
348
+ data_rows = rows
349
+ start_row_idx = 1
350
+
351
+ # Convert None/empty headers to column names
352
+ headers = [str(h) if h is not None and str(h).strip() != "" else f"Column_{i+1}"
353
+ for i, h in enumerate(headers)]
354
+
355
+ logger.info(f"Found {len(headers)} columns: {headers}")
356
+
357
+ # Process data rows
358
+ row_strings = []
359
+ for row_idx, row in enumerate(data_rows, start=start_row_idx):
360
+ # Skip empty rows if configured
361
+ if self._skip_empty_rows and all(
362
+ cell is None or str(cell).strip() == "" for cell in row
363
+ ):
364
+ logger.debug(f"Skipping empty row {row_idx}")
365
+ continue
366
+
367
+ # Format row
368
+ row_string = self._format_row(headers, row)
369
+ row_strings.append(row_string)
370
+
371
+ return row_strings
372
+
373
+ def load_as_nodes(
374
+ self,
375
+ source: Union[str, Path],
376
+ source_id: Optional[str] = None,
377
+ custom_metadata: Optional[Dict[str, Any]] = None
378
+ ) -> List[Node]:
379
+ """
380
+ Load tabular file and return as Node objects with metadata.
381
+
382
+ Args:
383
+ source: Path to tabular file (XLSX, CSV, TSV, etc.)
384
+ source_id: Optional source identifier (defaults to file path)
385
+ custom_metadata: Optional custom metadata to attach to nodes
386
+
387
+ Returns:
388
+ List of Node objects, one per row with metadata
389
+ """
390
+ # Convert Path to string
391
+ if isinstance(source, Path):
392
+ source = str(source)
393
+
394
+ # Load row strings
395
+ row_strings = self.load_as_chunks(source)
396
+
397
+ # Determine source ID
398
+ if source_id is None:
399
+ source_id = source
400
+
401
+ # Create nodes with metadata
402
+ nodes = []
403
+ for idx, row_string in enumerate(row_strings):
404
+ metadata = NodeMetadata(
405
+ source_file_uuid=source_id,
406
+ position=idx,
407
+ custom=custom_metadata or {}
408
+ )
409
+ node = Node(content=row_string, metadata=metadata)
410
+ nodes.append(node)
411
+
412
+ logger.info(f"Created {len(nodes)} nodes from tabular file: {source}")
413
+ return nodes
414
+
415
+ def load_as_vsfile(
416
+ self,
417
+ file_path: Union[str, Path],
418
+ custom_metadata: Optional[Dict[str, Any]] = None
419
+ ) -> VSFile:
420
+ """
421
+ Load tabular file and return as VSFile object.
422
+
423
+ Args:
424
+ file_path: Path to tabular file (XLSX, CSV, TSV, etc.)
425
+ custom_metadata: Optional custom metadata
426
+
427
+ Returns:
428
+ VSFile object with nodes
429
+
430
+ Raises:
431
+ FileNotFoundError: If file doesn't exist
432
+ ValueError: If file is not a supported tabular format
433
+ """
434
+ if isinstance(file_path, Path):
435
+ file_path = str(file_path)
436
+
437
+ if not os.path.isfile(file_path):
438
+ raise FileNotFoundError(f"File not found: {file_path}")
439
+
440
+ if not self._is_supported_file(file_path):
441
+ raise ValueError(
442
+ f"File is not a supported tabular format: {file_path}")
443
+
444
+ # Create VSFile
445
+ vsfile = VSFile(file_path)
446
+
447
+ # Load and create nodes
448
+ nodes = self.load_as_nodes(
449
+ file_path, str(vsfile.uuid), custom_metadata)
450
+ vsfile.nodes = nodes
451
+ vsfile.processed = True
452
+
453
+ logger.info(
454
+ f"Created VSFile with {len(nodes)} nodes from: {file_path}")
455
+ return vsfile
456
+
457
+ def _is_supported_file(self, file_path: str) -> bool:
458
+ """
459
+ Check if file is a supported tabular format based on extension.
460
+
461
+ Args:
462
+ file_path: Path to file
463
+
464
+ Returns:
465
+ True if file is a supported tabular format, False otherwise
466
+ """
467
+ path = Path(file_path)
468
+ return path.suffix.lower() in TABULAR_EXTENSIONS
469
+
470
+ def _is_xlsx_file(self, file_path: str) -> bool:
471
+ """
472
+ Check if file is an XLSX based on extension.
473
+
474
+ Args:
475
+ file_path: Path to file
476
+
477
+ Returns:
478
+ True if file is an XLSX, False otherwise
479
+ """
480
+ path = Path(file_path)
481
+ return path.suffix.lower() in XLSX_EXTENSIONS
482
+
483
+ def _format_row(self, headers: List[str], row: tuple) -> str:
484
+ """
485
+ Format a row with each field on its own line, separated by blank lines.
486
+
487
+ Format:
488
+ Column_name_1: value_1
489
+
490
+ Column_name_2: value_2
491
+
492
+ ...
493
+
494
+ Args:
495
+ headers: List of column names
496
+ row: Tuple of cell values
497
+
498
+ Returns:
499
+ Formatted string representation of the row
500
+ """
501
+ parts = []
502
+
503
+ for header, value in zip(headers, row):
504
+ # Convert value to string, handling None and empty values
505
+ if value is None or (isinstance(value, str) and value.strip() == ""):
506
+ value_str = self._empty_value_text
507
+ else:
508
+ value_str = str(value)
509
+
510
+ # Add to parts
511
+ parts.append(f"{header}: {value_str}")
512
+
513
+ # Join with double newline for better readability
514
+ return "\n\n".join(parts)
515
+
516
+
517
+ def create_tabular_loader(
518
+ sheet_name: Union[int, str] = 0,
519
+ skip_empty_rows: bool = True,
520
+ empty_value_text: str = "",
521
+ delimiter: Optional[str] = None,
522
+ encoding: str = 'utf-8',
523
+ has_header: bool = True
524
+ ) -> TabularLoader:
525
+ """
526
+ Factory function to create a tabular data loader.
527
+
528
+ Args:
529
+ sheet_name: Name or index of sheet to load for Excel files (default: 0 for first sheet)
530
+ skip_empty_rows: Whether to skip completely empty rows (default: True)
531
+ empty_value_text: Text to show for empty values (default: "")
532
+ delimiter: Delimiter for CSV/text files (default: auto-detect based on extension)
533
+ encoding: File encoding for CSV/text files (default: 'utf-8')
534
+ has_header: Whether first row contains headers (default: True)
535
+
536
+ Returns:
537
+ Configured tabular loader
538
+
539
+ Example:
540
+ >>> # Load Excel file
541
+ >>> loader = create_tabular_loader()
542
+ >>> rows = loader.run("data/spreadsheet.xlsx")
543
+ >>> print(f"Extracted {len(rows)} rows")
544
+
545
+ >>> # Load CSV file
546
+ >>> loader = create_tabular_loader()
547
+ >>> rows = loader.run("data/data.csv")
548
+
549
+ >>> # Load TSV file
550
+ >>> loader = create_tabular_loader()
551
+ >>> rows = loader.run("data/data.tsv")
552
+
553
+ >>> # Load file with custom delimiter (e.g., pipe-separated)
554
+ >>> loader = create_tabular_loader(delimiter='|')
555
+ >>> rows = loader.run("data/data.txt")
556
+
557
+ >>> # Load specific sheet from Excel
558
+ >>> loader = create_tabular_loader(sheet_name="Sheet2")
559
+ >>> rows = loader.run("data/spreadsheet.xlsx")
560
+
561
+ >>> # Load file without header row
562
+ >>> loader = create_tabular_loader(has_header=False)
563
+ >>> rows = loader.run("data/data.csv")
564
+
565
+ >>> # Show empty values explicitly
566
+ >>> loader = create_tabular_loader(empty_value_text="<empty>")
567
+ >>> rows = loader.run("data/spreadsheet.xlsx")
568
+ """
569
+ config = {
570
+ 'sheet_name': sheet_name,
571
+ 'skip_empty_rows': skip_empty_rows,
572
+ 'empty_value_text': empty_value_text,
573
+ 'delimiter': delimiter,
574
+ 'encoding': encoding,
575
+ 'has_header': has_header
576
+ }
577
+
578
+ return TabularLoader(config=config)
579
+
580
+
581
+ # Backward compatibility aliases
582
+ XlsxLoader = TabularLoader
583
+ create_xlsx_loader = create_tabular_loader
584
+
585
+
586
+ __all__ = [
587
+ "TabularLoader",
588
+ "create_tabular_loader",
589
+ # Backward compatibility
590
+ "XlsxLoader",
591
+ "create_xlsx_loader",
592
+ # Constants
593
+ "TABULAR_EXTENSIONS",
594
+ "XLSX_EXTENSIONS",
595
+ "CSV_EXTENSIONS",
596
+ "TSV_EXTENSIONS",
597
+ ]
@@ -0,0 +1,10 @@
1
+ """
2
+ Django App Configuration for Vector Store
3
+ """
4
+ from django.apps import AppConfig
5
+
6
+
7
+ class VectorStoreConfig(AppConfig):
8
+ default_auto_field = "django.db.models.BigAutoField"
9
+ name = "rakam_systems_vectorstore.components.vectorstore"
10
+ label = "application" # Match the app_label in models