rakam-systems-vectorstore 0.1.1rc7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. rakam_systems_vectorstore/MANIFEST.in +26 -0
  2. rakam_systems_vectorstore/README.md +1071 -0
  3. rakam_systems_vectorstore/__init__.py +93 -0
  4. rakam_systems_vectorstore/components/__init__.py +0 -0
  5. rakam_systems_vectorstore/components/chunker/__init__.py +19 -0
  6. rakam_systems_vectorstore/components/chunker/advanced_chunker.py +1019 -0
  7. rakam_systems_vectorstore/components/chunker/text_chunker.py +154 -0
  8. rakam_systems_vectorstore/components/embedding_model/__init__.py +0 -0
  9. rakam_systems_vectorstore/components/embedding_model/configurable_embeddings.py +546 -0
  10. rakam_systems_vectorstore/components/embedding_model/openai_embeddings.py +259 -0
  11. rakam_systems_vectorstore/components/loader/__init__.py +31 -0
  12. rakam_systems_vectorstore/components/loader/adaptive_loader.py +512 -0
  13. rakam_systems_vectorstore/components/loader/code_loader.py +699 -0
  14. rakam_systems_vectorstore/components/loader/doc_loader.py +812 -0
  15. rakam_systems_vectorstore/components/loader/eml_loader.py +556 -0
  16. rakam_systems_vectorstore/components/loader/html_loader.py +626 -0
  17. rakam_systems_vectorstore/components/loader/md_loader.py +622 -0
  18. rakam_systems_vectorstore/components/loader/odt_loader.py +750 -0
  19. rakam_systems_vectorstore/components/loader/pdf_loader.py +771 -0
  20. rakam_systems_vectorstore/components/loader/pdf_loader_light.py +723 -0
  21. rakam_systems_vectorstore/components/loader/tabular_loader.py +597 -0
  22. rakam_systems_vectorstore/components/vectorstore/__init__.py +0 -0
  23. rakam_systems_vectorstore/components/vectorstore/apps.py +10 -0
  24. rakam_systems_vectorstore/components/vectorstore/configurable_pg_vector_store.py +1661 -0
  25. rakam_systems_vectorstore/components/vectorstore/faiss_vector_store.py +878 -0
  26. rakam_systems_vectorstore/components/vectorstore/migrations/0001_initial.py +55 -0
  27. rakam_systems_vectorstore/components/vectorstore/migrations/__init__.py +0 -0
  28. rakam_systems_vectorstore/components/vectorstore/models.py +10 -0
  29. rakam_systems_vectorstore/components/vectorstore/pg_models.py +97 -0
  30. rakam_systems_vectorstore/components/vectorstore/pg_vector_store.py +827 -0
  31. rakam_systems_vectorstore/config.py +266 -0
  32. rakam_systems_vectorstore/core.py +8 -0
  33. rakam_systems_vectorstore/pyproject.toml +113 -0
  34. rakam_systems_vectorstore/server/README.md +290 -0
  35. rakam_systems_vectorstore/server/__init__.py +20 -0
  36. rakam_systems_vectorstore/server/mcp_server_vector.py +325 -0
  37. rakam_systems_vectorstore/setup.py +103 -0
  38. rakam_systems_vectorstore-0.1.1rc7.dist-info/METADATA +370 -0
  39. rakam_systems_vectorstore-0.1.1rc7.dist-info/RECORD +40 -0
  40. rakam_systems_vectorstore-0.1.1rc7.dist-info/WHEEL +4 -0
@@ -0,0 +1,556 @@
1
+ """
2
+ EML Loader for processing email files (.eml format).
3
+
4
+ This loader uses Python's email library to extract text content from EML files.
5
+ It supports:
6
+ - Email header extraction (From, To, Subject, Date)
7
+ - Plain text email body extraction
8
+ - HTML email body extraction with text conversion
9
+ - Multipart email parsing
10
+ - Text-based chunking using TextChunker
11
+
12
+ The extracted content is chunked and returned as text or Node objects.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import email
18
+ import os
19
+ import time
20
+ from email import policy
21
+ from email.parser import BytesParser
22
+ from pathlib import Path
23
+ from typing import Any, Dict, List, Optional, Union
24
+
25
+ from rakam_systems_core.ai_utils import logging
26
+ from rakam_systems_core.ai_core.interfaces.loader import Loader
27
+ from rakam_systems_vectorstore.components.chunker import TextChunker
28
+ from rakam_systems_vectorstore.core import Node, NodeMetadata, VSFile
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ class EmlLoader(Loader):
34
+ """
35
+ EML loader for processing email files.
36
+
37
+ This loader provides EML file processing with support for:
38
+ - Email header extraction (From, To, Subject, Date)
39
+ - Plain text and HTML email body extraction
40
+ - Multipart email parsing
41
+ - Text-based chunking with configurable parameters
42
+
43
+ The extracted content is chunked using TextChunker and returned as text or Node objects.
44
+ """
45
+
46
+ # Default configuration
47
+ DEFAULT_CHUNK_SIZE = 3000
48
+ DEFAULT_CHUNK_OVERLAP = 200
49
+ DEFAULT_MIN_SENTENCES_PER_CHUNK = 5
50
+ DEFAULT_TOKENIZER = "character"
51
+
52
+ def __init__(
53
+ self,
54
+ name: str = "eml_loader",
55
+ config: Optional[Dict[str, Any]] = None
56
+ ):
57
+ """
58
+ Initialize EML loader.
59
+
60
+ Args:
61
+ name: Component name
62
+ config: Optional configuration with keys:
63
+ - chunk_size: Maximum tokens per chunk (default: 3000)
64
+ - chunk_overlap: Overlap between chunks in tokens (default: 200)
65
+ - min_sentences_per_chunk: Minimum sentences per chunk (default: 5)
66
+ - tokenizer: Tokenizer for chunking (default: "character")
67
+ - include_headers: Whether to include email headers in output (default: True)
68
+ - extract_html: Whether to extract text from HTML parts (default: True)
69
+ """
70
+ super().__init__(name=name, config=config)
71
+
72
+ # Extract configuration
73
+ config = config or {}
74
+ self._chunk_size = config.get('chunk_size', self.DEFAULT_CHUNK_SIZE)
75
+ self._chunk_overlap = config.get(
76
+ 'chunk_overlap', self.DEFAULT_CHUNK_OVERLAP)
77
+ self._min_sentences_per_chunk = config.get(
78
+ 'min_sentences_per_chunk', self.DEFAULT_MIN_SENTENCES_PER_CHUNK)
79
+ self._tokenizer = config.get('tokenizer', self.DEFAULT_TOKENIZER)
80
+ self._include_headers = config.get('include_headers', True)
81
+ self._extract_html = config.get('extract_html', True)
82
+
83
+ # Initialize text chunker
84
+ self._chunker = TextChunker(
85
+ chunk_size=self._chunk_size,
86
+ chunk_overlap=self._chunk_overlap,
87
+ min_sentences_per_chunk=self._min_sentences_per_chunk,
88
+ tokenizer=self._tokenizer
89
+ )
90
+
91
+ logger.info(
92
+ f"Initialized EmlLoader with chunk_size={self._chunk_size}, chunk_overlap={self._chunk_overlap}")
93
+
94
+ def run(self, source: str) -> List[str]:
95
+ """
96
+ Execute the primary operation for the component.
97
+
98
+ This method satisfies the BaseComponent abstract method requirement
99
+ and delegates to load_as_chunks.
100
+
101
+ Args:
102
+ source: Path to EML file
103
+
104
+ Returns:
105
+ List of text chunks extracted from the EML file
106
+ """
107
+ return self.load_as_chunks(source)
108
+
109
+ def load_as_text(
110
+ self,
111
+ source: Union[str, Path],
112
+ ) -> str:
113
+ """
114
+ Load EML and return as a single text string.
115
+
116
+ This method extracts all text from the EML file and returns it as a single
117
+ string without chunking. Useful when you need the full email content.
118
+
119
+ Args:
120
+ source: Path to EML file
121
+
122
+ Returns:
123
+ Full text content of the EML as a single string
124
+
125
+ Raises:
126
+ FileNotFoundError: If source file doesn't exist
127
+ ValueError: If source is not an EML file
128
+ Exception: If EML processing fails
129
+ """
130
+ # Convert Path to string
131
+ if isinstance(source, Path):
132
+ source = str(source)
133
+
134
+ # Validate file exists
135
+ if not os.path.isfile(source):
136
+ raise FileNotFoundError(f"File not found: {source}")
137
+
138
+ # Validate file is an EML
139
+ if not self._is_eml_file(source):
140
+ raise ValueError(
141
+ f"File is not an EML: {source}. Extension: {Path(source).suffix}")
142
+
143
+ logger.info(f"Loading EML as text: {source}")
144
+ start_time = time.time()
145
+
146
+ try:
147
+ # Extract text from EML
148
+ full_text = self._extract_text_from_eml(source)
149
+
150
+ elapsed = time.time() - start_time
151
+ logger.info(
152
+ f"EML loaded as text in {elapsed:.2f}s: {len(full_text)} characters")
153
+
154
+ return full_text
155
+
156
+ except Exception as e:
157
+ logger.error(f"Error loading EML as text {source}: {e}")
158
+ raise
159
+
160
+ def load_as_chunks(
161
+ self,
162
+ source: Union[str, Path],
163
+ ) -> List[str]:
164
+ """
165
+ Load EML and return as a list of text chunks.
166
+
167
+ This method extracts text from the EML file, processes it with the configured
168
+ chunker, and returns a list of text chunks.
169
+
170
+ Args:
171
+ source: Path to EML file
172
+
173
+ Returns:
174
+ List of text chunks extracted from the EML file
175
+
176
+ Raises:
177
+ FileNotFoundError: If source file doesn't exist
178
+ ValueError: If source is not an EML file
179
+ Exception: If EML processing fails
180
+ """
181
+ # Convert Path to string
182
+ if isinstance(source, Path):
183
+ source = str(source)
184
+
185
+ # Validate file exists
186
+ if not os.path.isfile(source):
187
+ raise FileNotFoundError(f"File not found: {source}")
188
+
189
+ # Validate file is an EML
190
+ if not self._is_eml_file(source):
191
+ raise ValueError(
192
+ f"File is not an EML: {source}. Extension: {Path(source).suffix}")
193
+
194
+ logger.info(f"Loading EML file: {source}")
195
+ start_time = time.time()
196
+
197
+ try:
198
+ # Extract text from EML
199
+ full_text = self._extract_text_from_eml(source)
200
+
201
+ # Chunk the text using TextChunker
202
+ text_chunks = self._chunk_text(full_text)
203
+
204
+ elapsed = time.time() - start_time
205
+ logger.info(
206
+ f"EML processed in {elapsed:.2f}s: {len(text_chunks)} chunks")
207
+
208
+ return text_chunks
209
+
210
+ except Exception as e:
211
+ logger.error(f"Error processing EML {source}: {e}")
212
+ raise
213
+
214
+ def load_as_nodes(
215
+ self,
216
+ source: Union[str, Path],
217
+ source_id: Optional[str] = None,
218
+ custom_metadata: Optional[Dict[str, Any]] = None
219
+ ) -> List[Node]:
220
+ """
221
+ Load EML and return as Node objects with metadata.
222
+
223
+ Each EML file is loaded as a single node (one email = one node).
224
+
225
+ Args:
226
+ source: Path to EML file
227
+ source_id: Optional source identifier (defaults to file path)
228
+ custom_metadata: Optional custom metadata to attach to nodes
229
+
230
+ Returns:
231
+ List of Node objects (single node containing the full email)
232
+ """
233
+ # Convert Path to string
234
+ if isinstance(source, Path):
235
+ source = str(source)
236
+
237
+ # Load full email text (no chunking)
238
+ full_text = self.load_as_text(source)
239
+
240
+ # Determine source ID
241
+ if source_id is None:
242
+ source_id = source
243
+
244
+ # Create single node with metadata
245
+ metadata = NodeMetadata(
246
+ source_file_uuid=source_id,
247
+ position=0,
248
+ custom=custom_metadata or {}
249
+ )
250
+ node = Node(content=full_text, metadata=metadata)
251
+
252
+ logger.info(f"Created 1 node from EML: {source}")
253
+ return [node]
254
+
255
+ def load_as_vsfile(
256
+ self,
257
+ file_path: Union[str, Path],
258
+ custom_metadata: Optional[Dict[str, Any]] = None
259
+ ) -> VSFile:
260
+ """
261
+ Load EML and return as VSFile object.
262
+
263
+ Args:
264
+ file_path: Path to EML file
265
+ custom_metadata: Optional custom metadata
266
+
267
+ Returns:
268
+ VSFile object with nodes
269
+
270
+ Raises:
271
+ FileNotFoundError: If file doesn't exist
272
+ ValueError: If file is not an EML
273
+ """
274
+ if isinstance(file_path, Path):
275
+ file_path = str(file_path)
276
+
277
+ if not os.path.isfile(file_path):
278
+ raise FileNotFoundError(f"File not found: {file_path}")
279
+
280
+ if not self._is_eml_file(file_path):
281
+ raise ValueError(f"File is not an EML: {file_path}")
282
+
283
+ # Create VSFile
284
+ vsfile = VSFile(file_path)
285
+
286
+ # Load and create nodes
287
+ nodes = self.load_as_nodes(
288
+ file_path, str(vsfile.uuid), custom_metadata)
289
+ vsfile.nodes = nodes
290
+ vsfile.processed = True
291
+
292
+ logger.info(
293
+ f"Created VSFile with {len(nodes)} nodes from: {file_path}")
294
+ return vsfile
295
+
296
+ def _is_eml_file(self, file_path: str) -> bool:
297
+ """
298
+ Check if file is an EML based on extension.
299
+
300
+ Args:
301
+ file_path: Path to file
302
+
303
+ Returns:
304
+ True if file is an EML, False otherwise
305
+ """
306
+ # Check extension
307
+ path = Path(file_path)
308
+ return path.suffix.lower() == '.eml'
309
+
310
+ def _extract_text_from_eml(self, eml_path: str) -> str:
311
+ """
312
+ Extract text from EML file including headers and body.
313
+
314
+ Args:
315
+ eml_path: Path to EML file
316
+
317
+ Returns:
318
+ Extracted text content
319
+ """
320
+ try:
321
+ # Parse the EML file
322
+ with open(eml_path, 'rb') as f:
323
+ msg = BytesParser(policy=policy.default).parse(f)
324
+
325
+ # Extract headers if enabled
326
+ text_parts = []
327
+
328
+ if self._include_headers:
329
+ headers_text = self._extract_headers(msg)
330
+ if headers_text:
331
+ text_parts.append(headers_text)
332
+
333
+ # Extract body content
334
+ body_text = self._extract_body(msg)
335
+ if body_text:
336
+ text_parts.append(body_text)
337
+
338
+ # Combine all parts
339
+ full_text = "\n\n".join(text_parts)
340
+
341
+ logger.debug(f"Extracted {len(full_text)} characters from EML")
342
+ return full_text
343
+
344
+ except Exception as e:
345
+ logger.error(f"Failed to extract text from EML: {e}")
346
+ raise
347
+
348
+ def _extract_headers(self, msg: email.message.EmailMessage) -> str:
349
+ """
350
+ Extract relevant email headers.
351
+
352
+ Args:
353
+ msg: Email message object
354
+
355
+ Returns:
356
+ Formatted header text
357
+ """
358
+ headers = []
359
+
360
+ # Extract common headers
361
+ if msg['Subject']:
362
+ headers.append(f"Subject: {msg['Subject']}")
363
+
364
+ if msg['From']:
365
+ headers.append(f"From: {msg['From']}")
366
+
367
+ if msg['To']:
368
+ headers.append(f"To: {msg['To']}")
369
+
370
+ if msg['Date']:
371
+ headers.append(f"Date: {msg['Date']}")
372
+
373
+ if msg['Cc']:
374
+ headers.append(f"Cc: {msg['Cc']}")
375
+
376
+ return "\n".join(headers)
377
+
378
+ def _extract_body(self, msg: email.message.EmailMessage) -> str:
379
+ """
380
+ Extract email body content from plain text and/or HTML parts.
381
+
382
+ Args:
383
+ msg: Email message object
384
+
385
+ Returns:
386
+ Extracted body text
387
+ """
388
+ body_parts = []
389
+
390
+ # Try to get plain text body first
391
+ if msg.is_multipart():
392
+ for part in msg.walk():
393
+ content_type = part.get_content_type()
394
+ content_disposition = str(part.get("Content-Disposition", ""))
395
+
396
+ # Skip attachments
397
+ if "attachment" in content_disposition:
398
+ continue
399
+
400
+ # Extract plain text
401
+ if content_type == "text/plain":
402
+ try:
403
+ text = part.get_content()
404
+ if text and text.strip():
405
+ body_parts.append(text.strip())
406
+ except Exception as e:
407
+ logger.warning(
408
+ f"Failed to extract plain text part: {e}")
409
+
410
+ # Extract HTML and convert to text if enabled
411
+ elif content_type == "text/html" and self._extract_html:
412
+ try:
413
+ html = part.get_content()
414
+ text = self._html_to_text(html)
415
+ if text and text.strip():
416
+ body_parts.append(text.strip())
417
+ except Exception as e:
418
+ logger.warning(f"Failed to extract HTML part: {e}")
419
+ else:
420
+ # Single part message
421
+ content_type = msg.get_content_type()
422
+
423
+ if content_type == "text/plain":
424
+ try:
425
+ text = msg.get_content()
426
+ if text and text.strip():
427
+ body_parts.append(text.strip())
428
+ except Exception as e:
429
+ logger.warning(f"Failed to extract plain text: {e}")
430
+
431
+ elif content_type == "text/html" and self._extract_html:
432
+ try:
433
+ html = msg.get_content()
434
+ text = self._html_to_text(html)
435
+ if text and text.strip():
436
+ body_parts.append(text.strip())
437
+ except Exception as e:
438
+ logger.warning(f"Failed to extract HTML: {e}")
439
+
440
+ return "\n\n".join(body_parts)
441
+
442
+ def _html_to_text(self, html: str) -> str:
443
+ """
444
+ Convert HTML to plain text.
445
+
446
+ Args:
447
+ html: HTML content
448
+
449
+ Returns:
450
+ Plain text extracted from HTML
451
+ """
452
+ try:
453
+ from bs4 import BeautifulSoup
454
+
455
+ # Use 'lxml' parser for better performance (falls back to html.parser if not available)
456
+ try:
457
+ soup = BeautifulSoup(html, 'lxml')
458
+ except Exception:
459
+ soup = BeautifulSoup(html, 'html.parser')
460
+
461
+ # Remove script and style elements
462
+ for script in soup(["script", "style"]):
463
+ script.decompose()
464
+
465
+ # Get text - use separator for better text extraction
466
+ text = soup.get_text(separator=' ', strip=True)
467
+
468
+ # Clean up excessive whitespace more efficiently
469
+ import re
470
+ text = re.sub(r'\s+', ' ', text)
471
+ text = re.sub(r'\n\s*\n', '\n', text)
472
+
473
+ return text.strip()
474
+
475
+ except ImportError:
476
+ logger.warning(
477
+ "beautifulsoup4 not installed, returning HTML as-is")
478
+ return html
479
+ except Exception as e:
480
+ logger.warning(f"Failed to convert HTML to text: {e}")
481
+ return html
482
+
483
+ def _chunk_text(self, text: str) -> List[str]:
484
+ """
485
+ Chunk text using TextChunker.
486
+
487
+ Args:
488
+ text: Full text to chunk
489
+
490
+ Returns:
491
+ List of text chunks
492
+ """
493
+ if not text or not text.strip():
494
+ return []
495
+
496
+ try:
497
+ # Use TextChunker's chunk_text method
498
+ chunk_dicts = self._chunker.chunk_text(text, context="eml")
499
+
500
+ # Extract just the text from the chunk dictionaries
501
+ text_chunks = [chunk_dict['text'] for chunk_dict in chunk_dicts]
502
+
503
+ logger.info(f"Chunked EML text into {len(text_chunks)} chunks")
504
+ return text_chunks
505
+
506
+ except Exception as e:
507
+ logger.warning(f"Failed to chunk text with TextChunker: {e}")
508
+ # Fall back to returning the whole text as a single chunk
509
+ logger.info("Falling back to single chunk")
510
+ return [text]
511
+
512
+
513
+ def create_eml_loader(
514
+ chunk_size: int = 3000,
515
+ chunk_overlap: int = 200,
516
+ min_sentences_per_chunk: int = 5,
517
+ tokenizer: str = "character",
518
+ include_headers: bool = True,
519
+ extract_html: bool = True
520
+ ) -> EmlLoader:
521
+ """
522
+ Factory function to create an EML loader.
523
+
524
+ Args:
525
+ chunk_size: Maximum tokens per chunk (default: 3000)
526
+ chunk_overlap: Overlap between chunks in tokens (default: 200)
527
+ min_sentences_per_chunk: Minimum sentences per chunk (default: 5)
528
+ tokenizer: Tokenizer for chunking - "character", "gpt2", or HuggingFace model (default: "character")
529
+ include_headers: Whether to include email headers in output (default: True)
530
+ extract_html: Whether to extract text from HTML parts (default: True)
531
+
532
+ Returns:
533
+ Configured EML loader
534
+
535
+ Example:
536
+ >>> loader = create_eml_loader(chunk_size=1024, chunk_overlap=64)
537
+ >>> chunks = loader.run("data/email.eml")
538
+ >>> print(f"Extracted {len(chunks)} chunks")
539
+
540
+ >>> # Create loader without headers
541
+ >>> loader = create_eml_loader(include_headers=False)
542
+ >>> chunks = loader.run("data/email.eml")
543
+ """
544
+ config = {
545
+ 'chunk_size': chunk_size,
546
+ 'chunk_overlap': chunk_overlap,
547
+ 'min_sentences_per_chunk': min_sentences_per_chunk,
548
+ 'tokenizer': tokenizer,
549
+ 'include_headers': include_headers,
550
+ 'extract_html': extract_html
551
+ }
552
+
553
+ return EmlLoader(config=config)
554
+
555
+
556
+ __all__ = ["EmlLoader", "create_eml_loader"]