rakam-systems-vectorstore 0.1.1rc7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rakam_systems_vectorstore/MANIFEST.in +26 -0
- rakam_systems_vectorstore/README.md +1071 -0
- rakam_systems_vectorstore/__init__.py +93 -0
- rakam_systems_vectorstore/components/__init__.py +0 -0
- rakam_systems_vectorstore/components/chunker/__init__.py +19 -0
- rakam_systems_vectorstore/components/chunker/advanced_chunker.py +1019 -0
- rakam_systems_vectorstore/components/chunker/text_chunker.py +154 -0
- rakam_systems_vectorstore/components/embedding_model/__init__.py +0 -0
- rakam_systems_vectorstore/components/embedding_model/configurable_embeddings.py +546 -0
- rakam_systems_vectorstore/components/embedding_model/openai_embeddings.py +259 -0
- rakam_systems_vectorstore/components/loader/__init__.py +31 -0
- rakam_systems_vectorstore/components/loader/adaptive_loader.py +512 -0
- rakam_systems_vectorstore/components/loader/code_loader.py +699 -0
- rakam_systems_vectorstore/components/loader/doc_loader.py +812 -0
- rakam_systems_vectorstore/components/loader/eml_loader.py +556 -0
- rakam_systems_vectorstore/components/loader/html_loader.py +626 -0
- rakam_systems_vectorstore/components/loader/md_loader.py +622 -0
- rakam_systems_vectorstore/components/loader/odt_loader.py +750 -0
- rakam_systems_vectorstore/components/loader/pdf_loader.py +771 -0
- rakam_systems_vectorstore/components/loader/pdf_loader_light.py +723 -0
- rakam_systems_vectorstore/components/loader/tabular_loader.py +597 -0
- rakam_systems_vectorstore/components/vectorstore/__init__.py +0 -0
- rakam_systems_vectorstore/components/vectorstore/apps.py +10 -0
- rakam_systems_vectorstore/components/vectorstore/configurable_pg_vector_store.py +1661 -0
- rakam_systems_vectorstore/components/vectorstore/faiss_vector_store.py +878 -0
- rakam_systems_vectorstore/components/vectorstore/migrations/0001_initial.py +55 -0
- rakam_systems_vectorstore/components/vectorstore/migrations/__init__.py +0 -0
- rakam_systems_vectorstore/components/vectorstore/models.py +10 -0
- rakam_systems_vectorstore/components/vectorstore/pg_models.py +97 -0
- rakam_systems_vectorstore/components/vectorstore/pg_vector_store.py +827 -0
- rakam_systems_vectorstore/config.py +266 -0
- rakam_systems_vectorstore/core.py +8 -0
- rakam_systems_vectorstore/pyproject.toml +113 -0
- rakam_systems_vectorstore/server/README.md +290 -0
- rakam_systems_vectorstore/server/__init__.py +20 -0
- rakam_systems_vectorstore/server/mcp_server_vector.py +325 -0
- rakam_systems_vectorstore/setup.py +103 -0
- rakam_systems_vectorstore-0.1.1rc7.dist-info/METADATA +370 -0
- rakam_systems_vectorstore-0.1.1rc7.dist-info/RECORD +40 -0
- rakam_systems_vectorstore-0.1.1rc7.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,556 @@
|
|
|
1
|
+
"""
|
|
2
|
+
EML Loader for processing email files (.eml format).
|
|
3
|
+
|
|
4
|
+
This loader uses Python's email library to extract text content from EML files.
|
|
5
|
+
It supports:
|
|
6
|
+
- Email header extraction (From, To, Subject, Date)
|
|
7
|
+
- Plain text email body extraction
|
|
8
|
+
- HTML email body extraction with text conversion
|
|
9
|
+
- Multipart email parsing
|
|
10
|
+
- Text-based chunking using TextChunker
|
|
11
|
+
|
|
12
|
+
The extracted content is chunked and returned as text or Node objects.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import email
|
|
18
|
+
import os
|
|
19
|
+
import time
|
|
20
|
+
from email import policy
|
|
21
|
+
from email.parser import BytesParser
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
from typing import Any, Dict, List, Optional, Union
|
|
24
|
+
|
|
25
|
+
from rakam_systems_core.ai_utils import logging
|
|
26
|
+
from rakam_systems_core.ai_core.interfaces.loader import Loader
|
|
27
|
+
from rakam_systems_vectorstore.components.chunker import TextChunker
|
|
28
|
+
from rakam_systems_vectorstore.core import Node, NodeMetadata, VSFile
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class EmlLoader(Loader):
|
|
34
|
+
"""
|
|
35
|
+
EML loader for processing email files.
|
|
36
|
+
|
|
37
|
+
This loader provides EML file processing with support for:
|
|
38
|
+
- Email header extraction (From, To, Subject, Date)
|
|
39
|
+
- Plain text and HTML email body extraction
|
|
40
|
+
- Multipart email parsing
|
|
41
|
+
- Text-based chunking with configurable parameters
|
|
42
|
+
|
|
43
|
+
The extracted content is chunked using TextChunker and returned as text or Node objects.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
# Default configuration
|
|
47
|
+
DEFAULT_CHUNK_SIZE = 3000
|
|
48
|
+
DEFAULT_CHUNK_OVERLAP = 200
|
|
49
|
+
DEFAULT_MIN_SENTENCES_PER_CHUNK = 5
|
|
50
|
+
DEFAULT_TOKENIZER = "character"
|
|
51
|
+
|
|
52
|
+
def __init__(
|
|
53
|
+
self,
|
|
54
|
+
name: str = "eml_loader",
|
|
55
|
+
config: Optional[Dict[str, Any]] = None
|
|
56
|
+
):
|
|
57
|
+
"""
|
|
58
|
+
Initialize EML loader.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
name: Component name
|
|
62
|
+
config: Optional configuration with keys:
|
|
63
|
+
- chunk_size: Maximum tokens per chunk (default: 3000)
|
|
64
|
+
- chunk_overlap: Overlap between chunks in tokens (default: 200)
|
|
65
|
+
- min_sentences_per_chunk: Minimum sentences per chunk (default: 5)
|
|
66
|
+
- tokenizer: Tokenizer for chunking (default: "character")
|
|
67
|
+
- include_headers: Whether to include email headers in output (default: True)
|
|
68
|
+
- extract_html: Whether to extract text from HTML parts (default: True)
|
|
69
|
+
"""
|
|
70
|
+
super().__init__(name=name, config=config)
|
|
71
|
+
|
|
72
|
+
# Extract configuration
|
|
73
|
+
config = config or {}
|
|
74
|
+
self._chunk_size = config.get('chunk_size', self.DEFAULT_CHUNK_SIZE)
|
|
75
|
+
self._chunk_overlap = config.get(
|
|
76
|
+
'chunk_overlap', self.DEFAULT_CHUNK_OVERLAP)
|
|
77
|
+
self._min_sentences_per_chunk = config.get(
|
|
78
|
+
'min_sentences_per_chunk', self.DEFAULT_MIN_SENTENCES_PER_CHUNK)
|
|
79
|
+
self._tokenizer = config.get('tokenizer', self.DEFAULT_TOKENIZER)
|
|
80
|
+
self._include_headers = config.get('include_headers', True)
|
|
81
|
+
self._extract_html = config.get('extract_html', True)
|
|
82
|
+
|
|
83
|
+
# Initialize text chunker
|
|
84
|
+
self._chunker = TextChunker(
|
|
85
|
+
chunk_size=self._chunk_size,
|
|
86
|
+
chunk_overlap=self._chunk_overlap,
|
|
87
|
+
min_sentences_per_chunk=self._min_sentences_per_chunk,
|
|
88
|
+
tokenizer=self._tokenizer
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
logger.info(
|
|
92
|
+
f"Initialized EmlLoader with chunk_size={self._chunk_size}, chunk_overlap={self._chunk_overlap}")
|
|
93
|
+
|
|
94
|
+
def run(self, source: str) -> List[str]:
|
|
95
|
+
"""
|
|
96
|
+
Execute the primary operation for the component.
|
|
97
|
+
|
|
98
|
+
This method satisfies the BaseComponent abstract method requirement
|
|
99
|
+
and delegates to load_as_chunks.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
source: Path to EML file
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
List of text chunks extracted from the EML file
|
|
106
|
+
"""
|
|
107
|
+
return self.load_as_chunks(source)
|
|
108
|
+
|
|
109
|
+
def load_as_text(
|
|
110
|
+
self,
|
|
111
|
+
source: Union[str, Path],
|
|
112
|
+
) -> str:
|
|
113
|
+
"""
|
|
114
|
+
Load EML and return as a single text string.
|
|
115
|
+
|
|
116
|
+
This method extracts all text from the EML file and returns it as a single
|
|
117
|
+
string without chunking. Useful when you need the full email content.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
source: Path to EML file
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
Full text content of the EML as a single string
|
|
124
|
+
|
|
125
|
+
Raises:
|
|
126
|
+
FileNotFoundError: If source file doesn't exist
|
|
127
|
+
ValueError: If source is not an EML file
|
|
128
|
+
Exception: If EML processing fails
|
|
129
|
+
"""
|
|
130
|
+
# Convert Path to string
|
|
131
|
+
if isinstance(source, Path):
|
|
132
|
+
source = str(source)
|
|
133
|
+
|
|
134
|
+
# Validate file exists
|
|
135
|
+
if not os.path.isfile(source):
|
|
136
|
+
raise FileNotFoundError(f"File not found: {source}")
|
|
137
|
+
|
|
138
|
+
# Validate file is an EML
|
|
139
|
+
if not self._is_eml_file(source):
|
|
140
|
+
raise ValueError(
|
|
141
|
+
f"File is not an EML: {source}. Extension: {Path(source).suffix}")
|
|
142
|
+
|
|
143
|
+
logger.info(f"Loading EML as text: {source}")
|
|
144
|
+
start_time = time.time()
|
|
145
|
+
|
|
146
|
+
try:
|
|
147
|
+
# Extract text from EML
|
|
148
|
+
full_text = self._extract_text_from_eml(source)
|
|
149
|
+
|
|
150
|
+
elapsed = time.time() - start_time
|
|
151
|
+
logger.info(
|
|
152
|
+
f"EML loaded as text in {elapsed:.2f}s: {len(full_text)} characters")
|
|
153
|
+
|
|
154
|
+
return full_text
|
|
155
|
+
|
|
156
|
+
except Exception as e:
|
|
157
|
+
logger.error(f"Error loading EML as text {source}: {e}")
|
|
158
|
+
raise
|
|
159
|
+
|
|
160
|
+
def load_as_chunks(
|
|
161
|
+
self,
|
|
162
|
+
source: Union[str, Path],
|
|
163
|
+
) -> List[str]:
|
|
164
|
+
"""
|
|
165
|
+
Load EML and return as a list of text chunks.
|
|
166
|
+
|
|
167
|
+
This method extracts text from the EML file, processes it with the configured
|
|
168
|
+
chunker, and returns a list of text chunks.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
source: Path to EML file
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
List of text chunks extracted from the EML file
|
|
175
|
+
|
|
176
|
+
Raises:
|
|
177
|
+
FileNotFoundError: If source file doesn't exist
|
|
178
|
+
ValueError: If source is not an EML file
|
|
179
|
+
Exception: If EML processing fails
|
|
180
|
+
"""
|
|
181
|
+
# Convert Path to string
|
|
182
|
+
if isinstance(source, Path):
|
|
183
|
+
source = str(source)
|
|
184
|
+
|
|
185
|
+
# Validate file exists
|
|
186
|
+
if not os.path.isfile(source):
|
|
187
|
+
raise FileNotFoundError(f"File not found: {source}")
|
|
188
|
+
|
|
189
|
+
# Validate file is an EML
|
|
190
|
+
if not self._is_eml_file(source):
|
|
191
|
+
raise ValueError(
|
|
192
|
+
f"File is not an EML: {source}. Extension: {Path(source).suffix}")
|
|
193
|
+
|
|
194
|
+
logger.info(f"Loading EML file: {source}")
|
|
195
|
+
start_time = time.time()
|
|
196
|
+
|
|
197
|
+
try:
|
|
198
|
+
# Extract text from EML
|
|
199
|
+
full_text = self._extract_text_from_eml(source)
|
|
200
|
+
|
|
201
|
+
# Chunk the text using TextChunker
|
|
202
|
+
text_chunks = self._chunk_text(full_text)
|
|
203
|
+
|
|
204
|
+
elapsed = time.time() - start_time
|
|
205
|
+
logger.info(
|
|
206
|
+
f"EML processed in {elapsed:.2f}s: {len(text_chunks)} chunks")
|
|
207
|
+
|
|
208
|
+
return text_chunks
|
|
209
|
+
|
|
210
|
+
except Exception as e:
|
|
211
|
+
logger.error(f"Error processing EML {source}: {e}")
|
|
212
|
+
raise
|
|
213
|
+
|
|
214
|
+
def load_as_nodes(
|
|
215
|
+
self,
|
|
216
|
+
source: Union[str, Path],
|
|
217
|
+
source_id: Optional[str] = None,
|
|
218
|
+
custom_metadata: Optional[Dict[str, Any]] = None
|
|
219
|
+
) -> List[Node]:
|
|
220
|
+
"""
|
|
221
|
+
Load EML and return as Node objects with metadata.
|
|
222
|
+
|
|
223
|
+
Each EML file is loaded as a single node (one email = one node).
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
source: Path to EML file
|
|
227
|
+
source_id: Optional source identifier (defaults to file path)
|
|
228
|
+
custom_metadata: Optional custom metadata to attach to nodes
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
List of Node objects (single node containing the full email)
|
|
232
|
+
"""
|
|
233
|
+
# Convert Path to string
|
|
234
|
+
if isinstance(source, Path):
|
|
235
|
+
source = str(source)
|
|
236
|
+
|
|
237
|
+
# Load full email text (no chunking)
|
|
238
|
+
full_text = self.load_as_text(source)
|
|
239
|
+
|
|
240
|
+
# Determine source ID
|
|
241
|
+
if source_id is None:
|
|
242
|
+
source_id = source
|
|
243
|
+
|
|
244
|
+
# Create single node with metadata
|
|
245
|
+
metadata = NodeMetadata(
|
|
246
|
+
source_file_uuid=source_id,
|
|
247
|
+
position=0,
|
|
248
|
+
custom=custom_metadata or {}
|
|
249
|
+
)
|
|
250
|
+
node = Node(content=full_text, metadata=metadata)
|
|
251
|
+
|
|
252
|
+
logger.info(f"Created 1 node from EML: {source}")
|
|
253
|
+
return [node]
|
|
254
|
+
|
|
255
|
+
def load_as_vsfile(
|
|
256
|
+
self,
|
|
257
|
+
file_path: Union[str, Path],
|
|
258
|
+
custom_metadata: Optional[Dict[str, Any]] = None
|
|
259
|
+
) -> VSFile:
|
|
260
|
+
"""
|
|
261
|
+
Load EML and return as VSFile object.
|
|
262
|
+
|
|
263
|
+
Args:
|
|
264
|
+
file_path: Path to EML file
|
|
265
|
+
custom_metadata: Optional custom metadata
|
|
266
|
+
|
|
267
|
+
Returns:
|
|
268
|
+
VSFile object with nodes
|
|
269
|
+
|
|
270
|
+
Raises:
|
|
271
|
+
FileNotFoundError: If file doesn't exist
|
|
272
|
+
ValueError: If file is not an EML
|
|
273
|
+
"""
|
|
274
|
+
if isinstance(file_path, Path):
|
|
275
|
+
file_path = str(file_path)
|
|
276
|
+
|
|
277
|
+
if not os.path.isfile(file_path):
|
|
278
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
279
|
+
|
|
280
|
+
if not self._is_eml_file(file_path):
|
|
281
|
+
raise ValueError(f"File is not an EML: {file_path}")
|
|
282
|
+
|
|
283
|
+
# Create VSFile
|
|
284
|
+
vsfile = VSFile(file_path)
|
|
285
|
+
|
|
286
|
+
# Load and create nodes
|
|
287
|
+
nodes = self.load_as_nodes(
|
|
288
|
+
file_path, str(vsfile.uuid), custom_metadata)
|
|
289
|
+
vsfile.nodes = nodes
|
|
290
|
+
vsfile.processed = True
|
|
291
|
+
|
|
292
|
+
logger.info(
|
|
293
|
+
f"Created VSFile with {len(nodes)} nodes from: {file_path}")
|
|
294
|
+
return vsfile
|
|
295
|
+
|
|
296
|
+
def _is_eml_file(self, file_path: str) -> bool:
|
|
297
|
+
"""
|
|
298
|
+
Check if file is an EML based on extension.
|
|
299
|
+
|
|
300
|
+
Args:
|
|
301
|
+
file_path: Path to file
|
|
302
|
+
|
|
303
|
+
Returns:
|
|
304
|
+
True if file is an EML, False otherwise
|
|
305
|
+
"""
|
|
306
|
+
# Check extension
|
|
307
|
+
path = Path(file_path)
|
|
308
|
+
return path.suffix.lower() == '.eml'
|
|
309
|
+
|
|
310
|
+
def _extract_text_from_eml(self, eml_path: str) -> str:
|
|
311
|
+
"""
|
|
312
|
+
Extract text from EML file including headers and body.
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
eml_path: Path to EML file
|
|
316
|
+
|
|
317
|
+
Returns:
|
|
318
|
+
Extracted text content
|
|
319
|
+
"""
|
|
320
|
+
try:
|
|
321
|
+
# Parse the EML file
|
|
322
|
+
with open(eml_path, 'rb') as f:
|
|
323
|
+
msg = BytesParser(policy=policy.default).parse(f)
|
|
324
|
+
|
|
325
|
+
# Extract headers if enabled
|
|
326
|
+
text_parts = []
|
|
327
|
+
|
|
328
|
+
if self._include_headers:
|
|
329
|
+
headers_text = self._extract_headers(msg)
|
|
330
|
+
if headers_text:
|
|
331
|
+
text_parts.append(headers_text)
|
|
332
|
+
|
|
333
|
+
# Extract body content
|
|
334
|
+
body_text = self._extract_body(msg)
|
|
335
|
+
if body_text:
|
|
336
|
+
text_parts.append(body_text)
|
|
337
|
+
|
|
338
|
+
# Combine all parts
|
|
339
|
+
full_text = "\n\n".join(text_parts)
|
|
340
|
+
|
|
341
|
+
logger.debug(f"Extracted {len(full_text)} characters from EML")
|
|
342
|
+
return full_text
|
|
343
|
+
|
|
344
|
+
except Exception as e:
|
|
345
|
+
logger.error(f"Failed to extract text from EML: {e}")
|
|
346
|
+
raise
|
|
347
|
+
|
|
348
|
+
def _extract_headers(self, msg: email.message.EmailMessage) -> str:
|
|
349
|
+
"""
|
|
350
|
+
Extract relevant email headers.
|
|
351
|
+
|
|
352
|
+
Args:
|
|
353
|
+
msg: Email message object
|
|
354
|
+
|
|
355
|
+
Returns:
|
|
356
|
+
Formatted header text
|
|
357
|
+
"""
|
|
358
|
+
headers = []
|
|
359
|
+
|
|
360
|
+
# Extract common headers
|
|
361
|
+
if msg['Subject']:
|
|
362
|
+
headers.append(f"Subject: {msg['Subject']}")
|
|
363
|
+
|
|
364
|
+
if msg['From']:
|
|
365
|
+
headers.append(f"From: {msg['From']}")
|
|
366
|
+
|
|
367
|
+
if msg['To']:
|
|
368
|
+
headers.append(f"To: {msg['To']}")
|
|
369
|
+
|
|
370
|
+
if msg['Date']:
|
|
371
|
+
headers.append(f"Date: {msg['Date']}")
|
|
372
|
+
|
|
373
|
+
if msg['Cc']:
|
|
374
|
+
headers.append(f"Cc: {msg['Cc']}")
|
|
375
|
+
|
|
376
|
+
return "\n".join(headers)
|
|
377
|
+
|
|
378
|
+
def _extract_body(self, msg: email.message.EmailMessage) -> str:
|
|
379
|
+
"""
|
|
380
|
+
Extract email body content from plain text and/or HTML parts.
|
|
381
|
+
|
|
382
|
+
Args:
|
|
383
|
+
msg: Email message object
|
|
384
|
+
|
|
385
|
+
Returns:
|
|
386
|
+
Extracted body text
|
|
387
|
+
"""
|
|
388
|
+
body_parts = []
|
|
389
|
+
|
|
390
|
+
# Try to get plain text body first
|
|
391
|
+
if msg.is_multipart():
|
|
392
|
+
for part in msg.walk():
|
|
393
|
+
content_type = part.get_content_type()
|
|
394
|
+
content_disposition = str(part.get("Content-Disposition", ""))
|
|
395
|
+
|
|
396
|
+
# Skip attachments
|
|
397
|
+
if "attachment" in content_disposition:
|
|
398
|
+
continue
|
|
399
|
+
|
|
400
|
+
# Extract plain text
|
|
401
|
+
if content_type == "text/plain":
|
|
402
|
+
try:
|
|
403
|
+
text = part.get_content()
|
|
404
|
+
if text and text.strip():
|
|
405
|
+
body_parts.append(text.strip())
|
|
406
|
+
except Exception as e:
|
|
407
|
+
logger.warning(
|
|
408
|
+
f"Failed to extract plain text part: {e}")
|
|
409
|
+
|
|
410
|
+
# Extract HTML and convert to text if enabled
|
|
411
|
+
elif content_type == "text/html" and self._extract_html:
|
|
412
|
+
try:
|
|
413
|
+
html = part.get_content()
|
|
414
|
+
text = self._html_to_text(html)
|
|
415
|
+
if text and text.strip():
|
|
416
|
+
body_parts.append(text.strip())
|
|
417
|
+
except Exception as e:
|
|
418
|
+
logger.warning(f"Failed to extract HTML part: {e}")
|
|
419
|
+
else:
|
|
420
|
+
# Single part message
|
|
421
|
+
content_type = msg.get_content_type()
|
|
422
|
+
|
|
423
|
+
if content_type == "text/plain":
|
|
424
|
+
try:
|
|
425
|
+
text = msg.get_content()
|
|
426
|
+
if text and text.strip():
|
|
427
|
+
body_parts.append(text.strip())
|
|
428
|
+
except Exception as e:
|
|
429
|
+
logger.warning(f"Failed to extract plain text: {e}")
|
|
430
|
+
|
|
431
|
+
elif content_type == "text/html" and self._extract_html:
|
|
432
|
+
try:
|
|
433
|
+
html = msg.get_content()
|
|
434
|
+
text = self._html_to_text(html)
|
|
435
|
+
if text and text.strip():
|
|
436
|
+
body_parts.append(text.strip())
|
|
437
|
+
except Exception as e:
|
|
438
|
+
logger.warning(f"Failed to extract HTML: {e}")
|
|
439
|
+
|
|
440
|
+
return "\n\n".join(body_parts)
|
|
441
|
+
|
|
442
|
+
def _html_to_text(self, html: str) -> str:
|
|
443
|
+
"""
|
|
444
|
+
Convert HTML to plain text.
|
|
445
|
+
|
|
446
|
+
Args:
|
|
447
|
+
html: HTML content
|
|
448
|
+
|
|
449
|
+
Returns:
|
|
450
|
+
Plain text extracted from HTML
|
|
451
|
+
"""
|
|
452
|
+
try:
|
|
453
|
+
from bs4 import BeautifulSoup
|
|
454
|
+
|
|
455
|
+
# Use 'lxml' parser for better performance (falls back to html.parser if not available)
|
|
456
|
+
try:
|
|
457
|
+
soup = BeautifulSoup(html, 'lxml')
|
|
458
|
+
except Exception:
|
|
459
|
+
soup = BeautifulSoup(html, 'html.parser')
|
|
460
|
+
|
|
461
|
+
# Remove script and style elements
|
|
462
|
+
for script in soup(["script", "style"]):
|
|
463
|
+
script.decompose()
|
|
464
|
+
|
|
465
|
+
# Get text - use separator for better text extraction
|
|
466
|
+
text = soup.get_text(separator=' ', strip=True)
|
|
467
|
+
|
|
468
|
+
# Clean up excessive whitespace more efficiently
|
|
469
|
+
import re
|
|
470
|
+
text = re.sub(r'\s+', ' ', text)
|
|
471
|
+
text = re.sub(r'\n\s*\n', '\n', text)
|
|
472
|
+
|
|
473
|
+
return text.strip()
|
|
474
|
+
|
|
475
|
+
except ImportError:
|
|
476
|
+
logger.warning(
|
|
477
|
+
"beautifulsoup4 not installed, returning HTML as-is")
|
|
478
|
+
return html
|
|
479
|
+
except Exception as e:
|
|
480
|
+
logger.warning(f"Failed to convert HTML to text: {e}")
|
|
481
|
+
return html
|
|
482
|
+
|
|
483
|
+
def _chunk_text(self, text: str) -> List[str]:
|
|
484
|
+
"""
|
|
485
|
+
Chunk text using TextChunker.
|
|
486
|
+
|
|
487
|
+
Args:
|
|
488
|
+
text: Full text to chunk
|
|
489
|
+
|
|
490
|
+
Returns:
|
|
491
|
+
List of text chunks
|
|
492
|
+
"""
|
|
493
|
+
if not text or not text.strip():
|
|
494
|
+
return []
|
|
495
|
+
|
|
496
|
+
try:
|
|
497
|
+
# Use TextChunker's chunk_text method
|
|
498
|
+
chunk_dicts = self._chunker.chunk_text(text, context="eml")
|
|
499
|
+
|
|
500
|
+
# Extract just the text from the chunk dictionaries
|
|
501
|
+
text_chunks = [chunk_dict['text'] for chunk_dict in chunk_dicts]
|
|
502
|
+
|
|
503
|
+
logger.info(f"Chunked EML text into {len(text_chunks)} chunks")
|
|
504
|
+
return text_chunks
|
|
505
|
+
|
|
506
|
+
except Exception as e:
|
|
507
|
+
logger.warning(f"Failed to chunk text with TextChunker: {e}")
|
|
508
|
+
# Fall back to returning the whole text as a single chunk
|
|
509
|
+
logger.info("Falling back to single chunk")
|
|
510
|
+
return [text]
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
def create_eml_loader(
|
|
514
|
+
chunk_size: int = 3000,
|
|
515
|
+
chunk_overlap: int = 200,
|
|
516
|
+
min_sentences_per_chunk: int = 5,
|
|
517
|
+
tokenizer: str = "character",
|
|
518
|
+
include_headers: bool = True,
|
|
519
|
+
extract_html: bool = True
|
|
520
|
+
) -> EmlLoader:
|
|
521
|
+
"""
|
|
522
|
+
Factory function to create an EML loader.
|
|
523
|
+
|
|
524
|
+
Args:
|
|
525
|
+
chunk_size: Maximum tokens per chunk (default: 3000)
|
|
526
|
+
chunk_overlap: Overlap between chunks in tokens (default: 200)
|
|
527
|
+
min_sentences_per_chunk: Minimum sentences per chunk (default: 5)
|
|
528
|
+
tokenizer: Tokenizer for chunking - "character", "gpt2", or HuggingFace model (default: "character")
|
|
529
|
+
include_headers: Whether to include email headers in output (default: True)
|
|
530
|
+
extract_html: Whether to extract text from HTML parts (default: True)
|
|
531
|
+
|
|
532
|
+
Returns:
|
|
533
|
+
Configured EML loader
|
|
534
|
+
|
|
535
|
+
Example:
|
|
536
|
+
>>> loader = create_eml_loader(chunk_size=1024, chunk_overlap=64)
|
|
537
|
+
>>> chunks = loader.run("data/email.eml")
|
|
538
|
+
>>> print(f"Extracted {len(chunks)} chunks")
|
|
539
|
+
|
|
540
|
+
>>> # Create loader without headers
|
|
541
|
+
>>> loader = create_eml_loader(include_headers=False)
|
|
542
|
+
>>> chunks = loader.run("data/email.eml")
|
|
543
|
+
"""
|
|
544
|
+
config = {
|
|
545
|
+
'chunk_size': chunk_size,
|
|
546
|
+
'chunk_overlap': chunk_overlap,
|
|
547
|
+
'min_sentences_per_chunk': min_sentences_per_chunk,
|
|
548
|
+
'tokenizer': tokenizer,
|
|
549
|
+
'include_headers': include_headers,
|
|
550
|
+
'extract_html': extract_html
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
return EmlLoader(config=config)
|
|
554
|
+
|
|
555
|
+
|
|
556
|
+
__all__ = ["EmlLoader", "create_eml_loader"]
|