rakam-systems-vectorstore 0.1.1rc7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rakam_systems_vectorstore/MANIFEST.in +26 -0
- rakam_systems_vectorstore/README.md +1071 -0
- rakam_systems_vectorstore/__init__.py +93 -0
- rakam_systems_vectorstore/components/__init__.py +0 -0
- rakam_systems_vectorstore/components/chunker/__init__.py +19 -0
- rakam_systems_vectorstore/components/chunker/advanced_chunker.py +1019 -0
- rakam_systems_vectorstore/components/chunker/text_chunker.py +154 -0
- rakam_systems_vectorstore/components/embedding_model/__init__.py +0 -0
- rakam_systems_vectorstore/components/embedding_model/configurable_embeddings.py +546 -0
- rakam_systems_vectorstore/components/embedding_model/openai_embeddings.py +259 -0
- rakam_systems_vectorstore/components/loader/__init__.py +31 -0
- rakam_systems_vectorstore/components/loader/adaptive_loader.py +512 -0
- rakam_systems_vectorstore/components/loader/code_loader.py +699 -0
- rakam_systems_vectorstore/components/loader/doc_loader.py +812 -0
- rakam_systems_vectorstore/components/loader/eml_loader.py +556 -0
- rakam_systems_vectorstore/components/loader/html_loader.py +626 -0
- rakam_systems_vectorstore/components/loader/md_loader.py +622 -0
- rakam_systems_vectorstore/components/loader/odt_loader.py +750 -0
- rakam_systems_vectorstore/components/loader/pdf_loader.py +771 -0
- rakam_systems_vectorstore/components/loader/pdf_loader_light.py +723 -0
- rakam_systems_vectorstore/components/loader/tabular_loader.py +597 -0
- rakam_systems_vectorstore/components/vectorstore/__init__.py +0 -0
- rakam_systems_vectorstore/components/vectorstore/apps.py +10 -0
- rakam_systems_vectorstore/components/vectorstore/configurable_pg_vector_store.py +1661 -0
- rakam_systems_vectorstore/components/vectorstore/faiss_vector_store.py +878 -0
- rakam_systems_vectorstore/components/vectorstore/migrations/0001_initial.py +55 -0
- rakam_systems_vectorstore/components/vectorstore/migrations/__init__.py +0 -0
- rakam_systems_vectorstore/components/vectorstore/models.py +10 -0
- rakam_systems_vectorstore/components/vectorstore/pg_models.py +97 -0
- rakam_systems_vectorstore/components/vectorstore/pg_vector_store.py +827 -0
- rakam_systems_vectorstore/config.py +266 -0
- rakam_systems_vectorstore/core.py +8 -0
- rakam_systems_vectorstore/pyproject.toml +113 -0
- rakam_systems_vectorstore/server/README.md +290 -0
- rakam_systems_vectorstore/server/__init__.py +20 -0
- rakam_systems_vectorstore/server/mcp_server_vector.py +325 -0
- rakam_systems_vectorstore/setup.py +103 -0
- rakam_systems_vectorstore-0.1.1rc7.dist-info/METADATA +370 -0
- rakam_systems_vectorstore-0.1.1rc7.dist-info/RECORD +40 -0
- rakam_systems_vectorstore-0.1.1rc7.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,812 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DOC/DOCX Loader for Microsoft Word document processing.
|
|
3
|
+
|
|
4
|
+
This loader extracts text and images from Word documents (.doc, .docx).
|
|
5
|
+
It supports:
|
|
6
|
+
- Text extraction with paragraph and table preservation
|
|
7
|
+
- Image extraction from the document
|
|
8
|
+
- Configurable chunking of plain text
|
|
9
|
+
- Both legacy .doc and modern .docx formats
|
|
10
|
+
|
|
11
|
+
The loader stores extracted images in a scratch folder within the data directory.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import mimetypes
|
|
17
|
+
import os
|
|
18
|
+
import re
|
|
19
|
+
import subprocess
|
|
20
|
+
import tempfile
|
|
21
|
+
import time
|
|
22
|
+
import zipfile
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
from typing import Any, Dict, List, Optional, Union
|
|
25
|
+
|
|
26
|
+
from rakam_systems_core.ai_utils import logging
|
|
27
|
+
from rakam_systems_core.ai_core.interfaces.loader import Loader
|
|
28
|
+
from rakam_systems_vectorstore.components.chunker import AdvancedChunker
|
|
29
|
+
from rakam_systems_vectorstore.core import Node, NodeMetadata, VSFile
|
|
30
|
+
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class DocLoader(Loader):
|
|
35
|
+
"""
|
|
36
|
+
Word document loader for .doc and .docx files.
|
|
37
|
+
|
|
38
|
+
This loader provides Word document processing with support for:
|
|
39
|
+
- Text extraction with paragraph and table preservation
|
|
40
|
+
- Image extraction from document archive (DOCX only)
|
|
41
|
+
- Advanced text chunking
|
|
42
|
+
- Both legacy .doc and modern .docx formats
|
|
43
|
+
|
|
44
|
+
For .docx files, images are extracted and saved to a scratch directory.
|
|
45
|
+
For legacy .doc files, text extraction is attempted via python-docx or
|
|
46
|
+
falls back to antiword/textutil if available.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
# Supported file extensions
|
|
50
|
+
SUPPORTED_EXTENSIONS = {'.doc', '.docx', '.DOC', '.DOCX'}
|
|
51
|
+
|
|
52
|
+
# MIME types for Word documents
|
|
53
|
+
MIME_TYPES = {
|
|
54
|
+
'application/msword', # .doc
|
|
55
|
+
'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # .docx
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
# Default configuration
|
|
59
|
+
DEFAULT_EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
|
|
60
|
+
DEFAULT_CHUNK_SIZE = 2048
|
|
61
|
+
DEFAULT_CHUNK_OVERLAP = 128
|
|
62
|
+
DEFAULT_IMAGE_PATH = "data/ingestion_image/" # Default path for extracted images
|
|
63
|
+
|
|
64
|
+
def __init__(
|
|
65
|
+
self,
|
|
66
|
+
name: str = "doc_loader",
|
|
67
|
+
config: Optional[Dict[str, Any]] = None
|
|
68
|
+
):
|
|
69
|
+
"""
|
|
70
|
+
Initialize DOC/DOCX loader.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
name: Component name
|
|
74
|
+
config: Optional configuration with keys:
|
|
75
|
+
- embed_model_id: HuggingFace model ID for tokenization (default: "sentence-transformers/all-MiniLM-L6-v2")
|
|
76
|
+
- chunk_size: Maximum tokens per chunk (default: 2048)
|
|
77
|
+
- chunk_overlap: Overlap between chunks in tokens (default: 128)
|
|
78
|
+
- min_sentences_per_chunk: Minimum sentences per chunk (default: 1)
|
|
79
|
+
- tokenizer: Tokenizer for chunking (default: "character")
|
|
80
|
+
- save_images: Whether to save images to disk (default: True)
|
|
81
|
+
- image_path: Path to save images (default: None, uses INGESTION_IMAGE_PATH env var or "data/ingestion_image/")
|
|
82
|
+
- scratch_folder_name: Name of scratch folder (default: "scratch")
|
|
83
|
+
- include_images_in_text: Whether to add image references to text (default: True)
|
|
84
|
+
- extract_tables: Whether to extract table content (default: True)
|
|
85
|
+
- preserve_formatting: Whether to preserve basic formatting markers (default: False)
|
|
86
|
+
"""
|
|
87
|
+
super().__init__(name=name, config=config)
|
|
88
|
+
|
|
89
|
+
# Extract configuration
|
|
90
|
+
config = config or {}
|
|
91
|
+
self._save_images = config.get('save_images', True)
|
|
92
|
+
self._image_path = config.get('image_path') or os.getenv(
|
|
93
|
+
'INGESTION_IMAGE_PATH', self.DEFAULT_IMAGE_PATH)
|
|
94
|
+
self._scratch_folder_name = config.get(
|
|
95
|
+
'scratch_folder_name', 'scratch')
|
|
96
|
+
self._include_images_in_text = config.get(
|
|
97
|
+
'include_images_in_text', True)
|
|
98
|
+
self._extract_tables = config.get('extract_tables', True)
|
|
99
|
+
self._preserve_formatting = config.get('preserve_formatting', False)
|
|
100
|
+
|
|
101
|
+
# Chunking configuration
|
|
102
|
+
self._chunk_size = config.get('chunk_size', self.DEFAULT_CHUNK_SIZE)
|
|
103
|
+
self._chunk_overlap = config.get(
|
|
104
|
+
'chunk_overlap', self.DEFAULT_CHUNK_OVERLAP)
|
|
105
|
+
self._min_sentences_per_chunk = config.get(
|
|
106
|
+
'min_sentences_per_chunk', 1)
|
|
107
|
+
self._tokenizer = config.get('tokenizer', 'character')
|
|
108
|
+
|
|
109
|
+
# Initialize advanced chunker
|
|
110
|
+
embed_model_id = config.get(
|
|
111
|
+
'embed_model_id', self.DEFAULT_EMBED_MODEL_ID)
|
|
112
|
+
self._chunker = AdvancedChunker(
|
|
113
|
+
embed_model_id=embed_model_id,
|
|
114
|
+
strategy="default"
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Store last extraction info for image tracking
|
|
118
|
+
self._last_scratch_dir = None
|
|
119
|
+
self._last_image_files = []
|
|
120
|
+
self._image_path_mapping: Dict[str, str] = {}
|
|
121
|
+
|
|
122
|
+
logger.info(
|
|
123
|
+
f"Initialized DocLoader with chunk_size={self._chunk_size}, chunk_overlap={self._chunk_overlap}, image_path={self._image_path}")
|
|
124
|
+
|
|
125
|
+
def run(self, source: str) -> List[str]:
|
|
126
|
+
"""
|
|
127
|
+
Execute the primary operation for the component.
|
|
128
|
+
|
|
129
|
+
This method satisfies the BaseComponent abstract method requirement
|
|
130
|
+
and delegates to load_as_chunks.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
source: Path to DOC/DOCX file
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
List of text chunks extracted from the document
|
|
137
|
+
"""
|
|
138
|
+
return self.load_as_chunks(source)
|
|
139
|
+
|
|
140
|
+
def load_as_text(
|
|
141
|
+
self,
|
|
142
|
+
source: Union[str, Path],
|
|
143
|
+
) -> str:
|
|
144
|
+
"""
|
|
145
|
+
Load Word document and return as a single text string.
|
|
146
|
+
|
|
147
|
+
This method extracts all text from the document and returns it as a single
|
|
148
|
+
string without chunking. Useful when you need the full document text.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
source: Path to DOC/DOCX file
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
Full text content of the document as a single string
|
|
155
|
+
|
|
156
|
+
Raises:
|
|
157
|
+
FileNotFoundError: If source file doesn't exist
|
|
158
|
+
ValueError: If source is not a Word document
|
|
159
|
+
Exception: If document processing fails
|
|
160
|
+
"""
|
|
161
|
+
# Convert Path to string
|
|
162
|
+
if isinstance(source, Path):
|
|
163
|
+
source = str(source)
|
|
164
|
+
|
|
165
|
+
# Validate file exists
|
|
166
|
+
if not os.path.isfile(source):
|
|
167
|
+
raise FileNotFoundError(f"File not found: {source}")
|
|
168
|
+
|
|
169
|
+
# Validate file is a Word document
|
|
170
|
+
source_path = Path(source)
|
|
171
|
+
if not self._is_doc_file(source):
|
|
172
|
+
raise ValueError(
|
|
173
|
+
f"File is not a Word document: {source} (extension: {source_path.suffix})")
|
|
174
|
+
|
|
175
|
+
logger.info(
|
|
176
|
+
f"Loading Word document as text: {source_path.name} (extension: {source_path.suffix})")
|
|
177
|
+
start_time = time.time()
|
|
178
|
+
|
|
179
|
+
try:
|
|
180
|
+
# Create scratch directory in data folder
|
|
181
|
+
scratch_dir = self._get_scratch_dir(source)
|
|
182
|
+
self._last_scratch_dir = scratch_dir
|
|
183
|
+
|
|
184
|
+
# Extract images if enabled (DOCX only)
|
|
185
|
+
image_files = []
|
|
186
|
+
if self._save_images and self._is_docx_file(source):
|
|
187
|
+
image_dir = self._get_image_path(source)
|
|
188
|
+
image_files = self._extract_images(source, Path(image_dir))
|
|
189
|
+
self._last_image_files = image_files
|
|
190
|
+
logger.info(
|
|
191
|
+
f"Extracted {len(image_files)} images from document")
|
|
192
|
+
|
|
193
|
+
# Extract text from document
|
|
194
|
+
if self._is_docx_file(source):
|
|
195
|
+
full_text = self._extract_text_docx(source)
|
|
196
|
+
else:
|
|
197
|
+
full_text = self._extract_text_doc(source)
|
|
198
|
+
|
|
199
|
+
# Add image references if enabled
|
|
200
|
+
if self._include_images_in_text and image_files:
|
|
201
|
+
full_text = self._add_image_references_to_text(
|
|
202
|
+
full_text, image_files)
|
|
203
|
+
|
|
204
|
+
elapsed = time.time() - start_time
|
|
205
|
+
logger.info(
|
|
206
|
+
f"Document loaded as text in {elapsed:.2f}s: {len(full_text)} characters")
|
|
207
|
+
|
|
208
|
+
return full_text
|
|
209
|
+
|
|
210
|
+
except Exception as e:
|
|
211
|
+
logger.error(f"Error loading document as text {source}: {e}")
|
|
212
|
+
raise
|
|
213
|
+
|
|
214
|
+
def load_as_chunks(
|
|
215
|
+
self,
|
|
216
|
+
source: Union[str, Path],
|
|
217
|
+
) -> List[str]:
|
|
218
|
+
"""
|
|
219
|
+
Load Word document and return as a list of text chunks.
|
|
220
|
+
|
|
221
|
+
This method extracts text from the document, processes it with the configured
|
|
222
|
+
chunker, and returns a list of text chunks. Each chunk optionally includes
|
|
223
|
+
image references.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
source: Path to DOC/DOCX file
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
List of text chunks extracted from the document
|
|
230
|
+
|
|
231
|
+
Raises:
|
|
232
|
+
FileNotFoundError: If source file doesn't exist
|
|
233
|
+
ValueError: If source is not a Word document
|
|
234
|
+
Exception: If document processing fails
|
|
235
|
+
"""
|
|
236
|
+
# Convert Path to string
|
|
237
|
+
if isinstance(source, Path):
|
|
238
|
+
source = str(source)
|
|
239
|
+
|
|
240
|
+
# Validate file exists
|
|
241
|
+
if not os.path.isfile(source):
|
|
242
|
+
raise FileNotFoundError(f"File not found: {source}")
|
|
243
|
+
|
|
244
|
+
# Validate file is a Word document
|
|
245
|
+
source_path = Path(source)
|
|
246
|
+
if not self._is_doc_file(source):
|
|
247
|
+
raise ValueError(
|
|
248
|
+
f"File is not a Word document: {source} (extension: {source_path.suffix})")
|
|
249
|
+
|
|
250
|
+
logger.info(
|
|
251
|
+
f"Loading Word document file: {source_path.name} (extension: {source_path.suffix})")
|
|
252
|
+
start_time = time.time()
|
|
253
|
+
|
|
254
|
+
try:
|
|
255
|
+
# Get full text
|
|
256
|
+
full_text = self.load_as_text(source)
|
|
257
|
+
|
|
258
|
+
# Chunk the text using AdvancedChunker's chunk_text method
|
|
259
|
+
text_chunks = self._chunk_text(full_text)
|
|
260
|
+
|
|
261
|
+
elapsed = time.time() - start_time
|
|
262
|
+
logger.info(
|
|
263
|
+
f"Document processed in {elapsed:.2f}s: {len(text_chunks)} chunks")
|
|
264
|
+
|
|
265
|
+
return text_chunks
|
|
266
|
+
|
|
267
|
+
except Exception as e:
|
|
268
|
+
logger.error(f"Error processing document {source}: {e}")
|
|
269
|
+
raise
|
|
270
|
+
|
|
271
|
+
def load_as_nodes(
|
|
272
|
+
self,
|
|
273
|
+
source: Union[str, Path],
|
|
274
|
+
source_id: Optional[str] = None,
|
|
275
|
+
custom_metadata: Optional[Dict[str, Any]] = None
|
|
276
|
+
) -> List[Node]:
|
|
277
|
+
"""
|
|
278
|
+
Load Word document and return as Node objects with metadata.
|
|
279
|
+
|
|
280
|
+
Args:
|
|
281
|
+
source: Path to DOC/DOCX file
|
|
282
|
+
source_id: Optional source identifier (defaults to file path)
|
|
283
|
+
custom_metadata: Optional custom metadata to attach to nodes
|
|
284
|
+
|
|
285
|
+
Returns:
|
|
286
|
+
List of Node objects with text chunks and metadata
|
|
287
|
+
"""
|
|
288
|
+
# Convert Path to string
|
|
289
|
+
if isinstance(source, Path):
|
|
290
|
+
source = str(source)
|
|
291
|
+
|
|
292
|
+
# Load text chunks
|
|
293
|
+
chunks = self.load_as_chunks(source)
|
|
294
|
+
|
|
295
|
+
# Determine source ID
|
|
296
|
+
if source_id is None:
|
|
297
|
+
source_id = source
|
|
298
|
+
|
|
299
|
+
# Create nodes with metadata
|
|
300
|
+
nodes = []
|
|
301
|
+
for idx, chunk in enumerate(chunks):
|
|
302
|
+
metadata = NodeMetadata(
|
|
303
|
+
source_file_uuid=source_id,
|
|
304
|
+
position=idx,
|
|
305
|
+
custom=custom_metadata or {}
|
|
306
|
+
)
|
|
307
|
+
node = Node(content=chunk, metadata=metadata)
|
|
308
|
+
nodes.append(node)
|
|
309
|
+
|
|
310
|
+
logger.info(f"Created {len(nodes)} nodes from document: {source}")
|
|
311
|
+
return nodes
|
|
312
|
+
|
|
313
|
+
def load_as_vsfile(
|
|
314
|
+
self,
|
|
315
|
+
file_path: Union[str, Path],
|
|
316
|
+
custom_metadata: Optional[Dict[str, Any]] = None
|
|
317
|
+
) -> VSFile:
|
|
318
|
+
"""
|
|
319
|
+
Load Word document and return as VSFile object.
|
|
320
|
+
|
|
321
|
+
Args:
|
|
322
|
+
file_path: Path to DOC/DOCX file
|
|
323
|
+
custom_metadata: Optional custom metadata
|
|
324
|
+
|
|
325
|
+
Returns:
|
|
326
|
+
VSFile object with nodes
|
|
327
|
+
|
|
328
|
+
Raises:
|
|
329
|
+
FileNotFoundError: If file doesn't exist
|
|
330
|
+
ValueError: If file is not a Word document
|
|
331
|
+
"""
|
|
332
|
+
if isinstance(file_path, Path):
|
|
333
|
+
file_path = str(file_path)
|
|
334
|
+
|
|
335
|
+
if not os.path.isfile(file_path):
|
|
336
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
337
|
+
|
|
338
|
+
if not self._is_doc_file(file_path):
|
|
339
|
+
raise ValueError(f"File is not a Word document: {file_path}")
|
|
340
|
+
|
|
341
|
+
# Create VSFile
|
|
342
|
+
vsfile = VSFile(file_path)
|
|
343
|
+
|
|
344
|
+
# Load and create nodes
|
|
345
|
+
nodes = self.load_as_nodes(
|
|
346
|
+
file_path, str(vsfile.uuid), custom_metadata)
|
|
347
|
+
vsfile.nodes = nodes
|
|
348
|
+
vsfile.processed = True
|
|
349
|
+
|
|
350
|
+
logger.info(
|
|
351
|
+
f"Created VSFile with {len(nodes)} nodes from: {file_path}")
|
|
352
|
+
return vsfile
|
|
353
|
+
|
|
354
|
+
def _is_doc_file(self, file_path: str) -> bool:
|
|
355
|
+
"""
|
|
356
|
+
Check if file is a Word document based on extension and magic bytes.
|
|
357
|
+
|
|
358
|
+
Args:
|
|
359
|
+
file_path: Path to file
|
|
360
|
+
|
|
361
|
+
Returns:
|
|
362
|
+
True if file is a Word document, False otherwise
|
|
363
|
+
"""
|
|
364
|
+
path = Path(file_path)
|
|
365
|
+
suffix = path.suffix.lower()
|
|
366
|
+
|
|
367
|
+
# First check extension
|
|
368
|
+
if suffix not in {'.doc', '.docx'}:
|
|
369
|
+
logger.debug(
|
|
370
|
+
f"File {path.name} rejected: extension '{suffix}' is not .doc or .docx")
|
|
371
|
+
return False
|
|
372
|
+
|
|
373
|
+
# Additional safety check: verify it's not a PDF by checking magic bytes
|
|
374
|
+
try:
|
|
375
|
+
with open(file_path, 'rb') as f:
|
|
376
|
+
magic_bytes = f.read(4)
|
|
377
|
+
# PDF files start with %PDF (0x25504446)
|
|
378
|
+
if magic_bytes.startswith(b'%PDF'):
|
|
379
|
+
logger.error(
|
|
380
|
+
f"File {path.name} has .doc/.docx extension but is actually a PDF!")
|
|
381
|
+
return False
|
|
382
|
+
except Exception as e:
|
|
383
|
+
logger.warning(f"Could not read magic bytes from {path.name}: {e}")
|
|
384
|
+
|
|
385
|
+
return True
|
|
386
|
+
|
|
387
|
+
def _is_docx_file(self, file_path: str) -> bool:
|
|
388
|
+
"""
|
|
389
|
+
Check if file is specifically a .docx file.
|
|
390
|
+
|
|
391
|
+
Args:
|
|
392
|
+
file_path: Path to file
|
|
393
|
+
|
|
394
|
+
Returns:
|
|
395
|
+
True if file is a .docx, False otherwise
|
|
396
|
+
"""
|
|
397
|
+
path = Path(file_path)
|
|
398
|
+
return path.suffix.lower() == '.docx'
|
|
399
|
+
|
|
400
|
+
def _get_scratch_dir(self, source_path: str) -> Path:
|
|
401
|
+
"""
|
|
402
|
+
Get scratch directory for storing extracted files.
|
|
403
|
+
|
|
404
|
+
The scratch directory is created inside the data folder relative to the source file.
|
|
405
|
+
|
|
406
|
+
Args:
|
|
407
|
+
source_path: Path to source document file
|
|
408
|
+
|
|
409
|
+
Returns:
|
|
410
|
+
Path to scratch directory
|
|
411
|
+
"""
|
|
412
|
+
source = Path(source_path)
|
|
413
|
+
|
|
414
|
+
# Find data folder - assume it's a parent of the source or sibling
|
|
415
|
+
if 'data' in source.parts:
|
|
416
|
+
# Navigate to data folder
|
|
417
|
+
data_folder = source
|
|
418
|
+
while data_folder.name != 'data' and data_folder.parent != data_folder:
|
|
419
|
+
data_folder = data_folder.parent
|
|
420
|
+
else:
|
|
421
|
+
# Use parent directory and create/use data folder
|
|
422
|
+
data_folder = source.parent / 'data'
|
|
423
|
+
|
|
424
|
+
# Create scratch directory inside data folder
|
|
425
|
+
scratch_dir = data_folder / self._scratch_folder_name
|
|
426
|
+
scratch_dir.mkdir(parents=True, exist_ok=True)
|
|
427
|
+
|
|
428
|
+
logger.debug(f"Using scratch directory: {scratch_dir}")
|
|
429
|
+
return scratch_dir
|
|
430
|
+
|
|
431
|
+
def _get_image_path(self, source_path: str) -> str:
|
|
432
|
+
"""
|
|
433
|
+
Get the path where images should be extracted.
|
|
434
|
+
|
|
435
|
+
Uses the configured image path (from config, env var, or default).
|
|
436
|
+
Creates a subdirectory based on the source document filename.
|
|
437
|
+
|
|
438
|
+
Args:
|
|
439
|
+
source_path: Path to source document file
|
|
440
|
+
|
|
441
|
+
Returns:
|
|
442
|
+
Absolute path to image extraction directory
|
|
443
|
+
"""
|
|
444
|
+
source = Path(source_path)
|
|
445
|
+
doc_filename = source.stem
|
|
446
|
+
|
|
447
|
+
# Create base image path
|
|
448
|
+
base_path = Path(self._image_path)
|
|
449
|
+
|
|
450
|
+
# Create subdirectory for this document
|
|
451
|
+
image_dir = base_path / doc_filename
|
|
452
|
+
image_dir.mkdir(parents=True, exist_ok=True)
|
|
453
|
+
|
|
454
|
+
logger.debug(f"Using image path: {image_dir}")
|
|
455
|
+
return str(image_dir)
|
|
456
|
+
|
|
457
|
+
def get_image_path_mapping(self) -> Dict[str, str]:
|
|
458
|
+
"""
|
|
459
|
+
Get the mapping of image paths.
|
|
460
|
+
|
|
461
|
+
Returns:
|
|
462
|
+
Dictionary mapping image filenames to absolute paths on disk
|
|
463
|
+
"""
|
|
464
|
+
return self._image_path_mapping.copy()
|
|
465
|
+
|
|
466
|
+
def get_image_absolute_path(self, image_filename: str) -> Optional[str]:
|
|
467
|
+
"""
|
|
468
|
+
Get the absolute file path for an image.
|
|
469
|
+
|
|
470
|
+
Args:
|
|
471
|
+
image_filename: The image filename
|
|
472
|
+
|
|
473
|
+
Returns:
|
|
474
|
+
Absolute path to the image file, or None if not found
|
|
475
|
+
"""
|
|
476
|
+
return self._image_path_mapping.get(image_filename)
|
|
477
|
+
|
|
478
|
+
def _extract_text_docx(self, docx_path: str) -> str:
|
|
479
|
+
"""
|
|
480
|
+
Extract text from DOCX file using python-docx.
|
|
481
|
+
|
|
482
|
+
Args:
|
|
483
|
+
docx_path: Path to DOCX file
|
|
484
|
+
|
|
485
|
+
Returns:
|
|
486
|
+
Extracted text content
|
|
487
|
+
"""
|
|
488
|
+
# Additional safety check: ensure this is actually a DOCX/DOC file
|
|
489
|
+
file_path = Path(docx_path)
|
|
490
|
+
if file_path.suffix.lower() not in ['.doc', '.docx']:
|
|
491
|
+
raise ValueError(
|
|
492
|
+
f"File is not a Word document: {docx_path} (extension: {file_path.suffix})")
|
|
493
|
+
|
|
494
|
+
try:
|
|
495
|
+
from docx import Document
|
|
496
|
+
except ImportError:
|
|
497
|
+
logger.error(
|
|
498
|
+
"python-docx is required for DOCX support. Install with: pip install python-docx")
|
|
499
|
+
raise ImportError("python-docx is required for DOCX support")
|
|
500
|
+
|
|
501
|
+
try:
|
|
502
|
+
doc = Document(docx_path)
|
|
503
|
+
|
|
504
|
+
# Check if document was successfully loaded
|
|
505
|
+
if doc is None:
|
|
506
|
+
raise ValueError(f"Failed to load document: {docx_path}")
|
|
507
|
+
|
|
508
|
+
text_parts = []
|
|
509
|
+
|
|
510
|
+
# Extract paragraphs
|
|
511
|
+
for paragraph in doc.paragraphs:
|
|
512
|
+
para_text = paragraph.text
|
|
513
|
+
if para_text.strip():
|
|
514
|
+
# Optionally preserve formatting markers
|
|
515
|
+
if self._preserve_formatting:
|
|
516
|
+
# Add heading markers based on style
|
|
517
|
+
style_name = paragraph.style.name if paragraph.style else ""
|
|
518
|
+
if style_name.startswith("Heading"):
|
|
519
|
+
level = style_name[-1] if style_name[-1].isdigit() else "1"
|
|
520
|
+
para_text = f"{'#' * int(level)} {para_text}"
|
|
521
|
+
text_parts.append(para_text)
|
|
522
|
+
|
|
523
|
+
# Extract tables if enabled
|
|
524
|
+
if self._extract_tables and hasattr(doc, 'tables') and doc.tables is not None:
|
|
525
|
+
for table in doc.tables:
|
|
526
|
+
table_text = self._extract_table_text(table)
|
|
527
|
+
if table_text.strip():
|
|
528
|
+
text_parts.append(table_text)
|
|
529
|
+
|
|
530
|
+
full_text = '\n\n'.join(text_parts)
|
|
531
|
+
logger.debug(f"Extracted {len(full_text)} characters from DOCX")
|
|
532
|
+
return full_text
|
|
533
|
+
|
|
534
|
+
except Exception as e:
|
|
535
|
+
# Check if this might be a PDF file mistakenly routed here
|
|
536
|
+
if file_path.suffix.lower() == '.pdf' or 'pdf' in str(e).lower():
|
|
537
|
+
raise ValueError(
|
|
538
|
+
f"File appears to be a PDF, not a Word document: {docx_path}. Error: {e}")
|
|
539
|
+
logger.error(f"Failed to extract text from DOCX: {e}")
|
|
540
|
+
raise
|
|
541
|
+
|
|
542
|
+
def _extract_table_text(self, table) -> str:
|
|
543
|
+
"""
|
|
544
|
+
Extract text from a Word table.
|
|
545
|
+
|
|
546
|
+
Args:
|
|
547
|
+
table: python-docx Table object
|
|
548
|
+
|
|
549
|
+
Returns:
|
|
550
|
+
Table content as formatted text
|
|
551
|
+
"""
|
|
552
|
+
rows = []
|
|
553
|
+
for row in table.rows:
|
|
554
|
+
cells = [cell.text.strip() for cell in row.cells]
|
|
555
|
+
rows.append(' | '.join(cells))
|
|
556
|
+
return '\n'.join(rows)
|
|
557
|
+
|
|
558
|
+
def _extract_text_doc(self, doc_path: str) -> str:
|
|
559
|
+
"""
|
|
560
|
+
Extract text from legacy .doc file.
|
|
561
|
+
|
|
562
|
+
This method tries multiple approaches:
|
|
563
|
+
1. Use python-docx (may work for some .doc files)
|
|
564
|
+
2. Use antiword if available (Linux/macOS)
|
|
565
|
+
3. Use textutil if available (macOS)
|
|
566
|
+
|
|
567
|
+
Args:
|
|
568
|
+
doc_path: Path to .doc file
|
|
569
|
+
|
|
570
|
+
Returns:
|
|
571
|
+
Extracted text content
|
|
572
|
+
"""
|
|
573
|
+
# Safety check: verify this is not a PDF file
|
|
574
|
+
try:
|
|
575
|
+
with open(doc_path, 'rb') as f:
|
|
576
|
+
magic_bytes = f.read(4)
|
|
577
|
+
if magic_bytes.startswith(b'%PDF'):
|
|
578
|
+
raise ValueError(
|
|
579
|
+
f"File {doc_path} is a PDF, not a Word document. It should not be processed by DocLoader.")
|
|
580
|
+
except IOError:
|
|
581
|
+
pass # If we can't read the file, let the extraction methods handle it
|
|
582
|
+
|
|
583
|
+
# First, try python-docx (works for some .doc files that are actually .docx in disguise)
|
|
584
|
+
try:
|
|
585
|
+
return self._extract_text_docx(doc_path)
|
|
586
|
+
except Exception as e:
|
|
587
|
+
logger.debug(f"python-docx failed for .doc file: {e}")
|
|
588
|
+
|
|
589
|
+
# Try antiword (available on Linux and some macOS systems)
|
|
590
|
+
try:
|
|
591
|
+
result = subprocess.run(
|
|
592
|
+
['antiword', doc_path],
|
|
593
|
+
capture_output=True,
|
|
594
|
+
text=True,
|
|
595
|
+
timeout=30
|
|
596
|
+
)
|
|
597
|
+
if result.returncode == 0:
|
|
598
|
+
logger.debug("Successfully extracted text using antiword")
|
|
599
|
+
return result.stdout
|
|
600
|
+
except FileNotFoundError:
|
|
601
|
+
logger.debug("antiword not available")
|
|
602
|
+
except subprocess.TimeoutExpired:
|
|
603
|
+
logger.warning("antiword timed out")
|
|
604
|
+
except Exception as e:
|
|
605
|
+
logger.debug(f"antiword failed: {e}")
|
|
606
|
+
|
|
607
|
+
# Try textutil (macOS)
|
|
608
|
+
try:
|
|
609
|
+
with tempfile.NamedTemporaryFile(suffix='.txt', delete=False) as tmp:
|
|
610
|
+
tmp_path = tmp.name
|
|
611
|
+
|
|
612
|
+
result = subprocess.run(
|
|
613
|
+
['textutil', '-convert', 'txt', '-output', tmp_path, doc_path],
|
|
614
|
+
capture_output=True,
|
|
615
|
+
text=True,
|
|
616
|
+
timeout=30
|
|
617
|
+
)
|
|
618
|
+
if result.returncode == 0:
|
|
619
|
+
with open(tmp_path, 'r', encoding='utf-8') as f:
|
|
620
|
+
content = f.read()
|
|
621
|
+
os.unlink(tmp_path)
|
|
622
|
+
logger.debug("Successfully extracted text using textutil")
|
|
623
|
+
return content
|
|
624
|
+
if os.path.exists(tmp_path):
|
|
625
|
+
os.unlink(tmp_path)
|
|
626
|
+
except FileNotFoundError:
|
|
627
|
+
logger.debug("textutil not available")
|
|
628
|
+
except subprocess.TimeoutExpired:
|
|
629
|
+
logger.warning("textutil timed out")
|
|
630
|
+
except Exception as e:
|
|
631
|
+
logger.debug(f"textutil failed: {e}")
|
|
632
|
+
|
|
633
|
+
# If all methods fail, raise an error
|
|
634
|
+
raise RuntimeError(
|
|
635
|
+
f"Could not extract text from .doc file: {doc_path}. "
|
|
636
|
+
"Install 'antiword' (Linux/macOS) or use macOS with 'textutil', "
|
|
637
|
+
"or convert the file to .docx format."
|
|
638
|
+
)
|
|
639
|
+
|
|
640
|
+
def _extract_images(self, docx_path: str, output_dir: Path) -> List[str]:
|
|
641
|
+
"""
|
|
642
|
+
Extract all images from a DOCX file.
|
|
643
|
+
|
|
644
|
+
DOCX files are ZIP archives with images stored in the word/media/ directory.
|
|
645
|
+
|
|
646
|
+
Args:
|
|
647
|
+
docx_path: Path to the DOCX file
|
|
648
|
+
output_dir: Directory to save extracted images
|
|
649
|
+
|
|
650
|
+
Returns:
|
|
651
|
+
List of paths to extracted image files
|
|
652
|
+
"""
|
|
653
|
+
docx_path = Path(docx_path)
|
|
654
|
+
extracted_files = []
|
|
655
|
+
|
|
656
|
+
# Clear previous mapping
|
|
657
|
+
self._image_path_mapping.clear()
|
|
658
|
+
|
|
659
|
+
try:
|
|
660
|
+
# DOCX files are ZIP archives
|
|
661
|
+
with zipfile.ZipFile(docx_path, 'r') as zip_ref:
|
|
662
|
+
# Images are stored in word/media/ directory
|
|
663
|
+
for file_info in zip_ref.filelist:
|
|
664
|
+
# Extract only image files from media folder
|
|
665
|
+
if file_info.filename.startswith('word/media/') and not file_info.is_dir():
|
|
666
|
+
# Get the filename
|
|
667
|
+
filename = Path(file_info.filename).name
|
|
668
|
+
|
|
669
|
+
# Check if it's an image based on extension
|
|
670
|
+
img_extensions = {'.png', '.jpg', '.jpeg',
|
|
671
|
+
'.gif', '.bmp', '.tiff', '.emf', '.wmf'}
|
|
672
|
+
if Path(filename).suffix.lower() in img_extensions:
|
|
673
|
+
# Extract the file
|
|
674
|
+
extracted_path = output_dir / filename
|
|
675
|
+
with zip_ref.open(file_info) as source, open(extracted_path, 'wb') as target:
|
|
676
|
+
target.write(source.read())
|
|
677
|
+
extracted_files.append(str(extracted_path))
|
|
678
|
+
|
|
679
|
+
# Build image path mapping
|
|
680
|
+
self._image_path_mapping[filename] = str(
|
|
681
|
+
extracted_path)
|
|
682
|
+
logger.debug(f"Extracted image: {extracted_path}")
|
|
683
|
+
|
|
684
|
+
logger.info(f"Extracted {len(extracted_files)} images from DOCX")
|
|
685
|
+
logger.info(
|
|
686
|
+
f"Built image path mapping with {len(self._image_path_mapping)} images")
|
|
687
|
+
|
|
688
|
+
except Exception as e:
|
|
689
|
+
logger.warning(f"Failed to extract images from DOCX: {e}")
|
|
690
|
+
|
|
691
|
+
return extracted_files
|
|
692
|
+
|
|
693
|
+
def _add_image_references_to_text(self, text: str, image_files: List[str]) -> str:
|
|
694
|
+
"""
|
|
695
|
+
Add image references to the extracted text.
|
|
696
|
+
|
|
697
|
+
Args:
|
|
698
|
+
text: Extracted text content
|
|
699
|
+
image_files: List of extracted image file paths
|
|
700
|
+
|
|
701
|
+
Returns:
|
|
702
|
+
Text with appended image references
|
|
703
|
+
"""
|
|
704
|
+
if not image_files:
|
|
705
|
+
return text
|
|
706
|
+
|
|
707
|
+
# Add image references at the end of the text
|
|
708
|
+
image_refs = "\n\n--- Embedded Images ---\n"
|
|
709
|
+
for img_path in image_files:
|
|
710
|
+
img_name = Path(img_path).name
|
|
711
|
+
image_refs += f"\n"
|
|
712
|
+
|
|
713
|
+
return text + image_refs
|
|
714
|
+
|
|
715
|
+
def _chunk_text(self, text: str) -> List[str]:
|
|
716
|
+
"""
|
|
717
|
+
Chunk text using AdvancedChunker's chunk_text method.
|
|
718
|
+
|
|
719
|
+
This method uses chunk_text() which is specifically designed for plain text strings.
|
|
720
|
+
|
|
721
|
+
Args:
|
|
722
|
+
text: Full text to chunk
|
|
723
|
+
|
|
724
|
+
Returns:
|
|
725
|
+
List of text chunks
|
|
726
|
+
"""
|
|
727
|
+
if not text or not text.strip():
|
|
728
|
+
return []
|
|
729
|
+
|
|
730
|
+
try:
|
|
731
|
+
# Use AdvancedChunker's chunk_text method for plain text
|
|
732
|
+
chunk_dicts = self._chunker.chunk_text(
|
|
733
|
+
text=text,
|
|
734
|
+
chunk_size=self._chunk_size,
|
|
735
|
+
chunk_overlap=self._chunk_overlap,
|
|
736
|
+
min_sentences_per_chunk=self._min_sentences_per_chunk,
|
|
737
|
+
tokenizer=self._tokenizer
|
|
738
|
+
)
|
|
739
|
+
|
|
740
|
+
# Extract just the text from the chunk dictionaries
|
|
741
|
+
text_chunks = [chunk_dict['text'] for chunk_dict in chunk_dicts]
|
|
742
|
+
|
|
743
|
+
logger.info(f"Chunked text into {len(text_chunks)} chunks")
|
|
744
|
+
return text_chunks
|
|
745
|
+
|
|
746
|
+
except Exception as e:
|
|
747
|
+
logger.warning(f"Failed to chunk text with AdvancedChunker: {e}")
|
|
748
|
+
# Fall back to returning the whole text as a single chunk
|
|
749
|
+
logger.info("Falling back to single chunk")
|
|
750
|
+
return [text]
|
|
751
|
+
|
|
752
|
+
|
|
753
|
+
def create_doc_loader(
|
|
754
|
+
chunk_size: int = 2048,
|
|
755
|
+
chunk_overlap: int = 128,
|
|
756
|
+
min_sentences_per_chunk: int = 1,
|
|
757
|
+
tokenizer: str = "character",
|
|
758
|
+
embed_model_id: str = "sentence-transformers/all-MiniLM-L6-v2",
|
|
759
|
+
save_images: bool = True,
|
|
760
|
+
scratch_folder_name: str = 'scratch',
|
|
761
|
+
include_images_in_text: bool = True,
|
|
762
|
+
extract_tables: bool = True,
|
|
763
|
+
preserve_formatting: bool = False
|
|
764
|
+
) -> DocLoader:
|
|
765
|
+
"""
|
|
766
|
+
Factory function to create a Word document loader.
|
|
767
|
+
|
|
768
|
+
Args:
|
|
769
|
+
chunk_size: Maximum tokens per chunk (default: 2048)
|
|
770
|
+
chunk_overlap: Overlap between chunks in tokens (default: 128)
|
|
771
|
+
min_sentences_per_chunk: Minimum sentences per chunk (default: 1)
|
|
772
|
+
tokenizer: Tokenizer for chunking - "character", "gpt2", or HuggingFace model (default: "character")
|
|
773
|
+
embed_model_id: HuggingFace model ID for tokenization (default: "sentence-transformers/all-MiniLM-L6-v2")
|
|
774
|
+
save_images: Whether to save extracted images (default: True)
|
|
775
|
+
scratch_folder_name: Name of scratch folder in data directory (default: "scratch")
|
|
776
|
+
include_images_in_text: Whether to include image references in text (default: True)
|
|
777
|
+
extract_tables: Whether to extract table content (default: True)
|
|
778
|
+
preserve_formatting: Whether to preserve basic formatting markers (default: False)
|
|
779
|
+
|
|
780
|
+
Returns:
|
|
781
|
+
Configured DOC/DOCX loader
|
|
782
|
+
|
|
783
|
+
Example:
|
|
784
|
+
>>> loader = create_doc_loader(chunk_size=1024, chunk_overlap=64)
|
|
785
|
+
>>> chunks = loader.run("data/document.docx")
|
|
786
|
+
>>> print(f"Extracted {len(chunks)} chunks")
|
|
787
|
+
|
|
788
|
+
>>> # Create loader without image references
|
|
789
|
+
>>> loader = create_doc_loader(include_images_in_text=False)
|
|
790
|
+
>>> chunks = loader.run("data/document.docx")
|
|
791
|
+
|
|
792
|
+
>>> # Load as nodes for vector store
|
|
793
|
+
>>> loader = create_doc_loader()
|
|
794
|
+
>>> nodes = loader.load_as_nodes("data/report.docx", custom_metadata={"category": "reports"})
|
|
795
|
+
"""
|
|
796
|
+
config = {
|
|
797
|
+
'chunk_size': chunk_size,
|
|
798
|
+
'chunk_overlap': chunk_overlap,
|
|
799
|
+
'min_sentences_per_chunk': min_sentences_per_chunk,
|
|
800
|
+
'tokenizer': tokenizer,
|
|
801
|
+
'embed_model_id': embed_model_id,
|
|
802
|
+
'save_images': save_images,
|
|
803
|
+
'scratch_folder_name': scratch_folder_name,
|
|
804
|
+
'include_images_in_text': include_images_in_text,
|
|
805
|
+
'extract_tables': extract_tables,
|
|
806
|
+
'preserve_formatting': preserve_formatting
|
|
807
|
+
}
|
|
808
|
+
|
|
809
|
+
return DocLoader(config=config)
|
|
810
|
+
|
|
811
|
+
|
|
812
|
+
__all__ = ["DocLoader", "create_doc_loader"]
|