rakam-systems-vectorstore 0.1.1rc7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. rakam_systems_vectorstore/MANIFEST.in +26 -0
  2. rakam_systems_vectorstore/README.md +1071 -0
  3. rakam_systems_vectorstore/__init__.py +93 -0
  4. rakam_systems_vectorstore/components/__init__.py +0 -0
  5. rakam_systems_vectorstore/components/chunker/__init__.py +19 -0
  6. rakam_systems_vectorstore/components/chunker/advanced_chunker.py +1019 -0
  7. rakam_systems_vectorstore/components/chunker/text_chunker.py +154 -0
  8. rakam_systems_vectorstore/components/embedding_model/__init__.py +0 -0
  9. rakam_systems_vectorstore/components/embedding_model/configurable_embeddings.py +546 -0
  10. rakam_systems_vectorstore/components/embedding_model/openai_embeddings.py +259 -0
  11. rakam_systems_vectorstore/components/loader/__init__.py +31 -0
  12. rakam_systems_vectorstore/components/loader/adaptive_loader.py +512 -0
  13. rakam_systems_vectorstore/components/loader/code_loader.py +699 -0
  14. rakam_systems_vectorstore/components/loader/doc_loader.py +812 -0
  15. rakam_systems_vectorstore/components/loader/eml_loader.py +556 -0
  16. rakam_systems_vectorstore/components/loader/html_loader.py +626 -0
  17. rakam_systems_vectorstore/components/loader/md_loader.py +622 -0
  18. rakam_systems_vectorstore/components/loader/odt_loader.py +750 -0
  19. rakam_systems_vectorstore/components/loader/pdf_loader.py +771 -0
  20. rakam_systems_vectorstore/components/loader/pdf_loader_light.py +723 -0
  21. rakam_systems_vectorstore/components/loader/tabular_loader.py +597 -0
  22. rakam_systems_vectorstore/components/vectorstore/__init__.py +0 -0
  23. rakam_systems_vectorstore/components/vectorstore/apps.py +10 -0
  24. rakam_systems_vectorstore/components/vectorstore/configurable_pg_vector_store.py +1661 -0
  25. rakam_systems_vectorstore/components/vectorstore/faiss_vector_store.py +878 -0
  26. rakam_systems_vectorstore/components/vectorstore/migrations/0001_initial.py +55 -0
  27. rakam_systems_vectorstore/components/vectorstore/migrations/__init__.py +0 -0
  28. rakam_systems_vectorstore/components/vectorstore/models.py +10 -0
  29. rakam_systems_vectorstore/components/vectorstore/pg_models.py +97 -0
  30. rakam_systems_vectorstore/components/vectorstore/pg_vector_store.py +827 -0
  31. rakam_systems_vectorstore/config.py +266 -0
  32. rakam_systems_vectorstore/core.py +8 -0
  33. rakam_systems_vectorstore/pyproject.toml +113 -0
  34. rakam_systems_vectorstore/server/README.md +290 -0
  35. rakam_systems_vectorstore/server/__init__.py +20 -0
  36. rakam_systems_vectorstore/server/mcp_server_vector.py +325 -0
  37. rakam_systems_vectorstore/setup.py +103 -0
  38. rakam_systems_vectorstore-0.1.1rc7.dist-info/METADATA +370 -0
  39. rakam_systems_vectorstore-0.1.1rc7.dist-info/RECORD +40 -0
  40. rakam_systems_vectorstore-0.1.1rc7.dist-info/WHEEL +4 -0
@@ -0,0 +1,812 @@
1
+ """
2
+ DOC/DOCX Loader for Microsoft Word document processing.
3
+
4
+ This loader extracts text and images from Word documents (.doc, .docx).
5
+ It supports:
6
+ - Text extraction with paragraph and table preservation
7
+ - Image extraction from the document
8
+ - Configurable chunking of plain text
9
+ - Both legacy .doc and modern .docx formats
10
+
11
+ The loader stores extracted images in a scratch folder within the data directory.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import mimetypes
17
+ import os
18
+ import re
19
+ import subprocess
20
+ import tempfile
21
+ import time
22
+ import zipfile
23
+ from pathlib import Path
24
+ from typing import Any, Dict, List, Optional, Union
25
+
26
+ from rakam_systems_core.ai_utils import logging
27
+ from rakam_systems_core.ai_core.interfaces.loader import Loader
28
+ from rakam_systems_vectorstore.components.chunker import AdvancedChunker
29
+ from rakam_systems_vectorstore.core import Node, NodeMetadata, VSFile
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+ class DocLoader(Loader):
35
+ """
36
+ Word document loader for .doc and .docx files.
37
+
38
+ This loader provides Word document processing with support for:
39
+ - Text extraction with paragraph and table preservation
40
+ - Image extraction from document archive (DOCX only)
41
+ - Advanced text chunking
42
+ - Both legacy .doc and modern .docx formats
43
+
44
+ For .docx files, images are extracted and saved to a scratch directory.
45
+ For legacy .doc files, text extraction is attempted via python-docx or
46
+ falls back to antiword/textutil if available.
47
+ """
48
+
49
+ # Supported file extensions
50
+ SUPPORTED_EXTENSIONS = {'.doc', '.docx', '.DOC', '.DOCX'}
51
+
52
+ # MIME types for Word documents
53
+ MIME_TYPES = {
54
+ 'application/msword', # .doc
55
+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # .docx
56
+ }
57
+
58
+ # Default configuration
59
+ DEFAULT_EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
60
+ DEFAULT_CHUNK_SIZE = 2048
61
+ DEFAULT_CHUNK_OVERLAP = 128
62
+ DEFAULT_IMAGE_PATH = "data/ingestion_image/" # Default path for extracted images
63
+
64
+ def __init__(
65
+ self,
66
+ name: str = "doc_loader",
67
+ config: Optional[Dict[str, Any]] = None
68
+ ):
69
+ """
70
+ Initialize DOC/DOCX loader.
71
+
72
+ Args:
73
+ name: Component name
74
+ config: Optional configuration with keys:
75
+ - embed_model_id: HuggingFace model ID for tokenization (default: "sentence-transformers/all-MiniLM-L6-v2")
76
+ - chunk_size: Maximum tokens per chunk (default: 2048)
77
+ - chunk_overlap: Overlap between chunks in tokens (default: 128)
78
+ - min_sentences_per_chunk: Minimum sentences per chunk (default: 1)
79
+ - tokenizer: Tokenizer for chunking (default: "character")
80
+ - save_images: Whether to save images to disk (default: True)
81
+ - image_path: Path to save images (default: None, uses INGESTION_IMAGE_PATH env var or "data/ingestion_image/")
82
+ - scratch_folder_name: Name of scratch folder (default: "scratch")
83
+ - include_images_in_text: Whether to add image references to text (default: True)
84
+ - extract_tables: Whether to extract table content (default: True)
85
+ - preserve_formatting: Whether to preserve basic formatting markers (default: False)
86
+ """
87
+ super().__init__(name=name, config=config)
88
+
89
+ # Extract configuration
90
+ config = config or {}
91
+ self._save_images = config.get('save_images', True)
92
+ self._image_path = config.get('image_path') or os.getenv(
93
+ 'INGESTION_IMAGE_PATH', self.DEFAULT_IMAGE_PATH)
94
+ self._scratch_folder_name = config.get(
95
+ 'scratch_folder_name', 'scratch')
96
+ self._include_images_in_text = config.get(
97
+ 'include_images_in_text', True)
98
+ self._extract_tables = config.get('extract_tables', True)
99
+ self._preserve_formatting = config.get('preserve_formatting', False)
100
+
101
+ # Chunking configuration
102
+ self._chunk_size = config.get('chunk_size', self.DEFAULT_CHUNK_SIZE)
103
+ self._chunk_overlap = config.get(
104
+ 'chunk_overlap', self.DEFAULT_CHUNK_OVERLAP)
105
+ self._min_sentences_per_chunk = config.get(
106
+ 'min_sentences_per_chunk', 1)
107
+ self._tokenizer = config.get('tokenizer', 'character')
108
+
109
+ # Initialize advanced chunker
110
+ embed_model_id = config.get(
111
+ 'embed_model_id', self.DEFAULT_EMBED_MODEL_ID)
112
+ self._chunker = AdvancedChunker(
113
+ embed_model_id=embed_model_id,
114
+ strategy="default"
115
+ )
116
+
117
+ # Store last extraction info for image tracking
118
+ self._last_scratch_dir = None
119
+ self._last_image_files = []
120
+ self._image_path_mapping: Dict[str, str] = {}
121
+
122
+ logger.info(
123
+ f"Initialized DocLoader with chunk_size={self._chunk_size}, chunk_overlap={self._chunk_overlap}, image_path={self._image_path}")
124
+
125
+ def run(self, source: str) -> List[str]:
126
+ """
127
+ Execute the primary operation for the component.
128
+
129
+ This method satisfies the BaseComponent abstract method requirement
130
+ and delegates to load_as_chunks.
131
+
132
+ Args:
133
+ source: Path to DOC/DOCX file
134
+
135
+ Returns:
136
+ List of text chunks extracted from the document
137
+ """
138
+ return self.load_as_chunks(source)
139
+
140
+ def load_as_text(
141
+ self,
142
+ source: Union[str, Path],
143
+ ) -> str:
144
+ """
145
+ Load Word document and return as a single text string.
146
+
147
+ This method extracts all text from the document and returns it as a single
148
+ string without chunking. Useful when you need the full document text.
149
+
150
+ Args:
151
+ source: Path to DOC/DOCX file
152
+
153
+ Returns:
154
+ Full text content of the document as a single string
155
+
156
+ Raises:
157
+ FileNotFoundError: If source file doesn't exist
158
+ ValueError: If source is not a Word document
159
+ Exception: If document processing fails
160
+ """
161
+ # Convert Path to string
162
+ if isinstance(source, Path):
163
+ source = str(source)
164
+
165
+ # Validate file exists
166
+ if not os.path.isfile(source):
167
+ raise FileNotFoundError(f"File not found: {source}")
168
+
169
+ # Validate file is a Word document
170
+ source_path = Path(source)
171
+ if not self._is_doc_file(source):
172
+ raise ValueError(
173
+ f"File is not a Word document: {source} (extension: {source_path.suffix})")
174
+
175
+ logger.info(
176
+ f"Loading Word document as text: {source_path.name} (extension: {source_path.suffix})")
177
+ start_time = time.time()
178
+
179
+ try:
180
+ # Create scratch directory in data folder
181
+ scratch_dir = self._get_scratch_dir(source)
182
+ self._last_scratch_dir = scratch_dir
183
+
184
+ # Extract images if enabled (DOCX only)
185
+ image_files = []
186
+ if self._save_images and self._is_docx_file(source):
187
+ image_dir = self._get_image_path(source)
188
+ image_files = self._extract_images(source, Path(image_dir))
189
+ self._last_image_files = image_files
190
+ logger.info(
191
+ f"Extracted {len(image_files)} images from document")
192
+
193
+ # Extract text from document
194
+ if self._is_docx_file(source):
195
+ full_text = self._extract_text_docx(source)
196
+ else:
197
+ full_text = self._extract_text_doc(source)
198
+
199
+ # Add image references if enabled
200
+ if self._include_images_in_text and image_files:
201
+ full_text = self._add_image_references_to_text(
202
+ full_text, image_files)
203
+
204
+ elapsed = time.time() - start_time
205
+ logger.info(
206
+ f"Document loaded as text in {elapsed:.2f}s: {len(full_text)} characters")
207
+
208
+ return full_text
209
+
210
+ except Exception as e:
211
+ logger.error(f"Error loading document as text {source}: {e}")
212
+ raise
213
+
214
+ def load_as_chunks(
215
+ self,
216
+ source: Union[str, Path],
217
+ ) -> List[str]:
218
+ """
219
+ Load Word document and return as a list of text chunks.
220
+
221
+ This method extracts text from the document, processes it with the configured
222
+ chunker, and returns a list of text chunks. Each chunk optionally includes
223
+ image references.
224
+
225
+ Args:
226
+ source: Path to DOC/DOCX file
227
+
228
+ Returns:
229
+ List of text chunks extracted from the document
230
+
231
+ Raises:
232
+ FileNotFoundError: If source file doesn't exist
233
+ ValueError: If source is not a Word document
234
+ Exception: If document processing fails
235
+ """
236
+ # Convert Path to string
237
+ if isinstance(source, Path):
238
+ source = str(source)
239
+
240
+ # Validate file exists
241
+ if not os.path.isfile(source):
242
+ raise FileNotFoundError(f"File not found: {source}")
243
+
244
+ # Validate file is a Word document
245
+ source_path = Path(source)
246
+ if not self._is_doc_file(source):
247
+ raise ValueError(
248
+ f"File is not a Word document: {source} (extension: {source_path.suffix})")
249
+
250
+ logger.info(
251
+ f"Loading Word document file: {source_path.name} (extension: {source_path.suffix})")
252
+ start_time = time.time()
253
+
254
+ try:
255
+ # Get full text
256
+ full_text = self.load_as_text(source)
257
+
258
+ # Chunk the text using AdvancedChunker's chunk_text method
259
+ text_chunks = self._chunk_text(full_text)
260
+
261
+ elapsed = time.time() - start_time
262
+ logger.info(
263
+ f"Document processed in {elapsed:.2f}s: {len(text_chunks)} chunks")
264
+
265
+ return text_chunks
266
+
267
+ except Exception as e:
268
+ logger.error(f"Error processing document {source}: {e}")
269
+ raise
270
+
271
+ def load_as_nodes(
272
+ self,
273
+ source: Union[str, Path],
274
+ source_id: Optional[str] = None,
275
+ custom_metadata: Optional[Dict[str, Any]] = None
276
+ ) -> List[Node]:
277
+ """
278
+ Load Word document and return as Node objects with metadata.
279
+
280
+ Args:
281
+ source: Path to DOC/DOCX file
282
+ source_id: Optional source identifier (defaults to file path)
283
+ custom_metadata: Optional custom metadata to attach to nodes
284
+
285
+ Returns:
286
+ List of Node objects with text chunks and metadata
287
+ """
288
+ # Convert Path to string
289
+ if isinstance(source, Path):
290
+ source = str(source)
291
+
292
+ # Load text chunks
293
+ chunks = self.load_as_chunks(source)
294
+
295
+ # Determine source ID
296
+ if source_id is None:
297
+ source_id = source
298
+
299
+ # Create nodes with metadata
300
+ nodes = []
301
+ for idx, chunk in enumerate(chunks):
302
+ metadata = NodeMetadata(
303
+ source_file_uuid=source_id,
304
+ position=idx,
305
+ custom=custom_metadata or {}
306
+ )
307
+ node = Node(content=chunk, metadata=metadata)
308
+ nodes.append(node)
309
+
310
+ logger.info(f"Created {len(nodes)} nodes from document: {source}")
311
+ return nodes
312
+
313
+ def load_as_vsfile(
314
+ self,
315
+ file_path: Union[str, Path],
316
+ custom_metadata: Optional[Dict[str, Any]] = None
317
+ ) -> VSFile:
318
+ """
319
+ Load Word document and return as VSFile object.
320
+
321
+ Args:
322
+ file_path: Path to DOC/DOCX file
323
+ custom_metadata: Optional custom metadata
324
+
325
+ Returns:
326
+ VSFile object with nodes
327
+
328
+ Raises:
329
+ FileNotFoundError: If file doesn't exist
330
+ ValueError: If file is not a Word document
331
+ """
332
+ if isinstance(file_path, Path):
333
+ file_path = str(file_path)
334
+
335
+ if not os.path.isfile(file_path):
336
+ raise FileNotFoundError(f"File not found: {file_path}")
337
+
338
+ if not self._is_doc_file(file_path):
339
+ raise ValueError(f"File is not a Word document: {file_path}")
340
+
341
+ # Create VSFile
342
+ vsfile = VSFile(file_path)
343
+
344
+ # Load and create nodes
345
+ nodes = self.load_as_nodes(
346
+ file_path, str(vsfile.uuid), custom_metadata)
347
+ vsfile.nodes = nodes
348
+ vsfile.processed = True
349
+
350
+ logger.info(
351
+ f"Created VSFile with {len(nodes)} nodes from: {file_path}")
352
+ return vsfile
353
+
354
+ def _is_doc_file(self, file_path: str) -> bool:
355
+ """
356
+ Check if file is a Word document based on extension and magic bytes.
357
+
358
+ Args:
359
+ file_path: Path to file
360
+
361
+ Returns:
362
+ True if file is a Word document, False otherwise
363
+ """
364
+ path = Path(file_path)
365
+ suffix = path.suffix.lower()
366
+
367
+ # First check extension
368
+ if suffix not in {'.doc', '.docx'}:
369
+ logger.debug(
370
+ f"File {path.name} rejected: extension '{suffix}' is not .doc or .docx")
371
+ return False
372
+
373
+ # Additional safety check: verify it's not a PDF by checking magic bytes
374
+ try:
375
+ with open(file_path, 'rb') as f:
376
+ magic_bytes = f.read(4)
377
+ # PDF files start with %PDF (0x25504446)
378
+ if magic_bytes.startswith(b'%PDF'):
379
+ logger.error(
380
+ f"File {path.name} has .doc/.docx extension but is actually a PDF!")
381
+ return False
382
+ except Exception as e:
383
+ logger.warning(f"Could not read magic bytes from {path.name}: {e}")
384
+
385
+ return True
386
+
387
+ def _is_docx_file(self, file_path: str) -> bool:
388
+ """
389
+ Check if file is specifically a .docx file.
390
+
391
+ Args:
392
+ file_path: Path to file
393
+
394
+ Returns:
395
+ True if file is a .docx, False otherwise
396
+ """
397
+ path = Path(file_path)
398
+ return path.suffix.lower() == '.docx'
399
+
400
+ def _get_scratch_dir(self, source_path: str) -> Path:
401
+ """
402
+ Get scratch directory for storing extracted files.
403
+
404
+ The scratch directory is created inside the data folder relative to the source file.
405
+
406
+ Args:
407
+ source_path: Path to source document file
408
+
409
+ Returns:
410
+ Path to scratch directory
411
+ """
412
+ source = Path(source_path)
413
+
414
+ # Find data folder - assume it's a parent of the source or sibling
415
+ if 'data' in source.parts:
416
+ # Navigate to data folder
417
+ data_folder = source
418
+ while data_folder.name != 'data' and data_folder.parent != data_folder:
419
+ data_folder = data_folder.parent
420
+ else:
421
+ # Use parent directory and create/use data folder
422
+ data_folder = source.parent / 'data'
423
+
424
+ # Create scratch directory inside data folder
425
+ scratch_dir = data_folder / self._scratch_folder_name
426
+ scratch_dir.mkdir(parents=True, exist_ok=True)
427
+
428
+ logger.debug(f"Using scratch directory: {scratch_dir}")
429
+ return scratch_dir
430
+
431
+ def _get_image_path(self, source_path: str) -> str:
432
+ """
433
+ Get the path where images should be extracted.
434
+
435
+ Uses the configured image path (from config, env var, or default).
436
+ Creates a subdirectory based on the source document filename.
437
+
438
+ Args:
439
+ source_path: Path to source document file
440
+
441
+ Returns:
442
+ Absolute path to image extraction directory
443
+ """
444
+ source = Path(source_path)
445
+ doc_filename = source.stem
446
+
447
+ # Create base image path
448
+ base_path = Path(self._image_path)
449
+
450
+ # Create subdirectory for this document
451
+ image_dir = base_path / doc_filename
452
+ image_dir.mkdir(parents=True, exist_ok=True)
453
+
454
+ logger.debug(f"Using image path: {image_dir}")
455
+ return str(image_dir)
456
+
457
+ def get_image_path_mapping(self) -> Dict[str, str]:
458
+ """
459
+ Get the mapping of image paths.
460
+
461
+ Returns:
462
+ Dictionary mapping image filenames to absolute paths on disk
463
+ """
464
+ return self._image_path_mapping.copy()
465
+
466
+ def get_image_absolute_path(self, image_filename: str) -> Optional[str]:
467
+ """
468
+ Get the absolute file path for an image.
469
+
470
+ Args:
471
+ image_filename: The image filename
472
+
473
+ Returns:
474
+ Absolute path to the image file, or None if not found
475
+ """
476
+ return self._image_path_mapping.get(image_filename)
477
+
478
+ def _extract_text_docx(self, docx_path: str) -> str:
479
+ """
480
+ Extract text from DOCX file using python-docx.
481
+
482
+ Args:
483
+ docx_path: Path to DOCX file
484
+
485
+ Returns:
486
+ Extracted text content
487
+ """
488
+ # Additional safety check: ensure this is actually a DOCX/DOC file
489
+ file_path = Path(docx_path)
490
+ if file_path.suffix.lower() not in ['.doc', '.docx']:
491
+ raise ValueError(
492
+ f"File is not a Word document: {docx_path} (extension: {file_path.suffix})")
493
+
494
+ try:
495
+ from docx import Document
496
+ except ImportError:
497
+ logger.error(
498
+ "python-docx is required for DOCX support. Install with: pip install python-docx")
499
+ raise ImportError("python-docx is required for DOCX support")
500
+
501
+ try:
502
+ doc = Document(docx_path)
503
+
504
+ # Check if document was successfully loaded
505
+ if doc is None:
506
+ raise ValueError(f"Failed to load document: {docx_path}")
507
+
508
+ text_parts = []
509
+
510
+ # Extract paragraphs
511
+ for paragraph in doc.paragraphs:
512
+ para_text = paragraph.text
513
+ if para_text.strip():
514
+ # Optionally preserve formatting markers
515
+ if self._preserve_formatting:
516
+ # Add heading markers based on style
517
+ style_name = paragraph.style.name if paragraph.style else ""
518
+ if style_name.startswith("Heading"):
519
+ level = style_name[-1] if style_name[-1].isdigit() else "1"
520
+ para_text = f"{'#' * int(level)} {para_text}"
521
+ text_parts.append(para_text)
522
+
523
+ # Extract tables if enabled
524
+ if self._extract_tables and hasattr(doc, 'tables') and doc.tables is not None:
525
+ for table in doc.tables:
526
+ table_text = self._extract_table_text(table)
527
+ if table_text.strip():
528
+ text_parts.append(table_text)
529
+
530
+ full_text = '\n\n'.join(text_parts)
531
+ logger.debug(f"Extracted {len(full_text)} characters from DOCX")
532
+ return full_text
533
+
534
+ except Exception as e:
535
+ # Check if this might be a PDF file mistakenly routed here
536
+ if file_path.suffix.lower() == '.pdf' or 'pdf' in str(e).lower():
537
+ raise ValueError(
538
+ f"File appears to be a PDF, not a Word document: {docx_path}. Error: {e}")
539
+ logger.error(f"Failed to extract text from DOCX: {e}")
540
+ raise
541
+
542
+ def _extract_table_text(self, table) -> str:
543
+ """
544
+ Extract text from a Word table.
545
+
546
+ Args:
547
+ table: python-docx Table object
548
+
549
+ Returns:
550
+ Table content as formatted text
551
+ """
552
+ rows = []
553
+ for row in table.rows:
554
+ cells = [cell.text.strip() for cell in row.cells]
555
+ rows.append(' | '.join(cells))
556
+ return '\n'.join(rows)
557
+
558
+ def _extract_text_doc(self, doc_path: str) -> str:
559
+ """
560
+ Extract text from legacy .doc file.
561
+
562
+ This method tries multiple approaches:
563
+ 1. Use python-docx (may work for some .doc files)
564
+ 2. Use antiword if available (Linux/macOS)
565
+ 3. Use textutil if available (macOS)
566
+
567
+ Args:
568
+ doc_path: Path to .doc file
569
+
570
+ Returns:
571
+ Extracted text content
572
+ """
573
+ # Safety check: verify this is not a PDF file
574
+ try:
575
+ with open(doc_path, 'rb') as f:
576
+ magic_bytes = f.read(4)
577
+ if magic_bytes.startswith(b'%PDF'):
578
+ raise ValueError(
579
+ f"File {doc_path} is a PDF, not a Word document. It should not be processed by DocLoader.")
580
+ except IOError:
581
+ pass # If we can't read the file, let the extraction methods handle it
582
+
583
+ # First, try python-docx (works for some .doc files that are actually .docx in disguise)
584
+ try:
585
+ return self._extract_text_docx(doc_path)
586
+ except Exception as e:
587
+ logger.debug(f"python-docx failed for .doc file: {e}")
588
+
589
+ # Try antiword (available on Linux and some macOS systems)
590
+ try:
591
+ result = subprocess.run(
592
+ ['antiword', doc_path],
593
+ capture_output=True,
594
+ text=True,
595
+ timeout=30
596
+ )
597
+ if result.returncode == 0:
598
+ logger.debug("Successfully extracted text using antiword")
599
+ return result.stdout
600
+ except FileNotFoundError:
601
+ logger.debug("antiword not available")
602
+ except subprocess.TimeoutExpired:
603
+ logger.warning("antiword timed out")
604
+ except Exception as e:
605
+ logger.debug(f"antiword failed: {e}")
606
+
607
+ # Try textutil (macOS)
608
+ try:
609
+ with tempfile.NamedTemporaryFile(suffix='.txt', delete=False) as tmp:
610
+ tmp_path = tmp.name
611
+
612
+ result = subprocess.run(
613
+ ['textutil', '-convert', 'txt', '-output', tmp_path, doc_path],
614
+ capture_output=True,
615
+ text=True,
616
+ timeout=30
617
+ )
618
+ if result.returncode == 0:
619
+ with open(tmp_path, 'r', encoding='utf-8') as f:
620
+ content = f.read()
621
+ os.unlink(tmp_path)
622
+ logger.debug("Successfully extracted text using textutil")
623
+ return content
624
+ if os.path.exists(tmp_path):
625
+ os.unlink(tmp_path)
626
+ except FileNotFoundError:
627
+ logger.debug("textutil not available")
628
+ except subprocess.TimeoutExpired:
629
+ logger.warning("textutil timed out")
630
+ except Exception as e:
631
+ logger.debug(f"textutil failed: {e}")
632
+
633
+ # If all methods fail, raise an error
634
+ raise RuntimeError(
635
+ f"Could not extract text from .doc file: {doc_path}. "
636
+ "Install 'antiword' (Linux/macOS) or use macOS with 'textutil', "
637
+ "or convert the file to .docx format."
638
+ )
639
+
640
+ def _extract_images(self, docx_path: str, output_dir: Path) -> List[str]:
641
+ """
642
+ Extract all images from a DOCX file.
643
+
644
+ DOCX files are ZIP archives with images stored in the word/media/ directory.
645
+
646
+ Args:
647
+ docx_path: Path to the DOCX file
648
+ output_dir: Directory to save extracted images
649
+
650
+ Returns:
651
+ List of paths to extracted image files
652
+ """
653
+ docx_path = Path(docx_path)
654
+ extracted_files = []
655
+
656
+ # Clear previous mapping
657
+ self._image_path_mapping.clear()
658
+
659
+ try:
660
+ # DOCX files are ZIP archives
661
+ with zipfile.ZipFile(docx_path, 'r') as zip_ref:
662
+ # Images are stored in word/media/ directory
663
+ for file_info in zip_ref.filelist:
664
+ # Extract only image files from media folder
665
+ if file_info.filename.startswith('word/media/') and not file_info.is_dir():
666
+ # Get the filename
667
+ filename = Path(file_info.filename).name
668
+
669
+ # Check if it's an image based on extension
670
+ img_extensions = {'.png', '.jpg', '.jpeg',
671
+ '.gif', '.bmp', '.tiff', '.emf', '.wmf'}
672
+ if Path(filename).suffix.lower() in img_extensions:
673
+ # Extract the file
674
+ extracted_path = output_dir / filename
675
+ with zip_ref.open(file_info) as source, open(extracted_path, 'wb') as target:
676
+ target.write(source.read())
677
+ extracted_files.append(str(extracted_path))
678
+
679
+ # Build image path mapping
680
+ self._image_path_mapping[filename] = str(
681
+ extracted_path)
682
+ logger.debug(f"Extracted image: {extracted_path}")
683
+
684
+ logger.info(f"Extracted {len(extracted_files)} images from DOCX")
685
+ logger.info(
686
+ f"Built image path mapping with {len(self._image_path_mapping)} images")
687
+
688
+ except Exception as e:
689
+ logger.warning(f"Failed to extract images from DOCX: {e}")
690
+
691
+ return extracted_files
692
+
693
+ def _add_image_references_to_text(self, text: str, image_files: List[str]) -> str:
694
+ """
695
+ Add image references to the extracted text.
696
+
697
+ Args:
698
+ text: Extracted text content
699
+ image_files: List of extracted image file paths
700
+
701
+ Returns:
702
+ Text with appended image references
703
+ """
704
+ if not image_files:
705
+ return text
706
+
707
+ # Add image references at the end of the text
708
+ image_refs = "\n\n--- Embedded Images ---\n"
709
+ for img_path in image_files:
710
+ img_name = Path(img_path).name
711
+ image_refs += f"\n![{img_name}]({img_path})"
712
+
713
+ return text + image_refs
714
+
715
+ def _chunk_text(self, text: str) -> List[str]:
716
+ """
717
+ Chunk text using AdvancedChunker's chunk_text method.
718
+
719
+ This method uses chunk_text() which is specifically designed for plain text strings.
720
+
721
+ Args:
722
+ text: Full text to chunk
723
+
724
+ Returns:
725
+ List of text chunks
726
+ """
727
+ if not text or not text.strip():
728
+ return []
729
+
730
+ try:
731
+ # Use AdvancedChunker's chunk_text method for plain text
732
+ chunk_dicts = self._chunker.chunk_text(
733
+ text=text,
734
+ chunk_size=self._chunk_size,
735
+ chunk_overlap=self._chunk_overlap,
736
+ min_sentences_per_chunk=self._min_sentences_per_chunk,
737
+ tokenizer=self._tokenizer
738
+ )
739
+
740
+ # Extract just the text from the chunk dictionaries
741
+ text_chunks = [chunk_dict['text'] for chunk_dict in chunk_dicts]
742
+
743
+ logger.info(f"Chunked text into {len(text_chunks)} chunks")
744
+ return text_chunks
745
+
746
+ except Exception as e:
747
+ logger.warning(f"Failed to chunk text with AdvancedChunker: {e}")
748
+ # Fall back to returning the whole text as a single chunk
749
+ logger.info("Falling back to single chunk")
750
+ return [text]
751
+
752
+
753
+ def create_doc_loader(
754
+ chunk_size: int = 2048,
755
+ chunk_overlap: int = 128,
756
+ min_sentences_per_chunk: int = 1,
757
+ tokenizer: str = "character",
758
+ embed_model_id: str = "sentence-transformers/all-MiniLM-L6-v2",
759
+ save_images: bool = True,
760
+ scratch_folder_name: str = 'scratch',
761
+ include_images_in_text: bool = True,
762
+ extract_tables: bool = True,
763
+ preserve_formatting: bool = False
764
+ ) -> DocLoader:
765
+ """
766
+ Factory function to create a Word document loader.
767
+
768
+ Args:
769
+ chunk_size: Maximum tokens per chunk (default: 2048)
770
+ chunk_overlap: Overlap between chunks in tokens (default: 128)
771
+ min_sentences_per_chunk: Minimum sentences per chunk (default: 1)
772
+ tokenizer: Tokenizer for chunking - "character", "gpt2", or HuggingFace model (default: "character")
773
+ embed_model_id: HuggingFace model ID for tokenization (default: "sentence-transformers/all-MiniLM-L6-v2")
774
+ save_images: Whether to save extracted images (default: True)
775
+ scratch_folder_name: Name of scratch folder in data directory (default: "scratch")
776
+ include_images_in_text: Whether to include image references in text (default: True)
777
+ extract_tables: Whether to extract table content (default: True)
778
+ preserve_formatting: Whether to preserve basic formatting markers (default: False)
779
+
780
+ Returns:
781
+ Configured DOC/DOCX loader
782
+
783
+ Example:
784
+ >>> loader = create_doc_loader(chunk_size=1024, chunk_overlap=64)
785
+ >>> chunks = loader.run("data/document.docx")
786
+ >>> print(f"Extracted {len(chunks)} chunks")
787
+
788
+ >>> # Create loader without image references
789
+ >>> loader = create_doc_loader(include_images_in_text=False)
790
+ >>> chunks = loader.run("data/document.docx")
791
+
792
+ >>> # Load as nodes for vector store
793
+ >>> loader = create_doc_loader()
794
+ >>> nodes = loader.load_as_nodes("data/report.docx", custom_metadata={"category": "reports"})
795
+ """
796
+ config = {
797
+ 'chunk_size': chunk_size,
798
+ 'chunk_overlap': chunk_overlap,
799
+ 'min_sentences_per_chunk': min_sentences_per_chunk,
800
+ 'tokenizer': tokenizer,
801
+ 'embed_model_id': embed_model_id,
802
+ 'save_images': save_images,
803
+ 'scratch_folder_name': scratch_folder_name,
804
+ 'include_images_in_text': include_images_in_text,
805
+ 'extract_tables': extract_tables,
806
+ 'preserve_formatting': preserve_formatting
807
+ }
808
+
809
+ return DocLoader(config=config)
810
+
811
+
812
+ __all__ = ["DocLoader", "create_doc_loader"]