rakam-systems-vectorstore 0.1.1rc7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. rakam_systems_vectorstore/MANIFEST.in +26 -0
  2. rakam_systems_vectorstore/README.md +1071 -0
  3. rakam_systems_vectorstore/__init__.py +93 -0
  4. rakam_systems_vectorstore/components/__init__.py +0 -0
  5. rakam_systems_vectorstore/components/chunker/__init__.py +19 -0
  6. rakam_systems_vectorstore/components/chunker/advanced_chunker.py +1019 -0
  7. rakam_systems_vectorstore/components/chunker/text_chunker.py +154 -0
  8. rakam_systems_vectorstore/components/embedding_model/__init__.py +0 -0
  9. rakam_systems_vectorstore/components/embedding_model/configurable_embeddings.py +546 -0
  10. rakam_systems_vectorstore/components/embedding_model/openai_embeddings.py +259 -0
  11. rakam_systems_vectorstore/components/loader/__init__.py +31 -0
  12. rakam_systems_vectorstore/components/loader/adaptive_loader.py +512 -0
  13. rakam_systems_vectorstore/components/loader/code_loader.py +699 -0
  14. rakam_systems_vectorstore/components/loader/doc_loader.py +812 -0
  15. rakam_systems_vectorstore/components/loader/eml_loader.py +556 -0
  16. rakam_systems_vectorstore/components/loader/html_loader.py +626 -0
  17. rakam_systems_vectorstore/components/loader/md_loader.py +622 -0
  18. rakam_systems_vectorstore/components/loader/odt_loader.py +750 -0
  19. rakam_systems_vectorstore/components/loader/pdf_loader.py +771 -0
  20. rakam_systems_vectorstore/components/loader/pdf_loader_light.py +723 -0
  21. rakam_systems_vectorstore/components/loader/tabular_loader.py +597 -0
  22. rakam_systems_vectorstore/components/vectorstore/__init__.py +0 -0
  23. rakam_systems_vectorstore/components/vectorstore/apps.py +10 -0
  24. rakam_systems_vectorstore/components/vectorstore/configurable_pg_vector_store.py +1661 -0
  25. rakam_systems_vectorstore/components/vectorstore/faiss_vector_store.py +878 -0
  26. rakam_systems_vectorstore/components/vectorstore/migrations/0001_initial.py +55 -0
  27. rakam_systems_vectorstore/components/vectorstore/migrations/__init__.py +0 -0
  28. rakam_systems_vectorstore/components/vectorstore/models.py +10 -0
  29. rakam_systems_vectorstore/components/vectorstore/pg_models.py +97 -0
  30. rakam_systems_vectorstore/components/vectorstore/pg_vector_store.py +827 -0
  31. rakam_systems_vectorstore/config.py +266 -0
  32. rakam_systems_vectorstore/core.py +8 -0
  33. rakam_systems_vectorstore/pyproject.toml +113 -0
  34. rakam_systems_vectorstore/server/README.md +290 -0
  35. rakam_systems_vectorstore/server/__init__.py +20 -0
  36. rakam_systems_vectorstore/server/mcp_server_vector.py +325 -0
  37. rakam_systems_vectorstore/setup.py +103 -0
  38. rakam_systems_vectorstore-0.1.1rc7.dist-info/METADATA +370 -0
  39. rakam_systems_vectorstore-0.1.1rc7.dist-info/RECORD +40 -0
  40. rakam_systems_vectorstore-0.1.1rc7.dist-info/WHEEL +4 -0
@@ -0,0 +1,750 @@
1
+ """
2
+ ODT Loader using odfpy library for ODT document processing.
3
+
4
+ This loader uses the odfpy library to extract text and images from ODT documents.
5
+ It supports:
6
+ - Text extraction with paragraph preservation
7
+ - Image extraction from the ODT archive
8
+ - Configurable chunking of plain text
9
+
10
+ The loader stores extracted images in a scratch folder within the data directory.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import mimetypes
16
+ import os
17
+ import re
18
+ import time
19
+ import zipfile
20
+ from pathlib import Path
21
+ from typing import Any, Dict, List, Optional, Union
22
+
23
+ from odf import text, teletype, draw
24
+ from odf.opendocument import load as odf_load
25
+ from odf.element import Element
26
+
27
+ from rakam_systems_core.ai_utils import logging
28
+ from rakam_systems_core.ai_core.interfaces.loader import Loader
29
+ from rakam_systems_vectorstore.components.chunker import AdvancedChunker
30
+ from rakam_systems_vectorstore.core import Node, NodeMetadata, VSFile
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ class OdtLoader(Loader):
36
+ """
37
+ ODT loader using odfpy for document processing.
38
+
39
+ This loader provides ODT processing with support for:
40
+ - Text extraction with paragraph preservation
41
+ - Image extraction from ODT archive
42
+ - Advanced text chunking
43
+ - Configurable processing options
44
+
45
+ The extracted content is chunked and returned as text or Node objects.
46
+ Images are saved to a scratch directory for reference.
47
+ """
48
+
49
+ # Default configuration
50
+ DEFAULT_EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
51
+ DEFAULT_CHUNK_SIZE = 2048
52
+ DEFAULT_CHUNK_OVERLAP = 128
53
+ DEFAULT_IMAGE_PATH = "data/ingestion_image/" # Default path for extracted images
54
+
55
+ def __init__(
56
+ self,
57
+ name: str = "odt_loader",
58
+ config: Optional[Dict[str, Any]] = None
59
+ ):
60
+ """
61
+ Initialize ODT loader with odfpy.
62
+
63
+ Args:
64
+ name: Component name
65
+ config: Optional configuration with keys:
66
+ - embed_model_id: HuggingFace model ID for tokenization (default: "sentence-transformers/all-MiniLM-L6-v2")
67
+ - chunk_size: Maximum tokens per chunk (default: 2048)
68
+ - chunk_overlap: Overlap between chunks in tokens (default: 128)
69
+ - min_sentences_per_chunk: Minimum sentences per chunk (default: 1)
70
+ - tokenizer: Tokenizer for chunking (default: "character")
71
+ - save_images: Whether to save images to disk (default: True)
72
+ - image_path: Path to save images (default: None, uses INGESTION_IMAGE_PATH env var or "data/ingestion_image/")
73
+ - scratch_folder_name: Name of scratch folder (default: "scratch")
74
+ - include_images_in_text: Whether to add image references to text (default: True)
75
+ """
76
+ super().__init__(name=name, config=config)
77
+
78
+ # Extract configuration
79
+ config = config or {}
80
+ self._save_images = config.get('save_images', True)
81
+ self._image_path = config.get('image_path') or os.getenv(
82
+ 'INGESTION_IMAGE_PATH', self.DEFAULT_IMAGE_PATH)
83
+ self._scratch_folder_name = config.get(
84
+ 'scratch_folder_name', 'scratch')
85
+ self._include_images_in_text = config.get(
86
+ 'include_images_in_text', True)
87
+
88
+ # Chunking configuration
89
+ self._chunk_size = config.get('chunk_size', self.DEFAULT_CHUNK_SIZE)
90
+ self._chunk_overlap = config.get(
91
+ 'chunk_overlap', self.DEFAULT_CHUNK_OVERLAP)
92
+ self._min_sentences_per_chunk = config.get(
93
+ 'min_sentences_per_chunk', 1)
94
+ self._tokenizer = config.get('tokenizer', 'character')
95
+
96
+ # Initialize advanced chunker
97
+ embed_model_id = config.get(
98
+ 'embed_model_id', self.DEFAULT_EMBED_MODEL_ID)
99
+ self._chunker = AdvancedChunker(
100
+ embed_model_id=embed_model_id,
101
+ strategy="default" # We'll use chunk_text() which doesn't need a specific strategy
102
+ )
103
+
104
+ # Store last extraction info for image tracking
105
+ self._last_scratch_dir = None
106
+ self._last_image_files = []
107
+ self._image_path_mapping: Dict[str, str] = {}
108
+
109
+ logger.info(
110
+ f"Initialized OdtLoader with chunk_size={self._chunk_size}, chunk_overlap={self._chunk_overlap}, image_path={self._image_path}")
111
+
112
+ def run(self, source: str) -> List[str]:
113
+ """
114
+ Execute the primary operation for the component.
115
+
116
+ This method satisfies the BaseComponent abstract method requirement
117
+ and delegates to load_as_chunks.
118
+
119
+ Args:
120
+ source: Path to ODT file
121
+
122
+ Returns:
123
+ List of text chunks extracted from the ODT
124
+ """
125
+ return self.load_as_chunks(source)
126
+
127
+ def load_as_text(
128
+ self,
129
+ source: Union[str, Path],
130
+ ) -> str:
131
+ """
132
+ Load ODT and return as a single text string.
133
+
134
+ This method extracts all text from the ODT and returns it as a single
135
+ string without chunking. Useful when you need the full document text.
136
+
137
+ Args:
138
+ source: Path to ODT file
139
+
140
+ Returns:
141
+ Full text content of the ODT as a single string
142
+
143
+ Raises:
144
+ FileNotFoundError: If source file doesn't exist
145
+ ValueError: If source is not an ODT file
146
+ Exception: If ODT processing fails
147
+ """
148
+ # Convert Path to string
149
+ if isinstance(source, Path):
150
+ source = str(source)
151
+
152
+ # Validate file exists
153
+ if not os.path.isfile(source):
154
+ raise FileNotFoundError(f"File not found: {source}")
155
+
156
+ # Validate file is an ODT
157
+ if not self._is_odt_file(source):
158
+ raise ValueError(
159
+ f"File is not an ODT: {source}. MIME type: {mimetypes.guess_type(source)[0]}")
160
+
161
+ logger.info(f"Loading ODT as text: {source}")
162
+ start_time = time.time()
163
+
164
+ try:
165
+ # Create scratch directory in data folder
166
+ scratch_dir = self._get_scratch_dir(source)
167
+ self._last_scratch_dir = scratch_dir
168
+
169
+ # Extract images if enabled
170
+ image_files = []
171
+ if self._save_images:
172
+ image_dir = self._get_image_path(source)
173
+ image_files = self._extract_images(source, Path(image_dir))
174
+ self._last_image_files = image_files
175
+ logger.info(f"Extracted {len(image_files)} images from ODT")
176
+
177
+ # Extract text from ODT with image positions if enabled
178
+ if self._include_images_in_text and image_files:
179
+ full_text = self._extract_text_with_image_positions(
180
+ source, image_files)
181
+ else:
182
+ full_text = self._extract_text(source)
183
+
184
+ elapsed = time.time() - start_time
185
+ logger.info(
186
+ f"ODT loaded as text in {elapsed:.2f}s: {len(full_text)} characters")
187
+
188
+ return full_text
189
+
190
+ except Exception as e:
191
+ logger.error(f"Error loading ODT as text {source}: {e}")
192
+ raise
193
+
194
+ def load_as_chunks(
195
+ self,
196
+ source: Union[str, Path],
197
+ ) -> List[str]:
198
+ """
199
+ Load ODT and return as a list of text chunks.
200
+
201
+ This method extracts text from the ODT, processes it with the configured
202
+ chunker, and returns a list of text chunks. Each chunk optionally includes
203
+ image references.
204
+
205
+ Args:
206
+ source: Path to ODT file
207
+
208
+ Returns:
209
+ List of text chunks extracted from the ODT
210
+
211
+ Raises:
212
+ FileNotFoundError: If source file doesn't exist
213
+ ValueError: If source is not an ODT file
214
+ Exception: If ODT processing fails
215
+ """
216
+ # Convert Path to string
217
+ if isinstance(source, Path):
218
+ source = str(source)
219
+
220
+ # Validate file exists
221
+ if not os.path.isfile(source):
222
+ raise FileNotFoundError(f"File not found: {source}")
223
+
224
+ # Validate file is an ODT
225
+ if not self._is_odt_file(source):
226
+ raise ValueError(
227
+ f"File is not an ODT: {source}. MIME type: {mimetypes.guess_type(source)[0]}")
228
+
229
+ logger.info(f"Loading ODT file: {source}")
230
+ start_time = time.time()
231
+
232
+ try:
233
+ # Create scratch directory in data folder
234
+ scratch_dir = self._get_scratch_dir(source)
235
+ self._last_scratch_dir = scratch_dir
236
+
237
+ # Extract images if enabled
238
+ image_files = []
239
+ if self._save_images:
240
+ image_dir = self._get_image_path(source)
241
+ image_files = self._extract_images(source, Path(image_dir))
242
+ self._last_image_files = image_files
243
+ logger.info(f"Extracted {len(image_files)} images from ODT")
244
+
245
+ # Extract text from ODT with image positions if enabled
246
+ if self._include_images_in_text and image_files:
247
+ full_text = self._extract_text_with_image_positions(
248
+ source, image_files)
249
+ else:
250
+ full_text = self._extract_text(source)
251
+
252
+ # Chunk the text using AdvancedChunker's chunk_text method
253
+ text_chunks = self._chunk_text(full_text)
254
+
255
+ elapsed = time.time() - start_time
256
+ logger.info(
257
+ f"ODT processed in {elapsed:.2f}s: {len(text_chunks)} chunks")
258
+
259
+ return text_chunks
260
+
261
+ except Exception as e:
262
+ logger.error(f"Error processing ODT {source}: {e}")
263
+ raise
264
+
265
+ def load_as_nodes(
266
+ self,
267
+ source: Union[str, Path],
268
+ source_id: Optional[str] = None,
269
+ custom_metadata: Optional[Dict[str, Any]] = None
270
+ ) -> List[Node]:
271
+ """
272
+ Load ODT and return as Node objects with metadata.
273
+
274
+ Args:
275
+ source: Path to ODT file
276
+ source_id: Optional source identifier (defaults to file path)
277
+ custom_metadata: Optional custom metadata to attach to nodes
278
+
279
+ Returns:
280
+ List of Node objects with text chunks and metadata
281
+ """
282
+ # Convert Path to string
283
+ if isinstance(source, Path):
284
+ source = str(source)
285
+
286
+ # Load text chunks
287
+ chunks = self.load_as_chunks(source)
288
+
289
+ # Determine source ID
290
+ if source_id is None:
291
+ source_id = source
292
+
293
+ # Create nodes with metadata
294
+ nodes = []
295
+ for idx, chunk in enumerate(chunks):
296
+ metadata = NodeMetadata(
297
+ source_file_uuid=source_id,
298
+ position=idx,
299
+ custom=custom_metadata or {}
300
+ )
301
+ node = Node(content=chunk, metadata=metadata)
302
+ nodes.append(node)
303
+
304
+ logger.info(f"Created {len(nodes)} nodes from ODT: {source}")
305
+ return nodes
306
+
307
+ def load_as_vsfile(
308
+ self,
309
+ file_path: Union[str, Path],
310
+ custom_metadata: Optional[Dict[str, Any]] = None
311
+ ) -> VSFile:
312
+ """
313
+ Load ODT and return as VSFile object.
314
+
315
+ Args:
316
+ file_path: Path to ODT file
317
+ custom_metadata: Optional custom metadata
318
+
319
+ Returns:
320
+ VSFile object with nodes
321
+
322
+ Raises:
323
+ FileNotFoundError: If file doesn't exist
324
+ ValueError: If file is not an ODT
325
+ """
326
+ if isinstance(file_path, Path):
327
+ file_path = str(file_path)
328
+
329
+ if not os.path.isfile(file_path):
330
+ raise FileNotFoundError(f"File not found: {file_path}")
331
+
332
+ if not self._is_odt_file(file_path):
333
+ raise ValueError(f"File is not an ODT: {file_path}")
334
+
335
+ # Create VSFile
336
+ vsfile = VSFile(file_path)
337
+
338
+ # Load and create nodes
339
+ nodes = self.load_as_nodes(
340
+ file_path, str(vsfile.uuid), custom_metadata)
341
+ vsfile.nodes = nodes
342
+ vsfile.processed = True
343
+
344
+ logger.info(
345
+ f"Created VSFile with {len(nodes)} nodes from: {file_path}")
346
+ return vsfile
347
+
348
+ def _is_odt_file(self, file_path: str) -> bool:
349
+ """
350
+ Check if file is an ODT based on extension and MIME type.
351
+
352
+ Args:
353
+ file_path: Path to file
354
+
355
+ Returns:
356
+ True if file is an ODT, False otherwise
357
+ """
358
+ # Check extension
359
+ path = Path(file_path)
360
+ if path.suffix.lower() != '.odt':
361
+ return False
362
+
363
+ # Check MIME type
364
+ mime_type, _ = mimetypes.guess_type(file_path)
365
+ if mime_type and mime_type not in ['application/vnd.oasis.opendocument.text', None]:
366
+ return False
367
+
368
+ return True
369
+
370
+ def _get_scratch_dir(self, source_path: str) -> Path:
371
+ """
372
+ Get scratch directory for storing extracted files.
373
+
374
+ The scratch directory is created inside the data folder relative to the source file.
375
+
376
+ Args:
377
+ source_path: Path to source ODT file
378
+
379
+ Returns:
380
+ Path to scratch directory
381
+ """
382
+ source = Path(source_path)
383
+
384
+ # Find data folder - assume it's a parent of the source or sibling
385
+ if 'data' in source.parts:
386
+ # Navigate to data folder
387
+ data_folder = source
388
+ while data_folder.name != 'data' and data_folder.parent != data_folder:
389
+ data_folder = data_folder.parent
390
+ else:
391
+ # Use parent directory and create/use data folder
392
+ data_folder = source.parent / 'data'
393
+
394
+ # Create scratch directory inside data folder
395
+ scratch_dir = data_folder / self._scratch_folder_name
396
+ scratch_dir.mkdir(parents=True, exist_ok=True)
397
+
398
+ logger.debug(f"Using scratch directory: {scratch_dir}")
399
+ return scratch_dir
400
+
401
+ def _get_image_path(self, source_path: str) -> str:
402
+ """
403
+ Get the path where images should be extracted.
404
+
405
+ Uses the configured image path (from config, env var, or default).
406
+ Creates a subdirectory based on the source document filename.
407
+
408
+ Args:
409
+ source_path: Path to source ODT file
410
+
411
+ Returns:
412
+ Absolute path to image extraction directory
413
+ """
414
+ source = Path(source_path)
415
+ doc_filename = source.stem
416
+
417
+ # Create base image path
418
+ base_path = Path(self._image_path)
419
+
420
+ # Create subdirectory for this document
421
+ image_dir = base_path / doc_filename
422
+ image_dir.mkdir(parents=True, exist_ok=True)
423
+
424
+ logger.debug(f"Using image path: {image_dir}")
425
+ return str(image_dir)
426
+
427
+ def get_image_path_mapping(self) -> Dict[str, str]:
428
+ """
429
+ Get the mapping of image paths.
430
+
431
+ Returns:
432
+ Dictionary mapping image filenames to absolute paths on disk
433
+ """
434
+ return self._image_path_mapping.copy()
435
+
436
+ def get_image_absolute_path(self, image_filename: str) -> Optional[str]:
437
+ """
438
+ Get the absolute file path for an image.
439
+
440
+ Args:
441
+ image_filename: The image filename
442
+
443
+ Returns:
444
+ Absolute path to the image file, or None if not found
445
+ """
446
+ return self._image_path_mapping.get(image_filename)
447
+
448
+ def _extract_text(self, odt_path: str) -> str:
449
+ """
450
+ Extract text from ODT file using odfpy.
451
+
452
+ Args:
453
+ odt_path: Path to ODT file
454
+
455
+ Returns:
456
+ Extracted text content
457
+ """
458
+ try:
459
+ textdoc = odf_load(odt_path)
460
+ allparas = textdoc.getElementsByType(text.P)
461
+ extracted_text = "\n".join(
462
+ teletype.extractText(para) for para in allparas)
463
+
464
+ logger.debug(
465
+ f"Extracted {len(extracted_text)} characters from ODT")
466
+ return extracted_text
467
+
468
+ except Exception as e:
469
+ logger.error(f"Failed to extract text from ODT: {e}")
470
+ raise
471
+
472
+ def _extract_text_with_image_positions(self, odt_path: str, image_files: List[str]) -> str:
473
+ """
474
+ Extract text from ODT file and insert image references at their correct positions.
475
+
476
+ This method traverses the ODT document structure and tracks where images appear
477
+ relative to text, inserting image markers at the appropriate positions.
478
+
479
+ Args:
480
+ odt_path: Path to ODT file
481
+ image_files: List of extracted image file paths
482
+
483
+ Returns:
484
+ Extracted text with image references at correct positions
485
+ """
486
+ try:
487
+ textdoc = odf_load(odt_path)
488
+
489
+ # Create a mapping of image names in ODT to extracted file paths
490
+ image_name_to_path = {}
491
+ for img_path in image_files:
492
+ img_name = Path(img_path).name
493
+ image_name_to_path[img_name] = img_path
494
+
495
+ # Get all body content elements (paragraphs, frames, etc.)
496
+ body = textdoc.body
497
+ text_parts = []
498
+ processed_frames = set() # Track processed frames to avoid duplicates
499
+
500
+ # Recursively traverse document elements
501
+ def traverse_element(element):
502
+ """Recursively traverse ODT elements and extract text with image positions."""
503
+ # Check if this is a paragraph
504
+ if hasattr(element, 'qname') and element.qname == (text.TEXTNS, 'p'):
505
+ para_text = teletype.extractText(element)
506
+ if para_text.strip():
507
+ text_parts.append(para_text)
508
+
509
+ # Check for images within or after this paragraph
510
+ # Images are typically in draw:frame elements
511
+ for child in element.childNodes:
512
+ if hasattr(child, 'qname') and child.qname == (draw.DRAWNS, 'frame'):
513
+ # Use object id to track unique frames
514
+ frame_id = id(child)
515
+ if frame_id not in processed_frames:
516
+ # Found a frame (which may contain an image)
517
+ image_ref = self._extract_image_reference_from_frame(
518
+ child, image_name_to_path)
519
+ if image_ref:
520
+ text_parts.append(image_ref)
521
+ processed_frames.add(frame_id)
522
+
523
+ # Check if this is a frame element (can be at various levels)
524
+ elif hasattr(element, 'qname') and element.qname == (draw.DRAWNS, 'frame'):
525
+ frame_id = id(element)
526
+ if frame_id not in processed_frames:
527
+ image_ref = self._extract_image_reference_from_frame(
528
+ element, image_name_to_path)
529
+ if image_ref:
530
+ text_parts.append(image_ref)
531
+ processed_frames.add(frame_id)
532
+
533
+ # Recursively process child elements (but skip frames since we handle them above)
534
+ if hasattr(element, 'childNodes'):
535
+ for child in element.childNodes:
536
+ if isinstance(child, Element):
537
+ # Don't recurse into frames we've already processed
538
+ if not (hasattr(child, 'qname') and child.qname == (draw.DRAWNS, 'frame') and id(child) in processed_frames):
539
+ traverse_element(child)
540
+
541
+ # Start traversal from body
542
+ traverse_element(body)
543
+
544
+ # Join all text parts
545
+ extracted_text = "\n".join(text_parts)
546
+
547
+ logger.debug(
548
+ f"Extracted {len(extracted_text)} characters from ODT with image positions")
549
+ return extracted_text
550
+
551
+ except Exception as e:
552
+ logger.error(
553
+ f"Failed to extract text with image positions from ODT: {e}")
554
+ # Fall back to regular text extraction
555
+ logger.warning(
556
+ "Falling back to regular text extraction without image positioning")
557
+ return self._extract_text(odt_path)
558
+
559
+ def _extract_image_reference_from_frame(self, frame_element, image_name_to_path: Dict[str, str]) -> Optional[str]:
560
+ """
561
+ Extract image reference from a draw:frame element.
562
+
563
+ Args:
564
+ frame_element: ODT frame element that may contain an image
565
+ image_name_to_path: Mapping of image names to their extracted file paths
566
+
567
+ Returns:
568
+ Image reference string or None if no image found
569
+ """
570
+ try:
571
+ # Look for draw:image elements within the frame
572
+ for child in frame_element.childNodes:
573
+ if hasattr(child, 'qname') and child.qname == (draw.DRAWNS, 'image'):
574
+ # Get the image href
575
+ href = child.getAttribute('href')
576
+ if href:
577
+ # Extract image filename from href (e.g., "Pictures/image1.png" -> "image1.png")
578
+ img_filename = Path(href).name
579
+
580
+ # Look up the extracted file path
581
+ if img_filename in image_name_to_path:
582
+ img_path = image_name_to_path[img_filename]
583
+ return f"\n![Image]({img_path})"
584
+ else:
585
+ logger.debug(
586
+ f"Image {img_filename} referenced but not found in extracted files")
587
+
588
+ except Exception as e:
589
+ logger.debug(f"Error extracting image reference from frame: {e}")
590
+
591
+ return None
592
+
593
+ def _extract_images(self, odt_path: str, output_dir: Path) -> List[str]:
594
+ """
595
+ Extract all images from an ODT file.
596
+
597
+ ODT files are ZIP archives with images stored in the Pictures/ directory.
598
+
599
+ Args:
600
+ odt_path: Path to the ODT file
601
+ output_dir: Directory to save extracted images
602
+
603
+ Returns:
604
+ List of paths to extracted image files
605
+ """
606
+ odt_path = Path(odt_path)
607
+ extracted_files = []
608
+
609
+ # Clear previous mapping
610
+ self._image_path_mapping.clear()
611
+
612
+ try:
613
+ # ODT files are ZIP archives
614
+ with zipfile.ZipFile(odt_path, 'r') as zip_ref:
615
+ # Images are typically stored in Pictures/ directory
616
+ for file_info in zip_ref.filelist:
617
+ # Extract only image files
618
+ if file_info.filename.startswith('Pictures/') and not file_info.is_dir():
619
+ # Extract the file
620
+ filename = Path(file_info.filename).name
621
+ extracted_path = output_dir / filename
622
+ with zip_ref.open(file_info) as source, open(extracted_path, 'wb') as target:
623
+ target.write(source.read())
624
+ extracted_files.append(str(extracted_path))
625
+
626
+ # Build image path mapping
627
+ self._image_path_mapping[filename] = str(
628
+ extracted_path)
629
+ logger.debug(f"Extracted image: {extracted_path}")
630
+
631
+ logger.info(f"Extracted {len(extracted_files)} images from ODT")
632
+ logger.info(
633
+ f"Built image path mapping with {len(self._image_path_mapping)} images")
634
+
635
+ except Exception as e:
636
+ logger.warning(f"Failed to extract images from ODT: {e}")
637
+
638
+ return extracted_files
639
+
640
+ def _add_image_references_to_text(self, text: str, image_files: List[str]) -> str:
641
+ """
642
+ Add image references to the extracted text.
643
+
644
+ Args:
645
+ text: Extracted text content
646
+ image_files: List of extracted image file paths
647
+
648
+ Returns:
649
+ Text with appended image references
650
+ """
651
+ if not image_files:
652
+ return text
653
+
654
+ # Add image references at the end of the text
655
+ image_refs = "\n\n--- Embedded Images ---\n"
656
+ for img_path in image_files:
657
+ img_name = Path(img_path).name
658
+ image_refs += f"\n![Image]({img_path})"
659
+
660
+ return text + image_refs
661
+
662
+ def _chunk_text(self, text: str) -> List[str]:
663
+ """
664
+ Chunk text using AdvancedChunker's chunk_text method.
665
+
666
+ This method uses chunk_text() which is specifically designed for plain text strings,
667
+ as opposed to run() which processes PDF parsing data types.
668
+
669
+ Args:
670
+ text: Full text to chunk
671
+
672
+ Returns:
673
+ List of text chunks
674
+ """
675
+ if not text or not text.strip():
676
+ return []
677
+
678
+ try:
679
+ # Use AdvancedChunker's chunk_text method for plain text
680
+ chunk_dicts = self._chunker.chunk_text(
681
+ text=text,
682
+ chunk_size=self._chunk_size,
683
+ chunk_overlap=self._chunk_overlap,
684
+ min_sentences_per_chunk=self._min_sentences_per_chunk,
685
+ tokenizer=self._tokenizer
686
+ )
687
+
688
+ # Extract just the text from the chunk dictionaries
689
+ text_chunks = [chunk_dict['text'] for chunk_dict in chunk_dicts]
690
+
691
+ logger.info(f"Chunked text into {len(text_chunks)} chunks")
692
+ return text_chunks
693
+
694
+ except Exception as e:
695
+ logger.warning(f"Failed to chunk text with AdvancedChunker: {e}")
696
+ # Fall back to returning the whole text as a single chunk
697
+ logger.info("Falling back to single chunk")
698
+ return [text]
699
+
700
+
701
+ def create_odt_loader(
702
+ chunk_size: int = 2048,
703
+ chunk_overlap: int = 128,
704
+ min_sentences_per_chunk: int = 1,
705
+ tokenizer: str = "character",
706
+ embed_model_id: str = "sentence-transformers/all-MiniLM-L6-v2",
707
+ save_images: bool = True,
708
+ scratch_folder_name: str = 'scratch',
709
+ include_images_in_text: bool = True
710
+ ) -> OdtLoader:
711
+ """
712
+ Factory function to create an ODT loader.
713
+
714
+ Args:
715
+ chunk_size: Maximum tokens per chunk (default: 2048)
716
+ chunk_overlap: Overlap between chunks in tokens (default: 128)
717
+ min_sentences_per_chunk: Minimum sentences per chunk (default: 1)
718
+ tokenizer: Tokenizer for chunking - "character", "gpt2", or HuggingFace model (default: "character")
719
+ embed_model_id: HuggingFace model ID for tokenization (default: "sentence-transformers/all-MiniLM-L6-v2")
720
+ save_images: Whether to save extracted images (default: True)
721
+ scratch_folder_name: Name of scratch folder in data directory (default: "scratch")
722
+ include_images_in_text: Whether to include image references in text (default: True)
723
+
724
+ Returns:
725
+ Configured ODT loader
726
+
727
+ Example:
728
+ >>> loader = create_odt_loader(chunk_size=1024, chunk_overlap=64)
729
+ >>> chunks = loader.run("data/document.odt")
730
+ >>> print(f"Extracted {len(chunks)} chunks")
731
+
732
+ >>> # Create loader without image references
733
+ >>> loader = create_odt_loader(include_images_in_text=False)
734
+ >>> chunks = loader.run("data/document.odt")
735
+ """
736
+ config = {
737
+ 'chunk_size': chunk_size,
738
+ 'chunk_overlap': chunk_overlap,
739
+ 'min_sentences_per_chunk': min_sentences_per_chunk,
740
+ 'tokenizer': tokenizer,
741
+ 'embed_model_id': embed_model_id,
742
+ 'save_images': save_images,
743
+ 'scratch_folder_name': scratch_folder_name,
744
+ 'include_images_in_text': include_images_in_text
745
+ }
746
+
747
+ return OdtLoader(config=config)
748
+
749
+
750
+ __all__ = ["OdtLoader", "create_odt_loader"]