xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,321 @@
1
+ # xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py
2
+ """
3
+ PDF Image Processor
4
+
5
+ Provides PDF-specific image processing that inherits from ImageProcessor.
6
+ Handles XRef images, inline images, and page rendering for complex regions.
7
+
8
+ This class consolidates all PDF image extraction logic including:
9
+ - XRef-based image extraction
10
+ - Page region rendering
11
+ - Image filtering by size/position
12
+ """
13
+ import logging
14
+ from typing import Any, Dict, List, Optional, Set, Tuple, TYPE_CHECKING
15
+
16
+ from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
17
+ from xgen_doc2chunk.core.functions.storage_backend import BaseStorageBackend
18
+
19
+ if TYPE_CHECKING:
20
+ import fitz
21
+
22
+ logger = logging.getLogger("xgen_doc2chunk.image_processor.pdf")
23
+
24
+
25
+ class PDFImageProcessor(ImageProcessor):
26
+ """
27
+ PDF-specific image processor.
28
+
29
+ Inherits from ImageProcessor and provides PDF-specific processing.
30
+
31
+ Handles:
32
+ - XRef images (embedded images with XRef references)
33
+ - Inline images
34
+ - Page region rendering for complex areas
35
+ - Image extraction from PyMuPDF objects
36
+
37
+ Example:
38
+ processor = PDFImageProcessor()
39
+
40
+ # Process XRef image
41
+ tag = processor.process_image(image_data, xref=123)
42
+
43
+ # Process page region
44
+ tag = processor.process_page_region(page, rect)
45
+ """
46
+
47
+ def __init__(
48
+ self,
49
+ directory_path: str = "temp/images",
50
+ tag_prefix: str = "[Image:",
51
+ tag_suffix: str = "]",
52
+ storage_backend: Optional[BaseStorageBackend] = None,
53
+ dpi: int = 150,
54
+ ):
55
+ """
56
+ Initialize PDFImageProcessor.
57
+
58
+ Args:
59
+ directory_path: Image save directory
60
+ tag_prefix: Tag prefix for image references
61
+ tag_suffix: Tag suffix for image references
62
+ storage_backend: Storage backend for saving images
63
+ dpi: DPI for page rendering
64
+ """
65
+ super().__init__(
66
+ directory_path=directory_path,
67
+ tag_prefix=tag_prefix,
68
+ tag_suffix=tag_suffix,
69
+ storage_backend=storage_backend,
70
+ )
71
+ self._dpi = dpi
72
+
73
+ @property
74
+ def dpi(self) -> int:
75
+ """DPI for page rendering."""
76
+ return self._dpi
77
+
78
+ @dpi.setter
79
+ def dpi(self, value: int) -> None:
80
+ """Set DPI for page rendering."""
81
+ self._dpi = value
82
+
83
+ def process_image(
84
+ self,
85
+ image_data: bytes,
86
+ xref: Optional[int] = None,
87
+ page_num: Optional[int] = None,
88
+ **kwargs
89
+ ) -> Optional[str]:
90
+ """
91
+ Process and save PDF image data.
92
+
93
+ Args:
94
+ image_data: Raw image binary data
95
+ xref: Image XRef number (for naming)
96
+ page_num: Page number (for naming)
97
+ **kwargs: Additional options
98
+
99
+ Returns:
100
+ Image tag string, or None on failure
101
+ """
102
+ # Generate custom name based on XRef or page
103
+ custom_name = None
104
+ if xref is not None:
105
+ custom_name = f"pdf_xref_{xref}"
106
+ elif page_num is not None:
107
+ custom_name = f"pdf_page_{page_num}"
108
+
109
+ return self.save_image(image_data, custom_name=custom_name)
110
+
111
+ def process_xref_image(
112
+ self,
113
+ doc: "fitz.Document",
114
+ xref: int,
115
+ ) -> Optional[str]:
116
+ """
117
+ Extract and save image by XRef number.
118
+
119
+ Args:
120
+ doc: PyMuPDF document object
121
+ xref: Image XRef number
122
+
123
+ Returns:
124
+ Image tag string, or None on failure
125
+ """
126
+ try:
127
+ import fitz
128
+
129
+ image_dict = doc.extract_image(xref)
130
+ if not image_dict:
131
+ return None
132
+
133
+ image_data = image_dict.get("image")
134
+ if not image_data:
135
+ return None
136
+
137
+ return self.process_image(image_data, xref=xref)
138
+
139
+ except Exception as e:
140
+ self._logger.warning(f"Failed to extract XRef image {xref}: {e}")
141
+ return None
142
+
143
+ def process_page_region(
144
+ self,
145
+ page: "fitz.Page",
146
+ rect: "fitz.Rect",
147
+ region_name: Optional[str] = None,
148
+ ) -> Optional[str]:
149
+ """
150
+ Render and save a page region as image.
151
+
152
+ Used for complex regions that can't be represented as text.
153
+
154
+ Args:
155
+ page: PyMuPDF page object
156
+ rect: Region rectangle to render
157
+ region_name: Optional name for the region
158
+
159
+ Returns:
160
+ Image tag string, or None on failure
161
+ """
162
+ try:
163
+ import fitz
164
+
165
+ # Calculate zoom for DPI
166
+ zoom = self._dpi / 72.0
167
+ mat = fitz.Matrix(zoom, zoom)
168
+
169
+ # Clip to region
170
+ clip = rect
171
+ pix = page.get_pixmap(matrix=mat, clip=clip, alpha=False)
172
+ image_data = pix.tobytes("png")
173
+
174
+ custom_name = region_name or f"pdf_page{page.number}_region"
175
+ return self.save_image(image_data, custom_name=custom_name)
176
+
177
+ except Exception as e:
178
+ self._logger.warning(f"Failed to render page region: {e}")
179
+ return None
180
+
181
+ def process_embedded_image(
182
+ self,
183
+ image_data: bytes,
184
+ image_name: Optional[str] = None,
185
+ xref: Optional[int] = None,
186
+ **kwargs
187
+ ) -> Optional[str]:
188
+ """
189
+ Process embedded PDF image.
190
+
191
+ Args:
192
+ image_data: Image binary data
193
+ image_name: Original image name
194
+ xref: Image XRef number
195
+ **kwargs: Additional options
196
+
197
+ Returns:
198
+ Image tag string, or None on failure
199
+ """
200
+ custom_name = image_name
201
+ if custom_name is None and xref is not None:
202
+ custom_name = f"pdf_embedded_{xref}"
203
+
204
+ return self.save_image(image_data, custom_name=custom_name)
205
+
206
+ def render_page(
207
+ self,
208
+ page: "fitz.Page",
209
+ alpha: bool = False,
210
+ ) -> Optional[str]:
211
+ """
212
+ Render entire page as image.
213
+
214
+ Args:
215
+ page: PyMuPDF page object
216
+ alpha: Include alpha channel
217
+
218
+ Returns:
219
+ Image tag string, or None on failure
220
+ """
221
+ try:
222
+ import fitz
223
+
224
+ zoom = self._dpi / 72.0
225
+ mat = fitz.Matrix(zoom, zoom)
226
+ pix = page.get_pixmap(matrix=mat, alpha=alpha)
227
+ image_data = pix.tobytes("png")
228
+
229
+ custom_name = f"pdf_page_{page.number + 1}_full"
230
+ return self.save_image(image_data, custom_name=custom_name)
231
+
232
+ except Exception as e:
233
+ self._logger.warning(f"Failed to render page: {e}")
234
+ return None
235
+
236
+ def extract_images_from_page(
237
+ self,
238
+ page: "fitz.Page",
239
+ page_num: int,
240
+ doc: "fitz.Document",
241
+ processed_images: Set[int],
242
+ table_bboxes: List[Tuple[float, float, float, float]],
243
+ min_image_size: int = 50,
244
+ min_image_area: int = 2500
245
+ ) -> List[Dict[str, Any]]:
246
+ """
247
+ Extract images from PDF page.
248
+
249
+ This consolidates the logic from pdf_image.py extract_images_from_page().
250
+
251
+ Args:
252
+ page: PyMuPDF page object
253
+ page_num: Page number (0-indexed)
254
+ doc: PyMuPDF document object
255
+ processed_images: Set of already processed image xrefs
256
+ table_bboxes: List of table bounding boxes to exclude
257
+ min_image_size: Minimum image dimension
258
+ min_image_area: Minimum image area
259
+
260
+ Returns:
261
+ List of dicts with 'content', 'bbox', 'page_num' keys
262
+ """
263
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_utils import (
264
+ find_image_position,
265
+ is_inside_any_bbox,
266
+ )
267
+
268
+ elements = []
269
+
270
+ try:
271
+ image_list = page.get_images()
272
+
273
+ for img_info in image_list:
274
+ xref = img_info[0]
275
+
276
+ if xref in processed_images:
277
+ continue
278
+
279
+ try:
280
+ base_image = doc.extract_image(xref)
281
+ if not base_image:
282
+ continue
283
+
284
+ image_bytes = base_image.get("image")
285
+ width = base_image.get("width", 0)
286
+ height = base_image.get("height", 0)
287
+
288
+ if width < min_image_size or height < min_image_size:
289
+ continue
290
+ if width * height < min_image_area:
291
+ continue
292
+
293
+ img_bbox = find_image_position(page, xref)
294
+ if img_bbox is None:
295
+ continue
296
+
297
+ if is_inside_any_bbox(img_bbox, table_bboxes, threshold=0.7):
298
+ continue
299
+
300
+ # Use format-specific process_image method
301
+ image_tag = self.process_image(image_bytes, xref=xref, page_num=page_num)
302
+
303
+ if image_tag:
304
+ processed_images.add(xref)
305
+ elements.append({
306
+ 'content': f'\n{image_tag}\n',
307
+ 'bbox': img_bbox,
308
+ 'page_num': page_num
309
+ })
310
+
311
+ except Exception as e:
312
+ logger.debug(f"[PDF] Error extracting image xref={xref}: {e}")
313
+ continue
314
+
315
+ except Exception as e:
316
+ logger.warning(f"[PDF] Error extracting images: {e}")
317
+
318
+ return elements
319
+
320
+
321
+ __all__ = ["PDFImageProcessor"]