xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,316 @@
1
+ # xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py
2
+ """
3
+ Excel Image Processor
4
+
5
+ Provides Excel-specific image processing that inherits from ImageProcessor.
6
+ Handles embedded images, chart images, and drawing images for XLSX/XLS files.
7
+
8
+ This class consolidates all Excel image extraction logic including:
9
+ - XLSX ZIP-based image extraction
10
+ - openpyxl Image object processing
11
+ - Sheet image extraction
12
+ """
13
+ import os
14
+ import logging
15
+ import zipfile
16
+ from typing import Any, Dict, List, Optional, Set, Tuple, TYPE_CHECKING
17
+
18
+ from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
19
+ from xgen_doc2chunk.core.functions.storage_backend import BaseStorageBackend
20
+
21
+ if TYPE_CHECKING:
22
+ from openpyxl.workbook import Workbook
23
+ from openpyxl.worksheet.worksheet import Worksheet
24
+ from openpyxl.drawing.image import Image
25
+
26
+ logger = logging.getLogger("xgen_doc2chunk.image_processor.excel")
27
+
28
+ # Image formats supported by PIL
29
+ SUPPORTED_IMAGE_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff']
30
+
31
+ # Unsupported formats (EMF, WMF, etc.)
32
+ UNSUPPORTED_IMAGE_EXTENSIONS = ['.emf', '.wmf']
33
+
34
+
35
+ class ExcelImageProcessor(ImageProcessor):
36
+ """
37
+ Excel-specific image processor.
38
+
39
+ Inherits from ImageProcessor and provides Excel-specific processing.
40
+
41
+ Handles:
42
+ - Embedded worksheet images
43
+ - Drawing images
44
+ - Chart images
45
+ - Shape images
46
+
47
+ Example:
48
+ processor = ExcelImageProcessor()
49
+
50
+ # Process worksheet image
51
+ tag = processor.process_image(image_data, sheet_name="Sheet1")
52
+
53
+ # Process from openpyxl Image object
54
+ tag = processor.process_openpyxl_image(image_obj)
55
+ """
56
+
57
+ def __init__(
58
+ self,
59
+ directory_path: str = "temp/images",
60
+ tag_prefix: str = "[Image:",
61
+ tag_suffix: str = "]",
62
+ storage_backend: Optional[BaseStorageBackend] = None,
63
+ ):
64
+ """
65
+ Initialize ExcelImageProcessor.
66
+
67
+ Args:
68
+ directory_path: Image save directory
69
+ tag_prefix: Tag prefix for image references
70
+ tag_suffix: Tag suffix for image references
71
+ storage_backend: Storage backend for saving images
72
+ """
73
+ super().__init__(
74
+ directory_path=directory_path,
75
+ tag_prefix=tag_prefix,
76
+ tag_suffix=tag_suffix,
77
+ storage_backend=storage_backend,
78
+ )
79
+
80
+ def process_image(
81
+ self,
82
+ image_data: bytes,
83
+ sheet_name: Optional[str] = None,
84
+ image_index: Optional[int] = None,
85
+ **kwargs
86
+ ) -> Optional[str]:
87
+ """
88
+ Process and save Excel image data.
89
+
90
+ Args:
91
+ image_data: Raw image binary data
92
+ sheet_name: Source sheet name (for naming)
93
+ image_index: Image index in sheet (for naming)
94
+ **kwargs: Additional options
95
+
96
+ Returns:
97
+ Image tag string, or None on failure
98
+ """
99
+ custom_name = None
100
+ if sheet_name is not None:
101
+ safe_sheet = sheet_name.replace(' ', '_').replace('/', '_')
102
+ if image_index is not None:
103
+ custom_name = f"excel_{safe_sheet}_{image_index}"
104
+ else:
105
+ custom_name = f"excel_{safe_sheet}"
106
+
107
+ return self.save_image(image_data, custom_name=custom_name)
108
+
109
+ def process_openpyxl_image(
110
+ self,
111
+ image: "Image",
112
+ sheet_name: Optional[str] = None,
113
+ image_index: Optional[int] = None,
114
+ ) -> Optional[str]:
115
+ """
116
+ Process openpyxl Image object.
117
+
118
+ Args:
119
+ image: openpyxl Image object
120
+ sheet_name: Source sheet name
121
+ image_index: Image index
122
+
123
+ Returns:
124
+ Image tag string, or None on failure
125
+ """
126
+ try:
127
+ # Get image data from openpyxl Image
128
+ if hasattr(image, '_data'):
129
+ image_data = image._data()
130
+ elif hasattr(image, 'ref'):
131
+ # For embedded images with reference
132
+ image_data = image.ref.blob
133
+ else:
134
+ self._logger.warning("Cannot extract data from openpyxl Image")
135
+ return None
136
+
137
+ return self.process_image(
138
+ image_data,
139
+ sheet_name=sheet_name,
140
+ image_index=image_index
141
+ )
142
+
143
+ except Exception as e:
144
+ self._logger.warning(f"Failed to process openpyxl image: {e}")
145
+ return None
146
+
147
+ def process_embedded_image(
148
+ self,
149
+ image_data: bytes,
150
+ image_name: Optional[str] = None,
151
+ sheet_name: Optional[str] = None,
152
+ **kwargs
153
+ ) -> Optional[str]:
154
+ """
155
+ Process embedded Excel image.
156
+
157
+ Args:
158
+ image_data: Image binary data
159
+ image_name: Original image filename
160
+ sheet_name: Source sheet name
161
+ **kwargs: Additional options
162
+
163
+ Returns:
164
+ Image tag string, or None on failure
165
+ """
166
+ custom_name = image_name
167
+ if custom_name is None and sheet_name is not None:
168
+ safe_sheet = sheet_name.replace(' ', '_').replace('/', '_')
169
+ custom_name = f"excel_embed_{safe_sheet}"
170
+
171
+ return self.save_image(image_data, custom_name=custom_name)
172
+
173
+ def process_chart_image(
174
+ self,
175
+ chart_data: bytes,
176
+ chart_name: Optional[str] = None,
177
+ sheet_name: Optional[str] = None,
178
+ chart_index: Optional[int] = None,
179
+ **kwargs
180
+ ) -> Optional[str]:
181
+ """
182
+ Process Excel chart as image.
183
+
184
+ Args:
185
+ chart_data: Chart image binary data
186
+ chart_name: Chart title/name
187
+ sheet_name: Source sheet name
188
+ chart_index: Chart index in sheet
189
+ **kwargs: Additional options
190
+
191
+ Returns:
192
+ Image tag string, or None on failure
193
+ """
194
+ custom_name = chart_name
195
+ if custom_name is None:
196
+ if sheet_name is not None:
197
+ safe_sheet = sheet_name.replace(' ', '_').replace('/', '_')
198
+ if chart_index is not None:
199
+ custom_name = f"excel_chart_{safe_sheet}_{chart_index}"
200
+ else:
201
+ custom_name = f"excel_chart_{safe_sheet}"
202
+ elif chart_index is not None:
203
+ custom_name = f"excel_chart_{chart_index}"
204
+
205
+ return self.save_image(chart_data, custom_name=custom_name)
206
+
207
+ def extract_images_from_xlsx(
208
+ self,
209
+ file_path: str,
210
+ ) -> Dict[str, bytes]:
211
+ """
212
+ Extract images from XLSX file (direct ZIP access).
213
+ Excludes formats not supported by PIL (EMF, WMF, etc.).
214
+
215
+ Args:
216
+ file_path: Path to XLSX file
217
+
218
+ Returns:
219
+ {image_path: image_bytes} dictionary
220
+ """
221
+ images = {}
222
+
223
+ try:
224
+ with zipfile.ZipFile(file_path, 'r') as zf:
225
+ for name in zf.namelist():
226
+ if name.startswith('xl/media/'):
227
+ ext = os.path.splitext(name)[1].lower()
228
+ if ext in SUPPORTED_IMAGE_EXTENSIONS:
229
+ images[name] = zf.read(name)
230
+ elif ext in UNSUPPORTED_IMAGE_EXTENSIONS:
231
+ logger.debug(f"Skipping unsupported image format: {name}")
232
+
233
+ return images
234
+
235
+ except Exception as e:
236
+ logger.warning(f"Error extracting images from XLSX: {e}")
237
+ return {}
238
+
239
+ def get_sheet_images(
240
+ self,
241
+ ws: "Worksheet",
242
+ images_data: Dict[str, bytes],
243
+ file_path: str,
244
+ ) -> List[Tuple[bytes, str]]:
245
+ """
246
+ Get images contained in a sheet.
247
+
248
+ Args:
249
+ ws: openpyxl Worksheet object
250
+ images_data: Image dictionary from extract_images_from_xlsx
251
+ file_path: Path to XLSX file
252
+
253
+ Returns:
254
+ [(image_bytes, anchor_info)] list
255
+ """
256
+ result = []
257
+
258
+ try:
259
+ # Use openpyxl's _images attribute
260
+ if hasattr(ws, '_images') and ws._images:
261
+ for img in ws._images:
262
+ try:
263
+ if hasattr(img, '_data') and callable(img._data):
264
+ img_data = img._data()
265
+ anchor = str(img.anchor) if hasattr(img, 'anchor') else ""
266
+ result.append((img_data, anchor))
267
+ except Exception as e:
268
+ logger.debug(f"Error accessing image data: {e}")
269
+
270
+ # Use directly extracted images (if not obtained above)
271
+ if not result and images_data:
272
+ for name, data in images_data.items():
273
+ result.append((data, name))
274
+
275
+ return result
276
+
277
+ except Exception as e:
278
+ logger.warning(f"Error getting sheet images: {e}")
279
+ return []
280
+
281
+ def process_sheet_images(
282
+ self,
283
+ ws: "Worksheet",
284
+ sheet_name: str,
285
+ images_data: Optional[Dict[str, bytes]] = None,
286
+ file_path: Optional[str] = None,
287
+ ) -> str:
288
+ """
289
+ Process all images in a sheet.
290
+
291
+ Args:
292
+ ws: openpyxl Worksheet object
293
+ sheet_name: Sheet name
294
+ images_data: Pre-extracted image dictionary
295
+ file_path: Path to XLSX file
296
+
297
+ Returns:
298
+ Joined image tag strings
299
+ """
300
+ results = []
301
+
302
+ if images_data is None and file_path:
303
+ images_data = self.extract_images_from_xlsx(file_path)
304
+
305
+ images_data = images_data or {}
306
+ sheet_images = self.get_sheet_images(ws, images_data, file_path or "")
307
+
308
+ for idx, (img_data, anchor) in enumerate(sheet_images):
309
+ tag = self.process_image(img_data, sheet_name=sheet_name, image_index=idx)
310
+ if tag:
311
+ results.append(tag)
312
+
313
+ return "\n\n".join(results)
314
+
315
+
316
+ __all__ = ["ExcelImageProcessor"]