xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py
|
|
2
|
+
"""
|
|
3
|
+
Excel Image Processor
|
|
4
|
+
|
|
5
|
+
Provides Excel-specific image processing that inherits from ImageProcessor.
|
|
6
|
+
Handles embedded images, chart images, and drawing images for XLSX/XLS files.
|
|
7
|
+
|
|
8
|
+
This class consolidates all Excel image extraction logic including:
|
|
9
|
+
- XLSX ZIP-based image extraction
|
|
10
|
+
- openpyxl Image object processing
|
|
11
|
+
- Sheet image extraction
|
|
12
|
+
"""
|
|
13
|
+
import os
|
|
14
|
+
import logging
|
|
15
|
+
import zipfile
|
|
16
|
+
from typing import Any, Dict, List, Optional, Set, Tuple, TYPE_CHECKING
|
|
17
|
+
|
|
18
|
+
from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
|
|
19
|
+
from xgen_doc2chunk.core.functions.storage_backend import BaseStorageBackend
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from openpyxl.workbook import Workbook
|
|
23
|
+
from openpyxl.worksheet.worksheet import Worksheet
|
|
24
|
+
from openpyxl.drawing.image import Image
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger("xgen_doc2chunk.image_processor.excel")
|
|
27
|
+
|
|
28
|
+
# Image formats supported by PIL
|
|
29
|
+
SUPPORTED_IMAGE_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff']
|
|
30
|
+
|
|
31
|
+
# Unsupported formats (EMF, WMF, etc.)
|
|
32
|
+
UNSUPPORTED_IMAGE_EXTENSIONS = ['.emf', '.wmf']
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class ExcelImageProcessor(ImageProcessor):
|
|
36
|
+
"""
|
|
37
|
+
Excel-specific image processor.
|
|
38
|
+
|
|
39
|
+
Inherits from ImageProcessor and provides Excel-specific processing.
|
|
40
|
+
|
|
41
|
+
Handles:
|
|
42
|
+
- Embedded worksheet images
|
|
43
|
+
- Drawing images
|
|
44
|
+
- Chart images
|
|
45
|
+
- Shape images
|
|
46
|
+
|
|
47
|
+
Example:
|
|
48
|
+
processor = ExcelImageProcessor()
|
|
49
|
+
|
|
50
|
+
# Process worksheet image
|
|
51
|
+
tag = processor.process_image(image_data, sheet_name="Sheet1")
|
|
52
|
+
|
|
53
|
+
# Process from openpyxl Image object
|
|
54
|
+
tag = processor.process_openpyxl_image(image_obj)
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
def __init__(
|
|
58
|
+
self,
|
|
59
|
+
directory_path: str = "temp/images",
|
|
60
|
+
tag_prefix: str = "[Image:",
|
|
61
|
+
tag_suffix: str = "]",
|
|
62
|
+
storage_backend: Optional[BaseStorageBackend] = None,
|
|
63
|
+
):
|
|
64
|
+
"""
|
|
65
|
+
Initialize ExcelImageProcessor.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
directory_path: Image save directory
|
|
69
|
+
tag_prefix: Tag prefix for image references
|
|
70
|
+
tag_suffix: Tag suffix for image references
|
|
71
|
+
storage_backend: Storage backend for saving images
|
|
72
|
+
"""
|
|
73
|
+
super().__init__(
|
|
74
|
+
directory_path=directory_path,
|
|
75
|
+
tag_prefix=tag_prefix,
|
|
76
|
+
tag_suffix=tag_suffix,
|
|
77
|
+
storage_backend=storage_backend,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
def process_image(
|
|
81
|
+
self,
|
|
82
|
+
image_data: bytes,
|
|
83
|
+
sheet_name: Optional[str] = None,
|
|
84
|
+
image_index: Optional[int] = None,
|
|
85
|
+
**kwargs
|
|
86
|
+
) -> Optional[str]:
|
|
87
|
+
"""
|
|
88
|
+
Process and save Excel image data.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
image_data: Raw image binary data
|
|
92
|
+
sheet_name: Source sheet name (for naming)
|
|
93
|
+
image_index: Image index in sheet (for naming)
|
|
94
|
+
**kwargs: Additional options
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
Image tag string, or None on failure
|
|
98
|
+
"""
|
|
99
|
+
custom_name = None
|
|
100
|
+
if sheet_name is not None:
|
|
101
|
+
safe_sheet = sheet_name.replace(' ', '_').replace('/', '_')
|
|
102
|
+
if image_index is not None:
|
|
103
|
+
custom_name = f"excel_{safe_sheet}_{image_index}"
|
|
104
|
+
else:
|
|
105
|
+
custom_name = f"excel_{safe_sheet}"
|
|
106
|
+
|
|
107
|
+
return self.save_image(image_data, custom_name=custom_name)
|
|
108
|
+
|
|
109
|
+
def process_openpyxl_image(
|
|
110
|
+
self,
|
|
111
|
+
image: "Image",
|
|
112
|
+
sheet_name: Optional[str] = None,
|
|
113
|
+
image_index: Optional[int] = None,
|
|
114
|
+
) -> Optional[str]:
|
|
115
|
+
"""
|
|
116
|
+
Process openpyxl Image object.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
image: openpyxl Image object
|
|
120
|
+
sheet_name: Source sheet name
|
|
121
|
+
image_index: Image index
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
Image tag string, or None on failure
|
|
125
|
+
"""
|
|
126
|
+
try:
|
|
127
|
+
# Get image data from openpyxl Image
|
|
128
|
+
if hasattr(image, '_data'):
|
|
129
|
+
image_data = image._data()
|
|
130
|
+
elif hasattr(image, 'ref'):
|
|
131
|
+
# For embedded images with reference
|
|
132
|
+
image_data = image.ref.blob
|
|
133
|
+
else:
|
|
134
|
+
self._logger.warning("Cannot extract data from openpyxl Image")
|
|
135
|
+
return None
|
|
136
|
+
|
|
137
|
+
return self.process_image(
|
|
138
|
+
image_data,
|
|
139
|
+
sheet_name=sheet_name,
|
|
140
|
+
image_index=image_index
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
except Exception as e:
|
|
144
|
+
self._logger.warning(f"Failed to process openpyxl image: {e}")
|
|
145
|
+
return None
|
|
146
|
+
|
|
147
|
+
def process_embedded_image(
|
|
148
|
+
self,
|
|
149
|
+
image_data: bytes,
|
|
150
|
+
image_name: Optional[str] = None,
|
|
151
|
+
sheet_name: Optional[str] = None,
|
|
152
|
+
**kwargs
|
|
153
|
+
) -> Optional[str]:
|
|
154
|
+
"""
|
|
155
|
+
Process embedded Excel image.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
image_data: Image binary data
|
|
159
|
+
image_name: Original image filename
|
|
160
|
+
sheet_name: Source sheet name
|
|
161
|
+
**kwargs: Additional options
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
Image tag string, or None on failure
|
|
165
|
+
"""
|
|
166
|
+
custom_name = image_name
|
|
167
|
+
if custom_name is None and sheet_name is not None:
|
|
168
|
+
safe_sheet = sheet_name.replace(' ', '_').replace('/', '_')
|
|
169
|
+
custom_name = f"excel_embed_{safe_sheet}"
|
|
170
|
+
|
|
171
|
+
return self.save_image(image_data, custom_name=custom_name)
|
|
172
|
+
|
|
173
|
+
def process_chart_image(
|
|
174
|
+
self,
|
|
175
|
+
chart_data: bytes,
|
|
176
|
+
chart_name: Optional[str] = None,
|
|
177
|
+
sheet_name: Optional[str] = None,
|
|
178
|
+
chart_index: Optional[int] = None,
|
|
179
|
+
**kwargs
|
|
180
|
+
) -> Optional[str]:
|
|
181
|
+
"""
|
|
182
|
+
Process Excel chart as image.
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
chart_data: Chart image binary data
|
|
186
|
+
chart_name: Chart title/name
|
|
187
|
+
sheet_name: Source sheet name
|
|
188
|
+
chart_index: Chart index in sheet
|
|
189
|
+
**kwargs: Additional options
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
Image tag string, or None on failure
|
|
193
|
+
"""
|
|
194
|
+
custom_name = chart_name
|
|
195
|
+
if custom_name is None:
|
|
196
|
+
if sheet_name is not None:
|
|
197
|
+
safe_sheet = sheet_name.replace(' ', '_').replace('/', '_')
|
|
198
|
+
if chart_index is not None:
|
|
199
|
+
custom_name = f"excel_chart_{safe_sheet}_{chart_index}"
|
|
200
|
+
else:
|
|
201
|
+
custom_name = f"excel_chart_{safe_sheet}"
|
|
202
|
+
elif chart_index is not None:
|
|
203
|
+
custom_name = f"excel_chart_{chart_index}"
|
|
204
|
+
|
|
205
|
+
return self.save_image(chart_data, custom_name=custom_name)
|
|
206
|
+
|
|
207
|
+
def extract_images_from_xlsx(
|
|
208
|
+
self,
|
|
209
|
+
file_path: str,
|
|
210
|
+
) -> Dict[str, bytes]:
|
|
211
|
+
"""
|
|
212
|
+
Extract images from XLSX file (direct ZIP access).
|
|
213
|
+
Excludes formats not supported by PIL (EMF, WMF, etc.).
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
file_path: Path to XLSX file
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
{image_path: image_bytes} dictionary
|
|
220
|
+
"""
|
|
221
|
+
images = {}
|
|
222
|
+
|
|
223
|
+
try:
|
|
224
|
+
with zipfile.ZipFile(file_path, 'r') as zf:
|
|
225
|
+
for name in zf.namelist():
|
|
226
|
+
if name.startswith('xl/media/'):
|
|
227
|
+
ext = os.path.splitext(name)[1].lower()
|
|
228
|
+
if ext in SUPPORTED_IMAGE_EXTENSIONS:
|
|
229
|
+
images[name] = zf.read(name)
|
|
230
|
+
elif ext in UNSUPPORTED_IMAGE_EXTENSIONS:
|
|
231
|
+
logger.debug(f"Skipping unsupported image format: {name}")
|
|
232
|
+
|
|
233
|
+
return images
|
|
234
|
+
|
|
235
|
+
except Exception as e:
|
|
236
|
+
logger.warning(f"Error extracting images from XLSX: {e}")
|
|
237
|
+
return {}
|
|
238
|
+
|
|
239
|
+
def get_sheet_images(
|
|
240
|
+
self,
|
|
241
|
+
ws: "Worksheet",
|
|
242
|
+
images_data: Dict[str, bytes],
|
|
243
|
+
file_path: str,
|
|
244
|
+
) -> List[Tuple[bytes, str]]:
|
|
245
|
+
"""
|
|
246
|
+
Get images contained in a sheet.
|
|
247
|
+
|
|
248
|
+
Args:
|
|
249
|
+
ws: openpyxl Worksheet object
|
|
250
|
+
images_data: Image dictionary from extract_images_from_xlsx
|
|
251
|
+
file_path: Path to XLSX file
|
|
252
|
+
|
|
253
|
+
Returns:
|
|
254
|
+
[(image_bytes, anchor_info)] list
|
|
255
|
+
"""
|
|
256
|
+
result = []
|
|
257
|
+
|
|
258
|
+
try:
|
|
259
|
+
# Use openpyxl's _images attribute
|
|
260
|
+
if hasattr(ws, '_images') and ws._images:
|
|
261
|
+
for img in ws._images:
|
|
262
|
+
try:
|
|
263
|
+
if hasattr(img, '_data') and callable(img._data):
|
|
264
|
+
img_data = img._data()
|
|
265
|
+
anchor = str(img.anchor) if hasattr(img, 'anchor') else ""
|
|
266
|
+
result.append((img_data, anchor))
|
|
267
|
+
except Exception as e:
|
|
268
|
+
logger.debug(f"Error accessing image data: {e}")
|
|
269
|
+
|
|
270
|
+
# Use directly extracted images (if not obtained above)
|
|
271
|
+
if not result and images_data:
|
|
272
|
+
for name, data in images_data.items():
|
|
273
|
+
result.append((data, name))
|
|
274
|
+
|
|
275
|
+
return result
|
|
276
|
+
|
|
277
|
+
except Exception as e:
|
|
278
|
+
logger.warning(f"Error getting sheet images: {e}")
|
|
279
|
+
return []
|
|
280
|
+
|
|
281
|
+
def process_sheet_images(
|
|
282
|
+
self,
|
|
283
|
+
ws: "Worksheet",
|
|
284
|
+
sheet_name: str,
|
|
285
|
+
images_data: Optional[Dict[str, bytes]] = None,
|
|
286
|
+
file_path: Optional[str] = None,
|
|
287
|
+
) -> str:
|
|
288
|
+
"""
|
|
289
|
+
Process all images in a sheet.
|
|
290
|
+
|
|
291
|
+
Args:
|
|
292
|
+
ws: openpyxl Worksheet object
|
|
293
|
+
sheet_name: Sheet name
|
|
294
|
+
images_data: Pre-extracted image dictionary
|
|
295
|
+
file_path: Path to XLSX file
|
|
296
|
+
|
|
297
|
+
Returns:
|
|
298
|
+
Joined image tag strings
|
|
299
|
+
"""
|
|
300
|
+
results = []
|
|
301
|
+
|
|
302
|
+
if images_data is None and file_path:
|
|
303
|
+
images_data = self.extract_images_from_xlsx(file_path)
|
|
304
|
+
|
|
305
|
+
images_data = images_data or {}
|
|
306
|
+
sheet_images = self.get_sheet_images(ws, images_data, file_path or "")
|
|
307
|
+
|
|
308
|
+
for idx, (img_data, anchor) in enumerate(sheet_images):
|
|
309
|
+
tag = self.process_image(img_data, sheet_name=sheet_name, image_index=idx)
|
|
310
|
+
if tag:
|
|
311
|
+
results.append(tag)
|
|
312
|
+
|
|
313
|
+
return "\n\n".join(results)
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
__all__ = ["ExcelImageProcessor"]
|