xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py
|
|
2
|
+
"""
|
|
3
|
+
PDF Image Processor
|
|
4
|
+
|
|
5
|
+
Provides PDF-specific image processing that inherits from ImageProcessor.
|
|
6
|
+
Handles XRef images, inline images, and page rendering for complex regions.
|
|
7
|
+
|
|
8
|
+
This class consolidates all PDF image extraction logic including:
|
|
9
|
+
- XRef-based image extraction
|
|
10
|
+
- Page region rendering
|
|
11
|
+
- Image filtering by size/position
|
|
12
|
+
"""
|
|
13
|
+
import logging
|
|
14
|
+
from typing import Any, Dict, List, Optional, Set, Tuple, TYPE_CHECKING
|
|
15
|
+
|
|
16
|
+
from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
|
|
17
|
+
from xgen_doc2chunk.core.functions.storage_backend import BaseStorageBackend
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
import fitz
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger("xgen_doc2chunk.image_processor.pdf")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class PDFImageProcessor(ImageProcessor):
|
|
26
|
+
"""
|
|
27
|
+
PDF-specific image processor.
|
|
28
|
+
|
|
29
|
+
Inherits from ImageProcessor and provides PDF-specific processing.
|
|
30
|
+
|
|
31
|
+
Handles:
|
|
32
|
+
- XRef images (embedded images with XRef references)
|
|
33
|
+
- Inline images
|
|
34
|
+
- Page region rendering for complex areas
|
|
35
|
+
- Image extraction from PyMuPDF objects
|
|
36
|
+
|
|
37
|
+
Example:
|
|
38
|
+
processor = PDFImageProcessor()
|
|
39
|
+
|
|
40
|
+
# Process XRef image
|
|
41
|
+
tag = processor.process_image(image_data, xref=123)
|
|
42
|
+
|
|
43
|
+
# Process page region
|
|
44
|
+
tag = processor.process_page_region(page, rect)
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init__(
|
|
48
|
+
self,
|
|
49
|
+
directory_path: str = "temp/images",
|
|
50
|
+
tag_prefix: str = "[Image:",
|
|
51
|
+
tag_suffix: str = "]",
|
|
52
|
+
storage_backend: Optional[BaseStorageBackend] = None,
|
|
53
|
+
dpi: int = 150,
|
|
54
|
+
):
|
|
55
|
+
"""
|
|
56
|
+
Initialize PDFImageProcessor.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
directory_path: Image save directory
|
|
60
|
+
tag_prefix: Tag prefix for image references
|
|
61
|
+
tag_suffix: Tag suffix for image references
|
|
62
|
+
storage_backend: Storage backend for saving images
|
|
63
|
+
dpi: DPI for page rendering
|
|
64
|
+
"""
|
|
65
|
+
super().__init__(
|
|
66
|
+
directory_path=directory_path,
|
|
67
|
+
tag_prefix=tag_prefix,
|
|
68
|
+
tag_suffix=tag_suffix,
|
|
69
|
+
storage_backend=storage_backend,
|
|
70
|
+
)
|
|
71
|
+
self._dpi = dpi
|
|
72
|
+
|
|
73
|
+
@property
|
|
74
|
+
def dpi(self) -> int:
|
|
75
|
+
"""DPI for page rendering."""
|
|
76
|
+
return self._dpi
|
|
77
|
+
|
|
78
|
+
@dpi.setter
|
|
79
|
+
def dpi(self, value: int) -> None:
|
|
80
|
+
"""Set DPI for page rendering."""
|
|
81
|
+
self._dpi = value
|
|
82
|
+
|
|
83
|
+
def process_image(
|
|
84
|
+
self,
|
|
85
|
+
image_data: bytes,
|
|
86
|
+
xref: Optional[int] = None,
|
|
87
|
+
page_num: Optional[int] = None,
|
|
88
|
+
**kwargs
|
|
89
|
+
) -> Optional[str]:
|
|
90
|
+
"""
|
|
91
|
+
Process and save PDF image data.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
image_data: Raw image binary data
|
|
95
|
+
xref: Image XRef number (for naming)
|
|
96
|
+
page_num: Page number (for naming)
|
|
97
|
+
**kwargs: Additional options
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
Image tag string, or None on failure
|
|
101
|
+
"""
|
|
102
|
+
# Generate custom name based on XRef or page
|
|
103
|
+
custom_name = None
|
|
104
|
+
if xref is not None:
|
|
105
|
+
custom_name = f"pdf_xref_{xref}"
|
|
106
|
+
elif page_num is not None:
|
|
107
|
+
custom_name = f"pdf_page_{page_num}"
|
|
108
|
+
|
|
109
|
+
return self.save_image(image_data, custom_name=custom_name)
|
|
110
|
+
|
|
111
|
+
def process_xref_image(
|
|
112
|
+
self,
|
|
113
|
+
doc: "fitz.Document",
|
|
114
|
+
xref: int,
|
|
115
|
+
) -> Optional[str]:
|
|
116
|
+
"""
|
|
117
|
+
Extract and save image by XRef number.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
doc: PyMuPDF document object
|
|
121
|
+
xref: Image XRef number
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
Image tag string, or None on failure
|
|
125
|
+
"""
|
|
126
|
+
try:
|
|
127
|
+
import fitz
|
|
128
|
+
|
|
129
|
+
image_dict = doc.extract_image(xref)
|
|
130
|
+
if not image_dict:
|
|
131
|
+
return None
|
|
132
|
+
|
|
133
|
+
image_data = image_dict.get("image")
|
|
134
|
+
if not image_data:
|
|
135
|
+
return None
|
|
136
|
+
|
|
137
|
+
return self.process_image(image_data, xref=xref)
|
|
138
|
+
|
|
139
|
+
except Exception as e:
|
|
140
|
+
self._logger.warning(f"Failed to extract XRef image {xref}: {e}")
|
|
141
|
+
return None
|
|
142
|
+
|
|
143
|
+
def process_page_region(
|
|
144
|
+
self,
|
|
145
|
+
page: "fitz.Page",
|
|
146
|
+
rect: "fitz.Rect",
|
|
147
|
+
region_name: Optional[str] = None,
|
|
148
|
+
) -> Optional[str]:
|
|
149
|
+
"""
|
|
150
|
+
Render and save a page region as image.
|
|
151
|
+
|
|
152
|
+
Used for complex regions that can't be represented as text.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
page: PyMuPDF page object
|
|
156
|
+
rect: Region rectangle to render
|
|
157
|
+
region_name: Optional name for the region
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
Image tag string, or None on failure
|
|
161
|
+
"""
|
|
162
|
+
try:
|
|
163
|
+
import fitz
|
|
164
|
+
|
|
165
|
+
# Calculate zoom for DPI
|
|
166
|
+
zoom = self._dpi / 72.0
|
|
167
|
+
mat = fitz.Matrix(zoom, zoom)
|
|
168
|
+
|
|
169
|
+
# Clip to region
|
|
170
|
+
clip = rect
|
|
171
|
+
pix = page.get_pixmap(matrix=mat, clip=clip, alpha=False)
|
|
172
|
+
image_data = pix.tobytes("png")
|
|
173
|
+
|
|
174
|
+
custom_name = region_name or f"pdf_page{page.number}_region"
|
|
175
|
+
return self.save_image(image_data, custom_name=custom_name)
|
|
176
|
+
|
|
177
|
+
except Exception as e:
|
|
178
|
+
self._logger.warning(f"Failed to render page region: {e}")
|
|
179
|
+
return None
|
|
180
|
+
|
|
181
|
+
def process_embedded_image(
|
|
182
|
+
self,
|
|
183
|
+
image_data: bytes,
|
|
184
|
+
image_name: Optional[str] = None,
|
|
185
|
+
xref: Optional[int] = None,
|
|
186
|
+
**kwargs
|
|
187
|
+
) -> Optional[str]:
|
|
188
|
+
"""
|
|
189
|
+
Process embedded PDF image.
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
image_data: Image binary data
|
|
193
|
+
image_name: Original image name
|
|
194
|
+
xref: Image XRef number
|
|
195
|
+
**kwargs: Additional options
|
|
196
|
+
|
|
197
|
+
Returns:
|
|
198
|
+
Image tag string, or None on failure
|
|
199
|
+
"""
|
|
200
|
+
custom_name = image_name
|
|
201
|
+
if custom_name is None and xref is not None:
|
|
202
|
+
custom_name = f"pdf_embedded_{xref}"
|
|
203
|
+
|
|
204
|
+
return self.save_image(image_data, custom_name=custom_name)
|
|
205
|
+
|
|
206
|
+
def render_page(
|
|
207
|
+
self,
|
|
208
|
+
page: "fitz.Page",
|
|
209
|
+
alpha: bool = False,
|
|
210
|
+
) -> Optional[str]:
|
|
211
|
+
"""
|
|
212
|
+
Render entire page as image.
|
|
213
|
+
|
|
214
|
+
Args:
|
|
215
|
+
page: PyMuPDF page object
|
|
216
|
+
alpha: Include alpha channel
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
Image tag string, or None on failure
|
|
220
|
+
"""
|
|
221
|
+
try:
|
|
222
|
+
import fitz
|
|
223
|
+
|
|
224
|
+
zoom = self._dpi / 72.0
|
|
225
|
+
mat = fitz.Matrix(zoom, zoom)
|
|
226
|
+
pix = page.get_pixmap(matrix=mat, alpha=alpha)
|
|
227
|
+
image_data = pix.tobytes("png")
|
|
228
|
+
|
|
229
|
+
custom_name = f"pdf_page_{page.number + 1}_full"
|
|
230
|
+
return self.save_image(image_data, custom_name=custom_name)
|
|
231
|
+
|
|
232
|
+
except Exception as e:
|
|
233
|
+
self._logger.warning(f"Failed to render page: {e}")
|
|
234
|
+
return None
|
|
235
|
+
|
|
236
|
+
def extract_images_from_page(
|
|
237
|
+
self,
|
|
238
|
+
page: "fitz.Page",
|
|
239
|
+
page_num: int,
|
|
240
|
+
doc: "fitz.Document",
|
|
241
|
+
processed_images: Set[int],
|
|
242
|
+
table_bboxes: List[Tuple[float, float, float, float]],
|
|
243
|
+
min_image_size: int = 50,
|
|
244
|
+
min_image_area: int = 2500
|
|
245
|
+
) -> List[Dict[str, Any]]:
|
|
246
|
+
"""
|
|
247
|
+
Extract images from PDF page.
|
|
248
|
+
|
|
249
|
+
This consolidates the logic from pdf_image.py extract_images_from_page().
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
page: PyMuPDF page object
|
|
253
|
+
page_num: Page number (0-indexed)
|
|
254
|
+
doc: PyMuPDF document object
|
|
255
|
+
processed_images: Set of already processed image xrefs
|
|
256
|
+
table_bboxes: List of table bounding boxes to exclude
|
|
257
|
+
min_image_size: Minimum image dimension
|
|
258
|
+
min_image_area: Minimum image area
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
List of dicts with 'content', 'bbox', 'page_num' keys
|
|
262
|
+
"""
|
|
263
|
+
from xgen_doc2chunk.core.processor.pdf_helpers.pdf_utils import (
|
|
264
|
+
find_image_position,
|
|
265
|
+
is_inside_any_bbox,
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
elements = []
|
|
269
|
+
|
|
270
|
+
try:
|
|
271
|
+
image_list = page.get_images()
|
|
272
|
+
|
|
273
|
+
for img_info in image_list:
|
|
274
|
+
xref = img_info[0]
|
|
275
|
+
|
|
276
|
+
if xref in processed_images:
|
|
277
|
+
continue
|
|
278
|
+
|
|
279
|
+
try:
|
|
280
|
+
base_image = doc.extract_image(xref)
|
|
281
|
+
if not base_image:
|
|
282
|
+
continue
|
|
283
|
+
|
|
284
|
+
image_bytes = base_image.get("image")
|
|
285
|
+
width = base_image.get("width", 0)
|
|
286
|
+
height = base_image.get("height", 0)
|
|
287
|
+
|
|
288
|
+
if width < min_image_size or height < min_image_size:
|
|
289
|
+
continue
|
|
290
|
+
if width * height < min_image_area:
|
|
291
|
+
continue
|
|
292
|
+
|
|
293
|
+
img_bbox = find_image_position(page, xref)
|
|
294
|
+
if img_bbox is None:
|
|
295
|
+
continue
|
|
296
|
+
|
|
297
|
+
if is_inside_any_bbox(img_bbox, table_bboxes, threshold=0.7):
|
|
298
|
+
continue
|
|
299
|
+
|
|
300
|
+
# Use format-specific process_image method
|
|
301
|
+
image_tag = self.process_image(image_bytes, xref=xref, page_num=page_num)
|
|
302
|
+
|
|
303
|
+
if image_tag:
|
|
304
|
+
processed_images.add(xref)
|
|
305
|
+
elements.append({
|
|
306
|
+
'content': f'\n{image_tag}\n',
|
|
307
|
+
'bbox': img_bbox,
|
|
308
|
+
'page_num': page_num
|
|
309
|
+
})
|
|
310
|
+
|
|
311
|
+
except Exception as e:
|
|
312
|
+
logger.debug(f"[PDF] Error extracting image xref={xref}: {e}")
|
|
313
|
+
continue
|
|
314
|
+
|
|
315
|
+
except Exception as e:
|
|
316
|
+
logger.warning(f"[PDF] Error extracting images: {e}")
|
|
317
|
+
|
|
318
|
+
return elements
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
__all__ = ["PDFImageProcessor"]
|