xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py
|
|
2
|
+
"""
|
|
3
|
+
HWPX Image Processor
|
|
4
|
+
|
|
5
|
+
Provides HWPX-specific image processing that inherits from ImageProcessor.
|
|
6
|
+
Handles images in HWPX (ZIP/XML based) Korean document format.
|
|
7
|
+
|
|
8
|
+
This class consolidates all HWPX image extraction logic including:
|
|
9
|
+
- BinData images extraction from ZIP
|
|
10
|
+
- Remaining images processing
|
|
11
|
+
- Image filtering by extension
|
|
12
|
+
"""
|
|
13
|
+
import logging
|
|
14
|
+
import os
|
|
15
|
+
from typing import Any, Dict, List, Optional, Set, TYPE_CHECKING
|
|
16
|
+
import zipfile
|
|
17
|
+
|
|
18
|
+
from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
|
|
19
|
+
from xgen_doc2chunk.core.functions.storage_backend import BaseStorageBackend
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger("xgen_doc2chunk.image_processor.hwpx")
|
|
22
|
+
|
|
23
|
+
# Supported image extensions
|
|
24
|
+
SUPPORTED_IMAGE_EXTENSIONS = frozenset(['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff'])
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class HWPXImageProcessor(ImageProcessor):
|
|
28
|
+
"""
|
|
29
|
+
HWPX-specific image processor.
|
|
30
|
+
|
|
31
|
+
Inherits from ImageProcessor and provides HWPX-specific processing.
|
|
32
|
+
|
|
33
|
+
Handles:
|
|
34
|
+
- BinData images in HWPX ZIP structure
|
|
35
|
+
- Embedded images
|
|
36
|
+
- Referenced images via bin_item_map
|
|
37
|
+
|
|
38
|
+
Example:
|
|
39
|
+
processor = HWPXImageProcessor()
|
|
40
|
+
|
|
41
|
+
# Process image from ZIP
|
|
42
|
+
with zipfile.ZipFile(file_stream, 'r') as zf:
|
|
43
|
+
tag = processor.process_from_zip(zf, "BinData/image1.png")
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
directory_path: str = "temp/images",
|
|
49
|
+
tag_prefix: str = "[Image:",
|
|
50
|
+
tag_suffix: str = "]",
|
|
51
|
+
storage_backend: Optional[BaseStorageBackend] = None,
|
|
52
|
+
):
|
|
53
|
+
"""
|
|
54
|
+
Initialize HWPXImageProcessor.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
directory_path: Image save directory
|
|
58
|
+
tag_prefix: Tag prefix for image references
|
|
59
|
+
tag_suffix: Tag suffix for image references
|
|
60
|
+
storage_backend: Storage backend for saving images
|
|
61
|
+
"""
|
|
62
|
+
super().__init__(
|
|
63
|
+
directory_path=directory_path,
|
|
64
|
+
tag_prefix=tag_prefix,
|
|
65
|
+
tag_suffix=tag_suffix,
|
|
66
|
+
storage_backend=storage_backend,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
def process_image(
|
|
70
|
+
self,
|
|
71
|
+
image_data: bytes,
|
|
72
|
+
bin_item_id: Optional[str] = None,
|
|
73
|
+
image_path: Optional[str] = None,
|
|
74
|
+
**kwargs
|
|
75
|
+
) -> Optional[str]:
|
|
76
|
+
"""
|
|
77
|
+
Process and save HWPX image data.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
image_data: Raw image binary data
|
|
81
|
+
bin_item_id: BinItem ID from HWPX
|
|
82
|
+
image_path: Original path in ZIP (for naming)
|
|
83
|
+
**kwargs: Additional options
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
Image tag string, or None on failure
|
|
87
|
+
"""
|
|
88
|
+
custom_name = None
|
|
89
|
+
if bin_item_id is not None:
|
|
90
|
+
custom_name = f"hwpx_{bin_item_id}"
|
|
91
|
+
elif image_path is not None:
|
|
92
|
+
# Extract filename from path
|
|
93
|
+
filename = image_path.split('/')[-1] if '/' in image_path else image_path
|
|
94
|
+
# Remove extension and sanitize
|
|
95
|
+
name_base = filename.rsplit('.', 1)[0] if '.' in filename else filename
|
|
96
|
+
custom_name = f"hwpx_{name_base}"
|
|
97
|
+
|
|
98
|
+
return self.save_image(image_data, custom_name=custom_name)
|
|
99
|
+
|
|
100
|
+
def process_from_zip(
|
|
101
|
+
self,
|
|
102
|
+
zf: zipfile.ZipFile,
|
|
103
|
+
image_path: str,
|
|
104
|
+
bin_item_id: Optional[str] = None,
|
|
105
|
+
) -> Optional[str]:
|
|
106
|
+
"""
|
|
107
|
+
Process image from HWPX ZIP archive.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
zf: ZipFile object
|
|
111
|
+
image_path: Path to image in ZIP
|
|
112
|
+
bin_item_id: BinItem ID
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
Image tag string, or None on failure
|
|
116
|
+
"""
|
|
117
|
+
try:
|
|
118
|
+
with zf.open(image_path) as f:
|
|
119
|
+
image_data = f.read()
|
|
120
|
+
|
|
121
|
+
return self.process_image(
|
|
122
|
+
image_data,
|
|
123
|
+
bin_item_id=bin_item_id,
|
|
124
|
+
image_path=image_path
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
except Exception as e:
|
|
128
|
+
self._logger.warning(f"Failed to process image from ZIP {image_path}: {e}")
|
|
129
|
+
return None
|
|
130
|
+
|
|
131
|
+
def process_embedded_image(
|
|
132
|
+
self,
|
|
133
|
+
image_data: bytes,
|
|
134
|
+
image_name: Optional[str] = None,
|
|
135
|
+
bin_item_id: Optional[str] = None,
|
|
136
|
+
**kwargs
|
|
137
|
+
) -> Optional[str]:
|
|
138
|
+
"""
|
|
139
|
+
Process embedded HWPX image.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
image_data: Image binary data
|
|
143
|
+
image_name: Original image filename
|
|
144
|
+
bin_item_id: BinItem ID
|
|
145
|
+
**kwargs: Additional options
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
Image tag string, or None on failure
|
|
149
|
+
"""
|
|
150
|
+
custom_name = image_name
|
|
151
|
+
if custom_name is None and bin_item_id is not None:
|
|
152
|
+
custom_name = f"hwpx_embed_{bin_item_id}"
|
|
153
|
+
|
|
154
|
+
return self.save_image(image_data, custom_name=custom_name)
|
|
155
|
+
|
|
156
|
+
def process_bindata_images(
|
|
157
|
+
self,
|
|
158
|
+
zf: zipfile.ZipFile,
|
|
159
|
+
bin_item_map: Dict[str, str],
|
|
160
|
+
exclude_processed: Optional[Set[str]] = None,
|
|
161
|
+
) -> Dict[str, str]:
|
|
162
|
+
"""
|
|
163
|
+
Process all BinData images from HWPX.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
zf: ZipFile object
|
|
167
|
+
bin_item_map: Mapping of bin_item_id to path
|
|
168
|
+
exclude_processed: Set of already processed IDs to skip
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
Dictionary mapping bin_item_id to image tag
|
|
172
|
+
"""
|
|
173
|
+
exclude = exclude_processed or set()
|
|
174
|
+
result = {}
|
|
175
|
+
|
|
176
|
+
for bin_id, image_path in bin_item_map.items():
|
|
177
|
+
if bin_id in exclude:
|
|
178
|
+
continue
|
|
179
|
+
|
|
180
|
+
tag = self.process_from_zip(zf, image_path, bin_item_id=bin_id)
|
|
181
|
+
if tag:
|
|
182
|
+
result[bin_id] = tag
|
|
183
|
+
|
|
184
|
+
return result
|
|
185
|
+
|
|
186
|
+
def process_images(
|
|
187
|
+
self,
|
|
188
|
+
zf: zipfile.ZipFile,
|
|
189
|
+
image_files: List[str],
|
|
190
|
+
) -> str:
|
|
191
|
+
"""
|
|
192
|
+
Extract images from HWPX zip and save locally.
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
zf: Open ZipFile object
|
|
196
|
+
image_files: List of image file paths to process
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
Image tag strings joined by newlines
|
|
200
|
+
"""
|
|
201
|
+
results = []
|
|
202
|
+
|
|
203
|
+
for img_path in image_files:
|
|
204
|
+
ext = os.path.splitext(img_path)[1].lower()
|
|
205
|
+
if ext in SUPPORTED_IMAGE_EXTENSIONS:
|
|
206
|
+
tag = self.process_from_zip(zf, img_path)
|
|
207
|
+
if tag:
|
|
208
|
+
results.append(tag)
|
|
209
|
+
|
|
210
|
+
return "\n\n".join(results)
|
|
211
|
+
|
|
212
|
+
def get_remaining_images(
|
|
213
|
+
self,
|
|
214
|
+
zf: zipfile.ZipFile,
|
|
215
|
+
processed_images: Set[str],
|
|
216
|
+
) -> List[str]:
|
|
217
|
+
"""
|
|
218
|
+
Return list of image files not yet processed.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
zf: Open ZipFile object
|
|
222
|
+
processed_images: Set of already processed image paths
|
|
223
|
+
|
|
224
|
+
Returns:
|
|
225
|
+
List of unprocessed image file paths
|
|
226
|
+
"""
|
|
227
|
+
image_files = [
|
|
228
|
+
f for f in zf.namelist()
|
|
229
|
+
if f.startswith("BinData/") and not f.endswith("/")
|
|
230
|
+
]
|
|
231
|
+
|
|
232
|
+
remaining_images = []
|
|
233
|
+
for img in image_files:
|
|
234
|
+
if img not in processed_images:
|
|
235
|
+
remaining_images.append(img)
|
|
236
|
+
|
|
237
|
+
return remaining_images
|
|
238
|
+
|
|
239
|
+
def process_remaining_images(
|
|
240
|
+
self,
|
|
241
|
+
zf: zipfile.ZipFile,
|
|
242
|
+
processed_images: Set[str],
|
|
243
|
+
) -> str:
|
|
244
|
+
"""
|
|
245
|
+
Process all images not yet processed.
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
zf: Open ZipFile object
|
|
249
|
+
processed_images: Set of already processed image paths
|
|
250
|
+
|
|
251
|
+
Returns:
|
|
252
|
+
Image tag strings joined by newlines
|
|
253
|
+
"""
|
|
254
|
+
remaining = self.get_remaining_images(zf, processed_images)
|
|
255
|
+
return self.process_images(zf, remaining)
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
__all__ = ["HWPXImageProcessor"]
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py
|
|
2
|
+
"""
|
|
3
|
+
HWPX Metadata Extraction Module
|
|
4
|
+
|
|
5
|
+
Provides HWPXMetadataExtractor class for extracting metadata from HWPX files.
|
|
6
|
+
Implements BaseMetadataExtractor interface.
|
|
7
|
+
|
|
8
|
+
Metadata locations in HWPX:
|
|
9
|
+
- version.xml: Document version information
|
|
10
|
+
- META-INF/container.xml: Container information
|
|
11
|
+
- Contents/header.xml: Document properties (author, date, etc.)
|
|
12
|
+
|
|
13
|
+
Note: HWPX is a Korean-native document format, so Korean metadata labels
|
|
14
|
+
are preserved in output for proper display.
|
|
15
|
+
"""
|
|
16
|
+
import logging
|
|
17
|
+
import xml.etree.ElementTree as ET
|
|
18
|
+
import zipfile
|
|
19
|
+
from typing import Any, Dict
|
|
20
|
+
|
|
21
|
+
from xgen_doc2chunk.core.functions.metadata_extractor import (
|
|
22
|
+
BaseMetadataExtractor,
|
|
23
|
+
DocumentMetadata,
|
|
24
|
+
)
|
|
25
|
+
from xgen_doc2chunk.core.processor.hwpx_helper.hwpx_constants import HWPX_NAMESPACES, HEADER_FILE_PATHS
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger("document-processor")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class HWPXMetadataExtractor(BaseMetadataExtractor):
|
|
31
|
+
"""
|
|
32
|
+
HWPX Metadata Extractor.
|
|
33
|
+
|
|
34
|
+
Extracts HWPX metadata from zipfile.ZipFile objects.
|
|
35
|
+
|
|
36
|
+
Supported fields:
|
|
37
|
+
- Standard fields: title, subject, author, keywords, comments, etc.
|
|
38
|
+
- HWPX-specific: version, media_type, etc. (stored in custom fields)
|
|
39
|
+
|
|
40
|
+
Usage:
|
|
41
|
+
extractor = HWPXMetadataExtractor()
|
|
42
|
+
metadata = extractor.extract(zip_file)
|
|
43
|
+
text = extractor.format(metadata)
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def extract(self, source: zipfile.ZipFile) -> DocumentMetadata:
|
|
47
|
+
"""
|
|
48
|
+
Extract metadata from HWPX file.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
source: Open zipfile.ZipFile object
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
DocumentMetadata instance containing extracted metadata.
|
|
55
|
+
"""
|
|
56
|
+
raw_metadata: Dict[str, Any] = {}
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
# Try to read header.xml for document properties
|
|
60
|
+
for header_path in HEADER_FILE_PATHS:
|
|
61
|
+
if header_path in source.namelist():
|
|
62
|
+
with source.open(header_path) as f:
|
|
63
|
+
header_content = f.read()
|
|
64
|
+
header_root = ET.fromstring(header_content)
|
|
65
|
+
|
|
66
|
+
# Try to find document properties
|
|
67
|
+
# <hh:docInfo> contains metadata
|
|
68
|
+
doc_info = header_root.find('.//hh:docInfo', HWPX_NAMESPACES)
|
|
69
|
+
if doc_info is not None:
|
|
70
|
+
# Get properties
|
|
71
|
+
for prop in doc_info:
|
|
72
|
+
tag = prop.tag.split('}')[-1] if '}' in prop.tag else prop.tag
|
|
73
|
+
if prop.text:
|
|
74
|
+
raw_metadata[tag.lower()] = prop.text
|
|
75
|
+
break
|
|
76
|
+
|
|
77
|
+
# Try to read version.xml
|
|
78
|
+
if 'version.xml' in source.namelist():
|
|
79
|
+
with source.open('version.xml') as f:
|
|
80
|
+
version_content = f.read()
|
|
81
|
+
version_root = ET.fromstring(version_content)
|
|
82
|
+
|
|
83
|
+
# Get version info
|
|
84
|
+
if version_root.text:
|
|
85
|
+
raw_metadata['version'] = version_root.text
|
|
86
|
+
for attr in version_root.attrib:
|
|
87
|
+
raw_metadata[f'version_{attr}'] = version_root.get(attr)
|
|
88
|
+
|
|
89
|
+
# Try to read META-INF/manifest.xml for additional info
|
|
90
|
+
if 'META-INF/manifest.xml' in source.namelist():
|
|
91
|
+
with source.open('META-INF/manifest.xml') as f:
|
|
92
|
+
manifest_content = f.read()
|
|
93
|
+
manifest_root = ET.fromstring(manifest_content)
|
|
94
|
+
|
|
95
|
+
# Get mimetype and other info
|
|
96
|
+
for child in manifest_root:
|
|
97
|
+
tag = child.tag.split('}')[-1] if '}' in child.tag else child.tag
|
|
98
|
+
if tag == 'file-entry':
|
|
99
|
+
full_path = child.get('full-path', child.get('{urn:oasis:names:tc:opendocument:xmlns:manifest:1.0}full-path', ''))
|
|
100
|
+
if full_path == '/':
|
|
101
|
+
media_type = child.get('media-type', child.get('{urn:oasis:names:tc:opendocument:xmlns:manifest:1.0}media-type', ''))
|
|
102
|
+
if media_type:
|
|
103
|
+
raw_metadata['media_type'] = media_type
|
|
104
|
+
|
|
105
|
+
self.logger.debug(f"Extracted HWPX metadata: {list(raw_metadata.keys())}")
|
|
106
|
+
|
|
107
|
+
except Exception as e:
|
|
108
|
+
self.logger.warning(f"Failed to extract HWPX metadata: {e}")
|
|
109
|
+
|
|
110
|
+
# Separate standard fields from custom fields
|
|
111
|
+
standard_fields = {'title', 'subject', 'author', 'keywords', 'comments',
|
|
112
|
+
'last_saved_by', 'create_time', 'last_saved_time'}
|
|
113
|
+
custom_fields = {k: v for k, v in raw_metadata.items() if k not in standard_fields}
|
|
114
|
+
|
|
115
|
+
return DocumentMetadata(
|
|
116
|
+
title=raw_metadata.get('title'),
|
|
117
|
+
subject=raw_metadata.get('subject'),
|
|
118
|
+
author=raw_metadata.get('author'),
|
|
119
|
+
keywords=raw_metadata.get('keywords'),
|
|
120
|
+
comments=raw_metadata.get('comments'),
|
|
121
|
+
last_saved_by=raw_metadata.get('last_saved_by'),
|
|
122
|
+
create_time=raw_metadata.get('create_time'),
|
|
123
|
+
last_saved_time=raw_metadata.get('last_saved_time'),
|
|
124
|
+
custom=custom_fields,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def parse_bin_item_map(zf: zipfile.ZipFile) -> Dict[str, str]:
|
|
129
|
+
"""
|
|
130
|
+
Parse content.hpf file to create BinItem ID to file path mapping.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
zf: Open ZipFile object
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
Dictionary mapping BinItem ID to file path.
|
|
137
|
+
"""
|
|
138
|
+
from .hwpx_constants import HPF_PATH, OPF_NAMESPACES
|
|
139
|
+
|
|
140
|
+
bin_item_map = {}
|
|
141
|
+
|
|
142
|
+
try:
|
|
143
|
+
if HPF_PATH in zf.namelist():
|
|
144
|
+
with zf.open(HPF_PATH) as f:
|
|
145
|
+
hpf_content = f.read()
|
|
146
|
+
hpf_root = ET.fromstring(hpf_content)
|
|
147
|
+
|
|
148
|
+
for item in hpf_root.findall('.//opf:item', OPF_NAMESPACES):
|
|
149
|
+
item_id = item.get('id')
|
|
150
|
+
href = item.get('href')
|
|
151
|
+
if item_id and href:
|
|
152
|
+
bin_item_map[item_id] = href
|
|
153
|
+
|
|
154
|
+
except Exception as e:
|
|
155
|
+
logger.warning(f"Failed to parse content.hpf: {e}")
|
|
156
|
+
|
|
157
|
+
return bin_item_map
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
__all__ = [
|
|
161
|
+
'HWPXMetadataExtractor',
|
|
162
|
+
'parse_bin_item_map',
|
|
163
|
+
]
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py
|
|
2
|
+
"""
|
|
3
|
+
HWPX Preprocessor - Process HWPX ZIP document after conversion.
|
|
4
|
+
|
|
5
|
+
Processing Pipeline Position:
|
|
6
|
+
1. HWPXFileConverter.convert() ??zipfile.ZipFile
|
|
7
|
+
2. HWPXPreprocessor.preprocess() ??PreprocessedData (THIS STEP)
|
|
8
|
+
3. HWPXMetadataExtractor.extract() ??DocumentMetadata
|
|
9
|
+
4. Content extraction (sections, tables, images)
|
|
10
|
+
|
|
11
|
+
Current Implementation:
|
|
12
|
+
- Pass-through (HWPX uses zipfile object directly)
|
|
13
|
+
"""
|
|
14
|
+
import logging
|
|
15
|
+
from typing import Any, Dict
|
|
16
|
+
|
|
17
|
+
from xgen_doc2chunk.core.functions.preprocessor import (
|
|
18
|
+
BasePreprocessor,
|
|
19
|
+
PreprocessedData,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger("xgen_doc2chunk.hwpx.preprocessor")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class HWPXPreprocessor(BasePreprocessor):
|
|
26
|
+
"""
|
|
27
|
+
HWPX ZIP Document Preprocessor.
|
|
28
|
+
|
|
29
|
+
Currently a pass-through implementation as HWPX processing
|
|
30
|
+
is handled during the content extraction phase.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def preprocess(
|
|
34
|
+
self,
|
|
35
|
+
converted_data: Any,
|
|
36
|
+
**kwargs
|
|
37
|
+
) -> PreprocessedData:
|
|
38
|
+
"""
|
|
39
|
+
Preprocess the converted HWPX ZIP document.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
converted_data: zipfile.ZipFile object from HWPXFileConverter
|
|
43
|
+
**kwargs: Additional options
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
PreprocessedData with the ZIP object and any extracted resources
|
|
47
|
+
"""
|
|
48
|
+
metadata: Dict[str, Any] = {}
|
|
49
|
+
|
|
50
|
+
if hasattr(converted_data, 'namelist'):
|
|
51
|
+
try:
|
|
52
|
+
files = converted_data.namelist()
|
|
53
|
+
metadata['file_count'] = len(files)
|
|
54
|
+
# Check for section files
|
|
55
|
+
sections = [f for f in files if 'section' in f.lower() and f.endswith('.xml')]
|
|
56
|
+
metadata['section_count'] = len(sections)
|
|
57
|
+
except Exception: # noqa: BLE001
|
|
58
|
+
pass
|
|
59
|
+
|
|
60
|
+
logger.debug("HWPX preprocessor: pass-through, metadata=%s", metadata)
|
|
61
|
+
|
|
62
|
+
# clean_content is the TRUE SOURCE - contains the ZipFile
|
|
63
|
+
return PreprocessedData(
|
|
64
|
+
raw_content=converted_data,
|
|
65
|
+
clean_content=converted_data, # TRUE SOURCE - zipfile.ZipFile
|
|
66
|
+
encoding="utf-8",
|
|
67
|
+
extracted_resources={},
|
|
68
|
+
metadata=metadata,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
def get_format_name(self) -> str:
|
|
72
|
+
"""Return format name."""
|
|
73
|
+
return "HWPX Preprocessor"
|
|
74
|
+
|
|
75
|
+
def validate(self, data: Any) -> bool:
|
|
76
|
+
"""Validate if data is a ZipFile object."""
|
|
77
|
+
return hasattr(data, 'namelist') and hasattr(data, 'open')
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
__all__ = ['HWPXPreprocessor']
|