xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,258 @@
1
+ # xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py
2
+ """
3
+ HWPX Image Processor
4
+
5
+ Provides HWPX-specific image processing that inherits from ImageProcessor.
6
+ Handles images in HWPX (ZIP/XML based) Korean document format.
7
+
8
+ This class consolidates all HWPX image extraction logic including:
9
+ - BinData images extraction from ZIP
10
+ - Remaining images processing
11
+ - Image filtering by extension
12
+ """
13
+ import logging
14
+ import os
15
+ from typing import Any, Dict, List, Optional, Set, TYPE_CHECKING
16
+ import zipfile
17
+
18
+ from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
19
+ from xgen_doc2chunk.core.functions.storage_backend import BaseStorageBackend
20
+
21
+ logger = logging.getLogger("xgen_doc2chunk.image_processor.hwpx")
22
+
23
+ # Supported image extensions
24
+ SUPPORTED_IMAGE_EXTENSIONS = frozenset(['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff'])
25
+
26
+
27
+ class HWPXImageProcessor(ImageProcessor):
28
+ """
29
+ HWPX-specific image processor.
30
+
31
+ Inherits from ImageProcessor and provides HWPX-specific processing.
32
+
33
+ Handles:
34
+ - BinData images in HWPX ZIP structure
35
+ - Embedded images
36
+ - Referenced images via bin_item_map
37
+
38
+ Example:
39
+ processor = HWPXImageProcessor()
40
+
41
+ # Process image from ZIP
42
+ with zipfile.ZipFile(file_stream, 'r') as zf:
43
+ tag = processor.process_from_zip(zf, "BinData/image1.png")
44
+ """
45
+
46
+ def __init__(
47
+ self,
48
+ directory_path: str = "temp/images",
49
+ tag_prefix: str = "[Image:",
50
+ tag_suffix: str = "]",
51
+ storage_backend: Optional[BaseStorageBackend] = None,
52
+ ):
53
+ """
54
+ Initialize HWPXImageProcessor.
55
+
56
+ Args:
57
+ directory_path: Image save directory
58
+ tag_prefix: Tag prefix for image references
59
+ tag_suffix: Tag suffix for image references
60
+ storage_backend: Storage backend for saving images
61
+ """
62
+ super().__init__(
63
+ directory_path=directory_path,
64
+ tag_prefix=tag_prefix,
65
+ tag_suffix=tag_suffix,
66
+ storage_backend=storage_backend,
67
+ )
68
+
69
+ def process_image(
70
+ self,
71
+ image_data: bytes,
72
+ bin_item_id: Optional[str] = None,
73
+ image_path: Optional[str] = None,
74
+ **kwargs
75
+ ) -> Optional[str]:
76
+ """
77
+ Process and save HWPX image data.
78
+
79
+ Args:
80
+ image_data: Raw image binary data
81
+ bin_item_id: BinItem ID from HWPX
82
+ image_path: Original path in ZIP (for naming)
83
+ **kwargs: Additional options
84
+
85
+ Returns:
86
+ Image tag string, or None on failure
87
+ """
88
+ custom_name = None
89
+ if bin_item_id is not None:
90
+ custom_name = f"hwpx_{bin_item_id}"
91
+ elif image_path is not None:
92
+ # Extract filename from path
93
+ filename = image_path.split('/')[-1] if '/' in image_path else image_path
94
+ # Remove extension and sanitize
95
+ name_base = filename.rsplit('.', 1)[0] if '.' in filename else filename
96
+ custom_name = f"hwpx_{name_base}"
97
+
98
+ return self.save_image(image_data, custom_name=custom_name)
99
+
100
+ def process_from_zip(
101
+ self,
102
+ zf: zipfile.ZipFile,
103
+ image_path: str,
104
+ bin_item_id: Optional[str] = None,
105
+ ) -> Optional[str]:
106
+ """
107
+ Process image from HWPX ZIP archive.
108
+
109
+ Args:
110
+ zf: ZipFile object
111
+ image_path: Path to image in ZIP
112
+ bin_item_id: BinItem ID
113
+
114
+ Returns:
115
+ Image tag string, or None on failure
116
+ """
117
+ try:
118
+ with zf.open(image_path) as f:
119
+ image_data = f.read()
120
+
121
+ return self.process_image(
122
+ image_data,
123
+ bin_item_id=bin_item_id,
124
+ image_path=image_path
125
+ )
126
+
127
+ except Exception as e:
128
+ self._logger.warning(f"Failed to process image from ZIP {image_path}: {e}")
129
+ return None
130
+
131
+ def process_embedded_image(
132
+ self,
133
+ image_data: bytes,
134
+ image_name: Optional[str] = None,
135
+ bin_item_id: Optional[str] = None,
136
+ **kwargs
137
+ ) -> Optional[str]:
138
+ """
139
+ Process embedded HWPX image.
140
+
141
+ Args:
142
+ image_data: Image binary data
143
+ image_name: Original image filename
144
+ bin_item_id: BinItem ID
145
+ **kwargs: Additional options
146
+
147
+ Returns:
148
+ Image tag string, or None on failure
149
+ """
150
+ custom_name = image_name
151
+ if custom_name is None and bin_item_id is not None:
152
+ custom_name = f"hwpx_embed_{bin_item_id}"
153
+
154
+ return self.save_image(image_data, custom_name=custom_name)
155
+
156
+ def process_bindata_images(
157
+ self,
158
+ zf: zipfile.ZipFile,
159
+ bin_item_map: Dict[str, str],
160
+ exclude_processed: Optional[Set[str]] = None,
161
+ ) -> Dict[str, str]:
162
+ """
163
+ Process all BinData images from HWPX.
164
+
165
+ Args:
166
+ zf: ZipFile object
167
+ bin_item_map: Mapping of bin_item_id to path
168
+ exclude_processed: Set of already processed IDs to skip
169
+
170
+ Returns:
171
+ Dictionary mapping bin_item_id to image tag
172
+ """
173
+ exclude = exclude_processed or set()
174
+ result = {}
175
+
176
+ for bin_id, image_path in bin_item_map.items():
177
+ if bin_id in exclude:
178
+ continue
179
+
180
+ tag = self.process_from_zip(zf, image_path, bin_item_id=bin_id)
181
+ if tag:
182
+ result[bin_id] = tag
183
+
184
+ return result
185
+
186
+ def process_images(
187
+ self,
188
+ zf: zipfile.ZipFile,
189
+ image_files: List[str],
190
+ ) -> str:
191
+ """
192
+ Extract images from HWPX zip and save locally.
193
+
194
+ Args:
195
+ zf: Open ZipFile object
196
+ image_files: List of image file paths to process
197
+
198
+ Returns:
199
+ Image tag strings joined by newlines
200
+ """
201
+ results = []
202
+
203
+ for img_path in image_files:
204
+ ext = os.path.splitext(img_path)[1].lower()
205
+ if ext in SUPPORTED_IMAGE_EXTENSIONS:
206
+ tag = self.process_from_zip(zf, img_path)
207
+ if tag:
208
+ results.append(tag)
209
+
210
+ return "\n\n".join(results)
211
+
212
+ def get_remaining_images(
213
+ self,
214
+ zf: zipfile.ZipFile,
215
+ processed_images: Set[str],
216
+ ) -> List[str]:
217
+ """
218
+ Return list of image files not yet processed.
219
+
220
+ Args:
221
+ zf: Open ZipFile object
222
+ processed_images: Set of already processed image paths
223
+
224
+ Returns:
225
+ List of unprocessed image file paths
226
+ """
227
+ image_files = [
228
+ f for f in zf.namelist()
229
+ if f.startswith("BinData/") and not f.endswith("/")
230
+ ]
231
+
232
+ remaining_images = []
233
+ for img in image_files:
234
+ if img not in processed_images:
235
+ remaining_images.append(img)
236
+
237
+ return remaining_images
238
+
239
+ def process_remaining_images(
240
+ self,
241
+ zf: zipfile.ZipFile,
242
+ processed_images: Set[str],
243
+ ) -> str:
244
+ """
245
+ Process all images not yet processed.
246
+
247
+ Args:
248
+ zf: Open ZipFile object
249
+ processed_images: Set of already processed image paths
250
+
251
+ Returns:
252
+ Image tag strings joined by newlines
253
+ """
254
+ remaining = self.get_remaining_images(zf, processed_images)
255
+ return self.process_images(zf, remaining)
256
+
257
+
258
+ __all__ = ["HWPXImageProcessor"]
@@ -0,0 +1,163 @@
1
+ # xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py
2
+ """
3
+ HWPX Metadata Extraction Module
4
+
5
+ Provides HWPXMetadataExtractor class for extracting metadata from HWPX files.
6
+ Implements BaseMetadataExtractor interface.
7
+
8
+ Metadata locations in HWPX:
9
+ - version.xml: Document version information
10
+ - META-INF/container.xml: Container information
11
+ - Contents/header.xml: Document properties (author, date, etc.)
12
+
13
+ Note: HWPX is a Korean-native document format, so Korean metadata labels
14
+ are preserved in output for proper display.
15
+ """
16
+ import logging
17
+ import xml.etree.ElementTree as ET
18
+ import zipfile
19
+ from typing import Any, Dict
20
+
21
+ from xgen_doc2chunk.core.functions.metadata_extractor import (
22
+ BaseMetadataExtractor,
23
+ DocumentMetadata,
24
+ )
25
+ from xgen_doc2chunk.core.processor.hwpx_helper.hwpx_constants import HWPX_NAMESPACES, HEADER_FILE_PATHS
26
+
27
+ logger = logging.getLogger("document-processor")
28
+
29
+
30
+ class HWPXMetadataExtractor(BaseMetadataExtractor):
31
+ """
32
+ HWPX Metadata Extractor.
33
+
34
+ Extracts HWPX metadata from zipfile.ZipFile objects.
35
+
36
+ Supported fields:
37
+ - Standard fields: title, subject, author, keywords, comments, etc.
38
+ - HWPX-specific: version, media_type, etc. (stored in custom fields)
39
+
40
+ Usage:
41
+ extractor = HWPXMetadataExtractor()
42
+ metadata = extractor.extract(zip_file)
43
+ text = extractor.format(metadata)
44
+ """
45
+
46
+ def extract(self, source: zipfile.ZipFile) -> DocumentMetadata:
47
+ """
48
+ Extract metadata from HWPX file.
49
+
50
+ Args:
51
+ source: Open zipfile.ZipFile object
52
+
53
+ Returns:
54
+ DocumentMetadata instance containing extracted metadata.
55
+ """
56
+ raw_metadata: Dict[str, Any] = {}
57
+
58
+ try:
59
+ # Try to read header.xml for document properties
60
+ for header_path in HEADER_FILE_PATHS:
61
+ if header_path in source.namelist():
62
+ with source.open(header_path) as f:
63
+ header_content = f.read()
64
+ header_root = ET.fromstring(header_content)
65
+
66
+ # Try to find document properties
67
+ # <hh:docInfo> contains metadata
68
+ doc_info = header_root.find('.//hh:docInfo', HWPX_NAMESPACES)
69
+ if doc_info is not None:
70
+ # Get properties
71
+ for prop in doc_info:
72
+ tag = prop.tag.split('}')[-1] if '}' in prop.tag else prop.tag
73
+ if prop.text:
74
+ raw_metadata[tag.lower()] = prop.text
75
+ break
76
+
77
+ # Try to read version.xml
78
+ if 'version.xml' in source.namelist():
79
+ with source.open('version.xml') as f:
80
+ version_content = f.read()
81
+ version_root = ET.fromstring(version_content)
82
+
83
+ # Get version info
84
+ if version_root.text:
85
+ raw_metadata['version'] = version_root.text
86
+ for attr in version_root.attrib:
87
+ raw_metadata[f'version_{attr}'] = version_root.get(attr)
88
+
89
+ # Try to read META-INF/manifest.xml for additional info
90
+ if 'META-INF/manifest.xml' in source.namelist():
91
+ with source.open('META-INF/manifest.xml') as f:
92
+ manifest_content = f.read()
93
+ manifest_root = ET.fromstring(manifest_content)
94
+
95
+ # Get mimetype and other info
96
+ for child in manifest_root:
97
+ tag = child.tag.split('}')[-1] if '}' in child.tag else child.tag
98
+ if tag == 'file-entry':
99
+ full_path = child.get('full-path', child.get('{urn:oasis:names:tc:opendocument:xmlns:manifest:1.0}full-path', ''))
100
+ if full_path == '/':
101
+ media_type = child.get('media-type', child.get('{urn:oasis:names:tc:opendocument:xmlns:manifest:1.0}media-type', ''))
102
+ if media_type:
103
+ raw_metadata['media_type'] = media_type
104
+
105
+ self.logger.debug(f"Extracted HWPX metadata: {list(raw_metadata.keys())}")
106
+
107
+ except Exception as e:
108
+ self.logger.warning(f"Failed to extract HWPX metadata: {e}")
109
+
110
+ # Separate standard fields from custom fields
111
+ standard_fields = {'title', 'subject', 'author', 'keywords', 'comments',
112
+ 'last_saved_by', 'create_time', 'last_saved_time'}
113
+ custom_fields = {k: v for k, v in raw_metadata.items() if k not in standard_fields}
114
+
115
+ return DocumentMetadata(
116
+ title=raw_metadata.get('title'),
117
+ subject=raw_metadata.get('subject'),
118
+ author=raw_metadata.get('author'),
119
+ keywords=raw_metadata.get('keywords'),
120
+ comments=raw_metadata.get('comments'),
121
+ last_saved_by=raw_metadata.get('last_saved_by'),
122
+ create_time=raw_metadata.get('create_time'),
123
+ last_saved_time=raw_metadata.get('last_saved_time'),
124
+ custom=custom_fields,
125
+ )
126
+
127
+
128
+ def parse_bin_item_map(zf: zipfile.ZipFile) -> Dict[str, str]:
129
+ """
130
+ Parse content.hpf file to create BinItem ID to file path mapping.
131
+
132
+ Args:
133
+ zf: Open ZipFile object
134
+
135
+ Returns:
136
+ Dictionary mapping BinItem ID to file path.
137
+ """
138
+ from .hwpx_constants import HPF_PATH, OPF_NAMESPACES
139
+
140
+ bin_item_map = {}
141
+
142
+ try:
143
+ if HPF_PATH in zf.namelist():
144
+ with zf.open(HPF_PATH) as f:
145
+ hpf_content = f.read()
146
+ hpf_root = ET.fromstring(hpf_content)
147
+
148
+ for item in hpf_root.findall('.//opf:item', OPF_NAMESPACES):
149
+ item_id = item.get('id')
150
+ href = item.get('href')
151
+ if item_id and href:
152
+ bin_item_map[item_id] = href
153
+
154
+ except Exception as e:
155
+ logger.warning(f"Failed to parse content.hpf: {e}")
156
+
157
+ return bin_item_map
158
+
159
+
160
+ __all__ = [
161
+ 'HWPXMetadataExtractor',
162
+ 'parse_bin_item_map',
163
+ ]
@@ -0,0 +1,80 @@
1
+ # xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py
2
+ """
3
+ HWPX Preprocessor - Process HWPX ZIP document after conversion.
4
+
5
+ Processing Pipeline Position:
6
+ 1. HWPXFileConverter.convert() ??zipfile.ZipFile
7
+ 2. HWPXPreprocessor.preprocess() ??PreprocessedData (THIS STEP)
8
+ 3. HWPXMetadataExtractor.extract() ??DocumentMetadata
9
+ 4. Content extraction (sections, tables, images)
10
+
11
+ Current Implementation:
12
+ - Pass-through (HWPX uses zipfile object directly)
13
+ """
14
+ import logging
15
+ from typing import Any, Dict
16
+
17
+ from xgen_doc2chunk.core.functions.preprocessor import (
18
+ BasePreprocessor,
19
+ PreprocessedData,
20
+ )
21
+
22
+ logger = logging.getLogger("xgen_doc2chunk.hwpx.preprocessor")
23
+
24
+
25
+ class HWPXPreprocessor(BasePreprocessor):
26
+ """
27
+ HWPX ZIP Document Preprocessor.
28
+
29
+ Currently a pass-through implementation as HWPX processing
30
+ is handled during the content extraction phase.
31
+ """
32
+
33
+ def preprocess(
34
+ self,
35
+ converted_data: Any,
36
+ **kwargs
37
+ ) -> PreprocessedData:
38
+ """
39
+ Preprocess the converted HWPX ZIP document.
40
+
41
+ Args:
42
+ converted_data: zipfile.ZipFile object from HWPXFileConverter
43
+ **kwargs: Additional options
44
+
45
+ Returns:
46
+ PreprocessedData with the ZIP object and any extracted resources
47
+ """
48
+ metadata: Dict[str, Any] = {}
49
+
50
+ if hasattr(converted_data, 'namelist'):
51
+ try:
52
+ files = converted_data.namelist()
53
+ metadata['file_count'] = len(files)
54
+ # Check for section files
55
+ sections = [f for f in files if 'section' in f.lower() and f.endswith('.xml')]
56
+ metadata['section_count'] = len(sections)
57
+ except Exception: # noqa: BLE001
58
+ pass
59
+
60
+ logger.debug("HWPX preprocessor: pass-through, metadata=%s", metadata)
61
+
62
+ # clean_content is the TRUE SOURCE - contains the ZipFile
63
+ return PreprocessedData(
64
+ raw_content=converted_data,
65
+ clean_content=converted_data, # TRUE SOURCE - zipfile.ZipFile
66
+ encoding="utf-8",
67
+ extracted_resources={},
68
+ metadata=metadata,
69
+ )
70
+
71
+ def get_format_name(self) -> str:
72
+ """Return format name."""
73
+ return "HWPX Preprocessor"
74
+
75
+ def validate(self, data: Any) -> bool:
76
+ """Validate if data is a ZipFile object."""
77
+ return hasattr(data, 'namelist') and hasattr(data, 'open')
78
+
79
+
80
+ __all__ = ['HWPXPreprocessor']