xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,205 @@
1
+ # service/document_processor/processor/hwp_helper/hwp_table.py
2
+ """
3
+ HWP 테이블 파싱 유틸리티
4
+
5
+ HWP 5.0 OLE 파일에서 테이블을 파싱하여 HTML로 변환합니다.
6
+ - parse_table: 테이블 컨트롤을 파싱하여 HTML 또는 리스트로 변환
7
+ - build_table_grid: 셀 정보를 그리드로 구성
8
+ - render_table_html: 그리드를 HTML 테이블로 렌더링
9
+ """
10
+ import struct
11
+ import logging
12
+ from typing import Dict, Callable, Optional, Any, Set
13
+
14
+ import olefile
15
+
16
+ from xgen_doc2chunk.core.processor.hwp_helper.hwp_constants import (
17
+ HWPTAG_TABLE,
18
+ HWPTAG_LIST_HEADER,
19
+ )
20
+ from xgen_doc2chunk.core.processor.hwp_helper.hwp_record import HwpRecord
21
+
22
+ logger = logging.getLogger("document-processor")
23
+
24
+
25
+ def parse_table(
26
+ ctrl_header: HwpRecord,
27
+ traverse_callback: Callable[[HwpRecord, Any, Dict, Set], str],
28
+ ole: olefile.OleFileIO = None,
29
+ bin_data_map: Dict = None,
30
+ processed_images: Optional[Set[str]] = None
31
+ ) -> str:
32
+ """
33
+ HWP 테이블 컨트롤을 파싱합니다.
34
+
35
+ 테이블 구조를 분석하여:
36
+ - 1×1 테이블: 셀 내용만 텍스트로 반환 (컨테이너 테이블)
37
+ - 단일 컬럼 테이블 (1열, 다중 행): 셀 내용을 줄바꿈으로 구분하여 반환
38
+ - 다중 컬럼 테이블 (2+ 열): HTML 테이블로 변환
39
+
40
+ Args:
41
+ ctrl_header: 테이블을 포함하는 CTRL_HEADER 레코드
42
+ traverse_callback: 셀 내용을 추출하기 위한 트리 순회 콜백 함수
43
+ ole: OLE 파일 객체
44
+ bin_data_map: BinData 매핑 정보
45
+ processed_images: 처리된 이미지 경로 집합
46
+
47
+ Returns:
48
+ HTML 테이블, 텍스트 리스트, 또는 단순 텍스트
49
+ """
50
+ try:
51
+ table_rec = next((c for c in ctrl_header.children if c.tag_id == HWPTAG_TABLE), None)
52
+ if not table_rec:
53
+ return ""
54
+
55
+ if len(table_rec.payload) < 8:
56
+ return ""
57
+
58
+ row_cnt = struct.unpack('<H', table_rec.payload[4:6])[0]
59
+ col_cnt = struct.unpack('<H', table_rec.payload[6:8])[0]
60
+
61
+ # 셀 그리드 구성
62
+ grid = build_table_grid(
63
+ ctrl_header,
64
+ traverse_callback,
65
+ ole,
66
+ bin_data_map,
67
+ processed_images
68
+ )
69
+
70
+ # 1×1 테이블 -> 셀 내용만 반환 (컨테이너 테이블)
71
+ if row_cnt == 1 and col_cnt == 1:
72
+ if (0, 0) in grid:
73
+ return grid[(0, 0)]['text']
74
+ return ""
75
+
76
+ # 단일 컬럼 테이블 (1열, 다중 행) -> 셀 내용을 줄바꿈으로 구분
77
+ if col_cnt == 1:
78
+ text_items = []
79
+ for r in range(row_cnt):
80
+ if (r, 0) in grid:
81
+ cell_text = grid[(r, 0)]['text']
82
+ if cell_text:
83
+ text_items.append(cell_text)
84
+ if text_items:
85
+ return "\n\n".join(text_items)
86
+ return ""
87
+
88
+ # HTML 테이블 생성 (2+ 컬럼)
89
+ return render_table_html(grid, row_cnt, col_cnt)
90
+
91
+ except Exception as e:
92
+ logger.warning(f"Failed to parse HWP table: {e}")
93
+ return "[Table Extraction Failed]"
94
+
95
+
96
+ def build_table_grid(
97
+ ctrl_header: HwpRecord,
98
+ traverse_callback: Callable[[HwpRecord, Any, Dict, Set], str],
99
+ ole: olefile.OleFileIO = None,
100
+ bin_data_map: Dict = None,
101
+ processed_images: Optional[Set[str]] = None
102
+ ) -> Dict:
103
+ """
104
+ 테이블 셀 정보를 그리드로 구성합니다.
105
+
106
+ Args:
107
+ ctrl_header: 테이블을 포함하는 CTRL_HEADER 레코드
108
+ traverse_callback: 셀 내용을 추출하기 위한 트리 순회 콜백 함수
109
+ ole: OLE 파일 객체
110
+ bin_data_map: BinData 매핑 정보
111
+ processed_images: 처리된 이미지 경로 집합
112
+
113
+ Returns:
114
+ (row_idx, col_idx) -> {'text', 'rowspan', 'colspan'} 딕셔너리
115
+ """
116
+ grid = {}
117
+
118
+ cells = [c for c in ctrl_header.children if c.tag_id == HWPTAG_LIST_HEADER]
119
+
120
+ for cell in cells:
121
+ if len(cell.payload) < 16:
122
+ continue
123
+
124
+ para_count = struct.unpack('<H', cell.payload[0:2])[0]
125
+ col_idx = struct.unpack('<H', cell.payload[8:10])[0]
126
+ row_idx = struct.unpack('<H', cell.payload[10:12])[0]
127
+ col_span = struct.unpack('<H', cell.payload[12:14])[0]
128
+ row_span = struct.unpack('<H', cell.payload[14:16])[0]
129
+
130
+ cell_text_parts = []
131
+
132
+ if cell.children:
133
+ for child in cell.children:
134
+ t = traverse_callback(child, ole, bin_data_map, processed_images)
135
+ cell_text_parts.append(t)
136
+ else:
137
+ siblings = list(cell.get_next_siblings(para_count))
138
+ for sibling in siblings:
139
+ t = traverse_callback(sibling, ole, bin_data_map, processed_images)
140
+ cell_text_parts.append(t)
141
+
142
+ cell_content = "".join(cell_text_parts).strip()
143
+
144
+ grid[(row_idx, col_idx)] = {
145
+ 'text': cell_content,
146
+ 'rowspan': row_span,
147
+ 'colspan': col_span
148
+ }
149
+
150
+ return grid
151
+
152
+
153
+ def render_table_html(grid: Dict, row_cnt: int, col_cnt: int) -> str:
154
+ """
155
+ 그리드를 HTML 테이블로 렌더링합니다.
156
+
157
+ Args:
158
+ grid: (row_idx, col_idx) -> {'text', 'rowspan', 'colspan'} 딕셔너리
159
+ row_cnt: 테이블 행 수
160
+ col_cnt: 테이블 열 수
161
+
162
+ Returns:
163
+ HTML 테이블 문자열
164
+ """
165
+ html_parts = ["<table border='1'>"]
166
+ skip_map = set()
167
+
168
+ for r in range(row_cnt):
169
+ html_parts.append("<tr>")
170
+ for c in range(col_cnt):
171
+ if (r, c) in skip_map:
172
+ continue
173
+
174
+ if (r, c) in grid:
175
+ cell = grid[(r, c)]
176
+ rowspan = cell['rowspan']
177
+ colspan = cell['colspan']
178
+ text = cell['text']
179
+
180
+ attr = ""
181
+ if rowspan > 1:
182
+ attr += f" rowspan='{rowspan}'"
183
+ if colspan > 1:
184
+ attr += f" colspan='{colspan}'"
185
+
186
+ html_parts.append(f"<td{attr}>{text}</td>")
187
+
188
+ for rs in range(rowspan):
189
+ for cs in range(colspan):
190
+ if rs == 0 and cs == 0:
191
+ continue
192
+ skip_map.add((r + rs, c + cs))
193
+ else:
194
+ html_parts.append("<td></td>")
195
+ html_parts.append("</tr>")
196
+
197
+ html_parts.append("</table>")
198
+ return "\n".join(html_parts)
199
+
200
+
201
+ __all__ = [
202
+ 'parse_table',
203
+ 'build_table_grid',
204
+ 'render_table_html',
205
+ ]
@@ -0,0 +1,191 @@
1
+ # xgen_doc2chunk/core/processor/hwpx_processor.py
2
+ """
3
+ HWPX Handler - HWPX (ZIP/XML based) Document Processor
4
+
5
+ Class-based handler for HWPX files inheriting from BaseHandler.
6
+ """
7
+ import io
8
+ import logging
9
+ from typing import Dict, Any, Set, TYPE_CHECKING
10
+
11
+ from xgen_doc2chunk.core.processor.base_handler import BaseHandler
12
+ from xgen_doc2chunk.core.functions.chart_extractor import BaseChartExtractor
13
+ from xgen_doc2chunk.core.processor.hwpx_helper import (
14
+ parse_bin_item_map,
15
+ parse_hwpx_section,
16
+ )
17
+ from xgen_doc2chunk.core.processor.hwpx_helper.hwpx_chart_extractor import HWPXChartExtractor
18
+ from xgen_doc2chunk.core.processor.hwpx_helper.hwpx_metadata import HWPXMetadataExtractor
19
+ from xgen_doc2chunk.core.processor.hwpx_helper.hwpx_image_processor import HWPXImageProcessor
20
+
21
+ if TYPE_CHECKING:
22
+ from xgen_doc2chunk.core.document_processor import CurrentFile
23
+ from xgen_doc2chunk.core.functions.chart_extractor import ChartData
24
+
25
+ logger = logging.getLogger("document-processor")
26
+
27
+
28
+ class HWPXHandler(BaseHandler):
29
+ """HWPX (ZIP/XML based Korean document) Processing Handler Class"""
30
+
31
+ def _create_file_converter(self):
32
+ """Create HWPX-specific file converter."""
33
+ from xgen_doc2chunk.core.processor.hwpx_helper.hwpx_file_converter import HWPXFileConverter
34
+ return HWPXFileConverter()
35
+
36
+ def _create_preprocessor(self):
37
+ """Create HWPX-specific preprocessor."""
38
+ from xgen_doc2chunk.core.processor.hwpx_helper.hwpx_preprocessor import HWPXPreprocessor
39
+ return HWPXPreprocessor()
40
+
41
+ def _create_chart_extractor(self) -> BaseChartExtractor:
42
+ """Create HWPX-specific chart extractor."""
43
+ return HWPXChartExtractor(self._chart_processor)
44
+
45
+ def _create_metadata_extractor(self):
46
+ """Create HWPX-specific metadata extractor."""
47
+ return HWPXMetadataExtractor()
48
+
49
+ def _create_format_image_processor(self):
50
+ """Create HWPX-specific image processor."""
51
+ return HWPXImageProcessor(
52
+ directory_path=self._image_processor.config.directory_path,
53
+ tag_prefix=self._image_processor.config.tag_prefix,
54
+ tag_suffix=self._image_processor.config.tag_suffix,
55
+ storage_backend=self._image_processor.storage_backend,
56
+ )
57
+
58
+ def extract_text(
59
+ self,
60
+ current_file: "CurrentFile",
61
+ extract_metadata: bool = True,
62
+ **kwargs
63
+ ) -> str:
64
+ """
65
+ Extract text from HWPX file.
66
+
67
+ Args:
68
+ current_file: CurrentFile dict containing file info and binary data
69
+ extract_metadata: Whether to extract metadata
70
+ **kwargs: Additional options
71
+
72
+ Returns:
73
+ Extracted text
74
+ """
75
+ file_path = current_file.get("file_path", "unknown")
76
+ file_data = current_file.get("file_data", b"")
77
+ text_content = []
78
+
79
+ # Check if it's a valid ZIP file using file_converter.validate()
80
+ if not self.file_converter.validate(file_data):
81
+ self.logger.error("Not a valid Zip file: %s", file_path)
82
+ return ""
83
+
84
+ try:
85
+ # Get file stream
86
+ file_stream = self.get_file_stream(current_file)
87
+
88
+ # Pre-extract all charts using ChartExtractor with refs
89
+ # This creates a mapping from chartIDRef -> ChartData
90
+ chart_map = self.chart_extractor.extract_all_with_refs(file_stream)
91
+ processed_chart_refs = set()
92
+
93
+ def chart_callback(chart_id_ref: str) -> str:
94
+ """Callback to get chart content by chartIDRef."""
95
+ # chartIDRef is like "Chart/chart1.xml"
96
+ if chart_id_ref in processed_chart_refs:
97
+ return "" # Already processed
98
+
99
+ chart_data = chart_map.get(chart_id_ref)
100
+ if chart_data:
101
+ processed_chart_refs.add(chart_id_ref)
102
+ return self._format_chart_data(chart_data)
103
+ return ""
104
+
105
+ # Step 1: Convert binary to ZipFile using file_converter
106
+ zf = self.file_converter.convert(file_data, file_stream)
107
+
108
+ # Step 2: Preprocess - clean_content is the TRUE SOURCE
109
+ preprocessed = self.preprocess(zf)
110
+ zf = preprocessed.clean_content # TRUE SOURCE
111
+
112
+ try:
113
+ if extract_metadata:
114
+ metadata_text = self.extract_and_format_metadata(zf)
115
+ if metadata_text:
116
+ text_content.append(metadata_text)
117
+ text_content.append("")
118
+
119
+ bin_item_map = parse_bin_item_map(zf)
120
+
121
+ section_files = [
122
+ f for f in zf.namelist()
123
+ if f.startswith("Contents/section") and f.endswith(".xml")
124
+ ]
125
+ section_files.sort(key=lambda x: int(x.replace("Contents/section", "").replace(".xml", "")))
126
+
127
+ processed_images: Set[str] = set()
128
+
129
+ for sec_file in section_files:
130
+ with zf.open(sec_file) as f:
131
+ xml_content = f.read()
132
+ section_text = parse_hwpx_section(
133
+ xml_content,
134
+ zf,
135
+ bin_item_map,
136
+ processed_images,
137
+ image_processor=self.format_image_processor,
138
+ chart_callback=chart_callback
139
+ )
140
+ text_content.append(section_text)
141
+
142
+ # Use format_image_processor directly
143
+ image_processor = self.format_image_processor
144
+ if hasattr(image_processor, 'get_remaining_images'):
145
+ remaining_images = image_processor.get_remaining_images(zf, processed_images)
146
+ if remaining_images and hasattr(image_processor, 'process_images'):
147
+ image_text = image_processor.process_images(zf, remaining_images)
148
+ if image_text:
149
+ text_content.append("\n\n=== Extracted Images (Not Inline) ===\n")
150
+ text_content.append(image_text)
151
+
152
+ finally:
153
+ # Close ZipFile using file_converter
154
+ self.file_converter.close(zf)
155
+
156
+ except Exception as e: # noqa: BLE001
157
+ self.logger.error("Error processing HWPX file: %s", e)
158
+ return f"Error processing HWPX file: {str(e)}"
159
+
160
+ return "\n".join(text_content)
161
+
162
+ def _is_valid_zip(self, file_stream: io.BytesIO) -> bool:
163
+ """Check if stream is a valid ZIP file."""
164
+ try:
165
+ file_stream.seek(0)
166
+ header = file_stream.read(4)
167
+ file_stream.seek(0)
168
+ return header == b'PK\x03\x04'
169
+ except Exception: # noqa: BLE001
170
+ return False
171
+
172
+ def _format_chart_data(self, chart_data: "ChartData") -> str:
173
+ """Format ChartData using ChartProcessor."""
174
+ from xgen_doc2chunk.core.functions.chart_extractor import ChartData
175
+
176
+ if not isinstance(chart_data, ChartData):
177
+ return ""
178
+
179
+ if chart_data.has_data():
180
+ return self.chart_processor.format_chart_data(
181
+ chart_type=chart_data.chart_type,
182
+ title=chart_data.title,
183
+ categories=chart_data.categories,
184
+ series=chart_data.series
185
+ )
186
+ else:
187
+ return self.chart_processor.format_chart_fallback(
188
+ chart_type=chart_data.chart_type,
189
+ title=chart_data.title
190
+ )
191
+
@@ -0,0 +1,85 @@
1
+ # hwpx_helper/__init__.py
2
+ """
3
+ HWPX Helper 모듈
4
+
5
+ hwpx_processor.py에서 사용하는 기능적 구성요소들을 모듈화하여 제공합니다.
6
+
7
+ 모듈 구성:
8
+ - hwpx_constants: 상수 및 네임스페이스 정의
9
+ - hwpx_metadata: 메타데이터 추출 및 BinItem 매핑
10
+ - hwpx_table_extractor: 테이블 추출 (HWPXTableExtractor) - BaseTableExtractor 구현
11
+ - hwpx_table_processor: 테이블 포맷팅 (HWPXTableProcessor) - TableProcessor 확장
12
+ - hwpx_section: 섹션 XML 파싱
13
+ - hwpx_image_processor: 이미지 처리 및 업로드
14
+ - hwpx_chart_extractor: 차트 추출 (ChartExtractor)
15
+ """
16
+
17
+ # Constants
18
+ from xgen_doc2chunk.core.processor.hwpx_helper.hwpx_constants import (
19
+ HWPX_NAMESPACES,
20
+ OPF_NAMESPACES,
21
+ SUPPORTED_IMAGE_EXTENSIONS,
22
+ SKIP_IMAGE_EXTENSIONS,
23
+ HEADER_FILE_PATHS,
24
+ HPF_PATH,
25
+ )
26
+
27
+ # Metadata
28
+ from xgen_doc2chunk.core.processor.hwpx_helper.hwpx_metadata import (
29
+ HWPXMetadataExtractor,
30
+ parse_bin_item_map,
31
+ )
32
+
33
+ # Table Extractor (NEW - BaseTableExtractor implementation)
34
+ from xgen_doc2chunk.core.processor.hwpx_helper.hwpx_table_extractor import (
35
+ HWPXTableExtractor,
36
+ create_hwpx_table_extractor,
37
+ )
38
+
39
+ # Table Processor (NEW - TableProcessor extension)
40
+ from xgen_doc2chunk.core.processor.hwpx_helper.hwpx_table_processor import (
41
+ HWPXTableProcessor,
42
+ HWPXTableProcessorConfig,
43
+ create_hwpx_table_processor,
44
+ )
45
+
46
+ # Section
47
+ from xgen_doc2chunk.core.processor.hwpx_helper.hwpx_section import (
48
+ parse_hwpx_section,
49
+ )
50
+
51
+ # Image Processor (replaces hwpx_image.py utility functions)
52
+ from xgen_doc2chunk.core.processor.hwpx_helper.hwpx_image_processor import (
53
+ HWPXImageProcessor,
54
+ )
55
+
56
+ # Chart Extractor
57
+ from xgen_doc2chunk.core.processor.hwpx_helper.hwpx_chart_extractor import (
58
+ HWPXChartExtractor,
59
+ )
60
+
61
+ __all__ = [
62
+ # Constants
63
+ "HWPX_NAMESPACES",
64
+ "OPF_NAMESPACES",
65
+ "SUPPORTED_IMAGE_EXTENSIONS",
66
+ "SKIP_IMAGE_EXTENSIONS",
67
+ "HEADER_FILE_PATHS",
68
+ "HPF_PATH",
69
+ # Metadata
70
+ "HWPXMetadataExtractor",
71
+ "parse_bin_item_map",
72
+ # Table Extractor (NEW)
73
+ "HWPXTableExtractor",
74
+ "create_hwpx_table_extractor",
75
+ # Table Processor (NEW)
76
+ "HWPXTableProcessor",
77
+ "HWPXTableProcessorConfig",
78
+ "create_hwpx_table_processor",
79
+ # Section
80
+ "parse_hwpx_section",
81
+ # Image Processor
82
+ "HWPXImageProcessor",
83
+ # Chart Extractor
84
+ "HWPXChartExtractor",
85
+ ]