xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,242 @@
1
+ # hwpx_helper/hwpx_section.py
2
+ """
3
+ HWPX 섹션 파싱
4
+
5
+ HWPX 문서의 섹션 XML을 파싱하여 텍스트, 테이블, 이미지, 차트를 추출합니다.
6
+
7
+ 테이블 처리:
8
+ - HWPXTableExtractor: hp:tbl 요소 → TableData 변환
9
+ - HWPXTableProcessor: TableData → HTML/Markdown/Text 출력
10
+
11
+ 차트 처리:
12
+ - hp:chart 요소 발견 시 chart_callback 호출
13
+ - 원본 문서 순서대로 차트가 삽입됨
14
+ """
15
+ import logging
16
+ import xml.etree.ElementTree as ET
17
+ import zipfile
18
+ from typing import Dict, Set, Optional, Callable
19
+
20
+ from xgen_doc2chunk.core.processor.hwpx_helper.hwpx_constants import HWPX_NAMESPACES
21
+ from xgen_doc2chunk.core.processor.hwpx_helper.hwpx_table_extractor import (
22
+ HWPXTableExtractor,
23
+ )
24
+ from xgen_doc2chunk.core.processor.hwpx_helper.hwpx_table_processor import (
25
+ HWPXTableProcessor,
26
+ )
27
+
28
+ from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
29
+
30
+ logger = logging.getLogger("document-processor")
31
+
32
+ # Module-level instances (lazy initialized)
33
+ _table_extractor: Optional[HWPXTableExtractor] = None
34
+ _table_processor: Optional[HWPXTableProcessor] = None
35
+
36
+
37
+ def _get_table_extractor() -> HWPXTableExtractor:
38
+ """Get or create the module-level table extractor."""
39
+ global _table_extractor
40
+ if _table_extractor is None:
41
+ _table_extractor = HWPXTableExtractor()
42
+ return _table_extractor
43
+
44
+
45
+ def _get_table_processor() -> HWPXTableProcessor:
46
+ """Get or create the module-level table processor."""
47
+ global _table_processor
48
+ if _table_processor is None:
49
+ _table_processor = HWPXTableProcessor()
50
+ return _table_processor
51
+
52
+
53
+ def _process_table(table_element: ET.Element, ns: Dict[str, str]) -> str:
54
+ """Process a table element and return formatted output.
55
+
56
+ Uses HWPXTableExtractor to convert XML to TableData,
57
+ then HWPXTableProcessor to format as HTML.
58
+
59
+ Args:
60
+ table_element: hp:tbl XML element
61
+ ns: Namespace dictionary
62
+
63
+ Returns:
64
+ Formatted table string (HTML)
65
+ """
66
+ extractor = _get_table_extractor()
67
+ processor = _get_table_processor()
68
+
69
+ table_data = extractor.extract_table(table_element, ns)
70
+ if table_data:
71
+ return processor.format_table(table_data)
72
+ return ""
73
+
74
+
75
+ def parse_hwpx_section(
76
+ xml_content: bytes,
77
+ zf: zipfile.ZipFile = None,
78
+ bin_item_map: Dict[str, str] = None,
79
+ processed_images: Set[str] = None,
80
+ image_processor: ImageProcessor = None,
81
+ chart_callback: Optional[Callable[[str], str]] = None
82
+ ) -> str:
83
+ """
84
+ HWPX 섹션 XML을 파싱합니다.
85
+
86
+ 문단, 테이블, 인라인 이미지, 차트를 원본 문서 순서대로 처리합니다.
87
+
88
+ HWPX structure:
89
+ - <hs:sec> -> <hp:p> (최상위 문단)
90
+ - <hp:p> -> <hp:run> -> <hp:t> (Text)
91
+ - <hp:p> -> <hp:run> -> <hp:tbl> (Table)
92
+ - <hp:p> -> <hp:run> -> <hp:ctrl> -> <hc:pic> (Image)
93
+ - <hp:p> -> <hp:run> -> <hp:switch> -> <hp:case> -> <hp:chart> (Chart)
94
+ - <hp:p> -> <hp:run> -> <hp:pic> (Direct Image)
95
+
96
+ Args:
97
+ xml_content: 섹션 XML 바이너리 데이터
98
+ zf: ZipFile 객체 (이미지 추출용)
99
+ bin_item_map: BinItem ID -> 파일 경로 매핑
100
+ processed_images: 처리된 이미지 경로 집합 (중복 방지)
101
+ image_processor: 이미지 프로세서 인스턴스
102
+ chart_callback: 차트 참조 발견 시 호출할 콜백 함수
103
+ chartIDRef (예: "Chart/chart1.xml")를 받아 포맷된 차트 텍스트 반환
104
+
105
+ Returns:
106
+ 추출된 텍스트 문자열
107
+ """
108
+ try:
109
+ root = ET.fromstring(xml_content)
110
+ ns = HWPX_NAMESPACES
111
+
112
+ text_parts = []
113
+
114
+ # 최상위 레벨의 hp:p만 처리 (테이블 내부의 hp:p는 테이블 파서에서 처리)
115
+ for p in root.findall('hp:p', ns):
116
+ p_text = []
117
+ for run in p.findall('hp:run', ns):
118
+ # Text
119
+ t = run.find('hp:t', ns)
120
+ if t is not None and t.text:
121
+ p_text.append(t.text)
122
+
123
+ # Table (직접 hp:run 안에 hp:tbl로 존재!)
124
+ table = run.find('hp:tbl', ns)
125
+ if table is not None:
126
+ table_html = _process_table(table, ns)
127
+ if table_html:
128
+ p_text.append(f"\n{table_html}\n")
129
+
130
+ # Chart in switch/case (hp:switch > hp:case > hp:chart)
131
+ switch = run.find('hp:switch', ns)
132
+ if switch is not None:
133
+ case = switch.find('hp:case', ns)
134
+ if case is not None:
135
+ chart = case.find('hp:chart', ns)
136
+ if chart is not None and chart_callback:
137
+ chart_id_ref = chart.get('chartIDRef')
138
+ if chart_id_ref:
139
+ chart_text = chart_callback(chart_id_ref)
140
+ if chart_text:
141
+ p_text.append(f"\n{chart_text}\n")
142
+
143
+ # Direct Image (hp:pic directly in hp:run)
144
+ pic = run.find('hp:pic', ns)
145
+ if pic is not None and zf and bin_item_map:
146
+ image_text = _process_inline_image(
147
+ pic, zf, bin_item_map, processed_images, image_processor
148
+ )
149
+ if image_text:
150
+ p_text.append(image_text)
151
+
152
+ # Ctrl (Image 등)
153
+ ctrl = run.find('hp:ctrl', ns)
154
+ if ctrl is not None:
155
+ # 혹시 ctrl 안에 테이블이 있는 경우도 처리
156
+ ctrl_table = ctrl.find('hp:tbl', ns)
157
+ if ctrl_table is not None:
158
+ table_html = _process_table(ctrl_table, ns)
159
+ if table_html:
160
+ p_text.append(f"\n{table_html}\n")
161
+
162
+ # Image (hc:pic)
163
+ pic = ctrl.find('hc:pic', ns)
164
+ if pic is not None and zf and bin_item_map:
165
+ image_text = _process_inline_image(
166
+ pic, zf, bin_item_map, processed_images, image_processor
167
+ )
168
+ if image_text:
169
+ p_text.append(image_text)
170
+
171
+ if p_text:
172
+ text_parts.append("".join(p_text))
173
+
174
+ return "\n".join(text_parts)
175
+
176
+ except Exception as e:
177
+ logger.error(f"Error parsing HWPX XML: {e}")
178
+ return ""
179
+
180
+
181
+ def _process_inline_image(
182
+ pic: ET.Element,
183
+ zf: zipfile.ZipFile,
184
+ bin_item_map: Dict[str, str],
185
+ processed_images: Optional[Set[str]],
186
+ image_processor: ImageProcessor
187
+ ) -> str:
188
+ """
189
+ 인라인 이미지를 처리합니다.
190
+
191
+ HWPX 이미지 구조:
192
+ - <hp:pic> 또는 <hc:pic>
193
+ - <hc:img binaryItemIDRef="image3">
194
+
195
+ Args:
196
+ pic: hp:pic 또는 hc:pic 요소
197
+ zf: ZipFile 객체
198
+ bin_item_map: BinItem ID -> 파일 경로 매핑
199
+ processed_images: 처리된 이미지 경로 집합
200
+ image_processor: 이미지 프로세서 인스턴스
201
+
202
+ Returns:
203
+ 이미지 태그 문자열 또는 빈 문자열
204
+ """
205
+ ns = HWPX_NAMESPACES
206
+
207
+ try:
208
+ # Try to find binaryItemIDRef from nested hc:img element
209
+ img_elem = pic.find('hc:img', ns)
210
+ if img_elem is not None:
211
+ bin_item_id = img_elem.get('binaryItemIDRef')
212
+ else:
213
+ # Fallback: try direct BinItem attribute
214
+ bin_item_id = pic.get('BinItem')
215
+
216
+ if not bin_item_id or bin_item_id not in bin_item_map:
217
+ return ""
218
+
219
+ img_path = bin_item_map[bin_item_id]
220
+
221
+ # HWPX href might be relative. Usually "BinData/xxx.png"
222
+ full_path = img_path
223
+ if full_path not in zf.namelist():
224
+ if f"Contents/{img_path}" in zf.namelist():
225
+ full_path = f"Contents/{img_path}"
226
+
227
+ if full_path not in zf.namelist():
228
+ return ""
229
+
230
+ with zf.open(full_path) as f:
231
+ image_data = f.read()
232
+
233
+ image_tag = image_processor.save_image(image_data)
234
+ if image_tag:
235
+ if processed_images is not None:
236
+ processed_images.add(full_path)
237
+ return f"\n{image_tag}\n"
238
+
239
+ except Exception as e:
240
+ logger.warning(f"Failed to process inline image: {e}")
241
+
242
+ return ""
@@ -0,0 +1,462 @@
1
+ # xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py
2
+ """
3
+ HWPX Table Extractor
4
+
5
+ Extracts tables from HWPX documents using the BaseTableExtractor interface.
6
+ Converts HWPX table elements to TableData objects for further processing.
7
+
8
+ ================================================================================
9
+ EXTRACTION APPROACH: Streaming Processing (요소 단위 실시간 처리) - APPROACH 2
10
+ ================================================================================
11
+
12
+ HWPX uses the Streaming Processing approach because:
13
+ - Tables are explicit <hp:tbl> XML elements
14
+ - Tables can be processed one-by-one during document traversal
15
+ - Preserves natural document order
16
+ - Memory efficient for large documents
17
+
18
+ External Interface: extract_table(element, context) -> Optional[TableData]
19
+ - Called from hwpx_section.py during section element traversal
20
+ - Each <hp:tbl> element is passed to extract_table()
21
+ - Returns TableData or None
22
+ - ALL internal processing is encapsulated within this single method
23
+
24
+ ================================================================================
25
+ APPROACH 2 Pure Implementation:
26
+ ================================================================================
27
+ Per table_extractor.py structure, APPROACH 2 exposes ONLY extract_table().
28
+ All sub-functions are private and called only from within extract_table().
29
+
30
+ External (Public):
31
+ extract_table(element, context) → Optional[TableData]
32
+
33
+ Internal (Private) - All called from extract_table():
34
+ _parse_cell_position() - Extract cellAddr (row/col position)
35
+ _parse_cell_span() - Extract cellSpan (rowspan/colspan)
36
+ _extract_cell_content() - Cell content extraction (including nested tables)
37
+ _build_cell_grid() - Build grid from cells
38
+
39
+ ================================================================================
40
+ Key Features:
41
+ - Full support for rowspan/colspan (hp:cellSpan)
42
+ - Grid-based cell positioning (hp:cellAddr)
43
+ - Nested table support (recursive processing)
44
+ - Container table detection (1x1 tables)
45
+
46
+ HWPX Table XML Structure:
47
+ - hp:tbl rowCnt="N" colCnt="M": Table element with row/col count attributes
48
+ - hp:tr: Table row
49
+ - hp:tc: Table cell
50
+ - hp:cellAddr colAddr="X" rowAddr="Y": Cell position in grid
51
+ - hp:cellSpan colSpan="N" rowSpan="M": Merge information
52
+ - hp:subList/hp:p/hp:run/hp:t: Cell text content
53
+ """
54
+ import logging
55
+ import traceback
56
+ import xml.etree.ElementTree as ET
57
+ from typing import Any, Dict, List, Optional, Tuple
58
+
59
+ from xgen_doc2chunk.core.functions.table_extractor import (
60
+ BaseTableExtractor,
61
+ TableCell,
62
+ TableData,
63
+ TableExtractorConfig,
64
+ )
65
+ from xgen_doc2chunk.core.processor.hwpx_helper.hwpx_constants import HWPX_NAMESPACES
66
+
67
+ logger = logging.getLogger("document-processor")
68
+
69
+
70
+ class HWPXTableExtractor(BaseTableExtractor):
71
+ """
72
+ HWPX-specific table extractor implementation.
73
+
74
+ Uses STREAMING PROCESSING approach (APPROACH 2 - 요소 단위 실시간 처리).
75
+
76
+ Extracts tables from HWPX documents and converts them to TableData objects.
77
+ Supports complex table structures including merged cells (rowspan/colspan).
78
+
79
+ ============================================================================
80
+ External Interface (Public):
81
+ ============================================================================
82
+ extract_table(element, context) -> Optional[TableData]
83
+
84
+ This is the ONLY public method for table extraction.
85
+ All other methods are private and called internally from extract_table().
86
+
87
+ ============================================================================
88
+ Usage:
89
+ ============================================================================
90
+ extractor = HWPXTableExtractor()
91
+
92
+ # Streaming approach (APPROACH 2):
93
+ for p in root.findall('hp:p', ns):
94
+ for run in p.findall('hp:run', ns):
95
+ table = run.find('hp:tbl', ns)
96
+ if table is not None:
97
+ table_data = extractor.extract_table(table, ns)
98
+ if table_data:
99
+ process(table_data)
100
+ """
101
+
102
+ def __init__(self, config: Optional[TableExtractorConfig] = None):
103
+ """Initialize the HWPX table extractor.
104
+
105
+ Args:
106
+ config: Table extraction configuration
107
+ """
108
+ super().__init__(config)
109
+
110
+ def supports_format(self, format_type: str) -> bool:
111
+ """Check if this extractor supports the given format.
112
+
113
+ Args:
114
+ format_type: Format identifier
115
+
116
+ Returns:
117
+ True if format is 'hwpx'
118
+ """
119
+ return format_type.lower() == 'hwpx'
120
+
121
+ # ==========================================================================
122
+ # STREAMING PROCESSING - APPROACH 2 (요소 단위 실시간 처리)
123
+ # ==========================================================================
124
+ #
125
+ # HWPX는 APPROACH 2를 사용하므로 extract_table() 하나만 외부에 노출됨.
126
+ # 모든 세부 함수는 extract_table() 내부에서만 호출됨.
127
+ #
128
+ # ==========================================================================
129
+
130
+ def extract_table(
131
+ self,
132
+ element: Any,
133
+ context: Any = None
134
+ ) -> Optional[TableData]:
135
+ """Extract a single table from a <hp:tbl> XML element.
136
+
137
+ ========================================================================
138
+ [APPROACH 2 - STREAMING PROCESSING] - Single External Interface
139
+ ========================================================================
140
+
141
+ This is the ONLY public method for HWPX table extraction.
142
+ Called from hwpx_section.py during section parsing.
143
+
144
+ All internal processing (grid building, span detection,
145
+ cell extraction) is encapsulated within this method.
146
+
147
+ Args:
148
+ element: <hp:tbl> XML element (ElementTree Element)
149
+ context: Namespace dictionary for XML parsing (default: HWPX_NAMESPACES)
150
+
151
+ Returns:
152
+ TableData object or None if extraction fails
153
+
154
+ Example:
155
+ extractor = HWPXTableExtractor()
156
+ for table in root.iter('{http://www.hancom.co.kr/hwpml/2011/paragraph}tbl'):
157
+ table_data = extractor.extract_table(table, HWPX_NAMESPACES)
158
+ if table_data:
159
+ html = processor.format_table_as_html(table_data)
160
+ """
161
+ try:
162
+ # Use provided namespace or default
163
+ ns = context if isinstance(context, dict) else HWPX_NAMESPACES
164
+
165
+ # ----------------------------------------------------------------
166
+ # Step 1: Get table dimensions from attributes
167
+ # ----------------------------------------------------------------
168
+ total_rows = int(element.get('rowCnt', 0))
169
+ total_cols = int(element.get('colCnt', 0))
170
+
171
+ # ----------------------------------------------------------------
172
+ # Step 2: Build cell grid from table structure
173
+ # ----------------------------------------------------------------
174
+ grid, max_row, max_col = self._build_cell_grid(element, ns)
175
+
176
+ # Update dimensions if not specified in attributes
177
+ if total_rows == 0:
178
+ total_rows = max_row + 1 if max_row >= 0 else 0
179
+ if total_cols == 0:
180
+ total_cols = max_col + 1 if max_col >= 0 else 0
181
+
182
+ if not grid:
183
+ return None
184
+
185
+ # ----------------------------------------------------------------
186
+ # Step 3: Build skip map for merged cells
187
+ # ----------------------------------------------------------------
188
+ skip_map = set()
189
+ for (row_addr, col_addr), cell_info in grid.items():
190
+ rowspan = cell_info['rowspan']
191
+ colspan = cell_info['colspan']
192
+ # Mark cells covered by merge (except the origin cell)
193
+ for rs in range(rowspan):
194
+ for cs in range(colspan):
195
+ if rs == 0 and cs == 0:
196
+ continue
197
+ skip_map.add((row_addr + rs, col_addr + cs))
198
+
199
+ # ----------------------------------------------------------------
200
+ # Step 4: Build TableCell grid
201
+ # ----------------------------------------------------------------
202
+ table_rows: List[List[TableCell]] = []
203
+
204
+ for r in range(total_rows):
205
+ row_cells: List[TableCell] = []
206
+
207
+ for c in range(total_cols):
208
+ # Skip merged cells
209
+ if (r, c) in skip_map:
210
+ continue
211
+
212
+ if (r, c) in grid:
213
+ cell_info = grid[(r, c)]
214
+ content = cell_info['text']
215
+ rowspan = cell_info['rowspan']
216
+ colspan = cell_info['colspan']
217
+
218
+ table_cell = TableCell(
219
+ content=content,
220
+ row_span=rowspan,
221
+ col_span=colspan,
222
+ is_header=(r == 0 and self.config.include_header_row),
223
+ row_index=r,
224
+ col_index=c,
225
+ nested_table=None # Nested tables are embedded in content
226
+ )
227
+ row_cells.append(table_cell)
228
+ else:
229
+ # Empty cell (not in grid, not skipped)
230
+ table_cell = TableCell(
231
+ content="",
232
+ row_span=1,
233
+ col_span=1,
234
+ is_header=(r == 0 and self.config.include_header_row),
235
+ row_index=r,
236
+ col_index=c,
237
+ nested_table=None
238
+ )
239
+ row_cells.append(table_cell)
240
+
241
+ if row_cells:
242
+ table_rows.append(row_cells)
243
+
244
+ # ----------------------------------------------------------------
245
+ # Step 5: Create and return TableData
246
+ # ----------------------------------------------------------------
247
+ actual_rows = len(table_rows)
248
+ actual_cols = total_cols
249
+
250
+ table_data = TableData(
251
+ rows=table_rows,
252
+ num_rows=actual_rows,
253
+ num_cols=actual_cols,
254
+ has_header=self.config.include_header_row and actual_rows > 0,
255
+ start_offset=0,
256
+ end_offset=0,
257
+ source_format='hwpx',
258
+ metadata={
259
+ 'original_row_cnt': total_rows,
260
+ 'original_col_cnt': total_cols,
261
+ },
262
+ col_widths_percent=[] # HWPX doesn't typically specify column widths
263
+ )
264
+
265
+ return table_data
266
+
267
+ except Exception as e:
268
+ self.logger.error(f"Error extracting table from HWPX element: {e}")
269
+ self.logger.debug(traceback.format_exc())
270
+ return None
271
+
272
+ # ==========================================================================
273
+ # Private Helper Methods (Called internally from extract_table)
274
+ # ==========================================================================
275
+
276
+ def _build_cell_grid(
277
+ self,
278
+ table_elem: ET.Element,
279
+ ns: Dict[str, str]
280
+ ) -> Tuple[Dict[Tuple[int, int], Dict], int, int]:
281
+ """Build a grid of cells from the table element.
282
+
283
+ Parses all cells and builds a dictionary mapping (row, col) positions
284
+ to cell information including text, rowspan, and colspan.
285
+
286
+ Args:
287
+ table_elem: <hp:tbl> XML element
288
+ ns: Namespace dictionary
289
+
290
+ Returns:
291
+ Tuple of (grid, max_row, max_col) where:
292
+ - grid: Dict mapping (row_addr, col_addr) -> {text, rowspan, colspan}
293
+ - max_row: Maximum row index found
294
+ - max_col: Maximum column index found
295
+ """
296
+ grid = {}
297
+ max_row = -1
298
+ max_col = -1
299
+
300
+ for tr in table_elem.findall('hp:tr', ns):
301
+ for tc in tr.findall('hp:tc', ns):
302
+ # Parse cell position
303
+ row_addr, col_addr = self._parse_cell_position(tc, ns)
304
+
305
+ # Parse cell span
306
+ rowspan, colspan = self._parse_cell_span(tc, ns)
307
+
308
+ # Extract cell content (including nested tables)
309
+ cell_text = self._extract_cell_content(tc, ns)
310
+
311
+ # Store in grid
312
+ grid[(row_addr, col_addr)] = {
313
+ 'text': cell_text,
314
+ 'rowspan': rowspan,
315
+ 'colspan': colspan,
316
+ }
317
+
318
+ max_row = max(max_row, row_addr)
319
+ max_col = max(max_col, col_addr)
320
+
321
+ return grid, max_row, max_col
322
+
323
+ def _parse_cell_position(
324
+ self,
325
+ tc: ET.Element,
326
+ ns: Dict[str, str]
327
+ ) -> Tuple[int, int]:
328
+ """Parse cell position from hp:cellAddr element.
329
+
330
+ Args:
331
+ tc: <hp:tc> cell element
332
+ ns: Namespace dictionary
333
+
334
+ Returns:
335
+ Tuple of (row_addr, col_addr)
336
+ """
337
+ row_addr = 0
338
+ col_addr = 0
339
+
340
+ cell_addr = tc.find('hp:cellAddr', ns)
341
+ if cell_addr is not None:
342
+ try:
343
+ col_addr = int(cell_addr.get('colAddr', 0))
344
+ except (ValueError, TypeError):
345
+ col_addr = 0
346
+ try:
347
+ row_addr = int(cell_addr.get('rowAddr', 0))
348
+ except (ValueError, TypeError):
349
+ row_addr = 0
350
+
351
+ return row_addr, col_addr
352
+
353
+ def _parse_cell_span(
354
+ self,
355
+ tc: ET.Element,
356
+ ns: Dict[str, str]
357
+ ) -> Tuple[int, int]:
358
+ """Parse cell span from hp:cellSpan element.
359
+
360
+ Args:
361
+ tc: <hp:tc> cell element
362
+ ns: Namespace dictionary
363
+
364
+ Returns:
365
+ Tuple of (rowspan, colspan)
366
+ """
367
+ rowspan = 1
368
+ colspan = 1
369
+
370
+ cell_span = tc.find('hp:cellSpan', ns)
371
+ if cell_span is not None:
372
+ try:
373
+ colspan = int(cell_span.get('colSpan', 1))
374
+ except (ValueError, TypeError):
375
+ colspan = 1
376
+ try:
377
+ rowspan = int(cell_span.get('rowSpan', 1))
378
+ except (ValueError, TypeError):
379
+ rowspan = 1
380
+
381
+ return rowspan, colspan
382
+
383
+ def _extract_cell_content(
384
+ self,
385
+ tc: ET.Element,
386
+ ns: Dict[str, str]
387
+ ) -> str:
388
+ """Extract cell content including text and nested tables.
389
+
390
+ Recursively processes nested tables and returns them as embedded content.
391
+
392
+ Args:
393
+ tc: <hp:tc> cell element
394
+ ns: Namespace dictionary
395
+
396
+ Returns:
397
+ Cell content as string (nested tables converted to text)
398
+ """
399
+ content_parts = []
400
+
401
+ sublist = tc.find('hp:subList', ns)
402
+ if sublist is not None:
403
+ for p in sublist.findall('hp:p', ns):
404
+ para_parts = []
405
+
406
+ for run in p.findall('hp:run', ns):
407
+ # Extract text content
408
+ t = run.find('hp:t', ns)
409
+ if t is not None and t.text:
410
+ para_parts.append(t.text)
411
+
412
+ # Handle nested table (recursive call)
413
+ nested_table = run.find('hp:tbl', ns)
414
+ if nested_table is not None:
415
+ nested_data = self.extract_table(nested_table, ns)
416
+ if nested_data:
417
+ # Convert nested table to simple text representation
418
+ nested_text = self._nested_table_to_text(nested_data)
419
+ if nested_text:
420
+ para_parts.append(nested_text)
421
+
422
+ if para_parts:
423
+ content_parts.append("".join(para_parts))
424
+
425
+ return " ".join(content_parts).strip()
426
+
427
+ def _nested_table_to_text(self, table_data: TableData) -> str:
428
+ """Convert a nested TableData to simple text representation.
429
+
430
+ For nested tables, we convert to a simple text format to avoid
431
+ deeply nested HTML structures.
432
+
433
+ Args:
434
+ table_data: TableData of the nested table
435
+
436
+ Returns:
437
+ Simple text representation of the table
438
+ """
439
+ if not table_data or not table_data.rows:
440
+ return ""
441
+
442
+ lines = []
443
+ for row in table_data.rows:
444
+ row_texts = [cell.content for cell in row if cell.content]
445
+ if row_texts:
446
+ lines.append(" | ".join(row_texts))
447
+
448
+ return "\n".join(lines) if lines else ""
449
+
450
+
451
+ def create_hwpx_table_extractor(
452
+ config: Optional[TableExtractorConfig] = None
453
+ ) -> HWPXTableExtractor:
454
+ """Factory function to create an HWPX table extractor.
455
+
456
+ Args:
457
+ config: Table extraction configuration
458
+
459
+ Returns:
460
+ Configured HWPXTableExtractor instance
461
+ """
462
+ return HWPXTableExtractor(config)