xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
# service/document_processor/processor/hwp_helper/hwp_table.py
|
|
2
|
+
"""
|
|
3
|
+
HWP 테이블 파싱 유틸리티
|
|
4
|
+
|
|
5
|
+
HWP 5.0 OLE 파일에서 테이블을 파싱하여 HTML로 변환합니다.
|
|
6
|
+
- parse_table: 테이블 컨트롤을 파싱하여 HTML 또는 리스트로 변환
|
|
7
|
+
- build_table_grid: 셀 정보를 그리드로 구성
|
|
8
|
+
- render_table_html: 그리드를 HTML 테이블로 렌더링
|
|
9
|
+
"""
|
|
10
|
+
import struct
|
|
11
|
+
import logging
|
|
12
|
+
from typing import Dict, Callable, Optional, Any, Set
|
|
13
|
+
|
|
14
|
+
import olefile
|
|
15
|
+
|
|
16
|
+
from xgen_doc2chunk.core.processor.hwp_helper.hwp_constants import (
|
|
17
|
+
HWPTAG_TABLE,
|
|
18
|
+
HWPTAG_LIST_HEADER,
|
|
19
|
+
)
|
|
20
|
+
from xgen_doc2chunk.core.processor.hwp_helper.hwp_record import HwpRecord
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger("document-processor")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def parse_table(
|
|
26
|
+
ctrl_header: HwpRecord,
|
|
27
|
+
traverse_callback: Callable[[HwpRecord, Any, Dict, Set], str],
|
|
28
|
+
ole: olefile.OleFileIO = None,
|
|
29
|
+
bin_data_map: Dict = None,
|
|
30
|
+
processed_images: Optional[Set[str]] = None
|
|
31
|
+
) -> str:
|
|
32
|
+
"""
|
|
33
|
+
HWP 테이블 컨트롤을 파싱합니다.
|
|
34
|
+
|
|
35
|
+
테이블 구조를 분석하여:
|
|
36
|
+
- 1×1 테이블: 셀 내용만 텍스트로 반환 (컨테이너 테이블)
|
|
37
|
+
- 단일 컬럼 테이블 (1열, 다중 행): 셀 내용을 줄바꿈으로 구분하여 반환
|
|
38
|
+
- 다중 컬럼 테이블 (2+ 열): HTML 테이블로 변환
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
ctrl_header: 테이블을 포함하는 CTRL_HEADER 레코드
|
|
42
|
+
traverse_callback: 셀 내용을 추출하기 위한 트리 순회 콜백 함수
|
|
43
|
+
ole: OLE 파일 객체
|
|
44
|
+
bin_data_map: BinData 매핑 정보
|
|
45
|
+
processed_images: 처리된 이미지 경로 집합
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
HTML 테이블, 텍스트 리스트, 또는 단순 텍스트
|
|
49
|
+
"""
|
|
50
|
+
try:
|
|
51
|
+
table_rec = next((c for c in ctrl_header.children if c.tag_id == HWPTAG_TABLE), None)
|
|
52
|
+
if not table_rec:
|
|
53
|
+
return ""
|
|
54
|
+
|
|
55
|
+
if len(table_rec.payload) < 8:
|
|
56
|
+
return ""
|
|
57
|
+
|
|
58
|
+
row_cnt = struct.unpack('<H', table_rec.payload[4:6])[0]
|
|
59
|
+
col_cnt = struct.unpack('<H', table_rec.payload[6:8])[0]
|
|
60
|
+
|
|
61
|
+
# 셀 그리드 구성
|
|
62
|
+
grid = build_table_grid(
|
|
63
|
+
ctrl_header,
|
|
64
|
+
traverse_callback,
|
|
65
|
+
ole,
|
|
66
|
+
bin_data_map,
|
|
67
|
+
processed_images
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# 1×1 테이블 -> 셀 내용만 반환 (컨테이너 테이블)
|
|
71
|
+
if row_cnt == 1 and col_cnt == 1:
|
|
72
|
+
if (0, 0) in grid:
|
|
73
|
+
return grid[(0, 0)]['text']
|
|
74
|
+
return ""
|
|
75
|
+
|
|
76
|
+
# 단일 컬럼 테이블 (1열, 다중 행) -> 셀 내용을 줄바꿈으로 구분
|
|
77
|
+
if col_cnt == 1:
|
|
78
|
+
text_items = []
|
|
79
|
+
for r in range(row_cnt):
|
|
80
|
+
if (r, 0) in grid:
|
|
81
|
+
cell_text = grid[(r, 0)]['text']
|
|
82
|
+
if cell_text:
|
|
83
|
+
text_items.append(cell_text)
|
|
84
|
+
if text_items:
|
|
85
|
+
return "\n\n".join(text_items)
|
|
86
|
+
return ""
|
|
87
|
+
|
|
88
|
+
# HTML 테이블 생성 (2+ 컬럼)
|
|
89
|
+
return render_table_html(grid, row_cnt, col_cnt)
|
|
90
|
+
|
|
91
|
+
except Exception as e:
|
|
92
|
+
logger.warning(f"Failed to parse HWP table: {e}")
|
|
93
|
+
return "[Table Extraction Failed]"
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def build_table_grid(
|
|
97
|
+
ctrl_header: HwpRecord,
|
|
98
|
+
traverse_callback: Callable[[HwpRecord, Any, Dict, Set], str],
|
|
99
|
+
ole: olefile.OleFileIO = None,
|
|
100
|
+
bin_data_map: Dict = None,
|
|
101
|
+
processed_images: Optional[Set[str]] = None
|
|
102
|
+
) -> Dict:
|
|
103
|
+
"""
|
|
104
|
+
테이블 셀 정보를 그리드로 구성합니다.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
ctrl_header: 테이블을 포함하는 CTRL_HEADER 레코드
|
|
108
|
+
traverse_callback: 셀 내용을 추출하기 위한 트리 순회 콜백 함수
|
|
109
|
+
ole: OLE 파일 객체
|
|
110
|
+
bin_data_map: BinData 매핑 정보
|
|
111
|
+
processed_images: 처리된 이미지 경로 집합
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
(row_idx, col_idx) -> {'text', 'rowspan', 'colspan'} 딕셔너리
|
|
115
|
+
"""
|
|
116
|
+
grid = {}
|
|
117
|
+
|
|
118
|
+
cells = [c for c in ctrl_header.children if c.tag_id == HWPTAG_LIST_HEADER]
|
|
119
|
+
|
|
120
|
+
for cell in cells:
|
|
121
|
+
if len(cell.payload) < 16:
|
|
122
|
+
continue
|
|
123
|
+
|
|
124
|
+
para_count = struct.unpack('<H', cell.payload[0:2])[0]
|
|
125
|
+
col_idx = struct.unpack('<H', cell.payload[8:10])[0]
|
|
126
|
+
row_idx = struct.unpack('<H', cell.payload[10:12])[0]
|
|
127
|
+
col_span = struct.unpack('<H', cell.payload[12:14])[0]
|
|
128
|
+
row_span = struct.unpack('<H', cell.payload[14:16])[0]
|
|
129
|
+
|
|
130
|
+
cell_text_parts = []
|
|
131
|
+
|
|
132
|
+
if cell.children:
|
|
133
|
+
for child in cell.children:
|
|
134
|
+
t = traverse_callback(child, ole, bin_data_map, processed_images)
|
|
135
|
+
cell_text_parts.append(t)
|
|
136
|
+
else:
|
|
137
|
+
siblings = list(cell.get_next_siblings(para_count))
|
|
138
|
+
for sibling in siblings:
|
|
139
|
+
t = traverse_callback(sibling, ole, bin_data_map, processed_images)
|
|
140
|
+
cell_text_parts.append(t)
|
|
141
|
+
|
|
142
|
+
cell_content = "".join(cell_text_parts).strip()
|
|
143
|
+
|
|
144
|
+
grid[(row_idx, col_idx)] = {
|
|
145
|
+
'text': cell_content,
|
|
146
|
+
'rowspan': row_span,
|
|
147
|
+
'colspan': col_span
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
return grid
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def render_table_html(grid: Dict, row_cnt: int, col_cnt: int) -> str:
|
|
154
|
+
"""
|
|
155
|
+
그리드를 HTML 테이블로 렌더링합니다.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
grid: (row_idx, col_idx) -> {'text', 'rowspan', 'colspan'} 딕셔너리
|
|
159
|
+
row_cnt: 테이블 행 수
|
|
160
|
+
col_cnt: 테이블 열 수
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
HTML 테이블 문자열
|
|
164
|
+
"""
|
|
165
|
+
html_parts = ["<table border='1'>"]
|
|
166
|
+
skip_map = set()
|
|
167
|
+
|
|
168
|
+
for r in range(row_cnt):
|
|
169
|
+
html_parts.append("<tr>")
|
|
170
|
+
for c in range(col_cnt):
|
|
171
|
+
if (r, c) in skip_map:
|
|
172
|
+
continue
|
|
173
|
+
|
|
174
|
+
if (r, c) in grid:
|
|
175
|
+
cell = grid[(r, c)]
|
|
176
|
+
rowspan = cell['rowspan']
|
|
177
|
+
colspan = cell['colspan']
|
|
178
|
+
text = cell['text']
|
|
179
|
+
|
|
180
|
+
attr = ""
|
|
181
|
+
if rowspan > 1:
|
|
182
|
+
attr += f" rowspan='{rowspan}'"
|
|
183
|
+
if colspan > 1:
|
|
184
|
+
attr += f" colspan='{colspan}'"
|
|
185
|
+
|
|
186
|
+
html_parts.append(f"<td{attr}>{text}</td>")
|
|
187
|
+
|
|
188
|
+
for rs in range(rowspan):
|
|
189
|
+
for cs in range(colspan):
|
|
190
|
+
if rs == 0 and cs == 0:
|
|
191
|
+
continue
|
|
192
|
+
skip_map.add((r + rs, c + cs))
|
|
193
|
+
else:
|
|
194
|
+
html_parts.append("<td></td>")
|
|
195
|
+
html_parts.append("</tr>")
|
|
196
|
+
|
|
197
|
+
html_parts.append("</table>")
|
|
198
|
+
return "\n".join(html_parts)
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
__all__ = [
|
|
202
|
+
'parse_table',
|
|
203
|
+
'build_table_grid',
|
|
204
|
+
'render_table_html',
|
|
205
|
+
]
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/hwpx_processor.py
|
|
2
|
+
"""
|
|
3
|
+
HWPX Handler - HWPX (ZIP/XML based) Document Processor
|
|
4
|
+
|
|
5
|
+
Class-based handler for HWPX files inheriting from BaseHandler.
|
|
6
|
+
"""
|
|
7
|
+
import io
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Dict, Any, Set, TYPE_CHECKING
|
|
10
|
+
|
|
11
|
+
from xgen_doc2chunk.core.processor.base_handler import BaseHandler
|
|
12
|
+
from xgen_doc2chunk.core.functions.chart_extractor import BaseChartExtractor
|
|
13
|
+
from xgen_doc2chunk.core.processor.hwpx_helper import (
|
|
14
|
+
parse_bin_item_map,
|
|
15
|
+
parse_hwpx_section,
|
|
16
|
+
)
|
|
17
|
+
from xgen_doc2chunk.core.processor.hwpx_helper.hwpx_chart_extractor import HWPXChartExtractor
|
|
18
|
+
from xgen_doc2chunk.core.processor.hwpx_helper.hwpx_metadata import HWPXMetadataExtractor
|
|
19
|
+
from xgen_doc2chunk.core.processor.hwpx_helper.hwpx_image_processor import HWPXImageProcessor
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from xgen_doc2chunk.core.document_processor import CurrentFile
|
|
23
|
+
from xgen_doc2chunk.core.functions.chart_extractor import ChartData
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger("document-processor")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class HWPXHandler(BaseHandler):
|
|
29
|
+
"""HWPX (ZIP/XML based Korean document) Processing Handler Class"""
|
|
30
|
+
|
|
31
|
+
def _create_file_converter(self):
|
|
32
|
+
"""Create HWPX-specific file converter."""
|
|
33
|
+
from xgen_doc2chunk.core.processor.hwpx_helper.hwpx_file_converter import HWPXFileConverter
|
|
34
|
+
return HWPXFileConverter()
|
|
35
|
+
|
|
36
|
+
def _create_preprocessor(self):
|
|
37
|
+
"""Create HWPX-specific preprocessor."""
|
|
38
|
+
from xgen_doc2chunk.core.processor.hwpx_helper.hwpx_preprocessor import HWPXPreprocessor
|
|
39
|
+
return HWPXPreprocessor()
|
|
40
|
+
|
|
41
|
+
def _create_chart_extractor(self) -> BaseChartExtractor:
|
|
42
|
+
"""Create HWPX-specific chart extractor."""
|
|
43
|
+
return HWPXChartExtractor(self._chart_processor)
|
|
44
|
+
|
|
45
|
+
def _create_metadata_extractor(self):
|
|
46
|
+
"""Create HWPX-specific metadata extractor."""
|
|
47
|
+
return HWPXMetadataExtractor()
|
|
48
|
+
|
|
49
|
+
def _create_format_image_processor(self):
|
|
50
|
+
"""Create HWPX-specific image processor."""
|
|
51
|
+
return HWPXImageProcessor(
|
|
52
|
+
directory_path=self._image_processor.config.directory_path,
|
|
53
|
+
tag_prefix=self._image_processor.config.tag_prefix,
|
|
54
|
+
tag_suffix=self._image_processor.config.tag_suffix,
|
|
55
|
+
storage_backend=self._image_processor.storage_backend,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
def extract_text(
|
|
59
|
+
self,
|
|
60
|
+
current_file: "CurrentFile",
|
|
61
|
+
extract_metadata: bool = True,
|
|
62
|
+
**kwargs
|
|
63
|
+
) -> str:
|
|
64
|
+
"""
|
|
65
|
+
Extract text from HWPX file.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
current_file: CurrentFile dict containing file info and binary data
|
|
69
|
+
extract_metadata: Whether to extract metadata
|
|
70
|
+
**kwargs: Additional options
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
Extracted text
|
|
74
|
+
"""
|
|
75
|
+
file_path = current_file.get("file_path", "unknown")
|
|
76
|
+
file_data = current_file.get("file_data", b"")
|
|
77
|
+
text_content = []
|
|
78
|
+
|
|
79
|
+
# Check if it's a valid ZIP file using file_converter.validate()
|
|
80
|
+
if not self.file_converter.validate(file_data):
|
|
81
|
+
self.logger.error("Not a valid Zip file: %s", file_path)
|
|
82
|
+
return ""
|
|
83
|
+
|
|
84
|
+
try:
|
|
85
|
+
# Get file stream
|
|
86
|
+
file_stream = self.get_file_stream(current_file)
|
|
87
|
+
|
|
88
|
+
# Pre-extract all charts using ChartExtractor with refs
|
|
89
|
+
# This creates a mapping from chartIDRef -> ChartData
|
|
90
|
+
chart_map = self.chart_extractor.extract_all_with_refs(file_stream)
|
|
91
|
+
processed_chart_refs = set()
|
|
92
|
+
|
|
93
|
+
def chart_callback(chart_id_ref: str) -> str:
|
|
94
|
+
"""Callback to get chart content by chartIDRef."""
|
|
95
|
+
# chartIDRef is like "Chart/chart1.xml"
|
|
96
|
+
if chart_id_ref in processed_chart_refs:
|
|
97
|
+
return "" # Already processed
|
|
98
|
+
|
|
99
|
+
chart_data = chart_map.get(chart_id_ref)
|
|
100
|
+
if chart_data:
|
|
101
|
+
processed_chart_refs.add(chart_id_ref)
|
|
102
|
+
return self._format_chart_data(chart_data)
|
|
103
|
+
return ""
|
|
104
|
+
|
|
105
|
+
# Step 1: Convert binary to ZipFile using file_converter
|
|
106
|
+
zf = self.file_converter.convert(file_data, file_stream)
|
|
107
|
+
|
|
108
|
+
# Step 2: Preprocess - clean_content is the TRUE SOURCE
|
|
109
|
+
preprocessed = self.preprocess(zf)
|
|
110
|
+
zf = preprocessed.clean_content # TRUE SOURCE
|
|
111
|
+
|
|
112
|
+
try:
|
|
113
|
+
if extract_metadata:
|
|
114
|
+
metadata_text = self.extract_and_format_metadata(zf)
|
|
115
|
+
if metadata_text:
|
|
116
|
+
text_content.append(metadata_text)
|
|
117
|
+
text_content.append("")
|
|
118
|
+
|
|
119
|
+
bin_item_map = parse_bin_item_map(zf)
|
|
120
|
+
|
|
121
|
+
section_files = [
|
|
122
|
+
f for f in zf.namelist()
|
|
123
|
+
if f.startswith("Contents/section") and f.endswith(".xml")
|
|
124
|
+
]
|
|
125
|
+
section_files.sort(key=lambda x: int(x.replace("Contents/section", "").replace(".xml", "")))
|
|
126
|
+
|
|
127
|
+
processed_images: Set[str] = set()
|
|
128
|
+
|
|
129
|
+
for sec_file in section_files:
|
|
130
|
+
with zf.open(sec_file) as f:
|
|
131
|
+
xml_content = f.read()
|
|
132
|
+
section_text = parse_hwpx_section(
|
|
133
|
+
xml_content,
|
|
134
|
+
zf,
|
|
135
|
+
bin_item_map,
|
|
136
|
+
processed_images,
|
|
137
|
+
image_processor=self.format_image_processor,
|
|
138
|
+
chart_callback=chart_callback
|
|
139
|
+
)
|
|
140
|
+
text_content.append(section_text)
|
|
141
|
+
|
|
142
|
+
# Use format_image_processor directly
|
|
143
|
+
image_processor = self.format_image_processor
|
|
144
|
+
if hasattr(image_processor, 'get_remaining_images'):
|
|
145
|
+
remaining_images = image_processor.get_remaining_images(zf, processed_images)
|
|
146
|
+
if remaining_images and hasattr(image_processor, 'process_images'):
|
|
147
|
+
image_text = image_processor.process_images(zf, remaining_images)
|
|
148
|
+
if image_text:
|
|
149
|
+
text_content.append("\n\n=== Extracted Images (Not Inline) ===\n")
|
|
150
|
+
text_content.append(image_text)
|
|
151
|
+
|
|
152
|
+
finally:
|
|
153
|
+
# Close ZipFile using file_converter
|
|
154
|
+
self.file_converter.close(zf)
|
|
155
|
+
|
|
156
|
+
except Exception as e: # noqa: BLE001
|
|
157
|
+
self.logger.error("Error processing HWPX file: %s", e)
|
|
158
|
+
return f"Error processing HWPX file: {str(e)}"
|
|
159
|
+
|
|
160
|
+
return "\n".join(text_content)
|
|
161
|
+
|
|
162
|
+
def _is_valid_zip(self, file_stream: io.BytesIO) -> bool:
|
|
163
|
+
"""Check if stream is a valid ZIP file."""
|
|
164
|
+
try:
|
|
165
|
+
file_stream.seek(0)
|
|
166
|
+
header = file_stream.read(4)
|
|
167
|
+
file_stream.seek(0)
|
|
168
|
+
return header == b'PK\x03\x04'
|
|
169
|
+
except Exception: # noqa: BLE001
|
|
170
|
+
return False
|
|
171
|
+
|
|
172
|
+
def _format_chart_data(self, chart_data: "ChartData") -> str:
|
|
173
|
+
"""Format ChartData using ChartProcessor."""
|
|
174
|
+
from xgen_doc2chunk.core.functions.chart_extractor import ChartData
|
|
175
|
+
|
|
176
|
+
if not isinstance(chart_data, ChartData):
|
|
177
|
+
return ""
|
|
178
|
+
|
|
179
|
+
if chart_data.has_data():
|
|
180
|
+
return self.chart_processor.format_chart_data(
|
|
181
|
+
chart_type=chart_data.chart_type,
|
|
182
|
+
title=chart_data.title,
|
|
183
|
+
categories=chart_data.categories,
|
|
184
|
+
series=chart_data.series
|
|
185
|
+
)
|
|
186
|
+
else:
|
|
187
|
+
return self.chart_processor.format_chart_fallback(
|
|
188
|
+
chart_type=chart_data.chart_type,
|
|
189
|
+
title=chart_data.title
|
|
190
|
+
)
|
|
191
|
+
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# hwpx_helper/__init__.py
|
|
2
|
+
"""
|
|
3
|
+
HWPX Helper 모듈
|
|
4
|
+
|
|
5
|
+
hwpx_processor.py에서 사용하는 기능적 구성요소들을 모듈화하여 제공합니다.
|
|
6
|
+
|
|
7
|
+
모듈 구성:
|
|
8
|
+
- hwpx_constants: 상수 및 네임스페이스 정의
|
|
9
|
+
- hwpx_metadata: 메타데이터 추출 및 BinItem 매핑
|
|
10
|
+
- hwpx_table_extractor: 테이블 추출 (HWPXTableExtractor) - BaseTableExtractor 구현
|
|
11
|
+
- hwpx_table_processor: 테이블 포맷팅 (HWPXTableProcessor) - TableProcessor 확장
|
|
12
|
+
- hwpx_section: 섹션 XML 파싱
|
|
13
|
+
- hwpx_image_processor: 이미지 처리 및 업로드
|
|
14
|
+
- hwpx_chart_extractor: 차트 추출 (ChartExtractor)
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
# Constants
|
|
18
|
+
from xgen_doc2chunk.core.processor.hwpx_helper.hwpx_constants import (
|
|
19
|
+
HWPX_NAMESPACES,
|
|
20
|
+
OPF_NAMESPACES,
|
|
21
|
+
SUPPORTED_IMAGE_EXTENSIONS,
|
|
22
|
+
SKIP_IMAGE_EXTENSIONS,
|
|
23
|
+
HEADER_FILE_PATHS,
|
|
24
|
+
HPF_PATH,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
# Metadata
|
|
28
|
+
from xgen_doc2chunk.core.processor.hwpx_helper.hwpx_metadata import (
|
|
29
|
+
HWPXMetadataExtractor,
|
|
30
|
+
parse_bin_item_map,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
# Table Extractor (NEW - BaseTableExtractor implementation)
|
|
34
|
+
from xgen_doc2chunk.core.processor.hwpx_helper.hwpx_table_extractor import (
|
|
35
|
+
HWPXTableExtractor,
|
|
36
|
+
create_hwpx_table_extractor,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
# Table Processor (NEW - TableProcessor extension)
|
|
40
|
+
from xgen_doc2chunk.core.processor.hwpx_helper.hwpx_table_processor import (
|
|
41
|
+
HWPXTableProcessor,
|
|
42
|
+
HWPXTableProcessorConfig,
|
|
43
|
+
create_hwpx_table_processor,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
# Section
|
|
47
|
+
from xgen_doc2chunk.core.processor.hwpx_helper.hwpx_section import (
|
|
48
|
+
parse_hwpx_section,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# Image Processor (replaces hwpx_image.py utility functions)
|
|
52
|
+
from xgen_doc2chunk.core.processor.hwpx_helper.hwpx_image_processor import (
|
|
53
|
+
HWPXImageProcessor,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# Chart Extractor
|
|
57
|
+
from xgen_doc2chunk.core.processor.hwpx_helper.hwpx_chart_extractor import (
|
|
58
|
+
HWPXChartExtractor,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
__all__ = [
|
|
62
|
+
# Constants
|
|
63
|
+
"HWPX_NAMESPACES",
|
|
64
|
+
"OPF_NAMESPACES",
|
|
65
|
+
"SUPPORTED_IMAGE_EXTENSIONS",
|
|
66
|
+
"SKIP_IMAGE_EXTENSIONS",
|
|
67
|
+
"HEADER_FILE_PATHS",
|
|
68
|
+
"HPF_PATH",
|
|
69
|
+
# Metadata
|
|
70
|
+
"HWPXMetadataExtractor",
|
|
71
|
+
"parse_bin_item_map",
|
|
72
|
+
# Table Extractor (NEW)
|
|
73
|
+
"HWPXTableExtractor",
|
|
74
|
+
"create_hwpx_table_extractor",
|
|
75
|
+
# Table Processor (NEW)
|
|
76
|
+
"HWPXTableProcessor",
|
|
77
|
+
"HWPXTableProcessorConfig",
|
|
78
|
+
"create_hwpx_table_processor",
|
|
79
|
+
# Section
|
|
80
|
+
"parse_hwpx_section",
|
|
81
|
+
# Image Processor
|
|
82
|
+
"HWPXImageProcessor",
|
|
83
|
+
# Chart Extractor
|
|
84
|
+
"HWPXChartExtractor",
|
|
85
|
+
]
|