xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
# csv_helper/csv_parser.py
|
|
2
|
+
"""
|
|
3
|
+
CSV 파싱 및 분석
|
|
4
|
+
|
|
5
|
+
CSV 파일의 구분자 감지, 파싱, 헤더 감지 기능을 제공합니다.
|
|
6
|
+
"""
|
|
7
|
+
import csv
|
|
8
|
+
import io
|
|
9
|
+
import logging
|
|
10
|
+
import re
|
|
11
|
+
from typing import List
|
|
12
|
+
|
|
13
|
+
from xgen_doc2chunk.core.processor.csv_helper.csv_constants import DELIMITER_CANDIDATES, MAX_ROWS, MAX_COLS
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger("document-processor")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def detect_delimiter(content: str) -> str:
|
|
19
|
+
"""
|
|
20
|
+
CSV 구분자를 자동 감지합니다.
|
|
21
|
+
|
|
22
|
+
감지 방법:
|
|
23
|
+
1. csv.Sniffer 사용 시도
|
|
24
|
+
2. 각 구분자의 일관성 분석
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
content: CSV 파일 내용
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
감지된 구분자 문자
|
|
31
|
+
"""
|
|
32
|
+
try:
|
|
33
|
+
# 처음 몇 줄만 분석
|
|
34
|
+
sample_lines = content.split('\n')[:20]
|
|
35
|
+
sample = '\n'.join(sample_lines)
|
|
36
|
+
|
|
37
|
+
# csv.Sniffer 사용
|
|
38
|
+
try:
|
|
39
|
+
dialect = csv.Sniffer().sniff(sample, delimiters=',\t;|')
|
|
40
|
+
return dialect.delimiter
|
|
41
|
+
except csv.Error:
|
|
42
|
+
pass
|
|
43
|
+
|
|
44
|
+
# 수동 감지: 각 구분자의 일관성 확인
|
|
45
|
+
best_delimiter = ','
|
|
46
|
+
best_score = 0
|
|
47
|
+
|
|
48
|
+
for delim in DELIMITER_CANDIDATES:
|
|
49
|
+
counts = [line.count(delim) for line in sample_lines if line.strip()]
|
|
50
|
+
|
|
51
|
+
if not counts:
|
|
52
|
+
continue
|
|
53
|
+
|
|
54
|
+
# 모든 행에서 구분자 수가 일정한지 확인
|
|
55
|
+
if len(set(counts)) == 1 and counts[0] > 0:
|
|
56
|
+
score = counts[0] * 10 # 일관성 보너스
|
|
57
|
+
else:
|
|
58
|
+
score = sum(counts) / len(counts) if counts else 0
|
|
59
|
+
|
|
60
|
+
if score > best_score:
|
|
61
|
+
best_score = score
|
|
62
|
+
best_delimiter = delim
|
|
63
|
+
|
|
64
|
+
return best_delimiter
|
|
65
|
+
|
|
66
|
+
except Exception:
|
|
67
|
+
return ','
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def parse_csv_content(content: str, delimiter: str) -> List[List[str]]:
|
|
71
|
+
"""
|
|
72
|
+
CSV 내용을 파싱합니다.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
content: CSV 파일 내용
|
|
76
|
+
delimiter: 구분자
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
파싱된 행 데이터 (2차원 리스트)
|
|
80
|
+
"""
|
|
81
|
+
rows = []
|
|
82
|
+
|
|
83
|
+
try:
|
|
84
|
+
# 줄바꿈 정규화
|
|
85
|
+
content = content.replace('\r\n', '\n').replace('\r', '\n')
|
|
86
|
+
|
|
87
|
+
reader = csv.reader(
|
|
88
|
+
io.StringIO(content),
|
|
89
|
+
delimiter=delimiter,
|
|
90
|
+
quotechar='"',
|
|
91
|
+
doublequote=True,
|
|
92
|
+
skipinitialspace=True
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
for i, row in enumerate(reader):
|
|
96
|
+
if i >= MAX_ROWS:
|
|
97
|
+
logger.warning(f"CSV row limit reached: {MAX_ROWS}")
|
|
98
|
+
break
|
|
99
|
+
|
|
100
|
+
# 열 수 제한
|
|
101
|
+
if len(row) > MAX_COLS:
|
|
102
|
+
row = row[:MAX_COLS]
|
|
103
|
+
|
|
104
|
+
# 빈 행 건너뛰기
|
|
105
|
+
if any(cell.strip() for cell in row):
|
|
106
|
+
rows.append(row)
|
|
107
|
+
|
|
108
|
+
return rows
|
|
109
|
+
|
|
110
|
+
except csv.Error as e:
|
|
111
|
+
logger.warning(f"CSV parsing error: {e}")
|
|
112
|
+
# 폴백: 단순 분할
|
|
113
|
+
return parse_csv_simple(content, delimiter)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def parse_csv_simple(content: str, delimiter: str) -> List[List[str]]:
|
|
117
|
+
"""
|
|
118
|
+
단순 분할로 CSV를 파싱합니다 (폴백용).
|
|
119
|
+
|
|
120
|
+
csv 모듈 파싱 실패 시 사용됩니다.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
content: CSV 파일 내용
|
|
124
|
+
delimiter: 구분자
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
파싱된 행 데이터
|
|
128
|
+
"""
|
|
129
|
+
rows = []
|
|
130
|
+
|
|
131
|
+
for i, line in enumerate(content.split('\n')):
|
|
132
|
+
if i >= MAX_ROWS:
|
|
133
|
+
break
|
|
134
|
+
|
|
135
|
+
line = line.strip()
|
|
136
|
+
if not line:
|
|
137
|
+
continue
|
|
138
|
+
|
|
139
|
+
cells = line.split(delimiter)
|
|
140
|
+
if len(cells) > MAX_COLS:
|
|
141
|
+
cells = cells[:MAX_COLS]
|
|
142
|
+
|
|
143
|
+
rows.append(cells)
|
|
144
|
+
|
|
145
|
+
return rows
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def detect_header(rows: List[List[str]]) -> bool:
|
|
149
|
+
"""
|
|
150
|
+
첫 번째 행이 헤더인지 감지합니다.
|
|
151
|
+
|
|
152
|
+
판단 기준:
|
|
153
|
+
1. 첫 번째 행의 모든 셀이 문자열인지
|
|
154
|
+
2. 두 번째 행에 숫자가 있는지
|
|
155
|
+
3. 첫 번째 행의 셀들이 고유한지
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
rows: 파싱된 행 데이터
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
첫 번째 행이 헤더이면 True
|
|
162
|
+
"""
|
|
163
|
+
if len(rows) < 2:
|
|
164
|
+
return False
|
|
165
|
+
|
|
166
|
+
first_row = rows[0]
|
|
167
|
+
second_row = rows[1]
|
|
168
|
+
|
|
169
|
+
# 1. 첫 번째 행의 모든 셀이 문자열인지 확인
|
|
170
|
+
first_all_text = all(
|
|
171
|
+
not is_numeric(cell) for cell in first_row if cell.strip()
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
# 2. 두 번째 행에 숫자가 있는지 확인
|
|
175
|
+
second_has_numbers = any(
|
|
176
|
+
is_numeric(cell) for cell in second_row if cell.strip()
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
# 3. 첫 번째 행의 셀들이 고유한지 확인 (헤더는 보통 고유함)
|
|
180
|
+
first_unique = len(set(first_row)) == len(first_row)
|
|
181
|
+
|
|
182
|
+
# 헤더일 가능성 판단
|
|
183
|
+
if first_all_text and (second_has_numbers or first_unique):
|
|
184
|
+
return True
|
|
185
|
+
|
|
186
|
+
return False
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def is_numeric(value: str) -> bool:
|
|
190
|
+
"""
|
|
191
|
+
값이 숫자인지 확인합니다.
|
|
192
|
+
|
|
193
|
+
지원 형식:
|
|
194
|
+
- 정수: 123, -456
|
|
195
|
+
- 실수: 12.34, -56.78
|
|
196
|
+
- 천단위 구분: 1,234,567
|
|
197
|
+
- 퍼센트: 50%
|
|
198
|
+
- 통화: $100, ₩10,000
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
value: 확인할 값
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
숫자이면 True
|
|
205
|
+
"""
|
|
206
|
+
if not value or not value.strip():
|
|
207
|
+
return False
|
|
208
|
+
|
|
209
|
+
value = value.strip()
|
|
210
|
+
|
|
211
|
+
# 숫자 패턴
|
|
212
|
+
patterns = [
|
|
213
|
+
r'^-?\d+$', # 정수
|
|
214
|
+
r'^-?\d+\.\d+$', # 실수
|
|
215
|
+
r'^-?\d{1,3}(,\d{3})*(\.\d+)?$', # 천단위 구분
|
|
216
|
+
r'^-?\d+%$', # 퍼센트
|
|
217
|
+
r'^\$-?\d+(\.\d+)?$', # 달러
|
|
218
|
+
r'^₩-?\d+(,\d{3})*$', # 원화
|
|
219
|
+
]
|
|
220
|
+
|
|
221
|
+
for pattern in patterns:
|
|
222
|
+
if re.match(pattern, value):
|
|
223
|
+
return True
|
|
224
|
+
|
|
225
|
+
return False
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py
|
|
2
|
+
"""
|
|
3
|
+
CSV Preprocessor - Process CSV content after conversion.
|
|
4
|
+
|
|
5
|
+
Processing Pipeline Position:
|
|
6
|
+
1. CSVFileConverter.convert() ??(content: str, encoding: str)
|
|
7
|
+
2. CSVPreprocessor.preprocess() ??PreprocessedData (THIS STEP)
|
|
8
|
+
3. CSVMetadataExtractor.extract() ??DocumentMetadata
|
|
9
|
+
4. Content extraction (rows, columns)
|
|
10
|
+
|
|
11
|
+
Current Implementation:
|
|
12
|
+
- Pass-through (CSV uses decoded string content directly)
|
|
13
|
+
"""
|
|
14
|
+
import logging
|
|
15
|
+
from typing import Any, Dict
|
|
16
|
+
|
|
17
|
+
from xgen_doc2chunk.core.functions.preprocessor import (
|
|
18
|
+
BasePreprocessor,
|
|
19
|
+
PreprocessedData,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger("xgen_doc2chunk.csv.preprocessor")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class CSVPreprocessor(BasePreprocessor):
|
|
26
|
+
"""
|
|
27
|
+
CSV Content Preprocessor.
|
|
28
|
+
|
|
29
|
+
Currently a pass-through implementation as CSV processing
|
|
30
|
+
is handled during the content extraction phase.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def preprocess(
|
|
34
|
+
self,
|
|
35
|
+
converted_data: Any,
|
|
36
|
+
**kwargs
|
|
37
|
+
) -> PreprocessedData:
|
|
38
|
+
"""
|
|
39
|
+
Preprocess the converted CSV content.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
converted_data: Tuple of (content: str, encoding: str) from CSVFileConverter
|
|
43
|
+
**kwargs: Additional options
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
PreprocessedData with the content and encoding
|
|
47
|
+
"""
|
|
48
|
+
metadata: Dict[str, Any] = {}
|
|
49
|
+
|
|
50
|
+
content = ""
|
|
51
|
+
encoding = "utf-8"
|
|
52
|
+
|
|
53
|
+
# Handle tuple return from CSVFileConverter
|
|
54
|
+
if isinstance(converted_data, tuple) and len(converted_data) >= 2:
|
|
55
|
+
content, encoding = converted_data[0], converted_data[1]
|
|
56
|
+
metadata['detected_encoding'] = encoding
|
|
57
|
+
if content:
|
|
58
|
+
lines = content.split('\n')
|
|
59
|
+
metadata['line_count'] = len(lines)
|
|
60
|
+
elif isinstance(converted_data, str):
|
|
61
|
+
content = converted_data
|
|
62
|
+
metadata['line_count'] = len(content.split('\n'))
|
|
63
|
+
|
|
64
|
+
logger.debug("CSV preprocessor: pass-through, metadata=%s", metadata)
|
|
65
|
+
|
|
66
|
+
# clean_content is the TRUE SOURCE - contains the processed string content
|
|
67
|
+
return PreprocessedData(
|
|
68
|
+
raw_content=content,
|
|
69
|
+
clean_content=content, # TRUE SOURCE - string content for CSV
|
|
70
|
+
encoding=encoding,
|
|
71
|
+
extracted_resources={},
|
|
72
|
+
metadata=metadata,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
def get_format_name(self) -> str:
|
|
76
|
+
"""Return format name."""
|
|
77
|
+
return "CSV Preprocessor"
|
|
78
|
+
|
|
79
|
+
def validate(self, data: Any) -> bool:
|
|
80
|
+
"""Validate if data is CSV content."""
|
|
81
|
+
if isinstance(data, tuple) and len(data) >= 2:
|
|
82
|
+
return isinstance(data[0], str)
|
|
83
|
+
return isinstance(data, str)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
__all__ = ['CSVPreprocessor']
|
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
# csv_helper/csv_table.py
|
|
2
|
+
"""
|
|
3
|
+
CSV 테이블 변환
|
|
4
|
+
|
|
5
|
+
CSV 데이터를 Markdown 또는 HTML 테이블로 변환합니다.
|
|
6
|
+
병합셀 분석 및 처리를 포함합니다.
|
|
7
|
+
"""
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Any, Dict, List
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger("document-processor")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def has_merged_cells(rows: List[List[str]]) -> bool:
|
|
15
|
+
"""
|
|
16
|
+
CSV 데이터에 병합셀(빈 셀)이 존재하는지 확인합니다.
|
|
17
|
+
|
|
18
|
+
병합셀의 판단 기준:
|
|
19
|
+
- 연속된 빈 셀이 존재하는 경우
|
|
20
|
+
- 첫 번째 열에 빈 셀이 있고 이전 행에 값이 있는 경우 (세로 병합 패턴)
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
rows: 파싱된 행 데이터
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
병합셀이 존재하면 True, 아니면 False
|
|
27
|
+
"""
|
|
28
|
+
if not rows or len(rows) < 2:
|
|
29
|
+
return False
|
|
30
|
+
|
|
31
|
+
for row_idx, row in enumerate(rows):
|
|
32
|
+
for col_idx, cell in enumerate(row):
|
|
33
|
+
cell_value = cell.strip() if cell else ""
|
|
34
|
+
|
|
35
|
+
# 빈 셀 발견
|
|
36
|
+
if not cell_value:
|
|
37
|
+
# 첫 번째 행이 아니고 첫 번째 열의 빈 셀 -> 세로 병합 가능성
|
|
38
|
+
if row_idx > 0 and col_idx == 0:
|
|
39
|
+
return True
|
|
40
|
+
|
|
41
|
+
# 이전 셀이 비어있지 않고 현재 셀이 비어있음 -> 가로 병합 가능성
|
|
42
|
+
if col_idx > 0:
|
|
43
|
+
prev_cell = row[col_idx - 1].strip() if col_idx - 1 < len(row) else ""
|
|
44
|
+
if prev_cell:
|
|
45
|
+
return True
|
|
46
|
+
|
|
47
|
+
return False
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def analyze_merge_info(rows: List[List[str]]) -> List[List[Dict[str, Any]]]:
|
|
51
|
+
"""
|
|
52
|
+
CSV 데이터의 병합셀 정보를 분석합니다.
|
|
53
|
+
|
|
54
|
+
빈 셀을 기반으로 colspan(가로 병합)과 rowspan(세로 병합)을 계산합니다.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
rows: 파싱된 행 데이터
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
각 셀의 병합 정보를 담은 2차원 리스트
|
|
61
|
+
각 셀 정보: {
|
|
62
|
+
'value': str, # 셀 값
|
|
63
|
+
'colspan': int, # 가로 병합 수 (1 이상)
|
|
64
|
+
'rowspan': int, # 세로 병합 수 (1 이상)
|
|
65
|
+
'skip': bool # 이 셀이 다른 셀에 병합되어 렌더링 생략해야 하는지
|
|
66
|
+
}
|
|
67
|
+
"""
|
|
68
|
+
if not rows:
|
|
69
|
+
return []
|
|
70
|
+
|
|
71
|
+
row_count = len(rows)
|
|
72
|
+
col_count = max(len(row) for row in rows) if rows else 0
|
|
73
|
+
|
|
74
|
+
# 초기화: 모든 셀 정보 생성
|
|
75
|
+
merge_info: List[List[Dict[str, Any]]] = []
|
|
76
|
+
for row_idx, row in enumerate(rows):
|
|
77
|
+
row_info = []
|
|
78
|
+
for col_idx in range(col_count):
|
|
79
|
+
cell_value = row[col_idx].strip() if col_idx < len(row) else ""
|
|
80
|
+
row_info.append({
|
|
81
|
+
'value': cell_value,
|
|
82
|
+
'colspan': 1,
|
|
83
|
+
'rowspan': 1,
|
|
84
|
+
'skip': False
|
|
85
|
+
})
|
|
86
|
+
merge_info.append(row_info)
|
|
87
|
+
|
|
88
|
+
# 1단계: 가로 병합 (colspan) 계산 - 오른쪽으로 연속된 빈 셀
|
|
89
|
+
for row_idx in range(row_count):
|
|
90
|
+
col_idx = 0
|
|
91
|
+
while col_idx < col_count:
|
|
92
|
+
cell_info = merge_info[row_idx][col_idx]
|
|
93
|
+
|
|
94
|
+
# 이미 스킵된 셀이거나 빈 셀이면 패스
|
|
95
|
+
if cell_info['skip'] or not cell_info['value']:
|
|
96
|
+
col_idx += 1
|
|
97
|
+
continue
|
|
98
|
+
|
|
99
|
+
# 오른쪽으로 연속된 빈 셀 카운트
|
|
100
|
+
colspan = 1
|
|
101
|
+
next_col = col_idx + 1
|
|
102
|
+
while next_col < col_count:
|
|
103
|
+
next_cell = merge_info[row_idx][next_col]
|
|
104
|
+
if not next_cell['value'] and not next_cell['skip']:
|
|
105
|
+
colspan += 1
|
|
106
|
+
next_cell['skip'] = True # 병합되어 렌더링 생략
|
|
107
|
+
next_col += 1
|
|
108
|
+
else:
|
|
109
|
+
break
|
|
110
|
+
|
|
111
|
+
cell_info['colspan'] = colspan
|
|
112
|
+
col_idx = next_col
|
|
113
|
+
|
|
114
|
+
# 2단계: 세로 병합 (rowspan) 계산 - 아래로 연속된 빈 셀
|
|
115
|
+
for col_idx in range(col_count):
|
|
116
|
+
row_idx = 0
|
|
117
|
+
while row_idx < row_count:
|
|
118
|
+
cell_info = merge_info[row_idx][col_idx]
|
|
119
|
+
|
|
120
|
+
# 이미 스킵된 셀이거나 빈 셀이면 패스
|
|
121
|
+
if cell_info['skip'] or not cell_info['value']:
|
|
122
|
+
row_idx += 1
|
|
123
|
+
continue
|
|
124
|
+
|
|
125
|
+
# 아래로 연속된 빈 셀 카운트
|
|
126
|
+
rowspan = 1
|
|
127
|
+
next_row = row_idx + 1
|
|
128
|
+
while next_row < row_count:
|
|
129
|
+
next_cell = merge_info[next_row][col_idx]
|
|
130
|
+
# 빈 셀이고 아직 스킵되지 않은 경우에만 세로 병합
|
|
131
|
+
if not next_cell['value'] and not next_cell['skip']:
|
|
132
|
+
rowspan += 1
|
|
133
|
+
next_cell['skip'] = True # 병합되어 렌더링 생략
|
|
134
|
+
next_row += 1
|
|
135
|
+
else:
|
|
136
|
+
break
|
|
137
|
+
|
|
138
|
+
cell_info['rowspan'] = rowspan
|
|
139
|
+
row_idx = next_row
|
|
140
|
+
|
|
141
|
+
return merge_info
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def convert_rows_to_table(rows: List[List[str]], has_header: bool) -> str:
|
|
145
|
+
"""
|
|
146
|
+
CSV 행을 테이블로 변환합니다.
|
|
147
|
+
병합셀이 없으면 Markdown, 있으면 HTML로 변환합니다.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
rows: 파싱된 행 데이터
|
|
151
|
+
has_header: 헤더 존재 여부
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
변환된 테이블 문자열
|
|
155
|
+
"""
|
|
156
|
+
if not rows:
|
|
157
|
+
return ""
|
|
158
|
+
|
|
159
|
+
# 병합셀 유무 확인
|
|
160
|
+
has_merged = has_merged_cells(rows)
|
|
161
|
+
|
|
162
|
+
if has_merged:
|
|
163
|
+
logger.debug("Merged cells detected, using HTML format")
|
|
164
|
+
return convert_rows_to_html(rows, has_header)
|
|
165
|
+
else:
|
|
166
|
+
logger.debug("No merged cells, using Markdown format")
|
|
167
|
+
return convert_rows_to_markdown(rows, has_header)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def convert_rows_to_markdown(rows: List[List[str]], _has_header: bool) -> str:
|
|
171
|
+
"""
|
|
172
|
+
CSV 행을 Markdown 테이블로 변환합니다.
|
|
173
|
+
|
|
174
|
+
Note:
|
|
175
|
+
Markdown 테이블은 첫 행이 항상 헤더로 취급되므로
|
|
176
|
+
_has_header 인자는 HTML 변환과의 인터페이스 일관성을 위해 유지됩니다.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
rows: 파싱된 행 데이터
|
|
180
|
+
_has_header: 헤더 존재 여부 (Markdown에서는 미사용)
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
Markdown 테이블 문자열
|
|
184
|
+
"""
|
|
185
|
+
if not rows:
|
|
186
|
+
return ""
|
|
187
|
+
|
|
188
|
+
md_parts = []
|
|
189
|
+
|
|
190
|
+
for row_idx, row in enumerate(rows):
|
|
191
|
+
# 셀 값 정리 (파이프 문자 이스케이프)
|
|
192
|
+
cells = []
|
|
193
|
+
for cell in row:
|
|
194
|
+
cell_value = cell.strip() if cell else ""
|
|
195
|
+
# Markdown 테이블에서 파이프는 이스케이프 필요
|
|
196
|
+
cell_value = cell_value.replace("|", "\\|")
|
|
197
|
+
# 줄바꿈을 공백으로 변환 (Markdown 테이블은 줄바꿈 미지원)
|
|
198
|
+
cell_value = cell_value.replace("\n", " ")
|
|
199
|
+
cells.append(cell_value)
|
|
200
|
+
|
|
201
|
+
# 행 생성
|
|
202
|
+
row_str = "| " + " | ".join(cells) + " |"
|
|
203
|
+
md_parts.append(row_str)
|
|
204
|
+
|
|
205
|
+
# 헤더 구분선 추가 (첫 번째 행 다음)
|
|
206
|
+
if row_idx == 0:
|
|
207
|
+
separator = "| " + " | ".join(["---"] * len(cells)) + " |"
|
|
208
|
+
md_parts.append(separator)
|
|
209
|
+
|
|
210
|
+
return "\n".join(md_parts)
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def convert_rows_to_html(rows: List[List[str]], has_header: bool) -> str:
|
|
214
|
+
"""
|
|
215
|
+
CSV 행을 HTML 테이블로 변환합니다.
|
|
216
|
+
병합셀(빈 셀)을 분석하여 colspan과 rowspan을 적용합니다.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
rows: 파싱된 행 데이터
|
|
220
|
+
has_header: 헤더 존재 여부
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
HTML 테이블 문자열
|
|
224
|
+
"""
|
|
225
|
+
if not rows:
|
|
226
|
+
return ""
|
|
227
|
+
|
|
228
|
+
# 병합 정보 분석
|
|
229
|
+
merge_info = analyze_merge_info(rows)
|
|
230
|
+
|
|
231
|
+
html_parts = ["<table border='1'>"]
|
|
232
|
+
|
|
233
|
+
for row_idx, row_info in enumerate(merge_info):
|
|
234
|
+
html_parts.append("<tr>")
|
|
235
|
+
|
|
236
|
+
for cell_info in row_info:
|
|
237
|
+
# 다른 셀에 병합되어 스킵해야 하는 경우
|
|
238
|
+
if cell_info['skip']:
|
|
239
|
+
continue
|
|
240
|
+
|
|
241
|
+
cell_value = cell_info['value']
|
|
242
|
+
|
|
243
|
+
# HTML 이스케이프
|
|
244
|
+
cell_value = cell_value.replace("&", "&")
|
|
245
|
+
cell_value = cell_value.replace("<", "<")
|
|
246
|
+
cell_value = cell_value.replace(">", ">")
|
|
247
|
+
cell_value = cell_value.replace("\n", "<br>")
|
|
248
|
+
|
|
249
|
+
# 헤더 또는 데이터 셀
|
|
250
|
+
tag = "th" if (has_header and row_idx == 0) else "td"
|
|
251
|
+
|
|
252
|
+
# 병합 속성 생성
|
|
253
|
+
attrs = []
|
|
254
|
+
if cell_info['colspan'] > 1:
|
|
255
|
+
attrs.append(f"colspan='{cell_info['colspan']}'")
|
|
256
|
+
if cell_info['rowspan'] > 1:
|
|
257
|
+
attrs.append(f"rowspan='{cell_info['rowspan']}'")
|
|
258
|
+
|
|
259
|
+
attr_str = " " + " ".join(attrs) if attrs else ""
|
|
260
|
+
html_parts.append(f"<{tag}{attr_str}>{cell_value}</{tag}>")
|
|
261
|
+
|
|
262
|
+
html_parts.append("</tr>")
|
|
263
|
+
|
|
264
|
+
html_parts.append("</table>")
|
|
265
|
+
|
|
266
|
+
return "\n".join(html_parts)
|