xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,225 @@
1
+ # csv_helper/csv_parser.py
2
+ """
3
+ CSV 파싱 및 분석
4
+
5
+ CSV 파일의 구분자 감지, 파싱, 헤더 감지 기능을 제공합니다.
6
+ """
7
+ import csv
8
+ import io
9
+ import logging
10
+ import re
11
+ from typing import List
12
+
13
+ from xgen_doc2chunk.core.processor.csv_helper.csv_constants import DELIMITER_CANDIDATES, MAX_ROWS, MAX_COLS
14
+
15
+ logger = logging.getLogger("document-processor")
16
+
17
+
18
+ def detect_delimiter(content: str) -> str:
19
+ """
20
+ CSV 구분자를 자동 감지합니다.
21
+
22
+ 감지 방법:
23
+ 1. csv.Sniffer 사용 시도
24
+ 2. 각 구분자의 일관성 분석
25
+
26
+ Args:
27
+ content: CSV 파일 내용
28
+
29
+ Returns:
30
+ 감지된 구분자 문자
31
+ """
32
+ try:
33
+ # 처음 몇 줄만 분석
34
+ sample_lines = content.split('\n')[:20]
35
+ sample = '\n'.join(sample_lines)
36
+
37
+ # csv.Sniffer 사용
38
+ try:
39
+ dialect = csv.Sniffer().sniff(sample, delimiters=',\t;|')
40
+ return dialect.delimiter
41
+ except csv.Error:
42
+ pass
43
+
44
+ # 수동 감지: 각 구분자의 일관성 확인
45
+ best_delimiter = ','
46
+ best_score = 0
47
+
48
+ for delim in DELIMITER_CANDIDATES:
49
+ counts = [line.count(delim) for line in sample_lines if line.strip()]
50
+
51
+ if not counts:
52
+ continue
53
+
54
+ # 모든 행에서 구분자 수가 일정한지 확인
55
+ if len(set(counts)) == 1 and counts[0] > 0:
56
+ score = counts[0] * 10 # 일관성 보너스
57
+ else:
58
+ score = sum(counts) / len(counts) if counts else 0
59
+
60
+ if score > best_score:
61
+ best_score = score
62
+ best_delimiter = delim
63
+
64
+ return best_delimiter
65
+
66
+ except Exception:
67
+ return ','
68
+
69
+
70
+ def parse_csv_content(content: str, delimiter: str) -> List[List[str]]:
71
+ """
72
+ CSV 내용을 파싱합니다.
73
+
74
+ Args:
75
+ content: CSV 파일 내용
76
+ delimiter: 구분자
77
+
78
+ Returns:
79
+ 파싱된 행 데이터 (2차원 리스트)
80
+ """
81
+ rows = []
82
+
83
+ try:
84
+ # 줄바꿈 정규화
85
+ content = content.replace('\r\n', '\n').replace('\r', '\n')
86
+
87
+ reader = csv.reader(
88
+ io.StringIO(content),
89
+ delimiter=delimiter,
90
+ quotechar='"',
91
+ doublequote=True,
92
+ skipinitialspace=True
93
+ )
94
+
95
+ for i, row in enumerate(reader):
96
+ if i >= MAX_ROWS:
97
+ logger.warning(f"CSV row limit reached: {MAX_ROWS}")
98
+ break
99
+
100
+ # 열 수 제한
101
+ if len(row) > MAX_COLS:
102
+ row = row[:MAX_COLS]
103
+
104
+ # 빈 행 건너뛰기
105
+ if any(cell.strip() for cell in row):
106
+ rows.append(row)
107
+
108
+ return rows
109
+
110
+ except csv.Error as e:
111
+ logger.warning(f"CSV parsing error: {e}")
112
+ # 폴백: 단순 분할
113
+ return parse_csv_simple(content, delimiter)
114
+
115
+
116
+ def parse_csv_simple(content: str, delimiter: str) -> List[List[str]]:
117
+ """
118
+ 단순 분할로 CSV를 파싱합니다 (폴백용).
119
+
120
+ csv 모듈 파싱 실패 시 사용됩니다.
121
+
122
+ Args:
123
+ content: CSV 파일 내용
124
+ delimiter: 구분자
125
+
126
+ Returns:
127
+ 파싱된 행 데이터
128
+ """
129
+ rows = []
130
+
131
+ for i, line in enumerate(content.split('\n')):
132
+ if i >= MAX_ROWS:
133
+ break
134
+
135
+ line = line.strip()
136
+ if not line:
137
+ continue
138
+
139
+ cells = line.split(delimiter)
140
+ if len(cells) > MAX_COLS:
141
+ cells = cells[:MAX_COLS]
142
+
143
+ rows.append(cells)
144
+
145
+ return rows
146
+
147
+
148
+ def detect_header(rows: List[List[str]]) -> bool:
149
+ """
150
+ 첫 번째 행이 헤더인지 감지합니다.
151
+
152
+ 판단 기준:
153
+ 1. 첫 번째 행의 모든 셀이 문자열인지
154
+ 2. 두 번째 행에 숫자가 있는지
155
+ 3. 첫 번째 행의 셀들이 고유한지
156
+
157
+ Args:
158
+ rows: 파싱된 행 데이터
159
+
160
+ Returns:
161
+ 첫 번째 행이 헤더이면 True
162
+ """
163
+ if len(rows) < 2:
164
+ return False
165
+
166
+ first_row = rows[0]
167
+ second_row = rows[1]
168
+
169
+ # 1. 첫 번째 행의 모든 셀이 문자열인지 확인
170
+ first_all_text = all(
171
+ not is_numeric(cell) for cell in first_row if cell.strip()
172
+ )
173
+
174
+ # 2. 두 번째 행에 숫자가 있는지 확인
175
+ second_has_numbers = any(
176
+ is_numeric(cell) for cell in second_row if cell.strip()
177
+ )
178
+
179
+ # 3. 첫 번째 행의 셀들이 고유한지 확인 (헤더는 보통 고유함)
180
+ first_unique = len(set(first_row)) == len(first_row)
181
+
182
+ # 헤더일 가능성 판단
183
+ if first_all_text and (second_has_numbers or first_unique):
184
+ return True
185
+
186
+ return False
187
+
188
+
189
+ def is_numeric(value: str) -> bool:
190
+ """
191
+ 값이 숫자인지 확인합니다.
192
+
193
+ 지원 형식:
194
+ - 정수: 123, -456
195
+ - 실수: 12.34, -56.78
196
+ - 천단위 구분: 1,234,567
197
+ - 퍼센트: 50%
198
+ - 통화: $100, ₩10,000
199
+
200
+ Args:
201
+ value: 확인할 값
202
+
203
+ Returns:
204
+ 숫자이면 True
205
+ """
206
+ if not value or not value.strip():
207
+ return False
208
+
209
+ value = value.strip()
210
+
211
+ # 숫자 패턴
212
+ patterns = [
213
+ r'^-?\d+$', # 정수
214
+ r'^-?\d+\.\d+$', # 실수
215
+ r'^-?\d{1,3}(,\d{3})*(\.\d+)?$', # 천단위 구분
216
+ r'^-?\d+%$', # 퍼센트
217
+ r'^\$-?\d+(\.\d+)?$', # 달러
218
+ r'^₩-?\d+(,\d{3})*$', # 원화
219
+ ]
220
+
221
+ for pattern in patterns:
222
+ if re.match(pattern, value):
223
+ return True
224
+
225
+ return False
@@ -0,0 +1,86 @@
1
+ # xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py
2
+ """
3
+ CSV Preprocessor - Process CSV content after conversion.
4
+
5
+ Processing Pipeline Position:
6
+ 1. CSVFileConverter.convert() ??(content: str, encoding: str)
7
+ 2. CSVPreprocessor.preprocess() ??PreprocessedData (THIS STEP)
8
+ 3. CSVMetadataExtractor.extract() ??DocumentMetadata
9
+ 4. Content extraction (rows, columns)
10
+
11
+ Current Implementation:
12
+ - Pass-through (CSV uses decoded string content directly)
13
+ """
14
+ import logging
15
+ from typing import Any, Dict
16
+
17
+ from xgen_doc2chunk.core.functions.preprocessor import (
18
+ BasePreprocessor,
19
+ PreprocessedData,
20
+ )
21
+
22
+ logger = logging.getLogger("xgen_doc2chunk.csv.preprocessor")
23
+
24
+
25
+ class CSVPreprocessor(BasePreprocessor):
26
+ """
27
+ CSV Content Preprocessor.
28
+
29
+ Currently a pass-through implementation as CSV processing
30
+ is handled during the content extraction phase.
31
+ """
32
+
33
+ def preprocess(
34
+ self,
35
+ converted_data: Any,
36
+ **kwargs
37
+ ) -> PreprocessedData:
38
+ """
39
+ Preprocess the converted CSV content.
40
+
41
+ Args:
42
+ converted_data: Tuple of (content: str, encoding: str) from CSVFileConverter
43
+ **kwargs: Additional options
44
+
45
+ Returns:
46
+ PreprocessedData with the content and encoding
47
+ """
48
+ metadata: Dict[str, Any] = {}
49
+
50
+ content = ""
51
+ encoding = "utf-8"
52
+
53
+ # Handle tuple return from CSVFileConverter
54
+ if isinstance(converted_data, tuple) and len(converted_data) >= 2:
55
+ content, encoding = converted_data[0], converted_data[1]
56
+ metadata['detected_encoding'] = encoding
57
+ if content:
58
+ lines = content.split('\n')
59
+ metadata['line_count'] = len(lines)
60
+ elif isinstance(converted_data, str):
61
+ content = converted_data
62
+ metadata['line_count'] = len(content.split('\n'))
63
+
64
+ logger.debug("CSV preprocessor: pass-through, metadata=%s", metadata)
65
+
66
+ # clean_content is the TRUE SOURCE - contains the processed string content
67
+ return PreprocessedData(
68
+ raw_content=content,
69
+ clean_content=content, # TRUE SOURCE - string content for CSV
70
+ encoding=encoding,
71
+ extracted_resources={},
72
+ metadata=metadata,
73
+ )
74
+
75
+ def get_format_name(self) -> str:
76
+ """Return format name."""
77
+ return "CSV Preprocessor"
78
+
79
+ def validate(self, data: Any) -> bool:
80
+ """Validate if data is CSV content."""
81
+ if isinstance(data, tuple) and len(data) >= 2:
82
+ return isinstance(data[0], str)
83
+ return isinstance(data, str)
84
+
85
+
86
+ __all__ = ['CSVPreprocessor']
@@ -0,0 +1,266 @@
1
+ # csv_helper/csv_table.py
2
+ """
3
+ CSV 테이블 변환
4
+
5
+ CSV 데이터를 Markdown 또는 HTML 테이블로 변환합니다.
6
+ 병합셀 분석 및 처리를 포함합니다.
7
+ """
8
+ import logging
9
+ from typing import Any, Dict, List
10
+
11
+ logger = logging.getLogger("document-processor")
12
+
13
+
14
+ def has_merged_cells(rows: List[List[str]]) -> bool:
15
+ """
16
+ CSV 데이터에 병합셀(빈 셀)이 존재하는지 확인합니다.
17
+
18
+ 병합셀의 판단 기준:
19
+ - 연속된 빈 셀이 존재하는 경우
20
+ - 첫 번째 열에 빈 셀이 있고 이전 행에 값이 있는 경우 (세로 병합 패턴)
21
+
22
+ Args:
23
+ rows: 파싱된 행 데이터
24
+
25
+ Returns:
26
+ 병합셀이 존재하면 True, 아니면 False
27
+ """
28
+ if not rows or len(rows) < 2:
29
+ return False
30
+
31
+ for row_idx, row in enumerate(rows):
32
+ for col_idx, cell in enumerate(row):
33
+ cell_value = cell.strip() if cell else ""
34
+
35
+ # 빈 셀 발견
36
+ if not cell_value:
37
+ # 첫 번째 행이 아니고 첫 번째 열의 빈 셀 -> 세로 병합 가능성
38
+ if row_idx > 0 and col_idx == 0:
39
+ return True
40
+
41
+ # 이전 셀이 비어있지 않고 현재 셀이 비어있음 -> 가로 병합 가능성
42
+ if col_idx > 0:
43
+ prev_cell = row[col_idx - 1].strip() if col_idx - 1 < len(row) else ""
44
+ if prev_cell:
45
+ return True
46
+
47
+ return False
48
+
49
+
50
+ def analyze_merge_info(rows: List[List[str]]) -> List[List[Dict[str, Any]]]:
51
+ """
52
+ CSV 데이터의 병합셀 정보를 분석합니다.
53
+
54
+ 빈 셀을 기반으로 colspan(가로 병합)과 rowspan(세로 병합)을 계산합니다.
55
+
56
+ Args:
57
+ rows: 파싱된 행 데이터
58
+
59
+ Returns:
60
+ 각 셀의 병합 정보를 담은 2차원 리스트
61
+ 각 셀 정보: {
62
+ 'value': str, # 셀 값
63
+ 'colspan': int, # 가로 병합 수 (1 이상)
64
+ 'rowspan': int, # 세로 병합 수 (1 이상)
65
+ 'skip': bool # 이 셀이 다른 셀에 병합되어 렌더링 생략해야 하는지
66
+ }
67
+ """
68
+ if not rows:
69
+ return []
70
+
71
+ row_count = len(rows)
72
+ col_count = max(len(row) for row in rows) if rows else 0
73
+
74
+ # 초기화: 모든 셀 정보 생성
75
+ merge_info: List[List[Dict[str, Any]]] = []
76
+ for row_idx, row in enumerate(rows):
77
+ row_info = []
78
+ for col_idx in range(col_count):
79
+ cell_value = row[col_idx].strip() if col_idx < len(row) else ""
80
+ row_info.append({
81
+ 'value': cell_value,
82
+ 'colspan': 1,
83
+ 'rowspan': 1,
84
+ 'skip': False
85
+ })
86
+ merge_info.append(row_info)
87
+
88
+ # 1단계: 가로 병합 (colspan) 계산 - 오른쪽으로 연속된 빈 셀
89
+ for row_idx in range(row_count):
90
+ col_idx = 0
91
+ while col_idx < col_count:
92
+ cell_info = merge_info[row_idx][col_idx]
93
+
94
+ # 이미 스킵된 셀이거나 빈 셀이면 패스
95
+ if cell_info['skip'] or not cell_info['value']:
96
+ col_idx += 1
97
+ continue
98
+
99
+ # 오른쪽으로 연속된 빈 셀 카운트
100
+ colspan = 1
101
+ next_col = col_idx + 1
102
+ while next_col < col_count:
103
+ next_cell = merge_info[row_idx][next_col]
104
+ if not next_cell['value'] and not next_cell['skip']:
105
+ colspan += 1
106
+ next_cell['skip'] = True # 병합되어 렌더링 생략
107
+ next_col += 1
108
+ else:
109
+ break
110
+
111
+ cell_info['colspan'] = colspan
112
+ col_idx = next_col
113
+
114
+ # 2단계: 세로 병합 (rowspan) 계산 - 아래로 연속된 빈 셀
115
+ for col_idx in range(col_count):
116
+ row_idx = 0
117
+ while row_idx < row_count:
118
+ cell_info = merge_info[row_idx][col_idx]
119
+
120
+ # 이미 스킵된 셀이거나 빈 셀이면 패스
121
+ if cell_info['skip'] or not cell_info['value']:
122
+ row_idx += 1
123
+ continue
124
+
125
+ # 아래로 연속된 빈 셀 카운트
126
+ rowspan = 1
127
+ next_row = row_idx + 1
128
+ while next_row < row_count:
129
+ next_cell = merge_info[next_row][col_idx]
130
+ # 빈 셀이고 아직 스킵되지 않은 경우에만 세로 병합
131
+ if not next_cell['value'] and not next_cell['skip']:
132
+ rowspan += 1
133
+ next_cell['skip'] = True # 병합되어 렌더링 생략
134
+ next_row += 1
135
+ else:
136
+ break
137
+
138
+ cell_info['rowspan'] = rowspan
139
+ row_idx = next_row
140
+
141
+ return merge_info
142
+
143
+
144
+ def convert_rows_to_table(rows: List[List[str]], has_header: bool) -> str:
145
+ """
146
+ CSV 행을 테이블로 변환합니다.
147
+ 병합셀이 없으면 Markdown, 있으면 HTML로 변환합니다.
148
+
149
+ Args:
150
+ rows: 파싱된 행 데이터
151
+ has_header: 헤더 존재 여부
152
+
153
+ Returns:
154
+ 변환된 테이블 문자열
155
+ """
156
+ if not rows:
157
+ return ""
158
+
159
+ # 병합셀 유무 확인
160
+ has_merged = has_merged_cells(rows)
161
+
162
+ if has_merged:
163
+ logger.debug("Merged cells detected, using HTML format")
164
+ return convert_rows_to_html(rows, has_header)
165
+ else:
166
+ logger.debug("No merged cells, using Markdown format")
167
+ return convert_rows_to_markdown(rows, has_header)
168
+
169
+
170
+ def convert_rows_to_markdown(rows: List[List[str]], _has_header: bool) -> str:
171
+ """
172
+ CSV 행을 Markdown 테이블로 변환합니다.
173
+
174
+ Note:
175
+ Markdown 테이블은 첫 행이 항상 헤더로 취급되므로
176
+ _has_header 인자는 HTML 변환과의 인터페이스 일관성을 위해 유지됩니다.
177
+
178
+ Args:
179
+ rows: 파싱된 행 데이터
180
+ _has_header: 헤더 존재 여부 (Markdown에서는 미사용)
181
+
182
+ Returns:
183
+ Markdown 테이블 문자열
184
+ """
185
+ if not rows:
186
+ return ""
187
+
188
+ md_parts = []
189
+
190
+ for row_idx, row in enumerate(rows):
191
+ # 셀 값 정리 (파이프 문자 이스케이프)
192
+ cells = []
193
+ for cell in row:
194
+ cell_value = cell.strip() if cell else ""
195
+ # Markdown 테이블에서 파이프는 이스케이프 필요
196
+ cell_value = cell_value.replace("|", "\\|")
197
+ # 줄바꿈을 공백으로 변환 (Markdown 테이블은 줄바꿈 미지원)
198
+ cell_value = cell_value.replace("\n", " ")
199
+ cells.append(cell_value)
200
+
201
+ # 행 생성
202
+ row_str = "| " + " | ".join(cells) + " |"
203
+ md_parts.append(row_str)
204
+
205
+ # 헤더 구분선 추가 (첫 번째 행 다음)
206
+ if row_idx == 0:
207
+ separator = "| " + " | ".join(["---"] * len(cells)) + " |"
208
+ md_parts.append(separator)
209
+
210
+ return "\n".join(md_parts)
211
+
212
+
213
+ def convert_rows_to_html(rows: List[List[str]], has_header: bool) -> str:
214
+ """
215
+ CSV 행을 HTML 테이블로 변환합니다.
216
+ 병합셀(빈 셀)을 분석하여 colspan과 rowspan을 적용합니다.
217
+
218
+ Args:
219
+ rows: 파싱된 행 데이터
220
+ has_header: 헤더 존재 여부
221
+
222
+ Returns:
223
+ HTML 테이블 문자열
224
+ """
225
+ if not rows:
226
+ return ""
227
+
228
+ # 병합 정보 분석
229
+ merge_info = analyze_merge_info(rows)
230
+
231
+ html_parts = ["<table border='1'>"]
232
+
233
+ for row_idx, row_info in enumerate(merge_info):
234
+ html_parts.append("<tr>")
235
+
236
+ for cell_info in row_info:
237
+ # 다른 셀에 병합되어 스킵해야 하는 경우
238
+ if cell_info['skip']:
239
+ continue
240
+
241
+ cell_value = cell_info['value']
242
+
243
+ # HTML 이스케이프
244
+ cell_value = cell_value.replace("&", "&amp;")
245
+ cell_value = cell_value.replace("<", "&lt;")
246
+ cell_value = cell_value.replace(">", "&gt;")
247
+ cell_value = cell_value.replace("\n", "<br>")
248
+
249
+ # 헤더 또는 데이터 셀
250
+ tag = "th" if (has_header and row_idx == 0) else "td"
251
+
252
+ # 병합 속성 생성
253
+ attrs = []
254
+ if cell_info['colspan'] > 1:
255
+ attrs.append(f"colspan='{cell_info['colspan']}'")
256
+ if cell_info['rowspan'] > 1:
257
+ attrs.append(f"rowspan='{cell_info['rowspan']}'")
258
+
259
+ attr_str = " " + " ".join(attrs) if attrs else ""
260
+ html_parts.append(f"<{tag}{attr_str}>{cell_value}</{tag}>")
261
+
262
+ html_parts.append("</tr>")
263
+
264
+ html_parts.append("</table>")
265
+
266
+ return "\n".join(html_parts)