xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,386 @@
1
+ """
2
+ PPT 테이블 처리 모듈
3
+
4
+ 포함 함수:
5
+ - is_simple_table(): 단순 표인지 확인
6
+ - extract_simple_table_as_text(): 단순 표를 텍스트로 추출
7
+ - convert_table_to_html(): 테이블을 HTML로 변환 (병합 지원)
8
+ - extract_table_as_text(): 테이블을 평문으로 추출
9
+
10
+ 병합 셀(rowspan, colspan) 정확히 처리
11
+ """
12
+ import logging
13
+ from typing import Dict
14
+
15
+ logger = logging.getLogger("document-processor")
16
+
17
+
18
+ def is_simple_table(table) -> bool:
19
+ """
20
+ 단순 표인지 확인합니다.
21
+
22
+ 단순 표 조건:
23
+ - 행 또는 열 중 하나라도 1개인 경우 (1xN, Nx1)
24
+
25
+ 이런 표는 텍스트박스처럼 사용되는 경우가 많아 HTML 테이블 대신 일반 텍스트로 처리합니다.
26
+
27
+ Args:
28
+ table: python-pptx의 Table 객체
29
+
30
+ Returns:
31
+ True면 단순 표 (텍스트로 처리), False면 일반 표 (HTML로 처리)
32
+ """
33
+ try:
34
+ num_rows = len(table.rows)
35
+ num_cols = len(table.columns)
36
+
37
+ # 행 또는 열이 1개인 경우 (1xN, Nx1)
38
+ if num_rows == 1 or num_cols == 1:
39
+ return True
40
+
41
+ return False
42
+ except Exception:
43
+ return False
44
+
45
+
46
+ def extract_simple_table_as_text(table) -> str:
47
+ """
48
+ 단순 표(1xN, Nx1, 2x2 이하)를 일반 텍스트로 추출합니다.
49
+
50
+ Args:
51
+ table: python-pptx의 Table 객체
52
+
53
+ Returns:
54
+ 줄바꿈으로 구분된 텍스트
55
+ """
56
+ try:
57
+ texts = []
58
+ for row in table.rows:
59
+ row_texts = []
60
+ for cell in row.cells:
61
+ cell_text = cell.text.strip() if cell.text else ""
62
+ if cell_text:
63
+ row_texts.append(cell_text)
64
+ if row_texts:
65
+ # 한 행의 셀들은 공백으로 구분
66
+ texts.append(" ".join(row_texts))
67
+
68
+ # 행들은 줄바꿈으로 구분
69
+ return "\n".join(texts) if texts else ""
70
+ except Exception:
71
+ return ""
72
+
73
+
74
+ def convert_table_to_html(table) -> str:
75
+ """
76
+ 테이블을 HTML 형식으로 변환합니다.
77
+ 병합된 셀(rowspan, colspan)을 정확히 처리합니다.
78
+
79
+ Args:
80
+ table: python-pptx의 Table 객체
81
+
82
+ Returns:
83
+ HTML 테이블 문자열
84
+ """
85
+ try:
86
+ num_rows = len(table.rows)
87
+ num_cols = len(table.columns)
88
+
89
+ if num_rows == 0 or num_cols == 0:
90
+ return ""
91
+
92
+ # 병합 정보를 저장할 2D 배열
93
+ # None: 아직 처리 안됨, 'skip': 병합으로 인해 스킵할 셀
94
+ cell_info = [[None for _ in range(num_cols)] for _ in range(num_rows)]
95
+
96
+ # 1단계: 병합 정보 수집
97
+ for row_idx in range(num_rows):
98
+ for col_idx in range(num_cols):
99
+ if cell_info[row_idx][col_idx] == 'skip':
100
+ continue
101
+
102
+ cell = table.cell(row_idx, col_idx)
103
+
104
+ # 병합 정보 추출
105
+ merge_info = _get_cell_merge_info(cell, table, row_idx, col_idx, num_rows, num_cols)
106
+
107
+ rowspan = merge_info['rowspan']
108
+ colspan = merge_info['colspan']
109
+
110
+ # 병합된 영역의 다른 셀들을 'skip'으로 표시
111
+ for r in range(row_idx, min(row_idx + rowspan, num_rows)):
112
+ for c in range(col_idx, min(col_idx + colspan, num_cols)):
113
+ if r == row_idx and c == col_idx:
114
+ # 시작 셀에는 병합 정보 저장
115
+ cell_info[r][c] = {
116
+ 'rowspan': rowspan,
117
+ 'colspan': colspan,
118
+ 'text': cell.text.strip() if cell.text else ""
119
+ }
120
+ else:
121
+ cell_info[r][c] = 'skip'
122
+
123
+ # 2단계: HTML 생성
124
+ html_parts = ["<table border='1'>"]
125
+
126
+ for row_idx in range(num_rows):
127
+ html_parts.append("<tr>")
128
+
129
+ for col_idx in range(num_cols):
130
+ info = cell_info[row_idx][col_idx]
131
+
132
+ # 병합으로 스킵할 셀
133
+ if info == 'skip':
134
+ continue
135
+
136
+ # 셀 정보가 없으면 기본값
137
+ if info is None:
138
+ cell = table.cell(row_idx, col_idx)
139
+ info = {
140
+ 'rowspan': 1,
141
+ 'colspan': 1,
142
+ 'text': cell.text.strip() if cell.text else ""
143
+ }
144
+
145
+ # 태그 결정 (첫 행은 th, 나머지는 td)
146
+ tag = "th" if row_idx == 0 else "td"
147
+
148
+ # 속성 생성
149
+ attrs = []
150
+ if info['rowspan'] > 1:
151
+ attrs.append(f"rowspan='{info['rowspan']}'")
152
+ if info['colspan'] > 1:
153
+ attrs.append(f"colspan='{info['colspan']}'")
154
+
155
+ attr_str = " " + " ".join(attrs) if attrs else ""
156
+
157
+ # 텍스트 이스케이프
158
+ text = _escape_html(info['text'])
159
+
160
+ html_parts.append(f"<{tag}{attr_str}>{text}</{tag}>")
161
+
162
+ html_parts.append("</tr>")
163
+
164
+ html_parts.append("</table>")
165
+
166
+ return "\n".join(html_parts)
167
+
168
+ except Exception as e:
169
+ logger.warning(f"Error converting table to HTML: {e}")
170
+ return extract_table_as_text(table)
171
+
172
+
173
+ def extract_table_as_text(table) -> str:
174
+ """
175
+ 테이블을 평문 형식으로 추출합니다.
176
+
177
+ Args:
178
+ table: python-pptx의 Table 객체
179
+
180
+ Returns:
181
+ 파이프(|)로 구분된 텍스트
182
+ """
183
+ try:
184
+ rows_text = []
185
+ for row in table.rows:
186
+ row_cells = []
187
+ for cell in row.cells:
188
+ cell_text = cell.text.strip() if cell.text else ""
189
+ row_cells.append(cell_text)
190
+ if any(c for c in row_cells):
191
+ rows_text.append(" | ".join(row_cells))
192
+
193
+ return "\n".join(rows_text) if rows_text else ""
194
+
195
+ except Exception:
196
+ return ""
197
+
198
+
199
+ def _get_cell_merge_info(cell, table, row_idx: int, col_idx: int,
200
+ num_rows: int, num_cols: int) -> Dict[str, int]:
201
+ """
202
+ 셀의 병합 정보를 추출합니다.
203
+
204
+ python-pptx에서 병합 셀을 감지하는 방법:
205
+ 1. cell.is_merge_origin: 병합의 시작점인지
206
+ 2. cell.is_spanned: 다른 셀에 의해 병합된 셀인지
207
+ 3. cell.span_height: 세로 병합 크기
208
+ 4. cell.span_width: 가로 병합 크기
209
+
210
+ Args:
211
+ cell: 테이블 셀 객체
212
+ table: 테이블 객체
213
+ row_idx: 현재 행 인덱스
214
+ col_idx: 현재 열 인덱스
215
+ num_rows: 총 행 수
216
+ num_cols: 총 열 수
217
+
218
+ Returns:
219
+ {'rowspan': int, 'colspan': int}
220
+ """
221
+ rowspan = 1
222
+ colspan = 1
223
+
224
+ try:
225
+ # 방법 1: python-pptx의 내장 속성 사용 (권장)
226
+ if hasattr(cell, 'is_merge_origin') and cell.is_merge_origin:
227
+ # 병합의 시작 셀
228
+ if hasattr(cell, 'span_height'):
229
+ rowspan = cell.span_height
230
+ if hasattr(cell, 'span_width'):
231
+ colspan = cell.span_width
232
+ return {'rowspan': rowspan, 'colspan': colspan}
233
+
234
+ # 이미 병합된 셀 (다른 셀에 의해 덮어진 경우) - 스킵 대상
235
+ if hasattr(cell, 'is_spanned') and cell.is_spanned:
236
+ return {'rowspan': 0, 'colspan': 0} # 스킵 표시
237
+
238
+ # 방법 2: XML 직접 파싱 (폴백)
239
+ tc = cell._tc
240
+
241
+ # gridSpan 속성 (가로 병합)
242
+ grid_span = tc.get('gridSpan')
243
+ if grid_span:
244
+ colspan = int(grid_span)
245
+
246
+ # rowSpan 속성 (세로 병합)
247
+ row_span_attr = tc.get('rowSpan')
248
+ if row_span_attr:
249
+ rowspan = int(row_span_attr)
250
+
251
+ # 방법 3: 동일 셀 참조 비교 (추가 폴백)
252
+ if colspan == 1:
253
+ colspan = _detect_colspan_by_reference(table, row_idx, col_idx, num_cols)
254
+
255
+ if rowspan == 1:
256
+ rowspan = _detect_rowspan_by_reference(table, row_idx, col_idx, num_rows)
257
+
258
+ except Exception as e:
259
+ logger.debug(f"Error getting merge info: {e}")
260
+
261
+ return {'rowspan': rowspan, 'colspan': colspan}
262
+
263
+
264
+ def _detect_colspan_by_reference(table, row_idx: int, col_idx: int, num_cols: int) -> int:
265
+ """
266
+ 셀 참조 비교로 colspan을 감지합니다.
267
+
268
+ Args:
269
+ table: 테이블 객체
270
+ row_idx: 현재 행 인덱스
271
+ col_idx: 현재 열 인덱스
272
+ num_cols: 총 열 수
273
+
274
+ Returns:
275
+ colspan 값
276
+ """
277
+ colspan = 1
278
+ try:
279
+ current_cell = table.cell(row_idx, col_idx)
280
+
281
+ for c in range(col_idx + 1, num_cols):
282
+ next_cell = table.cell(row_idx, c)
283
+
284
+ # _tc 참조가 같으면 병합된 셀
285
+ if next_cell._tc is current_cell._tc:
286
+ colspan += 1
287
+ else:
288
+ break
289
+ except Exception:
290
+ pass
291
+
292
+ return colspan
293
+
294
+
295
+ def _detect_rowspan_by_reference(table, row_idx: int, col_idx: int, num_rows: int) -> int:
296
+ """
297
+ 셀 참조 비교로 rowspan을 감지합니다.
298
+
299
+ Args:
300
+ table: 테이블 객체
301
+ row_idx: 현재 행 인덱스
302
+ col_idx: 현재 열 인덱스
303
+ num_rows: 총 행 수
304
+
305
+ Returns:
306
+ rowspan 값
307
+ """
308
+ rowspan = 1
309
+ try:
310
+ current_cell = table.cell(row_idx, col_idx)
311
+
312
+ for r in range(row_idx + 1, num_rows):
313
+ next_cell = table.cell(r, col_idx)
314
+
315
+ if next_cell._tc is current_cell._tc:
316
+ rowspan += 1
317
+ else:
318
+ break
319
+ except Exception:
320
+ pass
321
+
322
+ return rowspan
323
+
324
+
325
+ def _escape_html(text: str) -> str:
326
+ """
327
+ HTML 특수 문자를 이스케이프합니다.
328
+
329
+ Args:
330
+ text: 원본 텍스트
331
+
332
+ Returns:
333
+ 이스케이프된 텍스트
334
+ """
335
+ if not text:
336
+ return ""
337
+
338
+ text = text.replace("&", "&amp;")
339
+ text = text.replace("<", "&lt;")
340
+ text = text.replace(">", "&gt;")
341
+ text = text.replace("\n", "<br>")
342
+
343
+ return text
344
+
345
+
346
+ def debug_table_structure(table):
347
+ """
348
+ 테이블 구조를 디버깅합니다.
349
+ 병합 정보 확인을 위해 사용합니다.
350
+
351
+ Args:
352
+ table: python-pptx의 Table 객체
353
+ """
354
+ logger.debug("=== Table Structure Debug ===")
355
+ logger.debug(f"Rows: {len(table.rows)}, Cols: {len(table.columns)}")
356
+
357
+ for row_idx in range(len(table.rows)):
358
+ for col_idx in range(len(table.columns)):
359
+ try:
360
+ cell = table.cell(row_idx, col_idx)
361
+ tc = cell._tc
362
+
363
+ # XML 속성 확인
364
+ grid_span = tc.get('gridSpan', '1')
365
+ row_span = tc.get('rowSpan', '1')
366
+
367
+ # python-pptx 속성 확인
368
+ is_merge_origin = getattr(cell, 'is_merge_origin', None)
369
+ is_spanned = getattr(cell, 'is_spanned', None)
370
+ span_width = getattr(cell, 'span_width', None)
371
+ span_height = getattr(cell, 'span_height', None)
372
+
373
+ text_preview = cell.text[:20] if cell.text else ""
374
+
375
+ logger.debug(
376
+ f"[{row_idx},{col_idx}] "
377
+ f"text='{text_preview}' "
378
+ f"gridSpan={grid_span} rowSpan={row_span} "
379
+ f"is_merge_origin={is_merge_origin} "
380
+ f"is_spanned={is_spanned} "
381
+ f"span_width={span_width} span_height={span_height}"
382
+ )
383
+ except Exception as e:
384
+ logger.debug(f"[{row_idx},{col_idx}] Error: {e}")
385
+
386
+ logger.debug("=== End Debug ===")
@@ -0,0 +1,290 @@
1
+ # xgen_doc2chunk/core/processor/rtf_handler.py
2
+ """
3
+ RTF Handler
4
+
5
+ Class-based handler for RTF files.
6
+ Follows the correct architecture:
7
+ 1. Converter: Pass through (RTF uses raw binary)
8
+ 2. Preprocessor: Binary preprocessing (image extraction, \\bin removal)
9
+ 3. Handler: Sequential processing (metadata ??tables ??content ??result)
10
+ """
11
+ import logging
12
+ import re
13
+ from pathlib import Path
14
+ from typing import Any, Dict, Optional, TYPE_CHECKING
15
+
16
+ from striprtf.striprtf import rtf_to_text
17
+
18
+ from xgen_doc2chunk.core.processor.base_handler import BaseHandler
19
+ from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
20
+ from xgen_doc2chunk.core.functions.chart_extractor import BaseChartExtractor, NullChartExtractor
21
+
22
+ # Import from rtf_helper
23
+ from xgen_doc2chunk.core.processor.rtf_helper import (
24
+ RTFFileConverter,
25
+ RTFConvertedData,
26
+ RTFMetadataExtractor,
27
+ RTFSourceInfo,
28
+ RTFPreprocessor,
29
+ extract_tables_with_positions,
30
+ extract_inline_content,
31
+ extract_text_only,
32
+ decode_content,
33
+ detect_encoding,
34
+ )
35
+
36
+ if TYPE_CHECKING:
37
+ from xgen_doc2chunk.core.document_processor import CurrentFile
38
+
39
+ logger = logging.getLogger("xgen_doc2chunk.rtf.handler")
40
+
41
+
42
+ class RTFHandler(BaseHandler):
43
+ """
44
+ RTF Document Processing Handler.
45
+
46
+ Processing flow:
47
+ 1. file_converter.convert() ??bytes (pass through)
48
+ 2. preprocessor.preprocess() ??PreprocessedData (image extraction, binary cleanup)
49
+ 3. decode content ??string
50
+ 4. metadata_extractor.extract() ??DocumentMetadata
51
+ 5. extract_tables_with_positions() ??List[RTFTable]
52
+ 6. extract_inline_content() ??str
53
+ 7. Build result string
54
+ """
55
+
56
+ def _create_file_converter(self) -> RTFFileConverter:
57
+ """Create RTF-specific file converter."""
58
+ return RTFFileConverter()
59
+
60
+ def _create_preprocessor(self) -> RTFPreprocessor:
61
+ """Create RTF-specific preprocessor."""
62
+ return RTFPreprocessor()
63
+
64
+ def _create_chart_extractor(self) -> BaseChartExtractor:
65
+ """RTF files do not contain charts. Return NullChartExtractor."""
66
+ return NullChartExtractor(self._chart_processor)
67
+
68
+ def _create_metadata_extractor(self) -> RTFMetadataExtractor:
69
+ """Create RTF-specific metadata extractor."""
70
+ return RTFMetadataExtractor()
71
+
72
+ def _create_format_image_processor(self) -> ImageProcessor:
73
+ """Create RTF-specific image processor (use base for now)."""
74
+ return self._image_processor
75
+
76
+ def extract_text(
77
+ self,
78
+ current_file: "CurrentFile",
79
+ extract_metadata: bool = True,
80
+ **kwargs
81
+ ) -> str:
82
+ """
83
+ Extract text from RTF file.
84
+
85
+ Args:
86
+ current_file: CurrentFile dict containing file info and binary data
87
+ extract_metadata: Whether to extract metadata
88
+ **kwargs: Additional options
89
+
90
+ Returns:
91
+ Extracted text
92
+ """
93
+ file_path = current_file.get("file_path", "unknown")
94
+ file_data = current_file.get("file_data", b"")
95
+
96
+ self.logger.info(f"RTF processing: {file_path}")
97
+
98
+ if not file_data:
99
+ self.logger.error(f"Empty file data: {file_path}")
100
+ return f"[RTF file is empty: {file_path}]"
101
+
102
+ # Validate RTF format
103
+ if not file_data.strip().startswith(b'{\\rtf'):
104
+ self.logger.warning(f"Invalid RTF format: {file_path}")
105
+ return self._extract_fallback(file_data, extract_metadata)
106
+
107
+ try:
108
+ # Step 1: Converter - pass through (RTF uses raw binary)
109
+ raw_data: bytes = self.file_converter.convert(file_data)
110
+
111
+ # Step 2: Preprocessor - extract images, remove binary data
112
+ output_dir = self._get_output_dir(file_path)
113
+ doc_name = Path(file_path).stem if file_path != "unknown" else "document"
114
+
115
+ preprocessed = self.preprocessor.preprocess(
116
+ raw_data,
117
+ output_dir=output_dir,
118
+ doc_name=doc_name,
119
+ )
120
+
121
+ clean_content = preprocessed.clean_content
122
+ image_tags = preprocessed.extracted_resources.get("image_tags", [])
123
+ encoding = preprocessed.encoding or "cp949"
124
+
125
+ # Step 3: Decode to string if still bytes
126
+ if isinstance(clean_content, bytes):
127
+ encoding = detect_encoding(clean_content) or encoding
128
+ content = decode_content(clean_content, encoding)
129
+ else:
130
+ content = clean_content
131
+
132
+ # Build RTFConvertedData for downstream processing
133
+ converted = RTFConvertedData(
134
+ content=content,
135
+ encoding=encoding,
136
+ image_tags=image_tags,
137
+ original_size=len(file_data),
138
+ )
139
+
140
+ self.logger.debug(
141
+ f"RTF preprocessed: encoding={encoding}, "
142
+ f"images={len(image_tags)}, size={len(file_data)}"
143
+ )
144
+
145
+ # Step 4: Extract content
146
+ return self._extract_from_converted(
147
+ converted,
148
+ current_file,
149
+ extract_metadata,
150
+ )
151
+
152
+ except Exception as e:
153
+ self.logger.error(f"Error in RTF processing: {e}", exc_info=True)
154
+ return self._extract_fallback(file_data, extract_metadata)
155
+
156
+ def _extract_from_converted(
157
+ self,
158
+ converted: RTFConvertedData,
159
+ current_file: "CurrentFile",
160
+ extract_metadata: bool,
161
+ ) -> str:
162
+ """
163
+ Internal method to extract content from RTFConvertedData.
164
+
165
+ Args:
166
+ converted: RTFConvertedData object
167
+ current_file: CurrentFile dict
168
+ extract_metadata: Whether to extract metadata
169
+
170
+ Returns:
171
+ Extracted text
172
+ """
173
+ content = converted.content
174
+ encoding = converted.encoding
175
+
176
+ result_parts = []
177
+
178
+ # Step 2: Extract metadata
179
+ if extract_metadata:
180
+ source = RTFSourceInfo(content=content, encoding=encoding)
181
+ metadata = self.metadata_extractor.extract(source)
182
+ metadata_str = self.metadata_extractor.format(metadata)
183
+ if metadata_str:
184
+ result_parts.append(metadata_str + "\n\n")
185
+
186
+ # Add page tag
187
+ page_tag = self.create_page_tag(1)
188
+ result_parts.append(f"{page_tag}\n")
189
+
190
+ # Step 3: Extract tables with positions
191
+ tables, table_regions = extract_tables_with_positions(content, encoding)
192
+
193
+ # Step 4: Extract inline content (preserves table positions)
194
+ inline_content = extract_inline_content(content, table_regions, encoding)
195
+
196
+ if inline_content:
197
+ result_parts.append(inline_content)
198
+ else:
199
+ # Fallback: separate text and tables
200
+ text_only = extract_text_only(content, encoding)
201
+ if text_only:
202
+ result_parts.append(text_only)
203
+
204
+ for table in tables:
205
+ if not table.rows:
206
+ continue
207
+ if table.is_real_table():
208
+ result_parts.append("\n" + table.to_html() + "\n")
209
+ else:
210
+ result_parts.append("\n" + table.to_text_list() + "\n")
211
+
212
+ # Step 5: Add image tags
213
+ if converted.image_tags:
214
+ result_parts.append("\n")
215
+ for tag in converted.image_tags:
216
+ result_parts.append(tag + "\n")
217
+
218
+ result = "\n".join(result_parts)
219
+
220
+ # Clean up invalid image tags
221
+ result = re.sub(r'\[image:[^\]]*uploads/\.[^\]]*\]', '', result)
222
+
223
+ return result
224
+
225
+ def _extract_fallback(
226
+ self,
227
+ file_data: bytes,
228
+ extract_metadata: bool,
229
+ ) -> str:
230
+ """
231
+ Fallback extraction using striprtf library.
232
+
233
+ Args:
234
+ file_data: Raw binary data
235
+ extract_metadata: Whether to extract metadata
236
+
237
+ Returns:
238
+ Extracted text
239
+ """
240
+ # Try different encodings
241
+ content = None
242
+ for encoding in ['utf-8', 'cp949', 'euc-kr', 'cp1252', 'latin-1']:
243
+ try:
244
+ content = file_data.decode(encoding)
245
+ break
246
+ except (UnicodeDecodeError, UnicodeError):
247
+ continue
248
+
249
+ if content is None:
250
+ content = file_data.decode('cp1252', errors='replace')
251
+
252
+ result_parts = []
253
+
254
+ # Extract metadata from raw content
255
+ if extract_metadata:
256
+ source = RTFSourceInfo(content=content, encoding='cp1252')
257
+ metadata = self.metadata_extractor.extract(source)
258
+ metadata_str = self.extract_and_format_metadata(metadata)
259
+ if metadata_str:
260
+ result_parts.append(metadata_str + "\n\n")
261
+
262
+ # Add page tag
263
+ page_tag = self.create_page_tag(1)
264
+ result_parts.append(f"{page_tag}\n")
265
+
266
+ # Extract text using striprtf
267
+ try:
268
+ text = rtf_to_text(content)
269
+ except Exception:
270
+ # Manual cleanup
271
+ text = re.sub(r'\\[a-z]+\d*\s?', '', content)
272
+ text = re.sub(r"\\'[0-9a-fA-F]{2}", '', text)
273
+ text = re.sub(r'[{}]', '', text)
274
+
275
+ if text:
276
+ text = re.sub(r'\n{3,}', '\n\n', text)
277
+ result_parts.append(text.strip())
278
+
279
+ return "\n".join(result_parts)
280
+
281
+ def _get_output_dir(self, file_path: str) -> Optional[Path]:
282
+ """Get output directory for images."""
283
+ if hasattr(self._image_processor, 'config'):
284
+ dir_path = self._image_processor.config.directory_path
285
+ if dir_path:
286
+ return Path(dir_path)
287
+ return None
288
+
289
+
290
+ __all__ = ['RTFHandler']