xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,145 @@
1
+ # xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py
2
+ """
3
+ Excel Metadata Extraction Module
4
+
5
+ Provides ExcelMetadataExtractor classes for extracting metadata from Excel documents.
6
+ Supports both XLSX (openpyxl) and XLS (xlrd) formats.
7
+ Implements BaseMetadataExtractor interface.
8
+ """
9
+ import logging
10
+ from typing import Any, Optional
11
+
12
+ from xgen_doc2chunk.core.functions.metadata_extractor import (
13
+ BaseMetadataExtractor,
14
+ DocumentMetadata,
15
+ )
16
+
17
+ logger = logging.getLogger("document-processor")
18
+
19
+
20
+ class XLSXMetadataExtractor(BaseMetadataExtractor):
21
+ """
22
+ XLSX Metadata Extractor.
23
+
24
+ Extracts metadata from openpyxl Workbook objects.
25
+
26
+ Supported fields:
27
+ - title, subject, author (creator), keywords
28
+ - comments (description), last_saved_by
29
+ - create_time, last_saved_time
30
+
31
+ Usage:
32
+ extractor = XLSXMetadataExtractor()
33
+ metadata = extractor.extract(workbook)
34
+ text = extractor.format(metadata)
35
+ """
36
+
37
+ def extract(self, source: Any) -> DocumentMetadata:
38
+ """
39
+ Extract metadata from XLSX document.
40
+
41
+ Args:
42
+ source: openpyxl Workbook object
43
+
44
+ Returns:
45
+ DocumentMetadata instance containing extracted metadata.
46
+ """
47
+ try:
48
+ props = source.properties
49
+
50
+ return DocumentMetadata(
51
+ title=self._get_stripped(props.title),
52
+ subject=self._get_stripped(props.subject),
53
+ author=self._get_stripped(props.creator),
54
+ keywords=self._get_stripped(props.keywords),
55
+ comments=self._get_stripped(props.description),
56
+ last_saved_by=self._get_stripped(props.lastModifiedBy),
57
+ create_time=props.created,
58
+ last_saved_time=props.modified,
59
+ )
60
+ except Exception as e:
61
+ self.logger.warning(f"Failed to extract XLSX metadata: {e}")
62
+ return DocumentMetadata()
63
+
64
+ def _get_stripped(self, value: Optional[str]) -> Optional[str]:
65
+ """Return stripped string value, or None if empty."""
66
+ return value.strip() if value else None
67
+
68
+
69
+ class XLSMetadataExtractor(BaseMetadataExtractor):
70
+ """
71
+ XLS Metadata Extractor.
72
+
73
+ Extracts metadata from xlrd Workbook objects.
74
+ Note: xlrd has limited metadata support.
75
+
76
+ Supported fields:
77
+ - author (user_name)
78
+
79
+ Usage:
80
+ extractor = XLSMetadataExtractor()
81
+ metadata = extractor.extract(workbook)
82
+ text = extractor.format(metadata)
83
+ """
84
+
85
+ def extract(self, source: Any) -> DocumentMetadata:
86
+ """
87
+ Extract metadata from XLS document.
88
+
89
+ Args:
90
+ source: xlrd Workbook object
91
+
92
+ Returns:
93
+ DocumentMetadata instance containing extracted metadata.
94
+ """
95
+ try:
96
+ author = None
97
+ if hasattr(source, 'user_name') and source.user_name:
98
+ author = source.user_name
99
+
100
+ return DocumentMetadata(author=author)
101
+ except Exception as e:
102
+ self.logger.warning(f"Failed to extract XLS metadata: {e}")
103
+ return DocumentMetadata()
104
+
105
+
106
+ class ExcelMetadataExtractor(BaseMetadataExtractor):
107
+ """
108
+ Unified Excel Metadata Extractor.
109
+
110
+ Selects appropriate extractor based on file format.
111
+
112
+ Usage:
113
+ extractor = ExcelMetadataExtractor()
114
+ # For XLSX
115
+ metadata = extractor.extract(xlsx_workbook, file_type='xlsx')
116
+ # For XLS
117
+ metadata = extractor.extract(xls_workbook, file_type='xls')
118
+ """
119
+
120
+ def __init__(self, **kwargs):
121
+ super().__init__(**kwargs)
122
+ self._xlsx_extractor = XLSXMetadataExtractor(**kwargs)
123
+ self._xls_extractor = XLSMetadataExtractor(**kwargs)
124
+
125
+ def extract(self, source: Any, file_type: str = 'xlsx') -> DocumentMetadata:
126
+ """
127
+ Extract metadata from Excel document.
128
+
129
+ Args:
130
+ source: openpyxl Workbook or xlrd Workbook object
131
+ file_type: File format ('xlsx' or 'xls')
132
+
133
+ Returns:
134
+ DocumentMetadata instance containing extracted metadata.
135
+ """
136
+ if file_type.lower() == 'xls':
137
+ return self._xls_extractor.extract(source)
138
+ return self._xlsx_extractor.extract(source)
139
+
140
+
141
+ __all__ = [
142
+ 'ExcelMetadataExtractor',
143
+ 'XLSXMetadataExtractor',
144
+ 'XLSMetadataExtractor',
145
+ ]
@@ -0,0 +1,83 @@
1
+ # xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py
2
+ """
3
+ Excel Preprocessor - Process Excel workbook after conversion.
4
+
5
+ Processing Pipeline Position:
6
+ 1. ExcelFileConverter.convert() ??openpyxl.Workbook or xlrd.Book
7
+ 2. ExcelPreprocessor.preprocess() ??PreprocessedData (THIS STEP)
8
+ 3. ExcelMetadataExtractor.extract() ??DocumentMetadata
9
+ 4. Content extraction (sheets, cells, images, charts)
10
+
11
+ Current Implementation:
12
+ - Pass-through (Excel uses openpyxl/xlrd objects directly)
13
+ """
14
+ import logging
15
+ from typing import Any, Dict
16
+
17
+ from xgen_doc2chunk.core.functions.preprocessor import (
18
+ BasePreprocessor,
19
+ PreprocessedData,
20
+ )
21
+
22
+ logger = logging.getLogger("xgen_doc2chunk.excel.preprocessor")
23
+
24
+
25
+ class ExcelPreprocessor(BasePreprocessor):
26
+ """
27
+ Excel Workbook Preprocessor.
28
+
29
+ Currently a pass-through implementation as Excel processing
30
+ is handled during the content extraction phase using openpyxl/xlrd.
31
+ """
32
+
33
+ def preprocess(
34
+ self,
35
+ converted_data: Any,
36
+ **kwargs
37
+ ) -> PreprocessedData:
38
+ """
39
+ Preprocess the converted Excel workbook.
40
+
41
+ Args:
42
+ converted_data: openpyxl.Workbook or xlrd.Book from ExcelFileConverter
43
+ **kwargs: Additional options
44
+
45
+ Returns:
46
+ PreprocessedData with the workbook and any extracted resources
47
+ """
48
+ metadata: Dict[str, Any] = {}
49
+
50
+ # Detect workbook type and extract info
51
+ if hasattr(converted_data, 'sheetnames'):
52
+ # openpyxl Workbook
53
+ metadata['format'] = 'xlsx'
54
+ metadata['sheet_count'] = len(converted_data.sheetnames)
55
+ metadata['sheet_names'] = converted_data.sheetnames
56
+ elif hasattr(converted_data, 'sheet_names'):
57
+ # xlrd Book
58
+ metadata['format'] = 'xls'
59
+ metadata['sheet_count'] = converted_data.nsheets
60
+ metadata['sheet_names'] = converted_data.sheet_names()
61
+
62
+ logger.debug("Excel preprocessor: pass-through, metadata=%s", metadata)
63
+
64
+ # clean_content is the TRUE SOURCE - contains the Workbook
65
+ return PreprocessedData(
66
+ raw_content=converted_data,
67
+ clean_content=converted_data, # TRUE SOURCE - openpyxl.Workbook or xlrd.Book
68
+ encoding="utf-8",
69
+ extracted_resources={},
70
+ metadata=metadata,
71
+ )
72
+
73
+ def get_format_name(self) -> str:
74
+ """Return format name."""
75
+ return "Excel Preprocessor"
76
+
77
+ def validate(self, data: Any) -> bool:
78
+ """Validate if data is an Excel Workbook object."""
79
+ # openpyxl or xlrd
80
+ return hasattr(data, 'sheetnames') or hasattr(data, 'sheet_names')
81
+
82
+
83
+ __all__ = ['ExcelPreprocessor']
@@ -0,0 +1,357 @@
1
+ """
2
+ Excel XLS 테이블 변환 모듈
3
+
4
+ XLS 시트를 Markdown 또는 HTML 테이블로 변환합니다.
5
+ 병합셀이 있으면 HTML, 없으면 Markdown을 사용합니다.
6
+ layout_detect_range를 통해 실제 데이터 영역만 추출합니다.
7
+ object_detect를 통해 개별 객체(테이블)별로 청킹할 수 있습니다.
8
+ """
9
+
10
+ import logging
11
+ from typing import Optional, List
12
+ import xlrd
13
+
14
+ from xgen_doc2chunk.core.processor.excel_helper.excel_layout_detector import layout_detect_range_xls, object_detect_xls, LayoutRange
15
+
16
+ logger = logging.getLogger("document-processor")
17
+
18
+
19
+ def has_merged_cells_xls(sheet, layout: Optional[LayoutRange] = None) -> bool:
20
+ """
21
+ XLS 시트에 병합셀이 존재하는지 확인합니다.
22
+ layout이 주어지면 해당 영역 내의 병합셀만 확인합니다.
23
+
24
+ Args:
25
+ sheet: xlrd Sheet 객체
26
+ layout: 검사할 레이아웃 범위 (None이면 전체 시트)
27
+
28
+ Returns:
29
+ 병합셀이 존재하면 True
30
+ """
31
+ try:
32
+ if len(sheet.merged_cells) == 0:
33
+ return False
34
+
35
+ # layout이 없으면 전체 시트에 병합셀 존재 여부만 확인
36
+ if layout is None:
37
+ return True
38
+
39
+ # layout 영역 내에 병합셀이 있는지 확인
40
+ # xlrd merged_cells는 (rlo, rhi, clo, chi) 튜플, 0-based
41
+ for (rlo, rhi, clo, chi) in sheet.merged_cells:
42
+ # 1-based로 변환하여 비교
43
+ mr_min_row = rlo + 1
44
+ mr_max_row = rhi # rhi는 exclusive이므로 +1 불필요
45
+ mr_min_col = clo + 1
46
+ mr_max_col = chi # chi는 exclusive이므로 +1 불필요
47
+
48
+ if (mr_min_row <= layout.max_row and
49
+ mr_max_row >= layout.min_row and
50
+ mr_min_col <= layout.max_col and
51
+ mr_max_col >= layout.min_col):
52
+ return True
53
+
54
+ return False
55
+ except Exception:
56
+ return False
57
+
58
+
59
+ def convert_xls_sheet_to_table(sheet, wb, layout: Optional[LayoutRange] = None) -> str:
60
+ """
61
+ XLS 시트를 테이블로 변환합니다.
62
+ 병합셀이 없으면 Markdown, 있으면 HTML로 변환합니다.
63
+ layout이 None이면 자동으로 감지합니다.
64
+
65
+ Args:
66
+ sheet: xlrd Sheet 객체
67
+ wb: xlrd Workbook 객체
68
+ layout: 변환할 레이아웃 범위 (None이면 자동 감지)
69
+
70
+ Returns:
71
+ 변환된 테이블 문자열
72
+ """
73
+ # layout이 없으면 자동 감지
74
+ if layout is None:
75
+ layout = layout_detect_range_xls(sheet)
76
+ if layout is None:
77
+ logger.debug("No data found in XLS sheet")
78
+ return ""
79
+
80
+ if has_merged_cells_xls(sheet, layout):
81
+ logger.debug("Merged cells detected in XLS, using HTML format")
82
+ return convert_xls_sheet_to_html(sheet, wb, layout)
83
+ else:
84
+ logger.debug("No merged cells in XLS, using Markdown format")
85
+ return convert_xls_sheet_to_markdown(sheet, wb, layout)
86
+
87
+
88
+ def convert_xls_sheet_to_markdown(sheet, wb, layout: Optional[LayoutRange] = None) -> str:
89
+ """
90
+ XLS 시트를 Markdown 테이블로 변환합니다.
91
+ layout_detect_range를 통해 실제 데이터 영역만 추출합니다.
92
+
93
+ Args:
94
+ sheet: xlrd Sheet 객체
95
+ wb: xlrd Workbook 객체
96
+ layout: 변환할 레이아웃 범위 (None이면 자동 감지)
97
+
98
+ Returns:
99
+ Markdown 테이블 문자열
100
+ """
101
+ try:
102
+ # layout이 없으면 자동 감지
103
+ if layout is None:
104
+ layout = layout_detect_range_xls(sheet)
105
+ if layout is None:
106
+ return ""
107
+
108
+ md_parts = []
109
+ row_count = 0
110
+
111
+ # 1-based layout을 0-based로 변환하여 사용
112
+ for row_idx in range(layout.min_row - 1, layout.max_row): # 0-based
113
+ cells = []
114
+ row_has_content = False
115
+
116
+ for col_idx in range(layout.min_col - 1, layout.max_col): # 0-based
117
+ cell_value = ""
118
+ try:
119
+ value = sheet.cell_value(row_idx, col_idx)
120
+ if value:
121
+ cell_type = sheet.cell_type(row_idx, col_idx)
122
+ cell_value = _format_xls_cell_value(value, cell_type, wb)
123
+
124
+ if cell_value:
125
+ row_has_content = True
126
+ except Exception:
127
+ pass
128
+
129
+ # Markdown 테이블에서 파이프는 이스케이프 필요
130
+ cell_value = cell_value.replace("|", "\\|")
131
+ cell_value = cell_value.replace("\n", " ")
132
+ cells.append(cell_value)
133
+
134
+ if not row_has_content:
135
+ continue
136
+
137
+ row_str = "| " + " | ".join(cells) + " |"
138
+ md_parts.append(row_str)
139
+ row_count += 1
140
+
141
+ # 첫 번째 데이터 행 다음에 구분선 추가
142
+ if row_count == 1:
143
+ separator = "| " + " | ".join(["---"] * len(cells)) + " |"
144
+ md_parts.append(separator)
145
+
146
+ return "\n".join(md_parts) if md_parts else ""
147
+
148
+ except Exception as e:
149
+ logger.warning(f"Error converting XLS sheet to Markdown: {e}")
150
+ return ""
151
+
152
+
153
+ def convert_xls_sheet_to_html(sheet, wb, layout: Optional[LayoutRange] = None) -> str:
154
+ """
155
+ XLS 시트를 HTML 테이블로 변환합니다.
156
+ 병합셀(rowspan/colspan)을 지원합니다.
157
+ layout_detect_range를 통해 실제 데이터 영역만 추출합니다.
158
+
159
+ 병합셀이 있는 경우 빈 행도 테이블 구조의 일부이므로 포함합니다.
160
+
161
+ Args:
162
+ sheet: xlrd Sheet 객체
163
+ wb: xlrd Workbook 객체
164
+ layout: 변환할 레이아웃 범위 (None이면 자동 감지)
165
+
166
+ Returns:
167
+ HTML 테이블 문자열
168
+ """
169
+ try:
170
+ # layout이 없으면 자동 감지
171
+ if layout is None:
172
+ layout = layout_detect_range_xls(sheet)
173
+ if layout is None:
174
+ return ""
175
+
176
+ # 병합된 셀 정보 수집 (layout 영역 내만)
177
+ # xlrd에서 merged_cells는 (rlo, rhi, clo, chi) 튜플 리스트
178
+ # rlo, clo는 0-based, rhi, chi는 exclusive
179
+ merged_cells_info = {} # (row, col) -> (rowspan, colspan), 0-based
180
+ skip_cells = set() # 건너뛸 셀 (병합된 영역의 일부), 0-based
181
+
182
+ for (rlo, rhi, clo, chi) in sheet.merged_cells:
183
+ # layout 영역과 겹치는 병합 셀만 처리 (1-based로 변환하여 비교)
184
+ mr_min_row = rlo + 1
185
+ mr_max_row = rhi # exclusive
186
+ mr_min_col = clo + 1
187
+ mr_max_col = chi # exclusive
188
+
189
+ if (mr_min_row <= layout.max_row and
190
+ mr_max_row >= layout.min_row and
191
+ mr_min_col <= layout.max_col and
192
+ mr_max_col >= layout.min_col):
193
+
194
+ rowspan = rhi - rlo
195
+ colspan = chi - clo
196
+
197
+ merged_cells_info[(rlo, clo)] = (rowspan, colspan)
198
+
199
+ # 병합된 영역의 나머지 셀들은 건너뛰기
200
+ for r in range(rlo, rhi):
201
+ for c in range(clo, chi):
202
+ if r != rlo or c != clo:
203
+ skip_cells.add((r, c))
204
+
205
+ # HTML 생성
206
+ html_parts = ["<table border='1'>"]
207
+ has_data = False
208
+
209
+ # 1-based layout을 0-based로 변환하여 사용
210
+ for row_idx in range(layout.min_row - 1, layout.max_row): # 0-based
211
+ row_parts = ["<tr>"]
212
+
213
+ for col_idx in range(layout.min_col - 1, layout.max_col): # 0-based
214
+ # 건너뛸 셀 확인 (병합된 영역의 일부)
215
+ if (row_idx, col_idx) in skip_cells:
216
+ continue
217
+
218
+ cell_value = ""
219
+ try:
220
+ value = sheet.cell_value(row_idx, col_idx)
221
+ if value:
222
+ cell_type = sheet.cell_type(row_idx, col_idx)
223
+ cell_value = _format_xls_cell_value(value, cell_type, wb)
224
+
225
+ if cell_value:
226
+ has_data = True
227
+ except Exception:
228
+ pass
229
+
230
+ # HTML 이스케이프
231
+ cell_value = _escape_html(cell_value)
232
+
233
+ # 첫 번째 행은 헤더로 처리
234
+ tag = "th" if row_idx == layout.min_row - 1 else "td"
235
+
236
+ # 병합 속성
237
+ attrs = []
238
+ if (row_idx, col_idx) in merged_cells_info:
239
+ rowspan, colspan = merged_cells_info[(row_idx, col_idx)]
240
+ if rowspan > 1:
241
+ attrs.append(f"rowspan='{rowspan}'")
242
+ if colspan > 1:
243
+ attrs.append(f"colspan='{colspan}'")
244
+
245
+ attr_str = " " + " ".join(attrs) if attrs else ""
246
+ row_parts.append(f"<{tag}{attr_str}>{cell_value}</{tag}>")
247
+
248
+ row_parts.append("</tr>")
249
+
250
+ # 모든 행을 추가 (빈 행도 테이블 구조의 일부)
251
+ html_parts.append("".join(row_parts))
252
+
253
+ html_parts.append("</table>")
254
+
255
+ if has_data:
256
+ return "\n".join(html_parts)
257
+ return ""
258
+
259
+ except Exception as e:
260
+ logger.warning(f"Error converting XLS sheet to HTML: {e}")
261
+ return ""
262
+
263
+
264
+ def _format_xls_cell_value(value, cell_type, wb) -> str:
265
+ """
266
+ XLS 셀 값을 문자열로 포맷합니다.
267
+
268
+ Args:
269
+ value: 셀 값
270
+ cell_type: xlrd 셀 타입
271
+ wb: xlrd Workbook 객체
272
+
273
+ Returns:
274
+ 포맷된 문자열
275
+ """
276
+ try:
277
+ if cell_type == xlrd.XL_CELL_NUMBER:
278
+ if value == int(value):
279
+ return str(int(value))
280
+ else:
281
+ return str(value)
282
+ elif cell_type == xlrd.XL_CELL_DATE:
283
+ try:
284
+ date_tuple = xlrd.xldate_as_tuple(value, wb.datemode)
285
+ return f"{date_tuple[0]:04d}-{date_tuple[1]:02d}-{date_tuple[2]:02d}"
286
+ except Exception:
287
+ return str(value)
288
+ else:
289
+ return str(value).strip()
290
+ except Exception:
291
+ return str(value).strip() if value else ""
292
+
293
+
294
+ def _escape_html(text: str) -> str:
295
+ """
296
+ HTML 특수 문자를 이스케이프합니다.
297
+
298
+ Args:
299
+ text: 원본 텍스트
300
+
301
+ Returns:
302
+ 이스케이프된 텍스트
303
+ """
304
+ if not text:
305
+ return ""
306
+
307
+ text = text.replace("&", "&amp;")
308
+ text = text.replace("<", "&lt;")
309
+ text = text.replace(">", "&gt;")
310
+ text = text.replace("\n", "<br>")
311
+
312
+ return text
313
+
314
+
315
+ def convert_xls_objects_to_tables(sheet, wb, layout: Optional[LayoutRange] = None) -> List[str]:
316
+ """
317
+ XLS 시트에서 개별 객체(테이블)를 감지하고 각각을 테이블 문자열로 변환합니다.
318
+
319
+ 알고리즘:
320
+ 1. 테두리가 있는 영역을 먼저 개별 개체로 인식
321
+ 2. 테두리가 없는 값 영역을 감지
322
+ 3. 완전히 인접한 개체들을 병합
323
+ 4. 각 객체를 테이블로 변환
324
+
325
+ Args:
326
+ sheet: xlrd Sheet 객체
327
+ wb: xlrd Workbook 객체
328
+ layout: 탐색할 레이아웃 범위 (None이면 자동 감지)
329
+
330
+ Returns:
331
+ 개별 객체 테이블 문자열 목록 (위→아래, 왼쪽→오른쪽 순서)
332
+ """
333
+ objects = object_detect_xls(sheet, wb, layout)
334
+
335
+ if not objects:
336
+ return []
337
+
338
+ tables = []
339
+ for obj_layout in objects:
340
+ table_str = convert_xls_sheet_to_table(sheet, wb, obj_layout)
341
+ # 빈 테이블 필터링 (공백, 줄바꿈, 테이블 기호만 있는 경우 제외)
342
+ if table_str and table_str.strip():
343
+ # Markdown 테이블에서 실제 데이터가 있는지 확인
344
+ lines = [line.strip() for line in table_str.strip().split('\n') if line.strip()]
345
+ has_data = False
346
+ for line in lines:
347
+ if '---' not in line:
348
+ parts = [p.strip() for p in line.split('|') if p.strip()]
349
+ if parts:
350
+ has_data = True
351
+ break
352
+
353
+ if has_data:
354
+ tables.append(table_str)
355
+
356
+ logger.debug(f"Converted {len(tables)} objects to tables (XLS)")
357
+ return tables