xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py
|
|
2
|
+
"""
|
|
3
|
+
Excel Metadata Extraction Module
|
|
4
|
+
|
|
5
|
+
Provides ExcelMetadataExtractor classes for extracting metadata from Excel documents.
|
|
6
|
+
Supports both XLSX (openpyxl) and XLS (xlrd) formats.
|
|
7
|
+
Implements BaseMetadataExtractor interface.
|
|
8
|
+
"""
|
|
9
|
+
import logging
|
|
10
|
+
from typing import Any, Optional
|
|
11
|
+
|
|
12
|
+
from xgen_doc2chunk.core.functions.metadata_extractor import (
|
|
13
|
+
BaseMetadataExtractor,
|
|
14
|
+
DocumentMetadata,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger("document-processor")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class XLSXMetadataExtractor(BaseMetadataExtractor):
|
|
21
|
+
"""
|
|
22
|
+
XLSX Metadata Extractor.
|
|
23
|
+
|
|
24
|
+
Extracts metadata from openpyxl Workbook objects.
|
|
25
|
+
|
|
26
|
+
Supported fields:
|
|
27
|
+
- title, subject, author (creator), keywords
|
|
28
|
+
- comments (description), last_saved_by
|
|
29
|
+
- create_time, last_saved_time
|
|
30
|
+
|
|
31
|
+
Usage:
|
|
32
|
+
extractor = XLSXMetadataExtractor()
|
|
33
|
+
metadata = extractor.extract(workbook)
|
|
34
|
+
text = extractor.format(metadata)
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def extract(self, source: Any) -> DocumentMetadata:
|
|
38
|
+
"""
|
|
39
|
+
Extract metadata from XLSX document.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
source: openpyxl Workbook object
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
DocumentMetadata instance containing extracted metadata.
|
|
46
|
+
"""
|
|
47
|
+
try:
|
|
48
|
+
props = source.properties
|
|
49
|
+
|
|
50
|
+
return DocumentMetadata(
|
|
51
|
+
title=self._get_stripped(props.title),
|
|
52
|
+
subject=self._get_stripped(props.subject),
|
|
53
|
+
author=self._get_stripped(props.creator),
|
|
54
|
+
keywords=self._get_stripped(props.keywords),
|
|
55
|
+
comments=self._get_stripped(props.description),
|
|
56
|
+
last_saved_by=self._get_stripped(props.lastModifiedBy),
|
|
57
|
+
create_time=props.created,
|
|
58
|
+
last_saved_time=props.modified,
|
|
59
|
+
)
|
|
60
|
+
except Exception as e:
|
|
61
|
+
self.logger.warning(f"Failed to extract XLSX metadata: {e}")
|
|
62
|
+
return DocumentMetadata()
|
|
63
|
+
|
|
64
|
+
def _get_stripped(self, value: Optional[str]) -> Optional[str]:
|
|
65
|
+
"""Return stripped string value, or None if empty."""
|
|
66
|
+
return value.strip() if value else None
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class XLSMetadataExtractor(BaseMetadataExtractor):
|
|
70
|
+
"""
|
|
71
|
+
XLS Metadata Extractor.
|
|
72
|
+
|
|
73
|
+
Extracts metadata from xlrd Workbook objects.
|
|
74
|
+
Note: xlrd has limited metadata support.
|
|
75
|
+
|
|
76
|
+
Supported fields:
|
|
77
|
+
- author (user_name)
|
|
78
|
+
|
|
79
|
+
Usage:
|
|
80
|
+
extractor = XLSMetadataExtractor()
|
|
81
|
+
metadata = extractor.extract(workbook)
|
|
82
|
+
text = extractor.format(metadata)
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
def extract(self, source: Any) -> DocumentMetadata:
|
|
86
|
+
"""
|
|
87
|
+
Extract metadata from XLS document.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
source: xlrd Workbook object
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
DocumentMetadata instance containing extracted metadata.
|
|
94
|
+
"""
|
|
95
|
+
try:
|
|
96
|
+
author = None
|
|
97
|
+
if hasattr(source, 'user_name') and source.user_name:
|
|
98
|
+
author = source.user_name
|
|
99
|
+
|
|
100
|
+
return DocumentMetadata(author=author)
|
|
101
|
+
except Exception as e:
|
|
102
|
+
self.logger.warning(f"Failed to extract XLS metadata: {e}")
|
|
103
|
+
return DocumentMetadata()
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class ExcelMetadataExtractor(BaseMetadataExtractor):
|
|
107
|
+
"""
|
|
108
|
+
Unified Excel Metadata Extractor.
|
|
109
|
+
|
|
110
|
+
Selects appropriate extractor based on file format.
|
|
111
|
+
|
|
112
|
+
Usage:
|
|
113
|
+
extractor = ExcelMetadataExtractor()
|
|
114
|
+
# For XLSX
|
|
115
|
+
metadata = extractor.extract(xlsx_workbook, file_type='xlsx')
|
|
116
|
+
# For XLS
|
|
117
|
+
metadata = extractor.extract(xls_workbook, file_type='xls')
|
|
118
|
+
"""
|
|
119
|
+
|
|
120
|
+
def __init__(self, **kwargs):
|
|
121
|
+
super().__init__(**kwargs)
|
|
122
|
+
self._xlsx_extractor = XLSXMetadataExtractor(**kwargs)
|
|
123
|
+
self._xls_extractor = XLSMetadataExtractor(**kwargs)
|
|
124
|
+
|
|
125
|
+
def extract(self, source: Any, file_type: str = 'xlsx') -> DocumentMetadata:
|
|
126
|
+
"""
|
|
127
|
+
Extract metadata from Excel document.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
source: openpyxl Workbook or xlrd Workbook object
|
|
131
|
+
file_type: File format ('xlsx' or 'xls')
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
DocumentMetadata instance containing extracted metadata.
|
|
135
|
+
"""
|
|
136
|
+
if file_type.lower() == 'xls':
|
|
137
|
+
return self._xls_extractor.extract(source)
|
|
138
|
+
return self._xlsx_extractor.extract(source)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
__all__ = [
|
|
142
|
+
'ExcelMetadataExtractor',
|
|
143
|
+
'XLSXMetadataExtractor',
|
|
144
|
+
'XLSMetadataExtractor',
|
|
145
|
+
]
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py
|
|
2
|
+
"""
|
|
3
|
+
Excel Preprocessor - Process Excel workbook after conversion.
|
|
4
|
+
|
|
5
|
+
Processing Pipeline Position:
|
|
6
|
+
1. ExcelFileConverter.convert() ??openpyxl.Workbook or xlrd.Book
|
|
7
|
+
2. ExcelPreprocessor.preprocess() ??PreprocessedData (THIS STEP)
|
|
8
|
+
3. ExcelMetadataExtractor.extract() ??DocumentMetadata
|
|
9
|
+
4. Content extraction (sheets, cells, images, charts)
|
|
10
|
+
|
|
11
|
+
Current Implementation:
|
|
12
|
+
- Pass-through (Excel uses openpyxl/xlrd objects directly)
|
|
13
|
+
"""
|
|
14
|
+
import logging
|
|
15
|
+
from typing import Any, Dict
|
|
16
|
+
|
|
17
|
+
from xgen_doc2chunk.core.functions.preprocessor import (
|
|
18
|
+
BasePreprocessor,
|
|
19
|
+
PreprocessedData,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger("xgen_doc2chunk.excel.preprocessor")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ExcelPreprocessor(BasePreprocessor):
|
|
26
|
+
"""
|
|
27
|
+
Excel Workbook Preprocessor.
|
|
28
|
+
|
|
29
|
+
Currently a pass-through implementation as Excel processing
|
|
30
|
+
is handled during the content extraction phase using openpyxl/xlrd.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def preprocess(
|
|
34
|
+
self,
|
|
35
|
+
converted_data: Any,
|
|
36
|
+
**kwargs
|
|
37
|
+
) -> PreprocessedData:
|
|
38
|
+
"""
|
|
39
|
+
Preprocess the converted Excel workbook.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
converted_data: openpyxl.Workbook or xlrd.Book from ExcelFileConverter
|
|
43
|
+
**kwargs: Additional options
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
PreprocessedData with the workbook and any extracted resources
|
|
47
|
+
"""
|
|
48
|
+
metadata: Dict[str, Any] = {}
|
|
49
|
+
|
|
50
|
+
# Detect workbook type and extract info
|
|
51
|
+
if hasattr(converted_data, 'sheetnames'):
|
|
52
|
+
# openpyxl Workbook
|
|
53
|
+
metadata['format'] = 'xlsx'
|
|
54
|
+
metadata['sheet_count'] = len(converted_data.sheetnames)
|
|
55
|
+
metadata['sheet_names'] = converted_data.sheetnames
|
|
56
|
+
elif hasattr(converted_data, 'sheet_names'):
|
|
57
|
+
# xlrd Book
|
|
58
|
+
metadata['format'] = 'xls'
|
|
59
|
+
metadata['sheet_count'] = converted_data.nsheets
|
|
60
|
+
metadata['sheet_names'] = converted_data.sheet_names()
|
|
61
|
+
|
|
62
|
+
logger.debug("Excel preprocessor: pass-through, metadata=%s", metadata)
|
|
63
|
+
|
|
64
|
+
# clean_content is the TRUE SOURCE - contains the Workbook
|
|
65
|
+
return PreprocessedData(
|
|
66
|
+
raw_content=converted_data,
|
|
67
|
+
clean_content=converted_data, # TRUE SOURCE - openpyxl.Workbook or xlrd.Book
|
|
68
|
+
encoding="utf-8",
|
|
69
|
+
extracted_resources={},
|
|
70
|
+
metadata=metadata,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
def get_format_name(self) -> str:
|
|
74
|
+
"""Return format name."""
|
|
75
|
+
return "Excel Preprocessor"
|
|
76
|
+
|
|
77
|
+
def validate(self, data: Any) -> bool:
|
|
78
|
+
"""Validate if data is an Excel Workbook object."""
|
|
79
|
+
# openpyxl or xlrd
|
|
80
|
+
return hasattr(data, 'sheetnames') or hasattr(data, 'sheet_names')
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
__all__ = ['ExcelPreprocessor']
|
|
@@ -0,0 +1,357 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Excel XLS 테이블 변환 모듈
|
|
3
|
+
|
|
4
|
+
XLS 시트를 Markdown 또는 HTML 테이블로 변환합니다.
|
|
5
|
+
병합셀이 있으면 HTML, 없으면 Markdown을 사용합니다.
|
|
6
|
+
layout_detect_range를 통해 실제 데이터 영역만 추출합니다.
|
|
7
|
+
object_detect를 통해 개별 객체(테이블)별로 청킹할 수 있습니다.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
from typing import Optional, List
|
|
12
|
+
import xlrd
|
|
13
|
+
|
|
14
|
+
from xgen_doc2chunk.core.processor.excel_helper.excel_layout_detector import layout_detect_range_xls, object_detect_xls, LayoutRange
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger("document-processor")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def has_merged_cells_xls(sheet, layout: Optional[LayoutRange] = None) -> bool:
|
|
20
|
+
"""
|
|
21
|
+
XLS 시트에 병합셀이 존재하는지 확인합니다.
|
|
22
|
+
layout이 주어지면 해당 영역 내의 병합셀만 확인합니다.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
sheet: xlrd Sheet 객체
|
|
26
|
+
layout: 검사할 레이아웃 범위 (None이면 전체 시트)
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
병합셀이 존재하면 True
|
|
30
|
+
"""
|
|
31
|
+
try:
|
|
32
|
+
if len(sheet.merged_cells) == 0:
|
|
33
|
+
return False
|
|
34
|
+
|
|
35
|
+
# layout이 없으면 전체 시트에 병합셀 존재 여부만 확인
|
|
36
|
+
if layout is None:
|
|
37
|
+
return True
|
|
38
|
+
|
|
39
|
+
# layout 영역 내에 병합셀이 있는지 확인
|
|
40
|
+
# xlrd merged_cells는 (rlo, rhi, clo, chi) 튜플, 0-based
|
|
41
|
+
for (rlo, rhi, clo, chi) in sheet.merged_cells:
|
|
42
|
+
# 1-based로 변환하여 비교
|
|
43
|
+
mr_min_row = rlo + 1
|
|
44
|
+
mr_max_row = rhi # rhi는 exclusive이므로 +1 불필요
|
|
45
|
+
mr_min_col = clo + 1
|
|
46
|
+
mr_max_col = chi # chi는 exclusive이므로 +1 불필요
|
|
47
|
+
|
|
48
|
+
if (mr_min_row <= layout.max_row and
|
|
49
|
+
mr_max_row >= layout.min_row and
|
|
50
|
+
mr_min_col <= layout.max_col and
|
|
51
|
+
mr_max_col >= layout.min_col):
|
|
52
|
+
return True
|
|
53
|
+
|
|
54
|
+
return False
|
|
55
|
+
except Exception:
|
|
56
|
+
return False
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def convert_xls_sheet_to_table(sheet, wb, layout: Optional[LayoutRange] = None) -> str:
|
|
60
|
+
"""
|
|
61
|
+
XLS 시트를 테이블로 변환합니다.
|
|
62
|
+
병합셀이 없으면 Markdown, 있으면 HTML로 변환합니다.
|
|
63
|
+
layout이 None이면 자동으로 감지합니다.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
sheet: xlrd Sheet 객체
|
|
67
|
+
wb: xlrd Workbook 객체
|
|
68
|
+
layout: 변환할 레이아웃 범위 (None이면 자동 감지)
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
변환된 테이블 문자열
|
|
72
|
+
"""
|
|
73
|
+
# layout이 없으면 자동 감지
|
|
74
|
+
if layout is None:
|
|
75
|
+
layout = layout_detect_range_xls(sheet)
|
|
76
|
+
if layout is None:
|
|
77
|
+
logger.debug("No data found in XLS sheet")
|
|
78
|
+
return ""
|
|
79
|
+
|
|
80
|
+
if has_merged_cells_xls(sheet, layout):
|
|
81
|
+
logger.debug("Merged cells detected in XLS, using HTML format")
|
|
82
|
+
return convert_xls_sheet_to_html(sheet, wb, layout)
|
|
83
|
+
else:
|
|
84
|
+
logger.debug("No merged cells in XLS, using Markdown format")
|
|
85
|
+
return convert_xls_sheet_to_markdown(sheet, wb, layout)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def convert_xls_sheet_to_markdown(sheet, wb, layout: Optional[LayoutRange] = None) -> str:
|
|
89
|
+
"""
|
|
90
|
+
XLS 시트를 Markdown 테이블로 변환합니다.
|
|
91
|
+
layout_detect_range를 통해 실제 데이터 영역만 추출합니다.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
sheet: xlrd Sheet 객체
|
|
95
|
+
wb: xlrd Workbook 객체
|
|
96
|
+
layout: 변환할 레이아웃 범위 (None이면 자동 감지)
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
Markdown 테이블 문자열
|
|
100
|
+
"""
|
|
101
|
+
try:
|
|
102
|
+
# layout이 없으면 자동 감지
|
|
103
|
+
if layout is None:
|
|
104
|
+
layout = layout_detect_range_xls(sheet)
|
|
105
|
+
if layout is None:
|
|
106
|
+
return ""
|
|
107
|
+
|
|
108
|
+
md_parts = []
|
|
109
|
+
row_count = 0
|
|
110
|
+
|
|
111
|
+
# 1-based layout을 0-based로 변환하여 사용
|
|
112
|
+
for row_idx in range(layout.min_row - 1, layout.max_row): # 0-based
|
|
113
|
+
cells = []
|
|
114
|
+
row_has_content = False
|
|
115
|
+
|
|
116
|
+
for col_idx in range(layout.min_col - 1, layout.max_col): # 0-based
|
|
117
|
+
cell_value = ""
|
|
118
|
+
try:
|
|
119
|
+
value = sheet.cell_value(row_idx, col_idx)
|
|
120
|
+
if value:
|
|
121
|
+
cell_type = sheet.cell_type(row_idx, col_idx)
|
|
122
|
+
cell_value = _format_xls_cell_value(value, cell_type, wb)
|
|
123
|
+
|
|
124
|
+
if cell_value:
|
|
125
|
+
row_has_content = True
|
|
126
|
+
except Exception:
|
|
127
|
+
pass
|
|
128
|
+
|
|
129
|
+
# Markdown 테이블에서 파이프는 이스케이프 필요
|
|
130
|
+
cell_value = cell_value.replace("|", "\\|")
|
|
131
|
+
cell_value = cell_value.replace("\n", " ")
|
|
132
|
+
cells.append(cell_value)
|
|
133
|
+
|
|
134
|
+
if not row_has_content:
|
|
135
|
+
continue
|
|
136
|
+
|
|
137
|
+
row_str = "| " + " | ".join(cells) + " |"
|
|
138
|
+
md_parts.append(row_str)
|
|
139
|
+
row_count += 1
|
|
140
|
+
|
|
141
|
+
# 첫 번째 데이터 행 다음에 구분선 추가
|
|
142
|
+
if row_count == 1:
|
|
143
|
+
separator = "| " + " | ".join(["---"] * len(cells)) + " |"
|
|
144
|
+
md_parts.append(separator)
|
|
145
|
+
|
|
146
|
+
return "\n".join(md_parts) if md_parts else ""
|
|
147
|
+
|
|
148
|
+
except Exception as e:
|
|
149
|
+
logger.warning(f"Error converting XLS sheet to Markdown: {e}")
|
|
150
|
+
return ""
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def convert_xls_sheet_to_html(sheet, wb, layout: Optional[LayoutRange] = None) -> str:
|
|
154
|
+
"""
|
|
155
|
+
XLS 시트를 HTML 테이블로 변환합니다.
|
|
156
|
+
병합셀(rowspan/colspan)을 지원합니다.
|
|
157
|
+
layout_detect_range를 통해 실제 데이터 영역만 추출합니다.
|
|
158
|
+
|
|
159
|
+
병합셀이 있는 경우 빈 행도 테이블 구조의 일부이므로 포함합니다.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
sheet: xlrd Sheet 객체
|
|
163
|
+
wb: xlrd Workbook 객체
|
|
164
|
+
layout: 변환할 레이아웃 범위 (None이면 자동 감지)
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
HTML 테이블 문자열
|
|
168
|
+
"""
|
|
169
|
+
try:
|
|
170
|
+
# layout이 없으면 자동 감지
|
|
171
|
+
if layout is None:
|
|
172
|
+
layout = layout_detect_range_xls(sheet)
|
|
173
|
+
if layout is None:
|
|
174
|
+
return ""
|
|
175
|
+
|
|
176
|
+
# 병합된 셀 정보 수집 (layout 영역 내만)
|
|
177
|
+
# xlrd에서 merged_cells는 (rlo, rhi, clo, chi) 튜플 리스트
|
|
178
|
+
# rlo, clo는 0-based, rhi, chi는 exclusive
|
|
179
|
+
merged_cells_info = {} # (row, col) -> (rowspan, colspan), 0-based
|
|
180
|
+
skip_cells = set() # 건너뛸 셀 (병합된 영역의 일부), 0-based
|
|
181
|
+
|
|
182
|
+
for (rlo, rhi, clo, chi) in sheet.merged_cells:
|
|
183
|
+
# layout 영역과 겹치는 병합 셀만 처리 (1-based로 변환하여 비교)
|
|
184
|
+
mr_min_row = rlo + 1
|
|
185
|
+
mr_max_row = rhi # exclusive
|
|
186
|
+
mr_min_col = clo + 1
|
|
187
|
+
mr_max_col = chi # exclusive
|
|
188
|
+
|
|
189
|
+
if (mr_min_row <= layout.max_row and
|
|
190
|
+
mr_max_row >= layout.min_row and
|
|
191
|
+
mr_min_col <= layout.max_col and
|
|
192
|
+
mr_max_col >= layout.min_col):
|
|
193
|
+
|
|
194
|
+
rowspan = rhi - rlo
|
|
195
|
+
colspan = chi - clo
|
|
196
|
+
|
|
197
|
+
merged_cells_info[(rlo, clo)] = (rowspan, colspan)
|
|
198
|
+
|
|
199
|
+
# 병합된 영역의 나머지 셀들은 건너뛰기
|
|
200
|
+
for r in range(rlo, rhi):
|
|
201
|
+
for c in range(clo, chi):
|
|
202
|
+
if r != rlo or c != clo:
|
|
203
|
+
skip_cells.add((r, c))
|
|
204
|
+
|
|
205
|
+
# HTML 생성
|
|
206
|
+
html_parts = ["<table border='1'>"]
|
|
207
|
+
has_data = False
|
|
208
|
+
|
|
209
|
+
# 1-based layout을 0-based로 변환하여 사용
|
|
210
|
+
for row_idx in range(layout.min_row - 1, layout.max_row): # 0-based
|
|
211
|
+
row_parts = ["<tr>"]
|
|
212
|
+
|
|
213
|
+
for col_idx in range(layout.min_col - 1, layout.max_col): # 0-based
|
|
214
|
+
# 건너뛸 셀 확인 (병합된 영역의 일부)
|
|
215
|
+
if (row_idx, col_idx) in skip_cells:
|
|
216
|
+
continue
|
|
217
|
+
|
|
218
|
+
cell_value = ""
|
|
219
|
+
try:
|
|
220
|
+
value = sheet.cell_value(row_idx, col_idx)
|
|
221
|
+
if value:
|
|
222
|
+
cell_type = sheet.cell_type(row_idx, col_idx)
|
|
223
|
+
cell_value = _format_xls_cell_value(value, cell_type, wb)
|
|
224
|
+
|
|
225
|
+
if cell_value:
|
|
226
|
+
has_data = True
|
|
227
|
+
except Exception:
|
|
228
|
+
pass
|
|
229
|
+
|
|
230
|
+
# HTML 이스케이프
|
|
231
|
+
cell_value = _escape_html(cell_value)
|
|
232
|
+
|
|
233
|
+
# 첫 번째 행은 헤더로 처리
|
|
234
|
+
tag = "th" if row_idx == layout.min_row - 1 else "td"
|
|
235
|
+
|
|
236
|
+
# 병합 속성
|
|
237
|
+
attrs = []
|
|
238
|
+
if (row_idx, col_idx) in merged_cells_info:
|
|
239
|
+
rowspan, colspan = merged_cells_info[(row_idx, col_idx)]
|
|
240
|
+
if rowspan > 1:
|
|
241
|
+
attrs.append(f"rowspan='{rowspan}'")
|
|
242
|
+
if colspan > 1:
|
|
243
|
+
attrs.append(f"colspan='{colspan}'")
|
|
244
|
+
|
|
245
|
+
attr_str = " " + " ".join(attrs) if attrs else ""
|
|
246
|
+
row_parts.append(f"<{tag}{attr_str}>{cell_value}</{tag}>")
|
|
247
|
+
|
|
248
|
+
row_parts.append("</tr>")
|
|
249
|
+
|
|
250
|
+
# 모든 행을 추가 (빈 행도 테이블 구조의 일부)
|
|
251
|
+
html_parts.append("".join(row_parts))
|
|
252
|
+
|
|
253
|
+
html_parts.append("</table>")
|
|
254
|
+
|
|
255
|
+
if has_data:
|
|
256
|
+
return "\n".join(html_parts)
|
|
257
|
+
return ""
|
|
258
|
+
|
|
259
|
+
except Exception as e:
|
|
260
|
+
logger.warning(f"Error converting XLS sheet to HTML: {e}")
|
|
261
|
+
return ""
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def _format_xls_cell_value(value, cell_type, wb) -> str:
|
|
265
|
+
"""
|
|
266
|
+
XLS 셀 값을 문자열로 포맷합니다.
|
|
267
|
+
|
|
268
|
+
Args:
|
|
269
|
+
value: 셀 값
|
|
270
|
+
cell_type: xlrd 셀 타입
|
|
271
|
+
wb: xlrd Workbook 객체
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
포맷된 문자열
|
|
275
|
+
"""
|
|
276
|
+
try:
|
|
277
|
+
if cell_type == xlrd.XL_CELL_NUMBER:
|
|
278
|
+
if value == int(value):
|
|
279
|
+
return str(int(value))
|
|
280
|
+
else:
|
|
281
|
+
return str(value)
|
|
282
|
+
elif cell_type == xlrd.XL_CELL_DATE:
|
|
283
|
+
try:
|
|
284
|
+
date_tuple = xlrd.xldate_as_tuple(value, wb.datemode)
|
|
285
|
+
return f"{date_tuple[0]:04d}-{date_tuple[1]:02d}-{date_tuple[2]:02d}"
|
|
286
|
+
except Exception:
|
|
287
|
+
return str(value)
|
|
288
|
+
else:
|
|
289
|
+
return str(value).strip()
|
|
290
|
+
except Exception:
|
|
291
|
+
return str(value).strip() if value else ""
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def _escape_html(text: str) -> str:
|
|
295
|
+
"""
|
|
296
|
+
HTML 특수 문자를 이스케이프합니다.
|
|
297
|
+
|
|
298
|
+
Args:
|
|
299
|
+
text: 원본 텍스트
|
|
300
|
+
|
|
301
|
+
Returns:
|
|
302
|
+
이스케이프된 텍스트
|
|
303
|
+
"""
|
|
304
|
+
if not text:
|
|
305
|
+
return ""
|
|
306
|
+
|
|
307
|
+
text = text.replace("&", "&")
|
|
308
|
+
text = text.replace("<", "<")
|
|
309
|
+
text = text.replace(">", ">")
|
|
310
|
+
text = text.replace("\n", "<br>")
|
|
311
|
+
|
|
312
|
+
return text
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def convert_xls_objects_to_tables(sheet, wb, layout: Optional[LayoutRange] = None) -> List[str]:
|
|
316
|
+
"""
|
|
317
|
+
XLS 시트에서 개별 객체(테이블)를 감지하고 각각을 테이블 문자열로 변환합니다.
|
|
318
|
+
|
|
319
|
+
알고리즘:
|
|
320
|
+
1. 테두리가 있는 영역을 먼저 개별 개체로 인식
|
|
321
|
+
2. 테두리가 없는 값 영역을 감지
|
|
322
|
+
3. 완전히 인접한 개체들을 병합
|
|
323
|
+
4. 각 객체를 테이블로 변환
|
|
324
|
+
|
|
325
|
+
Args:
|
|
326
|
+
sheet: xlrd Sheet 객체
|
|
327
|
+
wb: xlrd Workbook 객체
|
|
328
|
+
layout: 탐색할 레이아웃 범위 (None이면 자동 감지)
|
|
329
|
+
|
|
330
|
+
Returns:
|
|
331
|
+
개별 객체 테이블 문자열 목록 (위→아래, 왼쪽→오른쪽 순서)
|
|
332
|
+
"""
|
|
333
|
+
objects = object_detect_xls(sheet, wb, layout)
|
|
334
|
+
|
|
335
|
+
if not objects:
|
|
336
|
+
return []
|
|
337
|
+
|
|
338
|
+
tables = []
|
|
339
|
+
for obj_layout in objects:
|
|
340
|
+
table_str = convert_xls_sheet_to_table(sheet, wb, obj_layout)
|
|
341
|
+
# 빈 테이블 필터링 (공백, 줄바꿈, 테이블 기호만 있는 경우 제외)
|
|
342
|
+
if table_str and table_str.strip():
|
|
343
|
+
# Markdown 테이블에서 실제 데이터가 있는지 확인
|
|
344
|
+
lines = [line.strip() for line in table_str.strip().split('\n') if line.strip()]
|
|
345
|
+
has_data = False
|
|
346
|
+
for line in lines:
|
|
347
|
+
if '---' not in line:
|
|
348
|
+
parts = [p.strip() for p in line.split('|') if p.strip()]
|
|
349
|
+
if parts:
|
|
350
|
+
has_data = True
|
|
351
|
+
break
|
|
352
|
+
|
|
353
|
+
if has_data:
|
|
354
|
+
tables.append(table_str)
|
|
355
|
+
|
|
356
|
+
logger.debug(f"Converted {len(tables)} objects to tables (XLS)")
|
|
357
|
+
return tables
|