xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,220 @@
1
+ # xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py
2
+ """
3
+ DOCX Table Processor
4
+
5
+ Formats TableData into HTML/Markdown/Text output for DOCX documents.
6
+ Extends the base TableProcessor with DOCX-specific formatting options.
7
+
8
+ Key Features:
9
+ - HTML output with border attributes for backward compatibility
10
+ - Special handling for 1x1 container tables
11
+ - Special handling for single column tables
12
+ - Post-processing for DOCX-specific requirements
13
+
14
+ Usage:
15
+ from xgen_doc2chunk.core.processor.docx_helper.docx_table_processor import (
16
+ DOCXTableProcessor,
17
+ create_docx_table_processor,
18
+ )
19
+
20
+ processor = DOCXTableProcessor()
21
+ html = processor.format_table(table_data)
22
+ """
23
+ import logging
24
+ from dataclasses import dataclass
25
+ from typing import Optional
26
+
27
+ from xgen_doc2chunk.core.functions.table_extractor import TableData
28
+ from xgen_doc2chunk.core.functions.table_processor import (
29
+ TableProcessor,
30
+ TableProcessorConfig,
31
+ TableOutputFormat,
32
+ )
33
+
34
+ logger = logging.getLogger("document-processor")
35
+
36
+
37
+ @dataclass
38
+ class DOCXTableProcessorConfig(TableProcessorConfig):
39
+ """Configuration for DOCX table processing.
40
+
41
+ Extends TableProcessorConfig with DOCX-specific options.
42
+
43
+ Attributes:
44
+ add_border: Whether to add border='1' attribute to HTML tables
45
+ collapse_single_cell: Whether to collapse 1x1 tables to plain text
46
+ collapse_single_column: Whether to collapse single-column tables to line-separated text
47
+ """
48
+ add_border: bool = True
49
+ collapse_single_cell: bool = True
50
+ collapse_single_column: bool = True
51
+
52
+
53
+ class DOCXTableProcessor(TableProcessor):
54
+ """DOCX-specific table processor.
55
+
56
+ Extends TableProcessor with DOCX-specific formatting:
57
+ - Adds border='1' to HTML tables for backward compatibility
58
+ - Collapses 1x1 container tables to plain text
59
+ - Collapses single-column tables to line-separated text
60
+
61
+ Usage:
62
+ processor = DOCXTableProcessor()
63
+ html = processor.format_table(table_data)
64
+ """
65
+
66
+ def __init__(self, config: Optional[DOCXTableProcessorConfig] = None):
67
+ """Initialize the DOCX table processor.
68
+
69
+ Args:
70
+ config: DOCX table processing configuration
71
+ """
72
+ if config is None:
73
+ config = DOCXTableProcessorConfig()
74
+ super().__init__(config)
75
+ self.docx_config = config
76
+
77
+ def format_table(self, table: TableData) -> str:
78
+ """Format a table with DOCX-specific handling.
79
+
80
+ Handles special cases before delegating to base class:
81
+ - 1x1 tables: Return cell content only (container tables)
82
+ - Single column tables: Return as line-separated text
83
+
84
+ Args:
85
+ table: TableData to format
86
+
87
+ Returns:
88
+ Formatted table string
89
+ """
90
+ if not table or not table.rows:
91
+ return ""
92
+
93
+ # Special case: 1x1 table (container table)
94
+ if (self.docx_config.collapse_single_cell and
95
+ table.num_rows == 1 and table.num_cols == 1):
96
+ if table.rows and table.rows[0]:
97
+ return table.rows[0][0].content
98
+ return ""
99
+
100
+ # Special case: Single column table
101
+ if (self.docx_config.collapse_single_column and
102
+ table.num_cols == 1):
103
+ text_items = []
104
+ for row in table.rows:
105
+ if row and row[0].content:
106
+ text_items.append(row[0].content)
107
+ if text_items:
108
+ return "\n\n".join(text_items)
109
+ return ""
110
+
111
+ # Normal table processing
112
+ return super().format_table(table)
113
+
114
+ def format_table_as_html(self, table: TableData) -> str:
115
+ """Format table as HTML with DOCX-specific attributes.
116
+
117
+ Adds border='1' attribute for backward compatibility.
118
+
119
+ Args:
120
+ table: TableData to format
121
+
122
+ Returns:
123
+ HTML table string
124
+ """
125
+ # Check for special cases first
126
+ if not table or not table.rows:
127
+ return ""
128
+
129
+ # 1x1 table handling
130
+ if (self.docx_config.collapse_single_cell and
131
+ table.num_rows == 1 and table.num_cols == 1):
132
+ if table.rows and table.rows[0]:
133
+ return table.rows[0][0].content
134
+ return ""
135
+
136
+ # Single column table handling
137
+ if (self.docx_config.collapse_single_column and
138
+ table.num_cols == 1):
139
+ text_items = []
140
+ for row in table.rows:
141
+ if row and row[0].content:
142
+ text_items.append(row[0].content)
143
+ if text_items:
144
+ return "\n\n".join(text_items)
145
+ return ""
146
+
147
+ # Generate HTML using base class
148
+ html = super().format_table_as_html(table)
149
+
150
+ # Post-process: Add border attribute
151
+ if self.docx_config.add_border:
152
+ html = html.replace("<table>", "<table border='1'>")
153
+
154
+ return html
155
+
156
+
157
+ # Default configuration
158
+ DEFAULT_DOCX_PROCESSOR_CONFIG = DOCXTableProcessorConfig(
159
+ output_format=TableOutputFormat.HTML,
160
+ clean_whitespace=True,
161
+ preserve_merged_cells=True,
162
+ add_border=True,
163
+ collapse_single_cell=True,
164
+ collapse_single_column=True,
165
+ )
166
+
167
+
168
+ # Module-level default processor (lazy initialized)
169
+ _default_processor: Optional[DOCXTableProcessor] = None
170
+
171
+
172
+ def get_default_processor() -> DOCXTableProcessor:
173
+ """Get or create the default DOCX table processor.
174
+
175
+ Returns:
176
+ Configured DOCXTableProcessor instance
177
+ """
178
+ global _default_processor
179
+ if _default_processor is None:
180
+ _default_processor = DOCXTableProcessor(DEFAULT_DOCX_PROCESSOR_CONFIG)
181
+ return _default_processor
182
+
183
+
184
+ def create_docx_table_processor(
185
+ config: Optional[DOCXTableProcessorConfig] = None
186
+ ) -> DOCXTableProcessor:
187
+ """Create a DOCX table processor instance.
188
+
189
+ Args:
190
+ config: DOCX table processing configuration
191
+
192
+ Returns:
193
+ Configured DOCXTableProcessor instance
194
+ """
195
+ return DOCXTableProcessor(config)
196
+
197
+
198
+ def format_table_as_html(table: TableData) -> str:
199
+ """Convenience function to format a table as HTML.
200
+
201
+ Uses the default DOCX table processor.
202
+
203
+ Args:
204
+ table: TableData to format
205
+
206
+ Returns:
207
+ HTML table string
208
+ """
209
+ processor = get_default_processor()
210
+ return processor.format_table_as_html(table)
211
+
212
+
213
+ __all__ = [
214
+ 'DOCXTableProcessor',
215
+ 'DOCXTableProcessorConfig',
216
+ 'DEFAULT_DOCX_PROCESSOR_CONFIG',
217
+ 'create_docx_table_processor',
218
+ 'get_default_processor',
219
+ 'format_table_as_html',
220
+ ]
@@ -0,0 +1,353 @@
1
+ # your_package/document_processor/excel_handler.py
2
+ """
3
+ Excel Handler - Excel Document Processor (XLSX/XLS)
4
+
5
+ Main Features:
6
+ - Metadata extraction (title, author, subject, keywords, creation date, modification date, etc.)
7
+ - Text extraction (direct parsing via openpyxl/xlrd)
8
+ - Table extraction (Markdown or HTML conversion based on merged cells)
9
+ - Inline image extraction and local storage
10
+ - Chart processing (convert to table)
11
+ - Multi-sheet support
12
+
13
+ Class-based Handler:
14
+ - ExcelHandler class inherits from BaseHandler to manage config/image_processor
15
+ """
16
+ from __future__ import annotations
17
+
18
+ import logging
19
+ import os
20
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set
21
+
22
+ from xgen_doc2chunk.core.processor.base_handler import BaseHandler
23
+ from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
24
+ from xgen_doc2chunk.core.functions.chart_extractor import BaseChartExtractor
25
+ from xgen_doc2chunk.core.processor.excel_helper.excel_chart_extractor import ExcelChartExtractor
26
+
27
+ if TYPE_CHECKING:
28
+ from openpyxl.workbook import Workbook
29
+ from openpyxl.worksheet.worksheet import Worksheet
30
+ from xgen_doc2chunk.core.document_processor import CurrentFile
31
+ from xgen_doc2chunk.core.processor.excel_helper import (
32
+ # Textbox
33
+ extract_textboxes_from_xlsx,
34
+ # Table
35
+ convert_xlsx_sheet_to_table,
36
+ convert_xls_sheet_to_table,
37
+ # Object Detection
38
+ convert_xlsx_objects_to_tables,
39
+ convert_xls_objects_to_tables,
40
+ )
41
+ from xgen_doc2chunk.core.processor.excel_helper.excel_metadata import (
42
+ XLSXMetadataExtractor,
43
+ XLSMetadataExtractor,
44
+ )
45
+ from xgen_doc2chunk.core.processor.excel_helper.excel_image_processor import (
46
+ ExcelImageProcessor,
47
+ )
48
+
49
+ logger = logging.getLogger("document-processor")
50
+
51
+
52
+ # ============================================================================
53
+ # ExcelHandler Class
54
+ # ============================================================================
55
+
56
+ class ExcelHandler(BaseHandler):
57
+ """
58
+ Excel Document Handler (XLSX/XLS)
59
+
60
+ Inherits from BaseHandler to manage config and image_processor at instance level.
61
+
62
+ Usage:
63
+ handler = ExcelHandler(config=config, image_processor=image_processor)
64
+ text = handler.extract_text(current_file)
65
+ """
66
+
67
+ def __init__(self, *args, **kwargs):
68
+ super().__init__(*args, **kwargs)
69
+ self._xlsx_metadata_extractor = None
70
+ self._xls_metadata_extractor = None
71
+
72
+ def _create_file_converter(self):
73
+ """Create Excel-specific file converter."""
74
+ from xgen_doc2chunk.core.processor.excel_helper.excel_file_converter import ExcelFileConverter
75
+ return ExcelFileConverter()
76
+
77
+ def _create_preprocessor(self):
78
+ """Create Excel-specific preprocessor."""
79
+ from xgen_doc2chunk.core.processor.excel_helper.excel_preprocessor import ExcelPreprocessor
80
+ return ExcelPreprocessor()
81
+
82
+ def _create_chart_extractor(self) -> BaseChartExtractor:
83
+ """Create Excel-specific chart extractor."""
84
+ return ExcelChartExtractor(self._chart_processor)
85
+
86
+ def _create_metadata_extractor(self):
87
+ """Create XLSX-specific metadata extractor (default)."""
88
+ return XLSXMetadataExtractor()
89
+
90
+ def _create_format_image_processor(self):
91
+ """Create Excel-specific image processor."""
92
+ return ExcelImageProcessor(
93
+ directory_path=self._image_processor.config.directory_path,
94
+ tag_prefix=self._image_processor.config.tag_prefix,
95
+ tag_suffix=self._image_processor.config.tag_suffix,
96
+ storage_backend=self._image_processor.storage_backend,
97
+ )
98
+
99
+ def _get_xls_metadata_extractor(self):
100
+ """Get XLS-specific metadata extractor."""
101
+ if self._xls_metadata_extractor is None:
102
+ self._xls_metadata_extractor = XLSMetadataExtractor()
103
+ return self._xls_metadata_extractor
104
+
105
+ def extract_text(
106
+ self,
107
+ current_file: "CurrentFile",
108
+ extract_metadata: bool = True,
109
+ **kwargs
110
+ ) -> str:
111
+ """
112
+ Extract text from Excel file.
113
+
114
+ Args:
115
+ current_file: CurrentFile dict containing file info and binary data
116
+ extract_metadata: Whether to extract metadata
117
+ **kwargs: Additional options
118
+
119
+ Returns:
120
+ Extracted text
121
+ """
122
+ file_path = current_file.get("file_path", "unknown")
123
+ ext = current_file.get("file_extension", os.path.splitext(file_path)[1]).lower()
124
+ # Normalize extension (remove leading dot if present)
125
+ ext = ext.lstrip('.')
126
+ self.logger.info(f"Excel processing: {file_path}, ext: {ext}")
127
+
128
+ if ext == 'xlsx':
129
+ return self._extract_xlsx(current_file, extract_metadata)
130
+ elif ext == 'xls':
131
+ return self._extract_xls(current_file, extract_metadata)
132
+ else:
133
+ raise ValueError(f"Unsupported Excel format: {ext}")
134
+
135
+ def _extract_xlsx(
136
+ self,
137
+ current_file: "CurrentFile",
138
+ extract_metadata: bool = True
139
+ ) -> str:
140
+ """XLSX file processing."""
141
+ file_path = current_file.get("file_path", "unknown")
142
+ self.logger.info(f"XLSX processing: {file_path}")
143
+
144
+ try:
145
+ # Step 1: Convert to Workbook using file_converter
146
+ file_data = current_file.get("file_data", b"")
147
+ wb = self.file_converter.convert(file_data, extension='xlsx')
148
+
149
+ # Step 2: Preprocess - may transform wb in the future
150
+ preprocessed = self.preprocess(wb)
151
+ wb = preprocessed.clean_content # TRUE SOURCE
152
+
153
+ preload = self._preload_xlsx_data(current_file, wb, extract_metadata)
154
+
155
+ result_parts = [preload["metadata_str"]] if preload["metadata_str"] else []
156
+ processed_images: Set[str] = set()
157
+ stats = {"charts": 0, "images": 0, "textboxes": 0}
158
+
159
+ for sheet_name in wb.sheetnames:
160
+ sheet_result = self._process_xlsx_sheet(
161
+ wb[sheet_name], sheet_name, preload, processed_images, stats
162
+ )
163
+ result_parts.append(sheet_result)
164
+
165
+ remaining = self._process_remaining_charts(
166
+ preload["chart_data_list"], preload["chart_idx"], processed_images, stats
167
+ )
168
+ if remaining:
169
+ result_parts.append(remaining)
170
+
171
+ result = "".join(result_parts)
172
+ self.logger.info(
173
+ f"XLSX processing completed: {len(wb.sheetnames)} sheets, "
174
+ f"{stats['charts']} charts, {stats['images']} images"
175
+ )
176
+ return result
177
+
178
+ except Exception as e:
179
+ self.logger.error(f"Error in XLSX processing: {e}")
180
+ import traceback
181
+ self.logger.debug(traceback.format_exc())
182
+ raise
183
+
184
+ def _extract_xls(
185
+ self,
186
+ current_file: "CurrentFile",
187
+ extract_metadata: bool = True
188
+ ) -> str:
189
+ """XLS file processing."""
190
+ file_path = current_file.get("file_path", "unknown")
191
+ self.logger.info(f"XLS processing: {file_path}")
192
+
193
+ try:
194
+ # Step 1: Convert to Workbook using file_converter
195
+ file_data = current_file.get("file_data", b"")
196
+ wb = self.file_converter.convert(file_data, extension='xls')
197
+
198
+ # Step 2: Preprocess - may transform wb in the future
199
+ preprocessed = self.preprocess(wb)
200
+ wb = preprocessed.clean_content # TRUE SOURCE
201
+
202
+ result_parts = []
203
+
204
+ if extract_metadata:
205
+ xls_extractor = self._get_xls_metadata_extractor()
206
+ metadata_str = xls_extractor.extract_and_format(wb)
207
+ if metadata_str:
208
+ result_parts.append(metadata_str + "\n\n")
209
+
210
+ for sheet_idx in range(wb.nsheets):
211
+ ws = wb.sheet_by_index(sheet_idx)
212
+ sheet_tag = self.create_sheet_tag(ws.name)
213
+ result_parts.append(f"\n{sheet_tag}\n")
214
+
215
+ table_contents = convert_xls_objects_to_tables(ws, wb)
216
+ if table_contents:
217
+ for i, table_content in enumerate(table_contents, 1):
218
+ if len(table_contents) > 1:
219
+ result_parts.append(f"\n[Table {i}]\n{table_content}\n")
220
+ else:
221
+ result_parts.append(f"\n{table_content}\n")
222
+
223
+ result = "".join(result_parts)
224
+ self.logger.info(f"XLS processing completed: {wb.nsheets} sheets")
225
+ return result
226
+
227
+ except Exception as e:
228
+ self.logger.error(f"Error in XLS processing: {e}")
229
+ import traceback
230
+ self.logger.debug(traceback.format_exc())
231
+ raise
232
+
233
+ def _preload_xlsx_data(
234
+ self, current_file: "CurrentFile", wb, extract_metadata: bool
235
+ ) -> Dict[str, Any]:
236
+ """Extract preprocessing data from XLSX file."""
237
+ file_path = current_file.get("file_path", "unknown")
238
+ file_stream = self.get_file_stream(current_file)
239
+
240
+ result = {
241
+ "metadata_str": "",
242
+ "chart_data_list": [], # ChartData instances from extractor
243
+ "images_data": [],
244
+ "textboxes_by_sheet": {},
245
+ "chart_idx": 0,
246
+ }
247
+
248
+ if extract_metadata:
249
+ result["metadata_str"] = self.extract_and_format_metadata(wb)
250
+ if result["metadata_str"]:
251
+ result["metadata_str"] += "\n\n"
252
+
253
+ # Use ChartExtractor for chart extraction
254
+ result["chart_data_list"] = self.chart_extractor.extract_all_from_file(file_stream)
255
+
256
+ # Use format_image_processor directly for image extraction
257
+ image_processor = self.format_image_processor
258
+ if hasattr(image_processor, 'extract_images_from_xlsx'):
259
+ result["images_data"] = image_processor.extract_images_from_xlsx(file_path)
260
+ else:
261
+ result["images_data"] = {}
262
+ result["textboxes_by_sheet"] = extract_textboxes_from_xlsx(file_path)
263
+
264
+ return result
265
+
266
+ def _process_xlsx_sheet(
267
+ self, ws, sheet_name: str, preload: Dict[str, Any],
268
+ processed_images: Set[str], stats: Dict[str, int]
269
+ ) -> str:
270
+ """Process a single XLSX sheet."""
271
+ sheet_tag = self.create_sheet_tag(sheet_name)
272
+ parts = [f"\n{sheet_tag}\n"]
273
+
274
+ table_contents = convert_xlsx_objects_to_tables(ws)
275
+ if table_contents:
276
+ for i, table_content in enumerate(table_contents, 1):
277
+ if len(table_contents) > 1:
278
+ parts.append(f"\n[Table {i}]\n{table_content}\n")
279
+ else:
280
+ parts.append(f"\n{table_content}\n")
281
+
282
+ # Chart processing using ChartExtractor
283
+ if hasattr(ws, '_charts') and ws._charts:
284
+ chart_data_list = preload["chart_data_list"]
285
+ for chart in ws._charts:
286
+ if preload["chart_idx"] < len(chart_data_list):
287
+ chart_data = chart_data_list[preload["chart_idx"]]
288
+ # chart_data is already ChartData instance, format it
289
+ chart_output = self._format_chart_data(chart_data)
290
+ if chart_output:
291
+ parts.append(f"\n{chart_output}\n")
292
+ stats["charts"] += 1
293
+ preload["chart_idx"] += 1
294
+
295
+ # Image processing - use format_image_processor directly
296
+ image_processor = self.format_image_processor
297
+ if hasattr(image_processor, 'get_sheet_images'):
298
+ sheet_images = image_processor.get_sheet_images(ws, preload["images_data"], "")
299
+ else:
300
+ sheet_images = []
301
+ for image_data, anchor in sheet_images:
302
+ if image_data:
303
+ image_tag = self.format_image_processor.save_image(image_data)
304
+ if image_tag:
305
+ parts.append(f"\n{image_tag}\n")
306
+ stats["images"] += 1
307
+
308
+ # Textbox processing
309
+ textboxes = preload["textboxes_by_sheet"].get(sheet_name, [])
310
+ for tb in textboxes:
311
+ if tb:
312
+ parts.append(f"\n[Textbox] {tb}\n")
313
+ stats["textboxes"] += 1
314
+
315
+ return "".join(parts)
316
+
317
+ def _format_chart_data(self, chart_data) -> str:
318
+ """Format ChartData using ChartProcessor."""
319
+ from xgen_doc2chunk.core.functions.chart_extractor import ChartData
320
+
321
+ if not isinstance(chart_data, ChartData):
322
+ return ""
323
+
324
+ if chart_data.has_data():
325
+ return self.chart_processor.format_chart_data(
326
+ chart_type=chart_data.chart_type,
327
+ title=chart_data.title,
328
+ categories=chart_data.categories,
329
+ series=chart_data.series
330
+ )
331
+ else:
332
+ return self.chart_processor.format_chart_fallback(
333
+ chart_type=chart_data.chart_type,
334
+ title=chart_data.title
335
+ )
336
+
337
+ def _process_remaining_charts(
338
+ self, chart_data_list: List, chart_idx: int,
339
+ processed_images: Set[str], stats: Dict[str, int]
340
+ ) -> str:
341
+ """Process remaining charts not associated with sheets."""
342
+ parts = []
343
+ while chart_idx < len(chart_data_list):
344
+ chart_data = chart_data_list[chart_idx]
345
+ chart_output = self._format_chart_data(chart_data)
346
+ if chart_output:
347
+ parts.append(f"\n{chart_output}\n")
348
+ stats["charts"] += 1
349
+ chart_idx += 1
350
+ return "".join(parts)
351
+
352
+
353
+ __all__ = ["ExcelHandler"]
@@ -0,0 +1,97 @@
1
+ """
2
+ Excel Helper Module
3
+
4
+ Handles extraction of elements (textboxes, charts, images, tables, etc.) from XLSX/XLS files.
5
+
6
+ Module Structure:
7
+ - excel_chart_constants: Chart type mapping constants
8
+ - excel_chart_extractor: Chart extraction (ChartExtractor)
9
+ - excel_table_xlsx: XLSX table conversion
10
+ - excel_table_xls: XLS table conversion
11
+ - excel_textbox: Textbox extraction
12
+ - excel_metadata: Metadata extraction
13
+ - excel_image: Image extraction
14
+ - excel_layout_detector: Layout detection
15
+ """
16
+
17
+ # === Textbox ===
18
+ from xgen_doc2chunk.core.processor.excel_helper.excel_textbox import extract_textboxes_from_xlsx
19
+
20
+ # === Metadata ===
21
+ from xgen_doc2chunk.core.processor.excel_helper.excel_metadata import (
22
+ ExcelMetadataExtractor,
23
+ XLSXMetadataExtractor,
24
+ XLSMetadataExtractor,
25
+ )
26
+
27
+ # === Chart Extractor ===
28
+ from xgen_doc2chunk.core.processor.excel_helper.excel_chart_extractor import (
29
+ ExcelChartExtractor,
30
+ CHART_TYPE_MAP,
31
+ )
32
+
33
+ # === Image Processor (replaces excel_image.py utility functions) ===
34
+ from xgen_doc2chunk.core.processor.excel_helper.excel_image_processor import (
35
+ ExcelImageProcessor,
36
+ )
37
+
38
+ # === Table XLSX ===
39
+ from xgen_doc2chunk.core.processor.excel_helper.excel_table_xlsx import (
40
+ has_merged_cells_xlsx,
41
+ convert_xlsx_sheet_to_table,
42
+ convert_xlsx_sheet_to_markdown,
43
+ convert_xlsx_sheet_to_html,
44
+ convert_xlsx_objects_to_tables,
45
+ )
46
+
47
+ # === Table XLS ===
48
+ from xgen_doc2chunk.core.processor.excel_helper.excel_table_xls import (
49
+ has_merged_cells_xls,
50
+ convert_xls_sheet_to_table,
51
+ convert_xls_sheet_to_markdown,
52
+ convert_xls_sheet_to_html,
53
+ convert_xls_objects_to_tables,
54
+ )
55
+
56
+ # === Layout Detector ===
57
+ from xgen_doc2chunk.core.processor.excel_helper.excel_layout_detector import (
58
+ layout_detect_range_xlsx,
59
+ layout_detect_range_xls,
60
+ object_detect_xlsx,
61
+ object_detect_xls,
62
+ LayoutRange,
63
+ )
64
+
65
+
66
+ __all__ = [
67
+ # Textbox
68
+ 'extract_textboxes_from_xlsx',
69
+ # Metadata
70
+ 'ExcelMetadataExtractor',
71
+ 'XLSXMetadataExtractor',
72
+ 'XLSMetadataExtractor',
73
+ # Chart Constants
74
+ 'CHART_TYPE_MAP',
75
+ # Chart Extractor
76
+ 'ExcelChartExtractor',
77
+ # Image Processor
78
+ 'ExcelImageProcessor',
79
+ # Table XLSX
80
+ 'has_merged_cells_xlsx',
81
+ 'convert_xlsx_sheet_to_table',
82
+ 'convert_xlsx_sheet_to_markdown',
83
+ 'convert_xlsx_sheet_to_html',
84
+ 'convert_xlsx_objects_to_tables',
85
+ # Table XLS
86
+ 'has_merged_cells_xls',
87
+ 'convert_xls_sheet_to_table',
88
+ 'convert_xls_sheet_to_markdown',
89
+ 'convert_xls_sheet_to_html',
90
+ 'convert_xls_objects_to_tables',
91
+ # Layout Detector
92
+ 'layout_detect_range_xlsx',
93
+ 'layout_detect_range_xls',
94
+ 'object_detect_xlsx',
95
+ 'object_detect_xls',
96
+ 'LayoutRange',
97
+ ]