xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/functions/table_processor.py
|
|
2
|
+
"""
|
|
3
|
+
Table Processor - Common Table Processing Module
|
|
4
|
+
|
|
5
|
+
Provides common table processing utilities for formatting tables.
|
|
6
|
+
This module handles HTML, Markdown, and Text conversion of TableData.
|
|
7
|
+
|
|
8
|
+
================================================================================
|
|
9
|
+
TABLE PROCESSOR ARCHITECTURE
|
|
10
|
+
================================================================================
|
|
11
|
+
|
|
12
|
+
Main Entry Point:
|
|
13
|
+
format_table(table: TableData) -> str
|
|
14
|
+
|
|
15
|
+
Internal Processing Functions (called from format_table):
|
|
16
|
+
- format_table_as_html() : HTML conversion (rowspan/colspan support)
|
|
17
|
+
- format_table_as_markdown() : Markdown conversion (simple table)
|
|
18
|
+
- format_table_as_text() : Text conversion (plain text)
|
|
19
|
+
|
|
20
|
+
Common Utility:
|
|
21
|
+
- _clean_cell_content() : Cell content cleaning (whitespace handling)
|
|
22
|
+
|
|
23
|
+
================================================================================
|
|
24
|
+
PROCESSING FLOW
|
|
25
|
+
================================================================================
|
|
26
|
+
|
|
27
|
+
format_table(table) -> Main Entry Point
|
|
28
|
+
|
|
|
29
|
+
+-- config.output_format == HTML?
|
|
30
|
+
| YES -> format_table_as_html(table)
|
|
31
|
+
| +-- colgroup generation (column width)
|
|
32
|
+
| +-- row/cell iteration
|
|
33
|
+
| | +-- rowspan/colspan handling
|
|
34
|
+
| | +-- nested_table recursive handling
|
|
35
|
+
| | +-- _clean_cell_content()
|
|
36
|
+
| +-- HTML string return
|
|
37
|
+
|
|
|
38
|
+
+-- config.output_format == MARKDOWN?
|
|
39
|
+
| YES -> format_table_as_markdown(table)
|
|
40
|
+
| +-- row/cell iteration
|
|
41
|
+
| | +-- _clean_cell_content()
|
|
42
|
+
| +-- header separator addition
|
|
43
|
+
| +-- Markdown string return
|
|
44
|
+
|
|
|
45
|
+
+-- config.output_format == TEXT?
|
|
46
|
+
YES -> format_table_as_text(table)
|
|
47
|
+
+-- row/cell iteration
|
|
48
|
+
| +-- _clean_cell_content()
|
|
49
|
+
+-- Text string return
|
|
50
|
+
|
|
51
|
+
================================================================================
|
|
52
|
+
OUTPUT FORMAT COMPARISON
|
|
53
|
+
================================================================================
|
|
54
|
+
|
|
55
|
+
| Format | Use Case | Merge Support | Structure |
|
|
56
|
+
|----------|-----------------------------|--------------:|:-----------|
|
|
57
|
+
| HTML | Web rendering, full convert | Full support | Complex |
|
|
58
|
+
| Markdown | GitHub, docs, simple render | Not supported | Simplified |
|
|
59
|
+
| Text | Search index, logs, debug | Not supported | Minimal |
|
|
60
|
+
|
|
61
|
+
================================================================================
|
|
62
|
+
"""
|
|
63
|
+
import logging
|
|
64
|
+
import re
|
|
65
|
+
from enum import Enum
|
|
66
|
+
from dataclasses import dataclass
|
|
67
|
+
from typing import Optional
|
|
68
|
+
|
|
69
|
+
from xgen_doc2chunk.core.functions.table_extractor import TableData, TableCell
|
|
70
|
+
|
|
71
|
+
logger = logging.getLogger("document-processor")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class TableOutputFormat(Enum):
|
|
75
|
+
"""Table output format options."""
|
|
76
|
+
HTML = "html"
|
|
77
|
+
MARKDOWN = "markdown"
|
|
78
|
+
TEXT = "text"
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@dataclass
|
|
82
|
+
class TableProcessorConfig:
|
|
83
|
+
"""Configuration for table processing."""
|
|
84
|
+
output_format: TableOutputFormat = TableOutputFormat.HTML
|
|
85
|
+
clean_whitespace: bool = True
|
|
86
|
+
preserve_merged_cells: bool = True
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class TableProcessor:
|
|
90
|
+
"""
|
|
91
|
+
Main table processing class.
|
|
92
|
+
|
|
93
|
+
============================================================================
|
|
94
|
+
CLASS STRUCTURE
|
|
95
|
+
============================================================================
|
|
96
|
+
|
|
97
|
+
Public Methods:
|
|
98
|
+
format_table() -> Main Entry Point (routes by config.output_format)
|
|
99
|
+
format_table_as_html() -> HTML conversion (internal: called from format_table)
|
|
100
|
+
format_table_as_markdown()-> Markdown conversion (internal: called from format_table)
|
|
101
|
+
format_table_as_text() -> Text conversion (internal: called from format_table)
|
|
102
|
+
|
|
103
|
+
Private Methods:
|
|
104
|
+
_clean_cell_content() -> Common utility (cell content cleaning)
|
|
105
|
+
|
|
106
|
+
============================================================================
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
def __init__(self, config: Optional[TableProcessorConfig] = None):
|
|
110
|
+
self.config = config or TableProcessorConfig()
|
|
111
|
+
self.logger = logging.getLogger("document-processor")
|
|
112
|
+
|
|
113
|
+
# ==========================================================================
|
|
114
|
+
# format_table() - Main Entry Point
|
|
115
|
+
# ==========================================================================
|
|
116
|
+
#
|
|
117
|
+
# +------------------------------------------------------------------------+
|
|
118
|
+
# | format_table(table) |
|
|
119
|
+
# | |
|
|
120
|
+
# | Check config.output_format |
|
|
121
|
+
# | | |
|
|
122
|
+
# | +-- HTML -------> format_table_as_html() |
|
|
123
|
+
# | | +-- colgroup generation (col width) |
|
|
124
|
+
# | | +-- row/cell iteration |
|
|
125
|
+
# | | | +-- rowspan/colspan |
|
|
126
|
+
# | | | +-- nested_table (recursive) |
|
|
127
|
+
# | | | +-- _clean_cell_content() |
|
|
128
|
+
# | | +-- HTML return |
|
|
129
|
+
# | | |
|
|
130
|
+
# | +-- MARKDOWN ---> format_table_as_markdown() |
|
|
131
|
+
# | | +-- row/cell iteration |
|
|
132
|
+
# | | | +-- _clean_cell_content() |
|
|
133
|
+
# | | +-- header separator |
|
|
134
|
+
# | | +-- Markdown return |
|
|
135
|
+
# | | |
|
|
136
|
+
# | +-- TEXT -------> format_table_as_text() |
|
|
137
|
+
# | +-- row/cell iteration |
|
|
138
|
+
# | | +-- _clean_cell_content() |
|
|
139
|
+
# | +-- Text return |
|
|
140
|
+
# +------------------------------------------------------------------------+
|
|
141
|
+
#
|
|
142
|
+
# ==========================================================================
|
|
143
|
+
|
|
144
|
+
def format_table(self, table: TableData) -> str:
|
|
145
|
+
"""
|
|
146
|
+
Main entry point for table formatting.
|
|
147
|
+
|
|
148
|
+
Routes to appropriate format handler based on config.output_format.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
table: TableData from table extractor
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
Formatted string (HTML/Markdown/Text)
|
|
155
|
+
"""
|
|
156
|
+
if self.config.output_format == TableOutputFormat.HTML:
|
|
157
|
+
return self.format_table_as_html(table)
|
|
158
|
+
elif self.config.output_format == TableOutputFormat.MARKDOWN:
|
|
159
|
+
return self.format_table_as_markdown(table)
|
|
160
|
+
else:
|
|
161
|
+
return self.format_table_as_text(table)
|
|
162
|
+
|
|
163
|
+
# ==========================================================================
|
|
164
|
+
# format_table_as_html() - HTML conversion (called from format_table)
|
|
165
|
+
# ==========================================================================
|
|
166
|
+
|
|
167
|
+
def format_table_as_html(self, table: TableData) -> str:
|
|
168
|
+
"""
|
|
169
|
+
Convert TableData to HTML string.
|
|
170
|
+
|
|
171
|
+
Called from format_table() when output_format == HTML.
|
|
172
|
+
|
|
173
|
+
Features:
|
|
174
|
+
- colgroup for column widths
|
|
175
|
+
- rowspan/colspan for merged cells
|
|
176
|
+
- nested_table support (recursive)
|
|
177
|
+
"""
|
|
178
|
+
if not table.rows:
|
|
179
|
+
return ""
|
|
180
|
+
|
|
181
|
+
html_parts = ["<table>"]
|
|
182
|
+
|
|
183
|
+
if table.col_widths_percent:
|
|
184
|
+
html_parts.append(" <colgroup>")
|
|
185
|
+
for width_pct in table.col_widths_percent:
|
|
186
|
+
html_parts.append(f' <col style="width: {width_pct:.1f}%">')
|
|
187
|
+
html_parts.append(" </colgroup>")
|
|
188
|
+
|
|
189
|
+
for row_idx, row in enumerate(table.rows):
|
|
190
|
+
html_parts.append(" <tr>")
|
|
191
|
+
|
|
192
|
+
for cell in row:
|
|
193
|
+
tag = "th" if cell.is_header else "td"
|
|
194
|
+
attrs = []
|
|
195
|
+
if self.config.preserve_merged_cells:
|
|
196
|
+
if cell.row_span > 1:
|
|
197
|
+
attrs.append(f'rowspan="{cell.row_span}"')
|
|
198
|
+
if cell.col_span > 1:
|
|
199
|
+
attrs.append(f'colspan="{cell.col_span}"')
|
|
200
|
+
|
|
201
|
+
attr_str = " " + " ".join(attrs) if attrs else ""
|
|
202
|
+
|
|
203
|
+
if cell.nested_table:
|
|
204
|
+
nested_html = self.format_table_as_html(cell.nested_table)
|
|
205
|
+
html_parts.append(f" <{tag}{attr_str}>{nested_html}</{tag}>")
|
|
206
|
+
else:
|
|
207
|
+
content = self._clean_cell_content(cell.content)
|
|
208
|
+
html_parts.append(f" <{tag}{attr_str}>{content}</{tag}>")
|
|
209
|
+
|
|
210
|
+
html_parts.append(" </tr>")
|
|
211
|
+
|
|
212
|
+
html_parts.append("</table>")
|
|
213
|
+
return "\n".join(html_parts)
|
|
214
|
+
|
|
215
|
+
# ==========================================================================
|
|
216
|
+
# format_table_as_markdown() - Markdown conversion (called from format_table)
|
|
217
|
+
# ==========================================================================
|
|
218
|
+
|
|
219
|
+
def format_table_as_markdown(self, table: TableData) -> str:
|
|
220
|
+
"""
|
|
221
|
+
Convert TableData to Markdown string.
|
|
222
|
+
|
|
223
|
+
Called from format_table() when output_format == MARKDOWN.
|
|
224
|
+
|
|
225
|
+
Note: Markdown does NOT support rowspan/colspan.
|
|
226
|
+
"""
|
|
227
|
+
if not table.rows:
|
|
228
|
+
return ""
|
|
229
|
+
|
|
230
|
+
lines = []
|
|
231
|
+
for row_idx, row in enumerate(table.rows):
|
|
232
|
+
cells = [self._clean_cell_content(cell.content) for cell in row]
|
|
233
|
+
line = "| " + " | ".join(cells) + " |"
|
|
234
|
+
lines.append(line)
|
|
235
|
+
|
|
236
|
+
if row_idx == 0 and table.has_header:
|
|
237
|
+
separator = "| " + " | ".join(["---"] * len(row)) + " |"
|
|
238
|
+
lines.append(separator)
|
|
239
|
+
|
|
240
|
+
return "\n".join(lines)
|
|
241
|
+
|
|
242
|
+
# ==========================================================================
|
|
243
|
+
# format_table_as_text() - Text conversion (called from format_table)
|
|
244
|
+
# ==========================================================================
|
|
245
|
+
|
|
246
|
+
def format_table_as_text(self, table: TableData) -> str:
|
|
247
|
+
"""
|
|
248
|
+
Convert TableData to plain text string.
|
|
249
|
+
|
|
250
|
+
Called from format_table() when output_format == TEXT.
|
|
251
|
+
|
|
252
|
+
Note: No table structure preserved. Useful for search indexing.
|
|
253
|
+
"""
|
|
254
|
+
if not table.rows:
|
|
255
|
+
return ""
|
|
256
|
+
|
|
257
|
+
lines = []
|
|
258
|
+
for row in table.rows:
|
|
259
|
+
cells = [self._clean_cell_content(cell.content) for cell in row]
|
|
260
|
+
lines.append("\t".join(cells))
|
|
261
|
+
|
|
262
|
+
return "\n".join(lines)
|
|
263
|
+
|
|
264
|
+
# ==========================================================================
|
|
265
|
+
# _clean_cell_content() - Common utility (called from all format functions)
|
|
266
|
+
# ==========================================================================
|
|
267
|
+
|
|
268
|
+
def _clean_cell_content(self, content: str) -> str:
|
|
269
|
+
"""
|
|
270
|
+
Clean cell content (whitespace normalization).
|
|
271
|
+
|
|
272
|
+
Called from all format_table_as_* methods.
|
|
273
|
+
"""
|
|
274
|
+
if not content:
|
|
275
|
+
return ""
|
|
276
|
+
|
|
277
|
+
if self.config.clean_whitespace:
|
|
278
|
+
content = re.sub(r'\s+', ' ', content)
|
|
279
|
+
content = content.strip()
|
|
280
|
+
|
|
281
|
+
return content
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def create_table_processor(config: Optional[TableProcessorConfig] = None) -> TableProcessor:
|
|
285
|
+
"""
|
|
286
|
+
Factory function to create a TableProcessor.
|
|
287
|
+
|
|
288
|
+
Args:
|
|
289
|
+
config: Table processing configuration
|
|
290
|
+
|
|
291
|
+
Returns:
|
|
292
|
+
Configured TableProcessor instance
|
|
293
|
+
"""
|
|
294
|
+
return TableProcessor(config)
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
# Default configuration
|
|
298
|
+
DEFAULT_PROCESSOR_CONFIG = TableProcessorConfig()
|
|
299
|
+
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
# your_package/document_processor/utils.py
|
|
2
|
+
"""
|
|
3
|
+
Common utility module for document processing
|
|
4
|
+
"""
|
|
5
|
+
import io
|
|
6
|
+
import os
|
|
7
|
+
import hashlib
|
|
8
|
+
import tempfile
|
|
9
|
+
import logging
|
|
10
|
+
import re
|
|
11
|
+
import bisect
|
|
12
|
+
from typing import Any, Dict, List, Optional, Set
|
|
13
|
+
|
|
14
|
+
from PIL import Image
|
|
15
|
+
|
|
16
|
+
def sanitize_text_for_json(text: Optional[str]) -> str:
|
|
17
|
+
"""
|
|
18
|
+
Sanitizes text to be safely encodable in a UTF-8 JSON response.
|
|
19
|
+
|
|
20
|
+
Removes or replaces the following characters:
|
|
21
|
+
- Invalid surrogate pairs (U+D800-U+DFFF): removes isolated high/low surrogates
|
|
22
|
+
- Private Use Area characters (U+E000-U+F8FF, U+F0000 and above): removed
|
|
23
|
+
- Non-character code points (U+FFFE, U+FFFF): removed
|
|
24
|
+
- Problematic control characters (except tab, newline, carriage return)
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
text: Input text that may contain invalid characters
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
Sanitized text safe for JSON encoding
|
|
31
|
+
"""
|
|
32
|
+
if not text:
|
|
33
|
+
return text if text is not None else ""
|
|
34
|
+
|
|
35
|
+
result = []
|
|
36
|
+
i = 0
|
|
37
|
+
text_len = len(text)
|
|
38
|
+
|
|
39
|
+
while i < text_len:
|
|
40
|
+
char = text[i]
|
|
41
|
+
code = ord(char)
|
|
42
|
+
|
|
43
|
+
# Check for surrogate pairs (\uD800-\uDFFF)
|
|
44
|
+
if 0xD800 <= code <= 0xDFFF:
|
|
45
|
+
# High surrogate (\uD800-\uDBFF)
|
|
46
|
+
if 0xD800 <= code <= 0xDBFF:
|
|
47
|
+
# Check if followed by a valid low surrogate
|
|
48
|
+
if i + 1 < text_len:
|
|
49
|
+
next_code = ord(text[i + 1])
|
|
50
|
+
if 0xDC00 <= next_code <= 0xDFFF:
|
|
51
|
+
# Valid surrogate pair, calculate actual code point
|
|
52
|
+
full_code = 0x10000 + ((code - 0xD800) << 10) + (next_code - 0xDC00)
|
|
53
|
+
# Supplementary Private Use Area-A: U+F0000 ~ U+FFFFF
|
|
54
|
+
# Supplementary Private Use Area-B: U+100000 ~ U+10FFFF
|
|
55
|
+
if full_code >= 0xF0000:
|
|
56
|
+
# Skip Private Use Supplementary characters
|
|
57
|
+
i += 2
|
|
58
|
+
continue
|
|
59
|
+
else:
|
|
60
|
+
# Valid supplementary character, keep it
|
|
61
|
+
result.append(char)
|
|
62
|
+
result.append(text[i + 1])
|
|
63
|
+
i += 2
|
|
64
|
+
continue
|
|
65
|
+
# Invalid isolated high surrogate, skip it
|
|
66
|
+
i += 1
|
|
67
|
+
continue
|
|
68
|
+
else:
|
|
69
|
+
# Low surrogate without high surrogate, skip it
|
|
70
|
+
i += 1
|
|
71
|
+
continue
|
|
72
|
+
|
|
73
|
+
# Check Basic Private Use Area (U+E000 ~ U+F8FF)
|
|
74
|
+
if 0xE000 <= code <= 0xF8FF:
|
|
75
|
+
# Skip Private Use characters
|
|
76
|
+
i += 1
|
|
77
|
+
continue
|
|
78
|
+
|
|
79
|
+
# Check for problematic control characters
|
|
80
|
+
# Keep: \t (9), \n (10), \r (13), space (32) and above
|
|
81
|
+
# Remove: \x00-\x08, \x0B, \x0C, \x0E-\x1F (except those above)
|
|
82
|
+
if code < 32 and code not in (9, 10, 13):
|
|
83
|
+
# Skip problematic control characters
|
|
84
|
+
i += 1
|
|
85
|
+
continue
|
|
86
|
+
|
|
87
|
+
# Check for non-characters (U+FFFE, U+FFFF)
|
|
88
|
+
if code in (0xFFFE, 0xFFFF):
|
|
89
|
+
i += 1
|
|
90
|
+
continue
|
|
91
|
+
|
|
92
|
+
# Valid character, keep it
|
|
93
|
+
result.append(char)
|
|
94
|
+
i += 1
|
|
95
|
+
|
|
96
|
+
return ''.join(result)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def clean_text(text: Optional[str]) -> str:
|
|
100
|
+
if not text:
|
|
101
|
+
return ""
|
|
102
|
+
text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
|
|
103
|
+
return text.strip()
|
|
104
|
+
|
|
105
|
+
def clean_code_text(text: str) -> str:
|
|
106
|
+
if not text:
|
|
107
|
+
return ""
|
|
108
|
+
text = text.rstrip().replace('\t', ' ')
|
|
109
|
+
return text
|
|
110
|
+
|
|
111
|
+
def is_text_quality_sufficient(text: Optional[str], min_chars: int = 500, min_word_ratio: float = 0.6) -> bool:
|
|
112
|
+
try:
|
|
113
|
+
if not text or len(text) < min_chars:
|
|
114
|
+
return False
|
|
115
|
+
word_chars = re.findall(r"[\w\u1100-\u11FF\u3130-\u318F\uAC00-\uD7AF]", text)
|
|
116
|
+
ratio = len(word_chars) / max(1, len(text))
|
|
117
|
+
return ratio >= min_word_ratio
|
|
118
|
+
except Exception:
|
|
119
|
+
return False
|
|
120
|
+
|
|
121
|
+
def find_chunk_position(chunk: str, full_text: str, start_pos: int = 0) -> int:
|
|
122
|
+
try:
|
|
123
|
+
pos = full_text.find(chunk, start_pos)
|
|
124
|
+
if pos != -1:
|
|
125
|
+
return pos
|
|
126
|
+
lines = chunk.strip().split('\n')
|
|
127
|
+
if lines and len(lines[0]) >= 10:
|
|
128
|
+
first_line = lines[0].strip()
|
|
129
|
+
pos = full_text.find(first_line, start_pos)
|
|
130
|
+
if pos != -1:
|
|
131
|
+
chunk_start = full_text.find(chunk[:50] if len(chunk) > 50 else chunk, pos)
|
|
132
|
+
return chunk_start if chunk_start != -1 else pos
|
|
133
|
+
if len(chunk.strip()) >= 10:
|
|
134
|
+
start = chunk.strip()[:50]
|
|
135
|
+
pos = full_text.find(start, start_pos)
|
|
136
|
+
if pos != -1:
|
|
137
|
+
return pos
|
|
138
|
+
return -1
|
|
139
|
+
except Exception:
|
|
140
|
+
return -1
|
|
141
|
+
|
|
142
|
+
def build_line_starts(text: str) -> List[int]:
|
|
143
|
+
try:
|
|
144
|
+
starts = [0]
|
|
145
|
+
for i, ch in enumerate(text):
|
|
146
|
+
if ch == '\n' and i + 1 < len(text):
|
|
147
|
+
starts.append(i + 1)
|
|
148
|
+
return starts
|
|
149
|
+
except Exception:
|
|
150
|
+
return [0]
|
|
151
|
+
|
|
152
|
+
def pos_to_line(pos: int, line_starts: List[int]) -> int:
|
|
153
|
+
try:
|
|
154
|
+
if pos < 0:
|
|
155
|
+
return 1
|
|
156
|
+
idx = bisect.bisect_right(line_starts, pos) - 1
|
|
157
|
+
return max(1, idx + 1)
|
|
158
|
+
except Exception:
|
|
159
|
+
return 1
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/__init__.py
|
|
2
|
+
"""
|
|
3
|
+
Processor - Document Type-specific Handler Module
|
|
4
|
+
|
|
5
|
+
Provides handlers for processing individual document formats.
|
|
6
|
+
|
|
7
|
+
Handler List:
|
|
8
|
+
- pdf_handler: PDF document processing (adaptive complexity-based)
|
|
9
|
+
- docx_handler: DOCX document processing
|
|
10
|
+
- doc_handler: DOC document processing (OLE, HTML, misnamed DOCX)
|
|
11
|
+
- rtf_handler: RTF document processing
|
|
12
|
+
- ppt_handler: PPT/PPTX document processing
|
|
13
|
+
- excel_handler: Excel (XLSX/XLS) document processing
|
|
14
|
+
- hwp_processor: HWP document processing
|
|
15
|
+
- hwpx_processor: HWPX document processing
|
|
16
|
+
- csv_handler: CSV file processing
|
|
17
|
+
- text_handler: Text file processing
|
|
18
|
+
- html_reprocessor: HTML reprocessing
|
|
19
|
+
|
|
20
|
+
Helper Modules (subdirectories):
|
|
21
|
+
- csv_helper/: CSV processing helper
|
|
22
|
+
- docx_helper/: DOCX processing helper
|
|
23
|
+
- doc_helpers/: DOC processing helper
|
|
24
|
+
- rtf_helper/: RTF processing helper
|
|
25
|
+
- excel_helper/: Excel processing helper
|
|
26
|
+
- hwp_helper/: HWP processing helper
|
|
27
|
+
- hwpx_helper/: HWPX processing helper
|
|
28
|
+
- pdf_helpers/: PDF processing helper
|
|
29
|
+
- ppt_helper/: PPT processing helper
|
|
30
|
+
|
|
31
|
+
Usage Example:
|
|
32
|
+
from xgen_doc2chunk.core.processor import PDFHandler
|
|
33
|
+
from xgen_doc2chunk.core.processor import DOCXHandler
|
|
34
|
+
from xgen_doc2chunk.core.processor import RTFHandler
|
|
35
|
+
from xgen_doc2chunk.core.processor.pdf_helpers import extract_pdf_metadata
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
# === PDF Handler ===
|
|
39
|
+
from xgen_doc2chunk.core.processor.pdf_handler import PDFHandler
|
|
40
|
+
|
|
41
|
+
# === Document Handlers ===
|
|
42
|
+
from xgen_doc2chunk.core.processor.docx_handler import DOCXHandler
|
|
43
|
+
from xgen_doc2chunk.core.processor.doc_handler import DOCHandler
|
|
44
|
+
from xgen_doc2chunk.core.processor.rtf_handler import RTFHandler
|
|
45
|
+
from xgen_doc2chunk.core.processor.ppt_handler import PPTHandler
|
|
46
|
+
|
|
47
|
+
# === Data Handlers ===
|
|
48
|
+
from xgen_doc2chunk.core.processor.excel_handler import ExcelHandler
|
|
49
|
+
from xgen_doc2chunk.core.processor.csv_handler import CSVHandler
|
|
50
|
+
from xgen_doc2chunk.core.processor.text_handler import TextHandler
|
|
51
|
+
|
|
52
|
+
# === HWP Handlers ===
|
|
53
|
+
from xgen_doc2chunk.core.processor.hwp_handler import HWPHandler
|
|
54
|
+
from xgen_doc2chunk.core.processor.hwpx_handler import HWPXHandler
|
|
55
|
+
|
|
56
|
+
# === Other Processors ===
|
|
57
|
+
# from xgen_doc2chunk.core.processor.html_reprocessor import ... # HTML reprocessing
|
|
58
|
+
|
|
59
|
+
# === Helper Modules (subpackages) ===
|
|
60
|
+
from xgen_doc2chunk.core.processor import csv_helper
|
|
61
|
+
from xgen_doc2chunk.core.processor import doc_helpers
|
|
62
|
+
from xgen_doc2chunk.core.processor import docx_helper
|
|
63
|
+
from xgen_doc2chunk.core.processor import excel_helper
|
|
64
|
+
from xgen_doc2chunk.core.processor import hwp_helper
|
|
65
|
+
from xgen_doc2chunk.core.processor import hwpx_helper
|
|
66
|
+
from xgen_doc2chunk.core.processor import pdf_helpers
|
|
67
|
+
from xgen_doc2chunk.core.processor import ppt_helper
|
|
68
|
+
from xgen_doc2chunk.core.processor import rtf_helper
|
|
69
|
+
|
|
70
|
+
__all__ = [
|
|
71
|
+
# PDF Handler
|
|
72
|
+
"PDFHandler",
|
|
73
|
+
# Document Handlers
|
|
74
|
+
"DOCXHandler",
|
|
75
|
+
"DOCHandler",
|
|
76
|
+
"RTFHandler",
|
|
77
|
+
"PPTHandler",
|
|
78
|
+
# Data Handlers
|
|
79
|
+
"ExcelHandler",
|
|
80
|
+
"CSVHandler",
|
|
81
|
+
"TextHandler",
|
|
82
|
+
# HWP Handlers
|
|
83
|
+
"HWPHandler",
|
|
84
|
+
"HWPXHandler",
|
|
85
|
+
# Helper subpackages
|
|
86
|
+
"csv_helper",
|
|
87
|
+
"doc_helpers",
|
|
88
|
+
"docx_helper",
|
|
89
|
+
"excel_helper",
|
|
90
|
+
"hwp_helper",
|
|
91
|
+
"hwpx_helper",
|
|
92
|
+
"pdf_helpers",
|
|
93
|
+
"ppt_helper",
|
|
94
|
+
"rtf_helper",
|
|
95
|
+
]
|
|
96
|
+
|