xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py
|
|
2
|
+
"""
|
|
3
|
+
HWPX Table Processor
|
|
4
|
+
|
|
5
|
+
Formats TableData into HTML/Markdown/Text output for HWPX documents.
|
|
6
|
+
Extends the base TableProcessor with HWPX-specific formatting options.
|
|
7
|
+
|
|
8
|
+
Key Features:
|
|
9
|
+
- HTML output with border attributes for backward compatibility
|
|
10
|
+
- Special handling for 1x1 container tables
|
|
11
|
+
- Special handling for single column tables
|
|
12
|
+
- Post-processing for HWPX-specific requirements
|
|
13
|
+
|
|
14
|
+
Usage:
|
|
15
|
+
from xgen_doc2chunk.core.processor.hwpx_helper.hwpx_table_processor import (
|
|
16
|
+
HWPXTableProcessor,
|
|
17
|
+
create_hwpx_table_processor,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
processor = HWPXTableProcessor()
|
|
21
|
+
html = processor.format_table(table_data)
|
|
22
|
+
"""
|
|
23
|
+
import logging
|
|
24
|
+
from dataclasses import dataclass
|
|
25
|
+
from typing import Optional
|
|
26
|
+
|
|
27
|
+
from xgen_doc2chunk.core.functions.table_extractor import TableData
|
|
28
|
+
from xgen_doc2chunk.core.functions.table_processor import (
|
|
29
|
+
TableProcessor,
|
|
30
|
+
TableProcessorConfig,
|
|
31
|
+
TableOutputFormat,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
logger = logging.getLogger("document-processor")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class HWPXTableProcessorConfig(TableProcessorConfig):
|
|
39
|
+
"""Configuration for HWPX table processing.
|
|
40
|
+
|
|
41
|
+
Extends TableProcessorConfig with HWPX-specific options.
|
|
42
|
+
|
|
43
|
+
Attributes:
|
|
44
|
+
add_border: Whether to add border='1' attribute to HTML tables
|
|
45
|
+
collapse_single_cell: Whether to collapse 1x1 tables to plain text
|
|
46
|
+
collapse_single_column: Whether to collapse single-column tables to line-separated text
|
|
47
|
+
"""
|
|
48
|
+
add_border: bool = True
|
|
49
|
+
collapse_single_cell: bool = True
|
|
50
|
+
collapse_single_column: bool = True
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class HWPXTableProcessor(TableProcessor):
|
|
54
|
+
"""HWPX-specific table processor.
|
|
55
|
+
|
|
56
|
+
Extends TableProcessor with HWPX-specific formatting:
|
|
57
|
+
- Adds border='1' to HTML tables for backward compatibility
|
|
58
|
+
- Collapses 1x1 container tables to plain text
|
|
59
|
+
- Collapses single-column tables to line-separated text
|
|
60
|
+
|
|
61
|
+
Usage:
|
|
62
|
+
processor = HWPXTableProcessor()
|
|
63
|
+
html = processor.format_table(table_data)
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
def __init__(self, config: Optional[HWPXTableProcessorConfig] = None):
|
|
67
|
+
"""Initialize the HWPX table processor.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
config: HWPX table processing configuration
|
|
71
|
+
"""
|
|
72
|
+
if config is None:
|
|
73
|
+
config = HWPXTableProcessorConfig()
|
|
74
|
+
super().__init__(config)
|
|
75
|
+
self.hwpx_config = config
|
|
76
|
+
|
|
77
|
+
def format_table(self, table: TableData) -> str:
|
|
78
|
+
"""Format a table with HWPX-specific handling.
|
|
79
|
+
|
|
80
|
+
Handles special cases before delegating to base class:
|
|
81
|
+
- 1x1 tables: Return cell content only (container tables)
|
|
82
|
+
- Single column tables: Return as line-separated text
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
table: TableData to format
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
Formatted table string
|
|
89
|
+
"""
|
|
90
|
+
if not table or not table.rows:
|
|
91
|
+
return ""
|
|
92
|
+
|
|
93
|
+
# Special case: 1x1 table (container table)
|
|
94
|
+
if (self.hwpx_config.collapse_single_cell and
|
|
95
|
+
table.num_rows == 1 and table.num_cols == 1):
|
|
96
|
+
if table.rows and table.rows[0]:
|
|
97
|
+
return table.rows[0][0].content
|
|
98
|
+
return ""
|
|
99
|
+
|
|
100
|
+
# Special case: Single column table
|
|
101
|
+
if (self.hwpx_config.collapse_single_column and
|
|
102
|
+
table.num_cols == 1):
|
|
103
|
+
text_items = []
|
|
104
|
+
for row in table.rows:
|
|
105
|
+
if row and row[0].content:
|
|
106
|
+
text_items.append(row[0].content)
|
|
107
|
+
if text_items:
|
|
108
|
+
return "\n\n".join(text_items)
|
|
109
|
+
return ""
|
|
110
|
+
|
|
111
|
+
# Normal table processing
|
|
112
|
+
return super().format_table(table)
|
|
113
|
+
|
|
114
|
+
def format_table_as_html(self, table: TableData) -> str:
|
|
115
|
+
"""Format table as HTML with HWPX-specific attributes.
|
|
116
|
+
|
|
117
|
+
Adds border='1' attribute for backward compatibility.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
table: TableData to format
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
HTML table string
|
|
124
|
+
"""
|
|
125
|
+
# Check for special cases first
|
|
126
|
+
if not table or not table.rows:
|
|
127
|
+
return ""
|
|
128
|
+
|
|
129
|
+
# 1x1 table handling
|
|
130
|
+
if (self.hwpx_config.collapse_single_cell and
|
|
131
|
+
table.num_rows == 1 and table.num_cols == 1):
|
|
132
|
+
if table.rows and table.rows[0]:
|
|
133
|
+
return table.rows[0][0].content
|
|
134
|
+
return ""
|
|
135
|
+
|
|
136
|
+
# Single column table handling
|
|
137
|
+
if (self.hwpx_config.collapse_single_column and
|
|
138
|
+
table.num_cols == 1):
|
|
139
|
+
text_items = []
|
|
140
|
+
for row in table.rows:
|
|
141
|
+
if row and row[0].content:
|
|
142
|
+
text_items.append(row[0].content)
|
|
143
|
+
if text_items:
|
|
144
|
+
return "\n\n".join(text_items)
|
|
145
|
+
return ""
|
|
146
|
+
|
|
147
|
+
# Generate HTML using base class
|
|
148
|
+
html = super().format_table_as_html(table)
|
|
149
|
+
|
|
150
|
+
# Post-process: Add border attribute
|
|
151
|
+
if self.hwpx_config.add_border:
|
|
152
|
+
html = html.replace("<table>", "<table border='1'>")
|
|
153
|
+
|
|
154
|
+
return html
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
# Default configuration
|
|
158
|
+
DEFAULT_HWPX_PROCESSOR_CONFIG = HWPXTableProcessorConfig(
|
|
159
|
+
output_format=TableOutputFormat.HTML,
|
|
160
|
+
clean_whitespace=True,
|
|
161
|
+
preserve_merged_cells=True,
|
|
162
|
+
add_border=True,
|
|
163
|
+
collapse_single_cell=True,
|
|
164
|
+
collapse_single_column=True,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
# Module-level default processor (lazy initialized)
|
|
169
|
+
_default_processor: Optional[HWPXTableProcessor] = None
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def get_default_processor() -> HWPXTableProcessor:
|
|
173
|
+
"""Get or create the default HWPX table processor.
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
Configured HWPXTableProcessor instance
|
|
177
|
+
"""
|
|
178
|
+
global _default_processor
|
|
179
|
+
if _default_processor is None:
|
|
180
|
+
_default_processor = HWPXTableProcessor(DEFAULT_HWPX_PROCESSOR_CONFIG)
|
|
181
|
+
return _default_processor
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def create_hwpx_table_processor(
|
|
185
|
+
config: Optional[HWPXTableProcessorConfig] = None
|
|
186
|
+
) -> HWPXTableProcessor:
|
|
187
|
+
"""Create a HWPX table processor instance.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
config: HWPX table processing configuration
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
Configured HWPXTableProcessor instance
|
|
194
|
+
"""
|
|
195
|
+
return HWPXTableProcessor(config)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def format_table_as_html(table: TableData) -> str:
|
|
199
|
+
"""Convenience function to format a table as HTML.
|
|
200
|
+
|
|
201
|
+
Uses the default HWPX table processor.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
table: TableData to format
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
HTML table string
|
|
208
|
+
"""
|
|
209
|
+
processor = get_default_processor()
|
|
210
|
+
return processor.format_table_as_html(table)
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
__all__ = [
|
|
214
|
+
'HWPXTableProcessor',
|
|
215
|
+
'HWPXTableProcessorConfig',
|
|
216
|
+
'DEFAULT_HWPX_PROCESSOR_CONFIG',
|
|
217
|
+
'create_hwpx_table_processor',
|
|
218
|
+
'get_default_processor',
|
|
219
|
+
'format_table_as_html',
|
|
220
|
+
]
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/image_file_handler.py
|
|
2
|
+
"""
|
|
3
|
+
Image File Handler - Image File Processor
|
|
4
|
+
|
|
5
|
+
Class-based handler for image files (jpg, jpeg, png, gif, bmp, webp).
|
|
6
|
+
Converts images to text using OCR engine when available.
|
|
7
|
+
If no OCR engine is provided, returns a placeholder or empty string.
|
|
8
|
+
"""
|
|
9
|
+
import logging
|
|
10
|
+
import os
|
|
11
|
+
from typing import Any, Optional, TYPE_CHECKING
|
|
12
|
+
|
|
13
|
+
from xgen_doc2chunk.core.processor.base_handler import BaseHandler
|
|
14
|
+
from xgen_doc2chunk.core.functions.chart_extractor import BaseChartExtractor, NullChartExtractor
|
|
15
|
+
from xgen_doc2chunk.core.processor.image_file_helper.image_file_image_processor import ImageFileImageProcessor
|
|
16
|
+
from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from xgen_doc2chunk.core.document_processor import CurrentFile
|
|
20
|
+
from xgen_doc2chunk.ocr.base import BaseOCR
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger("document-processor")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# Supported image extensions
|
|
26
|
+
SUPPORTED_IMAGE_EXTENSIONS = frozenset(['jpg', 'jpeg', 'png', 'gif', 'bmp', 'webp'])
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ImageFileHandler(BaseHandler):
|
|
30
|
+
"""
|
|
31
|
+
Image File Processing Handler Class.
|
|
32
|
+
|
|
33
|
+
Processes standalone image files by converting them to text using OCR.
|
|
34
|
+
Requires an OCR engine to be provided for actual text extraction.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
config: Configuration dictionary (passed from DocumentProcessor)
|
|
38
|
+
image_processor: ImageProcessor instance (passed from DocumentProcessor)
|
|
39
|
+
page_tag_processor: PageTagProcessor instance (passed from DocumentProcessor)
|
|
40
|
+
ocr_engine: OCR engine instance (BaseOCR subclass) for image-to-text conversion
|
|
41
|
+
|
|
42
|
+
Example:
|
|
43
|
+
>>> from xgen_doc2chunk.ocr.ocr_engine import OpenAIOCR
|
|
44
|
+
>>> ocr = OpenAIOCR(api_key="sk-...", model="gpt-4o")
|
|
45
|
+
>>> handler = ImageFileHandler(ocr_engine=ocr)
|
|
46
|
+
>>> text = handler.extract_text(current_file)
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def _create_file_converter(self):
|
|
50
|
+
"""Create image-file-specific file converter."""
|
|
51
|
+
from xgen_doc2chunk.core.processor.image_file_helper.image_file_converter import ImageFileConverter
|
|
52
|
+
return ImageFileConverter()
|
|
53
|
+
|
|
54
|
+
def _create_preprocessor(self):
|
|
55
|
+
"""Create image-file-specific preprocessor."""
|
|
56
|
+
from xgen_doc2chunk.core.processor.image_file_helper.image_file_preprocessor import ImageFilePreprocessor
|
|
57
|
+
return ImageFilePreprocessor()
|
|
58
|
+
|
|
59
|
+
def _create_chart_extractor(self) -> BaseChartExtractor:
|
|
60
|
+
"""Image files do not contain charts. Return NullChartExtractor."""
|
|
61
|
+
return NullChartExtractor(self._chart_processor)
|
|
62
|
+
|
|
63
|
+
def _create_metadata_extractor(self):
|
|
64
|
+
"""Image files do not have document metadata. Return None (uses NullMetadataExtractor)."""
|
|
65
|
+
return None
|
|
66
|
+
|
|
67
|
+
def _create_format_image_processor(self) -> ImageProcessor:
|
|
68
|
+
"""Create image-file-specific image processor."""
|
|
69
|
+
return ImageFileImageProcessor()
|
|
70
|
+
|
|
71
|
+
def __init__(
|
|
72
|
+
self,
|
|
73
|
+
config: Optional[dict] = None,
|
|
74
|
+
image_processor: Optional[Any] = None,
|
|
75
|
+
page_tag_processor: Optional[Any] = None,
|
|
76
|
+
chart_processor: Optional[Any] = None,
|
|
77
|
+
ocr_engine: Optional["BaseOCR"] = None
|
|
78
|
+
):
|
|
79
|
+
"""
|
|
80
|
+
Initialize ImageFileHandler.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
config: Configuration dictionary (passed from DocumentProcessor)
|
|
84
|
+
image_processor: ImageProcessor instance (passed from DocumentProcessor)
|
|
85
|
+
page_tag_processor: PageTagProcessor instance (passed from DocumentProcessor)
|
|
86
|
+
chart_processor: ChartProcessor instance (passed from DocumentProcessor)
|
|
87
|
+
ocr_engine: OCR engine instance (BaseOCR subclass) for image-to-text conversion.
|
|
88
|
+
If None, images cannot be converted to text.
|
|
89
|
+
"""
|
|
90
|
+
super().__init__(
|
|
91
|
+
config=config,
|
|
92
|
+
image_processor=image_processor,
|
|
93
|
+
page_tag_processor=page_tag_processor,
|
|
94
|
+
chart_processor=chart_processor
|
|
95
|
+
)
|
|
96
|
+
self._ocr_engine = ocr_engine
|
|
97
|
+
|
|
98
|
+
@property
|
|
99
|
+
def ocr_engine(self) -> Optional["BaseOCR"]:
|
|
100
|
+
"""Current OCR engine instance."""
|
|
101
|
+
return self._ocr_engine
|
|
102
|
+
|
|
103
|
+
@ocr_engine.setter
|
|
104
|
+
def ocr_engine(self, engine: Optional["BaseOCR"]) -> None:
|
|
105
|
+
"""Set OCR engine instance."""
|
|
106
|
+
self._ocr_engine = engine
|
|
107
|
+
|
|
108
|
+
def extract_text(
|
|
109
|
+
self,
|
|
110
|
+
current_file: "CurrentFile",
|
|
111
|
+
extract_metadata: bool = True,
|
|
112
|
+
**kwargs
|
|
113
|
+
) -> str:
|
|
114
|
+
"""
|
|
115
|
+
Extract text from image file using OCR.
|
|
116
|
+
|
|
117
|
+
Converts the image file to text using the configured OCR engine.
|
|
118
|
+
If no OCR engine is available, returns an error message.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
current_file: CurrentFile dict containing file info and binary data
|
|
122
|
+
extract_metadata: Whether to extract metadata (not used for images)
|
|
123
|
+
**kwargs: Additional options (not used)
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
Extracted text from image, or error message if OCR is not available
|
|
127
|
+
|
|
128
|
+
Raises:
|
|
129
|
+
ValueError: If OCR engine is not configured
|
|
130
|
+
"""
|
|
131
|
+
file_path = current_file.get("file_path", "unknown")
|
|
132
|
+
file_name = current_file.get("file_name", "unknown")
|
|
133
|
+
file_extension = current_file.get("file_extension", "").lower()
|
|
134
|
+
file_data = current_file.get("file_data", b"")
|
|
135
|
+
|
|
136
|
+
self.logger.info(f"Processing image file: {file_name}")
|
|
137
|
+
|
|
138
|
+
# Step 1: No file_converter for image files (direct processing)
|
|
139
|
+
# Step 2: Preprocess - clean_content is the TRUE SOURCE
|
|
140
|
+
preprocessed = self.preprocess(file_data)
|
|
141
|
+
file_data = preprocessed.clean_content # TRUE SOURCE
|
|
142
|
+
|
|
143
|
+
# Validate file extension
|
|
144
|
+
if file_extension not in SUPPORTED_IMAGE_EXTENSIONS:
|
|
145
|
+
self.logger.warning(f"Unsupported image extension: {file_extension}")
|
|
146
|
+
return f"[Unsupported image format: {file_extension}]"
|
|
147
|
+
|
|
148
|
+
# If OCR engine is not available, return image tag format
|
|
149
|
+
# This allows the image to be processed later when OCR is available
|
|
150
|
+
if self._ocr_engine is None:
|
|
151
|
+
self.logger.debug(f"OCR engine not available, returning image tag: {file_name}")
|
|
152
|
+
# Use ImageProcessor's tag format (e.g., [Image:path] or custom format)
|
|
153
|
+
return self._build_image_tag(file_path)
|
|
154
|
+
|
|
155
|
+
# Use OCR engine to convert image to text
|
|
156
|
+
try:
|
|
157
|
+
# Use the file path directly for OCR conversion
|
|
158
|
+
result = self._ocr_engine.convert_image_to_text(file_path)
|
|
159
|
+
|
|
160
|
+
if result is None:
|
|
161
|
+
self.logger.error(f"OCR returned None for image: {file_name}")
|
|
162
|
+
return f"[Image OCR failed: {file_name}]"
|
|
163
|
+
|
|
164
|
+
if result.startswith("[Image conversion error:"):
|
|
165
|
+
self.logger.error(f"OCR error for image {file_name}: {result}")
|
|
166
|
+
return result
|
|
167
|
+
|
|
168
|
+
self.logger.info(f"Successfully extracted text from image: {file_name}")
|
|
169
|
+
return result
|
|
170
|
+
|
|
171
|
+
except Exception as e:
|
|
172
|
+
self.logger.error(f"Error processing image {file_name}: {e}")
|
|
173
|
+
return f"[Image processing error: {str(e)}]"
|
|
174
|
+
|
|
175
|
+
def is_supported(self, file_extension: str) -> bool:
|
|
176
|
+
"""
|
|
177
|
+
Check if file extension is supported.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
file_extension: File extension (with or without dot)
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
True if extension is supported, False otherwise
|
|
184
|
+
"""
|
|
185
|
+
ext = file_extension.lower().lstrip('.')
|
|
186
|
+
return ext in SUPPORTED_IMAGE_EXTENSIONS
|
|
187
|
+
|
|
188
|
+
def _build_image_tag(self, file_path: str) -> str:
|
|
189
|
+
"""
|
|
190
|
+
Build image tag using ImageProcessor's tag format.
|
|
191
|
+
|
|
192
|
+
Uses the configured tag_prefix and tag_suffix from ImageProcessor
|
|
193
|
+
to create a consistent image tag format.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
file_path: Path to the image file
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
Image tag string (e.g., "[Image:path]" or custom format)
|
|
200
|
+
"""
|
|
201
|
+
# Normalize path separators (Windows -> Unix style)
|
|
202
|
+
path_str = file_path.replace("\\", "/")
|
|
203
|
+
|
|
204
|
+
# Use ImageProcessor's tag format
|
|
205
|
+
prefix = self.image_processor.config.tag_prefix
|
|
206
|
+
suffix = self.image_processor.config.tag_suffix
|
|
207
|
+
|
|
208
|
+
return f"{prefix}{path_str}{suffix}"
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
__all__ = ["ImageFileHandler", "SUPPORTED_IMAGE_EXTENSIONS"]
|
|
212
|
+
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/image_file_helper/__init__.py
|
|
2
|
+
"""
|
|
3
|
+
Image File Helper 모듈
|
|
4
|
+
|
|
5
|
+
이미지 파일 처리에 필요한 유틸리티를 제공합니다.
|
|
6
|
+
|
|
7
|
+
모듈 구성:
|
|
8
|
+
- image_file_image_processor: 이미지 파일용 이미지 프로세서
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from xgen_doc2chunk.core.processor.image_file_helper.image_file_image_processor import (
|
|
12
|
+
ImageFileImageProcessor,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"ImageFileImageProcessor",
|
|
17
|
+
]
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py
|
|
2
|
+
"""
|
|
3
|
+
ImageFileConverter - Image file format converter
|
|
4
|
+
|
|
5
|
+
Pass-through converter for image files.
|
|
6
|
+
Images are kept as binary data.
|
|
7
|
+
"""
|
|
8
|
+
from typing import Any, Optional, BinaryIO
|
|
9
|
+
|
|
10
|
+
from xgen_doc2chunk.core.functions.file_converter import NullFileConverter
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ImageFileConverter(NullFileConverter):
|
|
14
|
+
"""
|
|
15
|
+
Image file converter.
|
|
16
|
+
|
|
17
|
+
Images don't need conversion - returns raw bytes.
|
|
18
|
+
This is a pass-through converter.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
# Common image magic numbers
|
|
22
|
+
MAGIC_JPEG = b'\xff\xd8\xff'
|
|
23
|
+
MAGIC_PNG = b'\x89PNG\r\n\x1a\n'
|
|
24
|
+
MAGIC_GIF = b'GIF8'
|
|
25
|
+
MAGIC_BMP = b'BM'
|
|
26
|
+
MAGIC_WEBP = b'RIFF'
|
|
27
|
+
|
|
28
|
+
def get_format_name(self) -> str:
|
|
29
|
+
"""Return format name."""
|
|
30
|
+
return "Image File"
|
|
31
|
+
|
|
32
|
+
def validate(self, file_data: bytes) -> bool:
|
|
33
|
+
"""Validate if data is an image."""
|
|
34
|
+
if not file_data or len(file_data) < 4:
|
|
35
|
+
return False
|
|
36
|
+
|
|
37
|
+
return (
|
|
38
|
+
file_data[:3] == self.MAGIC_JPEG or
|
|
39
|
+
file_data[:8] == self.MAGIC_PNG or
|
|
40
|
+
file_data[:4] == self.MAGIC_GIF or
|
|
41
|
+
file_data[:2] == self.MAGIC_BMP or
|
|
42
|
+
file_data[:4] == self.MAGIC_WEBP
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
def detect_image_type(self, file_data: bytes) -> Optional[str]:
|
|
46
|
+
"""
|
|
47
|
+
Detect image type from binary data.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
file_data: Raw binary image data
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
Image type string (jpeg, png, gif, bmp, webp) or None
|
|
54
|
+
"""
|
|
55
|
+
if not file_data or len(file_data) < 8:
|
|
56
|
+
return None
|
|
57
|
+
|
|
58
|
+
if file_data[:3] == self.MAGIC_JPEG:
|
|
59
|
+
return "jpeg"
|
|
60
|
+
elif file_data[:8] == self.MAGIC_PNG:
|
|
61
|
+
return "png"
|
|
62
|
+
elif file_data[:4] == self.MAGIC_GIF:
|
|
63
|
+
return "gif"
|
|
64
|
+
elif file_data[:2] == self.MAGIC_BMP:
|
|
65
|
+
return "bmp"
|
|
66
|
+
elif file_data[:4] == self.MAGIC_WEBP:
|
|
67
|
+
return "webp"
|
|
68
|
+
return None
|
|
69
|
+
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py
|
|
2
|
+
"""
|
|
3
|
+
Image File Image Processor
|
|
4
|
+
|
|
5
|
+
Provides image-file-specific processing that inherits from ImageProcessor.
|
|
6
|
+
Handles standalone image files (jpg, png, gif, bmp, webp, etc.).
|
|
7
|
+
"""
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Any, Optional
|
|
10
|
+
|
|
11
|
+
from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
|
|
12
|
+
from xgen_doc2chunk.core.functions.storage_backend import BaseStorageBackend
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger("xgen_doc2chunk.image_processor.image_file")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ImageFileImageProcessor(ImageProcessor):
|
|
18
|
+
"""
|
|
19
|
+
Image file-specific image processor.
|
|
20
|
+
|
|
21
|
+
Inherits from ImageProcessor and provides image file-specific processing.
|
|
22
|
+
Handles standalone image files that are the document themselves.
|
|
23
|
+
|
|
24
|
+
Handles:
|
|
25
|
+
- Standalone image files (jpg, jpeg, png, gif, bmp, webp)
|
|
26
|
+
- Image saving with metadata preservation
|
|
27
|
+
- Format conversion if needed
|
|
28
|
+
|
|
29
|
+
Example:
|
|
30
|
+
processor = ImageFileImageProcessor()
|
|
31
|
+
|
|
32
|
+
# Process standalone image
|
|
33
|
+
tag = processor.process_image(image_data, source_path="/path/to/image.png")
|
|
34
|
+
|
|
35
|
+
# Process with original filename
|
|
36
|
+
tag = processor.process_standalone_image(image_data, original_name="photo.jpg")
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(
|
|
40
|
+
self,
|
|
41
|
+
directory_path: str = "temp/images",
|
|
42
|
+
tag_prefix: str = "[Image:",
|
|
43
|
+
tag_suffix: str = "]",
|
|
44
|
+
storage_backend: Optional[BaseStorageBackend] = None,
|
|
45
|
+
preserve_original_name: bool = False,
|
|
46
|
+
):
|
|
47
|
+
"""
|
|
48
|
+
Initialize ImageFileImageProcessor.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
directory_path: Image save directory
|
|
52
|
+
tag_prefix: Tag prefix for image references
|
|
53
|
+
tag_suffix: Tag suffix for image references
|
|
54
|
+
storage_backend: Storage backend for saving images
|
|
55
|
+
preserve_original_name: Whether to preserve original filename
|
|
56
|
+
"""
|
|
57
|
+
super().__init__(
|
|
58
|
+
directory_path=directory_path,
|
|
59
|
+
tag_prefix=tag_prefix,
|
|
60
|
+
tag_suffix=tag_suffix,
|
|
61
|
+
storage_backend=storage_backend,
|
|
62
|
+
)
|
|
63
|
+
self._preserve_original_name = preserve_original_name
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def preserve_original_name(self) -> bool:
|
|
67
|
+
"""Whether to preserve original filename."""
|
|
68
|
+
return self._preserve_original_name
|
|
69
|
+
|
|
70
|
+
def process_image(
|
|
71
|
+
self,
|
|
72
|
+
image_data: bytes,
|
|
73
|
+
source_path: Optional[str] = None,
|
|
74
|
+
original_name: Optional[str] = None,
|
|
75
|
+
**kwargs
|
|
76
|
+
) -> Optional[str]:
|
|
77
|
+
"""
|
|
78
|
+
Process and save image file data.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
image_data: Raw image binary data
|
|
82
|
+
source_path: Original file path
|
|
83
|
+
original_name: Original filename
|
|
84
|
+
**kwargs: Additional options
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
Image tag string or None if processing failed
|
|
88
|
+
"""
|
|
89
|
+
# Use original name if preserve option is set
|
|
90
|
+
custom_name = None
|
|
91
|
+
if self._preserve_original_name and original_name:
|
|
92
|
+
import os
|
|
93
|
+
custom_name = os.path.splitext(original_name)[0]
|
|
94
|
+
elif source_path:
|
|
95
|
+
import os
|
|
96
|
+
custom_name = os.path.splitext(os.path.basename(source_path))[0]
|
|
97
|
+
|
|
98
|
+
return self.save_image(image_data, custom_name=custom_name)
|
|
99
|
+
|
|
100
|
+
def process_standalone_image(
|
|
101
|
+
self,
|
|
102
|
+
image_data: bytes,
|
|
103
|
+
original_name: Optional[str] = None,
|
|
104
|
+
**kwargs
|
|
105
|
+
) -> Optional[str]:
|
|
106
|
+
"""
|
|
107
|
+
Process standalone image file.
|
|
108
|
+
|
|
109
|
+
Specialized method for processing image files that are the document.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
image_data: Raw image binary data
|
|
113
|
+
original_name: Original filename
|
|
114
|
+
**kwargs: Additional options
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
Image tag string or None if processing failed
|
|
118
|
+
"""
|
|
119
|
+
return self.process_image(
|
|
120
|
+
image_data,
|
|
121
|
+
original_name=original_name,
|
|
122
|
+
**kwargs
|
|
123
|
+
)
|