xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,220 @@
1
+ # xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py
2
+ """
3
+ HWPX Table Processor
4
+
5
+ Formats TableData into HTML/Markdown/Text output for HWPX documents.
6
+ Extends the base TableProcessor with HWPX-specific formatting options.
7
+
8
+ Key Features:
9
+ - HTML output with border attributes for backward compatibility
10
+ - Special handling for 1x1 container tables
11
+ - Special handling for single column tables
12
+ - Post-processing for HWPX-specific requirements
13
+
14
+ Usage:
15
+ from xgen_doc2chunk.core.processor.hwpx_helper.hwpx_table_processor import (
16
+ HWPXTableProcessor,
17
+ create_hwpx_table_processor,
18
+ )
19
+
20
+ processor = HWPXTableProcessor()
21
+ html = processor.format_table(table_data)
22
+ """
23
+ import logging
24
+ from dataclasses import dataclass
25
+ from typing import Optional
26
+
27
+ from xgen_doc2chunk.core.functions.table_extractor import TableData
28
+ from xgen_doc2chunk.core.functions.table_processor import (
29
+ TableProcessor,
30
+ TableProcessorConfig,
31
+ TableOutputFormat,
32
+ )
33
+
34
+ logger = logging.getLogger("document-processor")
35
+
36
+
37
+ @dataclass
38
+ class HWPXTableProcessorConfig(TableProcessorConfig):
39
+ """Configuration for HWPX table processing.
40
+
41
+ Extends TableProcessorConfig with HWPX-specific options.
42
+
43
+ Attributes:
44
+ add_border: Whether to add border='1' attribute to HTML tables
45
+ collapse_single_cell: Whether to collapse 1x1 tables to plain text
46
+ collapse_single_column: Whether to collapse single-column tables to line-separated text
47
+ """
48
+ add_border: bool = True
49
+ collapse_single_cell: bool = True
50
+ collapse_single_column: bool = True
51
+
52
+
53
+ class HWPXTableProcessor(TableProcessor):
54
+ """HWPX-specific table processor.
55
+
56
+ Extends TableProcessor with HWPX-specific formatting:
57
+ - Adds border='1' to HTML tables for backward compatibility
58
+ - Collapses 1x1 container tables to plain text
59
+ - Collapses single-column tables to line-separated text
60
+
61
+ Usage:
62
+ processor = HWPXTableProcessor()
63
+ html = processor.format_table(table_data)
64
+ """
65
+
66
+ def __init__(self, config: Optional[HWPXTableProcessorConfig] = None):
67
+ """Initialize the HWPX table processor.
68
+
69
+ Args:
70
+ config: HWPX table processing configuration
71
+ """
72
+ if config is None:
73
+ config = HWPXTableProcessorConfig()
74
+ super().__init__(config)
75
+ self.hwpx_config = config
76
+
77
+ def format_table(self, table: TableData) -> str:
78
+ """Format a table with HWPX-specific handling.
79
+
80
+ Handles special cases before delegating to base class:
81
+ - 1x1 tables: Return cell content only (container tables)
82
+ - Single column tables: Return as line-separated text
83
+
84
+ Args:
85
+ table: TableData to format
86
+
87
+ Returns:
88
+ Formatted table string
89
+ """
90
+ if not table or not table.rows:
91
+ return ""
92
+
93
+ # Special case: 1x1 table (container table)
94
+ if (self.hwpx_config.collapse_single_cell and
95
+ table.num_rows == 1 and table.num_cols == 1):
96
+ if table.rows and table.rows[0]:
97
+ return table.rows[0][0].content
98
+ return ""
99
+
100
+ # Special case: Single column table
101
+ if (self.hwpx_config.collapse_single_column and
102
+ table.num_cols == 1):
103
+ text_items = []
104
+ for row in table.rows:
105
+ if row and row[0].content:
106
+ text_items.append(row[0].content)
107
+ if text_items:
108
+ return "\n\n".join(text_items)
109
+ return ""
110
+
111
+ # Normal table processing
112
+ return super().format_table(table)
113
+
114
+ def format_table_as_html(self, table: TableData) -> str:
115
+ """Format table as HTML with HWPX-specific attributes.
116
+
117
+ Adds border='1' attribute for backward compatibility.
118
+
119
+ Args:
120
+ table: TableData to format
121
+
122
+ Returns:
123
+ HTML table string
124
+ """
125
+ # Check for special cases first
126
+ if not table or not table.rows:
127
+ return ""
128
+
129
+ # 1x1 table handling
130
+ if (self.hwpx_config.collapse_single_cell and
131
+ table.num_rows == 1 and table.num_cols == 1):
132
+ if table.rows and table.rows[0]:
133
+ return table.rows[0][0].content
134
+ return ""
135
+
136
+ # Single column table handling
137
+ if (self.hwpx_config.collapse_single_column and
138
+ table.num_cols == 1):
139
+ text_items = []
140
+ for row in table.rows:
141
+ if row and row[0].content:
142
+ text_items.append(row[0].content)
143
+ if text_items:
144
+ return "\n\n".join(text_items)
145
+ return ""
146
+
147
+ # Generate HTML using base class
148
+ html = super().format_table_as_html(table)
149
+
150
+ # Post-process: Add border attribute
151
+ if self.hwpx_config.add_border:
152
+ html = html.replace("<table>", "<table border='1'>")
153
+
154
+ return html
155
+
156
+
157
+ # Default configuration
158
+ DEFAULT_HWPX_PROCESSOR_CONFIG = HWPXTableProcessorConfig(
159
+ output_format=TableOutputFormat.HTML,
160
+ clean_whitespace=True,
161
+ preserve_merged_cells=True,
162
+ add_border=True,
163
+ collapse_single_cell=True,
164
+ collapse_single_column=True,
165
+ )
166
+
167
+
168
+ # Module-level default processor (lazy initialized)
169
+ _default_processor: Optional[HWPXTableProcessor] = None
170
+
171
+
172
+ def get_default_processor() -> HWPXTableProcessor:
173
+ """Get or create the default HWPX table processor.
174
+
175
+ Returns:
176
+ Configured HWPXTableProcessor instance
177
+ """
178
+ global _default_processor
179
+ if _default_processor is None:
180
+ _default_processor = HWPXTableProcessor(DEFAULT_HWPX_PROCESSOR_CONFIG)
181
+ return _default_processor
182
+
183
+
184
+ def create_hwpx_table_processor(
185
+ config: Optional[HWPXTableProcessorConfig] = None
186
+ ) -> HWPXTableProcessor:
187
+ """Create a HWPX table processor instance.
188
+
189
+ Args:
190
+ config: HWPX table processing configuration
191
+
192
+ Returns:
193
+ Configured HWPXTableProcessor instance
194
+ """
195
+ return HWPXTableProcessor(config)
196
+
197
+
198
+ def format_table_as_html(table: TableData) -> str:
199
+ """Convenience function to format a table as HTML.
200
+
201
+ Uses the default HWPX table processor.
202
+
203
+ Args:
204
+ table: TableData to format
205
+
206
+ Returns:
207
+ HTML table string
208
+ """
209
+ processor = get_default_processor()
210
+ return processor.format_table_as_html(table)
211
+
212
+
213
+ __all__ = [
214
+ 'HWPXTableProcessor',
215
+ 'HWPXTableProcessorConfig',
216
+ 'DEFAULT_HWPX_PROCESSOR_CONFIG',
217
+ 'create_hwpx_table_processor',
218
+ 'get_default_processor',
219
+ 'format_table_as_html',
220
+ ]
@@ -0,0 +1,212 @@
1
+ # xgen_doc2chunk/core/processor/image_file_handler.py
2
+ """
3
+ Image File Handler - Image File Processor
4
+
5
+ Class-based handler for image files (jpg, jpeg, png, gif, bmp, webp).
6
+ Converts images to text using OCR engine when available.
7
+ If no OCR engine is provided, returns a placeholder or empty string.
8
+ """
9
+ import logging
10
+ import os
11
+ from typing import Any, Optional, TYPE_CHECKING
12
+
13
+ from xgen_doc2chunk.core.processor.base_handler import BaseHandler
14
+ from xgen_doc2chunk.core.functions.chart_extractor import BaseChartExtractor, NullChartExtractor
15
+ from xgen_doc2chunk.core.processor.image_file_helper.image_file_image_processor import ImageFileImageProcessor
16
+ from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
17
+
18
+ if TYPE_CHECKING:
19
+ from xgen_doc2chunk.core.document_processor import CurrentFile
20
+ from xgen_doc2chunk.ocr.base import BaseOCR
21
+
22
+ logger = logging.getLogger("document-processor")
23
+
24
+
25
+ # Supported image extensions
26
+ SUPPORTED_IMAGE_EXTENSIONS = frozenset(['jpg', 'jpeg', 'png', 'gif', 'bmp', 'webp'])
27
+
28
+
29
+ class ImageFileHandler(BaseHandler):
30
+ """
31
+ Image File Processing Handler Class.
32
+
33
+ Processes standalone image files by converting them to text using OCR.
34
+ Requires an OCR engine to be provided for actual text extraction.
35
+
36
+ Args:
37
+ config: Configuration dictionary (passed from DocumentProcessor)
38
+ image_processor: ImageProcessor instance (passed from DocumentProcessor)
39
+ page_tag_processor: PageTagProcessor instance (passed from DocumentProcessor)
40
+ ocr_engine: OCR engine instance (BaseOCR subclass) for image-to-text conversion
41
+
42
+ Example:
43
+ >>> from xgen_doc2chunk.ocr.ocr_engine import OpenAIOCR
44
+ >>> ocr = OpenAIOCR(api_key="sk-...", model="gpt-4o")
45
+ >>> handler = ImageFileHandler(ocr_engine=ocr)
46
+ >>> text = handler.extract_text(current_file)
47
+ """
48
+
49
+ def _create_file_converter(self):
50
+ """Create image-file-specific file converter."""
51
+ from xgen_doc2chunk.core.processor.image_file_helper.image_file_converter import ImageFileConverter
52
+ return ImageFileConverter()
53
+
54
+ def _create_preprocessor(self):
55
+ """Create image-file-specific preprocessor."""
56
+ from xgen_doc2chunk.core.processor.image_file_helper.image_file_preprocessor import ImageFilePreprocessor
57
+ return ImageFilePreprocessor()
58
+
59
+ def _create_chart_extractor(self) -> BaseChartExtractor:
60
+ """Image files do not contain charts. Return NullChartExtractor."""
61
+ return NullChartExtractor(self._chart_processor)
62
+
63
+ def _create_metadata_extractor(self):
64
+ """Image files do not have document metadata. Return None (uses NullMetadataExtractor)."""
65
+ return None
66
+
67
+ def _create_format_image_processor(self) -> ImageProcessor:
68
+ """Create image-file-specific image processor."""
69
+ return ImageFileImageProcessor()
70
+
71
+ def __init__(
72
+ self,
73
+ config: Optional[dict] = None,
74
+ image_processor: Optional[Any] = None,
75
+ page_tag_processor: Optional[Any] = None,
76
+ chart_processor: Optional[Any] = None,
77
+ ocr_engine: Optional["BaseOCR"] = None
78
+ ):
79
+ """
80
+ Initialize ImageFileHandler.
81
+
82
+ Args:
83
+ config: Configuration dictionary (passed from DocumentProcessor)
84
+ image_processor: ImageProcessor instance (passed from DocumentProcessor)
85
+ page_tag_processor: PageTagProcessor instance (passed from DocumentProcessor)
86
+ chart_processor: ChartProcessor instance (passed from DocumentProcessor)
87
+ ocr_engine: OCR engine instance (BaseOCR subclass) for image-to-text conversion.
88
+ If None, images cannot be converted to text.
89
+ """
90
+ super().__init__(
91
+ config=config,
92
+ image_processor=image_processor,
93
+ page_tag_processor=page_tag_processor,
94
+ chart_processor=chart_processor
95
+ )
96
+ self._ocr_engine = ocr_engine
97
+
98
+ @property
99
+ def ocr_engine(self) -> Optional["BaseOCR"]:
100
+ """Current OCR engine instance."""
101
+ return self._ocr_engine
102
+
103
+ @ocr_engine.setter
104
+ def ocr_engine(self, engine: Optional["BaseOCR"]) -> None:
105
+ """Set OCR engine instance."""
106
+ self._ocr_engine = engine
107
+
108
+ def extract_text(
109
+ self,
110
+ current_file: "CurrentFile",
111
+ extract_metadata: bool = True,
112
+ **kwargs
113
+ ) -> str:
114
+ """
115
+ Extract text from image file using OCR.
116
+
117
+ Converts the image file to text using the configured OCR engine.
118
+ If no OCR engine is available, returns an error message.
119
+
120
+ Args:
121
+ current_file: CurrentFile dict containing file info and binary data
122
+ extract_metadata: Whether to extract metadata (not used for images)
123
+ **kwargs: Additional options (not used)
124
+
125
+ Returns:
126
+ Extracted text from image, or error message if OCR is not available
127
+
128
+ Raises:
129
+ ValueError: If OCR engine is not configured
130
+ """
131
+ file_path = current_file.get("file_path", "unknown")
132
+ file_name = current_file.get("file_name", "unknown")
133
+ file_extension = current_file.get("file_extension", "").lower()
134
+ file_data = current_file.get("file_data", b"")
135
+
136
+ self.logger.info(f"Processing image file: {file_name}")
137
+
138
+ # Step 1: No file_converter for image files (direct processing)
139
+ # Step 2: Preprocess - clean_content is the TRUE SOURCE
140
+ preprocessed = self.preprocess(file_data)
141
+ file_data = preprocessed.clean_content # TRUE SOURCE
142
+
143
+ # Validate file extension
144
+ if file_extension not in SUPPORTED_IMAGE_EXTENSIONS:
145
+ self.logger.warning(f"Unsupported image extension: {file_extension}")
146
+ return f"[Unsupported image format: {file_extension}]"
147
+
148
+ # If OCR engine is not available, return image tag format
149
+ # This allows the image to be processed later when OCR is available
150
+ if self._ocr_engine is None:
151
+ self.logger.debug(f"OCR engine not available, returning image tag: {file_name}")
152
+ # Use ImageProcessor's tag format (e.g., [Image:path] or custom format)
153
+ return self._build_image_tag(file_path)
154
+
155
+ # Use OCR engine to convert image to text
156
+ try:
157
+ # Use the file path directly for OCR conversion
158
+ result = self._ocr_engine.convert_image_to_text(file_path)
159
+
160
+ if result is None:
161
+ self.logger.error(f"OCR returned None for image: {file_name}")
162
+ return f"[Image OCR failed: {file_name}]"
163
+
164
+ if result.startswith("[Image conversion error:"):
165
+ self.logger.error(f"OCR error for image {file_name}: {result}")
166
+ return result
167
+
168
+ self.logger.info(f"Successfully extracted text from image: {file_name}")
169
+ return result
170
+
171
+ except Exception as e:
172
+ self.logger.error(f"Error processing image {file_name}: {e}")
173
+ return f"[Image processing error: {str(e)}]"
174
+
175
+ def is_supported(self, file_extension: str) -> bool:
176
+ """
177
+ Check if file extension is supported.
178
+
179
+ Args:
180
+ file_extension: File extension (with or without dot)
181
+
182
+ Returns:
183
+ True if extension is supported, False otherwise
184
+ """
185
+ ext = file_extension.lower().lstrip('.')
186
+ return ext in SUPPORTED_IMAGE_EXTENSIONS
187
+
188
+ def _build_image_tag(self, file_path: str) -> str:
189
+ """
190
+ Build image tag using ImageProcessor's tag format.
191
+
192
+ Uses the configured tag_prefix and tag_suffix from ImageProcessor
193
+ to create a consistent image tag format.
194
+
195
+ Args:
196
+ file_path: Path to the image file
197
+
198
+ Returns:
199
+ Image tag string (e.g., "[Image:path]" or custom format)
200
+ """
201
+ # Normalize path separators (Windows -> Unix style)
202
+ path_str = file_path.replace("\\", "/")
203
+
204
+ # Use ImageProcessor's tag format
205
+ prefix = self.image_processor.config.tag_prefix
206
+ suffix = self.image_processor.config.tag_suffix
207
+
208
+ return f"{prefix}{path_str}{suffix}"
209
+
210
+
211
+ __all__ = ["ImageFileHandler", "SUPPORTED_IMAGE_EXTENSIONS"]
212
+
@@ -0,0 +1,17 @@
1
+ # xgen_doc2chunk/core/processor/image_file_helper/__init__.py
2
+ """
3
+ Image File Helper 모듈
4
+
5
+ 이미지 파일 처리에 필요한 유틸리티를 제공합니다.
6
+
7
+ 모듈 구성:
8
+ - image_file_image_processor: 이미지 파일용 이미지 프로세서
9
+ """
10
+
11
+ from xgen_doc2chunk.core.processor.image_file_helper.image_file_image_processor import (
12
+ ImageFileImageProcessor,
13
+ )
14
+
15
+ __all__ = [
16
+ "ImageFileImageProcessor",
17
+ ]
@@ -0,0 +1,69 @@
1
+ # xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py
2
+ """
3
+ ImageFileConverter - Image file format converter
4
+
5
+ Pass-through converter for image files.
6
+ Images are kept as binary data.
7
+ """
8
+ from typing import Any, Optional, BinaryIO
9
+
10
+ from xgen_doc2chunk.core.functions.file_converter import NullFileConverter
11
+
12
+
13
+ class ImageFileConverter(NullFileConverter):
14
+ """
15
+ Image file converter.
16
+
17
+ Images don't need conversion - returns raw bytes.
18
+ This is a pass-through converter.
19
+ """
20
+
21
+ # Common image magic numbers
22
+ MAGIC_JPEG = b'\xff\xd8\xff'
23
+ MAGIC_PNG = b'\x89PNG\r\n\x1a\n'
24
+ MAGIC_GIF = b'GIF8'
25
+ MAGIC_BMP = b'BM'
26
+ MAGIC_WEBP = b'RIFF'
27
+
28
+ def get_format_name(self) -> str:
29
+ """Return format name."""
30
+ return "Image File"
31
+
32
+ def validate(self, file_data: bytes) -> bool:
33
+ """Validate if data is an image."""
34
+ if not file_data or len(file_data) < 4:
35
+ return False
36
+
37
+ return (
38
+ file_data[:3] == self.MAGIC_JPEG or
39
+ file_data[:8] == self.MAGIC_PNG or
40
+ file_data[:4] == self.MAGIC_GIF or
41
+ file_data[:2] == self.MAGIC_BMP or
42
+ file_data[:4] == self.MAGIC_WEBP
43
+ )
44
+
45
+ def detect_image_type(self, file_data: bytes) -> Optional[str]:
46
+ """
47
+ Detect image type from binary data.
48
+
49
+ Args:
50
+ file_data: Raw binary image data
51
+
52
+ Returns:
53
+ Image type string (jpeg, png, gif, bmp, webp) or None
54
+ """
55
+ if not file_data or len(file_data) < 8:
56
+ return None
57
+
58
+ if file_data[:3] == self.MAGIC_JPEG:
59
+ return "jpeg"
60
+ elif file_data[:8] == self.MAGIC_PNG:
61
+ return "png"
62
+ elif file_data[:4] == self.MAGIC_GIF:
63
+ return "gif"
64
+ elif file_data[:2] == self.MAGIC_BMP:
65
+ return "bmp"
66
+ elif file_data[:4] == self.MAGIC_WEBP:
67
+ return "webp"
68
+ return None
69
+
@@ -0,0 +1,123 @@
1
+ # xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py
2
+ """
3
+ Image File Image Processor
4
+
5
+ Provides image-file-specific processing that inherits from ImageProcessor.
6
+ Handles standalone image files (jpg, png, gif, bmp, webp, etc.).
7
+ """
8
+ import logging
9
+ from typing import Any, Optional
10
+
11
+ from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
12
+ from xgen_doc2chunk.core.functions.storage_backend import BaseStorageBackend
13
+
14
+ logger = logging.getLogger("xgen_doc2chunk.image_processor.image_file")
15
+
16
+
17
+ class ImageFileImageProcessor(ImageProcessor):
18
+ """
19
+ Image file-specific image processor.
20
+
21
+ Inherits from ImageProcessor and provides image file-specific processing.
22
+ Handles standalone image files that are the document themselves.
23
+
24
+ Handles:
25
+ - Standalone image files (jpg, jpeg, png, gif, bmp, webp)
26
+ - Image saving with metadata preservation
27
+ - Format conversion if needed
28
+
29
+ Example:
30
+ processor = ImageFileImageProcessor()
31
+
32
+ # Process standalone image
33
+ tag = processor.process_image(image_data, source_path="/path/to/image.png")
34
+
35
+ # Process with original filename
36
+ tag = processor.process_standalone_image(image_data, original_name="photo.jpg")
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ directory_path: str = "temp/images",
42
+ tag_prefix: str = "[Image:",
43
+ tag_suffix: str = "]",
44
+ storage_backend: Optional[BaseStorageBackend] = None,
45
+ preserve_original_name: bool = False,
46
+ ):
47
+ """
48
+ Initialize ImageFileImageProcessor.
49
+
50
+ Args:
51
+ directory_path: Image save directory
52
+ tag_prefix: Tag prefix for image references
53
+ tag_suffix: Tag suffix for image references
54
+ storage_backend: Storage backend for saving images
55
+ preserve_original_name: Whether to preserve original filename
56
+ """
57
+ super().__init__(
58
+ directory_path=directory_path,
59
+ tag_prefix=tag_prefix,
60
+ tag_suffix=tag_suffix,
61
+ storage_backend=storage_backend,
62
+ )
63
+ self._preserve_original_name = preserve_original_name
64
+
65
+ @property
66
+ def preserve_original_name(self) -> bool:
67
+ """Whether to preserve original filename."""
68
+ return self._preserve_original_name
69
+
70
+ def process_image(
71
+ self,
72
+ image_data: bytes,
73
+ source_path: Optional[str] = None,
74
+ original_name: Optional[str] = None,
75
+ **kwargs
76
+ ) -> Optional[str]:
77
+ """
78
+ Process and save image file data.
79
+
80
+ Args:
81
+ image_data: Raw image binary data
82
+ source_path: Original file path
83
+ original_name: Original filename
84
+ **kwargs: Additional options
85
+
86
+ Returns:
87
+ Image tag string or None if processing failed
88
+ """
89
+ # Use original name if preserve option is set
90
+ custom_name = None
91
+ if self._preserve_original_name and original_name:
92
+ import os
93
+ custom_name = os.path.splitext(original_name)[0]
94
+ elif source_path:
95
+ import os
96
+ custom_name = os.path.splitext(os.path.basename(source_path))[0]
97
+
98
+ return self.save_image(image_data, custom_name=custom_name)
99
+
100
+ def process_standalone_image(
101
+ self,
102
+ image_data: bytes,
103
+ original_name: Optional[str] = None,
104
+ **kwargs
105
+ ) -> Optional[str]:
106
+ """
107
+ Process standalone image file.
108
+
109
+ Specialized method for processing image files that are the document.
110
+
111
+ Args:
112
+ image_data: Raw image binary data
113
+ original_name: Original filename
114
+ **kwargs: Additional options
115
+
116
+ Returns:
117
+ Image tag string or None if processing failed
118
+ """
119
+ return self.process_image(
120
+ image_data,
121
+ original_name=original_name,
122
+ **kwargs
123
+ )