xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,544 @@
1
+ # xgen_doc2chunk/core/processor/base_handler.py
2
+ """
3
+ BaseHandler - Abstract base class for document processing handlers
4
+
5
+ Defines the base interface for all document handlers.
6
+ Manages config, ImageProcessor, PageTagProcessor, ChartProcessor, MetadataExtractor,
7
+ Preprocessor, and format-specific ImageProcessor passed from DocumentProcessor at
8
+ instance level for reuse by internal methods.
9
+
10
+ Each handler should override:
11
+ - _create_file_converter(): Provide format-specific file converter
12
+ - _create_preprocessor(): Provide format-specific preprocessor
13
+ - _create_chart_extractor(): Provide format-specific chart extractor
14
+ - _create_metadata_extractor(): Provide format-specific metadata extractor
15
+ - _create_format_image_processor(): Provide format-specific image processor
16
+
17
+ Processing Pipeline:
18
+ 1. file_converter.convert() - Binary ??Format-specific object (e.g., bytes ??fitz.Document)
19
+ 2. preprocessor.preprocess() - Process/clean the converted data
20
+ 3. metadata_extractor.extract() - Extract document metadata
21
+ 4. Format-specific content extraction (text, images, charts, tables)
22
+
23
+ Usage Example:
24
+ class PDFHandler(BaseHandler):
25
+ def _create_file_converter(self):
26
+ return PDFFileConverter()
27
+
28
+ def _create_preprocessor(self):
29
+ return PDFPreprocessor() # Or NullPreprocessor() if no preprocessing needed
30
+
31
+ def _create_metadata_extractor(self):
32
+ return PDFMetadataExtractor()
33
+
34
+ def _create_format_image_processor(self):
35
+ return PDFImageProcessor(image_processor=self._image_processor)
36
+
37
+ def extract_text(self, current_file: CurrentFile, extract_metadata: bool = True) -> str:
38
+ # Step 1: Convert binary to format-specific object
39
+ doc = self.convert_file(current_file)
40
+ # Step 2: Preprocess the converted object
41
+ preprocessed = self.preprocess(doc)
42
+ # Step 3: Extract metadata
43
+ metadata = self.extract_metadata(doc)
44
+ # Step 4: Process content
45
+ ...
46
+ """
47
+ import io
48
+ import logging
49
+ from abc import ABC, abstractmethod
50
+ from typing import Any, Dict, Optional, TYPE_CHECKING
51
+
52
+ from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
53
+ from xgen_doc2chunk.core.functions.page_tag_processor import PageTagProcessor
54
+ from xgen_doc2chunk.core.functions.chart_processor import ChartProcessor
55
+ from xgen_doc2chunk.core.functions.chart_extractor import BaseChartExtractor, NullChartExtractor
56
+ from xgen_doc2chunk.core.functions.metadata_extractor import (
57
+ BaseMetadataExtractor,
58
+ DocumentMetadata,
59
+ )
60
+ from xgen_doc2chunk.core.functions.file_converter import (
61
+ BaseFileConverter,
62
+ NullFileConverter,
63
+ )
64
+ from xgen_doc2chunk.core.functions.preprocessor import (
65
+ BasePreprocessor,
66
+ NullPreprocessor,
67
+ PreprocessedData,
68
+ )
69
+
70
+ if TYPE_CHECKING:
71
+ from xgen_doc2chunk.core.document_processor import CurrentFile
72
+
73
+ logger = logging.getLogger("document-processor")
74
+
75
+
76
+ class NullMetadataExtractor(BaseMetadataExtractor):
77
+ """
78
+ Null implementation of metadata extractor.
79
+
80
+ Used as default when no format-specific extractor is provided.
81
+ Always returns empty metadata.
82
+ """
83
+
84
+ def extract(self, source: Any) -> DocumentMetadata:
85
+ """Return empty metadata."""
86
+ return DocumentMetadata()
87
+
88
+
89
+ class BaseHandler(ABC):
90
+ """
91
+ Abstract base class for document handlers.
92
+
93
+ All handlers inherit from this class.
94
+ config, image_processor, page_tag_processor, chart_processor, metadata_extractor,
95
+ preprocessor, and format_image_processor are passed at creation and stored as
96
+ instance variables.
97
+
98
+ Each handler should override:
99
+ - _create_file_converter(): Provide format-specific file converter
100
+ - _create_preprocessor(): Provide format-specific preprocessor
101
+ - _create_chart_extractor(): Provide format-specific chart extractor
102
+ - _create_metadata_extractor(): Provide format-specific metadata extractor
103
+ - _create_format_image_processor(): Provide format-specific image processor
104
+
105
+ All are lazy-initialized on first access.
106
+
107
+ Processing Pipeline:
108
+ 1. file_converter.convert() - Binary ??Format-specific object
109
+ 2. preprocessor.preprocess() - Process/clean the converted data
110
+ 3. metadata_extractor.extract() - Extract document metadata
111
+ 4. Format-specific content extraction
112
+
113
+ Attributes:
114
+ config: Configuration dictionary passed from DocumentProcessor
115
+ image_processor: Core ImageProcessor instance passed from DocumentProcessor
116
+ format_image_processor: Format-specific image processor (lazy-initialized)
117
+ page_tag_processor: PageTagProcessor instance passed from DocumentProcessor
118
+ chart_processor: ChartProcessor instance passed from DocumentProcessor
119
+ chart_extractor: Format-specific chart extractor instance
120
+ preprocessor: Format-specific preprocessor instance
121
+ metadata_extractor: Format-specific metadata extractor instance
122
+ file_converter: Format-specific file converter instance
123
+ logger: Logging instance
124
+ """
125
+
126
+ def __init__(
127
+ self,
128
+ config: Optional[Dict[str, Any]] = None,
129
+ image_processor: Optional[ImageProcessor] = None,
130
+ page_tag_processor: Optional[PageTagProcessor] = None,
131
+ chart_processor: Optional[ChartProcessor] = None
132
+ ):
133
+ """
134
+ Initialize BaseHandler.
135
+
136
+ Args:
137
+ config: Configuration dictionary (passed from DocumentProcessor)
138
+ image_processor: ImageProcessor instance (passed from DocumentProcessor)
139
+ page_tag_processor: PageTagProcessor instance (passed from DocumentProcessor)
140
+ chart_processor: ChartProcessor instance (passed from DocumentProcessor)
141
+ """
142
+ self._config = config or {}
143
+ self._image_processor = image_processor or ImageProcessor()
144
+ self._page_tag_processor = page_tag_processor or self._get_page_tag_processor_from_config()
145
+ self._chart_processor = chart_processor or self._get_chart_processor_from_config()
146
+ self._chart_extractor: Optional[BaseChartExtractor] = None
147
+ self._metadata_extractor: Optional[BaseMetadataExtractor] = None
148
+ self._file_converter: Optional[BaseFileConverter] = None
149
+ self._preprocessor: Optional[BasePreprocessor] = None
150
+ self._format_image_processor: Optional[ImageProcessor] = None
151
+ self._logger = logging.getLogger(f"document-processor.{self.__class__.__name__}")
152
+
153
+ def _get_page_tag_processor_from_config(self) -> PageTagProcessor:
154
+ """Get PageTagProcessor from config or create default."""
155
+ if self._config and "page_tag_processor" in self._config:
156
+ return self._config["page_tag_processor"]
157
+ return PageTagProcessor()
158
+
159
+ def _get_chart_processor_from_config(self) -> ChartProcessor:
160
+ """Get ChartProcessor from config or create default."""
161
+ if self._config and "chart_processor" in self._config:
162
+ return self._config["chart_processor"]
163
+ return ChartProcessor()
164
+
165
+ def _create_chart_extractor(self) -> BaseChartExtractor:
166
+ """
167
+ Create format-specific chart extractor.
168
+
169
+ Override this method in subclasses to provide the appropriate
170
+ chart extractor for the file format.
171
+
172
+ Returns:
173
+ BaseChartExtractor subclass instance
174
+ """
175
+ return NullChartExtractor(self._chart_processor)
176
+
177
+ def _create_metadata_extractor(self) -> BaseMetadataExtractor:
178
+ """
179
+ Create format-specific metadata extractor.
180
+
181
+ Override this method in subclasses to provide the appropriate
182
+ metadata extractor for the file format.
183
+
184
+ Returns:
185
+ BaseMetadataExtractor subclass instance
186
+ """
187
+ return NullMetadataExtractor()
188
+
189
+ def _create_format_image_processor(self) -> ImageProcessor:
190
+ """
191
+ Create format-specific image processor.
192
+
193
+ Override this method in subclasses to provide the appropriate
194
+ image processor for the file format.
195
+
196
+ Returns:
197
+ ImageProcessor subclass instance
198
+ """
199
+ return self._image_processor
200
+
201
+ def _create_file_converter(self) -> BaseFileConverter:
202
+ """
203
+ Create format-specific file converter.
204
+
205
+ Override this method in subclasses to provide the appropriate
206
+ file converter for the file format.
207
+
208
+ The file converter transforms raw binary data into a workable
209
+ format-specific object (e.g., Document, Workbook, OLE file).
210
+
211
+ Returns:
212
+ BaseFileConverter subclass instance
213
+ """
214
+ return NullFileConverter()
215
+
216
+ def _create_preprocessor(self) -> BasePreprocessor:
217
+ """
218
+ Create format-specific preprocessor.
219
+
220
+ Override this method in subclasses to provide the appropriate
221
+ preprocessor for the file format.
222
+
223
+ The preprocessor processes/cleans the converted data before
224
+ further extraction. This is the SECOND step in the pipeline,
225
+ after file_converter.convert().
226
+
227
+ Pipeline:
228
+ 1. file_converter.convert() ??Format-specific object
229
+ 2. preprocessor.preprocess() ??Cleaned/processed data
230
+ 3. metadata_extractor.extract() ??Metadata
231
+ 4. Content extraction
232
+
233
+ Returns:
234
+ BasePreprocessor subclass instance (NullPreprocessor if no preprocessing needed)
235
+ """
236
+ return NullPreprocessor()
237
+
238
+ @property
239
+ def config(self) -> Dict[str, Any]:
240
+ """Configuration dictionary."""
241
+ return self._config
242
+
243
+ @property
244
+ def image_processor(self) -> ImageProcessor:
245
+ """ImageProcessor instance."""
246
+ return self._image_processor
247
+
248
+ @property
249
+ def page_tag_processor(self) -> PageTagProcessor:
250
+ """PageTagProcessor instance."""
251
+ return self._page_tag_processor
252
+
253
+ @property
254
+ def chart_processor(self) -> ChartProcessor:
255
+ """ChartProcessor instance."""
256
+ return self._chart_processor
257
+
258
+ @property
259
+ def chart_extractor(self) -> BaseChartExtractor:
260
+ """
261
+ Format-specific chart extractor (lazy-initialized).
262
+
263
+ Returns the chart extractor for this handler's file format.
264
+ """
265
+ if self._chart_extractor is None:
266
+ self._chart_extractor = self._create_chart_extractor()
267
+ return self._chart_extractor
268
+
269
+ @property
270
+ def metadata_extractor(self) -> BaseMetadataExtractor:
271
+ """
272
+ Format-specific metadata extractor (lazy-initialized).
273
+
274
+ Returns the metadata extractor for this handler's file format.
275
+ """
276
+ if self._metadata_extractor is None:
277
+ extractor = self._create_metadata_extractor()
278
+ # If subclass returns None, use NullMetadataExtractor
279
+ self._metadata_extractor = extractor if extractor is not None else NullMetadataExtractor()
280
+ return self._metadata_extractor
281
+
282
+ @property
283
+ def format_image_processor(self) -> ImageProcessor:
284
+ """
285
+ Format-specific image processor (lazy-initialized).
286
+
287
+ Returns the image processor for this handler's file format.
288
+ Each handler should override _create_format_image_processor() to provide
289
+ format-specific image handling capabilities.
290
+ """
291
+ if self._format_image_processor is None:
292
+ processor = self._create_format_image_processor()
293
+ # If subclass returns None, use default image_processor
294
+ self._format_image_processor = processor if processor is not None else self._image_processor
295
+ return self._format_image_processor
296
+
297
+ @property
298
+ def file_converter(self) -> BaseFileConverter:
299
+ """
300
+ Format-specific file converter (lazy-initialized).
301
+
302
+ Returns the file converter for this handler's file format.
303
+ Each handler should override _create_file_converter() to provide
304
+ format-specific binary-to-object conversion.
305
+ """
306
+ if self._file_converter is None:
307
+ converter = self._create_file_converter()
308
+ # If subclass returns None, use NullFileConverter
309
+ self._file_converter = converter if converter is not None else NullFileConverter()
310
+ return self._file_converter
311
+
312
+ @property
313
+ def preprocessor(self) -> BasePreprocessor:
314
+ """
315
+ Format-specific preprocessor (lazy-initialized).
316
+
317
+ Returns the preprocessor for this handler's file format.
318
+ Each handler should override _create_preprocessor() to provide
319
+ format-specific data preprocessing after conversion.
320
+
321
+ This is called AFTER file_converter.convert() to process/clean
322
+ the converted data before content extraction.
323
+ """
324
+ if self._preprocessor is None:
325
+ preprocessor = self._create_preprocessor()
326
+ # If subclass returns None, use NullPreprocessor
327
+ self._preprocessor = preprocessor if preprocessor is not None else NullPreprocessor()
328
+ return self._preprocessor
329
+
330
+ @property
331
+ def logger(self) -> logging.Logger:
332
+ """Logger instance."""
333
+ return self._logger
334
+
335
+ @abstractmethod
336
+ def extract_text(
337
+ self,
338
+ current_file: "CurrentFile",
339
+ extract_metadata: bool = True,
340
+ **kwargs
341
+ ) -> str:
342
+ """
343
+ Extract text from file.
344
+
345
+ Args:
346
+ current_file: CurrentFile dict containing file info and binary data
347
+ extract_metadata: Whether to extract metadata
348
+ **kwargs: Additional options
349
+
350
+ Returns:
351
+ Extracted text
352
+ """
353
+ pass
354
+
355
+ def extract_metadata(self, source: Any) -> DocumentMetadata:
356
+ """
357
+ Extract metadata from source using format-specific extractor.
358
+
359
+ Convenience method that wraps self.metadata_extractor.extract().
360
+
361
+ Args:
362
+ source: Format-specific source object
363
+
364
+ Returns:
365
+ DocumentMetadata instance
366
+ """
367
+ return self.metadata_extractor.extract(source)
368
+
369
+ def format_metadata(self, metadata: DocumentMetadata) -> str:
370
+ """
371
+ Format metadata as string.
372
+
373
+ Convenience method that wraps self.metadata_extractor.format().
374
+
375
+ Args:
376
+ metadata: DocumentMetadata instance
377
+
378
+ Returns:
379
+ Formatted metadata string
380
+ """
381
+ return self.metadata_extractor.format(metadata)
382
+
383
+ def extract_and_format_metadata(self, source: Any) -> str:
384
+ """
385
+ Extract and format metadata in one step.
386
+
387
+ Convenience method that combines extract and format.
388
+
389
+ Args:
390
+ source: Format-specific source object
391
+
392
+ Returns:
393
+ Formatted metadata string
394
+ """
395
+ return self.metadata_extractor.extract_and_format(source)
396
+
397
+ def convert_file(self, current_file: "CurrentFile", **kwargs) -> Any:
398
+ """
399
+ Convert binary file data to workable format.
400
+
401
+ Convenience method that wraps self.file_converter.convert().
402
+
403
+ This is the first step in the processing pipeline:
404
+ Binary Data ??FileConverter ??Workable Object
405
+
406
+ Args:
407
+ current_file: CurrentFile dict containing file info and binary data
408
+ **kwargs: Additional format-specific options
409
+
410
+ Returns:
411
+ Format-specific workable object (Document, Workbook, OLE file, etc.)
412
+ """
413
+ file_data = current_file.get("file_data", b"")
414
+ file_stream = self.get_file_stream(current_file)
415
+ return self.file_converter.convert(file_data, file_stream, **kwargs)
416
+
417
+ def preprocess(self, converted_data: Any, **kwargs) -> PreprocessedData:
418
+ """
419
+ Preprocess the converted data.
420
+
421
+ Convenience method that wraps self.preprocessor.preprocess().
422
+
423
+ This is the SECOND step in the processing pipeline:
424
+ 1. file_converter.convert() ??Format-specific object
425
+ 2. preprocessor.preprocess() ??Cleaned/processed data (THIS STEP)
426
+ 3. metadata_extractor.extract() ??Metadata
427
+ 4. Content extraction
428
+
429
+ Args:
430
+ converted_data: The data returned from file_converter.convert()
431
+ **kwargs: Additional format-specific options
432
+
433
+ Returns:
434
+ PreprocessedData containing cleaned content and extracted resources
435
+ """
436
+ # If converted_data is bytes, pass it directly
437
+ if isinstance(converted_data, bytes):
438
+ return self.preprocessor.preprocess(converted_data, **kwargs)
439
+
440
+ # For other types, the preprocessor should handle it
441
+ # (e.g., Document object preprocessing)
442
+ return self.preprocessor.preprocess(converted_data, **kwargs)
443
+
444
+ def get_file_stream(self, current_file: "CurrentFile") -> io.BytesIO:
445
+ """
446
+ Get a fresh BytesIO stream from current_file.
447
+
448
+ Resets the stream position to the beginning for reuse.
449
+
450
+ Args:
451
+ current_file: CurrentFile dict
452
+
453
+ Returns:
454
+ BytesIO stream ready for reading
455
+ """
456
+ stream = current_file.get("file_stream")
457
+ if stream is not None:
458
+ stream.seek(0)
459
+ return stream
460
+ # Fallback: create new stream from file_data
461
+ return io.BytesIO(current_file.get("file_data", b""))
462
+
463
+ def save_image(self, image_data: bytes, processed_images: Optional[set] = None) -> Optional[str]:
464
+ """
465
+ Save image and return tag.
466
+
467
+ Convenience method that wraps self.image_processor.save_image().
468
+
469
+ Args:
470
+ image_data: Image binary data
471
+ processed_images: Set of processed image hashes (for deduplication)
472
+
473
+ Returns:
474
+ Image tag string or None
475
+ """
476
+ return self._image_processor.save_image(image_data, processed_images=processed_images)
477
+
478
+ def create_page_tag(self, page_number: int) -> str:
479
+ """
480
+ Create a page number tag.
481
+
482
+ Convenience method that wraps self.page_tag_processor.create_page_tag().
483
+
484
+ Args:
485
+ page_number: Page number
486
+
487
+ Returns:
488
+ Page tag string (e.g., "[Page Number: 1]")
489
+ """
490
+ return self._page_tag_processor.create_page_tag(page_number)
491
+
492
+ def create_slide_tag(self, slide_number: int) -> str:
493
+ """
494
+ Create a slide number tag.
495
+
496
+ Convenience method that wraps self.page_tag_processor.create_slide_tag().
497
+
498
+ Args:
499
+ slide_number: Slide number
500
+
501
+ Returns:
502
+ Slide tag string (e.g., "[Slide Number: 1]")
503
+ """
504
+ return self._page_tag_processor.create_slide_tag(slide_number)
505
+
506
+ def create_sheet_tag(self, sheet_name: str) -> str:
507
+ """
508
+ Create a sheet name tag.
509
+
510
+ Convenience method that wraps self.page_tag_processor.create_sheet_tag().
511
+
512
+ Args:
513
+ sheet_name: Sheet name
514
+
515
+ Returns:
516
+ Sheet tag string (e.g., "[Sheet: Sheet1]")
517
+ """
518
+ return self._page_tag_processor.create_sheet_tag(sheet_name)
519
+
520
+ def process_chart(self, chart_element: Any) -> str:
521
+ """
522
+ Process chart element using the format-specific chart extractor.
523
+
524
+ This is the main method for chart processing. It uses the chart_extractor
525
+ to extract data from the format-specific chart element and formats it
526
+ using ChartProcessor.
527
+
528
+ Args:
529
+ chart_element: Format-specific chart object/element
530
+
531
+ Returns:
532
+ Formatted chart text with tags
533
+ """
534
+ return self.chart_extractor.process(chart_element)
535
+
536
+
537
+ __all__ = [
538
+ "BaseHandler",
539
+ "NullMetadataExtractor",
540
+ "BasePreprocessor",
541
+ "NullPreprocessor",
542
+ "PreprocessedData",
543
+ ]
544
+
@@ -0,0 +1,135 @@
1
+ # xgen_doc2chunk/core/processor/csv_handler.py
2
+ """
3
+ CSV Handler - CSV/TSV File Processor
4
+
5
+ Class-based handler for CSV/TSV files inheriting from BaseHandler.
6
+ """
7
+ import logging
8
+ import os
9
+ from typing import Any, Dict, Optional, Tuple, TYPE_CHECKING
10
+
11
+ from xgen_doc2chunk.core.processor.base_handler import BaseHandler
12
+ from xgen_doc2chunk.core.functions.chart_extractor import BaseChartExtractor, NullChartExtractor
13
+ from xgen_doc2chunk.core.processor.csv_helper import (
14
+ detect_bom,
15
+ detect_delimiter,
16
+ parse_csv_content,
17
+ detect_header,
18
+ convert_rows_to_table,
19
+ )
20
+ from xgen_doc2chunk.core.processor.csv_helper.csv_metadata import CSVMetadataExtractor, CSVSourceInfo
21
+ from xgen_doc2chunk.core.processor.csv_helper.csv_image_processor import CSVImageProcessor
22
+ from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
23
+
24
+ if TYPE_CHECKING:
25
+ from xgen_doc2chunk.core.document_processor import CurrentFile
26
+
27
+ logger = logging.getLogger("document-processor")
28
+
29
+ # Encoding candidates for fallback
30
+ ENCODING_CANDIDATES = ['utf-8', 'utf-8-sig', 'cp949', 'euc-kr', 'iso-8859-1', 'latin-1']
31
+
32
+
33
+ class CSVHandler(BaseHandler):
34
+ """CSV/TSV File Processing Handler Class"""
35
+
36
+ def _create_file_converter(self):
37
+ """Create CSV-specific file converter."""
38
+ from xgen_doc2chunk.core.processor.csv_helper.csv_file_converter import CSVFileConverter
39
+ return CSVFileConverter()
40
+
41
+ def _create_preprocessor(self):
42
+ """Create CSV-specific preprocessor."""
43
+ from xgen_doc2chunk.core.processor.csv_helper.csv_preprocessor import CSVPreprocessor
44
+ return CSVPreprocessor()
45
+
46
+ def _create_chart_extractor(self) -> BaseChartExtractor:
47
+ """CSV files do not contain charts. Return NullChartExtractor."""
48
+ return NullChartExtractor(self._chart_processor)
49
+
50
+ def _create_metadata_extractor(self):
51
+ """Create CSV-specific metadata extractor."""
52
+ return CSVMetadataExtractor()
53
+
54
+ def _create_format_image_processor(self) -> ImageProcessor:
55
+ """Create CSV-specific image processor."""
56
+ return CSVImageProcessor()
57
+
58
+ def extract_text(
59
+ self,
60
+ current_file: "CurrentFile",
61
+ extract_metadata: bool = True,
62
+ encoding: Optional[str] = None,
63
+ delimiter: Optional[str] = None,
64
+ **kwargs
65
+ ) -> str:
66
+ """
67
+ Extract text from CSV/TSV file.
68
+
69
+ Args:
70
+ current_file: CurrentFile dict containing file info and binary data
71
+ extract_metadata: Whether to extract metadata
72
+ encoding: Encoding (None for auto-detect)
73
+ delimiter: Delimiter (None for auto-detect)
74
+ **kwargs: Additional options
75
+
76
+ Returns:
77
+ Extracted text
78
+ """
79
+ file_path = current_file.get("file_path", "unknown")
80
+ ext = current_file.get("file_extension", os.path.splitext(file_path)[1]).lower()
81
+ self.logger.info(f"CSV processing: {file_path}, ext: {ext}")
82
+
83
+ if ext == '.tsv' and delimiter is None:
84
+ delimiter = '\t'
85
+
86
+ try:
87
+ result_parts = []
88
+
89
+ # Step 1: Decode file_data using file_converter
90
+ file_data = current_file.get("file_data", b"")
91
+ content, detected_encoding = self.file_converter.convert(file_data, encoding=encoding)
92
+
93
+ # Step 2: Preprocess - clean_content is the TRUE SOURCE
94
+ preprocessed = self.preprocess(content)
95
+ content = preprocessed.clean_content # TRUE SOURCE
96
+
97
+ if delimiter is None:
98
+ delimiter = detect_delimiter(content)
99
+
100
+ self.logger.info(f"CSV: encoding={detected_encoding}, delimiter={repr(delimiter)}")
101
+
102
+ rows = parse_csv_content(content, delimiter)
103
+
104
+ if not rows:
105
+ return ""
106
+
107
+ has_header = detect_header(rows)
108
+
109
+ if extract_metadata:
110
+ source_info = CSVSourceInfo(
111
+ file_path=file_path,
112
+ encoding=detected_encoding,
113
+ delimiter=delimiter,
114
+ rows=rows,
115
+ has_header=has_header
116
+ )
117
+ metadata_str = self.extract_and_format_metadata(source_info)
118
+ if metadata_str:
119
+ result_parts.append(metadata_str + "\n\n")
120
+
121
+ table = convert_rows_to_table(rows, has_header)
122
+ if table:
123
+ result_parts.append(table)
124
+
125
+ result = "".join(result_parts)
126
+ self.logger.info(f"CSV processing completed: {len(rows)} rows")
127
+
128
+ return result
129
+
130
+ except Exception as e:
131
+ self.logger.error(f"Error extracting text from CSV {file_path}: {e}")
132
+ import traceback
133
+ self.logger.debug(traceback.format_exc())
134
+ raise
135
+