xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,376 @@
1
+ # xgen_doc2chunk/core/processor/docx_handler.py
2
+ """
3
+ DOCX Handler - DOCX Document Processor
4
+
5
+ Key Features:
6
+ - Metadata extraction (title, author, subject, keywords, created/modified dates, etc.)
7
+ - Text extraction (direct parsing via python-docx)
8
+ - Table extraction (HTML format preservation, rowspan/colspan support)
9
+ - Inline image extraction and local saving
10
+ - Chart data extraction (OOXML DrawingML Chart parsing)
11
+ - Diagram processing
12
+
13
+ All processing is done via direct binary parsing through python-docx.
14
+ Image OCR is performed in a separate post-processing step.
15
+
16
+ Fallback Chain:
17
+ 1. Enhanced DOCX processing (python-docx with BytesIO stream)
18
+ 2. DOCHandler fallback (for non-ZIP files: RTF, OLE, HTML, etc.)
19
+ 3. Simple text extraction
20
+ 4. Error message
21
+
22
+ Class-based Handler:
23
+ - DOCXHandler class inherits from BaseHandler to manage config/image_processor
24
+ - Internal methods access via self
25
+ """
26
+ import io
27
+ import logging
28
+ import traceback
29
+ import zipfile
30
+ from typing import Any, Dict, Optional, Set, TYPE_CHECKING
31
+
32
+ from docx import Document
33
+ from lxml import etree
34
+
35
+ # Base handler
36
+ from xgen_doc2chunk.core.processor.base_handler import BaseHandler
37
+ from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
38
+ from xgen_doc2chunk.core.functions.chart_extractor import BaseChartExtractor
39
+ from xgen_doc2chunk.core.processor.docx_helper.docx_chart_extractor import DOCXChartExtractor
40
+
41
+ if TYPE_CHECKING:
42
+ from xgen_doc2chunk.core.document_processor import CurrentFile
43
+
44
+ # docx_helper
45
+ from xgen_doc2chunk.core.processor.docx_helper import (
46
+ # Constants
47
+ ElementType,
48
+ # Paragraph
49
+ process_paragraph_element,
50
+ )
51
+ # Table Extractor & Processor (new interface)
52
+ from xgen_doc2chunk.core.processor.docx_helper.docx_table_extractor import DOCXTableExtractor
53
+ from xgen_doc2chunk.core.processor.docx_helper.docx_table_processor import DOCXTableProcessor
54
+
55
+ from xgen_doc2chunk.core.processor.docx_helper.docx_metadata import DOCXMetadataExtractor
56
+ from xgen_doc2chunk.core.processor.docx_helper.docx_image_processor import DOCXImageProcessor
57
+
58
+ logger = logging.getLogger("document-processor")
59
+
60
+
61
+ # ============================================================================
62
+ # DOCXHandler Class
63
+ # ============================================================================
64
+
65
+ class DOCXHandler(BaseHandler):
66
+ """
67
+ DOCX Document Processing Handler
68
+
69
+ Inherits from BaseHandler to manage config and image_processor at instance level.
70
+
71
+ Fallback Chain:
72
+ 1. Enhanced DOCX processing (python-docx with BytesIO stream)
73
+ 2. DOCHandler fallback (for non-ZIP files: RTF, OLE, HTML, etc.)
74
+ 3. Simple text extraction
75
+ 4. Error message
76
+
77
+ Usage:
78
+ handler = DOCXHandler(config=config, image_processor=image_processor)
79
+ text = handler.extract_text(current_file)
80
+ """
81
+
82
+ def _create_file_converter(self):
83
+ """Create DOCX-specific file converter."""
84
+ from xgen_doc2chunk.core.processor.docx_helper.docx_file_converter import DOCXFileConverter
85
+ return DOCXFileConverter()
86
+
87
+ def _create_preprocessor(self):
88
+ """Create DOCX-specific preprocessor."""
89
+ from xgen_doc2chunk.core.processor.docx_helper.docx_preprocessor import DOCXPreprocessor
90
+ return DOCXPreprocessor()
91
+
92
+ def _create_chart_extractor(self) -> BaseChartExtractor:
93
+ """Create DOCX-specific chart extractor."""
94
+ return DOCXChartExtractor(self._chart_processor)
95
+
96
+ def _create_metadata_extractor(self):
97
+ """Create DOCX-specific metadata extractor."""
98
+ return DOCXMetadataExtractor()
99
+
100
+ def _create_format_image_processor(self):
101
+ """Create DOCX-specific image processor."""
102
+ return DOCXImageProcessor(
103
+ directory_path=self._image_processor.config.directory_path,
104
+ tag_prefix=self._image_processor.config.tag_prefix,
105
+ tag_suffix=self._image_processor.config.tag_suffix,
106
+ storage_backend=self._image_processor.storage_backend,
107
+ )
108
+
109
+ def _create_table_extractor(self) -> DOCXTableExtractor:
110
+ """Create DOCX-specific table extractor."""
111
+ return DOCXTableExtractor()
112
+
113
+ def _create_table_processor(self) -> DOCXTableProcessor:
114
+ """Create DOCX-specific table processor."""
115
+ return DOCXTableProcessor()
116
+
117
+ @property
118
+ def table_extractor(self) -> DOCXTableExtractor:
119
+ """Get table extractor (lazy initialization)."""
120
+ if not hasattr(self, '_table_extractor') or self._table_extractor is None:
121
+ self._table_extractor = self._create_table_extractor()
122
+ return self._table_extractor
123
+
124
+ @property
125
+ def table_processor(self) -> DOCXTableProcessor:
126
+ """Get table processor (lazy initialization)."""
127
+ if not hasattr(self, '_table_processor') or self._table_processor is None:
128
+ self._table_processor = self._create_table_processor()
129
+ return self._table_processor
130
+
131
+ def extract_text(
132
+ self,
133
+ current_file: "CurrentFile",
134
+ extract_metadata: bool = True,
135
+ **kwargs
136
+ ) -> str:
137
+ """
138
+ Extract text from DOCX file.
139
+
140
+ Args:
141
+ current_file: CurrentFile dict containing file info and binary data
142
+ extract_metadata: Whether to extract metadata
143
+ **kwargs: Additional options
144
+
145
+ Returns:
146
+ Extracted text (with inline image tags, table HTML)
147
+ """
148
+ file_path = current_file.get("file_path", "unknown")
149
+ file_data = current_file.get("file_data", b"")
150
+ self.logger.info(f"DOCX processing: {file_path}")
151
+
152
+ # Check if file is a valid DOCX using file_converter validation
153
+ if self.file_converter.validate(file_data):
154
+ return self._extract_docx_enhanced(current_file, extract_metadata)
155
+ else:
156
+ # Not a valid DOCX, try DOCHandler fallback
157
+ self.logger.warning(f"File is not a valid DOCX, trying DOCHandler fallback: {file_path}")
158
+ return self._extract_with_doc_handler_fallback(current_file, extract_metadata)
159
+
160
+ def _extract_with_doc_handler_fallback(
161
+ self,
162
+ current_file: "CurrentFile",
163
+ extract_metadata: bool = True
164
+ ) -> str:
165
+ """
166
+ Fallback to DOCHandler for non-ZIP files.
167
+
168
+ Handles RTF, OLE, HTML, and other formats that might be
169
+ incorrectly named as .docx files.
170
+ """
171
+ file_path = current_file.get("file_path", "unknown")
172
+
173
+ try:
174
+ from xgen_doc2chunk.core.processor.doc_handler import DOCHandler
175
+
176
+ doc_handler = DOCHandler(
177
+ config=self.config,
178
+ image_processor=self.format_image_processor
179
+ )
180
+
181
+ # DOCHandler still uses file_path, so pass it directly
182
+ result = doc_handler.extract_text(current_file, extract_metadata=extract_metadata)
183
+
184
+ if result and not result.startswith("[DOC"):
185
+ self.logger.info(f"DOCHandler fallback successful for: {file_path}")
186
+ return result
187
+ else:
188
+ # DOCHandler also failed, try simple extraction
189
+ return self._extract_simple_text_fallback(current_file)
190
+
191
+ except Exception as e:
192
+ self.logger.error(f"DOCHandler fallback failed: {e}")
193
+ return self._extract_simple_text_fallback(current_file)
194
+
195
+ def _extract_simple_text_fallback(self, current_file: "CurrentFile") -> str:
196
+ """
197
+ Last resort: try to extract any readable text from the file.
198
+ """
199
+ file_path = current_file.get("file_path", "unknown")
200
+ file_data = current_file.get("file_data", b"")
201
+
202
+ try:
203
+ # Try different encodings
204
+ for encoding in ['utf-8', 'cp949', 'euc-kr', 'latin-1']:
205
+ try:
206
+ text = file_data.decode(encoding)
207
+ # Remove binary garbage and control characters
208
+ import re
209
+ text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]', '', text)
210
+ text = text.strip()
211
+
212
+ if text and len(text) > 50: # Must have meaningful content
213
+ self.logger.info(f"Simple text extraction successful with {encoding}: {file_path}")
214
+ return text
215
+ except (UnicodeDecodeError, Exception):
216
+ continue
217
+
218
+ raise ValueError("Could not decode file with any known encoding")
219
+
220
+ except Exception as e:
221
+ self.logger.error(f"All extraction methods failed for: {file_path}")
222
+ raise RuntimeError(f"DOCX file processing failed: {file_path}. "
223
+ f"File is not a valid DOCX, DOC, RTF, or text file.")
224
+
225
+ def _extract_docx_enhanced(
226
+ self,
227
+ current_file: "CurrentFile",
228
+ extract_metadata: bool = True
229
+ ) -> str:
230
+ """
231
+ Enhanced DOCX processing.
232
+
233
+ - Document order preservation (body element traversal)
234
+ - Metadata extraction
235
+ - Inline image extraction and local saving
236
+ - Table HTML format preservation (cell merge support)
237
+ - Chart data extraction
238
+ - Page break handling
239
+ """
240
+ file_path = current_file.get("file_path", "unknown")
241
+ file_data = current_file.get("file_data", b"")
242
+ self.logger.info(f"Enhanced DOCX processing: {file_path}")
243
+
244
+ try:
245
+ # Step 1: Use file_converter to convert binary to Document
246
+ doc = self.file_converter.convert(file_data)
247
+
248
+ # Step 2: Preprocess - may transform doc in the future
249
+ preprocessed = self.preprocess(doc)
250
+ doc = preprocessed.clean_content # TRUE SOURCE
251
+
252
+ result_parts = []
253
+ processed_images: Set[str] = set()
254
+ current_page = 1
255
+ total_tables = 0
256
+ total_images = 0
257
+ total_charts = 0
258
+
259
+ # Pre-extract all charts using ChartExtractor
260
+ file_stream = self.get_file_stream(current_file)
261
+ chart_data_list = self.chart_extractor.extract_all_from_file(file_stream)
262
+ chart_idx = [0] # Mutable container for closure
263
+
264
+ def get_next_chart() -> str:
265
+ """Callback to get the next pre-extracted chart content."""
266
+ if chart_idx[0] < len(chart_data_list):
267
+ chart_data = chart_data_list[chart_idx[0]]
268
+ chart_idx[0] += 1
269
+ return self._format_chart_data(chart_data)
270
+ return ""
271
+
272
+ # Metadata extraction
273
+ if extract_metadata:
274
+ metadata_str = self.extract_and_format_metadata(doc)
275
+ if metadata_str:
276
+ result_parts.append(metadata_str + "\n\n")
277
+ self.logger.info(f"DOCX metadata extracted")
278
+
279
+ # Start page 1
280
+ page_tag = self.create_page_tag(current_page)
281
+ result_parts.append(f"{page_tag}\n")
282
+
283
+ # Traverse body elements in document order
284
+ for body_elem in doc.element.body:
285
+ local_tag = etree.QName(body_elem).localname
286
+
287
+ if local_tag == 'p':
288
+ # Paragraph processing - pass chart_callback for pre-extracted charts
289
+ content, has_page_break, img_count, chart_count = process_paragraph_element(
290
+ body_elem, doc, processed_images, file_path,
291
+ image_processor=self.format_image_processor,
292
+ chart_callback=get_next_chart
293
+ )
294
+
295
+ if has_page_break:
296
+ current_page += 1
297
+ page_tag = self.create_page_tag(current_page)
298
+ result_parts.append(f"\n{page_tag}\n")
299
+
300
+ if content.strip():
301
+ result_parts.append(content + "\n")
302
+
303
+ total_images += img_count
304
+ total_charts += chart_count
305
+
306
+ elif local_tag == 'tbl':
307
+ # Table processing using APPROACH 2(extract_table)
308
+ table_data = self.table_extractor.extract_table(body_elem, doc)
309
+ if table_data:
310
+ table_html = self.table_processor.format_table_as_html(table_data)
311
+ if table_html:
312
+ total_tables += 1
313
+ result_parts.append("\n" + table_html + "\n\n")
314
+
315
+ elif local_tag == 'sectPr':
316
+ continue
317
+
318
+ result = "".join(result_parts)
319
+ self.logger.info(f"Enhanced DOCX processing completed: {current_page} pages, "
320
+ f"{total_tables} tables, {total_images} images, {total_charts} charts")
321
+
322
+ return result
323
+
324
+ except Exception as e:
325
+ self.logger.error(f"Error in enhanced DOCX processing: {e}")
326
+ self.logger.debug(traceback.format_exc())
327
+ return self._extract_docx_simple_text(current_file)
328
+
329
+ def _format_chart_data(self, chart_data) -> str:
330
+ """Format ChartData using ChartProcessor."""
331
+ from xgen_doc2chunk.core.functions.chart_extractor import ChartData
332
+
333
+ if not isinstance(chart_data, ChartData):
334
+ return ""
335
+
336
+ if chart_data.has_data():
337
+ return self.chart_processor.format_chart_data(
338
+ chart_type=chart_data.chart_type,
339
+ title=chart_data.title,
340
+ categories=chart_data.categories,
341
+ series=chart_data.series
342
+ )
343
+ else:
344
+ return self.chart_processor.format_chart_fallback(
345
+ chart_type=chart_data.chart_type,
346
+ title=chart_data.title
347
+ )
348
+
349
+ def _extract_docx_simple_text(self, current_file: "CurrentFile") -> str:
350
+ """Simple text extraction (fallback)."""
351
+ try:
352
+ file_data = current_file.get("file_data", b"")
353
+ doc = self.file_converter.convert(file_data)
354
+ result_parts = []
355
+
356
+ for para in doc.paragraphs:
357
+ if para.text.strip():
358
+ result_parts.append(para.text)
359
+
360
+ for table in doc.tables:
361
+ for row in table.rows:
362
+ row_texts = []
363
+ for cell in row.cells:
364
+ row_texts.append(cell.text.strip())
365
+ if any(t for t in row_texts):
366
+ result_parts.append(" | ".join(row_texts))
367
+
368
+ return "\n".join(result_parts)
369
+
370
+ except Exception as e:
371
+ self.logger.error(f"Error in simple DOCX text extraction: {e}")
372
+ return f"[DOCX file processing failed: {str(e)}]"
373
+
374
+
375
+ __all__ = ["DOCXHandler"]
376
+
@@ -0,0 +1,84 @@
1
+ # xgen_doc2chunk/core/processor/docx_helper/__init__.py
2
+ """
3
+ DOCX Helper Module
4
+
5
+ Utility modules for DOCX document processing.
6
+
7
+ Module structure:
8
+ - docx_constants: Constants, Enum, dataclasses (ElementType, NAMESPACES, etc.)
9
+ - docx_metadata: Metadata extraction (DOCXMetadataExtractor)
10
+ - docx_chart_extractor: Chart extraction (DOCXChartExtractor)
11
+ - docx_image_processor: Image/drawing processing (DOCXImageProcessor)
12
+ - docx_table_extractor: Table extraction (DOCXTableExtractor) - BaseTableExtractor interface
13
+ - docx_table_processor: Table formatting (DOCXTableProcessor) - TableProcessor interface
14
+ - docx_paragraph: Paragraph processing and page breaks
15
+ """
16
+
17
+ # Constants
18
+ from xgen_doc2chunk.core.processor.docx_helper.docx_constants import (
19
+ ElementType,
20
+ DocxElement,
21
+ NAMESPACES,
22
+ CHART_TYPE_MAP,
23
+ )
24
+
25
+ # Metadata
26
+ from xgen_doc2chunk.core.processor.docx_helper.docx_metadata import (
27
+ DOCXMetadataExtractor,
28
+ )
29
+
30
+ # Chart Extractor
31
+ from xgen_doc2chunk.core.processor.docx_helper.docx_chart_extractor import (
32
+ DOCXChartExtractor,
33
+ )
34
+
35
+ # Image Processor (replaces docx_image.py utility functions)
36
+ from xgen_doc2chunk.core.processor.docx_helper.docx_image_processor import (
37
+ DOCXImageProcessor,
38
+ )
39
+
40
+ # Table Extractor (BaseTableExtractor interface)
41
+ from xgen_doc2chunk.core.processor.docx_helper.docx_table_extractor import (
42
+ DOCXTableExtractor,
43
+ create_docx_table_extractor,
44
+ )
45
+
46
+ # Table Processor (TableProcessor interface)
47
+ from xgen_doc2chunk.core.processor.docx_helper.docx_table_processor import (
48
+ DOCXTableProcessor,
49
+ DOCXTableProcessorConfig,
50
+ create_docx_table_processor,
51
+ format_table_as_html,
52
+ )
53
+
54
+ # Paragraph
55
+ from xgen_doc2chunk.core.processor.docx_helper.docx_paragraph import (
56
+ process_paragraph_element,
57
+ has_page_break_element,
58
+ )
59
+
60
+
61
+ __all__ = [
62
+ # Constants
63
+ 'ElementType',
64
+ 'DocxElement',
65
+ 'NAMESPACES',
66
+ 'CHART_TYPE_MAP',
67
+ # Metadata
68
+ 'DOCXMetadataExtractor',
69
+ # Chart Extractor
70
+ 'DOCXChartExtractor',
71
+ # Image Processor
72
+ 'DOCXImageProcessor',
73
+ # Table Extractor (BaseTableExtractor interface)
74
+ 'DOCXTableExtractor',
75
+ 'create_docx_table_extractor',
76
+ # Table Processor (TableProcessor interface)
77
+ 'DOCXTableProcessor',
78
+ 'DOCXTableProcessorConfig',
79
+ 'create_docx_table_processor',
80
+ 'format_table_as_html',
81
+ # Paragraph
82
+ 'process_paragraph_element',
83
+ 'has_page_break_element',
84
+ ]