xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,84 @@
1
+ # xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py
2
+ """
3
+ Image File Preprocessor - Process image file after conversion.
4
+
5
+ Processing Pipeline Position:
6
+ 1. ImageFileConverter.convert() ??bytes (raw image data)
7
+ 2. ImageFilePreprocessor.preprocess() ??PreprocessedData (THIS STEP)
8
+ 3. ImageFileMetadataExtractor.extract() ??DocumentMetadata
9
+ 4. OCR processing (if OCR engine available)
10
+
11
+ Current Implementation:
12
+ - Pass-through (Image uses raw bytes directly for OCR)
13
+ """
14
+ import logging
15
+ from typing import Any, Dict
16
+
17
+ from xgen_doc2chunk.core.functions.preprocessor import (
18
+ BasePreprocessor,
19
+ PreprocessedData,
20
+ )
21
+
22
+ logger = logging.getLogger("xgen_doc2chunk.image_file.preprocessor")
23
+
24
+
25
+ class ImageFilePreprocessor(BasePreprocessor):
26
+ """
27
+ Image File Preprocessor.
28
+
29
+ Currently a pass-through implementation as image processing
30
+ is handled by the OCR engine.
31
+ """
32
+
33
+ def preprocess(
34
+ self,
35
+ converted_data: Any,
36
+ **kwargs
37
+ ) -> PreprocessedData:
38
+ """
39
+ Preprocess the converted image data.
40
+
41
+ Args:
42
+ converted_data: Image bytes from ImageFileConverter
43
+ **kwargs: Additional options
44
+
45
+ Returns:
46
+ PreprocessedData with the image data
47
+ """
48
+ metadata: Dict[str, Any] = {}
49
+
50
+ if isinstance(converted_data, bytes):
51
+ metadata['size_bytes'] = len(converted_data)
52
+ # Try to detect image format from magic bytes
53
+ if converted_data.startswith(b'\xff\xd8\xff'):
54
+ metadata['format'] = 'jpeg'
55
+ elif converted_data.startswith(b'\x89PNG'):
56
+ metadata['format'] = 'png'
57
+ elif converted_data.startswith(b'GIF'):
58
+ metadata['format'] = 'gif'
59
+ elif converted_data.startswith(b'BM'):
60
+ metadata['format'] = 'bmp'
61
+ elif converted_data.startswith(b'RIFF') and b'WEBP' in converted_data[:12]:
62
+ metadata['format'] = 'webp'
63
+
64
+ logger.debug("Image file preprocessor: pass-through, metadata=%s", metadata)
65
+
66
+ # clean_content is the TRUE SOURCE - contains the image bytes
67
+ return PreprocessedData(
68
+ raw_content=converted_data,
69
+ clean_content=converted_data, # TRUE SOURCE - image bytes
70
+ encoding="binary",
71
+ extracted_resources={},
72
+ metadata=metadata,
73
+ )
74
+
75
+ def get_format_name(self) -> str:
76
+ """Return format name."""
77
+ return "Image File Preprocessor"
78
+
79
+ def validate(self, data: Any) -> bool:
80
+ """Validate if data is image bytes."""
81
+ return isinstance(data, bytes) and len(data) > 0
82
+
83
+
84
+ __all__ = ['ImageFilePreprocessor']
@@ -0,0 +1,597 @@
1
+ # xgen_doc2chunk/core/processor/pdf_handler.py
2
+ """
3
+ PDF Handler - Adaptive Complexity-based PDF Processor
4
+
5
+ =============================================================================
6
+ Core Features:
7
+ =============================================================================
8
+ 1. Complexity Analysis - Calculate complexity scores per page/region
9
+ 2. Adaptive Processing Strategy - Select optimal strategy based on complexity
10
+ 3. Block Imaging - Render complex regions as images
11
+ 4. Local Storage - Save imaged blocks locally and generate [image:{path}] tags
12
+ 5. Multi-column Layout - Handle newspaper/magazine style multi-column layouts
13
+ 6. Text Quality Analysis - Automatic vector text quality evaluation
14
+
15
+ =============================================================================
16
+ Class-based Handler:
17
+ =============================================================================
18
+ PDFHandler class inherits from BaseHandler and manages config/image_processor
19
+ at instance level. All internal methods can access these via self.
20
+
21
+ =============================================================================
22
+ Core Algorithms:
23
+ =============================================================================
24
+ 1. Line Analysis:
25
+ - Extract all lines from drawings/rects
26
+ - Classify by line thickness (thin < 0.5pt, normal 0.5-2pt, thick > 2pt)
27
+ - Merge adjacent double lines (gap < 5pt)
28
+ - Recover incomplete borders (complete 4 sides when 3+ exist)
29
+
30
+ 2. Table Detection:
31
+ - Strategy 1: PyMuPDF find_tables() - Calculate confidence score
32
+ - Strategy 2: pdfplumber - Calculate confidence score
33
+ - Strategy 3: Line analysis based grid construction - Calculate confidence score
34
+ - Select highest confidence strategy or merge results
35
+
36
+ 3. Cell Analysis:
37
+ - Extract physical cell bbox
38
+ - Grid line mapping (tolerance based)
39
+ - Precise rowspan/colspan calculation
40
+ - Merge validation based on text position
41
+
42
+ 4. Annotation Integration:
43
+ - Detect annotation rows immediately after tables (e.g., "Note: ...")
44
+ - Collect footnote/endnote text
45
+ - Integrate appropriately into table data
46
+ """
47
+ import logging
48
+ import traceback
49
+ from typing import Any, Dict, List, Optional, Tuple, Set, TYPE_CHECKING
50
+
51
+ # Base handler
52
+ from xgen_doc2chunk.core.processor.base_handler import BaseHandler
53
+ from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
54
+
55
+ if TYPE_CHECKING:
56
+ from xgen_doc2chunk.core.document_processor import CurrentFile
57
+
58
+ # Import from new modular helpers
59
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_metadata import (
60
+ PDFMetadataExtractor,
61
+ )
62
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_image_processor import (
63
+ PDFImageProcessor,
64
+ )
65
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_utils import (
66
+ bbox_overlaps,
67
+ )
68
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_text_extractor import (
69
+ extract_text_blocks,
70
+ )
71
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_page_analyzer import (
72
+ detect_page_border,
73
+ is_table_likely_border,
74
+ )
75
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_element_merger import (
76
+ merge_page_elements,
77
+ )
78
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_table_processor import (
79
+ extract_all_tables,
80
+ )
81
+
82
+ # Modularized component imports
83
+ from xgen_doc2chunk.core.processor.pdf_helpers.types import (
84
+ TableDetectionStrategy as TableDetectionStrategyType,
85
+ ElementType,
86
+ PageElement,
87
+ PageBorderInfo,
88
+ )
89
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_vector_text_ocr import (
90
+ VectorTextOCREngine,
91
+ )
92
+
93
+ # Complexity analysis module
94
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_complexity_analyzer import (
95
+ ComplexityAnalyzer,
96
+ ProcessingStrategy,
97
+ PageComplexity,
98
+ )
99
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_block_image_engine import (
100
+ BlockImageEngine,
101
+ MultiBlockResult,
102
+ )
103
+
104
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_table_quality_analyzer import (
105
+ TableQualityAnalyzer,
106
+ TableQuality,
107
+ )
108
+
109
+ logger = logging.getLogger("document-processor")
110
+
111
+ # PyMuPDF import
112
+ import fitz
113
+
114
+
115
+ # Enum aliases for backward compatibility
116
+ TableDetectionStrategy = TableDetectionStrategyType
117
+
118
+
119
+ # ============================================================================
120
+ # PDFHandler Class
121
+ # ============================================================================
122
+
123
+ class PDFHandler(BaseHandler):
124
+ """
125
+ PDF Document Handler
126
+
127
+ Inherits from BaseHandler to manage config and image_processor at instance level.
128
+ All internal methods access these via self.config, self.image_processor.
129
+
130
+ Usage:
131
+ handler = PDFHandler(config=config, image_processor=image_processor)
132
+ text = handler.extract_text(current_file)
133
+ """
134
+
135
+ def _create_file_converter(self):
136
+ """Create PDF-specific file converter."""
137
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_file_converter import PDFFileConverter
138
+ return PDFFileConverter()
139
+
140
+ def _create_preprocessor(self):
141
+ """Create PDF-specific preprocessor."""
142
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_preprocessor import PDFPreprocessor
143
+ return PDFPreprocessor()
144
+
145
+ def _create_chart_extractor(self):
146
+ """PDF chart extraction not yet implemented. Return NullChartExtractor."""
147
+ from xgen_doc2chunk.core.functions.chart_extractor import NullChartExtractor
148
+ return NullChartExtractor(self._chart_processor)
149
+
150
+ def _create_metadata_extractor(self):
151
+ """Create PDF-specific metadata extractor."""
152
+ return PDFMetadataExtractor()
153
+
154
+ def _create_format_image_processor(self):
155
+ """Create PDF-specific image processor."""
156
+ return PDFImageProcessor(
157
+ directory_path=self._image_processor.config.directory_path,
158
+ tag_prefix=self._image_processor.config.tag_prefix,
159
+ tag_suffix=self._image_processor.config.tag_suffix,
160
+ storage_backend=self._image_processor.storage_backend,
161
+ )
162
+
163
+ def extract_text(
164
+ self,
165
+ current_file: "CurrentFile",
166
+ extract_metadata: bool = True,
167
+ **kwargs
168
+ ) -> str:
169
+ """
170
+ Extract text from PDF file.
171
+
172
+ Args:
173
+ current_file: CurrentFile dict containing file info and binary data
174
+ extract_metadata: Whether to extract metadata
175
+ **kwargs: Additional options
176
+
177
+ Returns:
178
+ Extracted text
179
+ """
180
+ file_path = current_file.get("file_path", "unknown")
181
+ self.logger.info(f"[PDF] Processing: {file_path}")
182
+ return self._extract_pdf(current_file, extract_metadata)
183
+
184
+ def _extract_pdf(
185
+ self,
186
+ current_file: "CurrentFile",
187
+ extract_metadata: bool = True
188
+ ) -> str:
189
+ """
190
+ Enhanced PDF processing - adaptive complexity-based.
191
+
192
+ Args:
193
+ current_file: CurrentFile dict containing file info and binary data
194
+ extract_metadata: Whether to extract metadata
195
+
196
+ Returns:
197
+ Extracted text
198
+ """
199
+ file_path = current_file.get("file_path", "unknown")
200
+ file_data = current_file.get("file_data", b"")
201
+
202
+ try:
203
+ # Step 1: Use FileConverter to convert binary to fitz.Document
204
+ doc = self.file_converter.convert(file_data)
205
+
206
+ # Step 2: Preprocess - may transform doc in the future
207
+ preprocessed = self.preprocess(doc)
208
+ doc = preprocessed.clean_content # TRUE SOURCE
209
+
210
+ all_pages_text = []
211
+ processed_images: Set[int] = set()
212
+
213
+ # Extract metadata
214
+ if extract_metadata:
215
+ metadata_text = self.extract_and_format_metadata(doc)
216
+ if metadata_text:
217
+ all_pages_text.append(metadata_text)
218
+
219
+ # Extract all document tables
220
+ # NOTE: file_path is passed for pdfplumber compatibility
221
+ all_tables = self._extract_all_tables(doc, file_path)
222
+
223
+ # Process each page
224
+ for page_num in range(len(doc)):
225
+ page = doc[page_num]
226
+
227
+ self.logger.debug(f"[PDF] Processing page {page_num + 1}")
228
+
229
+ # Complexity analysis
230
+ complexity_analyzer = ComplexityAnalyzer(page, page_num)
231
+ page_complexity = complexity_analyzer.analyze()
232
+
233
+ self.logger.info(f"[PDF] Page {page_num + 1}: "
234
+ f"complexity={page_complexity.overall_complexity.name}, "
235
+ f"score={page_complexity.overall_score:.2f}, "
236
+ f"strategy={page_complexity.recommended_strategy.name}")
237
+
238
+ # Branch by processing strategy
239
+ strategy = page_complexity.recommended_strategy
240
+
241
+ if strategy == ProcessingStrategy.FULL_PAGE_OCR:
242
+ page_text = self._process_page_full_ocr(
243
+ page, page_num, doc, processed_images, all_tables
244
+ )
245
+ elif strategy == ProcessingStrategy.BLOCK_IMAGE_OCR:
246
+ page_text = self._process_page_block_ocr(
247
+ page, page_num, doc, processed_images, all_tables,
248
+ page_complexity.complex_regions
249
+ )
250
+ elif strategy == ProcessingStrategy.HYBRID:
251
+ page_text = self._process_page_hybrid(
252
+ page, page_num, doc, processed_images, all_tables,
253
+ page_complexity
254
+ )
255
+ else:
256
+ page_text = self._process_page_text_extraction(
257
+ page, page_num, doc, processed_images, all_tables
258
+ )
259
+
260
+ if page_text.strip():
261
+ page_tag = self.create_page_tag(page_num + 1)
262
+ all_pages_text.append(f"{page_tag}\n{page_text}")
263
+
264
+ doc.close()
265
+
266
+ final_text = "\n\n".join(all_pages_text)
267
+ self.logger.info(f"[PDF] Extracted {len(final_text)} chars from {file_path}")
268
+
269
+ return final_text
270
+
271
+ except Exception as e:
272
+ self.logger.error(f"[PDF] Error processing {file_path}: {e}")
273
+ self.logger.debug(traceback.format_exc())
274
+ raise
275
+
276
+ def _process_page_text_extraction(
277
+ self, page, page_num: int, doc, processed_images: Set[int],
278
+ all_tables: Dict[int, List[PageElement]]
279
+ ) -> str:
280
+ """TEXT_EXTRACTION strategy - standard text extraction."""
281
+ page_elements: List[PageElement] = []
282
+
283
+ border_info = detect_page_border(page)
284
+
285
+ # Vector text OCR
286
+ vector_text_engine = VectorTextOCREngine(page, page_num)
287
+ vector_text_regions = vector_text_engine.detect_and_extract()
288
+
289
+ for region in vector_text_regions:
290
+ if region.ocr_text and region.confidence > 0.3:
291
+ page_elements.append(PageElement(
292
+ element_type=ElementType.TEXT,
293
+ content=region.ocr_text,
294
+ bbox=region.bbox,
295
+ page_num=page_num
296
+ ))
297
+
298
+ page_tables = all_tables.get(page_num, [])
299
+ for table_element in page_tables:
300
+ page_elements.append(table_element)
301
+
302
+ table_bboxes = [elem.bbox for elem in page_tables]
303
+
304
+ text_elements = extract_text_blocks(page, page_num, table_bboxes, border_info)
305
+ page_elements.extend(text_elements)
306
+
307
+ image_elements = self._extract_images_from_page(
308
+ page, page_num, doc, processed_images, table_bboxes
309
+ )
310
+ page_elements.extend(image_elements)
311
+
312
+ return merge_page_elements(page_elements)
313
+
314
+ def _process_page_hybrid(
315
+ self, page, page_num: int, doc, processed_images: Set[int],
316
+ all_tables: Dict[int, List[PageElement]],
317
+ page_complexity: PageComplexity
318
+ ) -> str:
319
+ """HYBRID strategy - text extraction + complex region imaging."""
320
+ page_elements: List[PageElement] = []
321
+
322
+ border_info = detect_page_border(page)
323
+
324
+ vector_text_engine = VectorTextOCREngine(page, page_num)
325
+ vector_text_regions = vector_text_engine.detect_and_extract()
326
+
327
+ for region in vector_text_regions:
328
+ if region.ocr_text and region.confidence > 0.3:
329
+ page_elements.append(PageElement(
330
+ element_type=ElementType.TEXT,
331
+ content=region.ocr_text,
332
+ bbox=region.bbox,
333
+ page_num=page_num
334
+ ))
335
+
336
+ page_tables = all_tables.get(page_num, [])
337
+ for table_element in page_tables:
338
+ page_elements.append(table_element)
339
+
340
+ table_bboxes = [elem.bbox for elem in page_tables]
341
+ complex_bboxes = page_complexity.complex_regions
342
+
343
+ text_elements = extract_text_blocks(page, page_num, table_bboxes, border_info)
344
+
345
+ for elem in text_elements:
346
+ is_in_complex = False
347
+ for complex_bbox in complex_bboxes:
348
+ if bbox_overlaps(elem.bbox, complex_bbox):
349
+ is_in_complex = True
350
+ break
351
+ if not is_in_complex:
352
+ page_elements.append(elem)
353
+
354
+ if complex_bboxes:
355
+ block_engine = BlockImageEngine(page, page_num, image_processor=self.format_image_processor)
356
+
357
+ for complex_bbox in complex_bboxes:
358
+ result = block_engine.process_region(complex_bbox, region_type="complex_region")
359
+
360
+ if result.success and result.image_tag:
361
+ page_elements.append(PageElement(
362
+ element_type=ElementType.IMAGE,
363
+ content=result.image_tag,
364
+ bbox=complex_bbox,
365
+ page_num=page_num
366
+ ))
367
+
368
+ image_elements = self._extract_images_from_page(
369
+ page, page_num, doc, processed_images, table_bboxes
370
+ )
371
+ page_elements.extend(image_elements)
372
+
373
+ return merge_page_elements(page_elements)
374
+
375
+ def _process_page_block_ocr(
376
+ self, page, page_num: int, doc, processed_images: Set[int],
377
+ all_tables: Dict[int, List[PageElement]],
378
+ complex_regions: List[Tuple[float, float, float, float]]
379
+ ) -> str:
380
+ """BLOCK_IMAGE_OCR strategy - render complex regions as images."""
381
+ page_elements: List[PageElement] = []
382
+
383
+ page_tables = all_tables.get(page_num, [])
384
+ for table_element in page_tables:
385
+ page_elements.append(table_element)
386
+
387
+ table_bboxes = [elem.bbox for elem in page_tables]
388
+
389
+ if complex_regions:
390
+ block_engine = BlockImageEngine(page, page_num, image_processor=self.format_image_processor)
391
+
392
+ for complex_bbox in complex_regions:
393
+ if any(bbox_overlaps(complex_bbox, tb) for tb in table_bboxes):
394
+ continue
395
+
396
+ result = block_engine.process_region(complex_bbox, region_type="complex_region")
397
+
398
+ if result.success and result.image_tag:
399
+ page_elements.append(PageElement(
400
+ element_type=ElementType.IMAGE,
401
+ content=result.image_tag,
402
+ bbox=complex_bbox,
403
+ page_num=page_num
404
+ ))
405
+
406
+ border_info = detect_page_border(page)
407
+ text_elements = extract_text_blocks(page, page_num, table_bboxes, border_info)
408
+
409
+ for elem in text_elements:
410
+ is_in_complex = any(
411
+ bbox_overlaps(elem.bbox, cr) for cr in complex_regions
412
+ )
413
+ if not is_in_complex:
414
+ page_elements.append(elem)
415
+
416
+ image_elements = self._extract_images_from_page(
417
+ page, page_num, doc, processed_images, table_bboxes
418
+ )
419
+ page_elements.extend(image_elements)
420
+
421
+ return merge_page_elements(page_elements)
422
+
423
+ def _process_page_full_ocr(
424
+ self, page, page_num: int, doc, processed_images: Set[int],
425
+ all_tables: Dict[int, List[PageElement]]
426
+ ) -> str:
427
+ """FULL_PAGE_OCR strategy - advanced smart block processing."""
428
+ page_elements: List[PageElement] = []
429
+
430
+ # Table quality analysis
431
+ table_quality_analyzer = TableQualityAnalyzer(page)
432
+ table_quality_result = table_quality_analyzer.analyze_page_tables()
433
+
434
+ unprocessable_table_bboxes: List[Tuple] = []
435
+
436
+ if table_quality_result and table_quality_result.get('table_candidates'):
437
+ for table_info in table_quality_result['table_candidates']:
438
+ quality = table_info.get('quality', TableQuality.UNPROCESSABLE)
439
+ bbox = table_info.get('bbox')
440
+
441
+ if quality not in (TableQuality.EXCELLENT, TableQuality.GOOD, TableQuality.MODERATE):
442
+ if bbox:
443
+ unprocessable_table_bboxes.append(bbox)
444
+
445
+ page_tables = all_tables.get(page_num, [])
446
+ has_processable_tables = len(page_tables) > 0 or (
447
+ table_quality_result and
448
+ any(t.get('quality') in (TableQuality.EXCELLENT, TableQuality.GOOD, TableQuality.MODERATE)
449
+ for t in table_quality_result.get('table_candidates', []))
450
+ )
451
+
452
+ if has_processable_tables:
453
+ self.logger.info(f"[PDF] Page {page_num + 1}: Found processable tables, "
454
+ f"using hybrid extraction")
455
+
456
+ table_bboxes = [elem.bbox for elem in page_tables]
457
+ for table_element in page_tables:
458
+ page_elements.append(table_element)
459
+
460
+ border_info = detect_page_border(page)
461
+ text_elements = extract_text_blocks(page, page_num, table_bboxes, border_info)
462
+ page_elements.extend(text_elements)
463
+
464
+ image_elements = self._extract_images_from_page(
465
+ page, page_num, doc, processed_images, table_bboxes
466
+ )
467
+ page_elements.extend(image_elements)
468
+
469
+ return merge_page_elements(page_elements)
470
+
471
+ # Smart block processing
472
+ block_engine = BlockImageEngine(page, page_num, image_processor=self.format_image_processor)
473
+ multi_result: MultiBlockResult = block_engine.process_page_smart()
474
+
475
+ if multi_result.success and multi_result.block_results:
476
+ for block_result in multi_result.block_results:
477
+ if block_result.success and block_result.image_tag:
478
+ page_elements.append(PageElement(
479
+ element_type=ElementType.IMAGE,
480
+ content=block_result.image_tag,
481
+ bbox=block_result.bbox,
482
+ page_num=page_num
483
+ ))
484
+
485
+ self.logger.info(f"[PDF] Page {page_num + 1}: Smart block processing - "
486
+ f"strategy={multi_result.strategy_used.name}, "
487
+ f"blocks={multi_result.successful_blocks}/{multi_result.total_blocks}")
488
+ else:
489
+ self.logger.warning(f"[PDF] Page {page_num + 1}: Smart processing failed, "
490
+ f"falling back to full page image")
491
+
492
+ result = block_engine.process_full_page(region_type="full_page")
493
+
494
+ if result.success and result.image_tag:
495
+ page_elements.append(PageElement(
496
+ element_type=ElementType.IMAGE,
497
+ content=result.image_tag,
498
+ bbox=(0, 0, page.rect.width, page.rect.height),
499
+ page_num=page_num
500
+ ))
501
+ else:
502
+ self.logger.warning(f"[PDF] Page {page_num + 1}: Full page image failed")
503
+ border_info = detect_page_border(page)
504
+ page_tables = all_tables.get(page_num, [])
505
+ table_bboxes = [elem.bbox for elem in page_tables]
506
+
507
+ for table_element in page_tables:
508
+ page_elements.append(table_element)
509
+
510
+ text_elements = extract_text_blocks(page, page_num, table_bboxes, border_info)
511
+ page_elements.extend(text_elements)
512
+
513
+ image_elements = self._extract_images_from_page(
514
+ page, page_num, doc, processed_images, table_bboxes
515
+ )
516
+ page_elements.extend(image_elements)
517
+
518
+ return merge_page_elements(page_elements)
519
+
520
+ def _extract_all_tables(self, doc, file_path: str) -> Dict[int, List[PageElement]]:
521
+ """Extract tables from entire document."""
522
+ return extract_all_tables(doc, file_path, detect_page_border, is_table_likely_border)
523
+
524
+ def _extract_images_from_page(
525
+ self, page, page_num: int, doc, processed_images: Set[int],
526
+ table_bboxes: List[Tuple[float, float, float, float]],
527
+ min_image_size: int = 50,
528
+ min_image_area: int = 2500
529
+ ) -> List[PageElement]:
530
+ """Extract images from page using instance's format_image_processor."""
531
+ # Use PDFImageProcessor's integrated method
532
+ image_processor = self.format_image_processor
533
+ if hasattr(image_processor, 'extract_images_from_page'):
534
+ elements_dicts = image_processor.extract_images_from_page(
535
+ page, page_num, doc, processed_images, table_bboxes,
536
+ min_image_size=min_image_size, min_image_area=min_image_area
537
+ )
538
+ # Convert dicts to PageElement
539
+ return [
540
+ PageElement(
541
+ element_type=ElementType.IMAGE,
542
+ content=e['content'],
543
+ bbox=e['bbox'],
544
+ page_num=e['page_num']
545
+ )
546
+ for e in elements_dicts
547
+ ]
548
+ return []
549
+
550
+
551
+ # ============================================================================
552
+ # Legacy Function Interface (for backward compatibility)
553
+ # ============================================================================
554
+
555
+ def extract_text_from_pdf(
556
+ file_path: str,
557
+ current_config: Dict[str, Any] = None,
558
+ extract_default_metadata: bool = True
559
+ ) -> str:
560
+ """
561
+ PDF text extraction (legacy function interface).
562
+
563
+ This function creates a PDFHandler instance and delegates to it.
564
+ For new code, consider using PDFHandler class directly.
565
+
566
+ Args:
567
+ file_path: PDF file path
568
+ current_config: Configuration dictionary
569
+ extract_default_metadata: Whether to extract metadata (default: True)
570
+
571
+ Returns:
572
+ Extracted text (including inline image tags, table HTML)
573
+ """
574
+ if current_config is None:
575
+ current_config = {}
576
+
577
+ # Extract image_processor from config if available
578
+ image_processor = current_config.get("image_processor")
579
+
580
+ # Create handler instance with config and image_processor
581
+ handler = PDFHandler(config=current_config, image_processor=image_processor)
582
+
583
+ return handler.extract_text(file_path, extract_metadata=extract_default_metadata)
584
+
585
+
586
+ # ============================================================================
587
+ # Deprecated Legacy Functions (kept for reference, not used)
588
+ # ============================================================================
589
+
590
+ def _extract_pdf(
591
+ file_path: str,
592
+ current_config: Dict[str, Any],
593
+ extract_default_metadata: bool = True
594
+ ) -> str:
595
+ """Deprecated: Use PDFHandler.extract_text() instead."""
596
+ return extract_text_from_pdf(file_path, current_config, extract_default_metadata)
597
+