xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,72 @@
1
+ # xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py
2
+ """
3
+ PDFFileConverter - PDF file format converter
4
+
5
+ Converts binary PDF data to fitz.Document object using PyMuPDF.
6
+ """
7
+ from typing import Any, Optional, BinaryIO
8
+
9
+ from xgen_doc2chunk.core.functions.file_converter import BaseFileConverter
10
+
11
+
12
+ class PDFFileConverter(BaseFileConverter):
13
+ """
14
+ PDF file converter using PyMuPDF (fitz).
15
+
16
+ Converts binary PDF data to fitz.Document object.
17
+ """
18
+
19
+ # PDF magic number
20
+ PDF_MAGIC = b'%PDF'
21
+
22
+ def convert(
23
+ self,
24
+ file_data: bytes,
25
+ file_stream: Optional[BinaryIO] = None,
26
+ **kwargs
27
+ ) -> Any:
28
+ """
29
+ Convert binary PDF data to fitz.Document.
30
+
31
+ Args:
32
+ file_data: Raw binary PDF data
33
+ file_stream: Optional file stream (not used, fitz prefers bytes)
34
+ **kwargs: Additional options
35
+
36
+ Returns:
37
+ fitz.Document object
38
+
39
+ Raises:
40
+ RuntimeError: If PDF cannot be opened
41
+ """
42
+ import fitz
43
+ return fitz.open(stream=file_data, filetype="pdf")
44
+
45
+ def get_format_name(self) -> str:
46
+ """Return format name."""
47
+ return "PDF Document"
48
+
49
+ def validate(self, file_data: bytes) -> bool:
50
+ """
51
+ Validate if data is a valid PDF.
52
+
53
+ Args:
54
+ file_data: Raw binary file data
55
+
56
+ Returns:
57
+ True if file appears to be a PDF
58
+ """
59
+ if not file_data or len(file_data) < 4:
60
+ return False
61
+ return file_data[:4] == self.PDF_MAGIC
62
+
63
+ def close(self, converted_object: Any) -> None:
64
+ """
65
+ Close the fitz.Document.
66
+
67
+ Args:
68
+ converted_object: fitz.Document to close
69
+ """
70
+ if converted_object is not None and hasattr(converted_object, 'close'):
71
+ converted_object.close()
72
+
@@ -0,0 +1,332 @@
1
+ """
2
+ Graphic Region Detector for PDF Handler
3
+
4
+ Detects graphic regions (charts, diagrams, icons, etc.) in PDF pages.
5
+ These regions are filtered to avoid being misidentified as tables.
6
+ """
7
+
8
+ import logging
9
+ from typing import List, Dict, Tuple, Optional
10
+
11
+ import fitz
12
+
13
+ from xgen_doc2chunk.core.processor.pdf_helpers.types import GraphicRegionInfo, PDFConfig
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ # ============================================================================
19
+ # Graphic Region Detector
20
+ # ============================================================================
21
+
22
+ class GraphicRegionDetector:
23
+ """
24
+ Graphic Region Detector
25
+
26
+ Detects graphic regions such as charts, diagrams, and icons in PDF pages.
27
+ These regions should be excluded from table detection.
28
+
29
+ Criteria for identifying graphics:
30
+ 1. High ratio of curves (Bezier curves) - tables are mostly straight lines
31
+ 2. Many filled shapes - areas filled with colors
32
+ 3. Use of various colors - tables are usually monochromatic
33
+ 4. High density of curves/lines within the region
34
+ """
35
+
36
+ def __init__(self, page, page_num: int):
37
+ """
38
+ Args:
39
+ page: PyMuPDF page object
40
+ page_num: Page number (0-indexed)
41
+ """
42
+ self.page = page
43
+ self.page_num = page_num
44
+ self.page_width = page.rect.width
45
+ self.page_height = page.rect.height
46
+ self.graphic_regions: List[GraphicRegionInfo] = []
47
+ self._drawings_cache: Optional[List[Dict]] = None
48
+
49
+ def detect(self) -> List[GraphicRegionInfo]:
50
+ """
51
+ Perform graphic region detection
52
+
53
+ Returns:
54
+ List of GraphicRegionInfo
55
+ """
56
+ drawings = self._get_drawings()
57
+ if not drawings:
58
+ return []
59
+
60
+ # Cluster drawings
61
+ regions = self._cluster_drawings(drawings)
62
+
63
+ # Analyze each region
64
+ for region in regions:
65
+ self._analyze_region(region)
66
+
67
+ # Return only regions identified as graphics
68
+ self.graphic_regions = [r for r in regions if r.is_graphic]
69
+
70
+ logger.debug(f"[GraphicDetector] Page {self.page_num + 1}: Found {len(self.graphic_regions)} graphic regions")
71
+
72
+ return self.graphic_regions
73
+
74
+ def _get_drawings(self) -> List[Dict]:
75
+ """Cache drawing data"""
76
+ if self._drawings_cache is None:
77
+ self._drawings_cache = self.page.get_drawings()
78
+ return self._drawings_cache
79
+
80
+ def _cluster_drawings(self, drawings: List[Dict]) -> List[GraphicRegionInfo]:
81
+ """
82
+ Cluster adjacent drawings into a single region
83
+ """
84
+ regions: List[Dict] = []
85
+
86
+ for drawing in drawings:
87
+ rect = drawing.get("rect", fitz.Rect())
88
+ if rect.is_empty or rect.is_infinite:
89
+ continue
90
+
91
+ items = drawing.get("items", [])
92
+ fill = drawing.get("fill")
93
+ stroke = drawing.get("color")
94
+
95
+ # Count each item type
96
+ curve_count = sum(1 for item in items if item[0] == 'c')
97
+ line_count = sum(1 for item in items if item[0] == 'l')
98
+ rect_count = sum(1 for item in items if item[0] == 're')
99
+
100
+ region_data = {
101
+ 'bbox': tuple(rect),
102
+ 'curve_count': curve_count,
103
+ 'line_count': line_count,
104
+ 'rect_count': rect_count,
105
+ 'fill_count': 1 if fill else 0,
106
+ 'colors': set()
107
+ }
108
+
109
+ # Collect colors
110
+ if fill:
111
+ region_data['colors'].add(tuple(fill) if isinstance(fill, (list, tuple)) else fill)
112
+ if stroke:
113
+ region_data['colors'].add(tuple(stroke) if isinstance(stroke, (list, tuple)) else stroke)
114
+
115
+ # Check if can be merged with existing regions
116
+ merged = False
117
+ for existing in regions:
118
+ if self._should_merge_regions(existing['bbox'], region_data['bbox']):
119
+ self._merge_region_data(existing, region_data)
120
+ merged = True
121
+ break
122
+
123
+ if not merged:
124
+ regions.append(region_data)
125
+
126
+ # Iteratively merge adjacent regions
127
+ regions = self._iterative_merge(regions)
128
+
129
+ # Convert to GraphicRegionInfo
130
+ result = []
131
+ for r in regions:
132
+ result.append(GraphicRegionInfo(
133
+ bbox=r['bbox'],
134
+ curve_count=r['curve_count'],
135
+ line_count=r['line_count'],
136
+ rect_count=r['rect_count'],
137
+ fill_count=r['fill_count'],
138
+ color_count=len(r['colors']),
139
+ is_graphic=False,
140
+ confidence=0.0
141
+ ))
142
+
143
+ return result
144
+
145
+ def _should_merge_regions(self, bbox1: Tuple, bbox2: Tuple, margin: float = 20.0) -> bool:
146
+ """Check if two regions should be merged"""
147
+ x0_1, y0_1, x1_1, y1_1 = bbox1
148
+ x0_2, y0_2, x1_2, y1_2 = bbox2
149
+
150
+ # Check overlap with margin consideration
151
+ if (x0_1 - margin <= x1_2 and x1_1 + margin >= x0_2 and
152
+ y0_1 - margin <= y1_2 and y1_1 + margin >= y0_2):
153
+ return True
154
+ return False
155
+
156
+ def _merge_region_data(self, target: Dict, source: Dict):
157
+ """Merge two region data"""
158
+ # Merge bboxes
159
+ x0 = min(target['bbox'][0], source['bbox'][0])
160
+ y0 = min(target['bbox'][1], source['bbox'][1])
161
+ x1 = max(target['bbox'][2], source['bbox'][2])
162
+ y1 = max(target['bbox'][3], source['bbox'][3])
163
+ target['bbox'] = (x0, y0, x1, y1)
164
+
165
+ # Accumulate counts
166
+ target['curve_count'] += source['curve_count']
167
+ target['line_count'] += source['line_count']
168
+ target['rect_count'] += source['rect_count']
169
+ target['fill_count'] += source['fill_count']
170
+ target['colors'].update(source['colors'])
171
+
172
+ def _iterative_merge(self, regions: List[Dict], max_iterations: int = 5) -> List[Dict]:
173
+ """Iteratively merge adjacent regions"""
174
+ for _ in range(max_iterations):
175
+ merged_any = False
176
+ new_regions = []
177
+ used = set()
178
+
179
+ for i, r1 in enumerate(regions):
180
+ if i in used:
181
+ continue
182
+
183
+ current = r1.copy()
184
+ current['colors'] = r1['colors'].copy()
185
+
186
+ for j, r2 in enumerate(regions):
187
+ if j <= i or j in used:
188
+ continue
189
+
190
+ if self._should_merge_regions(current['bbox'], r2['bbox']):
191
+ self._merge_region_data(current, r2)
192
+ used.add(j)
193
+ merged_any = True
194
+
195
+ new_regions.append(current)
196
+
197
+ regions = new_regions
198
+
199
+ if not merged_any:
200
+ break
201
+
202
+ return regions
203
+
204
+ def _analyze_region(self, region: GraphicRegionInfo):
205
+ """
206
+ Analyze whether the region is a graphic
207
+
208
+ Criteria for identifying graphics:
209
+ 1. High ratio of curves (Bezier)
210
+ 2. Many filled shapes
211
+ 3. Use of various colors
212
+ 4. High line/curve density relative to region size
213
+ 5. Chart pattern detection (curve + fill combination)
214
+
215
+ Table cells (grid-shaped rectangles) are excluded from graphics.
216
+ """
217
+ total_items = region.curve_count + region.line_count + region.rect_count
218
+
219
+ if total_items == 0:
220
+ region.is_graphic = False
221
+ region.confidence = 0.0
222
+ return
223
+
224
+ reasons = []
225
+ score = 0.0
226
+
227
+ # 1. Curve ratio check (pie charts, curved graphs, etc.)
228
+ curve_ratio = region.curve_count / total_items if total_items > 0 else 0
229
+ if curve_ratio >= PDFConfig.GRAPHIC_CURVE_RATIO_THRESHOLD:
230
+ score += 0.4
231
+ reasons.append(f"curve_ratio={curve_ratio:.2f}")
232
+
233
+ # 2. Minimum curve count check
234
+ if region.curve_count >= PDFConfig.GRAPHIC_MIN_CURVE_COUNT:
235
+ score += 0.2
236
+ reasons.append(f"curves={region.curve_count}")
237
+
238
+ # 3. Filled shape ratio
239
+ fill_ratio = region.fill_count / max(1, total_items // 10) # Rough estimate of shape count
240
+ if fill_ratio >= PDFConfig.GRAPHIC_FILL_RATIO_THRESHOLD:
241
+ score += 0.2
242
+ reasons.append(f"fills={region.fill_count}")
243
+
244
+ # 4. Color diversity (charts usually use multiple colors)
245
+ if region.color_count >= PDFConfig.GRAPHIC_COLOR_VARIETY_THRESHOLD:
246
+ score += 0.2
247
+ reasons.append(f"colors={region.color_count}")
248
+
249
+ # 5. Chart pattern with curves
250
+ # If curves exist with many fills, high probability of being a chart
251
+ if region.curve_count >= 5 and region.fill_count >= 3:
252
+ score += 0.3
253
+ reasons.append(f"chart_pattern(curves={region.curve_count}, fills={region.fill_count})")
254
+
255
+ # 6. Only rectangles with no curves - possibly table cells!
256
+ # Table cells are not graphics
257
+ if region.rect_count >= 5 and region.curve_count == 0 and region.line_count == 0:
258
+ # Only rectangles = high probability of table
259
+ # May be chart if high color diversity or irregular rectangle sizes
260
+ if region.color_count >= 3:
261
+ # Multiple colors = possibly a chart
262
+ score += 0.2
263
+ reasons.append(f"colored_rects(rects={region.rect_count}, colors={region.color_count})")
264
+ else:
265
+ # Single-colored rectangles only = high probability of table cells
266
+ score -= 0.3
267
+ reasons.append(f"likely_table_cells(rects={region.rect_count}, single_color)")
268
+
269
+ # 7. Exclude page background (full page size)
270
+ bbox_width = region.bbox[2] - region.bbox[0]
271
+ bbox_height = region.bbox[3] - region.bbox[1]
272
+ if (bbox_width > self.page_width * 0.9 and
273
+ bbox_height > self.page_height * 0.9):
274
+ score = 0.0
275
+ reasons = ["page_background"]
276
+
277
+ # 8. Too small regions are not graphics (excluding icons)
278
+ area = bbox_width * bbox_height
279
+ if area < 500: # Less than approximately 22x22pt
280
+ score *= 0.5
281
+
282
+ region.confidence = min(1.0, max(0.0, score))
283
+ region.is_graphic = score >= 0.5
284
+ region.reason = ", ".join(reasons) if reasons else "not_graphic"
285
+
286
+ if region.is_graphic:
287
+ logger.debug(f"[GraphicDetector] Graphic region detected: {region.bbox}, score={score:.2f}, {region.reason}")
288
+
289
+ def is_bbox_in_graphic_region(self, bbox: Tuple[float, float, float, float],
290
+ threshold: float = 0.3) -> bool:
291
+ """
292
+ Check if the given bbox is within a graphic region
293
+
294
+ Args:
295
+ bbox: The region to check
296
+ threshold: Overlap ratio threshold
297
+
298
+ Returns:
299
+ True if within a graphic region
300
+ """
301
+ for graphic in self.graphic_regions:
302
+ overlap = self._calculate_overlap_ratio(bbox, graphic.bbox)
303
+ if overlap >= threshold:
304
+ return True
305
+ return False
306
+
307
+ def _calculate_overlap_ratio(self, bbox1: Tuple, bbox2: Tuple) -> float:
308
+ """Calculate overlap ratio between two bboxes"""
309
+ x0 = max(bbox1[0], bbox2[0])
310
+ y0 = max(bbox1[1], bbox2[1])
311
+ x1 = min(bbox1[2], bbox2[2])
312
+ y1 = min(bbox1[3], bbox2[3])
313
+
314
+ if x1 <= x0 or y1 <= y0:
315
+ return 0.0
316
+
317
+ overlap_area = (x1 - x0) * (y1 - y0)
318
+ bbox1_area = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
319
+
320
+ if bbox1_area <= 0:
321
+ return 0.0
322
+
323
+ return overlap_area / bbox1_area
324
+
325
+
326
+ # ============================================================================
327
+ # Export
328
+ # ============================================================================
329
+
330
+ __all__ = [
331
+ 'GraphicRegionDetector',
332
+ ]