xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py
|
|
2
|
+
"""
|
|
3
|
+
PDFFileConverter - PDF file format converter
|
|
4
|
+
|
|
5
|
+
Converts binary PDF data to fitz.Document object using PyMuPDF.
|
|
6
|
+
"""
|
|
7
|
+
from typing import Any, Optional, BinaryIO
|
|
8
|
+
|
|
9
|
+
from xgen_doc2chunk.core.functions.file_converter import BaseFileConverter
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class PDFFileConverter(BaseFileConverter):
|
|
13
|
+
"""
|
|
14
|
+
PDF file converter using PyMuPDF (fitz).
|
|
15
|
+
|
|
16
|
+
Converts binary PDF data to fitz.Document object.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
# PDF magic number
|
|
20
|
+
PDF_MAGIC = b'%PDF'
|
|
21
|
+
|
|
22
|
+
def convert(
|
|
23
|
+
self,
|
|
24
|
+
file_data: bytes,
|
|
25
|
+
file_stream: Optional[BinaryIO] = None,
|
|
26
|
+
**kwargs
|
|
27
|
+
) -> Any:
|
|
28
|
+
"""
|
|
29
|
+
Convert binary PDF data to fitz.Document.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
file_data: Raw binary PDF data
|
|
33
|
+
file_stream: Optional file stream (not used, fitz prefers bytes)
|
|
34
|
+
**kwargs: Additional options
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
fitz.Document object
|
|
38
|
+
|
|
39
|
+
Raises:
|
|
40
|
+
RuntimeError: If PDF cannot be opened
|
|
41
|
+
"""
|
|
42
|
+
import fitz
|
|
43
|
+
return fitz.open(stream=file_data, filetype="pdf")
|
|
44
|
+
|
|
45
|
+
def get_format_name(self) -> str:
|
|
46
|
+
"""Return format name."""
|
|
47
|
+
return "PDF Document"
|
|
48
|
+
|
|
49
|
+
def validate(self, file_data: bytes) -> bool:
|
|
50
|
+
"""
|
|
51
|
+
Validate if data is a valid PDF.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
file_data: Raw binary file data
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
True if file appears to be a PDF
|
|
58
|
+
"""
|
|
59
|
+
if not file_data or len(file_data) < 4:
|
|
60
|
+
return False
|
|
61
|
+
return file_data[:4] == self.PDF_MAGIC
|
|
62
|
+
|
|
63
|
+
def close(self, converted_object: Any) -> None:
|
|
64
|
+
"""
|
|
65
|
+
Close the fitz.Document.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
converted_object: fitz.Document to close
|
|
69
|
+
"""
|
|
70
|
+
if converted_object is not None and hasattr(converted_object, 'close'):
|
|
71
|
+
converted_object.close()
|
|
72
|
+
|
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Graphic Region Detector for PDF Handler
|
|
3
|
+
|
|
4
|
+
Detects graphic regions (charts, diagrams, icons, etc.) in PDF pages.
|
|
5
|
+
These regions are filtered to avoid being misidentified as tables.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from typing import List, Dict, Tuple, Optional
|
|
10
|
+
|
|
11
|
+
import fitz
|
|
12
|
+
|
|
13
|
+
from xgen_doc2chunk.core.processor.pdf_helpers.types import GraphicRegionInfo, PDFConfig
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# ============================================================================
|
|
19
|
+
# Graphic Region Detector
|
|
20
|
+
# ============================================================================
|
|
21
|
+
|
|
22
|
+
class GraphicRegionDetector:
|
|
23
|
+
"""
|
|
24
|
+
Graphic Region Detector
|
|
25
|
+
|
|
26
|
+
Detects graphic regions such as charts, diagrams, and icons in PDF pages.
|
|
27
|
+
These regions should be excluded from table detection.
|
|
28
|
+
|
|
29
|
+
Criteria for identifying graphics:
|
|
30
|
+
1. High ratio of curves (Bezier curves) - tables are mostly straight lines
|
|
31
|
+
2. Many filled shapes - areas filled with colors
|
|
32
|
+
3. Use of various colors - tables are usually monochromatic
|
|
33
|
+
4. High density of curves/lines within the region
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(self, page, page_num: int):
|
|
37
|
+
"""
|
|
38
|
+
Args:
|
|
39
|
+
page: PyMuPDF page object
|
|
40
|
+
page_num: Page number (0-indexed)
|
|
41
|
+
"""
|
|
42
|
+
self.page = page
|
|
43
|
+
self.page_num = page_num
|
|
44
|
+
self.page_width = page.rect.width
|
|
45
|
+
self.page_height = page.rect.height
|
|
46
|
+
self.graphic_regions: List[GraphicRegionInfo] = []
|
|
47
|
+
self._drawings_cache: Optional[List[Dict]] = None
|
|
48
|
+
|
|
49
|
+
def detect(self) -> List[GraphicRegionInfo]:
|
|
50
|
+
"""
|
|
51
|
+
Perform graphic region detection
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
List of GraphicRegionInfo
|
|
55
|
+
"""
|
|
56
|
+
drawings = self._get_drawings()
|
|
57
|
+
if not drawings:
|
|
58
|
+
return []
|
|
59
|
+
|
|
60
|
+
# Cluster drawings
|
|
61
|
+
regions = self._cluster_drawings(drawings)
|
|
62
|
+
|
|
63
|
+
# Analyze each region
|
|
64
|
+
for region in regions:
|
|
65
|
+
self._analyze_region(region)
|
|
66
|
+
|
|
67
|
+
# Return only regions identified as graphics
|
|
68
|
+
self.graphic_regions = [r for r in regions if r.is_graphic]
|
|
69
|
+
|
|
70
|
+
logger.debug(f"[GraphicDetector] Page {self.page_num + 1}: Found {len(self.graphic_regions)} graphic regions")
|
|
71
|
+
|
|
72
|
+
return self.graphic_regions
|
|
73
|
+
|
|
74
|
+
def _get_drawings(self) -> List[Dict]:
|
|
75
|
+
"""Cache drawing data"""
|
|
76
|
+
if self._drawings_cache is None:
|
|
77
|
+
self._drawings_cache = self.page.get_drawings()
|
|
78
|
+
return self._drawings_cache
|
|
79
|
+
|
|
80
|
+
def _cluster_drawings(self, drawings: List[Dict]) -> List[GraphicRegionInfo]:
|
|
81
|
+
"""
|
|
82
|
+
Cluster adjacent drawings into a single region
|
|
83
|
+
"""
|
|
84
|
+
regions: List[Dict] = []
|
|
85
|
+
|
|
86
|
+
for drawing in drawings:
|
|
87
|
+
rect = drawing.get("rect", fitz.Rect())
|
|
88
|
+
if rect.is_empty or rect.is_infinite:
|
|
89
|
+
continue
|
|
90
|
+
|
|
91
|
+
items = drawing.get("items", [])
|
|
92
|
+
fill = drawing.get("fill")
|
|
93
|
+
stroke = drawing.get("color")
|
|
94
|
+
|
|
95
|
+
# Count each item type
|
|
96
|
+
curve_count = sum(1 for item in items if item[0] == 'c')
|
|
97
|
+
line_count = sum(1 for item in items if item[0] == 'l')
|
|
98
|
+
rect_count = sum(1 for item in items if item[0] == 're')
|
|
99
|
+
|
|
100
|
+
region_data = {
|
|
101
|
+
'bbox': tuple(rect),
|
|
102
|
+
'curve_count': curve_count,
|
|
103
|
+
'line_count': line_count,
|
|
104
|
+
'rect_count': rect_count,
|
|
105
|
+
'fill_count': 1 if fill else 0,
|
|
106
|
+
'colors': set()
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
# Collect colors
|
|
110
|
+
if fill:
|
|
111
|
+
region_data['colors'].add(tuple(fill) if isinstance(fill, (list, tuple)) else fill)
|
|
112
|
+
if stroke:
|
|
113
|
+
region_data['colors'].add(tuple(stroke) if isinstance(stroke, (list, tuple)) else stroke)
|
|
114
|
+
|
|
115
|
+
# Check if can be merged with existing regions
|
|
116
|
+
merged = False
|
|
117
|
+
for existing in regions:
|
|
118
|
+
if self._should_merge_regions(existing['bbox'], region_data['bbox']):
|
|
119
|
+
self._merge_region_data(existing, region_data)
|
|
120
|
+
merged = True
|
|
121
|
+
break
|
|
122
|
+
|
|
123
|
+
if not merged:
|
|
124
|
+
regions.append(region_data)
|
|
125
|
+
|
|
126
|
+
# Iteratively merge adjacent regions
|
|
127
|
+
regions = self._iterative_merge(regions)
|
|
128
|
+
|
|
129
|
+
# Convert to GraphicRegionInfo
|
|
130
|
+
result = []
|
|
131
|
+
for r in regions:
|
|
132
|
+
result.append(GraphicRegionInfo(
|
|
133
|
+
bbox=r['bbox'],
|
|
134
|
+
curve_count=r['curve_count'],
|
|
135
|
+
line_count=r['line_count'],
|
|
136
|
+
rect_count=r['rect_count'],
|
|
137
|
+
fill_count=r['fill_count'],
|
|
138
|
+
color_count=len(r['colors']),
|
|
139
|
+
is_graphic=False,
|
|
140
|
+
confidence=0.0
|
|
141
|
+
))
|
|
142
|
+
|
|
143
|
+
return result
|
|
144
|
+
|
|
145
|
+
def _should_merge_regions(self, bbox1: Tuple, bbox2: Tuple, margin: float = 20.0) -> bool:
|
|
146
|
+
"""Check if two regions should be merged"""
|
|
147
|
+
x0_1, y0_1, x1_1, y1_1 = bbox1
|
|
148
|
+
x0_2, y0_2, x1_2, y1_2 = bbox2
|
|
149
|
+
|
|
150
|
+
# Check overlap with margin consideration
|
|
151
|
+
if (x0_1 - margin <= x1_2 and x1_1 + margin >= x0_2 and
|
|
152
|
+
y0_1 - margin <= y1_2 and y1_1 + margin >= y0_2):
|
|
153
|
+
return True
|
|
154
|
+
return False
|
|
155
|
+
|
|
156
|
+
def _merge_region_data(self, target: Dict, source: Dict):
|
|
157
|
+
"""Merge two region data"""
|
|
158
|
+
# Merge bboxes
|
|
159
|
+
x0 = min(target['bbox'][0], source['bbox'][0])
|
|
160
|
+
y0 = min(target['bbox'][1], source['bbox'][1])
|
|
161
|
+
x1 = max(target['bbox'][2], source['bbox'][2])
|
|
162
|
+
y1 = max(target['bbox'][3], source['bbox'][3])
|
|
163
|
+
target['bbox'] = (x0, y0, x1, y1)
|
|
164
|
+
|
|
165
|
+
# Accumulate counts
|
|
166
|
+
target['curve_count'] += source['curve_count']
|
|
167
|
+
target['line_count'] += source['line_count']
|
|
168
|
+
target['rect_count'] += source['rect_count']
|
|
169
|
+
target['fill_count'] += source['fill_count']
|
|
170
|
+
target['colors'].update(source['colors'])
|
|
171
|
+
|
|
172
|
+
def _iterative_merge(self, regions: List[Dict], max_iterations: int = 5) -> List[Dict]:
|
|
173
|
+
"""Iteratively merge adjacent regions"""
|
|
174
|
+
for _ in range(max_iterations):
|
|
175
|
+
merged_any = False
|
|
176
|
+
new_regions = []
|
|
177
|
+
used = set()
|
|
178
|
+
|
|
179
|
+
for i, r1 in enumerate(regions):
|
|
180
|
+
if i in used:
|
|
181
|
+
continue
|
|
182
|
+
|
|
183
|
+
current = r1.copy()
|
|
184
|
+
current['colors'] = r1['colors'].copy()
|
|
185
|
+
|
|
186
|
+
for j, r2 in enumerate(regions):
|
|
187
|
+
if j <= i or j in used:
|
|
188
|
+
continue
|
|
189
|
+
|
|
190
|
+
if self._should_merge_regions(current['bbox'], r2['bbox']):
|
|
191
|
+
self._merge_region_data(current, r2)
|
|
192
|
+
used.add(j)
|
|
193
|
+
merged_any = True
|
|
194
|
+
|
|
195
|
+
new_regions.append(current)
|
|
196
|
+
|
|
197
|
+
regions = new_regions
|
|
198
|
+
|
|
199
|
+
if not merged_any:
|
|
200
|
+
break
|
|
201
|
+
|
|
202
|
+
return regions
|
|
203
|
+
|
|
204
|
+
def _analyze_region(self, region: GraphicRegionInfo):
|
|
205
|
+
"""
|
|
206
|
+
Analyze whether the region is a graphic
|
|
207
|
+
|
|
208
|
+
Criteria for identifying graphics:
|
|
209
|
+
1. High ratio of curves (Bezier)
|
|
210
|
+
2. Many filled shapes
|
|
211
|
+
3. Use of various colors
|
|
212
|
+
4. High line/curve density relative to region size
|
|
213
|
+
5. Chart pattern detection (curve + fill combination)
|
|
214
|
+
|
|
215
|
+
Table cells (grid-shaped rectangles) are excluded from graphics.
|
|
216
|
+
"""
|
|
217
|
+
total_items = region.curve_count + region.line_count + region.rect_count
|
|
218
|
+
|
|
219
|
+
if total_items == 0:
|
|
220
|
+
region.is_graphic = False
|
|
221
|
+
region.confidence = 0.0
|
|
222
|
+
return
|
|
223
|
+
|
|
224
|
+
reasons = []
|
|
225
|
+
score = 0.0
|
|
226
|
+
|
|
227
|
+
# 1. Curve ratio check (pie charts, curved graphs, etc.)
|
|
228
|
+
curve_ratio = region.curve_count / total_items if total_items > 0 else 0
|
|
229
|
+
if curve_ratio >= PDFConfig.GRAPHIC_CURVE_RATIO_THRESHOLD:
|
|
230
|
+
score += 0.4
|
|
231
|
+
reasons.append(f"curve_ratio={curve_ratio:.2f}")
|
|
232
|
+
|
|
233
|
+
# 2. Minimum curve count check
|
|
234
|
+
if region.curve_count >= PDFConfig.GRAPHIC_MIN_CURVE_COUNT:
|
|
235
|
+
score += 0.2
|
|
236
|
+
reasons.append(f"curves={region.curve_count}")
|
|
237
|
+
|
|
238
|
+
# 3. Filled shape ratio
|
|
239
|
+
fill_ratio = region.fill_count / max(1, total_items // 10) # Rough estimate of shape count
|
|
240
|
+
if fill_ratio >= PDFConfig.GRAPHIC_FILL_RATIO_THRESHOLD:
|
|
241
|
+
score += 0.2
|
|
242
|
+
reasons.append(f"fills={region.fill_count}")
|
|
243
|
+
|
|
244
|
+
# 4. Color diversity (charts usually use multiple colors)
|
|
245
|
+
if region.color_count >= PDFConfig.GRAPHIC_COLOR_VARIETY_THRESHOLD:
|
|
246
|
+
score += 0.2
|
|
247
|
+
reasons.append(f"colors={region.color_count}")
|
|
248
|
+
|
|
249
|
+
# 5. Chart pattern with curves
|
|
250
|
+
# If curves exist with many fills, high probability of being a chart
|
|
251
|
+
if region.curve_count >= 5 and region.fill_count >= 3:
|
|
252
|
+
score += 0.3
|
|
253
|
+
reasons.append(f"chart_pattern(curves={region.curve_count}, fills={region.fill_count})")
|
|
254
|
+
|
|
255
|
+
# 6. Only rectangles with no curves - possibly table cells!
|
|
256
|
+
# Table cells are not graphics
|
|
257
|
+
if region.rect_count >= 5 and region.curve_count == 0 and region.line_count == 0:
|
|
258
|
+
# Only rectangles = high probability of table
|
|
259
|
+
# May be chart if high color diversity or irregular rectangle sizes
|
|
260
|
+
if region.color_count >= 3:
|
|
261
|
+
# Multiple colors = possibly a chart
|
|
262
|
+
score += 0.2
|
|
263
|
+
reasons.append(f"colored_rects(rects={region.rect_count}, colors={region.color_count})")
|
|
264
|
+
else:
|
|
265
|
+
# Single-colored rectangles only = high probability of table cells
|
|
266
|
+
score -= 0.3
|
|
267
|
+
reasons.append(f"likely_table_cells(rects={region.rect_count}, single_color)")
|
|
268
|
+
|
|
269
|
+
# 7. Exclude page background (full page size)
|
|
270
|
+
bbox_width = region.bbox[2] - region.bbox[0]
|
|
271
|
+
bbox_height = region.bbox[3] - region.bbox[1]
|
|
272
|
+
if (bbox_width > self.page_width * 0.9 and
|
|
273
|
+
bbox_height > self.page_height * 0.9):
|
|
274
|
+
score = 0.0
|
|
275
|
+
reasons = ["page_background"]
|
|
276
|
+
|
|
277
|
+
# 8. Too small regions are not graphics (excluding icons)
|
|
278
|
+
area = bbox_width * bbox_height
|
|
279
|
+
if area < 500: # Less than approximately 22x22pt
|
|
280
|
+
score *= 0.5
|
|
281
|
+
|
|
282
|
+
region.confidence = min(1.0, max(0.0, score))
|
|
283
|
+
region.is_graphic = score >= 0.5
|
|
284
|
+
region.reason = ", ".join(reasons) if reasons else "not_graphic"
|
|
285
|
+
|
|
286
|
+
if region.is_graphic:
|
|
287
|
+
logger.debug(f"[GraphicDetector] Graphic region detected: {region.bbox}, score={score:.2f}, {region.reason}")
|
|
288
|
+
|
|
289
|
+
def is_bbox_in_graphic_region(self, bbox: Tuple[float, float, float, float],
|
|
290
|
+
threshold: float = 0.3) -> bool:
|
|
291
|
+
"""
|
|
292
|
+
Check if the given bbox is within a graphic region
|
|
293
|
+
|
|
294
|
+
Args:
|
|
295
|
+
bbox: The region to check
|
|
296
|
+
threshold: Overlap ratio threshold
|
|
297
|
+
|
|
298
|
+
Returns:
|
|
299
|
+
True if within a graphic region
|
|
300
|
+
"""
|
|
301
|
+
for graphic in self.graphic_regions:
|
|
302
|
+
overlap = self._calculate_overlap_ratio(bbox, graphic.bbox)
|
|
303
|
+
if overlap >= threshold:
|
|
304
|
+
return True
|
|
305
|
+
return False
|
|
306
|
+
|
|
307
|
+
def _calculate_overlap_ratio(self, bbox1: Tuple, bbox2: Tuple) -> float:
|
|
308
|
+
"""Calculate overlap ratio between two bboxes"""
|
|
309
|
+
x0 = max(bbox1[0], bbox2[0])
|
|
310
|
+
y0 = max(bbox1[1], bbox2[1])
|
|
311
|
+
x1 = min(bbox1[2], bbox2[2])
|
|
312
|
+
y1 = min(bbox1[3], bbox2[3])
|
|
313
|
+
|
|
314
|
+
if x1 <= x0 or y1 <= y0:
|
|
315
|
+
return 0.0
|
|
316
|
+
|
|
317
|
+
overlap_area = (x1 - x0) * (y1 - y0)
|
|
318
|
+
bbox1_area = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
|
|
319
|
+
|
|
320
|
+
if bbox1_area <= 0:
|
|
321
|
+
return 0.0
|
|
322
|
+
|
|
323
|
+
return overlap_area / bbox1_area
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
# ============================================================================
|
|
327
|
+
# Export
|
|
328
|
+
# ============================================================================
|
|
329
|
+
|
|
330
|
+
__all__ = [
|
|
331
|
+
'GraphicRegionDetector',
|
|
332
|
+
]
|