natural-pdf 0.2.18__py3-none-any.whl → 0.2.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +8 -0
- natural_pdf/analyzers/checkbox/__init__.py +6 -0
- natural_pdf/analyzers/checkbox/base.py +265 -0
- natural_pdf/analyzers/checkbox/checkbox_analyzer.py +329 -0
- natural_pdf/analyzers/checkbox/checkbox_manager.py +166 -0
- natural_pdf/analyzers/checkbox/checkbox_options.py +60 -0
- natural_pdf/analyzers/checkbox/mixin.py +95 -0
- natural_pdf/analyzers/checkbox/rtdetr.py +201 -0
- natural_pdf/collections/mixins.py +14 -5
- natural_pdf/core/element_manager.py +5 -1
- natural_pdf/core/page.py +61 -0
- natural_pdf/core/page_collection.py +41 -1
- natural_pdf/core/pdf.py +24 -1
- natural_pdf/describe/base.py +20 -0
- natural_pdf/elements/base.py +152 -10
- natural_pdf/elements/element_collection.py +41 -2
- natural_pdf/elements/region.py +115 -2
- natural_pdf/judge.py +1509 -0
- natural_pdf/selectors/parser.py +42 -1
- {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/METADATA +1 -1
- {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/RECORD +41 -17
- temp/check_model.py +49 -0
- temp/check_pdf_content.py +9 -0
- temp/checkbox_checks.py +590 -0
- temp/checkbox_simple.py +117 -0
- temp/checkbox_ux_ideas.py +400 -0
- temp/context_manager_prototype.py +177 -0
- temp/convert_to_hf.py +60 -0
- temp/demo_text_closest.py +66 -0
- temp/inspect_model.py +43 -0
- temp/rtdetr_dinov2_test.py +49 -0
- temp/test_closest_debug.py +26 -0
- temp/test_closest_debug2.py +22 -0
- temp/test_context_exploration.py +85 -0
- temp/test_durham.py +30 -0
- temp/test_empty_string.py +16 -0
- temp/test_similarity.py +15 -0
- {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/top_level.txt +0 -0
natural_pdf/__init__.py
CHANGED
@@ -66,6 +66,7 @@ class Options:
|
|
66
66
|
self.layout = ConfigSection(
|
67
67
|
directional_offset=0.01, # Offset in points when using directional methods
|
68
68
|
auto_multipage=False, # Whether directional methods span pages by default
|
69
|
+
directional_within=None, # Region to constrain directional operations to
|
69
70
|
)
|
70
71
|
|
71
72
|
|
@@ -126,6 +127,9 @@ from natural_pdf.elements.region import Region
|
|
126
127
|
from natural_pdf.flows.flow import Flow
|
127
128
|
from natural_pdf.flows.region import FlowRegion
|
128
129
|
|
130
|
+
# Judge for visual classification
|
131
|
+
from natural_pdf.judge import Decision, Judge, JudgeError, PickResult
|
132
|
+
|
129
133
|
# Search options (if extras installed)
|
130
134
|
try:
|
131
135
|
from natural_pdf.search.search_options import (
|
@@ -165,6 +169,10 @@ __all__ = [
|
|
165
169
|
"Flow",
|
166
170
|
"FlowRegion",
|
167
171
|
"Guides",
|
172
|
+
"Judge",
|
173
|
+
"Decision",
|
174
|
+
"PickResult",
|
175
|
+
"JudgeError",
|
168
176
|
"TextSearchOptions",
|
169
177
|
"MultiModalSearchOptions",
|
170
178
|
"BaseSearchOptions",
|
@@ -0,0 +1,265 @@
|
|
1
|
+
"""Base class for checkbox detection engines."""
|
2
|
+
|
3
|
+
import logging
|
4
|
+
from abc import ABC, abstractmethod
|
5
|
+
from typing import Any, Dict, List, Set
|
6
|
+
|
7
|
+
from PIL import Image
|
8
|
+
|
9
|
+
from .checkbox_options import CheckboxOptions
|
10
|
+
|
11
|
+
logger = logging.getLogger(__name__)
|
12
|
+
|
13
|
+
|
14
|
+
class CheckboxDetector(ABC):
|
15
|
+
"""Abstract base class for checkbox detection engines.
|
16
|
+
|
17
|
+
This class defines the standard interface that all checkbox detection engines
|
18
|
+
must implement in natural-pdf. Checkbox detectors analyze document images to
|
19
|
+
identify checkboxes and their states (checked/unchecked).
|
20
|
+
|
21
|
+
Subclasses must implement:
|
22
|
+
- detect(): Core checkbox detection for a single image
|
23
|
+
- is_available(): Check if engine dependencies are installed
|
24
|
+
- _load_model_from_options(): Load and configure the detection model
|
25
|
+
- _get_cache_key(): Generate cache keys for model instances
|
26
|
+
|
27
|
+
Attributes:
|
28
|
+
logger: Logger instance for the specific detector.
|
29
|
+
_model_cache: Dictionary cache for loaded model instances.
|
30
|
+
"""
|
31
|
+
|
32
|
+
def __init__(self):
|
33
|
+
"""Initialize the base checkbox detector."""
|
34
|
+
self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
|
35
|
+
self.logger.info(f"Initializing {self.__class__.__name__}")
|
36
|
+
self._model_cache: Dict[str, Any] = {} # Cache for initialized models
|
37
|
+
|
38
|
+
@abstractmethod
|
39
|
+
def detect(self, image: Image.Image, options: CheckboxOptions) -> List[Dict[str, Any]]:
|
40
|
+
"""
|
41
|
+
Detect checkboxes in a given PIL Image.
|
42
|
+
|
43
|
+
Args:
|
44
|
+
image: PIL Image of the page/region to analyze.
|
45
|
+
options: Instance of CheckboxOptions with configuration.
|
46
|
+
|
47
|
+
Returns:
|
48
|
+
List of detection dictionaries with:
|
49
|
+
- 'bbox': Tuple[float, float, float, float] - (x0, y0, x1, y1) relative to image
|
50
|
+
- 'class': str - Original class name from model (e.g., 'checkbox', 'checked_checkbox')
|
51
|
+
- 'normalized_class': str - Always 'checkbox'
|
52
|
+
- 'is_checked': bool - Whether checkbox is checked
|
53
|
+
- 'checkbox_state': str - 'checked' or 'unchecked'
|
54
|
+
- 'confidence': float - Confidence score (0.0-1.0)
|
55
|
+
- 'model': str - Name of the model used
|
56
|
+
- 'source': str - Always 'checkbox'
|
57
|
+
"""
|
58
|
+
raise NotImplementedError("Subclasses must implement this method")
|
59
|
+
|
60
|
+
@classmethod
|
61
|
+
@abstractmethod
|
62
|
+
def is_available(cls) -> bool:
|
63
|
+
"""
|
64
|
+
Check if the detector's dependencies are installed and usable.
|
65
|
+
|
66
|
+
Returns:
|
67
|
+
True if the detector is available, False otherwise.
|
68
|
+
"""
|
69
|
+
raise NotImplementedError("Subclasses must implement this method")
|
70
|
+
|
71
|
+
def _get_cache_key(self, options: CheckboxOptions) -> str:
|
72
|
+
"""
|
73
|
+
Generate a cache key for model loading based on relevant options.
|
74
|
+
|
75
|
+
Args:
|
76
|
+
options: The options dataclass instance.
|
77
|
+
|
78
|
+
Returns:
|
79
|
+
A string cache key.
|
80
|
+
"""
|
81
|
+
# Base key includes device, subclasses should add model specifics
|
82
|
+
device_key = str(options.device).lower()
|
83
|
+
return f"{self.__class__.__name__}_{device_key}"
|
84
|
+
|
85
|
+
def _get_model(self, options: CheckboxOptions) -> Any:
|
86
|
+
"""
|
87
|
+
Get or initialize the underlying model based on options, using caching.
|
88
|
+
"""
|
89
|
+
cache_key = self._get_cache_key(options)
|
90
|
+
if cache_key not in self._model_cache:
|
91
|
+
self.logger.info(f"Loading model for cache key: {cache_key}")
|
92
|
+
try:
|
93
|
+
# Ensure dependencies are met before loading
|
94
|
+
if not self.is_available():
|
95
|
+
raise RuntimeError(f"{self.__class__.__name__} dependencies are not met.")
|
96
|
+
self._model_cache[cache_key] = self._load_model_from_options(options)
|
97
|
+
self.logger.info(f"Model loaded successfully for key: {cache_key}")
|
98
|
+
except Exception as e:
|
99
|
+
self.logger.error(f"Failed to load model for key {cache_key}: {e}", exc_info=True)
|
100
|
+
# Remove potentially corrupted cache entry
|
101
|
+
self._model_cache.pop(cache_key, None)
|
102
|
+
raise
|
103
|
+
else:
|
104
|
+
self.logger.debug(f"Using cached model for key: {cache_key}")
|
105
|
+
return self._model_cache[cache_key]
|
106
|
+
|
107
|
+
@abstractmethod
|
108
|
+
def _load_model_from_options(self, options: CheckboxOptions) -> Any:
|
109
|
+
"""
|
110
|
+
Load and configure the detection model based on provided options.
|
111
|
+
|
112
|
+
Args:
|
113
|
+
options: The options dataclass instance.
|
114
|
+
|
115
|
+
Returns:
|
116
|
+
The loaded model object(s).
|
117
|
+
"""
|
118
|
+
raise NotImplementedError("Subclasses must implement _load_model_from_options")
|
119
|
+
|
120
|
+
def _map_label_to_state(self, label: str, options: CheckboxOptions) -> tuple[bool, str]:
|
121
|
+
"""
|
122
|
+
Map model output label to checkbox state.
|
123
|
+
|
124
|
+
Args:
|
125
|
+
label: Raw label from model (e.g., 'checked_checkbox', '1')
|
126
|
+
options: Options containing label mapping
|
127
|
+
|
128
|
+
Returns:
|
129
|
+
Tuple of (is_checked: bool, state: str)
|
130
|
+
"""
|
131
|
+
# Normalize label
|
132
|
+
normalized_label = str(label).lower().strip()
|
133
|
+
|
134
|
+
# Check mapping
|
135
|
+
if normalized_label in options.label_mapping:
|
136
|
+
state = options.label_mapping[normalized_label]
|
137
|
+
is_checked = state == "checked"
|
138
|
+
return is_checked, state
|
139
|
+
|
140
|
+
# Default heuristic if not in mapping
|
141
|
+
if any(term in normalized_label for term in ["checked", "tick", "filled", "1"]):
|
142
|
+
return True, "checked"
|
143
|
+
else:
|
144
|
+
return False, "unchecked"
|
145
|
+
|
146
|
+
def _apply_nms(
|
147
|
+
self, detections: List[Dict[str, Any]], iou_threshold: float
|
148
|
+
) -> List[Dict[str, Any]]:
|
149
|
+
"""
|
150
|
+
Apply non-maximum suppression to remove overlapping detections.
|
151
|
+
For checkboxes, we reject ANY meaningful overlap.
|
152
|
+
|
153
|
+
Args:
|
154
|
+
detections: List of detection dictionaries
|
155
|
+
iou_threshold: IoU threshold for suppression (ignored for checkboxes - we use stricter rules)
|
156
|
+
|
157
|
+
Returns:
|
158
|
+
Filtered list of detections
|
159
|
+
"""
|
160
|
+
if not detections:
|
161
|
+
return detections
|
162
|
+
|
163
|
+
# Sort by confidence (descending), then by area (ascending) to prefer smaller boxes
|
164
|
+
def sort_key(det):
|
165
|
+
bbox = det["bbox"]
|
166
|
+
area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
|
167
|
+
return (-det["confidence"], area)
|
168
|
+
|
169
|
+
sorted_detections = sorted(detections, key=sort_key)
|
170
|
+
|
171
|
+
keep = []
|
172
|
+
for i, det in enumerate(sorted_detections):
|
173
|
+
should_keep = True
|
174
|
+
det_bbox = det["bbox"]
|
175
|
+
|
176
|
+
for kept_det in keep:
|
177
|
+
kept_bbox = kept_det["bbox"]
|
178
|
+
|
179
|
+
# Check for ANY overlap at all
|
180
|
+
if self._boxes_overlap(det_bbox, kept_bbox):
|
181
|
+
should_keep = False
|
182
|
+
logger.debug(f"Rejecting box {det_bbox} due to overlap with {kept_bbox}")
|
183
|
+
break
|
184
|
+
|
185
|
+
if should_keep:
|
186
|
+
keep.append(det)
|
187
|
+
logger.debug(f"Keeping box {det_bbox} with confidence {det['confidence']}")
|
188
|
+
|
189
|
+
logger.info(f"NMS: Reduced {len(detections)} detections to {len(keep)}")
|
190
|
+
return keep
|
191
|
+
|
192
|
+
def _boxes_overlap(self, box1: tuple, box2: tuple) -> bool:
|
193
|
+
"""Check if two boxes have any overlap at all."""
|
194
|
+
x1_min, y1_min, x1_max, y1_max = box1
|
195
|
+
x2_min, y2_min, x2_max, y2_max = box2
|
196
|
+
|
197
|
+
# Check if boxes are separated
|
198
|
+
if x1_max <= x2_min or x2_max <= x1_min:
|
199
|
+
return False
|
200
|
+
if y1_max <= y2_min or y2_max <= y1_min:
|
201
|
+
return False
|
202
|
+
|
203
|
+
# If we get here, boxes overlap
|
204
|
+
return True
|
205
|
+
|
206
|
+
def _compute_intersection_ratio(self, box1: tuple, box2: tuple) -> float:
|
207
|
+
"""
|
208
|
+
Compute intersection ratio relative to the smaller box.
|
209
|
+
This is more aggressive than IoU for checkbox detection.
|
210
|
+
"""
|
211
|
+
x1_min, y1_min, x1_max, y1_max = box1
|
212
|
+
x2_min, y2_min, x2_max, y2_max = box2
|
213
|
+
|
214
|
+
# Intersection
|
215
|
+
inter_xmin = max(x1_min, x2_min)
|
216
|
+
inter_ymin = max(y1_min, y2_min)
|
217
|
+
inter_xmax = min(x1_max, x2_max)
|
218
|
+
inter_ymax = min(y1_max, y2_max)
|
219
|
+
|
220
|
+
if inter_xmax < inter_xmin or inter_ymax < inter_ymin:
|
221
|
+
return 0.0
|
222
|
+
|
223
|
+
inter_area = (inter_xmax - inter_xmin) * (inter_ymax - inter_ymin)
|
224
|
+
|
225
|
+
# Areas of both boxes
|
226
|
+
area1 = (x1_max - x1_min) * (y1_max - y1_min)
|
227
|
+
area2 = (x2_max - x2_min) * (y2_max - y2_min)
|
228
|
+
|
229
|
+
# Ratio relative to smaller box
|
230
|
+
smaller_area = min(area1, area2)
|
231
|
+
if smaller_area == 0:
|
232
|
+
return 0.0
|
233
|
+
|
234
|
+
return inter_area / smaller_area
|
235
|
+
|
236
|
+
def _compute_iou(self, box1: tuple, box2: tuple) -> float:
|
237
|
+
"""Compute IoU between two boxes."""
|
238
|
+
x1_min, y1_min, x1_max, y1_max = box1
|
239
|
+
x2_min, y2_min, x2_max, y2_max = box2
|
240
|
+
|
241
|
+
# Intersection
|
242
|
+
inter_xmin = max(x1_min, x2_min)
|
243
|
+
inter_ymin = max(y1_min, y2_min)
|
244
|
+
inter_xmax = min(x1_max, x2_max)
|
245
|
+
inter_ymax = min(y1_max, y2_max)
|
246
|
+
|
247
|
+
if inter_xmax < inter_xmin or inter_ymax < inter_ymin:
|
248
|
+
return 0.0
|
249
|
+
|
250
|
+
inter_area = (inter_xmax - inter_xmin) * (inter_ymax - inter_ymin)
|
251
|
+
|
252
|
+
# Union
|
253
|
+
area1 = (x1_max - x1_min) * (y1_max - y1_min)
|
254
|
+
area2 = (x2_max - x2_min) * (y2_max - y2_min)
|
255
|
+
union_area = area1 + area2 - inter_area
|
256
|
+
|
257
|
+
if union_area == 0:
|
258
|
+
return 0.0
|
259
|
+
|
260
|
+
return inter_area / union_area
|
261
|
+
|
262
|
+
def __del__(self):
|
263
|
+
"""Cleanup resources."""
|
264
|
+
self.logger.info(f"Cleaning up {self.__class__.__name__} resources.")
|
265
|
+
self._model_cache.clear()
|
@@ -0,0 +1,329 @@
|
|
1
|
+
"""Checkbox analyzer for PDF pages and regions."""
|
2
|
+
|
3
|
+
import logging
|
4
|
+
from typing import Any, Dict, List, Optional, Union
|
5
|
+
|
6
|
+
from PIL import Image
|
7
|
+
|
8
|
+
from natural_pdf.elements.region import Region
|
9
|
+
|
10
|
+
from .checkbox_manager import CheckboxManager
|
11
|
+
from .checkbox_options import CheckboxOptions
|
12
|
+
|
13
|
+
logger = logging.getLogger(__name__)
|
14
|
+
|
15
|
+
|
16
|
+
class CheckboxAnalyzer:
|
17
|
+
"""
|
18
|
+
Handles checkbox analysis for PDF pages and regions, including image rendering,
|
19
|
+
coordinate scaling, region creation, and result storage.
|
20
|
+
"""
|
21
|
+
|
22
|
+
def __init__(self, element, checkbox_manager: Optional[CheckboxManager] = None):
|
23
|
+
"""
|
24
|
+
Initialize the checkbox analyzer.
|
25
|
+
|
26
|
+
Args:
|
27
|
+
element: The Page or Region object to analyze
|
28
|
+
checkbox_manager: Optional CheckboxManager instance. If None, creates a new one.
|
29
|
+
"""
|
30
|
+
self._element = element
|
31
|
+
self._checkbox_manager = checkbox_manager or CheckboxManager()
|
32
|
+
|
33
|
+
# Determine if element is a page or region
|
34
|
+
self._is_page = hasattr(element, "number") and hasattr(element, "_parent")
|
35
|
+
self._is_region = hasattr(element, "bbox") and hasattr(element, "_page")
|
36
|
+
|
37
|
+
if self._is_region:
|
38
|
+
self._page = element._page
|
39
|
+
else:
|
40
|
+
self._page = element
|
41
|
+
|
42
|
+
def detect_checkboxes(
|
43
|
+
self,
|
44
|
+
engine: Optional[str] = None,
|
45
|
+
options: Optional[Union[CheckboxOptions, Dict[str, Any]]] = None,
|
46
|
+
confidence: Optional[float] = None,
|
47
|
+
resolution: Optional[int] = None,
|
48
|
+
device: Optional[str] = None,
|
49
|
+
existing: str = "replace",
|
50
|
+
limit: Optional[int] = None,
|
51
|
+
**kwargs,
|
52
|
+
) -> List[Region]:
|
53
|
+
"""
|
54
|
+
Detect checkboxes in the page or region.
|
55
|
+
|
56
|
+
Args:
|
57
|
+
engine: Name of the detection engine (default: 'rtdetr')
|
58
|
+
options: CheckboxOptions instance or dict of options
|
59
|
+
confidence: Minimum confidence threshold
|
60
|
+
resolution: DPI for rendering (default: 150)
|
61
|
+
device: Device for inference
|
62
|
+
existing: How to handle existing checkbox regions: 'replace' (default) or 'append'
|
63
|
+
limit: Maximum number of checkboxes to detect
|
64
|
+
**kwargs: Additional engine-specific arguments
|
65
|
+
|
66
|
+
Returns:
|
67
|
+
List of created Region objects representing checkboxes
|
68
|
+
"""
|
69
|
+
logger.info(
|
70
|
+
f"Detecting checkboxes (Engine: {engine or 'default'}, "
|
71
|
+
f"Element type: {'region' if self._is_region else 'page'})"
|
72
|
+
)
|
73
|
+
|
74
|
+
# Prepare options
|
75
|
+
if options is None:
|
76
|
+
# Build options from simple arguments
|
77
|
+
option_kwargs = {}
|
78
|
+
if confidence is not None:
|
79
|
+
option_kwargs["confidence"] = confidence
|
80
|
+
if resolution is not None:
|
81
|
+
option_kwargs["resolution"] = resolution
|
82
|
+
if device is not None:
|
83
|
+
option_kwargs["device"] = device
|
84
|
+
option_kwargs.update(kwargs)
|
85
|
+
|
86
|
+
# Let manager create appropriate options
|
87
|
+
final_options = None
|
88
|
+
final_kwargs = option_kwargs
|
89
|
+
else:
|
90
|
+
# Use provided options
|
91
|
+
final_options = options
|
92
|
+
# Apply any overrides
|
93
|
+
final_kwargs = {}
|
94
|
+
if confidence is not None:
|
95
|
+
final_kwargs["confidence"] = confidence
|
96
|
+
if resolution is not None:
|
97
|
+
final_kwargs["resolution"] = resolution
|
98
|
+
if device is not None:
|
99
|
+
final_kwargs["device"] = device
|
100
|
+
final_kwargs.update(kwargs)
|
101
|
+
|
102
|
+
# Render image
|
103
|
+
try:
|
104
|
+
resolution_val = (
|
105
|
+
resolution
|
106
|
+
or (
|
107
|
+
final_options.resolution
|
108
|
+
if final_options and hasattr(final_options, "resolution")
|
109
|
+
else None
|
110
|
+
)
|
111
|
+
or 150
|
112
|
+
)
|
113
|
+
|
114
|
+
if self._is_region:
|
115
|
+
# For regions, crop the page image to just the region bounds
|
116
|
+
page_image = self._page.render(resolution=resolution_val)
|
117
|
+
if not page_image:
|
118
|
+
raise ValueError("Page rendering returned None")
|
119
|
+
|
120
|
+
# Calculate region bounds in image coordinates
|
121
|
+
img_scale_x = page_image.width / self._page.width
|
122
|
+
img_scale_y = page_image.height / self._page.height
|
123
|
+
|
124
|
+
x0, y0, x1, y1 = self._element.bbox
|
125
|
+
img_x0 = int(x0 * img_scale_x)
|
126
|
+
img_y0 = int(y0 * img_scale_y)
|
127
|
+
img_x1 = int(x1 * img_scale_x)
|
128
|
+
img_y1 = int(y1 * img_scale_y)
|
129
|
+
|
130
|
+
# Crop to region
|
131
|
+
image = page_image.crop((img_x0, img_y0, img_x1, img_y1))
|
132
|
+
|
133
|
+
# Store crop offset for coordinate transformation
|
134
|
+
crop_offset = (img_x0, img_y0)
|
135
|
+
|
136
|
+
else:
|
137
|
+
# For pages, use the full image
|
138
|
+
image = self._page.render(resolution=resolution_val)
|
139
|
+
if not image:
|
140
|
+
raise ValueError("Page rendering returned None")
|
141
|
+
crop_offset = (0, 0)
|
142
|
+
|
143
|
+
logger.debug(f"Rendered image size: {image.width}x{image.height}")
|
144
|
+
|
145
|
+
except Exception as e:
|
146
|
+
logger.error(f"Failed to render image: {e}", exc_info=True)
|
147
|
+
return []
|
148
|
+
|
149
|
+
# Calculate scaling factors
|
150
|
+
if self._is_region:
|
151
|
+
# For regions, scale is relative to the cropped image
|
152
|
+
scale_x = (self._element.bbox[2] - self._element.bbox[0]) / image.width
|
153
|
+
scale_y = (self._element.bbox[3] - self._element.bbox[1]) / image.height
|
154
|
+
pdf_offset = (self._element.bbox[0], self._element.bbox[1])
|
155
|
+
else:
|
156
|
+
# For pages, scale is from image to PDF coordinates
|
157
|
+
scale_x = self._page.width / image.width
|
158
|
+
scale_y = self._page.height / image.height
|
159
|
+
pdf_offset = (0, 0)
|
160
|
+
|
161
|
+
# Run detection
|
162
|
+
try:
|
163
|
+
detections = self._checkbox_manager.detect_checkboxes(
|
164
|
+
image=image, engine=engine, options=final_options, **final_kwargs
|
165
|
+
)
|
166
|
+
logger.info(f"Detected {len(detections)} checkboxes")
|
167
|
+
except Exception as e:
|
168
|
+
logger.error(f"Checkbox detection failed: {e}", exc_info=True)
|
169
|
+
return []
|
170
|
+
|
171
|
+
# Process detections into regions
|
172
|
+
checkbox_regions = []
|
173
|
+
|
174
|
+
for detection in detections:
|
175
|
+
try:
|
176
|
+
# Get image coordinates
|
177
|
+
img_x0, img_y0, img_x1, img_y1 = detection["bbox"]
|
178
|
+
|
179
|
+
if self._is_region:
|
180
|
+
# For regions, add crop offset and scale to page image coords
|
181
|
+
page_img_x0 = img_x0 + crop_offset[0]
|
182
|
+
page_img_y0 = img_y0 + crop_offset[1]
|
183
|
+
page_img_x1 = img_x1 + crop_offset[0]
|
184
|
+
page_img_y1 = img_y1 + crop_offset[1]
|
185
|
+
|
186
|
+
# Then scale to PDF coords
|
187
|
+
pdf_x0 = page_img_x0 * (
|
188
|
+
self._page.width / (self._page.render(resolution=resolution_val).width)
|
189
|
+
)
|
190
|
+
pdf_y0 = page_img_y0 * (
|
191
|
+
self._page.height / (self._page.render(resolution=resolution_val).height)
|
192
|
+
)
|
193
|
+
pdf_x1 = page_img_x1 * (
|
194
|
+
self._page.width / (self._page.render(resolution=resolution_val).width)
|
195
|
+
)
|
196
|
+
pdf_y1 = page_img_y1 * (
|
197
|
+
self._page.height / (self._page.render(resolution=resolution_val).height)
|
198
|
+
)
|
199
|
+
else:
|
200
|
+
# For pages, directly scale to PDF coordinates
|
201
|
+
pdf_x0 = img_x0 * scale_x + pdf_offset[0]
|
202
|
+
pdf_y0 = img_y0 * scale_y + pdf_offset[1]
|
203
|
+
pdf_x1 = img_x1 * scale_x + pdf_offset[0]
|
204
|
+
pdf_y1 = img_y1 * scale_y + pdf_offset[1]
|
205
|
+
|
206
|
+
# Ensure valid bounds
|
207
|
+
pdf_x0, pdf_x1 = min(pdf_x0, pdf_x1), max(pdf_x0, pdf_x1)
|
208
|
+
pdf_y0, pdf_y1 = min(pdf_y0, pdf_y1), max(pdf_y0, pdf_y1)
|
209
|
+
pdf_x0 = max(0, pdf_x0)
|
210
|
+
pdf_y0 = max(0, pdf_y0)
|
211
|
+
pdf_x1 = min(self._page.width, pdf_x1)
|
212
|
+
pdf_y1 = min(self._page.height, pdf_y1)
|
213
|
+
|
214
|
+
# For region detection, skip checkboxes outside the region bounds
|
215
|
+
if self._is_region:
|
216
|
+
region_x0, region_y0, region_x1, region_y1 = self._element.bbox
|
217
|
+
# Check if checkbox center is within region
|
218
|
+
cb_center_x = (pdf_x0 + pdf_x1) / 2
|
219
|
+
cb_center_y = (pdf_y0 + pdf_y1) / 2
|
220
|
+
if not (
|
221
|
+
region_x0 <= cb_center_x <= region_x1
|
222
|
+
and region_y0 <= cb_center_y <= region_y1
|
223
|
+
):
|
224
|
+
continue # Skip this checkbox
|
225
|
+
|
226
|
+
# Create region
|
227
|
+
region = Region(self._page, (pdf_x0, pdf_y0, pdf_x1, pdf_y1))
|
228
|
+
region.region_type = "checkbox"
|
229
|
+
region.normalized_type = "checkbox"
|
230
|
+
region.is_checked = detection.get("is_checked", False)
|
231
|
+
region.checkbox_state = detection.get("checkbox_state", "unchecked")
|
232
|
+
region.confidence = detection.get("confidence", 0.0)
|
233
|
+
region.model = detection.get("model", "checkbox_detector")
|
234
|
+
region.source = "checkbox"
|
235
|
+
|
236
|
+
# Store original class for debugging
|
237
|
+
region.original_class = detection.get("class", "unknown")
|
238
|
+
|
239
|
+
# Check if region contains text - if so, it's probably not a checkbox
|
240
|
+
# Get reject_with_text setting from options or kwargs, default to True
|
241
|
+
reject_with_text = True
|
242
|
+
if final_options:
|
243
|
+
reject_with_text = getattr(final_options, "reject_with_text", True)
|
244
|
+
else:
|
245
|
+
reject_with_text = kwargs.get("reject_with_text", True)
|
246
|
+
|
247
|
+
if reject_with_text:
|
248
|
+
text_in_region = region.extract_text().strip()
|
249
|
+
if text_in_region:
|
250
|
+
# Allow only single characters that might be check marks
|
251
|
+
if len(text_in_region) > 1 or text_in_region.isalnum():
|
252
|
+
logger.debug(
|
253
|
+
f"Rejecting checkbox at {region.bbox} - contains text: '{text_in_region}'"
|
254
|
+
)
|
255
|
+
continue
|
256
|
+
|
257
|
+
checkbox_regions.append(region)
|
258
|
+
|
259
|
+
except Exception as e:
|
260
|
+
logger.warning(f"Could not process checkbox detection: {detection}. Error: {e}")
|
261
|
+
continue
|
262
|
+
|
263
|
+
# Apply limit if specified
|
264
|
+
if limit is not None and len(checkbox_regions) > limit:
|
265
|
+
# Sort by confidence (highest first) and take top N
|
266
|
+
checkbox_regions = sorted(checkbox_regions, key=lambda r: r.confidence, reverse=True)[
|
267
|
+
:limit
|
268
|
+
]
|
269
|
+
|
270
|
+
# Final cleanup - ensure no overlapping boxes (this shouldn't be needed if NMS worked)
|
271
|
+
cleaned_regions = []
|
272
|
+
for region in checkbox_regions:
|
273
|
+
overlaps = False
|
274
|
+
for kept_region in cleaned_regions:
|
275
|
+
# Check if bboxes overlap
|
276
|
+
r1 = region.bbox
|
277
|
+
r2 = kept_region.bbox
|
278
|
+
if not (r1[2] <= r2[0] or r2[2] <= r1[0] or r1[3] <= r2[1] or r2[3] <= r1[1]):
|
279
|
+
overlaps = True
|
280
|
+
logger.warning(
|
281
|
+
f"Found overlapping checkbox regions after NMS: {r1} overlaps {r2}"
|
282
|
+
)
|
283
|
+
break
|
284
|
+
if not overlaps:
|
285
|
+
cleaned_regions.append(region)
|
286
|
+
|
287
|
+
if len(cleaned_regions) < len(checkbox_regions):
|
288
|
+
logger.warning(
|
289
|
+
f"Removed {len(checkbox_regions) - len(cleaned_regions)} overlapping checkboxes in final cleanup"
|
290
|
+
)
|
291
|
+
checkbox_regions = cleaned_regions
|
292
|
+
|
293
|
+
# Store results
|
294
|
+
logger.debug(f"Storing {len(checkbox_regions)} checkbox regions (mode: {existing})")
|
295
|
+
|
296
|
+
# Initialize storage if needed
|
297
|
+
if not hasattr(self._page, "_regions"):
|
298
|
+
self._page._regions = {}
|
299
|
+
|
300
|
+
# Handle existing regions
|
301
|
+
if existing.lower() == "append":
|
302
|
+
if "checkbox" not in self._page._regions:
|
303
|
+
self._page._regions["checkbox"] = []
|
304
|
+
self._page._regions["checkbox"].extend(checkbox_regions)
|
305
|
+
else: # replace
|
306
|
+
# Remove old checkbox regions from element manager
|
307
|
+
if "checkbox" in self._page._regions:
|
308
|
+
old_checkboxes = self._page._regions["checkbox"]
|
309
|
+
if (
|
310
|
+
hasattr(self._page._element_mgr, "_elements")
|
311
|
+
and self._page._element_mgr._elements
|
312
|
+
):
|
313
|
+
current_regions = self._page._element_mgr._elements.get("regions", [])
|
314
|
+
# Remove old checkbox regions
|
315
|
+
self._page._element_mgr._elements["regions"] = [
|
316
|
+
r for r in current_regions if r not in old_checkboxes
|
317
|
+
]
|
318
|
+
self._page._regions["checkbox"] = checkbox_regions
|
319
|
+
|
320
|
+
# Add to element manager
|
321
|
+
for region in checkbox_regions:
|
322
|
+
self._page._element_mgr.add_region(region)
|
323
|
+
|
324
|
+
# Store for easy access
|
325
|
+
self._page.detected_checkbox_regions = self._page._regions.get("checkbox", [])
|
326
|
+
|
327
|
+
logger.info(f"Checkbox detection complete. Found {len(checkbox_regions)} checkboxes.")
|
328
|
+
|
329
|
+
return checkbox_regions
|