natural-pdf 0.2.18__py3-none-any.whl → 0.2.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. natural_pdf/__init__.py +8 -0
  2. natural_pdf/analyzers/checkbox/__init__.py +6 -0
  3. natural_pdf/analyzers/checkbox/base.py +265 -0
  4. natural_pdf/analyzers/checkbox/checkbox_analyzer.py +329 -0
  5. natural_pdf/analyzers/checkbox/checkbox_manager.py +166 -0
  6. natural_pdf/analyzers/checkbox/checkbox_options.py +60 -0
  7. natural_pdf/analyzers/checkbox/mixin.py +95 -0
  8. natural_pdf/analyzers/checkbox/rtdetr.py +201 -0
  9. natural_pdf/collections/mixins.py +14 -5
  10. natural_pdf/core/element_manager.py +5 -1
  11. natural_pdf/core/page.py +61 -0
  12. natural_pdf/core/page_collection.py +41 -1
  13. natural_pdf/core/pdf.py +24 -1
  14. natural_pdf/describe/base.py +20 -0
  15. natural_pdf/elements/base.py +152 -10
  16. natural_pdf/elements/element_collection.py +41 -2
  17. natural_pdf/elements/region.py +115 -2
  18. natural_pdf/judge.py +1509 -0
  19. natural_pdf/selectors/parser.py +42 -1
  20. {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/METADATA +1 -1
  21. {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/RECORD +41 -17
  22. temp/check_model.py +49 -0
  23. temp/check_pdf_content.py +9 -0
  24. temp/checkbox_checks.py +590 -0
  25. temp/checkbox_simple.py +117 -0
  26. temp/checkbox_ux_ideas.py +400 -0
  27. temp/context_manager_prototype.py +177 -0
  28. temp/convert_to_hf.py +60 -0
  29. temp/demo_text_closest.py +66 -0
  30. temp/inspect_model.py +43 -0
  31. temp/rtdetr_dinov2_test.py +49 -0
  32. temp/test_closest_debug.py +26 -0
  33. temp/test_closest_debug2.py +22 -0
  34. temp/test_context_exploration.py +85 -0
  35. temp/test_durham.py +30 -0
  36. temp/test_empty_string.py +16 -0
  37. temp/test_similarity.py +15 -0
  38. {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/WHEEL +0 -0
  39. {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/entry_points.txt +0 -0
  40. {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/licenses/LICENSE +0 -0
  41. {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/top_level.txt +0 -0
natural_pdf/__init__.py CHANGED
@@ -66,6 +66,7 @@ class Options:
66
66
  self.layout = ConfigSection(
67
67
  directional_offset=0.01, # Offset in points when using directional methods
68
68
  auto_multipage=False, # Whether directional methods span pages by default
69
+ directional_within=None, # Region to constrain directional operations to
69
70
  )
70
71
 
71
72
 
@@ -126,6 +127,9 @@ from natural_pdf.elements.region import Region
126
127
  from natural_pdf.flows.flow import Flow
127
128
  from natural_pdf.flows.region import FlowRegion
128
129
 
130
+ # Judge for visual classification
131
+ from natural_pdf.judge import Decision, Judge, JudgeError, PickResult
132
+
129
133
  # Search options (if extras installed)
130
134
  try:
131
135
  from natural_pdf.search.search_options import (
@@ -165,6 +169,10 @@ __all__ = [
165
169
  "Flow",
166
170
  "FlowRegion",
167
171
  "Guides",
172
+ "Judge",
173
+ "Decision",
174
+ "PickResult",
175
+ "JudgeError",
168
176
  "TextSearchOptions",
169
177
  "MultiModalSearchOptions",
170
178
  "BaseSearchOptions",
@@ -0,0 +1,6 @@
1
+ """Checkbox detection analyzers for natural-pdf."""
2
+
3
+ from .checkbox_manager import CheckboxManager
4
+ from .checkbox_options import CheckboxOptions, RTDETRCheckboxOptions
5
+
6
+ __all__ = ["CheckboxManager", "CheckboxOptions", "RTDETRCheckboxOptions"]
@@ -0,0 +1,265 @@
1
+ """Base class for checkbox detection engines."""
2
+
3
+ import logging
4
+ from abc import ABC, abstractmethod
5
+ from typing import Any, Dict, List, Set
6
+
7
+ from PIL import Image
8
+
9
+ from .checkbox_options import CheckboxOptions
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class CheckboxDetector(ABC):
15
+ """Abstract base class for checkbox detection engines.
16
+
17
+ This class defines the standard interface that all checkbox detection engines
18
+ must implement in natural-pdf. Checkbox detectors analyze document images to
19
+ identify checkboxes and their states (checked/unchecked).
20
+
21
+ Subclasses must implement:
22
+ - detect(): Core checkbox detection for a single image
23
+ - is_available(): Check if engine dependencies are installed
24
+ - _load_model_from_options(): Load and configure the detection model
25
+ - _get_cache_key(): Generate cache keys for model instances
26
+
27
+ Attributes:
28
+ logger: Logger instance for the specific detector.
29
+ _model_cache: Dictionary cache for loaded model instances.
30
+ """
31
+
32
+ def __init__(self):
33
+ """Initialize the base checkbox detector."""
34
+ self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
35
+ self.logger.info(f"Initializing {self.__class__.__name__}")
36
+ self._model_cache: Dict[str, Any] = {} # Cache for initialized models
37
+
38
+ @abstractmethod
39
+ def detect(self, image: Image.Image, options: CheckboxOptions) -> List[Dict[str, Any]]:
40
+ """
41
+ Detect checkboxes in a given PIL Image.
42
+
43
+ Args:
44
+ image: PIL Image of the page/region to analyze.
45
+ options: Instance of CheckboxOptions with configuration.
46
+
47
+ Returns:
48
+ List of detection dictionaries with:
49
+ - 'bbox': Tuple[float, float, float, float] - (x0, y0, x1, y1) relative to image
50
+ - 'class': str - Original class name from model (e.g., 'checkbox', 'checked_checkbox')
51
+ - 'normalized_class': str - Always 'checkbox'
52
+ - 'is_checked': bool - Whether checkbox is checked
53
+ - 'checkbox_state': str - 'checked' or 'unchecked'
54
+ - 'confidence': float - Confidence score (0.0-1.0)
55
+ - 'model': str - Name of the model used
56
+ - 'source': str - Always 'checkbox'
57
+ """
58
+ raise NotImplementedError("Subclasses must implement this method")
59
+
60
+ @classmethod
61
+ @abstractmethod
62
+ def is_available(cls) -> bool:
63
+ """
64
+ Check if the detector's dependencies are installed and usable.
65
+
66
+ Returns:
67
+ True if the detector is available, False otherwise.
68
+ """
69
+ raise NotImplementedError("Subclasses must implement this method")
70
+
71
+ def _get_cache_key(self, options: CheckboxOptions) -> str:
72
+ """
73
+ Generate a cache key for model loading based on relevant options.
74
+
75
+ Args:
76
+ options: The options dataclass instance.
77
+
78
+ Returns:
79
+ A string cache key.
80
+ """
81
+ # Base key includes device, subclasses should add model specifics
82
+ device_key = str(options.device).lower()
83
+ return f"{self.__class__.__name__}_{device_key}"
84
+
85
+ def _get_model(self, options: CheckboxOptions) -> Any:
86
+ """
87
+ Get or initialize the underlying model based on options, using caching.
88
+ """
89
+ cache_key = self._get_cache_key(options)
90
+ if cache_key not in self._model_cache:
91
+ self.logger.info(f"Loading model for cache key: {cache_key}")
92
+ try:
93
+ # Ensure dependencies are met before loading
94
+ if not self.is_available():
95
+ raise RuntimeError(f"{self.__class__.__name__} dependencies are not met.")
96
+ self._model_cache[cache_key] = self._load_model_from_options(options)
97
+ self.logger.info(f"Model loaded successfully for key: {cache_key}")
98
+ except Exception as e:
99
+ self.logger.error(f"Failed to load model for key {cache_key}: {e}", exc_info=True)
100
+ # Remove potentially corrupted cache entry
101
+ self._model_cache.pop(cache_key, None)
102
+ raise
103
+ else:
104
+ self.logger.debug(f"Using cached model for key: {cache_key}")
105
+ return self._model_cache[cache_key]
106
+
107
+ @abstractmethod
108
+ def _load_model_from_options(self, options: CheckboxOptions) -> Any:
109
+ """
110
+ Load and configure the detection model based on provided options.
111
+
112
+ Args:
113
+ options: The options dataclass instance.
114
+
115
+ Returns:
116
+ The loaded model object(s).
117
+ """
118
+ raise NotImplementedError("Subclasses must implement _load_model_from_options")
119
+
120
+ def _map_label_to_state(self, label: str, options: CheckboxOptions) -> tuple[bool, str]:
121
+ """
122
+ Map model output label to checkbox state.
123
+
124
+ Args:
125
+ label: Raw label from model (e.g., 'checked_checkbox', '1')
126
+ options: Options containing label mapping
127
+
128
+ Returns:
129
+ Tuple of (is_checked: bool, state: str)
130
+ """
131
+ # Normalize label
132
+ normalized_label = str(label).lower().strip()
133
+
134
+ # Check mapping
135
+ if normalized_label in options.label_mapping:
136
+ state = options.label_mapping[normalized_label]
137
+ is_checked = state == "checked"
138
+ return is_checked, state
139
+
140
+ # Default heuristic if not in mapping
141
+ if any(term in normalized_label for term in ["checked", "tick", "filled", "1"]):
142
+ return True, "checked"
143
+ else:
144
+ return False, "unchecked"
145
+
146
+ def _apply_nms(
147
+ self, detections: List[Dict[str, Any]], iou_threshold: float
148
+ ) -> List[Dict[str, Any]]:
149
+ """
150
+ Apply non-maximum suppression to remove overlapping detections.
151
+ For checkboxes, we reject ANY meaningful overlap.
152
+
153
+ Args:
154
+ detections: List of detection dictionaries
155
+ iou_threshold: IoU threshold for suppression (ignored for checkboxes - we use stricter rules)
156
+
157
+ Returns:
158
+ Filtered list of detections
159
+ """
160
+ if not detections:
161
+ return detections
162
+
163
+ # Sort by confidence (descending), then by area (ascending) to prefer smaller boxes
164
+ def sort_key(det):
165
+ bbox = det["bbox"]
166
+ area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
167
+ return (-det["confidence"], area)
168
+
169
+ sorted_detections = sorted(detections, key=sort_key)
170
+
171
+ keep = []
172
+ for i, det in enumerate(sorted_detections):
173
+ should_keep = True
174
+ det_bbox = det["bbox"]
175
+
176
+ for kept_det in keep:
177
+ kept_bbox = kept_det["bbox"]
178
+
179
+ # Check for ANY overlap at all
180
+ if self._boxes_overlap(det_bbox, kept_bbox):
181
+ should_keep = False
182
+ logger.debug(f"Rejecting box {det_bbox} due to overlap with {kept_bbox}")
183
+ break
184
+
185
+ if should_keep:
186
+ keep.append(det)
187
+ logger.debug(f"Keeping box {det_bbox} with confidence {det['confidence']}")
188
+
189
+ logger.info(f"NMS: Reduced {len(detections)} detections to {len(keep)}")
190
+ return keep
191
+
192
+ def _boxes_overlap(self, box1: tuple, box2: tuple) -> bool:
193
+ """Check if two boxes have any overlap at all."""
194
+ x1_min, y1_min, x1_max, y1_max = box1
195
+ x2_min, y2_min, x2_max, y2_max = box2
196
+
197
+ # Check if boxes are separated
198
+ if x1_max <= x2_min or x2_max <= x1_min:
199
+ return False
200
+ if y1_max <= y2_min or y2_max <= y1_min:
201
+ return False
202
+
203
+ # If we get here, boxes overlap
204
+ return True
205
+
206
+ def _compute_intersection_ratio(self, box1: tuple, box2: tuple) -> float:
207
+ """
208
+ Compute intersection ratio relative to the smaller box.
209
+ This is more aggressive than IoU for checkbox detection.
210
+ """
211
+ x1_min, y1_min, x1_max, y1_max = box1
212
+ x2_min, y2_min, x2_max, y2_max = box2
213
+
214
+ # Intersection
215
+ inter_xmin = max(x1_min, x2_min)
216
+ inter_ymin = max(y1_min, y2_min)
217
+ inter_xmax = min(x1_max, x2_max)
218
+ inter_ymax = min(y1_max, y2_max)
219
+
220
+ if inter_xmax < inter_xmin or inter_ymax < inter_ymin:
221
+ return 0.0
222
+
223
+ inter_area = (inter_xmax - inter_xmin) * (inter_ymax - inter_ymin)
224
+
225
+ # Areas of both boxes
226
+ area1 = (x1_max - x1_min) * (y1_max - y1_min)
227
+ area2 = (x2_max - x2_min) * (y2_max - y2_min)
228
+
229
+ # Ratio relative to smaller box
230
+ smaller_area = min(area1, area2)
231
+ if smaller_area == 0:
232
+ return 0.0
233
+
234
+ return inter_area / smaller_area
235
+
236
+ def _compute_iou(self, box1: tuple, box2: tuple) -> float:
237
+ """Compute IoU between two boxes."""
238
+ x1_min, y1_min, x1_max, y1_max = box1
239
+ x2_min, y2_min, x2_max, y2_max = box2
240
+
241
+ # Intersection
242
+ inter_xmin = max(x1_min, x2_min)
243
+ inter_ymin = max(y1_min, y2_min)
244
+ inter_xmax = min(x1_max, x2_max)
245
+ inter_ymax = min(y1_max, y2_max)
246
+
247
+ if inter_xmax < inter_xmin or inter_ymax < inter_ymin:
248
+ return 0.0
249
+
250
+ inter_area = (inter_xmax - inter_xmin) * (inter_ymax - inter_ymin)
251
+
252
+ # Union
253
+ area1 = (x1_max - x1_min) * (y1_max - y1_min)
254
+ area2 = (x2_max - x2_min) * (y2_max - y2_min)
255
+ union_area = area1 + area2 - inter_area
256
+
257
+ if union_area == 0:
258
+ return 0.0
259
+
260
+ return inter_area / union_area
261
+
262
+ def __del__(self):
263
+ """Cleanup resources."""
264
+ self.logger.info(f"Cleaning up {self.__class__.__name__} resources.")
265
+ self._model_cache.clear()
@@ -0,0 +1,329 @@
1
+ """Checkbox analyzer for PDF pages and regions."""
2
+
3
+ import logging
4
+ from typing import Any, Dict, List, Optional, Union
5
+
6
+ from PIL import Image
7
+
8
+ from natural_pdf.elements.region import Region
9
+
10
+ from .checkbox_manager import CheckboxManager
11
+ from .checkbox_options import CheckboxOptions
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class CheckboxAnalyzer:
17
+ """
18
+ Handles checkbox analysis for PDF pages and regions, including image rendering,
19
+ coordinate scaling, region creation, and result storage.
20
+ """
21
+
22
+ def __init__(self, element, checkbox_manager: Optional[CheckboxManager] = None):
23
+ """
24
+ Initialize the checkbox analyzer.
25
+
26
+ Args:
27
+ element: The Page or Region object to analyze
28
+ checkbox_manager: Optional CheckboxManager instance. If None, creates a new one.
29
+ """
30
+ self._element = element
31
+ self._checkbox_manager = checkbox_manager or CheckboxManager()
32
+
33
+ # Determine if element is a page or region
34
+ self._is_page = hasattr(element, "number") and hasattr(element, "_parent")
35
+ self._is_region = hasattr(element, "bbox") and hasattr(element, "_page")
36
+
37
+ if self._is_region:
38
+ self._page = element._page
39
+ else:
40
+ self._page = element
41
+
42
+ def detect_checkboxes(
43
+ self,
44
+ engine: Optional[str] = None,
45
+ options: Optional[Union[CheckboxOptions, Dict[str, Any]]] = None,
46
+ confidence: Optional[float] = None,
47
+ resolution: Optional[int] = None,
48
+ device: Optional[str] = None,
49
+ existing: str = "replace",
50
+ limit: Optional[int] = None,
51
+ **kwargs,
52
+ ) -> List[Region]:
53
+ """
54
+ Detect checkboxes in the page or region.
55
+
56
+ Args:
57
+ engine: Name of the detection engine (default: 'rtdetr')
58
+ options: CheckboxOptions instance or dict of options
59
+ confidence: Minimum confidence threshold
60
+ resolution: DPI for rendering (default: 150)
61
+ device: Device for inference
62
+ existing: How to handle existing checkbox regions: 'replace' (default) or 'append'
63
+ limit: Maximum number of checkboxes to detect
64
+ **kwargs: Additional engine-specific arguments
65
+
66
+ Returns:
67
+ List of created Region objects representing checkboxes
68
+ """
69
+ logger.info(
70
+ f"Detecting checkboxes (Engine: {engine or 'default'}, "
71
+ f"Element type: {'region' if self._is_region else 'page'})"
72
+ )
73
+
74
+ # Prepare options
75
+ if options is None:
76
+ # Build options from simple arguments
77
+ option_kwargs = {}
78
+ if confidence is not None:
79
+ option_kwargs["confidence"] = confidence
80
+ if resolution is not None:
81
+ option_kwargs["resolution"] = resolution
82
+ if device is not None:
83
+ option_kwargs["device"] = device
84
+ option_kwargs.update(kwargs)
85
+
86
+ # Let manager create appropriate options
87
+ final_options = None
88
+ final_kwargs = option_kwargs
89
+ else:
90
+ # Use provided options
91
+ final_options = options
92
+ # Apply any overrides
93
+ final_kwargs = {}
94
+ if confidence is not None:
95
+ final_kwargs["confidence"] = confidence
96
+ if resolution is not None:
97
+ final_kwargs["resolution"] = resolution
98
+ if device is not None:
99
+ final_kwargs["device"] = device
100
+ final_kwargs.update(kwargs)
101
+
102
+ # Render image
103
+ try:
104
+ resolution_val = (
105
+ resolution
106
+ or (
107
+ final_options.resolution
108
+ if final_options and hasattr(final_options, "resolution")
109
+ else None
110
+ )
111
+ or 150
112
+ )
113
+
114
+ if self._is_region:
115
+ # For regions, crop the page image to just the region bounds
116
+ page_image = self._page.render(resolution=resolution_val)
117
+ if not page_image:
118
+ raise ValueError("Page rendering returned None")
119
+
120
+ # Calculate region bounds in image coordinates
121
+ img_scale_x = page_image.width / self._page.width
122
+ img_scale_y = page_image.height / self._page.height
123
+
124
+ x0, y0, x1, y1 = self._element.bbox
125
+ img_x0 = int(x0 * img_scale_x)
126
+ img_y0 = int(y0 * img_scale_y)
127
+ img_x1 = int(x1 * img_scale_x)
128
+ img_y1 = int(y1 * img_scale_y)
129
+
130
+ # Crop to region
131
+ image = page_image.crop((img_x0, img_y0, img_x1, img_y1))
132
+
133
+ # Store crop offset for coordinate transformation
134
+ crop_offset = (img_x0, img_y0)
135
+
136
+ else:
137
+ # For pages, use the full image
138
+ image = self._page.render(resolution=resolution_val)
139
+ if not image:
140
+ raise ValueError("Page rendering returned None")
141
+ crop_offset = (0, 0)
142
+
143
+ logger.debug(f"Rendered image size: {image.width}x{image.height}")
144
+
145
+ except Exception as e:
146
+ logger.error(f"Failed to render image: {e}", exc_info=True)
147
+ return []
148
+
149
+ # Calculate scaling factors
150
+ if self._is_region:
151
+ # For regions, scale is relative to the cropped image
152
+ scale_x = (self._element.bbox[2] - self._element.bbox[0]) / image.width
153
+ scale_y = (self._element.bbox[3] - self._element.bbox[1]) / image.height
154
+ pdf_offset = (self._element.bbox[0], self._element.bbox[1])
155
+ else:
156
+ # For pages, scale is from image to PDF coordinates
157
+ scale_x = self._page.width / image.width
158
+ scale_y = self._page.height / image.height
159
+ pdf_offset = (0, 0)
160
+
161
+ # Run detection
162
+ try:
163
+ detections = self._checkbox_manager.detect_checkboxes(
164
+ image=image, engine=engine, options=final_options, **final_kwargs
165
+ )
166
+ logger.info(f"Detected {len(detections)} checkboxes")
167
+ except Exception as e:
168
+ logger.error(f"Checkbox detection failed: {e}", exc_info=True)
169
+ return []
170
+
171
+ # Process detections into regions
172
+ checkbox_regions = []
173
+
174
+ for detection in detections:
175
+ try:
176
+ # Get image coordinates
177
+ img_x0, img_y0, img_x1, img_y1 = detection["bbox"]
178
+
179
+ if self._is_region:
180
+ # For regions, add crop offset and scale to page image coords
181
+ page_img_x0 = img_x0 + crop_offset[0]
182
+ page_img_y0 = img_y0 + crop_offset[1]
183
+ page_img_x1 = img_x1 + crop_offset[0]
184
+ page_img_y1 = img_y1 + crop_offset[1]
185
+
186
+ # Then scale to PDF coords
187
+ pdf_x0 = page_img_x0 * (
188
+ self._page.width / (self._page.render(resolution=resolution_val).width)
189
+ )
190
+ pdf_y0 = page_img_y0 * (
191
+ self._page.height / (self._page.render(resolution=resolution_val).height)
192
+ )
193
+ pdf_x1 = page_img_x1 * (
194
+ self._page.width / (self._page.render(resolution=resolution_val).width)
195
+ )
196
+ pdf_y1 = page_img_y1 * (
197
+ self._page.height / (self._page.render(resolution=resolution_val).height)
198
+ )
199
+ else:
200
+ # For pages, directly scale to PDF coordinates
201
+ pdf_x0 = img_x0 * scale_x + pdf_offset[0]
202
+ pdf_y0 = img_y0 * scale_y + pdf_offset[1]
203
+ pdf_x1 = img_x1 * scale_x + pdf_offset[0]
204
+ pdf_y1 = img_y1 * scale_y + pdf_offset[1]
205
+
206
+ # Ensure valid bounds
207
+ pdf_x0, pdf_x1 = min(pdf_x0, pdf_x1), max(pdf_x0, pdf_x1)
208
+ pdf_y0, pdf_y1 = min(pdf_y0, pdf_y1), max(pdf_y0, pdf_y1)
209
+ pdf_x0 = max(0, pdf_x0)
210
+ pdf_y0 = max(0, pdf_y0)
211
+ pdf_x1 = min(self._page.width, pdf_x1)
212
+ pdf_y1 = min(self._page.height, pdf_y1)
213
+
214
+ # For region detection, skip checkboxes outside the region bounds
215
+ if self._is_region:
216
+ region_x0, region_y0, region_x1, region_y1 = self._element.bbox
217
+ # Check if checkbox center is within region
218
+ cb_center_x = (pdf_x0 + pdf_x1) / 2
219
+ cb_center_y = (pdf_y0 + pdf_y1) / 2
220
+ if not (
221
+ region_x0 <= cb_center_x <= region_x1
222
+ and region_y0 <= cb_center_y <= region_y1
223
+ ):
224
+ continue # Skip this checkbox
225
+
226
+ # Create region
227
+ region = Region(self._page, (pdf_x0, pdf_y0, pdf_x1, pdf_y1))
228
+ region.region_type = "checkbox"
229
+ region.normalized_type = "checkbox"
230
+ region.is_checked = detection.get("is_checked", False)
231
+ region.checkbox_state = detection.get("checkbox_state", "unchecked")
232
+ region.confidence = detection.get("confidence", 0.0)
233
+ region.model = detection.get("model", "checkbox_detector")
234
+ region.source = "checkbox"
235
+
236
+ # Store original class for debugging
237
+ region.original_class = detection.get("class", "unknown")
238
+
239
+ # Check if region contains text - if so, it's probably not a checkbox
240
+ # Get reject_with_text setting from options or kwargs, default to True
241
+ reject_with_text = True
242
+ if final_options:
243
+ reject_with_text = getattr(final_options, "reject_with_text", True)
244
+ else:
245
+ reject_with_text = kwargs.get("reject_with_text", True)
246
+
247
+ if reject_with_text:
248
+ text_in_region = region.extract_text().strip()
249
+ if text_in_region:
250
+ # Allow only single characters that might be check marks
251
+ if len(text_in_region) > 1 or text_in_region.isalnum():
252
+ logger.debug(
253
+ f"Rejecting checkbox at {region.bbox} - contains text: '{text_in_region}'"
254
+ )
255
+ continue
256
+
257
+ checkbox_regions.append(region)
258
+
259
+ except Exception as e:
260
+ logger.warning(f"Could not process checkbox detection: {detection}. Error: {e}")
261
+ continue
262
+
263
+ # Apply limit if specified
264
+ if limit is not None and len(checkbox_regions) > limit:
265
+ # Sort by confidence (highest first) and take top N
266
+ checkbox_regions = sorted(checkbox_regions, key=lambda r: r.confidence, reverse=True)[
267
+ :limit
268
+ ]
269
+
270
+ # Final cleanup - ensure no overlapping boxes (this shouldn't be needed if NMS worked)
271
+ cleaned_regions = []
272
+ for region in checkbox_regions:
273
+ overlaps = False
274
+ for kept_region in cleaned_regions:
275
+ # Check if bboxes overlap
276
+ r1 = region.bbox
277
+ r2 = kept_region.bbox
278
+ if not (r1[2] <= r2[0] or r2[2] <= r1[0] or r1[3] <= r2[1] or r2[3] <= r1[1]):
279
+ overlaps = True
280
+ logger.warning(
281
+ f"Found overlapping checkbox regions after NMS: {r1} overlaps {r2}"
282
+ )
283
+ break
284
+ if not overlaps:
285
+ cleaned_regions.append(region)
286
+
287
+ if len(cleaned_regions) < len(checkbox_regions):
288
+ logger.warning(
289
+ f"Removed {len(checkbox_regions) - len(cleaned_regions)} overlapping checkboxes in final cleanup"
290
+ )
291
+ checkbox_regions = cleaned_regions
292
+
293
+ # Store results
294
+ logger.debug(f"Storing {len(checkbox_regions)} checkbox regions (mode: {existing})")
295
+
296
+ # Initialize storage if needed
297
+ if not hasattr(self._page, "_regions"):
298
+ self._page._regions = {}
299
+
300
+ # Handle existing regions
301
+ if existing.lower() == "append":
302
+ if "checkbox" not in self._page._regions:
303
+ self._page._regions["checkbox"] = []
304
+ self._page._regions["checkbox"].extend(checkbox_regions)
305
+ else: # replace
306
+ # Remove old checkbox regions from element manager
307
+ if "checkbox" in self._page._regions:
308
+ old_checkboxes = self._page._regions["checkbox"]
309
+ if (
310
+ hasattr(self._page._element_mgr, "_elements")
311
+ and self._page._element_mgr._elements
312
+ ):
313
+ current_regions = self._page._element_mgr._elements.get("regions", [])
314
+ # Remove old checkbox regions
315
+ self._page._element_mgr._elements["regions"] = [
316
+ r for r in current_regions if r not in old_checkboxes
317
+ ]
318
+ self._page._regions["checkbox"] = checkbox_regions
319
+
320
+ # Add to element manager
321
+ for region in checkbox_regions:
322
+ self._page._element_mgr.add_region(region)
323
+
324
+ # Store for easy access
325
+ self._page.detected_checkbox_regions = self._page._regions.get("checkbox", [])
326
+
327
+ logger.info(f"Checkbox detection complete. Found {len(checkbox_regions)} checkboxes.")
328
+
329
+ return checkbox_regions