PyPI - natural-pdf - Versions diffs - 0.2.17__py3-none-any.whl → 0.2.19__py3-none-any.whl - Mend

natural-pdf 0.2.17py3-none-any.whl → 0.2.19py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

natural_pdf/__init__.py +8 -0
natural_pdf/analyzers/checkbox/__init__.py +6 -0
natural_pdf/analyzers/checkbox/base.py +265 -0
natural_pdf/analyzers/checkbox/checkbox_analyzer.py +329 -0
natural_pdf/analyzers/checkbox/checkbox_manager.py +166 -0
natural_pdf/analyzers/checkbox/checkbox_options.py +60 -0
natural_pdf/analyzers/checkbox/mixin.py +95 -0
natural_pdf/analyzers/checkbox/rtdetr.py +201 -0
natural_pdf/collections/mixins.py +14 -5
natural_pdf/core/element_manager.py +5 -1
natural_pdf/core/page.py +103 -9
natural_pdf/core/page_collection.py +41 -1
natural_pdf/core/pdf.py +24 -1
natural_pdf/describe/base.py +20 -0
natural_pdf/elements/base.py +152 -10
natural_pdf/elements/element_collection.py +41 -2
natural_pdf/elements/region.py +115 -2
natural_pdf/judge.py +1509 -0
natural_pdf/selectors/parser.py +42 -1
natural_pdf/utils/spatial.py +42 -39
{natural_pdf-0.2.17.dist-info → natural_pdf-0.2.19.dist-info}/METADATA +1 -1
{natural_pdf-0.2.17.dist-info → natural_pdf-0.2.19.dist-info}/RECORD +42 -18
temp/check_model.py +49 -0
temp/check_pdf_content.py +9 -0
temp/checkbox_checks.py +590 -0
temp/checkbox_simple.py +117 -0
temp/checkbox_ux_ideas.py +400 -0
temp/context_manager_prototype.py +177 -0
temp/convert_to_hf.py +60 -0
temp/demo_text_closest.py +66 -0
temp/inspect_model.py +43 -0
temp/rtdetr_dinov2_test.py +49 -0
temp/test_closest_debug.py +26 -0
temp/test_closest_debug2.py +22 -0
temp/test_context_exploration.py +85 -0
temp/test_durham.py +30 -0
temp/test_empty_string.py +16 -0
temp/test_similarity.py +15 -0
{natural_pdf-0.2.17.dist-info → natural_pdf-0.2.19.dist-info}/WHEEL +0 -0
{natural_pdf-0.2.17.dist-info → natural_pdf-0.2.19.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.2.17.dist-info → natural_pdf-0.2.19.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.2.17.dist-info → natural_pdf-0.2.19.dist-info}/top_level.txt +0 -0

natural_pdf/analyzers/checkbox/checkbox_manager.py ADDED Viewed

@@ -0,0 +1,166 @@
+"""Manager for checkbox detection engines."""
+import logging
+from typing import Any, Dict, List, Optional, Type, Union
+from PIL import Image
+from .base import CheckboxDetector
+from .checkbox_options import CheckboxOptions, RTDETRCheckboxOptions
+logger = logging.getLogger(__name__)
+def _lazy_import_rtdetr_detector():
+    """Lazy import RT-DETR detector to avoid heavy dependencies at module load."""
+    from .rtdetr import RTDETRCheckboxDetector
+    return RTDETRCheckboxDetector
+class CheckboxManager:
+    """Manages checkbox detection engines and provides a unified interface."""
+    # Registry of available engines
+    ENGINE_REGISTRY = {
+        "rtdetr": {
+            "class": _lazy_import_rtdetr_detector,
+            "options_class": RTDETRCheckboxOptions,
+        },
+        "wendys": {  # Alias for the default model
+            "class": _lazy_import_rtdetr_detector,
+            "options_class": RTDETRCheckboxOptions,
+        },
+    }
+    def __init__(self):
+        """Initialize the checkbox manager."""
+        self.logger = logging.getLogger(__name__)
+        self._detector_cache: Dict[str, CheckboxDetector] = {}
+    def detect_checkboxes(
+        self,
+        image: Image.Image,
+        engine: Optional[str] = None,
+        options: Optional[Union[CheckboxOptions, Dict[str, Any]]] = None,
+        **kwargs,
+    ) -> List[Dict[str, Any]]:
+        """
+        Detect checkboxes in an image using the specified engine.
+        Args:
+            image: PIL Image to analyze
+            engine: Name of the detection engine (default: 'rtdetr')
+            options: CheckboxOptions instance or dict of options
+            **kwargs: Additional options to override
+        Returns:
+            List of detection dictionaries
+        """
+        # Determine engine and options
+        if options is None:
+            if engine is None:
+                engine = "rtdetr"  # Default engine
+            options = self._create_options(engine, **kwargs)
+        elif isinstance(options, dict):
+            if engine is None:
+                engine = "rtdetr"
+            options = self._create_options(engine, **options, **kwargs)
+        else:
+            # options is a CheckboxOptions instance
+            # Determine engine from options type if not specified
+            if engine is None:
+                engine = self._get_engine_from_options(options)
+            # Apply any kwargs as overrides
+            if kwargs:
+                options = self._override_options(options, **kwargs)
+        # Get detector
+        detector = self._get_detector(engine)
+        # Run detection
+        try:
+            return detector.detect(image, options)
+        except Exception as e:
+            self.logger.error(f"Checkbox detection failed with {engine}: {e}", exc_info=True)
+            raise
+    def _get_engine_from_options(self, options: CheckboxOptions) -> str:
+        """Determine engine from options type."""
+        for engine_name, engine_info in self.ENGINE_REGISTRY.items():
+            if isinstance(options, engine_info["options_class"]):
+                return engine_name
+        # Default if can't determine
+        return "rtdetr"
+    def _create_options(self, engine: str, **kwargs) -> CheckboxOptions:
+        """Create options instance for the specified engine."""
+        if engine not in self.ENGINE_REGISTRY:
+            raise ValueError(
+                f"Unknown checkbox detection engine: {engine}. "
+                f"Available: {list(self.ENGINE_REGISTRY.keys())}"
+            )
+        options_class = self.ENGINE_REGISTRY[engine]["options_class"]
+        return options_class(**kwargs)
+    def _override_options(self, options: CheckboxOptions, **kwargs) -> CheckboxOptions:
+        """Create a new options instance with overrides applied."""
+        # Get current values as dict
+        import dataclasses
+        current_values = dataclasses.asdict(options)
+        # Apply overrides
+        current_values.update(kwargs)
+        # Create new instance
+        return type(options)(**current_values)
+    def _get_detector(self, engine: str) -> CheckboxDetector:
+        """Get or create a detector instance for the specified engine."""
+        if engine not in self._detector_cache:
+            if engine not in self.ENGINE_REGISTRY:
+                raise ValueError(
+                    f"Unknown checkbox detection engine: {engine}. "
+                    f"Available: {list(self.ENGINE_REGISTRY.keys())}"
+                )
+            # Get detector class (lazy import)
+            detector_class = self.ENGINE_REGISTRY[engine]["class"]
+            if callable(detector_class):
+                detector_class = detector_class()  # Call factory function
+            # Check availability
+            if not detector_class.is_available():
+                raise RuntimeError(
+                    f"Checkbox detection engine '{engine}' is not available. "
+                    f"Please install required dependencies."
+                )
+            # Create instance
+            self._detector_cache[engine] = detector_class()
+            self.logger.info(f"Initialized checkbox detector: {engine}")
+        return self._detector_cache[engine]
+    def is_engine_available(self, engine: str) -> bool:
+        """Check if a specific engine is available."""
+        if engine not in self.ENGINE_REGISTRY:
+            return False
+        try:
+            detector_class = self.ENGINE_REGISTRY[engine]["class"]
+            if callable(detector_class):
+                detector_class = detector_class()
+            return detector_class.is_available()
+        except Exception:
+            return False
+    def list_available_engines(self) -> List[str]:
+        """List all available checkbox detection engines."""
+        available = []
+        for engine in self.ENGINE_REGISTRY:
+            if self.is_engine_available(engine):
+                available.append(engine)
+        return available

natural_pdf/analyzers/checkbox/checkbox_options.py ADDED Viewed

@@ -0,0 +1,60 @@
+"""Options classes for checkbox detection engines."""
+import logging
+from dataclasses import dataclass, field
+from typing import Any, Dict, Optional
+logger = logging.getLogger(__name__)
+@dataclass
+class CheckboxOptions:
+    """Base options for checkbox detection engines."""
+    confidence: float = 0.02  # Default very low confidence for DETR models
+    resolution: int = 150  # DPI for rendering pages to images
+    device: Optional[str] = "cpu"  # Preferred device ('cpu', 'cuda', 'mps', etc.)
+    # Label mapping from model outputs to standard states
+    label_mapping: Dict[str, str] = field(
+        default_factory=lambda: {
+            # Common mappings
+            "checkbox": "unchecked",
+            "checked_checkbox": "checked",
+            "checkbox_checked": "checked",
+            "unchecked_checkbox": "unchecked",
+            "checkbox_unchecked": "unchecked",
+            # Numeric mappings
+            "0": "unchecked",
+            "1": "checked",
+            # Descriptive mappings
+            "empty": "unchecked",
+            "tick": "checked",
+            "filled": "checked",
+            "blank": "unchecked",
+        }
+    )
+    # Non-max suppression parameters
+    nms_threshold: float = 0.1  # IoU threshold for overlapping boxes (low for checkboxes)
+    # Text filtering
+    reject_with_text: bool = (
+        True  # Reject detections that contain text (checkboxes should be empty)
+    )
+    # Extra arguments for engine-specific parameters
+    extra_args: Dict[str, Any] = field(default_factory=dict)
+@dataclass
+class RTDETRCheckboxOptions(CheckboxOptions):
+    """Options specific to RT-DETR checkbox detection models."""
+    model_repo: str = "wendys-llc/rtdetr-v2-r50-chkbx"  # Default checkbox model
+    model_revision: Optional[str] = None  # Specific model revision
+    image_processor_repo: Optional[str] = None  # Override image processor if needed
+    # RT-DETR specific parameters
+    max_detections: int = 100  # Maximum number of detections per image
+    post_process_threshold: float = 0.0  # Threshold for post-processing (0.0 for all)

natural_pdf/analyzers/checkbox/mixin.py ADDED Viewed

@@ -0,0 +1,95 @@
+"""Checkbox detection mixin for Page and Region classes."""
+import logging
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+if TYPE_CHECKING:
+    from natural_pdf.analyzers.checkbox.checkbox_options import CheckboxOptions
+    from natural_pdf.elements.element_collection import ElementCollection
+    from natural_pdf.elements.region import Region
+logger = logging.getLogger(__name__)
+class CheckboxDetectionMixin:
+    """Mixin to add checkbox detection capabilities to Page and Region classes."""
+    def detect_checkboxes(
+        self,
+        engine: Optional[str] = None,
+        options: Optional[Union["CheckboxOptions", Dict[str, Any]]] = None,
+        confidence: Optional[float] = None,
+        resolution: Optional[int] = None,
+        device: Optional[str] = None,
+        existing: str = "replace",
+        limit: Optional[int] = None,
+        **kwargs,
+    ) -> "ElementCollection[Region]":
+        """
+        Detect checkboxes in the page or region.
+        This method identifies checkboxes and their states (checked/unchecked) using
+        computer vision models. Detected checkboxes are added as Region objects with
+        type="checkbox" and can be accessed via selectors like page.find_all('checkbox').
+        Args:
+            engine: Name of the detection engine (default: 'rtdetr' for wendys model)
+            options: CheckboxOptions instance or dict of options for advanced configuration
+            confidence: Minimum confidence threshold (default: 0.02 for DETR models)
+            resolution: DPI for rendering pages to images (default: 150)
+            device: Device for inference ('cpu', 'cuda', 'mps', etc.)
+            existing: How to handle existing checkbox regions: 'replace' (default) or 'append'
+            limit: Maximum number of checkboxes to detect (useful when you know the expected count)
+            **kwargs: Additional engine-specific arguments
+        Returns:
+            ElementCollection containing detected checkbox Region objects with attributes:
+            - region_type: "checkbox"
+            - is_checked: bool indicating if checkbox is checked
+            - checkbox_state: "checked" or "unchecked"
+            - confidence: detection confidence score
+        Examples:
+            # Basic detection
+            checkboxes = page.detect_checkboxes()
+            # Find checked boxes
+            checked = page.find_all('checkbox:checked')
+            unchecked = page.find_all('checkbox:unchecked')
+            # Limit to expected number
+            checkboxes = page.detect_checkboxes(limit=10)
+            # High confidence detection
+            checkboxes = page.detect_checkboxes(confidence=0.9)
+            # GPU acceleration
+            checkboxes = page.detect_checkboxes(device='cuda')
+            # Custom model
+            from natural_pdf import CheckboxOptions
+            options = CheckboxOptions(model_repo="your-org/your-checkbox-model")
+            checkboxes = page.detect_checkboxes(options=options)
+        """
+        # Lazy import to avoid circular dependencies
+        from natural_pdf.analyzers.checkbox.checkbox_analyzer import CheckboxAnalyzer
+        # Create analyzer
+        analyzer = CheckboxAnalyzer(self)
+        # Run detection
+        regions = analyzer.detect_checkboxes(
+            engine=engine,
+            options=options,
+            confidence=confidence,
+            resolution=resolution,
+            device=device,
+            existing=existing,
+            limit=limit,
+            **kwargs,
+        )
+        # Return as ElementCollection
+        from natural_pdf.elements.element_collection import ElementCollection
+        return ElementCollection(regions)

natural_pdf/analyzers/checkbox/rtdetr.py ADDED Viewed

@@ -0,0 +1,201 @@
+"""RT-DETR based checkbox detector implementation."""
+import logging
+from typing import Any, Dict, List, Optional, Tuple
+from PIL import Image
+from .base import CheckboxDetector
+from .checkbox_options import RTDETRCheckboxOptions
+logger = logging.getLogger(__name__)
+# Lazy imports cache
+_transformers_cache = None
+def _get_transformers():
+    """Lazy import transformers to avoid heavy dependency at module load."""
+    global _transformers_cache
+    if _transformers_cache is None:
+        try:
+            from transformers import AutoImageProcessor, AutoModelForObjectDetection
+            _transformers_cache = (AutoImageProcessor, AutoModelForObjectDetection)
+        except ImportError:
+            raise ImportError(
+                "transformers library is required for RT-DETR checkbox detection. "
+                "Install it with: pip install transformers"
+            )
+    return _transformers_cache
+def _get_torch():
+    """Lazy import torch."""
+    try:
+        import torch
+        return torch
+    except ImportError:
+        raise ImportError(
+            "torch is required for RT-DETR checkbox detection. "
+            "Install it with: pip install torch"
+        )
+class RTDETRCheckboxDetector(CheckboxDetector):
+    """RT-DETR based checkbox detector using HuggingFace transformers."""
+    def __init__(self):
+        """Initialize the RT-DETR checkbox detector."""
+        super().__init__()
+    @classmethod
+    def is_available(cls) -> bool:
+        """Check if transformers and torch are available."""
+        try:
+            _get_transformers()
+            _get_torch()
+            return True
+        except ImportError:
+            return False
+    def _get_cache_key(self, options: RTDETRCheckboxOptions) -> str:
+        """Generate cache key including model repo and revision."""
+        base_key = super()._get_cache_key(options)
+        model_key = options.model_repo.replace("/", "_")
+        revision_key = options.model_revision or "default"
+        return f"{base_key}_{model_key}_{revision_key}"
+    def _load_model_from_options(self, options: RTDETRCheckboxOptions) -> Dict[str, Any]:
+        """Load RT-DETR model and processor from HuggingFace."""
+        AutoImageProcessor, AutoModelForObjectDetection = _get_transformers()
+        torch = _get_torch()
+        try:
+            # Load image processor
+            if options.image_processor_repo:
+                image_processor = AutoImageProcessor.from_pretrained(
+                    options.image_processor_repo, revision=options.model_revision
+                )
+            else:
+                image_processor = AutoImageProcessor.from_pretrained(
+                    options.model_repo, revision=options.model_revision
+                )
+            # Load model
+            model = AutoModelForObjectDetection.from_pretrained(
+                options.model_repo, revision=options.model_revision
+            )
+            # Move to device
+            if options.device and options.device != "cpu":
+                if options.device == "cuda" and torch.cuda.is_available():
+                    model = model.to("cuda")
+                elif options.device == "mps" and torch.backends.mps.is_available():
+                    model = model.to("mps")
+                else:
+                    self.logger.warning(
+                        f"Requested device '{options.device}' not available, using CPU"
+                    )
+                    model = model.to("cpu")
+            else:
+                model = model.to("cpu")
+            # Set to eval mode
+            model.eval()
+            return {
+                "model": model,
+                "processor": image_processor,
+                "device": next(model.parameters()).device,
+            }
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to load checkbox model '{options.model_repo}'. "
+                f"This may be due to network issues or missing credentials. "
+                f"Original error: {e}"
+            )
+    def detect(self, image: Image.Image, options: RTDETRCheckboxOptions) -> List[Dict[str, Any]]:
+        """
+        Detect checkboxes in the given image using RT-DETR.
+        Args:
+            image: PIL Image to analyze
+            options: RT-DETR specific options
+        Returns:
+            List of standardized detection dictionaries
+        """
+        torch = _get_torch()
+        # Get cached model
+        model_dict = self._get_model(options)
+        model = model_dict["model"]
+        processor = model_dict["processor"]
+        device = model_dict["device"]
+        # Prepare inputs
+        inputs = processor(images=[image], return_tensors="pt")
+        if device.type != "cpu":
+            inputs = {k: v.to(device) for k, v in inputs.items()}
+        # Run inference
+        with torch.no_grad():
+            outputs = model(**inputs)
+        # Post-process results
+        target_sizes = torch.tensor([image.size[::-1]])  # (height, width)
+        if device.type != "cpu":
+            target_sizes = target_sizes.to(device)
+        results = processor.post_process_object_detection(
+            outputs, threshold=options.post_process_threshold, target_sizes=target_sizes
+        )[0]
+        # Convert to standardized format
+        detections = []
+        for i in range(len(results["scores"])):
+            score = results["scores"][i].item()
+            # Apply confidence threshold
+            if score < options.confidence:
+                continue
+            label = results["labels"][i].item()
+            box = results["boxes"][i].tolist()
+            # Get label text from model config
+            if hasattr(model.config, "id2label") and label in model.config.id2label:
+                label_text = model.config.id2label[label]
+            else:
+                label_text = str(label)
+            # Map to checkbox state
+            is_checked, state = self._map_label_to_state(label_text, options)
+            detection = {
+                "bbox": tuple(box),  # (x0, y0, x1, y1)
+                "class": label_text,
+                "normalized_class": "checkbox",
+                "is_checked": is_checked,
+                "checkbox_state": state,
+                "confidence": score,
+                "model": options.model_repo.split("/")[-1],  # Short model name
+                "source": "checkbox",
+            }
+            detections.append(detection)
+        # Apply NMS if needed
+        if options.nms_threshold > 0:
+            detections = self._apply_nms(detections, options.nms_threshold)
+        # Limit detections if specified
+        if options.max_detections > 0 and len(detections) > options.max_detections:
+            # Sort by confidence and keep top N
+            detections = sorted(detections, key=lambda x: x["confidence"], reverse=True)
+            detections = detections[: options.max_detections]
+        return detections

natural_pdf/collections/mixins.py CHANGED Viewed

@@ -99,10 +99,6 @@ class ApplyMixin:
         results = [func(item, *args, **kwargs) for item in items_iterable]
-        # If results is empty, return an empty list
-        if not results:
-            return []
         # Import here to avoid circular imports
         from natural_pdf import PDF, Page
         from natural_pdf.core.page_collection import PageCollection
@@ -111,11 +107,24 @@ class ApplyMixin:
         from natural_pdf.elements.element_collection import ElementCollection
         from natural_pdf.elements.region import Region
+        # Determine the return type based on the input collection type
+        # This handles empty results correctly
+        if self.__class__.__name__ == "ElementCollection":
+            return ElementCollection(results)
+        elif self.__class__.__name__ == "PageCollection":
+            return PageCollection(results)
+        elif self.__class__.__name__ == "PDFCollection":
+            return PDFCollection(results)
+        # If not a known collection type, try to infer from results
+        if not results:
+            return []
         first_non_none = next((r for r in results if r is not None), None)
         first_type = type(first_non_none) if first_non_none is not None else None
         # Return the appropriate collection based on result type (...generally)
-        if issubclass(first_type, Element) or issubclass(first_type, Region):
+        if first_type and (issubclass(first_type, Element) or issubclass(first_type, Region)):
             return ElementCollection(results)
         elif first_type == PDF:
             return PDFCollection(results)

natural_pdf/core/element_manager.py CHANGED Viewed

@@ -584,13 +584,17 @@ class ElementManager:
         # Add regions if they exist
         if hasattr(self._page, "_regions") and (
-            "detected" in self._page._regions or "named" in self._page._regions
+            "detected" in self._page._regions
+            or "named" in self._page._regions
+            or "checkbox" in self._page._regions
         ):
             regions = []
             if "detected" in self._page._regions:
                 regions.extend(self._page._regions["detected"])
             if "named" in self._page._regions:
                 regions.extend(self._page._regions["named"].values())
+            if "checkbox" in self._page._regions:
+                regions.extend(self._page._regions["checkbox"])
             self._elements["regions"] = regions
             logger.debug(f"Page {self._page.number}: Added {len(regions)} regions.")
         else:

natural-pdf 0.2.17__py3-none-any.whl → 0.2.19__py3-none-any.whl

natural-pdf 0.2.17py3-none-any.whl → 0.2.19py3-none-any.whl