PyPI - natural-pdf - Versions diffs - 0.1.0__py3-none-any.whl - Mend

natural-pdf 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

natural_pdf/__init__.py +55 -0
natural_pdf/analyzers/__init__.py +6 -0
natural_pdf/analyzers/layout/__init__.py +1 -0
natural_pdf/analyzers/layout/base.py +151 -0
natural_pdf/analyzers/layout/docling.py +247 -0
natural_pdf/analyzers/layout/layout_analyzer.py +166 -0
natural_pdf/analyzers/layout/layout_manager.py +200 -0
natural_pdf/analyzers/layout/layout_options.py +78 -0
natural_pdf/analyzers/layout/paddle.py +240 -0
natural_pdf/analyzers/layout/surya.py +151 -0
natural_pdf/analyzers/layout/tatr.py +251 -0
natural_pdf/analyzers/layout/yolo.py +165 -0
natural_pdf/analyzers/text_options.py +60 -0
natural_pdf/analyzers/text_structure.py +270 -0
natural_pdf/analyzers/utils.py +57 -0
natural_pdf/core/__init__.py +3 -0
natural_pdf/core/element_manager.py +457 -0
natural_pdf/core/highlighting_service.py +698 -0
natural_pdf/core/page.py +1444 -0
natural_pdf/core/pdf.py +653 -0
natural_pdf/elements/__init__.py +3 -0
natural_pdf/elements/base.py +761 -0
natural_pdf/elements/collections.py +1345 -0
natural_pdf/elements/line.py +140 -0
natural_pdf/elements/rect.py +122 -0
natural_pdf/elements/region.py +1793 -0
natural_pdf/elements/text.py +304 -0
natural_pdf/ocr/__init__.py +56 -0
natural_pdf/ocr/engine.py +104 -0
natural_pdf/ocr/engine_easyocr.py +179 -0
natural_pdf/ocr/engine_paddle.py +204 -0
natural_pdf/ocr/engine_surya.py +171 -0
natural_pdf/ocr/ocr_manager.py +191 -0
natural_pdf/ocr/ocr_options.py +114 -0
natural_pdf/qa/__init__.py +3 -0
natural_pdf/qa/document_qa.py +396 -0
natural_pdf/selectors/__init__.py +4 -0
natural_pdf/selectors/parser.py +354 -0
natural_pdf/templates/__init__.py +1 -0
natural_pdf/templates/ocr_debug.html +517 -0
natural_pdf/utils/__init__.py +3 -0
natural_pdf/utils/highlighting.py +12 -0
natural_pdf/utils/reading_order.py +227 -0
natural_pdf/utils/visualization.py +223 -0
natural_pdf/widgets/__init__.py +4 -0
natural_pdf/widgets/frontend/viewer.js +88 -0
natural_pdf/widgets/viewer.py +765 -0
natural_pdf-0.1.0.dist-info/METADATA +295 -0
natural_pdf-0.1.0.dist-info/RECORD +52 -0
natural_pdf-0.1.0.dist-info/WHEEL +5 -0
natural_pdf-0.1.0.dist-info/licenses/LICENSE +21 -0
natural_pdf-0.1.0.dist-info/top_level.txt +1 -0

natural_pdf/analyzers/layout/tatr.py ADDED Viewed

@@ -0,0 +1,251 @@
+# layout_detector_tatr.py
+import logging
+import importlib.util
+import os
+import tempfile
+from typing import List, Dict, Any, Optional, Tuple
+from PIL import Image
+# Assuming base class and options are importable
+from .base import LayoutDetector
+from .layout_options import TATRLayoutOptions, BaseLayoutOptions
+logger = logging.getLogger(__name__)
+# Check for dependencies
+torch_spec = importlib.util.find_spec("torch")
+torchvision_spec = importlib.util.find_spec("torchvision")
+transformers_spec = importlib.util.find_spec("transformers")
+torch = None
+transforms = None
+AutoModelForObjectDetection = None
+if torch_spec and torchvision_spec and transformers_spec:
+    try:
+        import torch
+        from torchvision import transforms
+        from transformers import AutoModelForObjectDetection
+    except ImportError as e:
+        logger.warning(f"Could not import TATR dependencies (torch, torchvision, transformers): {e}")
+else:
+    logger.warning("torch, torchvision, or transformers not found. TableTransformerDetector will not be available.")
+class TableTransformerDetector(LayoutDetector):
+    """Table structure detector using Microsoft's Table Transformer (TATR) models."""
+    # Custom resize transform (keep as nested class or move outside)
+    class MaxResize(object):
+        def __init__(self, max_size=800):
+            self.max_size = max_size
+        def __call__(self, image):
+            width, height = image.size
+            current_max_size = max(width, height)
+            scale = self.max_size / current_max_size
+            # Use LANCZOS for resizing
+            resized_image = image.resize((int(round(scale*width)), int(round(scale*height))), Image.Resampling.LANCZOS)
+            return resized_image
+    def __init__(self):
+        super().__init__()
+        self.supported_classes = {
+            'table', 'table row', 'table column', 'table column header', 'table projected row header', 'table spanning cell' # Add others if supported by models used
+        }
+        # Models are loaded via _get_model
+    def is_available(self) -> bool:
+        """Check if dependencies are installed."""
+        return torch is not None and transforms is not None and AutoModelForObjectDetection is not None
+    def _get_cache_key(self, options: TATRLayoutOptions) -> str:
+        """Generate cache key based on model IDs and device."""
+        if not isinstance(options, TATRLayoutOptions):
+            options = TATRLayoutOptions(device=options.device)
+        device_key = str(options.device).lower()
+        det_model_key = options.detection_model.replace('/','_')
+        struct_model_key = options.structure_model.replace('/','_')
+        return f"{self.__class__.__name__}_{device_key}_{det_model_key}_{struct_model_key}"
+    def _load_model_from_options(self, options: TATRLayoutOptions) -> Dict[str, Any]:
+        """Load the TATR detection and structure models."""
+        if not self.is_available():
+             raise RuntimeError("TATR dependencies (torch, torchvision, transformers) not installed.")
+        device = options.device or ("cuda" if torch.cuda.is_available() else "cpu")
+        self.logger.info(f"Loading TATR models: Detection='{options.detection_model}', Structure='{options.structure_model}' onto device='{device}'")
+        try:
+            detection_model = AutoModelForObjectDetection.from_pretrained(
+                options.detection_model, revision="no_timm" # Important revision for some versions
+            ).to(device)
+            structure_model = AutoModelForObjectDetection.from_pretrained(
+                options.structure_model
+            ).to(device)
+            self.logger.info("TATR models loaded.")
+            return {'detection': detection_model, 'structure': structure_model}
+        except Exception as e:
+            self.logger.error(f"Failed to load TATR models: {e}", exc_info=True)
+            raise
+    # --- Helper methods (box_cxcywh_to_xyxy, rescale_bboxes, outputs_to_objects) ---
+    # Keep these as defined in the original tatr.txt file, making them instance methods
+    def box_cxcywh_to_xyxy(self, x):
+        x_c, y_c, w, h = x.unbind(-1)
+        b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]
+        return torch.stack(b, dim=1)
+    def rescale_bboxes(self, out_bbox, size):
+        img_w, img_h = size
+        boxes = self.box_cxcywh_to_xyxy(out_bbox)
+        boxes = boxes * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32).to(out_bbox.device) # Ensure tensor on correct device
+        return boxes
+    def outputs_to_objects(self, outputs, img_size, id2label):
+        logits = outputs.logits
+        bboxes = outputs.pred_boxes
+        # Use softmax activation function
+        prob = logits.softmax(-1)[0, :, :-1] # Exclude the "no object" class
+        scores, labels = prob.max(-1)
+        # Convert to absolute coordinates
+        img_w, img_h = img_size
+        boxes = self.rescale_bboxes(bboxes[0, ...], (img_w, img_h)) # Pass tuple size
+        # Move results to CPU for list comprehension
+        scores = scores.cpu().tolist()
+        labels = labels.cpu().tolist()
+        boxes = boxes.cpu().tolist()
+        objects = []
+        for score, label_idx, bbox in zip(scores, labels, boxes):
+            class_label = id2label.get(label_idx, 'unknown') # Use get with default
+            if class_label != 'no object' and class_label != 'unknown':
+                objects.append({
+                    'label': class_label,
+                    'score': float(score),
+                    'bbox': [round(float(c), 2) for c in bbox] # Round coordinates
+                })
+        return objects
+    # --- End Helper Methods ---
+    def detect(self, image: Image.Image, options: BaseLayoutOptions) -> List[Dict[str, Any]]:
+        """Detect tables and their structure in an image."""
+        if not self.is_available():
+            raise RuntimeError("TATR dependencies (torch, torchvision, transformers) not installed.")
+        if not isinstance(options, TATRLayoutOptions):
+             self.logger.warning("Received BaseLayoutOptions, expected TATRLayoutOptions. Using defaults.")
+             options = TATRLayoutOptions(
+                 confidence=options.confidence, classes=options.classes,
+                 exclude_classes=options.exclude_classes, device=options.device,
+                 extra_args=options.extra_args
+             )
+        self.validate_classes(options.classes or [])
+        if options.exclude_classes:
+            self.validate_classes(options.exclude_classes)
+        models = self._get_model(options)
+        detection_model = models['detection']
+        structure_model = models['structure']
+        device = options.device or ("cuda" if torch.cuda.is_available() else "cpu")
+        # Prepare transforms based on options
+        detection_transform = transforms.Compose([
+            self.MaxResize(options.max_detection_size),
+            transforms.ToTensor(),
+            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+        ])
+        structure_transform = transforms.Compose([
+            self.MaxResize(options.max_structure_size),
+            transforms.ToTensor(),
+            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+        ])
+        # --- Detect Tables ---
+        self.logger.debug("Running TATR table detection...")
+        pixel_values = detection_transform(image.convert("RGB")).unsqueeze(0).to(device)
+        with torch.no_grad():
+            outputs = detection_model(pixel_values)
+        id2label_det = detection_model.config.id2label
+        id2label_det[detection_model.config.num_labels] = "no object" # Add no object class
+        tables = self.outputs_to_objects(outputs, image.size, id2label_det)
+        tables = [t for t in tables if t['score'] >= options.confidence and t['label'] == 'table'] # Filter for tables
+        self.logger.debug(f"Detected {len(tables)} table regions.")
+        all_detections = []
+        # Add table detections if requested
+        normalized_classes_req = {self._normalize_class_name(c) for c in options.classes} if options.classes else None
+        normalized_classes_excl = {self._normalize_class_name(c) for c in options.exclude_classes} if options.exclude_classes else set()
+        if normalized_classes_req is None or 'table' in normalized_classes_req:
+             if 'table' not in normalized_classes_excl:
+                 for table in tables:
+                     all_detections.append({
+                         'bbox': tuple(table['bbox']),
+                         'class': 'table',
+                         'confidence': float(table['score']),
+                         'normalized_class': 'table',
+                         'source': 'layout',
+                         'model': 'tatr'
+                     })
+        # --- Process Structure ---
+        structure_class_names = {'table row', 'table column', 'table column header', 'table projected row header', 'table spanning cell'}
+        normalized_structure_classes = {self._normalize_class_name(c) for c in structure_class_names}
+        needed_structure = False
+        if normalized_classes_req is None: # If no specific classes requested
+             needed_structure = any(norm_cls not in normalized_classes_excl for norm_cls in normalized_structure_classes)
+        else: # Specific classes requested
+             needed_structure = any(norm_cls in normalized_classes_req for norm_cls in normalized_structure_classes)
+        if needed_structure and tables:
+            self.logger.debug("Running TATR structure recognition...")
+            id2label_struct = structure_model.config.id2label
+            id2label_struct[structure_model.config.num_labels] = "no object"
+            for table in tables:
+                x_min, y_min, x_max, y_max = map(int, table['bbox'])
+                # Ensure coordinates are within image bounds
+                x_min, y_min = max(0, x_min), max(0, y_min)
+                x_max, y_max = min(image.width, x_max), min(image.height, y_max)
+                if x_max <= x_min or y_max <= y_min: continue # Skip invalid crop
+                cropped_table = image.crop((x_min, y_min, x_max, y_max))
+                if cropped_table.width == 0 or cropped_table.height == 0: continue # Skip empty crop
+                pixel_values_struct = structure_transform(cropped_table).unsqueeze(0).to(device)
+                with torch.no_grad():
+                    outputs_struct = structure_model(pixel_values_struct)
+                structure_elements = self.outputs_to_objects(outputs_struct, cropped_table.size, id2label_struct)
+                structure_elements = [e for e in structure_elements if e['score'] >= options.confidence]
+                for element in structure_elements:
+                    element_class_orig = element['label']
+                    normalized_class = self._normalize_class_name(element_class_orig)
+                    # Apply class filtering
+                    if normalized_classes_req and normalized_class not in normalized_classes_req: continue
+                    if normalized_class in normalized_classes_excl: continue
+                    # Adjust coordinates
+                    ex0, ey0, ex1, ey1 = element['bbox']
+                    adj_bbox = (ex0 + x_min, ey0 + y_min, ex1 + x_min, ey1 + y_min)
+                    all_detections.append({
+                        'bbox': adj_bbox,
+                        'class': element_class_orig,
+                        'confidence': float(element['score']),
+                        'normalized_class': normalized_class,
+                        'source': 'layout',
+                        'model': 'tatr'
+                    })
+            self.logger.debug(f"Added {len(all_detections) - len(tables)} structure elements.")
+        self.logger.info(f"TATR detected {len(all_detections)} layout elements matching criteria.")
+        return all_detections

natural_pdf/analyzers/layout/yolo.py ADDED Viewed

@@ -0,0 +1,165 @@
+# layout_detector_yolo.py
+import logging
+import importlib.util
+import os
+import tempfile
+from typing import List, Dict, Any, Optional
+from PIL import Image
+# Assuming base class and options are importable
+try:
+    from .base import LayoutDetector
+    from .layout_options import YOLOLayoutOptions, BaseLayoutOptions
+except ImportError:
+    # Placeholders if run standalone or imports fail
+    class BaseLayoutOptions: pass
+    class YOLOLayoutOptions(BaseLayoutOptions): pass
+    class LayoutDetector:
+         def __init__(self): self.logger=logging.getLogger(); self.supported_classes=set()
+         def _get_model(self, options): raise NotImplementedError
+         def _normalize_class_name(self, n): return n
+         def validate_classes(self, c): pass
+    logging.basicConfig()
+logger = logging.getLogger(__name__)
+# Check for dependencies
+yolo_spec = importlib.util.find_spec("doclayout_yolo")
+hf_spec = importlib.util.find_spec("huggingface_hub")
+YOLOv10 = None
+hf_hub_download = None
+if yolo_spec and hf_spec:
+    try:
+        from doclayout_yolo import YOLOv10
+        from huggingface_hub import hf_hub_download
+    except ImportError as e:
+        logger.warning(f"Could not import YOLO dependencies: {e}")
+else:
+    logger.warning("doclayout_yolo or huggingface_hub not found. YOLODocLayoutDetector will not be available.")
+class YOLODocLayoutDetector(LayoutDetector):
+    """Document layout detector using YOLO model."""
+    def __init__(self):
+        super().__init__()
+        self.supported_classes = {
+            'title', 'plain text', 'abandon', 'figure', 'figure_caption',
+            'table', 'table_caption', 'table_footnote', 'isolate_formula',
+            'formula_caption'
+        }
+    def is_available(self) -> bool:
+        """Check if dependencies are installed."""
+        return YOLOv10 is not None and hf_hub_download is not None
+    def _get_cache_key(self, options: YOLOLayoutOptions) -> str:
+        """Generate cache key based on model repo/file and device."""
+        # Ensure options is the correct type
+        if not isinstance(options, YOLOLayoutOptions):
+             # This shouldn't happen if called correctly, but handle defensively
+             options = YOLOLayoutOptions(device=options.device) # Use base device
+        device_key = str(options.device).lower()
+        model_key = f"{options.model_repo.replace('/','_')}_{options.model_file}"
+        return f"{self.__class__.__name__}_{device_key}_{model_key}"
+    def _load_model_from_options(self, options: YOLOLayoutOptions) -> Any:
+        """Load the YOLOv10 model based on options."""
+        if not self.is_available():
+             raise RuntimeError("YOLO dependencies (doclayout_yolo, huggingface_hub) not installed.")
+        self.logger.info(f"Loading YOLO model: {options.model_repo}/{options.model_file}")
+        try:
+            model_path = hf_hub_download(repo_id=options.model_repo, filename=options.model_file)
+            model = YOLOv10(model_path)
+            self.logger.info("YOLO model loaded.")
+            return model
+        except Exception as e:
+            self.logger.error(f"Failed to download or load YOLO model: {e}", exc_info=True)
+            raise
+    def detect(self, image: Image.Image, options: BaseLayoutOptions) -> List[Dict[str, Any]]:
+        """Detect layout elements in an image using YOLO."""
+        if not self.is_available():
+            raise RuntimeError("YOLO dependencies (doclayout_yolo, huggingface_hub) not installed.")
+        # Ensure options are the correct type, falling back to defaults if base type passed
+        if not isinstance(options, YOLOLayoutOptions):
+             self.logger.warning("Received BaseLayoutOptions, expected YOLOLayoutOptions. Using defaults.")
+             options = YOLOLayoutOptions(
+                 confidence=options.confidence, classes=options.classes,
+                 exclude_classes=options.exclude_classes, device=options.device,
+                 extra_args=options.extra_args
+             )
+        # Validate classes before proceeding
+        self.validate_classes(options.classes or [])
+        if options.exclude_classes:
+            self.validate_classes(options.exclude_classes)
+        # Get the cached/loaded model
+        model = self._get_model(options)
+        # YOLOv10 predict method requires an image path. Save temp file.
+        detections = []
+        # Use a context manager for robust temp file handling
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_image_path = os.path.join(temp_dir, "temp_layout_image.png")
+            try:
+                self.logger.debug(f"Saving temporary image for YOLO detector to: {temp_image_path}")
+                image.convert("RGB").save(temp_image_path) # Ensure RGB
+                # Run model prediction
+                self.logger.debug(f"Running YOLO prediction (imgsz={options.image_size}, conf={options.confidence}, device={options.device})...")
+                results = model.predict(
+                    temp_image_path,
+                    imgsz=options.image_size,
+                    conf=options.confidence,
+                    device=options.device or 'cpu' # Default to cpu if None
+                    # Add other predict args from options.extra_args if needed
+                    # **options.extra_args
+                )
+                self.logger.debug(f"YOLO prediction returned {len(results)} result objects.")
+                # Process results into standardized format
+                img_width, img_height = image.size # Get original image size for context if needed
+                for result in results:
+                    if result.boxes is None: continue
+                    boxes = result.boxes.xyxy
+                    labels = result.boxes.cls
+                    scores = result.boxes.conf
+                    class_names = result.names # Dictionary mapping index to name
+                    for box, label_idx_tensor, score_tensor in zip(boxes, labels, scores):
+                        x_min, y_min, x_max, y_max = map(float, box.tolist())
+                        label_idx = int(label_idx_tensor.item()) # Get int index
+                        score = float(score_tensor.item()) # Get float score
+                        if label_idx not in class_names:
+                             self.logger.warning(f"Label index {label_idx} not found in model names dict. Skipping.")
+                             continue
+                        label_name = class_names[label_idx]
+                        normalized_class = self._normalize_class_name(label_name)
+                        # Apply class filtering (using normalized names)
+                        if options.classes and normalized_class not in [self._normalize_class_name(c) for c in options.classes]:
+                            continue
+                        if options.exclude_classes and normalized_class in [self._normalize_class_name(c) for c in options.exclude_classes]:
+                            continue
+                        detections.append({
+                            'bbox': (x_min, y_min, x_max, y_max),
+                            'class': label_name,
+                            'confidence': score,
+                            'normalized_class': normalized_class,
+                            'source': 'layout',
+                            'model': 'yolo'
+                        })
+                self.logger.info(f"YOLO detected {len(detections)} layout elements matching criteria.")
+            except Exception as e:
+                self.logger.error(f"Error during YOLO detection: {e}", exc_info=True)
+                raise # Re-raise the exception
+        return detections

natural_pdf/analyzers/text_options.py ADDED Viewed

@@ -0,0 +1,60 @@
+import logging
+from dataclasses import dataclass, field
+from typing import List, Optional
+logger = logging.getLogger(__name__)
+@dataclass
+class TextStyleOptions:
+    """Options for configuring text style analysis."""
+    # Properties to consider when grouping elements by style
+    group_by: List[str] = field(default_factory=lambda: ['size', 'fontname', 'is_bold', 'is_italic', 'color'])
+    # Tolerance for comparing font sizes (e.g., 0.5 rounds to nearest 0.5 point)
+    size_tolerance: float = 0.5
+    # If True, ignores text color during grouping
+    ignore_color: bool = False
+    # If True, ignores small variations often found in font names (e.g., '+ArialMT')
+    normalize_fontname: bool = True
+    # If True, generates descriptive labels (e.g., "12pt-Bold-Arial")
+    # If False, uses simple numeric labels ("Style 1")
+    descriptive_labels: bool = True
+    # Prefix for generated labels (used if descriptive_labels is False or as fallback)
+    label_prefix: str = "Style"
+    # Format string for descriptive labels. Placeholders match keys in style_properties dict.
+    # Example: "{size}pt {weight}{style} {family} ({color})"
+    # Available keys: size, fontname, is_bold, is_italic, color, weight, style, family
+    label_format: str = "{size}pt {weight}{style} {family}" # Default format without color
+    def __post_init__(self):
+        # Validate size_tolerance
+        if self.size_tolerance <= 0:
+            logger.warning(f"size_tolerance must be positive, setting to 0.1. Original value: {self.size_tolerance}")
+            self.size_tolerance = 0.1
+        # Ensure 'size' is always considered if tolerance is relevant
+        if 'size' not in self.group_by and self.size_tolerance > 0:
+            logger.debug("Adding 'size' to group_by keys because size_tolerance is set.")
+            if 'size' not in self.group_by: self.group_by.append('size')
+        if self.ignore_color and 'color' in self.group_by:
+            logger.debug("Removing 'color' from group_by keys because ignore_color is True.")
+            self.group_by = [key for key in self.group_by if key != 'color']
+        elif not self.ignore_color and 'color' not in self.group_by:
+             # If color isn't ignored, ensure it's included if requested in label format?
+             # For now, just rely on explicit group_by setting.
+             pass
+        # Basic validation for group_by keys
+        allowed_keys = {'size', 'fontname', 'is_bold', 'is_italic', 'color'}
+        invalid_keys = set(self.group_by) - allowed_keys
+        if invalid_keys:
+            logger.warning(f"Invalid keys found in group_by: {invalid_keys}. Allowed keys: {allowed_keys}. Ignoring invalid keys.")
+            self.group_by = [key for key in self.group_by if key in allowed_keys]