PyPI - natural-pdf - Versions diffs - 0.2.18__py3-none-any.whl → 0.2.19__py3-none-any.whl - Mend

natural-pdf 0.2.18py3-none-any.whl → 0.2.19py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

natural_pdf/__init__.py +8 -0
natural_pdf/analyzers/checkbox/__init__.py +6 -0
natural_pdf/analyzers/checkbox/base.py +265 -0
natural_pdf/analyzers/checkbox/checkbox_analyzer.py +329 -0
natural_pdf/analyzers/checkbox/checkbox_manager.py +166 -0
natural_pdf/analyzers/checkbox/checkbox_options.py +60 -0
natural_pdf/analyzers/checkbox/mixin.py +95 -0
natural_pdf/analyzers/checkbox/rtdetr.py +201 -0
natural_pdf/collections/mixins.py +14 -5
natural_pdf/core/element_manager.py +5 -1
natural_pdf/core/page.py +61 -0
natural_pdf/core/page_collection.py +41 -1
natural_pdf/core/pdf.py +24 -1
natural_pdf/describe/base.py +20 -0
natural_pdf/elements/base.py +152 -10
natural_pdf/elements/element_collection.py +41 -2
natural_pdf/elements/region.py +115 -2
natural_pdf/judge.py +1509 -0
natural_pdf/selectors/parser.py +42 -1
{natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/METADATA +1 -1
{natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/RECORD +41 -17
temp/check_model.py +49 -0
temp/check_pdf_content.py +9 -0
temp/checkbox_checks.py +590 -0
temp/checkbox_simple.py +117 -0
temp/checkbox_ux_ideas.py +400 -0
temp/context_manager_prototype.py +177 -0
temp/convert_to_hf.py +60 -0
temp/demo_text_closest.py +66 -0
temp/inspect_model.py +43 -0
temp/rtdetr_dinov2_test.py +49 -0
temp/test_closest_debug.py +26 -0
temp/test_closest_debug2.py +22 -0
temp/test_context_exploration.py +85 -0
temp/test_durham.py +30 -0
temp/test_empty_string.py +16 -0
temp/test_similarity.py +15 -0
{natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/WHEEL +0 -0
{natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/top_level.txt +0 -0

temp/checkbox_ux_ideas.py ADDED Viewed

@@ -0,0 +1,400 @@
+"""
+Practical UX patterns for checkbox detection in form processing.
+"""
+import json
+import pandas as pd
+import numpy as np
+from pathlib import Path
+from typing import List, Dict, Tuple, Optional
+from collections import defaultdict
+class CheckboxCalibrator:
+    """
+    Learn optimal thresholds from user-provided examples.
+    """
+    def __init__(self):
+        self.checked_examples = []
+        self.unchecked_examples = []
+        self.thresholds = {}
+    def add_example(self, region, is_checked: bool):
+        """Add a labeled example."""
+        metrics = self._extract_metrics(region)
+        if is_checked:
+            self.checked_examples.append(metrics)
+        else:
+            self.unchecked_examples.append(metrics)
+    def calibrate(self):
+        """Find optimal thresholds based on examples."""
+        if not self.checked_examples or not self.unchecked_examples:
+            raise ValueError("Need both checked and unchecked examples")
+        # For each metric, find threshold that best separates checked/unchecked
+        checked_df = pd.DataFrame(self.checked_examples)
+        unchecked_df = pd.DataFrame(self.unchecked_examples)
+        for metric in checked_df.columns:
+            checked_vals = checked_df[metric].values
+            unchecked_vals = unchecked_df[metric].values
+            # Find threshold that maximizes separation
+            all_vals = np.concatenate([checked_vals, unchecked_vals])
+            best_threshold = None
+            best_score = 0
+            for threshold in np.percentile(all_vals, [10, 20, 30, 40, 50, 60, 70, 80, 90]):
+                # Score based on correct classification
+                correct = np.sum(checked_vals < threshold) + np.sum(unchecked_vals >= threshold)
+                score = correct / len(all_vals)
+                if score > best_score:
+                    best_score = score
+                    best_threshold = threshold
+            self.thresholds[metric] = {
+                'value': best_threshold,
+                'accuracy': best_score,
+                'checked_mean': np.mean(checked_vals),
+                'unchecked_mean': np.mean(unchecked_vals)
+            }
+    def predict(self, region, confidence_threshold=0.7):
+        """Predict if checkbox is checked based on calibrated thresholds."""
+        metrics = self._extract_metrics(region)
+        votes = 0
+        total_weight = 0
+        for metric, value in metrics.items():
+            if metric in self.thresholds:
+                threshold_info = self.thresholds[metric]
+                weight = threshold_info['accuracy']
+                if weight > confidence_threshold:
+                    if value < threshold_info['value']:
+                        votes += weight
+                    total_weight += weight
+        confidence = votes / total_weight if total_weight > 0 else 0
+        return confidence > 0.5, confidence
+    def save(self, path: str):
+        """Save calibration to file."""
+        with open(path, 'w') as f:
+            json.dump({
+                'thresholds': self.thresholds,
+                'n_checked': len(self.checked_examples),
+                'n_unchecked': len(self.unchecked_examples)
+            }, f, indent=2)
+    def load(self, path: str):
+        """Load calibration from file."""
+        with open(path, 'r') as f:
+            data = json.load(f)
+            self.thresholds = data['thresholds']
+    def _extract_metrics(self, region):
+        """Extract key metrics from region."""
+        img = np.array(region.render(crop=True).convert('L'))
+        h, w = img.shape
+        cy, cx = h // 2, w // 2
+        # Center sample
+        center = img[max(0, cy-2):min(h, cy+3), max(0, cx-2):min(w, cx+3)]
+        return {
+            'center_darkness': np.mean(center),
+            'dark_pixel_ratio': np.sum(img < 200) / img.size,
+            'std_intensity': np.std(img),
+            'edge_center_ratio': np.mean(img[0:2, :]) / np.mean(center) if np.mean(center) > 0 else 1
+        }
+class FormCheckboxProcessor:
+    """
+    Process multiple instances of the same form with checkboxes.
+    """
+    def __init__(self, template_config: Dict):
+        """
+        template_config = {
+            'checkboxes': {
+                'option1': {'find': 'text=Acceptable', 'direction': 'left', 'width': 15},
+                'option2': {'find': 'text=Deficient', 'direction': 'left', 'width': 15},
+                'option3': {'find': 'text=At-Risk', 'direction': 'left', 'width': 15},
+            },
+            'constraints': {
+                'exactly_one': True,  # Exactly one must be checked
+                'min_checked': 1,     # At least this many
+                'max_checked': 1,     # At most this many
+            }
+        }
+        """
+        self.config = template_config
+        self.calibrator = CheckboxCalibrator()
+        self.results = []
+    def process_form(self, page, form_id: str) -> Dict:
+        """Process a single form instance."""
+        results = {'form_id': form_id, 'checkboxes': {}, 'valid': True, 'confidence': 1.0}
+        # Find all checkboxes
+        for name, config in self.config['checkboxes'].items():
+            try:
+                # Find the checkbox region
+                ref = page.find(config['find'])
+                if not ref:
+                    results['checkboxes'][name] = {'found': False}
+                    results['valid'] = False
+                    continue
+                # Navigate to checkbox
+                direction = config.get('direction', 'left')
+                width = config.get('width', 15)
+                if direction == 'left':
+                    cb = ref.left(width=width)
+                elif direction == 'right':
+                    cb = ref.right(width=width)
+                elif direction == 'above':
+                    cb = ref.above(height=width)
+                elif direction == 'below':
+                    cb = ref.below(height=width)
+                # Analyze checkbox
+                if self.calibrator.thresholds:
+                    # Use calibrated prediction
+                    is_checked, confidence = self.calibrator.predict(cb)
+                else:
+                    # Use default analysis
+                    from temp.checkbox_checks import analyze_checkbox
+                    is_checked = analyze_checkbox(cb, method='all')
+                    confidence = 0.8  # Default confidence
+                results['checkboxes'][name] = {
+                    'found': True,
+                    'checked': is_checked,
+                    'confidence': confidence,
+                    'bbox': cb.bbox
+                }
+                results['confidence'] = min(results['confidence'], confidence)
+            except Exception as e:
+                results['checkboxes'][name] = {'found': False, 'error': str(e)}
+                results['valid'] = False
+        # Check constraints
+        if results['valid'] and 'constraints' in self.config:
+            checked_count = sum(1 for cb in results['checkboxes'].values()
+                              if cb.get('found') and cb.get('checked'))
+            constraints = self.config['constraints']
+            if 'exactly_one' in constraints and constraints['exactly_one']:
+                results['constraint_met'] = checked_count == 1
+            elif 'min_checked' in constraints:
+                results['constraint_met'] = checked_count >= constraints['min_checked']
+                if 'max_checked' in constraints:
+                    results['constraint_met'] &= checked_count <= constraints['max_checked']
+            else:
+                results['constraint_met'] = True
+            results['checked_count'] = checked_count
+        self.results.append(results)
+        return results
+    def process_batch(self, pdf_paths: List[str], page_num: int = 0):
+        """Process multiple PDFs."""
+        import natural_pdf as npdf
+        for pdf_path in pdf_paths:
+            pdf = npdf.PDF(pdf_path)
+            page = pdf[page_num]
+            form_id = Path(pdf_path).stem
+            yield self.process_form(page, form_id)
+    def get_summary(self) -> pd.DataFrame:
+        """Get summary of all processed forms."""
+        data = []
+        for result in self.results:
+            row = {'form_id': result['form_id'], 'valid': result['valid']}
+            for name, info in result['checkboxes'].items():
+                row[f'{name}_checked'] = info.get('checked', False)
+                row[f'{name}_confidence'] = info.get('confidence', 0)
+            row['constraint_met'] = result.get('constraint_met', False)
+            row['overall_confidence'] = result.get('confidence', 0)
+            data.append(row)
+        return pd.DataFrame(data)
+    def flag_for_review(self, confidence_threshold: float = 0.8) -> List[str]:
+        """Get forms that need human review."""
+        return [r['form_id'] for r in self.results
+                if r['confidence'] < confidence_threshold or not r['valid']]
+class InteractiveCheckboxReviewer:
+    """
+    Interactive tool for reviewing uncertain cases.
+    """
+    def __init__(self, processor: FormCheckboxProcessor):
+        self.processor = processor
+        self.corrections = {}
+    def review_uncertain(self, confidence_threshold: float = 0.8):
+        """Show uncertain cases for review."""
+        uncertain = [r for r in self.processor.results
+                    if r['confidence'] < confidence_threshold]
+        print(f"Found {len(uncertain)} uncertain cases to review")
+        for i, result in enumerate(uncertain):
+            print(f"\n--- Form {i+1}/{len(uncertain)}: {result['form_id']} ---")
+            print(f"Overall confidence: {result['confidence']:.2f}")
+            for name, info in result['checkboxes'].items():
+                if info.get('found'):
+                    status = "✓" if info['checked'] else "✗"
+                    conf = info['confidence']
+                    print(f"{name}: {status} (confidence: {conf:.2f})")
+            # In a real implementation, show the actual checkbox images
+            correction = input("Correct? (y/n/skip): ").lower()
+            if correction == 'n':
+                # Get corrections
+                for name in result['checkboxes']:
+                    if result['checkboxes'][name].get('found'):
+                        checked = input(f"Is {name} checked? (y/n): ").lower() == 'y'
+                        self.corrections[f"{result['form_id']}_{name}"] = checked
+    def export_training_data(self, output_dir: str):
+        """Export examples for future training."""
+        # Implementation would save checkbox images with labels
+class CheckboxExampleManager:
+    """
+    Manage a library of checkbox examples.
+    """
+    def __init__(self, examples_dir: str):
+        self.examples_dir = Path(examples_dir)
+        self.examples_dir.mkdir(exist_ok=True)
+    def save_example(self, region, label: str, is_checked: bool):
+        """Save a checkbox example."""
+        subdir = self.examples_dir / ('checked' if is_checked else 'unchecked')
+        subdir.mkdir(exist_ok=True)
+        # Save image
+        img = region.render(crop=True)
+        filename = f"{label}_{len(list(subdir.glob('*.png')))}.png"
+        img.save(subdir / filename)
+        # Save metadata
+        meta = {
+            'label': label,
+            'is_checked': is_checked,
+            'bbox': region.bbox,
+            'timestamp': pd.Timestamp.now().isoformat()
+        }
+        with open(subdir / f"{filename}.json", 'w') as f:
+            json.dump(meta, f)
+    def load_examples(self) -> Tuple[List, List]:
+        """Load all examples."""
+        checked = list((self.examples_dir / 'checked').glob('*.png'))
+        unchecked = list((self.examples_dir / 'unchecked').glob('*.png'))
+        return checked, unchecked
+    def create_calibrator(self) -> CheckboxCalibrator:
+        """Create calibrator from saved examples."""
+        calibrator = CheckboxCalibrator()
+        # Mock implementation - would load actual images
+        checked, unchecked = self.load_examples()
+        print(f"Loading {len(checked)} checked and {len(unchecked)} unchecked examples")
+        return calibrator
+# Example usage patterns
+if __name__ == "__main__":
+    print("UX Pattern Examples")
+    print("=" * 60)
+    # Pattern 1: Simple pairwise comparison
+    print("\n1. PAIRWISE COMPARISON")
+    print("Pros: Simple, no setup needed")
+    print("Cons: No learning, must process each time")
+    print("""
+    from temp.checkbox_checks import compare_checkboxes
+    df = compare_checkboxes([cb1, cb2, cb3], ['A', 'B', 'C'])
+    winner = df.loc['vote_score'].idxmax()
+    print(f"Checked: {winner}")
+    """)
+    # Pattern 2: Calibration-based
+    print("\n2. CALIBRATION-BASED")
+    print("Pros: Learns from examples, improves over time")
+    print("Cons: Requires initial setup")
+    print("""
+    calibrator = CheckboxCalibrator()
+    # Add examples
+    calibrator.add_example(checked_cb, is_checked=True)
+    calibrator.add_example(unchecked_cb, is_checked=False)
+    calibrator.calibrate()
+    # Use on new checkboxes
+    is_checked, confidence = calibrator.predict(new_cb)
+    """)
+    # Pattern 3: Template-based batch processing
+    print("\n3. TEMPLATE-BASED BATCH")
+    print("Pros: Handles many forms, constraint checking")
+    print("Cons: Requires template definition")
+    print("""
+    config = {
+        'checkboxes': {
+            'acceptable': {'find': 'text=Acceptable', 'direction': 'left'},
+            'deficient': {'find': 'text=Deficient', 'direction': 'left'},
+        },
+        'constraints': {'exactly_one': True}
+    }
+    processor = FormCheckboxProcessor(config)
+    for result in processor.process_batch(pdf_files):
+        if not result['constraint_met']:
+            print(f"Invalid: {result['form_id']}")
+    """)
+    # Pattern 4: Confidence-based triage
+    print("\n4. CONFIDENCE-BASED TRIAGE")
+    print("Pros: Focuses human review on uncertain cases")
+    print("Cons: Still requires some manual review")
+    print("""
+    # Process all forms
+    results = processor.process_batch(pdfs)
+    # Auto-accept high confidence
+    high_conf = [r for r in results if r['confidence'] > 0.9]
+    # Flag for review
+    needs_review = processor.flag_for_review(confidence_threshold=0.8)
+    print(f"Review needed: {len(needs_review)} forms")
+    """)
+    # Pattern 5: Active learning
+    print("\n5. ACTIVE LEARNING")
+    print("Pros: Improves accuracy with minimal human input")
+    print("Cons: More complex implementation")
+    print("""
+    reviewer = InteractiveCheckboxReviewer(processor)
+    # Review uncertain cases
+    reviewer.review_uncertain(confidence_threshold=0.7)
+    # Update calibrator with corrections
+    for correction in reviewer.corrections:
+        calibrator.add_example(region, is_checked=correction)
+    calibrator.calibrate()
+    """)

temp/context_manager_prototype.py ADDED Viewed

@@ -0,0 +1,177 @@
+#!/usr/bin/env python3
+"""
+Prototype implementation of context manager for Natural PDF directional options.
+This demonstrates how Option 1 (Global Context Manager) would work in practice.
+"""
+import natural_pdf as npdf
+from contextlib import contextmanager
+from typing import Any, Dict, Optional
+@contextmanager
+def with_directional_options(
+    directional_offset: Optional[float] = None,
+    auto_multipage: Optional[bool] = None,
+    **kwargs
+):
+    """
+    Context manager to temporarily override directional method options.
+    Parameters
+    ----------
+    directional_offset : float, optional
+        Temporary offset in points for directional methods (default: keep current)
+    auto_multipage : bool, optional
+        Temporary setting for automatic multipage navigation (default: keep current)
+    **kwargs : dict
+        Additional layout options to override
+    Examples
+    --------
+    >>> # Temporarily use larger offset
+    >>> with with_directional_options(directional_offset=5.0):
+    ...     region = element.below()  # Uses 5.0 offset
+    >>> # Multiple options
+    >>> with with_directional_options(directional_offset=10.0, auto_multipage=True):
+    ...     region = element.below(until="Section 2")  # Can span pages
+    >>> # Nested contexts
+    >>> with with_directional_options(directional_offset=5.0):
+    ...     with with_directional_options(auto_multipage=True):
+    ...         # Both options are active here
+    ...         region = element.below()
+    Warning
+    -------
+    This context manager modifies global state and is NOT thread-safe.
+    Do not use in multi-threaded applications.
+    """
+    # Build options dict
+    options = {}
+    if directional_offset is not None:
+        options['directional_offset'] = directional_offset
+    if auto_multipage is not None:
+        options['auto_multipage'] = auto_multipage
+    options.update(kwargs)
+    # Store original values
+    original_values = {}
+    layout_options = npdf.options.layout
+    for key, value in options.items():
+        if hasattr(layout_options, key):
+            original_values[key] = getattr(layout_options, key)
+            setattr(layout_options, key, value)
+        else:
+            raise ValueError(f"Unknown layout option: {key}")
+    try:
+        yield
+    finally:
+        # Restore original values
+        for key, original_value in original_values.items():
+            setattr(layout_options, key, original_value)
+# Convenience functions for common use cases
+@contextmanager
+def with_offset(offset: float):
+    """Temporarily set directional offset."""
+    with with_directional_options(directional_offset=offset):
+        yield
+@contextmanager
+def with_multipage():
+    """Temporarily enable multipage navigation."""
+    with with_directional_options(auto_multipage=True):
+        yield
+@contextmanager
+def no_offset():
+    """Temporarily disable directional offset (set to 0)."""
+    with with_directional_options(directional_offset=0.0):
+        yield
+# Demo usage
+if __name__ == "__main__":
+    import tempfile
+    from reportlab.pdfgen import canvas
+    from reportlab.lib.pagesizes import letter
+    # Create a test PDF
+    with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmp:
+        c = canvas.Canvas(tmp.name, pagesize=letter)
+        # Page 1
+        c.drawString(100, 700, "Header 1")
+        c.drawString(100, 600, "Content 1")
+        c.drawString(100, 500, "Footer 1")
+        # Page 2
+        c.showPage()
+        c.drawString(100, 700, "Header 2")
+        c.drawString(100, 600, "Content 2")
+        c.save()
+        # Test the context managers
+        print("=== Testing Context Managers ===")
+        pdf = npdf.PDF(tmp.name)
+        page = pdf.pages[0]
+        # Find header
+        header = page.find('text:contains("Header 1")')
+        # Test 1: Default behavior
+        print(f"\n1. Default offset: {npdf.options.layout.directional_offset}")
+        region1 = header.below(height=50)
+        print(f"   Region bbox: {region1.bbox}")
+        # Test 2: With custom offset
+        print(f"\n2. Using with_offset(10.0):")
+        with with_offset(10.0):
+            print(f"   Inside context: offset={npdf.options.layout.directional_offset}")
+            region2 = header.below(height=50)
+            print(f"   Region bbox: {region2.bbox}")
+            print(f"   Y difference: {region2.top - region1.top}")
+        print(f"   After context: offset={npdf.options.layout.directional_offset}")
+        # Test 3: No offset
+        print(f"\n3. Using no_offset():")
+        with no_offset():
+            region3 = header.below(height=50)
+            print(f"   Region bbox: {region3.bbox}")
+            print(f"   Includes header: {region3.top <= header.bottom}")
+        # Test 4: Multipage navigation
+        print(f"\n4. Testing multipage:")
+        print(f"   Default auto_multipage: {npdf.options.layout.auto_multipage}")
+        # This would normally stop at page boundary
+        region4 = header.below(until='text:contains("Header 2")')
+        print(f"   Without multipage: stops at page boundary")
+        # This can cross pages
+        with with_multipage():
+            print(f"   Inside context: auto_multipage={npdf.options.layout.auto_multipage}")
+            # Note: This would work if multipage was fully implemented
+            # region5 = header.below(until="Header 2")
+            # print(f"   Region type: {type(region5).__name__}")  # Would be FlowRegion
+        # Test 5: Nested contexts
+        print(f"\n5. Nested contexts:")
+        with with_offset(5.0):
+            print(f"   Outer: offset={npdf.options.layout.directional_offset}")
+            with with_offset(10.0):
+                print(f"   Inner: offset={npdf.options.layout.directional_offset}")
+            print(f"   Back to outer: offset={npdf.options.layout.directional_offset}")
+        print(f"   Back to default: offset={npdf.options.layout.directional_offset}")
+        # Cleanup
+        import os
+        os.unlink(tmp.name)

temp/convert_to_hf.py ADDED Viewed

@@ -0,0 +1,60 @@
+"""Convert RF-DETR checkpoint to HuggingFace format"""
+import torch
+import json
+from pathlib import Path
+# Load your checkpoint
+checkpoint = torch.load("model-weights/checkbox-nano.pt", map_location='cpu', weights_only=False)
+# Extract model info
+model_state = checkpoint['model']
+args = checkpoint['args']
+print(f"Model has {len(model_state)} parameters")
+print(f"Classes: {args.class_names}")
+print(f"Number of classes: {args.num_classes}")
+# Create HF-style config
+config = {
+    "architectures": ["RFDetrForObjectDetection"],
+    "model_type": "rf-detr",
+    "num_labels": args.num_classes,
+    "id2label": {str(i): label for i, label in enumerate(args.class_names)},
+    "label2id": {label: str(i) for i, label in enumerate(args.class_names)},
+    # RF-DETR specific
+    "encoder": args.encoder,  # dinov2_windowed_small
+    "hidden_dim": args.hidden_dim,
+    "num_queries": args.num_queries,
+    "dec_layers": args.dec_layers,
+    "dim_feedforward": args.dim_feedforward,
+    "dropout": args.dropout,
+    "sa_nheads": args.sa_nheads,
+    "ca_nheads": args.ca_nheads,
+    "two_stage": args.two_stage,
+    # Detection specific
+    "bbox_loss_coef": args.bbox_loss_coef,
+    "giou_loss_coef": args.giou_loss_coef,
+    "cls_loss_coef": args.cls_loss_coef,
+    # Training config
+    "resolution": args.resolution,
+    "pretrained_encoder": args.encoder,
+}
+# Save config
+output_dir = Path("temp/checkbox-rf-detr-hf")
+output_dir.mkdir(exist_ok=True)
+with open(output_dir / "config.json", "w") as f:
+    json.dump(config, f, indent=2)
+# Save model weights in HF format
+torch.save(model_state, output_dir / "pytorch_model.bin")
+print(f"\nSaved to {output_dir}")
+print("Next steps:")
+print("1. Copy the custom RF-DETR implementation files from Thastp/rf-detr-base")
+print("2. Upload to HuggingFace Hub with these files")
+print("3. Use with transformers library")

natural-pdf 0.2.18__py3-none-any.whl → 0.2.19__py3-none-any.whl

natural-pdf 0.2.18py3-none-any.whl → 0.2.19py3-none-any.whl