PyPI - doctra - Versions diffs - 0.3.2__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

doctra 0.3.2py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

doctra/__init__.py +4 -0
doctra/cli/main.py +168 -0
doctra/engines/image_restoration/__init__.py +10 -0
doctra/engines/image_restoration/docres_engine.py +566 -0
doctra/engines/vlm/service.py +0 -12
doctra/parsers/enhanced_pdf_parser.py +370 -0
doctra/parsers/structured_pdf_parser.py +11 -60
doctra/parsers/table_chart_extractor.py +8 -44
doctra/third_party/docres/data/MBD/MBD.py +110 -0
doctra/third_party/docres/data/MBD/MBD_utils.py +291 -0
doctra/third_party/docres/data/MBD/infer.py +151 -0
doctra/third_party/docres/data/MBD/model/deep_lab_model/aspp.py +95 -0
doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/__init__.py +13 -0
doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/drn.py +402 -0
doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/mobilenet.py +151 -0
doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/resnet.py +170 -0
doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/xception.py +288 -0
doctra/third_party/docres/data/MBD/model/deep_lab_model/decoder.py +59 -0
doctra/third_party/docres/data/MBD/model/deep_lab_model/deeplab.py +81 -0
doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/__init__.py +12 -0
doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/batchnorm.py +282 -0
doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/comm.py +129 -0
doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/replicate.py +88 -0
doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/unittest.py +29 -0
doctra/third_party/docres/data/preprocess/crop_merge_image.py +142 -0
doctra/third_party/docres/inference.py +370 -0
doctra/third_party/docres/models/restormer_arch.py +308 -0
doctra/third_party/docres/utils.py +464 -0
doctra/ui/app.py +5 -32
doctra/utils/progress.py +13 -98
doctra/utils/structured_utils.py +45 -49
doctra/version.py +1 -1
{doctra-0.3.2.dist-info → doctra-0.4.0.dist-info}/METADATA +1 -1
doctra-0.4.0.dist-info/RECORD +67 -0
doctra-0.3.2.dist-info/RECORD +0 -44
{doctra-0.3.2.dist-info → doctra-0.4.0.dist-info}/WHEEL +0 -0
{doctra-0.3.2.dist-info → doctra-0.4.0.dist-info}/licenses/LICENSE +0 -0
{doctra-0.3.2.dist-info → doctra-0.4.0.dist-info}/top_level.txt +0 -0

doctra/parsers/enhanced_pdf_parser.py ADDED Viewed

@@ -0,0 +1,370 @@
+"""
+Enhanced PDF Parser with Image Restoration
+This module provides an enhanced PDF parser that combines the structured parsing
+capabilities with DocRes image restoration for improved document processing.
+"""
+from __future__ import annotations
+import os
+import sys
+from typing import List, Dict, Any, Optional, Union
+from contextlib import ExitStack
+from PIL import Image
+from tqdm import tqdm
+from doctra.parsers.structured_pdf_parser import StructuredPDFParser
+from doctra.engines.image_restoration import DocResEngine
+from doctra.utils.pdf_io import render_pdf_to_images
+from doctra.utils.constants import IMAGE_SUBDIRS
+from doctra.utils.file_ops import ensure_output_dirs
+from doctra.utils.progress import create_beautiful_progress_bar, create_notebook_friendly_bar
+class EnhancedPDFParser(StructuredPDFParser):
+    """
+    Enhanced PDF Parser with Image Restoration capabilities.
+    Extends the StructuredPDFParser with DocRes image restoration to improve
+    document quality before processing. This is particularly useful for:
+    - Scanned documents with shadows or distortion
+    - Low-quality PDFs that need enhancement
+    - Documents with perspective issues
+    :param use_image_restoration: Whether to apply DocRes image restoration (default: True)
+    :param restoration_task: DocRes task to use ("dewarping", "deshadowing", "appearance", "deblurring", "binarization", "end2end", default: "appearance")
+    :param restoration_device: Device for DocRes processing ("cuda", "cpu", or None for auto-detect, default: None)
+    :param restoration_dpi: DPI for restoration processing (default: 200)
+    :param use_vlm: Whether to use VLM for structured data extraction (default: False)
+    :param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter", default: "gemini")
+    :param vlm_model: Model name to use (defaults to provider-specific defaults)
+    :param vlm_api_key: API key for VLM provider (required if use_vlm is True)
+    :param layout_model_name: Layout detection model name (default: "PP-DocLayout_plus-L")
+    :param dpi: DPI for PDF rendering (default: 200)
+    :param min_score: Minimum confidence score for layout detection (default: 0.0)
+    :param ocr_lang: OCR language code (default: "eng")
+    :param ocr_psm: Tesseract page segmentation mode (default: 4)
+    :param ocr_oem: Tesseract OCR engine mode (default: 3)
+    :param ocr_extra_config: Additional Tesseract configuration (default: "")
+    :param box_separator: Separator between text boxes in output (default: "\n")
+    """
+    def __init__(
+        self,
+        *,
+        use_image_restoration: bool = True,
+        restoration_task: str = "appearance",
+        restoration_device: Optional[str] = None,
+        restoration_dpi: int = 200,
+        use_vlm: bool = False,
+        vlm_provider: str = "gemini",
+        vlm_model: str | None = None,
+        vlm_api_key: str | None = None,
+        layout_model_name: str = "PP-DocLayout_plus-L",
+        dpi: int = 200,
+        min_score: float = 0.0,
+        ocr_lang: str = "eng",
+        ocr_psm: int = 4,
+        ocr_oem: int = 3,
+        ocr_extra_config: str = "",
+        box_separator: str = "\n",
+    ):
+        """
+        Initialize the Enhanced PDF Parser with image restoration capabilities.
+        """
+        # Initialize parent class
+        super().__init__(
+            use_vlm=use_vlm,
+            vlm_provider=vlm_provider,
+            vlm_model=vlm_model,
+            vlm_api_key=vlm_api_key,
+            layout_model_name=layout_model_name,
+            dpi=dpi,
+            min_score=min_score,
+            ocr_lang=ocr_lang,
+            ocr_psm=ocr_psm,
+            ocr_oem=ocr_oem,
+            ocr_extra_config=ocr_extra_config,
+            box_separator=box_separator,
+        )
+        # Image restoration settings
+        self.use_image_restoration = use_image_restoration
+        self.restoration_task = restoration_task
+        self.restoration_device = restoration_device
+        self.restoration_dpi = restoration_dpi
+        # Initialize DocRes engine if needed
+        self.docres_engine = None
+        if self.use_image_restoration:
+            try:
+                self.docres_engine = DocResEngine(
+                    device=restoration_device,
+                    use_half_precision=True
+                )
+                print(f"✅ DocRes engine initialized with task: {restoration_task}")
+            except Exception as e:
+                print(f"⚠️ DocRes initialization failed: {e}")
+                print("   Continuing without image restoration...")
+                self.use_image_restoration = False
+                self.docres_engine = None
+    def parse(self, pdf_path: str, enhanced_output_dir: str = None) -> None:
+        """
+        Parse a PDF document with optional image restoration.
+        :param pdf_path: Path to the input PDF file
+        :param enhanced_output_dir: Directory for enhanced images (if None, uses default)
+        :return: None
+        """
+        pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0]
+        # Set up output directories
+        if enhanced_output_dir is None:
+            out_dir = f"outputs/{pdf_filename}/enhanced_parse"
+        else:
+            out_dir = enhanced_output_dir
+        os.makedirs(out_dir, exist_ok=True)
+        ensure_output_dirs(out_dir, IMAGE_SUBDIRS)
+        # Process PDF pages with optional restoration
+        if self.use_image_restoration and self.docres_engine:
+            print(f"🔄 Processing PDF with image restoration: {os.path.basename(pdf_path)}")
+            enhanced_pages = self._process_pages_with_restoration(pdf_path, out_dir)
+        else:
+            print(f"🔄 Processing PDF without image restoration: {os.path.basename(pdf_path)}")
+            enhanced_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.dpi)]
+        # Run layout detection on enhanced pages
+        print("🔍 Running layout detection on enhanced pages...")
+        pages = self.layout_engine.predict_pdf(
+            pdf_path, batch_size=1, layout_nms=True, dpi=self.dpi, min_score=self.min_score
+        )
+        # Use enhanced pages for processing
+        pil_pages = enhanced_pages
+        # Continue with standard parsing logic
+        self._process_parsing_logic(pages, pil_pages, out_dir, pdf_filename)
+    def _process_pages_with_restoration(self, pdf_path: str, out_dir: str) -> List[Image.Image]:
+        """
+        Process PDF pages with DocRes image restoration.
+        :param pdf_path: Path to the input PDF file
+        :param out_dir: Output directory for enhanced images
+        :return: List of enhanced PIL images
+        """
+        # Render original pages
+        original_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.restoration_dpi)]
+        if not original_pages:
+            print("❌ No pages found in PDF")
+            return []
+        # Create progress bar
+        is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
+        if is_notebook:
+            progress_bar = create_notebook_friendly_bar(
+                total=len(original_pages),
+                desc=f"🔄 DocRes {self.restoration_task}"
+            )
+        else:
+            progress_bar = create_beautiful_progress_bar(
+                total=len(original_pages),
+                desc=f"🔄 DocRes {self.restoration_task}",
+                leave=True
+            )
+        enhanced_pages = []
+        enhanced_dir = os.path.join(out_dir, "enhanced_pages")
+        os.makedirs(enhanced_dir, exist_ok=True)
+        try:
+            with progress_bar:
+                for i, page_img in enumerate(original_pages):
+                    try:
+                        # Convert PIL to numpy array
+                        import numpy as np
+                        img_array = np.array(page_img)
+                        # Apply DocRes restoration
+                        restored_img, metadata = self.docres_engine.restore_image(
+                            img_array,
+                            task=self.restoration_task
+                        )
+                        # Convert back to PIL Image
+                        enhanced_page = Image.fromarray(restored_img)
+                        enhanced_pages.append(enhanced_page)
+                        # Save enhanced page for reference
+                        enhanced_path = os.path.join(enhanced_dir, f"page_{i+1:03d}_enhanced.jpg")
+                        enhanced_page.save(enhanced_path, "JPEG", quality=95)
+                        progress_bar.set_description(f"✅ Page {i+1}/{len(original_pages)} enhanced")
+                        progress_bar.update(1)
+                    except Exception as e:
+                        print(f"  ⚠️ Page {i+1} restoration failed: {e}, using original")
+                        enhanced_pages.append(page_img)
+                        progress_bar.set_description(f"⚠️ Page {i+1} failed, using original")
+                        progress_bar.update(1)
+        finally:
+            if hasattr(progress_bar, 'close'):
+                progress_bar.close()
+        print(f"✅ Image restoration completed. Enhanced pages saved to: {enhanced_dir}")
+        return enhanced_pages
+    def _process_parsing_logic(self, pages, pil_pages, out_dir, pdf_filename):
+        """
+        Process the parsing logic with enhanced pages.
+        This is extracted from the parent class to allow customization.
+        """
+        from doctra.utils.constants import EXCLUDE_LABELS
+        from doctra.parsers.layout_order import reading_order_key
+        from doctra.utils.ocr_utils import ocr_box_text
+        from doctra.exporters.image_saver import save_box_image
+        from doctra.exporters.markdown_writer import write_markdown
+        from doctra.exporters.html_writer import write_html
+        from doctra.exporters.excel_writer import write_structured_excel
+        from doctra.exporters.html_writer import write_structured_html
+        from doctra.utils.structured_utils import to_structured_dict
+        from doctra.exporters.markdown_table import render_markdown_table
+        fig_count = sum(sum(1 for b in p.boxes if b.label == "figure") for p in pages)
+        chart_count = sum(sum(1 for b in p.boxes if b.label == "chart") for p in pages)
+        table_count = sum(sum(1 for b in p.boxes if b.label == "table") for p in pages)
+        md_lines: List[str] = ["# Enhanced Document Content\n"]
+        structured_items: List[Dict[str, Any]] = []
+        charts_desc = "Charts (VLM → table)" if self.use_vlm else "Charts (cropped)"
+        tables_desc = "Tables (VLM → table)" if self.use_vlm else "Tables (cropped)"
+        figures_desc = "Figures (cropped)"
+        with ExitStack() as stack:
+            is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
+            if is_notebook:
+                charts_bar = stack.enter_context(
+                    create_notebook_friendly_bar(total=chart_count, desc=charts_desc)) if chart_count else None
+                tables_bar = stack.enter_context(
+                    create_notebook_friendly_bar(total=table_count, desc=tables_desc)) if table_count else None
+                figures_bar = stack.enter_context(
+                    create_notebook_friendly_bar(total=fig_count, desc=figures_desc)) if fig_count else None
+            else:
+                charts_bar = stack.enter_context(
+                    create_beautiful_progress_bar(total=chart_count, desc=charts_desc, leave=True)) if chart_count else None
+                tables_bar = stack.enter_context(
+                    create_beautiful_progress_bar(total=table_count, desc=tables_desc, leave=True)) if table_count else None
+                figures_bar = stack.enter_context(
+                    create_beautiful_progress_bar(total=fig_count, desc=figures_desc, leave=True)) if fig_count else None
+            for p in pages:
+                page_num = p.page_index
+                page_img: Image.Image = pil_pages[page_num - 1]
+                md_lines.append(f"\n## Page {page_num}\n")
+                for i, box in enumerate(sorted(p.boxes, key=reading_order_key), start=1):
+                    if box.label in EXCLUDE_LABELS:
+                        img_path = save_box_image(page_img, box, out_dir, page_num, i, IMAGE_SUBDIRS)
+                        abs_img_path = os.path.abspath(img_path)
+                        rel = os.path.relpath(abs_img_path, out_dir)
+                        if box.label == "figure":
+                            md_lines.append(f"![Figure — page {page_num}]({rel})\n")
+                            if figures_bar: figures_bar.update(1)
+                        elif box.label == "chart":
+                            if self.use_vlm and self.vlm:
+                                wrote_table = False
+                                try:
+                                    chart = self.vlm.extract_chart(abs_img_path)
+                                    item = to_structured_dict(chart)
+                                    if item:
+                                        structured_items.append(item)
+                                        md_lines.append(
+                                            render_markdown_table(item.get("headers"), item.get("rows"),
+                                                                  title=item.get("title"))
+                                        )
+                                        wrote_table = True
+                                except Exception as e:
+                                    pass
+                                if not wrote_table:
+                                    md_lines.append(f"![Chart — page {page_num}]({rel})\n")
+                            else:
+                                md_lines.append(f"![Chart — page {page_num}]({rel})\n")
+                            if charts_bar: charts_bar.update(1)
+                        elif box.label == "table":
+                            if self.use_vlm and self.vlm:
+                                wrote_table = False
+                                try:
+                                    table = self.vlm.extract_table(abs_img_path)
+                                    item = to_structured_dict(table)
+                                    if item:
+                                        structured_items.append(item)
+                                        md_lines.append(
+                                            render_markdown_table(item.get("headers"), item.get("rows"),
+                                                                  title=item.get("title"))
+                                        )
+                                        wrote_table = True
+                                except Exception as e:
+                                    pass
+                                if not wrote_table:
+                                    md_lines.append(f"![Table — page {page_num}]({rel})\n")
+                            else:
+                                md_lines.append(f"![Table — page {page_num}]({rel})\n")
+                            if tables_bar: tables_bar.update(1)
+                    else:
+                        text = ocr_box_text(self.ocr_engine, page_img, box)
+                        if text:
+                            md_lines.append(text)
+                            md_lines.append(self.box_separator if self.box_separator else "")
+        md_path = write_markdown(md_lines, out_dir)
+        html_path = write_html(md_lines, out_dir)
+        excel_path = None
+        html_structured_path = None
+        if self.use_vlm and structured_items:
+            excel_path = os.path.join(out_dir, "tables.xlsx")
+            write_structured_excel(excel_path, structured_items)
+            html_structured_path = os.path.join(out_dir, "tables.html")
+            write_structured_html(html_structured_path, structured_items)
+        print(f"✅ Enhanced parsing completed successfully!")
+        print(f"📁 Output directory: {out_dir}")
+    def restore_pdf_only(self, pdf_path: str, output_path: str = None, task: str = None) -> str:
+        """
+        Apply DocRes restoration to a PDF without parsing.
+        :param pdf_path: Path to the input PDF file
+        :param output_path: Path for the enhanced PDF (if None, auto-generates)
+        :param task: DocRes restoration task (if None, uses instance default)
+        :return: Path to the enhanced PDF or None if failed
+        """
+        if not self.use_image_restoration or not self.docres_engine:
+            raise RuntimeError("Image restoration is not enabled or DocRes engine is not available")
+        task = task or self.restoration_task
+        return self.docres_engine.restore_pdf(pdf_path, output_path, task, self.restoration_dpi)
+    def get_restoration_info(self) -> Dict[str, Any]:
+        """
+        Get information about the current restoration configuration.
+        :return: Dictionary with restoration settings and status
+        """
+        return {
+            'enabled': self.use_image_restoration,
+            'task': self.restoration_task,
+            'device': self.restoration_device,
+            'dpi': self.restoration_dpi,
+            'engine_available': self.docres_engine is not None,
+            'supported_tasks': self.docres_engine.get_supported_tasks() if self.docres_engine else []
+        }

doctra/parsers/structured_pdf_parser.py CHANGED Viewed

@@ -64,22 +64,19 @@ class StructuredPDFParser:
     ):
         """
         Initialize the StructuredPDFParser with processing configuration.
-        Sets up the layout detection engine, OCR engine, and optionally
-        the VLM service for comprehensive document processing.
-        :param use_vlm: Whether to use VLM for structured data extraction
-        :param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter")
+        :param use_vlm: Whether to use VLM for structured data extraction (default: False)
+        :param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter", default: "gemini")
         :param vlm_model: Model name to use (defaults to provider-specific defaults)
-        :param vlm_api_key: API key for VLM provider
-        :param layout_model_name: Layout detection model name
-        :param dpi: DPI for PDF rendering
-        :param min_score: Minimum confidence score for layout detection
-        :param ocr_lang: OCR language code
-        :param ocr_psm: Tesseract page segmentation mode
-        :param ocr_oem: Tesseract OCR engine mode
-        :param ocr_extra_config: Additional Tesseract configuration
-        :param box_separator: Separator between text boxes in output
+        :param vlm_api_key: API key for VLM provider (required if use_vlm is True)
+        :param layout_model_name: Layout detection model name (default: "PP-DocLayout_plus-L")
+        :param dpi: DPI for PDF rendering (default: 200)
+        :param min_score: Minimum confidence score for layout detection (default: 0.0)
+        :param ocr_lang: OCR language code (default: "eng")
+        :param ocr_psm: Tesseract page segmentation mode (default: 4)
+        :param ocr_oem: Tesseract OCR engine mode (default: 3)
+        :param ocr_extra_config: Additional Tesseract configuration (default: "")
+        :param box_separator: Separator between text boxes in output (default: "\n")
         """
         self.layout_engine = PaddleLayoutEngine(model_name=layout_model_name)
         self.dpi = dpi
@@ -100,15 +97,10 @@ class StructuredPDFParser:
     def parse(self, pdf_path: str) -> None:
         """
         Parse a PDF document and extract all content types.
-        Processes the PDF through layout detection, extracts text using OCR,
-        saves images for visual elements, and optionally converts charts/tables
-        to structured data using VLM.
         :param pdf_path: Path to the input PDF file
         :return: None
         """
-        # Extract filename without extension and create output directory
         pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0]
         out_dir = f"outputs/{pdf_filename}/full_parse"
@@ -120,7 +112,6 @@ class StructuredPDFParser:
         )
         pil_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.dpi)]
-        # Count for progress bars
         fig_count = sum(sum(1 for b in p.boxes if b.label == "figure") for p in pages)
         chart_count = sum(sum(1 for b in p.boxes if b.label == "chart") for p in pages)
         table_count = sum(sum(1 for b in p.boxes if b.label == "table") for p in pages)
@@ -133,11 +124,8 @@ class StructuredPDFParser:
         figures_desc = "Figures (cropped)"
         with ExitStack() as stack:
-            # Enhanced environment detection
             is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
             is_terminal = hasattr(sys.stdout, 'isatty') and sys.stdout.isatty()
-            # Use appropriate progress bars based on environment
             if is_notebook:
                 charts_bar = stack.enter_context(
                     create_notebook_friendly_bar(total=chart_count, desc=charts_desc)) if chart_count else None
@@ -165,13 +153,11 @@ class StructuredPDFParser:
                         rel = os.path.relpath(abs_img_path, out_dir)
                         if box.label == "figure":
-                            # Figures are always images in MD
                             md_lines.append(f"![Figure — page {page_num}]({rel})\n")
                             if figures_bar: figures_bar.update(1)
                         elif box.label == "chart":
                             if self.use_vlm and self.vlm:
-                                # Try structured → Markdown table; fallback to image if it fails
                                 wrote_table = False
                                 try:
                                     chart = self.vlm.extract_chart(abs_img_path)
@@ -193,7 +179,6 @@ class StructuredPDFParser:
                         elif box.label == "table":
                             if self.use_vlm and self.vlm:
-                                # Try structured → Markdown table; fallback to image if it fails
                                 wrote_table = False
                                 try:
                                     table = self.vlm.extract_table(abs_img_path)
@@ -229,7 +214,6 @@ class StructuredPDFParser:
             html_structured_path = os.path.join(out_dir, "tables.html")
             write_structured_html(html_structured_path, structured_items)
-        # Print completion message with output directory
         print(f"✅ Parsing completed successfully!")
         print(f"📁 Output directory: {out_dir}")
@@ -249,30 +233,25 @@ class StructuredPDFParser:
         :param save_path: Optional path to save the visualization (if None, displays only)
         :return: None
         """
-        # Get layout predictions
         pages: List[LayoutPage] = self.layout_engine.predict_pdf(
             pdf_path, batch_size=1, layout_nms=True, dpi=self.dpi, min_score=self.min_score
         )
         pil_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.dpi)]
-        # Limit to requested number of pages
         pages_to_show = min(num_pages, len(pages))
         if pages_to_show == 0:
             print("No pages to display")
             return
-        # Calculate grid dimensions
         rows = (pages_to_show + cols - 1) // cols
-        # Collect unique labels from the processed pages and assign colors
         used_labels = set()
         for idx in range(pages_to_show):
             page = pages[idx]
             for box in page.boxes:
                 used_labels.add(box.label.lower())
-        # Create dynamic color assignment for all detected labels
         base_colors = ['#3B82F6', '#EF4444', '#10B981', '#F59E0B', '#8B5CF6',
                        '#F97316', '#EC4899', '#6B7280', '#84CC16', '#06B6D4',
                        '#DC2626', '#059669', '#7C3AED', '#DB2777', '#0891B2']
@@ -281,22 +260,18 @@ class StructuredPDFParser:
         for i, label in enumerate(sorted(used_labels)):
             dynamic_label_colors[label] = base_colors[i % len(base_colors)]
-        # Process each page and add bounding boxes
         processed_pages = []
         for idx in range(pages_to_show):
             page = pages[idx]
             page_img = pil_pages[idx].copy()
-            # Calculate scale factor to resize to target width
             scale_factor = page_width / page_img.width
             new_height = int(page_img.height * scale_factor)
             page_img = page_img.resize((page_width, new_height), Image.LANCZOS)
-            # Create drawing context
             draw = ImageDraw.Draw(page_img)
-            # Try to load a nice font, fallback to default
             try:
                 font = ImageFont.truetype("arial.ttf", 24)
                 small_font = ImageFont.truetype("arial.ttf", 18)
@@ -308,21 +283,16 @@ class StructuredPDFParser:
                     font = None
                     small_font = None
-            # Draw bounding boxes
             for box in page.boxes:
-                # Scale coordinates
                 x1 = int(box.x1 * scale_factor)
                 y1 = int(box.y1 * scale_factor)
                 x2 = int(box.x2 * scale_factor)
                 y2 = int(box.y2 * scale_factor)
-                # Get color for this label from dynamic assignment
                 color = dynamic_label_colors.get(box.label.lower(), '#000000')
-                # Draw rectangle with rounded corners effect
                 draw.rectangle([x1, y1, x2, y2], outline=color, width=3)
-                # Draw label background
                 label_text = f"{box.label} ({box.score:.2f})"
                 if font:
                     bbox = draw.textbbox((0, 0), label_text, font=small_font)
@@ -332,11 +302,9 @@ class StructuredPDFParser:
                     text_width = len(label_text) * 8
                     text_height = 15
-                # Position label above the box
                 label_x = x1
                 label_y = max(0, y1 - text_height - 8)
-                # Draw label background with padding
                 padding = 4
                 draw.rectangle([
                     label_x - padding,
@@ -345,10 +313,8 @@ class StructuredPDFParser:
                     label_y + text_height + padding
                 ], fill='white', outline=color, width=2)
-                # Draw label text
                 draw.text((label_x, label_y), label_text, fill=color, font=small_font)
-            # Add page title
             title_text = f"Page {page.page_index} ({len(page.boxes)} boxes)"
             if font:
                 title_bbox = draw.textbbox((0, 0), title_text, font=font)
@@ -356,7 +322,6 @@ class StructuredPDFParser:
             else:
                 title_width = len(title_text) * 12
-            # Draw title background
             title_x = (page_width - title_width) // 2
             title_y = 10
             draw.rectangle([title_x - 10, title_y - 5, title_x + title_width + 10, title_y + 35],
@@ -365,16 +330,13 @@ class StructuredPDFParser:
             processed_pages.append(page_img)
-        # Create grid layout with space for legend
         legend_width = 250
         grid_width = cols * page_width + (cols - 1) * spacing
         total_width = grid_width + legend_width + spacing
         grid_height = rows * (processed_pages[0].height if processed_pages else 600) + (rows - 1) * spacing
-        # Create final grid image with modern background
         final_img = Image.new('RGB', (total_width, grid_height), '#F8FAFC')
-        # Place pages in grid
         for idx, page_img in enumerate(processed_pages):
             row = idx // cols
             col = idx % cols
@@ -384,13 +346,11 @@ class StructuredPDFParser:
             final_img.paste(page_img, (x_pos, y_pos))
-        # Create legend
         legend_x = grid_width + spacing
         legend_y = 20
         draw_legend = ImageDraw.Draw(final_img)
-        # Legend title
         legend_title = "Element Types"
         if font:
             title_bbox = draw_legend.textbbox((0, 0), legend_title, font=font)
@@ -400,47 +360,38 @@ class StructuredPDFParser:
             title_width = len(legend_title) * 12
             title_height = 20
-        # Draw legend background
         legend_bg_height = len(used_labels) * 35 + title_height + 40
         draw_legend.rectangle([legend_x - 10, legend_y - 10,
                                legend_x + legend_width - 10, legend_y + legend_bg_height],
                               fill='white', outline='#E5E7EB', width=2)
-        # Draw legend title
         draw_legend.text((legend_x + 10, legend_y + 5), legend_title,
                          fill='#1F2937', font=font)
-        # Draw legend items - now using dynamic colors for actually detected labels
         current_y = legend_y + title_height + 20
         for label in sorted(used_labels):
             color = dynamic_label_colors[label]
-            # Draw color square
             square_size = 20
             draw_legend.rectangle([legend_x + 10, current_y,
                                    legend_x + 10 + square_size, current_y + square_size],
                                   fill=color, outline='#6B7280', width=1)
-            # Draw label text
             draw_legend.text((legend_x + 40, current_y + 2), label.title(),
                              fill='#374151', font=small_font)
             current_y += 30
-        # Save or display
         if save_path:
             final_img.save(save_path, quality=95, optimize=True)
             print(f"Layout visualization saved to: {save_path}")
         else:
-            # Display using PIL's default viewer
             final_img.show()
-        # Print summary statistics
         print(f"\n📊 Layout Detection Summary for {os.path.basename(pdf_path)}:")
         print(f"Pages processed: {pages_to_show}")
-        # Create summary by label across all pages
         total_counts = {}
         for idx in range(pages_to_show):
             page = pages[idx]

doctra 0.3.2__py3-none-any.whl → 0.4.0__py3-none-any.whl

doctra 0.3.2py3-none-any.whl → 0.4.0py3-none-any.whl