PyPI - doctra - Versions diffs - 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl - Mend

doctra 0.4.0py3-none-any.whl → 0.4.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

doctra/cli/main.py +5 -12
doctra/cli/utils.py +2 -3
doctra/engines/image_restoration/docres_engine.py +6 -11
doctra/engines/vlm/outlines_types.py +13 -9
doctra/engines/vlm/service.py +4 -2
doctra/exporters/excel_writer.py +89 -0
doctra/exporters/html_writer.py +206 -1
doctra/parsers/enhanced_pdf_parser.py +124 -31
doctra/parsers/structured_pdf_parser.py +58 -15
doctra/parsers/table_chart_extractor.py +290 -284
doctra/ui/app.py +39 -960
doctra/ui/docres_ui.py +338 -0
doctra/ui/docres_wrapper.py +120 -0
doctra/ui/enhanced_parser_ui.py +483 -0
doctra/ui/full_parse_ui.py +539 -0
doctra/ui/tables_charts_ui.py +445 -0
doctra/ui/ui_helpers.py +435 -0
doctra/utils/progress.py +7 -7
doctra/utils/structured_utils.py +5 -2
doctra/version.py +1 -1
{doctra-0.4.0.dist-info → doctra-0.4.2.dist-info}/METADATA +1 -1
{doctra-0.4.0.dist-info → doctra-0.4.2.dist-info}/RECORD +25 -19
{doctra-0.4.0.dist-info → doctra-0.4.2.dist-info}/WHEEL +0 -0
{doctra-0.4.0.dist-info → doctra-0.4.2.dist-info}/licenses/LICENSE +0 -0
{doctra-0.4.0.dist-info → doctra-0.4.2.dist-info}/top_level.txt +0 -0

doctra/parsers/enhanced_pdf_parser.py CHANGED Viewed

@@ -8,6 +8,7 @@ capabilities with DocRes image restoration for improved document processing.
 from __future__ import annotations
 import os
 import sys
+import numpy as np
 from typing import List, Dict, Any, Optional, Union
 from contextlib import ExitStack
 from PIL import Image
@@ -16,9 +17,17 @@ from tqdm import tqdm
 from doctra.parsers.structured_pdf_parser import StructuredPDFParser
 from doctra.engines.image_restoration import DocResEngine
 from doctra.utils.pdf_io import render_pdf_to_images
-from doctra.utils.constants import IMAGE_SUBDIRS
+from doctra.utils.constants import IMAGE_SUBDIRS, EXCLUDE_LABELS
 from doctra.utils.file_ops import ensure_output_dirs
 from doctra.utils.progress import create_beautiful_progress_bar, create_notebook_friendly_bar
+from doctra.parsers.layout_order import reading_order_key
+from doctra.utils.ocr_utils import ocr_box_text
+from doctra.exporters.image_saver import save_box_image
+from doctra.exporters.markdown_writer import write_markdown
+from doctra.exporters.html_writer import write_html, write_structured_html, render_html_table, write_html_from_lines
+from doctra.exporters.excel_writer import write_structured_excel
+from doctra.utils.structured_utils import to_structured_dict
+from doctra.exporters.markdown_table import render_markdown_table
 class EnhancedPDFParser(StructuredPDFParser):
@@ -132,6 +141,13 @@ class EnhancedPDFParser(StructuredPDFParser):
         if self.use_image_restoration and self.docres_engine:
             print(f"🔄 Processing PDF with image restoration: {os.path.basename(pdf_path)}")
             enhanced_pages = self._process_pages_with_restoration(pdf_path, out_dir)
+            # Create enhanced PDF file using the already processed enhanced pages
+            enhanced_pdf_path = os.path.join(out_dir, f"{pdf_filename}_enhanced.pdf")
+            try:
+                self._create_enhanced_pdf_from_pages(enhanced_pages, enhanced_pdf_path)
+            except Exception as e:
+                print(f"⚠️ Failed to create enhanced PDF: {e}")
         else:
             print(f"🔄 Processing PDF without image restoration: {os.path.basename(pdf_path)}")
             enhanced_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.dpi)]
@@ -146,7 +162,7 @@ class EnhancedPDFParser(StructuredPDFParser):
         pil_pages = enhanced_pages
         # Continue with standard parsing logic
-        self._process_parsing_logic(pages, pil_pages, out_dir, pdf_filename)
+        self._process_parsing_logic(pages, pil_pages, out_dir, pdf_filename, pdf_path)
     def _process_pages_with_restoration(self, pdf_path: str, out_dir: str) -> List[Image.Image]:
         """
@@ -168,12 +184,12 @@ class EnhancedPDFParser(StructuredPDFParser):
         if is_notebook:
             progress_bar = create_notebook_friendly_bar(
                 total=len(original_pages),
-                desc=f"🔄 DocRes {self.restoration_task}"
+                desc=f"DocRes {self.restoration_task}"
             )
         else:
             progress_bar = create_beautiful_progress_bar(
                 total=len(original_pages),
-                desc=f"🔄 DocRes {self.restoration_task}",
+                desc=f"DocRes {self.restoration_task}",
                 leave=True
             )
@@ -186,7 +202,6 @@ class EnhancedPDFParser(StructuredPDFParser):
                 for i, page_img in enumerate(original_pages):
                     try:
                         # Convert PIL to numpy array
-                        import numpy as np
                         img_array = np.array(page_img)
                         # Apply DocRes restoration
@@ -216,31 +231,22 @@ class EnhancedPDFParser(StructuredPDFParser):
             if hasattr(progress_bar, 'close'):
                 progress_bar.close()
-        print(f"✅ Image restoration completed. Enhanced pages saved to: {enhanced_dir}")
         return enhanced_pages
-    def _process_parsing_logic(self, pages, pil_pages, out_dir, pdf_filename):
+    def _process_parsing_logic(self, pages, pil_pages, out_dir, pdf_filename, pdf_path):
         """
         Process the parsing logic with enhanced pages.
         This is extracted from the parent class to allow customization.
         """
-        from doctra.utils.constants import EXCLUDE_LABELS
-        from doctra.parsers.layout_order import reading_order_key
-        from doctra.utils.ocr_utils import ocr_box_text
-        from doctra.exporters.image_saver import save_box_image
-        from doctra.exporters.markdown_writer import write_markdown
-        from doctra.exporters.html_writer import write_html
-        from doctra.exporters.excel_writer import write_structured_excel
-        from doctra.exporters.html_writer import write_structured_html
-        from doctra.utils.structured_utils import to_structured_dict
-        from doctra.exporters.markdown_table import render_markdown_table
         fig_count = sum(sum(1 for b in p.boxes if b.label == "figure") for p in pages)
         chart_count = sum(sum(1 for b in p.boxes if b.label == "chart") for p in pages)
         table_count = sum(sum(1 for b in p.boxes if b.label == "table") for p in pages)
         md_lines: List[str] = ["# Enhanced Document Content\n"]
+        html_lines: List[str] = ["<h1>Enhanced Document Content</h1>"]  # For direct HTML generation
         structured_items: List[Dict[str, Any]] = []
+        page_content: Dict[int, List[str]] = {}  # Store content by page
         charts_desc = "Charts (VLM → table)" if self.use_vlm else "Charts (cropped)"
         tables_desc = "Tables (VLM → table)" if self.use_vlm else "Tables (cropped)"
@@ -263,10 +269,15 @@ class EnhancedPDFParser(StructuredPDFParser):
                 figures_bar = stack.enter_context(
                     create_beautiful_progress_bar(total=fig_count, desc=figures_desc, leave=True)) if fig_count else None
+            # Initialize page content for all pages first
+            for page_num in range(1, len(pil_pages) + 1):
+                page_content[page_num] = [f"# Page {page_num} Content\n"]
             for p in pages:
                 page_num = p.page_index
                 page_img: Image.Image = pil_pages[page_num - 1]
                 md_lines.append(f"\n## Page {page_num}\n")
+                html_lines.append(f"<h2>Page {page_num}</h2>")
                 for i, box in enumerate(sorted(p.boxes, key=reading_order_key), start=1):
                     if box.label in EXCLUDE_LABELS:
@@ -275,7 +286,11 @@ class EnhancedPDFParser(StructuredPDFParser):
                         rel = os.path.relpath(abs_img_path, out_dir)
                         if box.label == "figure":
-                            md_lines.append(f"![Figure — page {page_num}]({rel})\n")
+                            figure_md = f"![Figure — page {page_num}]({rel})\n"
+                            figure_html = f'<img src="{rel}" alt="Figure — page {page_num}" />'
+                            md_lines.append(figure_md)
+                            html_lines.append(figure_html)
+                            page_content[page_num].append(figure_md)
                             if figures_bar: figures_bar.update(1)
                         elif box.label == "chart":
@@ -285,18 +300,35 @@ class EnhancedPDFParser(StructuredPDFParser):
                                     chart = self.vlm.extract_chart(abs_img_path)
                                     item = to_structured_dict(chart)
                                     if item:
+                                        # Add page and type information to structured item
+                                        item["page"] = page_num
+                                        item["type"] = "Chart"
                                         structured_items.append(item)
-                                        md_lines.append(
-                                            render_markdown_table(item.get("headers"), item.get("rows"),
-                                                                  title=item.get("title"))
-                                        )
+                                        # Generate both markdown and HTML tables
+                                        table_md = render_markdown_table(item.get("headers"), item.get("rows"),
+                                                                         title=item.get("title"))
+                                        table_html = render_html_table(item.get("headers"), item.get("rows"),
+                                                                       title=item.get("title"))
+                                        md_lines.append(table_md)
+                                        html_lines.append(table_html)
+                                        page_content[page_num].append(table_md)
                                         wrote_table = True
                                 except Exception as e:
                                     pass
                                 if not wrote_table:
-                                    md_lines.append(f"![Chart — page {page_num}]({rel})\n")
+                                    chart_md = f"![Chart — page {page_num}]({rel})\n"
+                                    chart_html = f'<img src="{rel}" alt="Chart — page {page_num}" />'
+                                    md_lines.append(chart_md)
+                                    html_lines.append(chart_html)
+                                    page_content[page_num].append(chart_md)
                             else:
-                                md_lines.append(f"![Chart — page {page_num}]({rel})\n")
+                                chart_md = f"![Chart — page {page_num}]({rel})\n"
+                                chart_html = f'<img src="{rel}" alt="Chart — page {page_num}" />'
+                                md_lines.append(chart_md)
+                                html_lines.append(chart_html)
+                                page_content[page_num].append(chart_md)
                             if charts_bar: charts_bar.update(1)
                         elif box.label == "table":
@@ -306,27 +338,64 @@ class EnhancedPDFParser(StructuredPDFParser):
                                     table = self.vlm.extract_table(abs_img_path)
                                     item = to_structured_dict(table)
                                     if item:
+                                        # Add page and type information to structured item
+                                        item["page"] = page_num
+                                        item["type"] = "Table"
                                         structured_items.append(item)
-                                        md_lines.append(
-                                            render_markdown_table(item.get("headers"), item.get("rows"),
-                                                                  title=item.get("title"))
-                                        )
+                                        # Generate both markdown and HTML tables
+                                        table_md = render_markdown_table(item.get("headers"), item.get("rows"),
+                                                                         title=item.get("title"))
+                                        table_html = render_html_table(item.get("headers"), item.get("rows"),
+                                                                       title=item.get("title"))
+                                        md_lines.append(table_md)
+                                        html_lines.append(table_html)
+                                        page_content[page_num].append(table_md)
                                         wrote_table = True
                                 except Exception as e:
                                     pass
                                 if not wrote_table:
-                                    md_lines.append(f"![Table — page {page_num}]({rel})\n")
+                                    table_md = f"![Table — page {page_num}]({rel})\n"
+                                    table_html = f'<img src="{rel}" alt="Table — page {page_num}" />'
+                                    md_lines.append(table_md)
+                                    html_lines.append(table_html)
+                                    page_content[page_num].append(table_md)
                             else:
-                                md_lines.append(f"![Table — page {page_num}]({rel})\n")
+                                table_md = f"![Table — page {page_num}]({rel})\n"
+                                table_html = f'<img src="{rel}" alt="Table — page {page_num}" />'
+                                md_lines.append(table_md)
+                                html_lines.append(table_html)
+                                page_content[page_num].append(table_md)
                             if tables_bar: tables_bar.update(1)
                     else:
                         text = ocr_box_text(self.ocr_engine, page_img, box)
                         if text:
                             md_lines.append(text)
                             md_lines.append(self.box_separator if self.box_separator else "")
+                            # Convert text to HTML (basic conversion)
+                            html_text = text.replace('\n', '<br>')
+                            html_lines.append(f"<p>{html_text}</p>")
+                            if self.box_separator:
+                                html_lines.append("<br>")
+                            page_content[page_num].append(text)
+                            page_content[page_num].append(self.box_separator if self.box_separator else "")
         md_path = write_markdown(md_lines, out_dir)
-        html_path = write_html(md_lines, out_dir)
+        # Use HTML lines if VLM is enabled for better table formatting
+        if self.use_vlm and html_lines:
+            html_path = write_html_from_lines(html_lines, out_dir)
+        else:
+            html_path = write_html(md_lines, out_dir)
+        # Create pages folder and save individual page markdown files
+        pages_dir = os.path.join(out_dir, "pages")
+        os.makedirs(pages_dir, exist_ok=True)
+        for page_num, content_lines in page_content.items():
+            page_md_path = os.path.join(pages_dir, f"page_{page_num:03d}.md")
+            write_markdown(content_lines, os.path.dirname(page_md_path), os.path.basename(page_md_path))
         excel_path = None
         html_structured_path = None
@@ -339,6 +408,30 @@ class EnhancedPDFParser(StructuredPDFParser):
         print(f"✅ Enhanced parsing completed successfully!")
         print(f"📁 Output directory: {out_dir}")
+    def _create_enhanced_pdf_from_pages(self, enhanced_pages: List[Image.Image], output_path: str) -> None:
+        """
+        Create an enhanced PDF from already processed enhanced pages.
+        :param enhanced_pages: List of enhanced PIL images
+        :param output_path: Path for the enhanced PDF
+        """
+        if not enhanced_pages:
+            raise ValueError("No enhanced pages provided")
+        try:
+            # Create enhanced PDF from the processed pages
+            enhanced_pages[0].save(
+                output_path,
+                "PDF",
+                resolution=100.0,
+                save_all=True,
+                append_images=enhanced_pages[1:] if len(enhanced_pages) > 1 else []
+            )
+            print(f"✅ Enhanced PDF saved from processed pages: {output_path}")
+        except Exception as e:
+            print(f"❌ Error creating enhanced PDF from pages: {e}")
+            raise
     def restore_pdf_only(self, pdf_path: str, output_path: str = None, task: str = None) -> str:
         """
         Apply DocRes restoration to a PDF without parsing.

doctra/parsers/structured_pdf_parser.py CHANGED Viewed

@@ -20,7 +20,7 @@ from doctra.exporters.excel_writer import write_structured_excel
 from doctra.utils.structured_utils import to_structured_dict
 from doctra.exporters.markdown_table import render_markdown_table
 from doctra.exporters.markdown_writer import write_markdown
-from doctra.exporters.html_writer import write_html, write_structured_html
+from doctra.exporters.html_writer import write_html, write_structured_html, render_html_table, write_html_from_lines
 from doctra.utils.progress import create_beautiful_progress_bar, create_multi_progress_bars, create_notebook_friendly_bar
@@ -117,6 +117,7 @@ class StructuredPDFParser:
         table_count = sum(sum(1 for b in p.boxes if b.label == "table") for p in pages)
         md_lines: List[str] = ["# Extracted Content\n"]
+        html_lines: List[str] = ["<h1>Extracted Content</h1>"]  # For direct HTML generation
         structured_items: List[Dict[str, Any]] = []
         charts_desc = "Charts (VLM → table)" if self.use_vlm else "Charts (cropped)"
@@ -145,6 +146,7 @@ class StructuredPDFParser:
                 page_num = p.page_index
                 page_img: Image.Image = pil_pages[page_num - 1]
                 md_lines.append(f"\n## Page {page_num}\n")
+                html_lines.append(f"<h2>Page {page_num}</h2>")
                 for i, box in enumerate(sorted(p.boxes, key=reading_order_key), start=1):
                     if box.label in EXCLUDE_LABELS:
@@ -153,7 +155,10 @@ class StructuredPDFParser:
                         rel = os.path.relpath(abs_img_path, out_dir)
                         if box.label == "figure":
-                            md_lines.append(f"![Figure — page {page_num}]({rel})\n")
+                            figure_md = f"![Figure — page {page_num}]({rel})\n"
+                            figure_html = f'<img src="{rel}" alt="Figure — page {page_num}" />'
+                            md_lines.append(figure_md)
+                            html_lines.append(figure_html)
                             if figures_bar: figures_bar.update(1)
                         elif box.label == "chart":
@@ -163,18 +168,32 @@ class StructuredPDFParser:
                                     chart = self.vlm.extract_chart(abs_img_path)
                                     item = to_structured_dict(chart)
                                     if item:
+                                        # Add page and type information to structured item
+                                        item["page"] = page_num
+                                        item["type"] = "Chart"
                                         structured_items.append(item)
-                                        md_lines.append(
-                                            render_markdown_table(item.get("headers"), item.get("rows"),
-                                                                  title=item.get("title"))
-                                        )
+                                        # Generate both markdown and HTML tables
+                                        table_md = render_markdown_table(item.get("headers"), item.get("rows"),
+                                                                         title=item.get("title"))
+                                        table_html = render_html_table(item.get("headers"), item.get("rows"),
+                                                                       title=item.get("title"))
+                                        md_lines.append(table_md)
+                                        html_lines.append(table_html)
                                         wrote_table = True
                                 except Exception as e:
                                     pass
                                 if not wrote_table:
-                                    md_lines.append(f"![Chart — page {page_num}]({rel})\n")
+                                    chart_md = f"![Chart — page {page_num}]({rel})\n"
+                                    chart_html = f'<img src="{rel}" alt="Chart — page {page_num}" />'
+                                    md_lines.append(chart_md)
+                                    html_lines.append(chart_html)
                             else:
-                                md_lines.append(f"![Chart — page {page_num}]({rel})\n")
+                                chart_md = f"![Chart — page {page_num}]({rel})\n"
+                                chart_html = f'<img src="{rel}" alt="Chart — page {page_num}" />'
+                                md_lines.append(chart_md)
+                                html_lines.append(chart_html)
                             if charts_bar: charts_bar.update(1)
                         elif box.label == "table":
@@ -184,27 +203,51 @@ class StructuredPDFParser:
                                     table = self.vlm.extract_table(abs_img_path)
                                     item = to_structured_dict(table)
                                     if item:
+                                        # Add page and type information to structured item
+                                        item["page"] = page_num
+                                        item["type"] = "Table"
                                         structured_items.append(item)
-                                        md_lines.append(
-                                            render_markdown_table(item.get("headers"), item.get("rows"),
-                                                                  title=item.get("title"))
-                                        )
+                                        # Generate both markdown and HTML tables
+                                        table_md = render_markdown_table(item.get("headers"), item.get("rows"),
+                                                                         title=item.get("title"))
+                                        table_html = render_html_table(item.get("headers"), item.get("rows"),
+                                                                       title=item.get("title"))
+                                        md_lines.append(table_md)
+                                        html_lines.append(table_html)
                                         wrote_table = True
                                 except Exception as e:
                                     pass
                                 if not wrote_table:
-                                    md_lines.append(f"![Table — page {page_num}]({rel})\n")
+                                    table_md = f"![Table — page {page_num}]({rel})\n"
+                                    table_html = f'<img src="{rel}" alt="Table — page {page_num}" />'
+                                    md_lines.append(table_md)
+                                    html_lines.append(table_html)
                             else:
-                                md_lines.append(f"![Table — page {page_num}]({rel})\n")
+                                table_md = f"![Table — page {page_num}]({rel})\n"
+                                table_html = f'<img src="{rel}" alt="Table — page {page_num}" />'
+                                md_lines.append(table_md)
+                                html_lines.append(table_html)
                             if tables_bar: tables_bar.update(1)
                     else:
                         text = ocr_box_text(self.ocr_engine, page_img, box)
                         if text:
                             md_lines.append(text)
                             md_lines.append(self.box_separator if self.box_separator else "")
+                            # Convert text to HTML (basic conversion)
+                            html_text = text.replace('\n', '<br>')
+                            html_lines.append(f"<p>{html_text}</p>")
+                            if self.box_separator:
+                                html_lines.append("<br>")
         md_path = write_markdown(md_lines, out_dir)
-        html_path = write_html(md_lines, out_dir)
+        # Use HTML lines if VLM is enabled for better table formatting
+        if self.use_vlm and html_lines:
+            html_path = write_html_from_lines(html_lines, out_dir)
+        else:
+            html_path = write_html(md_lines, out_dir)
         excel_path = None
         html_structured_path = None

doctra 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl

doctra 0.4.0py3-none-any.whl → 0.4.2py3-none-any.whl