PyPI - doctra - Versions diffs - 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl - Mend

doctra 0.4.1py3-none-any.whl → 0.4.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

doctra/engines/image_restoration/docres_engine.py +4 -4
doctra/exporters/html_writer.py +206 -1
doctra/parsers/enhanced_pdf_parser.py +107 -18
doctra/parsers/structured_pdf_parser.py +52 -15
doctra/parsers/table_chart_extractor.py +290 -290
doctra/ui/app.py +39 -954
doctra/ui/docres_ui.py +338 -0
doctra/ui/docres_wrapper.py +120 -0
doctra/ui/enhanced_parser_ui.py +483 -0
doctra/ui/full_parse_ui.py +539 -0
doctra/ui/tables_charts_ui.py +445 -0
doctra/ui/ui_helpers.py +435 -0
doctra/utils/progress.py +7 -7
doctra/version.py +1 -1
{doctra-0.4.1.dist-info → doctra-0.4.3.dist-info}/METADATA +331 -74
{doctra-0.4.1.dist-info → doctra-0.4.3.dist-info}/RECORD +20 -13
doctra-0.4.3.dist-info/entry_points.txt +2 -0
{doctra-0.4.1.dist-info → doctra-0.4.3.dist-info}/WHEEL +0 -0
{doctra-0.4.1.dist-info → doctra-0.4.3.dist-info}/licenses/LICENSE +0 -0
{doctra-0.4.1.dist-info → doctra-0.4.3.dist-info}/top_level.txt +0 -0

doctra/engines/image_restoration/docres_engine.py CHANGED Viewed

@@ -87,12 +87,12 @@ def load_docres_weights_from_hf():
         if is_notebook:
             progress_bar = create_notebook_friendly_bar(
                 total=2,
-                desc="🔄 Downloading DocRes models from Hugging Face Hub"
+                desc="Downloading DocRes models from Hugging Face Hub"
             )
         else:
             progress_bar = create_beautiful_progress_bar(
                 total=2,
-                desc="🔄 Downloading DocRes models from Hugging Face Hub",
+                desc="Downloading DocRes models from Hugging Face Hub",
                 leave=True
             )
@@ -505,12 +505,12 @@ class DocResEngine:
             if is_notebook:
                 progress_bar = create_notebook_friendly_bar(
                     total=len(pil_pages),
-                    desc="🔄 Processing pages"
+                    desc="Processing pages"
                 )
             else:
                 progress_bar = create_beautiful_progress_bar(
                     total=len(pil_pages),
-                    desc="🔄 Processing pages",
+                    desc="Processing pages",
                     leave=True
                 )

doctra/exporters/html_writer.py CHANGED Viewed

@@ -2,7 +2,7 @@ from __future__ import annotations
 import os
 import re
 import base64
-from typing import List, Dict, Any
+from typing import List, Dict, Any, Optional
 from markdown_it import MarkdownIt
@@ -64,6 +64,114 @@ def _process_image_paths(md_content: str, out_dir: str) -> str:
     return processed_content
+def write_html_from_lines(html_lines: List[str], out_dir: str, filename: str = "result.html") -> str:
+    """
+    Convert HTML lines directly into a single HTML file and save it.
+    This function is used when VLM is enabled to ensure proper HTML table formatting
+    instead of markdown-to-HTML conversion.
+    :param html_lines: List of HTML strings to join into a single file
+    :param out_dir: Directory where the HTML file will be saved
+    :param filename: Name of the HTML file (default: "result.html")
+    :return: The absolute path of the written HTML file
+    """
+    os.makedirs(out_dir, exist_ok=True)
+    # Join HTML lines and clean up excessive blank lines
+    html_content = "\n".join(html_lines).strip() + "\n"
+    html_content = re.sub(r"\n{3,}", "\n\n", html_content)
+    # Process image paths to convert relative paths to absolute paths or base64
+    html_content = _process_image_paths(html_content, out_dir)
+    # Always apply table styling to ensure all tables are properly formatted
+    html_content = _add_table_styling(html_content)
+    # Create complete HTML document with modern styling
+    html_document = f"""<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Document Analysis Results</title>
+    <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700;800&display=swap" rel="stylesheet">
+    <style>
+        {_get_css_styles()}
+    </style>
+</head>
+<body>
+    <button class="theme-toggle" onclick="toggleTheme()" title="Toggle dark mode"></button>
+    <div class="container">
+        <header class="header">
+            <div class="header-content">
+                <div class="header-text">
+                    <h1>Document Analysis Results</h1>
+                    <p class="subtitle">Intelligent Document Processing & Analysis</p>
+                </div>
+                <div class="header-badge">
+                    Generated by Doctra
+                </div>
+            </div>
+        </header>
+        <main class="content">
+            {html_content}
+        </main>
+        <footer class="footer">
+            <div class="footer-content">
+                <div class="footer-brand">Doctra</div>
+                <div class="footer-info">
+                    <span>Intelligent Document Processing</span>
+                    <a href="https://github.com/AdemBoukhris457/Doctra" target="_blank">GitHub</a>
+                </div>
+            </div>
+        </footer>
+    </div>
+    <script>
+        // Theme toggle functionality
+        function toggleTheme() {{
+            const body = document.body;
+            const currentTheme = body.getAttribute('data-theme');
+            const newTheme = currentTheme === 'dark' ? 'light' : 'dark';
+            body.setAttribute('data-theme', newTheme);
+            localStorage.setItem('doctra-theme', newTheme);
+            // Add smooth transition
+            body.style.transition = 'all 0.3s ease';
+            setTimeout(() => {{
+                body.style.transition = '';
+            }}, 300);
+        }}
+        // Load saved theme on page load
+        document.addEventListener('DOMContentLoaded', function() {{
+            const savedTheme = localStorage.getItem('doctra-theme') || 'light';
+            document.body.setAttribute('data-theme', savedTheme);
+        }});
+        // Add smooth scroll behavior
+        document.documentElement.style.scrollBehavior = 'smooth';
+        // Add loading animation
+        window.addEventListener('load', function() {{
+            document.body.style.opacity = '0';
+            document.body.style.transition = 'opacity 0.5s ease';
+            setTimeout(() => {{
+                document.body.style.opacity = '1';
+            }}, 100);
+        }});
+    </script>
+</body>
+</html>"""
+    html_path = os.path.join(out_dir, filename)
+    with open(html_path, "w", encoding="utf-8") as f:
+        f.write(html_document)
+    return os.path.abspath(html_path)
 def write_html(md_lines: List[str], out_dir: str, filename: str = "result.html") -> str:
     """
     Convert collected Markdown lines into a single HTML file and save it.
@@ -414,6 +522,54 @@ def _create_html_table(headers: List[str], rows: List[List]) -> str:
     """
+def render_html_table(
+    headers: List[str] | None,
+    rows: List[List[str]] | None,
+    title: Optional[str] = None,
+) -> str:
+    """
+    Render an HTML table from headers, rows, and optional title.
+    Creates a properly formatted HTML table with headers, data rows,
+    and optional title. This is used for VLM-extracted tables to ensure
+    they display as proper HTML tables instead of markdown.
+    :param headers: List of column headers (optional, will be auto-generated if None)
+    :param rows: List of data rows, where each row is a list of cell values
+    :param title: Optional title to display above the table
+    :return: Formatted HTML table string
+    """
+    headers = headers or []
+    rows = rows or []
+    if not headers and not rows:
+        return "<p class='no-data'>No data available</p>"
+    # Determine width
+    width = len(headers) if headers else (max((len(r) for r in rows), default=1))
+    # Generate headers if not provided
+    if not headers:
+        headers = [f"Column {i+1}" for i in range(width)]
+    # Normalize data to handle mismatched dimensions
+    normalized_headers, normalized_rows = _normalize_data(headers, rows)
+    # Create HTML table
+    table_html = _create_html_table(normalized_headers, normalized_rows)
+    # Add title if provided
+    if title:
+        return f"""
+        <div class="table-section">
+            <h3 class="table-title">{_escape_html(title)}</h3>
+            {table_html}
+        </div>
+        """
+    else:
+        return table_html
 def _add_table_styling(html_content: str) -> str:
     """
     Add table styling wrapper to HTML content.
@@ -884,6 +1040,55 @@ def _get_css_styles() -> str:
             content: '☀️';
         }
+        /* Dark mode table styles */
+        [data-theme="dark"] .markdown-table,
+        [data-theme="dark"] table {
+            background: var(--card-bg);
+            border-color: var(--border-color);
+        }
+        [data-theme="dark"] .markdown-table th,
+        [data-theme="dark"] table th {
+            background: #374151;
+            color: #f9fafb;
+            border-bottom-color: var(--accent-color);
+        }
+        [data-theme="dark"] .markdown-table td,
+        [data-theme="dark"] table td {
+            color: #f9fafb;
+            border-bottom-color: var(--border-color);
+        }
+        [data-theme="dark"] .markdown-table tr:nth-child(even),
+        [data-theme="dark"] table tr:nth-child(even) {
+            background: #374151;
+        }
+        [data-theme="dark"] .markdown-table tr:hover,
+        [data-theme="dark"] table tr:hover {
+            background: #4b5563;
+        }
+        /* Dark mode footer styles to match header */
+        [data-theme="dark"] .footer {
+            background: var(--primary-color);
+            color: white;
+            border-top-color: var(--accent-color);
+        }
+        [data-theme="dark"] .footer-brand {
+            color: white;
+        }
+        [data-theme="dark"] .footer a {
+            color: rgba(255, 255, 255, 0.8);
+        }
+        [data-theme="dark"] .footer a:hover {
+            color: white;
+        }
         /* Professional scrollbar */
         ::-webkit-scrollbar {
             width: 8px;

doctra/parsers/enhanced_pdf_parser.py CHANGED Viewed

@@ -24,7 +24,7 @@ from doctra.parsers.layout_order import reading_order_key
 from doctra.utils.ocr_utils import ocr_box_text
 from doctra.exporters.image_saver import save_box_image
 from doctra.exporters.markdown_writer import write_markdown
-from doctra.exporters.html_writer import write_html, write_structured_html
+from doctra.exporters.html_writer import write_html, write_structured_html, render_html_table, write_html_from_lines
 from doctra.exporters.excel_writer import write_structured_excel
 from doctra.utils.structured_utils import to_structured_dict
 from doctra.exporters.markdown_table import render_markdown_table
@@ -141,6 +141,13 @@ class EnhancedPDFParser(StructuredPDFParser):
         if self.use_image_restoration and self.docres_engine:
             print(f"🔄 Processing PDF with image restoration: {os.path.basename(pdf_path)}")
             enhanced_pages = self._process_pages_with_restoration(pdf_path, out_dir)
+            # Create enhanced PDF file using the already processed enhanced pages
+            enhanced_pdf_path = os.path.join(out_dir, f"{pdf_filename}_enhanced.pdf")
+            try:
+                self._create_enhanced_pdf_from_pages(enhanced_pages, enhanced_pdf_path)
+            except Exception as e:
+                print(f"⚠️ Failed to create enhanced PDF: {e}")
         else:
             print(f"🔄 Processing PDF without image restoration: {os.path.basename(pdf_path)}")
             enhanced_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.dpi)]
@@ -177,12 +184,12 @@ class EnhancedPDFParser(StructuredPDFParser):
         if is_notebook:
             progress_bar = create_notebook_friendly_bar(
                 total=len(original_pages),
-                desc=f"🔄 DocRes {self.restoration_task}"
+                desc=f"DocRes {self.restoration_task}"
             )
         else:
             progress_bar = create_beautiful_progress_bar(
                 total=len(original_pages),
-                desc=f"🔄 DocRes {self.restoration_task}",
+                desc=f"DocRes {self.restoration_task}",
                 leave=True
             )
@@ -224,7 +231,6 @@ class EnhancedPDFParser(StructuredPDFParser):
             if hasattr(progress_bar, 'close'):
                 progress_bar.close()
-        print(f"✅ Image restoration completed. Enhanced pages saved to: {enhanced_dir}")
         return enhanced_pages
     def _process_parsing_logic(self, pages, pil_pages, out_dir, pdf_filename, pdf_path):
@@ -238,7 +244,9 @@ class EnhancedPDFParser(StructuredPDFParser):
         table_count = sum(sum(1 for b in p.boxes if b.label == "table") for p in pages)
         md_lines: List[str] = ["# Enhanced Document Content\n"]
+        html_lines: List[str] = ["<h1>Enhanced Document Content</h1>"]  # For direct HTML generation
         structured_items: List[Dict[str, Any]] = []
+        page_content: Dict[int, List[str]] = {}  # Store content by page
         charts_desc = "Charts (VLM → table)" if self.use_vlm else "Charts (cropped)"
         tables_desc = "Tables (VLM → table)" if self.use_vlm else "Tables (cropped)"
@@ -261,10 +269,15 @@ class EnhancedPDFParser(StructuredPDFParser):
                 figures_bar = stack.enter_context(
                     create_beautiful_progress_bar(total=fig_count, desc=figures_desc, leave=True)) if fig_count else None
+            # Initialize page content for all pages first
+            for page_num in range(1, len(pil_pages) + 1):
+                page_content[page_num] = [f"# Page {page_num} Content\n"]
             for p in pages:
                 page_num = p.page_index
                 page_img: Image.Image = pil_pages[page_num - 1]
                 md_lines.append(f"\n## Page {page_num}\n")
+                html_lines.append(f"<h2>Page {page_num}</h2>")
                 for i, box in enumerate(sorted(p.boxes, key=reading_order_key), start=1):
                     if box.label in EXCLUDE_LABELS:
@@ -273,7 +286,11 @@ class EnhancedPDFParser(StructuredPDFParser):
                         rel = os.path.relpath(abs_img_path, out_dir)
                         if box.label == "figure":
-                            md_lines.append(f"![Figure — page {page_num}]({rel})\n")
+                            figure_md = f"![Figure — page {page_num}]({rel})\n"
+                            figure_html = f'<img src="{rel}" alt="Figure — page {page_num}" />'
+                            md_lines.append(figure_md)
+                            html_lines.append(figure_html)
+                            page_content[page_num].append(figure_md)
                             if figures_bar: figures_bar.update(1)
                         elif box.label == "chart":
@@ -287,17 +304,31 @@ class EnhancedPDFParser(StructuredPDFParser):
                                         item["page"] = page_num
                                         item["type"] = "Chart"
                                         structured_items.append(item)
-                                        md_lines.append(
-                                            render_markdown_table(item.get("headers"), item.get("rows"),
-                                                                  title=item.get("title"))
-                                        )
+                                        # Generate both markdown and HTML tables
+                                        table_md = render_markdown_table(item.get("headers"), item.get("rows"),
+                                                                         title=item.get("title"))
+                                        table_html = render_html_table(item.get("headers"), item.get("rows"),
+                                                                       title=item.get("title"))
+                                        md_lines.append(table_md)
+                                        html_lines.append(table_html)
+                                        page_content[page_num].append(table_md)
                                         wrote_table = True
                                 except Exception as e:
                                     pass
                                 if not wrote_table:
-                                    md_lines.append(f"![Chart — page {page_num}]({rel})\n")
+                                    chart_md = f"![Chart — page {page_num}]({rel})\n"
+                                    chart_html = f'<img src="{rel}" alt="Chart — page {page_num}" />'
+                                    md_lines.append(chart_md)
+                                    html_lines.append(chart_html)
+                                    page_content[page_num].append(chart_md)
                             else:
-                                md_lines.append(f"![Chart — page {page_num}]({rel})\n")
+                                chart_md = f"![Chart — page {page_num}]({rel})\n"
+                                chart_html = f'<img src="{rel}" alt="Chart — page {page_num}" />'
+                                md_lines.append(chart_md)
+                                html_lines.append(chart_html)
+                                page_content[page_num].append(chart_md)
                             if charts_bar: charts_bar.update(1)
                         elif box.label == "table":
@@ -311,26 +342,60 @@ class EnhancedPDFParser(StructuredPDFParser):
                                         item["page"] = page_num
                                         item["type"] = "Table"
                                         structured_items.append(item)
-                                        md_lines.append(
-                                            render_markdown_table(item.get("headers"), item.get("rows"),
-                                                                  title=item.get("title"))
-                                        )
+                                        # Generate both markdown and HTML tables
+                                        table_md = render_markdown_table(item.get("headers"), item.get("rows"),
+                                                                         title=item.get("title"))
+                                        table_html = render_html_table(item.get("headers"), item.get("rows"),
+                                                                       title=item.get("title"))
+                                        md_lines.append(table_md)
+                                        html_lines.append(table_html)
+                                        page_content[page_num].append(table_md)
                                         wrote_table = True
                                 except Exception as e:
                                     pass
                                 if not wrote_table:
-                                    md_lines.append(f"![Table — page {page_num}]({rel})\n")
+                                    table_md = f"![Table — page {page_num}]({rel})\n"
+                                    table_html = f'<img src="{rel}" alt="Table — page {page_num}" />'
+                                    md_lines.append(table_md)
+                                    html_lines.append(table_html)
+                                    page_content[page_num].append(table_md)
                             else:
-                                md_lines.append(f"![Table — page {page_num}]({rel})\n")
+                                table_md = f"![Table — page {page_num}]({rel})\n"
+                                table_html = f'<img src="{rel}" alt="Table — page {page_num}" />'
+                                md_lines.append(table_md)
+                                html_lines.append(table_html)
+                                page_content[page_num].append(table_md)
                             if tables_bar: tables_bar.update(1)
                     else:
                         text = ocr_box_text(self.ocr_engine, page_img, box)
                         if text:
                             md_lines.append(text)
                             md_lines.append(self.box_separator if self.box_separator else "")
+                            # Convert text to HTML (basic conversion)
+                            html_text = text.replace('\n', '<br>')
+                            html_lines.append(f"<p>{html_text}</p>")
+                            if self.box_separator:
+                                html_lines.append("<br>")
+                            page_content[page_num].append(text)
+                            page_content[page_num].append(self.box_separator if self.box_separator else "")
         md_path = write_markdown(md_lines, out_dir)
-        html_path = write_html(md_lines, out_dir)
+        # Use HTML lines if VLM is enabled for better table formatting
+        if self.use_vlm and html_lines:
+            html_path = write_html_from_lines(html_lines, out_dir)
+        else:
+            html_path = write_html(md_lines, out_dir)
+        # Create pages folder and save individual page markdown files
+        pages_dir = os.path.join(out_dir, "pages")
+        os.makedirs(pages_dir, exist_ok=True)
+        for page_num, content_lines in page_content.items():
+            page_md_path = os.path.join(pages_dir, f"page_{page_num:03d}.md")
+            write_markdown(content_lines, os.path.dirname(page_md_path), os.path.basename(page_md_path))
         excel_path = None
         html_structured_path = None
@@ -343,6 +408,30 @@ class EnhancedPDFParser(StructuredPDFParser):
         print(f"✅ Enhanced parsing completed successfully!")
         print(f"📁 Output directory: {out_dir}")
+    def _create_enhanced_pdf_from_pages(self, enhanced_pages: List[Image.Image], output_path: str) -> None:
+        """
+        Create an enhanced PDF from already processed enhanced pages.
+        :param enhanced_pages: List of enhanced PIL images
+        :param output_path: Path for the enhanced PDF
+        """
+        if not enhanced_pages:
+            raise ValueError("No enhanced pages provided")
+        try:
+            # Create enhanced PDF from the processed pages
+            enhanced_pages[0].save(
+                output_path,
+                "PDF",
+                resolution=100.0,
+                save_all=True,
+                append_images=enhanced_pages[1:] if len(enhanced_pages) > 1 else []
+            )
+            print(f"✅ Enhanced PDF saved from processed pages: {output_path}")
+        except Exception as e:
+            print(f"❌ Error creating enhanced PDF from pages: {e}")
+            raise
     def restore_pdf_only(self, pdf_path: str, output_path: str = None, task: str = None) -> str:
         """
         Apply DocRes restoration to a PDF without parsing.

doctra/parsers/structured_pdf_parser.py CHANGED Viewed

@@ -20,7 +20,7 @@ from doctra.exporters.excel_writer import write_structured_excel
 from doctra.utils.structured_utils import to_structured_dict
 from doctra.exporters.markdown_table import render_markdown_table
 from doctra.exporters.markdown_writer import write_markdown
-from doctra.exporters.html_writer import write_html, write_structured_html
+from doctra.exporters.html_writer import write_html, write_structured_html, render_html_table, write_html_from_lines
 from doctra.utils.progress import create_beautiful_progress_bar, create_multi_progress_bars, create_notebook_friendly_bar
@@ -117,6 +117,7 @@ class StructuredPDFParser:
         table_count = sum(sum(1 for b in p.boxes if b.label == "table") for p in pages)
         md_lines: List[str] = ["# Extracted Content\n"]
+        html_lines: List[str] = ["<h1>Extracted Content</h1>"]  # For direct HTML generation
         structured_items: List[Dict[str, Any]] = []
         charts_desc = "Charts (VLM → table)" if self.use_vlm else "Charts (cropped)"
@@ -145,6 +146,7 @@ class StructuredPDFParser:
                 page_num = p.page_index
                 page_img: Image.Image = pil_pages[page_num - 1]
                 md_lines.append(f"\n## Page {page_num}\n")
+                html_lines.append(f"<h2>Page {page_num}</h2>")
                 for i, box in enumerate(sorted(p.boxes, key=reading_order_key), start=1):
                     if box.label in EXCLUDE_LABELS:
@@ -153,7 +155,10 @@ class StructuredPDFParser:
                         rel = os.path.relpath(abs_img_path, out_dir)
                         if box.label == "figure":
-                            md_lines.append(f"![Figure — page {page_num}]({rel})\n")
+                            figure_md = f"![Figure — page {page_num}]({rel})\n"
+                            figure_html = f'<img src="{rel}" alt="Figure — page {page_num}" />'
+                            md_lines.append(figure_md)
+                            html_lines.append(figure_html)
                             if figures_bar: figures_bar.update(1)
                         elif box.label == "chart":
@@ -167,17 +172,28 @@ class StructuredPDFParser:
                                         item["page"] = page_num
                                         item["type"] = "Chart"
                                         structured_items.append(item)
-                                        md_lines.append(
-                                            render_markdown_table(item.get("headers"), item.get("rows"),
-                                                                  title=item.get("title"))
-                                        )
+                                        # Generate both markdown and HTML tables
+                                        table_md = render_markdown_table(item.get("headers"), item.get("rows"),
+                                                                         title=item.get("title"))
+                                        table_html = render_html_table(item.get("headers"), item.get("rows"),
+                                                                       title=item.get("title"))
+                                        md_lines.append(table_md)
+                                        html_lines.append(table_html)
                                         wrote_table = True
                                 except Exception as e:
                                     pass
                                 if not wrote_table:
-                                    md_lines.append(f"![Chart — page {page_num}]({rel})\n")
+                                    chart_md = f"![Chart — page {page_num}]({rel})\n"
+                                    chart_html = f'<img src="{rel}" alt="Chart — page {page_num}" />'
+                                    md_lines.append(chart_md)
+                                    html_lines.append(chart_html)
                             else:
-                                md_lines.append(f"![Chart — page {page_num}]({rel})\n")
+                                chart_md = f"![Chart — page {page_num}]({rel})\n"
+                                chart_html = f'<img src="{rel}" alt="Chart — page {page_num}" />'
+                                md_lines.append(chart_md)
+                                html_lines.append(chart_html)
                             if charts_bar: charts_bar.update(1)
                         elif box.label == "table":
@@ -191,26 +207,47 @@ class StructuredPDFParser:
                                         item["page"] = page_num
                                         item["type"] = "Table"
                                         structured_items.append(item)
-                                        md_lines.append(
-                                            render_markdown_table(item.get("headers"), item.get("rows"),
-                                                                  title=item.get("title"))
-                                        )
+                                        # Generate both markdown and HTML tables
+                                        table_md = render_markdown_table(item.get("headers"), item.get("rows"),
+                                                                         title=item.get("title"))
+                                        table_html = render_html_table(item.get("headers"), item.get("rows"),
+                                                                       title=item.get("title"))
+                                        md_lines.append(table_md)
+                                        html_lines.append(table_html)
                                         wrote_table = True
                                 except Exception as e:
                                     pass
                                 if not wrote_table:
-                                    md_lines.append(f"![Table — page {page_num}]({rel})\n")
+                                    table_md = f"![Table — page {page_num}]({rel})\n"
+                                    table_html = f'<img src="{rel}" alt="Table — page {page_num}" />'
+                                    md_lines.append(table_md)
+                                    html_lines.append(table_html)
                             else:
-                                md_lines.append(f"![Table — page {page_num}]({rel})\n")
+                                table_md = f"![Table — page {page_num}]({rel})\n"
+                                table_html = f'<img src="{rel}" alt="Table — page {page_num}" />'
+                                md_lines.append(table_md)
+                                html_lines.append(table_html)
                             if tables_bar: tables_bar.update(1)
                     else:
                         text = ocr_box_text(self.ocr_engine, page_img, box)
                         if text:
                             md_lines.append(text)
                             md_lines.append(self.box_separator if self.box_separator else "")
+                            # Convert text to HTML (basic conversion)
+                            html_text = text.replace('\n', '<br>')
+                            html_lines.append(f"<p>{html_text}</p>")
+                            if self.box_separator:
+                                html_lines.append("<br>")
         md_path = write_markdown(md_lines, out_dir)
-        html_path = write_html(md_lines, out_dir)
+        # Use HTML lines if VLM is enabled for better table formatting
+        if self.use_vlm and html_lines:
+            html_path = write_html_from_lines(html_lines, out_dir)
+        else:
+            html_path = write_html(md_lines, out_dir)
         excel_path = None
         html_structured_path = None

doctra 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl

doctra 0.4.1py3-none-any.whl → 0.4.3py3-none-any.whl