PyPI - doctra - Versions diffs - 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

doctra 0.4.0py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

doctra/cli/main.py +5 -12
doctra/cli/utils.py +2 -3
doctra/engines/image_restoration/docres_engine.py +2 -7
doctra/engines/vlm/outlines_types.py +13 -9
doctra/engines/vlm/service.py +4 -2
doctra/exporters/excel_writer.py +89 -0
doctra/parsers/enhanced_pdf_parser.py +18 -14
doctra/parsers/structured_pdf_parser.py +6 -0
doctra/parsers/table_chart_extractor.py +6 -0
doctra/ui/app.py +8 -14
doctra/utils/structured_utils.py +5 -2
doctra/version.py +1 -1
{doctra-0.4.0.dist-info → doctra-0.4.1.dist-info}/METADATA +1 -1
{doctra-0.4.0.dist-info → doctra-0.4.1.dist-info}/RECORD +17 -17
{doctra-0.4.0.dist-info → doctra-0.4.1.dist-info}/WHEEL +0 -0
{doctra-0.4.0.dist-info → doctra-0.4.1.dist-info}/licenses/LICENSE +0 -0
{doctra-0.4.0.dist-info → doctra-0.4.1.dist-info}/top_level.txt +0 -0

doctra/cli/main.py CHANGED Viewed

@@ -9,6 +9,7 @@ detection results, and analyze document structure from the command line.
 import click
 import os
 import sys
+import traceback
 from pathlib import Path
 from typing import Optional
@@ -25,6 +26,10 @@ except ImportError:
     from doctra.parsers.enhanced_pdf_parser import EnhancedPDFParser
     from doctra.parsers.table_chart_extractor import ChartTablePDFParser
+# Import additional modules
+from doctra.engines.layout.paddle_layout import PaddleLayoutEngine
+from doctra.engines.image_restoration import DocResEngine
 @click.group(invoke_without_command=True)
 @click.pass_context
@@ -247,7 +252,6 @@ def parse(pdf_path: Path, output_dir: Optional[Path], use_vlm: bool,
     except Exception as e:
         click.echo(f"❌ Error initializing parser: {e}", err=True)
         if verbose:
-            import traceback
             click.echo(traceback.format_exc(), err=True)
         sys.exit(1)
@@ -271,7 +275,6 @@ def parse(pdf_path: Path, output_dir: Optional[Path], use_vlm: bool,
     except Exception as e:
         click.echo(f"❌ Error during parsing: {e}", err=True)
         if verbose:
-            import traceback
             click.echo(traceback.format_exc(), err=True)
         sys.exit(1)
     finally:
@@ -394,7 +397,6 @@ def enhance(pdf_path: Path, output_dir: Optional[Path], restoration_task: str,
     except Exception as e:
         click.echo(f"❌ Error initializing enhanced parser: {e}", err=True)
         if verbose:
-            import traceback
             click.echo(traceback.format_exc(), err=True)
         sys.exit(1)
@@ -418,7 +420,6 @@ def enhance(pdf_path: Path, output_dir: Optional[Path], restoration_task: str,
     except Exception as e:
         click.echo(f"❌ Error during enhanced parsing: {e}", err=True)
         if verbose:
-            import traceback
             click.echo(traceback.format_exc(), err=True)
         sys.exit(1)
     finally:
@@ -526,7 +527,6 @@ def charts(pdf_path: Path, output_dir: Path, use_vlm: bool, vlm_provider: str,
     except Exception as e:
         click.echo(f"❌ Error during chart extraction: {e}", err=True)
         if verbose:
-            import traceback
             click.echo(traceback.format_exc(), err=True)
         sys.exit(1)
@@ -604,7 +604,6 @@ def tables(pdf_path: Path, output_dir: Path, use_vlm: bool, vlm_provider: str,
     except Exception as e:
         click.echo(f"❌ Error during table extraction: {e}", err=True)
         if verbose:
-            import traceback
             click.echo(traceback.format_exc(), err=True)
         sys.exit(1)
@@ -683,7 +682,6 @@ def both(pdf_path: Path, output_dir: Path, use_vlm: bool, vlm_provider: str,
     except Exception as e:
         click.echo(f"❌ Error during extraction: {e}", err=True)
         if verbose:
-            import traceback
             click.echo(traceback.format_exc(), err=True)
         sys.exit(1)
@@ -772,7 +770,6 @@ def visualize(pdf_path: Path, pages: int, columns: int, width: int,
     except Exception as e:
         click.echo(f"❌ Error creating visualization: {e}", err=True)
         if verbose:
-            import traceback
             click.echo(traceback.format_exc(), err=True)
         sys.exit(1)
@@ -805,7 +802,6 @@ def analyze(pdf_path: Path, dpi: int, min_score: float, layout_model: str, verbo
         click.echo(f"🔍 Analyzing: {pdf_path.name}")
         # Create layout engine for analysis only
-        from doctra.engines.layout.paddle_layout import PaddleLayoutEngine
         if verbose:
             click.echo(f"   Using model: {layout_model}")
@@ -903,7 +899,6 @@ def analyze(pdf_path: Path, dpi: int, min_score: float, layout_model: str, verbo
     except Exception as e:
         click.echo(f"❌ Error analyzing PDF: {e}", err=True)
         if verbose:
-            import traceback
             click.echo(traceback.format_exc(), err=True)
         sys.exit(1)
@@ -922,7 +917,6 @@ def info():
     click.echo("=" * 50)
     # Check Python version
-    import sys
     python_version = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}"
     click.echo(f"Python version: {python_version}")
@@ -1003,7 +997,6 @@ def info():
     # DocRes information
     click.echo("\nDocRes Image Restoration:")
     try:
-        from doctra.engines.image_restoration import DocResEngine
         docres = DocResEngine()
         click.echo(f"  ✅ DocRes available - {len(docres.get_supported_tasks())} restoration tasks")
         click.echo("  Tasks: dewarping, deshadowing, appearance, deblurring, binarization, end2end")

doctra/cli/utils.py CHANGED Viewed

@@ -7,8 +7,10 @@ different CLI commands.
 import click
 import sys
+import traceback
 from typing import Optional, Dict, Any
 from pathlib import Path
+from doctra.utils.progress import create_beautiful_progress_bar, create_notebook_friendly_bar
 def validate_vlm_config(use_vlm: bool, vlm_api_key: Optional[str]) -> None:
@@ -58,7 +60,6 @@ def handle_exception(e: Exception, verbose: bool = False) -> None:
     """
     click.echo(f"❌ Error: {e}", err=True)
     if verbose:
-        import traceback
         click.echo(traceback.format_exc(), err=True)
     sys.exit(1)
@@ -271,8 +272,6 @@ def create_progress_callback(description: str, total: int):
     :return: Callable progress callback function that takes an integer
              representing the number of completed items
     """
-    import sys
-    from doctra.utils.progress import create_beautiful_progress_bar, create_notebook_friendly_bar
     # Enhanced environment detection
     is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules

doctra/engines/image_restoration/docres_engine.py CHANGED Viewed

@@ -18,6 +18,8 @@ import sys
 import cv2
 import numpy as np
 import torch
+import tempfile
+import time
 from pathlib import Path
 from typing import Union, List, Tuple, Optional, Dict, Any
@@ -308,8 +310,6 @@ class DocResEngine:
     def _run_single_task(self, img_array: np.ndarray, task: str, save_prompts: bool) -> Tuple[np.ndarray, Dict]:
         """Run a single restoration task"""
-        import tempfile
-        import time
         # Create temporary file for inference
         with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmp_file:
@@ -322,7 +322,6 @@ class DocResEngine:
             os.chdir(str(docres_dir))
             # Set global DEVICE variable that DocRes inference expects
-            import torch
             import inference  # Import the inference module to set its global DEVICE
             inference.DEVICE = self.device
@@ -364,8 +363,6 @@ class DocResEngine:
     def _run_end2end_pipeline(self, img_array: np.ndarray, save_prompts: bool) -> Tuple[np.ndarray, Dict]:
         """Run the end2end pipeline: dewarping → deshadowing → appearance"""
-        import tempfile
-        import time
         intermediate_steps = {}
@@ -374,7 +371,6 @@ class DocResEngine:
         os.chdir(str(docres_dir))
         # Set global DEVICE variable that DocRes inference expects
-        import torch
         import inference  # Import the inference module to set its global DEVICE
         inference.DEVICE = self.device
@@ -482,7 +478,6 @@ class DocResEngine:
         """
         try:
             from PIL import Image
-            import numpy as np
             from doctra.utils.pdf_io import render_pdf_to_images
             # Generate output path if not provided

doctra/engines/vlm/outlines_types.py CHANGED Viewed

@@ -1,17 +1,19 @@
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
 class Chart(BaseModel):
     """
     Structured representation of a chart extracted from an image.
-    Contains the title, headers, and data rows extracted from a chart
-    using VLM (Vision Language Model) processing.
+    Includes a title, a short description, column headers, and data rows
+    identified using VLM (Vision Language Model) processing.
-    :param title: Title or caption of the chart
+    :param title: Title or caption of the chart (max 31 characters)
+    :param description: Short description of the chart (max 300 characters)
     :param headers: Column headers for the chart data
     :param rows: Data rows containing the chart values
     """
-    title: str
+    title: str = Field(max_length=31)
+    description: str = Field(max_length=300)
     headers: list[str]
     rows: list[list[str]]
@@ -19,13 +21,15 @@ class Table(BaseModel):
     """
     Structured representation of a table extracted from an image.
-    Contains the title, headers, and data rows extracted from a table
-    using VLM (Vision Language Model) processing.
+    Includes a title, a short description, column headers, and data rows
+    identified using VLM (Vision Language Model) processing.
-    :param title: Title or caption of the table
+    :param title: Title or caption of the table (max 31 characters)
+    :param description: Short description of the table (max 300 characters)
     :param headers: Column headers for the table data
     :param rows: Data rows containing the table values
     """
-    title: str
+    title: str = Field(max_length=31)
+    description: str = Field(max_length=300)
     headers: list[str]
     rows: list[list[str]]

doctra/engines/vlm/service.py CHANGED Viewed

@@ -73,7 +73,7 @@ class VLMStructuredExtractor:
         Extract structured chart data from an image.
         :param image_path: Path to the chart image file
-        :return: Chart object containing extracted title, headers, and data rows
+        :return: Chart object containing extracted title, description, headers, and data rows
         :raises Exception: If image processing or VLM extraction fails
         """
         prompt_text = (
@@ -81,6 +81,7 @@ class VLMStructuredExtractor:
             "If the title is not present in the image, generate a suitable title. "
             "Ensure that the table represents the data from the chart accurately."
             "The number of columns in the headers must match the number of columns in each row."
+            "Also provide a short description (max 300 characters) of the chart."
         )
         return self._call(prompt_text, image_path, Chart)
@@ -89,7 +90,7 @@ class VLMStructuredExtractor:
         Extract structured table data from an image.
         :param image_path: Path to the table image file
-        :return: Table object containing extracted title, headers, and data rows
+        :return: Table object containing extracted title, description, headers, and data rows
         :raises Exception: If image processing or VLM extraction fails
         """
         prompt_text = (
@@ -97,5 +98,6 @@ class VLMStructuredExtractor:
             "Provide the headers and rows of the table, ensuring accuracy in the extraction. "
             "If the title is not present in the image, generate a suitable title."
             "The number of columns in the headers must match the number of columns in each row."
+            "Also provide a short description (max 300 characters) of the table."
         )
         return self._call(prompt_text, image_path, Table)

doctra/exporters/excel_writer.py CHANGED Viewed

@@ -5,6 +5,7 @@ from typing import Dict, Any, List, Set
 import pandas as pd  # pip install pandas openpyxl
 from openpyxl.styles import PatternFill, Font, Alignment
 from openpyxl.utils import get_column_letter
+from openpyxl.worksheet.hyperlink import Hyperlink
 _INVALID_SHEET_CHARS = r'[:\\/*?\[\]]'  # Excel-invalid characters
 _MAX_SHEET_LEN = 31
@@ -85,6 +86,61 @@ def _autosize_columns(ws, df: pd.DataFrame) -> None:
         ws.column_dimensions[get_column_letter(i)].width = min(max(10, max_len + 2), 60)
+def _style_summary_sheet(ws, df: pd.DataFrame, sheet_mapping: dict = None) -> None:
+    """
+    Apply special styling to the summary sheet with text wrapping for descriptions.
+    Add hyperlinks to table titles that link to their corresponding sheets.
+    :param ws: OpenPyXL worksheet object to style
+    :param df: Pandas DataFrame containing the summary data
+    :param sheet_mapping: Dictionary mapping table titles to their sheet names
+    :return: None
+    """
+    # Style header row
+    _style_header(ws, ncols=df.shape[1])
+    # Apply text wrapping to all data cells
+    wrap_alignment = Alignment(wrap_text=True, vertical="top")
+    # Apply wrapping to all data rows (skip header row)
+    for row_idx in range(2, len(df) + 2):  # Start from row 2 (after header)
+        for col_idx in range(1, df.shape[1] + 1):
+            cell = ws.cell(row=row_idx, column=col_idx)
+            cell.alignment = wrap_alignment
+            # Add hyperlink to table title column (column A)
+            if col_idx == 1 and sheet_mapping:  # Table Title column
+                table_title = cell.value
+                if table_title and table_title in sheet_mapping:
+                    sheet_name = sheet_mapping[table_title]
+                    # Create hyperlink to the sheet using proper Excel format
+                    # Escape sheet name if it contains spaces or special characters
+                    if ' ' in sheet_name or any(char in sheet_name for char in ['[', ']', '*', '?', ':', '\\', '/']):
+                        hyperlink_ref = f"#'{sheet_name}'!A1"
+                    else:
+                        hyperlink_ref = f"#{sheet_name}!A1"
+                    # Use Hyperlink class with proper parameters
+                    cell.hyperlink = Hyperlink(ref=hyperlink_ref, target=hyperlink_ref)
+                    # Style the hyperlink
+                    cell.font = Font(color="0000FF", underline="single")
+    # Set specific column widths for summary sheet
+    # Table Title column - narrower
+    ws.column_dimensions['A'].width = 30
+    # Description column - wider to accommodate wrapped text
+    ws.column_dimensions['B'].width = 60
+    # Page column - narrow for page numbers
+    ws.column_dimensions['C'].width = 10
+    # Type column - narrow for Table/Chart
+    ws.column_dimensions['D'].width = 12
+    # Set row heights to accommodate wrapped text
+    for row_idx in range(2, len(df) + 2):
+        ws.row_dimensions[row_idx].height = 60  # Allow for multiple lines
 def _normalize_data(headers: List[str], rows: List[List]) -> tuple[List[str], List[List]]:
     """
     Normalize headers and rows to ensure consistent dimensions.
@@ -159,6 +215,31 @@ def write_structured_excel(excel_path: str, items: List[Dict[str, Any]]) -> str
     taken: Set[str] = set()
     with pd.ExcelWriter(excel_path, engine="openpyxl", mode="w") as writer:
+        # Create summary sheet first
+        summary_data = []
+        sheet_mapping = {}  # Map table titles to their sheet names
+        for item in valid_items:
+            title = item.get("title") or "Untitled"
+            description = item.get("description") or "No description available"
+            page_number = item.get("page", "Unknown")
+            item_type = item.get("type", "Table")  # Default to "Table" if not specified
+            summary_data.append({
+                "Table Title": title,
+                "Description": description,
+                "Page": page_number,
+                "Type": item_type
+            })
+        # Create summary sheet first (but without hyperlinks initially)
+        if summary_data:
+            summary_df = pd.DataFrame(summary_data)
+            summary_df.to_excel(writer, sheet_name="Table Summary", index=False)
+            taken.add("Table Summary")
+        # Process individual table sheets to build sheet mapping
         for item in valid_items:
             try:
                 title = item.get("title") or "Untitled"
@@ -166,6 +247,9 @@ def write_structured_excel(excel_path: str, items: List[Dict[str, Any]]) -> str
                 rows = item.get("rows") or []
                 sheet_name = _safe_sheet_name(title, taken)
+                # Add to sheet mapping for hyperlinks
+                sheet_mapping[title] = sheet_name
                 # Normalize data to handle mismatched dimensions
                 normalized_headers, normalized_rows = _normalize_data(headers, rows)
@@ -194,4 +278,9 @@ def write_structured_excel(excel_path: str, items: List[Dict[str, Any]]) -> str
                 print(f"Error processing item '{item.get('title', 'Unknown')}': {e}")
                 continue
+        # Now add hyperlinks to the summary sheet (after all sheets are created)
+        if summary_data and sheet_mapping:
+            summary_ws = writer.sheets["Table Summary"]
+            _style_summary_sheet(summary_ws, summary_df, sheet_mapping)
     return excel_path

doctra/parsers/enhanced_pdf_parser.py CHANGED Viewed

@@ -8,6 +8,7 @@ capabilities with DocRes image restoration for improved document processing.
 from __future__ import annotations
 import os
 import sys
+import numpy as np
 from typing import List, Dict, Any, Optional, Union
 from contextlib import ExitStack
 from PIL import Image
@@ -16,9 +17,17 @@ from tqdm import tqdm
 from doctra.parsers.structured_pdf_parser import StructuredPDFParser
 from doctra.engines.image_restoration import DocResEngine
 from doctra.utils.pdf_io import render_pdf_to_images
-from doctra.utils.constants import IMAGE_SUBDIRS
+from doctra.utils.constants import IMAGE_SUBDIRS, EXCLUDE_LABELS
 from doctra.utils.file_ops import ensure_output_dirs
 from doctra.utils.progress import create_beautiful_progress_bar, create_notebook_friendly_bar
+from doctra.parsers.layout_order import reading_order_key
+from doctra.utils.ocr_utils import ocr_box_text
+from doctra.exporters.image_saver import save_box_image
+from doctra.exporters.markdown_writer import write_markdown
+from doctra.exporters.html_writer import write_html, write_structured_html
+from doctra.exporters.excel_writer import write_structured_excel
+from doctra.utils.structured_utils import to_structured_dict
+from doctra.exporters.markdown_table import render_markdown_table
 class EnhancedPDFParser(StructuredPDFParser):
@@ -146,7 +155,7 @@ class EnhancedPDFParser(StructuredPDFParser):
         pil_pages = enhanced_pages
         # Continue with standard parsing logic
-        self._process_parsing_logic(pages, pil_pages, out_dir, pdf_filename)
+        self._process_parsing_logic(pages, pil_pages, out_dir, pdf_filename, pdf_path)
     def _process_pages_with_restoration(self, pdf_path: str, out_dir: str) -> List[Image.Image]:
         """
@@ -186,7 +195,6 @@ class EnhancedPDFParser(StructuredPDFParser):
                 for i, page_img in enumerate(original_pages):
                     try:
                         # Convert PIL to numpy array
-                        import numpy as np
                         img_array = np.array(page_img)
                         # Apply DocRes restoration
@@ -219,21 +227,11 @@ class EnhancedPDFParser(StructuredPDFParser):
         print(f"✅ Image restoration completed. Enhanced pages saved to: {enhanced_dir}")
         return enhanced_pages
-    def _process_parsing_logic(self, pages, pil_pages, out_dir, pdf_filename):
+    def _process_parsing_logic(self, pages, pil_pages, out_dir, pdf_filename, pdf_path):
         """
         Process the parsing logic with enhanced pages.
         This is extracted from the parent class to allow customization.
         """
-        from doctra.utils.constants import EXCLUDE_LABELS
-        from doctra.parsers.layout_order import reading_order_key
-        from doctra.utils.ocr_utils import ocr_box_text
-        from doctra.exporters.image_saver import save_box_image
-        from doctra.exporters.markdown_writer import write_markdown
-        from doctra.exporters.html_writer import write_html
-        from doctra.exporters.excel_writer import write_structured_excel
-        from doctra.exporters.html_writer import write_structured_html
-        from doctra.utils.structured_utils import to_structured_dict
-        from doctra.exporters.markdown_table import render_markdown_table
         fig_count = sum(sum(1 for b in p.boxes if b.label == "figure") for p in pages)
         chart_count = sum(sum(1 for b in p.boxes if b.label == "chart") for p in pages)
@@ -285,6 +283,9 @@ class EnhancedPDFParser(StructuredPDFParser):
                                     chart = self.vlm.extract_chart(abs_img_path)
                                     item = to_structured_dict(chart)
                                     if item:
+                                        # Add page and type information to structured item
+                                        item["page"] = page_num
+                                        item["type"] = "Chart"
                                         structured_items.append(item)
                                         md_lines.append(
                                             render_markdown_table(item.get("headers"), item.get("rows"),
@@ -306,6 +307,9 @@ class EnhancedPDFParser(StructuredPDFParser):
                                     table = self.vlm.extract_table(abs_img_path)
                                     item = to_structured_dict(table)
                                     if item:
+                                        # Add page and type information to structured item
+                                        item["page"] = page_num
+                                        item["type"] = "Table"
                                         structured_items.append(item)
                                         md_lines.append(
                                             render_markdown_table(item.get("headers"), item.get("rows"),

doctra/parsers/structured_pdf_parser.py CHANGED Viewed

@@ -163,6 +163,9 @@ class StructuredPDFParser:
                                     chart = self.vlm.extract_chart(abs_img_path)
                                     item = to_structured_dict(chart)
                                     if item:
+                                        # Add page and type information to structured item
+                                        item["page"] = page_num
+                                        item["type"] = "Chart"
                                         structured_items.append(item)
                                         md_lines.append(
                                             render_markdown_table(item.get("headers"), item.get("rows"),
@@ -184,6 +187,9 @@ class StructuredPDFParser:
                                     table = self.vlm.extract_table(abs_img_path)
                                     item = to_structured_dict(table)
                                     if item:
+                                        # Add page and type information to structured item
+                                        item["page"] = page_num
+                                        item["type"] = "Table"
                                         structured_items.append(item)
                                         md_lines.append(
                                             render_markdown_table(item.get("headers"), item.get("rows"),

doctra/parsers/table_chart_extractor.py CHANGED Viewed

@@ -178,6 +178,9 @@ class ChartTablePDFParser:
                                 extracted_chart = self.vlm.extract_chart(chart_path)
                                 structured_item = to_structured_dict(extracted_chart)
                                 if structured_item:
+                                    # Add page and type information to structured item
+                                    structured_item["page"] = page_num
+                                    structured_item["type"] = "Chart"
                                     structured_items.append(structured_item)
                                     vlm_items.append({
                                         "kind": "chart",
@@ -221,6 +224,9 @@ class ChartTablePDFParser:
                                 extracted_table = self.vlm.extract_table(table_path)
                                 structured_item = to_structured_dict(extracted_table)
                                 if structured_item:
+                                    # Add page and type information to structured item
+                                    structured_item["page"] = page_num
+                                    structured_item["type"] = "Table"
                                     structured_items.append(structured_item)
                                     vlm_items.append({
                                         "kind": "table",

doctra/ui/app.py CHANGED Viewed

@@ -2,6 +2,11 @@ import os
 import shutil
 import tempfile
 import re
+import traceback
+import pandas as pd
+import html as _html
+import base64
+import json
 from pathlib import Path
 from typing import Optional, Tuple, List, Dict, Any
@@ -9,6 +14,7 @@ import gradio as gr
 from doctra.parsers.structured_pdf_parser import StructuredPDFParser
 from doctra.parsers.table_chart_extractor import ChartTablePDFParser
+from doctra.utils.pdf_io import render_pdf_to_images
 def _gather_outputs(out_dir: Path, allowed_kinds: Optional[List[str]] = None, zip_filename: Optional[str] = None, is_structured_parsing: bool = False) -> Tuple[List[tuple[str, str]], List[str], str]:
@@ -100,7 +106,6 @@ def _parse_markdown_by_pages(md_content: str) -> List[Dict[str, Any]]:
     Parse markdown content and organize it by pages.
     Returns a list of page dictionaries with content, tables, charts, and figures.
     """
-    import re
     pages = []
     current_page = None
@@ -209,7 +214,6 @@ def run_full_parse(
     try:
         parser.parse(str(input_pdf))
     except Exception as e:
-        import traceback
         traceback.print_exc()
         # Safely encode error message for return value
         try:
@@ -325,8 +329,6 @@ def run_extract(
             if excel_filename:
                 excel_path = out_dir / excel_filename
                 if excel_path.exists():
-                    import pandas as pd
-                    import html as _html
                     # Read Excel file and create HTML tables
                     xl_file = pd.ExcelFile(excel_path)
@@ -489,7 +491,6 @@ def build_demo() -> gr.Blocks:
             def parse_markdown_by_pages(md_content: str):
                 """Parse markdown content and organize it by pages."""
-                import re
                 pages = []
                 current_page = None
@@ -548,7 +549,6 @@ def build_demo() -> gr.Blocks:
                     return "Page not found", None
                 # Build HTML with inline base64 images, render markdown tables, and preserve paragraphs/line breaks
-                import html as _html, base64, re as _re
                 base_dir = None
                 try:
                     stem = Path(pdf_path).stem if pdf_path else ""
@@ -589,7 +589,7 @@ def build_demo() -> gr.Blocks:
                     stripped = line.strip()
                     if stripped.startswith('![') and ('](images/' in stripped or '](images\\' in stripped):
                         flush_paragraph()
-                        match = _re.match(r'!\[([^\]]+)\]\(([^)]+)\)', stripped)
+                        match = re.match(r'!\[([^\]]+)\]\(([^)]+)\)', stripped)
                         if match and base_dir is not None:
                             caption = match.group(1)
                             rel_path = match.group(2).replace('\\\\', '/').replace('\\', '/').lstrip('/')
@@ -646,7 +646,6 @@ def build_demo() -> gr.Blocks:
                 # Ensure page images are prepared
                 try:
                     if pdf_path and not page_images:
-                        from doctra.utils.pdf_io import render_pdf_to_images
                         tmp_img_dir = Path(tempfile.mkdtemp(prefix="doctra_pages_"))
                         pil_pages = render_pdf_to_images(pdf_path)
                         saved_paths: List[str] = []
@@ -726,7 +725,6 @@ def build_demo() -> gr.Blocks:
                     for page in pages_data:
                         for line in page['content']:
                             if line.strip().startswith('![') and ('](images/' in line or '](images\\' in line):
-                                import re
                                 match = re.match(r'!\[([^\]]+)\]\(([^)]+)\)', line.strip())
                                 if match:
                                     caption = match.group(1)
@@ -745,7 +743,6 @@ def build_demo() -> gr.Blocks:
                 saved_paths: List[str] = []
                 try:
                     if input_pdf_path:
-                        from doctra.utils.pdf_io import render_pdf_to_images
                         tmp_img_dir = Path(tempfile.mkdtemp(prefix="doctra_pages_"))
                         pil_pages = render_pdf_to_images(input_pdf_path)
                         for idx, (im, _, _) in enumerate(pil_pages, start=1):
@@ -759,7 +756,6 @@ def build_demo() -> gr.Blocks:
                 # Build initial HTML with inline images and proper blocks for first page
                 if pages_data:
-                    import html as _html, base64, re as _re
                     base_dir = None
                     try:
                         stem = Path(input_pdf_path).stem if input_pdf_path else ""
@@ -771,7 +767,7 @@ def build_demo() -> gr.Blocks:
                     for raw_line in pages_data[0]['content']:
                         line = raw_line.strip()
                         if line.startswith('![') and ('](images/' in line or '](images\\' in line):
-                            match = _re.match(r'!\[([^\]]+)\]\(([^)]+)\)', line)
+                            match = re.match(r'!\[([^\]]+)\]\(([^)]+)\)', line)
                             if match and base_dir is not None:
                                 caption = match.group(1)
                                 rel_path = match.group(2).replace('\\\\', '/').replace('\\', '/').lstrip('/')
@@ -874,7 +870,6 @@ def build_demo() -> gr.Blocks:
                     if not mapping.exists():
                         return gr.Dropdown(choices=[], value=None, visible=False)
-                    import json
                     data = json.loads(mapping.read_text(encoding="utf-8"))
                     choices = []
@@ -902,7 +897,6 @@ def build_demo() -> gr.Blocks:
                     if not mapping.exists():
                         return "", None
-                    import json, html as _html
                     data = json.loads(mapping.read_text(encoding="utf-8"))
                     for entry in data:

doctra/utils/structured_utils.py CHANGED Viewed

@@ -14,7 +14,7 @@ def to_structured_dict(obj: Any) -> Optional[Dict[str, Any]]:
       - JSON string
       - dict
       - Pydantic BaseModel (v1 .dict() or v2 .model_dump())
-    Returns a normalized dict with keys: title, headers, rows — or None.
+    Returns a normalized dict with keys: title, description, headers, rows, page, type — or None.
     """
     if obj is None:
         return None
@@ -36,10 +36,13 @@ def to_structured_dict(obj: Any) -> Optional[Dict[str, Any]]:
     if isinstance(obj, dict):
         title = obj.get("title") or "Untitled"
+        description = obj.get("description") or ""
         headers = obj.get("headers") or []
         rows = obj.get("rows") or []
+        page = obj.get("page", "Unknown")
+        item_type = obj.get("type", "Table")
         if not isinstance(headers, list) or not isinstance(rows, list):
             return None
-        return {"title": title, "headers": headers, "rows": rows}
+        return {"title": title, "description": description, "headers": headers, "rows": rows, "page": page, "type": item_type}
     return None

doctra/version.py CHANGED Viewed

@@ -1,2 +1,2 @@
 """Version information for Doctra."""
-__version__ = '0.4.0'
+__version__ = '0.4.1'

{doctra-0.4.0.dist-info → doctra-0.4.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: doctra
-Version: 0.4.0
+Version: 0.4.1
 Summary: Parse, extract, and analyze documents with ease
 Home-page: https://github.com/AdemBoukhris457/Doctra
 Author: Adem Boukhris

{doctra-0.4.0.dist-info → doctra-0.4.1.dist-info}/RECORD RENAMED Viewed

@@ -1,11 +1,11 @@
 doctra/__init__.py,sha256=rNLCyODOpaPb_TTP6qmQnuWZJW9JPXrxg1IfKnvb1No,773
-doctra/version.py,sha256=PSDo-SLZhu8_cWgmtvzLjHyKr7C8D_F61M1tiywnuKY,62
+doctra/version.py,sha256=gJX4jQdS3czcKE2h1k17fJPgWzxHyGH2oFP9nW9cTLw,62
 doctra/cli/__init__.py,sha256=4PTujjYRShOOUlZ7PwuWckShPWLC4v4CYIhJpzgyv1k,911
-doctra/cli/main.py,sha256=6b415qg-8gV4M2Uf0WvdU_nFx65DYFgRu5Q3Ys_LvAo,43756
-doctra/cli/utils.py,sha256=IghiUZQCOmXODC5-5smHGz2KeV4xqbP4avmA1Mggln0,11800
+doctra/cli/main.py,sha256=_gvG8bm-Mn1tIEw6eJUgqz9dYEo9klXGiJDJzjqgPyo,43503
+doctra/cli/utils.py,sha256=w3Bxyzczcbl_cs1Cea8C3ehv7dkGl_wecprYZXrcGhk,11772
 doctra/engines/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 doctra/engines/image_restoration/__init__.py,sha256=vzcN6Rw7_U-5jIK2pdo2NlgqdLdXDShigrOGM7QLNEE,263
-doctra/engines/image_restoration/docres_engine.py,sha256=6j2LfoqirmEEmLTOsz8nkhqaHUQHjYbJr-2MR01i6Gc,21754
+doctra/engines/image_restoration/docres_engine.py,sha256=n9Pr0R7dbu_UHv51urGv_wC6ZYW-43bmXxiyTCOEOMo,21612
 doctra/engines/layout/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 doctra/engines/layout/layout_models.py,sha256=vuTzjWd3FD-SkFPngktmUVhOJ6Xvff6ufwFEq796PQs,3162
 doctra/engines/layout/paddle_layout.py,sha256=P2-Gk8wHpWoA5Jpmo_3OLI59zWq3HeAOBOUKKVdXu8I,6792
@@ -14,20 +14,20 @@ doctra/engines/ocr/api.py,sha256=YOBKDLExXpvSiOsc_TDJasaMPxzdVx1llQCtYlsruWo,128
 doctra/engines/ocr/path_resolver.py,sha256=2_7Nsekt3dCDU3oVsgdr62iMrlAhbGNfYwgh4G7S3pA,1492
 doctra/engines/ocr/pytesseract_engine.py,sha256=Imz2uwju6himkBiS8CH7DLxBRe-LtmMYZiOdb_6PoQw,2911
 doctra/engines/vlm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-doctra/engines/vlm/outlines_types.py,sha256=qL-G6MNiA5mxp1qAPVEFhOANp4NqVt_MQKseJCr_xXE,970
+doctra/engines/vlm/outlines_types.py,sha256=fQK6ru7XiXHaa8JPpaTTBaTk_zQ93ZyhFp4SyAnUdVU,1337
 doctra/engines/vlm/provider.py,sha256=aE8Eo1U-8XqAimakNlT0-T4etIyCV8rZ3DwxdqbFeTc,3131
-doctra/engines/vlm/service.py,sha256=4ExDbLmyyC3ICXxr7OSIqvbOdrwbIJek-DE54vAUgDA,4151
+doctra/engines/vlm/service.py,sha256=nygxMe7uTq6Bv70ycBPL59F2a0ESp1Hix4j833p6rUM,4343
 doctra/exporters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-doctra/exporters/excel_writer.py,sha256=U5Eb5SF7_ll1QveUapSWSkCRt3OEoisKEVUQ_7X8Wjo,7762
+doctra/exporters/excel_writer.py,sha256=rwyqlH73P7z413BELovQY_pS6IMkkqHEho6mbPrJ2Sk,11857
 doctra/exporters/html_writer.py,sha256=OlW24Eg5bZcjldRHtd3GDD7RrajuRXj43EJpXIJkYf8,38810
 doctra/exporters/image_saver.py,sha256=zsPoQ0CwoE643ui4iZMdXk96kv5mU8L_zC2JfF22N1A,1639
 doctra/exporters/markdown_table.py,sha256=4_OJIwG_WoIPYBzJx1njy_3tNVdkK6QKSP-P9r-b0zw,2030
 doctra/exporters/markdown_writer.py,sha256=L7EjF2MB8jYX7XkZ3a3NeeEC8gnb0qzRPTzIN9tdfuw,1027
 doctra/parsers/__init__.py,sha256=8M6LVzcWGpuTIK_1SMXML3ll7zK1CTHXGI5qXvqdm-A,206
-doctra/parsers/enhanced_pdf_parser.py,sha256=7KfkQexXTxbi8Naen7HFlFaeoEGpfdbYbvRqkTXw22A,18095
+doctra/parsers/enhanced_pdf_parser.py,sha256=NBBopYdSIHWd_O96J0qR3DqZvbAt3CfK1hwUkXu8540,18377
 doctra/parsers/layout_order.py,sha256=W6b-T11H907RZ2FaZwNvnYhmvH11rpUzxC5yLkdf28k,640
-doctra/parsers/structured_pdf_parser.py,sha256=QIZIS5SAaIdGiT8o7G_a4D-Cht7nVLGeSuVzqSYLn14,19160
-doctra/parsers/table_chart_extractor.py,sha256=kSubqX0n0kVu_3jzX6QUyKmEGs9sG3Bg9kzUzn2wPHo,13733
+doctra/parsers/structured_pdf_parser.py,sha256=AU6yLW2kpd8bxZjelmm73L4CVBysnVAdKxwPkTV1Fzk,19602
+doctra/parsers/table_chart_extractor.py,sha256=ePmk9m9n-mvkqOvxpWC42ElxbnKMmDnq-e6SWiNqgzA,14195
 doctra/third_party/docres/inference.py,sha256=krD5EQDiqki-5uTMqqHYivhL38sfSOhYgaihI751070,13576
 doctra/third_party/docres/utils.py,sha256=N0ZVmOTB3wsinFlYu5hT84C4_MhWGdc98T8LTG-S9dA,14566
 doctra/third_party/docres/data/MBD/MBD.py,sha256=-d6cVQX1FVcGmQ_yJ5l-PQ3xKmkmveQQBytZ38pEGfY,4653
@@ -49,7 +49,7 @@ doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/unittest.
 doctra/third_party/docres/data/preprocess/crop_merge_image.py,sha256=f2NANY92s6IQ1hl1MAXfftFPIyIrj24O4TONjg7SXEc,4747
 doctra/third_party/docres/models/restormer_arch.py,sha256=BSwv_odCcp4HUZj3gv21e4IzFRBiyk8FjKAO8kF4YS8,12510
 doctra/ui/__init__.py,sha256=XzOOKeGSBnUREuDQiCIWds1asFSa2nypFQTJXwclROA,85
-doctra/ui/app.py,sha256=WpXUWHSs7wSYNjY4iBOZJHsKGQ88jDytvOFIjuhqAGE,44031
+doctra/ui/app.py,sha256=iFSAVZacL7iHB1SHhcUzperJGNQVWqUhvOYdlgjjt50,43623
 doctra/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 doctra/utils/bbox.py,sha256=R2-95p0KiWvet3TH27TQVvCar7WJg6z0u3L21iEDF-A,674
 doctra/utils/constants.py,sha256=ZWOvNDrvETbQ_pxHiX7vUW4J5Oj8_qnov0QacUOBizI,189
@@ -59,9 +59,9 @@ doctra/utils/ocr_utils.py,sha256=Doa1uYBg3kRgRYd2aPq9fICHgHfrM_efdhZfI7jl6OM,780
 doctra/utils/pdf_io.py,sha256=c8EY47Z1iqVtlLFHS_n0qGuXJ5ERFaMUd84ivXV0b9E,706
 doctra/utils/progress.py,sha256=IKQ_YErWSEd4hddYMUiCORy0_kW4TOYJM891HUEq2_E,11901
 doctra/utils/quiet.py,sha256=5XPS-1CtJ0sVk6qgSQctdhr_wR8mP1xoJLoUbmkXROA,387
-doctra/utils/structured_utils.py,sha256=znC2zr80rZMfIV58lipZ8M4zPq6IF070pdwLBve1qiE,1251
-doctra-0.4.0.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
-doctra-0.4.0.dist-info/METADATA,sha256=nlIT-QfxcwWi97jbQIastNHty8if3CyUv0LaDGiK7tk,28298
-doctra-0.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-doctra-0.4.0.dist-info/top_level.txt,sha256=jI7E8jHci2gP9y0GYaWxlg9jG0O5n3FjHJJPLXDXMds,7
-doctra-0.4.0.dist-info/RECORD,,
+doctra/utils/structured_utils.py,sha256=vU84dsD8wIlTyMsA9hitorGH-eroQiVuWEpBTQBUT24,1478
+doctra-0.4.1.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
+doctra-0.4.1.dist-info/METADATA,sha256=wXduiq7VJS5vf-TXdxpYFCKGfPyGYr5jGK0mwH3OjUw,28298
+doctra-0.4.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+doctra-0.4.1.dist-info/top_level.txt,sha256=jI7E8jHci2gP9y0GYaWxlg9jG0O5n3FjHJJPLXDXMds,7
+doctra-0.4.1.dist-info/RECORD,,

{doctra-0.4.0.dist-info → doctra-0.4.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{doctra-0.4.0.dist-info → doctra-0.4.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{doctra-0.4.0.dist-info → doctra-0.4.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

doctra 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

doctra 0.4.0py3-none-any.whl → 0.4.1py3-none-any.whl