PyPI - doctra - Versions diffs - 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

doctra 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

doctra/__init__.py +21 -18
doctra/cli/main.py +3 -0
doctra/engines/layout/paddle_layout.py +11 -77
doctra/engines/vlm/provider.py +85 -85
doctra/engines/vlm/service.py +6 -13
doctra/exporters/html_writer.py +1235 -0
doctra/parsers/structured_pdf_parser.py +12 -7
doctra/parsers/table_chart_extractor.py +47 -22
doctra/ui/__init__.py +5 -0
doctra/ui/app.py +1012 -0
doctra/utils/progress.py +200 -49
doctra/utils/structured_utils.py +49 -49
doctra/version.py +1 -1
{doctra-0.2.0.dist-info → doctra-0.3.0.dist-info}/METADATA +38 -1
{doctra-0.2.0.dist-info → doctra-0.3.0.dist-info}/RECORD +18 -15
{doctra-0.2.0.dist-info → doctra-0.3.0.dist-info}/WHEEL +0 -0
{doctra-0.2.0.dist-info → doctra-0.3.0.dist-info}/licenses/LICENSE +0 -0
{doctra-0.2.0.dist-info → doctra-0.3.0.dist-info}/top_level.txt +0 -0

doctra/parsers/structured_pdf_parser.py CHANGED Viewed

@@ -20,6 +20,7 @@ from doctra.exporters.excel_writer import write_structured_excel
 from doctra.utils.structured_utils import to_structured_dict
 from doctra.exporters.markdown_table import render_markdown_table
 from doctra.exporters.markdown_writer import write_markdown
+from doctra.exporters.html_writer import write_html, write_structured_html
 from doctra.utils.progress import create_beautiful_progress_bar, create_multi_progress_bars, create_notebook_friendly_bar
@@ -109,7 +110,7 @@ class StructuredPDFParser:
         """
         # Extract filename without extension and create output directory
         pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0]
-        out_dir = f"outputs/{pdf_filename}"
+        out_dir = f"outputs/{pdf_filename}/full_parse"
         os.makedirs(out_dir, exist_ok=True)
         ensure_output_dirs(out_dir, IMAGE_SUBDIRS)
@@ -182,7 +183,7 @@ class StructuredPDFParser:
                                                                   title=item.get("title"))
                                         )
                                         wrote_table = True
-                                except Exception:
+                                except Exception as e:
                                     pass
                                 if not wrote_table:
                                     md_lines.append(f"![Chart — page {page_num}]({rel})\n")
@@ -204,7 +205,7 @@ class StructuredPDFParser:
                                                                   title=item.get("title"))
                                         )
                                         wrote_table = True
-                                except Exception:
+                                except Exception as e:
                                     pass
                                 if not wrote_table:
                                     md_lines.append(f"![Table — page {page_num}]({rel})\n")
@@ -218,15 +219,19 @@ class StructuredPDFParser:
                             md_lines.append(self.box_separator if self.box_separator else "")
         md_path = write_markdown(md_lines, out_dir)
+        html_path = write_html(md_lines, out_dir)
         excel_path = None
+        html_structured_path = None
         if self.use_vlm and structured_items:
             excel_path = os.path.join(out_dir, "tables.xlsx")
             write_structured_excel(excel_path, structured_items)
+            html_structured_path = os.path.join(out_dir, "tables.html")
+            write_structured_html(html_structured_path, structured_items)
-        if excel_path:
-            print(f"Parsing completed successfully.\n- Markdown: {md_path}\n- Excel:    {excel_path}")
-        else:
-            print(f"Parsing completed successfully.\n- Markdown: {md_path}")
+        # Print completion message with output directory
+        print(f"✅ Parsing completed successfully!")
+        print(f"📁 Output directory: {out_dir}")
     def display_pages_with_boxes(self, pdf_path: str, num_pages: int = 3, cols: int = 2,
                                  page_width: int = 800, spacing: int = 40, save_path: str = None) -> None:

doctra/parsers/table_chart_extractor.py CHANGED Viewed

@@ -23,6 +23,8 @@ from doctra.exporters.excel_writer import write_structured_excel
 from doctra.utils.structured_utils import to_structured_dict
 from doctra.exporters.markdown_table import render_markdown_table
 from doctra.exporters.markdown_writer import write_markdown
+from doctra.exporters.html_writer import write_structured_html
+import json
 class ChartTablePDFParser:
@@ -105,9 +107,9 @@ class ChartTablePDFParser:
         :param output_base_dir: Base directory for output files (default: "outputs")
         :return: None
         """
-        # Create output directory structure: outputs/structured_doc/<filename>/
+        # Create output directory structure: outputs/<filename>/structured_parsing/
         pdf_name = Path(pdf_path).stem
-        out_dir = os.path.join(output_base_dir, pdf_name)
+        out_dir = os.path.join(output_base_dir, pdf_name, "structured_parsing")
         os.makedirs(out_dir, exist_ok=True)
         # Create subdirectories based on what we're extracting
@@ -142,6 +144,7 @@ class ChartTablePDFParser:
         if self.use_vlm:
             md_lines: List[str] = ["# Extracted Charts and Tables\n"]
             structured_items: List[Dict[str, Any]] = []
+            vlm_items: List[Dict[str, Any]] = []
         # Progress bar descriptions
         charts_desc = "Charts (VLM → table)" if self.use_vlm else "Charts (cropped)"
@@ -197,6 +200,14 @@ class ChartTablePDFParser:
                                 structured_item = to_structured_dict(extracted_chart)
                                 if structured_item:
                                     structured_items.append(structured_item)
+                                    vlm_items.append({
+                                        "kind": "chart",
+                                        "page": page_num,
+                                        "image_rel_path": rel_path,
+                                        "title": structured_item.get("title"),
+                                        "headers": structured_item.get("headers"),
+                                        "rows": structured_item.get("rows"),
+                                    })
                                     md_lines.append(
                                         render_markdown_table(
                                             structured_item.get("headers"),
@@ -235,6 +246,14 @@ class ChartTablePDFParser:
                                 structured_item = to_structured_dict(extracted_table)
                                 if structured_item:
                                     structured_items.append(structured_item)
+                                    vlm_items.append({
+                                        "kind": "table",
+                                        "page": page_num,
+                                        "image_rel_path": rel_path,
+                                        "title": structured_item.get("title"),
+                                        "headers": structured_item.get("headers"),
+                                        "rows": structured_item.get("rows"),
+                                    })
                                     md_lines.append(
                                         render_markdown_table(
                                             structured_item.get("headers"),
@@ -266,8 +285,29 @@ class ChartTablePDFParser:
             # Write Excel file if we have structured data
             if structured_items:
-                excel_path = os.path.join(out_dir, "charts.xlsx")
+                # Determine Excel filename based on extraction target
+                if self.extract_charts and self.extract_tables:
+                    excel_filename = "parsed_tables_charts.xlsx"
+                elif self.extract_charts:
+                    excel_filename = "parsed_charts.xlsx"
+                elif self.extract_tables:
+                    excel_filename = "parsed_tables.xlsx"
+                else:
+                    excel_filename = "parsed_data.xlsx"  # fallback
+                excel_path = os.path.join(out_dir, excel_filename)
                 write_structured_excel(excel_path, structured_items)
+                # Also create HTML version
+                html_filename = excel_filename.replace('.xlsx', '.html')
+                html_path = os.path.join(out_dir, html_filename)
+                write_structured_html(html_path, structured_items)
+            # Write VLM items mapping for UI linkage
+            if 'vlm_items' in locals() and vlm_items:
+                with open(os.path.join(out_dir, "vlm_items.json"), 'w', encoding='utf-8') as jf:
+                    json.dump(vlm_items, jf, ensure_ascii=False, indent=2)
         # Print results
         extraction_types = []
@@ -275,22 +315,7 @@ class ChartTablePDFParser:
             extraction_types.append("charts")
         if self.extract_tables:
             extraction_types.append("tables")
-        print(f"{' and '.join(extraction_types).title()} extraction completed successfully.")
-        print(f"- Output directory: {out_dir}")
-        if charts_dir and self.extract_charts:
-            print(f"- Charts directory: {charts_dir}")
-            print(f"- Charts extracted: {chart_counter - 1}")
-        if tables_dir and self.extract_tables:
-            print(f"- Tables directory: {tables_dir}")
-            print(f"- Tables extracted: {table_counter - 1}")
-        if md_path:
-            print(f"- Markdown file: {md_path}")
-        if excel_path:
-            print(f"- Excel file: {excel_path}")
-        if not self.use_vlm:
-            print("- Note: VLM disabled - only cropped images saved")
+        # Print completion message with output directory
+        print(f"✅ Parsing completed successfully!")
+        print(f"📁 Output directory: {out_dir}")

doctra/ui/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from .app import build_demo, launch_ui
+__all__ = ["build_demo", "launch_ui"]

doctra 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

doctra 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl