doctra 0.4.1__tar.gz → 0.4.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. {doctra-0.4.1/doctra.egg-info → doctra-0.4.2}/PKG-INFO +1 -1
  2. {doctra-0.4.1 → doctra-0.4.2}/doctra/engines/image_restoration/docres_engine.py +4 -4
  3. {doctra-0.4.1 → doctra-0.4.2}/doctra/exporters/html_writer.py +206 -1
  4. {doctra-0.4.1 → doctra-0.4.2}/doctra/parsers/enhanced_pdf_parser.py +107 -18
  5. {doctra-0.4.1 → doctra-0.4.2}/doctra/parsers/structured_pdf_parser.py +52 -15
  6. {doctra-0.4.1 → doctra-0.4.2}/doctra/parsers/table_chart_extractor.py +290 -290
  7. doctra-0.4.2/doctra/ui/app.py +64 -0
  8. doctra-0.4.2/doctra/ui/docres_ui.py +338 -0
  9. doctra-0.4.2/doctra/ui/docres_wrapper.py +120 -0
  10. doctra-0.4.2/doctra/ui/enhanced_parser_ui.py +483 -0
  11. doctra-0.4.2/doctra/ui/full_parse_ui.py +539 -0
  12. doctra-0.4.2/doctra/ui/tables_charts_ui.py +445 -0
  13. doctra-0.4.2/doctra/ui/ui_helpers.py +435 -0
  14. {doctra-0.4.1 → doctra-0.4.2}/doctra/utils/progress.py +7 -7
  15. {doctra-0.4.1 → doctra-0.4.2}/doctra/version.py +1 -1
  16. {doctra-0.4.1 → doctra-0.4.2/doctra.egg-info}/PKG-INFO +1 -1
  17. {doctra-0.4.1 → doctra-0.4.2}/doctra.egg-info/SOURCES.txt +6 -0
  18. doctra-0.4.1/doctra/ui/app.py +0 -979
  19. {doctra-0.4.1 → doctra-0.4.2}/LICENSE +0 -0
  20. {doctra-0.4.1 → doctra-0.4.2}/MANIFEST.in +0 -0
  21. {doctra-0.4.1 → doctra-0.4.2}/README.md +0 -0
  22. {doctra-0.4.1 → doctra-0.4.2}/doctra/__init__.py +0 -0
  23. {doctra-0.4.1 → doctra-0.4.2}/doctra/cli/__init__.py +0 -0
  24. {doctra-0.4.1 → doctra-0.4.2}/doctra/cli/main.py +0 -0
  25. {doctra-0.4.1 → doctra-0.4.2}/doctra/cli/utils.py +0 -0
  26. {doctra-0.4.1 → doctra-0.4.2}/doctra/engines/__init__.py +0 -0
  27. {doctra-0.4.1 → doctra-0.4.2}/doctra/engines/image_restoration/__init__.py +0 -0
  28. {doctra-0.4.1 → doctra-0.4.2}/doctra/engines/layout/__init__.py +0 -0
  29. {doctra-0.4.1 → doctra-0.4.2}/doctra/engines/layout/layout_models.py +0 -0
  30. {doctra-0.4.1 → doctra-0.4.2}/doctra/engines/layout/paddle_layout.py +0 -0
  31. {doctra-0.4.1 → doctra-0.4.2}/doctra/engines/ocr/__init__.py +0 -0
  32. {doctra-0.4.1 → doctra-0.4.2}/doctra/engines/ocr/api.py +0 -0
  33. {doctra-0.4.1 → doctra-0.4.2}/doctra/engines/ocr/path_resolver.py +0 -0
  34. {doctra-0.4.1 → doctra-0.4.2}/doctra/engines/ocr/pytesseract_engine.py +0 -0
  35. {doctra-0.4.1 → doctra-0.4.2}/doctra/engines/vlm/__init__.py +0 -0
  36. {doctra-0.4.1 → doctra-0.4.2}/doctra/engines/vlm/outlines_types.py +0 -0
  37. {doctra-0.4.1 → doctra-0.4.2}/doctra/engines/vlm/provider.py +0 -0
  38. {doctra-0.4.1 → doctra-0.4.2}/doctra/engines/vlm/service.py +0 -0
  39. {doctra-0.4.1 → doctra-0.4.2}/doctra/exporters/__init__.py +0 -0
  40. {doctra-0.4.1 → doctra-0.4.2}/doctra/exporters/excel_writer.py +0 -0
  41. {doctra-0.4.1 → doctra-0.4.2}/doctra/exporters/image_saver.py +0 -0
  42. {doctra-0.4.1 → doctra-0.4.2}/doctra/exporters/markdown_table.py +0 -0
  43. {doctra-0.4.1 → doctra-0.4.2}/doctra/exporters/markdown_writer.py +0 -0
  44. {doctra-0.4.1 → doctra-0.4.2}/doctra/parsers/__init__.py +0 -0
  45. {doctra-0.4.1 → doctra-0.4.2}/doctra/parsers/layout_order.py +0 -0
  46. {doctra-0.4.1 → doctra-0.4.2}/doctra/third_party/docres/data/MBD/MBD.py +0 -0
  47. {doctra-0.4.1 → doctra-0.4.2}/doctra/third_party/docres/data/MBD/MBD_utils.py +0 -0
  48. {doctra-0.4.1 → doctra-0.4.2}/doctra/third_party/docres/data/MBD/infer.py +0 -0
  49. {doctra-0.4.1 → doctra-0.4.2}/doctra/third_party/docres/data/MBD/model/deep_lab_model/aspp.py +0 -0
  50. {doctra-0.4.1 → doctra-0.4.2}/doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/__init__.py +0 -0
  51. {doctra-0.4.1 → doctra-0.4.2}/doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/drn.py +0 -0
  52. {doctra-0.4.1 → doctra-0.4.2}/doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/mobilenet.py +0 -0
  53. {doctra-0.4.1 → doctra-0.4.2}/doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/resnet.py +0 -0
  54. {doctra-0.4.1 → doctra-0.4.2}/doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/xception.py +0 -0
  55. {doctra-0.4.1 → doctra-0.4.2}/doctra/third_party/docres/data/MBD/model/deep_lab_model/decoder.py +0 -0
  56. {doctra-0.4.1 → doctra-0.4.2}/doctra/third_party/docres/data/MBD/model/deep_lab_model/deeplab.py +0 -0
  57. {doctra-0.4.1 → doctra-0.4.2}/doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/__init__.py +0 -0
  58. {doctra-0.4.1 → doctra-0.4.2}/doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/batchnorm.py +0 -0
  59. {doctra-0.4.1 → doctra-0.4.2}/doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/comm.py +0 -0
  60. {doctra-0.4.1 → doctra-0.4.2}/doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/replicate.py +0 -0
  61. {doctra-0.4.1 → doctra-0.4.2}/doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/unittest.py +0 -0
  62. {doctra-0.4.1 → doctra-0.4.2}/doctra/third_party/docres/data/preprocess/crop_merge_image.py +0 -0
  63. {doctra-0.4.1 → doctra-0.4.2}/doctra/third_party/docres/inference.py +0 -0
  64. {doctra-0.4.1 → doctra-0.4.2}/doctra/third_party/docres/models/restormer_arch.py +0 -0
  65. {doctra-0.4.1 → doctra-0.4.2}/doctra/third_party/docres/utils.py +0 -0
  66. {doctra-0.4.1 → doctra-0.4.2}/doctra/ui/__init__.py +0 -0
  67. {doctra-0.4.1 → doctra-0.4.2}/doctra/utils/__init__.py +0 -0
  68. {doctra-0.4.1 → doctra-0.4.2}/doctra/utils/bbox.py +0 -0
  69. {doctra-0.4.1 → doctra-0.4.2}/doctra/utils/constants.py +0 -0
  70. {doctra-0.4.1 → doctra-0.4.2}/doctra/utils/file_ops.py +0 -0
  71. {doctra-0.4.1 → doctra-0.4.2}/doctra/utils/io_utils.py +0 -0
  72. {doctra-0.4.1 → doctra-0.4.2}/doctra/utils/ocr_utils.py +0 -0
  73. {doctra-0.4.1 → doctra-0.4.2}/doctra/utils/pdf_io.py +0 -0
  74. {doctra-0.4.1 → doctra-0.4.2}/doctra/utils/quiet.py +0 -0
  75. {doctra-0.4.1 → doctra-0.4.2}/doctra/utils/structured_utils.py +0 -0
  76. {doctra-0.4.1 → doctra-0.4.2}/doctra.egg-info/dependency_links.txt +0 -0
  77. {doctra-0.4.1 → doctra-0.4.2}/doctra.egg-info/not-zip-safe +0 -0
  78. {doctra-0.4.1 → doctra-0.4.2}/doctra.egg-info/requires.txt +0 -0
  79. {doctra-0.4.1 → doctra-0.4.2}/doctra.egg-info/top_level.txt +0 -0
  80. {doctra-0.4.1 → doctra-0.4.2}/pyproject.toml +0 -0
  81. {doctra-0.4.1 → doctra-0.4.2}/requirements.txt +0 -0
  82. {doctra-0.4.1 → doctra-0.4.2}/setup.cfg +0 -0
  83. {doctra-0.4.1 → doctra-0.4.2}/setup.py +0 -0
  84. {doctra-0.4.1 → doctra-0.4.2}/tests/test_structured_pdf_parser.py +0 -0
  85. {doctra-0.4.1 → doctra-0.4.2}/tests/test_table_chart_extractor.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: doctra
3
- Version: 0.4.1
3
+ Version: 0.4.2
4
4
  Summary: Parse, extract, and analyze documents with ease
5
5
  Home-page: https://github.com/AdemBoukhris457/Doctra
6
6
  Author: Adem Boukhris
@@ -87,12 +87,12 @@ def load_docres_weights_from_hf():
87
87
  if is_notebook:
88
88
  progress_bar = create_notebook_friendly_bar(
89
89
  total=2,
90
- desc="🔄 Downloading DocRes models from Hugging Face Hub"
90
+ desc="Downloading DocRes models from Hugging Face Hub"
91
91
  )
92
92
  else:
93
93
  progress_bar = create_beautiful_progress_bar(
94
94
  total=2,
95
- desc="🔄 Downloading DocRes models from Hugging Face Hub",
95
+ desc="Downloading DocRes models from Hugging Face Hub",
96
96
  leave=True
97
97
  )
98
98
 
@@ -505,12 +505,12 @@ class DocResEngine:
505
505
  if is_notebook:
506
506
  progress_bar = create_notebook_friendly_bar(
507
507
  total=len(pil_pages),
508
- desc="🔄 Processing pages"
508
+ desc="Processing pages"
509
509
  )
510
510
  else:
511
511
  progress_bar = create_beautiful_progress_bar(
512
512
  total=len(pil_pages),
513
- desc="🔄 Processing pages",
513
+ desc="Processing pages",
514
514
  leave=True
515
515
  )
516
516
 
@@ -2,7 +2,7 @@ from __future__ import annotations
2
2
  import os
3
3
  import re
4
4
  import base64
5
- from typing import List, Dict, Any
5
+ from typing import List, Dict, Any, Optional
6
6
  from markdown_it import MarkdownIt
7
7
 
8
8
 
@@ -64,6 +64,114 @@ def _process_image_paths(md_content: str, out_dir: str) -> str:
64
64
  return processed_content
65
65
 
66
66
 
67
+ def write_html_from_lines(html_lines: List[str], out_dir: str, filename: str = "result.html") -> str:
68
+ """
69
+ Convert HTML lines directly into a single HTML file and save it.
70
+
71
+ This function is used when VLM is enabled to ensure proper HTML table formatting
72
+ instead of markdown-to-HTML conversion.
73
+
74
+ :param html_lines: List of HTML strings to join into a single file
75
+ :param out_dir: Directory where the HTML file will be saved
76
+ :param filename: Name of the HTML file (default: "result.html")
77
+ :return: The absolute path of the written HTML file
78
+ """
79
+ os.makedirs(out_dir, exist_ok=True)
80
+
81
+ # Join HTML lines and clean up excessive blank lines
82
+ html_content = "\n".join(html_lines).strip() + "\n"
83
+ html_content = re.sub(r"\n{3,}", "\n\n", html_content)
84
+
85
+ # Process image paths to convert relative paths to absolute paths or base64
86
+ html_content = _process_image_paths(html_content, out_dir)
87
+
88
+ # Always apply table styling to ensure all tables are properly formatted
89
+ html_content = _add_table_styling(html_content)
90
+
91
+ # Create complete HTML document with modern styling
92
+ html_document = f"""<!DOCTYPE html>
93
+ <html lang="en">
94
+ <head>
95
+ <meta charset="UTF-8">
96
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
97
+ <title>Document Analysis Results</title>
98
+ <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700;800&display=swap" rel="stylesheet">
99
+ <style>
100
+ {_get_css_styles()}
101
+ </style>
102
+ </head>
103
+ <body>
104
+ <button class="theme-toggle" onclick="toggleTheme()" title="Toggle dark mode"></button>
105
+ <div class="container">
106
+ <header class="header">
107
+ <div class="header-content">
108
+ <div class="header-text">
109
+ <h1>Document Analysis Results</h1>
110
+ <p class="subtitle">Intelligent Document Processing & Analysis</p>
111
+ </div>
112
+ <div class="header-badge">
113
+ Generated by Doctra
114
+ </div>
115
+ </div>
116
+ </header>
117
+ <main class="content">
118
+ {html_content}
119
+ </main>
120
+ <footer class="footer">
121
+ <div class="footer-content">
122
+ <div class="footer-brand">Doctra</div>
123
+ <div class="footer-info">
124
+ <span>Intelligent Document Processing</span>
125
+ <a href="https://github.com/AdemBoukhris457/Doctra" target="_blank">GitHub</a>
126
+ </div>
127
+ </div>
128
+ </footer>
129
+ </div>
130
+ <script>
131
+ // Theme toggle functionality
132
+ function toggleTheme() {{
133
+ const body = document.body;
134
+ const currentTheme = body.getAttribute('data-theme');
135
+ const newTheme = currentTheme === 'dark' ? 'light' : 'dark';
136
+
137
+ body.setAttribute('data-theme', newTheme);
138
+ localStorage.setItem('doctra-theme', newTheme);
139
+
140
+ // Add smooth transition
141
+ body.style.transition = 'all 0.3s ease';
142
+ setTimeout(() => {{
143
+ body.style.transition = '';
144
+ }}, 300);
145
+ }}
146
+
147
+ // Load saved theme on page load
148
+ document.addEventListener('DOMContentLoaded', function() {{
149
+ const savedTheme = localStorage.getItem('doctra-theme') || 'light';
150
+ document.body.setAttribute('data-theme', savedTheme);
151
+ }});
152
+
153
+ // Add smooth scroll behavior
154
+ document.documentElement.style.scrollBehavior = 'smooth';
155
+
156
+ // Add loading animation
157
+ window.addEventListener('load', function() {{
158
+ document.body.style.opacity = '0';
159
+ document.body.style.transition = 'opacity 0.5s ease';
160
+ setTimeout(() => {{
161
+ document.body.style.opacity = '1';
162
+ }}, 100);
163
+ }});
164
+ </script>
165
+ </body>
166
+ </html>"""
167
+
168
+ html_path = os.path.join(out_dir, filename)
169
+ with open(html_path, "w", encoding="utf-8") as f:
170
+ f.write(html_document)
171
+
172
+ return os.path.abspath(html_path)
173
+
174
+
67
175
  def write_html(md_lines: List[str], out_dir: str, filename: str = "result.html") -> str:
68
176
  """
69
177
  Convert collected Markdown lines into a single HTML file and save it.
@@ -414,6 +522,54 @@ def _create_html_table(headers: List[str], rows: List[List]) -> str:
414
522
  """
415
523
 
416
524
 
525
+ def render_html_table(
526
+ headers: List[str] | None,
527
+ rows: List[List[str]] | None,
528
+ title: Optional[str] = None,
529
+ ) -> str:
530
+ """
531
+ Render an HTML table from headers, rows, and optional title.
532
+
533
+ Creates a properly formatted HTML table with headers, data rows,
534
+ and optional title. This is used for VLM-extracted tables to ensure
535
+ they display as proper HTML tables instead of markdown.
536
+
537
+ :param headers: List of column headers (optional, will be auto-generated if None)
538
+ :param rows: List of data rows, where each row is a list of cell values
539
+ :param title: Optional title to display above the table
540
+ :return: Formatted HTML table string
541
+ """
542
+ headers = headers or []
543
+ rows = rows or []
544
+
545
+ if not headers and not rows:
546
+ return "<p class='no-data'>No data available</p>"
547
+
548
+ # Determine width
549
+ width = len(headers) if headers else (max((len(r) for r in rows), default=1))
550
+
551
+ # Generate headers if not provided
552
+ if not headers:
553
+ headers = [f"Column {i+1}" for i in range(width)]
554
+
555
+ # Normalize data to handle mismatched dimensions
556
+ normalized_headers, normalized_rows = _normalize_data(headers, rows)
557
+
558
+ # Create HTML table
559
+ table_html = _create_html_table(normalized_headers, normalized_rows)
560
+
561
+ # Add title if provided
562
+ if title:
563
+ return f"""
564
+ <div class="table-section">
565
+ <h3 class="table-title">{_escape_html(title)}</h3>
566
+ {table_html}
567
+ </div>
568
+ """
569
+ else:
570
+ return table_html
571
+
572
+
417
573
  def _add_table_styling(html_content: str) -> str:
418
574
  """
419
575
  Add table styling wrapper to HTML content.
@@ -884,6 +1040,55 @@ def _get_css_styles() -> str:
884
1040
  content: '☀️';
885
1041
  }
886
1042
 
1043
+ /* Dark mode table styles */
1044
+ [data-theme="dark"] .markdown-table,
1045
+ [data-theme="dark"] table {
1046
+ background: var(--card-bg);
1047
+ border-color: var(--border-color);
1048
+ }
1049
+
1050
+ [data-theme="dark"] .markdown-table th,
1051
+ [data-theme="dark"] table th {
1052
+ background: #374151;
1053
+ color: #f9fafb;
1054
+ border-bottom-color: var(--accent-color);
1055
+ }
1056
+
1057
+ [data-theme="dark"] .markdown-table td,
1058
+ [data-theme="dark"] table td {
1059
+ color: #f9fafb;
1060
+ border-bottom-color: var(--border-color);
1061
+ }
1062
+
1063
+ [data-theme="dark"] .markdown-table tr:nth-child(even),
1064
+ [data-theme="dark"] table tr:nth-child(even) {
1065
+ background: #374151;
1066
+ }
1067
+
1068
+ [data-theme="dark"] .markdown-table tr:hover,
1069
+ [data-theme="dark"] table tr:hover {
1070
+ background: #4b5563;
1071
+ }
1072
+
1073
+ /* Dark mode footer styles to match header */
1074
+ [data-theme="dark"] .footer {
1075
+ background: var(--primary-color);
1076
+ color: white;
1077
+ border-top-color: var(--accent-color);
1078
+ }
1079
+
1080
+ [data-theme="dark"] .footer-brand {
1081
+ color: white;
1082
+ }
1083
+
1084
+ [data-theme="dark"] .footer a {
1085
+ color: rgba(255, 255, 255, 0.8);
1086
+ }
1087
+
1088
+ [data-theme="dark"] .footer a:hover {
1089
+ color: white;
1090
+ }
1091
+
887
1092
  /* Professional scrollbar */
888
1093
  ::-webkit-scrollbar {
889
1094
  width: 8px;
@@ -24,7 +24,7 @@ from doctra.parsers.layout_order import reading_order_key
24
24
  from doctra.utils.ocr_utils import ocr_box_text
25
25
  from doctra.exporters.image_saver import save_box_image
26
26
  from doctra.exporters.markdown_writer import write_markdown
27
- from doctra.exporters.html_writer import write_html, write_structured_html
27
+ from doctra.exporters.html_writer import write_html, write_structured_html, render_html_table, write_html_from_lines
28
28
  from doctra.exporters.excel_writer import write_structured_excel
29
29
  from doctra.utils.structured_utils import to_structured_dict
30
30
  from doctra.exporters.markdown_table import render_markdown_table
@@ -141,6 +141,13 @@ class EnhancedPDFParser(StructuredPDFParser):
141
141
  if self.use_image_restoration and self.docres_engine:
142
142
  print(f"🔄 Processing PDF with image restoration: {os.path.basename(pdf_path)}")
143
143
  enhanced_pages = self._process_pages_with_restoration(pdf_path, out_dir)
144
+
145
+ # Create enhanced PDF file using the already processed enhanced pages
146
+ enhanced_pdf_path = os.path.join(out_dir, f"{pdf_filename}_enhanced.pdf")
147
+ try:
148
+ self._create_enhanced_pdf_from_pages(enhanced_pages, enhanced_pdf_path)
149
+ except Exception as e:
150
+ print(f"⚠️ Failed to create enhanced PDF: {e}")
144
151
  else:
145
152
  print(f"🔄 Processing PDF without image restoration: {os.path.basename(pdf_path)}")
146
153
  enhanced_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.dpi)]
@@ -177,12 +184,12 @@ class EnhancedPDFParser(StructuredPDFParser):
177
184
  if is_notebook:
178
185
  progress_bar = create_notebook_friendly_bar(
179
186
  total=len(original_pages),
180
- desc=f"🔄 DocRes {self.restoration_task}"
187
+ desc=f"DocRes {self.restoration_task}"
181
188
  )
182
189
  else:
183
190
  progress_bar = create_beautiful_progress_bar(
184
191
  total=len(original_pages),
185
- desc=f"🔄 DocRes {self.restoration_task}",
192
+ desc=f"DocRes {self.restoration_task}",
186
193
  leave=True
187
194
  )
188
195
 
@@ -224,7 +231,6 @@ class EnhancedPDFParser(StructuredPDFParser):
224
231
  if hasattr(progress_bar, 'close'):
225
232
  progress_bar.close()
226
233
 
227
- print(f"✅ Image restoration completed. Enhanced pages saved to: {enhanced_dir}")
228
234
  return enhanced_pages
229
235
 
230
236
  def _process_parsing_logic(self, pages, pil_pages, out_dir, pdf_filename, pdf_path):
@@ -238,7 +244,9 @@ class EnhancedPDFParser(StructuredPDFParser):
238
244
  table_count = sum(sum(1 for b in p.boxes if b.label == "table") for p in pages)
239
245
 
240
246
  md_lines: List[str] = ["# Enhanced Document Content\n"]
247
+ html_lines: List[str] = ["<h1>Enhanced Document Content</h1>"] # For direct HTML generation
241
248
  structured_items: List[Dict[str, Any]] = []
249
+ page_content: Dict[int, List[str]] = {} # Store content by page
242
250
 
243
251
  charts_desc = "Charts (VLM → table)" if self.use_vlm else "Charts (cropped)"
244
252
  tables_desc = "Tables (VLM → table)" if self.use_vlm else "Tables (cropped)"
@@ -261,10 +269,15 @@ class EnhancedPDFParser(StructuredPDFParser):
261
269
  figures_bar = stack.enter_context(
262
270
  create_beautiful_progress_bar(total=fig_count, desc=figures_desc, leave=True)) if fig_count else None
263
271
 
272
+ # Initialize page content for all pages first
273
+ for page_num in range(1, len(pil_pages) + 1):
274
+ page_content[page_num] = [f"# Page {page_num} Content\n"]
275
+
264
276
  for p in pages:
265
277
  page_num = p.page_index
266
278
  page_img: Image.Image = pil_pages[page_num - 1]
267
279
  md_lines.append(f"\n## Page {page_num}\n")
280
+ html_lines.append(f"<h2>Page {page_num}</h2>")
268
281
 
269
282
  for i, box in enumerate(sorted(p.boxes, key=reading_order_key), start=1):
270
283
  if box.label in EXCLUDE_LABELS:
@@ -273,7 +286,11 @@ class EnhancedPDFParser(StructuredPDFParser):
273
286
  rel = os.path.relpath(abs_img_path, out_dir)
274
287
 
275
288
  if box.label == "figure":
276
- md_lines.append(f"![Figure — page {page_num}]({rel})\n")
289
+ figure_md = f"![Figure — page {page_num}]({rel})\n"
290
+ figure_html = f'<img src="{rel}" alt="Figure — page {page_num}" />'
291
+ md_lines.append(figure_md)
292
+ html_lines.append(figure_html)
293
+ page_content[page_num].append(figure_md)
277
294
  if figures_bar: figures_bar.update(1)
278
295
 
279
296
  elif box.label == "chart":
@@ -287,17 +304,31 @@ class EnhancedPDFParser(StructuredPDFParser):
287
304
  item["page"] = page_num
288
305
  item["type"] = "Chart"
289
306
  structured_items.append(item)
290
- md_lines.append(
291
- render_markdown_table(item.get("headers"), item.get("rows"),
292
- title=item.get("title"))
293
- )
307
+
308
+ # Generate both markdown and HTML tables
309
+ table_md = render_markdown_table(item.get("headers"), item.get("rows"),
310
+ title=item.get("title"))
311
+ table_html = render_html_table(item.get("headers"), item.get("rows"),
312
+ title=item.get("title"))
313
+
314
+ md_lines.append(table_md)
315
+ html_lines.append(table_html)
316
+ page_content[page_num].append(table_md)
294
317
  wrote_table = True
295
318
  except Exception as e:
296
319
  pass
297
320
  if not wrote_table:
298
- md_lines.append(f"![Chart — page {page_num}]({rel})\n")
321
+ chart_md = f"![Chart — page {page_num}]({rel})\n"
322
+ chart_html = f'<img src="{rel}" alt="Chart — page {page_num}" />'
323
+ md_lines.append(chart_md)
324
+ html_lines.append(chart_html)
325
+ page_content[page_num].append(chart_md)
299
326
  else:
300
- md_lines.append(f"![Chart — page {page_num}]({rel})\n")
327
+ chart_md = f"![Chart — page {page_num}]({rel})\n"
328
+ chart_html = f'<img src="{rel}" alt="Chart — page {page_num}" />'
329
+ md_lines.append(chart_md)
330
+ html_lines.append(chart_html)
331
+ page_content[page_num].append(chart_md)
301
332
  if charts_bar: charts_bar.update(1)
302
333
 
303
334
  elif box.label == "table":
@@ -311,26 +342,60 @@ class EnhancedPDFParser(StructuredPDFParser):
311
342
  item["page"] = page_num
312
343
  item["type"] = "Table"
313
344
  structured_items.append(item)
314
- md_lines.append(
315
- render_markdown_table(item.get("headers"), item.get("rows"),
316
- title=item.get("title"))
317
- )
345
+
346
+ # Generate both markdown and HTML tables
347
+ table_md = render_markdown_table(item.get("headers"), item.get("rows"),
348
+ title=item.get("title"))
349
+ table_html = render_html_table(item.get("headers"), item.get("rows"),
350
+ title=item.get("title"))
351
+
352
+ md_lines.append(table_md)
353
+ html_lines.append(table_html)
354
+ page_content[page_num].append(table_md)
318
355
  wrote_table = True
319
356
  except Exception as e:
320
357
  pass
321
358
  if not wrote_table:
322
- md_lines.append(f"![Table — page {page_num}]({rel})\n")
359
+ table_md = f"![Table — page {page_num}]({rel})\n"
360
+ table_html = f'<img src="{rel}" alt="Table — page {page_num}" />'
361
+ md_lines.append(table_md)
362
+ html_lines.append(table_html)
363
+ page_content[page_num].append(table_md)
323
364
  else:
324
- md_lines.append(f"![Table — page {page_num}]({rel})\n")
365
+ table_md = f"![Table — page {page_num}]({rel})\n"
366
+ table_html = f'<img src="{rel}" alt="Table — page {page_num}" />'
367
+ md_lines.append(table_md)
368
+ html_lines.append(table_html)
369
+ page_content[page_num].append(table_md)
325
370
  if tables_bar: tables_bar.update(1)
326
371
  else:
327
372
  text = ocr_box_text(self.ocr_engine, page_img, box)
328
373
  if text:
329
374
  md_lines.append(text)
330
375
  md_lines.append(self.box_separator if self.box_separator else "")
376
+ # Convert text to HTML (basic conversion)
377
+ html_text = text.replace('\n', '<br>')
378
+ html_lines.append(f"<p>{html_text}</p>")
379
+ if self.box_separator:
380
+ html_lines.append("<br>")
381
+ page_content[page_num].append(text)
382
+ page_content[page_num].append(self.box_separator if self.box_separator else "")
331
383
 
332
384
  md_path = write_markdown(md_lines, out_dir)
333
- html_path = write_html(md_lines, out_dir)
385
+
386
+ # Use HTML lines if VLM is enabled for better table formatting
387
+ if self.use_vlm and html_lines:
388
+ html_path = write_html_from_lines(html_lines, out_dir)
389
+ else:
390
+ html_path = write_html(md_lines, out_dir)
391
+
392
+ # Create pages folder and save individual page markdown files
393
+ pages_dir = os.path.join(out_dir, "pages")
394
+ os.makedirs(pages_dir, exist_ok=True)
395
+
396
+ for page_num, content_lines in page_content.items():
397
+ page_md_path = os.path.join(pages_dir, f"page_{page_num:03d}.md")
398
+ write_markdown(content_lines, os.path.dirname(page_md_path), os.path.basename(page_md_path))
334
399
 
335
400
  excel_path = None
336
401
  html_structured_path = None
@@ -343,6 +408,30 @@ class EnhancedPDFParser(StructuredPDFParser):
343
408
  print(f"✅ Enhanced parsing completed successfully!")
344
409
  print(f"📁 Output directory: {out_dir}")
345
410
 
411
+ def _create_enhanced_pdf_from_pages(self, enhanced_pages: List[Image.Image], output_path: str) -> None:
412
+ """
413
+ Create an enhanced PDF from already processed enhanced pages.
414
+
415
+ :param enhanced_pages: List of enhanced PIL images
416
+ :param output_path: Path for the enhanced PDF
417
+ """
418
+ if not enhanced_pages:
419
+ raise ValueError("No enhanced pages provided")
420
+
421
+ try:
422
+ # Create enhanced PDF from the processed pages
423
+ enhanced_pages[0].save(
424
+ output_path,
425
+ "PDF",
426
+ resolution=100.0,
427
+ save_all=True,
428
+ append_images=enhanced_pages[1:] if len(enhanced_pages) > 1 else []
429
+ )
430
+ print(f"✅ Enhanced PDF saved from processed pages: {output_path}")
431
+ except Exception as e:
432
+ print(f"❌ Error creating enhanced PDF from pages: {e}")
433
+ raise
434
+
346
435
  def restore_pdf_only(self, pdf_path: str, output_path: str = None, task: str = None) -> str:
347
436
  """
348
437
  Apply DocRes restoration to a PDF without parsing.
@@ -20,7 +20,7 @@ from doctra.exporters.excel_writer import write_structured_excel
20
20
  from doctra.utils.structured_utils import to_structured_dict
21
21
  from doctra.exporters.markdown_table import render_markdown_table
22
22
  from doctra.exporters.markdown_writer import write_markdown
23
- from doctra.exporters.html_writer import write_html, write_structured_html
23
+ from doctra.exporters.html_writer import write_html, write_structured_html, render_html_table, write_html_from_lines
24
24
  from doctra.utils.progress import create_beautiful_progress_bar, create_multi_progress_bars, create_notebook_friendly_bar
25
25
 
26
26
 
@@ -117,6 +117,7 @@ class StructuredPDFParser:
117
117
  table_count = sum(sum(1 for b in p.boxes if b.label == "table") for p in pages)
118
118
 
119
119
  md_lines: List[str] = ["# Extracted Content\n"]
120
+ html_lines: List[str] = ["<h1>Extracted Content</h1>"] # For direct HTML generation
120
121
  structured_items: List[Dict[str, Any]] = []
121
122
 
122
123
  charts_desc = "Charts (VLM → table)" if self.use_vlm else "Charts (cropped)"
@@ -145,6 +146,7 @@ class StructuredPDFParser:
145
146
  page_num = p.page_index
146
147
  page_img: Image.Image = pil_pages[page_num - 1]
147
148
  md_lines.append(f"\n## Page {page_num}\n")
149
+ html_lines.append(f"<h2>Page {page_num}</h2>")
148
150
 
149
151
  for i, box in enumerate(sorted(p.boxes, key=reading_order_key), start=1):
150
152
  if box.label in EXCLUDE_LABELS:
@@ -153,7 +155,10 @@ class StructuredPDFParser:
153
155
  rel = os.path.relpath(abs_img_path, out_dir)
154
156
 
155
157
  if box.label == "figure":
156
- md_lines.append(f"![Figure — page {page_num}]({rel})\n")
158
+ figure_md = f"![Figure — page {page_num}]({rel})\n"
159
+ figure_html = f'<img src="{rel}" alt="Figure — page {page_num}" />'
160
+ md_lines.append(figure_md)
161
+ html_lines.append(figure_html)
157
162
  if figures_bar: figures_bar.update(1)
158
163
 
159
164
  elif box.label == "chart":
@@ -167,17 +172,28 @@ class StructuredPDFParser:
167
172
  item["page"] = page_num
168
173
  item["type"] = "Chart"
169
174
  structured_items.append(item)
170
- md_lines.append(
171
- render_markdown_table(item.get("headers"), item.get("rows"),
172
- title=item.get("title"))
173
- )
175
+
176
+ # Generate both markdown and HTML tables
177
+ table_md = render_markdown_table(item.get("headers"), item.get("rows"),
178
+ title=item.get("title"))
179
+ table_html = render_html_table(item.get("headers"), item.get("rows"),
180
+ title=item.get("title"))
181
+
182
+ md_lines.append(table_md)
183
+ html_lines.append(table_html)
174
184
  wrote_table = True
175
185
  except Exception as e:
176
186
  pass
177
187
  if not wrote_table:
178
- md_lines.append(f"![Chart — page {page_num}]({rel})\n")
188
+ chart_md = f"![Chart — page {page_num}]({rel})\n"
189
+ chart_html = f'<img src="{rel}" alt="Chart — page {page_num}" />'
190
+ md_lines.append(chart_md)
191
+ html_lines.append(chart_html)
179
192
  else:
180
- md_lines.append(f"![Chart — page {page_num}]({rel})\n")
193
+ chart_md = f"![Chart — page {page_num}]({rel})\n"
194
+ chart_html = f'<img src="{rel}" alt="Chart — page {page_num}" />'
195
+ md_lines.append(chart_md)
196
+ html_lines.append(chart_html)
181
197
  if charts_bar: charts_bar.update(1)
182
198
 
183
199
  elif box.label == "table":
@@ -191,26 +207,47 @@ class StructuredPDFParser:
191
207
  item["page"] = page_num
192
208
  item["type"] = "Table"
193
209
  structured_items.append(item)
194
- md_lines.append(
195
- render_markdown_table(item.get("headers"), item.get("rows"),
196
- title=item.get("title"))
197
- )
210
+
211
+ # Generate both markdown and HTML tables
212
+ table_md = render_markdown_table(item.get("headers"), item.get("rows"),
213
+ title=item.get("title"))
214
+ table_html = render_html_table(item.get("headers"), item.get("rows"),
215
+ title=item.get("title"))
216
+
217
+ md_lines.append(table_md)
218
+ html_lines.append(table_html)
198
219
  wrote_table = True
199
220
  except Exception as e:
200
221
  pass
201
222
  if not wrote_table:
202
- md_lines.append(f"![Table — page {page_num}]({rel})\n")
223
+ table_md = f"![Table — page {page_num}]({rel})\n"
224
+ table_html = f'<img src="{rel}" alt="Table — page {page_num}" />'
225
+ md_lines.append(table_md)
226
+ html_lines.append(table_html)
203
227
  else:
204
- md_lines.append(f"![Table — page {page_num}]({rel})\n")
228
+ table_md = f"![Table — page {page_num}]({rel})\n"
229
+ table_html = f'<img src="{rel}" alt="Table — page {page_num}" />'
230
+ md_lines.append(table_md)
231
+ html_lines.append(table_html)
205
232
  if tables_bar: tables_bar.update(1)
206
233
  else:
207
234
  text = ocr_box_text(self.ocr_engine, page_img, box)
208
235
  if text:
209
236
  md_lines.append(text)
210
237
  md_lines.append(self.box_separator if self.box_separator else "")
238
+ # Convert text to HTML (basic conversion)
239
+ html_text = text.replace('\n', '<br>')
240
+ html_lines.append(f"<p>{html_text}</p>")
241
+ if self.box_separator:
242
+ html_lines.append("<br>")
211
243
 
212
244
  md_path = write_markdown(md_lines, out_dir)
213
- html_path = write_html(md_lines, out_dir)
245
+
246
+ # Use HTML lines if VLM is enabled for better table formatting
247
+ if self.use_vlm and html_lines:
248
+ html_path = write_html_from_lines(html_lines, out_dir)
249
+ else:
250
+ html_path = write_html(md_lines, out_dir)
214
251
 
215
252
  excel_path = None
216
253
  html_structured_path = None