doctra 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doctra/engines/image_restoration/docres_engine.py +4 -4
- doctra/exporters/html_writer.py +206 -1
- doctra/parsers/enhanced_pdf_parser.py +107 -18
- doctra/parsers/structured_pdf_parser.py +52 -15
- doctra/parsers/table_chart_extractor.py +290 -290
- doctra/ui/app.py +39 -954
- doctra/ui/docres_ui.py +338 -0
- doctra/ui/docres_wrapper.py +120 -0
- doctra/ui/enhanced_parser_ui.py +483 -0
- doctra/ui/full_parse_ui.py +539 -0
- doctra/ui/tables_charts_ui.py +445 -0
- doctra/ui/ui_helpers.py +435 -0
- doctra/utils/progress.py +7 -7
- doctra/version.py +1 -1
- {doctra-0.4.1.dist-info → doctra-0.4.3.dist-info}/METADATA +331 -74
- {doctra-0.4.1.dist-info → doctra-0.4.3.dist-info}/RECORD +20 -13
- doctra-0.4.3.dist-info/entry_points.txt +2 -0
- {doctra-0.4.1.dist-info → doctra-0.4.3.dist-info}/WHEEL +0 -0
- {doctra-0.4.1.dist-info → doctra-0.4.3.dist-info}/licenses/LICENSE +0 -0
- {doctra-0.4.1.dist-info → doctra-0.4.3.dist-info}/top_level.txt +0 -0
@@ -87,12 +87,12 @@ def load_docres_weights_from_hf():
|
|
87
87
|
if is_notebook:
|
88
88
|
progress_bar = create_notebook_friendly_bar(
|
89
89
|
total=2,
|
90
|
-
desc="
|
90
|
+
desc="Downloading DocRes models from Hugging Face Hub"
|
91
91
|
)
|
92
92
|
else:
|
93
93
|
progress_bar = create_beautiful_progress_bar(
|
94
94
|
total=2,
|
95
|
-
desc="
|
95
|
+
desc="Downloading DocRes models from Hugging Face Hub",
|
96
96
|
leave=True
|
97
97
|
)
|
98
98
|
|
@@ -505,12 +505,12 @@ class DocResEngine:
|
|
505
505
|
if is_notebook:
|
506
506
|
progress_bar = create_notebook_friendly_bar(
|
507
507
|
total=len(pil_pages),
|
508
|
-
desc="
|
508
|
+
desc="Processing pages"
|
509
509
|
)
|
510
510
|
else:
|
511
511
|
progress_bar = create_beautiful_progress_bar(
|
512
512
|
total=len(pil_pages),
|
513
|
-
desc="
|
513
|
+
desc="Processing pages",
|
514
514
|
leave=True
|
515
515
|
)
|
516
516
|
|
doctra/exporters/html_writer.py
CHANGED
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
2
2
|
import os
|
3
3
|
import re
|
4
4
|
import base64
|
5
|
-
from typing import List, Dict, Any
|
5
|
+
from typing import List, Dict, Any, Optional
|
6
6
|
from markdown_it import MarkdownIt
|
7
7
|
|
8
8
|
|
@@ -64,6 +64,114 @@ def _process_image_paths(md_content: str, out_dir: str) -> str:
|
|
64
64
|
return processed_content
|
65
65
|
|
66
66
|
|
67
|
+
def write_html_from_lines(html_lines: List[str], out_dir: str, filename: str = "result.html") -> str:
|
68
|
+
"""
|
69
|
+
Convert HTML lines directly into a single HTML file and save it.
|
70
|
+
|
71
|
+
This function is used when VLM is enabled to ensure proper HTML table formatting
|
72
|
+
instead of markdown-to-HTML conversion.
|
73
|
+
|
74
|
+
:param html_lines: List of HTML strings to join into a single file
|
75
|
+
:param out_dir: Directory where the HTML file will be saved
|
76
|
+
:param filename: Name of the HTML file (default: "result.html")
|
77
|
+
:return: The absolute path of the written HTML file
|
78
|
+
"""
|
79
|
+
os.makedirs(out_dir, exist_ok=True)
|
80
|
+
|
81
|
+
# Join HTML lines and clean up excessive blank lines
|
82
|
+
html_content = "\n".join(html_lines).strip() + "\n"
|
83
|
+
html_content = re.sub(r"\n{3,}", "\n\n", html_content)
|
84
|
+
|
85
|
+
# Process image paths to convert relative paths to absolute paths or base64
|
86
|
+
html_content = _process_image_paths(html_content, out_dir)
|
87
|
+
|
88
|
+
# Always apply table styling to ensure all tables are properly formatted
|
89
|
+
html_content = _add_table_styling(html_content)
|
90
|
+
|
91
|
+
# Create complete HTML document with modern styling
|
92
|
+
html_document = f"""<!DOCTYPE html>
|
93
|
+
<html lang="en">
|
94
|
+
<head>
|
95
|
+
<meta charset="UTF-8">
|
96
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
97
|
+
<title>Document Analysis Results</title>
|
98
|
+
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700;800&display=swap" rel="stylesheet">
|
99
|
+
<style>
|
100
|
+
{_get_css_styles()}
|
101
|
+
</style>
|
102
|
+
</head>
|
103
|
+
<body>
|
104
|
+
<button class="theme-toggle" onclick="toggleTheme()" title="Toggle dark mode"></button>
|
105
|
+
<div class="container">
|
106
|
+
<header class="header">
|
107
|
+
<div class="header-content">
|
108
|
+
<div class="header-text">
|
109
|
+
<h1>Document Analysis Results</h1>
|
110
|
+
<p class="subtitle">Intelligent Document Processing & Analysis</p>
|
111
|
+
</div>
|
112
|
+
<div class="header-badge">
|
113
|
+
Generated by Doctra
|
114
|
+
</div>
|
115
|
+
</div>
|
116
|
+
</header>
|
117
|
+
<main class="content">
|
118
|
+
{html_content}
|
119
|
+
</main>
|
120
|
+
<footer class="footer">
|
121
|
+
<div class="footer-content">
|
122
|
+
<div class="footer-brand">Doctra</div>
|
123
|
+
<div class="footer-info">
|
124
|
+
<span>Intelligent Document Processing</span>
|
125
|
+
<a href="https://github.com/AdemBoukhris457/Doctra" target="_blank">GitHub</a>
|
126
|
+
</div>
|
127
|
+
</div>
|
128
|
+
</footer>
|
129
|
+
</div>
|
130
|
+
<script>
|
131
|
+
// Theme toggle functionality
|
132
|
+
function toggleTheme() {{
|
133
|
+
const body = document.body;
|
134
|
+
const currentTheme = body.getAttribute('data-theme');
|
135
|
+
const newTheme = currentTheme === 'dark' ? 'light' : 'dark';
|
136
|
+
|
137
|
+
body.setAttribute('data-theme', newTheme);
|
138
|
+
localStorage.setItem('doctra-theme', newTheme);
|
139
|
+
|
140
|
+
// Add smooth transition
|
141
|
+
body.style.transition = 'all 0.3s ease';
|
142
|
+
setTimeout(() => {{
|
143
|
+
body.style.transition = '';
|
144
|
+
}}, 300);
|
145
|
+
}}
|
146
|
+
|
147
|
+
// Load saved theme on page load
|
148
|
+
document.addEventListener('DOMContentLoaded', function() {{
|
149
|
+
const savedTheme = localStorage.getItem('doctra-theme') || 'light';
|
150
|
+
document.body.setAttribute('data-theme', savedTheme);
|
151
|
+
}});
|
152
|
+
|
153
|
+
// Add smooth scroll behavior
|
154
|
+
document.documentElement.style.scrollBehavior = 'smooth';
|
155
|
+
|
156
|
+
// Add loading animation
|
157
|
+
window.addEventListener('load', function() {{
|
158
|
+
document.body.style.opacity = '0';
|
159
|
+
document.body.style.transition = 'opacity 0.5s ease';
|
160
|
+
setTimeout(() => {{
|
161
|
+
document.body.style.opacity = '1';
|
162
|
+
}}, 100);
|
163
|
+
}});
|
164
|
+
</script>
|
165
|
+
</body>
|
166
|
+
</html>"""
|
167
|
+
|
168
|
+
html_path = os.path.join(out_dir, filename)
|
169
|
+
with open(html_path, "w", encoding="utf-8") as f:
|
170
|
+
f.write(html_document)
|
171
|
+
|
172
|
+
return os.path.abspath(html_path)
|
173
|
+
|
174
|
+
|
67
175
|
def write_html(md_lines: List[str], out_dir: str, filename: str = "result.html") -> str:
|
68
176
|
"""
|
69
177
|
Convert collected Markdown lines into a single HTML file and save it.
|
@@ -414,6 +522,54 @@ def _create_html_table(headers: List[str], rows: List[List]) -> str:
|
|
414
522
|
"""
|
415
523
|
|
416
524
|
|
525
|
+
def render_html_table(
|
526
|
+
headers: List[str] | None,
|
527
|
+
rows: List[List[str]] | None,
|
528
|
+
title: Optional[str] = None,
|
529
|
+
) -> str:
|
530
|
+
"""
|
531
|
+
Render an HTML table from headers, rows, and optional title.
|
532
|
+
|
533
|
+
Creates a properly formatted HTML table with headers, data rows,
|
534
|
+
and optional title. This is used for VLM-extracted tables to ensure
|
535
|
+
they display as proper HTML tables instead of markdown.
|
536
|
+
|
537
|
+
:param headers: List of column headers (optional, will be auto-generated if None)
|
538
|
+
:param rows: List of data rows, where each row is a list of cell values
|
539
|
+
:param title: Optional title to display above the table
|
540
|
+
:return: Formatted HTML table string
|
541
|
+
"""
|
542
|
+
headers = headers or []
|
543
|
+
rows = rows or []
|
544
|
+
|
545
|
+
if not headers and not rows:
|
546
|
+
return "<p class='no-data'>No data available</p>"
|
547
|
+
|
548
|
+
# Determine width
|
549
|
+
width = len(headers) if headers else (max((len(r) for r in rows), default=1))
|
550
|
+
|
551
|
+
# Generate headers if not provided
|
552
|
+
if not headers:
|
553
|
+
headers = [f"Column {i+1}" for i in range(width)]
|
554
|
+
|
555
|
+
# Normalize data to handle mismatched dimensions
|
556
|
+
normalized_headers, normalized_rows = _normalize_data(headers, rows)
|
557
|
+
|
558
|
+
# Create HTML table
|
559
|
+
table_html = _create_html_table(normalized_headers, normalized_rows)
|
560
|
+
|
561
|
+
# Add title if provided
|
562
|
+
if title:
|
563
|
+
return f"""
|
564
|
+
<div class="table-section">
|
565
|
+
<h3 class="table-title">{_escape_html(title)}</h3>
|
566
|
+
{table_html}
|
567
|
+
</div>
|
568
|
+
"""
|
569
|
+
else:
|
570
|
+
return table_html
|
571
|
+
|
572
|
+
|
417
573
|
def _add_table_styling(html_content: str) -> str:
|
418
574
|
"""
|
419
575
|
Add table styling wrapper to HTML content.
|
@@ -884,6 +1040,55 @@ def _get_css_styles() -> str:
|
|
884
1040
|
content: '☀️';
|
885
1041
|
}
|
886
1042
|
|
1043
|
+
/* Dark mode table styles */
|
1044
|
+
[data-theme="dark"] .markdown-table,
|
1045
|
+
[data-theme="dark"] table {
|
1046
|
+
background: var(--card-bg);
|
1047
|
+
border-color: var(--border-color);
|
1048
|
+
}
|
1049
|
+
|
1050
|
+
[data-theme="dark"] .markdown-table th,
|
1051
|
+
[data-theme="dark"] table th {
|
1052
|
+
background: #374151;
|
1053
|
+
color: #f9fafb;
|
1054
|
+
border-bottom-color: var(--accent-color);
|
1055
|
+
}
|
1056
|
+
|
1057
|
+
[data-theme="dark"] .markdown-table td,
|
1058
|
+
[data-theme="dark"] table td {
|
1059
|
+
color: #f9fafb;
|
1060
|
+
border-bottom-color: var(--border-color);
|
1061
|
+
}
|
1062
|
+
|
1063
|
+
[data-theme="dark"] .markdown-table tr:nth-child(even),
|
1064
|
+
[data-theme="dark"] table tr:nth-child(even) {
|
1065
|
+
background: #374151;
|
1066
|
+
}
|
1067
|
+
|
1068
|
+
[data-theme="dark"] .markdown-table tr:hover,
|
1069
|
+
[data-theme="dark"] table tr:hover {
|
1070
|
+
background: #4b5563;
|
1071
|
+
}
|
1072
|
+
|
1073
|
+
/* Dark mode footer styles to match header */
|
1074
|
+
[data-theme="dark"] .footer {
|
1075
|
+
background: var(--primary-color);
|
1076
|
+
color: white;
|
1077
|
+
border-top-color: var(--accent-color);
|
1078
|
+
}
|
1079
|
+
|
1080
|
+
[data-theme="dark"] .footer-brand {
|
1081
|
+
color: white;
|
1082
|
+
}
|
1083
|
+
|
1084
|
+
[data-theme="dark"] .footer a {
|
1085
|
+
color: rgba(255, 255, 255, 0.8);
|
1086
|
+
}
|
1087
|
+
|
1088
|
+
[data-theme="dark"] .footer a:hover {
|
1089
|
+
color: white;
|
1090
|
+
}
|
1091
|
+
|
887
1092
|
/* Professional scrollbar */
|
888
1093
|
::-webkit-scrollbar {
|
889
1094
|
width: 8px;
|
@@ -24,7 +24,7 @@ from doctra.parsers.layout_order import reading_order_key
|
|
24
24
|
from doctra.utils.ocr_utils import ocr_box_text
|
25
25
|
from doctra.exporters.image_saver import save_box_image
|
26
26
|
from doctra.exporters.markdown_writer import write_markdown
|
27
|
-
from doctra.exporters.html_writer import write_html, write_structured_html
|
27
|
+
from doctra.exporters.html_writer import write_html, write_structured_html, render_html_table, write_html_from_lines
|
28
28
|
from doctra.exporters.excel_writer import write_structured_excel
|
29
29
|
from doctra.utils.structured_utils import to_structured_dict
|
30
30
|
from doctra.exporters.markdown_table import render_markdown_table
|
@@ -141,6 +141,13 @@ class EnhancedPDFParser(StructuredPDFParser):
|
|
141
141
|
if self.use_image_restoration and self.docres_engine:
|
142
142
|
print(f"🔄 Processing PDF with image restoration: {os.path.basename(pdf_path)}")
|
143
143
|
enhanced_pages = self._process_pages_with_restoration(pdf_path, out_dir)
|
144
|
+
|
145
|
+
# Create enhanced PDF file using the already processed enhanced pages
|
146
|
+
enhanced_pdf_path = os.path.join(out_dir, f"{pdf_filename}_enhanced.pdf")
|
147
|
+
try:
|
148
|
+
self._create_enhanced_pdf_from_pages(enhanced_pages, enhanced_pdf_path)
|
149
|
+
except Exception as e:
|
150
|
+
print(f"⚠️ Failed to create enhanced PDF: {e}")
|
144
151
|
else:
|
145
152
|
print(f"🔄 Processing PDF without image restoration: {os.path.basename(pdf_path)}")
|
146
153
|
enhanced_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.dpi)]
|
@@ -177,12 +184,12 @@ class EnhancedPDFParser(StructuredPDFParser):
|
|
177
184
|
if is_notebook:
|
178
185
|
progress_bar = create_notebook_friendly_bar(
|
179
186
|
total=len(original_pages),
|
180
|
-
desc=f"
|
187
|
+
desc=f"DocRes {self.restoration_task}"
|
181
188
|
)
|
182
189
|
else:
|
183
190
|
progress_bar = create_beautiful_progress_bar(
|
184
191
|
total=len(original_pages),
|
185
|
-
desc=f"
|
192
|
+
desc=f"DocRes {self.restoration_task}",
|
186
193
|
leave=True
|
187
194
|
)
|
188
195
|
|
@@ -224,7 +231,6 @@ class EnhancedPDFParser(StructuredPDFParser):
|
|
224
231
|
if hasattr(progress_bar, 'close'):
|
225
232
|
progress_bar.close()
|
226
233
|
|
227
|
-
print(f"✅ Image restoration completed. Enhanced pages saved to: {enhanced_dir}")
|
228
234
|
return enhanced_pages
|
229
235
|
|
230
236
|
def _process_parsing_logic(self, pages, pil_pages, out_dir, pdf_filename, pdf_path):
|
@@ -238,7 +244,9 @@ class EnhancedPDFParser(StructuredPDFParser):
|
|
238
244
|
table_count = sum(sum(1 for b in p.boxes if b.label == "table") for p in pages)
|
239
245
|
|
240
246
|
md_lines: List[str] = ["# Enhanced Document Content\n"]
|
247
|
+
html_lines: List[str] = ["<h1>Enhanced Document Content</h1>"] # For direct HTML generation
|
241
248
|
structured_items: List[Dict[str, Any]] = []
|
249
|
+
page_content: Dict[int, List[str]] = {} # Store content by page
|
242
250
|
|
243
251
|
charts_desc = "Charts (VLM → table)" if self.use_vlm else "Charts (cropped)"
|
244
252
|
tables_desc = "Tables (VLM → table)" if self.use_vlm else "Tables (cropped)"
|
@@ -261,10 +269,15 @@ class EnhancedPDFParser(StructuredPDFParser):
|
|
261
269
|
figures_bar = stack.enter_context(
|
262
270
|
create_beautiful_progress_bar(total=fig_count, desc=figures_desc, leave=True)) if fig_count else None
|
263
271
|
|
272
|
+
# Initialize page content for all pages first
|
273
|
+
for page_num in range(1, len(pil_pages) + 1):
|
274
|
+
page_content[page_num] = [f"# Page {page_num} Content\n"]
|
275
|
+
|
264
276
|
for p in pages:
|
265
277
|
page_num = p.page_index
|
266
278
|
page_img: Image.Image = pil_pages[page_num - 1]
|
267
279
|
md_lines.append(f"\n## Page {page_num}\n")
|
280
|
+
html_lines.append(f"<h2>Page {page_num}</h2>")
|
268
281
|
|
269
282
|
for i, box in enumerate(sorted(p.boxes, key=reading_order_key), start=1):
|
270
283
|
if box.label in EXCLUDE_LABELS:
|
@@ -273,7 +286,11 @@ class EnhancedPDFParser(StructuredPDFParser):
|
|
273
286
|
rel = os.path.relpath(abs_img_path, out_dir)
|
274
287
|
|
275
288
|
if box.label == "figure":
|
276
|
-
|
289
|
+
figure_md = f"\n"
|
290
|
+
figure_html = f'<img src="{rel}" alt="Figure — page {page_num}" />'
|
291
|
+
md_lines.append(figure_md)
|
292
|
+
html_lines.append(figure_html)
|
293
|
+
page_content[page_num].append(figure_md)
|
277
294
|
if figures_bar: figures_bar.update(1)
|
278
295
|
|
279
296
|
elif box.label == "chart":
|
@@ -287,17 +304,31 @@ class EnhancedPDFParser(StructuredPDFParser):
|
|
287
304
|
item["page"] = page_num
|
288
305
|
item["type"] = "Chart"
|
289
306
|
structured_items.append(item)
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
307
|
+
|
308
|
+
# Generate both markdown and HTML tables
|
309
|
+
table_md = render_markdown_table(item.get("headers"), item.get("rows"),
|
310
|
+
title=item.get("title"))
|
311
|
+
table_html = render_html_table(item.get("headers"), item.get("rows"),
|
312
|
+
title=item.get("title"))
|
313
|
+
|
314
|
+
md_lines.append(table_md)
|
315
|
+
html_lines.append(table_html)
|
316
|
+
page_content[page_num].append(table_md)
|
294
317
|
wrote_table = True
|
295
318
|
except Exception as e:
|
296
319
|
pass
|
297
320
|
if not wrote_table:
|
298
|
-
|
321
|
+
chart_md = f"\n"
|
322
|
+
chart_html = f'<img src="{rel}" alt="Chart — page {page_num}" />'
|
323
|
+
md_lines.append(chart_md)
|
324
|
+
html_lines.append(chart_html)
|
325
|
+
page_content[page_num].append(chart_md)
|
299
326
|
else:
|
300
|
-
|
327
|
+
chart_md = f"\n"
|
328
|
+
chart_html = f'<img src="{rel}" alt="Chart — page {page_num}" />'
|
329
|
+
md_lines.append(chart_md)
|
330
|
+
html_lines.append(chart_html)
|
331
|
+
page_content[page_num].append(chart_md)
|
301
332
|
if charts_bar: charts_bar.update(1)
|
302
333
|
|
303
334
|
elif box.label == "table":
|
@@ -311,26 +342,60 @@ class EnhancedPDFParser(StructuredPDFParser):
|
|
311
342
|
item["page"] = page_num
|
312
343
|
item["type"] = "Table"
|
313
344
|
structured_items.append(item)
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
345
|
+
|
346
|
+
# Generate both markdown and HTML tables
|
347
|
+
table_md = render_markdown_table(item.get("headers"), item.get("rows"),
|
348
|
+
title=item.get("title"))
|
349
|
+
table_html = render_html_table(item.get("headers"), item.get("rows"),
|
350
|
+
title=item.get("title"))
|
351
|
+
|
352
|
+
md_lines.append(table_md)
|
353
|
+
html_lines.append(table_html)
|
354
|
+
page_content[page_num].append(table_md)
|
318
355
|
wrote_table = True
|
319
356
|
except Exception as e:
|
320
357
|
pass
|
321
358
|
if not wrote_table:
|
322
|
-
|
359
|
+
table_md = f"\n"
|
360
|
+
table_html = f'<img src="{rel}" alt="Table — page {page_num}" />'
|
361
|
+
md_lines.append(table_md)
|
362
|
+
html_lines.append(table_html)
|
363
|
+
page_content[page_num].append(table_md)
|
323
364
|
else:
|
324
|
-
|
365
|
+
table_md = f"\n"
|
366
|
+
table_html = f'<img src="{rel}" alt="Table — page {page_num}" />'
|
367
|
+
md_lines.append(table_md)
|
368
|
+
html_lines.append(table_html)
|
369
|
+
page_content[page_num].append(table_md)
|
325
370
|
if tables_bar: tables_bar.update(1)
|
326
371
|
else:
|
327
372
|
text = ocr_box_text(self.ocr_engine, page_img, box)
|
328
373
|
if text:
|
329
374
|
md_lines.append(text)
|
330
375
|
md_lines.append(self.box_separator if self.box_separator else "")
|
376
|
+
# Convert text to HTML (basic conversion)
|
377
|
+
html_text = text.replace('\n', '<br>')
|
378
|
+
html_lines.append(f"<p>{html_text}</p>")
|
379
|
+
if self.box_separator:
|
380
|
+
html_lines.append("<br>")
|
381
|
+
page_content[page_num].append(text)
|
382
|
+
page_content[page_num].append(self.box_separator if self.box_separator else "")
|
331
383
|
|
332
384
|
md_path = write_markdown(md_lines, out_dir)
|
333
|
-
|
385
|
+
|
386
|
+
# Use HTML lines if VLM is enabled for better table formatting
|
387
|
+
if self.use_vlm and html_lines:
|
388
|
+
html_path = write_html_from_lines(html_lines, out_dir)
|
389
|
+
else:
|
390
|
+
html_path = write_html(md_lines, out_dir)
|
391
|
+
|
392
|
+
# Create pages folder and save individual page markdown files
|
393
|
+
pages_dir = os.path.join(out_dir, "pages")
|
394
|
+
os.makedirs(pages_dir, exist_ok=True)
|
395
|
+
|
396
|
+
for page_num, content_lines in page_content.items():
|
397
|
+
page_md_path = os.path.join(pages_dir, f"page_{page_num:03d}.md")
|
398
|
+
write_markdown(content_lines, os.path.dirname(page_md_path), os.path.basename(page_md_path))
|
334
399
|
|
335
400
|
excel_path = None
|
336
401
|
html_structured_path = None
|
@@ -343,6 +408,30 @@ class EnhancedPDFParser(StructuredPDFParser):
|
|
343
408
|
print(f"✅ Enhanced parsing completed successfully!")
|
344
409
|
print(f"📁 Output directory: {out_dir}")
|
345
410
|
|
411
|
+
def _create_enhanced_pdf_from_pages(self, enhanced_pages: List[Image.Image], output_path: str) -> None:
|
412
|
+
"""
|
413
|
+
Create an enhanced PDF from already processed enhanced pages.
|
414
|
+
|
415
|
+
:param enhanced_pages: List of enhanced PIL images
|
416
|
+
:param output_path: Path for the enhanced PDF
|
417
|
+
"""
|
418
|
+
if not enhanced_pages:
|
419
|
+
raise ValueError("No enhanced pages provided")
|
420
|
+
|
421
|
+
try:
|
422
|
+
# Create enhanced PDF from the processed pages
|
423
|
+
enhanced_pages[0].save(
|
424
|
+
output_path,
|
425
|
+
"PDF",
|
426
|
+
resolution=100.0,
|
427
|
+
save_all=True,
|
428
|
+
append_images=enhanced_pages[1:] if len(enhanced_pages) > 1 else []
|
429
|
+
)
|
430
|
+
print(f"✅ Enhanced PDF saved from processed pages: {output_path}")
|
431
|
+
except Exception as e:
|
432
|
+
print(f"❌ Error creating enhanced PDF from pages: {e}")
|
433
|
+
raise
|
434
|
+
|
346
435
|
def restore_pdf_only(self, pdf_path: str, output_path: str = None, task: str = None) -> str:
|
347
436
|
"""
|
348
437
|
Apply DocRes restoration to a PDF without parsing.
|
@@ -20,7 +20,7 @@ from doctra.exporters.excel_writer import write_structured_excel
|
|
20
20
|
from doctra.utils.structured_utils import to_structured_dict
|
21
21
|
from doctra.exporters.markdown_table import render_markdown_table
|
22
22
|
from doctra.exporters.markdown_writer import write_markdown
|
23
|
-
from doctra.exporters.html_writer import write_html, write_structured_html
|
23
|
+
from doctra.exporters.html_writer import write_html, write_structured_html, render_html_table, write_html_from_lines
|
24
24
|
from doctra.utils.progress import create_beautiful_progress_bar, create_multi_progress_bars, create_notebook_friendly_bar
|
25
25
|
|
26
26
|
|
@@ -117,6 +117,7 @@ class StructuredPDFParser:
|
|
117
117
|
table_count = sum(sum(1 for b in p.boxes if b.label == "table") for p in pages)
|
118
118
|
|
119
119
|
md_lines: List[str] = ["# Extracted Content\n"]
|
120
|
+
html_lines: List[str] = ["<h1>Extracted Content</h1>"] # For direct HTML generation
|
120
121
|
structured_items: List[Dict[str, Any]] = []
|
121
122
|
|
122
123
|
charts_desc = "Charts (VLM → table)" if self.use_vlm else "Charts (cropped)"
|
@@ -145,6 +146,7 @@ class StructuredPDFParser:
|
|
145
146
|
page_num = p.page_index
|
146
147
|
page_img: Image.Image = pil_pages[page_num - 1]
|
147
148
|
md_lines.append(f"\n## Page {page_num}\n")
|
149
|
+
html_lines.append(f"<h2>Page {page_num}</h2>")
|
148
150
|
|
149
151
|
for i, box in enumerate(sorted(p.boxes, key=reading_order_key), start=1):
|
150
152
|
if box.label in EXCLUDE_LABELS:
|
@@ -153,7 +155,10 @@ class StructuredPDFParser:
|
|
153
155
|
rel = os.path.relpath(abs_img_path, out_dir)
|
154
156
|
|
155
157
|
if box.label == "figure":
|
156
|
-
|
158
|
+
figure_md = f"\n"
|
159
|
+
figure_html = f'<img src="{rel}" alt="Figure — page {page_num}" />'
|
160
|
+
md_lines.append(figure_md)
|
161
|
+
html_lines.append(figure_html)
|
157
162
|
if figures_bar: figures_bar.update(1)
|
158
163
|
|
159
164
|
elif box.label == "chart":
|
@@ -167,17 +172,28 @@ class StructuredPDFParser:
|
|
167
172
|
item["page"] = page_num
|
168
173
|
item["type"] = "Chart"
|
169
174
|
structured_items.append(item)
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
175
|
+
|
176
|
+
# Generate both markdown and HTML tables
|
177
|
+
table_md = render_markdown_table(item.get("headers"), item.get("rows"),
|
178
|
+
title=item.get("title"))
|
179
|
+
table_html = render_html_table(item.get("headers"), item.get("rows"),
|
180
|
+
title=item.get("title"))
|
181
|
+
|
182
|
+
md_lines.append(table_md)
|
183
|
+
html_lines.append(table_html)
|
174
184
|
wrote_table = True
|
175
185
|
except Exception as e:
|
176
186
|
pass
|
177
187
|
if not wrote_table:
|
178
|
-
|
188
|
+
chart_md = f"\n"
|
189
|
+
chart_html = f'<img src="{rel}" alt="Chart — page {page_num}" />'
|
190
|
+
md_lines.append(chart_md)
|
191
|
+
html_lines.append(chart_html)
|
179
192
|
else:
|
180
|
-
|
193
|
+
chart_md = f"\n"
|
194
|
+
chart_html = f'<img src="{rel}" alt="Chart — page {page_num}" />'
|
195
|
+
md_lines.append(chart_md)
|
196
|
+
html_lines.append(chart_html)
|
181
197
|
if charts_bar: charts_bar.update(1)
|
182
198
|
|
183
199
|
elif box.label == "table":
|
@@ -191,26 +207,47 @@ class StructuredPDFParser:
|
|
191
207
|
item["page"] = page_num
|
192
208
|
item["type"] = "Table"
|
193
209
|
structured_items.append(item)
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
210
|
+
|
211
|
+
# Generate both markdown and HTML tables
|
212
|
+
table_md = render_markdown_table(item.get("headers"), item.get("rows"),
|
213
|
+
title=item.get("title"))
|
214
|
+
table_html = render_html_table(item.get("headers"), item.get("rows"),
|
215
|
+
title=item.get("title"))
|
216
|
+
|
217
|
+
md_lines.append(table_md)
|
218
|
+
html_lines.append(table_html)
|
198
219
|
wrote_table = True
|
199
220
|
except Exception as e:
|
200
221
|
pass
|
201
222
|
if not wrote_table:
|
202
|
-
|
223
|
+
table_md = f"\n"
|
224
|
+
table_html = f'<img src="{rel}" alt="Table — page {page_num}" />'
|
225
|
+
md_lines.append(table_md)
|
226
|
+
html_lines.append(table_html)
|
203
227
|
else:
|
204
|
-
|
228
|
+
table_md = f"\n"
|
229
|
+
table_html = f'<img src="{rel}" alt="Table — page {page_num}" />'
|
230
|
+
md_lines.append(table_md)
|
231
|
+
html_lines.append(table_html)
|
205
232
|
if tables_bar: tables_bar.update(1)
|
206
233
|
else:
|
207
234
|
text = ocr_box_text(self.ocr_engine, page_img, box)
|
208
235
|
if text:
|
209
236
|
md_lines.append(text)
|
210
237
|
md_lines.append(self.box_separator if self.box_separator else "")
|
238
|
+
# Convert text to HTML (basic conversion)
|
239
|
+
html_text = text.replace('\n', '<br>')
|
240
|
+
html_lines.append(f"<p>{html_text}</p>")
|
241
|
+
if self.box_separator:
|
242
|
+
html_lines.append("<br>")
|
211
243
|
|
212
244
|
md_path = write_markdown(md_lines, out_dir)
|
213
|
-
|
245
|
+
|
246
|
+
# Use HTML lines if VLM is enabled for better table formatting
|
247
|
+
if self.use_vlm and html_lines:
|
248
|
+
html_path = write_html_from_lines(html_lines, out_dir)
|
249
|
+
else:
|
250
|
+
html_path = write_html(md_lines, out_dir)
|
214
251
|
|
215
252
|
excel_path = None
|
216
253
|
html_structured_path = None
|