doctra 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doctra/cli/main.py +5 -12
- doctra/cli/utils.py +2 -3
- doctra/engines/image_restoration/docres_engine.py +6 -11
- doctra/engines/vlm/outlines_types.py +13 -9
- doctra/engines/vlm/service.py +4 -2
- doctra/exporters/excel_writer.py +89 -0
- doctra/exporters/html_writer.py +206 -1
- doctra/parsers/enhanced_pdf_parser.py +124 -31
- doctra/parsers/structured_pdf_parser.py +58 -15
- doctra/parsers/table_chart_extractor.py +290 -284
- doctra/ui/app.py +39 -960
- doctra/ui/docres_ui.py +338 -0
- doctra/ui/docres_wrapper.py +120 -0
- doctra/ui/enhanced_parser_ui.py +483 -0
- doctra/ui/full_parse_ui.py +539 -0
- doctra/ui/tables_charts_ui.py +445 -0
- doctra/ui/ui_helpers.py +435 -0
- doctra/utils/progress.py +7 -7
- doctra/utils/structured_utils.py +5 -2
- doctra/version.py +1 -1
- {doctra-0.4.0.dist-info → doctra-0.4.2.dist-info}/METADATA +1 -1
- {doctra-0.4.0.dist-info → doctra-0.4.2.dist-info}/RECORD +25 -19
- {doctra-0.4.0.dist-info → doctra-0.4.2.dist-info}/WHEEL +0 -0
- {doctra-0.4.0.dist-info → doctra-0.4.2.dist-info}/licenses/LICENSE +0 -0
- {doctra-0.4.0.dist-info → doctra-0.4.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,445 @@
|
|
1
|
+
"""
|
2
|
+
Tables & Charts Parser UI Module
|
3
|
+
|
4
|
+
This module contains all functionality for the Tables & Charts extraction tab in the Doctra Gradio interface.
|
5
|
+
It handles table and chart extraction, VLM integration, and structured data display.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import json
|
9
|
+
import tempfile
|
10
|
+
from pathlib import Path
|
11
|
+
from typing import Tuple, List, Optional
|
12
|
+
|
13
|
+
import gradio as gr
|
14
|
+
import pandas as pd
|
15
|
+
import html as _html
|
16
|
+
|
17
|
+
from doctra.parsers.table_chart_extractor import ChartTablePDFParser
|
18
|
+
from doctra.ui.ui_helpers import gather_outputs, validate_vlm_config
|
19
|
+
|
20
|
+
|
21
|
+
def run_extract(
|
22
|
+
pdf_file: str,
|
23
|
+
target: str,
|
24
|
+
use_vlm: bool,
|
25
|
+
vlm_provider: str,
|
26
|
+
vlm_api_key: str,
|
27
|
+
layout_model_name: str,
|
28
|
+
dpi: int,
|
29
|
+
min_score: float,
|
30
|
+
) -> Tuple[str, str, List[tuple[str, str]], List[str], str]:
|
31
|
+
"""
|
32
|
+
Run table/chart extraction from PDF.
|
33
|
+
|
34
|
+
Args:
|
35
|
+
pdf_file: Path to input PDF file
|
36
|
+
target: Extraction target ("tables", "charts", or "both")
|
37
|
+
use_vlm: Whether to use Vision Language Model
|
38
|
+
vlm_provider: VLM provider name
|
39
|
+
vlm_api_key: API key for VLM provider
|
40
|
+
layout_model_name: Layout detection model name
|
41
|
+
dpi: DPI for image processing
|
42
|
+
min_score: Minimum confidence score for layout detection
|
43
|
+
|
44
|
+
Returns:
|
45
|
+
Tuple of (status_message, tables_html, gallery_items, file_paths, zip_path)
|
46
|
+
"""
|
47
|
+
if not pdf_file:
|
48
|
+
return ("No file provided.", "", [], [], "")
|
49
|
+
|
50
|
+
# Validate VLM configuration
|
51
|
+
vlm_error = validate_vlm_config(use_vlm, vlm_api_key)
|
52
|
+
if vlm_error:
|
53
|
+
return (vlm_error, "", [], [], "")
|
54
|
+
|
55
|
+
# Extract filename from the uploaded file path
|
56
|
+
original_filename = Path(pdf_file).stem
|
57
|
+
|
58
|
+
# Create temporary directory for processing
|
59
|
+
tmp_dir = Path(tempfile.mkdtemp(prefix="doctra_"))
|
60
|
+
input_pdf = tmp_dir / f"{original_filename}.pdf"
|
61
|
+
import shutil
|
62
|
+
shutil.copy2(pdf_file, input_pdf)
|
63
|
+
|
64
|
+
# Initialize parser with configuration
|
65
|
+
parser = ChartTablePDFParser(
|
66
|
+
extract_charts=(target in ("charts", "both")),
|
67
|
+
extract_tables=(target in ("tables", "both")),
|
68
|
+
use_vlm=use_vlm,
|
69
|
+
vlm_provider=vlm_provider,
|
70
|
+
vlm_api_key=vlm_api_key or None,
|
71
|
+
layout_model_name=layout_model_name,
|
72
|
+
dpi=int(dpi),
|
73
|
+
min_score=float(min_score),
|
74
|
+
)
|
75
|
+
|
76
|
+
# Run extraction
|
77
|
+
output_base = Path("outputs")
|
78
|
+
parser.parse(str(input_pdf), str(output_base))
|
79
|
+
|
80
|
+
# Find output directory
|
81
|
+
outputs_root = output_base
|
82
|
+
out_dir = outputs_root / original_filename / "structured_parsing"
|
83
|
+
if not out_dir.exists():
|
84
|
+
if outputs_root.exists():
|
85
|
+
candidates = sorted(outputs_root.glob("*/"), key=lambda p: p.stat().st_mtime, reverse=True)
|
86
|
+
if candidates:
|
87
|
+
out_dir = candidates[0] / "structured_parsing"
|
88
|
+
else:
|
89
|
+
out_dir = outputs_root
|
90
|
+
else:
|
91
|
+
outputs_root.mkdir(parents=True, exist_ok=True)
|
92
|
+
out_dir = outputs_root
|
93
|
+
|
94
|
+
# Determine which kinds to include in outputs based on target selection
|
95
|
+
allowed_kinds: Optional[List[str]] = None
|
96
|
+
if target in ("tables", "charts"):
|
97
|
+
allowed_kinds = [target]
|
98
|
+
elif target == "both":
|
99
|
+
allowed_kinds = ["tables", "charts"]
|
100
|
+
|
101
|
+
# Gather output files and create ZIP
|
102
|
+
gallery_items, file_paths, zip_path = gather_outputs(
|
103
|
+
out_dir,
|
104
|
+
allowed_kinds,
|
105
|
+
zip_filename=original_filename,
|
106
|
+
is_structured_parsing=True
|
107
|
+
)
|
108
|
+
|
109
|
+
# Build tables HTML preview from Excel data (when VLM enabled)
|
110
|
+
tables_html = ""
|
111
|
+
try:
|
112
|
+
if use_vlm:
|
113
|
+
# Find Excel file based on target
|
114
|
+
excel_filename = None
|
115
|
+
if target in ("tables", "charts"):
|
116
|
+
if target == "tables":
|
117
|
+
excel_filename = "parsed_tables.xlsx"
|
118
|
+
else: # charts
|
119
|
+
excel_filename = "parsed_charts.xlsx"
|
120
|
+
elif target == "both":
|
121
|
+
excel_filename = "parsed_tables_charts.xlsx"
|
122
|
+
|
123
|
+
if excel_filename:
|
124
|
+
excel_path = out_dir / excel_filename
|
125
|
+
if excel_path.exists():
|
126
|
+
# Read Excel file and create HTML tables
|
127
|
+
xl_file = pd.ExcelFile(excel_path)
|
128
|
+
html_blocks = []
|
129
|
+
|
130
|
+
for sheet_name in xl_file.sheet_names:
|
131
|
+
df = pd.read_excel(excel_path, sheet_name=sheet_name)
|
132
|
+
if not df.empty:
|
133
|
+
# Create table with title
|
134
|
+
title = f"<h3>{_html.escape(sheet_name)}</h3>"
|
135
|
+
|
136
|
+
# Convert DataFrame to HTML table
|
137
|
+
table_html = df.to_html(
|
138
|
+
classes="doc-table",
|
139
|
+
table_id=None,
|
140
|
+
escape=True,
|
141
|
+
index=False,
|
142
|
+
na_rep=""
|
143
|
+
)
|
144
|
+
|
145
|
+
html_blocks.append(title + table_html)
|
146
|
+
|
147
|
+
tables_html = "\n".join(html_blocks)
|
148
|
+
except Exception as e:
|
149
|
+
# Safely encode error message to handle Unicode characters
|
150
|
+
try:
|
151
|
+
error_msg = str(e).encode('utf-8', errors='replace').decode('utf-8')
|
152
|
+
print(f"Error building tables HTML: {error_msg}")
|
153
|
+
except Exception:
|
154
|
+
print(f"Error building tables HTML: <Unicode encoding error>")
|
155
|
+
tables_html = ""
|
156
|
+
|
157
|
+
return (
|
158
|
+
f"✅ Parsing completed successfully!\n📁 Output directory: {out_dir}",
|
159
|
+
tables_html,
|
160
|
+
file_paths,
|
161
|
+
zip_path
|
162
|
+
)
|
163
|
+
|
164
|
+
|
165
|
+
def capture_out_dir(status_text: str) -> str:
|
166
|
+
"""
|
167
|
+
Capture output directory from status text.
|
168
|
+
|
169
|
+
Args:
|
170
|
+
status_text: Status message containing output directory path
|
171
|
+
|
172
|
+
Returns:
|
173
|
+
Output directory path string
|
174
|
+
"""
|
175
|
+
if not status_text:
|
176
|
+
return ""
|
177
|
+
try:
|
178
|
+
if "Output directory:" in status_text:
|
179
|
+
return status_text.split("Output directory:", 1)[1].strip()
|
180
|
+
except Exception:
|
181
|
+
pass
|
182
|
+
return ""
|
183
|
+
|
184
|
+
|
185
|
+
def build_item_selector(out_dir_path: str, target: str, use_vlm: bool) -> gr.Dropdown:
|
186
|
+
"""
|
187
|
+
Build item selector dropdown based on VLM output data.
|
188
|
+
|
189
|
+
Args:
|
190
|
+
out_dir_path: Path to output directory
|
191
|
+
target: Extraction target type
|
192
|
+
use_vlm: Whether VLM was used
|
193
|
+
|
194
|
+
Returns:
|
195
|
+
Updated dropdown component
|
196
|
+
"""
|
197
|
+
if not out_dir_path or not use_vlm:
|
198
|
+
return gr.Dropdown(choices=[], value=None, visible=False)
|
199
|
+
|
200
|
+
try:
|
201
|
+
out_dir = Path(out_dir_path)
|
202
|
+
mapping = out_dir / "vlm_items.json"
|
203
|
+
if not mapping.exists():
|
204
|
+
return gr.Dropdown(choices=[], value=None, visible=False)
|
205
|
+
|
206
|
+
data = json.loads(mapping.read_text(encoding="utf-8"))
|
207
|
+
choices = []
|
208
|
+
|
209
|
+
for entry in data:
|
210
|
+
kind = entry.get("kind")
|
211
|
+
# Filter based on target
|
212
|
+
if target == "both" or (target == "tables" and kind == "table") or (target == "charts" and kind == "chart"):
|
213
|
+
title = entry.get("title") or f"{kind.title()}"
|
214
|
+
page = entry.get("page")
|
215
|
+
rel_path = entry.get("image_rel_path")
|
216
|
+
label = f"{title} — Page {page}"
|
217
|
+
choices.append((label, rel_path))
|
218
|
+
|
219
|
+
return gr.Dropdown(choices=choices, value=choices[0][1] if choices else None, visible=bool(choices))
|
220
|
+
except Exception:
|
221
|
+
return gr.Dropdown(choices=[], value=None, visible=False)
|
222
|
+
|
223
|
+
|
224
|
+
def show_selected_item(rel_path: str, out_dir_path: str) -> Tuple[str, Optional[str]]:
|
225
|
+
"""
|
226
|
+
Show selected item data and image.
|
227
|
+
|
228
|
+
Args:
|
229
|
+
rel_path: Relative path to the item image
|
230
|
+
out_dir_path: Path to output directory
|
231
|
+
|
232
|
+
Returns:
|
233
|
+
Tuple of (html_table, image_path)
|
234
|
+
"""
|
235
|
+
if not rel_path or not out_dir_path:
|
236
|
+
return "", None
|
237
|
+
|
238
|
+
try:
|
239
|
+
out_dir = Path(out_dir_path)
|
240
|
+
mapping = out_dir / "vlm_items.json"
|
241
|
+
if not mapping.exists():
|
242
|
+
return "", None
|
243
|
+
|
244
|
+
data = json.loads(mapping.read_text(encoding="utf-8"))
|
245
|
+
|
246
|
+
for entry in data:
|
247
|
+
if entry.get("image_rel_path") == rel_path:
|
248
|
+
headers = entry.get("headers") or []
|
249
|
+
rows = entry.get("rows") or []
|
250
|
+
title = entry.get("title") or "Data"
|
251
|
+
kind = entry.get("kind", "table")
|
252
|
+
|
253
|
+
# Create HTML table
|
254
|
+
if headers and rows:
|
255
|
+
thead = '<thead><tr>' + ''.join(f'<th>{_html.escape(str(h))}</th>' for h in headers) + '</tr></thead>'
|
256
|
+
tbody = '<tbody>' + ''.join('<tr>' + ''.join(f'<td>{_html.escape(str(c))}</td>' for c in r) + '</tr>' for r in rows) + '</tbody>'
|
257
|
+
html_table = f'<h3>{_html.escape(title)} ({kind.title()})</h3><table class="doc-table">{thead}{tbody}</table>'
|
258
|
+
else:
|
259
|
+
html_table = f'<h3>{_html.escape(title)} ({kind.title()})</h3><p>No structured data available</p>'
|
260
|
+
|
261
|
+
# Get image path
|
262
|
+
img_abs = str((out_dir / rel_path).resolve())
|
263
|
+
return html_table, img_abs
|
264
|
+
|
265
|
+
return "", None
|
266
|
+
except Exception:
|
267
|
+
return "", None
|
268
|
+
|
269
|
+
|
270
|
+
def update_content_visibility(use_vlm: bool) -> Tuple[gr.Column, gr.Column]:
|
271
|
+
"""
|
272
|
+
Update content visibility based on VLM usage.
|
273
|
+
|
274
|
+
Args:
|
275
|
+
use_vlm: Whether VLM is being used
|
276
|
+
|
277
|
+
Returns:
|
278
|
+
Tuple of (vlm_content, non_vlm_content)
|
279
|
+
"""
|
280
|
+
if use_vlm:
|
281
|
+
# Show VLM content (data + selected image)
|
282
|
+
return gr.Column(visible=True), gr.Column(visible=False)
|
283
|
+
else:
|
284
|
+
# Show non-VLM content (scrollable gallery)
|
285
|
+
return gr.Column(visible=False), gr.Column(visible=True)
|
286
|
+
|
287
|
+
|
288
|
+
def populate_scrollable_gallery(file_paths: List[str], target: str) -> List[tuple[str, str]]:
|
289
|
+
"""
|
290
|
+
Populate the scrollable gallery with image files from the extraction results, filtered by target.
|
291
|
+
|
292
|
+
Args:
|
293
|
+
file_paths: List of file paths from extraction
|
294
|
+
target: Extraction target ("tables", "charts", or "both")
|
295
|
+
|
296
|
+
Returns:
|
297
|
+
List of (image_path, caption) tuples for image files
|
298
|
+
"""
|
299
|
+
gallery_items = []
|
300
|
+
for file_path in file_paths:
|
301
|
+
if file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):
|
302
|
+
# Filter based on target
|
303
|
+
filename = Path(file_path).name.lower()
|
304
|
+
should_include = False
|
305
|
+
|
306
|
+
if target == "both":
|
307
|
+
# Include all images
|
308
|
+
should_include = True
|
309
|
+
elif target == "tables":
|
310
|
+
# Include only table images
|
311
|
+
if "table" in filename or "tables" in filename:
|
312
|
+
should_include = True
|
313
|
+
elif target == "charts":
|
314
|
+
# Include only chart images
|
315
|
+
if "chart" in filename or "charts" in filename:
|
316
|
+
should_include = True
|
317
|
+
|
318
|
+
if should_include:
|
319
|
+
gallery_items.append((file_path, Path(file_path).name))
|
320
|
+
|
321
|
+
return gallery_items
|
322
|
+
|
323
|
+
|
324
|
+
def create_tables_charts_tab() -> Tuple[gr.Tab, dict]:
|
325
|
+
"""
|
326
|
+
Create the Tables & Charts extraction tab with all its components and functionality.
|
327
|
+
|
328
|
+
Returns:
|
329
|
+
Tuple of (tab_component, state_variables_dict)
|
330
|
+
"""
|
331
|
+
with gr.Tab("Extract Tables/Charts") as tab:
|
332
|
+
# Input controls
|
333
|
+
with gr.Row():
|
334
|
+
pdf_e = gr.File(file_types=[".pdf"], label="PDF")
|
335
|
+
target = gr.Dropdown(["tables", "charts", "both"], value="both", label="Target")
|
336
|
+
use_vlm_e = gr.Checkbox(label="Use VLM (optional)", value=False)
|
337
|
+
vlm_provider_e = gr.Dropdown(["gemini", "openai", "anthropic", "openrouter"], value="gemini", label="VLM Provider")
|
338
|
+
vlm_api_key_e = gr.Textbox(type="password", label="VLM API Key", placeholder="Optional if VLM disabled")
|
339
|
+
|
340
|
+
# Advanced settings accordion
|
341
|
+
with gr.Accordion("Advanced", open=False):
|
342
|
+
with gr.Row():
|
343
|
+
layout_model_e = gr.Textbox(value="PP-DocLayout_plus-L", label="Layout model")
|
344
|
+
dpi_e = gr.Slider(100, 400, value=200, step=10, label="DPI")
|
345
|
+
min_score_e = gr.Slider(0, 1, value=0.0, step=0.05, label="Min layout score")
|
346
|
+
|
347
|
+
# Action button
|
348
|
+
run_btn_e = gr.Button("▶ Run Extraction", variant="primary")
|
349
|
+
status_e = gr.Textbox(label="Status")
|
350
|
+
|
351
|
+
# Item selector for VLM outputs
|
352
|
+
item_selector_e = gr.Dropdown(label="Select Item", visible=False, interactive=True)
|
353
|
+
|
354
|
+
# Content display - different layout based on VLM usage
|
355
|
+
with gr.Row():
|
356
|
+
# VLM mode: show data and selected image side by side
|
357
|
+
with gr.Column(visible=True) as vlm_content:
|
358
|
+
tables_preview_e = gr.HTML(label="Extracted Data", elem_classes=["page-content"])
|
359
|
+
image_e = gr.Image(label="Selected Image", interactive=False)
|
360
|
+
|
361
|
+
# Non-VLM mode: show scrollable gallery of extracted images
|
362
|
+
with gr.Column(visible=False) as non_vlm_content:
|
363
|
+
scrollable_gallery_e = gr.Gallery(
|
364
|
+
label="Extracted Images",
|
365
|
+
columns=2,
|
366
|
+
height=600,
|
367
|
+
preview=True,
|
368
|
+
show_label=True,
|
369
|
+
elem_classes=["scrollable-gallery"]
|
370
|
+
)
|
371
|
+
|
372
|
+
# Downloads
|
373
|
+
files_out_e = gr.Files(label="Download individual output files")
|
374
|
+
zip_out_e = gr.File(label="Download all outputs (ZIP)")
|
375
|
+
|
376
|
+
# State variable for output directory
|
377
|
+
out_dir_state = gr.State("")
|
378
|
+
|
379
|
+
# Event handlers
|
380
|
+
run_btn_e.click(
|
381
|
+
fn=lambda f, t, a, b, c, d, e, g: run_extract(
|
382
|
+
f.name if f else "",
|
383
|
+
t,
|
384
|
+
a,
|
385
|
+
b,
|
386
|
+
c,
|
387
|
+
d,
|
388
|
+
e,
|
389
|
+
g,
|
390
|
+
),
|
391
|
+
inputs=[pdf_e, target, use_vlm_e, vlm_provider_e, vlm_api_key_e, layout_model_e, dpi_e, min_score_e],
|
392
|
+
outputs=[status_e, tables_preview_e, files_out_e, zip_out_e],
|
393
|
+
).then(
|
394
|
+
fn=capture_out_dir,
|
395
|
+
inputs=[status_e],
|
396
|
+
outputs=[out_dir_state]
|
397
|
+
).then(
|
398
|
+
fn=build_item_selector,
|
399
|
+
inputs=[out_dir_state, target, use_vlm_e],
|
400
|
+
outputs=[item_selector_e]
|
401
|
+
).then(
|
402
|
+
fn=show_selected_item,
|
403
|
+
inputs=[item_selector_e, out_dir_state],
|
404
|
+
outputs=[tables_preview_e, image_e]
|
405
|
+
).then(
|
406
|
+
fn=update_content_visibility,
|
407
|
+
inputs=[use_vlm_e],
|
408
|
+
outputs=[vlm_content, non_vlm_content]
|
409
|
+
).then(
|
410
|
+
fn=populate_scrollable_gallery,
|
411
|
+
inputs=[files_out_e, target],
|
412
|
+
outputs=[scrollable_gallery_e]
|
413
|
+
)
|
414
|
+
|
415
|
+
# Handle dropdown selection changes
|
416
|
+
item_selector_e.change(
|
417
|
+
fn=show_selected_item,
|
418
|
+
inputs=[item_selector_e, out_dir_state],
|
419
|
+
outputs=[tables_preview_e, image_e]
|
420
|
+
)
|
421
|
+
|
422
|
+
# Return state variables for external access
|
423
|
+
state_vars = {
|
424
|
+
'pdf_e': pdf_e,
|
425
|
+
'target': target,
|
426
|
+
'use_vlm_e': use_vlm_e,
|
427
|
+
'vlm_provider_e': vlm_provider_e,
|
428
|
+
'vlm_api_key_e': vlm_api_key_e,
|
429
|
+
'layout_model_e': layout_model_e,
|
430
|
+
'dpi_e': dpi_e,
|
431
|
+
'min_score_e': min_score_e,
|
432
|
+
'run_btn_e': run_btn_e,
|
433
|
+
'status_e': status_e,
|
434
|
+
'item_selector_e': item_selector_e,
|
435
|
+
'tables_preview_e': tables_preview_e,
|
436
|
+
'image_e': image_e,
|
437
|
+
'files_out_e': files_out_e,
|
438
|
+
'zip_out_e': zip_out_e,
|
439
|
+
'out_dir_state': out_dir_state,
|
440
|
+
'vlm_content': vlm_content,
|
441
|
+
'non_vlm_content': non_vlm_content,
|
442
|
+
'scrollable_gallery_e': scrollable_gallery_e
|
443
|
+
}
|
444
|
+
|
445
|
+
return tab, state_vars
|