doctra 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,445 @@
1
+ """
2
+ Tables & Charts Parser UI Module
3
+
4
+ This module contains all functionality for the Tables & Charts extraction tab in the Doctra Gradio interface.
5
+ It handles table and chart extraction, VLM integration, and structured data display.
6
+ """
7
+
8
+ import json
9
+ import tempfile
10
+ from pathlib import Path
11
+ from typing import Tuple, List, Optional
12
+
13
+ import gradio as gr
14
+ import pandas as pd
15
+ import html as _html
16
+
17
+ from doctra.parsers.table_chart_extractor import ChartTablePDFParser
18
+ from doctra.ui.ui_helpers import gather_outputs, validate_vlm_config
19
+
20
+
21
+ def run_extract(
22
+ pdf_file: str,
23
+ target: str,
24
+ use_vlm: bool,
25
+ vlm_provider: str,
26
+ vlm_api_key: str,
27
+ layout_model_name: str,
28
+ dpi: int,
29
+ min_score: float,
30
+ ) -> Tuple[str, str, List[tuple[str, str]], List[str], str]:
31
+ """
32
+ Run table/chart extraction from PDF.
33
+
34
+ Args:
35
+ pdf_file: Path to input PDF file
36
+ target: Extraction target ("tables", "charts", or "both")
37
+ use_vlm: Whether to use Vision Language Model
38
+ vlm_provider: VLM provider name
39
+ vlm_api_key: API key for VLM provider
40
+ layout_model_name: Layout detection model name
41
+ dpi: DPI for image processing
42
+ min_score: Minimum confidence score for layout detection
43
+
44
+ Returns:
45
+ Tuple of (status_message, tables_html, gallery_items, file_paths, zip_path)
46
+ """
47
+ if not pdf_file:
48
+ return ("No file provided.", "", [], [], "")
49
+
50
+ # Validate VLM configuration
51
+ vlm_error = validate_vlm_config(use_vlm, vlm_api_key)
52
+ if vlm_error:
53
+ return (vlm_error, "", [], [], "")
54
+
55
+ # Extract filename from the uploaded file path
56
+ original_filename = Path(pdf_file).stem
57
+
58
+ # Create temporary directory for processing
59
+ tmp_dir = Path(tempfile.mkdtemp(prefix="doctra_"))
60
+ input_pdf = tmp_dir / f"{original_filename}.pdf"
61
+ import shutil
62
+ shutil.copy2(pdf_file, input_pdf)
63
+
64
+ # Initialize parser with configuration
65
+ parser = ChartTablePDFParser(
66
+ extract_charts=(target in ("charts", "both")),
67
+ extract_tables=(target in ("tables", "both")),
68
+ use_vlm=use_vlm,
69
+ vlm_provider=vlm_provider,
70
+ vlm_api_key=vlm_api_key or None,
71
+ layout_model_name=layout_model_name,
72
+ dpi=int(dpi),
73
+ min_score=float(min_score),
74
+ )
75
+
76
+ # Run extraction
77
+ output_base = Path("outputs")
78
+ parser.parse(str(input_pdf), str(output_base))
79
+
80
+ # Find output directory
81
+ outputs_root = output_base
82
+ out_dir = outputs_root / original_filename / "structured_parsing"
83
+ if not out_dir.exists():
84
+ if outputs_root.exists():
85
+ candidates = sorted(outputs_root.glob("*/"), key=lambda p: p.stat().st_mtime, reverse=True)
86
+ if candidates:
87
+ out_dir = candidates[0] / "structured_parsing"
88
+ else:
89
+ out_dir = outputs_root
90
+ else:
91
+ outputs_root.mkdir(parents=True, exist_ok=True)
92
+ out_dir = outputs_root
93
+
94
+ # Determine which kinds to include in outputs based on target selection
95
+ allowed_kinds: Optional[List[str]] = None
96
+ if target in ("tables", "charts"):
97
+ allowed_kinds = [target]
98
+ elif target == "both":
99
+ allowed_kinds = ["tables", "charts"]
100
+
101
+ # Gather output files and create ZIP
102
+ gallery_items, file_paths, zip_path = gather_outputs(
103
+ out_dir,
104
+ allowed_kinds,
105
+ zip_filename=original_filename,
106
+ is_structured_parsing=True
107
+ )
108
+
109
+ # Build tables HTML preview from Excel data (when VLM enabled)
110
+ tables_html = ""
111
+ try:
112
+ if use_vlm:
113
+ # Find Excel file based on target
114
+ excel_filename = None
115
+ if target in ("tables", "charts"):
116
+ if target == "tables":
117
+ excel_filename = "parsed_tables.xlsx"
118
+ else: # charts
119
+ excel_filename = "parsed_charts.xlsx"
120
+ elif target == "both":
121
+ excel_filename = "parsed_tables_charts.xlsx"
122
+
123
+ if excel_filename:
124
+ excel_path = out_dir / excel_filename
125
+ if excel_path.exists():
126
+ # Read Excel file and create HTML tables
127
+ xl_file = pd.ExcelFile(excel_path)
128
+ html_blocks = []
129
+
130
+ for sheet_name in xl_file.sheet_names:
131
+ df = pd.read_excel(excel_path, sheet_name=sheet_name)
132
+ if not df.empty:
133
+ # Create table with title
134
+ title = f"<h3>{_html.escape(sheet_name)}</h3>"
135
+
136
+ # Convert DataFrame to HTML table
137
+ table_html = df.to_html(
138
+ classes="doc-table",
139
+ table_id=None,
140
+ escape=True,
141
+ index=False,
142
+ na_rep=""
143
+ )
144
+
145
+ html_blocks.append(title + table_html)
146
+
147
+ tables_html = "\n".join(html_blocks)
148
+ except Exception as e:
149
+ # Safely encode error message to handle Unicode characters
150
+ try:
151
+ error_msg = str(e).encode('utf-8', errors='replace').decode('utf-8')
152
+ print(f"Error building tables HTML: {error_msg}")
153
+ except Exception:
154
+ print(f"Error building tables HTML: <Unicode encoding error>")
155
+ tables_html = ""
156
+
157
+ return (
158
+ f"✅ Parsing completed successfully!\n📁 Output directory: {out_dir}",
159
+ tables_html,
160
+ file_paths,
161
+ zip_path
162
+ )
163
+
164
+
165
+ def capture_out_dir(status_text: str) -> str:
166
+ """
167
+ Capture output directory from status text.
168
+
169
+ Args:
170
+ status_text: Status message containing output directory path
171
+
172
+ Returns:
173
+ Output directory path string
174
+ """
175
+ if not status_text:
176
+ return ""
177
+ try:
178
+ if "Output directory:" in status_text:
179
+ return status_text.split("Output directory:", 1)[1].strip()
180
+ except Exception:
181
+ pass
182
+ return ""
183
+
184
+
185
+ def build_item_selector(out_dir_path: str, target: str, use_vlm: bool) -> gr.Dropdown:
186
+ """
187
+ Build item selector dropdown based on VLM output data.
188
+
189
+ Args:
190
+ out_dir_path: Path to output directory
191
+ target: Extraction target type
192
+ use_vlm: Whether VLM was used
193
+
194
+ Returns:
195
+ Updated dropdown component
196
+ """
197
+ if not out_dir_path or not use_vlm:
198
+ return gr.Dropdown(choices=[], value=None, visible=False)
199
+
200
+ try:
201
+ out_dir = Path(out_dir_path)
202
+ mapping = out_dir / "vlm_items.json"
203
+ if not mapping.exists():
204
+ return gr.Dropdown(choices=[], value=None, visible=False)
205
+
206
+ data = json.loads(mapping.read_text(encoding="utf-8"))
207
+ choices = []
208
+
209
+ for entry in data:
210
+ kind = entry.get("kind")
211
+ # Filter based on target
212
+ if target == "both" or (target == "tables" and kind == "table") or (target == "charts" and kind == "chart"):
213
+ title = entry.get("title") or f"{kind.title()}"
214
+ page = entry.get("page")
215
+ rel_path = entry.get("image_rel_path")
216
+ label = f"{title} — Page {page}"
217
+ choices.append((label, rel_path))
218
+
219
+ return gr.Dropdown(choices=choices, value=choices[0][1] if choices else None, visible=bool(choices))
220
+ except Exception:
221
+ return gr.Dropdown(choices=[], value=None, visible=False)
222
+
223
+
224
+ def show_selected_item(rel_path: str, out_dir_path: str) -> Tuple[str, Optional[str]]:
225
+ """
226
+ Show selected item data and image.
227
+
228
+ Args:
229
+ rel_path: Relative path to the item image
230
+ out_dir_path: Path to output directory
231
+
232
+ Returns:
233
+ Tuple of (html_table, image_path)
234
+ """
235
+ if not rel_path or not out_dir_path:
236
+ return "", None
237
+
238
+ try:
239
+ out_dir = Path(out_dir_path)
240
+ mapping = out_dir / "vlm_items.json"
241
+ if not mapping.exists():
242
+ return "", None
243
+
244
+ data = json.loads(mapping.read_text(encoding="utf-8"))
245
+
246
+ for entry in data:
247
+ if entry.get("image_rel_path") == rel_path:
248
+ headers = entry.get("headers") or []
249
+ rows = entry.get("rows") or []
250
+ title = entry.get("title") or "Data"
251
+ kind = entry.get("kind", "table")
252
+
253
+ # Create HTML table
254
+ if headers and rows:
255
+ thead = '<thead><tr>' + ''.join(f'<th>{_html.escape(str(h))}</th>' for h in headers) + '</tr></thead>'
256
+ tbody = '<tbody>' + ''.join('<tr>' + ''.join(f'<td>{_html.escape(str(c))}</td>' for c in r) + '</tr>' for r in rows) + '</tbody>'
257
+ html_table = f'<h3>{_html.escape(title)} ({kind.title()})</h3><table class="doc-table">{thead}{tbody}</table>'
258
+ else:
259
+ html_table = f'<h3>{_html.escape(title)} ({kind.title()})</h3><p>No structured data available</p>'
260
+
261
+ # Get image path
262
+ img_abs = str((out_dir / rel_path).resolve())
263
+ return html_table, img_abs
264
+
265
+ return "", None
266
+ except Exception:
267
+ return "", None
268
+
269
+
270
+ def update_content_visibility(use_vlm: bool) -> Tuple[gr.Column, gr.Column]:
271
+ """
272
+ Update content visibility based on VLM usage.
273
+
274
+ Args:
275
+ use_vlm: Whether VLM is being used
276
+
277
+ Returns:
278
+ Tuple of (vlm_content, non_vlm_content)
279
+ """
280
+ if use_vlm:
281
+ # Show VLM content (data + selected image)
282
+ return gr.Column(visible=True), gr.Column(visible=False)
283
+ else:
284
+ # Show non-VLM content (scrollable gallery)
285
+ return gr.Column(visible=False), gr.Column(visible=True)
286
+
287
+
288
+ def populate_scrollable_gallery(file_paths: List[str], target: str) -> List[tuple[str, str]]:
289
+ """
290
+ Populate the scrollable gallery with image files from the extraction results, filtered by target.
291
+
292
+ Args:
293
+ file_paths: List of file paths from extraction
294
+ target: Extraction target ("tables", "charts", or "both")
295
+
296
+ Returns:
297
+ List of (image_path, caption) tuples for image files
298
+ """
299
+ gallery_items = []
300
+ for file_path in file_paths:
301
+ if file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):
302
+ # Filter based on target
303
+ filename = Path(file_path).name.lower()
304
+ should_include = False
305
+
306
+ if target == "both":
307
+ # Include all images
308
+ should_include = True
309
+ elif target == "tables":
310
+ # Include only table images
311
+ if "table" in filename or "tables" in filename:
312
+ should_include = True
313
+ elif target == "charts":
314
+ # Include only chart images
315
+ if "chart" in filename or "charts" in filename:
316
+ should_include = True
317
+
318
+ if should_include:
319
+ gallery_items.append((file_path, Path(file_path).name))
320
+
321
+ return gallery_items
322
+
323
+
324
+ def create_tables_charts_tab() -> Tuple[gr.Tab, dict]:
325
+ """
326
+ Create the Tables & Charts extraction tab with all its components and functionality.
327
+
328
+ Returns:
329
+ Tuple of (tab_component, state_variables_dict)
330
+ """
331
+ with gr.Tab("Extract Tables/Charts") as tab:
332
+ # Input controls
333
+ with gr.Row():
334
+ pdf_e = gr.File(file_types=[".pdf"], label="PDF")
335
+ target = gr.Dropdown(["tables", "charts", "both"], value="both", label="Target")
336
+ use_vlm_e = gr.Checkbox(label="Use VLM (optional)", value=False)
337
+ vlm_provider_e = gr.Dropdown(["gemini", "openai", "anthropic", "openrouter"], value="gemini", label="VLM Provider")
338
+ vlm_api_key_e = gr.Textbox(type="password", label="VLM API Key", placeholder="Optional if VLM disabled")
339
+
340
+ # Advanced settings accordion
341
+ with gr.Accordion("Advanced", open=False):
342
+ with gr.Row():
343
+ layout_model_e = gr.Textbox(value="PP-DocLayout_plus-L", label="Layout model")
344
+ dpi_e = gr.Slider(100, 400, value=200, step=10, label="DPI")
345
+ min_score_e = gr.Slider(0, 1, value=0.0, step=0.05, label="Min layout score")
346
+
347
+ # Action button
348
+ run_btn_e = gr.Button("▶ Run Extraction", variant="primary")
349
+ status_e = gr.Textbox(label="Status")
350
+
351
+ # Item selector for VLM outputs
352
+ item_selector_e = gr.Dropdown(label="Select Item", visible=False, interactive=True)
353
+
354
+ # Content display - different layout based on VLM usage
355
+ with gr.Row():
356
+ # VLM mode: show data and selected image side by side
357
+ with gr.Column(visible=True) as vlm_content:
358
+ tables_preview_e = gr.HTML(label="Extracted Data", elem_classes=["page-content"])
359
+ image_e = gr.Image(label="Selected Image", interactive=False)
360
+
361
+ # Non-VLM mode: show scrollable gallery of extracted images
362
+ with gr.Column(visible=False) as non_vlm_content:
363
+ scrollable_gallery_e = gr.Gallery(
364
+ label="Extracted Images",
365
+ columns=2,
366
+ height=600,
367
+ preview=True,
368
+ show_label=True,
369
+ elem_classes=["scrollable-gallery"]
370
+ )
371
+
372
+ # Downloads
373
+ files_out_e = gr.Files(label="Download individual output files")
374
+ zip_out_e = gr.File(label="Download all outputs (ZIP)")
375
+
376
+ # State variable for output directory
377
+ out_dir_state = gr.State("")
378
+
379
+ # Event handlers
380
+ run_btn_e.click(
381
+ fn=lambda f, t, a, b, c, d, e, g: run_extract(
382
+ f.name if f else "",
383
+ t,
384
+ a,
385
+ b,
386
+ c,
387
+ d,
388
+ e,
389
+ g,
390
+ ),
391
+ inputs=[pdf_e, target, use_vlm_e, vlm_provider_e, vlm_api_key_e, layout_model_e, dpi_e, min_score_e],
392
+ outputs=[status_e, tables_preview_e, files_out_e, zip_out_e],
393
+ ).then(
394
+ fn=capture_out_dir,
395
+ inputs=[status_e],
396
+ outputs=[out_dir_state]
397
+ ).then(
398
+ fn=build_item_selector,
399
+ inputs=[out_dir_state, target, use_vlm_e],
400
+ outputs=[item_selector_e]
401
+ ).then(
402
+ fn=show_selected_item,
403
+ inputs=[item_selector_e, out_dir_state],
404
+ outputs=[tables_preview_e, image_e]
405
+ ).then(
406
+ fn=update_content_visibility,
407
+ inputs=[use_vlm_e],
408
+ outputs=[vlm_content, non_vlm_content]
409
+ ).then(
410
+ fn=populate_scrollable_gallery,
411
+ inputs=[files_out_e, target],
412
+ outputs=[scrollable_gallery_e]
413
+ )
414
+
415
+ # Handle dropdown selection changes
416
+ item_selector_e.change(
417
+ fn=show_selected_item,
418
+ inputs=[item_selector_e, out_dir_state],
419
+ outputs=[tables_preview_e, image_e]
420
+ )
421
+
422
+ # Return state variables for external access
423
+ state_vars = {
424
+ 'pdf_e': pdf_e,
425
+ 'target': target,
426
+ 'use_vlm_e': use_vlm_e,
427
+ 'vlm_provider_e': vlm_provider_e,
428
+ 'vlm_api_key_e': vlm_api_key_e,
429
+ 'layout_model_e': layout_model_e,
430
+ 'dpi_e': dpi_e,
431
+ 'min_score_e': min_score_e,
432
+ 'run_btn_e': run_btn_e,
433
+ 'status_e': status_e,
434
+ 'item_selector_e': item_selector_e,
435
+ 'tables_preview_e': tables_preview_e,
436
+ 'image_e': image_e,
437
+ 'files_out_e': files_out_e,
438
+ 'zip_out_e': zip_out_e,
439
+ 'out_dir_state': out_dir_state,
440
+ 'vlm_content': vlm_content,
441
+ 'non_vlm_content': non_vlm_content,
442
+ 'scrollable_gallery_e': scrollable_gallery_e
443
+ }
444
+
445
+ return tab, state_vars