doctra 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doctra/cli/main.py +5 -12
- doctra/cli/utils.py +2 -3
- doctra/engines/image_restoration/docres_engine.py +2 -7
- doctra/engines/vlm/outlines_types.py +13 -9
- doctra/engines/vlm/service.py +4 -2
- doctra/exporters/excel_writer.py +89 -0
- doctra/parsers/enhanced_pdf_parser.py +18 -14
- doctra/parsers/structured_pdf_parser.py +6 -0
- doctra/parsers/table_chart_extractor.py +6 -0
- doctra/ui/app.py +8 -14
- doctra/utils/structured_utils.py +5 -2
- doctra/version.py +1 -1
- {doctra-0.4.0.dist-info → doctra-0.4.1.dist-info}/METADATA +1 -1
- {doctra-0.4.0.dist-info → doctra-0.4.1.dist-info}/RECORD +17 -17
- {doctra-0.4.0.dist-info → doctra-0.4.1.dist-info}/WHEEL +0 -0
- {doctra-0.4.0.dist-info → doctra-0.4.1.dist-info}/licenses/LICENSE +0 -0
- {doctra-0.4.0.dist-info → doctra-0.4.1.dist-info}/top_level.txt +0 -0
doctra/cli/main.py
CHANGED
@@ -9,6 +9,7 @@ detection results, and analyze document structure from the command line.
|
|
9
9
|
import click
|
10
10
|
import os
|
11
11
|
import sys
|
12
|
+
import traceback
|
12
13
|
from pathlib import Path
|
13
14
|
from typing import Optional
|
14
15
|
|
@@ -25,6 +26,10 @@ except ImportError:
|
|
25
26
|
from doctra.parsers.enhanced_pdf_parser import EnhancedPDFParser
|
26
27
|
from doctra.parsers.table_chart_extractor import ChartTablePDFParser
|
27
28
|
|
29
|
+
# Import additional modules
|
30
|
+
from doctra.engines.layout.paddle_layout import PaddleLayoutEngine
|
31
|
+
from doctra.engines.image_restoration import DocResEngine
|
32
|
+
|
28
33
|
|
29
34
|
@click.group(invoke_without_command=True)
|
30
35
|
@click.pass_context
|
@@ -247,7 +252,6 @@ def parse(pdf_path: Path, output_dir: Optional[Path], use_vlm: bool,
|
|
247
252
|
except Exception as e:
|
248
253
|
click.echo(f"❌ Error initializing parser: {e}", err=True)
|
249
254
|
if verbose:
|
250
|
-
import traceback
|
251
255
|
click.echo(traceback.format_exc(), err=True)
|
252
256
|
sys.exit(1)
|
253
257
|
|
@@ -271,7 +275,6 @@ def parse(pdf_path: Path, output_dir: Optional[Path], use_vlm: bool,
|
|
271
275
|
except Exception as e:
|
272
276
|
click.echo(f"❌ Error during parsing: {e}", err=True)
|
273
277
|
if verbose:
|
274
|
-
import traceback
|
275
278
|
click.echo(traceback.format_exc(), err=True)
|
276
279
|
sys.exit(1)
|
277
280
|
finally:
|
@@ -394,7 +397,6 @@ def enhance(pdf_path: Path, output_dir: Optional[Path], restoration_task: str,
|
|
394
397
|
except Exception as e:
|
395
398
|
click.echo(f"❌ Error initializing enhanced parser: {e}", err=True)
|
396
399
|
if verbose:
|
397
|
-
import traceback
|
398
400
|
click.echo(traceback.format_exc(), err=True)
|
399
401
|
sys.exit(1)
|
400
402
|
|
@@ -418,7 +420,6 @@ def enhance(pdf_path: Path, output_dir: Optional[Path], restoration_task: str,
|
|
418
420
|
except Exception as e:
|
419
421
|
click.echo(f"❌ Error during enhanced parsing: {e}", err=True)
|
420
422
|
if verbose:
|
421
|
-
import traceback
|
422
423
|
click.echo(traceback.format_exc(), err=True)
|
423
424
|
sys.exit(1)
|
424
425
|
finally:
|
@@ -526,7 +527,6 @@ def charts(pdf_path: Path, output_dir: Path, use_vlm: bool, vlm_provider: str,
|
|
526
527
|
except Exception as e:
|
527
528
|
click.echo(f"❌ Error during chart extraction: {e}", err=True)
|
528
529
|
if verbose:
|
529
|
-
import traceback
|
530
530
|
click.echo(traceback.format_exc(), err=True)
|
531
531
|
sys.exit(1)
|
532
532
|
|
@@ -604,7 +604,6 @@ def tables(pdf_path: Path, output_dir: Path, use_vlm: bool, vlm_provider: str,
|
|
604
604
|
except Exception as e:
|
605
605
|
click.echo(f"❌ Error during table extraction: {e}", err=True)
|
606
606
|
if verbose:
|
607
|
-
import traceback
|
608
607
|
click.echo(traceback.format_exc(), err=True)
|
609
608
|
sys.exit(1)
|
610
609
|
|
@@ -683,7 +682,6 @@ def both(pdf_path: Path, output_dir: Path, use_vlm: bool, vlm_provider: str,
|
|
683
682
|
except Exception as e:
|
684
683
|
click.echo(f"❌ Error during extraction: {e}", err=True)
|
685
684
|
if verbose:
|
686
|
-
import traceback
|
687
685
|
click.echo(traceback.format_exc(), err=True)
|
688
686
|
sys.exit(1)
|
689
687
|
|
@@ -772,7 +770,6 @@ def visualize(pdf_path: Path, pages: int, columns: int, width: int,
|
|
772
770
|
except Exception as e:
|
773
771
|
click.echo(f"❌ Error creating visualization: {e}", err=True)
|
774
772
|
if verbose:
|
775
|
-
import traceback
|
776
773
|
click.echo(traceback.format_exc(), err=True)
|
777
774
|
sys.exit(1)
|
778
775
|
|
@@ -805,7 +802,6 @@ def analyze(pdf_path: Path, dpi: int, min_score: float, layout_model: str, verbo
|
|
805
802
|
click.echo(f"🔍 Analyzing: {pdf_path.name}")
|
806
803
|
|
807
804
|
# Create layout engine for analysis only
|
808
|
-
from doctra.engines.layout.paddle_layout import PaddleLayoutEngine
|
809
805
|
|
810
806
|
if verbose:
|
811
807
|
click.echo(f" Using model: {layout_model}")
|
@@ -903,7 +899,6 @@ def analyze(pdf_path: Path, dpi: int, min_score: float, layout_model: str, verbo
|
|
903
899
|
except Exception as e:
|
904
900
|
click.echo(f"❌ Error analyzing PDF: {e}", err=True)
|
905
901
|
if verbose:
|
906
|
-
import traceback
|
907
902
|
click.echo(traceback.format_exc(), err=True)
|
908
903
|
sys.exit(1)
|
909
904
|
|
@@ -922,7 +917,6 @@ def info():
|
|
922
917
|
click.echo("=" * 50)
|
923
918
|
|
924
919
|
# Check Python version
|
925
|
-
import sys
|
926
920
|
python_version = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}"
|
927
921
|
click.echo(f"Python version: {python_version}")
|
928
922
|
|
@@ -1003,7 +997,6 @@ def info():
|
|
1003
997
|
# DocRes information
|
1004
998
|
click.echo("\nDocRes Image Restoration:")
|
1005
999
|
try:
|
1006
|
-
from doctra.engines.image_restoration import DocResEngine
|
1007
1000
|
docres = DocResEngine()
|
1008
1001
|
click.echo(f" ✅ DocRes available - {len(docres.get_supported_tasks())} restoration tasks")
|
1009
1002
|
click.echo(" Tasks: dewarping, deshadowing, appearance, deblurring, binarization, end2end")
|
doctra/cli/utils.py
CHANGED
@@ -7,8 +7,10 @@ different CLI commands.
|
|
7
7
|
|
8
8
|
import click
|
9
9
|
import sys
|
10
|
+
import traceback
|
10
11
|
from typing import Optional, Dict, Any
|
11
12
|
from pathlib import Path
|
13
|
+
from doctra.utils.progress import create_beautiful_progress_bar, create_notebook_friendly_bar
|
12
14
|
|
13
15
|
|
14
16
|
def validate_vlm_config(use_vlm: bool, vlm_api_key: Optional[str]) -> None:
|
@@ -58,7 +60,6 @@ def handle_exception(e: Exception, verbose: bool = False) -> None:
|
|
58
60
|
"""
|
59
61
|
click.echo(f"❌ Error: {e}", err=True)
|
60
62
|
if verbose:
|
61
|
-
import traceback
|
62
63
|
click.echo(traceback.format_exc(), err=True)
|
63
64
|
sys.exit(1)
|
64
65
|
|
@@ -271,8 +272,6 @@ def create_progress_callback(description: str, total: int):
|
|
271
272
|
:return: Callable progress callback function that takes an integer
|
272
273
|
representing the number of completed items
|
273
274
|
"""
|
274
|
-
import sys
|
275
|
-
from doctra.utils.progress import create_beautiful_progress_bar, create_notebook_friendly_bar
|
276
275
|
|
277
276
|
# Enhanced environment detection
|
278
277
|
is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
|
@@ -18,6 +18,8 @@ import sys
|
|
18
18
|
import cv2
|
19
19
|
import numpy as np
|
20
20
|
import torch
|
21
|
+
import tempfile
|
22
|
+
import time
|
21
23
|
from pathlib import Path
|
22
24
|
from typing import Union, List, Tuple, Optional, Dict, Any
|
23
25
|
|
@@ -308,8 +310,6 @@ class DocResEngine:
|
|
308
310
|
|
309
311
|
def _run_single_task(self, img_array: np.ndarray, task: str, save_prompts: bool) -> Tuple[np.ndarray, Dict]:
|
310
312
|
"""Run a single restoration task"""
|
311
|
-
import tempfile
|
312
|
-
import time
|
313
313
|
|
314
314
|
# Create temporary file for inference
|
315
315
|
with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmp_file:
|
@@ -322,7 +322,6 @@ class DocResEngine:
|
|
322
322
|
os.chdir(str(docres_dir))
|
323
323
|
|
324
324
|
# Set global DEVICE variable that DocRes inference expects
|
325
|
-
import torch
|
326
325
|
import inference # Import the inference module to set its global DEVICE
|
327
326
|
inference.DEVICE = self.device
|
328
327
|
|
@@ -364,8 +363,6 @@ class DocResEngine:
|
|
364
363
|
|
365
364
|
def _run_end2end_pipeline(self, img_array: np.ndarray, save_prompts: bool) -> Tuple[np.ndarray, Dict]:
|
366
365
|
"""Run the end2end pipeline: dewarping → deshadowing → appearance"""
|
367
|
-
import tempfile
|
368
|
-
import time
|
369
366
|
|
370
367
|
intermediate_steps = {}
|
371
368
|
|
@@ -374,7 +371,6 @@ class DocResEngine:
|
|
374
371
|
os.chdir(str(docres_dir))
|
375
372
|
|
376
373
|
# Set global DEVICE variable that DocRes inference expects
|
377
|
-
import torch
|
378
374
|
import inference # Import the inference module to set its global DEVICE
|
379
375
|
inference.DEVICE = self.device
|
380
376
|
|
@@ -482,7 +478,6 @@ class DocResEngine:
|
|
482
478
|
"""
|
483
479
|
try:
|
484
480
|
from PIL import Image
|
485
|
-
import numpy as np
|
486
481
|
from doctra.utils.pdf_io import render_pdf_to_images
|
487
482
|
|
488
483
|
# Generate output path if not provided
|
@@ -1,17 +1,19 @@
|
|
1
|
-
from pydantic import BaseModel
|
1
|
+
from pydantic import BaseModel, Field
|
2
2
|
|
3
3
|
class Chart(BaseModel):
|
4
4
|
"""
|
5
5
|
Structured representation of a chart extracted from an image.
|
6
6
|
|
7
|
-
|
8
|
-
using VLM (Vision Language Model) processing.
|
7
|
+
Includes a title, a short description, column headers, and data rows
|
8
|
+
identified using VLM (Vision Language Model) processing.
|
9
9
|
|
10
|
-
:param title: Title or caption of the chart
|
10
|
+
:param title: Title or caption of the chart (max 31 characters)
|
11
|
+
:param description: Short description of the chart (max 300 characters)
|
11
12
|
:param headers: Column headers for the chart data
|
12
13
|
:param rows: Data rows containing the chart values
|
13
14
|
"""
|
14
|
-
title: str
|
15
|
+
title: str = Field(max_length=31)
|
16
|
+
description: str = Field(max_length=300)
|
15
17
|
headers: list[str]
|
16
18
|
rows: list[list[str]]
|
17
19
|
|
@@ -19,13 +21,15 @@ class Table(BaseModel):
|
|
19
21
|
"""
|
20
22
|
Structured representation of a table extracted from an image.
|
21
23
|
|
22
|
-
|
23
|
-
using VLM (Vision Language Model) processing.
|
24
|
+
Includes a title, a short description, column headers, and data rows
|
25
|
+
identified using VLM (Vision Language Model) processing.
|
24
26
|
|
25
|
-
:param title: Title or caption of the table
|
27
|
+
:param title: Title or caption of the table (max 31 characters)
|
28
|
+
:param description: Short description of the table (max 300 characters)
|
26
29
|
:param headers: Column headers for the table data
|
27
30
|
:param rows: Data rows containing the table values
|
28
31
|
"""
|
29
|
-
title: str
|
32
|
+
title: str = Field(max_length=31)
|
33
|
+
description: str = Field(max_length=300)
|
30
34
|
headers: list[str]
|
31
35
|
rows: list[list[str]]
|
doctra/engines/vlm/service.py
CHANGED
@@ -73,7 +73,7 @@ class VLMStructuredExtractor:
|
|
73
73
|
Extract structured chart data from an image.
|
74
74
|
|
75
75
|
:param image_path: Path to the chart image file
|
76
|
-
:return: Chart object containing extracted title, headers, and data rows
|
76
|
+
:return: Chart object containing extracted title, description, headers, and data rows
|
77
77
|
:raises Exception: If image processing or VLM extraction fails
|
78
78
|
"""
|
79
79
|
prompt_text = (
|
@@ -81,6 +81,7 @@ class VLMStructuredExtractor:
|
|
81
81
|
"If the title is not present in the image, generate a suitable title. "
|
82
82
|
"Ensure that the table represents the data from the chart accurately."
|
83
83
|
"The number of columns in the headers must match the number of columns in each row."
|
84
|
+
"Also provide a short description (max 300 characters) of the chart."
|
84
85
|
)
|
85
86
|
return self._call(prompt_text, image_path, Chart)
|
86
87
|
|
@@ -89,7 +90,7 @@ class VLMStructuredExtractor:
|
|
89
90
|
Extract structured table data from an image.
|
90
91
|
|
91
92
|
:param image_path: Path to the table image file
|
92
|
-
:return: Table object containing extracted title, headers, and data rows
|
93
|
+
:return: Table object containing extracted title, description, headers, and data rows
|
93
94
|
:raises Exception: If image processing or VLM extraction fails
|
94
95
|
"""
|
95
96
|
prompt_text = (
|
@@ -97,5 +98,6 @@ class VLMStructuredExtractor:
|
|
97
98
|
"Provide the headers and rows of the table, ensuring accuracy in the extraction. "
|
98
99
|
"If the title is not present in the image, generate a suitable title."
|
99
100
|
"The number of columns in the headers must match the number of columns in each row."
|
101
|
+
"Also provide a short description (max 300 characters) of the table."
|
100
102
|
)
|
101
103
|
return self._call(prompt_text, image_path, Table)
|
doctra/exporters/excel_writer.py
CHANGED
@@ -5,6 +5,7 @@ from typing import Dict, Any, List, Set
|
|
5
5
|
import pandas as pd # pip install pandas openpyxl
|
6
6
|
from openpyxl.styles import PatternFill, Font, Alignment
|
7
7
|
from openpyxl.utils import get_column_letter
|
8
|
+
from openpyxl.worksheet.hyperlink import Hyperlink
|
8
9
|
|
9
10
|
_INVALID_SHEET_CHARS = r'[:\\/*?\[\]]' # Excel-invalid characters
|
10
11
|
_MAX_SHEET_LEN = 31
|
@@ -85,6 +86,61 @@ def _autosize_columns(ws, df: pd.DataFrame) -> None:
|
|
85
86
|
ws.column_dimensions[get_column_letter(i)].width = min(max(10, max_len + 2), 60)
|
86
87
|
|
87
88
|
|
89
|
+
def _style_summary_sheet(ws, df: pd.DataFrame, sheet_mapping: dict = None) -> None:
|
90
|
+
"""
|
91
|
+
Apply special styling to the summary sheet with text wrapping for descriptions.
|
92
|
+
Add hyperlinks to table titles that link to their corresponding sheets.
|
93
|
+
|
94
|
+
:param ws: OpenPyXL worksheet object to style
|
95
|
+
:param df: Pandas DataFrame containing the summary data
|
96
|
+
:param sheet_mapping: Dictionary mapping table titles to their sheet names
|
97
|
+
:return: None
|
98
|
+
"""
|
99
|
+
# Style header row
|
100
|
+
_style_header(ws, ncols=df.shape[1])
|
101
|
+
|
102
|
+
# Apply text wrapping to all data cells
|
103
|
+
wrap_alignment = Alignment(wrap_text=True, vertical="top")
|
104
|
+
|
105
|
+
# Apply wrapping to all data rows (skip header row)
|
106
|
+
for row_idx in range(2, len(df) + 2): # Start from row 2 (after header)
|
107
|
+
for col_idx in range(1, df.shape[1] + 1):
|
108
|
+
cell = ws.cell(row=row_idx, column=col_idx)
|
109
|
+
cell.alignment = wrap_alignment
|
110
|
+
|
111
|
+
# Add hyperlink to table title column (column A)
|
112
|
+
if col_idx == 1 and sheet_mapping: # Table Title column
|
113
|
+
table_title = cell.value
|
114
|
+
if table_title and table_title in sheet_mapping:
|
115
|
+
sheet_name = sheet_mapping[table_title]
|
116
|
+
|
117
|
+
# Create hyperlink to the sheet using proper Excel format
|
118
|
+
# Escape sheet name if it contains spaces or special characters
|
119
|
+
if ' ' in sheet_name or any(char in sheet_name for char in ['[', ']', '*', '?', ':', '\\', '/']):
|
120
|
+
hyperlink_ref = f"#'{sheet_name}'!A1"
|
121
|
+
else:
|
122
|
+
hyperlink_ref = f"#{sheet_name}!A1"
|
123
|
+
|
124
|
+
# Use Hyperlink class with proper parameters
|
125
|
+
cell.hyperlink = Hyperlink(ref=hyperlink_ref, target=hyperlink_ref)
|
126
|
+
# Style the hyperlink
|
127
|
+
cell.font = Font(color="0000FF", underline="single")
|
128
|
+
|
129
|
+
# Set specific column widths for summary sheet
|
130
|
+
# Table Title column - narrower
|
131
|
+
ws.column_dimensions['A'].width = 30
|
132
|
+
# Description column - wider to accommodate wrapped text
|
133
|
+
ws.column_dimensions['B'].width = 60
|
134
|
+
# Page column - narrow for page numbers
|
135
|
+
ws.column_dimensions['C'].width = 10
|
136
|
+
# Type column - narrow for Table/Chart
|
137
|
+
ws.column_dimensions['D'].width = 12
|
138
|
+
|
139
|
+
# Set row heights to accommodate wrapped text
|
140
|
+
for row_idx in range(2, len(df) + 2):
|
141
|
+
ws.row_dimensions[row_idx].height = 60 # Allow for multiple lines
|
142
|
+
|
143
|
+
|
88
144
|
def _normalize_data(headers: List[str], rows: List[List]) -> tuple[List[str], List[List]]:
|
89
145
|
"""
|
90
146
|
Normalize headers and rows to ensure consistent dimensions.
|
@@ -159,6 +215,31 @@ def write_structured_excel(excel_path: str, items: List[Dict[str, Any]]) -> str
|
|
159
215
|
taken: Set[str] = set()
|
160
216
|
|
161
217
|
with pd.ExcelWriter(excel_path, engine="openpyxl", mode="w") as writer:
|
218
|
+
# Create summary sheet first
|
219
|
+
summary_data = []
|
220
|
+
sheet_mapping = {} # Map table titles to their sheet names
|
221
|
+
|
222
|
+
for item in valid_items:
|
223
|
+
title = item.get("title") or "Untitled"
|
224
|
+
description = item.get("description") or "No description available"
|
225
|
+
page_number = item.get("page", "Unknown")
|
226
|
+
item_type = item.get("type", "Table") # Default to "Table" if not specified
|
227
|
+
|
228
|
+
|
229
|
+
summary_data.append({
|
230
|
+
"Table Title": title,
|
231
|
+
"Description": description,
|
232
|
+
"Page": page_number,
|
233
|
+
"Type": item_type
|
234
|
+
})
|
235
|
+
|
236
|
+
# Create summary sheet first (but without hyperlinks initially)
|
237
|
+
if summary_data:
|
238
|
+
summary_df = pd.DataFrame(summary_data)
|
239
|
+
summary_df.to_excel(writer, sheet_name="Table Summary", index=False)
|
240
|
+
taken.add("Table Summary")
|
241
|
+
|
242
|
+
# Process individual table sheets to build sheet mapping
|
162
243
|
for item in valid_items:
|
163
244
|
try:
|
164
245
|
title = item.get("title") or "Untitled"
|
@@ -166,6 +247,9 @@ def write_structured_excel(excel_path: str, items: List[Dict[str, Any]]) -> str
|
|
166
247
|
rows = item.get("rows") or []
|
167
248
|
|
168
249
|
sheet_name = _safe_sheet_name(title, taken)
|
250
|
+
|
251
|
+
# Add to sheet mapping for hyperlinks
|
252
|
+
sheet_mapping[title] = sheet_name
|
169
253
|
|
170
254
|
# Normalize data to handle mismatched dimensions
|
171
255
|
normalized_headers, normalized_rows = _normalize_data(headers, rows)
|
@@ -194,4 +278,9 @@ def write_structured_excel(excel_path: str, items: List[Dict[str, Any]]) -> str
|
|
194
278
|
print(f"Error processing item '{item.get('title', 'Unknown')}': {e}")
|
195
279
|
continue
|
196
280
|
|
281
|
+
# Now add hyperlinks to the summary sheet (after all sheets are created)
|
282
|
+
if summary_data and sheet_mapping:
|
283
|
+
summary_ws = writer.sheets["Table Summary"]
|
284
|
+
_style_summary_sheet(summary_ws, summary_df, sheet_mapping)
|
285
|
+
|
197
286
|
return excel_path
|
@@ -8,6 +8,7 @@ capabilities with DocRes image restoration for improved document processing.
|
|
8
8
|
from __future__ import annotations
|
9
9
|
import os
|
10
10
|
import sys
|
11
|
+
import numpy as np
|
11
12
|
from typing import List, Dict, Any, Optional, Union
|
12
13
|
from contextlib import ExitStack
|
13
14
|
from PIL import Image
|
@@ -16,9 +17,17 @@ from tqdm import tqdm
|
|
16
17
|
from doctra.parsers.structured_pdf_parser import StructuredPDFParser
|
17
18
|
from doctra.engines.image_restoration import DocResEngine
|
18
19
|
from doctra.utils.pdf_io import render_pdf_to_images
|
19
|
-
from doctra.utils.constants import IMAGE_SUBDIRS
|
20
|
+
from doctra.utils.constants import IMAGE_SUBDIRS, EXCLUDE_LABELS
|
20
21
|
from doctra.utils.file_ops import ensure_output_dirs
|
21
22
|
from doctra.utils.progress import create_beautiful_progress_bar, create_notebook_friendly_bar
|
23
|
+
from doctra.parsers.layout_order import reading_order_key
|
24
|
+
from doctra.utils.ocr_utils import ocr_box_text
|
25
|
+
from doctra.exporters.image_saver import save_box_image
|
26
|
+
from doctra.exporters.markdown_writer import write_markdown
|
27
|
+
from doctra.exporters.html_writer import write_html, write_structured_html
|
28
|
+
from doctra.exporters.excel_writer import write_structured_excel
|
29
|
+
from doctra.utils.structured_utils import to_structured_dict
|
30
|
+
from doctra.exporters.markdown_table import render_markdown_table
|
22
31
|
|
23
32
|
|
24
33
|
class EnhancedPDFParser(StructuredPDFParser):
|
@@ -146,7 +155,7 @@ class EnhancedPDFParser(StructuredPDFParser):
|
|
146
155
|
pil_pages = enhanced_pages
|
147
156
|
|
148
157
|
# Continue with standard parsing logic
|
149
|
-
self._process_parsing_logic(pages, pil_pages, out_dir, pdf_filename)
|
158
|
+
self._process_parsing_logic(pages, pil_pages, out_dir, pdf_filename, pdf_path)
|
150
159
|
|
151
160
|
def _process_pages_with_restoration(self, pdf_path: str, out_dir: str) -> List[Image.Image]:
|
152
161
|
"""
|
@@ -186,7 +195,6 @@ class EnhancedPDFParser(StructuredPDFParser):
|
|
186
195
|
for i, page_img in enumerate(original_pages):
|
187
196
|
try:
|
188
197
|
# Convert PIL to numpy array
|
189
|
-
import numpy as np
|
190
198
|
img_array = np.array(page_img)
|
191
199
|
|
192
200
|
# Apply DocRes restoration
|
@@ -219,21 +227,11 @@ class EnhancedPDFParser(StructuredPDFParser):
|
|
219
227
|
print(f"✅ Image restoration completed. Enhanced pages saved to: {enhanced_dir}")
|
220
228
|
return enhanced_pages
|
221
229
|
|
222
|
-
def _process_parsing_logic(self, pages, pil_pages, out_dir, pdf_filename):
|
230
|
+
def _process_parsing_logic(self, pages, pil_pages, out_dir, pdf_filename, pdf_path):
|
223
231
|
"""
|
224
232
|
Process the parsing logic with enhanced pages.
|
225
233
|
This is extracted from the parent class to allow customization.
|
226
234
|
"""
|
227
|
-
from doctra.utils.constants import EXCLUDE_LABELS
|
228
|
-
from doctra.parsers.layout_order import reading_order_key
|
229
|
-
from doctra.utils.ocr_utils import ocr_box_text
|
230
|
-
from doctra.exporters.image_saver import save_box_image
|
231
|
-
from doctra.exporters.markdown_writer import write_markdown
|
232
|
-
from doctra.exporters.html_writer import write_html
|
233
|
-
from doctra.exporters.excel_writer import write_structured_excel
|
234
|
-
from doctra.exporters.html_writer import write_structured_html
|
235
|
-
from doctra.utils.structured_utils import to_structured_dict
|
236
|
-
from doctra.exporters.markdown_table import render_markdown_table
|
237
235
|
|
238
236
|
fig_count = sum(sum(1 for b in p.boxes if b.label == "figure") for p in pages)
|
239
237
|
chart_count = sum(sum(1 for b in p.boxes if b.label == "chart") for p in pages)
|
@@ -285,6 +283,9 @@ class EnhancedPDFParser(StructuredPDFParser):
|
|
285
283
|
chart = self.vlm.extract_chart(abs_img_path)
|
286
284
|
item = to_structured_dict(chart)
|
287
285
|
if item:
|
286
|
+
# Add page and type information to structured item
|
287
|
+
item["page"] = page_num
|
288
|
+
item["type"] = "Chart"
|
288
289
|
structured_items.append(item)
|
289
290
|
md_lines.append(
|
290
291
|
render_markdown_table(item.get("headers"), item.get("rows"),
|
@@ -306,6 +307,9 @@ class EnhancedPDFParser(StructuredPDFParser):
|
|
306
307
|
table = self.vlm.extract_table(abs_img_path)
|
307
308
|
item = to_structured_dict(table)
|
308
309
|
if item:
|
310
|
+
# Add page and type information to structured item
|
311
|
+
item["page"] = page_num
|
312
|
+
item["type"] = "Table"
|
309
313
|
structured_items.append(item)
|
310
314
|
md_lines.append(
|
311
315
|
render_markdown_table(item.get("headers"), item.get("rows"),
|
@@ -163,6 +163,9 @@ class StructuredPDFParser:
|
|
163
163
|
chart = self.vlm.extract_chart(abs_img_path)
|
164
164
|
item = to_structured_dict(chart)
|
165
165
|
if item:
|
166
|
+
# Add page and type information to structured item
|
167
|
+
item["page"] = page_num
|
168
|
+
item["type"] = "Chart"
|
166
169
|
structured_items.append(item)
|
167
170
|
md_lines.append(
|
168
171
|
render_markdown_table(item.get("headers"), item.get("rows"),
|
@@ -184,6 +187,9 @@ class StructuredPDFParser:
|
|
184
187
|
table = self.vlm.extract_table(abs_img_path)
|
185
188
|
item = to_structured_dict(table)
|
186
189
|
if item:
|
190
|
+
# Add page and type information to structured item
|
191
|
+
item["page"] = page_num
|
192
|
+
item["type"] = "Table"
|
187
193
|
structured_items.append(item)
|
188
194
|
md_lines.append(
|
189
195
|
render_markdown_table(item.get("headers"), item.get("rows"),
|
@@ -178,6 +178,9 @@ class ChartTablePDFParser:
|
|
178
178
|
extracted_chart = self.vlm.extract_chart(chart_path)
|
179
179
|
structured_item = to_structured_dict(extracted_chart)
|
180
180
|
if structured_item:
|
181
|
+
# Add page and type information to structured item
|
182
|
+
structured_item["page"] = page_num
|
183
|
+
structured_item["type"] = "Chart"
|
181
184
|
structured_items.append(structured_item)
|
182
185
|
vlm_items.append({
|
183
186
|
"kind": "chart",
|
@@ -221,6 +224,9 @@ class ChartTablePDFParser:
|
|
221
224
|
extracted_table = self.vlm.extract_table(table_path)
|
222
225
|
structured_item = to_structured_dict(extracted_table)
|
223
226
|
if structured_item:
|
227
|
+
# Add page and type information to structured item
|
228
|
+
structured_item["page"] = page_num
|
229
|
+
structured_item["type"] = "Table"
|
224
230
|
structured_items.append(structured_item)
|
225
231
|
vlm_items.append({
|
226
232
|
"kind": "table",
|
doctra/ui/app.py
CHANGED
@@ -2,6 +2,11 @@ import os
|
|
2
2
|
import shutil
|
3
3
|
import tempfile
|
4
4
|
import re
|
5
|
+
import traceback
|
6
|
+
import pandas as pd
|
7
|
+
import html as _html
|
8
|
+
import base64
|
9
|
+
import json
|
5
10
|
from pathlib import Path
|
6
11
|
from typing import Optional, Tuple, List, Dict, Any
|
7
12
|
|
@@ -9,6 +14,7 @@ import gradio as gr
|
|
9
14
|
|
10
15
|
from doctra.parsers.structured_pdf_parser import StructuredPDFParser
|
11
16
|
from doctra.parsers.table_chart_extractor import ChartTablePDFParser
|
17
|
+
from doctra.utils.pdf_io import render_pdf_to_images
|
12
18
|
|
13
19
|
|
14
20
|
def _gather_outputs(out_dir: Path, allowed_kinds: Optional[List[str]] = None, zip_filename: Optional[str] = None, is_structured_parsing: bool = False) -> Tuple[List[tuple[str, str]], List[str], str]:
|
@@ -100,7 +106,6 @@ def _parse_markdown_by_pages(md_content: str) -> List[Dict[str, Any]]:
|
|
100
106
|
Parse markdown content and organize it by pages.
|
101
107
|
Returns a list of page dictionaries with content, tables, charts, and figures.
|
102
108
|
"""
|
103
|
-
import re
|
104
109
|
|
105
110
|
pages = []
|
106
111
|
current_page = None
|
@@ -209,7 +214,6 @@ def run_full_parse(
|
|
209
214
|
try:
|
210
215
|
parser.parse(str(input_pdf))
|
211
216
|
except Exception as e:
|
212
|
-
import traceback
|
213
217
|
traceback.print_exc()
|
214
218
|
# Safely encode error message for return value
|
215
219
|
try:
|
@@ -325,8 +329,6 @@ def run_extract(
|
|
325
329
|
if excel_filename:
|
326
330
|
excel_path = out_dir / excel_filename
|
327
331
|
if excel_path.exists():
|
328
|
-
import pandas as pd
|
329
|
-
import html as _html
|
330
332
|
|
331
333
|
# Read Excel file and create HTML tables
|
332
334
|
xl_file = pd.ExcelFile(excel_path)
|
@@ -489,7 +491,6 @@ def build_demo() -> gr.Blocks:
|
|
489
491
|
|
490
492
|
def parse_markdown_by_pages(md_content: str):
|
491
493
|
"""Parse markdown content and organize it by pages."""
|
492
|
-
import re
|
493
494
|
|
494
495
|
pages = []
|
495
496
|
current_page = None
|
@@ -548,7 +549,6 @@ def build_demo() -> gr.Blocks:
|
|
548
549
|
return "Page not found", None
|
549
550
|
|
550
551
|
# Build HTML with inline base64 images, render markdown tables, and preserve paragraphs/line breaks
|
551
|
-
import html as _html, base64, re as _re
|
552
552
|
base_dir = None
|
553
553
|
try:
|
554
554
|
stem = Path(pdf_path).stem if pdf_path else ""
|
@@ -589,7 +589,7 @@ def build_demo() -> gr.Blocks:
|
|
589
589
|
stripped = line.strip()
|
590
590
|
if stripped.startswith(':
|
591
591
|
flush_paragraph()
|
592
|
-
match =
|
592
|
+
match = re.match(r'!\[([^\]]+)\]\(([^)]+)\)', stripped)
|
593
593
|
if match and base_dir is not None:
|
594
594
|
caption = match.group(1)
|
595
595
|
rel_path = match.group(2).replace('\\\\', '/').replace('\\', '/').lstrip('/')
|
@@ -646,7 +646,6 @@ def build_demo() -> gr.Blocks:
|
|
646
646
|
# Ensure page images are prepared
|
647
647
|
try:
|
648
648
|
if pdf_path and not page_images:
|
649
|
-
from doctra.utils.pdf_io import render_pdf_to_images
|
650
649
|
tmp_img_dir = Path(tempfile.mkdtemp(prefix="doctra_pages_"))
|
651
650
|
pil_pages = render_pdf_to_images(pdf_path)
|
652
651
|
saved_paths: List[str] = []
|
@@ -726,7 +725,6 @@ def build_demo() -> gr.Blocks:
|
|
726
725
|
for page in pages_data:
|
727
726
|
for line in page['content']:
|
728
727
|
if line.strip().startswith(':
|
729
|
-
import re
|
730
728
|
match = re.match(r'!\[([^\]]+)\]\(([^)]+)\)', line.strip())
|
731
729
|
if match:
|
732
730
|
caption = match.group(1)
|
@@ -745,7 +743,6 @@ def build_demo() -> gr.Blocks:
|
|
745
743
|
saved_paths: List[str] = []
|
746
744
|
try:
|
747
745
|
if input_pdf_path:
|
748
|
-
from doctra.utils.pdf_io import render_pdf_to_images
|
749
746
|
tmp_img_dir = Path(tempfile.mkdtemp(prefix="doctra_pages_"))
|
750
747
|
pil_pages = render_pdf_to_images(input_pdf_path)
|
751
748
|
for idx, (im, _, _) in enumerate(pil_pages, start=1):
|
@@ -759,7 +756,6 @@ def build_demo() -> gr.Blocks:
|
|
759
756
|
|
760
757
|
# Build initial HTML with inline images and proper blocks for first page
|
761
758
|
if pages_data:
|
762
|
-
import html as _html, base64, re as _re
|
763
759
|
base_dir = None
|
764
760
|
try:
|
765
761
|
stem = Path(input_pdf_path).stem if input_pdf_path else ""
|
@@ -771,7 +767,7 @@ def build_demo() -> gr.Blocks:
|
|
771
767
|
for raw_line in pages_data[0]['content']:
|
772
768
|
line = raw_line.strip()
|
773
769
|
if line.startswith(':
|
774
|
-
match =
|
770
|
+
match = re.match(r'!\[([^\]]+)\]\(([^)]+)\)', line)
|
775
771
|
if match and base_dir is not None:
|
776
772
|
caption = match.group(1)
|
777
773
|
rel_path = match.group(2).replace('\\\\', '/').replace('\\', '/').lstrip('/')
|
@@ -874,7 +870,6 @@ def build_demo() -> gr.Blocks:
|
|
874
870
|
if not mapping.exists():
|
875
871
|
return gr.Dropdown(choices=[], value=None, visible=False)
|
876
872
|
|
877
|
-
import json
|
878
873
|
data = json.loads(mapping.read_text(encoding="utf-8"))
|
879
874
|
choices = []
|
880
875
|
|
@@ -902,7 +897,6 @@ def build_demo() -> gr.Blocks:
|
|
902
897
|
if not mapping.exists():
|
903
898
|
return "", None
|
904
899
|
|
905
|
-
import json, html as _html
|
906
900
|
data = json.loads(mapping.read_text(encoding="utf-8"))
|
907
901
|
|
908
902
|
for entry in data:
|
doctra/utils/structured_utils.py
CHANGED
@@ -14,7 +14,7 @@ def to_structured_dict(obj: Any) -> Optional[Dict[str, Any]]:
|
|
14
14
|
- JSON string
|
15
15
|
- dict
|
16
16
|
- Pydantic BaseModel (v1 .dict() or v2 .model_dump())
|
17
|
-
Returns a normalized dict with keys: title, headers, rows — or None.
|
17
|
+
Returns a normalized dict with keys: title, description, headers, rows, page, type — or None.
|
18
18
|
"""
|
19
19
|
if obj is None:
|
20
20
|
return None
|
@@ -36,10 +36,13 @@ def to_structured_dict(obj: Any) -> Optional[Dict[str, Any]]:
|
|
36
36
|
|
37
37
|
if isinstance(obj, dict):
|
38
38
|
title = obj.get("title") or "Untitled"
|
39
|
+
description = obj.get("description") or ""
|
39
40
|
headers = obj.get("headers") or []
|
40
41
|
rows = obj.get("rows") or []
|
42
|
+
page = obj.get("page", "Unknown")
|
43
|
+
item_type = obj.get("type", "Table")
|
41
44
|
if not isinstance(headers, list) or not isinstance(rows, list):
|
42
45
|
return None
|
43
|
-
return {"title": title, "headers": headers, "rows": rows}
|
46
|
+
return {"title": title, "description": description, "headers": headers, "rows": rows, "page": page, "type": item_type}
|
44
47
|
|
45
48
|
return None
|
doctra/version.py
CHANGED
@@ -1,2 +1,2 @@
|
|
1
1
|
"""Version information for Doctra."""
|
2
|
-
__version__ = '0.4.
|
2
|
+
__version__ = '0.4.1'
|
@@ -1,11 +1,11 @@
|
|
1
1
|
doctra/__init__.py,sha256=rNLCyODOpaPb_TTP6qmQnuWZJW9JPXrxg1IfKnvb1No,773
|
2
|
-
doctra/version.py,sha256=
|
2
|
+
doctra/version.py,sha256=gJX4jQdS3czcKE2h1k17fJPgWzxHyGH2oFP9nW9cTLw,62
|
3
3
|
doctra/cli/__init__.py,sha256=4PTujjYRShOOUlZ7PwuWckShPWLC4v4CYIhJpzgyv1k,911
|
4
|
-
doctra/cli/main.py,sha256=
|
5
|
-
doctra/cli/utils.py,sha256=
|
4
|
+
doctra/cli/main.py,sha256=_gvG8bm-Mn1tIEw6eJUgqz9dYEo9klXGiJDJzjqgPyo,43503
|
5
|
+
doctra/cli/utils.py,sha256=w3Bxyzczcbl_cs1Cea8C3ehv7dkGl_wecprYZXrcGhk,11772
|
6
6
|
doctra/engines/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
7
|
doctra/engines/image_restoration/__init__.py,sha256=vzcN6Rw7_U-5jIK2pdo2NlgqdLdXDShigrOGM7QLNEE,263
|
8
|
-
doctra/engines/image_restoration/docres_engine.py,sha256=
|
8
|
+
doctra/engines/image_restoration/docres_engine.py,sha256=n9Pr0R7dbu_UHv51urGv_wC6ZYW-43bmXxiyTCOEOMo,21612
|
9
9
|
doctra/engines/layout/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
10
|
doctra/engines/layout/layout_models.py,sha256=vuTzjWd3FD-SkFPngktmUVhOJ6Xvff6ufwFEq796PQs,3162
|
11
11
|
doctra/engines/layout/paddle_layout.py,sha256=P2-Gk8wHpWoA5Jpmo_3OLI59zWq3HeAOBOUKKVdXu8I,6792
|
@@ -14,20 +14,20 @@ doctra/engines/ocr/api.py,sha256=YOBKDLExXpvSiOsc_TDJasaMPxzdVx1llQCtYlsruWo,128
|
|
14
14
|
doctra/engines/ocr/path_resolver.py,sha256=2_7Nsekt3dCDU3oVsgdr62iMrlAhbGNfYwgh4G7S3pA,1492
|
15
15
|
doctra/engines/ocr/pytesseract_engine.py,sha256=Imz2uwju6himkBiS8CH7DLxBRe-LtmMYZiOdb_6PoQw,2911
|
16
16
|
doctra/engines/vlm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
|
-
doctra/engines/vlm/outlines_types.py,sha256=
|
17
|
+
doctra/engines/vlm/outlines_types.py,sha256=fQK6ru7XiXHaa8JPpaTTBaTk_zQ93ZyhFp4SyAnUdVU,1337
|
18
18
|
doctra/engines/vlm/provider.py,sha256=aE8Eo1U-8XqAimakNlT0-T4etIyCV8rZ3DwxdqbFeTc,3131
|
19
|
-
doctra/engines/vlm/service.py,sha256=
|
19
|
+
doctra/engines/vlm/service.py,sha256=nygxMe7uTq6Bv70ycBPL59F2a0ESp1Hix4j833p6rUM,4343
|
20
20
|
doctra/exporters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
21
|
-
doctra/exporters/excel_writer.py,sha256=
|
21
|
+
doctra/exporters/excel_writer.py,sha256=rwyqlH73P7z413BELovQY_pS6IMkkqHEho6mbPrJ2Sk,11857
|
22
22
|
doctra/exporters/html_writer.py,sha256=OlW24Eg5bZcjldRHtd3GDD7RrajuRXj43EJpXIJkYf8,38810
|
23
23
|
doctra/exporters/image_saver.py,sha256=zsPoQ0CwoE643ui4iZMdXk96kv5mU8L_zC2JfF22N1A,1639
|
24
24
|
doctra/exporters/markdown_table.py,sha256=4_OJIwG_WoIPYBzJx1njy_3tNVdkK6QKSP-P9r-b0zw,2030
|
25
25
|
doctra/exporters/markdown_writer.py,sha256=L7EjF2MB8jYX7XkZ3a3NeeEC8gnb0qzRPTzIN9tdfuw,1027
|
26
26
|
doctra/parsers/__init__.py,sha256=8M6LVzcWGpuTIK_1SMXML3ll7zK1CTHXGI5qXvqdm-A,206
|
27
|
-
doctra/parsers/enhanced_pdf_parser.py,sha256=
|
27
|
+
doctra/parsers/enhanced_pdf_parser.py,sha256=NBBopYdSIHWd_O96J0qR3DqZvbAt3CfK1hwUkXu8540,18377
|
28
28
|
doctra/parsers/layout_order.py,sha256=W6b-T11H907RZ2FaZwNvnYhmvH11rpUzxC5yLkdf28k,640
|
29
|
-
doctra/parsers/structured_pdf_parser.py,sha256=
|
30
|
-
doctra/parsers/table_chart_extractor.py,sha256=
|
29
|
+
doctra/parsers/structured_pdf_parser.py,sha256=AU6yLW2kpd8bxZjelmm73L4CVBysnVAdKxwPkTV1Fzk,19602
|
30
|
+
doctra/parsers/table_chart_extractor.py,sha256=ePmk9m9n-mvkqOvxpWC42ElxbnKMmDnq-e6SWiNqgzA,14195
|
31
31
|
doctra/third_party/docres/inference.py,sha256=krD5EQDiqki-5uTMqqHYivhL38sfSOhYgaihI751070,13576
|
32
32
|
doctra/third_party/docres/utils.py,sha256=N0ZVmOTB3wsinFlYu5hT84C4_MhWGdc98T8LTG-S9dA,14566
|
33
33
|
doctra/third_party/docres/data/MBD/MBD.py,sha256=-d6cVQX1FVcGmQ_yJ5l-PQ3xKmkmveQQBytZ38pEGfY,4653
|
@@ -49,7 +49,7 @@ doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/unittest.
|
|
49
49
|
doctra/third_party/docres/data/preprocess/crop_merge_image.py,sha256=f2NANY92s6IQ1hl1MAXfftFPIyIrj24O4TONjg7SXEc,4747
|
50
50
|
doctra/third_party/docres/models/restormer_arch.py,sha256=BSwv_odCcp4HUZj3gv21e4IzFRBiyk8FjKAO8kF4YS8,12510
|
51
51
|
doctra/ui/__init__.py,sha256=XzOOKeGSBnUREuDQiCIWds1asFSa2nypFQTJXwclROA,85
|
52
|
-
doctra/ui/app.py,sha256=
|
52
|
+
doctra/ui/app.py,sha256=iFSAVZacL7iHB1SHhcUzperJGNQVWqUhvOYdlgjjt50,43623
|
53
53
|
doctra/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
54
54
|
doctra/utils/bbox.py,sha256=R2-95p0KiWvet3TH27TQVvCar7WJg6z0u3L21iEDF-A,674
|
55
55
|
doctra/utils/constants.py,sha256=ZWOvNDrvETbQ_pxHiX7vUW4J5Oj8_qnov0QacUOBizI,189
|
@@ -59,9 +59,9 @@ doctra/utils/ocr_utils.py,sha256=Doa1uYBg3kRgRYd2aPq9fICHgHfrM_efdhZfI7jl6OM,780
|
|
59
59
|
doctra/utils/pdf_io.py,sha256=c8EY47Z1iqVtlLFHS_n0qGuXJ5ERFaMUd84ivXV0b9E,706
|
60
60
|
doctra/utils/progress.py,sha256=IKQ_YErWSEd4hddYMUiCORy0_kW4TOYJM891HUEq2_E,11901
|
61
61
|
doctra/utils/quiet.py,sha256=5XPS-1CtJ0sVk6qgSQctdhr_wR8mP1xoJLoUbmkXROA,387
|
62
|
-
doctra/utils/structured_utils.py,sha256=
|
63
|
-
doctra-0.4.
|
64
|
-
doctra-0.4.
|
65
|
-
doctra-0.4.
|
66
|
-
doctra-0.4.
|
67
|
-
doctra-0.4.
|
62
|
+
doctra/utils/structured_utils.py,sha256=vU84dsD8wIlTyMsA9hitorGH-eroQiVuWEpBTQBUT24,1478
|
63
|
+
doctra-0.4.1.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
64
|
+
doctra-0.4.1.dist-info/METADATA,sha256=wXduiq7VJS5vf-TXdxpYFCKGfPyGYr5jGK0mwH3OjUw,28298
|
65
|
+
doctra-0.4.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
66
|
+
doctra-0.4.1.dist-info/top_level.txt,sha256=jI7E8jHci2gP9y0GYaWxlg9jG0O5n3FjHJJPLXDXMds,7
|
67
|
+
doctra-0.4.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|