doctra 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
doctra/cli/main.py CHANGED
@@ -9,6 +9,7 @@ detection results, and analyze document structure from the command line.
9
9
  import click
10
10
  import os
11
11
  import sys
12
+ import traceback
12
13
  from pathlib import Path
13
14
  from typing import Optional
14
15
 
@@ -25,6 +26,10 @@ except ImportError:
25
26
  from doctra.parsers.enhanced_pdf_parser import EnhancedPDFParser
26
27
  from doctra.parsers.table_chart_extractor import ChartTablePDFParser
27
28
 
29
+ # Import additional modules
30
+ from doctra.engines.layout.paddle_layout import PaddleLayoutEngine
31
+ from doctra.engines.image_restoration import DocResEngine
32
+
28
33
 
29
34
  @click.group(invoke_without_command=True)
30
35
  @click.pass_context
@@ -247,7 +252,6 @@ def parse(pdf_path: Path, output_dir: Optional[Path], use_vlm: bool,
247
252
  except Exception as e:
248
253
  click.echo(f"❌ Error initializing parser: {e}", err=True)
249
254
  if verbose:
250
- import traceback
251
255
  click.echo(traceback.format_exc(), err=True)
252
256
  sys.exit(1)
253
257
 
@@ -271,7 +275,6 @@ def parse(pdf_path: Path, output_dir: Optional[Path], use_vlm: bool,
271
275
  except Exception as e:
272
276
  click.echo(f"❌ Error during parsing: {e}", err=True)
273
277
  if verbose:
274
- import traceback
275
278
  click.echo(traceback.format_exc(), err=True)
276
279
  sys.exit(1)
277
280
  finally:
@@ -394,7 +397,6 @@ def enhance(pdf_path: Path, output_dir: Optional[Path], restoration_task: str,
394
397
  except Exception as e:
395
398
  click.echo(f"❌ Error initializing enhanced parser: {e}", err=True)
396
399
  if verbose:
397
- import traceback
398
400
  click.echo(traceback.format_exc(), err=True)
399
401
  sys.exit(1)
400
402
 
@@ -418,7 +420,6 @@ def enhance(pdf_path: Path, output_dir: Optional[Path], restoration_task: str,
418
420
  except Exception as e:
419
421
  click.echo(f"❌ Error during enhanced parsing: {e}", err=True)
420
422
  if verbose:
421
- import traceback
422
423
  click.echo(traceback.format_exc(), err=True)
423
424
  sys.exit(1)
424
425
  finally:
@@ -526,7 +527,6 @@ def charts(pdf_path: Path, output_dir: Path, use_vlm: bool, vlm_provider: str,
526
527
  except Exception as e:
527
528
  click.echo(f"❌ Error during chart extraction: {e}", err=True)
528
529
  if verbose:
529
- import traceback
530
530
  click.echo(traceback.format_exc(), err=True)
531
531
  sys.exit(1)
532
532
 
@@ -604,7 +604,6 @@ def tables(pdf_path: Path, output_dir: Path, use_vlm: bool, vlm_provider: str,
604
604
  except Exception as e:
605
605
  click.echo(f"❌ Error during table extraction: {e}", err=True)
606
606
  if verbose:
607
- import traceback
608
607
  click.echo(traceback.format_exc(), err=True)
609
608
  sys.exit(1)
610
609
 
@@ -683,7 +682,6 @@ def both(pdf_path: Path, output_dir: Path, use_vlm: bool, vlm_provider: str,
683
682
  except Exception as e:
684
683
  click.echo(f"❌ Error during extraction: {e}", err=True)
685
684
  if verbose:
686
- import traceback
687
685
  click.echo(traceback.format_exc(), err=True)
688
686
  sys.exit(1)
689
687
 
@@ -772,7 +770,6 @@ def visualize(pdf_path: Path, pages: int, columns: int, width: int,
772
770
  except Exception as e:
773
771
  click.echo(f"❌ Error creating visualization: {e}", err=True)
774
772
  if verbose:
775
- import traceback
776
773
  click.echo(traceback.format_exc(), err=True)
777
774
  sys.exit(1)
778
775
 
@@ -805,7 +802,6 @@ def analyze(pdf_path: Path, dpi: int, min_score: float, layout_model: str, verbo
805
802
  click.echo(f"🔍 Analyzing: {pdf_path.name}")
806
803
 
807
804
  # Create layout engine for analysis only
808
- from doctra.engines.layout.paddle_layout import PaddleLayoutEngine
809
805
 
810
806
  if verbose:
811
807
  click.echo(f" Using model: {layout_model}")
@@ -903,7 +899,6 @@ def analyze(pdf_path: Path, dpi: int, min_score: float, layout_model: str, verbo
903
899
  except Exception as e:
904
900
  click.echo(f"❌ Error analyzing PDF: {e}", err=True)
905
901
  if verbose:
906
- import traceback
907
902
  click.echo(traceback.format_exc(), err=True)
908
903
  sys.exit(1)
909
904
 
@@ -922,7 +917,6 @@ def info():
922
917
  click.echo("=" * 50)
923
918
 
924
919
  # Check Python version
925
- import sys
926
920
  python_version = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}"
927
921
  click.echo(f"Python version: {python_version}")
928
922
 
@@ -1003,7 +997,6 @@ def info():
1003
997
  # DocRes information
1004
998
  click.echo("\nDocRes Image Restoration:")
1005
999
  try:
1006
- from doctra.engines.image_restoration import DocResEngine
1007
1000
  docres = DocResEngine()
1008
1001
  click.echo(f" ✅ DocRes available - {len(docres.get_supported_tasks())} restoration tasks")
1009
1002
  click.echo(" Tasks: dewarping, deshadowing, appearance, deblurring, binarization, end2end")
doctra/cli/utils.py CHANGED
@@ -7,8 +7,10 @@ different CLI commands.
7
7
 
8
8
  import click
9
9
  import sys
10
+ import traceback
10
11
  from typing import Optional, Dict, Any
11
12
  from pathlib import Path
13
+ from doctra.utils.progress import create_beautiful_progress_bar, create_notebook_friendly_bar
12
14
 
13
15
 
14
16
  def validate_vlm_config(use_vlm: bool, vlm_api_key: Optional[str]) -> None:
@@ -58,7 +60,6 @@ def handle_exception(e: Exception, verbose: bool = False) -> None:
58
60
  """
59
61
  click.echo(f"❌ Error: {e}", err=True)
60
62
  if verbose:
61
- import traceback
62
63
  click.echo(traceback.format_exc(), err=True)
63
64
  sys.exit(1)
64
65
 
@@ -271,8 +272,6 @@ def create_progress_callback(description: str, total: int):
271
272
  :return: Callable progress callback function that takes an integer
272
273
  representing the number of completed items
273
274
  """
274
- import sys
275
- from doctra.utils.progress import create_beautiful_progress_bar, create_notebook_friendly_bar
276
275
 
277
276
  # Enhanced environment detection
278
277
  is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
@@ -18,6 +18,8 @@ import sys
18
18
  import cv2
19
19
  import numpy as np
20
20
  import torch
21
+ import tempfile
22
+ import time
21
23
  from pathlib import Path
22
24
  from typing import Union, List, Tuple, Optional, Dict, Any
23
25
 
@@ -308,8 +310,6 @@ class DocResEngine:
308
310
 
309
311
  def _run_single_task(self, img_array: np.ndarray, task: str, save_prompts: bool) -> Tuple[np.ndarray, Dict]:
310
312
  """Run a single restoration task"""
311
- import tempfile
312
- import time
313
313
 
314
314
  # Create temporary file for inference
315
315
  with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmp_file:
@@ -322,7 +322,6 @@ class DocResEngine:
322
322
  os.chdir(str(docres_dir))
323
323
 
324
324
  # Set global DEVICE variable that DocRes inference expects
325
- import torch
326
325
  import inference # Import the inference module to set its global DEVICE
327
326
  inference.DEVICE = self.device
328
327
 
@@ -364,8 +363,6 @@ class DocResEngine:
364
363
 
365
364
  def _run_end2end_pipeline(self, img_array: np.ndarray, save_prompts: bool) -> Tuple[np.ndarray, Dict]:
366
365
  """Run the end2end pipeline: dewarping → deshadowing → appearance"""
367
- import tempfile
368
- import time
369
366
 
370
367
  intermediate_steps = {}
371
368
 
@@ -374,7 +371,6 @@ class DocResEngine:
374
371
  os.chdir(str(docres_dir))
375
372
 
376
373
  # Set global DEVICE variable that DocRes inference expects
377
- import torch
378
374
  import inference # Import the inference module to set its global DEVICE
379
375
  inference.DEVICE = self.device
380
376
 
@@ -482,7 +478,6 @@ class DocResEngine:
482
478
  """
483
479
  try:
484
480
  from PIL import Image
485
- import numpy as np
486
481
  from doctra.utils.pdf_io import render_pdf_to_images
487
482
 
488
483
  # Generate output path if not provided
@@ -1,17 +1,19 @@
1
- from pydantic import BaseModel
1
+ from pydantic import BaseModel, Field
2
2
 
3
3
  class Chart(BaseModel):
4
4
  """
5
5
  Structured representation of a chart extracted from an image.
6
6
 
7
- Contains the title, headers, and data rows extracted from a chart
8
- using VLM (Vision Language Model) processing.
7
+ Includes a title, a short description, column headers, and data rows
8
+ identified using VLM (Vision Language Model) processing.
9
9
 
10
- :param title: Title or caption of the chart
10
+ :param title: Title or caption of the chart (max 31 characters)
11
+ :param description: Short description of the chart (max 300 characters)
11
12
  :param headers: Column headers for the chart data
12
13
  :param rows: Data rows containing the chart values
13
14
  """
14
- title: str
15
+ title: str = Field(max_length=31)
16
+ description: str = Field(max_length=300)
15
17
  headers: list[str]
16
18
  rows: list[list[str]]
17
19
 
@@ -19,13 +21,15 @@ class Table(BaseModel):
19
21
  """
20
22
  Structured representation of a table extracted from an image.
21
23
 
22
- Contains the title, headers, and data rows extracted from a table
23
- using VLM (Vision Language Model) processing.
24
+ Includes a title, a short description, column headers, and data rows
25
+ identified using VLM (Vision Language Model) processing.
24
26
 
25
- :param title: Title or caption of the table
27
+ :param title: Title or caption of the table (max 31 characters)
28
+ :param description: Short description of the table (max 300 characters)
26
29
  :param headers: Column headers for the table data
27
30
  :param rows: Data rows containing the table values
28
31
  """
29
- title: str
32
+ title: str = Field(max_length=31)
33
+ description: str = Field(max_length=300)
30
34
  headers: list[str]
31
35
  rows: list[list[str]]
@@ -73,7 +73,7 @@ class VLMStructuredExtractor:
73
73
  Extract structured chart data from an image.
74
74
 
75
75
  :param image_path: Path to the chart image file
76
- :return: Chart object containing extracted title, headers, and data rows
76
+ :return: Chart object containing extracted title, description, headers, and data rows
77
77
  :raises Exception: If image processing or VLM extraction fails
78
78
  """
79
79
  prompt_text = (
@@ -81,6 +81,7 @@ class VLMStructuredExtractor:
81
81
  "If the title is not present in the image, generate a suitable title. "
82
82
  "Ensure that the table represents the data from the chart accurately."
83
83
  "The number of columns in the headers must match the number of columns in each row."
84
+ "Also provide a short description (max 300 characters) of the chart."
84
85
  )
85
86
  return self._call(prompt_text, image_path, Chart)
86
87
 
@@ -89,7 +90,7 @@ class VLMStructuredExtractor:
89
90
  Extract structured table data from an image.
90
91
 
91
92
  :param image_path: Path to the table image file
92
- :return: Table object containing extracted title, headers, and data rows
93
+ :return: Table object containing extracted title, description, headers, and data rows
93
94
  :raises Exception: If image processing or VLM extraction fails
94
95
  """
95
96
  prompt_text = (
@@ -97,5 +98,6 @@ class VLMStructuredExtractor:
97
98
  "Provide the headers and rows of the table, ensuring accuracy in the extraction. "
98
99
  "If the title is not present in the image, generate a suitable title."
99
100
  "The number of columns in the headers must match the number of columns in each row."
101
+ "Also provide a short description (max 300 characters) of the table."
100
102
  )
101
103
  return self._call(prompt_text, image_path, Table)
@@ -5,6 +5,7 @@ from typing import Dict, Any, List, Set
5
5
  import pandas as pd # pip install pandas openpyxl
6
6
  from openpyxl.styles import PatternFill, Font, Alignment
7
7
  from openpyxl.utils import get_column_letter
8
+ from openpyxl.worksheet.hyperlink import Hyperlink
8
9
 
9
10
  _INVALID_SHEET_CHARS = r'[:\\/*?\[\]]' # Excel-invalid characters
10
11
  _MAX_SHEET_LEN = 31
@@ -85,6 +86,61 @@ def _autosize_columns(ws, df: pd.DataFrame) -> None:
85
86
  ws.column_dimensions[get_column_letter(i)].width = min(max(10, max_len + 2), 60)
86
87
 
87
88
 
89
+ def _style_summary_sheet(ws, df: pd.DataFrame, sheet_mapping: dict = None) -> None:
90
+ """
91
+ Apply special styling to the summary sheet with text wrapping for descriptions.
92
+ Add hyperlinks to table titles that link to their corresponding sheets.
93
+
94
+ :param ws: OpenPyXL worksheet object to style
95
+ :param df: Pandas DataFrame containing the summary data
96
+ :param sheet_mapping: Dictionary mapping table titles to their sheet names
97
+ :return: None
98
+ """
99
+ # Style header row
100
+ _style_header(ws, ncols=df.shape[1])
101
+
102
+ # Apply text wrapping to all data cells
103
+ wrap_alignment = Alignment(wrap_text=True, vertical="top")
104
+
105
+ # Apply wrapping to all data rows (skip header row)
106
+ for row_idx in range(2, len(df) + 2): # Start from row 2 (after header)
107
+ for col_idx in range(1, df.shape[1] + 1):
108
+ cell = ws.cell(row=row_idx, column=col_idx)
109
+ cell.alignment = wrap_alignment
110
+
111
+ # Add hyperlink to table title column (column A)
112
+ if col_idx == 1 and sheet_mapping: # Table Title column
113
+ table_title = cell.value
114
+ if table_title and table_title in sheet_mapping:
115
+ sheet_name = sheet_mapping[table_title]
116
+
117
+ # Create hyperlink to the sheet using proper Excel format
118
+ # Escape sheet name if it contains spaces or special characters
119
+ if ' ' in sheet_name or any(char in sheet_name for char in ['[', ']', '*', '?', ':', '\\', '/']):
120
+ hyperlink_ref = f"#'{sheet_name}'!A1"
121
+ else:
122
+ hyperlink_ref = f"#{sheet_name}!A1"
123
+
124
+ # Use Hyperlink class with proper parameters
125
+ cell.hyperlink = Hyperlink(ref=hyperlink_ref, target=hyperlink_ref)
126
+ # Style the hyperlink
127
+ cell.font = Font(color="0000FF", underline="single")
128
+
129
+ # Set specific column widths for summary sheet
130
+ # Table Title column - narrower
131
+ ws.column_dimensions['A'].width = 30
132
+ # Description column - wider to accommodate wrapped text
133
+ ws.column_dimensions['B'].width = 60
134
+ # Page column - narrow for page numbers
135
+ ws.column_dimensions['C'].width = 10
136
+ # Type column - narrow for Table/Chart
137
+ ws.column_dimensions['D'].width = 12
138
+
139
+ # Set row heights to accommodate wrapped text
140
+ for row_idx in range(2, len(df) + 2):
141
+ ws.row_dimensions[row_idx].height = 60 # Allow for multiple lines
142
+
143
+
88
144
  def _normalize_data(headers: List[str], rows: List[List]) -> tuple[List[str], List[List]]:
89
145
  """
90
146
  Normalize headers and rows to ensure consistent dimensions.
@@ -159,6 +215,31 @@ def write_structured_excel(excel_path: str, items: List[Dict[str, Any]]) -> str
159
215
  taken: Set[str] = set()
160
216
 
161
217
  with pd.ExcelWriter(excel_path, engine="openpyxl", mode="w") as writer:
218
+ # Create summary sheet first
219
+ summary_data = []
220
+ sheet_mapping = {} # Map table titles to their sheet names
221
+
222
+ for item in valid_items:
223
+ title = item.get("title") or "Untitled"
224
+ description = item.get("description") or "No description available"
225
+ page_number = item.get("page", "Unknown")
226
+ item_type = item.get("type", "Table") # Default to "Table" if not specified
227
+
228
+
229
+ summary_data.append({
230
+ "Table Title": title,
231
+ "Description": description,
232
+ "Page": page_number,
233
+ "Type": item_type
234
+ })
235
+
236
+ # Create summary sheet first (but without hyperlinks initially)
237
+ if summary_data:
238
+ summary_df = pd.DataFrame(summary_data)
239
+ summary_df.to_excel(writer, sheet_name="Table Summary", index=False)
240
+ taken.add("Table Summary")
241
+
242
+ # Process individual table sheets to build sheet mapping
162
243
  for item in valid_items:
163
244
  try:
164
245
  title = item.get("title") or "Untitled"
@@ -166,6 +247,9 @@ def write_structured_excel(excel_path: str, items: List[Dict[str, Any]]) -> str
166
247
  rows = item.get("rows") or []
167
248
 
168
249
  sheet_name = _safe_sheet_name(title, taken)
250
+
251
+ # Add to sheet mapping for hyperlinks
252
+ sheet_mapping[title] = sheet_name
169
253
 
170
254
  # Normalize data to handle mismatched dimensions
171
255
  normalized_headers, normalized_rows = _normalize_data(headers, rows)
@@ -194,4 +278,9 @@ def write_structured_excel(excel_path: str, items: List[Dict[str, Any]]) -> str
194
278
  print(f"Error processing item '{item.get('title', 'Unknown')}': {e}")
195
279
  continue
196
280
 
281
+ # Now add hyperlinks to the summary sheet (after all sheets are created)
282
+ if summary_data and sheet_mapping:
283
+ summary_ws = writer.sheets["Table Summary"]
284
+ _style_summary_sheet(summary_ws, summary_df, sheet_mapping)
285
+
197
286
  return excel_path
@@ -8,6 +8,7 @@ capabilities with DocRes image restoration for improved document processing.
8
8
  from __future__ import annotations
9
9
  import os
10
10
  import sys
11
+ import numpy as np
11
12
  from typing import List, Dict, Any, Optional, Union
12
13
  from contextlib import ExitStack
13
14
  from PIL import Image
@@ -16,9 +17,17 @@ from tqdm import tqdm
16
17
  from doctra.parsers.structured_pdf_parser import StructuredPDFParser
17
18
  from doctra.engines.image_restoration import DocResEngine
18
19
  from doctra.utils.pdf_io import render_pdf_to_images
19
- from doctra.utils.constants import IMAGE_SUBDIRS
20
+ from doctra.utils.constants import IMAGE_SUBDIRS, EXCLUDE_LABELS
20
21
  from doctra.utils.file_ops import ensure_output_dirs
21
22
  from doctra.utils.progress import create_beautiful_progress_bar, create_notebook_friendly_bar
23
+ from doctra.parsers.layout_order import reading_order_key
24
+ from doctra.utils.ocr_utils import ocr_box_text
25
+ from doctra.exporters.image_saver import save_box_image
26
+ from doctra.exporters.markdown_writer import write_markdown
27
+ from doctra.exporters.html_writer import write_html, write_structured_html
28
+ from doctra.exporters.excel_writer import write_structured_excel
29
+ from doctra.utils.structured_utils import to_structured_dict
30
+ from doctra.exporters.markdown_table import render_markdown_table
22
31
 
23
32
 
24
33
  class EnhancedPDFParser(StructuredPDFParser):
@@ -146,7 +155,7 @@ class EnhancedPDFParser(StructuredPDFParser):
146
155
  pil_pages = enhanced_pages
147
156
 
148
157
  # Continue with standard parsing logic
149
- self._process_parsing_logic(pages, pil_pages, out_dir, pdf_filename)
158
+ self._process_parsing_logic(pages, pil_pages, out_dir, pdf_filename, pdf_path)
150
159
 
151
160
  def _process_pages_with_restoration(self, pdf_path: str, out_dir: str) -> List[Image.Image]:
152
161
  """
@@ -186,7 +195,6 @@ class EnhancedPDFParser(StructuredPDFParser):
186
195
  for i, page_img in enumerate(original_pages):
187
196
  try:
188
197
  # Convert PIL to numpy array
189
- import numpy as np
190
198
  img_array = np.array(page_img)
191
199
 
192
200
  # Apply DocRes restoration
@@ -219,21 +227,11 @@ class EnhancedPDFParser(StructuredPDFParser):
219
227
  print(f"✅ Image restoration completed. Enhanced pages saved to: {enhanced_dir}")
220
228
  return enhanced_pages
221
229
 
222
- def _process_parsing_logic(self, pages, pil_pages, out_dir, pdf_filename):
230
+ def _process_parsing_logic(self, pages, pil_pages, out_dir, pdf_filename, pdf_path):
223
231
  """
224
232
  Process the parsing logic with enhanced pages.
225
233
  This is extracted from the parent class to allow customization.
226
234
  """
227
- from doctra.utils.constants import EXCLUDE_LABELS
228
- from doctra.parsers.layout_order import reading_order_key
229
- from doctra.utils.ocr_utils import ocr_box_text
230
- from doctra.exporters.image_saver import save_box_image
231
- from doctra.exporters.markdown_writer import write_markdown
232
- from doctra.exporters.html_writer import write_html
233
- from doctra.exporters.excel_writer import write_structured_excel
234
- from doctra.exporters.html_writer import write_structured_html
235
- from doctra.utils.structured_utils import to_structured_dict
236
- from doctra.exporters.markdown_table import render_markdown_table
237
235
 
238
236
  fig_count = sum(sum(1 for b in p.boxes if b.label == "figure") for p in pages)
239
237
  chart_count = sum(sum(1 for b in p.boxes if b.label == "chart") for p in pages)
@@ -285,6 +283,9 @@ class EnhancedPDFParser(StructuredPDFParser):
285
283
  chart = self.vlm.extract_chart(abs_img_path)
286
284
  item = to_structured_dict(chart)
287
285
  if item:
286
+ # Add page and type information to structured item
287
+ item["page"] = page_num
288
+ item["type"] = "Chart"
288
289
  structured_items.append(item)
289
290
  md_lines.append(
290
291
  render_markdown_table(item.get("headers"), item.get("rows"),
@@ -306,6 +307,9 @@ class EnhancedPDFParser(StructuredPDFParser):
306
307
  table = self.vlm.extract_table(abs_img_path)
307
308
  item = to_structured_dict(table)
308
309
  if item:
310
+ # Add page and type information to structured item
311
+ item["page"] = page_num
312
+ item["type"] = "Table"
309
313
  structured_items.append(item)
310
314
  md_lines.append(
311
315
  render_markdown_table(item.get("headers"), item.get("rows"),
@@ -163,6 +163,9 @@ class StructuredPDFParser:
163
163
  chart = self.vlm.extract_chart(abs_img_path)
164
164
  item = to_structured_dict(chart)
165
165
  if item:
166
+ # Add page and type information to structured item
167
+ item["page"] = page_num
168
+ item["type"] = "Chart"
166
169
  structured_items.append(item)
167
170
  md_lines.append(
168
171
  render_markdown_table(item.get("headers"), item.get("rows"),
@@ -184,6 +187,9 @@ class StructuredPDFParser:
184
187
  table = self.vlm.extract_table(abs_img_path)
185
188
  item = to_structured_dict(table)
186
189
  if item:
190
+ # Add page and type information to structured item
191
+ item["page"] = page_num
192
+ item["type"] = "Table"
187
193
  structured_items.append(item)
188
194
  md_lines.append(
189
195
  render_markdown_table(item.get("headers"), item.get("rows"),
@@ -178,6 +178,9 @@ class ChartTablePDFParser:
178
178
  extracted_chart = self.vlm.extract_chart(chart_path)
179
179
  structured_item = to_structured_dict(extracted_chart)
180
180
  if structured_item:
181
+ # Add page and type information to structured item
182
+ structured_item["page"] = page_num
183
+ structured_item["type"] = "Chart"
181
184
  structured_items.append(structured_item)
182
185
  vlm_items.append({
183
186
  "kind": "chart",
@@ -221,6 +224,9 @@ class ChartTablePDFParser:
221
224
  extracted_table = self.vlm.extract_table(table_path)
222
225
  structured_item = to_structured_dict(extracted_table)
223
226
  if structured_item:
227
+ # Add page and type information to structured item
228
+ structured_item["page"] = page_num
229
+ structured_item["type"] = "Table"
224
230
  structured_items.append(structured_item)
225
231
  vlm_items.append({
226
232
  "kind": "table",
doctra/ui/app.py CHANGED
@@ -2,6 +2,11 @@ import os
2
2
  import shutil
3
3
  import tempfile
4
4
  import re
5
+ import traceback
6
+ import pandas as pd
7
+ import html as _html
8
+ import base64
9
+ import json
5
10
  from pathlib import Path
6
11
  from typing import Optional, Tuple, List, Dict, Any
7
12
 
@@ -9,6 +14,7 @@ import gradio as gr
9
14
 
10
15
  from doctra.parsers.structured_pdf_parser import StructuredPDFParser
11
16
  from doctra.parsers.table_chart_extractor import ChartTablePDFParser
17
+ from doctra.utils.pdf_io import render_pdf_to_images
12
18
 
13
19
 
14
20
  def _gather_outputs(out_dir: Path, allowed_kinds: Optional[List[str]] = None, zip_filename: Optional[str] = None, is_structured_parsing: bool = False) -> Tuple[List[tuple[str, str]], List[str], str]:
@@ -100,7 +106,6 @@ def _parse_markdown_by_pages(md_content: str) -> List[Dict[str, Any]]:
100
106
  Parse markdown content and organize it by pages.
101
107
  Returns a list of page dictionaries with content, tables, charts, and figures.
102
108
  """
103
- import re
104
109
 
105
110
  pages = []
106
111
  current_page = None
@@ -209,7 +214,6 @@ def run_full_parse(
209
214
  try:
210
215
  parser.parse(str(input_pdf))
211
216
  except Exception as e:
212
- import traceback
213
217
  traceback.print_exc()
214
218
  # Safely encode error message for return value
215
219
  try:
@@ -325,8 +329,6 @@ def run_extract(
325
329
  if excel_filename:
326
330
  excel_path = out_dir / excel_filename
327
331
  if excel_path.exists():
328
- import pandas as pd
329
- import html as _html
330
332
 
331
333
  # Read Excel file and create HTML tables
332
334
  xl_file = pd.ExcelFile(excel_path)
@@ -489,7 +491,6 @@ def build_demo() -> gr.Blocks:
489
491
 
490
492
  def parse_markdown_by_pages(md_content: str):
491
493
  """Parse markdown content and organize it by pages."""
492
- import re
493
494
 
494
495
  pages = []
495
496
  current_page = None
@@ -548,7 +549,6 @@ def build_demo() -> gr.Blocks:
548
549
  return "Page not found", None
549
550
 
550
551
  # Build HTML with inline base64 images, render markdown tables, and preserve paragraphs/line breaks
551
- import html as _html, base64, re as _re
552
552
  base_dir = None
553
553
  try:
554
554
  stem = Path(pdf_path).stem if pdf_path else ""
@@ -589,7 +589,7 @@ def build_demo() -> gr.Blocks:
589
589
  stripped = line.strip()
590
590
  if stripped.startswith('![') and ('](images/' in stripped or '](images\\' in stripped):
591
591
  flush_paragraph()
592
- match = _re.match(r'!\[([^\]]+)\]\(([^)]+)\)', stripped)
592
+ match = re.match(r'!\[([^\]]+)\]\(([^)]+)\)', stripped)
593
593
  if match and base_dir is not None:
594
594
  caption = match.group(1)
595
595
  rel_path = match.group(2).replace('\\\\', '/').replace('\\', '/').lstrip('/')
@@ -646,7 +646,6 @@ def build_demo() -> gr.Blocks:
646
646
  # Ensure page images are prepared
647
647
  try:
648
648
  if pdf_path and not page_images:
649
- from doctra.utils.pdf_io import render_pdf_to_images
650
649
  tmp_img_dir = Path(tempfile.mkdtemp(prefix="doctra_pages_"))
651
650
  pil_pages = render_pdf_to_images(pdf_path)
652
651
  saved_paths: List[str] = []
@@ -726,7 +725,6 @@ def build_demo() -> gr.Blocks:
726
725
  for page in pages_data:
727
726
  for line in page['content']:
728
727
  if line.strip().startswith('![') and ('](images/' in line or '](images\\' in line):
729
- import re
730
728
  match = re.match(r'!\[([^\]]+)\]\(([^)]+)\)', line.strip())
731
729
  if match:
732
730
  caption = match.group(1)
@@ -745,7 +743,6 @@ def build_demo() -> gr.Blocks:
745
743
  saved_paths: List[str] = []
746
744
  try:
747
745
  if input_pdf_path:
748
- from doctra.utils.pdf_io import render_pdf_to_images
749
746
  tmp_img_dir = Path(tempfile.mkdtemp(prefix="doctra_pages_"))
750
747
  pil_pages = render_pdf_to_images(input_pdf_path)
751
748
  for idx, (im, _, _) in enumerate(pil_pages, start=1):
@@ -759,7 +756,6 @@ def build_demo() -> gr.Blocks:
759
756
 
760
757
  # Build initial HTML with inline images and proper blocks for first page
761
758
  if pages_data:
762
- import html as _html, base64, re as _re
763
759
  base_dir = None
764
760
  try:
765
761
  stem = Path(input_pdf_path).stem if input_pdf_path else ""
@@ -771,7 +767,7 @@ def build_demo() -> gr.Blocks:
771
767
  for raw_line in pages_data[0]['content']:
772
768
  line = raw_line.strip()
773
769
  if line.startswith('![') and ('](images/' in line or '](images\\' in line):
774
- match = _re.match(r'!\[([^\]]+)\]\(([^)]+)\)', line)
770
+ match = re.match(r'!\[([^\]]+)\]\(([^)]+)\)', line)
775
771
  if match and base_dir is not None:
776
772
  caption = match.group(1)
777
773
  rel_path = match.group(2).replace('\\\\', '/').replace('\\', '/').lstrip('/')
@@ -874,7 +870,6 @@ def build_demo() -> gr.Blocks:
874
870
  if not mapping.exists():
875
871
  return gr.Dropdown(choices=[], value=None, visible=False)
876
872
 
877
- import json
878
873
  data = json.loads(mapping.read_text(encoding="utf-8"))
879
874
  choices = []
880
875
 
@@ -902,7 +897,6 @@ def build_demo() -> gr.Blocks:
902
897
  if not mapping.exists():
903
898
  return "", None
904
899
 
905
- import json, html as _html
906
900
  data = json.loads(mapping.read_text(encoding="utf-8"))
907
901
 
908
902
  for entry in data:
@@ -14,7 +14,7 @@ def to_structured_dict(obj: Any) -> Optional[Dict[str, Any]]:
14
14
  - JSON string
15
15
  - dict
16
16
  - Pydantic BaseModel (v1 .dict() or v2 .model_dump())
17
- Returns a normalized dict with keys: title, headers, rows — or None.
17
+ Returns a normalized dict with keys: title, description, headers, rows, page, type — or None.
18
18
  """
19
19
  if obj is None:
20
20
  return None
@@ -36,10 +36,13 @@ def to_structured_dict(obj: Any) -> Optional[Dict[str, Any]]:
36
36
 
37
37
  if isinstance(obj, dict):
38
38
  title = obj.get("title") or "Untitled"
39
+ description = obj.get("description") or ""
39
40
  headers = obj.get("headers") or []
40
41
  rows = obj.get("rows") or []
42
+ page = obj.get("page", "Unknown")
43
+ item_type = obj.get("type", "Table")
41
44
  if not isinstance(headers, list) or not isinstance(rows, list):
42
45
  return None
43
- return {"title": title, "headers": headers, "rows": rows}
46
+ return {"title": title, "description": description, "headers": headers, "rows": rows, "page": page, "type": item_type}
44
47
 
45
48
  return None
doctra/version.py CHANGED
@@ -1,2 +1,2 @@
1
1
  """Version information for Doctra."""
2
- __version__ = '0.4.0'
2
+ __version__ = '0.4.1'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: doctra
3
- Version: 0.4.0
3
+ Version: 0.4.1
4
4
  Summary: Parse, extract, and analyze documents with ease
5
5
  Home-page: https://github.com/AdemBoukhris457/Doctra
6
6
  Author: Adem Boukhris
@@ -1,11 +1,11 @@
1
1
  doctra/__init__.py,sha256=rNLCyODOpaPb_TTP6qmQnuWZJW9JPXrxg1IfKnvb1No,773
2
- doctra/version.py,sha256=PSDo-SLZhu8_cWgmtvzLjHyKr7C8D_F61M1tiywnuKY,62
2
+ doctra/version.py,sha256=gJX4jQdS3czcKE2h1k17fJPgWzxHyGH2oFP9nW9cTLw,62
3
3
  doctra/cli/__init__.py,sha256=4PTujjYRShOOUlZ7PwuWckShPWLC4v4CYIhJpzgyv1k,911
4
- doctra/cli/main.py,sha256=6b415qg-8gV4M2Uf0WvdU_nFx65DYFgRu5Q3Ys_LvAo,43756
5
- doctra/cli/utils.py,sha256=IghiUZQCOmXODC5-5smHGz2KeV4xqbP4avmA1Mggln0,11800
4
+ doctra/cli/main.py,sha256=_gvG8bm-Mn1tIEw6eJUgqz9dYEo9klXGiJDJzjqgPyo,43503
5
+ doctra/cli/utils.py,sha256=w3Bxyzczcbl_cs1Cea8C3ehv7dkGl_wecprYZXrcGhk,11772
6
6
  doctra/engines/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
7
  doctra/engines/image_restoration/__init__.py,sha256=vzcN6Rw7_U-5jIK2pdo2NlgqdLdXDShigrOGM7QLNEE,263
8
- doctra/engines/image_restoration/docres_engine.py,sha256=6j2LfoqirmEEmLTOsz8nkhqaHUQHjYbJr-2MR01i6Gc,21754
8
+ doctra/engines/image_restoration/docres_engine.py,sha256=n9Pr0R7dbu_UHv51urGv_wC6ZYW-43bmXxiyTCOEOMo,21612
9
9
  doctra/engines/layout/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  doctra/engines/layout/layout_models.py,sha256=vuTzjWd3FD-SkFPngktmUVhOJ6Xvff6ufwFEq796PQs,3162
11
11
  doctra/engines/layout/paddle_layout.py,sha256=P2-Gk8wHpWoA5Jpmo_3OLI59zWq3HeAOBOUKKVdXu8I,6792
@@ -14,20 +14,20 @@ doctra/engines/ocr/api.py,sha256=YOBKDLExXpvSiOsc_TDJasaMPxzdVx1llQCtYlsruWo,128
14
14
  doctra/engines/ocr/path_resolver.py,sha256=2_7Nsekt3dCDU3oVsgdr62iMrlAhbGNfYwgh4G7S3pA,1492
15
15
  doctra/engines/ocr/pytesseract_engine.py,sha256=Imz2uwju6himkBiS8CH7DLxBRe-LtmMYZiOdb_6PoQw,2911
16
16
  doctra/engines/vlm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- doctra/engines/vlm/outlines_types.py,sha256=qL-G6MNiA5mxp1qAPVEFhOANp4NqVt_MQKseJCr_xXE,970
17
+ doctra/engines/vlm/outlines_types.py,sha256=fQK6ru7XiXHaa8JPpaTTBaTk_zQ93ZyhFp4SyAnUdVU,1337
18
18
  doctra/engines/vlm/provider.py,sha256=aE8Eo1U-8XqAimakNlT0-T4etIyCV8rZ3DwxdqbFeTc,3131
19
- doctra/engines/vlm/service.py,sha256=4ExDbLmyyC3ICXxr7OSIqvbOdrwbIJek-DE54vAUgDA,4151
19
+ doctra/engines/vlm/service.py,sha256=nygxMe7uTq6Bv70ycBPL59F2a0ESp1Hix4j833p6rUM,4343
20
20
  doctra/exporters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
- doctra/exporters/excel_writer.py,sha256=U5Eb5SF7_ll1QveUapSWSkCRt3OEoisKEVUQ_7X8Wjo,7762
21
+ doctra/exporters/excel_writer.py,sha256=rwyqlH73P7z413BELovQY_pS6IMkkqHEho6mbPrJ2Sk,11857
22
22
  doctra/exporters/html_writer.py,sha256=OlW24Eg5bZcjldRHtd3GDD7RrajuRXj43EJpXIJkYf8,38810
23
23
  doctra/exporters/image_saver.py,sha256=zsPoQ0CwoE643ui4iZMdXk96kv5mU8L_zC2JfF22N1A,1639
24
24
  doctra/exporters/markdown_table.py,sha256=4_OJIwG_WoIPYBzJx1njy_3tNVdkK6QKSP-P9r-b0zw,2030
25
25
  doctra/exporters/markdown_writer.py,sha256=L7EjF2MB8jYX7XkZ3a3NeeEC8gnb0qzRPTzIN9tdfuw,1027
26
26
  doctra/parsers/__init__.py,sha256=8M6LVzcWGpuTIK_1SMXML3ll7zK1CTHXGI5qXvqdm-A,206
27
- doctra/parsers/enhanced_pdf_parser.py,sha256=7KfkQexXTxbi8Naen7HFlFaeoEGpfdbYbvRqkTXw22A,18095
27
+ doctra/parsers/enhanced_pdf_parser.py,sha256=NBBopYdSIHWd_O96J0qR3DqZvbAt3CfK1hwUkXu8540,18377
28
28
  doctra/parsers/layout_order.py,sha256=W6b-T11H907RZ2FaZwNvnYhmvH11rpUzxC5yLkdf28k,640
29
- doctra/parsers/structured_pdf_parser.py,sha256=QIZIS5SAaIdGiT8o7G_a4D-Cht7nVLGeSuVzqSYLn14,19160
30
- doctra/parsers/table_chart_extractor.py,sha256=kSubqX0n0kVu_3jzX6QUyKmEGs9sG3Bg9kzUzn2wPHo,13733
29
+ doctra/parsers/structured_pdf_parser.py,sha256=AU6yLW2kpd8bxZjelmm73L4CVBysnVAdKxwPkTV1Fzk,19602
30
+ doctra/parsers/table_chart_extractor.py,sha256=ePmk9m9n-mvkqOvxpWC42ElxbnKMmDnq-e6SWiNqgzA,14195
31
31
  doctra/third_party/docres/inference.py,sha256=krD5EQDiqki-5uTMqqHYivhL38sfSOhYgaihI751070,13576
32
32
  doctra/third_party/docres/utils.py,sha256=N0ZVmOTB3wsinFlYu5hT84C4_MhWGdc98T8LTG-S9dA,14566
33
33
  doctra/third_party/docres/data/MBD/MBD.py,sha256=-d6cVQX1FVcGmQ_yJ5l-PQ3xKmkmveQQBytZ38pEGfY,4653
@@ -49,7 +49,7 @@ doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/unittest.
49
49
  doctra/third_party/docres/data/preprocess/crop_merge_image.py,sha256=f2NANY92s6IQ1hl1MAXfftFPIyIrj24O4TONjg7SXEc,4747
50
50
  doctra/third_party/docres/models/restormer_arch.py,sha256=BSwv_odCcp4HUZj3gv21e4IzFRBiyk8FjKAO8kF4YS8,12510
51
51
  doctra/ui/__init__.py,sha256=XzOOKeGSBnUREuDQiCIWds1asFSa2nypFQTJXwclROA,85
52
- doctra/ui/app.py,sha256=WpXUWHSs7wSYNjY4iBOZJHsKGQ88jDytvOFIjuhqAGE,44031
52
+ doctra/ui/app.py,sha256=iFSAVZacL7iHB1SHhcUzperJGNQVWqUhvOYdlgjjt50,43623
53
53
  doctra/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
54
54
  doctra/utils/bbox.py,sha256=R2-95p0KiWvet3TH27TQVvCar7WJg6z0u3L21iEDF-A,674
55
55
  doctra/utils/constants.py,sha256=ZWOvNDrvETbQ_pxHiX7vUW4J5Oj8_qnov0QacUOBizI,189
@@ -59,9 +59,9 @@ doctra/utils/ocr_utils.py,sha256=Doa1uYBg3kRgRYd2aPq9fICHgHfrM_efdhZfI7jl6OM,780
59
59
  doctra/utils/pdf_io.py,sha256=c8EY47Z1iqVtlLFHS_n0qGuXJ5ERFaMUd84ivXV0b9E,706
60
60
  doctra/utils/progress.py,sha256=IKQ_YErWSEd4hddYMUiCORy0_kW4TOYJM891HUEq2_E,11901
61
61
  doctra/utils/quiet.py,sha256=5XPS-1CtJ0sVk6qgSQctdhr_wR8mP1xoJLoUbmkXROA,387
62
- doctra/utils/structured_utils.py,sha256=znC2zr80rZMfIV58lipZ8M4zPq6IF070pdwLBve1qiE,1251
63
- doctra-0.4.0.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
64
- doctra-0.4.0.dist-info/METADATA,sha256=nlIT-QfxcwWi97jbQIastNHty8if3CyUv0LaDGiK7tk,28298
65
- doctra-0.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
66
- doctra-0.4.0.dist-info/top_level.txt,sha256=jI7E8jHci2gP9y0GYaWxlg9jG0O5n3FjHJJPLXDXMds,7
67
- doctra-0.4.0.dist-info/RECORD,,
62
+ doctra/utils/structured_utils.py,sha256=vU84dsD8wIlTyMsA9hitorGH-eroQiVuWEpBTQBUT24,1478
63
+ doctra-0.4.1.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
64
+ doctra-0.4.1.dist-info/METADATA,sha256=wXduiq7VJS5vf-TXdxpYFCKGfPyGYr5jGK0mwH3OjUw,28298
65
+ doctra-0.4.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
66
+ doctra-0.4.1.dist-info/top_level.txt,sha256=jI7E8jHci2gP9y0GYaWxlg9jG0O5n3FjHJJPLXDXMds,7
67
+ doctra-0.4.1.dist-info/RECORD,,
File without changes