doctra 0.3.3__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doctra/__init__.py +4 -0
- doctra/cli/main.py +170 -9
- doctra/cli/utils.py +2 -3
- doctra/engines/image_restoration/__init__.py +10 -0
- doctra/engines/image_restoration/docres_engine.py +561 -0
- doctra/engines/vlm/outlines_types.py +13 -9
- doctra/engines/vlm/service.py +4 -2
- doctra/exporters/excel_writer.py +89 -0
- doctra/parsers/enhanced_pdf_parser.py +374 -0
- doctra/parsers/structured_pdf_parser.py +6 -0
- doctra/parsers/table_chart_extractor.py +6 -0
- doctra/third_party/docres/data/MBD/MBD.py +110 -0
- doctra/third_party/docres/data/MBD/MBD_utils.py +291 -0
- doctra/third_party/docres/data/MBD/infer.py +151 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/aspp.py +95 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/__init__.py +13 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/drn.py +402 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/mobilenet.py +151 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/resnet.py +170 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/xception.py +288 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/decoder.py +59 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/deeplab.py +81 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/__init__.py +12 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/batchnorm.py +282 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/comm.py +129 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/replicate.py +88 -0
- doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/unittest.py +29 -0
- doctra/third_party/docres/data/preprocess/crop_merge_image.py +142 -0
- doctra/third_party/docres/inference.py +370 -0
- doctra/third_party/docres/models/restormer_arch.py +308 -0
- doctra/third_party/docres/utils.py +464 -0
- doctra/ui/app.py +8 -14
- doctra/utils/structured_utils.py +5 -2
- doctra/version.py +1 -1
- {doctra-0.3.3.dist-info → doctra-0.4.1.dist-info}/METADATA +1 -1
- doctra-0.4.1.dist-info/RECORD +67 -0
- doctra-0.3.3.dist-info/RECORD +0 -44
- {doctra-0.3.3.dist-info → doctra-0.4.1.dist-info}/WHEEL +0 -0
- {doctra-0.3.3.dist-info → doctra-0.4.1.dist-info}/licenses/LICENSE +0 -0
- {doctra-0.3.3.dist-info → doctra-0.4.1.dist-info}/top_level.txt +0 -0
doctra/__init__.py
CHANGED
@@ -4,13 +4,17 @@ Parse, extract, and analyze documents with ease
|
|
4
4
|
"""
|
5
5
|
|
6
6
|
from .parsers.structured_pdf_parser import StructuredPDFParser
|
7
|
+
from .parsers.enhanced_pdf_parser import EnhancedPDFParser
|
7
8
|
from .parsers.table_chart_extractor import ChartTablePDFParser
|
9
|
+
from .engines.image_restoration import DocResEngine
|
8
10
|
from .version import __version__
|
9
11
|
from .ui import build_demo, launch_ui
|
10
12
|
|
11
13
|
__all__ = [
|
12
14
|
'StructuredPDFParser',
|
15
|
+
'EnhancedPDFParser',
|
13
16
|
'ChartTablePDFParser',
|
17
|
+
'DocResEngine',
|
14
18
|
'build_demo',
|
15
19
|
'launch_ui',
|
16
20
|
'__version__'
|
doctra/cli/main.py
CHANGED
@@ -9,20 +9,27 @@ detection results, and analyze document structure from the command line.
|
|
9
9
|
import click
|
10
10
|
import os
|
11
11
|
import sys
|
12
|
+
import traceback
|
12
13
|
from pathlib import Path
|
13
14
|
from typing import Optional
|
14
15
|
|
15
16
|
# Import parsers
|
16
17
|
try:
|
17
18
|
from doctra.parsers.structured_pdf_parser_enhancer import StructuredPDFParser
|
19
|
+
from doctra.parsers.enhanced_pdf_parser import EnhancedPDFParser
|
18
20
|
from doctra.parsers.chart_table_pdf_parser import ChartTablePDFParser
|
19
21
|
except ImportError:
|
20
22
|
# Fallback for development/testing
|
21
23
|
project_root = Path(__file__).parent.parent.parent
|
22
24
|
sys.path.insert(0, str(project_root))
|
23
25
|
from doctra.parsers.structured_pdf_parser import StructuredPDFParser
|
26
|
+
from doctra.parsers.enhanced_pdf_parser import EnhancedPDFParser
|
24
27
|
from doctra.parsers.table_chart_extractor import ChartTablePDFParser
|
25
28
|
|
29
|
+
# Import additional modules
|
30
|
+
from doctra.engines.layout.paddle_layout import PaddleLayoutEngine
|
31
|
+
from doctra.engines.image_restoration import DocResEngine
|
32
|
+
|
26
33
|
|
27
34
|
@click.group(invoke_without_command=True)
|
28
35
|
@click.pass_context
|
@@ -37,6 +44,7 @@ def cli(ctx):
|
|
37
44
|
\b
|
38
45
|
Commands:
|
39
46
|
parse Full document parsing with text, tables, charts, and figures
|
47
|
+
enhance Enhanced parsing with DocRes image restoration
|
40
48
|
extract Extract only charts and/or tables from documents
|
41
49
|
visualize Visualize layout detection results
|
42
50
|
analyze Quick document analysis without processing
|
@@ -45,6 +53,7 @@ def cli(ctx):
|
|
45
53
|
\b
|
46
54
|
Examples:
|
47
55
|
doctra parse document.pdf # Full document parsing
|
56
|
+
doctra enhance document.pdf # Enhanced parsing with image restoration
|
48
57
|
doctra extract charts document.pdf # Extract only charts
|
49
58
|
doctra extract both document.pdf --use-vlm # Extract charts & tables with VLM
|
50
59
|
doctra visualize document.pdf # Visualize layout detection
|
@@ -243,7 +252,6 @@ def parse(pdf_path: Path, output_dir: Optional[Path], use_vlm: bool,
|
|
243
252
|
except Exception as e:
|
244
253
|
click.echo(f"❌ Error initializing parser: {e}", err=True)
|
245
254
|
if verbose:
|
246
|
-
import traceback
|
247
255
|
click.echo(traceback.format_exc(), err=True)
|
248
256
|
sys.exit(1)
|
249
257
|
|
@@ -267,7 +275,151 @@ def parse(pdf_path: Path, output_dir: Optional[Path], use_vlm: bool,
|
|
267
275
|
except Exception as e:
|
268
276
|
click.echo(f"❌ Error during parsing: {e}", err=True)
|
269
277
|
if verbose:
|
270
|
-
|
278
|
+
click.echo(traceback.format_exc(), err=True)
|
279
|
+
sys.exit(1)
|
280
|
+
finally:
|
281
|
+
# Restore original working directory
|
282
|
+
os.chdir(original_cwd)
|
283
|
+
|
284
|
+
|
285
|
+
@cli.command()
|
286
|
+
@click.argument('pdf_path', type=click.Path(exists=True, path_type=Path))
|
287
|
+
@click.option('--output-dir', '-o', type=click.Path(path_type=Path),
|
288
|
+
help='Output directory (default: outputs/{pdf_filename}_enhanced)')
|
289
|
+
@click.option('--restoration-task', type=click.Choice(['dewarping', 'deshadowing', 'appearance', 'deblurring', 'binarization', 'end2end']),
|
290
|
+
default='appearance', help='DocRes restoration task (default: appearance)')
|
291
|
+
@click.option('--restoration-device', type=click.Choice(['cuda', 'cpu']),
|
292
|
+
help='Device for DocRes processing (default: auto-detect)')
|
293
|
+
@click.option('--restoration-dpi', type=int, default=200,
|
294
|
+
help='DPI for restoration processing (default: 200)')
|
295
|
+
@vlm_options
|
296
|
+
@layout_options
|
297
|
+
@ocr_options
|
298
|
+
@click.option('--box-separator', default='\n',
|
299
|
+
help='Separator between text boxes in output (default: newline)')
|
300
|
+
@click.option('--verbose', '-v', is_flag=True,
|
301
|
+
help='Enable verbose output')
|
302
|
+
def enhance(pdf_path: Path, output_dir: Optional[Path], restoration_task: str,
|
303
|
+
restoration_device: Optional[str], restoration_dpi: int,
|
304
|
+
use_vlm: bool, vlm_provider: str, vlm_model: Optional[str], vlm_api_key: Optional[str],
|
305
|
+
layout_model: str, dpi: int, min_score: float,
|
306
|
+
ocr_lang: str, ocr_psm: int, ocr_oem: int, ocr_config: str,
|
307
|
+
box_separator: str, verbose: bool):
|
308
|
+
"""
|
309
|
+
Enhanced PDF parsing with DocRes image restoration.
|
310
|
+
|
311
|
+
Performs document processing with image restoration to improve quality
|
312
|
+
before layout detection and content extraction. Particularly useful for
|
313
|
+
scanned documents, low-quality PDFs, or documents with shadows/distortion.
|
314
|
+
|
315
|
+
\b
|
316
|
+
Restoration Tasks:
|
317
|
+
appearance - General appearance enhancement (default)
|
318
|
+
dewarping - Correct document perspective distortion
|
319
|
+
deshadowing - Remove shadows from documents
|
320
|
+
deblurring - Reduce blur in document images
|
321
|
+
binarization - Convert to clean black/white text
|
322
|
+
end2end - Complete pipeline: dewarping → deshadowing → appearance
|
323
|
+
|
324
|
+
\b
|
325
|
+
Examples:
|
326
|
+
doctra enhance document.pdf
|
327
|
+
doctra enhance document.pdf --restoration-task dewarping
|
328
|
+
doctra enhance document.pdf --restoration-task end2end --restoration-device cuda
|
329
|
+
doctra enhance document.pdf --use-vlm --vlm-api-key your_key
|
330
|
+
doctra enhance document.pdf -o ./enhanced_results --restoration-dpi 300
|
331
|
+
doctra enhance document.pdf --restoration-task deshadowing # Use different restoration task
|
332
|
+
|
333
|
+
:param pdf_path: Path to the input PDF file
|
334
|
+
:param output_dir: Output directory for results (optional)
|
335
|
+
:param restoration_task: DocRes restoration task to perform
|
336
|
+
:param restoration_device: Device for DocRes processing
|
337
|
+
:param restoration_dpi: DPI for restoration processing
|
338
|
+
:param use_vlm: Whether to use VLM for enhanced extraction
|
339
|
+
:param vlm_provider: VLM provider ('gemini' or 'openai')
|
340
|
+
:param vlm_model: Model name to use (defaults to provider-specific defaults)
|
341
|
+
:param vlm_api_key: API key for VLM provider
|
342
|
+
:param layout_model: Layout detection model name
|
343
|
+
:param dpi: DPI for PDF rendering
|
344
|
+
:param min_score: Minimum confidence score for layout detection
|
345
|
+
:param ocr_lang: OCR language code
|
346
|
+
:param ocr_psm: Tesseract page segmentation mode
|
347
|
+
:param ocr_oem: Tesseract OCR engine mode
|
348
|
+
:param ocr_config: Additional Tesseract configuration
|
349
|
+
:param box_separator: Separator between text boxes in output
|
350
|
+
:param verbose: Whether to enable verbose output
|
351
|
+
:return: None
|
352
|
+
"""
|
353
|
+
validate_vlm_config(use_vlm, vlm_api_key)
|
354
|
+
|
355
|
+
if verbose:
|
356
|
+
click.echo(f"🔍 Starting enhanced PDF parsing with DocRes...")
|
357
|
+
click.echo(f" Input: {pdf_path}")
|
358
|
+
click.echo(f" Restoration task: {restoration_task}")
|
359
|
+
click.echo(f" Restoration device: {restoration_device or 'auto-detect'}")
|
360
|
+
click.echo(f" Restoration DPI: {restoration_dpi}")
|
361
|
+
if output_dir:
|
362
|
+
click.echo(f" Output: {output_dir}")
|
363
|
+
|
364
|
+
# Create enhanced parser instance
|
365
|
+
try:
|
366
|
+
if verbose:
|
367
|
+
click.echo(f"🔧 Initializing enhanced parser with DocRes...")
|
368
|
+
if use_vlm:
|
369
|
+
click.echo(f" VLM Provider: {vlm_provider}")
|
370
|
+
click.echo(f" VLM Model: {vlm_model or 'default'}")
|
371
|
+
click.echo(f" Layout Model: {layout_model}")
|
372
|
+
click.echo(f" DPI: {dpi}")
|
373
|
+
click.echo(f" OCR Language: {ocr_lang}")
|
374
|
+
else:
|
375
|
+
click.echo(f"🔧 Initializing enhanced parser with DocRes...")
|
376
|
+
if use_vlm:
|
377
|
+
click.echo(f" Using VLM: {vlm_provider}")
|
378
|
+
|
379
|
+
parser = EnhancedPDFParser(
|
380
|
+
use_image_restoration=True,
|
381
|
+
restoration_task=restoration_task,
|
382
|
+
restoration_device=restoration_device,
|
383
|
+
restoration_dpi=restoration_dpi,
|
384
|
+
use_vlm=use_vlm,
|
385
|
+
vlm_provider=vlm_provider,
|
386
|
+
vlm_model=vlm_model,
|
387
|
+
vlm_api_key=vlm_api_key,
|
388
|
+
layout_model_name=layout_model,
|
389
|
+
dpi=dpi,
|
390
|
+
min_score=min_score,
|
391
|
+
ocr_lang=ocr_lang,
|
392
|
+
ocr_psm=ocr_psm,
|
393
|
+
ocr_oem=ocr_oem,
|
394
|
+
ocr_extra_config=ocr_config,
|
395
|
+
box_separator=box_separator
|
396
|
+
)
|
397
|
+
except Exception as e:
|
398
|
+
click.echo(f"❌ Error initializing enhanced parser: {e}", err=True)
|
399
|
+
if verbose:
|
400
|
+
click.echo(traceback.format_exc(), err=True)
|
401
|
+
sys.exit(1)
|
402
|
+
|
403
|
+
# Change to output directory if specified
|
404
|
+
original_cwd = os.getcwd()
|
405
|
+
if output_dir:
|
406
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
407
|
+
os.chdir(output_dir)
|
408
|
+
click.echo(f"📁 Output directory: {output_dir.absolute()}")
|
409
|
+
|
410
|
+
try:
|
411
|
+
# Parse the document with enhancement
|
412
|
+
click.echo(f"📄 Processing with enhancement: {pdf_path.name}")
|
413
|
+
parser.parse(str(pdf_path.absolute()), str(output_dir) if output_dir else None)
|
414
|
+
click.echo("✅ Enhanced document processing completed successfully!")
|
415
|
+
click.echo(f"📁 Output directory: {output_dir.absolute() if output_dir else 'outputs/'}")
|
416
|
+
|
417
|
+
except KeyboardInterrupt:
|
418
|
+
click.echo("\n⚠️ Processing interrupted by user", err=True)
|
419
|
+
sys.exit(130)
|
420
|
+
except Exception as e:
|
421
|
+
click.echo(f"❌ Error during enhanced parsing: {e}", err=True)
|
422
|
+
if verbose:
|
271
423
|
click.echo(traceback.format_exc(), err=True)
|
272
424
|
sys.exit(1)
|
273
425
|
finally:
|
@@ -375,7 +527,6 @@ def charts(pdf_path: Path, output_dir: Path, use_vlm: bool, vlm_provider: str,
|
|
375
527
|
except Exception as e:
|
376
528
|
click.echo(f"❌ Error during chart extraction: {e}", err=True)
|
377
529
|
if verbose:
|
378
|
-
import traceback
|
379
530
|
click.echo(traceback.format_exc(), err=True)
|
380
531
|
sys.exit(1)
|
381
532
|
|
@@ -453,7 +604,6 @@ def tables(pdf_path: Path, output_dir: Path, use_vlm: bool, vlm_provider: str,
|
|
453
604
|
except Exception as e:
|
454
605
|
click.echo(f"❌ Error during table extraction: {e}", err=True)
|
455
606
|
if verbose:
|
456
|
-
import traceback
|
457
607
|
click.echo(traceback.format_exc(), err=True)
|
458
608
|
sys.exit(1)
|
459
609
|
|
@@ -532,7 +682,6 @@ def both(pdf_path: Path, output_dir: Path, use_vlm: bool, vlm_provider: str,
|
|
532
682
|
except Exception as e:
|
533
683
|
click.echo(f"❌ Error during extraction: {e}", err=True)
|
534
684
|
if verbose:
|
535
|
-
import traceback
|
536
685
|
click.echo(traceback.format_exc(), err=True)
|
537
686
|
sys.exit(1)
|
538
687
|
|
@@ -621,7 +770,6 @@ def visualize(pdf_path: Path, pages: int, columns: int, width: int,
|
|
621
770
|
except Exception as e:
|
622
771
|
click.echo(f"❌ Error creating visualization: {e}", err=True)
|
623
772
|
if verbose:
|
624
|
-
import traceback
|
625
773
|
click.echo(traceback.format_exc(), err=True)
|
626
774
|
sys.exit(1)
|
627
775
|
|
@@ -654,7 +802,6 @@ def analyze(pdf_path: Path, dpi: int, min_score: float, layout_model: str, verbo
|
|
654
802
|
click.echo(f"🔍 Analyzing: {pdf_path.name}")
|
655
803
|
|
656
804
|
# Create layout engine for analysis only
|
657
|
-
from doctra.engines.layout.paddle_layout import PaddleLayoutEngine
|
658
805
|
|
659
806
|
if verbose:
|
660
807
|
click.echo(f" Using model: {layout_model}")
|
@@ -752,7 +899,6 @@ def analyze(pdf_path: Path, dpi: int, min_score: float, layout_model: str, verbo
|
|
752
899
|
except Exception as e:
|
753
900
|
click.echo(f"❌ Error analyzing PDF: {e}", err=True)
|
754
901
|
if verbose:
|
755
|
-
import traceback
|
756
902
|
click.echo(traceback.format_exc(), err=True)
|
757
903
|
sys.exit(1)
|
758
904
|
|
@@ -771,7 +917,6 @@ def info():
|
|
771
917
|
click.echo("=" * 50)
|
772
918
|
|
773
919
|
# Check Python version
|
774
|
-
import sys
|
775
920
|
python_version = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}"
|
776
921
|
click.echo(f"Python version: {python_version}")
|
777
922
|
|
@@ -782,6 +927,9 @@ def info():
|
|
782
927
|
('pytesseract', 'pytesseract', 'OCR engine'),
|
783
928
|
('tqdm', 'tqdm', 'Progress bars'),
|
784
929
|
('click', 'click', 'CLI framework'),
|
930
|
+
('skimage', 'scikit-image', 'DocRes image restoration'),
|
931
|
+
('torch', 'torch', 'DocRes neural networks'),
|
932
|
+
('huggingface_hub', 'huggingface_hub', 'Hugging Face model downloads'),
|
785
933
|
]
|
786
934
|
|
787
935
|
click.echo("\nCore Dependencies:")
|
@@ -811,6 +959,7 @@ def info():
|
|
811
959
|
# Available commands
|
812
960
|
click.echo("\nAvailable Commands:")
|
813
961
|
click.echo(" 📄 parse - Full document processing (text, tables, charts, figures)")
|
962
|
+
click.echo(" ✨ enhance - Enhanced parsing with DocRes image restoration")
|
814
963
|
click.echo(" 📊 extract - Chart/table extraction only")
|
815
964
|
click.echo(" ├─ charts - Extract only charts")
|
816
965
|
click.echo(" ├─ tables - Extract only tables")
|
@@ -845,9 +994,21 @@ def info():
|
|
845
994
|
else:
|
846
995
|
click.echo(" VLM_API_KEY: (not set)")
|
847
996
|
|
997
|
+
# DocRes information
|
998
|
+
click.echo("\nDocRes Image Restoration:")
|
999
|
+
try:
|
1000
|
+
docres = DocResEngine()
|
1001
|
+
click.echo(f" ✅ DocRes available - {len(docres.get_supported_tasks())} restoration tasks")
|
1002
|
+
click.echo(" Tasks: dewarping, deshadowing, appearance, deblurring, binarization, end2end")
|
1003
|
+
click.echo(" 📥 Models: Downloaded from Hugging Face Hub")
|
1004
|
+
except Exception as e:
|
1005
|
+
click.echo(f" ⚠️ DocRes not available - {str(e)[:50]}...")
|
1006
|
+
click.echo(" Install with: pip install scikit-image torch huggingface_hub")
|
1007
|
+
|
848
1008
|
# Usage examples
|
849
1009
|
click.echo("\n💡 Quick Start Examples:")
|
850
1010
|
click.echo(" doctra parse document.pdf # Full document parsing")
|
1011
|
+
click.echo(" doctra enhance document.pdf # Enhanced parsing with DocRes")
|
851
1012
|
click.echo(" doctra extract both document.pdf --use-vlm # Charts & tables with VLM")
|
852
1013
|
click.echo(" doctra extract charts document.pdf # Only charts")
|
853
1014
|
click.echo(" doctra extract tables document.pdf # Only tables")
|
doctra/cli/utils.py
CHANGED
@@ -7,8 +7,10 @@ different CLI commands.
|
|
7
7
|
|
8
8
|
import click
|
9
9
|
import sys
|
10
|
+
import traceback
|
10
11
|
from typing import Optional, Dict, Any
|
11
12
|
from pathlib import Path
|
13
|
+
from doctra.utils.progress import create_beautiful_progress_bar, create_notebook_friendly_bar
|
12
14
|
|
13
15
|
|
14
16
|
def validate_vlm_config(use_vlm: bool, vlm_api_key: Optional[str]) -> None:
|
@@ -58,7 +60,6 @@ def handle_exception(e: Exception, verbose: bool = False) -> None:
|
|
58
60
|
"""
|
59
61
|
click.echo(f"❌ Error: {e}", err=True)
|
60
62
|
if verbose:
|
61
|
-
import traceback
|
62
63
|
click.echo(traceback.format_exc(), err=True)
|
63
64
|
sys.exit(1)
|
64
65
|
|
@@ -271,8 +272,6 @@ def create_progress_callback(description: str, total: int):
|
|
271
272
|
:return: Callable progress callback function that takes an integer
|
272
273
|
representing the number of completed items
|
273
274
|
"""
|
274
|
-
import sys
|
275
|
-
from doctra.utils.progress import create_beautiful_progress_bar, create_notebook_friendly_bar
|
276
275
|
|
277
276
|
# Enhanced environment detection
|
278
277
|
is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
|
@@ -0,0 +1,10 @@
|
|
1
|
+
"""
|
2
|
+
Image Restoration Engines
|
3
|
+
|
4
|
+
This module provides image restoration capabilities for document processing.
|
5
|
+
Currently supports DocRes for various document image restoration tasks.
|
6
|
+
"""
|
7
|
+
|
8
|
+
from .docres_engine import DocResEngine
|
9
|
+
|
10
|
+
__all__ = ['DocResEngine']
|