doctra 0.4.2__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {doctra-0.4.2/doctra.egg-info → doctra-0.5.0}/PKG-INFO +332 -74
- doctra-0.5.0/README.md +689 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/cli/main.py +10 -23
- {doctra-0.4.2 → doctra-0.5.0}/doctra/cli/utils.py +7 -6
- doctra-0.5.0/doctra/engines/vlm/provider.py +257 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/parsers/structured_pdf_parser.py +8 -5
- {doctra-0.4.2 → doctra-0.5.0}/doctra/ui/enhanced_parser_ui.py +2 -2
- {doctra-0.4.2 → doctra-0.5.0}/doctra/ui/full_parse_ui.py +2 -2
- {doctra-0.4.2 → doctra-0.5.0}/doctra/ui/tables_charts_ui.py +2 -2
- {doctra-0.4.2 → doctra-0.5.0}/doctra/ui/ui_helpers.py +5 -4
- {doctra-0.4.2 → doctra-0.5.0}/doctra/version.py +1 -1
- {doctra-0.4.2 → doctra-0.5.0/doctra.egg-info}/PKG-INFO +332 -74
- {doctra-0.4.2 → doctra-0.5.0}/doctra.egg-info/SOURCES.txt +1 -0
- doctra-0.5.0/doctra.egg-info/entry_points.txt +2 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra.egg-info/requires.txt +1 -0
- {doctra-0.4.2 → doctra-0.5.0}/pyproject.toml +4 -0
- {doctra-0.4.2 → doctra-0.5.0}/setup.py +6 -0
- doctra-0.4.2/README.md +0 -432
- doctra-0.4.2/doctra/engines/vlm/provider.py +0 -86
- {doctra-0.4.2 → doctra-0.5.0}/LICENSE +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/MANIFEST.in +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/__init__.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/cli/__init__.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/engines/__init__.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/engines/image_restoration/__init__.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/engines/image_restoration/docres_engine.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/engines/layout/__init__.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/engines/layout/layout_models.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/engines/layout/paddle_layout.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/engines/ocr/__init__.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/engines/ocr/api.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/engines/ocr/path_resolver.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/engines/ocr/pytesseract_engine.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/engines/vlm/__init__.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/engines/vlm/outlines_types.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/engines/vlm/service.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/exporters/__init__.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/exporters/excel_writer.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/exporters/html_writer.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/exporters/image_saver.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/exporters/markdown_table.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/exporters/markdown_writer.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/parsers/__init__.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/parsers/enhanced_pdf_parser.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/parsers/layout_order.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/parsers/table_chart_extractor.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/third_party/docres/data/MBD/MBD.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/third_party/docres/data/MBD/MBD_utils.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/third_party/docres/data/MBD/infer.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/third_party/docres/data/MBD/model/deep_lab_model/aspp.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/__init__.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/drn.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/mobilenet.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/resnet.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/xception.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/third_party/docres/data/MBD/model/deep_lab_model/decoder.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/third_party/docres/data/MBD/model/deep_lab_model/deeplab.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/__init__.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/batchnorm.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/comm.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/replicate.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/unittest.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/third_party/docres/data/preprocess/crop_merge_image.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/third_party/docres/inference.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/third_party/docres/models/restormer_arch.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/third_party/docres/utils.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/ui/__init__.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/ui/app.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/ui/docres_ui.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/ui/docres_wrapper.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/utils/__init__.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/utils/bbox.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/utils/constants.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/utils/file_ops.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/utils/io_utils.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/utils/ocr_utils.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/utils/pdf_io.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/utils/progress.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/utils/quiet.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra/utils/structured_utils.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra.egg-info/dependency_links.txt +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra.egg-info/not-zip-safe +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/doctra.egg-info/top_level.txt +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/requirements.txt +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/setup.cfg +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/tests/test_structured_pdf_parser.py +0 -0
- {doctra-0.4.2 → doctra-0.5.0}/tests/test_table_chart_extractor.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: doctra
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.5.0
|
4
4
|
Summary: Parse, extract, and analyze documents with ease
|
5
5
|
Home-page: https://github.com/AdemBoukhris457/Doctra
|
6
6
|
Author: Adem Boukhris
|
@@ -240,6 +240,7 @@ Requires-Dist: anthropic>=0.40.0
|
|
240
240
|
Requires-Dist: outlines>=0.0.34
|
241
241
|
Requires-Dist: tqdm>=4.62.0
|
242
242
|
Requires-Dist: matplotlib>=3.5.0
|
243
|
+
Requires-Dist: click>=8.0.0
|
243
244
|
Provides-Extra: openai
|
244
245
|
Requires-Dist: openai>=1.0.0; extra == "openai"
|
245
246
|
Provides-Extra: gemini
|
@@ -259,26 +260,30 @@ Dynamic: requires-python
|
|
259
260
|
|
260
261
|
# 🚀 **Doctra - Document Parser Library** 📑🔎
|
261
262
|
|
262
|
-

|
263
264
|
|
264
265
|
<div align="center">
|
265
266
|
|
266
267
|
[](https://github.com/AdemBoukhris457/Doctra)
|
267
268
|
[](https://github.com/AdemBoukhris457/Doctra)
|
268
269
|
[](https://pypi.org/project/doctra/)
|
270
|
+
[](https://ademboukhris457.github.io/Doctra/index.html)
|
269
271
|
</div>
|
270
272
|
|
271
273
|
## 📋 Table of Contents
|
272
274
|
|
273
|
-
- [Installation](
|
274
|
-
- [Quick Start](
|
275
|
-
- [Core Components](
|
275
|
+
- [Installation](#🛠️-installation)
|
276
|
+
- [Quick Start](#⚡-quick-start)
|
277
|
+
- [Core Components](#🔧-core-components)
|
276
278
|
- [StructuredPDFParser](#structuredpdfparser)
|
279
|
+
- [EnhancedPDFParser](#enhancedpdfparser)
|
277
280
|
- [ChartTablePDFParser](#charttablepdfparser)
|
278
|
-
- [
|
279
|
-
- [
|
280
|
-
- [
|
281
|
-
- [
|
281
|
+
- [DocResEngine](#docresengine)
|
282
|
+
- [Web UI (Gradio)](#🖥️-web-ui-gradio)
|
283
|
+
- [Command Line Interface](#command-line-interface)
|
284
|
+
- [Visualization](#🎨-visualization)
|
285
|
+
- [Usage Examples](#📖-usage-examples)
|
286
|
+
- [Features](#✨-features)
|
282
287
|
|
283
288
|
## 🛠️ Installation
|
284
289
|
|
@@ -391,6 +396,70 @@ parser = StructuredPDFParser(
|
|
391
396
|
)
|
392
397
|
```
|
393
398
|
|
399
|
+
### EnhancedPDFParser
|
400
|
+
|
401
|
+
The `EnhancedPDFParser` extends the `StructuredPDFParser` with advanced image restoration capabilities using DocRes. This parser is ideal for processing scanned documents, low-quality PDFs, or documents with visual distortions that need enhancement before parsing.
|
402
|
+
|
403
|
+
#### Key Features:
|
404
|
+
- **Image Restoration**: Uses DocRes for document enhancement before processing
|
405
|
+
- **Multiple Restoration Tasks**: Supports dewarping, deshadowing, appearance enhancement, deblurring, binarization, and end-to-end restoration
|
406
|
+
- **Enhanced Quality**: Improves document quality for better OCR and layout detection
|
407
|
+
- **All StructuredPDFParser Features**: Inherits all capabilities of the base parser
|
408
|
+
- **Flexible Configuration**: Extensive options for restoration and processing
|
409
|
+
|
410
|
+
#### Basic Usage:
|
411
|
+
|
412
|
+
```python
|
413
|
+
from doctra.parsers.enhanced_pdf_parser import EnhancedPDFParser
|
414
|
+
|
415
|
+
# Basic enhanced parser with image restoration
|
416
|
+
parser = EnhancedPDFParser(
|
417
|
+
use_image_restoration=True,
|
418
|
+
restoration_task="appearance" # Default restoration task
|
419
|
+
)
|
420
|
+
|
421
|
+
# Parse document with enhancement
|
422
|
+
parser.parse("scanned_document.pdf")
|
423
|
+
```
|
424
|
+
|
425
|
+
#### Advanced Configuration:
|
426
|
+
|
427
|
+
```python
|
428
|
+
parser = EnhancedPDFParser(
|
429
|
+
# Image Restoration Settings
|
430
|
+
use_image_restoration=True,
|
431
|
+
restoration_task="dewarping", # Correct perspective distortion
|
432
|
+
restoration_device="cuda", # Use GPU for faster processing
|
433
|
+
restoration_dpi=300, # Higher DPI for better quality
|
434
|
+
|
435
|
+
# VLM Settings
|
436
|
+
use_vlm=True,
|
437
|
+
vlm_provider="openai",
|
438
|
+
vlm_model="gpt-4-vision",
|
439
|
+
vlm_api_key="your_api_key",
|
440
|
+
|
441
|
+
# Layout Detection Settings
|
442
|
+
layout_model_name="PP-DocLayout_plus-L",
|
443
|
+
dpi=200,
|
444
|
+
min_score=0.5,
|
445
|
+
|
446
|
+
# OCR Settings
|
447
|
+
ocr_lang="eng",
|
448
|
+
ocr_psm=6
|
449
|
+
)
|
450
|
+
```
|
451
|
+
|
452
|
+
#### DocRes Restoration Tasks:
|
453
|
+
|
454
|
+
| Task | Description | Best For |
|
455
|
+
|------|-------------|----------|
|
456
|
+
| `appearance` | General appearance enhancement | Most documents (default) |
|
457
|
+
| `dewarping` | Correct perspective distortion | Scanned documents with perspective issues |
|
458
|
+
| `deshadowing` | Remove shadows and lighting artifacts | Documents with shadow problems |
|
459
|
+
| `deblurring` | Reduce blur and improve sharpness | Blurry or low-quality scans |
|
460
|
+
| `binarization` | Convert to black and white | Documents needing clean binarization |
|
461
|
+
| `end2end` | Complete restoration pipeline | Severely degraded documents |
|
462
|
+
|
394
463
|
### ChartTablePDFParser
|
395
464
|
|
396
465
|
The `ChartTablePDFParser` is a specialized parser focused specifically on extracting charts and tables from PDF documents. It's optimized for scenarios where you only need these specific elements, providing faster processing and more targeted output.
|
@@ -444,6 +513,163 @@ parser = ChartTablePDFParser(
|
|
444
513
|
)
|
445
514
|
```
|
446
515
|
|
516
|
+
### DocResEngine
|
517
|
+
|
518
|
+
The `DocResEngine` provides direct access to DocRes image restoration capabilities. This engine is perfect for standalone image restoration tasks or when you need fine-grained control over the restoration process.
|
519
|
+
|
520
|
+
#### Key Features:
|
521
|
+
- **Direct Image Restoration**: Process individual images or entire PDFs
|
522
|
+
- **Multiple Restoration Tasks**: All 6 DocRes restoration tasks available
|
523
|
+
- **GPU Acceleration**: Automatic CUDA detection and optimization
|
524
|
+
- **Flexible Input/Output**: Support for various image formats and PDFs
|
525
|
+
- **Metadata Extraction**: Get detailed information about restoration process
|
526
|
+
|
527
|
+
#### Basic Usage:
|
528
|
+
|
529
|
+
```python
|
530
|
+
from doctra.engines.image_restoration import DocResEngine
|
531
|
+
|
532
|
+
# Initialize DocRes engine
|
533
|
+
docres = DocResEngine(device="cuda") # or "cpu" or None for auto-detect
|
534
|
+
|
535
|
+
# Restore a single image
|
536
|
+
restored_img, metadata = docres.restore_image(
|
537
|
+
image="path/to/image.jpg",
|
538
|
+
task="appearance"
|
539
|
+
)
|
540
|
+
|
541
|
+
# Restore entire PDF
|
542
|
+
enhanced_pdf = docres.restore_pdf(
|
543
|
+
pdf_path="document.pdf",
|
544
|
+
output_path="enhanced_document.pdf",
|
545
|
+
task="appearance"
|
546
|
+
)
|
547
|
+
```
|
548
|
+
|
549
|
+
#### Advanced Usage:
|
550
|
+
|
551
|
+
```python
|
552
|
+
# Initialize with custom settings
|
553
|
+
docres = DocResEngine(
|
554
|
+
device="cuda", # Force GPU usage
|
555
|
+
use_half_precision=True, # Use half precision for faster processing
|
556
|
+
model_path="custom/model.pth", # Custom model path (optional)
|
557
|
+
mbd_path="custom/mbd.pth" # Custom MBD model path (optional)
|
558
|
+
)
|
559
|
+
|
560
|
+
# Process multiple images
|
561
|
+
images = ["doc1.jpg", "doc2.jpg", "doc3.jpg"]
|
562
|
+
for img_path in images:
|
563
|
+
restored_img, metadata = docres.restore_image(
|
564
|
+
image=img_path,
|
565
|
+
task="dewarping"
|
566
|
+
)
|
567
|
+
print(f"Processed {img_path}: {metadata}")
|
568
|
+
|
569
|
+
# Batch PDF processing
|
570
|
+
pdfs = ["report1.pdf", "report2.pdf"]
|
571
|
+
for pdf_path in pdfs:
|
572
|
+
output_path = f"enhanced_{os.path.basename(pdf_path)}"
|
573
|
+
docres.restore_pdf(
|
574
|
+
pdf_path=pdf_path,
|
575
|
+
output_path=output_path,
|
576
|
+
task="end2end" # Complete restoration pipeline
|
577
|
+
)
|
578
|
+
```
|
579
|
+
|
580
|
+
#### Supported Restoration Tasks:
|
581
|
+
|
582
|
+
| Task | Description | Use Case |
|
583
|
+
|------|-------------|----------|
|
584
|
+
| `appearance` | General appearance enhancement | Default choice for most documents |
|
585
|
+
| `dewarping` | Correct document perspective distortion | Scanned documents with perspective issues |
|
586
|
+
| `deshadowing` | Remove shadows and lighting artifacts | Documents with shadow problems |
|
587
|
+
| `deblurring` | Reduce blur and improve sharpness | Blurry or low-quality scans |
|
588
|
+
| `binarization` | Convert to black and white | Documents needing clean binarization |
|
589
|
+
| `end2end` | Complete restoration pipeline | Severely degraded documents |
|
590
|
+
|
591
|
+
## 🖥️ Web UI (Gradio)
|
592
|
+
|
593
|
+
Doctra provides a comprehensive web interface built with Gradio that makes document processing accessible to non-technical users.
|
594
|
+
|
595
|
+
#### Features:
|
596
|
+
- **Drag & Drop Interface**: Upload PDFs by dragging and dropping
|
597
|
+
- **Multiple Parsers**: Choose between full parsing, enhanced parsing, and chart/table extraction
|
598
|
+
- **Real-time Processing**: See progress as documents are processed
|
599
|
+
- **VLM Integration**: Configure API keys for AI features
|
600
|
+
- **Output Preview**: View results directly in the browser
|
601
|
+
- **Download Results**: Download processed files as ZIP archives
|
602
|
+
|
603
|
+
#### Launch the Web UI:
|
604
|
+
|
605
|
+
```python
|
606
|
+
from doctra.ui.app import launch_ui
|
607
|
+
|
608
|
+
# Launch the web interface
|
609
|
+
launch_ui()
|
610
|
+
```
|
611
|
+
|
612
|
+
Or from command line:
|
613
|
+
```bash
|
614
|
+
python gradio_app.py
|
615
|
+
```
|
616
|
+
|
617
|
+
#### Web UI Components:
|
618
|
+
|
619
|
+
1. **Full Parse Tab**: Complete document processing with page navigation
|
620
|
+
2. **Tables & Charts Tab**: Specialized extraction with VLM integration
|
621
|
+
3. **DocRes Tab**: Image restoration with before/after comparison
|
622
|
+
4. **Enhanced Parser Tab**: Enhanced parsing with DocRes integration
|
623
|
+
|
624
|
+
## Command Line Interface
|
625
|
+
|
626
|
+
Doctra includes a powerful CLI for batch processing and automation.
|
627
|
+
|
628
|
+
#### Available Commands:
|
629
|
+
|
630
|
+
```bash
|
631
|
+
# Full document parsing
|
632
|
+
doctra parse document.pdf
|
633
|
+
|
634
|
+
# Enhanced parsing with image restoration
|
635
|
+
doctra enhance document.pdf --restoration-task appearance
|
636
|
+
|
637
|
+
# Extract only charts and tables
|
638
|
+
doctra extract charts document.pdf
|
639
|
+
doctra extract tables document.pdf
|
640
|
+
doctra extract both document.pdf --use-vlm
|
641
|
+
|
642
|
+
# Visualize layout detection
|
643
|
+
doctra visualize document.pdf
|
644
|
+
|
645
|
+
# Quick document analysis
|
646
|
+
doctra analyze document.pdf
|
647
|
+
|
648
|
+
# System information
|
649
|
+
doctra info
|
650
|
+
```
|
651
|
+
|
652
|
+
#### CLI Examples:
|
653
|
+
|
654
|
+
```bash
|
655
|
+
# Enhanced parsing with custom settings
|
656
|
+
doctra enhance document.pdf \
|
657
|
+
--restoration-task dewarping \
|
658
|
+
--restoration-device cuda \
|
659
|
+
--use-vlm \
|
660
|
+
--vlm-provider openai \
|
661
|
+
--vlm-api-key your_key
|
662
|
+
|
663
|
+
# Extract charts with VLM
|
664
|
+
doctra extract charts document.pdf \
|
665
|
+
--use-vlm \
|
666
|
+
--vlm-provider gemini \
|
667
|
+
--vlm-api-key your_key
|
668
|
+
|
669
|
+
# Batch processing
|
670
|
+
doctra parse *.pdf --output-dir results/
|
671
|
+
```
|
672
|
+
|
447
673
|
## 🎨 Visualization
|
448
674
|
|
449
675
|
Doctra provides powerful visualization capabilities to help you understand how the layout detection works and verify the accuracy of element extraction.
|
@@ -540,7 +766,53 @@ parser.parse("financial_report.pdf")
|
|
540
766
|
# - Markdown file with all content
|
541
767
|
```
|
542
768
|
|
543
|
-
### Example 2:
|
769
|
+
### Example 2: Enhanced Parsing with Image Restoration
|
770
|
+
|
771
|
+
```python
|
772
|
+
from doctra.parsers.enhanced_pdf_parser import EnhancedPDFParser
|
773
|
+
|
774
|
+
# Initialize enhanced parser with image restoration
|
775
|
+
parser = EnhancedPDFParser(
|
776
|
+
use_image_restoration=True,
|
777
|
+
restoration_task="dewarping", # Correct perspective distortion
|
778
|
+
restoration_device="cuda", # Use GPU for faster processing
|
779
|
+
use_vlm=True,
|
780
|
+
vlm_provider="openai",
|
781
|
+
vlm_api_key="your_api_key"
|
782
|
+
)
|
783
|
+
|
784
|
+
# Process scanned document with enhancement
|
785
|
+
parser.parse("scanned_document.pdf")
|
786
|
+
|
787
|
+
# Output will include:
|
788
|
+
# - Enhanced PDF with restored images
|
789
|
+
# - All standard parsing outputs
|
790
|
+
# - Improved OCR accuracy due to restoration
|
791
|
+
```
|
792
|
+
|
793
|
+
### Example 3: Direct Image Restoration
|
794
|
+
|
795
|
+
```python
|
796
|
+
from doctra.engines.image_restoration import DocResEngine
|
797
|
+
|
798
|
+
# Initialize DocRes engine
|
799
|
+
docres = DocResEngine(device="cuda")
|
800
|
+
|
801
|
+
# Restore individual images
|
802
|
+
restored_img, metadata = docres.restore_image(
|
803
|
+
image="blurry_document.jpg",
|
804
|
+
task="deblurring"
|
805
|
+
)
|
806
|
+
|
807
|
+
# Restore entire PDF
|
808
|
+
docres.restore_pdf(
|
809
|
+
pdf_path="low_quality.pdf",
|
810
|
+
output_path="enhanced.pdf",
|
811
|
+
task="appearance"
|
812
|
+
)
|
813
|
+
```
|
814
|
+
|
815
|
+
### Example 4: Chart and Table Extraction with VLM
|
544
816
|
|
545
817
|
```python
|
546
818
|
from doctra.parsers.table_chart_extractor import ChartTablePDFParser
|
@@ -563,29 +835,42 @@ parser.parse("data_report.pdf", output_base_dir="extracted_data")
|
|
563
835
|
# - Markdown tables with extracted data
|
564
836
|
```
|
565
837
|
|
566
|
-
### Example
|
838
|
+
### Example 5: Web UI Usage
|
567
839
|
|
568
840
|
```python
|
569
|
-
from doctra.
|
841
|
+
from doctra.ui.app import launch_ui
|
570
842
|
|
571
|
-
#
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
ocr_psm=6, # Uniform block of text
|
582
|
-
box_separator="\n\n" # Double line breaks between elements
|
583
|
-
)
|
843
|
+
# Launch the web interface
|
844
|
+
launch_ui()
|
845
|
+
|
846
|
+
# Or build the interface programmatically
|
847
|
+
from doctra.ui.app import build_demo
|
848
|
+
demo = build_demo()
|
849
|
+
demo.launch(share=True) # Share publicly
|
850
|
+
```
|
851
|
+
|
852
|
+
### Example 6: Command Line Usage
|
584
853
|
|
585
|
-
|
854
|
+
```bash
|
855
|
+
# Enhanced parsing with custom settings
|
856
|
+
doctra enhance document.pdf \
|
857
|
+
--restoration-task dewarping \
|
858
|
+
--restoration-device cuda \
|
859
|
+
--use-vlm \
|
860
|
+
--vlm-provider openai \
|
861
|
+
--vlm-api-key your_key
|
862
|
+
|
863
|
+
# Extract charts with VLM
|
864
|
+
doctra extract charts document.pdf \
|
865
|
+
--use-vlm \
|
866
|
+
--vlm-provider gemini \
|
867
|
+
--vlm-api-key your_key
|
868
|
+
|
869
|
+
# Batch processing
|
870
|
+
doctra parse *.pdf --output-dir results/
|
586
871
|
```
|
587
872
|
|
588
|
-
### Example
|
873
|
+
### Example 7: Layout Visualization
|
589
874
|
|
590
875
|
```python
|
591
876
|
from doctra.parsers.structured_pdf_parser import StructuredPDFParser
|
@@ -624,68 +909,41 @@ parser.display_pages_with_boxes("document.pdf")
|
|
624
909
|
- Organized output directory structure
|
625
910
|
- High-resolution image preservation
|
626
911
|
|
912
|
+
### 🔧 Image Restoration (DocRes)
|
913
|
+
- **6 Restoration Tasks**: Dewarping, deshadowing, appearance enhancement, deblurring, binarization, and end-to-end restoration
|
914
|
+
- **GPU Acceleration**: Automatic CUDA detection and optimization
|
915
|
+
- **Enhanced Quality**: Improves document quality for better OCR and layout detection
|
916
|
+
- **Flexible Processing**: Standalone image restoration or integrated with parsing
|
917
|
+
|
627
918
|
### 🤖 VLM Integration
|
628
919
|
- Vision Language Model support for structured data extraction
|
629
|
-
- Multiple provider options (Gemini,
|
920
|
+
- Multiple provider options (OpenAI, Gemini, Anthropic, OpenRouter)
|
630
921
|
- Automatic conversion of charts and tables to structured formats
|
631
922
|
|
632
923
|
### 📊 Multiple Output Formats
|
633
924
|
- **Markdown**: Human-readable document with embedded images and tables
|
634
925
|
- **Excel**: Structured data in spreadsheet format
|
635
926
|
- **JSON**: Programmatically accessible structured data
|
927
|
+
- **HTML**: Interactive web-ready documents
|
636
928
|
- **Images**: High-quality cropped visual elements
|
637
929
|
|
930
|
+
### 🖥️ User Interfaces
|
931
|
+
- **Web UI**: Gradio-based interface with drag & drop functionality
|
932
|
+
- **Command Line**: Powerful CLI for batch processing and automation
|
933
|
+
- **Multiple Tabs**: Full parsing, enhanced parsing, chart/table extraction, and image restoration
|
934
|
+
|
638
935
|
### ⚙️ Flexible Configuration
|
639
936
|
- Extensive customization options
|
640
937
|
- Performance tuning parameters
|
641
938
|
- Output format selection
|
939
|
+
- Device selection (CPU/GPU)
|
642
940
|
|
643
|
-
##
|
644
|
-
|
645
|
-
### Core Dependencies
|
646
|
-
- **PaddleOCR**: Document layout detection
|
647
|
-
- **Outlines**: Structured output generation
|
648
|
-
- **Tesseract**: OCR text extraction
|
649
|
-
- **Pillow**: Image processing
|
650
|
-
- **OpenCV**: Computer vision operations
|
651
|
-
- **Pandas**: Data manipulation
|
652
|
-
- **OpenPyXL**: Excel file generation
|
653
|
-
- **Google Generative AI**: For Gemini VLM integration
|
654
|
-
- **OpenAI**: For GPT-5 VLM integration
|
655
|
-
|
656
|
-
## 🖥️ Web Interface (Gradio)
|
657
|
-
|
658
|
-
You can try Doctra in a simple web UI powered by Gradio.
|
659
|
-
|
660
|
-
### Run locally
|
661
|
-
|
662
|
-
```bash
|
663
|
-
pip install -U gradio
|
664
|
-
python gradio_app.py
|
665
|
-
```
|
666
|
-
|
667
|
-
Then open the printed URL (default `http://127.0.0.1:7860`).
|
668
|
-
|
669
|
-
Notes:
|
670
|
-
- If using VLM, set the API key field in the UI or export `VLM_API_KEY`.
|
671
|
-
- Outputs are saved under `outputs/<pdf_stem>/` and previewed in the UI.
|
672
|
-
|
673
|
-
### Deploy on Hugging Face Spaces
|
674
|
-
|
675
|
-
1) Create a new Space (type: Gradio, SDK: Python).
|
676
|
-
|
677
|
-
2) Add these files to the Space repo:
|
678
|
-
- Your package code (or install from PyPI).
|
679
|
-
- `gradio_app.py` (entry point).
|
680
|
-
- `requirements.txt` with at least:
|
681
|
-
|
682
|
-
```text
|
683
|
-
doctra
|
684
|
-
gradio
|
685
|
-
```
|
941
|
+
## 🙏 Acknowledgments
|
686
942
|
|
687
|
-
|
943
|
+
Doctra builds upon several excellent open-source projects:
|
688
944
|
|
689
|
-
|
945
|
+
- **[PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)** - Advanced document layout detection and OCR capabilities
|
946
|
+
- **[DocRes](https://github.com/ZZZHANG-jx/DocRes)** - State-of-the-art document image restoration model
|
947
|
+
- **[Outlines](https://github.com/dottxt-ai/outlines)** - Structured output generation for LLMs
|
690
948
|
|
691
|
-
|
949
|
+
We thank the developers and contributors of these projects for their valuable work that makes Doctra possible.
|