doctra 0.4.2__tar.gz → 0.4.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {doctra-0.4.2/doctra.egg-info → doctra-0.4.3}/PKG-INFO +331 -74
- doctra-0.4.3/README.md +688 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/version.py +1 -1
- {doctra-0.4.2 → doctra-0.4.3/doctra.egg-info}/PKG-INFO +331 -74
- {doctra-0.4.2 → doctra-0.4.3}/doctra.egg-info/SOURCES.txt +1 -0
- doctra-0.4.3/doctra.egg-info/entry_points.txt +2 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra.egg-info/requires.txt +1 -0
- {doctra-0.4.2 → doctra-0.4.3}/pyproject.toml +4 -0
- {doctra-0.4.2 → doctra-0.4.3}/setup.py +6 -0
- doctra-0.4.2/README.md +0 -432
- {doctra-0.4.2 → doctra-0.4.3}/LICENSE +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/MANIFEST.in +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/__init__.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/cli/__init__.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/cli/main.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/cli/utils.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/engines/__init__.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/engines/image_restoration/__init__.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/engines/image_restoration/docres_engine.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/engines/layout/__init__.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/engines/layout/layout_models.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/engines/layout/paddle_layout.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/engines/ocr/__init__.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/engines/ocr/api.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/engines/ocr/path_resolver.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/engines/ocr/pytesseract_engine.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/engines/vlm/__init__.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/engines/vlm/outlines_types.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/engines/vlm/provider.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/engines/vlm/service.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/exporters/__init__.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/exporters/excel_writer.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/exporters/html_writer.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/exporters/image_saver.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/exporters/markdown_table.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/exporters/markdown_writer.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/parsers/__init__.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/parsers/enhanced_pdf_parser.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/parsers/layout_order.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/parsers/structured_pdf_parser.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/parsers/table_chart_extractor.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/third_party/docres/data/MBD/MBD.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/third_party/docres/data/MBD/MBD_utils.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/third_party/docres/data/MBD/infer.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/third_party/docres/data/MBD/model/deep_lab_model/aspp.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/__init__.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/drn.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/mobilenet.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/resnet.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/xception.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/third_party/docres/data/MBD/model/deep_lab_model/decoder.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/third_party/docres/data/MBD/model/deep_lab_model/deeplab.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/__init__.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/batchnorm.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/comm.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/replicate.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/unittest.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/third_party/docres/data/preprocess/crop_merge_image.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/third_party/docres/inference.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/third_party/docres/models/restormer_arch.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/third_party/docres/utils.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/ui/__init__.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/ui/app.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/ui/docres_ui.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/ui/docres_wrapper.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/ui/enhanced_parser_ui.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/ui/full_parse_ui.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/ui/tables_charts_ui.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/ui/ui_helpers.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/utils/__init__.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/utils/bbox.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/utils/constants.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/utils/file_ops.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/utils/io_utils.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/utils/ocr_utils.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/utils/pdf_io.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/utils/progress.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/utils/quiet.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra/utils/structured_utils.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra.egg-info/dependency_links.txt +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra.egg-info/not-zip-safe +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/doctra.egg-info/top_level.txt +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/requirements.txt +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/setup.cfg +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/tests/test_structured_pdf_parser.py +0 -0
- {doctra-0.4.2 → doctra-0.4.3}/tests/test_table_chart_extractor.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: doctra
|
3
|
-
Version: 0.4.
|
3
|
+
Version: 0.4.3
|
4
4
|
Summary: Parse, extract, and analyze documents with ease
|
5
5
|
Home-page: https://github.com/AdemBoukhris457/Doctra
|
6
6
|
Author: Adem Boukhris
|
@@ -240,6 +240,7 @@ Requires-Dist: anthropic>=0.40.0
|
|
240
240
|
Requires-Dist: outlines>=0.0.34
|
241
241
|
Requires-Dist: tqdm>=4.62.0
|
242
242
|
Requires-Dist: matplotlib>=3.5.0
|
243
|
+
Requires-Dist: click>=8.0.0
|
243
244
|
Provides-Extra: openai
|
244
245
|
Requires-Dist: openai>=1.0.0; extra == "openai"
|
245
246
|
Provides-Extra: gemini
|
@@ -259,7 +260,7 @@ Dynamic: requires-python
|
|
259
260
|
|
260
261
|
# 🚀 **Doctra - Document Parser Library** 📑🔎
|
261
262
|
|
262
|
-

|
263
264
|
|
264
265
|
<div align="center">
|
265
266
|
|
@@ -270,15 +271,18 @@ Dynamic: requires-python
|
|
270
271
|
|
271
272
|
## 📋 Table of Contents
|
272
273
|
|
273
|
-
- [Installation](
|
274
|
-
- [Quick Start](
|
275
|
-
- [Core Components](
|
274
|
+
- [Installation](#🛠️-installation)
|
275
|
+
- [Quick Start](#⚡-quick-start)
|
276
|
+
- [Core Components](#🔧-core-components)
|
276
277
|
- [StructuredPDFParser](#structuredpdfparser)
|
278
|
+
- [EnhancedPDFParser](#enhancedpdfparser)
|
277
279
|
- [ChartTablePDFParser](#charttablepdfparser)
|
278
|
-
- [
|
279
|
-
- [
|
280
|
-
- [
|
281
|
-
- [
|
280
|
+
- [DocResEngine](#docresengine)
|
281
|
+
- [Web UI (Gradio)](#🖥️-web-ui-gradio)
|
282
|
+
- [Command Line Interface](#command-line-interface)
|
283
|
+
- [Visualization](#🎨-visualization)
|
284
|
+
- [Usage Examples](#📖-usage-examples)
|
285
|
+
- [Features](#✨-features)
|
282
286
|
|
283
287
|
## 🛠️ Installation
|
284
288
|
|
@@ -391,6 +395,70 @@ parser = StructuredPDFParser(
|
|
391
395
|
)
|
392
396
|
```
|
393
397
|
|
398
|
+
### EnhancedPDFParser
|
399
|
+
|
400
|
+
The `EnhancedPDFParser` extends the `StructuredPDFParser` with advanced image restoration capabilities using DocRes. This parser is ideal for processing scanned documents, low-quality PDFs, or documents with visual distortions that need enhancement before parsing.
|
401
|
+
|
402
|
+
#### Key Features:
|
403
|
+
- **Image Restoration**: Uses DocRes for document enhancement before processing
|
404
|
+
- **Multiple Restoration Tasks**: Supports dewarping, deshadowing, appearance enhancement, deblurring, binarization, and end-to-end restoration
|
405
|
+
- **Enhanced Quality**: Improves document quality for better OCR and layout detection
|
406
|
+
- **All StructuredPDFParser Features**: Inherits all capabilities of the base parser
|
407
|
+
- **Flexible Configuration**: Extensive options for restoration and processing
|
408
|
+
|
409
|
+
#### Basic Usage:
|
410
|
+
|
411
|
+
```python
|
412
|
+
from doctra.parsers.enhanced_pdf_parser import EnhancedPDFParser
|
413
|
+
|
414
|
+
# Basic enhanced parser with image restoration
|
415
|
+
parser = EnhancedPDFParser(
|
416
|
+
use_image_restoration=True,
|
417
|
+
restoration_task="appearance" # Default restoration task
|
418
|
+
)
|
419
|
+
|
420
|
+
# Parse document with enhancement
|
421
|
+
parser.parse("scanned_document.pdf")
|
422
|
+
```
|
423
|
+
|
424
|
+
#### Advanced Configuration:
|
425
|
+
|
426
|
+
```python
|
427
|
+
parser = EnhancedPDFParser(
|
428
|
+
# Image Restoration Settings
|
429
|
+
use_image_restoration=True,
|
430
|
+
restoration_task="dewarping", # Correct perspective distortion
|
431
|
+
restoration_device="cuda", # Use GPU for faster processing
|
432
|
+
restoration_dpi=300, # Higher DPI for better quality
|
433
|
+
|
434
|
+
# VLM Settings
|
435
|
+
use_vlm=True,
|
436
|
+
vlm_provider="openai",
|
437
|
+
vlm_model="gpt-4-vision",
|
438
|
+
vlm_api_key="your_api_key",
|
439
|
+
|
440
|
+
# Layout Detection Settings
|
441
|
+
layout_model_name="PP-DocLayout_plus-L",
|
442
|
+
dpi=200,
|
443
|
+
min_score=0.5,
|
444
|
+
|
445
|
+
# OCR Settings
|
446
|
+
ocr_lang="eng",
|
447
|
+
ocr_psm=6
|
448
|
+
)
|
449
|
+
```
|
450
|
+
|
451
|
+
#### DocRes Restoration Tasks:
|
452
|
+
|
453
|
+
| Task | Description | Best For |
|
454
|
+
|------|-------------|----------|
|
455
|
+
| `appearance` | General appearance enhancement | Most documents (default) |
|
456
|
+
| `dewarping` | Correct perspective distortion | Scanned documents with perspective issues |
|
457
|
+
| `deshadowing` | Remove shadows and lighting artifacts | Documents with shadow problems |
|
458
|
+
| `deblurring` | Reduce blur and improve sharpness | Blurry or low-quality scans |
|
459
|
+
| `binarization` | Convert to black and white | Documents needing clean binarization |
|
460
|
+
| `end2end` | Complete restoration pipeline | Severely degraded documents |
|
461
|
+
|
394
462
|
### ChartTablePDFParser
|
395
463
|
|
396
464
|
The `ChartTablePDFParser` is a specialized parser focused specifically on extracting charts and tables from PDF documents. It's optimized for scenarios where you only need these specific elements, providing faster processing and more targeted output.
|
@@ -444,6 +512,163 @@ parser = ChartTablePDFParser(
|
|
444
512
|
)
|
445
513
|
```
|
446
514
|
|
515
|
+
### DocResEngine
|
516
|
+
|
517
|
+
The `DocResEngine` provides direct access to DocRes image restoration capabilities. This engine is perfect for standalone image restoration tasks or when you need fine-grained control over the restoration process.
|
518
|
+
|
519
|
+
#### Key Features:
|
520
|
+
- **Direct Image Restoration**: Process individual images or entire PDFs
|
521
|
+
- **Multiple Restoration Tasks**: All 6 DocRes restoration tasks available
|
522
|
+
- **GPU Acceleration**: Automatic CUDA detection and optimization
|
523
|
+
- **Flexible Input/Output**: Support for various image formats and PDFs
|
524
|
+
- **Metadata Extraction**: Get detailed information about restoration process
|
525
|
+
|
526
|
+
#### Basic Usage:
|
527
|
+
|
528
|
+
```python
|
529
|
+
from doctra.engines.image_restoration import DocResEngine
|
530
|
+
|
531
|
+
# Initialize DocRes engine
|
532
|
+
docres = DocResEngine(device="cuda") # or "cpu" or None for auto-detect
|
533
|
+
|
534
|
+
# Restore a single image
|
535
|
+
restored_img, metadata = docres.restore_image(
|
536
|
+
image="path/to/image.jpg",
|
537
|
+
task="appearance"
|
538
|
+
)
|
539
|
+
|
540
|
+
# Restore entire PDF
|
541
|
+
enhanced_pdf = docres.restore_pdf(
|
542
|
+
pdf_path="document.pdf",
|
543
|
+
output_path="enhanced_document.pdf",
|
544
|
+
task="appearance"
|
545
|
+
)
|
546
|
+
```
|
547
|
+
|
548
|
+
#### Advanced Usage:
|
549
|
+
|
550
|
+
```python
|
551
|
+
# Initialize with custom settings
|
552
|
+
docres = DocResEngine(
|
553
|
+
device="cuda", # Force GPU usage
|
554
|
+
use_half_precision=True, # Use half precision for faster processing
|
555
|
+
model_path="custom/model.pth", # Custom model path (optional)
|
556
|
+
mbd_path="custom/mbd.pth" # Custom MBD model path (optional)
|
557
|
+
)
|
558
|
+
|
559
|
+
# Process multiple images
|
560
|
+
images = ["doc1.jpg", "doc2.jpg", "doc3.jpg"]
|
561
|
+
for img_path in images:
|
562
|
+
restored_img, metadata = docres.restore_image(
|
563
|
+
image=img_path,
|
564
|
+
task="dewarping"
|
565
|
+
)
|
566
|
+
print(f"Processed {img_path}: {metadata}")
|
567
|
+
|
568
|
+
# Batch PDF processing
|
569
|
+
pdfs = ["report1.pdf", "report2.pdf"]
|
570
|
+
for pdf_path in pdfs:
|
571
|
+
output_path = f"enhanced_{os.path.basename(pdf_path)}"
|
572
|
+
docres.restore_pdf(
|
573
|
+
pdf_path=pdf_path,
|
574
|
+
output_path=output_path,
|
575
|
+
task="end2end" # Complete restoration pipeline
|
576
|
+
)
|
577
|
+
```
|
578
|
+
|
579
|
+
#### Supported Restoration Tasks:
|
580
|
+
|
581
|
+
| Task | Description | Use Case |
|
582
|
+
|------|-------------|----------|
|
583
|
+
| `appearance` | General appearance enhancement | Default choice for most documents |
|
584
|
+
| `dewarping` | Correct document perspective distortion | Scanned documents with perspective issues |
|
585
|
+
| `deshadowing` | Remove shadows and lighting artifacts | Documents with shadow problems |
|
586
|
+
| `deblurring` | Reduce blur and improve sharpness | Blurry or low-quality scans |
|
587
|
+
| `binarization` | Convert to black and white | Documents needing clean binarization |
|
588
|
+
| `end2end` | Complete restoration pipeline | Severely degraded documents |
|
589
|
+
|
590
|
+
## 🖥️ Web UI (Gradio)
|
591
|
+
|
592
|
+
Doctra provides a comprehensive web interface built with Gradio that makes document processing accessible to non-technical users.
|
593
|
+
|
594
|
+
#### Features:
|
595
|
+
- **Drag & Drop Interface**: Upload PDFs by dragging and dropping
|
596
|
+
- **Multiple Parsers**: Choose between full parsing, enhanced parsing, and chart/table extraction
|
597
|
+
- **Real-time Processing**: See progress as documents are processed
|
598
|
+
- **VLM Integration**: Configure API keys for AI features
|
599
|
+
- **Output Preview**: View results directly in the browser
|
600
|
+
- **Download Results**: Download processed files as ZIP archives
|
601
|
+
|
602
|
+
#### Launch the Web UI:
|
603
|
+
|
604
|
+
```python
|
605
|
+
from doctra.ui.app import launch_ui
|
606
|
+
|
607
|
+
# Launch the web interface
|
608
|
+
launch_ui()
|
609
|
+
```
|
610
|
+
|
611
|
+
Or from command line:
|
612
|
+
```bash
|
613
|
+
python gradio_app.py
|
614
|
+
```
|
615
|
+
|
616
|
+
#### Web UI Components:
|
617
|
+
|
618
|
+
1. **Full Parse Tab**: Complete document processing with page navigation
|
619
|
+
2. **Tables & Charts Tab**: Specialized extraction with VLM integration
|
620
|
+
3. **DocRes Tab**: Image restoration with before/after comparison
|
621
|
+
4. **Enhanced Parser Tab**: Enhanced parsing with DocRes integration
|
622
|
+
|
623
|
+
## Command Line Interface
|
624
|
+
|
625
|
+
Doctra includes a powerful CLI for batch processing and automation.
|
626
|
+
|
627
|
+
#### Available Commands:
|
628
|
+
|
629
|
+
```bash
|
630
|
+
# Full document parsing
|
631
|
+
doctra parse document.pdf
|
632
|
+
|
633
|
+
# Enhanced parsing with image restoration
|
634
|
+
doctra enhance document.pdf --restoration-task appearance
|
635
|
+
|
636
|
+
# Extract only charts and tables
|
637
|
+
doctra extract charts document.pdf
|
638
|
+
doctra extract tables document.pdf
|
639
|
+
doctra extract both document.pdf --use-vlm
|
640
|
+
|
641
|
+
# Visualize layout detection
|
642
|
+
doctra visualize document.pdf
|
643
|
+
|
644
|
+
# Quick document analysis
|
645
|
+
doctra analyze document.pdf
|
646
|
+
|
647
|
+
# System information
|
648
|
+
doctra info
|
649
|
+
```
|
650
|
+
|
651
|
+
#### CLI Examples:
|
652
|
+
|
653
|
+
```bash
|
654
|
+
# Enhanced parsing with custom settings
|
655
|
+
doctra enhance document.pdf \
|
656
|
+
--restoration-task dewarping \
|
657
|
+
--restoration-device cuda \
|
658
|
+
--use-vlm \
|
659
|
+
--vlm-provider openai \
|
660
|
+
--vlm-api-key your_key
|
661
|
+
|
662
|
+
# Extract charts with VLM
|
663
|
+
doctra extract charts document.pdf \
|
664
|
+
--use-vlm \
|
665
|
+
--vlm-provider gemini \
|
666
|
+
--vlm-api-key your_key
|
667
|
+
|
668
|
+
# Batch processing
|
669
|
+
doctra parse *.pdf --output-dir results/
|
670
|
+
```
|
671
|
+
|
447
672
|
## 🎨 Visualization
|
448
673
|
|
449
674
|
Doctra provides powerful visualization capabilities to help you understand how the layout detection works and verify the accuracy of element extraction.
|
@@ -540,7 +765,53 @@ parser.parse("financial_report.pdf")
|
|
540
765
|
# - Markdown file with all content
|
541
766
|
```
|
542
767
|
|
543
|
-
### Example 2:
|
768
|
+
### Example 2: Enhanced Parsing with Image Restoration
|
769
|
+
|
770
|
+
```python
|
771
|
+
from doctra.parsers.enhanced_pdf_parser import EnhancedPDFParser
|
772
|
+
|
773
|
+
# Initialize enhanced parser with image restoration
|
774
|
+
parser = EnhancedPDFParser(
|
775
|
+
use_image_restoration=True,
|
776
|
+
restoration_task="dewarping", # Correct perspective distortion
|
777
|
+
restoration_device="cuda", # Use GPU for faster processing
|
778
|
+
use_vlm=True,
|
779
|
+
vlm_provider="openai",
|
780
|
+
vlm_api_key="your_api_key"
|
781
|
+
)
|
782
|
+
|
783
|
+
# Process scanned document with enhancement
|
784
|
+
parser.parse("scanned_document.pdf")
|
785
|
+
|
786
|
+
# Output will include:
|
787
|
+
# - Enhanced PDF with restored images
|
788
|
+
# - All standard parsing outputs
|
789
|
+
# - Improved OCR accuracy due to restoration
|
790
|
+
```
|
791
|
+
|
792
|
+
### Example 3: Direct Image Restoration
|
793
|
+
|
794
|
+
```python
|
795
|
+
from doctra.engines.image_restoration import DocResEngine
|
796
|
+
|
797
|
+
# Initialize DocRes engine
|
798
|
+
docres = DocResEngine(device="cuda")
|
799
|
+
|
800
|
+
# Restore individual images
|
801
|
+
restored_img, metadata = docres.restore_image(
|
802
|
+
image="blurry_document.jpg",
|
803
|
+
task="deblurring"
|
804
|
+
)
|
805
|
+
|
806
|
+
# Restore entire PDF
|
807
|
+
docres.restore_pdf(
|
808
|
+
pdf_path="low_quality.pdf",
|
809
|
+
output_path="enhanced.pdf",
|
810
|
+
task="appearance"
|
811
|
+
)
|
812
|
+
```
|
813
|
+
|
814
|
+
### Example 4: Chart and Table Extraction with VLM
|
544
815
|
|
545
816
|
```python
|
546
817
|
from doctra.parsers.table_chart_extractor import ChartTablePDFParser
|
@@ -563,29 +834,42 @@ parser.parse("data_report.pdf", output_base_dir="extracted_data")
|
|
563
834
|
# - Markdown tables with extracted data
|
564
835
|
```
|
565
836
|
|
566
|
-
### Example
|
837
|
+
### Example 5: Web UI Usage
|
567
838
|
|
568
839
|
```python
|
569
|
-
from doctra.
|
840
|
+
from doctra.ui.app import launch_ui
|
570
841
|
|
571
|
-
#
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
ocr_psm=6, # Uniform block of text
|
582
|
-
box_separator="\n\n" # Double line breaks between elements
|
583
|
-
)
|
842
|
+
# Launch the web interface
|
843
|
+
launch_ui()
|
844
|
+
|
845
|
+
# Or build the interface programmatically
|
846
|
+
from doctra.ui.app import build_demo
|
847
|
+
demo = build_demo()
|
848
|
+
demo.launch(share=True) # Share publicly
|
849
|
+
```
|
850
|
+
|
851
|
+
### Example 6: Command Line Usage
|
584
852
|
|
585
|
-
|
853
|
+
```bash
|
854
|
+
# Enhanced parsing with custom settings
|
855
|
+
doctra enhance document.pdf \
|
856
|
+
--restoration-task dewarping \
|
857
|
+
--restoration-device cuda \
|
858
|
+
--use-vlm \
|
859
|
+
--vlm-provider openai \
|
860
|
+
--vlm-api-key your_key
|
861
|
+
|
862
|
+
# Extract charts with VLM
|
863
|
+
doctra extract charts document.pdf \
|
864
|
+
--use-vlm \
|
865
|
+
--vlm-provider gemini \
|
866
|
+
--vlm-api-key your_key
|
867
|
+
|
868
|
+
# Batch processing
|
869
|
+
doctra parse *.pdf --output-dir results/
|
586
870
|
```
|
587
871
|
|
588
|
-
### Example
|
872
|
+
### Example 7: Layout Visualization
|
589
873
|
|
590
874
|
```python
|
591
875
|
from doctra.parsers.structured_pdf_parser import StructuredPDFParser
|
@@ -624,68 +908,41 @@ parser.display_pages_with_boxes("document.pdf")
|
|
624
908
|
- Organized output directory structure
|
625
909
|
- High-resolution image preservation
|
626
910
|
|
911
|
+
### 🔧 Image Restoration (DocRes)
|
912
|
+
- **6 Restoration Tasks**: Dewarping, deshadowing, appearance enhancement, deblurring, binarization, and end-to-end restoration
|
913
|
+
- **GPU Acceleration**: Automatic CUDA detection and optimization
|
914
|
+
- **Enhanced Quality**: Improves document quality for better OCR and layout detection
|
915
|
+
- **Flexible Processing**: Standalone image restoration or integrated with parsing
|
916
|
+
|
627
917
|
### 🤖 VLM Integration
|
628
918
|
- Vision Language Model support for structured data extraction
|
629
|
-
- Multiple provider options (Gemini,
|
919
|
+
- Multiple provider options (OpenAI, Gemini, Anthropic, OpenRouter)
|
630
920
|
- Automatic conversion of charts and tables to structured formats
|
631
921
|
|
632
922
|
### 📊 Multiple Output Formats
|
633
923
|
- **Markdown**: Human-readable document with embedded images and tables
|
634
924
|
- **Excel**: Structured data in spreadsheet format
|
635
925
|
- **JSON**: Programmatically accessible structured data
|
926
|
+
- **HTML**: Interactive web-ready documents
|
636
927
|
- **Images**: High-quality cropped visual elements
|
637
928
|
|
929
|
+
### 🖥️ User Interfaces
|
930
|
+
- **Web UI**: Gradio-based interface with drag & drop functionality
|
931
|
+
- **Command Line**: Powerful CLI for batch processing and automation
|
932
|
+
- **Multiple Tabs**: Full parsing, enhanced parsing, chart/table extraction, and image restoration
|
933
|
+
|
638
934
|
### ⚙️ Flexible Configuration
|
639
935
|
- Extensive customization options
|
640
936
|
- Performance tuning parameters
|
641
937
|
- Output format selection
|
938
|
+
- Device selection (CPU/GPU)
|
642
939
|
|
643
|
-
##
|
644
|
-
|
645
|
-
### Core Dependencies
|
646
|
-
- **PaddleOCR**: Document layout detection
|
647
|
-
- **Outlines**: Structured output generation
|
648
|
-
- **Tesseract**: OCR text extraction
|
649
|
-
- **Pillow**: Image processing
|
650
|
-
- **OpenCV**: Computer vision operations
|
651
|
-
- **Pandas**: Data manipulation
|
652
|
-
- **OpenPyXL**: Excel file generation
|
653
|
-
- **Google Generative AI**: For Gemini VLM integration
|
654
|
-
- **OpenAI**: For GPT-5 VLM integration
|
655
|
-
|
656
|
-
## 🖥️ Web Interface (Gradio)
|
657
|
-
|
658
|
-
You can try Doctra in a simple web UI powered by Gradio.
|
659
|
-
|
660
|
-
### Run locally
|
661
|
-
|
662
|
-
```bash
|
663
|
-
pip install -U gradio
|
664
|
-
python gradio_app.py
|
665
|
-
```
|
666
|
-
|
667
|
-
Then open the printed URL (default `http://127.0.0.1:7860`).
|
668
|
-
|
669
|
-
Notes:
|
670
|
-
- If using VLM, set the API key field in the UI or export `VLM_API_KEY`.
|
671
|
-
- Outputs are saved under `outputs/<pdf_stem>/` and previewed in the UI.
|
672
|
-
|
673
|
-
### Deploy on Hugging Face Spaces
|
674
|
-
|
675
|
-
1) Create a new Space (type: Gradio, SDK: Python).
|
676
|
-
|
677
|
-
2) Add these files to the Space repo:
|
678
|
-
- Your package code (or install from PyPI).
|
679
|
-
- `gradio_app.py` (entry point).
|
680
|
-
- `requirements.txt` with at least:
|
681
|
-
|
682
|
-
```text
|
683
|
-
doctra
|
684
|
-
gradio
|
685
|
-
```
|
940
|
+
## 🙏 Acknowledgments
|
686
941
|
|
687
|
-
|
942
|
+
Doctra builds upon several excellent open-source projects:
|
688
943
|
|
689
|
-
|
944
|
+
- **[PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)** - Advanced document layout detection and OCR capabilities
|
945
|
+
- **[DocRes](https://github.com/ZZZHANG-jx/DocRes)** - State-of-the-art document image restoration model
|
946
|
+
- **[Outlines](https://github.com/dottxt-ai/outlines)** - Structured output generation for LLMs
|
690
947
|
|
691
|
-
|
948
|
+
We thank the developers and contributors of these projects for their valuable work that makes Doctra possible.
|