doctra 0.3.3__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. doctra/__init__.py +4 -0
  2. doctra/cli/main.py +170 -9
  3. doctra/cli/utils.py +2 -3
  4. doctra/engines/image_restoration/__init__.py +10 -0
  5. doctra/engines/image_restoration/docres_engine.py +561 -0
  6. doctra/engines/vlm/outlines_types.py +13 -9
  7. doctra/engines/vlm/service.py +4 -2
  8. doctra/exporters/excel_writer.py +89 -0
  9. doctra/parsers/enhanced_pdf_parser.py +374 -0
  10. doctra/parsers/structured_pdf_parser.py +6 -0
  11. doctra/parsers/table_chart_extractor.py +6 -0
  12. doctra/third_party/docres/data/MBD/MBD.py +110 -0
  13. doctra/third_party/docres/data/MBD/MBD_utils.py +291 -0
  14. doctra/third_party/docres/data/MBD/infer.py +151 -0
  15. doctra/third_party/docres/data/MBD/model/deep_lab_model/aspp.py +95 -0
  16. doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/__init__.py +13 -0
  17. doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/drn.py +402 -0
  18. doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/mobilenet.py +151 -0
  19. doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/resnet.py +170 -0
  20. doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/xception.py +288 -0
  21. doctra/third_party/docres/data/MBD/model/deep_lab_model/decoder.py +59 -0
  22. doctra/third_party/docres/data/MBD/model/deep_lab_model/deeplab.py +81 -0
  23. doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/__init__.py +12 -0
  24. doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/batchnorm.py +282 -0
  25. doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/comm.py +129 -0
  26. doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/replicate.py +88 -0
  27. doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/unittest.py +29 -0
  28. doctra/third_party/docres/data/preprocess/crop_merge_image.py +142 -0
  29. doctra/third_party/docres/inference.py +370 -0
  30. doctra/third_party/docres/models/restormer_arch.py +308 -0
  31. doctra/third_party/docres/utils.py +464 -0
  32. doctra/ui/app.py +8 -14
  33. doctra/utils/structured_utils.py +5 -2
  34. doctra/version.py +1 -1
  35. {doctra-0.3.3.dist-info → doctra-0.4.1.dist-info}/METADATA +1 -1
  36. doctra-0.4.1.dist-info/RECORD +67 -0
  37. doctra-0.3.3.dist-info/RECORD +0 -44
  38. {doctra-0.3.3.dist-info → doctra-0.4.1.dist-info}/WHEEL +0 -0
  39. {doctra-0.3.3.dist-info → doctra-0.4.1.dist-info}/licenses/LICENSE +0 -0
  40. {doctra-0.3.3.dist-info → doctra-0.4.1.dist-info}/top_level.txt +0 -0
doctra/__init__.py CHANGED
@@ -4,13 +4,17 @@ Parse, extract, and analyze documents with ease
4
4
  """
5
5
 
6
6
  from .parsers.structured_pdf_parser import StructuredPDFParser
7
+ from .parsers.enhanced_pdf_parser import EnhancedPDFParser
7
8
  from .parsers.table_chart_extractor import ChartTablePDFParser
9
+ from .engines.image_restoration import DocResEngine
8
10
  from .version import __version__
9
11
  from .ui import build_demo, launch_ui
10
12
 
11
13
  __all__ = [
12
14
  'StructuredPDFParser',
15
+ 'EnhancedPDFParser',
13
16
  'ChartTablePDFParser',
17
+ 'DocResEngine',
14
18
  'build_demo',
15
19
  'launch_ui',
16
20
  '__version__'
doctra/cli/main.py CHANGED
@@ -9,20 +9,27 @@ detection results, and analyze document structure from the command line.
9
9
  import click
10
10
  import os
11
11
  import sys
12
+ import traceback
12
13
  from pathlib import Path
13
14
  from typing import Optional
14
15
 
15
16
  # Import parsers
16
17
  try:
17
18
  from doctra.parsers.structured_pdf_parser_enhancer import StructuredPDFParser
19
+ from doctra.parsers.enhanced_pdf_parser import EnhancedPDFParser
18
20
  from doctra.parsers.chart_table_pdf_parser import ChartTablePDFParser
19
21
  except ImportError:
20
22
  # Fallback for development/testing
21
23
  project_root = Path(__file__).parent.parent.parent
22
24
  sys.path.insert(0, str(project_root))
23
25
  from doctra.parsers.structured_pdf_parser import StructuredPDFParser
26
+ from doctra.parsers.enhanced_pdf_parser import EnhancedPDFParser
24
27
  from doctra.parsers.table_chart_extractor import ChartTablePDFParser
25
28
 
29
+ # Import additional modules
30
+ from doctra.engines.layout.paddle_layout import PaddleLayoutEngine
31
+ from doctra.engines.image_restoration import DocResEngine
32
+
26
33
 
27
34
  @click.group(invoke_without_command=True)
28
35
  @click.pass_context
@@ -37,6 +44,7 @@ def cli(ctx):
37
44
  \b
38
45
  Commands:
39
46
  parse Full document parsing with text, tables, charts, and figures
47
+ enhance Enhanced parsing with DocRes image restoration
40
48
  extract Extract only charts and/or tables from documents
41
49
  visualize Visualize layout detection results
42
50
  analyze Quick document analysis without processing
@@ -45,6 +53,7 @@ def cli(ctx):
45
53
  \b
46
54
  Examples:
47
55
  doctra parse document.pdf # Full document parsing
56
+ doctra enhance document.pdf # Enhanced parsing with image restoration
48
57
  doctra extract charts document.pdf # Extract only charts
49
58
  doctra extract both document.pdf --use-vlm # Extract charts & tables with VLM
50
59
  doctra visualize document.pdf # Visualize layout detection
@@ -243,7 +252,6 @@ def parse(pdf_path: Path, output_dir: Optional[Path], use_vlm: bool,
243
252
  except Exception as e:
244
253
  click.echo(f"❌ Error initializing parser: {e}", err=True)
245
254
  if verbose:
246
- import traceback
247
255
  click.echo(traceback.format_exc(), err=True)
248
256
  sys.exit(1)
249
257
 
@@ -267,7 +275,151 @@ def parse(pdf_path: Path, output_dir: Optional[Path], use_vlm: bool,
267
275
  except Exception as e:
268
276
  click.echo(f"❌ Error during parsing: {e}", err=True)
269
277
  if verbose:
270
- import traceback
278
+ click.echo(traceback.format_exc(), err=True)
279
+ sys.exit(1)
280
+ finally:
281
+ # Restore original working directory
282
+ os.chdir(original_cwd)
283
+
284
+
285
+ @cli.command()
286
+ @click.argument('pdf_path', type=click.Path(exists=True, path_type=Path))
287
+ @click.option('--output-dir', '-o', type=click.Path(path_type=Path),
288
+ help='Output directory (default: outputs/{pdf_filename}_enhanced)')
289
+ @click.option('--restoration-task', type=click.Choice(['dewarping', 'deshadowing', 'appearance', 'deblurring', 'binarization', 'end2end']),
290
+ default='appearance', help='DocRes restoration task (default: appearance)')
291
+ @click.option('--restoration-device', type=click.Choice(['cuda', 'cpu']),
292
+ help='Device for DocRes processing (default: auto-detect)')
293
+ @click.option('--restoration-dpi', type=int, default=200,
294
+ help='DPI for restoration processing (default: 200)')
295
+ @vlm_options
296
+ @layout_options
297
+ @ocr_options
298
+ @click.option('--box-separator', default='\n',
299
+ help='Separator between text boxes in output (default: newline)')
300
+ @click.option('--verbose', '-v', is_flag=True,
301
+ help='Enable verbose output')
302
+ def enhance(pdf_path: Path, output_dir: Optional[Path], restoration_task: str,
303
+ restoration_device: Optional[str], restoration_dpi: int,
304
+ use_vlm: bool, vlm_provider: str, vlm_model: Optional[str], vlm_api_key: Optional[str],
305
+ layout_model: str, dpi: int, min_score: float,
306
+ ocr_lang: str, ocr_psm: int, ocr_oem: int, ocr_config: str,
307
+ box_separator: str, verbose: bool):
308
+ """
309
+ Enhanced PDF parsing with DocRes image restoration.
310
+
311
+ Performs document processing with image restoration to improve quality
312
+ before layout detection and content extraction. Particularly useful for
313
+ scanned documents, low-quality PDFs, or documents with shadows/distortion.
314
+
315
+ \b
316
+ Restoration Tasks:
317
+ appearance - General appearance enhancement (default)
318
+ dewarping - Correct document perspective distortion
319
+ deshadowing - Remove shadows from documents
320
+ deblurring - Reduce blur in document images
321
+ binarization - Convert to clean black/white text
322
+ end2end - Complete pipeline: dewarping → deshadowing → appearance
323
+
324
+ \b
325
+ Examples:
326
+ doctra enhance document.pdf
327
+ doctra enhance document.pdf --restoration-task dewarping
328
+ doctra enhance document.pdf --restoration-task end2end --restoration-device cuda
329
+ doctra enhance document.pdf --use-vlm --vlm-api-key your_key
330
+ doctra enhance document.pdf -o ./enhanced_results --restoration-dpi 300
331
+ doctra enhance document.pdf --restoration-task deshadowing # Use different restoration task
332
+
333
+ :param pdf_path: Path to the input PDF file
334
+ :param output_dir: Output directory for results (optional)
335
+ :param restoration_task: DocRes restoration task to perform
336
+ :param restoration_device: Device for DocRes processing
337
+ :param restoration_dpi: DPI for restoration processing
338
+ :param use_vlm: Whether to use VLM for enhanced extraction
339
+ :param vlm_provider: VLM provider ('gemini' or 'openai')
340
+ :param vlm_model: Model name to use (defaults to provider-specific defaults)
341
+ :param vlm_api_key: API key for VLM provider
342
+ :param layout_model: Layout detection model name
343
+ :param dpi: DPI for PDF rendering
344
+ :param min_score: Minimum confidence score for layout detection
345
+ :param ocr_lang: OCR language code
346
+ :param ocr_psm: Tesseract page segmentation mode
347
+ :param ocr_oem: Tesseract OCR engine mode
348
+ :param ocr_config: Additional Tesseract configuration
349
+ :param box_separator: Separator between text boxes in output
350
+ :param verbose: Whether to enable verbose output
351
+ :return: None
352
+ """
353
+ validate_vlm_config(use_vlm, vlm_api_key)
354
+
355
+ if verbose:
356
+ click.echo(f"🔍 Starting enhanced PDF parsing with DocRes...")
357
+ click.echo(f" Input: {pdf_path}")
358
+ click.echo(f" Restoration task: {restoration_task}")
359
+ click.echo(f" Restoration device: {restoration_device or 'auto-detect'}")
360
+ click.echo(f" Restoration DPI: {restoration_dpi}")
361
+ if output_dir:
362
+ click.echo(f" Output: {output_dir}")
363
+
364
+ # Create enhanced parser instance
365
+ try:
366
+ if verbose:
367
+ click.echo(f"🔧 Initializing enhanced parser with DocRes...")
368
+ if use_vlm:
369
+ click.echo(f" VLM Provider: {vlm_provider}")
370
+ click.echo(f" VLM Model: {vlm_model or 'default'}")
371
+ click.echo(f" Layout Model: {layout_model}")
372
+ click.echo(f" DPI: {dpi}")
373
+ click.echo(f" OCR Language: {ocr_lang}")
374
+ else:
375
+ click.echo(f"🔧 Initializing enhanced parser with DocRes...")
376
+ if use_vlm:
377
+ click.echo(f" Using VLM: {vlm_provider}")
378
+
379
+ parser = EnhancedPDFParser(
380
+ use_image_restoration=True,
381
+ restoration_task=restoration_task,
382
+ restoration_device=restoration_device,
383
+ restoration_dpi=restoration_dpi,
384
+ use_vlm=use_vlm,
385
+ vlm_provider=vlm_provider,
386
+ vlm_model=vlm_model,
387
+ vlm_api_key=vlm_api_key,
388
+ layout_model_name=layout_model,
389
+ dpi=dpi,
390
+ min_score=min_score,
391
+ ocr_lang=ocr_lang,
392
+ ocr_psm=ocr_psm,
393
+ ocr_oem=ocr_oem,
394
+ ocr_extra_config=ocr_config,
395
+ box_separator=box_separator
396
+ )
397
+ except Exception as e:
398
+ click.echo(f"❌ Error initializing enhanced parser: {e}", err=True)
399
+ if verbose:
400
+ click.echo(traceback.format_exc(), err=True)
401
+ sys.exit(1)
402
+
403
+ # Change to output directory if specified
404
+ original_cwd = os.getcwd()
405
+ if output_dir:
406
+ output_dir.mkdir(parents=True, exist_ok=True)
407
+ os.chdir(output_dir)
408
+ click.echo(f"📁 Output directory: {output_dir.absolute()}")
409
+
410
+ try:
411
+ # Parse the document with enhancement
412
+ click.echo(f"📄 Processing with enhancement: {pdf_path.name}")
413
+ parser.parse(str(pdf_path.absolute()), str(output_dir) if output_dir else None)
414
+ click.echo("✅ Enhanced document processing completed successfully!")
415
+ click.echo(f"📁 Output directory: {output_dir.absolute() if output_dir else 'outputs/'}")
416
+
417
+ except KeyboardInterrupt:
418
+ click.echo("\n⚠️ Processing interrupted by user", err=True)
419
+ sys.exit(130)
420
+ except Exception as e:
421
+ click.echo(f"❌ Error during enhanced parsing: {e}", err=True)
422
+ if verbose:
271
423
  click.echo(traceback.format_exc(), err=True)
272
424
  sys.exit(1)
273
425
  finally:
@@ -375,7 +527,6 @@ def charts(pdf_path: Path, output_dir: Path, use_vlm: bool, vlm_provider: str,
375
527
  except Exception as e:
376
528
  click.echo(f"❌ Error during chart extraction: {e}", err=True)
377
529
  if verbose:
378
- import traceback
379
530
  click.echo(traceback.format_exc(), err=True)
380
531
  sys.exit(1)
381
532
 
@@ -453,7 +604,6 @@ def tables(pdf_path: Path, output_dir: Path, use_vlm: bool, vlm_provider: str,
453
604
  except Exception as e:
454
605
  click.echo(f"❌ Error during table extraction: {e}", err=True)
455
606
  if verbose:
456
- import traceback
457
607
  click.echo(traceback.format_exc(), err=True)
458
608
  sys.exit(1)
459
609
 
@@ -532,7 +682,6 @@ def both(pdf_path: Path, output_dir: Path, use_vlm: bool, vlm_provider: str,
532
682
  except Exception as e:
533
683
  click.echo(f"❌ Error during extraction: {e}", err=True)
534
684
  if verbose:
535
- import traceback
536
685
  click.echo(traceback.format_exc(), err=True)
537
686
  sys.exit(1)
538
687
 
@@ -621,7 +770,6 @@ def visualize(pdf_path: Path, pages: int, columns: int, width: int,
621
770
  except Exception as e:
622
771
  click.echo(f"❌ Error creating visualization: {e}", err=True)
623
772
  if verbose:
624
- import traceback
625
773
  click.echo(traceback.format_exc(), err=True)
626
774
  sys.exit(1)
627
775
 
@@ -654,7 +802,6 @@ def analyze(pdf_path: Path, dpi: int, min_score: float, layout_model: str, verbo
654
802
  click.echo(f"🔍 Analyzing: {pdf_path.name}")
655
803
 
656
804
  # Create layout engine for analysis only
657
- from doctra.engines.layout.paddle_layout import PaddleLayoutEngine
658
805
 
659
806
  if verbose:
660
807
  click.echo(f" Using model: {layout_model}")
@@ -752,7 +899,6 @@ def analyze(pdf_path: Path, dpi: int, min_score: float, layout_model: str, verbo
752
899
  except Exception as e:
753
900
  click.echo(f"❌ Error analyzing PDF: {e}", err=True)
754
901
  if verbose:
755
- import traceback
756
902
  click.echo(traceback.format_exc(), err=True)
757
903
  sys.exit(1)
758
904
 
@@ -771,7 +917,6 @@ def info():
771
917
  click.echo("=" * 50)
772
918
 
773
919
  # Check Python version
774
- import sys
775
920
  python_version = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}"
776
921
  click.echo(f"Python version: {python_version}")
777
922
 
@@ -782,6 +927,9 @@ def info():
782
927
  ('pytesseract', 'pytesseract', 'OCR engine'),
783
928
  ('tqdm', 'tqdm', 'Progress bars'),
784
929
  ('click', 'click', 'CLI framework'),
930
+ ('skimage', 'scikit-image', 'DocRes image restoration'),
931
+ ('torch', 'torch', 'DocRes neural networks'),
932
+ ('huggingface_hub', 'huggingface_hub', 'Hugging Face model downloads'),
785
933
  ]
786
934
 
787
935
  click.echo("\nCore Dependencies:")
@@ -811,6 +959,7 @@ def info():
811
959
  # Available commands
812
960
  click.echo("\nAvailable Commands:")
813
961
  click.echo(" 📄 parse - Full document processing (text, tables, charts, figures)")
962
+ click.echo(" ✨ enhance - Enhanced parsing with DocRes image restoration")
814
963
  click.echo(" 📊 extract - Chart/table extraction only")
815
964
  click.echo(" ├─ charts - Extract only charts")
816
965
  click.echo(" ├─ tables - Extract only tables")
@@ -845,9 +994,21 @@ def info():
845
994
  else:
846
995
  click.echo(" VLM_API_KEY: (not set)")
847
996
 
997
+ # DocRes information
998
+ click.echo("\nDocRes Image Restoration:")
999
+ try:
1000
+ docres = DocResEngine()
1001
+ click.echo(f" ✅ DocRes available - {len(docres.get_supported_tasks())} restoration tasks")
1002
+ click.echo(" Tasks: dewarping, deshadowing, appearance, deblurring, binarization, end2end")
1003
+ click.echo(" 📥 Models: Downloaded from Hugging Face Hub")
1004
+ except Exception as e:
1005
+ click.echo(f" ⚠️ DocRes not available - {str(e)[:50]}...")
1006
+ click.echo(" Install with: pip install scikit-image torch huggingface_hub")
1007
+
848
1008
  # Usage examples
849
1009
  click.echo("\n💡 Quick Start Examples:")
850
1010
  click.echo(" doctra parse document.pdf # Full document parsing")
1011
+ click.echo(" doctra enhance document.pdf # Enhanced parsing with DocRes")
851
1012
  click.echo(" doctra extract both document.pdf --use-vlm # Charts & tables with VLM")
852
1013
  click.echo(" doctra extract charts document.pdf # Only charts")
853
1014
  click.echo(" doctra extract tables document.pdf # Only tables")
doctra/cli/utils.py CHANGED
@@ -7,8 +7,10 @@ different CLI commands.
7
7
 
8
8
  import click
9
9
  import sys
10
+ import traceback
10
11
  from typing import Optional, Dict, Any
11
12
  from pathlib import Path
13
+ from doctra.utils.progress import create_beautiful_progress_bar, create_notebook_friendly_bar
12
14
 
13
15
 
14
16
  def validate_vlm_config(use_vlm: bool, vlm_api_key: Optional[str]) -> None:
@@ -58,7 +60,6 @@ def handle_exception(e: Exception, verbose: bool = False) -> None:
58
60
  """
59
61
  click.echo(f"❌ Error: {e}", err=True)
60
62
  if verbose:
61
- import traceback
62
63
  click.echo(traceback.format_exc(), err=True)
63
64
  sys.exit(1)
64
65
 
@@ -271,8 +272,6 @@ def create_progress_callback(description: str, total: int):
271
272
  :return: Callable progress callback function that takes an integer
272
273
  representing the number of completed items
273
274
  """
274
- import sys
275
- from doctra.utils.progress import create_beautiful_progress_bar, create_notebook_friendly_bar
276
275
 
277
276
  # Enhanced environment detection
278
277
  is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
@@ -0,0 +1,10 @@
1
+ """
2
+ Image Restoration Engines
3
+
4
+ This module provides image restoration capabilities for document processing.
5
+ Currently supports DocRes for various document image restoration tasks.
6
+ """
7
+
8
+ from .docres_engine import DocResEngine
9
+
10
+ __all__ = ['DocResEngine']