doctra 0.3.2__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. doctra/__init__.py +4 -0
  2. doctra/cli/main.py +168 -0
  3. doctra/engines/image_restoration/__init__.py +10 -0
  4. doctra/engines/image_restoration/docres_engine.py +566 -0
  5. doctra/engines/vlm/service.py +0 -12
  6. doctra/parsers/enhanced_pdf_parser.py +370 -0
  7. doctra/parsers/structured_pdf_parser.py +11 -60
  8. doctra/parsers/table_chart_extractor.py +8 -44
  9. doctra/third_party/docres/data/MBD/MBD.py +110 -0
  10. doctra/third_party/docres/data/MBD/MBD_utils.py +291 -0
  11. doctra/third_party/docres/data/MBD/infer.py +151 -0
  12. doctra/third_party/docres/data/MBD/model/deep_lab_model/aspp.py +95 -0
  13. doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/__init__.py +13 -0
  14. doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/drn.py +402 -0
  15. doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/mobilenet.py +151 -0
  16. doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/resnet.py +170 -0
  17. doctra/third_party/docres/data/MBD/model/deep_lab_model/backbone/xception.py +288 -0
  18. doctra/third_party/docres/data/MBD/model/deep_lab_model/decoder.py +59 -0
  19. doctra/third_party/docres/data/MBD/model/deep_lab_model/deeplab.py +81 -0
  20. doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/__init__.py +12 -0
  21. doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/batchnorm.py +282 -0
  22. doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/comm.py +129 -0
  23. doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/replicate.py +88 -0
  24. doctra/third_party/docres/data/MBD/model/deep_lab_model/sync_batchnorm/unittest.py +29 -0
  25. doctra/third_party/docres/data/preprocess/crop_merge_image.py +142 -0
  26. doctra/third_party/docres/inference.py +370 -0
  27. doctra/third_party/docres/models/restormer_arch.py +308 -0
  28. doctra/third_party/docres/utils.py +464 -0
  29. doctra/ui/app.py +5 -32
  30. doctra/utils/progress.py +13 -98
  31. doctra/utils/structured_utils.py +45 -49
  32. doctra/version.py +1 -1
  33. {doctra-0.3.2.dist-info → doctra-0.4.0.dist-info}/METADATA +1 -1
  34. doctra-0.4.0.dist-info/RECORD +67 -0
  35. doctra-0.3.2.dist-info/RECORD +0 -44
  36. {doctra-0.3.2.dist-info → doctra-0.4.0.dist-info}/WHEEL +0 -0
  37. {doctra-0.3.2.dist-info → doctra-0.4.0.dist-info}/licenses/LICENSE +0 -0
  38. {doctra-0.3.2.dist-info → doctra-0.4.0.dist-info}/top_level.txt +0 -0
doctra/__init__.py CHANGED
@@ -4,13 +4,17 @@ Parse, extract, and analyze documents with ease
4
4
  """
5
5
 
6
6
  from .parsers.structured_pdf_parser import StructuredPDFParser
7
+ from .parsers.enhanced_pdf_parser import EnhancedPDFParser
7
8
  from .parsers.table_chart_extractor import ChartTablePDFParser
9
+ from .engines.image_restoration import DocResEngine
8
10
  from .version import __version__
9
11
  from .ui import build_demo, launch_ui
10
12
 
11
13
  __all__ = [
12
14
  'StructuredPDFParser',
15
+ 'EnhancedPDFParser',
13
16
  'ChartTablePDFParser',
17
+ 'DocResEngine',
14
18
  'build_demo',
15
19
  'launch_ui',
16
20
  '__version__'
doctra/cli/main.py CHANGED
@@ -15,12 +15,14 @@ from typing import Optional
15
15
  # Import parsers
16
16
  try:
17
17
  from doctra.parsers.structured_pdf_parser_enhancer import StructuredPDFParser
18
+ from doctra.parsers.enhanced_pdf_parser import EnhancedPDFParser
18
19
  from doctra.parsers.chart_table_pdf_parser import ChartTablePDFParser
19
20
  except ImportError:
20
21
  # Fallback for development/testing
21
22
  project_root = Path(__file__).parent.parent.parent
22
23
  sys.path.insert(0, str(project_root))
23
24
  from doctra.parsers.structured_pdf_parser import StructuredPDFParser
25
+ from doctra.parsers.enhanced_pdf_parser import EnhancedPDFParser
24
26
  from doctra.parsers.table_chart_extractor import ChartTablePDFParser
25
27
 
26
28
 
@@ -37,6 +39,7 @@ def cli(ctx):
37
39
  \b
38
40
  Commands:
39
41
  parse Full document parsing with text, tables, charts, and figures
42
+ enhance Enhanced parsing with DocRes image restoration
40
43
  extract Extract only charts and/or tables from documents
41
44
  visualize Visualize layout detection results
42
45
  analyze Quick document analysis without processing
@@ -45,6 +48,7 @@ def cli(ctx):
45
48
  \b
46
49
  Examples:
47
50
  doctra parse document.pdf # Full document parsing
51
+ doctra enhance document.pdf # Enhanced parsing with image restoration
48
52
  doctra extract charts document.pdf # Extract only charts
49
53
  doctra extract both document.pdf --use-vlm # Extract charts & tables with VLM
50
54
  doctra visualize document.pdf # Visualize layout detection
@@ -275,6 +279,153 @@ def parse(pdf_path: Path, output_dir: Optional[Path], use_vlm: bool,
275
279
  os.chdir(original_cwd)
276
280
 
277
281
 
282
+ @cli.command()
283
+ @click.argument('pdf_path', type=click.Path(exists=True, path_type=Path))
284
+ @click.option('--output-dir', '-o', type=click.Path(path_type=Path),
285
+ help='Output directory (default: outputs/{pdf_filename}_enhanced)')
286
+ @click.option('--restoration-task', type=click.Choice(['dewarping', 'deshadowing', 'appearance', 'deblurring', 'binarization', 'end2end']),
287
+ default='appearance', help='DocRes restoration task (default: appearance)')
288
+ @click.option('--restoration-device', type=click.Choice(['cuda', 'cpu']),
289
+ help='Device for DocRes processing (default: auto-detect)')
290
+ @click.option('--restoration-dpi', type=int, default=200,
291
+ help='DPI for restoration processing (default: 200)')
292
+ @vlm_options
293
+ @layout_options
294
+ @ocr_options
295
+ @click.option('--box-separator', default='\n',
296
+ help='Separator between text boxes in output (default: newline)')
297
+ @click.option('--verbose', '-v', is_flag=True,
298
+ help='Enable verbose output')
299
+ def enhance(pdf_path: Path, output_dir: Optional[Path], restoration_task: str,
300
+ restoration_device: Optional[str], restoration_dpi: int,
301
+ use_vlm: bool, vlm_provider: str, vlm_model: Optional[str], vlm_api_key: Optional[str],
302
+ layout_model: str, dpi: int, min_score: float,
303
+ ocr_lang: str, ocr_psm: int, ocr_oem: int, ocr_config: str,
304
+ box_separator: str, verbose: bool):
305
+ """
306
+ Enhanced PDF parsing with DocRes image restoration.
307
+
308
+ Performs document processing with image restoration to improve quality
309
+ before layout detection and content extraction. Particularly useful for
310
+ scanned documents, low-quality PDFs, or documents with shadows/distortion.
311
+
312
+ \b
313
+ Restoration Tasks:
314
+ appearance - General appearance enhancement (default)
315
+ dewarping - Correct document perspective distortion
316
+ deshadowing - Remove shadows from documents
317
+ deblurring - Reduce blur in document images
318
+ binarization - Convert to clean black/white text
319
+ end2end - Complete pipeline: dewarping → deshadowing → appearance
320
+
321
+ \b
322
+ Examples:
323
+ doctra enhance document.pdf
324
+ doctra enhance document.pdf --restoration-task dewarping
325
+ doctra enhance document.pdf --restoration-task end2end --restoration-device cuda
326
+ doctra enhance document.pdf --use-vlm --vlm-api-key your_key
327
+ doctra enhance document.pdf -o ./enhanced_results --restoration-dpi 300
328
+ doctra enhance document.pdf --restoration-task deshadowing # Use different restoration task
329
+
330
+ :param pdf_path: Path to the input PDF file
331
+ :param output_dir: Output directory for results (optional)
332
+ :param restoration_task: DocRes restoration task to perform
333
+ :param restoration_device: Device for DocRes processing
334
+ :param restoration_dpi: DPI for restoration processing
335
+ :param use_vlm: Whether to use VLM for enhanced extraction
336
+ :param vlm_provider: VLM provider ('gemini' or 'openai')
337
+ :param vlm_model: Model name to use (defaults to provider-specific defaults)
338
+ :param vlm_api_key: API key for VLM provider
339
+ :param layout_model: Layout detection model name
340
+ :param dpi: DPI for PDF rendering
341
+ :param min_score: Minimum confidence score for layout detection
342
+ :param ocr_lang: OCR language code
343
+ :param ocr_psm: Tesseract page segmentation mode
344
+ :param ocr_oem: Tesseract OCR engine mode
345
+ :param ocr_config: Additional Tesseract configuration
346
+ :param box_separator: Separator between text boxes in output
347
+ :param verbose: Whether to enable verbose output
348
+ :return: None
349
+ """
350
+ validate_vlm_config(use_vlm, vlm_api_key)
351
+
352
+ if verbose:
353
+ click.echo(f"🔍 Starting enhanced PDF parsing with DocRes...")
354
+ click.echo(f" Input: {pdf_path}")
355
+ click.echo(f" Restoration task: {restoration_task}")
356
+ click.echo(f" Restoration device: {restoration_device or 'auto-detect'}")
357
+ click.echo(f" Restoration DPI: {restoration_dpi}")
358
+ if output_dir:
359
+ click.echo(f" Output: {output_dir}")
360
+
361
+ # Create enhanced parser instance
362
+ try:
363
+ if verbose:
364
+ click.echo(f"🔧 Initializing enhanced parser with DocRes...")
365
+ if use_vlm:
366
+ click.echo(f" VLM Provider: {vlm_provider}")
367
+ click.echo(f" VLM Model: {vlm_model or 'default'}")
368
+ click.echo(f" Layout Model: {layout_model}")
369
+ click.echo(f" DPI: {dpi}")
370
+ click.echo(f" OCR Language: {ocr_lang}")
371
+ else:
372
+ click.echo(f"🔧 Initializing enhanced parser with DocRes...")
373
+ if use_vlm:
374
+ click.echo(f" Using VLM: {vlm_provider}")
375
+
376
+ parser = EnhancedPDFParser(
377
+ use_image_restoration=True,
378
+ restoration_task=restoration_task,
379
+ restoration_device=restoration_device,
380
+ restoration_dpi=restoration_dpi,
381
+ use_vlm=use_vlm,
382
+ vlm_provider=vlm_provider,
383
+ vlm_model=vlm_model,
384
+ vlm_api_key=vlm_api_key,
385
+ layout_model_name=layout_model,
386
+ dpi=dpi,
387
+ min_score=min_score,
388
+ ocr_lang=ocr_lang,
389
+ ocr_psm=ocr_psm,
390
+ ocr_oem=ocr_oem,
391
+ ocr_extra_config=ocr_config,
392
+ box_separator=box_separator
393
+ )
394
+ except Exception as e:
395
+ click.echo(f"❌ Error initializing enhanced parser: {e}", err=True)
396
+ if verbose:
397
+ import traceback
398
+ click.echo(traceback.format_exc(), err=True)
399
+ sys.exit(1)
400
+
401
+ # Change to output directory if specified
402
+ original_cwd = os.getcwd()
403
+ if output_dir:
404
+ output_dir.mkdir(parents=True, exist_ok=True)
405
+ os.chdir(output_dir)
406
+ click.echo(f"📁 Output directory: {output_dir.absolute()}")
407
+
408
+ try:
409
+ # Parse the document with enhancement
410
+ click.echo(f"📄 Processing with enhancement: {pdf_path.name}")
411
+ parser.parse(str(pdf_path.absolute()), str(output_dir) if output_dir else None)
412
+ click.echo("✅ Enhanced document processing completed successfully!")
413
+ click.echo(f"📁 Output directory: {output_dir.absolute() if output_dir else 'outputs/'}")
414
+
415
+ except KeyboardInterrupt:
416
+ click.echo("\n⚠️ Processing interrupted by user", err=True)
417
+ sys.exit(130)
418
+ except Exception as e:
419
+ click.echo(f"❌ Error during enhanced parsing: {e}", err=True)
420
+ if verbose:
421
+ import traceback
422
+ click.echo(traceback.format_exc(), err=True)
423
+ sys.exit(1)
424
+ finally:
425
+ # Restore original working directory
426
+ os.chdir(original_cwd)
427
+
428
+
278
429
  @cli.group(invoke_without_command=True)
279
430
  @click.pass_context
280
431
  def extract(ctx):
@@ -782,6 +933,9 @@ def info():
782
933
  ('pytesseract', 'pytesseract', 'OCR engine'),
783
934
  ('tqdm', 'tqdm', 'Progress bars'),
784
935
  ('click', 'click', 'CLI framework'),
936
+ ('skimage', 'scikit-image', 'DocRes image restoration'),
937
+ ('torch', 'torch', 'DocRes neural networks'),
938
+ ('huggingface_hub', 'huggingface_hub', 'Hugging Face model downloads'),
785
939
  ]
786
940
 
787
941
  click.echo("\nCore Dependencies:")
@@ -811,6 +965,7 @@ def info():
811
965
  # Available commands
812
966
  click.echo("\nAvailable Commands:")
813
967
  click.echo(" 📄 parse - Full document processing (text, tables, charts, figures)")
968
+ click.echo(" ✨ enhance - Enhanced parsing with DocRes image restoration")
814
969
  click.echo(" 📊 extract - Chart/table extraction only")
815
970
  click.echo(" ├─ charts - Extract only charts")
816
971
  click.echo(" ├─ tables - Extract only tables")
@@ -845,9 +1000,22 @@ def info():
845
1000
  else:
846
1001
  click.echo(" VLM_API_KEY: (not set)")
847
1002
 
1003
+ # DocRes information
1004
+ click.echo("\nDocRes Image Restoration:")
1005
+ try:
1006
+ from doctra.engines.image_restoration import DocResEngine
1007
+ docres = DocResEngine()
1008
+ click.echo(f" ✅ DocRes available - {len(docres.get_supported_tasks())} restoration tasks")
1009
+ click.echo(" Tasks: dewarping, deshadowing, appearance, deblurring, binarization, end2end")
1010
+ click.echo(" 📥 Models: Downloaded from Hugging Face Hub")
1011
+ except Exception as e:
1012
+ click.echo(f" ⚠️ DocRes not available - {str(e)[:50]}...")
1013
+ click.echo(" Install with: pip install scikit-image torch huggingface_hub")
1014
+
848
1015
  # Usage examples
849
1016
  click.echo("\n💡 Quick Start Examples:")
850
1017
  click.echo(" doctra parse document.pdf # Full document parsing")
1018
+ click.echo(" doctra enhance document.pdf # Enhanced parsing with DocRes")
851
1019
  click.echo(" doctra extract both document.pdf --use-vlm # Charts & tables with VLM")
852
1020
  click.echo(" doctra extract charts document.pdf # Only charts")
853
1021
  click.echo(" doctra extract tables document.pdf # Only tables")
@@ -0,0 +1,10 @@
1
+ """
2
+ Image Restoration Engines
3
+
4
+ This module provides image restoration capabilities for document processing.
5
+ Currently supports DocRes for various document image restoration tasks.
6
+ """
7
+
8
+ from .docres_engine import DocResEngine
9
+
10
+ __all__ = ['DocResEngine']