doctra 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. doctra/__init__.py +19 -0
  2. doctra/cli/__init__.py +27 -0
  3. doctra/cli/main.py +856 -0
  4. doctra/cli/utils.py +340 -0
  5. doctra/engines/__init__.py +0 -0
  6. doctra/engines/layout/__init__.py +0 -0
  7. doctra/engines/layout/layout_models.py +90 -0
  8. doctra/engines/layout/paddle_layout.py +225 -0
  9. doctra/engines/ocr/__init__.py +4 -0
  10. doctra/engines/ocr/api.py +36 -0
  11. doctra/engines/ocr/path_resolver.py +48 -0
  12. doctra/engines/ocr/pytesseract_engine.py +76 -0
  13. doctra/engines/vlm/__init__.py +0 -0
  14. doctra/engines/vlm/outlines_types.py +31 -0
  15. doctra/engines/vlm/provider.py +58 -0
  16. doctra/engines/vlm/service.py +117 -0
  17. doctra/exporters/__init__.py +0 -0
  18. doctra/exporters/excel_writer.py +197 -0
  19. doctra/exporters/image_saver.py +42 -0
  20. doctra/exporters/markdown_table.py +56 -0
  21. doctra/exporters/markdown_writer.py +29 -0
  22. doctra/parsers/__init__.py +6 -0
  23. doctra/parsers/layout_order.py +16 -0
  24. doctra/parsers/structured_pdf_parser.py +434 -0
  25. doctra/parsers/table_chart_extractor.py +283 -0
  26. doctra/utils/__init__.py +0 -0
  27. doctra/utils/bbox.py +18 -0
  28. doctra/utils/constants.py +8 -0
  29. doctra/utils/file_ops.py +26 -0
  30. doctra/utils/io_utils.py +10 -0
  31. doctra/utils/ocr_utils.py +20 -0
  32. doctra/utils/pdf_io.py +19 -0
  33. doctra/utils/quiet.py +13 -0
  34. doctra/utils/structured_utils.py +49 -0
  35. doctra/version.py +2 -0
  36. doctra-0.1.0.dist-info/METADATA +626 -0
  37. doctra-0.1.0.dist-info/RECORD +40 -0
  38. doctra-0.1.0.dist-info/WHEEL +5 -0
  39. doctra-0.1.0.dist-info/licenses/LICENSE +201 -0
  40. doctra-0.1.0.dist-info/top_level.txt +1 -0
doctra/__init__.py ADDED
@@ -0,0 +1,19 @@
1
+ """
2
+ Doctra - Document Parsing Library
3
+ Parse, extract, and analyze documents with ease
4
+ """
5
+
6
+ from .parsers.structured_pdf_parser import StructuredPDFParser
7
+ from .parsers.table_chart_extractor import ChartTablePDFParser
8
+ from .version import __version__
9
+
10
+ __all__ = [
11
+ 'StructuredPDFParser',
12
+ 'ChartTablePDFParser',
13
+ '__version__'
14
+ ]
15
+
16
+ # Package metadata
17
+ __author__ = 'Adem Boukhris'
18
+ __email__ = 'boukhrisadam98@gmail.com' # Replace with your email
19
+ __description__ = 'Parse, extract, and analyze documents with ease'
doctra/cli/__init__.py ADDED
@@ -0,0 +1,27 @@
1
+ """
2
+ Doctra CLI module
3
+
4
+ This module provides command-line interface functionality for the Doctra library.
5
+ It exposes the main CLI entry point and related utilities for document processing,
6
+ chart/table extraction, layout visualization, and document analysis.
7
+ """
8
+
9
+ from .main import cli
10
+
11
+ __all__ = ['cli']
12
+ __version__ = '1.0.0'
13
+
14
+ # Command descriptions for help documentation
15
+ COMMANDS = {
16
+ 'parse': 'Full document processing with text, tables, charts, and figures',
17
+ 'extract': 'Extract charts and/or tables from PDF documents',
18
+ 'visualize': 'Visualize layout detection results',
19
+ 'analyze': 'Quick document analysis without processing',
20
+ 'info': 'Show system information and dependencies'
21
+ }
22
+
23
+ EXTRACT_SUBCOMMANDS = {
24
+ 'charts': 'Extract only charts from the document',
25
+ 'tables': 'Extract only tables from the document',
26
+ 'both': 'Extract both charts and tables'
27
+ }