doctra 0.2.0__tar.gz → 0.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. {doctra-0.2.0/doctra.egg-info → doctra-0.3.1}/PKG-INFO +66 -3
  2. {doctra-0.2.0 → doctra-0.3.1}/README.md +432 -370
  3. {doctra-0.2.0 → doctra-0.3.1}/doctra/__init__.py +21 -18
  4. {doctra-0.2.0 → doctra-0.3.1}/doctra/cli/main.py +3 -0
  5. {doctra-0.2.0 → doctra-0.3.1}/doctra/engines/layout/paddle_layout.py +11 -77
  6. {doctra-0.2.0 → doctra-0.3.1}/doctra/engines/vlm/provider.py +85 -85
  7. {doctra-0.2.0 → doctra-0.3.1}/doctra/engines/vlm/service.py +6 -13
  8. doctra-0.3.1/doctra/exporters/html_writer.py +1235 -0
  9. {doctra-0.2.0 → doctra-0.3.1}/doctra/parsers/structured_pdf_parser.py +12 -7
  10. {doctra-0.2.0 → doctra-0.3.1}/doctra/parsers/table_chart_extractor.py +47 -22
  11. doctra-0.3.1/doctra/ui/__init__.py +5 -0
  12. doctra-0.3.1/doctra/ui/app.py +1012 -0
  13. doctra-0.3.1/doctra/utils/progress.py +428 -0
  14. {doctra-0.2.0 → doctra-0.3.1}/doctra/utils/structured_utils.py +49 -49
  15. {doctra-0.2.0 → doctra-0.3.1}/doctra/version.py +1 -1
  16. {doctra-0.2.0 → doctra-0.3.1/doctra.egg-info}/PKG-INFO +66 -3
  17. {doctra-0.2.0 → doctra-0.3.1}/doctra.egg-info/SOURCES.txt +3 -0
  18. {doctra-0.2.0 → doctra-0.3.1}/doctra.egg-info/requires.txt +3 -3
  19. {doctra-0.2.0 → doctra-0.3.1}/pyproject.toml +82 -80
  20. doctra-0.3.1/requirements.txt +0 -0
  21. {doctra-0.2.0 → doctra-0.3.1}/setup.py +67 -65
  22. doctra-0.2.0/doctra/utils/progress.py +0 -277
  23. doctra-0.2.0/requirements.txt +0 -0
  24. {doctra-0.2.0 → doctra-0.3.1}/LICENSE +0 -0
  25. {doctra-0.2.0 → doctra-0.3.1}/MANIFEST.in +0 -0
  26. {doctra-0.2.0 → doctra-0.3.1}/doctra/cli/__init__.py +0 -0
  27. {doctra-0.2.0 → doctra-0.3.1}/doctra/cli/utils.py +0 -0
  28. {doctra-0.2.0 → doctra-0.3.1}/doctra/engines/__init__.py +0 -0
  29. {doctra-0.2.0 → doctra-0.3.1}/doctra/engines/layout/__init__.py +0 -0
  30. {doctra-0.2.0 → doctra-0.3.1}/doctra/engines/layout/layout_models.py +0 -0
  31. {doctra-0.2.0 → doctra-0.3.1}/doctra/engines/ocr/__init__.py +0 -0
  32. {doctra-0.2.0 → doctra-0.3.1}/doctra/engines/ocr/api.py +0 -0
  33. {doctra-0.2.0 → doctra-0.3.1}/doctra/engines/ocr/path_resolver.py +0 -0
  34. {doctra-0.2.0 → doctra-0.3.1}/doctra/engines/ocr/pytesseract_engine.py +0 -0
  35. {doctra-0.2.0 → doctra-0.3.1}/doctra/engines/vlm/__init__.py +0 -0
  36. {doctra-0.2.0 → doctra-0.3.1}/doctra/engines/vlm/outlines_types.py +0 -0
  37. {doctra-0.2.0 → doctra-0.3.1}/doctra/exporters/__init__.py +0 -0
  38. {doctra-0.2.0 → doctra-0.3.1}/doctra/exporters/excel_writer.py +0 -0
  39. {doctra-0.2.0 → doctra-0.3.1}/doctra/exporters/image_saver.py +0 -0
  40. {doctra-0.2.0 → doctra-0.3.1}/doctra/exporters/markdown_table.py +0 -0
  41. {doctra-0.2.0 → doctra-0.3.1}/doctra/exporters/markdown_writer.py +0 -0
  42. {doctra-0.2.0 → doctra-0.3.1}/doctra/parsers/__init__.py +0 -0
  43. {doctra-0.2.0 → doctra-0.3.1}/doctra/parsers/layout_order.py +0 -0
  44. {doctra-0.2.0 → doctra-0.3.1}/doctra/utils/__init__.py +0 -0
  45. {doctra-0.2.0 → doctra-0.3.1}/doctra/utils/bbox.py +0 -0
  46. {doctra-0.2.0 → doctra-0.3.1}/doctra/utils/constants.py +0 -0
  47. {doctra-0.2.0 → doctra-0.3.1}/doctra/utils/file_ops.py +0 -0
  48. {doctra-0.2.0 → doctra-0.3.1}/doctra/utils/io_utils.py +0 -0
  49. {doctra-0.2.0 → doctra-0.3.1}/doctra/utils/ocr_utils.py +0 -0
  50. {doctra-0.2.0 → doctra-0.3.1}/doctra/utils/pdf_io.py +0 -0
  51. {doctra-0.2.0 → doctra-0.3.1}/doctra/utils/quiet.py +0 -0
  52. {doctra-0.2.0 → doctra-0.3.1}/doctra.egg-info/dependency_links.txt +0 -0
  53. {doctra-0.2.0 → doctra-0.3.1}/doctra.egg-info/not-zip-safe +0 -0
  54. {doctra-0.2.0 → doctra-0.3.1}/doctra.egg-info/top_level.txt +0 -0
  55. {doctra-0.2.0 → doctra-0.3.1}/setup.cfg +0 -0
  56. {doctra-0.2.0 → doctra-0.3.1}/tests/test_structured_pdf_parser.py +0 -0
  57. {doctra-0.2.0 → doctra-0.3.1}/tests/test_table_chart_extractor.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: doctra
3
- Version: 0.2.0
3
+ Version: 0.3.1
4
4
  Summary: Parse, extract, and analyze documents with ease
5
5
  Home-page: https://github.com/AdemBoukhris457/Doctra
6
6
  Author: Adem Boukhris
@@ -234,6 +234,9 @@ Requires-Dist: opencv-python>=4.5.0
234
234
  Requires-Dist: pandas>=1.3.0
235
235
  Requires-Dist: openpyxl>=3.0.0
236
236
  Requires-Dist: tesseract>=0.1.3
237
+ Requires-Dist: pytesseract>=0.3.10
238
+ Requires-Dist: pdf2image>=1.16.0
239
+ Requires-Dist: anthropic>=0.40.0
237
240
  Requires-Dist: outlines>=0.0.34
238
241
  Requires-Dist: tqdm>=4.62.0
239
242
  Requires-Dist: matplotlib>=3.5.0
@@ -241,8 +244,6 @@ Provides-Extra: openai
241
244
  Requires-Dist: openai>=1.0.0; extra == "openai"
242
245
  Provides-Extra: gemini
243
246
  Requires-Dist: google-generativeai>=0.3.0; extra == "gemini"
244
- Provides-Extra: anthropic
245
- Requires-Dist: anthropic>=0.40.0; extra == "anthropic"
246
247
  Provides-Extra: dev
247
248
  Requires-Dist: pytest>=6.0; extra == "dev"
248
249
  Requires-Dist: pytest-cov>=2.0; extra == "dev"
@@ -295,6 +296,31 @@ cd Doctra
295
296
  pip install .
296
297
  ```
297
298
 
299
+ ### System Dependencies
300
+
301
+ Doctra requires **Poppler** for PDF processing. Install it based on your operating system:
302
+
303
+ #### Ubuntu/Debian
304
+ ```bash
305
+ sudo apt install poppler-utils
306
+ ```
307
+
308
+ #### macOS
309
+ ```bash
310
+ brew install poppler
311
+ ```
312
+
313
+ #### Windows
314
+ Download and install from [Poppler for Windows](http://blog.alivate.com.au/poppler-windows/) or use conda:
315
+ ```bash
316
+ conda install -c conda-forge poppler
317
+ ```
318
+
319
+ #### Google Colab
320
+ ```bash
321
+ !sudo apt install poppler-utils
322
+ ```
323
+
298
324
  ## ⚡ Quick Start
299
325
 
300
326
  ```python
@@ -626,3 +652,40 @@ parser.display_pages_with_boxes("document.pdf")
626
652
  - **OpenPyXL**: Excel file generation
627
653
  - **Google Generative AI**: For Gemini VLM integration
628
654
  - **OpenAI**: For GPT-5 VLM integration
655
+
656
+ ## 🖥️ Web Interface (Gradio)
657
+
658
+ You can try Doctra in a simple web UI powered by Gradio.
659
+
660
+ ### Run locally
661
+
662
+ ```bash
663
+ pip install -U gradio
664
+ python gradio_app.py
665
+ ```
666
+
667
+ Then open the printed URL (default `http://127.0.0.1:7860`).
668
+
669
+ Notes:
670
+ - If using VLM, set the API key field in the UI or export `VLM_API_KEY`.
671
+ - Outputs are saved under `outputs/<pdf_stem>/` and previewed in the UI.
672
+
673
+ ### Deploy on Hugging Face Spaces
674
+
675
+ 1) Create a new Space (type: Gradio, SDK: Python).
676
+
677
+ 2) Add these files to the Space repo:
678
+ - Your package code (or install from PyPI).
679
+ - `gradio_app.py` (entry point).
680
+ - `requirements.txt` with at least:
681
+
682
+ ```text
683
+ doctra
684
+ gradio
685
+ ```
686
+
687
+ 3) Set a secret named `VLM_API_KEY` if you want VLM features.
688
+
689
+ 4) In Space settings, set `python gradio_app.py` as the run command (or rely on auto-detect).
690
+
691
+ The Space will build and expose the same interface for uploads and processing.