doctra 0.2.0__tar.gz → 0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {doctra-0.2.0/doctra.egg-info → doctra-0.3.1}/PKG-INFO +66 -3
- {doctra-0.2.0 → doctra-0.3.1}/README.md +432 -370
- {doctra-0.2.0 → doctra-0.3.1}/doctra/__init__.py +21 -18
- {doctra-0.2.0 → doctra-0.3.1}/doctra/cli/main.py +3 -0
- {doctra-0.2.0 → doctra-0.3.1}/doctra/engines/layout/paddle_layout.py +11 -77
- {doctra-0.2.0 → doctra-0.3.1}/doctra/engines/vlm/provider.py +85 -85
- {doctra-0.2.0 → doctra-0.3.1}/doctra/engines/vlm/service.py +6 -13
- doctra-0.3.1/doctra/exporters/html_writer.py +1235 -0
- {doctra-0.2.0 → doctra-0.3.1}/doctra/parsers/structured_pdf_parser.py +12 -7
- {doctra-0.2.0 → doctra-0.3.1}/doctra/parsers/table_chart_extractor.py +47 -22
- doctra-0.3.1/doctra/ui/__init__.py +5 -0
- doctra-0.3.1/doctra/ui/app.py +1012 -0
- doctra-0.3.1/doctra/utils/progress.py +428 -0
- {doctra-0.2.0 → doctra-0.3.1}/doctra/utils/structured_utils.py +49 -49
- {doctra-0.2.0 → doctra-0.3.1}/doctra/version.py +1 -1
- {doctra-0.2.0 → doctra-0.3.1/doctra.egg-info}/PKG-INFO +66 -3
- {doctra-0.2.0 → doctra-0.3.1}/doctra.egg-info/SOURCES.txt +3 -0
- {doctra-0.2.0 → doctra-0.3.1}/doctra.egg-info/requires.txt +3 -3
- {doctra-0.2.0 → doctra-0.3.1}/pyproject.toml +82 -80
- doctra-0.3.1/requirements.txt +0 -0
- {doctra-0.2.0 → doctra-0.3.1}/setup.py +67 -65
- doctra-0.2.0/doctra/utils/progress.py +0 -277
- doctra-0.2.0/requirements.txt +0 -0
- {doctra-0.2.0 → doctra-0.3.1}/LICENSE +0 -0
- {doctra-0.2.0 → doctra-0.3.1}/MANIFEST.in +0 -0
- {doctra-0.2.0 → doctra-0.3.1}/doctra/cli/__init__.py +0 -0
- {doctra-0.2.0 → doctra-0.3.1}/doctra/cli/utils.py +0 -0
- {doctra-0.2.0 → doctra-0.3.1}/doctra/engines/__init__.py +0 -0
- {doctra-0.2.0 → doctra-0.3.1}/doctra/engines/layout/__init__.py +0 -0
- {doctra-0.2.0 → doctra-0.3.1}/doctra/engines/layout/layout_models.py +0 -0
- {doctra-0.2.0 → doctra-0.3.1}/doctra/engines/ocr/__init__.py +0 -0
- {doctra-0.2.0 → doctra-0.3.1}/doctra/engines/ocr/api.py +0 -0
- {doctra-0.2.0 → doctra-0.3.1}/doctra/engines/ocr/path_resolver.py +0 -0
- {doctra-0.2.0 → doctra-0.3.1}/doctra/engines/ocr/pytesseract_engine.py +0 -0
- {doctra-0.2.0 → doctra-0.3.1}/doctra/engines/vlm/__init__.py +0 -0
- {doctra-0.2.0 → doctra-0.3.1}/doctra/engines/vlm/outlines_types.py +0 -0
- {doctra-0.2.0 → doctra-0.3.1}/doctra/exporters/__init__.py +0 -0
- {doctra-0.2.0 → doctra-0.3.1}/doctra/exporters/excel_writer.py +0 -0
- {doctra-0.2.0 → doctra-0.3.1}/doctra/exporters/image_saver.py +0 -0
- {doctra-0.2.0 → doctra-0.3.1}/doctra/exporters/markdown_table.py +0 -0
- {doctra-0.2.0 → doctra-0.3.1}/doctra/exporters/markdown_writer.py +0 -0
- {doctra-0.2.0 → doctra-0.3.1}/doctra/parsers/__init__.py +0 -0
- {doctra-0.2.0 → doctra-0.3.1}/doctra/parsers/layout_order.py +0 -0
- {doctra-0.2.0 → doctra-0.3.1}/doctra/utils/__init__.py +0 -0
- {doctra-0.2.0 → doctra-0.3.1}/doctra/utils/bbox.py +0 -0
- {doctra-0.2.0 → doctra-0.3.1}/doctra/utils/constants.py +0 -0
- {doctra-0.2.0 → doctra-0.3.1}/doctra/utils/file_ops.py +0 -0
- {doctra-0.2.0 → doctra-0.3.1}/doctra/utils/io_utils.py +0 -0
- {doctra-0.2.0 → doctra-0.3.1}/doctra/utils/ocr_utils.py +0 -0
- {doctra-0.2.0 → doctra-0.3.1}/doctra/utils/pdf_io.py +0 -0
- {doctra-0.2.0 → doctra-0.3.1}/doctra/utils/quiet.py +0 -0
- {doctra-0.2.0 → doctra-0.3.1}/doctra.egg-info/dependency_links.txt +0 -0
- {doctra-0.2.0 → doctra-0.3.1}/doctra.egg-info/not-zip-safe +0 -0
- {doctra-0.2.0 → doctra-0.3.1}/doctra.egg-info/top_level.txt +0 -0
- {doctra-0.2.0 → doctra-0.3.1}/setup.cfg +0 -0
- {doctra-0.2.0 → doctra-0.3.1}/tests/test_structured_pdf_parser.py +0 -0
- {doctra-0.2.0 → doctra-0.3.1}/tests/test_table_chart_extractor.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: doctra
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.3.1
|
4
4
|
Summary: Parse, extract, and analyze documents with ease
|
5
5
|
Home-page: https://github.com/AdemBoukhris457/Doctra
|
6
6
|
Author: Adem Boukhris
|
@@ -234,6 +234,9 @@ Requires-Dist: opencv-python>=4.5.0
|
|
234
234
|
Requires-Dist: pandas>=1.3.0
|
235
235
|
Requires-Dist: openpyxl>=3.0.0
|
236
236
|
Requires-Dist: tesseract>=0.1.3
|
237
|
+
Requires-Dist: pytesseract>=0.3.10
|
238
|
+
Requires-Dist: pdf2image>=1.16.0
|
239
|
+
Requires-Dist: anthropic>=0.40.0
|
237
240
|
Requires-Dist: outlines>=0.0.34
|
238
241
|
Requires-Dist: tqdm>=4.62.0
|
239
242
|
Requires-Dist: matplotlib>=3.5.0
|
@@ -241,8 +244,6 @@ Provides-Extra: openai
|
|
241
244
|
Requires-Dist: openai>=1.0.0; extra == "openai"
|
242
245
|
Provides-Extra: gemini
|
243
246
|
Requires-Dist: google-generativeai>=0.3.0; extra == "gemini"
|
244
|
-
Provides-Extra: anthropic
|
245
|
-
Requires-Dist: anthropic>=0.40.0; extra == "anthropic"
|
246
247
|
Provides-Extra: dev
|
247
248
|
Requires-Dist: pytest>=6.0; extra == "dev"
|
248
249
|
Requires-Dist: pytest-cov>=2.0; extra == "dev"
|
@@ -295,6 +296,31 @@ cd Doctra
|
|
295
296
|
pip install .
|
296
297
|
```
|
297
298
|
|
299
|
+
### System Dependencies
|
300
|
+
|
301
|
+
Doctra requires **Poppler** for PDF processing. Install it based on your operating system:
|
302
|
+
|
303
|
+
#### Ubuntu/Debian
|
304
|
+
```bash
|
305
|
+
sudo apt install poppler-utils
|
306
|
+
```
|
307
|
+
|
308
|
+
#### macOS
|
309
|
+
```bash
|
310
|
+
brew install poppler
|
311
|
+
```
|
312
|
+
|
313
|
+
#### Windows
|
314
|
+
Download and install from [Poppler for Windows](http://blog.alivate.com.au/poppler-windows/) or use conda:
|
315
|
+
```bash
|
316
|
+
conda install -c conda-forge poppler
|
317
|
+
```
|
318
|
+
|
319
|
+
#### Google Colab
|
320
|
+
```bash
|
321
|
+
!sudo apt install poppler-utils
|
322
|
+
```
|
323
|
+
|
298
324
|
## ⚡ Quick Start
|
299
325
|
|
300
326
|
```python
|
@@ -626,3 +652,40 @@ parser.display_pages_with_boxes("document.pdf")
|
|
626
652
|
- **OpenPyXL**: Excel file generation
|
627
653
|
- **Google Generative AI**: For Gemini VLM integration
|
628
654
|
- **OpenAI**: For GPT-5 VLM integration
|
655
|
+
|
656
|
+
## 🖥️ Web Interface (Gradio)
|
657
|
+
|
658
|
+
You can try Doctra in a simple web UI powered by Gradio.
|
659
|
+
|
660
|
+
### Run locally
|
661
|
+
|
662
|
+
```bash
|
663
|
+
pip install -U gradio
|
664
|
+
python gradio_app.py
|
665
|
+
```
|
666
|
+
|
667
|
+
Then open the printed URL (default `http://127.0.0.1:7860`).
|
668
|
+
|
669
|
+
Notes:
|
670
|
+
- If using VLM, set the API key field in the UI or export `VLM_API_KEY`.
|
671
|
+
- Outputs are saved under `outputs/<pdf_stem>/` and previewed in the UI.
|
672
|
+
|
673
|
+
### Deploy on Hugging Face Spaces
|
674
|
+
|
675
|
+
1) Create a new Space (type: Gradio, SDK: Python).
|
676
|
+
|
677
|
+
2) Add these files to the Space repo:
|
678
|
+
- Your package code (or install from PyPI).
|
679
|
+
- `gradio_app.py` (entry point).
|
680
|
+
- `requirements.txt` with at least:
|
681
|
+
|
682
|
+
```text
|
683
|
+
doctra
|
684
|
+
gradio
|
685
|
+
```
|
686
|
+
|
687
|
+
3) Set a secret named `VLM_API_KEY` if you want VLM features.
|
688
|
+
|
689
|
+
4) In Space settings, set `python gradio_app.py` as the run command (or rely on auto-detect).
|
690
|
+
|
691
|
+
The Space will build and expose the same interface for uploads and processing.
|