doctra 0.1.1__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {doctra-0.1.1/doctra.egg-info → doctra-0.3.0}/PKG-INFO +45 -6
- {doctra-0.1.1 → doctra-0.3.0}/README.md +407 -370
- {doctra-0.1.1 → doctra-0.3.0}/doctra/__init__.py +21 -18
- {doctra-0.1.1 → doctra-0.3.0}/doctra/cli/main.py +5 -2
- {doctra-0.1.1 → doctra-0.3.0}/doctra/cli/utils.py +12 -3
- {doctra-0.1.1 → doctra-0.3.0}/doctra/engines/layout/paddle_layout.py +13 -78
- {doctra-0.1.1 → doctra-0.3.0}/doctra/engines/vlm/provider.py +86 -58
- {doctra-0.1.1 → doctra-0.3.0}/doctra/engines/vlm/service.py +10 -14
- doctra-0.3.0/doctra/exporters/html_writer.py +1235 -0
- {doctra-0.1.1 → doctra-0.3.0}/doctra/parsers/structured_pdf_parser.py +35 -15
- {doctra-0.1.1 → doctra-0.3.0}/doctra/parsers/table_chart_extractor.py +66 -28
- doctra-0.3.0/doctra/ui/__init__.py +5 -0
- doctra-0.3.0/doctra/ui/app.py +1012 -0
- doctra-0.3.0/doctra/utils/progress.py +428 -0
- {doctra-0.1.1 → doctra-0.3.0}/doctra/utils/structured_utils.py +49 -49
- {doctra-0.1.1 → doctra-0.3.0}/doctra/version.py +1 -1
- {doctra-0.1.1 → doctra-0.3.0/doctra.egg-info}/PKG-INFO +45 -6
- {doctra-0.1.1 → doctra-0.3.0}/doctra.egg-info/SOURCES.txt +4 -0
- {doctra-0.1.1 → doctra-0.3.0}/doctra.egg-info/requires.txt +3 -0
- {doctra-0.1.1 → doctra-0.3.0}/pyproject.toml +1 -0
- doctra-0.3.0/requirements.txt +0 -0
- {doctra-0.1.1 → doctra-0.3.0}/setup.py +1 -0
- doctra-0.1.1/requirements.txt +0 -0
- {doctra-0.1.1 → doctra-0.3.0}/LICENSE +0 -0
- {doctra-0.1.1 → doctra-0.3.0}/MANIFEST.in +0 -0
- {doctra-0.1.1 → doctra-0.3.0}/doctra/cli/__init__.py +0 -0
- {doctra-0.1.1 → doctra-0.3.0}/doctra/engines/__init__.py +0 -0
- {doctra-0.1.1 → doctra-0.3.0}/doctra/engines/layout/__init__.py +0 -0
- {doctra-0.1.1 → doctra-0.3.0}/doctra/engines/layout/layout_models.py +0 -0
- {doctra-0.1.1 → doctra-0.3.0}/doctra/engines/ocr/__init__.py +0 -0
- {doctra-0.1.1 → doctra-0.3.0}/doctra/engines/ocr/api.py +0 -0
- {doctra-0.1.1 → doctra-0.3.0}/doctra/engines/ocr/path_resolver.py +0 -0
- {doctra-0.1.1 → doctra-0.3.0}/doctra/engines/ocr/pytesseract_engine.py +0 -0
- {doctra-0.1.1 → doctra-0.3.0}/doctra/engines/vlm/__init__.py +0 -0
- {doctra-0.1.1 → doctra-0.3.0}/doctra/engines/vlm/outlines_types.py +0 -0
- {doctra-0.1.1 → doctra-0.3.0}/doctra/exporters/__init__.py +0 -0
- {doctra-0.1.1 → doctra-0.3.0}/doctra/exporters/excel_writer.py +0 -0
- {doctra-0.1.1 → doctra-0.3.0}/doctra/exporters/image_saver.py +0 -0
- {doctra-0.1.1 → doctra-0.3.0}/doctra/exporters/markdown_table.py +0 -0
- {doctra-0.1.1 → doctra-0.3.0}/doctra/exporters/markdown_writer.py +0 -0
- {doctra-0.1.1 → doctra-0.3.0}/doctra/parsers/__init__.py +0 -0
- {doctra-0.1.1 → doctra-0.3.0}/doctra/parsers/layout_order.py +0 -0
- {doctra-0.1.1 → doctra-0.3.0}/doctra/utils/__init__.py +0 -0
- {doctra-0.1.1 → doctra-0.3.0}/doctra/utils/bbox.py +0 -0
- {doctra-0.1.1 → doctra-0.3.0}/doctra/utils/constants.py +0 -0
- {doctra-0.1.1 → doctra-0.3.0}/doctra/utils/file_ops.py +0 -0
- {doctra-0.1.1 → doctra-0.3.0}/doctra/utils/io_utils.py +0 -0
- {doctra-0.1.1 → doctra-0.3.0}/doctra/utils/ocr_utils.py +0 -0
- {doctra-0.1.1 → doctra-0.3.0}/doctra/utils/pdf_io.py +0 -0
- {doctra-0.1.1 → doctra-0.3.0}/doctra/utils/quiet.py +0 -0
- {doctra-0.1.1 → doctra-0.3.0}/doctra.egg-info/dependency_links.txt +0 -0
- {doctra-0.1.1 → doctra-0.3.0}/doctra.egg-info/not-zip-safe +0 -0
- {doctra-0.1.1 → doctra-0.3.0}/doctra.egg-info/top_level.txt +0 -0
- {doctra-0.1.1 → doctra-0.3.0}/setup.cfg +0 -0
- {doctra-0.1.1 → doctra-0.3.0}/tests/test_structured_pdf_parser.py +0 -0
- {doctra-0.1.1 → doctra-0.3.0}/tests/test_table_chart_extractor.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: doctra
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.3.0
|
4
4
|
Summary: Parse, extract, and analyze documents with ease
|
5
5
|
Home-page: https://github.com/AdemBoukhris457/Doctra
|
6
6
|
Author: Adem Boukhris
|
@@ -241,6 +241,8 @@ Provides-Extra: openai
|
|
241
241
|
Requires-Dist: openai>=1.0.0; extra == "openai"
|
242
242
|
Provides-Extra: gemini
|
243
243
|
Requires-Dist: google-generativeai>=0.3.0; extra == "gemini"
|
244
|
+
Provides-Extra: anthropic
|
245
|
+
Requires-Dist: anthropic>=0.40.0; extra == "anthropic"
|
244
246
|
Provides-Extra: dev
|
245
247
|
Requires-Dist: pytest>=6.0; extra == "dev"
|
246
248
|
Requires-Dist: pytest-cov>=2.0; extra == "dev"
|
@@ -329,7 +331,7 @@ parser = StructuredPDFParser()
|
|
329
331
|
# Parser with VLM for structured data extraction
|
330
332
|
parser = StructuredPDFParser(
|
331
333
|
use_vlm=True,
|
332
|
-
vlm_provider="openai", # or "gemini"
|
334
|
+
vlm_provider="openai", # or "gemini" or "anthropic" or "openrouter"
|
333
335
|
vlm_api_key="your_api_key_here"
|
334
336
|
)
|
335
337
|
|
@@ -344,7 +346,7 @@ parser = StructuredPDFParser(
|
|
344
346
|
# VLM Settings
|
345
347
|
use_vlm=True,
|
346
348
|
vlm_provider="openai",
|
347
|
-
vlm_model="gpt-
|
349
|
+
vlm_model="gpt-5",
|
348
350
|
vlm_api_key="your_api_key",
|
349
351
|
|
350
352
|
# Layout Detection Settings
|
@@ -406,7 +408,7 @@ parser = ChartTablePDFParser(
|
|
406
408
|
# VLM Settings
|
407
409
|
use_vlm=True,
|
408
410
|
vlm_provider="openai",
|
409
|
-
vlm_model="gpt-
|
411
|
+
vlm_model="gpt-5",
|
410
412
|
vlm_api_key="your_api_key",
|
411
413
|
|
412
414
|
# Layout Detection Settings
|
@@ -545,7 +547,7 @@ parser = StructuredPDFParser(
|
|
545
547
|
use_vlm=True,
|
546
548
|
vlm_provider="openai",
|
547
549
|
vlm_api_key="your_openai_api_key",
|
548
|
-
vlm__model="gpt-
|
550
|
+
vlm__model="gpt-5",
|
549
551
|
layout_model_name="PP-DocLayout_plus-L",
|
550
552
|
dpi=300, # Higher DPI for better quality
|
551
553
|
min_score=0.5, # Higher confidence threshold
|
@@ -623,4 +625,41 @@ parser.display_pages_with_boxes("document.pdf")
|
|
623
625
|
- **Pandas**: Data manipulation
|
624
626
|
- **OpenPyXL**: Excel file generation
|
625
627
|
- **Google Generative AI**: For Gemini VLM integration
|
626
|
-
- **OpenAI**: For GPT-
|
628
|
+
- **OpenAI**: For GPT-5 VLM integration
|
629
|
+
|
630
|
+
## 🖥️ Web Interface (Gradio)
|
631
|
+
|
632
|
+
You can try Doctra in a simple web UI powered by Gradio.
|
633
|
+
|
634
|
+
### Run locally
|
635
|
+
|
636
|
+
```bash
|
637
|
+
pip install -U gradio
|
638
|
+
python gradio_app.py
|
639
|
+
```
|
640
|
+
|
641
|
+
Then open the printed URL (default `http://127.0.0.1:7860`).
|
642
|
+
|
643
|
+
Notes:
|
644
|
+
- If using VLM, set the API key field in the UI or export `VLM_API_KEY`.
|
645
|
+
- Outputs are saved under `outputs/<pdf_stem>/` and previewed in the UI.
|
646
|
+
|
647
|
+
### Deploy on Hugging Face Spaces
|
648
|
+
|
649
|
+
1) Create a new Space (type: Gradio, SDK: Python).
|
650
|
+
|
651
|
+
2) Add these files to the Space repo:
|
652
|
+
- Your package code (or install from PyPI).
|
653
|
+
- `gradio_app.py` (entry point).
|
654
|
+
- `requirements.txt` with at least:
|
655
|
+
|
656
|
+
```text
|
657
|
+
doctra
|
658
|
+
gradio
|
659
|
+
```
|
660
|
+
|
661
|
+
3) Set a secret named `VLM_API_KEY` if you want VLM features.
|
662
|
+
|
663
|
+
4) In Space settings, set `python gradio_app.py` as the run command (or rely on auto-detect).
|
664
|
+
|
665
|
+
The Space will build and expose the same interface for uploads and processing.
|