doctra 0.1.0__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {doctra-0.1.0/doctra.egg-info → doctra-0.2.0}/PKG-INFO +12 -10
- {doctra-0.1.0 → doctra-0.2.0}/README.md +370 -370
- {doctra-0.1.0 → doctra-0.2.0}/doctra/cli/main.py +2 -2
- {doctra-0.1.0 → doctra-0.2.0}/doctra/cli/utils.py +12 -3
- {doctra-0.1.0 → doctra-0.2.0}/doctra/engines/layout/paddle_layout.py +3 -2
- {doctra-0.1.0 → doctra-0.2.0}/doctra/engines/vlm/provider.py +34 -6
- {doctra-0.1.0 → doctra-0.2.0}/doctra/engines/vlm/service.py +5 -2
- {doctra-0.1.0 → doctra-0.2.0}/doctra/parsers/structured_pdf_parser.py +23 -8
- {doctra-0.1.0 → doctra-0.2.0}/doctra/parsers/table_chart_extractor.py +19 -6
- doctra-0.2.0/doctra/utils/progress.py +277 -0
- {doctra-0.1.0 → doctra-0.2.0}/doctra/version.py +1 -1
- {doctra-0.1.0 → doctra-0.2.0/doctra.egg-info}/PKG-INFO +12 -10
- {doctra-0.1.0 → doctra-0.2.0}/doctra.egg-info/SOURCES.txt +1 -0
- {doctra-0.1.0 → doctra-0.2.0}/doctra.egg-info/requires.txt +3 -0
- {doctra-0.1.0 → doctra-0.2.0}/pyproject.toml +1 -0
- {doctra-0.1.0 → doctra-0.2.0}/requirements.txt +0 -0
- {doctra-0.1.0 → doctra-0.2.0}/setup.py +1 -0
- {doctra-0.1.0 → doctra-0.2.0}/LICENSE +0 -0
- {doctra-0.1.0 → doctra-0.2.0}/MANIFEST.in +0 -0
- {doctra-0.1.0 → doctra-0.2.0}/doctra/__init__.py +0 -0
- {doctra-0.1.0 → doctra-0.2.0}/doctra/cli/__init__.py +0 -0
- {doctra-0.1.0 → doctra-0.2.0}/doctra/engines/__init__.py +0 -0
- {doctra-0.1.0 → doctra-0.2.0}/doctra/engines/layout/__init__.py +0 -0
- {doctra-0.1.0 → doctra-0.2.0}/doctra/engines/layout/layout_models.py +0 -0
- {doctra-0.1.0 → doctra-0.2.0}/doctra/engines/ocr/__init__.py +0 -0
- {doctra-0.1.0 → doctra-0.2.0}/doctra/engines/ocr/api.py +0 -0
- {doctra-0.1.0 → doctra-0.2.0}/doctra/engines/ocr/path_resolver.py +0 -0
- {doctra-0.1.0 → doctra-0.2.0}/doctra/engines/ocr/pytesseract_engine.py +0 -0
- {doctra-0.1.0 → doctra-0.2.0}/doctra/engines/vlm/__init__.py +0 -0
- {doctra-0.1.0 → doctra-0.2.0}/doctra/engines/vlm/outlines_types.py +0 -0
- {doctra-0.1.0 → doctra-0.2.0}/doctra/exporters/__init__.py +0 -0
- {doctra-0.1.0 → doctra-0.2.0}/doctra/exporters/excel_writer.py +0 -0
- {doctra-0.1.0 → doctra-0.2.0}/doctra/exporters/image_saver.py +0 -0
- {doctra-0.1.0 → doctra-0.2.0}/doctra/exporters/markdown_table.py +0 -0
- {doctra-0.1.0 → doctra-0.2.0}/doctra/exporters/markdown_writer.py +0 -0
- {doctra-0.1.0 → doctra-0.2.0}/doctra/parsers/__init__.py +0 -0
- {doctra-0.1.0 → doctra-0.2.0}/doctra/parsers/layout_order.py +0 -0
- {doctra-0.1.0 → doctra-0.2.0}/doctra/utils/__init__.py +0 -0
- {doctra-0.1.0 → doctra-0.2.0}/doctra/utils/bbox.py +0 -0
- {doctra-0.1.0 → doctra-0.2.0}/doctra/utils/constants.py +0 -0
- {doctra-0.1.0 → doctra-0.2.0}/doctra/utils/file_ops.py +0 -0
- {doctra-0.1.0 → doctra-0.2.0}/doctra/utils/io_utils.py +0 -0
- {doctra-0.1.0 → doctra-0.2.0}/doctra/utils/ocr_utils.py +0 -0
- {doctra-0.1.0 → doctra-0.2.0}/doctra/utils/pdf_io.py +0 -0
- {doctra-0.1.0 → doctra-0.2.0}/doctra/utils/quiet.py +0 -0
- {doctra-0.1.0 → doctra-0.2.0}/doctra/utils/structured_utils.py +0 -0
- {doctra-0.1.0 → doctra-0.2.0}/doctra.egg-info/dependency_links.txt +0 -0
- {doctra-0.1.0 → doctra-0.2.0}/doctra.egg-info/not-zip-safe +0 -0
- {doctra-0.1.0 → doctra-0.2.0}/doctra.egg-info/top_level.txt +0 -0
- {doctra-0.1.0 → doctra-0.2.0}/setup.cfg +0 -0
- {doctra-0.1.0 → doctra-0.2.0}/tests/test_structured_pdf_parser.py +0 -0
- {doctra-0.1.0 → doctra-0.2.0}/tests/test_table_chart_extractor.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: doctra
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.2.0
|
4
4
|
Summary: Parse, extract, and analyze documents with ease
|
5
5
|
Home-page: https://github.com/AdemBoukhris457/Doctra
|
6
6
|
Author: Adem Boukhris
|
@@ -241,6 +241,8 @@ Provides-Extra: openai
|
|
241
241
|
Requires-Dist: openai>=1.0.0; extra == "openai"
|
242
242
|
Provides-Extra: gemini
|
243
243
|
Requires-Dist: google-generativeai>=0.3.0; extra == "gemini"
|
244
|
+
Provides-Extra: anthropic
|
245
|
+
Requires-Dist: anthropic>=0.40.0; extra == "anthropic"
|
244
246
|
Provides-Extra: dev
|
245
247
|
Requires-Dist: pytest>=6.0; extra == "dev"
|
246
248
|
Requires-Dist: pytest-cov>=2.0; extra == "dev"
|
@@ -256,13 +258,13 @@ Dynamic: requires-python
|
|
256
258
|
|
257
259
|
# 🚀 **Doctra - Document Parser Library** 📑🔎
|
258
260
|
|
259
|
-

|
260
262
|
|
261
263
|
<div align="center">
|
262
264
|
|
263
|
-
[](https://github.com/AdemBoukhris457/Doctra)
|
266
|
+
[](https://github.com/AdemBoukhris457/Doctra)
|
267
|
+
[](https://pypi.org/project/doctra/)
|
266
268
|
</div>
|
267
269
|
|
268
270
|
## 📋 Table of Contents
|
@@ -329,7 +331,7 @@ parser = StructuredPDFParser()
|
|
329
331
|
# Parser with VLM for structured data extraction
|
330
332
|
parser = StructuredPDFParser(
|
331
333
|
use_vlm=True,
|
332
|
-
vlm_provider="openai", # or "gemini"
|
334
|
+
vlm_provider="openai", # or "gemini" or "anthropic" or "openrouter"
|
333
335
|
vlm_api_key="your_api_key_here"
|
334
336
|
)
|
335
337
|
|
@@ -344,7 +346,7 @@ parser = StructuredPDFParser(
|
|
344
346
|
# VLM Settings
|
345
347
|
use_vlm=True,
|
346
348
|
vlm_provider="openai",
|
347
|
-
vlm_model="gpt-
|
349
|
+
vlm_model="gpt-5",
|
348
350
|
vlm_api_key="your_api_key",
|
349
351
|
|
350
352
|
# Layout Detection Settings
|
@@ -406,7 +408,7 @@ parser = ChartTablePDFParser(
|
|
406
408
|
# VLM Settings
|
407
409
|
use_vlm=True,
|
408
410
|
vlm_provider="openai",
|
409
|
-
vlm_model="gpt-
|
411
|
+
vlm_model="gpt-5",
|
410
412
|
vlm_api_key="your_api_key",
|
411
413
|
|
412
414
|
# Layout Detection Settings
|
@@ -545,7 +547,7 @@ parser = StructuredPDFParser(
|
|
545
547
|
use_vlm=True,
|
546
548
|
vlm_provider="openai",
|
547
549
|
vlm_api_key="your_openai_api_key",
|
548
|
-
vlm__model="gpt-
|
550
|
+
vlm__model="gpt-5",
|
549
551
|
layout_model_name="PP-DocLayout_plus-L",
|
550
552
|
dpi=300, # Higher DPI for better quality
|
551
553
|
min_score=0.5, # Higher confidence threshold
|
@@ -623,4 +625,4 @@ parser.display_pages_with_boxes("document.pdf")
|
|
623
625
|
- **Pandas**: Data manipulation
|
624
626
|
- **OpenPyXL**: Excel file generation
|
625
627
|
- **Google Generative AI**: For Gemini VLM integration
|
626
|
-
- **OpenAI**: For GPT-
|
628
|
+
- **OpenAI**: For GPT-5 VLM integration
|