doctra 0.1.1__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. {doctra-0.1.1/doctra.egg-info → doctra-0.3.0}/PKG-INFO +45 -6
  2. {doctra-0.1.1 → doctra-0.3.0}/README.md +407 -370
  3. {doctra-0.1.1 → doctra-0.3.0}/doctra/__init__.py +21 -18
  4. {doctra-0.1.1 → doctra-0.3.0}/doctra/cli/main.py +5 -2
  5. {doctra-0.1.1 → doctra-0.3.0}/doctra/cli/utils.py +12 -3
  6. {doctra-0.1.1 → doctra-0.3.0}/doctra/engines/layout/paddle_layout.py +13 -78
  7. {doctra-0.1.1 → doctra-0.3.0}/doctra/engines/vlm/provider.py +86 -58
  8. {doctra-0.1.1 → doctra-0.3.0}/doctra/engines/vlm/service.py +10 -14
  9. doctra-0.3.0/doctra/exporters/html_writer.py +1235 -0
  10. {doctra-0.1.1 → doctra-0.3.0}/doctra/parsers/structured_pdf_parser.py +35 -15
  11. {doctra-0.1.1 → doctra-0.3.0}/doctra/parsers/table_chart_extractor.py +66 -28
  12. doctra-0.3.0/doctra/ui/__init__.py +5 -0
  13. doctra-0.3.0/doctra/ui/app.py +1012 -0
  14. doctra-0.3.0/doctra/utils/progress.py +428 -0
  15. {doctra-0.1.1 → doctra-0.3.0}/doctra/utils/structured_utils.py +49 -49
  16. {doctra-0.1.1 → doctra-0.3.0}/doctra/version.py +1 -1
  17. {doctra-0.1.1 → doctra-0.3.0/doctra.egg-info}/PKG-INFO +45 -6
  18. {doctra-0.1.1 → doctra-0.3.0}/doctra.egg-info/SOURCES.txt +4 -0
  19. {doctra-0.1.1 → doctra-0.3.0}/doctra.egg-info/requires.txt +3 -0
  20. {doctra-0.1.1 → doctra-0.3.0}/pyproject.toml +1 -0
  21. doctra-0.3.0/requirements.txt +0 -0
  22. {doctra-0.1.1 → doctra-0.3.0}/setup.py +1 -0
  23. doctra-0.1.1/requirements.txt +0 -0
  24. {doctra-0.1.1 → doctra-0.3.0}/LICENSE +0 -0
  25. {doctra-0.1.1 → doctra-0.3.0}/MANIFEST.in +0 -0
  26. {doctra-0.1.1 → doctra-0.3.0}/doctra/cli/__init__.py +0 -0
  27. {doctra-0.1.1 → doctra-0.3.0}/doctra/engines/__init__.py +0 -0
  28. {doctra-0.1.1 → doctra-0.3.0}/doctra/engines/layout/__init__.py +0 -0
  29. {doctra-0.1.1 → doctra-0.3.0}/doctra/engines/layout/layout_models.py +0 -0
  30. {doctra-0.1.1 → doctra-0.3.0}/doctra/engines/ocr/__init__.py +0 -0
  31. {doctra-0.1.1 → doctra-0.3.0}/doctra/engines/ocr/api.py +0 -0
  32. {doctra-0.1.1 → doctra-0.3.0}/doctra/engines/ocr/path_resolver.py +0 -0
  33. {doctra-0.1.1 → doctra-0.3.0}/doctra/engines/ocr/pytesseract_engine.py +0 -0
  34. {doctra-0.1.1 → doctra-0.3.0}/doctra/engines/vlm/__init__.py +0 -0
  35. {doctra-0.1.1 → doctra-0.3.0}/doctra/engines/vlm/outlines_types.py +0 -0
  36. {doctra-0.1.1 → doctra-0.3.0}/doctra/exporters/__init__.py +0 -0
  37. {doctra-0.1.1 → doctra-0.3.0}/doctra/exporters/excel_writer.py +0 -0
  38. {doctra-0.1.1 → doctra-0.3.0}/doctra/exporters/image_saver.py +0 -0
  39. {doctra-0.1.1 → doctra-0.3.0}/doctra/exporters/markdown_table.py +0 -0
  40. {doctra-0.1.1 → doctra-0.3.0}/doctra/exporters/markdown_writer.py +0 -0
  41. {doctra-0.1.1 → doctra-0.3.0}/doctra/parsers/__init__.py +0 -0
  42. {doctra-0.1.1 → doctra-0.3.0}/doctra/parsers/layout_order.py +0 -0
  43. {doctra-0.1.1 → doctra-0.3.0}/doctra/utils/__init__.py +0 -0
  44. {doctra-0.1.1 → doctra-0.3.0}/doctra/utils/bbox.py +0 -0
  45. {doctra-0.1.1 → doctra-0.3.0}/doctra/utils/constants.py +0 -0
  46. {doctra-0.1.1 → doctra-0.3.0}/doctra/utils/file_ops.py +0 -0
  47. {doctra-0.1.1 → doctra-0.3.0}/doctra/utils/io_utils.py +0 -0
  48. {doctra-0.1.1 → doctra-0.3.0}/doctra/utils/ocr_utils.py +0 -0
  49. {doctra-0.1.1 → doctra-0.3.0}/doctra/utils/pdf_io.py +0 -0
  50. {doctra-0.1.1 → doctra-0.3.0}/doctra/utils/quiet.py +0 -0
  51. {doctra-0.1.1 → doctra-0.3.0}/doctra.egg-info/dependency_links.txt +0 -0
  52. {doctra-0.1.1 → doctra-0.3.0}/doctra.egg-info/not-zip-safe +0 -0
  53. {doctra-0.1.1 → doctra-0.3.0}/doctra.egg-info/top_level.txt +0 -0
  54. {doctra-0.1.1 → doctra-0.3.0}/setup.cfg +0 -0
  55. {doctra-0.1.1 → doctra-0.3.0}/tests/test_structured_pdf_parser.py +0 -0
  56. {doctra-0.1.1 → doctra-0.3.0}/tests/test_table_chart_extractor.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: doctra
3
- Version: 0.1.1
3
+ Version: 0.3.0
4
4
  Summary: Parse, extract, and analyze documents with ease
5
5
  Home-page: https://github.com/AdemBoukhris457/Doctra
6
6
  Author: Adem Boukhris
@@ -241,6 +241,8 @@ Provides-Extra: openai
241
241
  Requires-Dist: openai>=1.0.0; extra == "openai"
242
242
  Provides-Extra: gemini
243
243
  Requires-Dist: google-generativeai>=0.3.0; extra == "gemini"
244
+ Provides-Extra: anthropic
245
+ Requires-Dist: anthropic>=0.40.0; extra == "anthropic"
244
246
  Provides-Extra: dev
245
247
  Requires-Dist: pytest>=6.0; extra == "dev"
246
248
  Requires-Dist: pytest-cov>=2.0; extra == "dev"
@@ -329,7 +331,7 @@ parser = StructuredPDFParser()
329
331
  # Parser with VLM for structured data extraction
330
332
  parser = StructuredPDFParser(
331
333
  use_vlm=True,
332
- vlm_provider="openai", # or "gemini"
334
+ vlm_provider="openai", # or "gemini" or "anthropic" or "openrouter"
333
335
  vlm_api_key="your_api_key_here"
334
336
  )
335
337
 
@@ -344,7 +346,7 @@ parser = StructuredPDFParser(
344
346
  # VLM Settings
345
347
  use_vlm=True,
346
348
  vlm_provider="openai",
347
- vlm_model="gpt-4o",
349
+ vlm_model="gpt-5",
348
350
  vlm_api_key="your_api_key",
349
351
 
350
352
  # Layout Detection Settings
@@ -406,7 +408,7 @@ parser = ChartTablePDFParser(
406
408
  # VLM Settings
407
409
  use_vlm=True,
408
410
  vlm_provider="openai",
409
- vlm_model="gpt-4o",
411
+ vlm_model="gpt-5",
410
412
  vlm_api_key="your_api_key",
411
413
 
412
414
  # Layout Detection Settings
@@ -545,7 +547,7 @@ parser = StructuredPDFParser(
545
547
  use_vlm=True,
546
548
  vlm_provider="openai",
547
549
  vlm_api_key="your_openai_api_key",
548
- vlm__model="gpt-4o",
550
+ vlm__model="gpt-5",
549
551
  layout_model_name="PP-DocLayout_plus-L",
550
552
  dpi=300, # Higher DPI for better quality
551
553
  min_score=0.5, # Higher confidence threshold
@@ -623,4 +625,41 @@ parser.display_pages_with_boxes("document.pdf")
623
625
  - **Pandas**: Data manipulation
624
626
  - **OpenPyXL**: Excel file generation
625
627
  - **Google Generative AI**: For Gemini VLM integration
626
- - **OpenAI**: For GPT-4 VLM integration
628
+ - **OpenAI**: For GPT-5 VLM integration
629
+
630
+ ## 🖥️ Web Interface (Gradio)
631
+
632
+ You can try Doctra in a simple web UI powered by Gradio.
633
+
634
+ ### Run locally
635
+
636
+ ```bash
637
+ pip install -U gradio
638
+ python gradio_app.py
639
+ ```
640
+
641
+ Then open the printed URL (default `http://127.0.0.1:7860`).
642
+
643
+ Notes:
644
+ - If using VLM, set the API key field in the UI or export `VLM_API_KEY`.
645
+ - Outputs are saved under `outputs/<pdf_stem>/` and previewed in the UI.
646
+
647
+ ### Deploy on Hugging Face Spaces
648
+
649
+ 1) Create a new Space (type: Gradio, SDK: Python).
650
+
651
+ 2) Add these files to the Space repo:
652
+ - Your package code (or install from PyPI).
653
+ - `gradio_app.py` (entry point).
654
+ - `requirements.txt` with at least:
655
+
656
+ ```text
657
+ doctra
658
+ gradio
659
+ ```
660
+
661
+ 3) Set a secret named `VLM_API_KEY` if you want VLM features.
662
+
663
+ 4) In Space settings, set `python gradio_app.py` as the run command (or rely on auto-detect).
664
+
665
+ The Space will build and expose the same interface for uploads and processing.