docling 2.49.0__tar.gz → 2.51.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.49.0 → docling-2.51.0}/PKG-INFO +10 -6
- {docling-2.49.0 → docling-2.51.0}/README.md +7 -3
- {docling-2.49.0 → docling-2.51.0}/docling/backend/docling_parse_v4_backend.py +12 -0
- {docling-2.49.0 → docling-2.51.0}/docling/backend/html_backend.py +3 -2
- {docling-2.49.0 → docling-2.51.0}/docling/datamodel/pipeline_options.py +4 -2
- {docling-2.49.0 → docling-2.51.0}/docling/models/layout_model.py +3 -3
- {docling-2.49.0 → docling-2.51.0}/docling/models/page_preprocessing_model.py +1 -1
- {docling-2.49.0 → docling-2.51.0}/docling/models/table_structure_model.py +1 -1
- {docling-2.49.0 → docling-2.51.0}/docling/utils/model_downloader.py +2 -1
- {docling-2.49.0 → docling-2.51.0}/docling.egg-info/PKG-INFO +10 -6
- {docling-2.49.0 → docling-2.51.0}/docling.egg-info/requires.txt +2 -2
- {docling-2.49.0 → docling-2.51.0}/pyproject.toml +3 -3
- {docling-2.49.0 → docling-2.51.0}/tests/test_e2e_conversion.py +9 -1
- {docling-2.49.0 → docling-2.51.0}/LICENSE +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/__init__.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/backend/__init__.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/backend/csv_backend.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/backend/docx/__init__.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/backend/docx/latex/__init__.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/backend/docx/latex/latex_dict.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/backend/docx/latex/omml.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/backend/json/__init__.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/backend/md_backend.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/backend/mets_gbs_backend.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/backend/msword_backend.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/backend/noop_backend.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/backend/pdf_backend.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/backend/xml/__init__.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/backend/xml/jats_backend.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/backend/xml/uspto_backend.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/chunking/__init__.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/cli/__init__.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/cli/main.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/cli/models.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/cli/tools.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/datamodel/accelerator_options.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/datamodel/asr_model_specs.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/datamodel/base_models.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/datamodel/document.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/datamodel/extraction.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/datamodel/layout_model_specs.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/datamodel/pipeline_options_asr_model.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/datamodel/pipeline_options_vlm_model.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/datamodel/settings.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/datamodel/vlm_model_specs.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/document_converter.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/document_extractor.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/exceptions.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/models/__init__.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/models/api_vlm_model.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/models/base_model.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/models/base_ocr_model.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/models/code_formula_model.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/models/document_picture_classifier.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/models/easyocr_model.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/models/factories/__init__.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/models/factories/base_factory.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/models/factories/ocr_factory.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/models/factories/picture_description_factory.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/models/page_assemble_model.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/models/picture_description_api_model.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/models/picture_description_base_model.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/models/picture_description_vlm_model.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/models/plugins/__init__.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/models/plugins/defaults.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/models/rapid_ocr_model.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/models/readingorder_model.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/models/utils/__init__.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/models/utils/hf_model_download.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/models/vlm_models_inline/__init__.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/models/vlm_models_inline/hf_transformers_model.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/models/vlm_models_inline/mlx_model.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/models/vlm_models_inline/nuextract_transformers_model.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/models/vlm_models_inline/vllm_model.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/pipeline/__init__.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/pipeline/asr_pipeline.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/pipeline/base_extraction_pipeline.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/pipeline/extraction_vlm_pipeline.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/pipeline/threaded_standard_pdf_pipeline.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/pipeline/vlm_pipeline.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/py.typed +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/utils/__init__.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/utils/api_image_request.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/utils/export.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/utils/glm_utils.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/utils/layout_postprocessor.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/utils/locks.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/utils/ocr_utils.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/utils/orientation.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/utils/profiling.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/utils/utils.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling/utils/visualization.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling.egg-info/SOURCES.txt +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling.egg-info/dependency_links.txt +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling.egg-info/entry_points.txt +0 -0
- {docling-2.49.0 → docling-2.51.0}/docling.egg-info/top_level.txt +0 -0
- {docling-2.49.0 → docling-2.51.0}/setup.cfg +0 -0
- {docling-2.49.0 → docling-2.51.0}/tests/test_asr_pipeline.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/tests/test_backend_asciidoc.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/tests/test_backend_csv.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/tests/test_backend_docling_json.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/tests/test_backend_docling_parse.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/tests/test_backend_docling_parse_v2.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/tests/test_backend_docling_parse_v4.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/tests/test_backend_html.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/tests/test_backend_jats.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/tests/test_backend_markdown.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/tests/test_backend_mets_gbs.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/tests/test_backend_msexcel.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/tests/test_backend_msword.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/tests/test_backend_patent_uspto.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/tests/test_backend_pdfium.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/tests/test_backend_pptx.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/tests/test_backend_webp.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/tests/test_cli.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/tests/test_code_formula.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/tests/test_data_gen_flag.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/tests/test_document_picture_classifier.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/tests/test_e2e_ocr_conversion.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/tests/test_extraction.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/tests/test_input_doc.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/tests/test_interfaces.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/tests/test_invalid_input.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/tests/test_legacy_format_transform.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/tests/test_ocr_utils.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/tests/test_options.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/tests/test_settings_load.py +0 -0
- {docling-2.49.0 → docling-2.51.0}/tests/test_threaded_pipeline.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.51.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
6
|
License-Expression: MIT
|
@@ -27,8 +27,8 @@ Description-Content-Type: text/markdown
|
|
27
27
|
License-File: LICENSE
|
28
28
|
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
29
29
|
Requires-Dist: docling-core[chunking]<3.0.0,>=2.42.0
|
30
|
-
Requires-Dist: docling-parse<5.0.0,>=4.
|
31
|
-
Requires-Dist: docling-ibm-models<4,>=3.9.
|
30
|
+
Requires-Dist: docling-parse<5.0.0,>=4.4.0
|
31
|
+
Requires-Dist: docling-ibm-models<4,>=3.9.1
|
32
32
|
Requires-Dist: filetype<2.0.0,>=1.2.0
|
33
33
|
Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0
|
34
34
|
Requires-Dist: pydantic-settings<3.0.0,>=2.3.0
|
@@ -101,17 +101,20 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
|
101
101
|
|
102
102
|
## Features
|
103
103
|
|
104
|
-
* 🗂️
|
104
|
+
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
|
105
105
|
* 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
|
106
106
|
* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
|
107
|
-
* ↪️
|
107
|
+
* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
|
108
108
|
* 🔒 Local execution capabilities for sensitive data and air-gapped environments
|
109
109
|
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
110
110
|
* 🔍 Extensive OCR support for scanned PDFs and images
|
111
111
|
* 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
|
112
|
-
* 🎙️
|
112
|
+
* 🎙️ Audio support with Automatic Speech Recognition (ASR) models
|
113
113
|
* 💻 Simple and convenient CLI
|
114
114
|
|
115
|
+
### What's new
|
116
|
+
* 📤 Structured [information extraction][extraction] \[🧪 beta\]
|
117
|
+
|
115
118
|
### Coming soon
|
116
119
|
|
117
120
|
* 📝 Metadata extraction, including title, authors, references & language
|
@@ -222,3 +225,4 @@ The project was started by the AI for knowledge team at IBM Research Zurich.
|
|
222
225
|
[supported_formats]: https://docling-project.github.io/docling/usage/supported_formats/
|
223
226
|
[docling_document]: https://docling-project.github.io/docling/concepts/docling_document/
|
224
227
|
[integrations]: https://docling-project.github.io/docling/integrations/
|
228
|
+
[extraction]: https://docling-project.github.io/docling/examples/extraction/
|
@@ -29,17 +29,20 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
|
29
29
|
|
30
30
|
## Features
|
31
31
|
|
32
|
-
* 🗂️
|
32
|
+
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
|
33
33
|
* 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
|
34
34
|
* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
|
35
|
-
* ↪️
|
35
|
+
* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
|
36
36
|
* 🔒 Local execution capabilities for sensitive data and air-gapped environments
|
37
37
|
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
38
38
|
* 🔍 Extensive OCR support for scanned PDFs and images
|
39
39
|
* 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
|
40
|
-
* 🎙️
|
40
|
+
* 🎙️ Audio support with Automatic Speech Recognition (ASR) models
|
41
41
|
* 💻 Simple and convenient CLI
|
42
42
|
|
43
|
+
### What's new
|
44
|
+
* 📤 Structured [information extraction][extraction] \[🧪 beta\]
|
45
|
+
|
43
46
|
### Coming soon
|
44
47
|
|
45
48
|
* 📝 Metadata extraction, including title, authors, references & language
|
@@ -150,3 +153,4 @@ The project was started by the AI for knowledge team at IBM Research Zurich.
|
|
150
153
|
[supported_formats]: https://docling-project.github.io/docling/usage/supported_formats/
|
151
154
|
[docling_document]: https://docling-project.github.io/docling/concepts/docling_document/
|
152
155
|
[integrations]: https://docling-project.github.io/docling/integrations/
|
156
|
+
[extraction]: https://docling-project.github.io/docling/examples/extraction/
|
@@ -30,13 +30,21 @@ class DoclingParseV4PageBackend(PdfPageBackend):
|
|
30
30
|
page_no: int,
|
31
31
|
create_words: bool = True,
|
32
32
|
create_textlines: bool = True,
|
33
|
+
keep_chars: bool = False,
|
34
|
+
keep_lines: bool = False,
|
35
|
+
keep_images: bool = True,
|
33
36
|
):
|
34
37
|
self._ppage = page_obj
|
35
38
|
self._dp_doc = dp_doc
|
36
39
|
self._page_no = page_no
|
40
|
+
|
37
41
|
self._create_words = create_words
|
38
42
|
self._create_textlines = create_textlines
|
39
43
|
|
44
|
+
self._keep_chars = keep_chars
|
45
|
+
self._keep_lines = keep_lines
|
46
|
+
self._keep_images = keep_images
|
47
|
+
|
40
48
|
self._dpage: Optional[SegmentedPdfPage] = None
|
41
49
|
self._unloaded = False
|
42
50
|
self.valid = (self._ppage is not None) and (self._dp_doc is not None)
|
@@ -47,8 +55,12 @@ class DoclingParseV4PageBackend(PdfPageBackend):
|
|
47
55
|
|
48
56
|
seg_page = self._dp_doc.get_page(
|
49
57
|
self._page_no + 1,
|
58
|
+
keep_chars=self._keep_chars,
|
59
|
+
keep_lines=self._keep_lines,
|
60
|
+
keep_bitmaps=self._keep_images,
|
50
61
|
create_words=self._create_words,
|
51
62
|
create_textlines=self._create_textlines,
|
63
|
+
enforce_same_font=True,
|
52
64
|
)
|
53
65
|
|
54
66
|
# In Docling, all TextCell instances are expected with top-left origin.
|
@@ -467,13 +467,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
467
467
|
|
468
468
|
@contextmanager
|
469
469
|
def _use_hyperlink(self, tag: Tag):
|
470
|
+
old_hyperlink: Union[AnyUrl, Path, None] = None
|
471
|
+
new_hyperlink: Union[AnyUrl, Path, None] = None
|
470
472
|
this_href = tag.get("href")
|
471
473
|
if this_href is None:
|
472
474
|
yield None
|
473
475
|
else:
|
474
476
|
if isinstance(this_href, str) and this_href:
|
475
|
-
old_hyperlink
|
476
|
-
new_hyperlink: Union[AnyUrl, Path, None] = None
|
477
|
+
old_hyperlink = self.hyperlink
|
477
478
|
if self.original_url is not None:
|
478
479
|
this_href = urljoin(str(self.original_url), str(this_href))
|
479
480
|
# ugly fix for relative links since pydantic does not support them.
|
@@ -237,7 +237,9 @@ class PdfBackend(str, Enum):
|
|
237
237
|
|
238
238
|
|
239
239
|
# Define an enum for the ocr engines
|
240
|
-
@deprecated(
|
240
|
+
@deprecated(
|
241
|
+
"Use get_ocr_factory().registered_kind to get a list of registered OCR engines."
|
242
|
+
)
|
241
243
|
class OcrEngine(str, Enum):
|
242
244
|
"""Enum of valid OCR engines."""
|
243
245
|
|
@@ -283,10 +285,10 @@ class LayoutOptions(BaseModel):
|
|
283
285
|
keep_empty_clusters: bool = (
|
284
286
|
False # Whether to keep clusters that contain no text cells
|
285
287
|
)
|
288
|
+
model_spec: LayoutModelConfig = DOCLING_LAYOUT_HERON
|
286
289
|
skip_cell_assignment: bool = (
|
287
290
|
False # Skip cell-to-cluster assignment for VLM-only processing
|
288
291
|
)
|
289
|
-
model_spec: LayoutModelConfig = DOCLING_LAYOUT_V2
|
290
292
|
|
291
293
|
|
292
294
|
class AsrPipelineOptions(PipelineOptions):
|
@@ -91,7 +91,7 @@ class LayoutModel(BasePageModel):
|
|
91
91
|
local_dir: Optional[Path] = None,
|
92
92
|
force: bool = False,
|
93
93
|
progress: bool = False,
|
94
|
-
layout_model_config: LayoutModelConfig =
|
94
|
+
layout_model_config: LayoutModelConfig = LayoutOptions().model_spec, # use default
|
95
95
|
) -> Path:
|
96
96
|
return download_hf_model(
|
97
97
|
repo_id=layout_model_config.repo_id,
|
@@ -122,8 +122,8 @@ class LayoutModel(BasePageModel):
|
|
122
122
|
left_clusters = [c for c in clusters if c.label not in exclude_labels]
|
123
123
|
right_clusters = [c for c in clusters if c.label in exclude_labels]
|
124
124
|
# Create a deep copy of the original image for both sides
|
125
|
-
left_image =
|
126
|
-
right_image =
|
125
|
+
left_image = page.image.copy()
|
126
|
+
right_image = page.image.copy()
|
127
127
|
|
128
128
|
# Draw clusters on both images
|
129
129
|
draw_clusters(left_image, left_clusters, scale_x, scale_y)
|
@@ -90,7 +90,7 @@ class PagePreprocessingModel(BasePageModel):
|
|
90
90
|
|
91
91
|
# DEBUG code:
|
92
92
|
def draw_text_boxes(image, cells, show: bool = False):
|
93
|
-
draw = ImageDraw.Draw(image)
|
93
|
+
draw = ImageDraw.Draw(image.copy())
|
94
94
|
for c in cells:
|
95
95
|
x0, y0, x1, y1 = (
|
96
96
|
c.to_bounding_box().l,
|
@@ -4,6 +4,7 @@ from typing import Optional
|
|
4
4
|
|
5
5
|
from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_V2
|
6
6
|
from docling.datamodel.pipeline_options import (
|
7
|
+
LayoutOptions,
|
7
8
|
granite_picture_description,
|
8
9
|
smolvlm_picture_description,
|
9
10
|
)
|
@@ -47,7 +48,7 @@ def download_models(
|
|
47
48
|
if with_layout:
|
48
49
|
_log.info("Downloading layout model...")
|
49
50
|
LayoutModel.download_models(
|
50
|
-
local_dir=output_dir /
|
51
|
+
local_dir=output_dir / LayoutOptions().model_spec.model_repo_folder,
|
51
52
|
force=force,
|
52
53
|
progress=progress,
|
53
54
|
)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.51.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
6
|
License-Expression: MIT
|
@@ -27,8 +27,8 @@ Description-Content-Type: text/markdown
|
|
27
27
|
License-File: LICENSE
|
28
28
|
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
29
29
|
Requires-Dist: docling-core[chunking]<3.0.0,>=2.42.0
|
30
|
-
Requires-Dist: docling-parse<5.0.0,>=4.
|
31
|
-
Requires-Dist: docling-ibm-models<4,>=3.9.
|
30
|
+
Requires-Dist: docling-parse<5.0.0,>=4.4.0
|
31
|
+
Requires-Dist: docling-ibm-models<4,>=3.9.1
|
32
32
|
Requires-Dist: filetype<2.0.0,>=1.2.0
|
33
33
|
Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0
|
34
34
|
Requires-Dist: pydantic-settings<3.0.0,>=2.3.0
|
@@ -101,17 +101,20 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
|
101
101
|
|
102
102
|
## Features
|
103
103
|
|
104
|
-
* 🗂️
|
104
|
+
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
|
105
105
|
* 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
|
106
106
|
* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
|
107
|
-
* ↪️
|
107
|
+
* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
|
108
108
|
* 🔒 Local execution capabilities for sensitive data and air-gapped environments
|
109
109
|
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
110
110
|
* 🔍 Extensive OCR support for scanned PDFs and images
|
111
111
|
* 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
|
112
|
-
* 🎙️
|
112
|
+
* 🎙️ Audio support with Automatic Speech Recognition (ASR) models
|
113
113
|
* 💻 Simple and convenient CLI
|
114
114
|
|
115
|
+
### What's new
|
116
|
+
* 📤 Structured [information extraction][extraction] \[🧪 beta\]
|
117
|
+
|
115
118
|
### Coming soon
|
116
119
|
|
117
120
|
* 📝 Metadata extraction, including title, authors, references & language
|
@@ -222,3 +225,4 @@ The project was started by the AI for knowledge team at IBM Research Zurich.
|
|
222
225
|
[supported_formats]: https://docling-project.github.io/docling/usage/supported_formats/
|
223
226
|
[docling_document]: https://docling-project.github.io/docling/concepts/docling_document/
|
224
227
|
[integrations]: https://docling-project.github.io/docling/integrations/
|
228
|
+
[extraction]: https://docling-project.github.io/docling/examples/extraction/
|
@@ -1,7 +1,7 @@
|
|
1
1
|
pydantic<3.0.0,>=2.0.0
|
2
2
|
docling-core[chunking]<3.0.0,>=2.42.0
|
3
|
-
docling-parse<5.0.0,>=4.
|
4
|
-
docling-ibm-models<4,>=3.9.
|
3
|
+
docling-parse<5.0.0,>=4.4.0
|
4
|
+
docling-ibm-models<4,>=3.9.1
|
5
5
|
filetype<2.0.0,>=1.2.0
|
6
6
|
pypdfium2!=4.30.1,<5.0.0,>=4.30.0
|
7
7
|
pydantic-settings<3.0.0,>=2.3.0
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "docling"
|
3
|
-
version = "2.
|
3
|
+
version = "2.51.0" # DO NOT EDIT, updated automatically
|
4
4
|
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
5
5
|
license = "MIT"
|
6
6
|
keywords = [
|
@@ -45,8 +45,8 @@ requires-python = '>=3.9,<4.0'
|
|
45
45
|
dependencies = [
|
46
46
|
'pydantic (>=2.0.0,<3.0.0)',
|
47
47
|
'docling-core[chunking] (>=2.42.0,<3.0.0)',
|
48
|
-
'docling-parse (>=4.
|
49
|
-
"docling-ibm-models>=3.9.
|
48
|
+
'docling-parse (>=4.4.0,<5.0.0)',
|
49
|
+
"docling-ibm-models>=3.9.1,<4",
|
50
50
|
'filetype (>=1.2.0,<2.0.0)',
|
51
51
|
'pypdfium2 (>=4.30.0,!=4.30.1,<5.0.0)',
|
52
52
|
'pydantic-settings (>=2.3.0,<3.0.0)',
|
@@ -11,6 +11,8 @@ from .verify_utils import verify_conversion_result_v2
|
|
11
11
|
|
12
12
|
GENERATE_V2 = GEN_TEST_DATA
|
13
13
|
|
14
|
+
SKIP_DOCTAGS_COMPARISON = ["2203.01017v2.pdf"]
|
15
|
+
|
14
16
|
|
15
17
|
def get_pdf_paths():
|
16
18
|
# Define the directory you want to search
|
@@ -50,6 +52,12 @@ def test_e2e_pdfs_conversions():
|
|
50
52
|
|
51
53
|
doc_result: ConversionResult = converter.convert(pdf_path)
|
52
54
|
|
55
|
+
# Decide if to skip doctags comparison
|
56
|
+
verify_doctags = pdf_path.name not in SKIP_DOCTAGS_COMPARISON
|
57
|
+
|
53
58
|
verify_conversion_result_v2(
|
54
|
-
input_path=pdf_path,
|
59
|
+
input_path=pdf_path,
|
60
|
+
doc_result=doc_result,
|
61
|
+
generate=GENERATE_V2,
|
62
|
+
verify_doctags=verify_doctags,
|
55
63
|
)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{docling-2.49.0 → docling-2.51.0}/docling/models/vlm_models_inline/nuextract_transformers_model.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|