docling 2.50.0__tar.gz → 2.51.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.50.0 → docling-2.51.0}/PKG-INFO +9 -5
- {docling-2.50.0 → docling-2.51.0}/README.md +7 -3
- {docling-2.50.0 → docling-2.51.0}/docling/backend/docling_parse_v4_backend.py +12 -0
- {docling-2.50.0 → docling-2.51.0}/docling/datamodel/pipeline_options.py +3 -1
- {docling-2.50.0 → docling-2.51.0}/docling.egg-info/PKG-INFO +9 -5
- {docling-2.50.0 → docling-2.51.0}/docling.egg-info/requires.txt +1 -1
- {docling-2.50.0 → docling-2.51.0}/pyproject.toml +2 -2
- {docling-2.50.0 → docling-2.51.0}/LICENSE +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/__init__.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/backend/__init__.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/backend/csv_backend.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/backend/docx/__init__.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/backend/docx/latex/__init__.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/backend/docx/latex/latex_dict.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/backend/docx/latex/omml.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/backend/html_backend.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/backend/json/__init__.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/backend/md_backend.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/backend/mets_gbs_backend.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/backend/msword_backend.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/backend/noop_backend.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/backend/pdf_backend.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/backend/xml/__init__.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/backend/xml/jats_backend.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/backend/xml/uspto_backend.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/chunking/__init__.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/cli/__init__.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/cli/main.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/cli/models.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/cli/tools.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/datamodel/accelerator_options.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/datamodel/asr_model_specs.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/datamodel/base_models.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/datamodel/document.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/datamodel/extraction.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/datamodel/layout_model_specs.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/datamodel/pipeline_options_asr_model.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/datamodel/pipeline_options_vlm_model.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/datamodel/settings.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/datamodel/vlm_model_specs.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/document_converter.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/document_extractor.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/exceptions.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/models/__init__.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/models/api_vlm_model.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/models/base_model.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/models/base_ocr_model.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/models/code_formula_model.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/models/document_picture_classifier.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/models/easyocr_model.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/models/factories/__init__.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/models/factories/base_factory.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/models/factories/ocr_factory.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/models/factories/picture_description_factory.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/models/layout_model.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/models/page_assemble_model.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/models/picture_description_api_model.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/models/picture_description_base_model.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/models/picture_description_vlm_model.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/models/plugins/__init__.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/models/plugins/defaults.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/models/rapid_ocr_model.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/models/readingorder_model.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/models/table_structure_model.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/models/utils/__init__.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/models/utils/hf_model_download.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/models/vlm_models_inline/__init__.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/models/vlm_models_inline/hf_transformers_model.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/models/vlm_models_inline/mlx_model.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/models/vlm_models_inline/nuextract_transformers_model.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/models/vlm_models_inline/vllm_model.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/pipeline/__init__.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/pipeline/asr_pipeline.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/pipeline/base_extraction_pipeline.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/pipeline/extraction_vlm_pipeline.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/pipeline/threaded_standard_pdf_pipeline.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/pipeline/vlm_pipeline.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/py.typed +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/utils/__init__.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/utils/api_image_request.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/utils/export.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/utils/glm_utils.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/utils/layout_postprocessor.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/utils/locks.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/utils/model_downloader.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/utils/ocr_utils.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/utils/orientation.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/utils/profiling.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/utils/utils.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling/utils/visualization.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling.egg-info/SOURCES.txt +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling.egg-info/dependency_links.txt +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling.egg-info/entry_points.txt +0 -0
- {docling-2.50.0 → docling-2.51.0}/docling.egg-info/top_level.txt +0 -0
- {docling-2.50.0 → docling-2.51.0}/setup.cfg +0 -0
- {docling-2.50.0 → docling-2.51.0}/tests/test_asr_pipeline.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/tests/test_backend_asciidoc.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/tests/test_backend_csv.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/tests/test_backend_docling_json.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/tests/test_backend_docling_parse.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/tests/test_backend_docling_parse_v2.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/tests/test_backend_docling_parse_v4.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/tests/test_backend_html.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/tests/test_backend_jats.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/tests/test_backend_markdown.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/tests/test_backend_mets_gbs.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/tests/test_backend_msexcel.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/tests/test_backend_msword.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/tests/test_backend_patent_uspto.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/tests/test_backend_pdfium.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/tests/test_backend_pptx.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/tests/test_backend_webp.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/tests/test_cli.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/tests/test_code_formula.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/tests/test_data_gen_flag.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/tests/test_document_picture_classifier.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/tests/test_e2e_conversion.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/tests/test_e2e_ocr_conversion.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/tests/test_extraction.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/tests/test_input_doc.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/tests/test_interfaces.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/tests/test_invalid_input.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/tests/test_legacy_format_transform.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/tests/test_ocr_utils.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/tests/test_options.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/tests/test_settings_load.py +0 -0
- {docling-2.50.0 → docling-2.51.0}/tests/test_threaded_pipeline.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.51.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
6
|
License-Expression: MIT
|
@@ -27,7 +27,7 @@ Description-Content-Type: text/markdown
|
|
27
27
|
License-File: LICENSE
|
28
28
|
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
29
29
|
Requires-Dist: docling-core[chunking]<3.0.0,>=2.42.0
|
30
|
-
Requires-Dist: docling-parse<5.0.0,>=4.
|
30
|
+
Requires-Dist: docling-parse<5.0.0,>=4.4.0
|
31
31
|
Requires-Dist: docling-ibm-models<4,>=3.9.1
|
32
32
|
Requires-Dist: filetype<2.0.0,>=1.2.0
|
33
33
|
Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0
|
@@ -101,17 +101,20 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
|
101
101
|
|
102
102
|
## Features
|
103
103
|
|
104
|
-
* 🗂️
|
104
|
+
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
|
105
105
|
* 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
|
106
106
|
* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
|
107
|
-
* ↪️
|
107
|
+
* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
|
108
108
|
* 🔒 Local execution capabilities for sensitive data and air-gapped environments
|
109
109
|
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
110
110
|
* 🔍 Extensive OCR support for scanned PDFs and images
|
111
111
|
* 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
|
112
|
-
* 🎙️
|
112
|
+
* 🎙️ Audio support with Automatic Speech Recognition (ASR) models
|
113
113
|
* 💻 Simple and convenient CLI
|
114
114
|
|
115
|
+
### What's new
|
116
|
+
* 📤 Structured [information extraction][extraction] \[🧪 beta\]
|
117
|
+
|
115
118
|
### Coming soon
|
116
119
|
|
117
120
|
* 📝 Metadata extraction, including title, authors, references & language
|
@@ -222,3 +225,4 @@ The project was started by the AI for knowledge team at IBM Research Zurich.
|
|
222
225
|
[supported_formats]: https://docling-project.github.io/docling/usage/supported_formats/
|
223
226
|
[docling_document]: https://docling-project.github.io/docling/concepts/docling_document/
|
224
227
|
[integrations]: https://docling-project.github.io/docling/integrations/
|
228
|
+
[extraction]: https://docling-project.github.io/docling/examples/extraction/
|
@@ -29,17 +29,20 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
|
29
29
|
|
30
30
|
## Features
|
31
31
|
|
32
|
-
* 🗂️
|
32
|
+
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
|
33
33
|
* 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
|
34
34
|
* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
|
35
|
-
* ↪️
|
35
|
+
* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
|
36
36
|
* 🔒 Local execution capabilities for sensitive data and air-gapped environments
|
37
37
|
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
38
38
|
* 🔍 Extensive OCR support for scanned PDFs and images
|
39
39
|
* 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
|
40
|
-
* 🎙️
|
40
|
+
* 🎙️ Audio support with Automatic Speech Recognition (ASR) models
|
41
41
|
* 💻 Simple and convenient CLI
|
42
42
|
|
43
|
+
### What's new
|
44
|
+
* 📤 Structured [information extraction][extraction] \[🧪 beta\]
|
45
|
+
|
43
46
|
### Coming soon
|
44
47
|
|
45
48
|
* 📝 Metadata extraction, including title, authors, references & language
|
@@ -150,3 +153,4 @@ The project was started by the AI for knowledge team at IBM Research Zurich.
|
|
150
153
|
[supported_formats]: https://docling-project.github.io/docling/usage/supported_formats/
|
151
154
|
[docling_document]: https://docling-project.github.io/docling/concepts/docling_document/
|
152
155
|
[integrations]: https://docling-project.github.io/docling/integrations/
|
156
|
+
[extraction]: https://docling-project.github.io/docling/examples/extraction/
|
@@ -30,13 +30,21 @@ class DoclingParseV4PageBackend(PdfPageBackend):
|
|
30
30
|
page_no: int,
|
31
31
|
create_words: bool = True,
|
32
32
|
create_textlines: bool = True,
|
33
|
+
keep_chars: bool = False,
|
34
|
+
keep_lines: bool = False,
|
35
|
+
keep_images: bool = True,
|
33
36
|
):
|
34
37
|
self._ppage = page_obj
|
35
38
|
self._dp_doc = dp_doc
|
36
39
|
self._page_no = page_no
|
40
|
+
|
37
41
|
self._create_words = create_words
|
38
42
|
self._create_textlines = create_textlines
|
39
43
|
|
44
|
+
self._keep_chars = keep_chars
|
45
|
+
self._keep_lines = keep_lines
|
46
|
+
self._keep_images = keep_images
|
47
|
+
|
40
48
|
self._dpage: Optional[SegmentedPdfPage] = None
|
41
49
|
self._unloaded = False
|
42
50
|
self.valid = (self._ppage is not None) and (self._dp_doc is not None)
|
@@ -47,8 +55,12 @@ class DoclingParseV4PageBackend(PdfPageBackend):
|
|
47
55
|
|
48
56
|
seg_page = self._dp_doc.get_page(
|
49
57
|
self._page_no + 1,
|
58
|
+
keep_chars=self._keep_chars,
|
59
|
+
keep_lines=self._keep_lines,
|
60
|
+
keep_bitmaps=self._keep_images,
|
50
61
|
create_words=self._create_words,
|
51
62
|
create_textlines=self._create_textlines,
|
63
|
+
enforce_same_font=True,
|
52
64
|
)
|
53
65
|
|
54
66
|
# In Docling, all TextCell instances are expected with top-left origin.
|
@@ -237,7 +237,9 @@ class PdfBackend(str, Enum):
|
|
237
237
|
|
238
238
|
|
239
239
|
# Define an enum for the ocr engines
|
240
|
-
@deprecated(
|
240
|
+
@deprecated(
|
241
|
+
"Use get_ocr_factory().registered_kind to get a list of registered OCR engines."
|
242
|
+
)
|
241
243
|
class OcrEngine(str, Enum):
|
242
244
|
"""Enum of valid OCR engines."""
|
243
245
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.51.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
6
|
License-Expression: MIT
|
@@ -27,7 +27,7 @@ Description-Content-Type: text/markdown
|
|
27
27
|
License-File: LICENSE
|
28
28
|
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
29
29
|
Requires-Dist: docling-core[chunking]<3.0.0,>=2.42.0
|
30
|
-
Requires-Dist: docling-parse<5.0.0,>=4.
|
30
|
+
Requires-Dist: docling-parse<5.0.0,>=4.4.0
|
31
31
|
Requires-Dist: docling-ibm-models<4,>=3.9.1
|
32
32
|
Requires-Dist: filetype<2.0.0,>=1.2.0
|
33
33
|
Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0
|
@@ -101,17 +101,20 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
|
101
101
|
|
102
102
|
## Features
|
103
103
|
|
104
|
-
* 🗂️
|
104
|
+
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
|
105
105
|
* 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
|
106
106
|
* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
|
107
|
-
* ↪️
|
107
|
+
* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
|
108
108
|
* 🔒 Local execution capabilities for sensitive data and air-gapped environments
|
109
109
|
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
110
110
|
* 🔍 Extensive OCR support for scanned PDFs and images
|
111
111
|
* 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
|
112
|
-
* 🎙️
|
112
|
+
* 🎙️ Audio support with Automatic Speech Recognition (ASR) models
|
113
113
|
* 💻 Simple and convenient CLI
|
114
114
|
|
115
|
+
### What's new
|
116
|
+
* 📤 Structured [information extraction][extraction] \[🧪 beta\]
|
117
|
+
|
115
118
|
### Coming soon
|
116
119
|
|
117
120
|
* 📝 Metadata extraction, including title, authors, references & language
|
@@ -222,3 +225,4 @@ The project was started by the AI for knowledge team at IBM Research Zurich.
|
|
222
225
|
[supported_formats]: https://docling-project.github.io/docling/usage/supported_formats/
|
223
226
|
[docling_document]: https://docling-project.github.io/docling/concepts/docling_document/
|
224
227
|
[integrations]: https://docling-project.github.io/docling/integrations/
|
228
|
+
[extraction]: https://docling-project.github.io/docling/examples/extraction/
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "docling"
|
3
|
-
version = "2.
|
3
|
+
version = "2.51.0" # DO NOT EDIT, updated automatically
|
4
4
|
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
5
5
|
license = "MIT"
|
6
6
|
keywords = [
|
@@ -45,7 +45,7 @@ requires-python = '>=3.9,<4.0'
|
|
45
45
|
dependencies = [
|
46
46
|
'pydantic (>=2.0.0,<3.0.0)',
|
47
47
|
'docling-core[chunking] (>=2.42.0,<3.0.0)',
|
48
|
-
'docling-parse (>=4.
|
48
|
+
'docling-parse (>=4.4.0,<5.0.0)',
|
49
49
|
"docling-ibm-models>=3.9.1,<4",
|
50
50
|
'filetype (>=1.2.0,<2.0.0)',
|
51
51
|
'pypdfium2 (>=4.30.0,!=4.30.1,<5.0.0)',
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{docling-2.50.0 → docling-2.51.0}/docling/models/vlm_models_inline/nuextract_transformers_model.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|