docling 2.51.0__tar.gz → 2.53.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- {docling-2.51.0 → docling-2.53.0}/PKG-INFO +10 -6
- {docling-2.51.0 → docling-2.53.0}/README.md +8 -4
- {docling-2.51.0 → docling-2.53.0}/docling/cli/main.py +44 -1
- {docling-2.51.0 → docling-2.53.0}/docling/cli/models.py +4 -0
- {docling-2.51.0 → docling-2.53.0}/docling/datamodel/pipeline_options.py +21 -12
- {docling-2.51.0 → docling-2.53.0}/docling/datamodel/vlm_model_specs.py +30 -0
- {docling-2.51.0 → docling-2.53.0}/docling/models/base_model.py +27 -2
- {docling-2.51.0 → docling-2.53.0}/docling/models/easyocr_model.py +19 -9
- {docling-2.51.0 → docling-2.53.0}/docling/models/picture_description_vlm_model.py +1 -1
- {docling-2.51.0 → docling-2.53.0}/docling/models/rapid_ocr_model.py +40 -25
- {docling-2.51.0 → docling-2.53.0}/docling/models/vlm_models_inline/hf_transformers_model.py +1 -1
- {docling-2.51.0 → docling-2.53.0}/docling/models/vlm_models_inline/nuextract_transformers_model.py +1 -1
- {docling-2.51.0 → docling-2.53.0}/docling/pipeline/asr_pipeline.py +1 -13
- {docling-2.51.0 → docling-2.53.0}/docling/pipeline/base_extraction_pipeline.py +17 -3
- {docling-2.51.0 → docling-2.53.0}/docling/pipeline/base_pipeline.py +75 -9
- {docling-2.51.0 → docling-2.53.0}/docling/pipeline/extraction_vlm_pipeline.py +9 -16
- {docling-2.51.0 → docling-2.53.0}/docling/pipeline/simple_pipeline.py +6 -6
- {docling-2.51.0 → docling-2.53.0}/docling/pipeline/standard_pdf_pipeline.py +6 -55
- {docling-2.51.0 → docling-2.53.0}/docling/pipeline/threaded_standard_pdf_pipeline.py +102 -62
- {docling-2.51.0 → docling-2.53.0}/docling/pipeline/vlm_pipeline.py +3 -15
- {docling-2.51.0 → docling-2.53.0}/docling/utils/model_downloader.py +22 -0
- {docling-2.51.0 → docling-2.53.0}/docling.egg-info/PKG-INFO +10 -6
- {docling-2.51.0 → docling-2.53.0}/docling.egg-info/requires.txt +1 -1
- {docling-2.51.0 → docling-2.53.0}/pyproject.toml +2 -2
- {docling-2.51.0 → docling-2.53.0}/tests/test_e2e_ocr_conversion.py +10 -0
- {docling-2.51.0 → docling-2.53.0}/LICENSE +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/__init__.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/backend/__init__.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/backend/csv_backend.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/backend/docling_parse_v4_backend.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/backend/docx/__init__.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/backend/docx/latex/__init__.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/backend/docx/latex/latex_dict.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/backend/docx/latex/omml.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/backend/html_backend.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/backend/json/__init__.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/backend/md_backend.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/backend/mets_gbs_backend.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/backend/msword_backend.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/backend/noop_backend.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/backend/pdf_backend.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/backend/xml/__init__.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/backend/xml/jats_backend.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/backend/xml/uspto_backend.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/chunking/__init__.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/cli/__init__.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/cli/tools.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/datamodel/accelerator_options.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/datamodel/asr_model_specs.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/datamodel/base_models.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/datamodel/document.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/datamodel/extraction.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/datamodel/layout_model_specs.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/datamodel/pipeline_options_asr_model.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/datamodel/pipeline_options_vlm_model.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/datamodel/settings.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/document_converter.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/document_extractor.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/exceptions.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/models/__init__.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/models/api_vlm_model.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/models/base_ocr_model.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/models/code_formula_model.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/models/document_picture_classifier.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/models/factories/__init__.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/models/factories/base_factory.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/models/factories/ocr_factory.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/models/factories/picture_description_factory.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/models/layout_model.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/models/page_assemble_model.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/models/picture_description_api_model.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/models/picture_description_base_model.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/models/plugins/__init__.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/models/plugins/defaults.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/models/readingorder_model.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/models/table_structure_model.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/models/utils/__init__.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/models/utils/hf_model_download.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/models/vlm_models_inline/__init__.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/models/vlm_models_inline/mlx_model.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/models/vlm_models_inline/vllm_model.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/pipeline/__init__.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/py.typed +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/utils/__init__.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/utils/api_image_request.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/utils/export.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/utils/glm_utils.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/utils/layout_postprocessor.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/utils/locks.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/utils/ocr_utils.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/utils/orientation.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/utils/profiling.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/utils/utils.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling/utils/visualization.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling.egg-info/SOURCES.txt +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling.egg-info/dependency_links.txt +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling.egg-info/entry_points.txt +0 -0
- {docling-2.51.0 → docling-2.53.0}/docling.egg-info/top_level.txt +0 -0
- {docling-2.51.0 → docling-2.53.0}/setup.cfg +0 -0
- {docling-2.51.0 → docling-2.53.0}/tests/test_asr_pipeline.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/tests/test_backend_asciidoc.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/tests/test_backend_csv.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/tests/test_backend_docling_json.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/tests/test_backend_docling_parse.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/tests/test_backend_docling_parse_v2.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/tests/test_backend_docling_parse_v4.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/tests/test_backend_html.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/tests/test_backend_jats.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/tests/test_backend_markdown.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/tests/test_backend_mets_gbs.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/tests/test_backend_msexcel.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/tests/test_backend_msword.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/tests/test_backend_patent_uspto.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/tests/test_backend_pdfium.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/tests/test_backend_pptx.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/tests/test_backend_webp.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/tests/test_cli.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/tests/test_code_formula.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/tests/test_data_gen_flag.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/tests/test_document_picture_classifier.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/tests/test_e2e_conversion.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/tests/test_extraction.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/tests/test_input_doc.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/tests/test_interfaces.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/tests/test_invalid_input.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/tests/test_legacy_format_transform.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/tests/test_ocr_utils.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/tests/test_options.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/tests/test_settings_load.py +0 -0
- {docling-2.51.0 → docling-2.53.0}/tests/test_threaded_pipeline.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.53.0
|
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -26,7 +26,7 @@ Requires-Python: <4.0,>=3.9
|
|
|
26
26
|
Description-Content-Type: text/markdown
|
|
27
27
|
License-File: LICENSE
|
|
28
28
|
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
|
29
|
-
Requires-Dist: docling-core[chunking]<3.0.0,>=2.
|
|
29
|
+
Requires-Dist: docling-core[chunking]<3.0.0,>=2.48.0
|
|
30
30
|
Requires-Dist: docling-parse<5.0.0,>=4.4.0
|
|
31
31
|
Requires-Dist: docling-ibm-models<4,>=3.9.1
|
|
32
32
|
Requires-Dist: filetype<2.0.0,>=1.2.0
|
|
@@ -108,18 +108,22 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
|
|
108
108
|
* 🔒 Local execution capabilities for sensitive data and air-gapped environments
|
|
109
109
|
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
|
110
110
|
* 🔍 Extensive OCR support for scanned PDFs and images
|
|
111
|
-
* 👓 Support of several Visual Language Models ([
|
|
111
|
+
* 👓 Support of several Visual Language Models ([GraniteDocling](https://huggingface.co/ibm-granite/granite-docling-258M))
|
|
112
112
|
* 🎙️ Audio support with Automatic Speech Recognition (ASR) models
|
|
113
|
+
* 🔌 Connect to any agent using the [MCP server](https://docling-project.github.io/docling/usage/mcp/)
|
|
113
114
|
* 💻 Simple and convenient CLI
|
|
114
115
|
|
|
115
116
|
### What's new
|
|
116
117
|
* 📤 Structured [information extraction][extraction] \[🧪 beta\]
|
|
118
|
+
* 📑 New layout model (**Heron**) by default, for faster PDF parsing
|
|
119
|
+
* 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications
|
|
117
120
|
|
|
118
121
|
### Coming soon
|
|
119
122
|
|
|
120
123
|
* 📝 Metadata extraction, including title, authors, references & language
|
|
121
124
|
* 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
|
|
122
125
|
* 📝 Complex chemistry understanding (Molecular structures)
|
|
126
|
+
* 📝 Parsing of Web Video Text Tracks (WebVTT) files
|
|
123
127
|
|
|
124
128
|
## Installation
|
|
125
129
|
|
|
@@ -145,7 +149,7 @@ result = converter.convert(source)
|
|
|
145
149
|
print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
|
|
146
150
|
```
|
|
147
151
|
|
|
148
|
-
More [advanced usage options](https://docling-project.github.io/docling/usage/) are available in
|
|
152
|
+
More [advanced usage options](https://docling-project.github.io/docling/usage/advanced_options/) are available in
|
|
149
153
|
the docs.
|
|
150
154
|
|
|
151
155
|
## CLI
|
|
@@ -156,9 +160,9 @@ Docling has a built-in CLI to run conversions.
|
|
|
156
160
|
docling https://arxiv.org/pdf/2206.01062
|
|
157
161
|
```
|
|
158
162
|
|
|
159
|
-
You can also use 🥚[
|
|
163
|
+
You can also use 🥚[GraniteDocling](https://huggingface.co/ibm-granite/granite-docling-258M) and other VLMs via Docling CLI:
|
|
160
164
|
```bash
|
|
161
|
-
docling --pipeline vlm --vlm-model
|
|
165
|
+
docling --pipeline vlm --vlm-model granite_docling https://arxiv.org/pdf/2206.01062
|
|
162
166
|
```
|
|
163
167
|
This will use MLX acceleration on supported Apple Silicon hardware.
|
|
164
168
|
|
|
@@ -36,18 +36,22 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
|
|
36
36
|
* 🔒 Local execution capabilities for sensitive data and air-gapped environments
|
|
37
37
|
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
|
38
38
|
* 🔍 Extensive OCR support for scanned PDFs and images
|
|
39
|
-
* 👓 Support of several Visual Language Models ([
|
|
39
|
+
* 👓 Support of several Visual Language Models ([GraniteDocling](https://huggingface.co/ibm-granite/granite-docling-258M))
|
|
40
40
|
* 🎙️ Audio support with Automatic Speech Recognition (ASR) models
|
|
41
|
+
* 🔌 Connect to any agent using the [MCP server](https://docling-project.github.io/docling/usage/mcp/)
|
|
41
42
|
* 💻 Simple and convenient CLI
|
|
42
43
|
|
|
43
44
|
### What's new
|
|
44
45
|
* 📤 Structured [information extraction][extraction] \[🧪 beta\]
|
|
46
|
+
* 📑 New layout model (**Heron**) by default, for faster PDF parsing
|
|
47
|
+
* 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications
|
|
45
48
|
|
|
46
49
|
### Coming soon
|
|
47
50
|
|
|
48
51
|
* 📝 Metadata extraction, including title, authors, references & language
|
|
49
52
|
* 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
|
|
50
53
|
* 📝 Complex chemistry understanding (Molecular structures)
|
|
54
|
+
* 📝 Parsing of Web Video Text Tracks (WebVTT) files
|
|
51
55
|
|
|
52
56
|
## Installation
|
|
53
57
|
|
|
@@ -73,7 +77,7 @@ result = converter.convert(source)
|
|
|
73
77
|
print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
|
|
74
78
|
```
|
|
75
79
|
|
|
76
|
-
More [advanced usage options](https://docling-project.github.io/docling/usage/) are available in
|
|
80
|
+
More [advanced usage options](https://docling-project.github.io/docling/usage/advanced_options/) are available in
|
|
77
81
|
the docs.
|
|
78
82
|
|
|
79
83
|
## CLI
|
|
@@ -84,9 +88,9 @@ Docling has a built-in CLI to run conversions.
|
|
|
84
88
|
docling https://arxiv.org/pdf/2206.01062
|
|
85
89
|
```
|
|
86
90
|
|
|
87
|
-
You can also use 🥚[
|
|
91
|
+
You can also use 🥚[GraniteDocling](https://huggingface.co/ibm-granite/granite-docling-258M) and other VLMs via Docling CLI:
|
|
88
92
|
```bash
|
|
89
|
-
docling --pipeline vlm --vlm-model
|
|
93
|
+
docling --pipeline vlm --vlm-model granite_docling https://arxiv.org/pdf/2206.01062
|
|
90
94
|
```
|
|
91
95
|
This will use MLX acceleration on supported Apple Silicon hardware.
|
|
92
96
|
|
|
@@ -48,6 +48,7 @@ from docling.datamodel.base_models import (
|
|
|
48
48
|
from docling.datamodel.document import ConversionResult
|
|
49
49
|
from docling.datamodel.pipeline_options import (
|
|
50
50
|
AsrPipelineOptions,
|
|
51
|
+
ConvertPipelineOptions,
|
|
51
52
|
EasyOcrOptions,
|
|
52
53
|
OcrOptions,
|
|
53
54
|
PaginatedPipelineOptions,
|
|
@@ -63,6 +64,8 @@ from docling.datamodel.vlm_model_specs import (
|
|
|
63
64
|
GOT2_TRANSFORMERS,
|
|
64
65
|
GRANITE_VISION_OLLAMA,
|
|
65
66
|
GRANITE_VISION_TRANSFORMERS,
|
|
67
|
+
GRANITEDOCLING_MLX,
|
|
68
|
+
GRANITEDOCLING_TRANSFORMERS,
|
|
66
69
|
SMOLDOCLING_MLX,
|
|
67
70
|
SMOLDOCLING_TRANSFORMERS,
|
|
68
71
|
SMOLDOCLING_VLLM,
|
|
@@ -71,8 +74,13 @@ from docling.datamodel.vlm_model_specs import (
|
|
|
71
74
|
from docling.document_converter import (
|
|
72
75
|
AudioFormatOption,
|
|
73
76
|
DocumentConverter,
|
|
77
|
+
ExcelFormatOption,
|
|
74
78
|
FormatOption,
|
|
79
|
+
HTMLFormatOption,
|
|
80
|
+
MarkdownFormatOption,
|
|
75
81
|
PdfFormatOption,
|
|
82
|
+
PowerpointFormatOption,
|
|
83
|
+
WordFormatOption,
|
|
76
84
|
)
|
|
77
85
|
from docling.models.factories import get_ocr_factory
|
|
78
86
|
from docling.pipeline.asr_pipeline import AsrPipeline
|
|
@@ -328,7 +336,7 @@ def convert( # noqa: C901
|
|
|
328
336
|
vlm_model: Annotated[
|
|
329
337
|
VlmModelType,
|
|
330
338
|
typer.Option(..., help="Choose the VLM model to use with PDF or image files."),
|
|
331
|
-
] = VlmModelType.
|
|
339
|
+
] = VlmModelType.GRANITEDOCLING,
|
|
332
340
|
asr_model: Annotated[
|
|
333
341
|
AsrModelType,
|
|
334
342
|
typer.Option(..., help="Choose the ASR model to use with audio/video files."),
|
|
@@ -626,10 +634,33 @@ def convert( # noqa: C901
|
|
|
626
634
|
backend=MetsGbsDocumentBackend,
|
|
627
635
|
)
|
|
628
636
|
|
|
637
|
+
# SimplePipeline options
|
|
638
|
+
simple_format_option = ConvertPipelineOptions(
|
|
639
|
+
do_picture_description=enrich_picture_description,
|
|
640
|
+
do_picture_classification=enrich_picture_classes,
|
|
641
|
+
)
|
|
642
|
+
if artifacts_path is not None:
|
|
643
|
+
simple_format_option.artifacts_path = artifacts_path
|
|
644
|
+
|
|
629
645
|
format_options = {
|
|
630
646
|
InputFormat.PDF: pdf_format_option,
|
|
631
647
|
InputFormat.IMAGE: pdf_format_option,
|
|
632
648
|
InputFormat.METS_GBS: mets_gbs_format_option,
|
|
649
|
+
InputFormat.DOCX: WordFormatOption(
|
|
650
|
+
pipeline_options=simple_format_option
|
|
651
|
+
),
|
|
652
|
+
InputFormat.PPTX: PowerpointFormatOption(
|
|
653
|
+
pipeline_options=simple_format_option
|
|
654
|
+
),
|
|
655
|
+
InputFormat.XLSX: ExcelFormatOption(
|
|
656
|
+
pipeline_options=simple_format_option
|
|
657
|
+
),
|
|
658
|
+
InputFormat.HTML: HTMLFormatOption(
|
|
659
|
+
pipeline_options=simple_format_option
|
|
660
|
+
),
|
|
661
|
+
InputFormat.MD: MarkdownFormatOption(
|
|
662
|
+
pipeline_options=simple_format_option
|
|
663
|
+
),
|
|
633
664
|
}
|
|
634
665
|
|
|
635
666
|
elif pipeline == ProcessingPipeline.VLM:
|
|
@@ -655,6 +686,18 @@ def convert( # noqa: C901
|
|
|
655
686
|
"To run SmolDocling faster, please install mlx-vlm:\n"
|
|
656
687
|
"pip install mlx-vlm"
|
|
657
688
|
)
|
|
689
|
+
elif vlm_model == VlmModelType.GRANITEDOCLING:
|
|
690
|
+
pipeline_options.vlm_options = GRANITEDOCLING_TRANSFORMERS
|
|
691
|
+
if sys.platform == "darwin":
|
|
692
|
+
try:
|
|
693
|
+
import mlx_vlm
|
|
694
|
+
|
|
695
|
+
pipeline_options.vlm_options = GRANITEDOCLING_MLX
|
|
696
|
+
except ImportError:
|
|
697
|
+
_log.warning(
|
|
698
|
+
"To run GraniteDocling faster, please install mlx-vlm:\n"
|
|
699
|
+
"pip install mlx-vlm"
|
|
700
|
+
)
|
|
658
701
|
elif vlm_model == VlmModelType.SMOLDOCLING_VLLM:
|
|
659
702
|
pipeline_options.vlm_options = SMOLDOCLING_VLLM
|
|
660
703
|
|
|
@@ -33,6 +33,8 @@ class _AvailableModels(str, Enum):
|
|
|
33
33
|
CODE_FORMULA = "code_formula"
|
|
34
34
|
PICTURE_CLASSIFIER = "picture_classifier"
|
|
35
35
|
SMOLVLM = "smolvlm"
|
|
36
|
+
GRANITEDOCLING = "granitedocling"
|
|
37
|
+
GRANITEDOCLING_MLX = "granitedocling_mlx"
|
|
36
38
|
SMOLDOCLING = "smoldocling"
|
|
37
39
|
SMOLDOCLING_MLX = "smoldocling_mlx"
|
|
38
40
|
GRANITE_VISION = "granite_vision"
|
|
@@ -108,6 +110,8 @@ def download(
|
|
|
108
110
|
with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
|
|
109
111
|
with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
|
|
110
112
|
with_smolvlm=_AvailableModels.SMOLVLM in to_download,
|
|
113
|
+
with_granitedocling=_AvailableModels.GRANITEDOCLING in to_download,
|
|
114
|
+
with_granitedocling_mlx=_AvailableModels.GRANITEDOCLING_MLX in to_download,
|
|
111
115
|
with_smoldocling=_AvailableModels.SMOLDOCLING in to_download,
|
|
112
116
|
with_smoldocling_mlx=_AvailableModels.SMOLDOCLING_MLX in to_download,
|
|
113
117
|
with_granite_vision=_AvailableModels.GRANITE_VISION in to_download,
|
|
@@ -12,7 +12,7 @@ from pydantic import (
|
|
|
12
12
|
)
|
|
13
13
|
from typing_extensions import deprecated
|
|
14
14
|
|
|
15
|
-
from docling.datamodel import asr_model_specs
|
|
15
|
+
from docling.datamodel import asr_model_specs, vlm_model_specs
|
|
16
16
|
|
|
17
17
|
# Import the following for backwards compatibility
|
|
18
18
|
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
|
@@ -114,7 +114,11 @@ class RapidOcrOptions(OcrOptions):
|
|
|
114
114
|
cls_model_path: Optional[str] = None # same default as rapidocr
|
|
115
115
|
rec_model_path: Optional[str] = None # same default as rapidocr
|
|
116
116
|
rec_keys_path: Optional[str] = None # same default as rapidocr
|
|
117
|
-
rec_font_path: Optional[str] = None #
|
|
117
|
+
rec_font_path: Optional[str] = None # Deprecated, please use font_path instead
|
|
118
|
+
font_path: Optional[str] = None # same default as rapidocr
|
|
119
|
+
|
|
120
|
+
# Dictionary to overwrite or pass-through additional parameters
|
|
121
|
+
rapidocr_params: Dict[str, Any] = Field(default_factory=dict)
|
|
118
122
|
|
|
119
123
|
model_config = ConfigDict(
|
|
120
124
|
extra="forbid",
|
|
@@ -135,6 +139,8 @@ class EasyOcrOptions(OcrOptions):
|
|
|
135
139
|
recog_network: Optional[str] = "standard"
|
|
136
140
|
download_enabled: bool = True
|
|
137
141
|
|
|
142
|
+
suppress_mps_warnings: bool = True
|
|
143
|
+
|
|
138
144
|
model_config = ConfigDict(
|
|
139
145
|
extra="forbid",
|
|
140
146
|
protected_namespaces=(),
|
|
@@ -257,11 +263,21 @@ class PipelineOptions(BaseOptions):
|
|
|
257
263
|
accelerator_options: AcceleratorOptions = AcceleratorOptions()
|
|
258
264
|
enable_remote_services: bool = False
|
|
259
265
|
allow_external_plugins: bool = False
|
|
266
|
+
artifacts_path: Optional[Union[Path, str]] = None
|
|
260
267
|
|
|
261
268
|
|
|
262
|
-
class
|
|
263
|
-
|
|
269
|
+
class ConvertPipelineOptions(PipelineOptions):
|
|
270
|
+
"""Base convert pipeline options."""
|
|
271
|
+
|
|
272
|
+
do_picture_classification: bool = False # True: classify pictures in documents
|
|
264
273
|
|
|
274
|
+
do_picture_description: bool = False # True: run describe pictures in documents
|
|
275
|
+
picture_description_options: PictureDescriptionBaseOptions = (
|
|
276
|
+
smolvlm_picture_description
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
class PaginatedPipelineOptions(ConvertPipelineOptions):
|
|
265
281
|
images_scale: float = 1.0
|
|
266
282
|
generate_page_images: bool = False
|
|
267
283
|
generate_picture_images: bool = False
|
|
@@ -274,7 +290,7 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
|
|
|
274
290
|
)
|
|
275
291
|
# If True, text from backend will be used instead of generated text
|
|
276
292
|
vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = (
|
|
277
|
-
|
|
293
|
+
vlm_model_specs.GRANITEDOCLING_TRANSFORMERS
|
|
278
294
|
)
|
|
279
295
|
|
|
280
296
|
|
|
@@ -293,13 +309,11 @@ class LayoutOptions(BaseModel):
|
|
|
293
309
|
|
|
294
310
|
class AsrPipelineOptions(PipelineOptions):
|
|
295
311
|
asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
|
|
296
|
-
artifacts_path: Optional[Union[Path, str]] = None
|
|
297
312
|
|
|
298
313
|
|
|
299
314
|
class VlmExtractionPipelineOptions(PipelineOptions):
|
|
300
315
|
"""Options for extraction pipeline."""
|
|
301
316
|
|
|
302
|
-
artifacts_path: Optional[Union[Path, str]] = None
|
|
303
317
|
vlm_options: Union[InlineVlmOptions] = NU_EXTRACT_2B_TRANSFORMERS
|
|
304
318
|
|
|
305
319
|
|
|
@@ -310,8 +324,6 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
|
|
310
324
|
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
|
311
325
|
do_code_enrichment: bool = False # True: perform code OCR
|
|
312
326
|
do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
|
|
313
|
-
do_picture_classification: bool = False # True: classify pictures in documents
|
|
314
|
-
do_picture_description: bool = False # True: run describe pictures in documents
|
|
315
327
|
force_backend_text: bool = (
|
|
316
328
|
False # (To be used with vlms, or other generative models)
|
|
317
329
|
)
|
|
@@ -319,9 +331,6 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
|
|
319
331
|
|
|
320
332
|
table_structure_options: TableStructureOptions = TableStructureOptions()
|
|
321
333
|
ocr_options: OcrOptions = EasyOcrOptions()
|
|
322
|
-
picture_description_options: PictureDescriptionBaseOptions = (
|
|
323
|
-
smolvlm_picture_description
|
|
324
|
-
)
|
|
325
334
|
layout_options: LayoutOptions = LayoutOptions()
|
|
326
335
|
|
|
327
336
|
images_scale: float = 1.0
|
|
@@ -18,6 +18,35 @@ from docling.datamodel.pipeline_options_vlm_model import (
|
|
|
18
18
|
_log = logging.getLogger(__name__)
|
|
19
19
|
|
|
20
20
|
|
|
21
|
+
# Granite-Docling
|
|
22
|
+
GRANITEDOCLING_TRANSFORMERS = InlineVlmOptions(
|
|
23
|
+
repo_id="ibm-granite/granite-docling-258M",
|
|
24
|
+
prompt="Convert this page to docling.",
|
|
25
|
+
response_format=ResponseFormat.DOCTAGS,
|
|
26
|
+
inference_framework=InferenceFramework.TRANSFORMERS,
|
|
27
|
+
transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
|
|
28
|
+
supported_devices=[
|
|
29
|
+
AcceleratorDevice.CPU,
|
|
30
|
+
AcceleratorDevice.CUDA,
|
|
31
|
+
],
|
|
32
|
+
scale=2.0,
|
|
33
|
+
temperature=0.0,
|
|
34
|
+
max_new_tokens=8192,
|
|
35
|
+
stop_strings=["</doctag>", "<|end_of_text|>"],
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
GRANITEDOCLING_MLX = InlineVlmOptions(
|
|
39
|
+
repo_id="ibm-granite/granite-docling-258M-mlx",
|
|
40
|
+
prompt="Convert this page to docling.",
|
|
41
|
+
response_format=ResponseFormat.DOCTAGS,
|
|
42
|
+
inference_framework=InferenceFramework.MLX,
|
|
43
|
+
supported_devices=[AcceleratorDevice.MPS],
|
|
44
|
+
scale=2.0,
|
|
45
|
+
temperature=0.0,
|
|
46
|
+
max_new_tokens=8192,
|
|
47
|
+
stop_strings=["</doctag>", "<|end_of_text|>"],
|
|
48
|
+
)
|
|
49
|
+
|
|
21
50
|
# SmolDocling
|
|
22
51
|
SMOLDOCLING_MLX = InlineVlmOptions(
|
|
23
52
|
repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
|
|
@@ -272,3 +301,4 @@ class VlmModelType(str, Enum):
|
|
|
272
301
|
GRANITE_VISION_VLLM = "granite_vision_vllm"
|
|
273
302
|
GRANITE_VISION_OLLAMA = "granite_vision_ollama"
|
|
274
303
|
GOT_OCR_2 = "got_ocr_2"
|
|
304
|
+
GRANITEDOCLING = "granite_docling"
|
|
@@ -4,7 +4,13 @@ from collections.abc import Iterable
|
|
|
4
4
|
from typing import Any, Generic, Optional, Protocol, Type, Union
|
|
5
5
|
|
|
6
6
|
import numpy as np
|
|
7
|
-
from docling_core.types.doc import
|
|
7
|
+
from docling_core.types.doc import (
|
|
8
|
+
BoundingBox,
|
|
9
|
+
DocItem,
|
|
10
|
+
DoclingDocument,
|
|
11
|
+
NodeItem,
|
|
12
|
+
PictureItem,
|
|
13
|
+
)
|
|
8
14
|
from PIL.Image import Image
|
|
9
15
|
from typing_extensions import TypeVar
|
|
10
16
|
|
|
@@ -164,8 +170,17 @@ class BaseItemAndImageEnrichmentModel(
|
|
|
164
170
|
return None
|
|
165
171
|
|
|
166
172
|
assert isinstance(element, DocItem)
|
|
167
|
-
element_prov = element.prov[0]
|
|
168
173
|
|
|
174
|
+
# Allow the case of documents without page images but embedded images (e.g. Word and HTML docs)
|
|
175
|
+
if len(element.prov) == 0 and isinstance(element, PictureItem):
|
|
176
|
+
embedded_im = element.get_image(conv_res.document)
|
|
177
|
+
if embedded_im is not None:
|
|
178
|
+
return ItemAndImageEnrichmentElement(item=element, image=embedded_im)
|
|
179
|
+
else:
|
|
180
|
+
return None
|
|
181
|
+
|
|
182
|
+
# Crop the image form the page
|
|
183
|
+
element_prov = element.prov[0]
|
|
169
184
|
bbox = element_prov.bbox
|
|
170
185
|
width = bbox.r - bbox.l
|
|
171
186
|
height = bbox.t - bbox.b
|
|
@@ -183,4 +198,14 @@ class BaseItemAndImageEnrichmentModel(
|
|
|
183
198
|
cropped_image = conv_res.pages[page_ix].get_image(
|
|
184
199
|
scale=self.images_scale, cropbox=expanded_bbox
|
|
185
200
|
)
|
|
201
|
+
|
|
202
|
+
# Allow for images being embedded without the page backend or page images
|
|
203
|
+
if cropped_image is None and isinstance(element, PictureItem):
|
|
204
|
+
embedded_im = element.get_image(conv_res.document)
|
|
205
|
+
if embedded_im is not None:
|
|
206
|
+
return ItemAndImageEnrichmentElement(item=element, image=embedded_im)
|
|
207
|
+
else:
|
|
208
|
+
return None
|
|
209
|
+
|
|
210
|
+
# Return the proper cropped image
|
|
186
211
|
return ItemAndImageEnrichmentElement(item=element, image=cropped_image)
|
|
@@ -78,14 +78,17 @@ class EasyOcrModel(BaseOcrModel):
|
|
|
78
78
|
download_enabled = False
|
|
79
79
|
model_storage_directory = str(artifacts_path / self._model_repo_folder)
|
|
80
80
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
81
|
+
with warnings.catch_warnings():
|
|
82
|
+
if self.options.suppress_mps_warnings:
|
|
83
|
+
warnings.filterwarnings("ignore", message=".*pin_memory.*MPS.*")
|
|
84
|
+
self.reader = easyocr.Reader(
|
|
85
|
+
lang_list=self.options.lang,
|
|
86
|
+
gpu=use_gpu,
|
|
87
|
+
model_storage_directory=model_storage_directory,
|
|
88
|
+
recog_network=self.options.recog_network,
|
|
89
|
+
download_enabled=download_enabled,
|
|
90
|
+
verbose=False,
|
|
91
|
+
)
|
|
89
92
|
|
|
90
93
|
@staticmethod
|
|
91
94
|
def download_models(
|
|
@@ -147,7 +150,14 @@ class EasyOcrModel(BaseOcrModel):
|
|
|
147
150
|
scale=self.scale, cropbox=ocr_rect
|
|
148
151
|
)
|
|
149
152
|
im = numpy.array(high_res_image)
|
|
150
|
-
|
|
153
|
+
|
|
154
|
+
with warnings.catch_warnings():
|
|
155
|
+
if self.options.suppress_mps_warnings:
|
|
156
|
+
warnings.filterwarnings(
|
|
157
|
+
"ignore", message=".*pin_memory.*MPS.*"
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
result = self.reader.readtext(im)
|
|
151
161
|
|
|
152
162
|
del high_res_image
|
|
153
163
|
del im
|
|
@@ -67,7 +67,7 @@ class PictureDescriptionVlmModel(
|
|
|
67
67
|
self.model = AutoModelForImageTextToText.from_pretrained(
|
|
68
68
|
artifacts_path,
|
|
69
69
|
device_map=self.device,
|
|
70
|
-
|
|
70
|
+
dtype=torch.bfloat16,
|
|
71
71
|
_attn_implementation=(
|
|
72
72
|
"flash_attention_2"
|
|
73
73
|
if self.device.startswith("cuda")
|
|
@@ -62,32 +62,44 @@ class RapidOcrModel(BaseOcrModel):
|
|
|
62
62
|
}
|
|
63
63
|
backend_enum = _ALIASES.get(self.options.backend, EngineType.ONNXRUNTIME)
|
|
64
64
|
|
|
65
|
+
params = {
|
|
66
|
+
# Global settings (these are still correct)
|
|
67
|
+
"Global.text_score": self.options.text_score,
|
|
68
|
+
"Global.font_path": self.options.font_path,
|
|
69
|
+
# "Global.verbose": self.options.print_verbose,
|
|
70
|
+
# Detection model settings
|
|
71
|
+
"Det.model_path": self.options.det_model_path,
|
|
72
|
+
"Det.use_cuda": use_cuda,
|
|
73
|
+
"Det.use_dml": use_dml,
|
|
74
|
+
"Det.intra_op_num_threads": intra_op_num_threads,
|
|
75
|
+
# Classification model settings
|
|
76
|
+
"Cls.model_path": self.options.cls_model_path,
|
|
77
|
+
"Cls.use_cuda": use_cuda,
|
|
78
|
+
"Cls.use_dml": use_dml,
|
|
79
|
+
"Cls.intra_op_num_threads": intra_op_num_threads,
|
|
80
|
+
# Recognition model settings
|
|
81
|
+
"Rec.model_path": self.options.rec_model_path,
|
|
82
|
+
"Rec.font_path": self.options.rec_font_path,
|
|
83
|
+
"Rec.keys_path": self.options.rec_keys_path,
|
|
84
|
+
"Rec.use_cuda": use_cuda,
|
|
85
|
+
"Rec.use_dml": use_dml,
|
|
86
|
+
"Rec.intra_op_num_threads": intra_op_num_threads,
|
|
87
|
+
"Det.engine_type": backend_enum,
|
|
88
|
+
"Cls.engine_type": backend_enum,
|
|
89
|
+
"Rec.engine_type": backend_enum,
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
if self.options.rec_font_path is not None:
|
|
93
|
+
_log.warning(
|
|
94
|
+
"The 'rec_font_path' option for RapidOCR is deprecated. Please use 'font_path' instead."
|
|
95
|
+
)
|
|
96
|
+
user_params = self.options.rapidocr_params
|
|
97
|
+
if user_params:
|
|
98
|
+
_log.debug("Overwriting RapidOCR params with user-provided values.")
|
|
99
|
+
params.update(user_params)
|
|
100
|
+
|
|
65
101
|
self.reader = RapidOCR(
|
|
66
|
-
params=
|
|
67
|
-
# Global settings (these are still correct)
|
|
68
|
-
"Global.text_score": self.options.text_score,
|
|
69
|
-
# "Global.verbose": self.options.print_verbose,
|
|
70
|
-
# Detection model settings
|
|
71
|
-
"Det.model_path": self.options.det_model_path,
|
|
72
|
-
"Det.use_cuda": use_cuda,
|
|
73
|
-
"Det.use_dml": use_dml,
|
|
74
|
-
"Det.intra_op_num_threads": intra_op_num_threads,
|
|
75
|
-
# Classification model settings
|
|
76
|
-
"Cls.model_path": self.options.cls_model_path,
|
|
77
|
-
"Cls.use_cuda": use_cuda,
|
|
78
|
-
"Cls.use_dml": use_dml,
|
|
79
|
-
"Cls.intra_op_num_threads": intra_op_num_threads,
|
|
80
|
-
# Recognition model settings
|
|
81
|
-
"Rec.model_path": self.options.rec_model_path,
|
|
82
|
-
"Rec.font_path": self.options.rec_font_path,
|
|
83
|
-
"Rec.keys_path": self.options.rec_keys_path,
|
|
84
|
-
"Rec.use_cuda": use_cuda,
|
|
85
|
-
"Rec.use_dml": use_dml,
|
|
86
|
-
"Rec.intra_op_num_threads": intra_op_num_threads,
|
|
87
|
-
"Det.engine_type": backend_enum,
|
|
88
|
-
"Cls.engine_type": backend_enum,
|
|
89
|
-
"Rec.engine_type": backend_enum,
|
|
90
|
-
}
|
|
102
|
+
params=params,
|
|
91
103
|
)
|
|
92
104
|
|
|
93
105
|
def __call__(
|
|
@@ -120,6 +132,9 @@ class RapidOcrModel(BaseOcrModel):
|
|
|
120
132
|
use_cls=self.options.use_cls,
|
|
121
133
|
use_rec=self.options.use_rec,
|
|
122
134
|
)
|
|
135
|
+
if result is None or result.boxes is None:
|
|
136
|
+
_log.warning("RapidOCR returned empty result!")
|
|
137
|
+
continue
|
|
123
138
|
result = list(
|
|
124
139
|
zip(result.boxes.tolist(), result.txts, result.scores)
|
|
125
140
|
)
|
|
@@ -112,7 +112,7 @@ class HuggingFaceTransformersVlmModel(BaseVlmPageModel, HuggingFaceModelDownload
|
|
|
112
112
|
self.vlm_model = model_cls.from_pretrained(
|
|
113
113
|
artifacts_path,
|
|
114
114
|
device_map=self.device,
|
|
115
|
-
|
|
115
|
+
dtype=self.vlm_options.torch_dtype,
|
|
116
116
|
_attn_implementation=(
|
|
117
117
|
"flash_attention_2"
|
|
118
118
|
if self.device.startswith("cuda")
|
{docling-2.51.0 → docling-2.53.0}/docling/models/vlm_models_inline/nuextract_transformers_model.py
RENAMED
|
@@ -144,7 +144,7 @@ class NuExtractTransformersModel(BaseVlmModel, HuggingFaceModelDownloadMixin):
|
|
|
144
144
|
self.vlm_model = AutoModelForImageTextToText.from_pretrained(
|
|
145
145
|
artifacts_path,
|
|
146
146
|
device_map=self.device,
|
|
147
|
-
|
|
147
|
+
dtype=self.vlm_options.torch_dtype,
|
|
148
148
|
_attn_implementation=(
|
|
149
149
|
"flash_attention_2"
|
|
150
150
|
if self.device.startswith("cuda")
|
|
@@ -208,25 +208,13 @@ class AsrPipeline(BasePipeline):
|
|
|
208
208
|
|
|
209
209
|
self.pipeline_options: AsrPipelineOptions = pipeline_options
|
|
210
210
|
|
|
211
|
-
artifacts_path: Optional[Path] = None
|
|
212
|
-
if pipeline_options.artifacts_path is not None:
|
|
213
|
-
artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
|
|
214
|
-
elif settings.artifacts_path is not None:
|
|
215
|
-
artifacts_path = Path(settings.artifacts_path).expanduser()
|
|
216
|
-
|
|
217
|
-
if artifacts_path is not None and not artifacts_path.is_dir():
|
|
218
|
-
raise RuntimeError(
|
|
219
|
-
f"The value of {artifacts_path=} is not valid. "
|
|
220
|
-
"When defined, it must point to a folder containing all models required by the pipeline."
|
|
221
|
-
)
|
|
222
|
-
|
|
223
211
|
if isinstance(self.pipeline_options.asr_options, InlineAsrNativeWhisperOptions):
|
|
224
212
|
asr_options: InlineAsrNativeWhisperOptions = (
|
|
225
213
|
self.pipeline_options.asr_options
|
|
226
214
|
)
|
|
227
215
|
self._model = _NativeWhisperModel(
|
|
228
216
|
enabled=True, # must be always enabled for this pipeline to make sense.
|
|
229
|
-
artifacts_path=artifacts_path,
|
|
217
|
+
artifacts_path=self.artifacts_path,
|
|
230
218
|
accelerator_options=pipeline_options.accelerator_options,
|
|
231
219
|
asr_options=asr_options,
|
|
232
220
|
)
|
|
@@ -1,19 +1,33 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from abc import ABC, abstractmethod
|
|
3
|
+
from pathlib import Path
|
|
3
4
|
from typing import Optional
|
|
4
5
|
|
|
5
6
|
from docling.datamodel.base_models import ConversionStatus, ErrorItem
|
|
6
7
|
from docling.datamodel.document import InputDocument
|
|
7
8
|
from docling.datamodel.extraction import ExtractionResult, ExtractionTemplateType
|
|
8
|
-
from docling.datamodel.pipeline_options import BaseOptions
|
|
9
|
+
from docling.datamodel.pipeline_options import BaseOptions, PipelineOptions
|
|
10
|
+
from docling.datamodel.settings import settings
|
|
9
11
|
|
|
10
12
|
_log = logging.getLogger(__name__)
|
|
11
13
|
|
|
12
14
|
|
|
13
15
|
class BaseExtractionPipeline(ABC):
|
|
14
|
-
def __init__(self, pipeline_options:
|
|
16
|
+
def __init__(self, pipeline_options: PipelineOptions):
|
|
15
17
|
self.pipeline_options = pipeline_options
|
|
16
18
|
|
|
19
|
+
self.artifacts_path: Optional[Path] = None
|
|
20
|
+
if pipeline_options.artifacts_path is not None:
|
|
21
|
+
self.artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
|
|
22
|
+
elif settings.artifacts_path is not None:
|
|
23
|
+
self.artifacts_path = Path(settings.artifacts_path).expanduser()
|
|
24
|
+
|
|
25
|
+
if self.artifacts_path is not None and not self.artifacts_path.is_dir():
|
|
26
|
+
raise RuntimeError(
|
|
27
|
+
f"The value of {self.artifacts_path=} is not valid. "
|
|
28
|
+
"When defined, it must point to a folder containing all models required by the pipeline."
|
|
29
|
+
)
|
|
30
|
+
|
|
17
31
|
def execute(
|
|
18
32
|
self,
|
|
19
33
|
in_doc: InputDocument,
|
|
@@ -54,5 +68,5 @@ class BaseExtractionPipeline(ABC):
|
|
|
54
68
|
|
|
55
69
|
@classmethod
|
|
56
70
|
@abstractmethod
|
|
57
|
-
def get_default_options(cls) ->
|
|
71
|
+
def get_default_options(cls) -> PipelineOptions:
|
|
58
72
|
pass
|