docling 2.50.0__tar.gz → 2.52.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.50.0 → docling-2.52.0}/PKG-INFO +15 -7
- {docling-2.50.0 → docling-2.52.0}/README.md +12 -4
- {docling-2.50.0 → docling-2.52.0}/docling/backend/docling_parse_v4_backend.py +12 -0
- {docling-2.50.0 → docling-2.52.0}/docling/cli/main.py +29 -0
- {docling-2.50.0 → docling-2.52.0}/docling/datamodel/pipeline_options.py +17 -10
- {docling-2.50.0 → docling-2.52.0}/docling/models/base_model.py +27 -2
- {docling-2.50.0 → docling-2.52.0}/docling/models/easyocr_model.py +19 -9
- {docling-2.50.0 → docling-2.52.0}/docling/models/picture_description_vlm_model.py +1 -1
- {docling-2.50.0 → docling-2.52.0}/docling/models/vlm_models_inline/hf_transformers_model.py +1 -1
- {docling-2.50.0 → docling-2.52.0}/docling/models/vlm_models_inline/nuextract_transformers_model.py +1 -1
- {docling-2.50.0 → docling-2.52.0}/docling/pipeline/asr_pipeline.py +1 -13
- {docling-2.50.0 → docling-2.52.0}/docling/pipeline/base_extraction_pipeline.py +17 -3
- {docling-2.50.0 → docling-2.52.0}/docling/pipeline/base_pipeline.py +75 -9
- {docling-2.50.0 → docling-2.52.0}/docling/pipeline/extraction_vlm_pipeline.py +9 -16
- {docling-2.50.0 → docling-2.52.0}/docling/pipeline/simple_pipeline.py +6 -6
- {docling-2.50.0 → docling-2.52.0}/docling/pipeline/standard_pdf_pipeline.py +6 -55
- {docling-2.50.0 → docling-2.52.0}/docling/pipeline/threaded_standard_pdf_pipeline.py +102 -62
- {docling-2.50.0 → docling-2.52.0}/docling/pipeline/vlm_pipeline.py +3 -15
- {docling-2.50.0 → docling-2.52.0}/docling.egg-info/PKG-INFO +15 -7
- {docling-2.50.0 → docling-2.52.0}/docling.egg-info/requires.txt +2 -2
- {docling-2.50.0 → docling-2.52.0}/pyproject.toml +3 -3
- {docling-2.50.0 → docling-2.52.0}/LICENSE +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/__init__.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/backend/__init__.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/backend/csv_backend.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/backend/docx/__init__.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/backend/docx/latex/__init__.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/backend/docx/latex/latex_dict.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/backend/docx/latex/omml.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/backend/html_backend.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/backend/json/__init__.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/backend/md_backend.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/backend/mets_gbs_backend.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/backend/msword_backend.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/backend/noop_backend.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/backend/pdf_backend.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/backend/xml/__init__.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/backend/xml/jats_backend.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/backend/xml/uspto_backend.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/chunking/__init__.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/cli/__init__.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/cli/models.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/cli/tools.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/datamodel/accelerator_options.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/datamodel/asr_model_specs.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/datamodel/base_models.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/datamodel/document.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/datamodel/extraction.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/datamodel/layout_model_specs.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/datamodel/pipeline_options_asr_model.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/datamodel/pipeline_options_vlm_model.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/datamodel/settings.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/datamodel/vlm_model_specs.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/document_converter.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/document_extractor.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/exceptions.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/models/__init__.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/models/api_vlm_model.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/models/base_ocr_model.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/models/code_formula_model.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/models/document_picture_classifier.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/models/factories/__init__.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/models/factories/base_factory.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/models/factories/ocr_factory.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/models/factories/picture_description_factory.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/models/layout_model.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/models/page_assemble_model.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/models/picture_description_api_model.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/models/picture_description_base_model.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/models/plugins/__init__.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/models/plugins/defaults.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/models/rapid_ocr_model.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/models/readingorder_model.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/models/table_structure_model.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/models/utils/__init__.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/models/utils/hf_model_download.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/models/vlm_models_inline/__init__.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/models/vlm_models_inline/mlx_model.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/models/vlm_models_inline/vllm_model.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/pipeline/__init__.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/py.typed +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/utils/__init__.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/utils/api_image_request.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/utils/export.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/utils/glm_utils.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/utils/layout_postprocessor.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/utils/locks.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/utils/model_downloader.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/utils/ocr_utils.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/utils/orientation.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/utils/profiling.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/utils/utils.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling/utils/visualization.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling.egg-info/SOURCES.txt +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling.egg-info/dependency_links.txt +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling.egg-info/entry_points.txt +0 -0
- {docling-2.50.0 → docling-2.52.0}/docling.egg-info/top_level.txt +0 -0
- {docling-2.50.0 → docling-2.52.0}/setup.cfg +0 -0
- {docling-2.50.0 → docling-2.52.0}/tests/test_asr_pipeline.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/tests/test_backend_asciidoc.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/tests/test_backend_csv.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/tests/test_backend_docling_json.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/tests/test_backend_docling_parse.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/tests/test_backend_docling_parse_v2.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/tests/test_backend_docling_parse_v4.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/tests/test_backend_html.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/tests/test_backend_jats.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/tests/test_backend_markdown.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/tests/test_backend_mets_gbs.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/tests/test_backend_msexcel.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/tests/test_backend_msword.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/tests/test_backend_patent_uspto.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/tests/test_backend_pdfium.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/tests/test_backend_pptx.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/tests/test_backend_webp.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/tests/test_cli.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/tests/test_code_formula.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/tests/test_data_gen_flag.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/tests/test_document_picture_classifier.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/tests/test_e2e_conversion.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/tests/test_e2e_ocr_conversion.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/tests/test_extraction.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/tests/test_input_doc.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/tests/test_interfaces.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/tests/test_invalid_input.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/tests/test_legacy_format_transform.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/tests/test_ocr_utils.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/tests/test_options.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/tests/test_settings_load.py +0 -0
- {docling-2.50.0 → docling-2.52.0}/tests/test_threaded_pipeline.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.52.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
6
|
License-Expression: MIT
|
@@ -26,8 +26,8 @@ Requires-Python: <4.0,>=3.9
|
|
26
26
|
Description-Content-Type: text/markdown
|
27
27
|
License-File: LICENSE
|
28
28
|
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
29
|
-
Requires-Dist: docling-core[chunking]<3.0.0,>=2.
|
30
|
-
Requires-Dist: docling-parse<5.0.0,>=4.
|
29
|
+
Requires-Dist: docling-core[chunking]<3.0.0,>=2.48.0
|
30
|
+
Requires-Dist: docling-parse<5.0.0,>=4.4.0
|
31
31
|
Requires-Dist: docling-ibm-models<4,>=3.9.1
|
32
32
|
Requires-Dist: filetype<2.0.0,>=1.2.0
|
33
33
|
Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0
|
@@ -101,22 +101,29 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
|
101
101
|
|
102
102
|
## Features
|
103
103
|
|
104
|
-
* 🗂️
|
104
|
+
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
|
105
105
|
* 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
|
106
106
|
* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
|
107
|
-
* ↪️
|
107
|
+
* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
|
108
108
|
* 🔒 Local execution capabilities for sensitive data and air-gapped environments
|
109
109
|
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
110
110
|
* 🔍 Extensive OCR support for scanned PDFs and images
|
111
111
|
* 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
|
112
|
-
* 🎙️
|
112
|
+
* 🎙️ Audio support with Automatic Speech Recognition (ASR) models
|
113
|
+
* 🔌 Connect to any agent using the [MCP server](https://docling-project.github.io/docling/usage/mcp/)
|
113
114
|
* 💻 Simple and convenient CLI
|
114
115
|
|
116
|
+
### What's new
|
117
|
+
* 📤 Structured [information extraction][extraction] \[🧪 beta\]
|
118
|
+
* 📑 New layout model (**Heron**) by default, for faster PDF parsing
|
119
|
+
* 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications
|
120
|
+
|
115
121
|
### Coming soon
|
116
122
|
|
117
123
|
* 📝 Metadata extraction, including title, authors, references & language
|
118
124
|
* 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
|
119
125
|
* 📝 Complex chemistry understanding (Molecular structures)
|
126
|
+
* 📝 Parsing of Web Video Text Tracks (WebVTT) files
|
120
127
|
|
121
128
|
## Installation
|
122
129
|
|
@@ -142,7 +149,7 @@ result = converter.convert(source)
|
|
142
149
|
print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
|
143
150
|
```
|
144
151
|
|
145
|
-
More [advanced usage options](https://docling-project.github.io/docling/usage/) are available in
|
152
|
+
More [advanced usage options](https://docling-project.github.io/docling/usage/advanced_options/) are available in
|
146
153
|
the docs.
|
147
154
|
|
148
155
|
## CLI
|
@@ -222,3 +229,4 @@ The project was started by the AI for knowledge team at IBM Research Zurich.
|
|
222
229
|
[supported_formats]: https://docling-project.github.io/docling/usage/supported_formats/
|
223
230
|
[docling_document]: https://docling-project.github.io/docling/concepts/docling_document/
|
224
231
|
[integrations]: https://docling-project.github.io/docling/integrations/
|
232
|
+
[extraction]: https://docling-project.github.io/docling/examples/extraction/
|
@@ -29,22 +29,29 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
|
29
29
|
|
30
30
|
## Features
|
31
31
|
|
32
|
-
* 🗂️
|
32
|
+
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
|
33
33
|
* 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
|
34
34
|
* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
|
35
|
-
* ↪️
|
35
|
+
* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
|
36
36
|
* 🔒 Local execution capabilities for sensitive data and air-gapped environments
|
37
37
|
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
38
38
|
* 🔍 Extensive OCR support for scanned PDFs and images
|
39
39
|
* 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
|
40
|
-
* 🎙️
|
40
|
+
* 🎙️ Audio support with Automatic Speech Recognition (ASR) models
|
41
|
+
* 🔌 Connect to any agent using the [MCP server](https://docling-project.github.io/docling/usage/mcp/)
|
41
42
|
* 💻 Simple and convenient CLI
|
42
43
|
|
44
|
+
### What's new
|
45
|
+
* 📤 Structured [information extraction][extraction] \[🧪 beta\]
|
46
|
+
* 📑 New layout model (**Heron**) by default, for faster PDF parsing
|
47
|
+
* 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications
|
48
|
+
|
43
49
|
### Coming soon
|
44
50
|
|
45
51
|
* 📝 Metadata extraction, including title, authors, references & language
|
46
52
|
* 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
|
47
53
|
* 📝 Complex chemistry understanding (Molecular structures)
|
54
|
+
* 📝 Parsing of Web Video Text Tracks (WebVTT) files
|
48
55
|
|
49
56
|
## Installation
|
50
57
|
|
@@ -70,7 +77,7 @@ result = converter.convert(source)
|
|
70
77
|
print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
|
71
78
|
```
|
72
79
|
|
73
|
-
More [advanced usage options](https://docling-project.github.io/docling/usage/) are available in
|
80
|
+
More [advanced usage options](https://docling-project.github.io/docling/usage/advanced_options/) are available in
|
74
81
|
the docs.
|
75
82
|
|
76
83
|
## CLI
|
@@ -150,3 +157,4 @@ The project was started by the AI for knowledge team at IBM Research Zurich.
|
|
150
157
|
[supported_formats]: https://docling-project.github.io/docling/usage/supported_formats/
|
151
158
|
[docling_document]: https://docling-project.github.io/docling/concepts/docling_document/
|
152
159
|
[integrations]: https://docling-project.github.io/docling/integrations/
|
160
|
+
[extraction]: https://docling-project.github.io/docling/examples/extraction/
|
@@ -30,13 +30,21 @@ class DoclingParseV4PageBackend(PdfPageBackend):
|
|
30
30
|
page_no: int,
|
31
31
|
create_words: bool = True,
|
32
32
|
create_textlines: bool = True,
|
33
|
+
keep_chars: bool = False,
|
34
|
+
keep_lines: bool = False,
|
35
|
+
keep_images: bool = True,
|
33
36
|
):
|
34
37
|
self._ppage = page_obj
|
35
38
|
self._dp_doc = dp_doc
|
36
39
|
self._page_no = page_no
|
40
|
+
|
37
41
|
self._create_words = create_words
|
38
42
|
self._create_textlines = create_textlines
|
39
43
|
|
44
|
+
self._keep_chars = keep_chars
|
45
|
+
self._keep_lines = keep_lines
|
46
|
+
self._keep_images = keep_images
|
47
|
+
|
40
48
|
self._dpage: Optional[SegmentedPdfPage] = None
|
41
49
|
self._unloaded = False
|
42
50
|
self.valid = (self._ppage is not None) and (self._dp_doc is not None)
|
@@ -47,8 +55,12 @@ class DoclingParseV4PageBackend(PdfPageBackend):
|
|
47
55
|
|
48
56
|
seg_page = self._dp_doc.get_page(
|
49
57
|
self._page_no + 1,
|
58
|
+
keep_chars=self._keep_chars,
|
59
|
+
keep_lines=self._keep_lines,
|
60
|
+
keep_bitmaps=self._keep_images,
|
50
61
|
create_words=self._create_words,
|
51
62
|
create_textlines=self._create_textlines,
|
63
|
+
enforce_same_font=True,
|
52
64
|
)
|
53
65
|
|
54
66
|
# In Docling, all TextCell instances are expected with top-left origin.
|
@@ -48,6 +48,7 @@ from docling.datamodel.base_models import (
|
|
48
48
|
from docling.datamodel.document import ConversionResult
|
49
49
|
from docling.datamodel.pipeline_options import (
|
50
50
|
AsrPipelineOptions,
|
51
|
+
ConvertPipelineOptions,
|
51
52
|
EasyOcrOptions,
|
52
53
|
OcrOptions,
|
53
54
|
PaginatedPipelineOptions,
|
@@ -71,8 +72,13 @@ from docling.datamodel.vlm_model_specs import (
|
|
71
72
|
from docling.document_converter import (
|
72
73
|
AudioFormatOption,
|
73
74
|
DocumentConverter,
|
75
|
+
ExcelFormatOption,
|
74
76
|
FormatOption,
|
77
|
+
HTMLFormatOption,
|
78
|
+
MarkdownFormatOption,
|
75
79
|
PdfFormatOption,
|
80
|
+
PowerpointFormatOption,
|
81
|
+
WordFormatOption,
|
76
82
|
)
|
77
83
|
from docling.models.factories import get_ocr_factory
|
78
84
|
from docling.pipeline.asr_pipeline import AsrPipeline
|
@@ -626,10 +632,33 @@ def convert( # noqa: C901
|
|
626
632
|
backend=MetsGbsDocumentBackend,
|
627
633
|
)
|
628
634
|
|
635
|
+
# SimplePipeline options
|
636
|
+
simple_format_option = ConvertPipelineOptions(
|
637
|
+
do_picture_description=enrich_picture_description,
|
638
|
+
do_picture_classification=enrich_picture_classes,
|
639
|
+
)
|
640
|
+
if artifacts_path is not None:
|
641
|
+
simple_format_option.artifacts_path = artifacts_path
|
642
|
+
|
629
643
|
format_options = {
|
630
644
|
InputFormat.PDF: pdf_format_option,
|
631
645
|
InputFormat.IMAGE: pdf_format_option,
|
632
646
|
InputFormat.METS_GBS: mets_gbs_format_option,
|
647
|
+
InputFormat.DOCX: WordFormatOption(
|
648
|
+
pipeline_options=simple_format_option
|
649
|
+
),
|
650
|
+
InputFormat.PPTX: PowerpointFormatOption(
|
651
|
+
pipeline_options=simple_format_option
|
652
|
+
),
|
653
|
+
InputFormat.XLSX: ExcelFormatOption(
|
654
|
+
pipeline_options=simple_format_option
|
655
|
+
),
|
656
|
+
InputFormat.HTML: HTMLFormatOption(
|
657
|
+
pipeline_options=simple_format_option
|
658
|
+
),
|
659
|
+
InputFormat.MD: MarkdownFormatOption(
|
660
|
+
pipeline_options=simple_format_option
|
661
|
+
),
|
633
662
|
}
|
634
663
|
|
635
664
|
elif pipeline == ProcessingPipeline.VLM:
|
@@ -135,6 +135,8 @@ class EasyOcrOptions(OcrOptions):
|
|
135
135
|
recog_network: Optional[str] = "standard"
|
136
136
|
download_enabled: bool = True
|
137
137
|
|
138
|
+
suppress_mps_warnings: bool = True
|
139
|
+
|
138
140
|
model_config = ConfigDict(
|
139
141
|
extra="forbid",
|
140
142
|
protected_namespaces=(),
|
@@ -237,7 +239,9 @@ class PdfBackend(str, Enum):
|
|
237
239
|
|
238
240
|
|
239
241
|
# Define an enum for the ocr engines
|
240
|
-
@deprecated(
|
242
|
+
@deprecated(
|
243
|
+
"Use get_ocr_factory().registered_kind to get a list of registered OCR engines."
|
244
|
+
)
|
241
245
|
class OcrEngine(str, Enum):
|
242
246
|
"""Enum of valid OCR engines."""
|
243
247
|
|
@@ -255,11 +259,21 @@ class PipelineOptions(BaseOptions):
|
|
255
259
|
accelerator_options: AcceleratorOptions = AcceleratorOptions()
|
256
260
|
enable_remote_services: bool = False
|
257
261
|
allow_external_plugins: bool = False
|
262
|
+
artifacts_path: Optional[Union[Path, str]] = None
|
258
263
|
|
259
264
|
|
260
|
-
class
|
261
|
-
|
265
|
+
class ConvertPipelineOptions(PipelineOptions):
|
266
|
+
"""Base convert pipeline options."""
|
267
|
+
|
268
|
+
do_picture_classification: bool = False # True: classify pictures in documents
|
262
269
|
|
270
|
+
do_picture_description: bool = False # True: run describe pictures in documents
|
271
|
+
picture_description_options: PictureDescriptionBaseOptions = (
|
272
|
+
smolvlm_picture_description
|
273
|
+
)
|
274
|
+
|
275
|
+
|
276
|
+
class PaginatedPipelineOptions(ConvertPipelineOptions):
|
263
277
|
images_scale: float = 1.0
|
264
278
|
generate_page_images: bool = False
|
265
279
|
generate_picture_images: bool = False
|
@@ -291,13 +305,11 @@ class LayoutOptions(BaseModel):
|
|
291
305
|
|
292
306
|
class AsrPipelineOptions(PipelineOptions):
|
293
307
|
asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
|
294
|
-
artifacts_path: Optional[Union[Path, str]] = None
|
295
308
|
|
296
309
|
|
297
310
|
class VlmExtractionPipelineOptions(PipelineOptions):
|
298
311
|
"""Options for extraction pipeline."""
|
299
312
|
|
300
|
-
artifacts_path: Optional[Union[Path, str]] = None
|
301
313
|
vlm_options: Union[InlineVlmOptions] = NU_EXTRACT_2B_TRANSFORMERS
|
302
314
|
|
303
315
|
|
@@ -308,8 +320,6 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
|
308
320
|
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
309
321
|
do_code_enrichment: bool = False # True: perform code OCR
|
310
322
|
do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
|
311
|
-
do_picture_classification: bool = False # True: classify pictures in documents
|
312
|
-
do_picture_description: bool = False # True: run describe pictures in documents
|
313
323
|
force_backend_text: bool = (
|
314
324
|
False # (To be used with vlms, or other generative models)
|
315
325
|
)
|
@@ -317,9 +327,6 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
|
317
327
|
|
318
328
|
table_structure_options: TableStructureOptions = TableStructureOptions()
|
319
329
|
ocr_options: OcrOptions = EasyOcrOptions()
|
320
|
-
picture_description_options: PictureDescriptionBaseOptions = (
|
321
|
-
smolvlm_picture_description
|
322
|
-
)
|
323
330
|
layout_options: LayoutOptions = LayoutOptions()
|
324
331
|
|
325
332
|
images_scale: float = 1.0
|
@@ -4,7 +4,13 @@ from collections.abc import Iterable
|
|
4
4
|
from typing import Any, Generic, Optional, Protocol, Type, Union
|
5
5
|
|
6
6
|
import numpy as np
|
7
|
-
from docling_core.types.doc import
|
7
|
+
from docling_core.types.doc import (
|
8
|
+
BoundingBox,
|
9
|
+
DocItem,
|
10
|
+
DoclingDocument,
|
11
|
+
NodeItem,
|
12
|
+
PictureItem,
|
13
|
+
)
|
8
14
|
from PIL.Image import Image
|
9
15
|
from typing_extensions import TypeVar
|
10
16
|
|
@@ -164,8 +170,17 @@ class BaseItemAndImageEnrichmentModel(
|
|
164
170
|
return None
|
165
171
|
|
166
172
|
assert isinstance(element, DocItem)
|
167
|
-
element_prov = element.prov[0]
|
168
173
|
|
174
|
+
# Allow the case of documents without page images but embedded images (e.g. Word and HTML docs)
|
175
|
+
if len(element.prov) == 0 and isinstance(element, PictureItem):
|
176
|
+
embedded_im = element.get_image(conv_res.document)
|
177
|
+
if embedded_im is not None:
|
178
|
+
return ItemAndImageEnrichmentElement(item=element, image=embedded_im)
|
179
|
+
else:
|
180
|
+
return None
|
181
|
+
|
182
|
+
# Crop the image form the page
|
183
|
+
element_prov = element.prov[0]
|
169
184
|
bbox = element_prov.bbox
|
170
185
|
width = bbox.r - bbox.l
|
171
186
|
height = bbox.t - bbox.b
|
@@ -183,4 +198,14 @@ class BaseItemAndImageEnrichmentModel(
|
|
183
198
|
cropped_image = conv_res.pages[page_ix].get_image(
|
184
199
|
scale=self.images_scale, cropbox=expanded_bbox
|
185
200
|
)
|
201
|
+
|
202
|
+
# Allow for images being embedded without the page backend or page images
|
203
|
+
if cropped_image is None and isinstance(element, PictureItem):
|
204
|
+
embedded_im = element.get_image(conv_res.document)
|
205
|
+
if embedded_im is not None:
|
206
|
+
return ItemAndImageEnrichmentElement(item=element, image=embedded_im)
|
207
|
+
else:
|
208
|
+
return None
|
209
|
+
|
210
|
+
# Return the proper cropped image
|
186
211
|
return ItemAndImageEnrichmentElement(item=element, image=cropped_image)
|
@@ -78,14 +78,17 @@ class EasyOcrModel(BaseOcrModel):
|
|
78
78
|
download_enabled = False
|
79
79
|
model_storage_directory = str(artifacts_path / self._model_repo_folder)
|
80
80
|
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
81
|
+
with warnings.catch_warnings():
|
82
|
+
if self.options.suppress_mps_warnings:
|
83
|
+
warnings.filterwarnings("ignore", message=".*pin_memory.*MPS.*")
|
84
|
+
self.reader = easyocr.Reader(
|
85
|
+
lang_list=self.options.lang,
|
86
|
+
gpu=use_gpu,
|
87
|
+
model_storage_directory=model_storage_directory,
|
88
|
+
recog_network=self.options.recog_network,
|
89
|
+
download_enabled=download_enabled,
|
90
|
+
verbose=False,
|
91
|
+
)
|
89
92
|
|
90
93
|
@staticmethod
|
91
94
|
def download_models(
|
@@ -147,7 +150,14 @@ class EasyOcrModel(BaseOcrModel):
|
|
147
150
|
scale=self.scale, cropbox=ocr_rect
|
148
151
|
)
|
149
152
|
im = numpy.array(high_res_image)
|
150
|
-
|
153
|
+
|
154
|
+
with warnings.catch_warnings():
|
155
|
+
if self.options.suppress_mps_warnings:
|
156
|
+
warnings.filterwarnings(
|
157
|
+
"ignore", message=".*pin_memory.*MPS.*"
|
158
|
+
)
|
159
|
+
|
160
|
+
result = self.reader.readtext(im)
|
151
161
|
|
152
162
|
del high_res_image
|
153
163
|
del im
|
@@ -67,7 +67,7 @@ class PictureDescriptionVlmModel(
|
|
67
67
|
self.model = AutoModelForImageTextToText.from_pretrained(
|
68
68
|
artifacts_path,
|
69
69
|
device_map=self.device,
|
70
|
-
|
70
|
+
dtype=torch.bfloat16,
|
71
71
|
_attn_implementation=(
|
72
72
|
"flash_attention_2"
|
73
73
|
if self.device.startswith("cuda")
|
@@ -112,7 +112,7 @@ class HuggingFaceTransformersVlmModel(BaseVlmPageModel, HuggingFaceModelDownload
|
|
112
112
|
self.vlm_model = model_cls.from_pretrained(
|
113
113
|
artifacts_path,
|
114
114
|
device_map=self.device,
|
115
|
-
|
115
|
+
dtype=self.vlm_options.torch_dtype,
|
116
116
|
_attn_implementation=(
|
117
117
|
"flash_attention_2"
|
118
118
|
if self.device.startswith("cuda")
|
{docling-2.50.0 → docling-2.52.0}/docling/models/vlm_models_inline/nuextract_transformers_model.py
RENAMED
@@ -144,7 +144,7 @@ class NuExtractTransformersModel(BaseVlmModel, HuggingFaceModelDownloadMixin):
|
|
144
144
|
self.vlm_model = AutoModelForImageTextToText.from_pretrained(
|
145
145
|
artifacts_path,
|
146
146
|
device_map=self.device,
|
147
|
-
|
147
|
+
dtype=self.vlm_options.torch_dtype,
|
148
148
|
_attn_implementation=(
|
149
149
|
"flash_attention_2"
|
150
150
|
if self.device.startswith("cuda")
|
@@ -208,25 +208,13 @@ class AsrPipeline(BasePipeline):
|
|
208
208
|
|
209
209
|
self.pipeline_options: AsrPipelineOptions = pipeline_options
|
210
210
|
|
211
|
-
artifacts_path: Optional[Path] = None
|
212
|
-
if pipeline_options.artifacts_path is not None:
|
213
|
-
artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
|
214
|
-
elif settings.artifacts_path is not None:
|
215
|
-
artifacts_path = Path(settings.artifacts_path).expanduser()
|
216
|
-
|
217
|
-
if artifacts_path is not None and not artifacts_path.is_dir():
|
218
|
-
raise RuntimeError(
|
219
|
-
f"The value of {artifacts_path=} is not valid. "
|
220
|
-
"When defined, it must point to a folder containing all models required by the pipeline."
|
221
|
-
)
|
222
|
-
|
223
211
|
if isinstance(self.pipeline_options.asr_options, InlineAsrNativeWhisperOptions):
|
224
212
|
asr_options: InlineAsrNativeWhisperOptions = (
|
225
213
|
self.pipeline_options.asr_options
|
226
214
|
)
|
227
215
|
self._model = _NativeWhisperModel(
|
228
216
|
enabled=True, # must be always enabled for this pipeline to make sense.
|
229
|
-
artifacts_path=artifacts_path,
|
217
|
+
artifacts_path=self.artifacts_path,
|
230
218
|
accelerator_options=pipeline_options.accelerator_options,
|
231
219
|
asr_options=asr_options,
|
232
220
|
)
|
@@ -1,19 +1,33 @@
|
|
1
1
|
import logging
|
2
2
|
from abc import ABC, abstractmethod
|
3
|
+
from pathlib import Path
|
3
4
|
from typing import Optional
|
4
5
|
|
5
6
|
from docling.datamodel.base_models import ConversionStatus, ErrorItem
|
6
7
|
from docling.datamodel.document import InputDocument
|
7
8
|
from docling.datamodel.extraction import ExtractionResult, ExtractionTemplateType
|
8
|
-
from docling.datamodel.pipeline_options import BaseOptions
|
9
|
+
from docling.datamodel.pipeline_options import BaseOptions, PipelineOptions
|
10
|
+
from docling.datamodel.settings import settings
|
9
11
|
|
10
12
|
_log = logging.getLogger(__name__)
|
11
13
|
|
12
14
|
|
13
15
|
class BaseExtractionPipeline(ABC):
|
14
|
-
def __init__(self, pipeline_options:
|
16
|
+
def __init__(self, pipeline_options: PipelineOptions):
|
15
17
|
self.pipeline_options = pipeline_options
|
16
18
|
|
19
|
+
self.artifacts_path: Optional[Path] = None
|
20
|
+
if pipeline_options.artifacts_path is not None:
|
21
|
+
self.artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
|
22
|
+
elif settings.artifacts_path is not None:
|
23
|
+
self.artifacts_path = Path(settings.artifacts_path).expanduser()
|
24
|
+
|
25
|
+
if self.artifacts_path is not None and not self.artifacts_path.is_dir():
|
26
|
+
raise RuntimeError(
|
27
|
+
f"The value of {self.artifacts_path=} is not valid. "
|
28
|
+
"When defined, it must point to a folder containing all models required by the pipeline."
|
29
|
+
)
|
30
|
+
|
17
31
|
def execute(
|
18
32
|
self,
|
19
33
|
in_doc: InputDocument,
|
@@ -54,5 +68,5 @@ class BaseExtractionPipeline(ABC):
|
|
54
68
|
|
55
69
|
@classmethod
|
56
70
|
@abstractmethod
|
57
|
-
def get_default_options(cls) ->
|
71
|
+
def get_default_options(cls) -> PipelineOptions:
|
58
72
|
pass
|
@@ -4,7 +4,8 @@ import time
|
|
4
4
|
import traceback
|
5
5
|
from abc import ABC, abstractmethod
|
6
6
|
from collections.abc import Iterable
|
7
|
-
from
|
7
|
+
from pathlib import Path
|
8
|
+
from typing import Any, Callable, List, Optional
|
8
9
|
|
9
10
|
from docling_core.types.doc import NodeItem
|
10
11
|
|
@@ -20,9 +21,19 @@ from docling.datamodel.base_models import (
|
|
20
21
|
Page,
|
21
22
|
)
|
22
23
|
from docling.datamodel.document import ConversionResult, InputDocument
|
23
|
-
from docling.datamodel.pipeline_options import
|
24
|
+
from docling.datamodel.pipeline_options import (
|
25
|
+
ConvertPipelineOptions,
|
26
|
+
PdfPipelineOptions,
|
27
|
+
PipelineOptions,
|
28
|
+
)
|
24
29
|
from docling.datamodel.settings import settings
|
25
30
|
from docling.models.base_model import GenericEnrichmentModel
|
31
|
+
from docling.models.document_picture_classifier import (
|
32
|
+
DocumentPictureClassifier,
|
33
|
+
DocumentPictureClassifierOptions,
|
34
|
+
)
|
35
|
+
from docling.models.factories import get_picture_description_factory
|
36
|
+
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
26
37
|
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
27
38
|
from docling.utils.utils import chunkify
|
28
39
|
|
@@ -36,6 +47,18 @@ class BasePipeline(ABC):
|
|
36
47
|
self.build_pipe: List[Callable] = []
|
37
48
|
self.enrichment_pipe: List[GenericEnrichmentModel[Any]] = []
|
38
49
|
|
50
|
+
self.artifacts_path: Optional[Path] = None
|
51
|
+
if pipeline_options.artifacts_path is not None:
|
52
|
+
self.artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
|
53
|
+
elif settings.artifacts_path is not None:
|
54
|
+
self.artifacts_path = Path(settings.artifacts_path).expanduser()
|
55
|
+
|
56
|
+
if self.artifacts_path is not None and not self.artifacts_path.is_dir():
|
57
|
+
raise RuntimeError(
|
58
|
+
f"The value of {self.artifacts_path=} is not valid. "
|
59
|
+
"When defined, it must point to a folder containing all models required by the pipeline."
|
60
|
+
)
|
61
|
+
|
39
62
|
def execute(self, in_doc: InputDocument, raises_on_error: bool) -> ConversionResult:
|
40
63
|
conv_res = ConversionResult(input=in_doc)
|
41
64
|
|
@@ -108,15 +131,58 @@ class BasePipeline(ABC):
|
|
108
131
|
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
109
132
|
pass
|
110
133
|
|
111
|
-
# def _apply_on_elements(self, element_batch: Iterable[NodeItem]) -> Iterable[Any]:
|
112
|
-
# for model in self.build_pipe:
|
113
|
-
# element_batch = model(element_batch)
|
114
|
-
#
|
115
|
-
# yield from element_batch
|
116
134
|
|
135
|
+
class ConvertPipeline(BasePipeline):
|
136
|
+
def __init__(self, pipeline_options: ConvertPipelineOptions):
|
137
|
+
super().__init__(pipeline_options)
|
138
|
+
self.pipeline_options: ConvertPipelineOptions
|
117
139
|
|
118
|
-
|
119
|
-
|
140
|
+
# ------ Common enrichment models working on all backends
|
141
|
+
|
142
|
+
# Picture description model
|
143
|
+
if (
|
144
|
+
picture_description_model := self._get_picture_description_model(
|
145
|
+
artifacts_path=self.artifacts_path
|
146
|
+
)
|
147
|
+
) is None:
|
148
|
+
raise RuntimeError(
|
149
|
+
f"The specified picture description kind is not supported: {pipeline_options.picture_description_options.kind}."
|
150
|
+
)
|
151
|
+
|
152
|
+
self.enrichment_pipe = [
|
153
|
+
# Document Picture Classifier
|
154
|
+
DocumentPictureClassifier(
|
155
|
+
enabled=pipeline_options.do_picture_classification,
|
156
|
+
artifacts_path=self.artifacts_path,
|
157
|
+
options=DocumentPictureClassifierOptions(),
|
158
|
+
accelerator_options=pipeline_options.accelerator_options,
|
159
|
+
),
|
160
|
+
# Document Picture description
|
161
|
+
picture_description_model,
|
162
|
+
]
|
163
|
+
|
164
|
+
def _get_picture_description_model(
|
165
|
+
self, artifacts_path: Optional[Path] = None
|
166
|
+
) -> Optional[PictureDescriptionBaseModel]:
|
167
|
+
factory = get_picture_description_factory(
|
168
|
+
allow_external_plugins=self.pipeline_options.allow_external_plugins
|
169
|
+
)
|
170
|
+
return factory.create_instance(
|
171
|
+
options=self.pipeline_options.picture_description_options,
|
172
|
+
enabled=self.pipeline_options.do_picture_description,
|
173
|
+
enable_remote_services=self.pipeline_options.enable_remote_services,
|
174
|
+
artifacts_path=artifacts_path,
|
175
|
+
accelerator_options=self.pipeline_options.accelerator_options,
|
176
|
+
)
|
177
|
+
|
178
|
+
@classmethod
|
179
|
+
@abstractmethod
|
180
|
+
def get_default_options(cls) -> ConvertPipelineOptions:
|
181
|
+
pass
|
182
|
+
|
183
|
+
|
184
|
+
class PaginatedPipeline(ConvertPipeline): # TODO this is a bad name.
|
185
|
+
def __init__(self, pipeline_options: ConvertPipelineOptions):
|
120
186
|
super().__init__(pipeline_options)
|
121
187
|
self.keep_backend = False
|
122
188
|
|
@@ -1,7 +1,6 @@
|
|
1
1
|
import inspect
|
2
2
|
import json
|
3
3
|
import logging
|
4
|
-
from pathlib import Path
|
5
4
|
from typing import Optional
|
6
5
|
|
7
6
|
from PIL.Image import Image
|
@@ -16,7 +15,10 @@ from docling.datamodel.extraction import (
|
|
16
15
|
ExtractionResult,
|
17
16
|
ExtractionTemplateType,
|
18
17
|
)
|
19
|
-
from docling.datamodel.pipeline_options import
|
18
|
+
from docling.datamodel.pipeline_options import (
|
19
|
+
PipelineOptions,
|
20
|
+
VlmExtractionPipelineOptions,
|
21
|
+
)
|
20
22
|
from docling.datamodel.settings import settings
|
21
23
|
from docling.models.vlm_models_inline.nuextract_transformers_model import (
|
22
24
|
NuExtractTransformersModel,
|
@@ -35,22 +37,10 @@ class ExtractionVlmPipeline(BaseExtractionPipeline):
|
|
35
37
|
self.accelerator_options = pipeline_options.accelerator_options
|
36
38
|
self.pipeline_options: VlmExtractionPipelineOptions
|
37
39
|
|
38
|
-
artifacts_path: Optional[Path] = None
|
39
|
-
if pipeline_options.artifacts_path is not None:
|
40
|
-
artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
|
41
|
-
elif settings.artifacts_path is not None:
|
42
|
-
artifacts_path = Path(settings.artifacts_path).expanduser()
|
43
|
-
|
44
|
-
if artifacts_path is not None and not artifacts_path.is_dir():
|
45
|
-
raise RuntimeError(
|
46
|
-
f"The value of {artifacts_path=} is not valid. "
|
47
|
-
"When defined, it must point to a folder containing all models required by the pipeline."
|
48
|
-
)
|
49
|
-
|
50
40
|
# Create VLM model instance
|
51
41
|
self.vlm_model = NuExtractTransformersModel(
|
52
42
|
enabled=True,
|
53
|
-
artifacts_path=artifacts_path, # Will download automatically
|
43
|
+
artifacts_path=self.artifacts_path, # Will download automatically
|
54
44
|
accelerator_options=self.accelerator_options,
|
55
45
|
vlm_options=pipeline_options.vlm_options,
|
56
46
|
)
|
@@ -194,11 +184,14 @@ class ExtractionVlmPipeline(BaseExtractionPipeline):
|
|
194
184
|
class ExtractionTemplateFactory(ModelFactory[template]): # type: ignore
|
195
185
|
__use_examples__ = True # prefer Field(examples=...) when present
|
196
186
|
__use_defaults__ = True # use field defaults instead of random values
|
187
|
+
__check_model__ = (
|
188
|
+
True # setting the value to avoid deprecation warnings
|
189
|
+
)
|
197
190
|
|
198
191
|
return ExtractionTemplateFactory.build().model_dump_json(indent=2) # type: ignore
|
199
192
|
else:
|
200
193
|
raise ValueError(f"Unsupported template type: {type(template)}")
|
201
194
|
|
202
195
|
@classmethod
|
203
|
-
def get_default_options(cls) ->
|
196
|
+
def get_default_options(cls) -> PipelineOptions:
|
204
197
|
return VlmExtractionPipelineOptions()
|