docling 2.34.0__tar.gz → 2.36.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.34.0 → docling-2.36.0}/PKG-INFO +54 -55
- {docling-2.34.0 → docling-2.36.0}/README.md +3 -4
- {docling-2.34.0 → docling-2.36.0}/docling/cli/main.py +48 -18
- docling-2.36.0/docling/datamodel/accelerator_options.py +68 -0
- {docling-2.34.0 → docling-2.36.0}/docling/datamodel/base_models.py +10 -8
- {docling-2.34.0 → docling-2.36.0}/docling/datamodel/document.py +7 -2
- {docling-2.34.0 → docling-2.36.0}/docling/datamodel/pipeline_options.py +29 -161
- docling-2.36.0/docling/datamodel/pipeline_options_vlm_model.py +81 -0
- docling-2.36.0/docling/datamodel/vlm_model_specs.py +144 -0
- {docling-2.34.0 → docling-2.36.0}/docling/document_converter.py +5 -0
- {docling-2.34.0 → docling-2.36.0}/docling/models/api_vlm_model.py +1 -1
- {docling-2.34.0 → docling-2.36.0}/docling/models/base_ocr_model.py +2 -1
- {docling-2.34.0 → docling-2.36.0}/docling/models/code_formula_model.py +6 -11
- {docling-2.34.0 → docling-2.36.0}/docling/models/document_picture_classifier.py +6 -11
- {docling-2.34.0 → docling-2.36.0}/docling/models/easyocr_model.py +1 -2
- {docling-2.34.0 → docling-2.36.0}/docling/models/layout_model.py +22 -17
- {docling-2.34.0 → docling-2.36.0}/docling/models/ocr_mac_model.py +1 -1
- {docling-2.34.0 → docling-2.36.0}/docling/models/page_preprocessing_model.py +11 -6
- {docling-2.34.0 → docling-2.36.0}/docling/models/picture_description_api_model.py +1 -1
- {docling-2.34.0 → docling-2.36.0}/docling/models/picture_description_base_model.py +1 -1
- {docling-2.34.0 → docling-2.36.0}/docling/models/picture_description_vlm_model.py +7 -22
- {docling-2.34.0 → docling-2.36.0}/docling/models/rapid_ocr_model.py +1 -2
- {docling-2.34.0 → docling-2.36.0}/docling/models/table_structure_model.py +6 -12
- {docling-2.34.0 → docling-2.36.0}/docling/models/tesseract_ocr_cli_model.py +1 -1
- {docling-2.34.0 → docling-2.36.0}/docling/models/tesseract_ocr_model.py +1 -1
- docling-2.36.0/docling/models/utils/hf_model_download.py +40 -0
- docling-2.36.0/docling/models/vlm_models_inline/hf_transformers_model.py +194 -0
- docling-2.34.0/docling/models/hf_mlx_model.py → docling-2.36.0/docling/models/vlm_models_inline/mlx_model.py +56 -44
- docling-2.36.0/docling/pipeline/__init__.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/pipeline/standard_pdf_pipeline.py +69 -57
- docling-2.36.0/docling/pipeline/vlm_pipeline.py +386 -0
- docling-2.36.0/docling/py.typed +1 -0
- docling-2.36.0/docling/utils/__init__.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/utils/accelerator_utils.py +17 -2
- {docling-2.34.0 → docling-2.36.0}/docling/utils/model_downloader.py +13 -12
- docling-2.36.0/docling.egg-info/PKG-INFO +216 -0
- docling-2.36.0/docling.egg-info/SOURCES.txt +124 -0
- docling-2.36.0/docling.egg-info/entry_points.txt +6 -0
- docling-2.36.0/docling.egg-info/requires.txt +47 -0
- docling-2.36.0/docling.egg-info/top_level.txt +1 -0
- docling-2.36.0/pyproject.toml +266 -0
- docling-2.36.0/setup.cfg +4 -0
- docling-2.36.0/tests/test_backend_asciidoc.py +50 -0
- docling-2.36.0/tests/test_backend_csv.py +87 -0
- docling-2.36.0/tests/test_backend_docling_json.py +58 -0
- docling-2.36.0/tests/test_backend_docling_parse.py +77 -0
- docling-2.36.0/tests/test_backend_docling_parse_v2.py +76 -0
- docling-2.36.0/tests/test_backend_docling_parse_v4.py +76 -0
- docling-2.36.0/tests/test_backend_html.py +149 -0
- docling-2.36.0/tests/test_backend_jats.py +62 -0
- docling-2.36.0/tests/test_backend_markdown.py +41 -0
- docling-2.36.0/tests/test_backend_msexcel.py +99 -0
- docling-2.36.0/tests/test_backend_msword.py +133 -0
- docling-2.36.0/tests/test_backend_patent_uspto.py +458 -0
- docling-2.36.0/tests/test_backend_pdfium.py +90 -0
- docling-2.36.0/tests/test_backend_pptx.py +55 -0
- docling-2.36.0/tests/test_backend_webp.py +82 -0
- docling-2.36.0/tests/test_cli.py +27 -0
- docling-2.36.0/tests/test_code_formula.py +62 -0
- docling-2.36.0/tests/test_data_gen_flag.py +9 -0
- docling-2.36.0/tests/test_document_picture_classifier.py +78 -0
- docling-2.36.0/tests/test_e2e_conversion.py +60 -0
- docling-2.36.0/tests/test_e2e_ocr_conversion.py +104 -0
- docling-2.36.0/tests/test_input_doc.py +245 -0
- docling-2.36.0/tests/test_interfaces.py +67 -0
- docling-2.36.0/tests/test_invalid_input.py +44 -0
- docling-2.36.0/tests/test_legacy_format_transform.py +52 -0
- docling-2.36.0/tests/test_options.py +172 -0
- docling-2.36.0/tests/test_settings_load.py +29 -0
- docling-2.34.0/docling/models/hf_vlm_model.py +0 -182
- docling-2.34.0/docling/pipeline/vlm_pipeline.py +0 -219
- docling-2.34.0/pyproject.toml +0 -285
- {docling-2.34.0 → docling-2.36.0}/LICENSE +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/__init__.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/backend/__init__.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/backend/csv_backend.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/backend/docling_parse_v4_backend.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/backend/docx/__init__.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/backend/docx/latex/__init__.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/backend/docx/latex/latex_dict.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/backend/docx/latex/omml.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/backend/html_backend.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/backend/json/__init__.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/backend/md_backend.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/backend/msword_backend.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/backend/pdf_backend.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/backend/xml/__init__.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/backend/xml/jats_backend.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/backend/xml/uspto_backend.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/chunking/__init__.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/cli/__init__.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/cli/models.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/cli/tools.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/datamodel/settings.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/exceptions.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/models/__init__.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/models/base_model.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/models/factories/__init__.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/models/factories/base_factory.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/models/factories/ocr_factory.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/models/factories/picture_description_factory.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/models/page_assemble_model.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/models/plugins/__init__.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/models/plugins/defaults.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/models/readingorder_model.py +0 -0
- {docling-2.34.0/docling/pipeline → docling-2.36.0/docling/models/utils}/__init__.py +0 -0
- {docling-2.34.0/docling/utils → docling-2.36.0/docling/models/vlm_models_inline}/__init__.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/utils/api_image_request.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/utils/export.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/utils/glm_utils.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/utils/layout_postprocessor.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/utils/locks.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/utils/ocr_utils.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/utils/orientation.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/utils/profiling.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/utils/utils.py +0 -0
- {docling-2.34.0 → docling-2.36.0}/docling/utils/visualization.py +0 -0
- /docling-2.34.0/docling/py.typed → /docling-2.36.0/docling.egg-info/dependency_links.txt +0 -0
@@ -1,67 +1,68 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.36.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
|
-
|
6
|
-
License: MIT
|
5
|
+
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
|
+
License-Expression: MIT
|
7
|
+
Project-URL: homepage, https://github.com/docling-project/docling
|
8
|
+
Project-URL: repository, https://github.com/docling-project/docling
|
9
|
+
Project-URL: issues, https://github.com/docling-project/docling/issues
|
10
|
+
Project-URL: changelog, https://github.com/docling-project/docling/blob/main/CHANGELOG.md
|
7
11
|
Keywords: docling,convert,document,pdf,docx,html,markdown,layout model,segmentation,table structure,table former
|
8
|
-
|
9
|
-
|
10
|
-
|
12
|
+
Classifier: Operating System :: MacOS :: MacOS X
|
13
|
+
Classifier: Operating System :: POSIX :: Linux
|
14
|
+
Classifier: Operating System :: Microsoft :: Windows
|
11
15
|
Classifier: Development Status :: 5 - Production/Stable
|
12
16
|
Classifier: Intended Audience :: Developers
|
13
17
|
Classifier: Intended Audience :: Science/Research
|
14
|
-
Classifier:
|
15
|
-
Classifier: Operating System :: MacOS :: MacOS X
|
16
|
-
Classifier: Operating System :: POSIX :: Linux
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
17
19
|
Classifier: Programming Language :: Python :: 3
|
18
20
|
Classifier: Programming Language :: Python :: 3.9
|
19
21
|
Classifier: Programming Language :: Python :: 3.10
|
20
22
|
Classifier: Programming Language :: Python :: 3.11
|
21
23
|
Classifier: Programming Language :: Python :: 3.12
|
22
24
|
Classifier: Programming Language :: Python :: 3.13
|
23
|
-
|
24
|
-
|
25
|
-
|
25
|
+
Requires-Python: <4.0,>=3.9
|
26
|
+
Description-Content-Type: text/markdown
|
27
|
+
License-File: LICENSE
|
28
|
+
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
29
|
+
Requires-Dist: docling-core[chunking]<3.0.0,>=2.29.0
|
30
|
+
Requires-Dist: docling-ibm-models<4.0.0,>=3.4.4
|
31
|
+
Requires-Dist: docling-parse<5.0.0,>=4.0.0
|
32
|
+
Requires-Dist: filetype<2.0.0,>=1.2.0
|
33
|
+
Requires-Dist: pypdfium2<5.0.0,>=4.30.0
|
34
|
+
Requires-Dist: pydantic-settings<3.0.0,>=2.3.0
|
35
|
+
Requires-Dist: huggingface_hub<1,>=0.23
|
36
|
+
Requires-Dist: requests<3.0.0,>=2.32.2
|
37
|
+
Requires-Dist: easyocr<2.0,>=1.7
|
38
|
+
Requires-Dist: certifi>=2024.7.4
|
39
|
+
Requires-Dist: rtree<2.0.0,>=1.3.0
|
40
|
+
Requires-Dist: typer<0.16.0,>=0.12.5
|
41
|
+
Requires-Dist: python-docx<2.0.0,>=1.1.2
|
42
|
+
Requires-Dist: python-pptx<2.0.0,>=1.0.2
|
43
|
+
Requires-Dist: beautifulsoup4<5.0.0,>=4.12.3
|
44
|
+
Requires-Dist: pandas<3.0.0,>=2.1.4
|
45
|
+
Requires-Dist: marko<3.0.0,>=2.1.2
|
46
|
+
Requires-Dist: openpyxl<4.0.0,>=3.1.5
|
47
|
+
Requires-Dist: lxml<6.0.0,>=4.0.0
|
48
|
+
Requires-Dist: pillow<12.0.0,>=10.0.0
|
49
|
+
Requires-Dist: tqdm<5.0.0,>=4.65.0
|
50
|
+
Requires-Dist: pluggy<2.0.0,>=1.0.0
|
51
|
+
Requires-Dist: pylatexenc<3.0,>=2.10
|
52
|
+
Requires-Dist: click<8.2.0
|
53
|
+
Requires-Dist: scipy<2.0.0,>=1.6.0
|
26
54
|
Provides-Extra: tesserocr
|
55
|
+
Requires-Dist: tesserocr<3.0.0,>=2.7.1; extra == "tesserocr"
|
56
|
+
Provides-Extra: ocrmac
|
57
|
+
Requires-Dist: ocrmac<2.0.0,>=1.0.0; sys_platform == "darwin" and extra == "ocrmac"
|
27
58
|
Provides-Extra: vlm
|
28
|
-
Requires-Dist:
|
29
|
-
Requires-Dist:
|
30
|
-
Requires-Dist:
|
31
|
-
|
32
|
-
Requires-Dist:
|
33
|
-
Requires-Dist:
|
34
|
-
|
35
|
-
Requires-Dist: easyocr (>=1.7,<2.0)
|
36
|
-
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
37
|
-
Requires-Dist: huggingface_hub (>=0.23,<1)
|
38
|
-
Requires-Dist: lxml (>=4.0.0,<6.0.0)
|
39
|
-
Requires-Dist: marko (>=2.1.2,<3.0.0)
|
40
|
-
Requires-Dist: ocrmac (>=1.0.0,<2.0.0) ; (sys_platform == "darwin") and (extra == "ocrmac")
|
41
|
-
Requires-Dist: onnxruntime (>=1.7.0,<1.20.0) ; (python_version < "3.10") and (extra == "rapidocr")
|
42
|
-
Requires-Dist: onnxruntime (>=1.7.0,<2.0.0) ; (python_version >= "3.10") and (extra == "rapidocr")
|
43
|
-
Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
|
44
|
-
Requires-Dist: pandas (>=2.1.4,<3.0.0)
|
45
|
-
Requires-Dist: pillow (>=10.0.0,<12.0.0)
|
46
|
-
Requires-Dist: pluggy (>=1.0.0,<2.0.0)
|
47
|
-
Requires-Dist: pydantic (>=2.0.0,<3.0.0)
|
48
|
-
Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
|
49
|
-
Requires-Dist: pylatexenc (>=2.10,<3.0)
|
50
|
-
Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
|
51
|
-
Requires-Dist: python-docx (>=1.1.2,<2.0.0)
|
52
|
-
Requires-Dist: python-pptx (>=1.0.2,<2.0.0)
|
53
|
-
Requires-Dist: rapidocr-onnxruntime (>=1.4.0,<2.0.0) ; (python_version < "3.13") and (extra == "rapidocr")
|
54
|
-
Requires-Dist: requests (>=2.32.2,<3.0.0)
|
55
|
-
Requires-Dist: rtree (>=1.3.0,<2.0.0)
|
56
|
-
Requires-Dist: scipy (>=1.6.0,<1.14.0) ; python_version < "3.10"
|
57
|
-
Requires-Dist: scipy (>=1.6.0,<2.0.0) ; python_version >= "3.10"
|
58
|
-
Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
|
59
|
-
Requires-Dist: tqdm (>=4.65.0,<5.0.0)
|
60
|
-
Requires-Dist: transformers (>=4.42.0,<4.43.0) ; (sys_platform == "darwin" and platform_machine == "x86_64") and (extra == "vlm")
|
61
|
-
Requires-Dist: transformers (>=4.46.0,<5.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
|
62
|
-
Requires-Dist: typer (>=0.12.5,<0.16.0)
|
63
|
-
Project-URL: Repository, https://github.com/docling-project/docling
|
64
|
-
Description-Content-Type: text/markdown
|
59
|
+
Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
|
60
|
+
Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
|
61
|
+
Requires-Dist: mlx-vlm>=0.1.22; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
|
62
|
+
Provides-Extra: rapidocr
|
63
|
+
Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
|
64
|
+
Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
|
65
|
+
Dynamic: license-file
|
65
66
|
|
66
67
|
<p align="center">
|
67
68
|
<a href="https://github.com/docling-project/docling">
|
@@ -79,9 +80,8 @@ Description-Content-Type: text/markdown
|
|
79
80
|
[](https://docling-project.github.io/docling/)
|
80
81
|
[](https://pypi.org/project/docling/)
|
81
82
|
[](https://pypi.org/project/docling/)
|
82
|
-
[](https://pycqa.github.io/isort/)
|
83
|
+
[](https://github.com/astral-sh/uv)
|
84
|
+
[](https://github.com/astral-sh/ruff)
|
85
85
|
[](https://pydantic.dev)
|
86
86
|
[](https://github.com/pre-commit/pre-commit)
|
87
87
|
[](https://opensource.org/licenses/MIT)
|
@@ -101,7 +101,7 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
|
101
101
|
* 🔒 Local execution capabilities for sensitive data and air-gapped environments
|
102
102
|
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
103
103
|
* 🔍 Extensive OCR support for scanned PDFs and images
|
104
|
-
* 🥚 Support of Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
|
104
|
+
* 🥚 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
|
105
105
|
* 💻 Simple and convenient CLI
|
106
106
|
|
107
107
|
### Coming soon
|
@@ -214,4 +214,3 @@ The project was started by the AI for knowledge team at IBM Research Zurich.
|
|
214
214
|
[supported_formats]: https://docling-project.github.io/docling/usage/supported_formats/
|
215
215
|
[docling_document]: https://docling-project.github.io/docling/concepts/docling_document/
|
216
216
|
[integrations]: https://docling-project.github.io/docling/integrations/
|
217
|
-
|
@@ -14,9 +14,8 @@
|
|
14
14
|
[](https://docling-project.github.io/docling/)
|
15
15
|
[](https://pypi.org/project/docling/)
|
16
16
|
[](https://pypi.org/project/docling/)
|
17
|
-
[](https://pycqa.github.io/isort/)
|
17
|
+
[](https://github.com/astral-sh/uv)
|
18
|
+
[](https://github.com/astral-sh/ruff)
|
20
19
|
[](https://pydantic.dev)
|
21
20
|
[](https://github.com/pre-commit/pre-commit)
|
22
21
|
[](https://opensource.org/licenses/MIT)
|
@@ -36,7 +35,7 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
|
36
35
|
* 🔒 Local execution capabilities for sensitive data and air-gapped environments
|
37
36
|
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
38
37
|
* 🔍 Extensive OCR support for scanned PDFs and images
|
39
|
-
* 🥚 Support of Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
|
38
|
+
* 🥚 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
|
40
39
|
* 💻 Simple and convenient CLI
|
41
40
|
|
42
41
|
### Coming soon
|
@@ -12,6 +12,12 @@ from typing import Annotated, Dict, List, Optional, Type
|
|
12
12
|
|
13
13
|
import rich.table
|
14
14
|
import typer
|
15
|
+
from docling_core.transforms.serializer.html import (
|
16
|
+
HTMLDocSerializer,
|
17
|
+
HTMLOutputStyle,
|
18
|
+
HTMLParams,
|
19
|
+
)
|
20
|
+
from docling_core.transforms.visualizer.layout_visualizer import LayoutVisualizer
|
15
21
|
from docling_core.types.doc import ImageRefMode
|
16
22
|
from docling_core.utils.file import resolve_source_to_path
|
17
23
|
from pydantic import TypeAdapter
|
@@ -22,6 +28,7 @@ from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBacke
|
|
22
28
|
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
23
29
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
24
30
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
31
|
+
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
25
32
|
from docling.datamodel.base_models import (
|
26
33
|
ConversionStatus,
|
27
34
|
FormatToExtensions,
|
@@ -30,8 +37,6 @@ from docling.datamodel.base_models import (
|
|
30
37
|
)
|
31
38
|
from docling.datamodel.document import ConversionResult
|
32
39
|
from docling.datamodel.pipeline_options import (
|
33
|
-
AcceleratorDevice,
|
34
|
-
AcceleratorOptions,
|
35
40
|
EasyOcrOptions,
|
36
41
|
OcrOptions,
|
37
42
|
PaginatedPipelineOptions,
|
@@ -39,14 +44,16 @@ from docling.datamodel.pipeline_options import (
|
|
39
44
|
PdfPipeline,
|
40
45
|
PdfPipelineOptions,
|
41
46
|
TableFormerMode,
|
42
|
-
VlmModelType,
|
43
47
|
VlmPipelineOptions,
|
44
|
-
granite_vision_vlm_conversion_options,
|
45
|
-
granite_vision_vlm_ollama_conversion_options,
|
46
|
-
smoldocling_vlm_conversion_options,
|
47
|
-
smoldocling_vlm_mlx_conversion_options,
|
48
48
|
)
|
49
49
|
from docling.datamodel.settings import settings
|
50
|
+
from docling.datamodel.vlm_model_specs import (
|
51
|
+
GRANITE_VISION_OLLAMA,
|
52
|
+
GRANITE_VISION_TRANSFORMERS,
|
53
|
+
SMOLDOCLING_MLX,
|
54
|
+
SMOLDOCLING_TRANSFORMERS,
|
55
|
+
VlmModelType,
|
56
|
+
)
|
50
57
|
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
51
58
|
from docling.models.factories import get_ocr_factory
|
52
59
|
from docling.pipeline.vlm_pipeline import VlmPipeline
|
@@ -156,6 +163,7 @@ def export_documents(
|
|
156
163
|
export_json: bool,
|
157
164
|
export_html: bool,
|
158
165
|
export_html_split_page: bool,
|
166
|
+
show_layout: bool,
|
159
167
|
export_md: bool,
|
160
168
|
export_txt: bool,
|
161
169
|
export_doctags: bool,
|
@@ -189,9 +197,27 @@ def export_documents(
|
|
189
197
|
if export_html_split_page:
|
190
198
|
fname = output_dir / f"{doc_filename}.html"
|
191
199
|
_log.info(f"writing HTML output to {fname}")
|
192
|
-
|
193
|
-
|
194
|
-
|
200
|
+
if show_layout:
|
201
|
+
ser = HTMLDocSerializer(
|
202
|
+
doc=conv_res.document,
|
203
|
+
params=HTMLParams(
|
204
|
+
image_mode=image_export_mode,
|
205
|
+
output_style=HTMLOutputStyle.SPLIT_PAGE,
|
206
|
+
),
|
207
|
+
)
|
208
|
+
visualizer = LayoutVisualizer()
|
209
|
+
visualizer.params.show_label = False
|
210
|
+
ser_res = ser.serialize(
|
211
|
+
visualizer=visualizer,
|
212
|
+
)
|
213
|
+
with open(fname, "w") as fw:
|
214
|
+
fw.write(ser_res.text)
|
215
|
+
else:
|
216
|
+
conv_res.document.save_as_html(
|
217
|
+
filename=fname,
|
218
|
+
image_mode=image_export_mode,
|
219
|
+
split_page_view=True,
|
220
|
+
)
|
195
221
|
|
196
222
|
# Export Text format:
|
197
223
|
if export_txt:
|
@@ -250,6 +276,13 @@ def convert( # noqa: C901
|
|
250
276
|
to_formats: List[OutputFormat] = typer.Option(
|
251
277
|
None, "--to", help="Specify output formats. Defaults to Markdown."
|
252
278
|
),
|
279
|
+
show_layout: Annotated[
|
280
|
+
bool,
|
281
|
+
typer.Option(
|
282
|
+
...,
|
283
|
+
help="If enabled, the page images will show the bounding-boxes of the items.",
|
284
|
+
),
|
285
|
+
] = False,
|
253
286
|
headers: str = typer.Option(
|
254
287
|
None,
|
255
288
|
"--headers",
|
@@ -547,20 +580,16 @@ def convert( # noqa: C901
|
|
547
580
|
)
|
548
581
|
|
549
582
|
if vlm_model == VlmModelType.GRANITE_VISION:
|
550
|
-
pipeline_options.vlm_options =
|
583
|
+
pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS
|
551
584
|
elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
|
552
|
-
pipeline_options.vlm_options =
|
553
|
-
granite_vision_vlm_ollama_conversion_options
|
554
|
-
)
|
585
|
+
pipeline_options.vlm_options = GRANITE_VISION_OLLAMA
|
555
586
|
elif vlm_model == VlmModelType.SMOLDOCLING:
|
556
|
-
pipeline_options.vlm_options =
|
587
|
+
pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS
|
557
588
|
if sys.platform == "darwin":
|
558
589
|
try:
|
559
590
|
import mlx_vlm
|
560
591
|
|
561
|
-
pipeline_options.vlm_options =
|
562
|
-
smoldocling_vlm_mlx_conversion_options
|
563
|
-
)
|
592
|
+
pipeline_options.vlm_options = SMOLDOCLING_MLX
|
564
593
|
except ImportError:
|
565
594
|
_log.warning(
|
566
595
|
"To run SmolDocling faster, please install mlx-vlm:\n"
|
@@ -596,6 +625,7 @@ def convert( # noqa: C901
|
|
596
625
|
export_json=export_json,
|
597
626
|
export_html=export_html,
|
598
627
|
export_html_split_page=export_html_split_page,
|
628
|
+
show_layout=show_layout,
|
599
629
|
export_md=export_md,
|
600
630
|
export_txt=export_txt,
|
601
631
|
export_doctags=export_doctags,
|
@@ -0,0 +1,68 @@
|
|
1
|
+
import logging
|
2
|
+
import os
|
3
|
+
import re
|
4
|
+
from enum import Enum
|
5
|
+
from typing import Any, Union
|
6
|
+
|
7
|
+
from pydantic import field_validator, model_validator
|
8
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
9
|
+
|
10
|
+
_log = logging.getLogger(__name__)
|
11
|
+
|
12
|
+
|
13
|
+
class AcceleratorDevice(str, Enum):
|
14
|
+
"""Devices to run model inference"""
|
15
|
+
|
16
|
+
AUTO = "auto"
|
17
|
+
CPU = "cpu"
|
18
|
+
CUDA = "cuda"
|
19
|
+
MPS = "mps"
|
20
|
+
|
21
|
+
|
22
|
+
class AcceleratorOptions(BaseSettings):
|
23
|
+
model_config = SettingsConfigDict(
|
24
|
+
env_prefix="DOCLING_", env_nested_delimiter="_", populate_by_name=True
|
25
|
+
)
|
26
|
+
|
27
|
+
num_threads: int = 4
|
28
|
+
device: Union[str, AcceleratorDevice] = "auto"
|
29
|
+
cuda_use_flash_attention2: bool = False
|
30
|
+
|
31
|
+
@field_validator("device")
|
32
|
+
def validate_device(cls, value):
|
33
|
+
# "auto", "cpu", "cuda", "mps", or "cuda:N"
|
34
|
+
if value in {d.value for d in AcceleratorDevice} or re.match(
|
35
|
+
r"^cuda(:\d+)?$", value
|
36
|
+
):
|
37
|
+
return value
|
38
|
+
raise ValueError(
|
39
|
+
"Invalid device option. Use 'auto', 'cpu', 'mps', 'cuda', or 'cuda:N'."
|
40
|
+
)
|
41
|
+
|
42
|
+
@model_validator(mode="before")
|
43
|
+
@classmethod
|
44
|
+
def check_alternative_envvars(cls, data: Any) -> Any:
|
45
|
+
r"""
|
46
|
+
Set num_threads from the "alternative" envvar OMP_NUM_THREADS.
|
47
|
+
The alternative envvar is used only if it is valid and the regular envvar is not set.
|
48
|
+
|
49
|
+
Notice: The standard pydantic settings mechanism with parameter "aliases" does not provide
|
50
|
+
the same functionality. In case the alias envvar is set and the user tries to override the
|
51
|
+
parameter in settings initialization, Pydantic treats the parameter provided in __init__()
|
52
|
+
as an extra input instead of simply overwriting the evvar value for that parameter.
|
53
|
+
"""
|
54
|
+
if isinstance(data, dict):
|
55
|
+
input_num_threads = data.get("num_threads")
|
56
|
+
# Check if to set the num_threads from the alternative envvar
|
57
|
+
if input_num_threads is None:
|
58
|
+
docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
|
59
|
+
omp_num_threads = os.getenv("OMP_NUM_THREADS")
|
60
|
+
if docling_num_threads is None and omp_num_threads is not None:
|
61
|
+
try:
|
62
|
+
data["num_threads"] = int(omp_num_threads)
|
63
|
+
except ValueError:
|
64
|
+
_log.error(
|
65
|
+
"Ignoring misformatted envvar OMP_NUM_THREADS '%s'",
|
66
|
+
omp_num_threads,
|
67
|
+
)
|
68
|
+
return data
|
@@ -13,11 +13,11 @@ from docling_core.types.doc import (
|
|
13
13
|
TableCell,
|
14
14
|
)
|
15
15
|
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
16
|
-
|
17
|
-
# DO NOT REMOVE; explicitly exposed from this location
|
18
16
|
from docling_core.types.io import (
|
19
17
|
DocumentStream,
|
20
18
|
)
|
19
|
+
|
20
|
+
# DO NOT REMOVE; explicitly exposed from this location
|
21
21
|
from PIL.Image import Image
|
22
22
|
from pydantic import BaseModel, ConfigDict, Field, computed_field
|
23
23
|
|
@@ -131,12 +131,6 @@ class ErrorItem(BaseModel):
|
|
131
131
|
error_message: str
|
132
132
|
|
133
133
|
|
134
|
-
# class Cell(BaseModel):
|
135
|
-
# id: int
|
136
|
-
# text: str
|
137
|
-
# bbox: BoundingBox
|
138
|
-
|
139
|
-
|
140
134
|
class Cluster(BaseModel):
|
141
135
|
id: int
|
142
136
|
label: DocItemLabel
|
@@ -158,8 +152,16 @@ class LayoutPrediction(BaseModel):
|
|
158
152
|
clusters: List[Cluster] = []
|
159
153
|
|
160
154
|
|
155
|
+
class VlmPredictionToken(BaseModel):
|
156
|
+
text: str = ""
|
157
|
+
token: int = -1
|
158
|
+
logprob: float = -1
|
159
|
+
|
160
|
+
|
161
161
|
class VlmPrediction(BaseModel):
|
162
162
|
text: str = ""
|
163
|
+
generated_tokens: list[VlmPredictionToken] = []
|
164
|
+
generation_time: float = -1
|
163
165
|
|
164
166
|
|
165
167
|
class ContainerElement(
|
@@ -334,9 +334,9 @@ class _DocumentConversionInput(BaseModel):
|
|
334
334
|
) -> Optional[InputFormat]:
|
335
335
|
"""Guess the input format of a document by checking part of its content."""
|
336
336
|
input_format: Optional[InputFormat] = None
|
337
|
-
content_str = content.decode("utf-8")
|
338
337
|
|
339
338
|
if mime == "application/xml":
|
339
|
+
content_str = content.decode("utf-8")
|
340
340
|
match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)
|
341
341
|
if match_doctype:
|
342
342
|
xml_doctype = match_doctype.group()
|
@@ -358,6 +358,7 @@ class _DocumentConversionInput(BaseModel):
|
|
358
358
|
input_format = InputFormat.XML_JATS
|
359
359
|
|
360
360
|
elif mime == "text/plain":
|
361
|
+
content_str = content.decode("utf-8")
|
361
362
|
if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
|
362
363
|
input_format = InputFormat.XML_USPTO
|
363
364
|
|
@@ -411,7 +412,11 @@ class _DocumentConversionInput(BaseModel):
|
|
411
412
|
else:
|
412
413
|
return "application/xml"
|
413
414
|
|
414
|
-
if re.match(
|
415
|
+
if re.match(
|
416
|
+
r"(<script.*?>.*?</script>\s*)?(<!doctype\s+html|<html|<head|<body)",
|
417
|
+
content_str,
|
418
|
+
re.DOTALL,
|
419
|
+
):
|
415
420
|
return "text/html"
|
416
421
|
|
417
422
|
p = re.compile(
|