docling 2.40.0__tar.gz → 2.41.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.40.0 → docling-2.41.0}/PKG-INFO +3 -3
- {docling-2.40.0 → docling-2.41.0}/docling/datamodel/asr_model_specs.py +6 -6
- {docling-2.40.0 → docling-2.41.0}/docling/datamodel/base_models.py +23 -1
- docling-2.41.0/docling/datamodel/layout_model_specs.py +90 -0
- {docling-2.40.0 → docling-2.41.0}/docling/datamodel/pipeline_options.py +10 -0
- {docling-2.40.0 → docling-2.41.0}/docling/datamodel/pipeline_options_vlm_model.py +11 -3
- {docling-2.40.0 → docling-2.41.0}/docling/models/api_vlm_model.py +7 -5
- {docling-2.40.0 → docling-2.41.0}/docling/models/document_picture_classifier.py +12 -13
- {docling-2.40.0 → docling-2.41.0}/docling/models/layout_model.py +17 -15
- {docling-2.40.0 → docling-2.41.0}/docling/models/vlm_models_inline/hf_transformers_model.py +39 -20
- {docling-2.40.0 → docling-2.41.0}/docling/models/vlm_models_inline/mlx_model.py +5 -3
- {docling-2.40.0 → docling-2.41.0}/docling/pipeline/standard_pdf_pipeline.py +2 -3
- {docling-2.40.0 → docling-2.41.0}/docling/pipeline/vlm_pipeline.py +1 -0
- {docling-2.40.0 → docling-2.41.0}/docling/utils/model_downloader.py +2 -1
- {docling-2.40.0 → docling-2.41.0}/docling/utils/ocr_utils.py +1 -1
- {docling-2.40.0 → docling-2.41.0}/docling/utils/orientation.py +22 -28
- {docling-2.40.0 → docling-2.41.0}/docling.egg-info/PKG-INFO +3 -3
- {docling-2.40.0 → docling-2.41.0}/docling.egg-info/SOURCES.txt +2 -0
- {docling-2.40.0 → docling-2.41.0}/docling.egg-info/requires.txt +2 -2
- {docling-2.40.0 → docling-2.41.0}/pyproject.toml +4 -5
- {docling-2.40.0 → docling-2.41.0}/tests/test_backend_markdown.py +6 -1
- {docling-2.40.0 → docling-2.41.0}/tests/test_backend_patent_uspto.py +11 -3
- {docling-2.40.0 → docling-2.41.0}/tests/test_document_picture_classifier.py +2 -1
- {docling-2.40.0 → docling-2.41.0}/tests/test_e2e_conversion.py +2 -8
- {docling-2.40.0 → docling-2.41.0}/tests/test_e2e_ocr_conversion.py +5 -10
- {docling-2.40.0 → docling-2.41.0}/tests/test_interfaces.py +2 -9
- {docling-2.40.0 → docling-2.41.0}/tests/test_legacy_format_transform.py +1 -0
- docling-2.41.0/tests/test_ocr_utils.py +80 -0
- {docling-2.40.0 → docling-2.41.0}/LICENSE +0 -0
- {docling-2.40.0 → docling-2.41.0}/README.md +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/__init__.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/backend/__init__.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/backend/csv_backend.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/backend/docling_parse_v4_backend.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/backend/docx/__init__.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/backend/docx/latex/__init__.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/backend/docx/latex/latex_dict.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/backend/docx/latex/omml.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/backend/html_backend.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/backend/json/__init__.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/backend/md_backend.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/backend/msword_backend.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/backend/noop_backend.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/backend/pdf_backend.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/backend/xml/__init__.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/backend/xml/jats_backend.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/backend/xml/uspto_backend.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/chunking/__init__.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/cli/__init__.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/cli/main.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/cli/models.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/cli/tools.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/datamodel/accelerator_options.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/datamodel/document.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/datamodel/pipeline_options_asr_model.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/datamodel/settings.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/datamodel/vlm_model_specs.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/document_converter.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/exceptions.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/models/__init__.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/models/base_model.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/models/base_ocr_model.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/models/code_formula_model.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/models/easyocr_model.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/models/factories/__init__.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/models/factories/base_factory.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/models/factories/ocr_factory.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/models/factories/picture_description_factory.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/models/page_assemble_model.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/models/picture_description_api_model.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/models/picture_description_base_model.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/models/picture_description_vlm_model.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/models/plugins/__init__.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/models/plugins/defaults.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/models/rapid_ocr_model.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/models/readingorder_model.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/models/table_structure_model.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/models/utils/__init__.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/models/utils/hf_model_download.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/models/vlm_models_inline/__init__.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/pipeline/__init__.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/pipeline/asr_pipeline.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/py.typed +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/utils/__init__.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/utils/api_image_request.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/utils/export.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/utils/glm_utils.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/utils/layout_postprocessor.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/utils/locks.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/utils/profiling.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/utils/utils.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling/utils/visualization.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling.egg-info/dependency_links.txt +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling.egg-info/entry_points.txt +0 -0
- {docling-2.40.0 → docling-2.41.0}/docling.egg-info/top_level.txt +0 -0
- {docling-2.40.0 → docling-2.41.0}/setup.cfg +0 -0
- {docling-2.40.0 → docling-2.41.0}/tests/test_asr_pipeline.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/tests/test_backend_asciidoc.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/tests/test_backend_csv.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/tests/test_backend_docling_json.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/tests/test_backend_docling_parse.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/tests/test_backend_docling_parse_v2.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/tests/test_backend_docling_parse_v4.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/tests/test_backend_html.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/tests/test_backend_jats.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/tests/test_backend_msexcel.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/tests/test_backend_msword.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/tests/test_backend_pdfium.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/tests/test_backend_pptx.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/tests/test_backend_webp.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/tests/test_cli.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/tests/test_code_formula.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/tests/test_data_gen_flag.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/tests/test_input_doc.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/tests/test_invalid_input.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/tests/test_options.py +0 -0
- {docling-2.40.0 → docling-2.41.0}/tests/test_settings_load.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.41.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
6
|
License-Expression: MIT
|
@@ -26,7 +26,7 @@ Requires-Python: <4.0,>=3.9
|
|
26
26
|
Description-Content-Type: text/markdown
|
27
27
|
License-File: LICENSE
|
28
28
|
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
29
|
-
Requires-Dist: docling-core[chunking]<3.0.0,>=2.
|
29
|
+
Requires-Dist: docling-core[chunking]<3.0.0,>=2.42.0
|
30
30
|
Requires-Dist: docling-parse<5.0.0,>=4.0.0
|
31
31
|
Requires-Dist: docling-ibm-models<4,>=3.6.0
|
32
32
|
Requires-Dist: filetype<2.0.0,>=1.2.0
|
@@ -62,7 +62,7 @@ Provides-Extra: rapidocr
|
|
62
62
|
Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
|
63
63
|
Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
|
64
64
|
Provides-Extra: asr
|
65
|
-
Requires-Dist: openai-whisper>=
|
65
|
+
Requires-Dist: openai-whisper>=20250625; extra == "asr"
|
66
66
|
Dynamic: license-file
|
67
67
|
|
68
68
|
<p align="center">
|
@@ -22,7 +22,7 @@ WHISPER_TINY = InlineAsrNativeWhisperOptions(
|
|
22
22
|
verbose=True,
|
23
23
|
timestamps=True,
|
24
24
|
word_timestamps=True,
|
25
|
-
|
25
|
+
temperature=0.0,
|
26
26
|
max_new_tokens=256,
|
27
27
|
max_time_chunk=30.0,
|
28
28
|
)
|
@@ -33,7 +33,7 @@ WHISPER_SMALL = InlineAsrNativeWhisperOptions(
|
|
33
33
|
verbose=True,
|
34
34
|
timestamps=True,
|
35
35
|
word_timestamps=True,
|
36
|
-
|
36
|
+
temperature=0.0,
|
37
37
|
max_new_tokens=256,
|
38
38
|
max_time_chunk=30.0,
|
39
39
|
)
|
@@ -44,7 +44,7 @@ WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
|
|
44
44
|
verbose=True,
|
45
45
|
timestamps=True,
|
46
46
|
word_timestamps=True,
|
47
|
-
|
47
|
+
temperature=0.0,
|
48
48
|
max_new_tokens=256,
|
49
49
|
max_time_chunk=30.0,
|
50
50
|
)
|
@@ -55,7 +55,7 @@ WHISPER_BASE = InlineAsrNativeWhisperOptions(
|
|
55
55
|
verbose=True,
|
56
56
|
timestamps=True,
|
57
57
|
word_timestamps=True,
|
58
|
-
|
58
|
+
temperature=0.0,
|
59
59
|
max_new_tokens=256,
|
60
60
|
max_time_chunk=30.0,
|
61
61
|
)
|
@@ -66,7 +66,7 @@ WHISPER_LARGE = InlineAsrNativeWhisperOptions(
|
|
66
66
|
verbose=True,
|
67
67
|
timestamps=True,
|
68
68
|
word_timestamps=True,
|
69
|
-
|
69
|
+
temperature=0.0,
|
70
70
|
max_new_tokens=256,
|
71
71
|
max_time_chunk=30.0,
|
72
72
|
)
|
@@ -77,7 +77,7 @@ WHISPER_TURBO = InlineAsrNativeWhisperOptions(
|
|
77
77
|
verbose=True,
|
78
78
|
timestamps=True,
|
79
79
|
word_timestamps=True,
|
80
|
-
|
80
|
+
temperature=0.0,
|
81
81
|
max_new_tokens=256,
|
82
82
|
max_time_chunk=30.0,
|
83
83
|
)
|
@@ -12,6 +12,7 @@ from docling_core.types.doc import (
|
|
12
12
|
Size,
|
13
13
|
TableCell,
|
14
14
|
)
|
15
|
+
from docling_core.types.doc.base import PydanticSerCtxKey, round_pydantic_float
|
15
16
|
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
16
17
|
from docling_core.types.io import (
|
17
18
|
DocumentStream,
|
@@ -19,7 +20,14 @@ from docling_core.types.io import (
|
|
19
20
|
|
20
21
|
# DO NOT REMOVE; explicitly exposed from this location
|
21
22
|
from PIL.Image import Image
|
22
|
-
from pydantic import
|
23
|
+
from pydantic import (
|
24
|
+
BaseModel,
|
25
|
+
ConfigDict,
|
26
|
+
Field,
|
27
|
+
FieldSerializationInfo,
|
28
|
+
computed_field,
|
29
|
+
field_serializer,
|
30
|
+
)
|
23
31
|
|
24
32
|
if TYPE_CHECKING:
|
25
33
|
from docling.backend.pdf_backend import PdfPageBackend
|
@@ -142,6 +150,10 @@ class Cluster(BaseModel):
|
|
142
150
|
cells: List[TextCell] = []
|
143
151
|
children: List["Cluster"] = [] # Add child cluster support
|
144
152
|
|
153
|
+
@field_serializer("confidence")
|
154
|
+
def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
|
155
|
+
return round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC)
|
156
|
+
|
145
157
|
|
146
158
|
class BasePageElement(BaseModel):
|
147
159
|
label: DocItemLabel
|
@@ -194,6 +206,16 @@ class FigureElement(BasePageElement):
|
|
194
206
|
predicted_class: Optional[str] = None
|
195
207
|
confidence: Optional[float] = None
|
196
208
|
|
209
|
+
@field_serializer("confidence")
|
210
|
+
def _serialize(
|
211
|
+
self, value: Optional[float], info: FieldSerializationInfo
|
212
|
+
) -> Optional[float]:
|
213
|
+
return (
|
214
|
+
round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC)
|
215
|
+
if value is not None
|
216
|
+
else None
|
217
|
+
)
|
218
|
+
|
197
219
|
|
198
220
|
class FigureClassificationPrediction(BaseModel):
|
199
221
|
figure_count: int = 0
|
@@ -0,0 +1,90 @@
|
|
1
|
+
import logging
|
2
|
+
from enum import Enum
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import Optional
|
5
|
+
|
6
|
+
from pydantic import BaseModel
|
7
|
+
|
8
|
+
from docling.datamodel.accelerator_options import AcceleratorDevice
|
9
|
+
|
10
|
+
_log = logging.getLogger(__name__)
|
11
|
+
|
12
|
+
|
13
|
+
class LayoutModelConfig(BaseModel):
|
14
|
+
name: str
|
15
|
+
repo_id: str
|
16
|
+
revision: str
|
17
|
+
model_path: str
|
18
|
+
supported_devices: list[AcceleratorDevice] = [
|
19
|
+
AcceleratorDevice.CPU,
|
20
|
+
AcceleratorDevice.CUDA,
|
21
|
+
AcceleratorDevice.MPS,
|
22
|
+
]
|
23
|
+
|
24
|
+
@property
|
25
|
+
def model_repo_folder(self) -> str:
|
26
|
+
return self.repo_id.replace("/", "--")
|
27
|
+
|
28
|
+
|
29
|
+
# HuggingFace Layout Models
|
30
|
+
|
31
|
+
# Default Docling Layout Model
|
32
|
+
DOCLING_LAYOUT_V2 = LayoutModelConfig(
|
33
|
+
name="docling_layout_v2",
|
34
|
+
repo_id="ds4sd/docling-layout-old",
|
35
|
+
revision="main",
|
36
|
+
model_path="",
|
37
|
+
)
|
38
|
+
|
39
|
+
DOCLING_LAYOUT_HERON = LayoutModelConfig(
|
40
|
+
name="docling_layout_heron",
|
41
|
+
repo_id="ds4sd/docling-layout-heron",
|
42
|
+
revision="main",
|
43
|
+
model_path="",
|
44
|
+
)
|
45
|
+
|
46
|
+
DOCLING_LAYOUT_HERON_101 = LayoutModelConfig(
|
47
|
+
name="docling_layout_heron_101",
|
48
|
+
repo_id="ds4sd/docling-layout-heron-101",
|
49
|
+
revision="main",
|
50
|
+
model_path="",
|
51
|
+
)
|
52
|
+
|
53
|
+
DOCLING_LAYOUT_EGRET_MEDIUM = LayoutModelConfig(
|
54
|
+
name="docling_layout_egret_medium",
|
55
|
+
repo_id="ds4sd/docling-layout-egret-medium",
|
56
|
+
revision="main",
|
57
|
+
model_path="",
|
58
|
+
)
|
59
|
+
|
60
|
+
DOCLING_LAYOUT_EGRET_LARGE = LayoutModelConfig(
|
61
|
+
name="docling_layout_egret_large",
|
62
|
+
repo_id="ds4sd/docling-layout-egret-large",
|
63
|
+
revision="main",
|
64
|
+
model_path="",
|
65
|
+
)
|
66
|
+
|
67
|
+
DOCLING_LAYOUT_EGRET_XLARGE = LayoutModelConfig(
|
68
|
+
name="docling_layout_egret_xlarge",
|
69
|
+
repo_id="ds4sd/docling-layout-egret-xlarge",
|
70
|
+
revision="main",
|
71
|
+
model_path="",
|
72
|
+
)
|
73
|
+
|
74
|
+
# Example for a hypothetical alternative model
|
75
|
+
# ALTERNATIVE_LAYOUT = LayoutModelConfig(
|
76
|
+
# name="alternative_layout",
|
77
|
+
# repo_id="someorg/alternative-layout",
|
78
|
+
# revision="main",
|
79
|
+
# model_path="model_artifacts/layout_alt",
|
80
|
+
# )
|
81
|
+
|
82
|
+
|
83
|
+
class LayoutModelType(str, Enum):
|
84
|
+
DOCLING_LAYOUT_V2 = "docling_layout_v2"
|
85
|
+
DOCLING_LAYOUT_HERON = "docling_layout_heron"
|
86
|
+
DOCLING_LAYOUT_HERON_101 = "docling_layout_heron_101"
|
87
|
+
DOCLING_LAYOUT_EGRET_MEDIUM = "docling_layout_egret_medium"
|
88
|
+
DOCLING_LAYOUT_EGRET_LARGE = "docling_layout_egret_large"
|
89
|
+
DOCLING_LAYOUT_EGRET_XLARGE = "docling_layout_egret_xlarge"
|
90
|
+
# ALTERNATIVE_LAYOUT = "alternative_layout"
|
@@ -16,6 +16,15 @@ from docling.datamodel import asr_model_specs
|
|
16
16
|
|
17
17
|
# Import the following for backwards compatibility
|
18
18
|
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
19
|
+
from docling.datamodel.layout_model_specs import (
|
20
|
+
DOCLING_LAYOUT_EGRET_LARGE,
|
21
|
+
DOCLING_LAYOUT_EGRET_MEDIUM,
|
22
|
+
DOCLING_LAYOUT_EGRET_XLARGE,
|
23
|
+
DOCLING_LAYOUT_HERON,
|
24
|
+
DOCLING_LAYOUT_HERON_101,
|
25
|
+
DOCLING_LAYOUT_V2,
|
26
|
+
LayoutModelConfig,
|
27
|
+
)
|
19
28
|
from docling.datamodel.pipeline_options_asr_model import (
|
20
29
|
InlineAsrOptions,
|
21
30
|
)
|
@@ -270,6 +279,7 @@ class LayoutOptions(BaseModel):
|
|
270
279
|
"""Options for layout processing."""
|
271
280
|
|
272
281
|
create_orphan_clusters: bool = True # Whether to create clusters for orphaned cells
|
282
|
+
model_spec: LayoutModelConfig = DOCLING_LAYOUT_V2
|
273
283
|
|
274
284
|
|
275
285
|
class AsrPipelineOptions(PipelineOptions):
|
@@ -1,6 +1,7 @@
|
|
1
1
|
from enum import Enum
|
2
|
-
from typing import Any, Dict, List, Literal, Optional, Union
|
2
|
+
from typing import Any, Callable, Dict, List, Literal, Optional, Union
|
3
3
|
|
4
|
+
from docling_core.types.doc.page import SegmentedPage
|
4
5
|
from pydantic import AnyUrl, BaseModel
|
5
6
|
from typing_extensions import deprecated
|
6
7
|
|
@@ -9,9 +10,10 @@ from docling.datamodel.accelerator_options import AcceleratorDevice
|
|
9
10
|
|
10
11
|
class BaseVlmOptions(BaseModel):
|
11
12
|
kind: str
|
12
|
-
prompt: str
|
13
|
+
prompt: Union[str, Callable[[Optional[SegmentedPage]], str]]
|
13
14
|
scale: float = 2.0
|
14
15
|
max_size: Optional[int] = None
|
16
|
+
temperature: float = 0.0
|
15
17
|
|
16
18
|
|
17
19
|
class ResponseFormat(str, Enum):
|
@@ -29,6 +31,12 @@ class TransformersModelType(str, Enum):
|
|
29
31
|
AUTOMODEL = "automodel"
|
30
32
|
AUTOMODEL_VISION2SEQ = "automodel-vision2seq"
|
31
33
|
AUTOMODEL_CAUSALLM = "automodel-causallm"
|
34
|
+
AUTOMODEL_IMAGETEXTTOTEXT = "automodel-imagetexttotext"
|
35
|
+
|
36
|
+
|
37
|
+
class TransformersPromptStyle(str, Enum):
|
38
|
+
CHAT = "chat"
|
39
|
+
RAW = "raw"
|
32
40
|
|
33
41
|
|
34
42
|
class InlineVlmOptions(BaseVlmOptions):
|
@@ -42,6 +50,7 @@ class InlineVlmOptions(BaseVlmOptions):
|
|
42
50
|
|
43
51
|
inference_framework: InferenceFramework
|
44
52
|
transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL
|
53
|
+
transformers_prompt_style: TransformersPromptStyle = TransformersPromptStyle.CHAT
|
45
54
|
response_format: ResponseFormat
|
46
55
|
|
47
56
|
torch_dtype: Optional[str] = None
|
@@ -51,7 +60,6 @@ class InlineVlmOptions(BaseVlmOptions):
|
|
51
60
|
AcceleratorDevice.MPS,
|
52
61
|
]
|
53
62
|
|
54
|
-
temperature: float = 0.0
|
55
63
|
stop_strings: List[str] = []
|
56
64
|
extra_generation_config: Dict[str, Any] = {}
|
57
65
|
|
@@ -29,12 +29,9 @@ class ApiVlmModel(BasePageModel):
|
|
29
29
|
|
30
30
|
self.timeout = self.vlm_options.timeout
|
31
31
|
self.concurrency = self.vlm_options.concurrency
|
32
|
-
self.prompt_content = (
|
33
|
-
f"This is a page from a document.\n{self.vlm_options.prompt}"
|
34
|
-
)
|
35
32
|
self.params = {
|
36
33
|
**self.vlm_options.params,
|
37
|
-
"temperature":
|
34
|
+
"temperature": self.vlm_options.temperature,
|
38
35
|
}
|
39
36
|
|
40
37
|
def __call__(
|
@@ -56,9 +53,14 @@ class ApiVlmModel(BasePageModel):
|
|
56
53
|
if hi_res_image.mode != "RGB":
|
57
54
|
hi_res_image = hi_res_image.convert("RGB")
|
58
55
|
|
56
|
+
if callable(self.vlm_options.prompt):
|
57
|
+
prompt = self.vlm_options.prompt(page.parsed_page)
|
58
|
+
else:
|
59
|
+
prompt = self.vlm_options.prompt
|
60
|
+
|
59
61
|
page_tags = api_image_request(
|
60
62
|
image=hi_res_image,
|
61
|
-
prompt=
|
63
|
+
prompt=prompt,
|
62
64
|
url=self.vlm_options.url,
|
63
65
|
timeout=self.timeout,
|
64
66
|
headers=self.vlm_options.headers,
|
@@ -14,7 +14,8 @@ from PIL import Image
|
|
14
14
|
from pydantic import BaseModel
|
15
15
|
|
16
16
|
from docling.datamodel.accelerator_options import AcceleratorOptions
|
17
|
-
from docling.
|
17
|
+
from docling.datamodel.base_models import ItemAndImageEnrichmentElement
|
18
|
+
from docling.models.base_model import BaseItemAndImageEnrichmentModel
|
18
19
|
from docling.models.utils.hf_model_download import download_hf_model
|
19
20
|
from docling.utils.accelerator_utils import decide_device
|
20
21
|
|
@@ -32,7 +33,7 @@ class DocumentPictureClassifierOptions(BaseModel):
|
|
32
33
|
kind: Literal["document_picture_classifier"] = "document_picture_classifier"
|
33
34
|
|
34
35
|
|
35
|
-
class DocumentPictureClassifier(
|
36
|
+
class DocumentPictureClassifier(BaseItemAndImageEnrichmentModel):
|
36
37
|
"""
|
37
38
|
A model for classifying pictures in documents.
|
38
39
|
|
@@ -135,7 +136,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
|
|
135
136
|
def __call__(
|
136
137
|
self,
|
137
138
|
doc: DoclingDocument,
|
138
|
-
element_batch: Iterable[
|
139
|
+
element_batch: Iterable[ItemAndImageEnrichmentElement],
|
139
140
|
) -> Iterable[NodeItem]:
|
140
141
|
"""
|
141
142
|
Processes a batch of elements and enriches them with classification predictions.
|
@@ -144,7 +145,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
|
|
144
145
|
----------
|
145
146
|
doc : DoclingDocument
|
146
147
|
The document containing the elements to be processed.
|
147
|
-
element_batch : Iterable[
|
148
|
+
element_batch : Iterable[ItemAndImageEnrichmentElement]
|
148
149
|
A batch of pictures to classify.
|
149
150
|
|
150
151
|
Returns
|
@@ -155,22 +156,20 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
|
|
155
156
|
"""
|
156
157
|
if not self.enabled:
|
157
158
|
for element in element_batch:
|
158
|
-
yield element
|
159
|
+
yield element.item
|
159
160
|
return
|
160
161
|
|
161
162
|
images: List[Union[Image.Image, np.ndarray]] = []
|
162
163
|
elements: List[PictureItem] = []
|
163
164
|
for el in element_batch:
|
164
|
-
assert isinstance(el, PictureItem)
|
165
|
-
elements.append(el)
|
166
|
-
|
167
|
-
assert img is not None
|
168
|
-
images.append(img)
|
165
|
+
assert isinstance(el.item, PictureItem)
|
166
|
+
elements.append(el.item)
|
167
|
+
images.append(el.image)
|
169
168
|
|
170
169
|
outputs = self.document_picture_classifier.predict(images)
|
171
170
|
|
172
|
-
for
|
173
|
-
|
171
|
+
for item, output in zip(elements, outputs):
|
172
|
+
item.annotations.append(
|
174
173
|
PictureClassificationData(
|
175
174
|
provenance="DocumentPictureClassifier",
|
176
175
|
predicted_classes=[
|
@@ -183,4 +182,4 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
|
|
183
182
|
)
|
184
183
|
)
|
185
184
|
|
186
|
-
yield
|
185
|
+
yield item
|
@@ -12,6 +12,7 @@ from PIL import Image
|
|
12
12
|
from docling.datamodel.accelerator_options import AcceleratorOptions
|
13
13
|
from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
|
14
14
|
from docling.datamodel.document import ConversionResult
|
15
|
+
from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_V2, LayoutModelConfig
|
15
16
|
from docling.datamodel.pipeline_options import LayoutOptions
|
16
17
|
from docling.datamodel.settings import settings
|
17
18
|
from docling.models.base_model import BasePageModel
|
@@ -25,9 +26,6 @@ _log = logging.getLogger(__name__)
|
|
25
26
|
|
26
27
|
|
27
28
|
class LayoutModel(BasePageModel):
|
28
|
-
_model_repo_folder = "ds4sd--docling-models"
|
29
|
-
_model_path = "model_artifacts/layout"
|
30
|
-
|
31
29
|
TEXT_ELEM_LABELS = [
|
32
30
|
DocItemLabel.TEXT,
|
33
31
|
DocItemLabel.FOOTNOTE,
|
@@ -59,25 +57,28 @@ class LayoutModel(BasePageModel):
|
|
59
57
|
self.options = options
|
60
58
|
|
61
59
|
device = decide_device(accelerator_options.device)
|
60
|
+
layout_model_config = options.model_spec
|
61
|
+
model_repo_folder = layout_model_config.model_repo_folder
|
62
|
+
model_path = layout_model_config.model_path
|
62
63
|
|
63
64
|
if artifacts_path is None:
|
64
|
-
artifacts_path =
|
65
|
+
artifacts_path = (
|
66
|
+
self.download_models(layout_model_config=layout_model_config)
|
67
|
+
/ model_path
|
68
|
+
)
|
65
69
|
else:
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
artifacts_path / self._model_repo_folder / self._model_path
|
70
|
-
)
|
71
|
-
elif (artifacts_path / self._model_path).exists():
|
70
|
+
if (artifacts_path / model_repo_folder).exists():
|
71
|
+
artifacts_path = artifacts_path / model_repo_folder / model_path
|
72
|
+
elif (artifacts_path / model_path).exists():
|
72
73
|
warnings.warn(
|
73
74
|
"The usage of artifacts_path containing directly "
|
74
|
-
f"{
|
75
|
+
f"{model_path} is deprecated. Please point "
|
75
76
|
"the artifacts_path to the parent containing "
|
76
|
-
f"the {
|
77
|
+
f"the {model_repo_folder} folder.",
|
77
78
|
DeprecationWarning,
|
78
79
|
stacklevel=3,
|
79
80
|
)
|
80
|
-
artifacts_path = artifacts_path /
|
81
|
+
artifacts_path = artifacts_path / model_path
|
81
82
|
|
82
83
|
self.layout_predictor = LayoutPredictor(
|
83
84
|
artifact_path=str(artifacts_path),
|
@@ -90,10 +91,11 @@ class LayoutModel(BasePageModel):
|
|
90
91
|
local_dir: Optional[Path] = None,
|
91
92
|
force: bool = False,
|
92
93
|
progress: bool = False,
|
94
|
+
layout_model_config: LayoutModelConfig = DOCLING_LAYOUT_V2,
|
93
95
|
) -> Path:
|
94
96
|
return download_hf_model(
|
95
|
-
repo_id=
|
96
|
-
revision=
|
97
|
+
repo_id=layout_model_config.repo_id,
|
98
|
+
revision=layout_model_config.revision,
|
97
99
|
local_dir=local_dir,
|
98
100
|
force=force,
|
99
101
|
progress=progress,
|
@@ -13,6 +13,7 @@ from docling.datamodel.document import ConversionResult
|
|
13
13
|
from docling.datamodel.pipeline_options_vlm_model import (
|
14
14
|
InlineVlmOptions,
|
15
15
|
TransformersModelType,
|
16
|
+
TransformersPromptStyle,
|
16
17
|
)
|
17
18
|
from docling.models.base_model import BasePageModel
|
18
19
|
from docling.models.utils.hf_model_download import (
|
@@ -41,6 +42,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
|
|
41
42
|
from transformers import (
|
42
43
|
AutoModel,
|
43
44
|
AutoModelForCausalLM,
|
45
|
+
AutoModelForImageTextToText,
|
44
46
|
AutoModelForVision2Seq,
|
45
47
|
AutoProcessor,
|
46
48
|
BitsAndBytesConfig,
|
@@ -91,6 +93,11 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
|
|
91
93
|
== TransformersModelType.AUTOMODEL_VISION2SEQ
|
92
94
|
):
|
93
95
|
model_cls = AutoModelForVision2Seq
|
96
|
+
elif (
|
97
|
+
self.vlm_options.transformers_model_type
|
98
|
+
== TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT
|
99
|
+
):
|
100
|
+
model_cls = AutoModelForImageTextToText
|
94
101
|
|
95
102
|
self.processor = AutoProcessor.from_pretrained(
|
96
103
|
artifacts_path,
|
@@ -128,7 +135,11 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
|
|
128
135
|
)
|
129
136
|
|
130
137
|
# Define prompt structure
|
131
|
-
|
138
|
+
if callable(self.vlm_options.prompt):
|
139
|
+
user_prompt = self.vlm_options.prompt(page.parsed_page)
|
140
|
+
else:
|
141
|
+
user_prompt = self.vlm_options.prompt
|
142
|
+
prompt = self.formulate_prompt(user_prompt)
|
132
143
|
|
133
144
|
inputs = self.processor(
|
134
145
|
text=prompt, images=[hi_res_image], return_tensors="pt"
|
@@ -162,10 +173,13 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
|
|
162
173
|
|
163
174
|
yield page
|
164
175
|
|
165
|
-
def formulate_prompt(self) -> str:
|
176
|
+
def formulate_prompt(self, user_prompt: str) -> str:
|
166
177
|
"""Formulate a prompt for the VLM."""
|
167
178
|
|
168
|
-
if self.vlm_options.
|
179
|
+
if self.vlm_options.transformers_prompt_style == TransformersPromptStyle.RAW:
|
180
|
+
return user_prompt
|
181
|
+
|
182
|
+
elif self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
|
169
183
|
_log.debug("Using specialized prompt for Phi-4")
|
170
184
|
# more info here: https://huggingface.co/microsoft/Phi-4-multimodal-instruct#loading-the-model-locally
|
171
185
|
|
@@ -173,25 +187,30 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
|
|
173
187
|
assistant_prompt = "<|assistant|>"
|
174
188
|
prompt_suffix = "<|end|>"
|
175
189
|
|
176
|
-
prompt = f"{user_prompt}<|image_1|>{
|
190
|
+
prompt = f"{user_prompt}<|image_1|>{user_prompt}{prompt_suffix}{assistant_prompt}"
|
177
191
|
_log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}")
|
178
192
|
|
179
193
|
return prompt
|
180
194
|
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
195
|
+
elif self.vlm_options.transformers_prompt_style == TransformersPromptStyle.CHAT:
|
196
|
+
messages = [
|
197
|
+
{
|
198
|
+
"role": "user",
|
199
|
+
"content": [
|
200
|
+
{
|
201
|
+
"type": "text",
|
202
|
+
"text": "This is a page from a document.",
|
203
|
+
},
|
204
|
+
{"type": "image"},
|
205
|
+
{"type": "text", "text": user_prompt},
|
206
|
+
],
|
207
|
+
}
|
208
|
+
]
|
209
|
+
prompt = self.processor.apply_chat_template(
|
210
|
+
messages, add_generation_prompt=False
|
211
|
+
)
|
212
|
+
return prompt
|
213
|
+
|
214
|
+
raise RuntimeError(
|
215
|
+
f"Uknown prompt style `{self.vlm_options.transformers_prompt_style}`. Valid values are {', '.join(s.value for s in TransformersPromptStyle)}."
|
196
216
|
)
|
197
|
-
return prompt
|
@@ -56,8 +56,6 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
|
|
56
56
|
elif (artifacts_path / repo_cache_folder).exists():
|
57
57
|
artifacts_path = artifacts_path / repo_cache_folder
|
58
58
|
|
59
|
-
self.param_question = vlm_options.prompt
|
60
|
-
|
61
59
|
## Load the model
|
62
60
|
self.vlm_model, self.processor = load(artifacts_path)
|
63
61
|
self.config = load_config(artifacts_path)
|
@@ -86,8 +84,12 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
|
|
86
84
|
if hi_res_image.mode != "RGB":
|
87
85
|
hi_res_image = hi_res_image.convert("RGB")
|
88
86
|
|
87
|
+
if callable(self.vlm_options.prompt):
|
88
|
+
user_prompt = self.vlm_options.prompt(page.parsed_page)
|
89
|
+
else:
|
90
|
+
user_prompt = self.vlm_options.prompt
|
89
91
|
prompt = self.apply_chat_template(
|
90
|
-
self.processor, self.config,
|
92
|
+
self.processor, self.config, user_prompt, num_images=1
|
91
93
|
)
|
92
94
|
|
93
95
|
start_time = time.time()
|
@@ -10,6 +10,7 @@ from docling.backend.abstract_backend import AbstractDocumentBackend
|
|
10
10
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
11
11
|
from docling.datamodel.base_models import AssembledUnit, Page
|
12
12
|
from docling.datamodel.document import ConversionResult
|
13
|
+
from docling.datamodel.layout_model_specs import LayoutModelConfig
|
13
14
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
14
15
|
from docling.datamodel.settings import settings
|
15
16
|
from docling.models.base_ocr_model import BaseOcrModel
|
@@ -36,9 +37,6 @@ _log = logging.getLogger(__name__)
|
|
36
37
|
|
37
38
|
|
38
39
|
class StandardPdfPipeline(PaginatedPipeline):
|
39
|
-
_layout_model_path = LayoutModel._model_path
|
40
|
-
_table_model_path = TableStructureModel._model_path
|
41
|
-
|
42
40
|
def __init__(self, pipeline_options: PdfPipelineOptions):
|
43
41
|
super().__init__(pipeline_options)
|
44
42
|
self.pipeline_options: PdfPipelineOptions
|
@@ -129,6 +127,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
129
127
|
if (
|
130
128
|
self.pipeline_options.do_formula_enrichment
|
131
129
|
or self.pipeline_options.do_code_enrichment
|
130
|
+
or self.pipeline_options.do_picture_classification
|
132
131
|
or self.pipeline_options.do_picture_description
|
133
132
|
):
|
134
133
|
self.keep_backend = True
|
@@ -117,6 +117,7 @@ class VlmPipeline(PaginatedPipeline):
|
|
117
117
|
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
|
118
118
|
if page._backend is not None and page._backend.is_valid():
|
119
119
|
page.size = page._backend.get_size()
|
120
|
+
page.parsed_page = page._backend.get_segmented_page()
|
120
121
|
|
121
122
|
return page
|
122
123
|
|
@@ -2,6 +2,7 @@ import logging
|
|
2
2
|
from pathlib import Path
|
3
3
|
from typing import Optional
|
4
4
|
|
5
|
+
from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_V2
|
5
6
|
from docling.datamodel.pipeline_options import (
|
6
7
|
granite_picture_description,
|
7
8
|
smolvlm_picture_description,
|
@@ -46,7 +47,7 @@ def download_models(
|
|
46
47
|
if with_layout:
|
47
48
|
_log.info("Downloading layout model...")
|
48
49
|
LayoutModel.download_models(
|
49
|
-
local_dir=output_dir /
|
50
|
+
local_dir=output_dir / DOCLING_LAYOUT_V2.model_repo_folder,
|
50
51
|
force=force,
|
51
52
|
progress=progress,
|
52
53
|
)
|
@@ -41,7 +41,7 @@ def tesseract_box_to_bounding_rectangle(
|
|
41
41
|
im_size: Tuple[int, int],
|
42
42
|
) -> BoundingRectangle:
|
43
43
|
# box is in the top, left, height, width format, top left coordinates
|
44
|
-
rect = rotate_bounding_box(bbox, angle
|
44
|
+
rect = rotate_bounding_box(bbox, angle=orientation, im_size=im_size)
|
45
45
|
rect = BoundingRectangle(
|
46
46
|
r_x0=rect.r_x0 / scale,
|
47
47
|
r_y0=rect.r_y0 / scale,
|