docling 2.69.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/__init__.py +0 -0
- docling/backend/__init__.py +0 -0
- docling/backend/abstract_backend.py +84 -0
- docling/backend/asciidoc_backend.py +443 -0
- docling/backend/csv_backend.py +125 -0
- docling/backend/docling_parse_backend.py +237 -0
- docling/backend/docling_parse_v2_backend.py +276 -0
- docling/backend/docling_parse_v4_backend.py +260 -0
- docling/backend/docx/__init__.py +0 -0
- docling/backend/docx/drawingml/utils.py +131 -0
- docling/backend/docx/latex/__init__.py +0 -0
- docling/backend/docx/latex/latex_dict.py +274 -0
- docling/backend/docx/latex/omml.py +459 -0
- docling/backend/html_backend.py +1502 -0
- docling/backend/image_backend.py +188 -0
- docling/backend/json/__init__.py +0 -0
- docling/backend/json/docling_json_backend.py +58 -0
- docling/backend/md_backend.py +618 -0
- docling/backend/mets_gbs_backend.py +399 -0
- docling/backend/msexcel_backend.py +686 -0
- docling/backend/mspowerpoint_backend.py +398 -0
- docling/backend/msword_backend.py +1663 -0
- docling/backend/noop_backend.py +51 -0
- docling/backend/pdf_backend.py +82 -0
- docling/backend/pypdfium2_backend.py +417 -0
- docling/backend/webvtt_backend.py +572 -0
- docling/backend/xml/__init__.py +0 -0
- docling/backend/xml/jats_backend.py +819 -0
- docling/backend/xml/uspto_backend.py +1905 -0
- docling/chunking/__init__.py +12 -0
- docling/cli/__init__.py +0 -0
- docling/cli/main.py +974 -0
- docling/cli/models.py +196 -0
- docling/cli/tools.py +17 -0
- docling/datamodel/__init__.py +0 -0
- docling/datamodel/accelerator_options.py +69 -0
- docling/datamodel/asr_model_specs.py +494 -0
- docling/datamodel/backend_options.py +102 -0
- docling/datamodel/base_models.py +493 -0
- docling/datamodel/document.py +699 -0
- docling/datamodel/extraction.py +39 -0
- docling/datamodel/layout_model_specs.py +91 -0
- docling/datamodel/pipeline_options.py +457 -0
- docling/datamodel/pipeline_options_asr_model.py +78 -0
- docling/datamodel/pipeline_options_vlm_model.py +136 -0
- docling/datamodel/settings.py +65 -0
- docling/datamodel/vlm_model_specs.py +365 -0
- docling/document_converter.py +559 -0
- docling/document_extractor.py +327 -0
- docling/exceptions.py +10 -0
- docling/experimental/__init__.py +5 -0
- docling/experimental/datamodel/__init__.py +1 -0
- docling/experimental/datamodel/table_crops_layout_options.py +13 -0
- docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
- docling/experimental/models/__init__.py +3 -0
- docling/experimental/models/table_crops_layout_model.py +114 -0
- docling/experimental/pipeline/__init__.py +1 -0
- docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
- docling/models/__init__.py +0 -0
- docling/models/base_layout_model.py +39 -0
- docling/models/base_model.py +230 -0
- docling/models/base_ocr_model.py +241 -0
- docling/models/base_table_model.py +45 -0
- docling/models/extraction/__init__.py +0 -0
- docling/models/extraction/nuextract_transformers_model.py +305 -0
- docling/models/factories/__init__.py +47 -0
- docling/models/factories/base_factory.py +122 -0
- docling/models/factories/layout_factory.py +7 -0
- docling/models/factories/ocr_factory.py +11 -0
- docling/models/factories/picture_description_factory.py +11 -0
- docling/models/factories/table_factory.py +7 -0
- docling/models/picture_description_base_model.py +149 -0
- docling/models/plugins/__init__.py +0 -0
- docling/models/plugins/defaults.py +60 -0
- docling/models/stages/__init__.py +0 -0
- docling/models/stages/code_formula/__init__.py +0 -0
- docling/models/stages/code_formula/code_formula_model.py +342 -0
- docling/models/stages/layout/__init__.py +0 -0
- docling/models/stages/layout/layout_model.py +249 -0
- docling/models/stages/ocr/__init__.py +0 -0
- docling/models/stages/ocr/auto_ocr_model.py +132 -0
- docling/models/stages/ocr/easyocr_model.py +200 -0
- docling/models/stages/ocr/ocr_mac_model.py +145 -0
- docling/models/stages/ocr/rapid_ocr_model.py +328 -0
- docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
- docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
- docling/models/stages/page_assemble/__init__.py +0 -0
- docling/models/stages/page_assemble/page_assemble_model.py +156 -0
- docling/models/stages/page_preprocessing/__init__.py +0 -0
- docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
- docling/models/stages/picture_classifier/__init__.py +0 -0
- docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
- docling/models/stages/picture_description/__init__.py +0 -0
- docling/models/stages/picture_description/picture_description_api_model.py +66 -0
- docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
- docling/models/stages/reading_order/__init__.py +0 -0
- docling/models/stages/reading_order/readingorder_model.py +431 -0
- docling/models/stages/table_structure/__init__.py +0 -0
- docling/models/stages/table_structure/table_structure_model.py +305 -0
- docling/models/utils/__init__.py +0 -0
- docling/models/utils/generation_utils.py +157 -0
- docling/models/utils/hf_model_download.py +45 -0
- docling/models/vlm_pipeline_models/__init__.py +1 -0
- docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
- docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
- docling/models/vlm_pipeline_models/mlx_model.py +325 -0
- docling/models/vlm_pipeline_models/vllm_model.py +344 -0
- docling/pipeline/__init__.py +0 -0
- docling/pipeline/asr_pipeline.py +431 -0
- docling/pipeline/base_extraction_pipeline.py +72 -0
- docling/pipeline/base_pipeline.py +326 -0
- docling/pipeline/extraction_vlm_pipeline.py +207 -0
- docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
- docling/pipeline/simple_pipeline.py +55 -0
- docling/pipeline/standard_pdf_pipeline.py +859 -0
- docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
- docling/pipeline/vlm_pipeline.py +416 -0
- docling/py.typed +1 -0
- docling/utils/__init__.py +0 -0
- docling/utils/accelerator_utils.py +97 -0
- docling/utils/api_image_request.py +205 -0
- docling/utils/deepseekocr_utils.py +388 -0
- docling/utils/export.py +146 -0
- docling/utils/glm_utils.py +361 -0
- docling/utils/layout_postprocessor.py +683 -0
- docling/utils/locks.py +3 -0
- docling/utils/model_downloader.py +168 -0
- docling/utils/ocr_utils.py +69 -0
- docling/utils/orientation.py +65 -0
- docling/utils/profiling.py +65 -0
- docling/utils/utils.py +65 -0
- docling/utils/visualization.py +85 -0
- docling-2.69.0.dist-info/METADATA +237 -0
- docling-2.69.0.dist-info/RECORD +138 -0
- docling-2.69.0.dist-info/WHEEL +5 -0
- docling-2.69.0.dist-info/entry_points.txt +6 -0
- docling-2.69.0.dist-info/licenses/LICENSE +21 -0
- docling-2.69.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
def ocr_engines():
|
|
2
|
+
from docling.models.stages.ocr.auto_ocr_model import OcrAutoModel
|
|
3
|
+
from docling.models.stages.ocr.easyocr_model import EasyOcrModel
|
|
4
|
+
from docling.models.stages.ocr.ocr_mac_model import OcrMacModel
|
|
5
|
+
from docling.models.stages.ocr.rapid_ocr_model import RapidOcrModel
|
|
6
|
+
from docling.models.stages.ocr.tesseract_ocr_cli_model import TesseractOcrCliModel
|
|
7
|
+
from docling.models.stages.ocr.tesseract_ocr_model import TesseractOcrModel
|
|
8
|
+
|
|
9
|
+
return {
|
|
10
|
+
"ocr_engines": [
|
|
11
|
+
OcrAutoModel,
|
|
12
|
+
EasyOcrModel,
|
|
13
|
+
OcrMacModel,
|
|
14
|
+
RapidOcrModel,
|
|
15
|
+
TesseractOcrModel,
|
|
16
|
+
TesseractOcrCliModel,
|
|
17
|
+
]
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def picture_description():
|
|
22
|
+
from docling.models.stages.picture_description.picture_description_api_model import (
|
|
23
|
+
PictureDescriptionApiModel,
|
|
24
|
+
)
|
|
25
|
+
from docling.models.stages.picture_description.picture_description_vlm_model import (
|
|
26
|
+
PictureDescriptionVlmModel,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
return {
|
|
30
|
+
"picture_description": [
|
|
31
|
+
PictureDescriptionVlmModel,
|
|
32
|
+
PictureDescriptionApiModel,
|
|
33
|
+
]
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def layout_engines():
|
|
38
|
+
from docling.experimental.models.table_crops_layout_model import (
|
|
39
|
+
TableCropsLayoutModel,
|
|
40
|
+
)
|
|
41
|
+
from docling.models.stages.layout.layout_model import LayoutModel
|
|
42
|
+
|
|
43
|
+
return {
|
|
44
|
+
"layout_engines": [
|
|
45
|
+
LayoutModel,
|
|
46
|
+
TableCropsLayoutModel,
|
|
47
|
+
]
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def table_structure_engines():
|
|
52
|
+
from docling.models.stages.table_structure.table_structure_model import (
|
|
53
|
+
TableStructureModel,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
return {
|
|
57
|
+
"table_structure_engines": [
|
|
58
|
+
TableStructureModel,
|
|
59
|
+
]
|
|
60
|
+
}
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from collections.abc import Iterable
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import List, Literal, Optional, Tuple, Union
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
from docling_core.types.doc import (
|
|
8
|
+
CodeItem,
|
|
9
|
+
DocItemLabel,
|
|
10
|
+
DoclingDocument,
|
|
11
|
+
NodeItem,
|
|
12
|
+
TextItem,
|
|
13
|
+
)
|
|
14
|
+
from docling_core.types.doc.labels import CodeLanguageLabel
|
|
15
|
+
from PIL import Image
|
|
16
|
+
from pydantic import BaseModel
|
|
17
|
+
|
|
18
|
+
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
|
19
|
+
from docling.datamodel.base_models import ItemAndImageEnrichmentElement
|
|
20
|
+
from docling.models.base_model import BaseItemAndImageEnrichmentModel
|
|
21
|
+
from docling.models.utils.hf_model_download import download_hf_model
|
|
22
|
+
from docling.utils.accelerator_utils import decide_device
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class CodeFormulaModelOptions(BaseModel):
|
|
26
|
+
"""
|
|
27
|
+
Configuration options for the CodeFormulaModel.
|
|
28
|
+
|
|
29
|
+
Attributes
|
|
30
|
+
----------
|
|
31
|
+
kind : str
|
|
32
|
+
Type of the model. Fixed value "code_formula".
|
|
33
|
+
do_code_enrichment : bool
|
|
34
|
+
True if code enrichment is enabled, False otherwise.
|
|
35
|
+
do_formula_enrichment : bool
|
|
36
|
+
True if formula enrichment is enabled, False otherwise.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
kind: Literal["code_formula"] = "code_formula"
|
|
40
|
+
do_code_enrichment: bool = True
|
|
41
|
+
do_formula_enrichment: bool = True
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
|
|
45
|
+
"""
|
|
46
|
+
Model for processing and enriching documents with code and formula predictions.
|
|
47
|
+
|
|
48
|
+
Attributes
|
|
49
|
+
----------
|
|
50
|
+
enabled : bool
|
|
51
|
+
True if the model is enabled, False otherwise.
|
|
52
|
+
options : CodeFormulaModelOptions
|
|
53
|
+
Configuration options for the CodeFormulaModel.
|
|
54
|
+
code_formula_model : CodeFormulaPredictor
|
|
55
|
+
The predictor model for code and formula processing.
|
|
56
|
+
|
|
57
|
+
Methods
|
|
58
|
+
-------
|
|
59
|
+
__init__(self, enabled, artifacts_path, accelerator_options, code_formula_options)
|
|
60
|
+
Initializes the CodeFormulaModel with the given configuration options.
|
|
61
|
+
is_processable(self, doc, element)
|
|
62
|
+
Determines if a given element in a document can be processed by the model.
|
|
63
|
+
__call__(self, doc, element_batch)
|
|
64
|
+
Processes the given batch of elements and enriches them with predictions.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
_model_repo_folder = "docling-project--CodeFormulaV2"
|
|
68
|
+
elements_batch_size = 5
|
|
69
|
+
images_scale = 1.67 # = 120 dpi, aligned with training data resolution
|
|
70
|
+
expansion_factor = 0.18
|
|
71
|
+
|
|
72
|
+
def __init__(
|
|
73
|
+
self,
|
|
74
|
+
enabled: bool,
|
|
75
|
+
artifacts_path: Optional[Path],
|
|
76
|
+
options: CodeFormulaModelOptions,
|
|
77
|
+
accelerator_options: AcceleratorOptions,
|
|
78
|
+
):
|
|
79
|
+
"""
|
|
80
|
+
Initializes the CodeFormulaModel with the given configuration.
|
|
81
|
+
|
|
82
|
+
Parameters
|
|
83
|
+
----------
|
|
84
|
+
enabled : bool
|
|
85
|
+
True if the model is enabled, False otherwise.
|
|
86
|
+
artifacts_path : Path
|
|
87
|
+
Path to the directory containing the model artifacts.
|
|
88
|
+
options : CodeFormulaModelOptions
|
|
89
|
+
Configuration options for the model.
|
|
90
|
+
accelerator_options : AcceleratorOptions
|
|
91
|
+
Options specifying the device and number of threads for acceleration.
|
|
92
|
+
"""
|
|
93
|
+
self.enabled = enabled
|
|
94
|
+
self.options = options
|
|
95
|
+
|
|
96
|
+
if self.enabled:
|
|
97
|
+
self.device = decide_device(
|
|
98
|
+
accelerator_options.device,
|
|
99
|
+
supported_devices=[
|
|
100
|
+
AcceleratorDevice.CPU,
|
|
101
|
+
AcceleratorDevice.CUDA,
|
|
102
|
+
AcceleratorDevice.XPU,
|
|
103
|
+
],
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
if artifacts_path is None:
|
|
107
|
+
artifacts_path = self.download_models()
|
|
108
|
+
else:
|
|
109
|
+
artifacts_path = artifacts_path / self._model_repo_folder
|
|
110
|
+
|
|
111
|
+
from transformers import AutoModelForImageTextToText, AutoProcessor
|
|
112
|
+
|
|
113
|
+
self._processor = AutoProcessor.from_pretrained(
|
|
114
|
+
artifacts_path,
|
|
115
|
+
)
|
|
116
|
+
self._model_max_length = self._processor.tokenizer.model_max_length
|
|
117
|
+
self._model = AutoModelForImageTextToText.from_pretrained(
|
|
118
|
+
artifacts_path, device_map=self.device
|
|
119
|
+
)
|
|
120
|
+
self._model.eval()
|
|
121
|
+
|
|
122
|
+
@staticmethod
|
|
123
|
+
def download_models(
|
|
124
|
+
local_dir: Optional[Path] = None,
|
|
125
|
+
force: bool = False,
|
|
126
|
+
progress: bool = False,
|
|
127
|
+
) -> Path:
|
|
128
|
+
return download_hf_model(
|
|
129
|
+
repo_id="docling-project/CodeFormulaV2",
|
|
130
|
+
revision="main",
|
|
131
|
+
local_dir=local_dir,
|
|
132
|
+
force=force,
|
|
133
|
+
progress=progress,
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
|
|
137
|
+
"""
|
|
138
|
+
Determines if a given element in a document can be processed by the model.
|
|
139
|
+
|
|
140
|
+
Parameters
|
|
141
|
+
----------
|
|
142
|
+
doc : DoclingDocument
|
|
143
|
+
The document being processed.
|
|
144
|
+
element : NodeItem
|
|
145
|
+
The element within the document to check.
|
|
146
|
+
|
|
147
|
+
Returns
|
|
148
|
+
-------
|
|
149
|
+
bool
|
|
150
|
+
True if the element can be processed, False otherwise.
|
|
151
|
+
"""
|
|
152
|
+
return self.enabled and (
|
|
153
|
+
(isinstance(element, CodeItem) and self.options.do_code_enrichment)
|
|
154
|
+
or (
|
|
155
|
+
isinstance(element, TextItem)
|
|
156
|
+
and element.label == DocItemLabel.FORMULA
|
|
157
|
+
and self.options.do_formula_enrichment
|
|
158
|
+
)
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
def _extract_code_language(self, input_string: str) -> Tuple[str, Optional[str]]:
|
|
162
|
+
"""Extracts a programming language from the beginning of a string.
|
|
163
|
+
|
|
164
|
+
This function checks if the input string starts with a pattern of the form
|
|
165
|
+
``<_some_language_>``. If it does, it extracts the language string and returns
|
|
166
|
+
a tuple of (remainder, language). Otherwise, it returns the original string
|
|
167
|
+
and `None`.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
input_string (str): The input string, which may start with ``<_language_>``.
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
Tuple[str, Optional[str]]:
|
|
174
|
+
A tuple where:
|
|
175
|
+
- The first element is either:
|
|
176
|
+
- The remainder of the string (everything after ``<_language_>``),
|
|
177
|
+
if a match is found; or
|
|
178
|
+
- The original string, if no match is found.
|
|
179
|
+
- The second element is the extracted language if a match is found;
|
|
180
|
+
otherwise, `None`.
|
|
181
|
+
"""
|
|
182
|
+
pattern = r"^<_([^_>]+)_>\s*(.*)"
|
|
183
|
+
match = re.match(pattern, input_string, flags=re.DOTALL)
|
|
184
|
+
if match:
|
|
185
|
+
language = str(match.group(1)) # the captured programming language
|
|
186
|
+
remainder = str(match.group(2)) # everything after the <_language_>
|
|
187
|
+
return remainder, language
|
|
188
|
+
else:
|
|
189
|
+
return input_string, None
|
|
190
|
+
|
|
191
|
+
def _get_code_language_enum(self, value: Optional[str]) -> CodeLanguageLabel:
|
|
192
|
+
"""
|
|
193
|
+
Converts a string to a corresponding `CodeLanguageLabel` enum member.
|
|
194
|
+
|
|
195
|
+
If the provided string does not match any value in `CodeLanguageLabel`,
|
|
196
|
+
it defaults to `CodeLanguageLabel.UNKNOWN`.
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
value (Optional[str]): The string representation of the code language or None.
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
CodeLanguageLabel: The corresponding enum member if the value is valid,
|
|
203
|
+
otherwise `CodeLanguageLabel.UNKNOWN`.
|
|
204
|
+
"""
|
|
205
|
+
if not isinstance(value, str):
|
|
206
|
+
return CodeLanguageLabel.UNKNOWN
|
|
207
|
+
|
|
208
|
+
try:
|
|
209
|
+
return CodeLanguageLabel(value)
|
|
210
|
+
except ValueError:
|
|
211
|
+
return CodeLanguageLabel.UNKNOWN
|
|
212
|
+
|
|
213
|
+
def _get_prompt(self, label: str) -> str:
|
|
214
|
+
"""
|
|
215
|
+
Constructs the prompt for the model based on the input label.
|
|
216
|
+
|
|
217
|
+
Parameters
|
|
218
|
+
----------
|
|
219
|
+
label : str
|
|
220
|
+
The type of input, either 'code' or 'formula'.
|
|
221
|
+
|
|
222
|
+
Returns
|
|
223
|
+
-------
|
|
224
|
+
str
|
|
225
|
+
The constructed prompt including necessary tokens and query.
|
|
226
|
+
|
|
227
|
+
Raises
|
|
228
|
+
------
|
|
229
|
+
NotImplementedError
|
|
230
|
+
If the label is not 'code' or 'formula'.
|
|
231
|
+
"""
|
|
232
|
+
if label == "code":
|
|
233
|
+
query = "<code>"
|
|
234
|
+
elif label == "formula":
|
|
235
|
+
query = "<formula>"
|
|
236
|
+
else:
|
|
237
|
+
raise NotImplementedError("Label must be either code or formula")
|
|
238
|
+
|
|
239
|
+
messages = [
|
|
240
|
+
{
|
|
241
|
+
"role": "user",
|
|
242
|
+
"content": [{"type": "image"}, {"type": "text", "text": query}],
|
|
243
|
+
},
|
|
244
|
+
]
|
|
245
|
+
|
|
246
|
+
prompt = self._processor.apply_chat_template(
|
|
247
|
+
messages, add_generation_prompt=True
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
return prompt
|
|
251
|
+
|
|
252
|
+
def _post_process(self, texts: list[str]) -> list[str]:
|
|
253
|
+
"""
|
|
254
|
+
Processes a list of text strings by truncating at '<end_of_utterance>' and
|
|
255
|
+
removing a predefined set of unwanted substrings.
|
|
256
|
+
|
|
257
|
+
Parameters
|
|
258
|
+
----------
|
|
259
|
+
texts : list[str]
|
|
260
|
+
A list of strings to be post-processed.
|
|
261
|
+
|
|
262
|
+
Returns
|
|
263
|
+
-------
|
|
264
|
+
list[str]
|
|
265
|
+
A list of cleaned strings with specified substrings removed and truncated at
|
|
266
|
+
'<end_of_utterance>' if present.
|
|
267
|
+
"""
|
|
268
|
+
to_remove = ["</code>", "</formula>", "<loc_0><loc_0><loc_500><loc_500>"]
|
|
269
|
+
|
|
270
|
+
def clean_text(text: str) -> str:
|
|
271
|
+
idx = text.find("<end_of_utterance>")
|
|
272
|
+
if idx != -1:
|
|
273
|
+
text = text[:idx]
|
|
274
|
+
|
|
275
|
+
for token in to_remove:
|
|
276
|
+
if token in text:
|
|
277
|
+
text = text.replace(token, "")
|
|
278
|
+
return text.lstrip()
|
|
279
|
+
|
|
280
|
+
return [clean_text(t) for t in texts]
|
|
281
|
+
|
|
282
|
+
def __call__(
|
|
283
|
+
self,
|
|
284
|
+
doc: DoclingDocument,
|
|
285
|
+
element_batch: Iterable[ItemAndImageEnrichmentElement],
|
|
286
|
+
) -> Iterable[NodeItem]:
|
|
287
|
+
"""
|
|
288
|
+
Processes the given batch of elements and enriches them with predictions.
|
|
289
|
+
|
|
290
|
+
Parameters
|
|
291
|
+
----------
|
|
292
|
+
doc : DoclingDocument
|
|
293
|
+
The document being processed.
|
|
294
|
+
element_batch : Iterable[ItemAndImageEnrichmentElement]
|
|
295
|
+
A batch of elements to be processed.
|
|
296
|
+
|
|
297
|
+
Returns
|
|
298
|
+
-------
|
|
299
|
+
Iterable[Any]
|
|
300
|
+
An iterable of enriched elements.
|
|
301
|
+
"""
|
|
302
|
+
if not self.enabled:
|
|
303
|
+
for element in element_batch:
|
|
304
|
+
yield element.item
|
|
305
|
+
return
|
|
306
|
+
|
|
307
|
+
labels: List[str] = []
|
|
308
|
+
images: List[Union[Image.Image, np.ndarray]] = []
|
|
309
|
+
elements: List[TextItem] = []
|
|
310
|
+
for el in element_batch:
|
|
311
|
+
elements.append(el.item) # type: ignore[arg-type]
|
|
312
|
+
labels.append(el.item.label) # type: ignore[attr-defined]
|
|
313
|
+
images.append(el.image)
|
|
314
|
+
|
|
315
|
+
prompts = [self._get_prompt(label) for label in labels]
|
|
316
|
+
inputs = self._processor(
|
|
317
|
+
text=prompts,
|
|
318
|
+
images=images,
|
|
319
|
+
return_tensors="pt",
|
|
320
|
+
)
|
|
321
|
+
inputs = inputs.to(self.device)
|
|
322
|
+
|
|
323
|
+
gen_kwargs = dict(
|
|
324
|
+
max_new_tokens=self._model_max_length - inputs.input_ids.shape[1],
|
|
325
|
+
use_cache=True,
|
|
326
|
+
do_sample=False,
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
generated_ids = self._model.generate(**inputs, **gen_kwargs)
|
|
330
|
+
|
|
331
|
+
outputs = self._processor.batch_decode(
|
|
332
|
+
generated_ids[:, inputs.input_ids.shape[1] :], skip_special_tokens=False
|
|
333
|
+
)
|
|
334
|
+
outputs = self._post_process(outputs)
|
|
335
|
+
|
|
336
|
+
for item, output in zip(elements, outputs):
|
|
337
|
+
if isinstance(item, CodeItem):
|
|
338
|
+
output, code_language = self._extract_code_language(output)
|
|
339
|
+
item.code_language = self._get_code_language_enum(code_language)
|
|
340
|
+
item.text = output
|
|
341
|
+
|
|
342
|
+
yield item
|
|
File without changes
|
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import logging
|
|
3
|
+
import warnings
|
|
4
|
+
from collections.abc import Sequence
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import List, Optional, Union
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
from docling_core.types.doc import DocItemLabel
|
|
10
|
+
from PIL import Image
|
|
11
|
+
|
|
12
|
+
from docling.datamodel.accelerator_options import AcceleratorOptions
|
|
13
|
+
from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
|
|
14
|
+
from docling.datamodel.document import ConversionResult
|
|
15
|
+
from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_V2, LayoutModelConfig
|
|
16
|
+
from docling.datamodel.pipeline_options import LayoutOptions
|
|
17
|
+
from docling.datamodel.settings import settings
|
|
18
|
+
from docling.models.base_layout_model import BaseLayoutModel
|
|
19
|
+
from docling.models.utils.hf_model_download import download_hf_model
|
|
20
|
+
from docling.utils.accelerator_utils import decide_device
|
|
21
|
+
from docling.utils.layout_postprocessor import LayoutPostprocessor
|
|
22
|
+
from docling.utils.profiling import TimeRecorder
|
|
23
|
+
from docling.utils.visualization import draw_clusters
|
|
24
|
+
|
|
25
|
+
_log = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class LayoutModel(BaseLayoutModel):
|
|
29
|
+
TEXT_ELEM_LABELS = [
|
|
30
|
+
DocItemLabel.TEXT,
|
|
31
|
+
DocItemLabel.FOOTNOTE,
|
|
32
|
+
DocItemLabel.CAPTION,
|
|
33
|
+
DocItemLabel.CHECKBOX_UNSELECTED,
|
|
34
|
+
DocItemLabel.CHECKBOX_SELECTED,
|
|
35
|
+
DocItemLabel.SECTION_HEADER,
|
|
36
|
+
DocItemLabel.PAGE_HEADER,
|
|
37
|
+
DocItemLabel.PAGE_FOOTER,
|
|
38
|
+
DocItemLabel.CODE,
|
|
39
|
+
DocItemLabel.LIST_ITEM,
|
|
40
|
+
DocItemLabel.FORMULA,
|
|
41
|
+
]
|
|
42
|
+
PAGE_HEADER_LABELS = [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]
|
|
43
|
+
|
|
44
|
+
TABLE_LABELS = [DocItemLabel.TABLE, DocItemLabel.DOCUMENT_INDEX]
|
|
45
|
+
FIGURE_LABEL = DocItemLabel.PICTURE
|
|
46
|
+
FORMULA_LABEL = DocItemLabel.FORMULA
|
|
47
|
+
CONTAINER_LABELS = [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION]
|
|
48
|
+
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
artifacts_path: Optional[Path],
|
|
52
|
+
accelerator_options: AcceleratorOptions,
|
|
53
|
+
options: LayoutOptions,
|
|
54
|
+
):
|
|
55
|
+
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
|
56
|
+
|
|
57
|
+
self.options = options
|
|
58
|
+
|
|
59
|
+
device = decide_device(accelerator_options.device)
|
|
60
|
+
layout_model_config = options.model_spec
|
|
61
|
+
model_repo_folder = layout_model_config.model_repo_folder
|
|
62
|
+
model_path = layout_model_config.model_path
|
|
63
|
+
|
|
64
|
+
if artifacts_path is None:
|
|
65
|
+
artifacts_path = (
|
|
66
|
+
self.download_models(layout_model_config=layout_model_config)
|
|
67
|
+
/ model_path
|
|
68
|
+
)
|
|
69
|
+
else:
|
|
70
|
+
if (artifacts_path / model_repo_folder).exists():
|
|
71
|
+
artifacts_path = artifacts_path / model_repo_folder / model_path
|
|
72
|
+
elif (artifacts_path / model_path).exists():
|
|
73
|
+
warnings.warn(
|
|
74
|
+
"The usage of artifacts_path containing directly "
|
|
75
|
+
f"{model_path} is deprecated. Please point "
|
|
76
|
+
"the artifacts_path to the parent containing "
|
|
77
|
+
f"the {model_repo_folder} folder.",
|
|
78
|
+
DeprecationWarning,
|
|
79
|
+
stacklevel=3,
|
|
80
|
+
)
|
|
81
|
+
artifacts_path = artifacts_path / model_path
|
|
82
|
+
|
|
83
|
+
self.layout_predictor = LayoutPredictor(
|
|
84
|
+
artifact_path=str(artifacts_path),
|
|
85
|
+
device=device,
|
|
86
|
+
num_threads=accelerator_options.num_threads,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
@classmethod
|
|
90
|
+
def get_options_type(cls) -> type[LayoutOptions]:
|
|
91
|
+
return LayoutOptions
|
|
92
|
+
|
|
93
|
+
@staticmethod
|
|
94
|
+
def download_models(
|
|
95
|
+
local_dir: Optional[Path] = None,
|
|
96
|
+
force: bool = False,
|
|
97
|
+
progress: bool = False,
|
|
98
|
+
layout_model_config: LayoutModelConfig = LayoutOptions().model_spec, # use default
|
|
99
|
+
) -> Path:
|
|
100
|
+
return download_hf_model(
|
|
101
|
+
repo_id=layout_model_config.repo_id,
|
|
102
|
+
revision=layout_model_config.revision,
|
|
103
|
+
local_dir=local_dir,
|
|
104
|
+
force=force,
|
|
105
|
+
progress=progress,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
def draw_clusters_and_cells_side_by_side(
|
|
109
|
+
self, conv_res, page, clusters, mode_prefix: str, show: bool = False
|
|
110
|
+
):
|
|
111
|
+
"""
|
|
112
|
+
Draws a page image side by side with clusters filtered into two categories:
|
|
113
|
+
- Left: Clusters excluding FORM, KEY_VALUE_REGION, and PICTURE.
|
|
114
|
+
- Right: Clusters including FORM, KEY_VALUE_REGION, and PICTURE.
|
|
115
|
+
Includes label names and confidence scores for each cluster.
|
|
116
|
+
"""
|
|
117
|
+
scale_x = page.image.width / page.size.width
|
|
118
|
+
scale_y = page.image.height / page.size.height
|
|
119
|
+
|
|
120
|
+
# Filter clusters for left and right images
|
|
121
|
+
exclude_labels = {
|
|
122
|
+
DocItemLabel.FORM,
|
|
123
|
+
DocItemLabel.KEY_VALUE_REGION,
|
|
124
|
+
DocItemLabel.PICTURE,
|
|
125
|
+
}
|
|
126
|
+
left_clusters = [c for c in clusters if c.label not in exclude_labels]
|
|
127
|
+
right_clusters = [c for c in clusters if c.label in exclude_labels]
|
|
128
|
+
# Create a deep copy of the original image for both sides
|
|
129
|
+
left_image = page.image.copy()
|
|
130
|
+
right_image = page.image.copy()
|
|
131
|
+
|
|
132
|
+
# Draw clusters on both images
|
|
133
|
+
draw_clusters(left_image, left_clusters, scale_x, scale_y)
|
|
134
|
+
draw_clusters(right_image, right_clusters, scale_x, scale_y)
|
|
135
|
+
# Combine the images side by side
|
|
136
|
+
combined_width = left_image.width * 2
|
|
137
|
+
combined_height = left_image.height
|
|
138
|
+
combined_image = Image.new("RGB", (combined_width, combined_height))
|
|
139
|
+
combined_image.paste(left_image, (0, 0))
|
|
140
|
+
combined_image.paste(right_image, (left_image.width, 0))
|
|
141
|
+
if show:
|
|
142
|
+
combined_image.show()
|
|
143
|
+
else:
|
|
144
|
+
out_path: Path = (
|
|
145
|
+
Path(settings.debug.debug_output_path)
|
|
146
|
+
/ f"debug_{conv_res.input.file.stem}"
|
|
147
|
+
)
|
|
148
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
|
149
|
+
out_file = out_path / f"{mode_prefix}_layout_page_{page.page_no:05}.png"
|
|
150
|
+
combined_image.save(str(out_file), format="png")
|
|
151
|
+
|
|
152
|
+
def predict_layout(
|
|
153
|
+
self,
|
|
154
|
+
conv_res: ConversionResult,
|
|
155
|
+
pages: Sequence[Page],
|
|
156
|
+
) -> Sequence[LayoutPrediction]:
|
|
157
|
+
# Convert to list to ensure predictable iteration
|
|
158
|
+
pages = list(pages)
|
|
159
|
+
|
|
160
|
+
# Separate valid and invalid pages
|
|
161
|
+
valid_pages = []
|
|
162
|
+
valid_page_images: List[Union[Image.Image, np.ndarray]] = []
|
|
163
|
+
|
|
164
|
+
for page in pages:
|
|
165
|
+
assert page._backend is not None
|
|
166
|
+
if not page._backend.is_valid():
|
|
167
|
+
continue
|
|
168
|
+
|
|
169
|
+
assert page.size is not None
|
|
170
|
+
page_image = page.get_image(scale=1.0)
|
|
171
|
+
assert page_image is not None
|
|
172
|
+
|
|
173
|
+
valid_pages.append(page)
|
|
174
|
+
valid_page_images.append(page_image)
|
|
175
|
+
|
|
176
|
+
# Process all valid pages with batch prediction
|
|
177
|
+
batch_predictions = []
|
|
178
|
+
if valid_page_images:
|
|
179
|
+
with TimeRecorder(conv_res, "layout"):
|
|
180
|
+
batch_predictions = self.layout_predictor.predict_batch( # type: ignore[attr-defined]
|
|
181
|
+
valid_page_images
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
# Process each page with its predictions
|
|
185
|
+
layout_predictions: list[LayoutPrediction] = []
|
|
186
|
+
valid_page_idx = 0
|
|
187
|
+
for page in pages:
|
|
188
|
+
assert page._backend is not None
|
|
189
|
+
if not page._backend.is_valid():
|
|
190
|
+
existing_prediction = page.predictions.layout or LayoutPrediction()
|
|
191
|
+
page.predictions.layout = existing_prediction
|
|
192
|
+
layout_predictions.append(existing_prediction)
|
|
193
|
+
continue
|
|
194
|
+
|
|
195
|
+
page_predictions = batch_predictions[valid_page_idx]
|
|
196
|
+
valid_page_idx += 1
|
|
197
|
+
|
|
198
|
+
clusters = []
|
|
199
|
+
for ix, pred_item in enumerate(page_predictions):
|
|
200
|
+
label = DocItemLabel(
|
|
201
|
+
pred_item["label"].lower().replace(" ", "_").replace("-", "_")
|
|
202
|
+
) # Temporary, until docling-ibm-model uses docling-core types
|
|
203
|
+
cluster = Cluster(
|
|
204
|
+
id=ix,
|
|
205
|
+
label=label,
|
|
206
|
+
confidence=pred_item["confidence"],
|
|
207
|
+
bbox=BoundingBox.model_validate(pred_item),
|
|
208
|
+
cells=[],
|
|
209
|
+
)
|
|
210
|
+
clusters.append(cluster)
|
|
211
|
+
|
|
212
|
+
if settings.debug.visualize_raw_layout:
|
|
213
|
+
self.draw_clusters_and_cells_side_by_side(
|
|
214
|
+
conv_res, page, clusters, mode_prefix="raw"
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
# Apply postprocessing
|
|
218
|
+
processed_clusters, processed_cells = LayoutPostprocessor(
|
|
219
|
+
page, clusters, self.options
|
|
220
|
+
).postprocess()
|
|
221
|
+
# Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
|
|
222
|
+
|
|
223
|
+
with warnings.catch_warnings():
|
|
224
|
+
warnings.filterwarnings(
|
|
225
|
+
"ignore",
|
|
226
|
+
"Mean of empty slice|invalid value encountered in scalar divide",
|
|
227
|
+
RuntimeWarning,
|
|
228
|
+
"numpy",
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
conv_res.confidence.pages[page.page_no].layout_score = float(
|
|
232
|
+
np.mean([c.confidence for c in processed_clusters])
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
conv_res.confidence.pages[page.page_no].ocr_score = float(
|
|
236
|
+
np.mean([c.confidence for c in processed_cells if c.from_ocr])
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
prediction = LayoutPrediction(clusters=processed_clusters)
|
|
240
|
+
page.predictions.layout = prediction
|
|
241
|
+
|
|
242
|
+
if settings.debug.visualize_layout:
|
|
243
|
+
self.draw_clusters_and_cells_side_by_side(
|
|
244
|
+
conv_res, page, processed_clusters, mode_prefix="postprocessed"
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
layout_predictions.append(prediction)
|
|
248
|
+
|
|
249
|
+
return layout_predictions
|
|
File without changes
|