docling 2.69.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/__init__.py +0 -0
- docling/backend/__init__.py +0 -0
- docling/backend/abstract_backend.py +84 -0
- docling/backend/asciidoc_backend.py +443 -0
- docling/backend/csv_backend.py +125 -0
- docling/backend/docling_parse_backend.py +237 -0
- docling/backend/docling_parse_v2_backend.py +276 -0
- docling/backend/docling_parse_v4_backend.py +260 -0
- docling/backend/docx/__init__.py +0 -0
- docling/backend/docx/drawingml/utils.py +131 -0
- docling/backend/docx/latex/__init__.py +0 -0
- docling/backend/docx/latex/latex_dict.py +274 -0
- docling/backend/docx/latex/omml.py +459 -0
- docling/backend/html_backend.py +1502 -0
- docling/backend/image_backend.py +188 -0
- docling/backend/json/__init__.py +0 -0
- docling/backend/json/docling_json_backend.py +58 -0
- docling/backend/md_backend.py +618 -0
- docling/backend/mets_gbs_backend.py +399 -0
- docling/backend/msexcel_backend.py +686 -0
- docling/backend/mspowerpoint_backend.py +398 -0
- docling/backend/msword_backend.py +1663 -0
- docling/backend/noop_backend.py +51 -0
- docling/backend/pdf_backend.py +82 -0
- docling/backend/pypdfium2_backend.py +417 -0
- docling/backend/webvtt_backend.py +572 -0
- docling/backend/xml/__init__.py +0 -0
- docling/backend/xml/jats_backend.py +819 -0
- docling/backend/xml/uspto_backend.py +1905 -0
- docling/chunking/__init__.py +12 -0
- docling/cli/__init__.py +0 -0
- docling/cli/main.py +974 -0
- docling/cli/models.py +196 -0
- docling/cli/tools.py +17 -0
- docling/datamodel/__init__.py +0 -0
- docling/datamodel/accelerator_options.py +69 -0
- docling/datamodel/asr_model_specs.py +494 -0
- docling/datamodel/backend_options.py +102 -0
- docling/datamodel/base_models.py +493 -0
- docling/datamodel/document.py +699 -0
- docling/datamodel/extraction.py +39 -0
- docling/datamodel/layout_model_specs.py +91 -0
- docling/datamodel/pipeline_options.py +457 -0
- docling/datamodel/pipeline_options_asr_model.py +78 -0
- docling/datamodel/pipeline_options_vlm_model.py +136 -0
- docling/datamodel/settings.py +65 -0
- docling/datamodel/vlm_model_specs.py +365 -0
- docling/document_converter.py +559 -0
- docling/document_extractor.py +327 -0
- docling/exceptions.py +10 -0
- docling/experimental/__init__.py +5 -0
- docling/experimental/datamodel/__init__.py +1 -0
- docling/experimental/datamodel/table_crops_layout_options.py +13 -0
- docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
- docling/experimental/models/__init__.py +3 -0
- docling/experimental/models/table_crops_layout_model.py +114 -0
- docling/experimental/pipeline/__init__.py +1 -0
- docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
- docling/models/__init__.py +0 -0
- docling/models/base_layout_model.py +39 -0
- docling/models/base_model.py +230 -0
- docling/models/base_ocr_model.py +241 -0
- docling/models/base_table_model.py +45 -0
- docling/models/extraction/__init__.py +0 -0
- docling/models/extraction/nuextract_transformers_model.py +305 -0
- docling/models/factories/__init__.py +47 -0
- docling/models/factories/base_factory.py +122 -0
- docling/models/factories/layout_factory.py +7 -0
- docling/models/factories/ocr_factory.py +11 -0
- docling/models/factories/picture_description_factory.py +11 -0
- docling/models/factories/table_factory.py +7 -0
- docling/models/picture_description_base_model.py +149 -0
- docling/models/plugins/__init__.py +0 -0
- docling/models/plugins/defaults.py +60 -0
- docling/models/stages/__init__.py +0 -0
- docling/models/stages/code_formula/__init__.py +0 -0
- docling/models/stages/code_formula/code_formula_model.py +342 -0
- docling/models/stages/layout/__init__.py +0 -0
- docling/models/stages/layout/layout_model.py +249 -0
- docling/models/stages/ocr/__init__.py +0 -0
- docling/models/stages/ocr/auto_ocr_model.py +132 -0
- docling/models/stages/ocr/easyocr_model.py +200 -0
- docling/models/stages/ocr/ocr_mac_model.py +145 -0
- docling/models/stages/ocr/rapid_ocr_model.py +328 -0
- docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
- docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
- docling/models/stages/page_assemble/__init__.py +0 -0
- docling/models/stages/page_assemble/page_assemble_model.py +156 -0
- docling/models/stages/page_preprocessing/__init__.py +0 -0
- docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
- docling/models/stages/picture_classifier/__init__.py +0 -0
- docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
- docling/models/stages/picture_description/__init__.py +0 -0
- docling/models/stages/picture_description/picture_description_api_model.py +66 -0
- docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
- docling/models/stages/reading_order/__init__.py +0 -0
- docling/models/stages/reading_order/readingorder_model.py +431 -0
- docling/models/stages/table_structure/__init__.py +0 -0
- docling/models/stages/table_structure/table_structure_model.py +305 -0
- docling/models/utils/__init__.py +0 -0
- docling/models/utils/generation_utils.py +157 -0
- docling/models/utils/hf_model_download.py +45 -0
- docling/models/vlm_pipeline_models/__init__.py +1 -0
- docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
- docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
- docling/models/vlm_pipeline_models/mlx_model.py +325 -0
- docling/models/vlm_pipeline_models/vllm_model.py +344 -0
- docling/pipeline/__init__.py +0 -0
- docling/pipeline/asr_pipeline.py +431 -0
- docling/pipeline/base_extraction_pipeline.py +72 -0
- docling/pipeline/base_pipeline.py +326 -0
- docling/pipeline/extraction_vlm_pipeline.py +207 -0
- docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
- docling/pipeline/simple_pipeline.py +55 -0
- docling/pipeline/standard_pdf_pipeline.py +859 -0
- docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
- docling/pipeline/vlm_pipeline.py +416 -0
- docling/py.typed +1 -0
- docling/utils/__init__.py +0 -0
- docling/utils/accelerator_utils.py +97 -0
- docling/utils/api_image_request.py +205 -0
- docling/utils/deepseekocr_utils.py +388 -0
- docling/utils/export.py +146 -0
- docling/utils/glm_utils.py +361 -0
- docling/utils/layout_postprocessor.py +683 -0
- docling/utils/locks.py +3 -0
- docling/utils/model_downloader.py +168 -0
- docling/utils/ocr_utils.py +69 -0
- docling/utils/orientation.py +65 -0
- docling/utils/profiling.py +65 -0
- docling/utils/utils.py +65 -0
- docling/utils/visualization.py +85 -0
- docling-2.69.0.dist-info/METADATA +237 -0
- docling-2.69.0.dist-info/RECORD +138 -0
- docling-2.69.0.dist-info/WHEEL +5 -0
- docling-2.69.0.dist-info/entry_points.txt +6 -0
- docling-2.69.0.dist-info/licenses/LICENSE +21 -0
- docling-2.69.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,416 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import re
|
|
3
|
+
from io import BytesIO
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import List, Optional, Union, cast
|
|
6
|
+
|
|
7
|
+
from docling_core.types.doc import (
|
|
8
|
+
BoundingBox,
|
|
9
|
+
ContentLayer,
|
|
10
|
+
DocItem,
|
|
11
|
+
DoclingDocument,
|
|
12
|
+
ImageRef,
|
|
13
|
+
PictureItem,
|
|
14
|
+
ProvenanceItem,
|
|
15
|
+
TableCell,
|
|
16
|
+
TableData,
|
|
17
|
+
TextItem,
|
|
18
|
+
)
|
|
19
|
+
from docling_core.types.doc.base import (
|
|
20
|
+
BoundingBox,
|
|
21
|
+
Size,
|
|
22
|
+
)
|
|
23
|
+
from docling_core.types.doc.document import DocTagsDocument
|
|
24
|
+
from lxml import etree
|
|
25
|
+
from PIL import Image as PILImage
|
|
26
|
+
|
|
27
|
+
from docling.backend.abstract_backend import (
|
|
28
|
+
AbstractDocumentBackend,
|
|
29
|
+
DeclarativeDocumentBackend,
|
|
30
|
+
)
|
|
31
|
+
from docling.backend.html_backend import HTMLDocumentBackend
|
|
32
|
+
from docling.backend.md_backend import MarkdownDocumentBackend
|
|
33
|
+
from docling.backend.pdf_backend import PdfDocumentBackend
|
|
34
|
+
from docling.datamodel.base_models import InputFormat, Page
|
|
35
|
+
from docling.datamodel.document import ConversionResult, InputDocument
|
|
36
|
+
from docling.datamodel.pipeline_options import (
|
|
37
|
+
VlmPipelineOptions,
|
|
38
|
+
)
|
|
39
|
+
from docling.datamodel.pipeline_options_vlm_model import (
|
|
40
|
+
ApiVlmOptions,
|
|
41
|
+
InferenceFramework,
|
|
42
|
+
InlineVlmOptions,
|
|
43
|
+
ResponseFormat,
|
|
44
|
+
)
|
|
45
|
+
from docling.datamodel.settings import settings
|
|
46
|
+
from docling.models.vlm_pipeline_models.api_vlm_model import ApiVlmModel
|
|
47
|
+
from docling.models.vlm_pipeline_models.hf_transformers_model import (
|
|
48
|
+
HuggingFaceTransformersVlmModel,
|
|
49
|
+
)
|
|
50
|
+
from docling.models.vlm_pipeline_models.mlx_model import HuggingFaceMlxModel
|
|
51
|
+
from docling.pipeline.base_pipeline import PaginatedPipeline
|
|
52
|
+
from docling.utils.deepseekocr_utils import parse_deepseekocr_markdown
|
|
53
|
+
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
|
54
|
+
|
|
55
|
+
_log = logging.getLogger(__name__)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class VlmPipeline(PaginatedPipeline):
|
|
59
|
+
def __init__(self, pipeline_options: VlmPipelineOptions):
|
|
60
|
+
super().__init__(pipeline_options)
|
|
61
|
+
self.keep_backend = True
|
|
62
|
+
|
|
63
|
+
self.pipeline_options: VlmPipelineOptions
|
|
64
|
+
|
|
65
|
+
# force_backend_text = False - use text that is coming from VLM response
|
|
66
|
+
# force_backend_text = True - get text from backend using bounding boxes predicted by SmolDocling doctags
|
|
67
|
+
self.force_backend_text = (
|
|
68
|
+
pipeline_options.force_backend_text
|
|
69
|
+
and pipeline_options.vlm_options.response_format == ResponseFormat.DOCTAGS
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
self.keep_images = self.pipeline_options.generate_page_images
|
|
73
|
+
|
|
74
|
+
if isinstance(pipeline_options.vlm_options, ApiVlmOptions):
|
|
75
|
+
self.build_pipe = [
|
|
76
|
+
ApiVlmModel(
|
|
77
|
+
enabled=True, # must be always enabled for this pipeline to make sense.
|
|
78
|
+
enable_remote_services=self.pipeline_options.enable_remote_services,
|
|
79
|
+
vlm_options=cast(ApiVlmOptions, self.pipeline_options.vlm_options),
|
|
80
|
+
),
|
|
81
|
+
]
|
|
82
|
+
elif isinstance(self.pipeline_options.vlm_options, InlineVlmOptions):
|
|
83
|
+
vlm_options = cast(InlineVlmOptions, self.pipeline_options.vlm_options)
|
|
84
|
+
if vlm_options.inference_framework == InferenceFramework.MLX:
|
|
85
|
+
self.build_pipe = [
|
|
86
|
+
HuggingFaceMlxModel(
|
|
87
|
+
enabled=True, # must be always enabled for this pipeline to make sense.
|
|
88
|
+
artifacts_path=self.artifacts_path,
|
|
89
|
+
accelerator_options=pipeline_options.accelerator_options,
|
|
90
|
+
vlm_options=vlm_options,
|
|
91
|
+
),
|
|
92
|
+
]
|
|
93
|
+
elif vlm_options.inference_framework == InferenceFramework.TRANSFORMERS:
|
|
94
|
+
self.build_pipe = [
|
|
95
|
+
HuggingFaceTransformersVlmModel(
|
|
96
|
+
enabled=True, # must be always enabled for this pipeline to make sense.
|
|
97
|
+
artifacts_path=self.artifacts_path,
|
|
98
|
+
accelerator_options=pipeline_options.accelerator_options,
|
|
99
|
+
vlm_options=vlm_options,
|
|
100
|
+
),
|
|
101
|
+
]
|
|
102
|
+
elif vlm_options.inference_framework == InferenceFramework.VLLM:
|
|
103
|
+
from docling.models.vlm_pipeline_models.vllm_model import VllmVlmModel
|
|
104
|
+
|
|
105
|
+
self.build_pipe = [
|
|
106
|
+
VllmVlmModel(
|
|
107
|
+
enabled=True, # must be always enabled for this pipeline to make sense.
|
|
108
|
+
artifacts_path=self.artifacts_path,
|
|
109
|
+
accelerator_options=pipeline_options.accelerator_options,
|
|
110
|
+
vlm_options=vlm_options,
|
|
111
|
+
),
|
|
112
|
+
]
|
|
113
|
+
else:
|
|
114
|
+
raise ValueError(
|
|
115
|
+
f"Could not instantiate the right type of VLM pipeline: {vlm_options.inference_framework}"
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
self.enrichment_pipe = [
|
|
119
|
+
# Other models working on `NodeItem` elements in the DoclingDocument
|
|
120
|
+
]
|
|
121
|
+
|
|
122
|
+
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
|
|
123
|
+
with TimeRecorder(conv_res, "page_init"):
|
|
124
|
+
images_scale = self.pipeline_options.images_scale
|
|
125
|
+
if images_scale is not None:
|
|
126
|
+
page._default_image_scale = images_scale
|
|
127
|
+
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
|
|
128
|
+
if page._backend is not None and page._backend.is_valid():
|
|
129
|
+
page.size = page._backend.get_size()
|
|
130
|
+
|
|
131
|
+
if self.force_backend_text:
|
|
132
|
+
page.parsed_page = page._backend.get_segmented_page()
|
|
133
|
+
|
|
134
|
+
return page
|
|
135
|
+
|
|
136
|
+
def extract_text_from_backend(
|
|
137
|
+
self, page: Page, bbox: Union[BoundingBox, None]
|
|
138
|
+
) -> str:
|
|
139
|
+
# Convert bounding box normalized to 0-100 into page coordinates for cropping
|
|
140
|
+
text = ""
|
|
141
|
+
if bbox:
|
|
142
|
+
if page.size:
|
|
143
|
+
if page._backend:
|
|
144
|
+
text = page._backend.get_text_in_rect(bbox)
|
|
145
|
+
return text
|
|
146
|
+
|
|
147
|
+
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
|
|
148
|
+
with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
|
|
149
|
+
if (
|
|
150
|
+
self.pipeline_options.vlm_options.response_format
|
|
151
|
+
== ResponseFormat.DOCTAGS
|
|
152
|
+
):
|
|
153
|
+
conv_res.document = self._turn_dt_into_doc(conv_res)
|
|
154
|
+
|
|
155
|
+
elif (
|
|
156
|
+
self.pipeline_options.vlm_options.response_format
|
|
157
|
+
== ResponseFormat.DEEPSEEKOCR_MARKDOWN
|
|
158
|
+
):
|
|
159
|
+
conv_res.document = self._parse_deepseekocr_markdown(conv_res)
|
|
160
|
+
|
|
161
|
+
elif (
|
|
162
|
+
self.pipeline_options.vlm_options.response_format
|
|
163
|
+
== ResponseFormat.MARKDOWN
|
|
164
|
+
):
|
|
165
|
+
conv_res.document = self._convert_text_with_backend(
|
|
166
|
+
conv_res, InputFormat.MD, MarkdownDocumentBackend
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
elif (
|
|
170
|
+
self.pipeline_options.vlm_options.response_format == ResponseFormat.HTML
|
|
171
|
+
):
|
|
172
|
+
conv_res.document = self._convert_text_with_backend(
|
|
173
|
+
conv_res, InputFormat.HTML, HTMLDocumentBackend
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
else:
|
|
177
|
+
raise RuntimeError(
|
|
178
|
+
f"Unsupported VLM response format {self.pipeline_options.vlm_options.response_format}"
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
# Generate images of the requested element types
|
|
182
|
+
if self.pipeline_options.generate_picture_images:
|
|
183
|
+
scale = self.pipeline_options.images_scale
|
|
184
|
+
for element, _level in conv_res.document.iterate_items():
|
|
185
|
+
if not isinstance(element, DocItem) or len(element.prov) == 0:
|
|
186
|
+
continue
|
|
187
|
+
if (
|
|
188
|
+
isinstance(element, PictureItem)
|
|
189
|
+
and self.pipeline_options.generate_picture_images
|
|
190
|
+
):
|
|
191
|
+
page_ix = element.prov[0].page_no - 1
|
|
192
|
+
page = conv_res.pages[page_ix]
|
|
193
|
+
assert page.size is not None
|
|
194
|
+
assert page.image is not None
|
|
195
|
+
|
|
196
|
+
crop_bbox = (
|
|
197
|
+
element.prov[0]
|
|
198
|
+
.bbox.scaled(scale=scale)
|
|
199
|
+
.to_top_left_origin(page_height=page.size.height * scale)
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
cropped_im = page.image.crop(crop_bbox.as_tuple())
|
|
203
|
+
element.image = ImageRef.from_pil(
|
|
204
|
+
cropped_im, dpi=int(72 * scale)
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
return conv_res
|
|
208
|
+
|
|
209
|
+
def _turn_dt_into_doc(self, conv_res) -> DoclingDocument:
|
|
210
|
+
doctags_list = []
|
|
211
|
+
image_list = []
|
|
212
|
+
for page in conv_res.pages:
|
|
213
|
+
predicted_doctags = ""
|
|
214
|
+
img = PILImage.new("RGB", (1, 1), "rgb(255,255,255)")
|
|
215
|
+
if page.predictions.vlm_response:
|
|
216
|
+
predicted_doctags = page.predictions.vlm_response.text
|
|
217
|
+
if page.image:
|
|
218
|
+
img = page.image
|
|
219
|
+
image_list.append(img)
|
|
220
|
+
doctags_list.append(predicted_doctags)
|
|
221
|
+
|
|
222
|
+
doctags_list_c = cast(List[Union[Path, str]], doctags_list)
|
|
223
|
+
image_list_c = cast(List[Union[Path, PILImage.Image]], image_list)
|
|
224
|
+
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
|
|
225
|
+
doctags_list_c, image_list_c
|
|
226
|
+
)
|
|
227
|
+
conv_res.document = DoclingDocument.load_from_doctags(
|
|
228
|
+
doctag_document=doctags_doc
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
# If forced backend text, replace model predicted text with backend one
|
|
232
|
+
if page.size:
|
|
233
|
+
if self.force_backend_text:
|
|
234
|
+
scale = self.pipeline_options.images_scale
|
|
235
|
+
for element, _level in conv_res.document.iterate_items():
|
|
236
|
+
if not isinstance(element, TextItem) or len(element.prov) == 0:
|
|
237
|
+
continue
|
|
238
|
+
crop_bbox = (
|
|
239
|
+
element.prov[0]
|
|
240
|
+
.bbox.scaled(scale=scale)
|
|
241
|
+
.to_top_left_origin(page_height=page.size.height * scale)
|
|
242
|
+
)
|
|
243
|
+
txt = self.extract_text_from_backend(page, crop_bbox)
|
|
244
|
+
element.text = txt
|
|
245
|
+
element.orig = txt
|
|
246
|
+
|
|
247
|
+
return conv_res.document
|
|
248
|
+
|
|
249
|
+
def _parse_deepseekocr_markdown(
|
|
250
|
+
self, conv_res: ConversionResult
|
|
251
|
+
) -> DoclingDocument:
|
|
252
|
+
"""Parse DeepSeek OCR markdown with label[[x1, y1, x2, y2]] format.
|
|
253
|
+
|
|
254
|
+
Labels supported:
|
|
255
|
+
- text: Standard body text
|
|
256
|
+
- title: Main document or section titles
|
|
257
|
+
- sub_title: Secondary headings or sub-headers
|
|
258
|
+
- table: Tabular data
|
|
259
|
+
- table_caption: Descriptive text for tables
|
|
260
|
+
- figure: Image-based elements or diagrams
|
|
261
|
+
- figure_caption: Titles or descriptions for figures/images
|
|
262
|
+
- header / footer: Content at top or bottom margins of pages
|
|
263
|
+
"""
|
|
264
|
+
page_docs = []
|
|
265
|
+
|
|
266
|
+
for pg_idx, page in enumerate(conv_res.pages):
|
|
267
|
+
predicted_text = ""
|
|
268
|
+
if page.predictions.vlm_response:
|
|
269
|
+
predicted_text = page.predictions.vlm_response.text
|
|
270
|
+
|
|
271
|
+
assert page.size is not None
|
|
272
|
+
|
|
273
|
+
# Parse single page using the utility function
|
|
274
|
+
# Pass vlm_options.scale to convert bboxes from scaled image coords to original PDF coords
|
|
275
|
+
page_doc = parse_deepseekocr_markdown(
|
|
276
|
+
content=predicted_text,
|
|
277
|
+
original_page_size=page.size,
|
|
278
|
+
page_no=pg_idx + 1,
|
|
279
|
+
filename=conv_res.input.file.name or "file",
|
|
280
|
+
page_image=page.image,
|
|
281
|
+
)
|
|
282
|
+
page_docs.append(page_doc)
|
|
283
|
+
|
|
284
|
+
# Add page metadata and concatenate
|
|
285
|
+
return self._add_page_metadata_and_concatenate(page_docs, conv_res)
|
|
286
|
+
|
|
287
|
+
def _extract_code_block(self, text: str) -> str:
|
|
288
|
+
"""
|
|
289
|
+
Extracts text from markdown code blocks (enclosed in triple backticks).
|
|
290
|
+
If no code blocks are found, returns the original text.
|
|
291
|
+
|
|
292
|
+
Args:
|
|
293
|
+
text (str): Input text that may contain markdown code blocks
|
|
294
|
+
|
|
295
|
+
Returns:
|
|
296
|
+
str: Extracted code if code blocks exist, otherwise original text
|
|
297
|
+
"""
|
|
298
|
+
# Regex pattern to match content between triple backticks
|
|
299
|
+
# This handles multiline content and optional language specifier
|
|
300
|
+
pattern = r"^```(?:\w*\n)?(.*?)```(\n)*$"
|
|
301
|
+
|
|
302
|
+
# Search with DOTALL flag to match across multiple lines
|
|
303
|
+
mtch = re.search(pattern, text, re.DOTALL)
|
|
304
|
+
|
|
305
|
+
if mtch:
|
|
306
|
+
# Return only the content of the first capturing group
|
|
307
|
+
return mtch.group(1)
|
|
308
|
+
else:
|
|
309
|
+
# No code blocks found, return original text
|
|
310
|
+
return text
|
|
311
|
+
|
|
312
|
+
def _add_page_metadata_and_concatenate(
|
|
313
|
+
self,
|
|
314
|
+
page_docs: List[DoclingDocument],
|
|
315
|
+
conv_res: ConversionResult,
|
|
316
|
+
) -> DoclingDocument:
|
|
317
|
+
"""
|
|
318
|
+
Add page metadata to page documents and concatenate them.
|
|
319
|
+
|
|
320
|
+
Args:
|
|
321
|
+
page_docs: List of page documents to process
|
|
322
|
+
conv_res: Conversion result containing page information
|
|
323
|
+
|
|
324
|
+
Returns:
|
|
325
|
+
DoclingDocument: Concatenated document with page metadata
|
|
326
|
+
"""
|
|
327
|
+
for pg_idx, (page_doc, page) in enumerate(zip(page_docs, conv_res.pages)):
|
|
328
|
+
# Add page metadata to the page document before concatenation
|
|
329
|
+
if page.image is not None:
|
|
330
|
+
pg_width = page.image.width
|
|
331
|
+
pg_height = page.image.height
|
|
332
|
+
else:
|
|
333
|
+
pg_width = 1
|
|
334
|
+
pg_height = 1
|
|
335
|
+
|
|
336
|
+
page_doc.add_page(
|
|
337
|
+
page_no=pg_idx + 1,
|
|
338
|
+
size=Size(width=pg_width, height=pg_height),
|
|
339
|
+
image=ImageRef.from_pil(image=page.image, dpi=72)
|
|
340
|
+
if page.image
|
|
341
|
+
else None,
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
# Concatenate all page documents to preserve hierarchy
|
|
345
|
+
return DoclingDocument.concatenate(docs=page_docs)
|
|
346
|
+
|
|
347
|
+
def _convert_text_with_backend(
|
|
348
|
+
self,
|
|
349
|
+
conv_res: ConversionResult,
|
|
350
|
+
input_format: InputFormat,
|
|
351
|
+
backend_class: type[DeclarativeDocumentBackend],
|
|
352
|
+
) -> DoclingDocument:
|
|
353
|
+
"""
|
|
354
|
+
Convert text-based formats (Markdown, HTML) into DoclingDocument using a backend.
|
|
355
|
+
|
|
356
|
+
Args:
|
|
357
|
+
conv_res: The conversion result containing pages with VLM predictions
|
|
358
|
+
input_format: The format type (MD or HTML)
|
|
359
|
+
backend_class: The backend class to use for conversion
|
|
360
|
+
|
|
361
|
+
Returns:
|
|
362
|
+
DoclingDocument: The assembled document
|
|
363
|
+
"""
|
|
364
|
+
page_docs = []
|
|
365
|
+
|
|
366
|
+
for pg_idx, page in enumerate(conv_res.pages):
|
|
367
|
+
predicted_text = ""
|
|
368
|
+
if page.predictions.vlm_response:
|
|
369
|
+
predicted_text = page.predictions.vlm_response.text + "\n\n"
|
|
370
|
+
|
|
371
|
+
# Extract content from code blocks if present
|
|
372
|
+
predicted_text = self._extract_code_block(text=predicted_text)
|
|
373
|
+
|
|
374
|
+
# Convert text to document using specified backend
|
|
375
|
+
response_bytes = BytesIO(predicted_text.encode("utf8"))
|
|
376
|
+
out_doc = InputDocument(
|
|
377
|
+
path_or_stream=response_bytes,
|
|
378
|
+
filename=conv_res.input.file.name,
|
|
379
|
+
format=input_format,
|
|
380
|
+
backend=backend_class,
|
|
381
|
+
)
|
|
382
|
+
backend = backend_class(
|
|
383
|
+
in_doc=out_doc,
|
|
384
|
+
path_or_stream=response_bytes,
|
|
385
|
+
)
|
|
386
|
+
page_doc = backend.convert()
|
|
387
|
+
|
|
388
|
+
# Modify provenance in place for all items in the page document
|
|
389
|
+
for item, level in page_doc.iterate_items(
|
|
390
|
+
with_groups=True,
|
|
391
|
+
traverse_pictures=True,
|
|
392
|
+
included_content_layers=set(ContentLayer),
|
|
393
|
+
):
|
|
394
|
+
if isinstance(item, DocItem):
|
|
395
|
+
item.prov = [
|
|
396
|
+
ProvenanceItem(
|
|
397
|
+
page_no=pg_idx + 1,
|
|
398
|
+
bbox=BoundingBox(
|
|
399
|
+
t=0.0, b=0.0, l=0.0, r=0.0
|
|
400
|
+
), # FIXME: would be nice not to have to "fake" it
|
|
401
|
+
charspan=[0, 0],
|
|
402
|
+
)
|
|
403
|
+
]
|
|
404
|
+
|
|
405
|
+
page_docs.append(page_doc)
|
|
406
|
+
|
|
407
|
+
# Add page metadata and concatenate
|
|
408
|
+
return self._add_page_metadata_and_concatenate(page_docs, conv_res)
|
|
409
|
+
|
|
410
|
+
@classmethod
|
|
411
|
+
def get_default_options(cls) -> VlmPipelineOptions:
|
|
412
|
+
return VlmPipelineOptions()
|
|
413
|
+
|
|
414
|
+
@classmethod
|
|
415
|
+
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
|
416
|
+
return isinstance(backend, PdfDocumentBackend)
|
docling/py.typed
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
File without changes
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import List, Optional
|
|
3
|
+
|
|
4
|
+
from docling.datamodel.accelerator_options import AcceleratorDevice
|
|
5
|
+
|
|
6
|
+
_log = logging.getLogger(__name__)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def decide_device(
|
|
10
|
+
accelerator_device: str, supported_devices: Optional[List[AcceleratorDevice]] = None
|
|
11
|
+
) -> str:
|
|
12
|
+
r"""
|
|
13
|
+
Resolve the device based on the acceleration options and the available devices in the system.
|
|
14
|
+
|
|
15
|
+
Rules:
|
|
16
|
+
1. AUTO: Check for the best available device on the system.
|
|
17
|
+
2. User-defined: Check if the device actually exists, otherwise fall-back to CPU
|
|
18
|
+
"""
|
|
19
|
+
import torch
|
|
20
|
+
|
|
21
|
+
device = "cpu"
|
|
22
|
+
|
|
23
|
+
has_cuda = torch.backends.cuda.is_built() and torch.cuda.is_available()
|
|
24
|
+
has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
|
|
25
|
+
has_xpu = hasattr(torch, "xpu") and torch.xpu.is_available()
|
|
26
|
+
|
|
27
|
+
if supported_devices is not None:
|
|
28
|
+
if has_cuda and AcceleratorDevice.CUDA not in supported_devices:
|
|
29
|
+
_log.info(
|
|
30
|
+
f"Removing CUDA from available devices because it is not in {supported_devices=}"
|
|
31
|
+
)
|
|
32
|
+
has_cuda = False
|
|
33
|
+
if has_mps and AcceleratorDevice.MPS not in supported_devices:
|
|
34
|
+
_log.info(
|
|
35
|
+
f"Removing MPS from available devices because it is not in {supported_devices=}"
|
|
36
|
+
)
|
|
37
|
+
has_mps = False
|
|
38
|
+
if has_xpu and AcceleratorDevice.XPU not in supported_devices:
|
|
39
|
+
_log.info(
|
|
40
|
+
f"Removing XPU from available devices because it is not in {supported_devices=}"
|
|
41
|
+
)
|
|
42
|
+
has_xpu = False
|
|
43
|
+
|
|
44
|
+
if accelerator_device == AcceleratorDevice.AUTO.value: # Handle 'auto'
|
|
45
|
+
if has_cuda:
|
|
46
|
+
device = "cuda:0"
|
|
47
|
+
elif has_mps:
|
|
48
|
+
device = "mps"
|
|
49
|
+
elif has_xpu:
|
|
50
|
+
device = "xpu"
|
|
51
|
+
|
|
52
|
+
elif accelerator_device.startswith("cuda"):
|
|
53
|
+
if has_cuda:
|
|
54
|
+
# if cuda device index specified extract device id
|
|
55
|
+
parts = accelerator_device.split(":")
|
|
56
|
+
if len(parts) == 2 and parts[1].isdigit():
|
|
57
|
+
# select cuda device's id
|
|
58
|
+
cuda_index = int(parts[1])
|
|
59
|
+
if cuda_index < torch.cuda.device_count():
|
|
60
|
+
device = f"cuda:{cuda_index}"
|
|
61
|
+
else:
|
|
62
|
+
_log.warning(
|
|
63
|
+
"CUDA device 'cuda:%d' is not available. Fall back to 'CPU'.",
|
|
64
|
+
cuda_index,
|
|
65
|
+
)
|
|
66
|
+
elif len(parts) == 1: # just "cuda"
|
|
67
|
+
device = "cuda:0"
|
|
68
|
+
else:
|
|
69
|
+
_log.warning(
|
|
70
|
+
"Invalid CUDA device format '%s'. Fall back to 'CPU'",
|
|
71
|
+
accelerator_device,
|
|
72
|
+
)
|
|
73
|
+
else:
|
|
74
|
+
_log.warning("CUDA is not available in the system. Fall back to 'CPU'")
|
|
75
|
+
|
|
76
|
+
elif accelerator_device == AcceleratorDevice.MPS.value:
|
|
77
|
+
if has_mps:
|
|
78
|
+
device = "mps"
|
|
79
|
+
else:
|
|
80
|
+
_log.warning("MPS is not available in the system. Fall back to 'CPU'")
|
|
81
|
+
|
|
82
|
+
elif accelerator_device == AcceleratorDevice.XPU.value:
|
|
83
|
+
if has_xpu:
|
|
84
|
+
device = "xpu"
|
|
85
|
+
else:
|
|
86
|
+
_log.warning("XPU is not available in the system. Fall back to 'CPU'")
|
|
87
|
+
|
|
88
|
+
elif accelerator_device == AcceleratorDevice.CPU.value:
|
|
89
|
+
device = "cpu"
|
|
90
|
+
|
|
91
|
+
else:
|
|
92
|
+
_log.warning(
|
|
93
|
+
"Unknown device option '%s'. Fall back to 'CPU'", accelerator_device
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
_log.info("Accelerator device: '%s'", device)
|
|
97
|
+
return device
|