docling 2.34.0__py3-none-any.whl → 2.36.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/xml/jats_backend.py +0 -0
- docling/cli/main.py +48 -18
- docling/datamodel/accelerator_options.py +68 -0
- docling/datamodel/base_models.py +10 -8
- docling/datamodel/document.py +7 -2
- docling/datamodel/pipeline_options.py +29 -161
- docling/datamodel/pipeline_options_vlm_model.py +81 -0
- docling/datamodel/vlm_model_specs.py +144 -0
- docling/document_converter.py +5 -0
- docling/models/api_vlm_model.py +1 -1
- docling/models/base_ocr_model.py +2 -1
- docling/models/code_formula_model.py +6 -11
- docling/models/document_picture_classifier.py +6 -11
- docling/models/easyocr_model.py +1 -2
- docling/models/layout_model.py +22 -17
- docling/models/ocr_mac_model.py +1 -1
- docling/models/page_preprocessing_model.py +11 -6
- docling/models/picture_description_api_model.py +1 -1
- docling/models/picture_description_base_model.py +1 -1
- docling/models/picture_description_vlm_model.py +7 -22
- docling/models/rapid_ocr_model.py +1 -2
- docling/models/table_structure_model.py +6 -12
- docling/models/tesseract_ocr_cli_model.py +1 -1
- docling/models/tesseract_ocr_model.py +1 -1
- docling/models/utils/__init__.py +0 -0
- docling/models/utils/hf_model_download.py +40 -0
- docling/models/vlm_models_inline/__init__.py +0 -0
- docling/models/vlm_models_inline/hf_transformers_model.py +194 -0
- docling/models/{hf_mlx_model.py → vlm_models_inline/mlx_model.py} +56 -44
- docling/pipeline/standard_pdf_pipeline.py +69 -57
- docling/pipeline/vlm_pipeline.py +228 -61
- docling/utils/accelerator_utils.py +17 -2
- docling/utils/model_downloader.py +13 -12
- {docling-2.34.0.dist-info → docling-2.36.0.dist-info}/METADATA +54 -55
- {docling-2.34.0.dist-info → docling-2.36.0.dist-info}/RECORD +48 -41
- {docling-2.34.0.dist-info → docling-2.36.0.dist-info}/WHEEL +2 -1
- docling-2.36.0.dist-info/entry_points.txt +6 -0
- docling-2.36.0.dist-info/top_level.txt +1 -0
- docling/models/hf_vlm_model.py +0 -182
- docling-2.34.0.dist-info/entry_points.txt +0 -7
- {docling-2.34.0.dist-info → docling-2.36.0.dist-info/licenses}/LICENSE +0 -0
docling/pipeline/vlm_pipeline.py
CHANGED
@@ -1,29 +1,46 @@
|
|
1
1
|
import logging
|
2
|
+
import re
|
2
3
|
from io import BytesIO
|
3
4
|
from pathlib import Path
|
4
5
|
from typing import List, Optional, Union, cast
|
5
6
|
|
6
|
-
from docling_core.types import
|
7
|
-
|
7
|
+
from docling_core.types.doc import (
|
8
|
+
BoundingBox,
|
9
|
+
DocItem,
|
10
|
+
DoclingDocument,
|
11
|
+
ImageRef,
|
12
|
+
PictureItem,
|
13
|
+
ProvenanceItem,
|
14
|
+
TextItem,
|
15
|
+
)
|
16
|
+
from docling_core.types.doc.base import (
|
17
|
+
BoundingBox,
|
18
|
+
Size,
|
19
|
+
)
|
8
20
|
from docling_core.types.doc.document import DocTagsDocument
|
9
21
|
from PIL import Image as PILImage
|
10
22
|
|
11
23
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
24
|
+
from docling.backend.html_backend import HTMLDocumentBackend
|
12
25
|
from docling.backend.md_backend import MarkdownDocumentBackend
|
13
26
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
14
27
|
from docling.datamodel.base_models import InputFormat, Page
|
15
28
|
from docling.datamodel.document import ConversionResult, InputDocument
|
16
29
|
from docling.datamodel.pipeline_options import (
|
30
|
+
VlmPipelineOptions,
|
31
|
+
)
|
32
|
+
from docling.datamodel.pipeline_options_vlm_model import (
|
17
33
|
ApiVlmOptions,
|
18
|
-
HuggingFaceVlmOptions,
|
19
34
|
InferenceFramework,
|
35
|
+
InlineVlmOptions,
|
20
36
|
ResponseFormat,
|
21
|
-
VlmPipelineOptions,
|
22
37
|
)
|
23
38
|
from docling.datamodel.settings import settings
|
24
39
|
from docling.models.api_vlm_model import ApiVlmModel
|
25
|
-
from docling.models.
|
26
|
-
|
40
|
+
from docling.models.vlm_models_inline.hf_transformers_model import (
|
41
|
+
HuggingFaceTransformersVlmModel,
|
42
|
+
)
|
43
|
+
from docling.models.vlm_models_inline.mlx_model import HuggingFaceMlxModel
|
27
44
|
from docling.pipeline.base_pipeline import PaginatedPipeline
|
28
45
|
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
29
46
|
|
@@ -66,8 +83,8 @@ class VlmPipeline(PaginatedPipeline):
|
|
66
83
|
vlm_options=cast(ApiVlmOptions, self.pipeline_options.vlm_options),
|
67
84
|
),
|
68
85
|
]
|
69
|
-
elif isinstance(self.pipeline_options.vlm_options,
|
70
|
-
vlm_options = cast(
|
86
|
+
elif isinstance(self.pipeline_options.vlm_options, InlineVlmOptions):
|
87
|
+
vlm_options = cast(InlineVlmOptions, self.pipeline_options.vlm_options)
|
71
88
|
if vlm_options.inference_framework == InferenceFramework.MLX:
|
72
89
|
self.build_pipe = [
|
73
90
|
HuggingFaceMlxModel(
|
@@ -77,15 +94,19 @@ class VlmPipeline(PaginatedPipeline):
|
|
77
94
|
vlm_options=vlm_options,
|
78
95
|
),
|
79
96
|
]
|
80
|
-
|
97
|
+
elif vlm_options.inference_framework == InferenceFramework.TRANSFORMERS:
|
81
98
|
self.build_pipe = [
|
82
|
-
|
99
|
+
HuggingFaceTransformersVlmModel(
|
83
100
|
enabled=True, # must be always enabled for this pipeline to make sense.
|
84
101
|
artifacts_path=artifacts_path,
|
85
102
|
accelerator_options=pipeline_options.accelerator_options,
|
86
103
|
vlm_options=vlm_options,
|
87
104
|
),
|
88
105
|
]
|
106
|
+
else:
|
107
|
+
raise ValueError(
|
108
|
+
f"Could not instantiate the right type of VLM pipeline: {vlm_options.inference_framework}"
|
109
|
+
)
|
89
110
|
|
90
111
|
self.enrichment_pipe = [
|
91
112
|
# Other models working on `NodeItem` elements in the DoclingDocument
|
@@ -116,49 +137,19 @@ class VlmPipeline(PaginatedPipeline):
|
|
116
137
|
self.pipeline_options.vlm_options.response_format
|
117
138
|
== ResponseFormat.DOCTAGS
|
118
139
|
):
|
119
|
-
|
120
|
-
|
121
|
-
for page in conv_res.pages:
|
122
|
-
predicted_doctags = ""
|
123
|
-
img = PILImage.new("RGB", (1, 1), "rgb(255,255,255)")
|
124
|
-
if page.predictions.vlm_response:
|
125
|
-
predicted_doctags = page.predictions.vlm_response.text
|
126
|
-
if page.image:
|
127
|
-
img = page.image
|
128
|
-
image_list.append(img)
|
129
|
-
doctags_list.append(predicted_doctags)
|
130
|
-
|
131
|
-
doctags_list_c = cast(List[Union[Path, str]], doctags_list)
|
132
|
-
image_list_c = cast(List[Union[Path, PILImage.Image]], image_list)
|
133
|
-
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
|
134
|
-
doctags_list_c, image_list_c
|
135
|
-
)
|
136
|
-
conv_res.document = DoclingDocument.load_from_doctags(doctags_doc)
|
137
|
-
|
138
|
-
# If forced backend text, replace model predicted text with backend one
|
139
|
-
if self.force_backend_text:
|
140
|
-
scale = self.pipeline_options.images_scale
|
141
|
-
for element, _level in conv_res.document.iterate_items():
|
142
|
-
if not isinstance(element, TextItem) or len(element.prov) == 0:
|
143
|
-
continue
|
144
|
-
page_ix = element.prov[0].page_no - 1
|
145
|
-
page = conv_res.pages[page_ix]
|
146
|
-
if not page.size:
|
147
|
-
continue
|
148
|
-
crop_bbox = (
|
149
|
-
element.prov[0]
|
150
|
-
.bbox.scaled(scale=scale)
|
151
|
-
.to_top_left_origin(page_height=page.size.height * scale)
|
152
|
-
)
|
153
|
-
txt = self.extract_text_from_backend(page, crop_bbox)
|
154
|
-
element.text = txt
|
155
|
-
element.orig = txt
|
140
|
+
conv_res.document = self._turn_dt_into_doc(conv_res)
|
141
|
+
|
156
142
|
elif (
|
157
143
|
self.pipeline_options.vlm_options.response_format
|
158
144
|
== ResponseFormat.MARKDOWN
|
159
145
|
):
|
160
146
|
conv_res.document = self._turn_md_into_doc(conv_res)
|
161
147
|
|
148
|
+
elif (
|
149
|
+
self.pipeline_options.vlm_options.response_format == ResponseFormat.HTML
|
150
|
+
):
|
151
|
+
conv_res.document = self._turn_html_into_doc(conv_res)
|
152
|
+
|
162
153
|
else:
|
163
154
|
raise RuntimeError(
|
164
155
|
f"Unsupported VLM response format {self.pipeline_options.vlm_options.response_format}"
|
@@ -192,23 +183,199 @@ class VlmPipeline(PaginatedPipeline):
|
|
192
183
|
|
193
184
|
return conv_res
|
194
185
|
|
195
|
-
def
|
196
|
-
|
197
|
-
|
186
|
+
def _turn_dt_into_doc(self, conv_res) -> DoclingDocument:
|
187
|
+
doctags_list = []
|
188
|
+
image_list = []
|
189
|
+
for page in conv_res.pages:
|
190
|
+
predicted_doctags = ""
|
191
|
+
img = PILImage.new("RGB", (1, 1), "rgb(255,255,255)")
|
198
192
|
if page.predictions.vlm_response:
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
193
|
+
predicted_doctags = page.predictions.vlm_response.text
|
194
|
+
if page.image:
|
195
|
+
img = page.image
|
196
|
+
image_list.append(img)
|
197
|
+
doctags_list.append(predicted_doctags)
|
198
|
+
|
199
|
+
doctags_list_c = cast(List[Union[Path, str]], doctags_list)
|
200
|
+
image_list_c = cast(List[Union[Path, PILImage.Image]], image_list)
|
201
|
+
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
|
202
|
+
doctags_list_c, image_list_c
|
206
203
|
)
|
207
|
-
|
208
|
-
|
209
|
-
path_or_stream=response_bytes,
|
204
|
+
conv_res.document = DoclingDocument.load_from_doctags(
|
205
|
+
doctag_document=doctags_doc
|
210
206
|
)
|
211
|
-
|
207
|
+
|
208
|
+
# If forced backend text, replace model predicted text with backend one
|
209
|
+
if page.size:
|
210
|
+
if self.force_backend_text:
|
211
|
+
scale = self.pipeline_options.images_scale
|
212
|
+
for element, _level in conv_res.document.iterate_items():
|
213
|
+
if not isinstance(element, TextItem) or len(element.prov) == 0:
|
214
|
+
continue
|
215
|
+
crop_bbox = (
|
216
|
+
element.prov[0]
|
217
|
+
.bbox.scaled(scale=scale)
|
218
|
+
.to_top_left_origin(page_height=page.size.height * scale)
|
219
|
+
)
|
220
|
+
txt = self.extract_text_from_backend(page, crop_bbox)
|
221
|
+
element.text = txt
|
222
|
+
element.orig = txt
|
223
|
+
|
224
|
+
return conv_res.document
|
225
|
+
|
226
|
+
def _turn_md_into_doc(self, conv_res):
|
227
|
+
def _extract_markdown_code(text):
|
228
|
+
"""
|
229
|
+
Extracts text from markdown code blocks (enclosed in triple backticks).
|
230
|
+
If no code blocks are found, returns the original text.
|
231
|
+
|
232
|
+
Args:
|
233
|
+
text (str): Input text that may contain markdown code blocks
|
234
|
+
|
235
|
+
Returns:
|
236
|
+
str: Extracted code if code blocks exist, otherwise original text
|
237
|
+
"""
|
238
|
+
# Regex pattern to match content between triple backticks
|
239
|
+
# This handles multiline content and optional language specifier
|
240
|
+
pattern = r"^```(?:\w*\n)?(.*?)```(\n)*$"
|
241
|
+
|
242
|
+
# Search with DOTALL flag to match across multiple lines
|
243
|
+
mtch = re.search(pattern, text, re.DOTALL)
|
244
|
+
|
245
|
+
if mtch:
|
246
|
+
# Return only the content of the first capturing group
|
247
|
+
return mtch.group(1)
|
248
|
+
else:
|
249
|
+
# No code blocks found, return original text
|
250
|
+
return text
|
251
|
+
|
252
|
+
for pg_idx, page in enumerate(conv_res.pages):
|
253
|
+
page_no = pg_idx + 1 # FIXME: might be incorrect
|
254
|
+
|
255
|
+
predicted_text = ""
|
256
|
+
if page.predictions.vlm_response:
|
257
|
+
predicted_text = page.predictions.vlm_response.text + "\n\n"
|
258
|
+
|
259
|
+
predicted_text = _extract_markdown_code(text=predicted_text)
|
260
|
+
|
261
|
+
response_bytes = BytesIO(predicted_text.encode("utf8"))
|
262
|
+
out_doc = InputDocument(
|
263
|
+
path_or_stream=response_bytes,
|
264
|
+
filename=conv_res.input.file.name,
|
265
|
+
format=InputFormat.MD,
|
266
|
+
backend=MarkdownDocumentBackend,
|
267
|
+
)
|
268
|
+
backend = MarkdownDocumentBackend(
|
269
|
+
in_doc=out_doc,
|
270
|
+
path_or_stream=response_bytes,
|
271
|
+
)
|
272
|
+
page_doc = backend.convert()
|
273
|
+
|
274
|
+
if page.image is not None:
|
275
|
+
pg_width = page.image.width
|
276
|
+
pg_height = page.image.height
|
277
|
+
else:
|
278
|
+
pg_width = 1
|
279
|
+
pg_height = 1
|
280
|
+
|
281
|
+
conv_res.document.add_page(
|
282
|
+
page_no=page_no,
|
283
|
+
size=Size(width=pg_width, height=pg_height),
|
284
|
+
image=ImageRef.from_pil(image=page.image, dpi=72)
|
285
|
+
if page.image
|
286
|
+
else None,
|
287
|
+
)
|
288
|
+
|
289
|
+
for item, level in page_doc.iterate_items():
|
290
|
+
item.prov = [
|
291
|
+
ProvenanceItem(
|
292
|
+
page_no=pg_idx + 1,
|
293
|
+
bbox=BoundingBox(
|
294
|
+
t=0.0, b=0.0, l=0.0, r=0.0
|
295
|
+
), # FIXME: would be nice not to have to "fake" it
|
296
|
+
charspan=[0, 0],
|
297
|
+
)
|
298
|
+
]
|
299
|
+
conv_res.document.append_child_item(child=item)
|
300
|
+
|
301
|
+
return conv_res.document
|
302
|
+
|
303
|
+
def _turn_html_into_doc(self, conv_res):
|
304
|
+
def _extract_html_code(text):
|
305
|
+
"""
|
306
|
+
Extracts text from markdown code blocks (enclosed in triple backticks).
|
307
|
+
If no code blocks are found, returns the original text.
|
308
|
+
|
309
|
+
Args:
|
310
|
+
text (str): Input text that may contain markdown code blocks
|
311
|
+
|
312
|
+
Returns:
|
313
|
+
str: Extracted code if code blocks exist, otherwise original text
|
314
|
+
"""
|
315
|
+
# Regex pattern to match content between triple backticks
|
316
|
+
# This handles multiline content and optional language specifier
|
317
|
+
pattern = r"^```(?:\w*\n)?(.*?)```(\n)*$"
|
318
|
+
|
319
|
+
# Search with DOTALL flag to match across multiple lines
|
320
|
+
mtch = re.search(pattern, text, re.DOTALL)
|
321
|
+
|
322
|
+
if mtch:
|
323
|
+
# Return only the content of the first capturing group
|
324
|
+
return mtch.group(1)
|
325
|
+
else:
|
326
|
+
# No code blocks found, return original text
|
327
|
+
return text
|
328
|
+
|
329
|
+
for pg_idx, page in enumerate(conv_res.pages):
|
330
|
+
page_no = pg_idx + 1 # FIXME: might be incorrect
|
331
|
+
|
332
|
+
predicted_text = ""
|
333
|
+
if page.predictions.vlm_response:
|
334
|
+
predicted_text = page.predictions.vlm_response.text + "\n\n"
|
335
|
+
|
336
|
+
predicted_text = _extract_html_code(text=predicted_text)
|
337
|
+
|
338
|
+
response_bytes = BytesIO(predicted_text.encode("utf8"))
|
339
|
+
out_doc = InputDocument(
|
340
|
+
path_or_stream=response_bytes,
|
341
|
+
filename=conv_res.input.file.name,
|
342
|
+
format=InputFormat.MD,
|
343
|
+
backend=HTMLDocumentBackend,
|
344
|
+
)
|
345
|
+
backend = HTMLDocumentBackend(
|
346
|
+
in_doc=out_doc,
|
347
|
+
path_or_stream=response_bytes,
|
348
|
+
)
|
349
|
+
page_doc = backend.convert()
|
350
|
+
|
351
|
+
if page.image is not None:
|
352
|
+
pg_width = page.image.width
|
353
|
+
pg_height = page.image.height
|
354
|
+
else:
|
355
|
+
pg_width = 1
|
356
|
+
pg_height = 1
|
357
|
+
|
358
|
+
conv_res.document.add_page(
|
359
|
+
page_no=page_no,
|
360
|
+
size=Size(width=pg_width, height=pg_height),
|
361
|
+
image=ImageRef.from_pil(image=page.image, dpi=72)
|
362
|
+
if page.image
|
363
|
+
else None,
|
364
|
+
)
|
365
|
+
|
366
|
+
for item, level in page_doc.iterate_items():
|
367
|
+
item.prov = [
|
368
|
+
ProvenanceItem(
|
369
|
+
page_no=pg_idx + 1,
|
370
|
+
bbox=BoundingBox(
|
371
|
+
t=0.0, b=0.0, l=0.0, r=0.0
|
372
|
+
), # FIXME: would be nice not to have to "fake" it
|
373
|
+
charspan=[0, 0],
|
374
|
+
)
|
375
|
+
]
|
376
|
+
conv_res.document.append_child_item(child=item)
|
377
|
+
|
378
|
+
return conv_res.document
|
212
379
|
|
213
380
|
@classmethod
|
214
381
|
def get_default_options(cls) -> VlmPipelineOptions:
|
@@ -1,13 +1,16 @@
|
|
1
1
|
import logging
|
2
|
+
from typing import List, Optional
|
2
3
|
|
3
4
|
import torch
|
4
5
|
|
5
|
-
from docling.datamodel.
|
6
|
+
from docling.datamodel.accelerator_options import AcceleratorDevice
|
6
7
|
|
7
8
|
_log = logging.getLogger(__name__)
|
8
9
|
|
9
10
|
|
10
|
-
def decide_device(
|
11
|
+
def decide_device(
|
12
|
+
accelerator_device: str, supported_devices: Optional[List[AcceleratorDevice]] = None
|
13
|
+
) -> str:
|
11
14
|
r"""
|
12
15
|
Resolve the device based on the acceleration options and the available devices in the system.
|
13
16
|
|
@@ -20,6 +23,18 @@ def decide_device(accelerator_device: str) -> str:
|
|
20
23
|
has_cuda = torch.backends.cuda.is_built() and torch.cuda.is_available()
|
21
24
|
has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
|
22
25
|
|
26
|
+
if supported_devices is not None:
|
27
|
+
if has_cuda and AcceleratorDevice.CUDA not in supported_devices:
|
28
|
+
_log.info(
|
29
|
+
f"Removing CUDA from available devices because it is not in {supported_devices=}"
|
30
|
+
)
|
31
|
+
has_cuda = False
|
32
|
+
if has_mps and AcceleratorDevice.MPS not in supported_devices:
|
33
|
+
_log.info(
|
34
|
+
f"Removing MPS from available devices because it is not in {supported_devices=}"
|
35
|
+
)
|
36
|
+
has_mps = False
|
37
|
+
|
23
38
|
if accelerator_device == AcceleratorDevice.AUTO.value: # Handle 'auto'
|
24
39
|
if has_cuda:
|
25
40
|
device = "cuda:0"
|
@@ -4,18 +4,20 @@ from typing import Optional
|
|
4
4
|
|
5
5
|
from docling.datamodel.pipeline_options import (
|
6
6
|
granite_picture_description,
|
7
|
-
smoldocling_vlm_conversion_options,
|
8
|
-
smoldocling_vlm_mlx_conversion_options,
|
9
7
|
smolvlm_picture_description,
|
10
8
|
)
|
11
9
|
from docling.datamodel.settings import settings
|
10
|
+
from docling.datamodel.vlm_model_specs import (
|
11
|
+
SMOLDOCLING_MLX,
|
12
|
+
SMOLDOCLING_TRANSFORMERS,
|
13
|
+
)
|
12
14
|
from docling.models.code_formula_model import CodeFormulaModel
|
13
15
|
from docling.models.document_picture_classifier import DocumentPictureClassifier
|
14
16
|
from docling.models.easyocr_model import EasyOcrModel
|
15
|
-
from docling.models.hf_vlm_model import HuggingFaceVlmModel
|
16
17
|
from docling.models.layout_model import LayoutModel
|
17
18
|
from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
|
18
19
|
from docling.models.table_structure_model import TableStructureModel
|
20
|
+
from docling.models.utils.hf_model_download import download_hf_model
|
19
21
|
|
20
22
|
_log = logging.getLogger(__name__)
|
21
23
|
|
@@ -75,7 +77,7 @@ def download_models(
|
|
75
77
|
|
76
78
|
if with_smolvlm:
|
77
79
|
_log.info("Downloading SmolVlm model...")
|
78
|
-
|
80
|
+
download_hf_model(
|
79
81
|
repo_id=smolvlm_picture_description.repo_id,
|
80
82
|
local_dir=output_dir / smolvlm_picture_description.repo_cache_folder,
|
81
83
|
force=force,
|
@@ -84,26 +86,25 @@ def download_models(
|
|
84
86
|
|
85
87
|
if with_smoldocling:
|
86
88
|
_log.info("Downloading SmolDocling model...")
|
87
|
-
|
88
|
-
repo_id=
|
89
|
-
local_dir=output_dir /
|
89
|
+
download_hf_model(
|
90
|
+
repo_id=SMOLDOCLING_TRANSFORMERS.repo_id,
|
91
|
+
local_dir=output_dir / SMOLDOCLING_TRANSFORMERS.repo_cache_folder,
|
90
92
|
force=force,
|
91
93
|
progress=progress,
|
92
94
|
)
|
93
95
|
|
94
96
|
if with_smoldocling_mlx:
|
95
97
|
_log.info("Downloading SmolDocling MLX model...")
|
96
|
-
|
97
|
-
repo_id=
|
98
|
-
local_dir=output_dir
|
99
|
-
/ smoldocling_vlm_mlx_conversion_options.repo_cache_folder,
|
98
|
+
download_hf_model(
|
99
|
+
repo_id=SMOLDOCLING_MLX.repo_id,
|
100
|
+
local_dir=output_dir / SMOLDOCLING_MLX.repo_cache_folder,
|
100
101
|
force=force,
|
101
102
|
progress=progress,
|
102
103
|
)
|
103
104
|
|
104
105
|
if with_granite_vision:
|
105
106
|
_log.info("Downloading Granite Vision model...")
|
106
|
-
|
107
|
+
download_hf_model(
|
107
108
|
repo_id=granite_picture_description.repo_id,
|
108
109
|
local_dir=output_dir / granite_picture_description.repo_cache_folder,
|
109
110
|
force=force,
|
@@ -1,67 +1,68 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.36.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
|
-
|
6
|
-
License: MIT
|
5
|
+
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
|
+
License-Expression: MIT
|
7
|
+
Project-URL: homepage, https://github.com/docling-project/docling
|
8
|
+
Project-URL: repository, https://github.com/docling-project/docling
|
9
|
+
Project-URL: issues, https://github.com/docling-project/docling/issues
|
10
|
+
Project-URL: changelog, https://github.com/docling-project/docling/blob/main/CHANGELOG.md
|
7
11
|
Keywords: docling,convert,document,pdf,docx,html,markdown,layout model,segmentation,table structure,table former
|
8
|
-
|
9
|
-
|
10
|
-
|
12
|
+
Classifier: Operating System :: MacOS :: MacOS X
|
13
|
+
Classifier: Operating System :: POSIX :: Linux
|
14
|
+
Classifier: Operating System :: Microsoft :: Windows
|
11
15
|
Classifier: Development Status :: 5 - Production/Stable
|
12
16
|
Classifier: Intended Audience :: Developers
|
13
17
|
Classifier: Intended Audience :: Science/Research
|
14
|
-
Classifier:
|
15
|
-
Classifier: Operating System :: MacOS :: MacOS X
|
16
|
-
Classifier: Operating System :: POSIX :: Linux
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
17
19
|
Classifier: Programming Language :: Python :: 3
|
18
20
|
Classifier: Programming Language :: Python :: 3.9
|
19
21
|
Classifier: Programming Language :: Python :: 3.10
|
20
22
|
Classifier: Programming Language :: Python :: 3.11
|
21
23
|
Classifier: Programming Language :: Python :: 3.12
|
22
24
|
Classifier: Programming Language :: Python :: 3.13
|
23
|
-
|
24
|
-
|
25
|
-
|
25
|
+
Requires-Python: <4.0,>=3.9
|
26
|
+
Description-Content-Type: text/markdown
|
27
|
+
License-File: LICENSE
|
28
|
+
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
29
|
+
Requires-Dist: docling-core[chunking]<3.0.0,>=2.29.0
|
30
|
+
Requires-Dist: docling-ibm-models<4.0.0,>=3.4.4
|
31
|
+
Requires-Dist: docling-parse<5.0.0,>=4.0.0
|
32
|
+
Requires-Dist: filetype<2.0.0,>=1.2.0
|
33
|
+
Requires-Dist: pypdfium2<5.0.0,>=4.30.0
|
34
|
+
Requires-Dist: pydantic-settings<3.0.0,>=2.3.0
|
35
|
+
Requires-Dist: huggingface_hub<1,>=0.23
|
36
|
+
Requires-Dist: requests<3.0.0,>=2.32.2
|
37
|
+
Requires-Dist: easyocr<2.0,>=1.7
|
38
|
+
Requires-Dist: certifi>=2024.7.4
|
39
|
+
Requires-Dist: rtree<2.0.0,>=1.3.0
|
40
|
+
Requires-Dist: typer<0.16.0,>=0.12.5
|
41
|
+
Requires-Dist: python-docx<2.0.0,>=1.1.2
|
42
|
+
Requires-Dist: python-pptx<2.0.0,>=1.0.2
|
43
|
+
Requires-Dist: beautifulsoup4<5.0.0,>=4.12.3
|
44
|
+
Requires-Dist: pandas<3.0.0,>=2.1.4
|
45
|
+
Requires-Dist: marko<3.0.0,>=2.1.2
|
46
|
+
Requires-Dist: openpyxl<4.0.0,>=3.1.5
|
47
|
+
Requires-Dist: lxml<6.0.0,>=4.0.0
|
48
|
+
Requires-Dist: pillow<12.0.0,>=10.0.0
|
49
|
+
Requires-Dist: tqdm<5.0.0,>=4.65.0
|
50
|
+
Requires-Dist: pluggy<2.0.0,>=1.0.0
|
51
|
+
Requires-Dist: pylatexenc<3.0,>=2.10
|
52
|
+
Requires-Dist: click<8.2.0
|
53
|
+
Requires-Dist: scipy<2.0.0,>=1.6.0
|
26
54
|
Provides-Extra: tesserocr
|
55
|
+
Requires-Dist: tesserocr<3.0.0,>=2.7.1; extra == "tesserocr"
|
56
|
+
Provides-Extra: ocrmac
|
57
|
+
Requires-Dist: ocrmac<2.0.0,>=1.0.0; sys_platform == "darwin" and extra == "ocrmac"
|
27
58
|
Provides-Extra: vlm
|
28
|
-
Requires-Dist:
|
29
|
-
Requires-Dist:
|
30
|
-
Requires-Dist:
|
31
|
-
|
32
|
-
Requires-Dist:
|
33
|
-
Requires-Dist:
|
34
|
-
|
35
|
-
Requires-Dist: easyocr (>=1.7,<2.0)
|
36
|
-
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
37
|
-
Requires-Dist: huggingface_hub (>=0.23,<1)
|
38
|
-
Requires-Dist: lxml (>=4.0.0,<6.0.0)
|
39
|
-
Requires-Dist: marko (>=2.1.2,<3.0.0)
|
40
|
-
Requires-Dist: ocrmac (>=1.0.0,<2.0.0) ; (sys_platform == "darwin") and (extra == "ocrmac")
|
41
|
-
Requires-Dist: onnxruntime (>=1.7.0,<1.20.0) ; (python_version < "3.10") and (extra == "rapidocr")
|
42
|
-
Requires-Dist: onnxruntime (>=1.7.0,<2.0.0) ; (python_version >= "3.10") and (extra == "rapidocr")
|
43
|
-
Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
|
44
|
-
Requires-Dist: pandas (>=2.1.4,<3.0.0)
|
45
|
-
Requires-Dist: pillow (>=10.0.0,<12.0.0)
|
46
|
-
Requires-Dist: pluggy (>=1.0.0,<2.0.0)
|
47
|
-
Requires-Dist: pydantic (>=2.0.0,<3.0.0)
|
48
|
-
Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
|
49
|
-
Requires-Dist: pylatexenc (>=2.10,<3.0)
|
50
|
-
Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
|
51
|
-
Requires-Dist: python-docx (>=1.1.2,<2.0.0)
|
52
|
-
Requires-Dist: python-pptx (>=1.0.2,<2.0.0)
|
53
|
-
Requires-Dist: rapidocr-onnxruntime (>=1.4.0,<2.0.0) ; (python_version < "3.13") and (extra == "rapidocr")
|
54
|
-
Requires-Dist: requests (>=2.32.2,<3.0.0)
|
55
|
-
Requires-Dist: rtree (>=1.3.0,<2.0.0)
|
56
|
-
Requires-Dist: scipy (>=1.6.0,<1.14.0) ; python_version < "3.10"
|
57
|
-
Requires-Dist: scipy (>=1.6.0,<2.0.0) ; python_version >= "3.10"
|
58
|
-
Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
|
59
|
-
Requires-Dist: tqdm (>=4.65.0,<5.0.0)
|
60
|
-
Requires-Dist: transformers (>=4.42.0,<4.43.0) ; (sys_platform == "darwin" and platform_machine == "x86_64") and (extra == "vlm")
|
61
|
-
Requires-Dist: transformers (>=4.46.0,<5.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
|
62
|
-
Requires-Dist: typer (>=0.12.5,<0.16.0)
|
63
|
-
Project-URL: Repository, https://github.com/docling-project/docling
|
64
|
-
Description-Content-Type: text/markdown
|
59
|
+
Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
|
60
|
+
Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
|
61
|
+
Requires-Dist: mlx-vlm>=0.1.22; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
|
62
|
+
Provides-Extra: rapidocr
|
63
|
+
Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
|
64
|
+
Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
|
65
|
+
Dynamic: license-file
|
65
66
|
|
66
67
|
<p align="center">
|
67
68
|
<a href="https://github.com/docling-project/docling">
|
@@ -79,9 +80,8 @@ Description-Content-Type: text/markdown
|
|
79
80
|
[](https://docling-project.github.io/docling/)
|
80
81
|
[](https://pypi.org/project/docling/)
|
81
82
|
[](https://pypi.org/project/docling/)
|
82
|
-
[](https://pycqa.github.io/isort/)
|
83
|
+
[](https://github.com/astral-sh/uv)
|
84
|
+
[](https://github.com/astral-sh/ruff)
|
85
85
|
[](https://pydantic.dev)
|
86
86
|
[](https://github.com/pre-commit/pre-commit)
|
87
87
|
[](https://opensource.org/licenses/MIT)
|
@@ -101,7 +101,7 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
|
101
101
|
* 🔒 Local execution capabilities for sensitive data and air-gapped environments
|
102
102
|
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
103
103
|
* 🔍 Extensive OCR support for scanned PDFs and images
|
104
|
-
* 🥚 Support of Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
|
104
|
+
* 🥚 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
|
105
105
|
* 💻 Simple and convenient CLI
|
106
106
|
|
107
107
|
### Coming soon
|
@@ -214,4 +214,3 @@ The project was started by the AI for knowledge team at IBM Research Zurich.
|
|
214
214
|
[supported_formats]: https://docling-project.github.io/docling/usage/supported_formats/
|
215
215
|
[docling_document]: https://docling-project.github.io/docling/concepts/docling_document/
|
216
216
|
[integrations]: https://docling-project.github.io/docling/integrations/
|
217
|
-
|