docling 2.51.0__py3-none-any.whl → 2.53.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/cli/main.py +44 -1
- docling/cli/models.py +4 -0
- docling/datamodel/pipeline_options.py +21 -12
- docling/datamodel/vlm_model_specs.py +30 -0
- docling/models/base_model.py +27 -2
- docling/models/easyocr_model.py +19 -9
- docling/models/picture_description_vlm_model.py +1 -1
- docling/models/rapid_ocr_model.py +40 -25
- docling/models/vlm_models_inline/hf_transformers_model.py +1 -1
- docling/models/vlm_models_inline/nuextract_transformers_model.py +1 -1
- docling/pipeline/asr_pipeline.py +1 -13
- docling/pipeline/base_extraction_pipeline.py +17 -3
- docling/pipeline/base_pipeline.py +75 -9
- docling/pipeline/extraction_vlm_pipeline.py +9 -16
- docling/pipeline/simple_pipeline.py +6 -6
- docling/pipeline/standard_pdf_pipeline.py +6 -55
- docling/pipeline/threaded_standard_pdf_pipeline.py +102 -62
- docling/pipeline/vlm_pipeline.py +3 -15
- docling/utils/model_downloader.py +22 -0
- {docling-2.51.0.dist-info → docling-2.53.0.dist-info}/METADATA +10 -6
- {docling-2.51.0.dist-info → docling-2.53.0.dist-info}/RECORD +25 -25
- {docling-2.51.0.dist-info → docling-2.53.0.dist-info}/WHEEL +0 -0
- {docling-2.51.0.dist-info → docling-2.53.0.dist-info}/entry_points.txt +0 -0
- {docling-2.51.0.dist-info → docling-2.53.0.dist-info}/licenses/LICENSE +0 -0
- {docling-2.51.0.dist-info → docling-2.53.0.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import inspect
|
|
2
2
|
import json
|
|
3
3
|
import logging
|
|
4
|
-
from pathlib import Path
|
|
5
4
|
from typing import Optional
|
|
6
5
|
|
|
7
6
|
from PIL.Image import Image
|
|
@@ -16,7 +15,10 @@ from docling.datamodel.extraction import (
|
|
|
16
15
|
ExtractionResult,
|
|
17
16
|
ExtractionTemplateType,
|
|
18
17
|
)
|
|
19
|
-
from docling.datamodel.pipeline_options import
|
|
18
|
+
from docling.datamodel.pipeline_options import (
|
|
19
|
+
PipelineOptions,
|
|
20
|
+
VlmExtractionPipelineOptions,
|
|
21
|
+
)
|
|
20
22
|
from docling.datamodel.settings import settings
|
|
21
23
|
from docling.models.vlm_models_inline.nuextract_transformers_model import (
|
|
22
24
|
NuExtractTransformersModel,
|
|
@@ -35,22 +37,10 @@ class ExtractionVlmPipeline(BaseExtractionPipeline):
|
|
|
35
37
|
self.accelerator_options = pipeline_options.accelerator_options
|
|
36
38
|
self.pipeline_options: VlmExtractionPipelineOptions
|
|
37
39
|
|
|
38
|
-
artifacts_path: Optional[Path] = None
|
|
39
|
-
if pipeline_options.artifacts_path is not None:
|
|
40
|
-
artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
|
|
41
|
-
elif settings.artifacts_path is not None:
|
|
42
|
-
artifacts_path = Path(settings.artifacts_path).expanduser()
|
|
43
|
-
|
|
44
|
-
if artifacts_path is not None and not artifacts_path.is_dir():
|
|
45
|
-
raise RuntimeError(
|
|
46
|
-
f"The value of {artifacts_path=} is not valid. "
|
|
47
|
-
"When defined, it must point to a folder containing all models required by the pipeline."
|
|
48
|
-
)
|
|
49
|
-
|
|
50
40
|
# Create VLM model instance
|
|
51
41
|
self.vlm_model = NuExtractTransformersModel(
|
|
52
42
|
enabled=True,
|
|
53
|
-
artifacts_path=artifacts_path, # Will download automatically
|
|
43
|
+
artifacts_path=self.artifacts_path, # Will download automatically
|
|
54
44
|
accelerator_options=self.accelerator_options,
|
|
55
45
|
vlm_options=pipeline_options.vlm_options,
|
|
56
46
|
)
|
|
@@ -194,11 +184,14 @@ class ExtractionVlmPipeline(BaseExtractionPipeline):
|
|
|
194
184
|
class ExtractionTemplateFactory(ModelFactory[template]): # type: ignore
|
|
195
185
|
__use_examples__ = True # prefer Field(examples=...) when present
|
|
196
186
|
__use_defaults__ = True # use field defaults instead of random values
|
|
187
|
+
__check_model__ = (
|
|
188
|
+
True # setting the value to avoid deprecation warnings
|
|
189
|
+
)
|
|
197
190
|
|
|
198
191
|
return ExtractionTemplateFactory.build().model_dump_json(indent=2) # type: ignore
|
|
199
192
|
else:
|
|
200
193
|
raise ValueError(f"Unsupported template type: {type(template)}")
|
|
201
194
|
|
|
202
195
|
@classmethod
|
|
203
|
-
def get_default_options(cls) ->
|
|
196
|
+
def get_default_options(cls) -> PipelineOptions:
|
|
204
197
|
return VlmExtractionPipelineOptions()
|
|
@@ -6,21 +6,21 @@ from docling.backend.abstract_backend import (
|
|
|
6
6
|
)
|
|
7
7
|
from docling.datamodel.base_models import ConversionStatus
|
|
8
8
|
from docling.datamodel.document import ConversionResult
|
|
9
|
-
from docling.datamodel.pipeline_options import
|
|
10
|
-
from docling.pipeline.base_pipeline import
|
|
9
|
+
from docling.datamodel.pipeline_options import ConvertPipelineOptions
|
|
10
|
+
from docling.pipeline.base_pipeline import ConvertPipeline
|
|
11
11
|
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
|
12
12
|
|
|
13
13
|
_log = logging.getLogger(__name__)
|
|
14
14
|
|
|
15
15
|
|
|
16
|
-
class SimplePipeline(
|
|
16
|
+
class SimplePipeline(ConvertPipeline):
|
|
17
17
|
"""SimpleModelPipeline.
|
|
18
18
|
|
|
19
19
|
This class is used at the moment for formats / backends
|
|
20
20
|
which produce straight DoclingDocument output.
|
|
21
21
|
"""
|
|
22
22
|
|
|
23
|
-
def __init__(self, pipeline_options:
|
|
23
|
+
def __init__(self, pipeline_options: ConvertPipelineOptions):
|
|
24
24
|
super().__init__(pipeline_options)
|
|
25
25
|
|
|
26
26
|
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
|
@@ -47,8 +47,8 @@ class SimplePipeline(BasePipeline):
|
|
|
47
47
|
return ConversionStatus.SUCCESS
|
|
48
48
|
|
|
49
49
|
@classmethod
|
|
50
|
-
def get_default_options(cls) ->
|
|
51
|
-
return
|
|
50
|
+
def get_default_options(cls) -> ConvertPipelineOptions:
|
|
51
|
+
return ConvertPipelineOptions()
|
|
52
52
|
|
|
53
53
|
@classmethod
|
|
54
54
|
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
|
@@ -15,18 +15,13 @@ from docling.datamodel.pipeline_options import PdfPipelineOptions
|
|
|
15
15
|
from docling.datamodel.settings import settings
|
|
16
16
|
from docling.models.base_ocr_model import BaseOcrModel
|
|
17
17
|
from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
|
|
18
|
-
from docling.models.
|
|
19
|
-
DocumentPictureClassifier,
|
|
20
|
-
DocumentPictureClassifierOptions,
|
|
21
|
-
)
|
|
22
|
-
from docling.models.factories import get_ocr_factory, get_picture_description_factory
|
|
18
|
+
from docling.models.factories import get_ocr_factory
|
|
23
19
|
from docling.models.layout_model import LayoutModel
|
|
24
20
|
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
|
|
25
21
|
from docling.models.page_preprocessing_model import (
|
|
26
22
|
PagePreprocessingModel,
|
|
27
23
|
PagePreprocessingOptions,
|
|
28
24
|
)
|
|
29
|
-
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
|
30
25
|
from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions
|
|
31
26
|
from docling.models.table_structure_model import TableStructureModel
|
|
32
27
|
from docling.pipeline.base_pipeline import PaginatedPipeline
|
|
@@ -41,18 +36,6 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
|
41
36
|
super().__init__(pipeline_options)
|
|
42
37
|
self.pipeline_options: PdfPipelineOptions
|
|
43
38
|
|
|
44
|
-
artifacts_path: Optional[Path] = None
|
|
45
|
-
if pipeline_options.artifacts_path is not None:
|
|
46
|
-
artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
|
|
47
|
-
elif settings.artifacts_path is not None:
|
|
48
|
-
artifacts_path = Path(settings.artifacts_path).expanduser()
|
|
49
|
-
|
|
50
|
-
if artifacts_path is not None and not artifacts_path.is_dir():
|
|
51
|
-
raise RuntimeError(
|
|
52
|
-
f"The value of {artifacts_path=} is not valid. "
|
|
53
|
-
"When defined, it must point to a folder containing all models required by the pipeline."
|
|
54
|
-
)
|
|
55
|
-
|
|
56
39
|
with warnings.catch_warnings(): # deprecated generate_table_images
|
|
57
40
|
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
|
58
41
|
self.keep_images = (
|
|
@@ -63,7 +46,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
|
63
46
|
|
|
64
47
|
self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
|
|
65
48
|
|
|
66
|
-
ocr_model = self.get_ocr_model(artifacts_path=artifacts_path)
|
|
49
|
+
ocr_model = self.get_ocr_model(artifacts_path=self.artifacts_path)
|
|
67
50
|
|
|
68
51
|
self.build_pipe = [
|
|
69
52
|
# Pre-processing
|
|
@@ -76,14 +59,14 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
|
76
59
|
ocr_model,
|
|
77
60
|
# Layout model
|
|
78
61
|
LayoutModel(
|
|
79
|
-
artifacts_path=artifacts_path,
|
|
62
|
+
artifacts_path=self.artifacts_path,
|
|
80
63
|
accelerator_options=pipeline_options.accelerator_options,
|
|
81
64
|
options=pipeline_options.layout_options,
|
|
82
65
|
),
|
|
83
66
|
# Table structure model
|
|
84
67
|
TableStructureModel(
|
|
85
68
|
enabled=pipeline_options.do_table_structure,
|
|
86
|
-
artifacts_path=artifacts_path,
|
|
69
|
+
artifacts_path=self.artifacts_path,
|
|
87
70
|
options=pipeline_options.table_structure_options,
|
|
88
71
|
accelerator_options=pipeline_options.accelerator_options,
|
|
89
72
|
),
|
|
@@ -91,37 +74,19 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
|
91
74
|
PageAssembleModel(options=PageAssembleOptions()),
|
|
92
75
|
]
|
|
93
76
|
|
|
94
|
-
# Picture description model
|
|
95
|
-
if (
|
|
96
|
-
picture_description_model := self.get_picture_description_model(
|
|
97
|
-
artifacts_path=artifacts_path
|
|
98
|
-
)
|
|
99
|
-
) is None:
|
|
100
|
-
raise RuntimeError(
|
|
101
|
-
f"The specified picture description kind is not supported: {pipeline_options.picture_description_options.kind}."
|
|
102
|
-
)
|
|
103
|
-
|
|
104
77
|
self.enrichment_pipe = [
|
|
105
78
|
# Code Formula Enrichment Model
|
|
106
79
|
CodeFormulaModel(
|
|
107
80
|
enabled=pipeline_options.do_code_enrichment
|
|
108
81
|
or pipeline_options.do_formula_enrichment,
|
|
109
|
-
artifacts_path=artifacts_path,
|
|
82
|
+
artifacts_path=self.artifacts_path,
|
|
110
83
|
options=CodeFormulaModelOptions(
|
|
111
84
|
do_code_enrichment=pipeline_options.do_code_enrichment,
|
|
112
85
|
do_formula_enrichment=pipeline_options.do_formula_enrichment,
|
|
113
86
|
),
|
|
114
87
|
accelerator_options=pipeline_options.accelerator_options,
|
|
115
88
|
),
|
|
116
|
-
|
|
117
|
-
DocumentPictureClassifier(
|
|
118
|
-
enabled=pipeline_options.do_picture_classification,
|
|
119
|
-
artifacts_path=artifacts_path,
|
|
120
|
-
options=DocumentPictureClassifierOptions(),
|
|
121
|
-
accelerator_options=pipeline_options.accelerator_options,
|
|
122
|
-
),
|
|
123
|
-
# Document Picture description
|
|
124
|
-
picture_description_model,
|
|
89
|
+
*self.enrichment_pipe,
|
|
125
90
|
]
|
|
126
91
|
|
|
127
92
|
if (
|
|
@@ -158,20 +123,6 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
|
158
123
|
accelerator_options=self.pipeline_options.accelerator_options,
|
|
159
124
|
)
|
|
160
125
|
|
|
161
|
-
def get_picture_description_model(
|
|
162
|
-
self, artifacts_path: Optional[Path] = None
|
|
163
|
-
) -> Optional[PictureDescriptionBaseModel]:
|
|
164
|
-
factory = get_picture_description_factory(
|
|
165
|
-
allow_external_plugins=self.pipeline_options.allow_external_plugins
|
|
166
|
-
)
|
|
167
|
-
return factory.create_instance(
|
|
168
|
-
options=self.pipeline_options.picture_description_options,
|
|
169
|
-
enabled=self.pipeline_options.do_picture_description,
|
|
170
|
-
enable_remote_services=self.pipeline_options.enable_remote_services,
|
|
171
|
-
artifacts_path=artifacts_path,
|
|
172
|
-
accelerator_options=self.pipeline_options.accelerator_options,
|
|
173
|
-
)
|
|
174
|
-
|
|
175
126
|
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
|
|
176
127
|
with TimeRecorder(conv_res, "page_init"):
|
|
177
128
|
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
|
|
@@ -20,10 +20,14 @@ import itertools
|
|
|
20
20
|
import logging
|
|
21
21
|
import threading
|
|
22
22
|
import time
|
|
23
|
+
import warnings
|
|
23
24
|
from collections import defaultdict, deque
|
|
24
25
|
from dataclasses import dataclass, field
|
|
25
26
|
from pathlib import Path
|
|
26
|
-
from typing import Any, Iterable, List, Optional, Sequence, Tuple
|
|
27
|
+
from typing import Any, Iterable, List, Optional, Sequence, Tuple, cast
|
|
28
|
+
|
|
29
|
+
import numpy as np
|
|
30
|
+
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
|
|
27
31
|
|
|
28
32
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
|
29
33
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
|
@@ -32,21 +36,16 @@ from docling.datamodel.document import ConversionResult
|
|
|
32
36
|
from docling.datamodel.pipeline_options import ThreadedPdfPipelineOptions
|
|
33
37
|
from docling.datamodel.settings import settings
|
|
34
38
|
from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
|
|
35
|
-
from docling.models.
|
|
36
|
-
DocumentPictureClassifier,
|
|
37
|
-
DocumentPictureClassifierOptions,
|
|
38
|
-
)
|
|
39
|
-
from docling.models.factories import get_ocr_factory, get_picture_description_factory
|
|
39
|
+
from docling.models.factories import get_ocr_factory
|
|
40
40
|
from docling.models.layout_model import LayoutModel
|
|
41
41
|
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
|
|
42
42
|
from docling.models.page_preprocessing_model import (
|
|
43
43
|
PagePreprocessingModel,
|
|
44
44
|
PagePreprocessingOptions,
|
|
45
45
|
)
|
|
46
|
-
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
|
47
46
|
from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions
|
|
48
47
|
from docling.models.table_structure_model import TableStructureModel
|
|
49
|
-
from docling.pipeline.base_pipeline import
|
|
48
|
+
from docling.pipeline.base_pipeline import ConvertPipeline
|
|
50
49
|
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
|
51
50
|
from docling.utils.utils import chunkify
|
|
52
51
|
|
|
@@ -294,7 +293,7 @@ class RunContext:
|
|
|
294
293
|
# ──────────────────────────────────────────────────────────────────────────────
|
|
295
294
|
|
|
296
295
|
|
|
297
|
-
class ThreadedStandardPdfPipeline(
|
|
296
|
+
class ThreadedStandardPdfPipeline(ConvertPipeline):
|
|
298
297
|
"""High-performance PDF pipeline with multi-threaded stages."""
|
|
299
298
|
|
|
300
299
|
def __init__(self, pipeline_options: ThreadedPdfPipelineOptions) -> None:
|
|
@@ -310,7 +309,7 @@ class ThreadedStandardPdfPipeline(BasePipeline):
|
|
|
310
309
|
# ────────────────────────────────────────────────────────────────────────
|
|
311
310
|
|
|
312
311
|
def _init_models(self) -> None:
|
|
313
|
-
art_path = self.
|
|
312
|
+
art_path = self.artifacts_path
|
|
314
313
|
self.keep_images = (
|
|
315
314
|
self.pipeline_options.generate_page_images
|
|
316
315
|
or self.pipeline_options.generate_picture_images
|
|
@@ -337,32 +336,20 @@ class ThreadedStandardPdfPipeline(BasePipeline):
|
|
|
337
336
|
self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
|
|
338
337
|
|
|
339
338
|
# --- optional enrichment ------------------------------------------------
|
|
340
|
-
self.enrichment_pipe = [
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
339
|
+
self.enrichment_pipe = [
|
|
340
|
+
# Code Formula Enrichment Model
|
|
341
|
+
CodeFormulaModel(
|
|
342
|
+
enabled=self.pipeline_options.do_code_enrichment
|
|
343
|
+
or self.pipeline_options.do_formula_enrichment,
|
|
344
|
+
artifacts_path=self.artifacts_path,
|
|
345
|
+
options=CodeFormulaModelOptions(
|
|
346
|
+
do_code_enrichment=self.pipeline_options.do_code_enrichment,
|
|
347
|
+
do_formula_enrichment=self.pipeline_options.do_formula_enrichment,
|
|
348
|
+
),
|
|
349
|
+
accelerator_options=self.pipeline_options.accelerator_options,
|
|
348
350
|
),
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
if code_formula.enabled:
|
|
352
|
-
self.enrichment_pipe.append(code_formula)
|
|
353
|
-
|
|
354
|
-
picture_classifier = DocumentPictureClassifier(
|
|
355
|
-
enabled=self.pipeline_options.do_picture_classification,
|
|
356
|
-
artifacts_path=art_path,
|
|
357
|
-
options=DocumentPictureClassifierOptions(),
|
|
358
|
-
accelerator_options=self.pipeline_options.accelerator_options,
|
|
359
|
-
)
|
|
360
|
-
if picture_classifier.enabled:
|
|
361
|
-
self.enrichment_pipe.append(picture_classifier)
|
|
362
|
-
|
|
363
|
-
picture_descr = self._make_picture_description_model(art_path)
|
|
364
|
-
if picture_descr and picture_descr.enabled:
|
|
365
|
-
self.enrichment_pipe.append(picture_descr)
|
|
351
|
+
*self.enrichment_pipe,
|
|
352
|
+
]
|
|
366
353
|
|
|
367
354
|
self.keep_backend = any(
|
|
368
355
|
(
|
|
@@ -374,19 +361,6 @@ class ThreadedStandardPdfPipeline(BasePipeline):
|
|
|
374
361
|
)
|
|
375
362
|
|
|
376
363
|
# ---------------------------------------------------------------- helpers
|
|
377
|
-
def _resolve_artifacts_path(self) -> Optional[Path]:
|
|
378
|
-
if self.pipeline_options.artifacts_path:
|
|
379
|
-
p = Path(self.pipeline_options.artifacts_path).expanduser()
|
|
380
|
-
elif settings.artifacts_path:
|
|
381
|
-
p = Path(settings.artifacts_path).expanduser()
|
|
382
|
-
else:
|
|
383
|
-
return None
|
|
384
|
-
if not p.is_dir():
|
|
385
|
-
raise RuntimeError(
|
|
386
|
-
f"{p} does not exist or is not a directory containing the required models"
|
|
387
|
-
)
|
|
388
|
-
return p
|
|
389
|
-
|
|
390
364
|
def _make_ocr_model(self, art_path: Optional[Path]) -> Any:
|
|
391
365
|
factory = get_ocr_factory(
|
|
392
366
|
allow_external_plugins=self.pipeline_options.allow_external_plugins
|
|
@@ -398,20 +372,6 @@ class ThreadedStandardPdfPipeline(BasePipeline):
|
|
|
398
372
|
accelerator_options=self.pipeline_options.accelerator_options,
|
|
399
373
|
)
|
|
400
374
|
|
|
401
|
-
def _make_picture_description_model(
|
|
402
|
-
self, art_path: Optional[Path]
|
|
403
|
-
) -> Optional[PictureDescriptionBaseModel]:
|
|
404
|
-
factory = get_picture_description_factory(
|
|
405
|
-
allow_external_plugins=self.pipeline_options.allow_external_plugins
|
|
406
|
-
)
|
|
407
|
-
return factory.create_instance(
|
|
408
|
-
options=self.pipeline_options.picture_description_options,
|
|
409
|
-
enabled=self.pipeline_options.do_picture_description,
|
|
410
|
-
enable_remote_services=self.pipeline_options.enable_remote_services,
|
|
411
|
-
artifacts_path=art_path,
|
|
412
|
-
accelerator_options=self.pipeline_options.accelerator_options,
|
|
413
|
-
)
|
|
414
|
-
|
|
415
375
|
# ────────────────────────────────────────────────────────────────────────
|
|
416
376
|
# Build - thread pipeline
|
|
417
377
|
# ────────────────────────────────────────────────────────────────────────
|
|
@@ -585,6 +545,86 @@ class ThreadedStandardPdfPipeline(BasePipeline):
|
|
|
585
545
|
elements=elements, headers=headers, body=body
|
|
586
546
|
)
|
|
587
547
|
conv_res.document = self.reading_order_model(conv_res)
|
|
548
|
+
|
|
549
|
+
# Generate page images in the output
|
|
550
|
+
if self.pipeline_options.generate_page_images:
|
|
551
|
+
for page in conv_res.pages:
|
|
552
|
+
assert page.image is not None
|
|
553
|
+
page_no = page.page_no + 1
|
|
554
|
+
conv_res.document.pages[page_no].image = ImageRef.from_pil(
|
|
555
|
+
page.image, dpi=int(72 * self.pipeline_options.images_scale)
|
|
556
|
+
)
|
|
557
|
+
|
|
558
|
+
# Generate images of the requested element types
|
|
559
|
+
with warnings.catch_warnings(): # deprecated generate_table_images
|
|
560
|
+
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
|
561
|
+
if (
|
|
562
|
+
self.pipeline_options.generate_picture_images
|
|
563
|
+
or self.pipeline_options.generate_table_images
|
|
564
|
+
):
|
|
565
|
+
scale = self.pipeline_options.images_scale
|
|
566
|
+
for element, _level in conv_res.document.iterate_items():
|
|
567
|
+
if not isinstance(element, DocItem) or len(element.prov) == 0:
|
|
568
|
+
continue
|
|
569
|
+
if (
|
|
570
|
+
isinstance(element, PictureItem)
|
|
571
|
+
and self.pipeline_options.generate_picture_images
|
|
572
|
+
) or (
|
|
573
|
+
isinstance(element, TableItem)
|
|
574
|
+
and self.pipeline_options.generate_table_images
|
|
575
|
+
):
|
|
576
|
+
page_ix = element.prov[0].page_no - 1
|
|
577
|
+
page = next(
|
|
578
|
+
(p for p in conv_res.pages if p.page_no == page_ix),
|
|
579
|
+
cast("Page", None),
|
|
580
|
+
)
|
|
581
|
+
assert page is not None
|
|
582
|
+
assert page.size is not None
|
|
583
|
+
assert page.image is not None
|
|
584
|
+
|
|
585
|
+
crop_bbox = (
|
|
586
|
+
element.prov[0]
|
|
587
|
+
.bbox.scaled(scale=scale)
|
|
588
|
+
.to_top_left_origin(
|
|
589
|
+
page_height=page.size.height * scale
|
|
590
|
+
)
|
|
591
|
+
)
|
|
592
|
+
|
|
593
|
+
cropped_im = page.image.crop(crop_bbox.as_tuple())
|
|
594
|
+
element.image = ImageRef.from_pil(
|
|
595
|
+
cropped_im, dpi=int(72 * scale)
|
|
596
|
+
)
|
|
597
|
+
|
|
598
|
+
# Aggregate confidence values for document:
|
|
599
|
+
if len(conv_res.pages) > 0:
|
|
600
|
+
with warnings.catch_warnings():
|
|
601
|
+
warnings.filterwarnings(
|
|
602
|
+
"ignore",
|
|
603
|
+
category=RuntimeWarning,
|
|
604
|
+
message="Mean of empty slice|All-NaN slice encountered",
|
|
605
|
+
)
|
|
606
|
+
conv_res.confidence.layout_score = float(
|
|
607
|
+
np.nanmean(
|
|
608
|
+
[c.layout_score for c in conv_res.confidence.pages.values()]
|
|
609
|
+
)
|
|
610
|
+
)
|
|
611
|
+
conv_res.confidence.parse_score = float(
|
|
612
|
+
np.nanquantile(
|
|
613
|
+
[c.parse_score for c in conv_res.confidence.pages.values()],
|
|
614
|
+
q=0.1, # parse score should relate to worst 10% of pages.
|
|
615
|
+
)
|
|
616
|
+
)
|
|
617
|
+
conv_res.confidence.table_score = float(
|
|
618
|
+
np.nanmean(
|
|
619
|
+
[c.table_score for c in conv_res.confidence.pages.values()]
|
|
620
|
+
)
|
|
621
|
+
)
|
|
622
|
+
conv_res.confidence.ocr_score = float(
|
|
623
|
+
np.nanmean(
|
|
624
|
+
[c.ocr_score for c in conv_res.confidence.pages.values()]
|
|
625
|
+
)
|
|
626
|
+
)
|
|
627
|
+
|
|
588
628
|
return conv_res
|
|
589
629
|
|
|
590
630
|
# ---------------------------------------------------------------- misc
|
docling/pipeline/vlm_pipeline.py
CHANGED
|
@@ -54,18 +54,6 @@ class VlmPipeline(PaginatedPipeline):
|
|
|
54
54
|
|
|
55
55
|
self.pipeline_options: VlmPipelineOptions
|
|
56
56
|
|
|
57
|
-
artifacts_path: Optional[Path] = None
|
|
58
|
-
if pipeline_options.artifacts_path is not None:
|
|
59
|
-
artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
|
|
60
|
-
elif settings.artifacts_path is not None:
|
|
61
|
-
artifacts_path = Path(settings.artifacts_path).expanduser()
|
|
62
|
-
|
|
63
|
-
if artifacts_path is not None and not artifacts_path.is_dir():
|
|
64
|
-
raise RuntimeError(
|
|
65
|
-
f"The value of {artifacts_path=} is not valid. "
|
|
66
|
-
"When defined, it must point to a folder containing all models required by the pipeline."
|
|
67
|
-
)
|
|
68
|
-
|
|
69
57
|
# force_backend_text = False - use text that is coming from VLM response
|
|
70
58
|
# force_backend_text = True - get text from backend using bounding boxes predicted by SmolDocling doctags
|
|
71
59
|
self.force_backend_text = (
|
|
@@ -89,7 +77,7 @@ class VlmPipeline(PaginatedPipeline):
|
|
|
89
77
|
self.build_pipe = [
|
|
90
78
|
HuggingFaceMlxModel(
|
|
91
79
|
enabled=True, # must be always enabled for this pipeline to make sense.
|
|
92
|
-
artifacts_path=artifacts_path,
|
|
80
|
+
artifacts_path=self.artifacts_path,
|
|
93
81
|
accelerator_options=pipeline_options.accelerator_options,
|
|
94
82
|
vlm_options=vlm_options,
|
|
95
83
|
),
|
|
@@ -98,7 +86,7 @@ class VlmPipeline(PaginatedPipeline):
|
|
|
98
86
|
self.build_pipe = [
|
|
99
87
|
HuggingFaceTransformersVlmModel(
|
|
100
88
|
enabled=True, # must be always enabled for this pipeline to make sense.
|
|
101
|
-
artifacts_path=artifacts_path,
|
|
89
|
+
artifacts_path=self.artifacts_path,
|
|
102
90
|
accelerator_options=pipeline_options.accelerator_options,
|
|
103
91
|
vlm_options=vlm_options,
|
|
104
92
|
),
|
|
@@ -109,7 +97,7 @@ class VlmPipeline(PaginatedPipeline):
|
|
|
109
97
|
self.build_pipe = [
|
|
110
98
|
VllmVlmModel(
|
|
111
99
|
enabled=True, # must be always enabled for this pipeline to make sense.
|
|
112
|
-
artifacts_path=artifacts_path,
|
|
100
|
+
artifacts_path=self.artifacts_path,
|
|
113
101
|
accelerator_options=pipeline_options.accelerator_options,
|
|
114
102
|
vlm_options=vlm_options,
|
|
115
103
|
),
|
|
@@ -10,6 +10,8 @@ from docling.datamodel.pipeline_options import (
|
|
|
10
10
|
)
|
|
11
11
|
from docling.datamodel.settings import settings
|
|
12
12
|
from docling.datamodel.vlm_model_specs import (
|
|
13
|
+
GRANITEDOCLING_MLX,
|
|
14
|
+
GRANITEDOCLING_TRANSFORMERS,
|
|
13
15
|
SMOLDOCLING_MLX,
|
|
14
16
|
SMOLDOCLING_TRANSFORMERS,
|
|
15
17
|
)
|
|
@@ -34,6 +36,8 @@ def download_models(
|
|
|
34
36
|
with_code_formula: bool = True,
|
|
35
37
|
with_picture_classifier: bool = True,
|
|
36
38
|
with_smolvlm: bool = False,
|
|
39
|
+
with_granitedocling: bool = False,
|
|
40
|
+
with_granitedocling_mlx: bool = False,
|
|
37
41
|
with_smoldocling: bool = False,
|
|
38
42
|
with_smoldocling_mlx: bool = False,
|
|
39
43
|
with_granite_vision: bool = False,
|
|
@@ -86,6 +90,24 @@ def download_models(
|
|
|
86
90
|
progress=progress,
|
|
87
91
|
)
|
|
88
92
|
|
|
93
|
+
if with_granitedocling:
|
|
94
|
+
_log.info("Downloading GraniteDocling model...")
|
|
95
|
+
download_hf_model(
|
|
96
|
+
repo_id=GRANITEDOCLING_TRANSFORMERS.repo_id,
|
|
97
|
+
local_dir=output_dir / GRANITEDOCLING_TRANSFORMERS.repo_cache_folder,
|
|
98
|
+
force=force,
|
|
99
|
+
progress=progress,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
if with_granitedocling_mlx:
|
|
103
|
+
_log.info("Downloading GraniteDocling MLX model...")
|
|
104
|
+
download_hf_model(
|
|
105
|
+
repo_id=GRANITEDOCLING_MLX.repo_id,
|
|
106
|
+
local_dir=output_dir / GRANITEDOCLING_MLX.repo_cache_folder,
|
|
107
|
+
force=force,
|
|
108
|
+
progress=progress,
|
|
109
|
+
)
|
|
110
|
+
|
|
89
111
|
if with_smoldocling:
|
|
90
112
|
_log.info("Downloading SmolDocling model...")
|
|
91
113
|
download_hf_model(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.53.0
|
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -26,7 +26,7 @@ Requires-Python: <4.0,>=3.9
|
|
|
26
26
|
Description-Content-Type: text/markdown
|
|
27
27
|
License-File: LICENSE
|
|
28
28
|
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
|
29
|
-
Requires-Dist: docling-core[chunking]<3.0.0,>=2.
|
|
29
|
+
Requires-Dist: docling-core[chunking]<3.0.0,>=2.48.0
|
|
30
30
|
Requires-Dist: docling-parse<5.0.0,>=4.4.0
|
|
31
31
|
Requires-Dist: docling-ibm-models<4,>=3.9.1
|
|
32
32
|
Requires-Dist: filetype<2.0.0,>=1.2.0
|
|
@@ -108,18 +108,22 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
|
|
108
108
|
* 🔒 Local execution capabilities for sensitive data and air-gapped environments
|
|
109
109
|
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
|
110
110
|
* 🔍 Extensive OCR support for scanned PDFs and images
|
|
111
|
-
* 👓 Support of several Visual Language Models ([
|
|
111
|
+
* 👓 Support of several Visual Language Models ([GraniteDocling](https://huggingface.co/ibm-granite/granite-docling-258M))
|
|
112
112
|
* 🎙️ Audio support with Automatic Speech Recognition (ASR) models
|
|
113
|
+
* 🔌 Connect to any agent using the [MCP server](https://docling-project.github.io/docling/usage/mcp/)
|
|
113
114
|
* 💻 Simple and convenient CLI
|
|
114
115
|
|
|
115
116
|
### What's new
|
|
116
117
|
* 📤 Structured [information extraction][extraction] \[🧪 beta\]
|
|
118
|
+
* 📑 New layout model (**Heron**) by default, for faster PDF parsing
|
|
119
|
+
* 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications
|
|
117
120
|
|
|
118
121
|
### Coming soon
|
|
119
122
|
|
|
120
123
|
* 📝 Metadata extraction, including title, authors, references & language
|
|
121
124
|
* 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
|
|
122
125
|
* 📝 Complex chemistry understanding (Molecular structures)
|
|
126
|
+
* 📝 Parsing of Web Video Text Tracks (WebVTT) files
|
|
123
127
|
|
|
124
128
|
## Installation
|
|
125
129
|
|
|
@@ -145,7 +149,7 @@ result = converter.convert(source)
|
|
|
145
149
|
print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
|
|
146
150
|
```
|
|
147
151
|
|
|
148
|
-
More [advanced usage options](https://docling-project.github.io/docling/usage/) are available in
|
|
152
|
+
More [advanced usage options](https://docling-project.github.io/docling/usage/advanced_options/) are available in
|
|
149
153
|
the docs.
|
|
150
154
|
|
|
151
155
|
## CLI
|
|
@@ -156,9 +160,9 @@ Docling has a built-in CLI to run conversions.
|
|
|
156
160
|
docling https://arxiv.org/pdf/2206.01062
|
|
157
161
|
```
|
|
158
162
|
|
|
159
|
-
You can also use 🥚[
|
|
163
|
+
You can also use 🥚[GraniteDocling](https://huggingface.co/ibm-granite/granite-docling-258M) and other VLMs via Docling CLI:
|
|
160
164
|
```bash
|
|
161
|
-
docling --pipeline vlm --vlm-model
|
|
165
|
+
docling --pipeline vlm --vlm-model granite_docling https://arxiv.org/pdf/2206.01062
|
|
162
166
|
```
|
|
163
167
|
This will use MLX acceleration on supported Apple Silicon hardware.
|
|
164
168
|
|