docling 2.11.0__tar.gz → 2.12.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.11.0 → docling-2.12.0}/PKG-INFO +2 -2
- {docling-2.11.0 → docling-2.12.0}/docling/cli/main.py +8 -0
- {docling-2.11.0 → docling-2.12.0}/docling/datamodel/pipeline_options.py +65 -4
- {docling-2.11.0 → docling-2.12.0}/docling/models/easyocr_model.py +35 -3
- {docling-2.11.0 → docling-2.12.0}/docling/models/layout_model.py +13 -3
- {docling-2.11.0 → docling-2.12.0}/docling/models/rapid_ocr_model.py +24 -45
- {docling-2.11.0 → docling-2.12.0}/docling/models/table_structure_model.py +24 -4
- {docling-2.11.0 → docling-2.12.0}/docling/pipeline/standard_pdf_pipeline.py +7 -3
- docling-2.12.0/docling/utils/accelerator_utils.py +42 -0
- {docling-2.11.0 → docling-2.12.0}/pyproject.toml +3 -3
- {docling-2.11.0 → docling-2.12.0}/LICENSE +0 -0
- {docling-2.11.0 → docling-2.12.0}/README.md +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/__init__.py +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/backend/__init__.py +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/backend/html_backend.py +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/backend/md_backend.py +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/backend/msword_backend.py +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/backend/pdf_backend.py +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/chunking/__init__.py +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/cli/__init__.py +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/datamodel/base_models.py +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/datamodel/document.py +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/datamodel/settings.py +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/document_converter.py +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/exceptions.py +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/models/__init__.py +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/models/base_model.py +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/models/base_ocr_model.py +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/models/ds_glm_model.py +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/models/page_assemble_model.py +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/pipeline/__init__.py +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/py.typed +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/utils/__init__.py +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/utils/export.py +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/utils/glm_utils.py +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/utils/layout_utils.py +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/utils/profiling.py +0 -0
- {docling-2.11.0 → docling-2.12.0}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.12.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -27,7 +27,7 @@ Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
|
27
27
|
Requires-Dist: certifi (>=2024.7.4)
|
28
28
|
Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
|
29
29
|
Requires-Dist: docling-core[chunking] (>=2.9.0,<3.0.0)
|
30
|
-
Requires-Dist: docling-ibm-models (>=
|
30
|
+
Requires-Dist: docling-ibm-models (>=3.1.0,<4.0.0)
|
31
31
|
Requires-Dist: docling-parse (>=3.0.0,<4.0.0)
|
32
32
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
33
33
|
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
@@ -26,6 +26,8 @@ from docling.datamodel.base_models import (
|
|
26
26
|
)
|
27
27
|
from docling.datamodel.document import ConversionResult
|
28
28
|
from docling.datamodel.pipeline_options import (
|
29
|
+
AcceleratorDevice,
|
30
|
+
AcceleratorOptions,
|
29
31
|
EasyOcrOptions,
|
30
32
|
OcrEngine,
|
31
33
|
OcrMacOptions,
|
@@ -257,6 +259,10 @@ def convert(
|
|
257
259
|
help="The timeout for processing each document, in seconds.",
|
258
260
|
),
|
259
261
|
] = None,
|
262
|
+
num_threads: Annotated[int, typer.Option(..., help="Number of threads")] = 4,
|
263
|
+
device: Annotated[
|
264
|
+
AcceleratorDevice, typer.Option(..., help="Accelerator device")
|
265
|
+
] = AcceleratorDevice.AUTO,
|
260
266
|
):
|
261
267
|
if verbose == 0:
|
262
268
|
logging.basicConfig(level=logging.WARNING)
|
@@ -336,7 +342,9 @@ def convert(
|
|
336
342
|
if ocr_lang_list is not None:
|
337
343
|
ocr_options.lang = ocr_lang_list
|
338
344
|
|
345
|
+
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
|
339
346
|
pipeline_options = PdfPipelineOptions(
|
347
|
+
accelerator_options=accelerator_options,
|
340
348
|
do_ocr=ocr,
|
341
349
|
ocr_options=ocr_options,
|
342
350
|
do_table_structure=True,
|
@@ -1,8 +1,66 @@
|
|
1
|
+
import logging
|
2
|
+
import os
|
3
|
+
import warnings
|
1
4
|
from enum import Enum
|
2
5
|
from pathlib import Path
|
3
|
-
from typing import List, Literal, Optional, Union
|
6
|
+
from typing import Annotated, Any, Dict, List, Literal, Optional, Tuple, Type, Union
|
4
7
|
|
5
|
-
from pydantic import BaseModel, ConfigDict, Field
|
8
|
+
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
|
9
|
+
from pydantic_settings import (
|
10
|
+
BaseSettings,
|
11
|
+
PydanticBaseSettingsSource,
|
12
|
+
SettingsConfigDict,
|
13
|
+
)
|
14
|
+
from typing_extensions import deprecated
|
15
|
+
|
16
|
+
_log = logging.getLogger(__name__)
|
17
|
+
|
18
|
+
|
19
|
+
class AcceleratorDevice(str, Enum):
|
20
|
+
"""Devices to run model inference"""
|
21
|
+
|
22
|
+
AUTO = "auto"
|
23
|
+
CPU = "cpu"
|
24
|
+
CUDA = "cuda"
|
25
|
+
MPS = "mps"
|
26
|
+
|
27
|
+
|
28
|
+
class AcceleratorOptions(BaseSettings):
|
29
|
+
model_config = SettingsConfigDict(
|
30
|
+
env_prefix="DOCLING_", env_nested_delimiter="_", populate_by_name=True
|
31
|
+
)
|
32
|
+
|
33
|
+
num_threads: int = 4
|
34
|
+
device: AcceleratorDevice = AcceleratorDevice.AUTO
|
35
|
+
|
36
|
+
@model_validator(mode="before")
|
37
|
+
@classmethod
|
38
|
+
def check_alternative_envvars(cls, data: Any) -> Any:
|
39
|
+
r"""
|
40
|
+
Set num_threads from the "alternative" envvar OMP_NUM_THREADS.
|
41
|
+
The alternative envvar is used only if it is valid and the regular envvar is not set.
|
42
|
+
|
43
|
+
Notice: The standard pydantic settings mechanism with parameter "aliases" does not provide
|
44
|
+
the same functionality. In case the alias envvar is set and the user tries to override the
|
45
|
+
parameter in settings initialization, Pydantic treats the parameter provided in __init__()
|
46
|
+
as an extra input instead of simply overwriting the evvar value for that parameter.
|
47
|
+
"""
|
48
|
+
if isinstance(data, dict):
|
49
|
+
input_num_threads = data.get("num_threads")
|
50
|
+
|
51
|
+
# Check if to set the num_threads from the alternative envvar
|
52
|
+
if input_num_threads is None:
|
53
|
+
docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
|
54
|
+
omp_num_threads = os.getenv("OMP_NUM_THREADS")
|
55
|
+
if docling_num_threads is None and omp_num_threads is not None:
|
56
|
+
try:
|
57
|
+
data["num_threads"] = int(omp_num_threads)
|
58
|
+
except ValueError:
|
59
|
+
_log.error(
|
60
|
+
"Ignoring misformatted envvar OMP_NUM_THREADS '%s'",
|
61
|
+
omp_num_threads,
|
62
|
+
)
|
63
|
+
return data
|
6
64
|
|
7
65
|
|
8
66
|
class TableFormerMode(str, Enum):
|
@@ -78,9 +136,11 @@ class EasyOcrOptions(OcrOptions):
|
|
78
136
|
|
79
137
|
kind: Literal["easyocr"] = "easyocr"
|
80
138
|
lang: List[str] = ["fr", "de", "es", "en"]
|
81
|
-
|
139
|
+
|
140
|
+
use_gpu: Optional[bool] = None
|
141
|
+
|
82
142
|
model_storage_directory: Optional[str] = None
|
83
|
-
download_enabled: bool = True
|
143
|
+
download_enabled: bool = True
|
84
144
|
|
85
145
|
model_config = ConfigDict(
|
86
146
|
extra="forbid",
|
@@ -153,6 +213,7 @@ class PipelineOptions(BaseModel):
|
|
153
213
|
True # This default will be set to False on a future version of docling
|
154
214
|
)
|
155
215
|
document_timeout: Optional[float] = None
|
216
|
+
accelerator_options: AcceleratorOptions = AcceleratorOptions()
|
156
217
|
|
157
218
|
|
158
219
|
class PdfPipelineOptions(PipelineOptions):
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import logging
|
2
|
+
import warnings
|
2
3
|
from typing import Iterable
|
3
4
|
|
4
5
|
import numpy
|
@@ -7,16 +8,26 @@ from docling_core.types.doc import BoundingBox, CoordOrigin
|
|
7
8
|
|
8
9
|
from docling.datamodel.base_models import Cell, OcrCell, Page
|
9
10
|
from docling.datamodel.document import ConversionResult
|
10
|
-
from docling.datamodel.pipeline_options import
|
11
|
+
from docling.datamodel.pipeline_options import (
|
12
|
+
AcceleratorDevice,
|
13
|
+
AcceleratorOptions,
|
14
|
+
EasyOcrOptions,
|
15
|
+
)
|
11
16
|
from docling.datamodel.settings import settings
|
12
17
|
from docling.models.base_ocr_model import BaseOcrModel
|
18
|
+
from docling.utils.accelerator_utils import decide_device
|
13
19
|
from docling.utils.profiling import TimeRecorder
|
14
20
|
|
15
21
|
_log = logging.getLogger(__name__)
|
16
22
|
|
17
23
|
|
18
24
|
class EasyOcrModel(BaseOcrModel):
|
19
|
-
def __init__(
|
25
|
+
def __init__(
|
26
|
+
self,
|
27
|
+
enabled: bool,
|
28
|
+
options: EasyOcrOptions,
|
29
|
+
accelerator_options: AcceleratorOptions,
|
30
|
+
):
|
20
31
|
super().__init__(enabled=enabled, options=options)
|
21
32
|
self.options: EasyOcrOptions
|
22
33
|
|
@@ -31,11 +42,32 @@ class EasyOcrModel(BaseOcrModel):
|
|
31
42
|
"Alternatively, Docling has support for other OCR engines. See the documentation."
|
32
43
|
)
|
33
44
|
|
45
|
+
if self.options.use_gpu is None:
|
46
|
+
device = decide_device(accelerator_options.device)
|
47
|
+
# Enable easyocr GPU if running on CUDA, MPS
|
48
|
+
use_gpu = any(
|
49
|
+
[
|
50
|
+
device.startswith(x)
|
51
|
+
for x in [
|
52
|
+
AcceleratorDevice.CUDA.value,
|
53
|
+
AcceleratorDevice.MPS.value,
|
54
|
+
]
|
55
|
+
]
|
56
|
+
)
|
57
|
+
else:
|
58
|
+
warnings.warn(
|
59
|
+
"Deprecated field. Better to set the `accelerator_options.device` in `pipeline_options`. "
|
60
|
+
"When `use_gpu and accelerator_options.device == AcceleratorDevice.CUDA` the GPU is used "
|
61
|
+
"to run EasyOCR. Otherwise, EasyOCR runs in CPU."
|
62
|
+
)
|
63
|
+
use_gpu = self.options.use_gpu
|
64
|
+
|
34
65
|
self.reader = easyocr.Reader(
|
35
66
|
lang_list=self.options.lang,
|
36
|
-
gpu=
|
67
|
+
gpu=use_gpu,
|
37
68
|
model_storage_directory=self.options.model_storage_directory,
|
38
69
|
download_enabled=self.options.download_enabled,
|
70
|
+
verbose=False,
|
39
71
|
)
|
40
72
|
|
41
73
|
def __call__(
|
@@ -9,6 +9,7 @@ from docling_core.types.doc import CoordOrigin, DocItemLabel
|
|
9
9
|
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
10
10
|
from PIL import ImageDraw
|
11
11
|
|
12
|
+
import docling.utils.layout_utils as lu
|
12
13
|
from docling.datamodel.base_models import (
|
13
14
|
BoundingBox,
|
14
15
|
Cell,
|
@@ -17,9 +18,10 @@ from docling.datamodel.base_models import (
|
|
17
18
|
Page,
|
18
19
|
)
|
19
20
|
from docling.datamodel.document import ConversionResult
|
21
|
+
from docling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions
|
20
22
|
from docling.datamodel.settings import settings
|
21
23
|
from docling.models.base_model import BasePageModel
|
22
|
-
from docling.utils import
|
24
|
+
from docling.utils.accelerator_utils import decide_device
|
23
25
|
from docling.utils.profiling import TimeRecorder
|
24
26
|
|
25
27
|
_log = logging.getLogger(__name__)
|
@@ -46,8 +48,16 @@ class LayoutModel(BasePageModel):
|
|
46
48
|
FIGURE_LABEL = DocItemLabel.PICTURE
|
47
49
|
FORMULA_LABEL = DocItemLabel.FORMULA
|
48
50
|
|
49
|
-
def __init__(self, artifacts_path: Path):
|
50
|
-
|
51
|
+
def __init__(self, artifacts_path: Path, accelerator_options: AcceleratorOptions):
|
52
|
+
device = decide_device(accelerator_options.device)
|
53
|
+
|
54
|
+
self.layout_predictor = LayoutPredictor(
|
55
|
+
artifact_path=str(artifacts_path),
|
56
|
+
device=device,
|
57
|
+
num_threads=accelerator_options.num_threads,
|
58
|
+
base_threshold=0.6,
|
59
|
+
blacklist_classes={"Form", "Key-Value Region"},
|
60
|
+
)
|
51
61
|
|
52
62
|
def postprocess(self, clusters_in: List[Cluster], cells: List[Cell], page_height):
|
53
63
|
MIN_INTERSECTION = 0.2
|
@@ -6,16 +6,26 @@ from docling_core.types.doc import BoundingBox, CoordOrigin
|
|
6
6
|
|
7
7
|
from docling.datamodel.base_models import OcrCell, Page
|
8
8
|
from docling.datamodel.document import ConversionResult
|
9
|
-
from docling.datamodel.pipeline_options import
|
9
|
+
from docling.datamodel.pipeline_options import (
|
10
|
+
AcceleratorDevice,
|
11
|
+
AcceleratorOptions,
|
12
|
+
RapidOcrOptions,
|
13
|
+
)
|
10
14
|
from docling.datamodel.settings import settings
|
11
15
|
from docling.models.base_ocr_model import BaseOcrModel
|
16
|
+
from docling.utils.accelerator_utils import decide_device
|
12
17
|
from docling.utils.profiling import TimeRecorder
|
13
18
|
|
14
19
|
_log = logging.getLogger(__name__)
|
15
20
|
|
16
21
|
|
17
22
|
class RapidOcrModel(BaseOcrModel):
|
18
|
-
def __init__(
|
23
|
+
def __init__(
|
24
|
+
self,
|
25
|
+
enabled: bool,
|
26
|
+
options: RapidOcrOptions,
|
27
|
+
accelerator_options: AcceleratorOptions,
|
28
|
+
):
|
19
29
|
super().__init__(enabled=enabled, options=options)
|
20
30
|
self.options: RapidOcrOptions
|
21
31
|
|
@@ -30,52 +40,21 @@ class RapidOcrModel(BaseOcrModel):
|
|
30
40
|
"Alternatively, Docling has support for other OCR engines. See the documentation."
|
31
41
|
)
|
32
42
|
|
33
|
-
#
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
det_use_dml = True
|
39
|
-
cls_use_dml = True
|
40
|
-
rec_use_dml = True
|
41
|
-
|
42
|
-
# # Same as Defaults in RapidOCR
|
43
|
-
# cls_use_cuda = False
|
44
|
-
# rec_use_cuda = False
|
45
|
-
# det_use_cuda = False
|
46
|
-
# det_use_dml = False
|
47
|
-
# cls_use_dml = False
|
48
|
-
# rec_use_dml = False
|
49
|
-
|
50
|
-
# # If we set everything to true onnx-runtime would automatically choose the fastest accelerator
|
51
|
-
# if self.options.device == self.options.Device.AUTO:
|
52
|
-
# cls_use_cuda = True
|
53
|
-
# rec_use_cuda = True
|
54
|
-
# det_use_cuda = True
|
55
|
-
# det_use_dml = True
|
56
|
-
# cls_use_dml = True
|
57
|
-
# rec_use_dml = True
|
58
|
-
|
59
|
-
# # If we set use_cuda to true onnx would use the cuda device available in runtime if no cuda device is available it would run on CPU.
|
60
|
-
# elif self.options.device == self.options.Device.CUDA:
|
61
|
-
# cls_use_cuda = True
|
62
|
-
# rec_use_cuda = True
|
63
|
-
# det_use_cuda = True
|
64
|
-
|
65
|
-
# # If we set use_dml to true onnx would use the dml device available in runtime if no dml device is available it would work on CPU.
|
66
|
-
# elif self.options.device == self.options.Device.DIRECTML:
|
67
|
-
# det_use_dml = True
|
68
|
-
# cls_use_dml = True
|
69
|
-
# rec_use_dml = True
|
43
|
+
# Decide the accelerator devices
|
44
|
+
device = decide_device(accelerator_options.device)
|
45
|
+
use_cuda = str(AcceleratorDevice.CUDA.value).lower() in device
|
46
|
+
use_dml = accelerator_options.device == AcceleratorDevice.AUTO
|
47
|
+
intra_op_num_threads = accelerator_options.num_threads
|
70
48
|
|
71
49
|
self.reader = RapidOCR(
|
72
50
|
text_score=self.options.text_score,
|
73
|
-
cls_use_cuda=
|
74
|
-
rec_use_cuda=
|
75
|
-
det_use_cuda=
|
76
|
-
det_use_dml=
|
77
|
-
cls_use_dml=
|
78
|
-
rec_use_dml=
|
51
|
+
cls_use_cuda=use_cuda,
|
52
|
+
rec_use_cuda=use_cuda,
|
53
|
+
det_use_cuda=use_cuda,
|
54
|
+
det_use_dml=use_dml,
|
55
|
+
cls_use_dml=use_dml,
|
56
|
+
rec_use_dml=use_dml,
|
57
|
+
intra_op_num_threads=intra_op_num_threads,
|
79
58
|
print_verbose=self.options.print_verbose,
|
80
59
|
det_model_path=self.options.det_model_path,
|
81
60
|
cls_model_path=self.options.cls_model_path,
|
@@ -9,15 +9,25 @@ from PIL import ImageDraw
|
|
9
9
|
|
10
10
|
from docling.datamodel.base_models import Page, Table, TableStructurePrediction
|
11
11
|
from docling.datamodel.document import ConversionResult
|
12
|
-
from docling.datamodel.pipeline_options import
|
12
|
+
from docling.datamodel.pipeline_options import (
|
13
|
+
AcceleratorDevice,
|
14
|
+
AcceleratorOptions,
|
15
|
+
TableFormerMode,
|
16
|
+
TableStructureOptions,
|
17
|
+
)
|
13
18
|
from docling.datamodel.settings import settings
|
14
19
|
from docling.models.base_model import BasePageModel
|
20
|
+
from docling.utils.accelerator_utils import decide_device
|
15
21
|
from docling.utils.profiling import TimeRecorder
|
16
22
|
|
17
23
|
|
18
24
|
class TableStructureModel(BasePageModel):
|
19
25
|
def __init__(
|
20
|
-
self,
|
26
|
+
self,
|
27
|
+
enabled: bool,
|
28
|
+
artifacts_path: Path,
|
29
|
+
options: TableStructureOptions,
|
30
|
+
accelerator_options: AcceleratorOptions,
|
21
31
|
):
|
22
32
|
self.options = options
|
23
33
|
self.do_cell_matching = self.options.do_cell_matching
|
@@ -26,16 +36,26 @@ class TableStructureModel(BasePageModel):
|
|
26
36
|
self.enabled = enabled
|
27
37
|
if self.enabled:
|
28
38
|
if self.mode == TableFormerMode.ACCURATE:
|
29
|
-
artifacts_path = artifacts_path / "
|
39
|
+
artifacts_path = artifacts_path / "accurate"
|
40
|
+
else:
|
41
|
+
artifacts_path = artifacts_path / "fast"
|
30
42
|
|
31
43
|
# Third Party
|
32
44
|
import docling_ibm_models.tableformer.common as c
|
33
45
|
|
46
|
+
device = decide_device(accelerator_options.device)
|
47
|
+
|
48
|
+
# Disable MPS here, until we know why it makes things slower.
|
49
|
+
if device == AcceleratorDevice.MPS.value:
|
50
|
+
device = AcceleratorDevice.CPU.value
|
51
|
+
|
34
52
|
self.tm_config = c.read_config(f"{artifacts_path}/tm_config.json")
|
35
53
|
self.tm_config["model"]["save_dir"] = artifacts_path
|
36
54
|
self.tm_model_type = self.tm_config["model"]["type"]
|
37
55
|
|
38
|
-
self.tf_predictor = TFPredictor(
|
56
|
+
self.tf_predictor = TFPredictor(
|
57
|
+
self.tm_config, device, accelerator_options.num_threads
|
58
|
+
)
|
39
59
|
self.scale = 2.0 # Scale up table input images to 144 dpi
|
40
60
|
|
41
61
|
def draw_table_and_cells(
|
@@ -38,7 +38,7 @@ _log = logging.getLogger(__name__)
|
|
38
38
|
|
39
39
|
|
40
40
|
class StandardPdfPipeline(PaginatedPipeline):
|
41
|
-
_layout_model_path = "model_artifacts/layout
|
41
|
+
_layout_model_path = "model_artifacts/layout"
|
42
42
|
_table_model_path = "model_artifacts/tableformer"
|
43
43
|
|
44
44
|
def __init__(self, pipeline_options: PdfPipelineOptions):
|
@@ -75,7 +75,8 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
75
75
|
# Layout model
|
76
76
|
LayoutModel(
|
77
77
|
artifacts_path=self.artifacts_path
|
78
|
-
/ StandardPdfPipeline._layout_model_path
|
78
|
+
/ StandardPdfPipeline._layout_model_path,
|
79
|
+
accelerator_options=pipeline_options.accelerator_options,
|
79
80
|
),
|
80
81
|
# Table structure model
|
81
82
|
TableStructureModel(
|
@@ -83,6 +84,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
83
84
|
artifacts_path=self.artifacts_path
|
84
85
|
/ StandardPdfPipeline._table_model_path,
|
85
86
|
options=pipeline_options.table_structure_options,
|
87
|
+
accelerator_options=pipeline_options.accelerator_options,
|
86
88
|
),
|
87
89
|
# Page assemble
|
88
90
|
PageAssembleModel(options=PageAssembleOptions(keep_images=keep_images)),
|
@@ -104,7 +106,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
104
106
|
repo_id="ds4sd/docling-models",
|
105
107
|
force_download=force,
|
106
108
|
local_dir=local_dir,
|
107
|
-
revision="v2.0
|
109
|
+
revision="v2.1.0",
|
108
110
|
)
|
109
111
|
|
110
112
|
return Path(download_path)
|
@@ -114,6 +116,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
114
116
|
return EasyOcrModel(
|
115
117
|
enabled=self.pipeline_options.do_ocr,
|
116
118
|
options=self.pipeline_options.ocr_options,
|
119
|
+
accelerator_options=self.pipeline_options.accelerator_options,
|
117
120
|
)
|
118
121
|
elif isinstance(self.pipeline_options.ocr_options, TesseractCliOcrOptions):
|
119
122
|
return TesseractOcrCliModel(
|
@@ -129,6 +132,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
129
132
|
return RapidOcrModel(
|
130
133
|
enabled=self.pipeline_options.do_ocr,
|
131
134
|
options=self.pipeline_options.ocr_options,
|
135
|
+
accelerator_options=self.pipeline_options.accelerator_options,
|
132
136
|
)
|
133
137
|
elif isinstance(self.pipeline_options.ocr_options, OcrMacOptions):
|
134
138
|
if "darwin" != sys.platform:
|
@@ -0,0 +1,42 @@
|
|
1
|
+
import logging
|
2
|
+
|
3
|
+
import torch
|
4
|
+
|
5
|
+
from docling.datamodel.pipeline_options import AcceleratorDevice
|
6
|
+
|
7
|
+
_log = logging.getLogger(__name__)
|
8
|
+
|
9
|
+
|
10
|
+
def decide_device(accelerator_device: AcceleratorDevice) -> str:
|
11
|
+
r"""
|
12
|
+
Resolve the device based on the acceleration options and the available devices in the system
|
13
|
+
Rules:
|
14
|
+
1. AUTO: Check for the best available device on the system.
|
15
|
+
2. User-defined: Check if the device actually exists, otherwise fall-back to CPU
|
16
|
+
"""
|
17
|
+
cuda_index = 0
|
18
|
+
device = "cpu"
|
19
|
+
|
20
|
+
has_cuda = torch.backends.cuda.is_built() and torch.cuda.is_available()
|
21
|
+
has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
|
22
|
+
|
23
|
+
if accelerator_device == AcceleratorDevice.AUTO:
|
24
|
+
if has_cuda:
|
25
|
+
device = f"cuda:{cuda_index}"
|
26
|
+
elif has_mps:
|
27
|
+
device = "mps"
|
28
|
+
|
29
|
+
else:
|
30
|
+
if accelerator_device == AcceleratorDevice.CUDA:
|
31
|
+
if has_cuda:
|
32
|
+
device = f"cuda:{cuda_index}"
|
33
|
+
else:
|
34
|
+
_log.warning("CUDA is not available in the system. Fall back to 'CPU'")
|
35
|
+
elif accelerator_device == AcceleratorDevice.MPS:
|
36
|
+
if has_mps:
|
37
|
+
device = "mps"
|
38
|
+
else:
|
39
|
+
_log.warning("MPS is not available in the system. Fall back to 'CPU'")
|
40
|
+
|
41
|
+
_log.info("Accelerator device: '%s'", device)
|
42
|
+
return device
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "docling"
|
3
|
-
version = "2.
|
3
|
+
version = "2.12.0" # DO NOT EDIT, updated automatically
|
4
4
|
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
5
5
|
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
6
6
|
license = "MIT"
|
@@ -27,8 +27,9 @@ packages = [{include = "docling"}]
|
|
27
27
|
python = "^3.9"
|
28
28
|
docling-core = { version = "^2.9.0", extras = ["chunking"] }
|
29
29
|
pydantic = "^2.0.0"
|
30
|
-
docling-ibm-models = "^
|
30
|
+
docling-ibm-models = "^3.1.0"
|
31
31
|
deepsearch-glm = "^1.0.0"
|
32
|
+
docling-parse = "^3.0.0"
|
32
33
|
filetype = "^1.2.0"
|
33
34
|
pypdfium2 = "^4.30.0"
|
34
35
|
pydantic-settings = "^2.3.0"
|
@@ -36,7 +37,6 @@ huggingface_hub = ">=0.23,<1"
|
|
36
37
|
requests = "^2.32.3"
|
37
38
|
easyocr = "^1.7"
|
38
39
|
tesserocr = { version = "^2.7.1", optional = true }
|
39
|
-
docling-parse = "^3.0.0"
|
40
40
|
certifi = ">=2024.7.4"
|
41
41
|
rtree = "^1.3.0"
|
42
42
|
scipy = "^1.6.0"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|