docling 2.10.0__py3-none-any.whl → 2.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/cli/main.py +18 -16
- docling/datamodel/base_models.py +12 -12
- docling/datamodel/pipeline_options.py +87 -5
- docling/models/ds_glm_model.py +2 -5
- docling/models/easyocr_model.py +35 -3
- docling/models/layout_model.py +13 -3
- docling/models/rapid_ocr_model.py +42 -62
- docling/models/table_structure_model.py +24 -4
- docling/pipeline/base_pipeline.py +17 -3
- docling/pipeline/standard_pdf_pipeline.py +7 -3
- docling/utils/accelerator_utils.py +42 -0
- {docling-2.10.0.dist-info → docling-2.12.0.dist-info}/METADATA +2 -2
- {docling-2.10.0.dist-info → docling-2.12.0.dist-info}/RECORD +16 -15
- {docling-2.10.0.dist-info → docling-2.12.0.dist-info}/LICENSE +0 -0
- {docling-2.10.0.dist-info → docling-2.12.0.dist-info}/WHEEL +0 -0
- {docling-2.10.0.dist-info → docling-2.12.0.dist-info}/entry_points.txt +0 -0
docling/cli/main.py
CHANGED
@@ -26,9 +26,13 @@ from docling.datamodel.base_models import (
|
|
26
26
|
)
|
27
27
|
from docling.datamodel.document import ConversionResult
|
28
28
|
from docling.datamodel.pipeline_options import (
|
29
|
+
AcceleratorDevice,
|
30
|
+
AcceleratorOptions,
|
29
31
|
EasyOcrOptions,
|
32
|
+
OcrEngine,
|
30
33
|
OcrMacOptions,
|
31
34
|
OcrOptions,
|
35
|
+
PdfBackend,
|
32
36
|
PdfPipelineOptions,
|
33
37
|
RapidOcrOptions,
|
34
38
|
TableFormerMode,
|
@@ -68,22 +72,6 @@ def version_callback(value: bool):
|
|
68
72
|
raise typer.Exit()
|
69
73
|
|
70
74
|
|
71
|
-
# Define an enum for the backend options
|
72
|
-
class PdfBackend(str, Enum):
|
73
|
-
PYPDFIUM2 = "pypdfium2"
|
74
|
-
DLPARSE_V1 = "dlparse_v1"
|
75
|
-
DLPARSE_V2 = "dlparse_v2"
|
76
|
-
|
77
|
-
|
78
|
-
# Define an enum for the ocr engines
|
79
|
-
class OcrEngine(str, Enum):
|
80
|
-
EASYOCR = "easyocr"
|
81
|
-
TESSERACT_CLI = "tesseract_cli"
|
82
|
-
TESSERACT = "tesseract"
|
83
|
-
OCRMAC = "ocrmac"
|
84
|
-
RAPIDOCR = "rapidocr"
|
85
|
-
|
86
|
-
|
87
75
|
def export_documents(
|
88
76
|
conv_results: Iterable[ConversionResult],
|
89
77
|
output_dir: Path,
|
@@ -264,6 +252,17 @@ def convert(
|
|
264
252
|
help="Show version information.",
|
265
253
|
),
|
266
254
|
] = None,
|
255
|
+
document_timeout: Annotated[
|
256
|
+
Optional[float],
|
257
|
+
typer.Option(
|
258
|
+
...,
|
259
|
+
help="The timeout for processing each document, in seconds.",
|
260
|
+
),
|
261
|
+
] = None,
|
262
|
+
num_threads: Annotated[int, typer.Option(..., help="Number of threads")] = 4,
|
263
|
+
device: Annotated[
|
264
|
+
AcceleratorDevice, typer.Option(..., help="Accelerator device")
|
265
|
+
] = AcceleratorDevice.AUTO,
|
267
266
|
):
|
268
267
|
if verbose == 0:
|
269
268
|
logging.basicConfig(level=logging.WARNING)
|
@@ -343,10 +342,13 @@ def convert(
|
|
343
342
|
if ocr_lang_list is not None:
|
344
343
|
ocr_options.lang = ocr_lang_list
|
345
344
|
|
345
|
+
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
|
346
346
|
pipeline_options = PdfPipelineOptions(
|
347
|
+
accelerator_options=accelerator_options,
|
347
348
|
do_ocr=ocr,
|
348
349
|
ocr_options=ocr_options,
|
349
350
|
do_table_structure=True,
|
351
|
+
document_timeout=document_timeout,
|
350
352
|
)
|
351
353
|
pipeline_options.table_structure_options.do_cell_matching = (
|
352
354
|
True # do_cell_matching
|
docling/datamodel/base_models.py
CHANGED
@@ -19,12 +19,12 @@ if TYPE_CHECKING:
|
|
19
19
|
|
20
20
|
|
21
21
|
class ConversionStatus(str, Enum):
|
22
|
-
PENDING =
|
23
|
-
STARTED =
|
24
|
-
FAILURE =
|
25
|
-
SUCCESS =
|
26
|
-
PARTIAL_SUCCESS =
|
27
|
-
SKIPPED =
|
22
|
+
PENDING = "pending"
|
23
|
+
STARTED = "started"
|
24
|
+
FAILURE = "failure"
|
25
|
+
SUCCESS = "success"
|
26
|
+
PARTIAL_SUCCESS = "partial_success"
|
27
|
+
SKIPPED = "skipped"
|
28
28
|
|
29
29
|
|
30
30
|
class InputFormat(str, Enum):
|
@@ -89,15 +89,15 @@ MimeTypeToFormat = {
|
|
89
89
|
|
90
90
|
|
91
91
|
class DocInputType(str, Enum):
|
92
|
-
PATH =
|
93
|
-
STREAM =
|
92
|
+
PATH = "path"
|
93
|
+
STREAM = "stream"
|
94
94
|
|
95
95
|
|
96
96
|
class DoclingComponentType(str, Enum):
|
97
|
-
DOCUMENT_BACKEND =
|
98
|
-
MODEL =
|
99
|
-
DOC_ASSEMBLER =
|
100
|
-
USER_INPUT =
|
97
|
+
DOCUMENT_BACKEND = "document_backend"
|
98
|
+
MODEL = "model"
|
99
|
+
DOC_ASSEMBLER = "doc_assembler"
|
100
|
+
USER_INPUT = "user_input"
|
101
101
|
|
102
102
|
|
103
103
|
class ErrorItem(BaseModel):
|
@@ -1,8 +1,66 @@
|
|
1
|
+
import logging
|
2
|
+
import os
|
3
|
+
import warnings
|
1
4
|
from enum import Enum
|
2
5
|
from pathlib import Path
|
3
|
-
from typing import List, Literal, Optional, Union
|
6
|
+
from typing import Annotated, Any, Dict, List, Literal, Optional, Tuple, Type, Union
|
4
7
|
|
5
|
-
from pydantic import BaseModel, ConfigDict, Field
|
8
|
+
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
|
9
|
+
from pydantic_settings import (
|
10
|
+
BaseSettings,
|
11
|
+
PydanticBaseSettingsSource,
|
12
|
+
SettingsConfigDict,
|
13
|
+
)
|
14
|
+
from typing_extensions import deprecated
|
15
|
+
|
16
|
+
_log = logging.getLogger(__name__)
|
17
|
+
|
18
|
+
|
19
|
+
class AcceleratorDevice(str, Enum):
|
20
|
+
"""Devices to run model inference"""
|
21
|
+
|
22
|
+
AUTO = "auto"
|
23
|
+
CPU = "cpu"
|
24
|
+
CUDA = "cuda"
|
25
|
+
MPS = "mps"
|
26
|
+
|
27
|
+
|
28
|
+
class AcceleratorOptions(BaseSettings):
|
29
|
+
model_config = SettingsConfigDict(
|
30
|
+
env_prefix="DOCLING_", env_nested_delimiter="_", populate_by_name=True
|
31
|
+
)
|
32
|
+
|
33
|
+
num_threads: int = 4
|
34
|
+
device: AcceleratorDevice = AcceleratorDevice.AUTO
|
35
|
+
|
36
|
+
@model_validator(mode="before")
|
37
|
+
@classmethod
|
38
|
+
def check_alternative_envvars(cls, data: Any) -> Any:
|
39
|
+
r"""
|
40
|
+
Set num_threads from the "alternative" envvar OMP_NUM_THREADS.
|
41
|
+
The alternative envvar is used only if it is valid and the regular envvar is not set.
|
42
|
+
|
43
|
+
Notice: The standard pydantic settings mechanism with parameter "aliases" does not provide
|
44
|
+
the same functionality. In case the alias envvar is set and the user tries to override the
|
45
|
+
parameter in settings initialization, Pydantic treats the parameter provided in __init__()
|
46
|
+
as an extra input instead of simply overwriting the evvar value for that parameter.
|
47
|
+
"""
|
48
|
+
if isinstance(data, dict):
|
49
|
+
input_num_threads = data.get("num_threads")
|
50
|
+
|
51
|
+
# Check if to set the num_threads from the alternative envvar
|
52
|
+
if input_num_threads is None:
|
53
|
+
docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
|
54
|
+
omp_num_threads = os.getenv("OMP_NUM_THREADS")
|
55
|
+
if docling_num_threads is None and omp_num_threads is not None:
|
56
|
+
try:
|
57
|
+
data["num_threads"] = int(omp_num_threads)
|
58
|
+
except ValueError:
|
59
|
+
_log.error(
|
60
|
+
"Ignoring misformatted envvar OMP_NUM_THREADS '%s'",
|
61
|
+
omp_num_threads,
|
62
|
+
)
|
63
|
+
return data
|
6
64
|
|
7
65
|
|
8
66
|
class TableFormerMode(str, Enum):
|
@@ -78,9 +136,11 @@ class EasyOcrOptions(OcrOptions):
|
|
78
136
|
|
79
137
|
kind: Literal["easyocr"] = "easyocr"
|
80
138
|
lang: List[str] = ["fr", "de", "es", "en"]
|
81
|
-
|
139
|
+
|
140
|
+
use_gpu: Optional[bool] = None
|
141
|
+
|
82
142
|
model_storage_directory: Optional[str] = None
|
83
|
-
download_enabled: bool = True
|
143
|
+
download_enabled: bool = True
|
84
144
|
|
85
145
|
model_config = ConfigDict(
|
86
146
|
extra="forbid",
|
@@ -126,12 +186,34 @@ class OcrMacOptions(OcrOptions):
|
|
126
186
|
)
|
127
187
|
|
128
188
|
|
189
|
+
# Define an enum for the backend options
|
190
|
+
class PdfBackend(str, Enum):
|
191
|
+
"""Enum of valid PDF backends."""
|
192
|
+
|
193
|
+
PYPDFIUM2 = "pypdfium2"
|
194
|
+
DLPARSE_V1 = "dlparse_v1"
|
195
|
+
DLPARSE_V2 = "dlparse_v2"
|
196
|
+
|
197
|
+
|
198
|
+
# Define an enum for the ocr engines
|
199
|
+
class OcrEngine(str, Enum):
|
200
|
+
"""Enum of valid OCR engines."""
|
201
|
+
|
202
|
+
EASYOCR = "easyocr"
|
203
|
+
TESSERACT_CLI = "tesseract_cli"
|
204
|
+
TESSERACT = "tesseract"
|
205
|
+
OCRMAC = "ocrmac"
|
206
|
+
RAPIDOCR = "rapidocr"
|
207
|
+
|
208
|
+
|
129
209
|
class PipelineOptions(BaseModel):
|
130
210
|
"""Base pipeline options."""
|
131
211
|
|
132
212
|
create_legacy_output: bool = (
|
133
|
-
True # This
|
213
|
+
True # This default will be set to False on a future version of docling
|
134
214
|
)
|
215
|
+
document_timeout: Optional[float] = None
|
216
|
+
accelerator_options: AcceleratorOptions = AcceleratorOptions()
|
135
217
|
|
136
218
|
|
137
219
|
class PdfPipelineOptions(PipelineOptions):
|
docling/models/ds_glm_model.py
CHANGED
@@ -3,8 +3,7 @@ import random
|
|
3
3
|
from pathlib import Path
|
4
4
|
from typing import List, Union
|
5
5
|
|
6
|
-
from deepsearch_glm.
|
7
|
-
from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
|
6
|
+
from deepsearch_glm.andromeda_nlp import nlp_model
|
8
7
|
from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
|
9
8
|
from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
|
10
9
|
from docling_core.types.legacy_doc.base import (
|
@@ -43,9 +42,7 @@ class GlmModel:
|
|
43
42
|
def __init__(self, options: GlmOptions):
|
44
43
|
self.options = options
|
45
44
|
|
46
|
-
|
47
|
-
load_pretrained_nlp_models()
|
48
|
-
self.model = init_nlp_model(model_names=self.options.model_names)
|
45
|
+
self.model = nlp_model(loglevel="error", text_ordering=True)
|
49
46
|
|
50
47
|
def _to_legacy_document(self, conv_res) -> DsDocument:
|
51
48
|
title = ""
|
docling/models/easyocr_model.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
import logging
|
2
|
+
import warnings
|
2
3
|
from typing import Iterable
|
3
4
|
|
4
5
|
import numpy
|
@@ -7,16 +8,26 @@ from docling_core.types.doc import BoundingBox, CoordOrigin
|
|
7
8
|
|
8
9
|
from docling.datamodel.base_models import Cell, OcrCell, Page
|
9
10
|
from docling.datamodel.document import ConversionResult
|
10
|
-
from docling.datamodel.pipeline_options import
|
11
|
+
from docling.datamodel.pipeline_options import (
|
12
|
+
AcceleratorDevice,
|
13
|
+
AcceleratorOptions,
|
14
|
+
EasyOcrOptions,
|
15
|
+
)
|
11
16
|
from docling.datamodel.settings import settings
|
12
17
|
from docling.models.base_ocr_model import BaseOcrModel
|
18
|
+
from docling.utils.accelerator_utils import decide_device
|
13
19
|
from docling.utils.profiling import TimeRecorder
|
14
20
|
|
15
21
|
_log = logging.getLogger(__name__)
|
16
22
|
|
17
23
|
|
18
24
|
class EasyOcrModel(BaseOcrModel):
|
19
|
-
def __init__(
|
25
|
+
def __init__(
|
26
|
+
self,
|
27
|
+
enabled: bool,
|
28
|
+
options: EasyOcrOptions,
|
29
|
+
accelerator_options: AcceleratorOptions,
|
30
|
+
):
|
20
31
|
super().__init__(enabled=enabled, options=options)
|
21
32
|
self.options: EasyOcrOptions
|
22
33
|
|
@@ -31,11 +42,32 @@ class EasyOcrModel(BaseOcrModel):
|
|
31
42
|
"Alternatively, Docling has support for other OCR engines. See the documentation."
|
32
43
|
)
|
33
44
|
|
45
|
+
if self.options.use_gpu is None:
|
46
|
+
device = decide_device(accelerator_options.device)
|
47
|
+
# Enable easyocr GPU if running on CUDA, MPS
|
48
|
+
use_gpu = any(
|
49
|
+
[
|
50
|
+
device.startswith(x)
|
51
|
+
for x in [
|
52
|
+
AcceleratorDevice.CUDA.value,
|
53
|
+
AcceleratorDevice.MPS.value,
|
54
|
+
]
|
55
|
+
]
|
56
|
+
)
|
57
|
+
else:
|
58
|
+
warnings.warn(
|
59
|
+
"Deprecated field. Better to set the `accelerator_options.device` in `pipeline_options`. "
|
60
|
+
"When `use_gpu and accelerator_options.device == AcceleratorDevice.CUDA` the GPU is used "
|
61
|
+
"to run EasyOCR. Otherwise, EasyOCR runs in CPU."
|
62
|
+
)
|
63
|
+
use_gpu = self.options.use_gpu
|
64
|
+
|
34
65
|
self.reader = easyocr.Reader(
|
35
66
|
lang_list=self.options.lang,
|
36
|
-
gpu=
|
67
|
+
gpu=use_gpu,
|
37
68
|
model_storage_directory=self.options.model_storage_directory,
|
38
69
|
download_enabled=self.options.download_enabled,
|
70
|
+
verbose=False,
|
39
71
|
)
|
40
72
|
|
41
73
|
def __call__(
|
docling/models/layout_model.py
CHANGED
@@ -9,6 +9,7 @@ from docling_core.types.doc import CoordOrigin, DocItemLabel
|
|
9
9
|
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
10
10
|
from PIL import ImageDraw
|
11
11
|
|
12
|
+
import docling.utils.layout_utils as lu
|
12
13
|
from docling.datamodel.base_models import (
|
13
14
|
BoundingBox,
|
14
15
|
Cell,
|
@@ -17,9 +18,10 @@ from docling.datamodel.base_models import (
|
|
17
18
|
Page,
|
18
19
|
)
|
19
20
|
from docling.datamodel.document import ConversionResult
|
21
|
+
from docling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions
|
20
22
|
from docling.datamodel.settings import settings
|
21
23
|
from docling.models.base_model import BasePageModel
|
22
|
-
from docling.utils import
|
24
|
+
from docling.utils.accelerator_utils import decide_device
|
23
25
|
from docling.utils.profiling import TimeRecorder
|
24
26
|
|
25
27
|
_log = logging.getLogger(__name__)
|
@@ -46,8 +48,16 @@ class LayoutModel(BasePageModel):
|
|
46
48
|
FIGURE_LABEL = DocItemLabel.PICTURE
|
47
49
|
FORMULA_LABEL = DocItemLabel.FORMULA
|
48
50
|
|
49
|
-
def __init__(self, artifacts_path: Path):
|
50
|
-
|
51
|
+
def __init__(self, artifacts_path: Path, accelerator_options: AcceleratorOptions):
|
52
|
+
device = decide_device(accelerator_options.device)
|
53
|
+
|
54
|
+
self.layout_predictor = LayoutPredictor(
|
55
|
+
artifact_path=str(artifacts_path),
|
56
|
+
device=device,
|
57
|
+
num_threads=accelerator_options.num_threads,
|
58
|
+
base_threshold=0.6,
|
59
|
+
blacklist_classes={"Form", "Key-Value Region"},
|
60
|
+
)
|
51
61
|
|
52
62
|
def postprocess(self, clusters_in: List[Cluster], cells: List[Cell], page_height):
|
53
63
|
MIN_INTERSECTION = 0.2
|
@@ -6,16 +6,26 @@ from docling_core.types.doc import BoundingBox, CoordOrigin
|
|
6
6
|
|
7
7
|
from docling.datamodel.base_models import OcrCell, Page
|
8
8
|
from docling.datamodel.document import ConversionResult
|
9
|
-
from docling.datamodel.pipeline_options import
|
9
|
+
from docling.datamodel.pipeline_options import (
|
10
|
+
AcceleratorDevice,
|
11
|
+
AcceleratorOptions,
|
12
|
+
RapidOcrOptions,
|
13
|
+
)
|
10
14
|
from docling.datamodel.settings import settings
|
11
15
|
from docling.models.base_ocr_model import BaseOcrModel
|
16
|
+
from docling.utils.accelerator_utils import decide_device
|
12
17
|
from docling.utils.profiling import TimeRecorder
|
13
18
|
|
14
19
|
_log = logging.getLogger(__name__)
|
15
20
|
|
16
21
|
|
17
22
|
class RapidOcrModel(BaseOcrModel):
|
18
|
-
def __init__(
|
23
|
+
def __init__(
|
24
|
+
self,
|
25
|
+
enabled: bool,
|
26
|
+
options: RapidOcrOptions,
|
27
|
+
accelerator_options: AcceleratorOptions,
|
28
|
+
):
|
19
29
|
super().__init__(enabled=enabled, options=options)
|
20
30
|
self.options: RapidOcrOptions
|
21
31
|
|
@@ -30,52 +40,21 @@ class RapidOcrModel(BaseOcrModel):
|
|
30
40
|
"Alternatively, Docling has support for other OCR engines. See the documentation."
|
31
41
|
)
|
32
42
|
|
33
|
-
#
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
det_use_dml = True
|
39
|
-
cls_use_dml = True
|
40
|
-
rec_use_dml = True
|
41
|
-
|
42
|
-
# # Same as Defaults in RapidOCR
|
43
|
-
# cls_use_cuda = False
|
44
|
-
# rec_use_cuda = False
|
45
|
-
# det_use_cuda = False
|
46
|
-
# det_use_dml = False
|
47
|
-
# cls_use_dml = False
|
48
|
-
# rec_use_dml = False
|
49
|
-
|
50
|
-
# # If we set everything to true onnx-runtime would automatically choose the fastest accelerator
|
51
|
-
# if self.options.device == self.options.Device.AUTO:
|
52
|
-
# cls_use_cuda = True
|
53
|
-
# rec_use_cuda = True
|
54
|
-
# det_use_cuda = True
|
55
|
-
# det_use_dml = True
|
56
|
-
# cls_use_dml = True
|
57
|
-
# rec_use_dml = True
|
58
|
-
|
59
|
-
# # If we set use_cuda to true onnx would use the cuda device available in runtime if no cuda device is available it would run on CPU.
|
60
|
-
# elif self.options.device == self.options.Device.CUDA:
|
61
|
-
# cls_use_cuda = True
|
62
|
-
# rec_use_cuda = True
|
63
|
-
# det_use_cuda = True
|
64
|
-
|
65
|
-
# # If we set use_dml to true onnx would use the dml device available in runtime if no dml device is available it would work on CPU.
|
66
|
-
# elif self.options.device == self.options.Device.DIRECTML:
|
67
|
-
# det_use_dml = True
|
68
|
-
# cls_use_dml = True
|
69
|
-
# rec_use_dml = True
|
43
|
+
# Decide the accelerator devices
|
44
|
+
device = decide_device(accelerator_options.device)
|
45
|
+
use_cuda = str(AcceleratorDevice.CUDA.value).lower() in device
|
46
|
+
use_dml = accelerator_options.device == AcceleratorDevice.AUTO
|
47
|
+
intra_op_num_threads = accelerator_options.num_threads
|
70
48
|
|
71
49
|
self.reader = RapidOCR(
|
72
50
|
text_score=self.options.text_score,
|
73
|
-
cls_use_cuda=
|
74
|
-
rec_use_cuda=
|
75
|
-
det_use_cuda=
|
76
|
-
det_use_dml=
|
77
|
-
cls_use_dml=
|
78
|
-
rec_use_dml=
|
51
|
+
cls_use_cuda=use_cuda,
|
52
|
+
rec_use_cuda=use_cuda,
|
53
|
+
det_use_cuda=use_cuda,
|
54
|
+
det_use_dml=use_dml,
|
55
|
+
cls_use_dml=use_dml,
|
56
|
+
rec_use_dml=use_dml,
|
57
|
+
intra_op_num_threads=intra_op_num_threads,
|
79
58
|
print_verbose=self.options.print_verbose,
|
80
59
|
det_model_path=self.options.det_model_path,
|
81
60
|
cls_model_path=self.options.cls_model_path,
|
@@ -118,24 +97,25 @@ class RapidOcrModel(BaseOcrModel):
|
|
118
97
|
del high_res_image
|
119
98
|
del im
|
120
99
|
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
(
|
129
|
-
|
130
|
-
|
131
|
-
|
100
|
+
if result is not None:
|
101
|
+
cells = [
|
102
|
+
OcrCell(
|
103
|
+
id=ix,
|
104
|
+
text=line[1],
|
105
|
+
confidence=line[2],
|
106
|
+
bbox=BoundingBox.from_tuple(
|
107
|
+
coord=(
|
108
|
+
(line[0][0][0] / self.scale) + ocr_rect.l,
|
109
|
+
(line[0][0][1] / self.scale) + ocr_rect.t,
|
110
|
+
(line[0][2][0] / self.scale) + ocr_rect.l,
|
111
|
+
(line[0][2][1] / self.scale) + ocr_rect.t,
|
112
|
+
),
|
113
|
+
origin=CoordOrigin.TOPLEFT,
|
132
114
|
),
|
133
|
-
|
134
|
-
)
|
135
|
-
|
136
|
-
|
137
|
-
]
|
138
|
-
all_ocr_cells.extend(cells)
|
115
|
+
)
|
116
|
+
for ix, line in enumerate(result)
|
117
|
+
]
|
118
|
+
all_ocr_cells.extend(cells)
|
139
119
|
|
140
120
|
# Post-process the cells
|
141
121
|
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
|
@@ -9,15 +9,25 @@ from PIL import ImageDraw
|
|
9
9
|
|
10
10
|
from docling.datamodel.base_models import Page, Table, TableStructurePrediction
|
11
11
|
from docling.datamodel.document import ConversionResult
|
12
|
-
from docling.datamodel.pipeline_options import
|
12
|
+
from docling.datamodel.pipeline_options import (
|
13
|
+
AcceleratorDevice,
|
14
|
+
AcceleratorOptions,
|
15
|
+
TableFormerMode,
|
16
|
+
TableStructureOptions,
|
17
|
+
)
|
13
18
|
from docling.datamodel.settings import settings
|
14
19
|
from docling.models.base_model import BasePageModel
|
20
|
+
from docling.utils.accelerator_utils import decide_device
|
15
21
|
from docling.utils.profiling import TimeRecorder
|
16
22
|
|
17
23
|
|
18
24
|
class TableStructureModel(BasePageModel):
|
19
25
|
def __init__(
|
20
|
-
self,
|
26
|
+
self,
|
27
|
+
enabled: bool,
|
28
|
+
artifacts_path: Path,
|
29
|
+
options: TableStructureOptions,
|
30
|
+
accelerator_options: AcceleratorOptions,
|
21
31
|
):
|
22
32
|
self.options = options
|
23
33
|
self.do_cell_matching = self.options.do_cell_matching
|
@@ -26,16 +36,26 @@ class TableStructureModel(BasePageModel):
|
|
26
36
|
self.enabled = enabled
|
27
37
|
if self.enabled:
|
28
38
|
if self.mode == TableFormerMode.ACCURATE:
|
29
|
-
artifacts_path = artifacts_path / "
|
39
|
+
artifacts_path = artifacts_path / "accurate"
|
40
|
+
else:
|
41
|
+
artifacts_path = artifacts_path / "fast"
|
30
42
|
|
31
43
|
# Third Party
|
32
44
|
import docling_ibm_models.tableformer.common as c
|
33
45
|
|
46
|
+
device = decide_device(accelerator_options.device)
|
47
|
+
|
48
|
+
# Disable MPS here, until we know why it makes things slower.
|
49
|
+
if device == AcceleratorDevice.MPS.value:
|
50
|
+
device = AcceleratorDevice.CPU.value
|
51
|
+
|
34
52
|
self.tm_config = c.read_config(f"{artifacts_path}/tm_config.json")
|
35
53
|
self.tm_config["model"]["save_dir"] = artifacts_path
|
36
54
|
self.tm_model_type = self.tm_config["model"]["type"]
|
37
55
|
|
38
|
-
self.tf_predictor = TFPredictor(
|
56
|
+
self.tf_predictor = TFPredictor(
|
57
|
+
self.tm_config, device, accelerator_options.num_threads
|
58
|
+
)
|
39
59
|
self.scale = 2.0 # Scale up table input images to 144 dpi
|
40
60
|
|
41
61
|
def draw_table_and_cells(
|
@@ -126,6 +126,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|
126
126
|
# conv_res.status = ConversionStatus.FAILURE
|
127
127
|
# return conv_res
|
128
128
|
|
129
|
+
total_elapsed_time = 0.0
|
129
130
|
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
|
130
131
|
|
131
132
|
for i in range(0, conv_res.input.page_count):
|
@@ -136,7 +137,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|
136
137
|
for page_batch in chunkify(
|
137
138
|
conv_res.pages, settings.perf.page_batch_size
|
138
139
|
):
|
139
|
-
|
140
|
+
start_batch_time = time.monotonic()
|
140
141
|
|
141
142
|
# 1. Initialise the page resources
|
142
143
|
init_pages = map(
|
@@ -149,8 +150,21 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|
149
150
|
for p in pipeline_pages: # Must exhaust!
|
150
151
|
pass
|
151
152
|
|
152
|
-
|
153
|
-
|
153
|
+
end_batch_time = time.monotonic()
|
154
|
+
total_elapsed_time += end_batch_time - start_batch_time
|
155
|
+
if (
|
156
|
+
self.pipeline_options.document_timeout is not None
|
157
|
+
and total_elapsed_time > self.pipeline_options.document_timeout
|
158
|
+
):
|
159
|
+
_log.warning(
|
160
|
+
f"Document processing time ({total_elapsed_time:.3f} seconds) exceeded the specified timeout of {self.pipeline_options.document_timeout:.3f} seconds"
|
161
|
+
)
|
162
|
+
conv_res.status = ConversionStatus.PARTIAL_SUCCESS
|
163
|
+
break
|
164
|
+
|
165
|
+
_log.debug(
|
166
|
+
f"Finished converting page batch time={end_batch_time:.3f}"
|
167
|
+
)
|
154
168
|
|
155
169
|
except Exception as e:
|
156
170
|
conv_res.status = ConversionStatus.FAILURE
|
@@ -38,7 +38,7 @@ _log = logging.getLogger(__name__)
|
|
38
38
|
|
39
39
|
|
40
40
|
class StandardPdfPipeline(PaginatedPipeline):
|
41
|
-
_layout_model_path = "model_artifacts/layout
|
41
|
+
_layout_model_path = "model_artifacts/layout"
|
42
42
|
_table_model_path = "model_artifacts/tableformer"
|
43
43
|
|
44
44
|
def __init__(self, pipeline_options: PdfPipelineOptions):
|
@@ -75,7 +75,8 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
75
75
|
# Layout model
|
76
76
|
LayoutModel(
|
77
77
|
artifacts_path=self.artifacts_path
|
78
|
-
/ StandardPdfPipeline._layout_model_path
|
78
|
+
/ StandardPdfPipeline._layout_model_path,
|
79
|
+
accelerator_options=pipeline_options.accelerator_options,
|
79
80
|
),
|
80
81
|
# Table structure model
|
81
82
|
TableStructureModel(
|
@@ -83,6 +84,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
83
84
|
artifacts_path=self.artifacts_path
|
84
85
|
/ StandardPdfPipeline._table_model_path,
|
85
86
|
options=pipeline_options.table_structure_options,
|
87
|
+
accelerator_options=pipeline_options.accelerator_options,
|
86
88
|
),
|
87
89
|
# Page assemble
|
88
90
|
PageAssembleModel(options=PageAssembleOptions(keep_images=keep_images)),
|
@@ -104,7 +106,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
104
106
|
repo_id="ds4sd/docling-models",
|
105
107
|
force_download=force,
|
106
108
|
local_dir=local_dir,
|
107
|
-
revision="v2.0
|
109
|
+
revision="v2.1.0",
|
108
110
|
)
|
109
111
|
|
110
112
|
return Path(download_path)
|
@@ -114,6 +116,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
114
116
|
return EasyOcrModel(
|
115
117
|
enabled=self.pipeline_options.do_ocr,
|
116
118
|
options=self.pipeline_options.ocr_options,
|
119
|
+
accelerator_options=self.pipeline_options.accelerator_options,
|
117
120
|
)
|
118
121
|
elif isinstance(self.pipeline_options.ocr_options, TesseractCliOcrOptions):
|
119
122
|
return TesseractOcrCliModel(
|
@@ -129,6 +132,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
129
132
|
return RapidOcrModel(
|
130
133
|
enabled=self.pipeline_options.do_ocr,
|
131
134
|
options=self.pipeline_options.ocr_options,
|
135
|
+
accelerator_options=self.pipeline_options.accelerator_options,
|
132
136
|
)
|
133
137
|
elif isinstance(self.pipeline_options.ocr_options, OcrMacOptions):
|
134
138
|
if "darwin" != sys.platform:
|
@@ -0,0 +1,42 @@
|
|
1
|
+
import logging
|
2
|
+
|
3
|
+
import torch
|
4
|
+
|
5
|
+
from docling.datamodel.pipeline_options import AcceleratorDevice
|
6
|
+
|
7
|
+
_log = logging.getLogger(__name__)
|
8
|
+
|
9
|
+
|
10
|
+
def decide_device(accelerator_device: AcceleratorDevice) -> str:
|
11
|
+
r"""
|
12
|
+
Resolve the device based on the acceleration options and the available devices in the system
|
13
|
+
Rules:
|
14
|
+
1. AUTO: Check for the best available device on the system.
|
15
|
+
2. User-defined: Check if the device actually exists, otherwise fall-back to CPU
|
16
|
+
"""
|
17
|
+
cuda_index = 0
|
18
|
+
device = "cpu"
|
19
|
+
|
20
|
+
has_cuda = torch.backends.cuda.is_built() and torch.cuda.is_available()
|
21
|
+
has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
|
22
|
+
|
23
|
+
if accelerator_device == AcceleratorDevice.AUTO:
|
24
|
+
if has_cuda:
|
25
|
+
device = f"cuda:{cuda_index}"
|
26
|
+
elif has_mps:
|
27
|
+
device = "mps"
|
28
|
+
|
29
|
+
else:
|
30
|
+
if accelerator_device == AcceleratorDevice.CUDA:
|
31
|
+
if has_cuda:
|
32
|
+
device = f"cuda:{cuda_index}"
|
33
|
+
else:
|
34
|
+
_log.warning("CUDA is not available in the system. Fall back to 'CPU'")
|
35
|
+
elif accelerator_device == AcceleratorDevice.MPS:
|
36
|
+
if has_mps:
|
37
|
+
device = "mps"
|
38
|
+
else:
|
39
|
+
_log.warning("MPS is not available in the system. Fall back to 'CPU'")
|
40
|
+
|
41
|
+
_log.info("Accelerator device: '%s'", device)
|
42
|
+
return device
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.12.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -27,7 +27,7 @@ Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
|
27
27
|
Requires-Dist: certifi (>=2024.7.4)
|
28
28
|
Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
|
29
29
|
Requires-Dist: docling-core[chunking] (>=2.9.0,<3.0.0)
|
30
|
-
Requires-Dist: docling-ibm-models (>=
|
30
|
+
Requires-Dist: docling-ibm-models (>=3.1.0,<4.0.0)
|
31
31
|
Requires-Dist: docling-parse (>=3.0.0,<4.0.0)
|
32
32
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
33
33
|
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
@@ -13,40 +13,41 @@ docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6
|
|
13
13
|
docling/backend/pypdfium2_backend.py,sha256=B4bfv-dfzlWiKTfF8LN5fto_99YBu8A2c1_XIVwRUWI,8996
|
14
14
|
docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
|
15
15
|
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
|
-
docling/cli/main.py,sha256=
|
16
|
+
docling/cli/main.py,sha256=SdavhL0VTApK9JrKz0Pc1IYdnQhK-0OOaGT8zlTiN5c,15022
|
17
17
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
|
-
docling/datamodel/base_models.py,sha256=
|
18
|
+
docling/datamodel/base_models.py,sha256=vwy59eDrkzCSaay24RlUvx4zEyuaUukOdOhw3622u2I,5616
|
19
19
|
docling/datamodel/document.py,sha256=GNlTsgKgDqdqv2dfhpYmnqymxDQWWWC8HgE8uAta8V4,10265
|
20
|
-
docling/datamodel/pipeline_options.py,sha256=
|
20
|
+
docling/datamodel/pipeline_options.py,sha256=AJxnc3lHAlomkXcm-g68wylrKp1_2dttO1HQBNXleME,7649
|
21
21
|
docling/datamodel/settings.py,sha256=JK8lZPBjUx2kD2q-Qpg-o3vOElADMcyQbRUL0EHZ7us,1263
|
22
22
|
docling/document_converter.py,sha256=Iz5eerBWFPVJoXAMlXEivRQX2VLBiUkA07BL4NNbaEs,11583
|
23
23
|
docling/exceptions.py,sha256=-FoP46rFJgz_jn5uDv2V052udEEg8gckk6uhoItchXc,85
|
24
24
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
25
25
|
docling/models/base_model.py,sha256=Yq_-FmUhqhE20vXYG3WiQXDRTIPjik1CyuEZ8iYTGAY,701
|
26
26
|
docling/models/base_ocr_model.py,sha256=rGSpBF4dByITcsBaRIgvFKpiu0CrhmZS_PHIo686Dw0,6428
|
27
|
-
docling/models/ds_glm_model.py,sha256=
|
28
|
-
docling/models/easyocr_model.py,sha256=
|
29
|
-
docling/models/layout_model.py,sha256=
|
27
|
+
docling/models/ds_glm_model.py,sha256=YJkGxV46wh7G2Wr4vVzt9b8oewkUDPWpvI6AEaZDrs0,11872
|
28
|
+
docling/models/easyocr_model.py,sha256=q9GWMRte-D7sleSb5tnTReWsx4vOeqnMEaguxDFdFms,4856
|
29
|
+
docling/models/layout_model.py,sha256=v7EvFYFtFVMa-UeXCR644sk6mbX9EvEVG5jRoDli7II,14450
|
30
30
|
docling/models/ocr_mac_model.py,sha256=bLP14UUmZcSzjDe-HLj-mtksTuBmsCTg2C1wCxUpan0,4502
|
31
31
|
docling/models/page_assemble_model.py,sha256=kSGNiRKhmzkpFH7xCiT3rulMsgJmUXFa6Th_eB-cLEk,7103
|
32
32
|
docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
|
33
|
-
docling/models/rapid_ocr_model.py,sha256=
|
34
|
-
docling/models/table_structure_model.py,sha256
|
33
|
+
docling/models/rapid_ocr_model.py,sha256=LOIvczJs3_db2o8mtrKk-pIXgC-xqWqRLu2cjA3wvy4,4980
|
34
|
+
docling/models/table_structure_model.py,sha256=g5u42ptUEtqPfFATAEBtDDjkNcEIzIBhunoT8DpYra8,9010
|
35
35
|
docling/models/tesseract_ocr_cli_model.py,sha256=aKQBaty4cYu6zG_C5uy6Zm3eeRQo5fxIierbKixa2kc,6622
|
36
36
|
docling/models/tesseract_ocr_model.py,sha256=RDf6iV1q-oXaGfZXv0bW6SqjHNKQvBUDlUsOkuz0neY,6095
|
37
37
|
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
38
|
-
docling/pipeline/base_pipeline.py,sha256=
|
38
|
+
docling/pipeline/base_pipeline.py,sha256=hVvtk5E4DVZdl_SyNs_pYRUjN9C8PABhpVaeN5Z_fAY,7885
|
39
39
|
docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
|
40
|
-
docling/pipeline/standard_pdf_pipeline.py,sha256=
|
40
|
+
docling/pipeline/standard_pdf_pipeline.py,sha256=iXjVLy-9q82jrU_0AZTkbz3ccrqz4WiRLYD-epxG5BQ,9174
|
41
41
|
docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
42
42
|
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
43
|
+
docling/utils/accelerator_utils.py,sha256=ZjULCn-qhxqx3frF-rJmAlWdzqgUMxH5utLHbSPev80,1367
|
43
44
|
docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
|
44
45
|
docling/utils/glm_utils.py,sha256=H1O_tDiRksMgw45rY9LhK6GjcZSOq5IyoGurGjoo-Ac,11211
|
45
46
|
docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
|
46
47
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
47
48
|
docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
|
48
|
-
docling-2.
|
49
|
-
docling-2.
|
50
|
-
docling-2.
|
51
|
-
docling-2.
|
52
|
-
docling-2.
|
49
|
+
docling-2.12.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
50
|
+
docling-2.12.0.dist-info/METADATA,sha256=tltJX40w5aC-5oNy7FrMxhRuJ42YFY2fUcUXu9vpo14,7731
|
51
|
+
docling-2.12.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
52
|
+
docling-2.12.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
|
53
|
+
docling-2.12.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|