docling 2.11.0__py3-none-any.whl → 2.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/xml/__init__.py +0 -0
- docling/backend/xml/uspto_backend.py +1888 -0
- docling/cli/main.py +8 -0
- docling/datamodel/base_models.py +18 -4
- docling/datamodel/document.py +77 -13
- docling/datamodel/pipeline_options.py +68 -4
- docling/datamodel/settings.py +1 -0
- docling/document_converter.py +11 -2
- docling/models/ds_glm_model.py +34 -4
- docling/models/easyocr_model.py +37 -3
- docling/models/layout_model.py +144 -280
- docling/models/page_assemble_model.py +11 -1
- docling/models/rapid_ocr_model.py +24 -45
- docling/models/table_structure_model.py +49 -33
- docling/pipeline/base_pipeline.py +3 -1
- docling/pipeline/standard_pdf_pipeline.py +7 -3
- docling/utils/accelerator_utils.py +42 -0
- docling/utils/glm_utils.py +11 -3
- docling/utils/layout_postprocessor.py +666 -0
- {docling-2.11.0.dist-info → docling-2.13.0.dist-info}/METADATA +3 -3
- {docling-2.11.0.dist-info → docling-2.13.0.dist-info}/RECORD +24 -21
- docling/utils/layout_utils.py +0 -812
- {docling-2.11.0.dist-info → docling-2.13.0.dist-info}/LICENSE +0 -0
- {docling-2.11.0.dist-info → docling-2.13.0.dist-info}/WHEEL +0 -0
- {docling-2.11.0.dist-info → docling-2.13.0.dist-info}/entry_points.txt +0 -0
@@ -9,15 +9,25 @@ from PIL import ImageDraw
|
|
9
9
|
|
10
10
|
from docling.datamodel.base_models import Page, Table, TableStructurePrediction
|
11
11
|
from docling.datamodel.document import ConversionResult
|
12
|
-
from docling.datamodel.pipeline_options import
|
12
|
+
from docling.datamodel.pipeline_options import (
|
13
|
+
AcceleratorDevice,
|
14
|
+
AcceleratorOptions,
|
15
|
+
TableFormerMode,
|
16
|
+
TableStructureOptions,
|
17
|
+
)
|
13
18
|
from docling.datamodel.settings import settings
|
14
19
|
from docling.models.base_model import BasePageModel
|
20
|
+
from docling.utils.accelerator_utils import decide_device
|
15
21
|
from docling.utils.profiling import TimeRecorder
|
16
22
|
|
17
23
|
|
18
24
|
class TableStructureModel(BasePageModel):
|
19
25
|
def __init__(
|
20
|
-
self,
|
26
|
+
self,
|
27
|
+
enabled: bool,
|
28
|
+
artifacts_path: Path,
|
29
|
+
options: TableStructureOptions,
|
30
|
+
accelerator_options: AcceleratorOptions,
|
21
31
|
):
|
22
32
|
self.options = options
|
23
33
|
self.do_cell_matching = self.options.do_cell_matching
|
@@ -26,16 +36,26 @@ class TableStructureModel(BasePageModel):
|
|
26
36
|
self.enabled = enabled
|
27
37
|
if self.enabled:
|
28
38
|
if self.mode == TableFormerMode.ACCURATE:
|
29
|
-
artifacts_path = artifacts_path / "
|
39
|
+
artifacts_path = artifacts_path / "accurate"
|
40
|
+
else:
|
41
|
+
artifacts_path = artifacts_path / "fast"
|
30
42
|
|
31
43
|
# Third Party
|
32
44
|
import docling_ibm_models.tableformer.common as c
|
33
45
|
|
46
|
+
device = decide_device(accelerator_options.device)
|
47
|
+
|
48
|
+
# Disable MPS here, until we know why it makes things slower.
|
49
|
+
if device == AcceleratorDevice.MPS.value:
|
50
|
+
device = AcceleratorDevice.CPU.value
|
51
|
+
|
34
52
|
self.tm_config = c.read_config(f"{artifacts_path}/tm_config.json")
|
35
53
|
self.tm_config["model"]["save_dir"] = artifacts_path
|
36
54
|
self.tm_model_type = self.tm_config["model"]["type"]
|
37
55
|
|
38
|
-
self.tf_predictor = TFPredictor(
|
56
|
+
self.tf_predictor = TFPredictor(
|
57
|
+
self.tm_config, device, accelerator_options.num_threads
|
58
|
+
)
|
39
59
|
self.scale = 2.0 # Scale up table input images to 144 dpi
|
40
60
|
|
41
61
|
def draw_table_and_cells(
|
@@ -56,6 +76,10 @@ class TableStructureModel(BasePageModel):
|
|
56
76
|
x0, y0, x1, y1 = table_element.cluster.bbox.as_tuple()
|
57
77
|
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
58
78
|
|
79
|
+
for cell in table_element.cluster.cells:
|
80
|
+
x0, y0, x1, y1 = cell.bbox.as_tuple()
|
81
|
+
draw.rectangle([(x0, y0), (x1, y1)], outline="green")
|
82
|
+
|
59
83
|
for tc in table_element.table_cells:
|
60
84
|
if tc.bbox is not None:
|
61
85
|
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
@@ -69,7 +93,6 @@ class TableStructureModel(BasePageModel):
|
|
69
93
|
text=f"{tc.start_row_offset_idx}, {tc.start_col_offset_idx}",
|
70
94
|
fill="black",
|
71
95
|
)
|
72
|
-
|
73
96
|
if show:
|
74
97
|
image.show()
|
75
98
|
else:
|
@@ -115,47 +138,40 @@ class TableStructureModel(BasePageModel):
|
|
115
138
|
],
|
116
139
|
)
|
117
140
|
for cluster in page.predictions.layout.clusters
|
118
|
-
if cluster.label
|
141
|
+
if cluster.label
|
142
|
+
in [DocItemLabel.TABLE, DocItemLabel.DOCUMENT_INDEX]
|
119
143
|
]
|
120
144
|
if not len(in_tables):
|
121
145
|
yield page
|
122
146
|
continue
|
123
147
|
|
124
|
-
tokens = []
|
125
|
-
for c in page.cells:
|
126
|
-
for cluster, _ in in_tables:
|
127
|
-
if c.bbox.area() > 0:
|
128
|
-
if (
|
129
|
-
c.bbox.intersection_area_with(cluster.bbox)
|
130
|
-
/ c.bbox.area()
|
131
|
-
> 0.2
|
132
|
-
):
|
133
|
-
# Only allow non empty stings (spaces) into the cells of a table
|
134
|
-
if len(c.text.strip()) > 0:
|
135
|
-
new_cell = copy.deepcopy(c)
|
136
|
-
new_cell.bbox = new_cell.bbox.scaled(
|
137
|
-
scale=self.scale
|
138
|
-
)
|
139
|
-
|
140
|
-
tokens.append(new_cell.model_dump())
|
141
|
-
|
142
148
|
page_input = {
|
143
|
-
"tokens": tokens,
|
144
149
|
"width": page.size.width * self.scale,
|
145
150
|
"height": page.size.height * self.scale,
|
151
|
+
"image": numpy.asarray(page.get_image(scale=self.scale)),
|
146
152
|
}
|
147
|
-
page_input["image"] = numpy.asarray(
|
148
|
-
page.get_image(scale=self.scale)
|
149
|
-
)
|
150
153
|
|
151
154
|
table_clusters, table_bboxes = zip(*in_tables)
|
152
155
|
|
153
156
|
if len(table_bboxes):
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
+
for table_cluster, tbl_box in in_tables:
|
158
|
+
|
159
|
+
tokens = []
|
160
|
+
for c in table_cluster.cells:
|
161
|
+
# Only allow non empty stings (spaces) into the cells of a table
|
162
|
+
if len(c.text.strip()) > 0:
|
163
|
+
new_cell = copy.deepcopy(c)
|
164
|
+
new_cell.bbox = new_cell.bbox.scaled(
|
165
|
+
scale=self.scale
|
166
|
+
)
|
167
|
+
|
168
|
+
tokens.append(new_cell.model_dump())
|
169
|
+
page_input["tokens"] = tokens
|
157
170
|
|
158
|
-
|
171
|
+
tf_output = self.tf_predictor.multi_table_predict(
|
172
|
+
page_input, [tbl_box], do_matching=self.do_cell_matching
|
173
|
+
)
|
174
|
+
table_out = tf_output[0]
|
159
175
|
table_cells = []
|
160
176
|
for element in table_out["tf_responses"]:
|
161
177
|
|
@@ -188,7 +204,7 @@ class TableStructureModel(BasePageModel):
|
|
188
204
|
id=table_cluster.id,
|
189
205
|
page_no=page.page_no,
|
190
206
|
cluster=table_cluster,
|
191
|
-
label=
|
207
|
+
label=table_cluster.label,
|
192
208
|
)
|
193
209
|
|
194
210
|
page.predictions.tablestructure.table_map[
|
@@ -168,7 +168,9 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|
168
168
|
|
169
169
|
except Exception as e:
|
170
170
|
conv_res.status = ConversionStatus.FAILURE
|
171
|
-
trace = "\n".join(
|
171
|
+
trace = "\n".join(
|
172
|
+
traceback.format_exception(type(e), e, e.__traceback__)
|
173
|
+
)
|
172
174
|
_log.warning(
|
173
175
|
f"Encountered an error during conversion of document {conv_res.input.document_hash}:\n"
|
174
176
|
f"{trace}"
|
@@ -38,7 +38,7 @@ _log = logging.getLogger(__name__)
|
|
38
38
|
|
39
39
|
|
40
40
|
class StandardPdfPipeline(PaginatedPipeline):
|
41
|
-
_layout_model_path = "model_artifacts/layout
|
41
|
+
_layout_model_path = "model_artifacts/layout"
|
42
42
|
_table_model_path = "model_artifacts/tableformer"
|
43
43
|
|
44
44
|
def __init__(self, pipeline_options: PdfPipelineOptions):
|
@@ -75,7 +75,8 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
75
75
|
# Layout model
|
76
76
|
LayoutModel(
|
77
77
|
artifacts_path=self.artifacts_path
|
78
|
-
/ StandardPdfPipeline._layout_model_path
|
78
|
+
/ StandardPdfPipeline._layout_model_path,
|
79
|
+
accelerator_options=pipeline_options.accelerator_options,
|
79
80
|
),
|
80
81
|
# Table structure model
|
81
82
|
TableStructureModel(
|
@@ -83,6 +84,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
83
84
|
artifacts_path=self.artifacts_path
|
84
85
|
/ StandardPdfPipeline._table_model_path,
|
85
86
|
options=pipeline_options.table_structure_options,
|
87
|
+
accelerator_options=pipeline_options.accelerator_options,
|
86
88
|
),
|
87
89
|
# Page assemble
|
88
90
|
PageAssembleModel(options=PageAssembleOptions(keep_images=keep_images)),
|
@@ -104,7 +106,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
104
106
|
repo_id="ds4sd/docling-models",
|
105
107
|
force_download=force,
|
106
108
|
local_dir=local_dir,
|
107
|
-
revision="v2.0
|
109
|
+
revision="v2.1.0",
|
108
110
|
)
|
109
111
|
|
110
112
|
return Path(download_path)
|
@@ -114,6 +116,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
114
116
|
return EasyOcrModel(
|
115
117
|
enabled=self.pipeline_options.do_ocr,
|
116
118
|
options=self.pipeline_options.ocr_options,
|
119
|
+
accelerator_options=self.pipeline_options.accelerator_options,
|
117
120
|
)
|
118
121
|
elif isinstance(self.pipeline_options.ocr_options, TesseractCliOcrOptions):
|
119
122
|
return TesseractOcrCliModel(
|
@@ -129,6 +132,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
129
132
|
return RapidOcrModel(
|
130
133
|
enabled=self.pipeline_options.do_ocr,
|
131
134
|
options=self.pipeline_options.ocr_options,
|
135
|
+
accelerator_options=self.pipeline_options.accelerator_options,
|
132
136
|
)
|
133
137
|
elif isinstance(self.pipeline_options.ocr_options, OcrMacOptions):
|
134
138
|
if "darwin" != sys.platform:
|
@@ -0,0 +1,42 @@
|
|
1
|
+
import logging
|
2
|
+
|
3
|
+
import torch
|
4
|
+
|
5
|
+
from docling.datamodel.pipeline_options import AcceleratorDevice
|
6
|
+
|
7
|
+
_log = logging.getLogger(__name__)
|
8
|
+
|
9
|
+
|
10
|
+
def decide_device(accelerator_device: AcceleratorDevice) -> str:
|
11
|
+
r"""
|
12
|
+
Resolve the device based on the acceleration options and the available devices in the system
|
13
|
+
Rules:
|
14
|
+
1. AUTO: Check for the best available device on the system.
|
15
|
+
2. User-defined: Check if the device actually exists, otherwise fall-back to CPU
|
16
|
+
"""
|
17
|
+
cuda_index = 0
|
18
|
+
device = "cpu"
|
19
|
+
|
20
|
+
has_cuda = torch.backends.cuda.is_built() and torch.cuda.is_available()
|
21
|
+
has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
|
22
|
+
|
23
|
+
if accelerator_device == AcceleratorDevice.AUTO:
|
24
|
+
if has_cuda:
|
25
|
+
device = f"cuda:{cuda_index}"
|
26
|
+
elif has_mps:
|
27
|
+
device = "mps"
|
28
|
+
|
29
|
+
else:
|
30
|
+
if accelerator_device == AcceleratorDevice.CUDA:
|
31
|
+
if has_cuda:
|
32
|
+
device = f"cuda:{cuda_index}"
|
33
|
+
else:
|
34
|
+
_log.warning("CUDA is not available in the system. Fall back to 'CPU'")
|
35
|
+
elif accelerator_device == AcceleratorDevice.MPS:
|
36
|
+
if has_mps:
|
37
|
+
device = "mps"
|
38
|
+
else:
|
39
|
+
_log.warning("MPS is not available in the system. Fall back to 'CPU'")
|
40
|
+
|
41
|
+
_log.info("Accelerator device: '%s'", device)
|
42
|
+
return device
|
docling/utils/glm_utils.py
CHANGED
@@ -169,6 +169,8 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
|
|
169
169
|
current_list = None
|
170
170
|
text = ""
|
171
171
|
caption_refs = []
|
172
|
+
item_label = DocItemLabel(pelem["name"])
|
173
|
+
|
172
174
|
for caption in obj["captions"]:
|
173
175
|
text += caption["text"]
|
174
176
|
|
@@ -254,12 +256,18 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
|
|
254
256
|
),
|
255
257
|
)
|
256
258
|
|
257
|
-
tbl = doc.add_table(data=tbl_data, prov=prov)
|
259
|
+
tbl = doc.add_table(data=tbl_data, prov=prov, label=item_label)
|
258
260
|
tbl.captions.extend(caption_refs)
|
259
261
|
|
260
|
-
elif ptype in [
|
262
|
+
elif ptype in [DocItemLabel.FORM.value, DocItemLabel.KEY_VALUE_REGION.value]:
|
261
263
|
label = DocItemLabel(ptype)
|
262
|
-
|
264
|
+
group_label = GroupLabel.UNSPECIFIED
|
265
|
+
if label == DocItemLabel.FORM:
|
266
|
+
group_label = GroupLabel.FORM_AREA
|
267
|
+
elif label == DocItemLabel.KEY_VALUE_REGION:
|
268
|
+
group_label = GroupLabel.KEY_VALUE_AREA
|
269
|
+
|
270
|
+
container_el = doc.add_group(label=group_label)
|
263
271
|
|
264
272
|
_add_child_elements(container_el, doc, obj, pelem)
|
265
273
|
|