docling 2.26.0__py3-none-any.whl → 2.27.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/asciidoc_backend.py +1 -1
- docling/backend/csv_backend.py +1 -1
- docling/backend/docling_parse_backend.py +21 -13
- docling/backend/docling_parse_v2_backend.py +20 -12
- docling/backend/docling_parse_v4_backend.py +185 -0
- docling/backend/docx/__init__.py +0 -0
- docling/backend/docx/latex/__init__.py +0 -0
- docling/backend/docx/latex/latex_dict.py +271 -0
- docling/backend/docx/latex/omml.py +453 -0
- docling/backend/html_backend.py +7 -7
- docling/backend/md_backend.py +1 -1
- docling/backend/msexcel_backend.py +2 -45
- docling/backend/mspowerpoint_backend.py +1 -1
- docling/backend/msword_backend.py +65 -3
- docling/backend/pdf_backend.py +7 -2
- docling/backend/pypdfium2_backend.py +52 -30
- docling/backend/xml/uspto_backend.py +1 -1
- docling/cli/main.py +60 -21
- docling/cli/models.py +1 -1
- docling/datamodel/base_models.py +8 -10
- docling/datamodel/pipeline_options.py +26 -30
- docling/document_converter.py +5 -5
- docling/models/base_model.py +9 -1
- docling/models/base_ocr_model.py +27 -16
- docling/models/easyocr_model.py +28 -13
- docling/models/factories/__init__.py +27 -0
- docling/models/factories/base_factory.py +122 -0
- docling/models/factories/ocr_factory.py +11 -0
- docling/models/factories/picture_description_factory.py +11 -0
- docling/models/ocr_mac_model.py +39 -11
- docling/models/page_preprocessing_model.py +4 -0
- docling/models/picture_description_api_model.py +20 -3
- docling/models/picture_description_base_model.py +19 -3
- docling/models/picture_description_vlm_model.py +14 -2
- docling/models/plugins/__init__.py +0 -0
- docling/models/plugins/defaults.py +28 -0
- docling/models/rapid_ocr_model.py +34 -13
- docling/models/table_structure_model.py +13 -4
- docling/models/tesseract_ocr_cli_model.py +40 -15
- docling/models/tesseract_ocr_model.py +37 -12
- docling/pipeline/standard_pdf_pipeline.py +25 -78
- docling/utils/export.py +8 -6
- docling/utils/layout_postprocessor.py +26 -23
- docling/utils/visualization.py +1 -1
- {docling-2.26.0.dist-info → docling-2.27.0.dist-info}/METADATA +48 -19
- docling-2.27.0.dist-info/RECORD +83 -0
- {docling-2.26.0.dist-info → docling-2.27.0.dist-info}/entry_points.txt +3 -0
- docling-2.26.0.dist-info/RECORD +0 -72
- {docling-2.26.0.dist-info → docling-2.27.0.dist-info}/LICENSE +0 -0
- {docling-2.26.0.dist-info → docling-2.27.0.dist-info}/WHEEL +0 -0
@@ -1,14 +1,17 @@
|
|
1
1
|
import logging
|
2
|
-
from
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Iterable, Optional, Type
|
3
4
|
|
4
5
|
import numpy
|
5
6
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
7
|
+
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
6
8
|
|
7
|
-
from docling.datamodel.base_models import
|
9
|
+
from docling.datamodel.base_models import Page
|
8
10
|
from docling.datamodel.document import ConversionResult
|
9
11
|
from docling.datamodel.pipeline_options import (
|
10
12
|
AcceleratorDevice,
|
11
13
|
AcceleratorOptions,
|
14
|
+
OcrOptions,
|
12
15
|
RapidOcrOptions,
|
13
16
|
)
|
14
17
|
from docling.datamodel.settings import settings
|
@@ -23,10 +26,16 @@ class RapidOcrModel(BaseOcrModel):
|
|
23
26
|
def __init__(
|
24
27
|
self,
|
25
28
|
enabled: bool,
|
29
|
+
artifacts_path: Optional[Path],
|
26
30
|
options: RapidOcrOptions,
|
27
31
|
accelerator_options: AcceleratorOptions,
|
28
32
|
):
|
29
|
-
super().__init__(
|
33
|
+
super().__init__(
|
34
|
+
enabled=enabled,
|
35
|
+
artifacts_path=artifacts_path,
|
36
|
+
options=options,
|
37
|
+
accelerator_options=accelerator_options,
|
38
|
+
)
|
30
39
|
self.options: RapidOcrOptions
|
31
40
|
|
32
41
|
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
@@ -100,18 +109,26 @@ class RapidOcrModel(BaseOcrModel):
|
|
100
109
|
|
101
110
|
if result is not None:
|
102
111
|
cells = [
|
103
|
-
|
104
|
-
|
112
|
+
TextCell(
|
113
|
+
index=ix,
|
105
114
|
text=line[1],
|
115
|
+
orig=line[1],
|
106
116
|
confidence=line[2],
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
(
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
117
|
+
from_ocr=True,
|
118
|
+
rect=BoundingRectangle.from_bounding_box(
|
119
|
+
BoundingBox.from_tuple(
|
120
|
+
coord=(
|
121
|
+
(line[0][0][0] / self.scale)
|
122
|
+
+ ocr_rect.l,
|
123
|
+
(line[0][0][1] / self.scale)
|
124
|
+
+ ocr_rect.t,
|
125
|
+
(line[0][2][0] / self.scale)
|
126
|
+
+ ocr_rect.l,
|
127
|
+
(line[0][2][1] / self.scale)
|
128
|
+
+ ocr_rect.t,
|
129
|
+
),
|
130
|
+
origin=CoordOrigin.TOPLEFT,
|
131
|
+
)
|
115
132
|
),
|
116
133
|
)
|
117
134
|
for ix, line in enumerate(result)
|
@@ -126,3 +143,7 @@ class RapidOcrModel(BaseOcrModel):
|
|
126
143
|
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
|
127
144
|
|
128
145
|
yield page
|
146
|
+
|
147
|
+
@classmethod
|
148
|
+
def get_options_type(cls) -> Type[OcrOptions]:
|
149
|
+
return RapidOcrOptions
|
@@ -5,6 +5,7 @@ from typing import Iterable, Optional, Union
|
|
5
5
|
|
6
6
|
import numpy
|
7
7
|
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
|
8
|
+
from docling_core.types.doc.page import BoundingRectangle
|
8
9
|
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
|
9
10
|
from PIL import ImageDraw
|
10
11
|
|
@@ -129,7 +130,7 @@ class TableStructureModel(BasePageModel):
|
|
129
130
|
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
130
131
|
|
131
132
|
for cell in table_element.cluster.cells:
|
132
|
-
x0, y0, x1, y1 = cell.
|
133
|
+
x0, y0, x1, y1 = cell.rect.to_bounding_box().as_tuple()
|
133
134
|
x0 *= scale_x
|
134
135
|
x1 *= scale_x
|
135
136
|
y0 *= scale_x
|
@@ -223,11 +224,19 @@ class TableStructureModel(BasePageModel):
|
|
223
224
|
# Only allow non empty stings (spaces) into the cells of a table
|
224
225
|
if len(c.text.strip()) > 0:
|
225
226
|
new_cell = copy.deepcopy(c)
|
226
|
-
new_cell.
|
227
|
-
|
227
|
+
new_cell.rect = BoundingRectangle.from_bounding_box(
|
228
|
+
new_cell.rect.to_bounding_box().scaled(
|
229
|
+
scale=self.scale
|
230
|
+
)
|
228
231
|
)
|
229
232
|
|
230
|
-
tokens.append(
|
233
|
+
tokens.append(
|
234
|
+
{
|
235
|
+
"id": new_cell.index,
|
236
|
+
"text": new_cell.text,
|
237
|
+
"bbox": new_cell.rect.to_bounding_box().model_dump(),
|
238
|
+
}
|
239
|
+
)
|
231
240
|
page_input["tokens"] = tokens
|
232
241
|
|
233
242
|
tf_output = self.tf_predictor.multi_table_predict(
|
@@ -3,15 +3,21 @@ import io
|
|
3
3
|
import logging
|
4
4
|
import os
|
5
5
|
import tempfile
|
6
|
+
from pathlib import Path
|
6
7
|
from subprocess import DEVNULL, PIPE, Popen
|
7
|
-
from typing import Iterable, List, Optional, Tuple
|
8
|
+
from typing import Iterable, List, Optional, Tuple, Type
|
8
9
|
|
9
10
|
import pandas as pd
|
10
11
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
12
|
+
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
11
13
|
|
12
|
-
from docling.datamodel.base_models import
|
14
|
+
from docling.datamodel.base_models import Page
|
13
15
|
from docling.datamodel.document import ConversionResult
|
14
|
-
from docling.datamodel.pipeline_options import
|
16
|
+
from docling.datamodel.pipeline_options import (
|
17
|
+
AcceleratorOptions,
|
18
|
+
OcrOptions,
|
19
|
+
TesseractCliOcrOptions,
|
20
|
+
)
|
15
21
|
from docling.datamodel.settings import settings
|
16
22
|
from docling.models.base_ocr_model import BaseOcrModel
|
17
23
|
from docling.utils.ocr_utils import map_tesseract_script
|
@@ -21,8 +27,19 @@ _log = logging.getLogger(__name__)
|
|
21
27
|
|
22
28
|
|
23
29
|
class TesseractOcrCliModel(BaseOcrModel):
|
24
|
-
def __init__(
|
25
|
-
|
30
|
+
def __init__(
|
31
|
+
self,
|
32
|
+
enabled: bool,
|
33
|
+
artifacts_path: Optional[Path],
|
34
|
+
options: TesseractCliOcrOptions,
|
35
|
+
accelerator_options: AcceleratorOptions,
|
36
|
+
):
|
37
|
+
super().__init__(
|
38
|
+
enabled=enabled,
|
39
|
+
artifacts_path=artifacts_path,
|
40
|
+
options=options,
|
41
|
+
accelerator_options=accelerator_options,
|
42
|
+
)
|
26
43
|
self.options: TesseractCliOcrOptions
|
27
44
|
|
28
45
|
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
@@ -228,18 +245,22 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
228
245
|
t = b + h
|
229
246
|
r = l + w
|
230
247
|
|
231
|
-
cell =
|
232
|
-
|
248
|
+
cell = TextCell(
|
249
|
+
index=ix,
|
233
250
|
text=text,
|
251
|
+
orig=text,
|
252
|
+
from_ocr=True,
|
234
253
|
confidence=conf / 100.0,
|
235
|
-
|
236
|
-
|
237
|
-
(
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
254
|
+
rect=BoundingRectangle.from_bounding_box(
|
255
|
+
BoundingBox.from_tuple(
|
256
|
+
coord=(
|
257
|
+
(l / self.scale) + ocr_rect.l,
|
258
|
+
(b / self.scale) + ocr_rect.t,
|
259
|
+
(r / self.scale) + ocr_rect.l,
|
260
|
+
(t / self.scale) + ocr_rect.t,
|
261
|
+
),
|
262
|
+
origin=CoordOrigin.TOPLEFT,
|
263
|
+
)
|
243
264
|
),
|
244
265
|
)
|
245
266
|
all_ocr_cells.append(cell)
|
@@ -252,3 +273,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
252
273
|
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
|
253
274
|
|
254
275
|
yield page
|
276
|
+
|
277
|
+
@classmethod
|
278
|
+
def get_options_type(cls) -> Type[OcrOptions]:
|
279
|
+
return TesseractCliOcrOptions
|
@@ -1,11 +1,17 @@
|
|
1
1
|
import logging
|
2
|
-
from
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Iterable, Optional, Type
|
3
4
|
|
4
5
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
6
|
+
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
5
7
|
|
6
|
-
from docling.datamodel.base_models import
|
8
|
+
from docling.datamodel.base_models import Page
|
7
9
|
from docling.datamodel.document import ConversionResult
|
8
|
-
from docling.datamodel.pipeline_options import
|
10
|
+
from docling.datamodel.pipeline_options import (
|
11
|
+
AcceleratorOptions,
|
12
|
+
OcrOptions,
|
13
|
+
TesseractOcrOptions,
|
14
|
+
)
|
9
15
|
from docling.datamodel.settings import settings
|
10
16
|
from docling.models.base_ocr_model import BaseOcrModel
|
11
17
|
from docling.utils.ocr_utils import map_tesseract_script
|
@@ -15,8 +21,19 @@ _log = logging.getLogger(__name__)
|
|
15
21
|
|
16
22
|
|
17
23
|
class TesseractOcrModel(BaseOcrModel):
|
18
|
-
def __init__(
|
19
|
-
|
24
|
+
def __init__(
|
25
|
+
self,
|
26
|
+
enabled: bool,
|
27
|
+
artifacts_path: Optional[Path],
|
28
|
+
options: TesseractOcrOptions,
|
29
|
+
accelerator_options: AcceleratorOptions,
|
30
|
+
):
|
31
|
+
super().__init__(
|
32
|
+
enabled=enabled,
|
33
|
+
artifacts_path=artifacts_path,
|
34
|
+
options=options,
|
35
|
+
accelerator_options=accelerator_options,
|
36
|
+
)
|
20
37
|
self.options: TesseractOcrOptions
|
21
38
|
|
22
39
|
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
@@ -31,14 +48,14 @@ class TesseractOcrModel(BaseOcrModel):
|
|
31
48
|
"Note that tesserocr might have to be manually compiled for working with "
|
32
49
|
"your Tesseract installation. The Docling documentation provides examples for it. "
|
33
50
|
"Alternatively, Docling has support for other OCR engines. See the documentation: "
|
34
|
-
"https://
|
51
|
+
"https://docling-project.github.io/docling/installation/"
|
35
52
|
)
|
36
53
|
missing_langs_errmsg = (
|
37
54
|
"tesserocr is not correctly configured. No language models have been detected. "
|
38
55
|
"Please ensure that the TESSDATA_PREFIX envvar points to tesseract languages dir. "
|
39
56
|
"You can find more information how to setup other OCR engines in Docling "
|
40
57
|
"documentation: "
|
41
|
-
"https://
|
58
|
+
"https://docling-project.github.io/docling/installation/"
|
42
59
|
)
|
43
60
|
|
44
61
|
try:
|
@@ -173,13 +190,17 @@ class TesseractOcrModel(BaseOcrModel):
|
|
173
190
|
top = (box["y"] + box["h"]) / self.scale
|
174
191
|
|
175
192
|
cells.append(
|
176
|
-
|
177
|
-
|
193
|
+
TextCell(
|
194
|
+
index=ix,
|
178
195
|
text=text,
|
196
|
+
orig=text,
|
197
|
+
from_ocr=True,
|
179
198
|
confidence=confidence,
|
180
|
-
|
181
|
-
|
182
|
-
|
199
|
+
rect=BoundingRectangle.from_bounding_box(
|
200
|
+
BoundingBox.from_tuple(
|
201
|
+
coord=(left, top, right, bottom),
|
202
|
+
origin=CoordOrigin.TOPLEFT,
|
203
|
+
),
|
183
204
|
),
|
184
205
|
)
|
185
206
|
)
|
@@ -195,3 +216,7 @@ class TesseractOcrModel(BaseOcrModel):
|
|
195
216
|
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
|
196
217
|
|
197
218
|
yield page
|
219
|
+
|
220
|
+
@classmethod
|
221
|
+
def get_options_type(cls) -> Type[OcrOptions]:
|
222
|
+
return TesseractOcrOptions
|
@@ -10,16 +10,7 @@ from docling.backend.abstract_backend import AbstractDocumentBackend
|
|
10
10
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
11
11
|
from docling.datamodel.base_models import AssembledUnit, Page
|
12
12
|
from docling.datamodel.document import ConversionResult
|
13
|
-
from docling.datamodel.pipeline_options import
|
14
|
-
EasyOcrOptions,
|
15
|
-
OcrMacOptions,
|
16
|
-
PdfPipelineOptions,
|
17
|
-
PictureDescriptionApiOptions,
|
18
|
-
PictureDescriptionVlmOptions,
|
19
|
-
RapidOcrOptions,
|
20
|
-
TesseractCliOcrOptions,
|
21
|
-
TesseractOcrOptions,
|
22
|
-
)
|
13
|
+
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
23
14
|
from docling.datamodel.settings import settings
|
24
15
|
from docling.models.base_ocr_model import BaseOcrModel
|
25
16
|
from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
|
@@ -27,22 +18,16 @@ from docling.models.document_picture_classifier import (
|
|
27
18
|
DocumentPictureClassifier,
|
28
19
|
DocumentPictureClassifierOptions,
|
29
20
|
)
|
30
|
-
from docling.models.
|
21
|
+
from docling.models.factories import get_ocr_factory, get_picture_description_factory
|
31
22
|
from docling.models.layout_model import LayoutModel
|
32
|
-
from docling.models.ocr_mac_model import OcrMacModel
|
33
23
|
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
|
34
24
|
from docling.models.page_preprocessing_model import (
|
35
25
|
PagePreprocessingModel,
|
36
26
|
PagePreprocessingOptions,
|
37
27
|
)
|
38
|
-
from docling.models.picture_description_api_model import PictureDescriptionApiModel
|
39
28
|
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
40
|
-
from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
|
41
|
-
from docling.models.rapid_ocr_model import RapidOcrModel
|
42
29
|
from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions
|
43
30
|
from docling.models.table_structure_model import TableStructureModel
|
44
|
-
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
45
|
-
from docling.models.tesseract_ocr_model import TesseractOcrModel
|
46
31
|
from docling.pipeline.base_pipeline import PaginatedPipeline
|
47
32
|
from docling.utils.model_downloader import download_models
|
48
33
|
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
@@ -78,16 +63,14 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
78
63
|
|
79
64
|
self.glm_model = ReadingOrderModel(options=ReadingOrderOptions())
|
80
65
|
|
81
|
-
|
82
|
-
raise RuntimeError(
|
83
|
-
f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
|
84
|
-
)
|
66
|
+
ocr_model = self.get_ocr_model(artifacts_path=artifacts_path)
|
85
67
|
|
86
68
|
self.build_pipe = [
|
87
69
|
# Pre-processing
|
88
70
|
PagePreprocessingModel(
|
89
71
|
options=PagePreprocessingOptions(
|
90
|
-
images_scale=pipeline_options.images_scale
|
72
|
+
images_scale=pipeline_options.images_scale,
|
73
|
+
create_parsed_page=pipeline_options.generate_parsed_pages,
|
91
74
|
)
|
92
75
|
),
|
93
76
|
# OCR
|
@@ -163,66 +146,30 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
163
146
|
output_dir = download_models(output_dir=local_dir, force=force, progress=False)
|
164
147
|
return output_dir
|
165
148
|
|
166
|
-
def get_ocr_model(
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
elif isinstance(self.pipeline_options.ocr_options, TesseractCliOcrOptions):
|
177
|
-
return TesseractOcrCliModel(
|
178
|
-
enabled=self.pipeline_options.do_ocr,
|
179
|
-
options=self.pipeline_options.ocr_options,
|
180
|
-
)
|
181
|
-
elif isinstance(self.pipeline_options.ocr_options, TesseractOcrOptions):
|
182
|
-
return TesseractOcrModel(
|
183
|
-
enabled=self.pipeline_options.do_ocr,
|
184
|
-
options=self.pipeline_options.ocr_options,
|
185
|
-
)
|
186
|
-
elif isinstance(self.pipeline_options.ocr_options, RapidOcrOptions):
|
187
|
-
return RapidOcrModel(
|
188
|
-
enabled=self.pipeline_options.do_ocr,
|
189
|
-
options=self.pipeline_options.ocr_options,
|
190
|
-
accelerator_options=self.pipeline_options.accelerator_options,
|
191
|
-
)
|
192
|
-
elif isinstance(self.pipeline_options.ocr_options, OcrMacOptions):
|
193
|
-
if "darwin" != sys.platform:
|
194
|
-
raise RuntimeError(
|
195
|
-
f"The specified OCR type is only supported on Mac: {self.pipeline_options.ocr_options.kind}."
|
196
|
-
)
|
197
|
-
return OcrMacModel(
|
198
|
-
enabled=self.pipeline_options.do_ocr,
|
199
|
-
options=self.pipeline_options.ocr_options,
|
200
|
-
)
|
201
|
-
return None
|
149
|
+
def get_ocr_model(self, artifacts_path: Optional[Path] = None) -> BaseOcrModel:
|
150
|
+
factory = get_ocr_factory(
|
151
|
+
allow_external_plugins=self.pipeline_options.allow_external_plugins
|
152
|
+
)
|
153
|
+
return factory.create_instance(
|
154
|
+
options=self.pipeline_options.ocr_options,
|
155
|
+
enabled=self.pipeline_options.do_ocr,
|
156
|
+
artifacts_path=artifacts_path,
|
157
|
+
accelerator_options=self.pipeline_options.accelerator_options,
|
158
|
+
)
|
202
159
|
|
203
160
|
def get_picture_description_model(
|
204
161
|
self, artifacts_path: Optional[Path] = None
|
205
162
|
) -> Optional[PictureDescriptionBaseModel]:
|
206
|
-
|
207
|
-
self.pipeline_options.
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
self.pipeline_options.picture_description_options,
|
217
|
-
PictureDescriptionVlmOptions,
|
218
|
-
):
|
219
|
-
return PictureDescriptionVlmModel(
|
220
|
-
enabled=self.pipeline_options.do_picture_description,
|
221
|
-
artifacts_path=artifacts_path,
|
222
|
-
options=self.pipeline_options.picture_description_options,
|
223
|
-
accelerator_options=self.pipeline_options.accelerator_options,
|
224
|
-
)
|
225
|
-
return None
|
163
|
+
factory = get_picture_description_factory(
|
164
|
+
allow_external_plugins=self.pipeline_options.allow_external_plugins
|
165
|
+
)
|
166
|
+
return factory.create_instance(
|
167
|
+
options=self.pipeline_options.picture_description_options,
|
168
|
+
enabled=self.pipeline_options.do_picture_description,
|
169
|
+
enable_remote_services=self.pipeline_options.enable_remote_services,
|
170
|
+
artifacts_path=artifacts_path,
|
171
|
+
accelerator_options=self.pipeline_options.accelerator_options,
|
172
|
+
)
|
226
173
|
|
227
174
|
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
|
228
175
|
with TimeRecorder(conv_res, "page_init"):
|
docling/utils/export.py
CHANGED
@@ -2,9 +2,9 @@ import logging
|
|
2
2
|
from typing import Any, Dict, Iterable, List, Tuple, Union
|
3
3
|
|
4
4
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
5
|
+
from docling_core.types.doc.page import TextCell
|
5
6
|
from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
|
6
7
|
|
7
|
-
from docling.datamodel.base_models import OcrCell
|
8
8
|
from docling.datamodel.document import ConversionResult, Page
|
9
9
|
|
10
10
|
_log = logging.getLogger(__name__)
|
@@ -86,11 +86,13 @@ def generate_multimodal_pages(
|
|
86
86
|
if page.size is None:
|
87
87
|
return cells
|
88
88
|
for cell in page.cells:
|
89
|
-
new_bbox =
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
89
|
+
new_bbox = (
|
90
|
+
cell.rect.to_bounding_box()
|
91
|
+
.to_top_left_origin(page_height=page.size.height)
|
92
|
+
.normalized(page_size=page.size)
|
93
|
+
)
|
94
|
+
is_ocr = cell.from_ocr
|
95
|
+
ocr_confidence = cell.confidence
|
94
96
|
cells.append(
|
95
97
|
{
|
96
98
|
"text": cell.text,
|
@@ -5,9 +5,10 @@ from collections import defaultdict
|
|
5
5
|
from typing import Dict, List, Set, Tuple
|
6
6
|
|
7
7
|
from docling_core.types.doc import DocItemLabel, Size
|
8
|
+
from docling_core.types.doc.page import TextCell
|
8
9
|
from rtree import index
|
9
10
|
|
10
|
-
from docling.datamodel.base_models import BoundingBox,
|
11
|
+
from docling.datamodel.base_models import BoundingBox, Cluster
|
11
12
|
|
12
13
|
_log = logging.getLogger(__name__)
|
13
14
|
|
@@ -198,7 +199,7 @@ class LayoutPostprocessor:
|
|
198
199
|
DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
|
199
200
|
}
|
200
201
|
|
201
|
-
def __init__(self, cells: List[
|
202
|
+
def __init__(self, cells: List[TextCell], clusters: List[Cluster], page_size: Size):
|
202
203
|
"""Initialize processor with cells and clusters."""
|
203
204
|
"""Initialize processor with cells and spatial indices."""
|
204
205
|
self.cells = cells
|
@@ -218,7 +219,7 @@ class LayoutPostprocessor:
|
|
218
219
|
[c for c in self.special_clusters if c.label in self.WRAPPER_TYPES]
|
219
220
|
)
|
220
221
|
|
221
|
-
def postprocess(self) -> Tuple[List[Cluster], List[
|
222
|
+
def postprocess(self) -> Tuple[List[Cluster], List[TextCell]]:
|
222
223
|
"""Main processing pipeline."""
|
223
224
|
self.regular_clusters = self._process_regular_clusters()
|
224
225
|
self.special_clusters = self._process_special_clusters()
|
@@ -271,15 +272,13 @@ class LayoutPostprocessor:
|
|
271
272
|
next_id = max((c.id for c in self.all_clusters), default=0) + 1
|
272
273
|
orphan_clusters = []
|
273
274
|
for i, cell in enumerate(unassigned):
|
274
|
-
conf =
|
275
|
-
if isinstance(cell, OcrCell):
|
276
|
-
conf = cell.confidence
|
275
|
+
conf = cell.confidence
|
277
276
|
|
278
277
|
orphan_clusters.append(
|
279
278
|
Cluster(
|
280
279
|
id=next_id + i,
|
281
280
|
label=DocItemLabel.TEXT,
|
282
|
-
bbox=cell.
|
281
|
+
bbox=cell.to_bounding_box(),
|
283
282
|
confidence=conf,
|
284
283
|
cells=[cell],
|
285
284
|
)
|
@@ -557,13 +556,13 @@ class LayoutPostprocessor:
|
|
557
556
|
|
558
557
|
return current_best if current_best else clusters[0]
|
559
558
|
|
560
|
-
def _deduplicate_cells(self, cells: List[
|
559
|
+
def _deduplicate_cells(self, cells: List[TextCell]) -> List[TextCell]:
|
561
560
|
"""Ensure each cell appears only once, maintaining order of first appearance."""
|
562
561
|
seen_ids = set()
|
563
562
|
unique_cells = []
|
564
563
|
for cell in cells:
|
565
|
-
if cell.
|
566
|
-
seen_ids.add(cell.
|
564
|
+
if cell.index not in seen_ids:
|
565
|
+
seen_ids.add(cell.index)
|
567
566
|
unique_cells.append(cell)
|
568
567
|
return unique_cells
|
569
568
|
|
@@ -582,11 +581,13 @@ class LayoutPostprocessor:
|
|
582
581
|
best_cluster = None
|
583
582
|
|
584
583
|
for cluster in clusters:
|
585
|
-
if cell.
|
584
|
+
if cell.rect.to_bounding_box().area() <= 0:
|
586
585
|
continue
|
587
586
|
|
588
|
-
overlap = cell.
|
589
|
-
|
587
|
+
overlap = cell.rect.to_bounding_box().intersection_area_with(
|
588
|
+
cluster.bbox
|
589
|
+
)
|
590
|
+
overlap_ratio = overlap / cell.rect.to_bounding_box().area()
|
590
591
|
|
591
592
|
if overlap_ratio > best_overlap:
|
592
593
|
best_overlap = overlap_ratio
|
@@ -601,11 +602,13 @@ class LayoutPostprocessor:
|
|
601
602
|
|
602
603
|
return clusters
|
603
604
|
|
604
|
-
def _find_unassigned_cells(self, clusters: List[Cluster]) -> List[
|
605
|
+
def _find_unassigned_cells(self, clusters: List[Cluster]) -> List[TextCell]:
|
605
606
|
"""Find cells not assigned to any cluster."""
|
606
|
-
assigned = {cell.
|
607
|
+
assigned = {cell.index for cluster in clusters for cell in cluster.cells}
|
607
608
|
return [
|
608
|
-
cell
|
609
|
+
cell
|
610
|
+
for cell in self.cells
|
611
|
+
if cell.index not in assigned and cell.text.strip()
|
609
612
|
]
|
610
613
|
|
611
614
|
def _adjust_cluster_bboxes(self, clusters: List[Cluster]) -> List[Cluster]:
|
@@ -615,10 +618,10 @@ class LayoutPostprocessor:
|
|
615
618
|
continue
|
616
619
|
|
617
620
|
cells_bbox = BoundingBox(
|
618
|
-
l=min(cell.
|
619
|
-
t=min(cell.
|
620
|
-
r=max(cell.
|
621
|
-
b=max(cell.
|
621
|
+
l=min(cell.rect.to_bounding_box().l for cell in cluster.cells),
|
622
|
+
t=min(cell.rect.to_bounding_box().t for cell in cluster.cells),
|
623
|
+
r=max(cell.rect.to_bounding_box().r for cell in cluster.cells),
|
624
|
+
b=max(cell.rect.to_bounding_box().b for cell in cluster.cells),
|
622
625
|
)
|
623
626
|
|
624
627
|
if cluster.label == DocItemLabel.TABLE:
|
@@ -634,9 +637,9 @@ class LayoutPostprocessor:
|
|
634
637
|
|
635
638
|
return clusters
|
636
639
|
|
637
|
-
def _sort_cells(self, cells: List[
|
640
|
+
def _sort_cells(self, cells: List[TextCell]) -> List[TextCell]:
|
638
641
|
"""Sort cells in native reading order."""
|
639
|
-
return sorted(cells, key=lambda c: (c.
|
642
|
+
return sorted(cells, key=lambda c: (c.index))
|
640
643
|
|
641
644
|
def _sort_clusters(
|
642
645
|
self, clusters: List[Cluster], mode: str = "id"
|
@@ -647,7 +650,7 @@ class LayoutPostprocessor:
|
|
647
650
|
clusters,
|
648
651
|
key=lambda cluster: (
|
649
652
|
(
|
650
|
-
min(cell.
|
653
|
+
min(cell.index for cell in cluster.cells)
|
651
654
|
if cluster.cells
|
652
655
|
else sys.maxsize
|
653
656
|
),
|
docling/utils/visualization.py
CHANGED
@@ -25,7 +25,7 @@ def draw_clusters(
|
|
25
25
|
# Draw cells first (underneath)
|
26
26
|
cell_color = (0, 0, 0, 40) # Transparent black for cells
|
27
27
|
for tc in c.cells:
|
28
|
-
cx0, cy0, cx1, cy1 = tc.
|
28
|
+
cx0, cy0, cx1, cy1 = tc.rect.to_bounding_box().as_tuple()
|
29
29
|
cx0 *= scale_x
|
30
30
|
cx1 *= scale_x
|
31
31
|
cy0 *= scale_x
|