docling 1.4.0__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/abstract_backend.py +4 -0
- docling/backend/docling_parse_backend.py +38 -4
- docling/backend/pypdfium2_backend.py +18 -2
- docling/datamodel/base_models.py +56 -17
- docling/datamodel/document.py +16 -1
- docling/document_converter.py +12 -8
- docling/models/base_ocr_model.py +124 -0
- docling/models/easyocr_model.py +39 -46
- docling/models/layout_model.py +3 -1
- docling/models/table_structure_model.py +4 -9
- docling/pipeline/base_model_pipeline.py +0 -1
- docling/pipeline/standard_model_pipeline.py +1 -3
- {docling-1.4.0.dist-info → docling-1.6.0.dist-info}/METADATA +13 -10
- docling-1.6.0.dist-info/RECORD +27 -0
- docling-1.4.0.dist-info/RECORD +0 -26
- {docling-1.4.0.dist-info → docling-1.6.0.dist-info}/LICENSE +0 -0
- {docling-1.4.0.dist-info → docling-1.6.0.dist-info}/WHEEL +0 -0
@@ -18,6 +18,10 @@ class PdfPageBackend(ABC):
|
|
18
18
|
def get_text_cells(self) -> Iterable["Cell"]:
|
19
19
|
pass
|
20
20
|
|
21
|
+
@abstractmethod
|
22
|
+
def get_bitmap_rects(self, scale: int = 1) -> Iterable["BoundingBox"]:
|
23
|
+
pass
|
24
|
+
|
21
25
|
@abstractmethod
|
22
26
|
def get_page_image(
|
23
27
|
self, scale: int = 1, cropbox: Optional["BoundingBox"] = None
|
@@ -1,7 +1,9 @@
|
|
1
|
+
import logging
|
1
2
|
import random
|
3
|
+
import time
|
2
4
|
from io import BytesIO
|
3
5
|
from pathlib import Path
|
4
|
-
from typing import Iterable,
|
6
|
+
from typing import Iterable, Optional, Union
|
5
7
|
|
6
8
|
import pypdfium2 as pdfium
|
7
9
|
from docling_parse.docling_parse import pdf_parser
|
@@ -11,6 +13,8 @@ from pypdfium2 import PdfPage
|
|
11
13
|
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
|
12
14
|
from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
|
13
15
|
|
16
|
+
_log = logging.getLogger(__name__)
|
17
|
+
|
14
18
|
|
15
19
|
class DoclingParsePageBackend(PdfPageBackend):
|
16
20
|
def __init__(self, page_obj: PdfPage, docling_page_obj):
|
@@ -39,7 +43,7 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
39
43
|
r=x1 * scale * page_size.width / parser_width,
|
40
44
|
t=y1 * scale * page_size.height / parser_height,
|
41
45
|
coord_origin=CoordOrigin.BOTTOMLEFT,
|
42
|
-
).to_top_left_origin(page_size.height * scale)
|
46
|
+
).to_top_left_origin(page_height=page_size.height * scale)
|
43
47
|
|
44
48
|
overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
|
45
49
|
|
@@ -62,6 +66,12 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
62
66
|
for i in range(len(self._dpage["cells"])):
|
63
67
|
rect = self._dpage["cells"][i]["box"]["device"]
|
64
68
|
x0, y0, x1, y1 = rect
|
69
|
+
|
70
|
+
if x1 < x0:
|
71
|
+
x0, x1 = x1, x0
|
72
|
+
if y1 < y0:
|
73
|
+
y0, y1 = y1, y0
|
74
|
+
|
65
75
|
text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
|
66
76
|
cells.append(
|
67
77
|
Cell(
|
@@ -80,7 +90,9 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
80
90
|
cell_counter += 1
|
81
91
|
|
82
92
|
def draw_clusters_and_cells():
|
83
|
-
image =
|
93
|
+
image = (
|
94
|
+
self.get_page_image()
|
95
|
+
) # make new image to avoid drawing on the saved ones
|
84
96
|
draw = ImageDraw.Draw(image)
|
85
97
|
for c in cells:
|
86
98
|
x0, y0, x1, y1 = c.bbox.as_tuple()
|
@@ -102,6 +114,20 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
102
114
|
|
103
115
|
return cells
|
104
116
|
|
117
|
+
def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
|
118
|
+
AREA_THRESHOLD = 32 * 32
|
119
|
+
|
120
|
+
for i in range(len(self._dpage["images"])):
|
121
|
+
bitmap = self._dpage["images"][i]
|
122
|
+
cropbox = BoundingBox.from_tuple(
|
123
|
+
bitmap["box"], origin=CoordOrigin.BOTTOMLEFT
|
124
|
+
).to_top_left_origin(self.get_size().height)
|
125
|
+
|
126
|
+
if cropbox.area() > AREA_THRESHOLD:
|
127
|
+
cropbox = cropbox.scaled(scale=scale)
|
128
|
+
|
129
|
+
yield cropbox
|
130
|
+
|
105
131
|
def get_page_image(
|
106
132
|
self, scale: int = 1, cropbox: Optional[BoundingBox] = None
|
107
133
|
) -> Image.Image:
|
@@ -151,15 +177,23 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
|
|
151
177
|
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
152
178
|
# Parsing cells with docling_parser call
|
153
179
|
parser = pdf_parser()
|
180
|
+
|
181
|
+
start_pb_time = time.time()
|
182
|
+
|
154
183
|
if isinstance(path_or_stream, BytesIO):
|
155
184
|
self._parser_doc = parser.find_cells_from_bytesio(path_or_stream)
|
156
185
|
else:
|
157
186
|
self._parser_doc = parser.find_cells(str(path_or_stream))
|
158
187
|
|
188
|
+
end_pb_time = time.time() - start_pb_time
|
189
|
+
_log.info(
|
190
|
+
f"Time to parse {path_or_stream.name} with docling-parse: time={end_pb_time:.3f}"
|
191
|
+
)
|
192
|
+
|
159
193
|
def page_count(self) -> int:
|
160
194
|
return len(self._parser_doc["pages"])
|
161
195
|
|
162
|
-
def load_page(self, page_no: int) ->
|
196
|
+
def load_page(self, page_no: int) -> DoclingParsePageBackend:
|
163
197
|
return DoclingParsePageBackend(
|
164
198
|
self._pdoc[page_no], self._parser_doc["pages"][page_no]
|
165
199
|
)
|
@@ -4,6 +4,7 @@ from pathlib import Path
|
|
4
4
|
from typing import Iterable, List, Optional, Union
|
5
5
|
|
6
6
|
import pypdfium2 as pdfium
|
7
|
+
import pypdfium2.raw as pdfium_c
|
7
8
|
from PIL import Image, ImageDraw
|
8
9
|
from pypdfium2 import PdfPage
|
9
10
|
|
@@ -17,6 +18,19 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
17
18
|
self._ppage = page_obj
|
18
19
|
self.text_page = None
|
19
20
|
|
21
|
+
def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
|
22
|
+
AREA_THRESHOLD = 32 * 32
|
23
|
+
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
24
|
+
pos = obj.get_pos()
|
25
|
+
cropbox = BoundingBox.from_tuple(
|
26
|
+
pos, origin=CoordOrigin.BOTTOMLEFT
|
27
|
+
).to_top_left_origin(page_height=self.get_size().height)
|
28
|
+
|
29
|
+
if cropbox.area() > AREA_THRESHOLD:
|
30
|
+
cropbox = cropbox.scaled(scale=scale)
|
31
|
+
|
32
|
+
yield cropbox
|
33
|
+
|
20
34
|
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
21
35
|
if not self.text_page:
|
22
36
|
self.text_page = self._ppage.get_textpage()
|
@@ -134,7 +148,9 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
134
148
|
return merged_cells
|
135
149
|
|
136
150
|
def draw_clusters_and_cells():
|
137
|
-
image =
|
151
|
+
image = (
|
152
|
+
self.get_page_image()
|
153
|
+
) # make new image to avoid drawing on the saved ones
|
138
154
|
draw = ImageDraw.Draw(image)
|
139
155
|
for c in cells:
|
140
156
|
x0, y0, x1, y1 = c.bbox.as_tuple()
|
@@ -206,7 +222,7 @@ class PyPdfiumDocumentBackend(PdfDocumentBackend):
|
|
206
222
|
def page_count(self) -> int:
|
207
223
|
return len(self._pdoc)
|
208
224
|
|
209
|
-
def load_page(self, page_no: int) ->
|
225
|
+
def load_page(self, page_no: int) -> PyPdfiumPageBackend:
|
210
226
|
return PyPdfiumPageBackend(self._pdoc[page_no])
|
211
227
|
|
212
228
|
def is_valid(self) -> bool:
|
docling/datamodel/base_models.py
CHANGED
@@ -1,10 +1,12 @@
|
|
1
1
|
import copy
|
2
|
+
import warnings
|
2
3
|
from enum import Enum, auto
|
3
4
|
from io import BytesIO
|
4
|
-
from typing import Any, Dict, List, Optional, Tuple, Union
|
5
|
+
from typing import Annotated, Any, Dict, List, Optional, Tuple, Union
|
5
6
|
|
6
7
|
from PIL.Image import Image
|
7
|
-
from pydantic import BaseModel, ConfigDict, model_validator
|
8
|
+
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
9
|
+
from typing_extensions import Self
|
8
10
|
|
9
11
|
from docling.backend.abstract_backend import PdfPageBackend
|
10
12
|
|
@@ -66,13 +68,21 @@ class BoundingBox(BaseModel):
|
|
66
68
|
@classmethod
|
67
69
|
def from_tuple(cls, coord: Tuple[float], origin: CoordOrigin):
|
68
70
|
if origin == CoordOrigin.TOPLEFT:
|
69
|
-
|
70
|
-
|
71
|
-
|
71
|
+
l, t, r, b = coord[0], coord[1], coord[2], coord[3]
|
72
|
+
if r < l:
|
73
|
+
l, r = r, l
|
74
|
+
if b < t:
|
75
|
+
b, t = t, b
|
76
|
+
|
77
|
+
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
|
72
78
|
elif origin == CoordOrigin.BOTTOMLEFT:
|
73
|
-
|
74
|
-
|
75
|
-
|
79
|
+
l, b, r, t = coord[0], coord[1], coord[2], coord[3]
|
80
|
+
if r < l:
|
81
|
+
l, r = r, l
|
82
|
+
if b > t:
|
83
|
+
b, t = t, b
|
84
|
+
|
85
|
+
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
|
76
86
|
|
77
87
|
def area(self) -> float:
|
78
88
|
return (self.r - self.l) * (self.b - self.t)
|
@@ -234,14 +244,30 @@ class Page(BaseModel):
|
|
234
244
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
235
245
|
|
236
246
|
page_no: int
|
237
|
-
page_hash: str = None
|
238
|
-
size: PageSize = None
|
239
|
-
image: Image = None
|
247
|
+
page_hash: Optional[str] = None
|
248
|
+
size: Optional[PageSize] = None
|
240
249
|
cells: List[Cell] = None
|
241
250
|
predictions: PagePredictions = PagePredictions()
|
242
|
-
assembled: AssembledUnit = None
|
251
|
+
assembled: Optional[AssembledUnit] = None
|
243
252
|
|
244
|
-
_backend: PdfPageBackend =
|
253
|
+
_backend: Optional[PdfPageBackend] = (
|
254
|
+
None # Internal PDF backend. By default it is cleared during assembling.
|
255
|
+
)
|
256
|
+
_default_image_scale: float = 1.0 # Default image scale for external usage.
|
257
|
+
_image_cache: Dict[float, Image] = (
|
258
|
+
{}
|
259
|
+
) # Cache of images in different scales. By default it is cleared during assembling.
|
260
|
+
|
261
|
+
def get_image(self, scale: float = 1.0) -> Optional[Image]:
|
262
|
+
if self._backend is None:
|
263
|
+
return self._image_cache.get(scale, None)
|
264
|
+
if not scale in self._image_cache:
|
265
|
+
self._image_cache[scale] = self._backend.get_page_image(scale=scale)
|
266
|
+
return self._image_cache[scale]
|
267
|
+
|
268
|
+
@property
|
269
|
+
def image(self) -> Optional[Image]:
|
270
|
+
return self.get_image(scale=self._default_image_scale)
|
245
271
|
|
246
272
|
|
247
273
|
class DocumentStream(BaseModel):
|
@@ -262,12 +288,25 @@ class TableStructureOptions(BaseModel):
|
|
262
288
|
|
263
289
|
class PipelineOptions(BaseModel):
|
264
290
|
do_table_structure: bool = True # True: perform table structure extraction
|
265
|
-
do_ocr: bool =
|
291
|
+
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
266
292
|
|
267
293
|
table_structure_options: TableStructureOptions = TableStructureOptions()
|
268
294
|
|
269
295
|
|
270
296
|
class AssembleOptions(BaseModel):
|
271
|
-
keep_page_images:
|
272
|
-
|
273
|
-
|
297
|
+
keep_page_images: Annotated[
|
298
|
+
bool,
|
299
|
+
Field(
|
300
|
+
deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead"
|
301
|
+
),
|
302
|
+
] = False # False: page images are removed in the assemble step
|
303
|
+
images_scale: Optional[float] = None # if set, the scale for generated images
|
304
|
+
|
305
|
+
@model_validator(mode="after")
|
306
|
+
def set_page_images_from_deprecated(self) -> Self:
|
307
|
+
with warnings.catch_warnings():
|
308
|
+
warnings.simplefilter("ignore", DeprecationWarning)
|
309
|
+
default_scale = 1.0
|
310
|
+
if self.keep_page_images and self.images_scale is None:
|
311
|
+
self.images_scale = default_scale
|
312
|
+
return self
|
docling/datamodel/document.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
import logging
|
2
2
|
from io import BytesIO
|
3
3
|
from pathlib import Path, PurePath
|
4
|
-
from typing import ClassVar, Dict, Iterable, List, Optional, Type, Union
|
4
|
+
from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
|
5
5
|
|
6
6
|
from docling_core.types import BaseCell, BaseText
|
7
7
|
from docling_core.types import BoundingBox as DsBoundingBox
|
@@ -21,6 +21,7 @@ from docling.datamodel.base_models import (
|
|
21
21
|
DocumentStream,
|
22
22
|
FigureElement,
|
23
23
|
Page,
|
24
|
+
PageElement,
|
24
25
|
TableElement,
|
25
26
|
TextElement,
|
26
27
|
)
|
@@ -302,6 +303,20 @@ class ConvertedDocument(BaseModel):
|
|
302
303
|
else:
|
303
304
|
return ""
|
304
305
|
|
306
|
+
def render_element_images(
|
307
|
+
self, element_types: Tuple[PageElement] = (FigureElement,)
|
308
|
+
):
|
309
|
+
for element in self.assembled.elements:
|
310
|
+
if isinstance(element, element_types):
|
311
|
+
page_ix = element.page_no
|
312
|
+
scale = self.pages[page_ix]._default_image_scale
|
313
|
+
crop_bbox = element.cluster.bbox.scaled(scale=scale).to_top_left_origin(
|
314
|
+
page_height=self.pages[page_ix].size.height * scale
|
315
|
+
)
|
316
|
+
|
317
|
+
cropped_im = self.pages[page_ix].image.crop(crop_bbox.as_tuple())
|
318
|
+
yield element, cropped_im
|
319
|
+
|
305
320
|
|
306
321
|
class DocumentConversionInput(BaseModel):
|
307
322
|
|
docling/document_converter.py
CHANGED
@@ -35,8 +35,6 @@ _log = logging.getLogger(__name__)
|
|
35
35
|
|
36
36
|
|
37
37
|
class DocumentConverter:
|
38
|
-
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
|
39
|
-
_table_model_path = "model_artifacts/tableformer"
|
40
38
|
_default_download_filename = "file.pdf"
|
41
39
|
|
42
40
|
def __init__(
|
@@ -188,10 +186,8 @@ class DocumentConverter:
|
|
188
186
|
# Free up mem resources before moving on with next batch
|
189
187
|
|
190
188
|
# Remove page images (can be disabled)
|
191
|
-
if
|
192
|
-
assembled_page.
|
193
|
-
None # Comment this if you want to visualize page images
|
194
|
-
)
|
189
|
+
if self.assemble_options.images_scale is None:
|
190
|
+
assembled_page._image_cache = {}
|
195
191
|
|
196
192
|
# Unload backend
|
197
193
|
assembled_page._backend.unload()
|
@@ -231,7 +227,15 @@ class DocumentConverter:
|
|
231
227
|
|
232
228
|
# Generate the page image and store it in the page object
|
233
229
|
def populate_page_images(self, doc: InputDocument, page: Page) -> Page:
|
234
|
-
|
230
|
+
# default scale
|
231
|
+
page.get_image(scale=1.0)
|
232
|
+
|
233
|
+
# user requested scales
|
234
|
+
if self.assemble_options.images_scale is not None:
|
235
|
+
page._default_image_scale = self.assemble_options.images_scale
|
236
|
+
page.get_image(
|
237
|
+
scale=self.assemble_options.images_scale
|
238
|
+
) # this will trigger storing the image in the internal cache
|
235
239
|
|
236
240
|
return page
|
237
241
|
|
@@ -247,7 +251,7 @@ class DocumentConverter:
|
|
247
251
|
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
248
252
|
image.show()
|
249
253
|
|
250
|
-
# draw_text_boxes(page.
|
254
|
+
# draw_text_boxes(page.get_image(scale=1.0), cells)
|
251
255
|
|
252
256
|
return page
|
253
257
|
|
@@ -0,0 +1,124 @@
|
|
1
|
+
import copy
|
2
|
+
import logging
|
3
|
+
from abc import abstractmethod
|
4
|
+
from typing import Iterable, List, Tuple
|
5
|
+
|
6
|
+
import numpy
|
7
|
+
import numpy as np
|
8
|
+
from PIL import Image, ImageDraw
|
9
|
+
from rtree import index
|
10
|
+
from scipy.ndimage import find_objects, label
|
11
|
+
|
12
|
+
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
|
13
|
+
|
14
|
+
_log = logging.getLogger(__name__)
|
15
|
+
|
16
|
+
|
17
|
+
class BaseOcrModel:
|
18
|
+
def __init__(self, config):
|
19
|
+
self.config = config
|
20
|
+
self.enabled = config["enabled"]
|
21
|
+
|
22
|
+
# Computes the optimum amount and coordinates of rectangles to OCR on a given page
|
23
|
+
def get_ocr_rects(self, page: Page) -> Tuple[bool, List[BoundingBox]]:
|
24
|
+
BITMAP_COVERAGE_TRESHOLD = 0.75
|
25
|
+
|
26
|
+
def find_ocr_rects(size, bitmap_rects):
|
27
|
+
image = Image.new(
|
28
|
+
"1", (round(size.width), round(size.height))
|
29
|
+
) # '1' mode is binary
|
30
|
+
|
31
|
+
# Draw all bitmap rects into a binary image
|
32
|
+
draw = ImageDraw.Draw(image)
|
33
|
+
for rect in bitmap_rects:
|
34
|
+
x0, y0, x1, y1 = rect.as_tuple()
|
35
|
+
x0, y0, x1, y1 = round(x0), round(y0), round(x1), round(y1)
|
36
|
+
draw.rectangle([(x0, y0), (x1, y1)], fill=1)
|
37
|
+
|
38
|
+
np_image = np.array(image)
|
39
|
+
|
40
|
+
# Find the connected components
|
41
|
+
labeled_image, num_features = label(
|
42
|
+
np_image > 0
|
43
|
+
) # Label black (0 value) regions
|
44
|
+
|
45
|
+
# Find enclosing bounding boxes for each connected component.
|
46
|
+
slices = find_objects(labeled_image)
|
47
|
+
bounding_boxes = [
|
48
|
+
BoundingBox(
|
49
|
+
l=slc[1].start,
|
50
|
+
t=slc[0].start,
|
51
|
+
r=slc[1].stop - 1,
|
52
|
+
b=slc[0].stop - 1,
|
53
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
54
|
+
)
|
55
|
+
for slc in slices
|
56
|
+
]
|
57
|
+
|
58
|
+
# Compute area fraction on page covered by bitmaps
|
59
|
+
area_frac = np.sum(np_image > 0) / (size.width * size.height)
|
60
|
+
|
61
|
+
return (area_frac, bounding_boxes) # fraction covered # boxes
|
62
|
+
|
63
|
+
bitmap_rects = page._backend.get_bitmap_rects()
|
64
|
+
coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
|
65
|
+
|
66
|
+
# return full-page rectangle if sufficiently covered with bitmaps
|
67
|
+
if coverage > BITMAP_COVERAGE_TRESHOLD:
|
68
|
+
return [
|
69
|
+
BoundingBox(
|
70
|
+
l=0,
|
71
|
+
t=0,
|
72
|
+
r=page.size.width,
|
73
|
+
b=page.size.height,
|
74
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
75
|
+
)
|
76
|
+
]
|
77
|
+
# return individual rectangles if the bitmap coverage is smaller
|
78
|
+
elif coverage < BITMAP_COVERAGE_TRESHOLD:
|
79
|
+
return ocr_rects
|
80
|
+
|
81
|
+
# Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
|
82
|
+
def filter_ocr_cells(self, ocr_cells, programmatic_cells):
|
83
|
+
# Create R-tree index for programmatic cells
|
84
|
+
p = index.Property()
|
85
|
+
p.dimension = 2
|
86
|
+
idx = index.Index(properties=p)
|
87
|
+
for i, cell in enumerate(programmatic_cells):
|
88
|
+
idx.insert(i, cell.bbox.as_tuple())
|
89
|
+
|
90
|
+
def is_overlapping_with_existing_cells(ocr_cell):
|
91
|
+
# Query the R-tree to get overlapping rectangles
|
92
|
+
possible_matches_index = list(idx.intersection(ocr_cell.bbox.as_tuple()))
|
93
|
+
|
94
|
+
return (
|
95
|
+
len(possible_matches_index) > 0
|
96
|
+
) # this is a weak criterion but it works.
|
97
|
+
|
98
|
+
filtered_ocr_cells = [
|
99
|
+
rect for rect in ocr_cells if not is_overlapping_with_existing_cells(rect)
|
100
|
+
]
|
101
|
+
return filtered_ocr_cells
|
102
|
+
|
103
|
+
def draw_ocr_rects_and_cells(self, page, ocr_rects):
|
104
|
+
image = copy.deepcopy(page.image)
|
105
|
+
draw = ImageDraw.Draw(image, "RGBA")
|
106
|
+
|
107
|
+
# Draw OCR rectangles as yellow filled rect
|
108
|
+
for rect in ocr_rects:
|
109
|
+
x0, y0, x1, y1 = rect.as_tuple()
|
110
|
+
shade_color = (255, 255, 0, 40) # transparent yellow
|
111
|
+
draw.rectangle([(x0, y0), (x1, y1)], fill=shade_color, outline=None)
|
112
|
+
|
113
|
+
# Draw OCR and programmatic cells
|
114
|
+
for tc in page.cells:
|
115
|
+
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
116
|
+
color = "red"
|
117
|
+
if isinstance(tc, OcrCell):
|
118
|
+
color = "magenta"
|
119
|
+
draw.rectangle([(x0, y0), (x1, y1)], outline=color)
|
120
|
+
image.show()
|
121
|
+
|
122
|
+
@abstractmethod
|
123
|
+
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
124
|
+
pass
|
docling/models/easyocr_model.py
CHANGED
@@ -1,20 +1,18 @@
|
|
1
|
-
import copy
|
2
1
|
import logging
|
3
|
-
import random
|
4
2
|
from typing import Iterable
|
5
3
|
|
6
4
|
import numpy
|
7
|
-
from PIL import ImageDraw
|
8
5
|
|
9
6
|
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
|
7
|
+
from docling.models.base_ocr_model import BaseOcrModel
|
10
8
|
|
11
9
|
_log = logging.getLogger(__name__)
|
12
10
|
|
13
11
|
|
14
|
-
class EasyOcrModel:
|
12
|
+
class EasyOcrModel(BaseOcrModel):
|
15
13
|
def __init__(self, config):
|
16
|
-
|
17
|
-
|
14
|
+
super().__init__(config)
|
15
|
+
|
18
16
|
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
19
17
|
|
20
18
|
if self.enabled:
|
@@ -29,49 +27,44 @@ class EasyOcrModel:
|
|
29
27
|
return
|
30
28
|
|
31
29
|
for page in page_batch:
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
del im
|
39
|
-
|
40
|
-
cells = [
|
41
|
-
OcrCell(
|
42
|
-
id=ix,
|
43
|
-
text=line[1],
|
44
|
-
confidence=line[2],
|
45
|
-
bbox=BoundingBox.from_tuple(
|
46
|
-
coord=(
|
47
|
-
line[0][0][0] / self.scale,
|
48
|
-
line[0][0][1] / self.scale,
|
49
|
-
line[0][2][0] / self.scale,
|
50
|
-
line[0][2][1] / self.scale,
|
51
|
-
),
|
52
|
-
origin=CoordOrigin.TOPLEFT,
|
53
|
-
),
|
30
|
+
ocr_rects = self.get_ocr_rects(page)
|
31
|
+
|
32
|
+
all_ocr_cells = []
|
33
|
+
for ocr_rect in ocr_rects:
|
34
|
+
high_res_image = page._backend.get_page_image(
|
35
|
+
scale=self.scale, cropbox=ocr_rect
|
54
36
|
)
|
55
|
-
|
56
|
-
|
37
|
+
im = numpy.array(high_res_image)
|
38
|
+
result = self.reader.readtext(im)
|
39
|
+
|
40
|
+
del high_res_image
|
41
|
+
del im
|
42
|
+
|
43
|
+
cells = [
|
44
|
+
OcrCell(
|
45
|
+
id=ix,
|
46
|
+
text=line[1],
|
47
|
+
confidence=line[2],
|
48
|
+
bbox=BoundingBox.from_tuple(
|
49
|
+
coord=(
|
50
|
+
(line[0][0][0] / self.scale) + ocr_rect.l,
|
51
|
+
(line[0][0][1] / self.scale) + ocr_rect.t,
|
52
|
+
(line[0][2][0] / self.scale) + ocr_rect.l,
|
53
|
+
(line[0][2][1] / self.scale) + ocr_rect.t,
|
54
|
+
),
|
55
|
+
origin=CoordOrigin.TOPLEFT,
|
56
|
+
),
|
57
|
+
)
|
58
|
+
for ix, line in enumerate(result)
|
59
|
+
]
|
60
|
+
all_ocr_cells.extend(cells)
|
57
61
|
|
58
|
-
|
62
|
+
## Remove OCR cells which overlap with programmatic cells.
|
63
|
+
filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
|
59
64
|
|
60
|
-
|
61
|
-
def draw_clusters_and_cells():
|
62
|
-
image = copy.deepcopy(page.image)
|
63
|
-
draw = ImageDraw.Draw(image)
|
64
|
-
|
65
|
-
cell_color = (
|
66
|
-
random.randint(30, 140),
|
67
|
-
random.randint(30, 140),
|
68
|
-
random.randint(30, 140),
|
69
|
-
)
|
70
|
-
for tc in cells:
|
71
|
-
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
72
|
-
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
73
|
-
image.show()
|
65
|
+
page.cells.extend(filtered_ocr_cells)
|
74
66
|
|
75
|
-
#
|
67
|
+
# DEBUG code:
|
68
|
+
# self.draw_ocr_rects_and_cells(page, ocr_rects)
|
76
69
|
|
77
70
|
yield page
|
docling/models/layout_model.py
CHANGED
@@ -267,7 +267,9 @@ class LayoutModel:
|
|
267
267
|
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
268
268
|
for page in page_batch:
|
269
269
|
clusters = []
|
270
|
-
for ix, pred_item in enumerate(
|
270
|
+
for ix, pred_item in enumerate(
|
271
|
+
self.layout_predictor.predict(page.get_image(scale=1.0))
|
272
|
+
):
|
271
273
|
cluster = Cluster(
|
272
274
|
id=ix,
|
273
275
|
label=pred_item["label"],
|
@@ -1,5 +1,4 @@
|
|
1
1
|
import copy
|
2
|
-
import random
|
3
2
|
from typing import Iterable, List
|
4
3
|
|
5
4
|
import numpy
|
@@ -34,7 +33,9 @@ class TableStructureModel:
|
|
34
33
|
self.scale = 2.0 # Scale up table input images to 144 dpi
|
35
34
|
|
36
35
|
def draw_table_and_cells(self, page: Page, tbl_list: List[TableElement]):
|
37
|
-
image =
|
36
|
+
image = (
|
37
|
+
page._backend.get_page_image()
|
38
|
+
) # make new image to avoid drawing on the saved ones
|
38
39
|
draw = ImageDraw.Draw(image)
|
39
40
|
|
40
41
|
for table_element in tbl_list:
|
@@ -94,13 +95,7 @@ class TableStructureModel:
|
|
94
95
|
"width": page.size.width * self.scale,
|
95
96
|
"height": page.size.height * self.scale,
|
96
97
|
}
|
97
|
-
|
98
|
-
if self.scale == 1.0:
|
99
|
-
page_input["image"] = numpy.asarray(page.image)
|
100
|
-
else: # render new page image on the fly at desired scale
|
101
|
-
page_input["image"] = numpy.asarray(
|
102
|
-
page._backend.get_page_image(scale=self.scale)
|
103
|
-
)
|
98
|
+
page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))
|
104
99
|
|
105
100
|
table_clusters, table_bboxes = zip(*in_tables)
|
106
101
|
|
@@ -1,10 +1,8 @@
|
|
1
1
|
from pathlib import Path
|
2
|
-
from typing import Iterable
|
3
2
|
|
4
|
-
from docling.datamodel.base_models import
|
3
|
+
from docling.datamodel.base_models import PipelineOptions
|
5
4
|
from docling.models.easyocr_model import EasyOcrModel
|
6
5
|
from docling.models.layout_model import LayoutModel
|
7
|
-
from docling.models.page_assemble_model import PageAssembleModel
|
8
6
|
from docling.models.table_structure_model import TableStructureModel
|
9
7
|
from docling.pipeline.base_model_pipeline import BaseModelPipeline
|
10
8
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.6.0
|
4
4
|
Summary: Docling PDF conversion package
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -19,20 +19,20 @@ Classifier: Programming Language :: Python :: 3.10
|
|
19
19
|
Classifier: Programming Language :: Python :: 3.11
|
20
20
|
Classifier: Programming Language :: Python :: 3.12
|
21
21
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
22
|
-
Provides-Extra: easyocr
|
23
22
|
Provides-Extra: ocr
|
24
23
|
Requires-Dist: certifi (>=2024.7.4)
|
25
24
|
Requires-Dist: deepsearch-glm (>=0.19.0,<1)
|
26
25
|
Requires-Dist: docling-core (>=1.1.2,<2.0.0)
|
27
26
|
Requires-Dist: docling-ibm-models (>=1.1.1,<2.0.0)
|
28
27
|
Requires-Dist: docling-parse (>=0.2.0,<0.3.0)
|
29
|
-
Requires-Dist: easyocr (>=1.7,<2.0) ; extra == "
|
28
|
+
Requires-Dist: easyocr (>=1.7,<2.0) ; extra == "ocr"
|
30
29
|
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
31
30
|
Requires-Dist: huggingface_hub (>=0.23,<1)
|
32
31
|
Requires-Dist: pydantic (>=2.0.0,<3.0.0)
|
33
32
|
Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
|
34
33
|
Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
|
35
34
|
Requires-Dist: requests (>=2.32.3,<3.0.0)
|
35
|
+
Requires-Dist: rtree (>=1.3.0,<2.0.0)
|
36
36
|
Project-URL: Repository, https://github.com/DS4SD/docling
|
37
37
|
Description-Content-Type: text/markdown
|
38
38
|
|
@@ -44,6 +44,7 @@ Description-Content-Type: text/markdown
|
|
44
44
|
|
45
45
|
# Docling
|
46
46
|
|
47
|
+
[](https://arxiv.org/abs/2408.09869)
|
47
48
|
[](https://pypi.org/project/docling/)
|
48
49
|

|
49
50
|
[](https://python-poetry.org/)
|
@@ -172,13 +173,15 @@ Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main
|
|
172
173
|
If you use Docling in your projects, please consider citing the following:
|
173
174
|
|
174
175
|
```bib
|
175
|
-
@
|
176
|
-
author = {Deep Search Team},
|
177
|
-
month = {
|
178
|
-
title = {{Docling}},
|
179
|
-
url
|
180
|
-
|
181
|
-
|
176
|
+
@techreport{Docling,
|
177
|
+
author = {Deep Search Team},
|
178
|
+
month = {8},
|
179
|
+
title = {{Docling Technical Report}},
|
180
|
+
url={https://arxiv.org/abs/2408.09869},
|
181
|
+
eprint={2408.09869},
|
182
|
+
doi = "10.48550/arXiv.2408.09869",
|
183
|
+
version = {1.0.0},
|
184
|
+
year = {2024}
|
182
185
|
}
|
183
186
|
```
|
184
187
|
|
@@ -0,0 +1,27 @@
|
|
1
|
+
docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
+
docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
+
docling/backend/abstract_backend.py,sha256=ZfEHaBPGM1cmqrhaEoU3MHhnHU11NhOnhtFEIbVMYDo,1221
|
4
|
+
docling/backend/docling_parse_backend.py,sha256=TN7Ln3Lkc8k0v6HzxA2iUGc8f2iqMw0I-3eryLQkpdw,6924
|
5
|
+
docling/backend/pypdfium2_backend.py,sha256=xUiIYgd7i22YDx4-W2hfPUaQFszW0gcT6pavG5qZ8LE,8062
|
6
|
+
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
+
docling/datamodel/base_models.py,sha256=5VHit5h7OleKnbhvy-sWDxQLizEdNrGUBrypyzwHyAE,8604
|
8
|
+
docling/datamodel/document.py,sha256=Dgi9pSwXCgIoR26MKiRDiVMyMaFKdvGSKq2Fm5Lef9M,13173
|
9
|
+
docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
|
10
|
+
docling/document_converter.py,sha256=UFSELvUSWsr8s0VByu4lNuzu7bn7zZauJTL3FTSLSBg,10371
|
11
|
+
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
|
+
docling/models/base_ocr_model.py,sha256=Ipl82a3AV2OsgMQSMEMpnWJ6MXcmyIQzmp52PmTaB0g,4465
|
13
|
+
docling/models/ds_glm_model.py,sha256=wmb--2JKFQby-kvidw6PyM8wURPXYPQ_Z_eKKCBAdYQ,3192
|
14
|
+
docling/models/easyocr_model.py,sha256=ABIqALvtNNrDQ47fXaZ0lDFhOwKsYGUUlAPnIsFZgZA,2232
|
15
|
+
docling/models/layout_model.py,sha256=ZFmaLXlRWUfsT1pJCiYVxhQFrBBsiz6Aw0m9GM3UvVM,11249
|
16
|
+
docling/models/page_assemble_model.py,sha256=8eoG2WiFxPxq9TPvM-wkngb2gkr0tdtCRVXg1JcTETo,5550
|
17
|
+
docling/models/table_structure_model.py,sha256=5jzTlpM-GdCSq4l0vD1W6aSPTJXeTcXEnNuPxnw-DlA,5437
|
18
|
+
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
|
+
docling/pipeline/base_model_pipeline.py,sha256=AC5NTR0xLy5JIZqsTINkKEHeCPqpyvJpuE_bcnZhyvI,529
|
20
|
+
docling/pipeline/standard_model_pipeline.py,sha256=UTjyaEXvz9htYZz-IMTkn11cZwNjgvo_Fl2dfBVnRQs,1442
|
21
|
+
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
|
+
docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
|
23
|
+
docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
|
24
|
+
docling-1.6.0.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
|
25
|
+
docling-1.6.0.dist-info/METADATA,sha256=iMNzQ5wFtqHCTYat46cOq9JK0nhYKr1N6_PuEuah5D4,7227
|
26
|
+
docling-1.6.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
27
|
+
docling-1.6.0.dist-info/RECORD,,
|
docling-1.4.0.dist-info/RECORD
DELETED
@@ -1,26 +0,0 @@
|
|
1
|
-
docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
-
docling/backend/abstract_backend.py,sha256=swwmXzNueZSHqEOvw4j-IFhP2OUJhBeB--gV7NtzKgo,1112
|
4
|
-
docling/backend/docling_parse_backend.py,sha256=bgsmnwDmroBwuOwkEUzlN9KMEIFJ1xUaCZW6rsr5G-c,5924
|
5
|
-
docling/backend/pypdfium2_backend.py,sha256=tv6JxyTkTdT2qr2ghsQgYA2zgpCDxKYSdHVBTAR7FSk,7411
|
6
|
-
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
docling/datamodel/base_models.py,sha256=irZLAHdsROOOwRbywKIA0mk3H8GrLwtGjOgTV6G0QoU,7004
|
8
|
-
docling/datamodel/document.py,sha256=lZHXINmPWvpzrV3PTilgJs1blqTMCnJdLEww_qfcqdE,12533
|
9
|
-
docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
|
10
|
-
docling/document_converter.py,sha256=dMucsq6M_nwPsC1ChogVwJgNDv8sJuFklQWWinDZaug,10246
|
11
|
-
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
|
-
docling/models/ds_glm_model.py,sha256=wmb--2JKFQby-kvidw6PyM8wURPXYPQ_Z_eKKCBAdYQ,3192
|
13
|
-
docling/models/easyocr_model.py,sha256=NaHVs8IN0eW9KB076E2Kae1s-bq74_4IMWueze9QqtE,2290
|
14
|
-
docling/models/layout_model.py,sha256=3mOgNvCYPh99_oLxJy-ZaIqGOFgG5bcIQ0tTubW656Q,11204
|
15
|
-
docling/models/page_assemble_model.py,sha256=8eoG2WiFxPxq9TPvM-wkngb2gkr0tdtCRVXg1JcTETo,5550
|
16
|
-
docling/models/table_structure_model.py,sha256=xUmfunZNYC30P0fRdESdztqy1FVlMzlhJjLBp-xcn4A,5638
|
17
|
-
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
|
-
docling/pipeline/base_model_pipeline.py,sha256=ozHdJak0yQAxQf7pQN_C480vI35A2e5KL5Qq1xSkq5c,560
|
19
|
-
docling/pipeline/standard_model_pipeline.py,sha256=UTwodKUKrisLoVcntbNUBDhjzRyFvpdUvyVw-gNmBlM,1541
|
20
|
-
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
21
|
-
docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
|
22
|
-
docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
|
23
|
-
docling-1.4.0.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
|
24
|
-
docling-1.4.0.dist-info/METADATA,sha256=Hu8pvrxpc0b1qzQvvzI_ijRAQWjOfcfNl4_1Zb7oyoc,7042
|
25
|
-
docling-1.4.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
26
|
-
docling-1.4.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|