docling 1.1.1__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/docling_parse_backend.py +171 -0
- docling/datamodel/document.py +5 -5
- {docling-1.1.1.dist-info → docling-1.2.0.dist-info}/METADATA +4 -3
- {docling-1.1.1.dist-info → docling-1.2.0.dist-info}/RECORD +6 -5
- {docling-1.1.1.dist-info → docling-1.2.0.dist-info}/LICENSE +0 -0
- {docling-1.1.1.dist-info → docling-1.2.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,171 @@
|
|
1
|
+
import random
|
2
|
+
from io import BytesIO
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import Iterable, List, Optional, Union
|
5
|
+
|
6
|
+
import pypdfium2 as pdfium
|
7
|
+
from docling_parse.docling_parse import pdf_parser
|
8
|
+
from PIL import Image, ImageDraw
|
9
|
+
from pypdfium2 import PdfPage
|
10
|
+
|
11
|
+
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
|
12
|
+
from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
|
13
|
+
|
14
|
+
|
15
|
+
class DoclingParsePageBackend(PdfPageBackend):
|
16
|
+
def __init__(self, page_obj: PdfPage, docling_page_obj):
|
17
|
+
super().__init__(page_obj)
|
18
|
+
self._ppage = page_obj
|
19
|
+
self._dpage = docling_page_obj
|
20
|
+
self.text_page = None
|
21
|
+
|
22
|
+
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
23
|
+
# Find intersecting cells on the page
|
24
|
+
text_piece = ""
|
25
|
+
page_size = self.get_size()
|
26
|
+
parser_width = self._dpage["width"]
|
27
|
+
parser_height = self._dpage["height"]
|
28
|
+
|
29
|
+
scale = (
|
30
|
+
1 # FIX - Replace with param in get_text_in_rect across backends (optional)
|
31
|
+
)
|
32
|
+
|
33
|
+
for i in range(len(self._dpage["cells"])):
|
34
|
+
rect = self._dpage["cells"][i]["box"]["device"]
|
35
|
+
x0, y0, x1, y1 = rect
|
36
|
+
cell_bbox = BoundingBox(
|
37
|
+
l=x0 * scale * page_size.width / parser_width,
|
38
|
+
b=y0 * scale * page_size.height / parser_height,
|
39
|
+
r=x1 * scale * page_size.width / parser_width,
|
40
|
+
t=y1 * scale * page_size.height / parser_height,
|
41
|
+
coord_origin=CoordOrigin.BOTTOMLEFT,
|
42
|
+
).to_top_left_origin(page_size.height * scale)
|
43
|
+
|
44
|
+
overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
|
45
|
+
|
46
|
+
if overlap_frac > 0.5:
|
47
|
+
if len(text_piece) > 0:
|
48
|
+
text_piece += " "
|
49
|
+
text_piece += self._dpage["cells"][i]["content"]["rnormalized"]
|
50
|
+
|
51
|
+
return text_piece
|
52
|
+
|
53
|
+
def get_text_cells(self) -> Iterable[Cell]:
|
54
|
+
cells = []
|
55
|
+
cell_counter = 0
|
56
|
+
|
57
|
+
page_size = self.get_size()
|
58
|
+
|
59
|
+
parser_width = self._dpage["width"]
|
60
|
+
parser_height = self._dpage["height"]
|
61
|
+
|
62
|
+
for i in range(len(self._dpage["cells"])):
|
63
|
+
rect = self._dpage["cells"][i]["box"]["device"]
|
64
|
+
x0, y0, x1, y1 = rect
|
65
|
+
text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
|
66
|
+
cells.append(
|
67
|
+
Cell(
|
68
|
+
id=cell_counter,
|
69
|
+
text=text_piece,
|
70
|
+
bbox=BoundingBox(
|
71
|
+
# l=x0, b=y0, r=x1, t=y1,
|
72
|
+
l=x0 * page_size.width / parser_width,
|
73
|
+
b=y0 * page_size.height / parser_height,
|
74
|
+
r=x1 * page_size.width / parser_width,
|
75
|
+
t=y1 * page_size.height / parser_height,
|
76
|
+
coord_origin=CoordOrigin.BOTTOMLEFT,
|
77
|
+
).to_top_left_origin(page_size.height),
|
78
|
+
)
|
79
|
+
)
|
80
|
+
cell_counter += 1
|
81
|
+
|
82
|
+
def draw_clusters_and_cells():
|
83
|
+
image = self.get_page_image()
|
84
|
+
draw = ImageDraw.Draw(image)
|
85
|
+
for c in cells:
|
86
|
+
x0, y0, x1, y1 = c.bbox.as_tuple()
|
87
|
+
cell_color = (
|
88
|
+
random.randint(30, 140),
|
89
|
+
random.randint(30, 140),
|
90
|
+
random.randint(30, 140),
|
91
|
+
)
|
92
|
+
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
93
|
+
image.show()
|
94
|
+
|
95
|
+
# before merge:
|
96
|
+
# draw_clusters_and_cells()
|
97
|
+
|
98
|
+
# cells = merge_horizontal_cells(cells)
|
99
|
+
|
100
|
+
# after merge:
|
101
|
+
# draw_clusters_and_cells()
|
102
|
+
|
103
|
+
return cells
|
104
|
+
|
105
|
+
def get_page_image(
|
106
|
+
self, scale: int = 1, cropbox: Optional[BoundingBox] = None
|
107
|
+
) -> Image.Image:
|
108
|
+
|
109
|
+
page_size = self.get_size()
|
110
|
+
|
111
|
+
if not cropbox:
|
112
|
+
cropbox = BoundingBox(
|
113
|
+
l=0,
|
114
|
+
r=page_size.width,
|
115
|
+
t=0,
|
116
|
+
b=page_size.height,
|
117
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
118
|
+
)
|
119
|
+
padbox = BoundingBox(
|
120
|
+
l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
|
121
|
+
)
|
122
|
+
else:
|
123
|
+
padbox = cropbox.to_bottom_left_origin(page_size.height)
|
124
|
+
padbox.r = page_size.width - padbox.r
|
125
|
+
padbox.t = page_size.height - padbox.t
|
126
|
+
|
127
|
+
image = (
|
128
|
+
self._ppage.render(
|
129
|
+
scale=scale * 1.5,
|
130
|
+
rotation=0, # no additional rotation
|
131
|
+
crop=padbox.as_tuple(),
|
132
|
+
)
|
133
|
+
.to_pil()
|
134
|
+
.resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
|
135
|
+
) # We resize the image from 1.5x the given scale to make it sharper.
|
136
|
+
|
137
|
+
return image
|
138
|
+
|
139
|
+
def get_size(self) -> PageSize:
|
140
|
+
return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
|
141
|
+
|
142
|
+
def unload(self):
|
143
|
+
self._ppage = None
|
144
|
+
self._dpage = None
|
145
|
+
self.text_page = None
|
146
|
+
|
147
|
+
|
148
|
+
class DoclingParseDocumentBackend(PdfDocumentBackend):
|
149
|
+
def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
|
150
|
+
super().__init__(path_or_stream)
|
151
|
+
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
152
|
+
# Parsing cells with docling_parser call
|
153
|
+
print("PARSING WITH DOCLING PARSE")
|
154
|
+
parser = pdf_parser()
|
155
|
+
self._parser_doc = parser.find_cells(str(path_or_stream))
|
156
|
+
|
157
|
+
def page_count(self) -> int:
|
158
|
+
return len(self._parser_doc["pages"])
|
159
|
+
|
160
|
+
def load_page(self, page_no: int) -> PdfPage:
|
161
|
+
return DoclingParsePageBackend(
|
162
|
+
self._pdoc[page_no], self._parser_doc["pages"][page_no]
|
163
|
+
)
|
164
|
+
|
165
|
+
def is_valid(self) -> bool:
|
166
|
+
return self.page_count() > 0
|
167
|
+
|
168
|
+
def unload(self):
|
169
|
+
self._pdoc.close()
|
170
|
+
self._pdoc = None
|
171
|
+
self._parser_doc = None
|
docling/datamodel/document.py
CHANGED
@@ -125,7 +125,7 @@ class ConvertedDocument(BaseModel):
|
|
125
125
|
desc = DsDocumentDescription(logs=[])
|
126
126
|
|
127
127
|
page_hashes = [
|
128
|
-
PageReference(hash=p.page_hash, page=p.page_no, model="default")
|
128
|
+
PageReference(hash=p.page_hash, page=p.page_no + 1, model="default")
|
129
129
|
for p in self.pages
|
130
130
|
]
|
131
131
|
|
@@ -159,7 +159,7 @@ class ConvertedDocument(BaseModel):
|
|
159
159
|
prov=[
|
160
160
|
Prov(
|
161
161
|
bbox=target_bbox,
|
162
|
-
page=element.page_no,
|
162
|
+
page=element.page_no + 1,
|
163
163
|
span=[0, len(element.text)],
|
164
164
|
)
|
165
165
|
],
|
@@ -242,7 +242,7 @@ class ConvertedDocument(BaseModel):
|
|
242
242
|
prov=[
|
243
243
|
Prov(
|
244
244
|
bbox=target_bbox,
|
245
|
-
page=element.page_no,
|
245
|
+
page=element.page_no + 1,
|
246
246
|
span=[0, 0],
|
247
247
|
)
|
248
248
|
],
|
@@ -264,7 +264,7 @@ class ConvertedDocument(BaseModel):
|
|
264
264
|
prov=[
|
265
265
|
Prov(
|
266
266
|
bbox=target_bbox,
|
267
|
-
page=element.page_no,
|
267
|
+
page=element.page_no + 1,
|
268
268
|
span=[0, 0],
|
269
269
|
)
|
270
270
|
],
|
@@ -274,7 +274,7 @@ class ConvertedDocument(BaseModel):
|
|
274
274
|
)
|
275
275
|
|
276
276
|
page_dimensions = [
|
277
|
-
PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
|
277
|
+
PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
|
278
278
|
for p in self.pages
|
279
279
|
]
|
280
280
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.2.0
|
4
4
|
Summary: Docling PDF conversion package
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -22,8 +22,9 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
22
|
Provides-Extra: easyocr
|
23
23
|
Provides-Extra: ocr
|
24
24
|
Requires-Dist: deepsearch-glm (>=0.19.0,<1)
|
25
|
-
Requires-Dist: docling-core (>=1.1.
|
25
|
+
Requires-Dist: docling-core (>=1.1.2,<2.0.0)
|
26
26
|
Requires-Dist: docling-ibm-models (>=1.1.0,<2.0.0)
|
27
|
+
Requires-Dist: docling-parse (>=0.0.1,<0.0.2)
|
27
28
|
Requires-Dist: easyocr (>=1.7,<2.0) ; extra == "easyocr" or extra == "ocr"
|
28
29
|
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
29
30
|
Requires-Dist: huggingface_hub (>=0.23,<1)
|
@@ -92,7 +93,7 @@ print(doc.export_to_markdown()) # output: "## DocLayNet: A Large Human-Annotate
|
|
92
93
|
|
93
94
|
### Convert a batch of documents
|
94
95
|
|
95
|
-
For an example of converting
|
96
|
+
For an example of batch-converting documents, see [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py).
|
96
97
|
|
97
98
|
From a local repo clone, you can run it with:
|
98
99
|
|
@@ -1,10 +1,11 @@
|
|
1
1
|
docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
2
|
docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
3
|
docling/backend/abstract_backend.py,sha256=dINr8oTax9Fq31Y1AR0CGWNZtAHN5aqB_M7TAPkJNVQ,1122
|
4
|
+
docling/backend/docling_parse_backend.py,sha256=cupeYC1evzM31lXskH-mbXnZhw1_JHyUiJ-cpTmlrM4,5834
|
4
5
|
docling/backend/pypdfium2_backend.py,sha256=cIQGFkwzceN57PzmACt06CytRo0A_t-im6rW804RC3M,7421
|
5
6
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
7
|
docling/datamodel/base_models.py,sha256=k7gLFPnq3ArEMAFz6qUcp5qemlYzVhOmR9qtBTkAiX4,6862
|
7
|
-
docling/datamodel/document.py,sha256=
|
8
|
+
docling/datamodel/document.py,sha256=FG_ntDFRBWj-MhV52D0sC8XaZOwN3yryyXahsVHGnyI,12517
|
8
9
|
docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
|
9
10
|
docling/document_converter.py,sha256=I9vjTLCLahsMrcs9ozM3C5r_CtBN-9qHk7-ANma7fkc,9895
|
10
11
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -19,7 +20,7 @@ docling/pipeline/standard_model_pipeline.py,sha256=UTwodKUKrisLoVcntbNUBDhjzRyFv
|
|
19
20
|
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
20
21
|
docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
|
21
22
|
docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
|
22
|
-
docling-1.
|
23
|
-
docling-1.
|
24
|
-
docling-1.
|
25
|
-
docling-1.
|
23
|
+
docling-1.2.0.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
|
24
|
+
docling-1.2.0.dist-info/METADATA,sha256=9ZWFckdLpf45avuDgZgyzQK6J2oLCK0_oCW9T9Rx4iU,6802
|
25
|
+
docling-1.2.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
26
|
+
docling-1.2.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|