docling 1.1.1__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,171 @@
1
+ import random
2
+ from io import BytesIO
3
+ from pathlib import Path
4
+ from typing import Iterable, List, Optional, Union
5
+
6
+ import pypdfium2 as pdfium
7
+ from docling_parse.docling_parse import pdf_parser
8
+ from PIL import Image, ImageDraw
9
+ from pypdfium2 import PdfPage
10
+
11
+ from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
12
+ from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
13
+
14
+
15
+ class DoclingParsePageBackend(PdfPageBackend):
16
+ def __init__(self, page_obj: PdfPage, docling_page_obj):
17
+ super().__init__(page_obj)
18
+ self._ppage = page_obj
19
+ self._dpage = docling_page_obj
20
+ self.text_page = None
21
+
22
+ def get_text_in_rect(self, bbox: BoundingBox) -> str:
23
+ # Find intersecting cells on the page
24
+ text_piece = ""
25
+ page_size = self.get_size()
26
+ parser_width = self._dpage["width"]
27
+ parser_height = self._dpage["height"]
28
+
29
+ scale = (
30
+ 1 # FIX - Replace with param in get_text_in_rect across backends (optional)
31
+ )
32
+
33
+ for i in range(len(self._dpage["cells"])):
34
+ rect = self._dpage["cells"][i]["box"]["device"]
35
+ x0, y0, x1, y1 = rect
36
+ cell_bbox = BoundingBox(
37
+ l=x0 * scale * page_size.width / parser_width,
38
+ b=y0 * scale * page_size.height / parser_height,
39
+ r=x1 * scale * page_size.width / parser_width,
40
+ t=y1 * scale * page_size.height / parser_height,
41
+ coord_origin=CoordOrigin.BOTTOMLEFT,
42
+ ).to_top_left_origin(page_size.height * scale)
43
+
44
+ overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
45
+
46
+ if overlap_frac > 0.5:
47
+ if len(text_piece) > 0:
48
+ text_piece += " "
49
+ text_piece += self._dpage["cells"][i]["content"]["rnormalized"]
50
+
51
+ return text_piece
52
+
53
+ def get_text_cells(self) -> Iterable[Cell]:
54
+ cells = []
55
+ cell_counter = 0
56
+
57
+ page_size = self.get_size()
58
+
59
+ parser_width = self._dpage["width"]
60
+ parser_height = self._dpage["height"]
61
+
62
+ for i in range(len(self._dpage["cells"])):
63
+ rect = self._dpage["cells"][i]["box"]["device"]
64
+ x0, y0, x1, y1 = rect
65
+ text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
66
+ cells.append(
67
+ Cell(
68
+ id=cell_counter,
69
+ text=text_piece,
70
+ bbox=BoundingBox(
71
+ # l=x0, b=y0, r=x1, t=y1,
72
+ l=x0 * page_size.width / parser_width,
73
+ b=y0 * page_size.height / parser_height,
74
+ r=x1 * page_size.width / parser_width,
75
+ t=y1 * page_size.height / parser_height,
76
+ coord_origin=CoordOrigin.BOTTOMLEFT,
77
+ ).to_top_left_origin(page_size.height),
78
+ )
79
+ )
80
+ cell_counter += 1
81
+
82
+ def draw_clusters_and_cells():
83
+ image = self.get_page_image()
84
+ draw = ImageDraw.Draw(image)
85
+ for c in cells:
86
+ x0, y0, x1, y1 = c.bbox.as_tuple()
87
+ cell_color = (
88
+ random.randint(30, 140),
89
+ random.randint(30, 140),
90
+ random.randint(30, 140),
91
+ )
92
+ draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
93
+ image.show()
94
+
95
+ # before merge:
96
+ # draw_clusters_and_cells()
97
+
98
+ # cells = merge_horizontal_cells(cells)
99
+
100
+ # after merge:
101
+ # draw_clusters_and_cells()
102
+
103
+ return cells
104
+
105
+ def get_page_image(
106
+ self, scale: int = 1, cropbox: Optional[BoundingBox] = None
107
+ ) -> Image.Image:
108
+
109
+ page_size = self.get_size()
110
+
111
+ if not cropbox:
112
+ cropbox = BoundingBox(
113
+ l=0,
114
+ r=page_size.width,
115
+ t=0,
116
+ b=page_size.height,
117
+ coord_origin=CoordOrigin.TOPLEFT,
118
+ )
119
+ padbox = BoundingBox(
120
+ l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
121
+ )
122
+ else:
123
+ padbox = cropbox.to_bottom_left_origin(page_size.height)
124
+ padbox.r = page_size.width - padbox.r
125
+ padbox.t = page_size.height - padbox.t
126
+
127
+ image = (
128
+ self._ppage.render(
129
+ scale=scale * 1.5,
130
+ rotation=0, # no additional rotation
131
+ crop=padbox.as_tuple(),
132
+ )
133
+ .to_pil()
134
+ .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
135
+ ) # We resize the image from 1.5x the given scale to make it sharper.
136
+
137
+ return image
138
+
139
+ def get_size(self) -> PageSize:
140
+ return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
141
+
142
+ def unload(self):
143
+ self._ppage = None
144
+ self._dpage = None
145
+ self.text_page = None
146
+
147
+
148
+ class DoclingParseDocumentBackend(PdfDocumentBackend):
149
+ def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
150
+ super().__init__(path_or_stream)
151
+ self._pdoc = pdfium.PdfDocument(path_or_stream)
152
+ # Parsing cells with docling_parser call
153
+ print("PARSING WITH DOCLING PARSE")
154
+ parser = pdf_parser()
155
+ self._parser_doc = parser.find_cells(str(path_or_stream))
156
+
157
+ def page_count(self) -> int:
158
+ return len(self._parser_doc["pages"])
159
+
160
+ def load_page(self, page_no: int) -> PdfPage:
161
+ return DoclingParsePageBackend(
162
+ self._pdoc[page_no], self._parser_doc["pages"][page_no]
163
+ )
164
+
165
+ def is_valid(self) -> bool:
166
+ return self.page_count() > 0
167
+
168
+ def unload(self):
169
+ self._pdoc.close()
170
+ self._pdoc = None
171
+ self._parser_doc = None
@@ -125,7 +125,7 @@ class ConvertedDocument(BaseModel):
125
125
  desc = DsDocumentDescription(logs=[])
126
126
 
127
127
  page_hashes = [
128
- PageReference(hash=p.page_hash, page=p.page_no, model="default")
128
+ PageReference(hash=p.page_hash, page=p.page_no + 1, model="default")
129
129
  for p in self.pages
130
130
  ]
131
131
 
@@ -159,7 +159,7 @@ class ConvertedDocument(BaseModel):
159
159
  prov=[
160
160
  Prov(
161
161
  bbox=target_bbox,
162
- page=element.page_no,
162
+ page=element.page_no + 1,
163
163
  span=[0, len(element.text)],
164
164
  )
165
165
  ],
@@ -242,7 +242,7 @@ class ConvertedDocument(BaseModel):
242
242
  prov=[
243
243
  Prov(
244
244
  bbox=target_bbox,
245
- page=element.page_no,
245
+ page=element.page_no + 1,
246
246
  span=[0, 0],
247
247
  )
248
248
  ],
@@ -264,7 +264,7 @@ class ConvertedDocument(BaseModel):
264
264
  prov=[
265
265
  Prov(
266
266
  bbox=target_bbox,
267
- page=element.page_no,
267
+ page=element.page_no + 1,
268
268
  span=[0, 0],
269
269
  )
270
270
  ],
@@ -274,7 +274,7 @@ class ConvertedDocument(BaseModel):
274
274
  )
275
275
 
276
276
  page_dimensions = [
277
- PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
277
+ PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
278
278
  for p in self.pages
279
279
  ]
280
280
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 1.1.1
3
+ Version: 1.2.0
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -22,8 +22,9 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Provides-Extra: easyocr
23
23
  Provides-Extra: ocr
24
24
  Requires-Dist: deepsearch-glm (>=0.19.0,<1)
25
- Requires-Dist: docling-core (>=1.1.0,<2.0.0)
25
+ Requires-Dist: docling-core (>=1.1.2,<2.0.0)
26
26
  Requires-Dist: docling-ibm-models (>=1.1.0,<2.0.0)
27
+ Requires-Dist: docling-parse (>=0.0.1,<0.0.2)
27
28
  Requires-Dist: easyocr (>=1.7,<2.0) ; extra == "easyocr" or extra == "ocr"
28
29
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
29
30
  Requires-Dist: huggingface_hub (>=0.23,<1)
@@ -92,7 +93,7 @@ print(doc.export_to_markdown()) # output: "## DocLayNet: A Large Human-Annotate
92
93
 
93
94
  ### Convert a batch of documents
94
95
 
95
- For an example of converting multiple documents, see [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py).
96
+ For an example of batch-converting documents, see [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py).
96
97
 
97
98
  From a local repo clone, you can run it with:
98
99
 
@@ -1,10 +1,11 @@
1
1
  docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  docling/backend/abstract_backend.py,sha256=dINr8oTax9Fq31Y1AR0CGWNZtAHN5aqB_M7TAPkJNVQ,1122
4
+ docling/backend/docling_parse_backend.py,sha256=cupeYC1evzM31lXskH-mbXnZhw1_JHyUiJ-cpTmlrM4,5834
4
5
  docling/backend/pypdfium2_backend.py,sha256=cIQGFkwzceN57PzmACt06CytRo0A_t-im6rW804RC3M,7421
5
6
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
7
  docling/datamodel/base_models.py,sha256=k7gLFPnq3ArEMAFz6qUcp5qemlYzVhOmR9qtBTkAiX4,6862
7
- docling/datamodel/document.py,sha256=7caefzaii6itMQgtXfA4SJhB1TAF32v1c8zRwbiU03s,12497
8
+ docling/datamodel/document.py,sha256=FG_ntDFRBWj-MhV52D0sC8XaZOwN3yryyXahsVHGnyI,12517
8
9
  docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
9
10
  docling/document_converter.py,sha256=I9vjTLCLahsMrcs9ozM3C5r_CtBN-9qHk7-ANma7fkc,9895
10
11
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -19,7 +20,7 @@ docling/pipeline/standard_model_pipeline.py,sha256=UTwodKUKrisLoVcntbNUBDhjzRyFv
19
20
  docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
21
  docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
21
22
  docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
22
- docling-1.1.1.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
23
- docling-1.1.1.dist-info/METADATA,sha256=hnIPHm49bjWcFKBSCJ-aPsqim6aqHkWZiMdhkQli9Lk,6759
24
- docling-1.1.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
25
- docling-1.1.1.dist-info/RECORD,,
23
+ docling-1.2.0.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
24
+ docling-1.2.0.dist-info/METADATA,sha256=9ZWFckdLpf45avuDgZgyzQK6J2oLCK0_oCW9T9Rx4iU,6802
25
+ docling-1.2.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
26
+ docling-1.2.0.dist-info/RECORD,,