docling 1.3.0__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,6 @@
1
+ import logging
1
2
  import random
3
+ import time
2
4
  from io import BytesIO
3
5
  from pathlib import Path
4
6
  from typing import Iterable, List, Optional, Union
@@ -11,6 +13,8 @@ from pypdfium2 import PdfPage
11
13
  from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
12
14
  from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
13
15
 
16
+ _log = logging.getLogger(__name__)
17
+
14
18
 
15
19
  class DoclingParsePageBackend(PdfPageBackend):
16
20
  def __init__(self, page_obj: PdfPage, docling_page_obj):
@@ -80,7 +84,9 @@ class DoclingParsePageBackend(PdfPageBackend):
80
84
  cell_counter += 1
81
85
 
82
86
  def draw_clusters_and_cells():
83
- image = self.get_page_image()
87
+ image = (
88
+ self.get_page_image()
89
+ ) # make new image to avoid drawing on the saved ones
84
90
  draw = ImageDraw.Draw(image)
85
91
  for c in cells:
86
92
  x0, y0, x1, y1 = c.bbox.as_tuple()
@@ -150,10 +156,19 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
150
156
  super().__init__(path_or_stream)
151
157
  self._pdoc = pdfium.PdfDocument(path_or_stream)
152
158
  # Parsing cells with docling_parser call
153
- if isinstance(path_or_stream, BytesIO):
154
- raise NotImplemented("This backend does not support byte streams yet.")
155
159
  parser = pdf_parser()
156
- self._parser_doc = parser.find_cells(str(path_or_stream))
160
+
161
+ start_pb_time = time.time()
162
+
163
+ if isinstance(path_or_stream, BytesIO):
164
+ self._parser_doc = parser.find_cells_from_bytesio(path_or_stream)
165
+ else:
166
+ self._parser_doc = parser.find_cells(str(path_or_stream))
167
+
168
+ end_pb_time = time.time() - start_pb_time
169
+ _log.info(
170
+ f"Time to parse {path_or_stream.name} with docling-parse: time={end_pb_time:.3f}"
171
+ )
157
172
 
158
173
  def page_count(self) -> int:
159
174
  return len(self._parser_doc["pages"])
@@ -134,7 +134,9 @@ class PyPdfiumPageBackend(PdfPageBackend):
134
134
  return merged_cells
135
135
 
136
136
  def draw_clusters_and_cells():
137
- image = self.get_page_image()
137
+ image = (
138
+ self.get_page_image()
139
+ ) # make new image to avoid drawing on the saved ones
138
140
  draw = ImageDraw.Draw(image)
139
141
  for c in cells:
140
142
  x0, y0, x1, y1 = c.bbox.as_tuple()
@@ -1,10 +1,12 @@
1
1
  import copy
2
+ import warnings
2
3
  from enum import Enum, auto
3
4
  from io import BytesIO
4
- from typing import Any, Dict, List, Optional, Tuple, Union
5
+ from typing import Annotated, Any, Dict, List, Optional, Tuple, Union
5
6
 
6
7
  from PIL.Image import Image
7
- from pydantic import BaseModel, ConfigDict, model_validator
8
+ from pydantic import BaseModel, ConfigDict, Field, model_validator
9
+ from typing_extensions import Self
8
10
 
9
11
  from docling.backend.abstract_backend import PdfPageBackend
10
12
 
@@ -234,14 +236,30 @@ class Page(BaseModel):
234
236
  model_config = ConfigDict(arbitrary_types_allowed=True)
235
237
 
236
238
  page_no: int
237
- page_hash: str = None
238
- size: PageSize = None
239
- image: Image = None
239
+ page_hash: Optional[str] = None
240
+ size: Optional[PageSize] = None
240
241
  cells: List[Cell] = None
241
242
  predictions: PagePredictions = PagePredictions()
242
- assembled: AssembledUnit = None
243
+ assembled: Optional[AssembledUnit] = None
243
244
 
244
- _backend: PdfPageBackend = None # Internal PDF backend
245
+ _backend: Optional[PdfPageBackend] = (
246
+ None # Internal PDF backend. By default it is cleared during assembling.
247
+ )
248
+ _default_image_scale: float = 1.0 # Default image scale for external usage.
249
+ _image_cache: Dict[float, Image] = (
250
+ {}
251
+ ) # Cache of images in different scales. By default it is cleared during assembling.
252
+
253
+ def get_image(self, scale: float = 1.0) -> Optional[Image]:
254
+ if self._backend is None:
255
+ return self._image_cache.get(scale, None)
256
+ if not scale in self._image_cache:
257
+ self._image_cache[scale] = self._backend.get_page_image(scale=scale)
258
+ return self._image_cache[scale]
259
+
260
+ @property
261
+ def image(self) -> Optional[Image]:
262
+ return self.get_image(scale=self._default_image_scale)
245
263
 
246
264
 
247
265
  class DocumentStream(BaseModel):
@@ -268,6 +286,19 @@ class PipelineOptions(BaseModel):
268
286
 
269
287
 
270
288
  class AssembleOptions(BaseModel):
271
- keep_page_images: bool = (
272
- False # False: page images are removed in the assemble step
273
- )
289
+ keep_page_images: Annotated[
290
+ bool,
291
+ Field(
292
+ deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead"
293
+ ),
294
+ ] = False # False: page images are removed in the assemble step
295
+ images_scale: Optional[float] = None # if set, the scale for generated images
296
+
297
+ @model_validator(mode="after")
298
+ def set_page_images_from_deprecated(self) -> Self:
299
+ with warnings.catch_warnings():
300
+ warnings.simplefilter("ignore", DeprecationWarning)
301
+ default_scale = 1.0
302
+ if self.keep_page_images and self.images_scale is None:
303
+ self.images_scale = default_scale
304
+ return self
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from io import BytesIO
3
3
  from pathlib import Path, PurePath
4
- from typing import ClassVar, Dict, Iterable, List, Optional, Type, Union
4
+ from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
5
5
 
6
6
  from docling_core.types import BaseCell, BaseText
7
7
  from docling_core.types import BoundingBox as DsBoundingBox
@@ -14,13 +14,14 @@ from docling_core.types import TableCell
14
14
  from pydantic import BaseModel
15
15
 
16
16
  from docling.backend.abstract_backend import PdfDocumentBackend
17
- from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
17
+ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
18
18
  from docling.datamodel.base_models import (
19
19
  AssembledUnit,
20
20
  ConversionStatus,
21
21
  DocumentStream,
22
22
  FigureElement,
23
23
  Page,
24
+ PageElement,
24
25
  TableElement,
25
26
  TextElement,
26
27
  )
@@ -64,7 +65,7 @@ class InputDocument(BaseModel):
64
65
  path_or_stream: Union[BytesIO, Path],
65
66
  filename: Optional[str] = None,
66
67
  limits: Optional[DocumentLimits] = None,
67
- pdf_backend=PyPdfiumDocumentBackend,
68
+ pdf_backend=DoclingParseDocumentBackend,
68
69
  ):
69
70
  super().__init__()
70
71
 
@@ -302,13 +303,27 @@ class ConvertedDocument(BaseModel):
302
303
  else:
303
304
  return ""
304
305
 
306
+ def render_element_images(
307
+ self, element_types: Tuple[PageElement] = (FigureElement,)
308
+ ):
309
+ for element in self.assembled.elements:
310
+ if isinstance(element, element_types):
311
+ page_ix = element.page_no
312
+ scale = self.pages[page_ix]._default_image_scale
313
+ crop_bbox = element.cluster.bbox.scaled(scale=scale).to_top_left_origin(
314
+ page_height=self.pages[page_ix].size.height * scale
315
+ )
316
+
317
+ cropped_im = self.pages[page_ix].image.crop(crop_bbox.as_tuple())
318
+ yield element, cropped_im
319
+
305
320
 
306
321
  class DocumentConversionInput(BaseModel):
307
322
 
308
323
  _path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
309
324
  limits: Optional[DocumentLimits] = DocumentLimits()
310
325
 
311
- DEFAULT_BACKEND: ClassVar = PyPdfiumDocumentBackend
326
+ DEFAULT_BACKEND: ClassVar = DoclingParseDocumentBackend
312
327
 
313
328
  def docs(
314
329
  self, pdf_backend: Optional[Type[PdfDocumentBackend]] = None
@@ -188,10 +188,8 @@ class DocumentConverter:
188
188
  # Free up mem resources before moving on with next batch
189
189
 
190
190
  # Remove page images (can be disabled)
191
- if not self.assemble_options.keep_page_images:
192
- assembled_page.image = (
193
- None # Comment this if you want to visualize page images
194
- )
191
+ if self.assemble_options.images_scale is None:
192
+ assembled_page._image_cache = {}
195
193
 
196
194
  # Unload backend
197
195
  assembled_page._backend.unload()
@@ -231,7 +229,15 @@ class DocumentConverter:
231
229
 
232
230
  # Generate the page image and store it in the page object
233
231
  def populate_page_images(self, doc: InputDocument, page: Page) -> Page:
234
- page.image = page._backend.get_page_image()
232
+ # default scale
233
+ page.get_image(scale=1.0)
234
+
235
+ # user requested scales
236
+ if self.assemble_options.images_scale is not None:
237
+ page._default_image_scale = self.assemble_options.images_scale
238
+ page.get_image(
239
+ scale=self.assemble_options.images_scale
240
+ ) # this will trigger storing the image in the internal cache
235
241
 
236
242
  return page
237
243
 
@@ -247,7 +253,7 @@ class DocumentConverter:
247
253
  draw.rectangle([(x0, y0), (x1, y1)], outline="red")
248
254
  image.show()
249
255
 
250
- # draw_text_boxes(page.image, cells)
256
+ # draw_text_boxes(page.get_image(scale=1.0), cells)
251
257
 
252
258
  return page
253
259
 
@@ -30,7 +30,7 @@ class EasyOcrModel:
30
30
 
31
31
  for page in page_batch:
32
32
  # rects = page._fpage.
33
- high_res_image = page._backend.get_page_image(scale=self.scale)
33
+ high_res_image = page.get_image(scale=self.scale)
34
34
  im = numpy.array(high_res_image)
35
35
  result = self.reader.readtext(im)
36
36
 
@@ -69,6 +69,10 @@ class LayoutModel:
69
69
  "Key-Value Region": 0.45,
70
70
  }
71
71
 
72
+ CLASS_REMAPPINGS = {
73
+ "Document Index": "Table",
74
+ }
75
+
72
76
  _log.debug("================= Start postprocess function ====================")
73
77
  start_time = time.time()
74
78
  # Apply Confidence Threshold to cluster predictions
@@ -79,6 +83,10 @@ class LayoutModel:
79
83
  confidence = CLASS_THRESHOLDS[cluster.label]
80
84
  if cluster.confidence >= confidence:
81
85
  # annotation["created_by"] = "high_conf_pred"
86
+
87
+ # Remap class labels where needed.
88
+ if cluster.label in CLASS_REMAPPINGS.keys():
89
+ cluster.label = CLASS_REMAPPINGS[cluster.label]
82
90
  clusters_out.append(cluster)
83
91
 
84
92
  # map to dictionary clusters and cells, with bottom left origin
@@ -259,7 +267,9 @@ class LayoutModel:
259
267
  def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
260
268
  for page in page_batch:
261
269
  clusters = []
262
- for ix, pred_item in enumerate(self.layout_predictor.predict(page.image)):
270
+ for ix, pred_item in enumerate(
271
+ self.layout_predictor.predict(page.get_image(scale=1.0))
272
+ ):
263
273
  cluster = Cluster(
264
274
  id=ix,
265
275
  label=pred_item["label"],
@@ -34,7 +34,9 @@ class TableStructureModel:
34
34
  self.scale = 2.0 # Scale up table input images to 144 dpi
35
35
 
36
36
  def draw_table_and_cells(self, page: Page, tbl_list: List[TableElement]):
37
- image = page._backend.get_page_image()
37
+ image = (
38
+ page._backend.get_page_image()
39
+ ) # make new image to avoid drawing on the saved ones
38
40
  draw = ImageDraw.Draw(image)
39
41
 
40
42
  for table_element in tbl_list:
@@ -94,13 +96,7 @@ class TableStructureModel:
94
96
  "width": page.size.width * self.scale,
95
97
  "height": page.size.height * self.scale,
96
98
  }
97
- # add image to page input.
98
- if self.scale == 1.0:
99
- page_input["image"] = numpy.asarray(page.image)
100
- else: # render new page image on the fly at desired scale
101
- page_input["image"] = numpy.asarray(
102
- page._backend.get_page_image(scale=self.scale)
103
- )
99
+ page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))
104
100
 
105
101
  table_clusters, table_bboxes = zip(*in_tables)
106
102
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 1.3.0
3
+ Version: 1.5.0
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -24,8 +24,8 @@ Provides-Extra: ocr
24
24
  Requires-Dist: certifi (>=2024.7.4)
25
25
  Requires-Dist: deepsearch-glm (>=0.19.0,<1)
26
26
  Requires-Dist: docling-core (>=1.1.2,<2.0.0)
27
- Requires-Dist: docling-ibm-models (>=1.1.0,<2.0.0)
28
- Requires-Dist: docling-parse (>=0.0.1,<0.0.2)
27
+ Requires-Dist: docling-ibm-models (>=1.1.1,<2.0.0)
28
+ Requires-Dist: docling-parse (>=0.2.0,<0.3.0)
29
29
  Requires-Dist: easyocr (>=1.7,<2.0) ; extra == "easyocr" or extra == "ocr"
30
30
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
31
31
  Requires-Dist: huggingface_hub (>=0.23,<1)
@@ -44,6 +44,7 @@ Description-Content-Type: text/markdown
44
44
 
45
45
  # Docling
46
46
 
47
+ [![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
47
48
  [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
48
49
  ![Python](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue)
49
50
  [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
@@ -172,13 +173,15 @@ Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main
172
173
  If you use Docling in your projects, please consider citing the following:
173
174
 
174
175
  ```bib
175
- @software{Docling,
176
- author = {Deep Search Team},
177
- month = {7},
178
- title = {{Docling}},
179
- url = {https://github.com/DS4SD/docling},
180
- version = {main},
181
- year = {2024}
176
+ @techreport{Docling,
177
+ author = {Deep Search Team},
178
+ month = {8},
179
+ title = {{Docling Technical Report}},
180
+ url={https://arxiv.org/abs/2408.09869},
181
+ eprint={2408.09869},
182
+ doi = "10.48550/arXiv.2408.09869",
183
+ version = {1.0.0},
184
+ year = {2024}
182
185
  }
183
186
  ```
184
187
 
@@ -1,26 +1,26 @@
1
1
  docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  docling/backend/abstract_backend.py,sha256=swwmXzNueZSHqEOvw4j-IFhP2OUJhBeB--gV7NtzKgo,1112
4
- docling/backend/docling_parse_backend.py,sha256=mGuJCpMVqyrZK-cXKRWrELPz0Wt1h6uydx4QwWI1rew,5912
5
- docling/backend/pypdfium2_backend.py,sha256=tv6JxyTkTdT2qr2ghsQgYA2zgpCDxKYSdHVBTAR7FSk,7411
4
+ docling/backend/docling_parse_backend.py,sha256=-bIjYJ-80R2SArAEw_lAyzgW5_BFEoX83n1oBMmUGF4,6284
5
+ docling/backend/pypdfium2_backend.py,sha256=3Qeeal8z6DunUe4S10Z2TXrdeucanCpa8evt6SQtpKQ,7496
6
6
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- docling/datamodel/base_models.py,sha256=irZLAHdsROOOwRbywKIA0mk3H8GrLwtGjOgTV6G0QoU,7004
8
- docling/datamodel/document.py,sha256=FG_ntDFRBWj-MhV52D0sC8XaZOwN3yryyXahsVHGnyI,12517
7
+ docling/datamodel/base_models.py,sha256=uOq0zjUS60aIkROREiypp3Jn1yqQTlWEf34jXTT43ls,8391
8
+ docling/datamodel/document.py,sha256=Dgi9pSwXCgIoR26MKiRDiVMyMaFKdvGSKq2Fm5Lef9M,13173
9
9
  docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
10
- docling/document_converter.py,sha256=dMucsq6M_nwPsC1ChogVwJgNDv8sJuFklQWWinDZaug,10246
10
+ docling/document_converter.py,sha256=r9z48VjL_hkq-rbAgyZ135njzUGBJ5AnhEH6-1zfyCA,10490
11
11
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  docling/models/ds_glm_model.py,sha256=wmb--2JKFQby-kvidw6PyM8wURPXYPQ_Z_eKKCBAdYQ,3192
13
- docling/models/easyocr_model.py,sha256=NaHVs8IN0eW9KB076E2Kae1s-bq74_4IMWueze9QqtE,2290
14
- docling/models/layout_model.py,sha256=4AfPFiu6pXc8wIQ1sQlEZnHRt7SnBmfzDdctiRveOWw,10944
13
+ docling/models/easyocr_model.py,sha256=Y-RWolIFE3By6gk8dnb2qFy7Cr9qcHs6eo65fWPT0Nc,2276
14
+ docling/models/layout_model.py,sha256=ZFmaLXlRWUfsT1pJCiYVxhQFrBBsiz6Aw0m9GM3UvVM,11249
15
15
  docling/models/page_assemble_model.py,sha256=8eoG2WiFxPxq9TPvM-wkngb2gkr0tdtCRVXg1JcTETo,5550
16
- docling/models/table_structure_model.py,sha256=xUmfunZNYC30P0fRdESdztqy1FVlMzlhJjLBp-xcn4A,5638
16
+ docling/models/table_structure_model.py,sha256=lKsodvfZaGwxOHp-CbRW5nzCKZYMwf770h0Ka6Bdbgw,5451
17
17
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
18
  docling/pipeline/base_model_pipeline.py,sha256=ozHdJak0yQAxQf7pQN_C480vI35A2e5KL5Qq1xSkq5c,560
19
19
  docling/pipeline/standard_model_pipeline.py,sha256=UTwodKUKrisLoVcntbNUBDhjzRyFvpdUvyVw-gNmBlM,1541
20
20
  docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
21
  docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
22
22
  docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
23
- docling-1.3.0.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
24
- docling-1.3.0.dist-info/METADATA,sha256=wi2DOn77z_BIMSLsrmzebYZUgpjHYWbNTOIVEY3A4-o,7042
25
- docling-1.3.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
26
- docling-1.3.0.dist-info/RECORD,,
23
+ docling-1.5.0.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
24
+ docling-1.5.0.dist-info/METADATA,sha256=jWcjsrdfYcpeYFCRQ1h5C1b8MyaKsJWyUhGheXQEGvY,7235
25
+ docling-1.5.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
26
+ docling-1.5.0.dist-info/RECORD,,