docling-core 2.3.1__py3-none-any.whl → 2.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -551,6 +551,28 @@ class DocItem(
551
551
 
552
552
  return location
553
553
 
554
+ def get_image(self, doc: "DoclingDocument") -> Optional[PILImage.Image]:
555
+ """Returns the image of this DocItem.
556
+
557
+ The function returns None if this DocItem has no valid provenance or
558
+ if a valid image of the page containing this DocItem is not available
559
+ in doc.
560
+ """
561
+ if not len(self.prov):
562
+ return None
563
+
564
+ page = doc.pages.get(self.prov[0].page_no)
565
+ if page is None or page.size is None or page.image is None:
566
+ return None
567
+
568
+ page_image = page.image.pil_image
569
+ crop_bbox = (
570
+ self.prov[0]
571
+ .bbox.to_top_left_origin(page_height=page.size.height)
572
+ .scaled(scale=page_image.height / page.size.height)
573
+ )
574
+ return page_image.crop(crop_bbox.as_tuple())
575
+
554
576
 
555
577
  class TextItem(DocItem):
556
578
  """TextItem."""
@@ -633,6 +655,20 @@ class FloatingItem(DocItem):
633
655
  text += cap.resolve(doc).text
634
656
  return text
635
657
 
658
+ def get_image(self, doc: "DoclingDocument") -> Optional[PILImage.Image]:
659
+ """Returns the image corresponding to this FloatingItem.
660
+
661
+ This function returns the PIL image from self.image if one is available.
662
+ Otherwise, it uses DocItem.get_image to get an image of this FloatingItem.
663
+
664
+ In particular, when self.image is None, the function returns None if this
665
+ FloatingItem has no valid provenance or the doc does not contain a valid image
666
+ for the required page.
667
+ """
668
+ if self.image is not None:
669
+ return self.image.pil_image
670
+ return super().get_image(doc=doc)
671
+
636
672
 
637
673
  class PictureItem(FloatingItem):
638
674
  """PictureItem."""
@@ -1255,7 +1291,10 @@ class DoclingDocument(BaseModel):
1255
1291
  # If the child is a NodeItem, recursively traverse it
1256
1292
  if not isinstance(child, PictureItem) or traverse_pictures:
1257
1293
  yield from self.iterate_items(
1258
- child, _level=_level + 1, with_groups=with_groups
1294
+ child,
1295
+ _level=_level + 1,
1296
+ with_groups=with_groups,
1297
+ page_no=page_no,
1259
1298
  )
1260
1299
 
1261
1300
  def print_element_tree(self):
@@ -1281,11 +1320,12 @@ class DoclingDocument(BaseModel):
1281
1320
  image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
1282
1321
  indent: int = 4,
1283
1322
  text_width: int = -1,
1323
+ page_no: Optional[int] = None,
1284
1324
  ) -> str:
1285
1325
  r"""Serialize to Markdown.
1286
1326
 
1287
- Operates on a slice of the document's main_text as defined through arguments
1288
- main_text_start and main_text_stop; defaulting to the whole main_text.
1327
+ Operates on a slice of the document's body as defined through arguments
1328
+ from_element and to_element; defaulting to the whole document.
1289
1329
 
1290
1330
  :param delim: Delimiter to use when concatenating the various
1291
1331
  Markdown parts. Defaults to "\n\n".
@@ -1294,11 +1334,9 @@ class DoclingDocument(BaseModel):
1294
1334
  Defaults to 0.
1295
1335
  :type from_element: int
1296
1336
  :param to_element: Body slicing stop index
1297
- (exclusive). Defaults to None.
1298
- :type to_element: Optional[int]
1337
+ (exclusive). Defaults to 0maxint.
1338
+ :type to_element: int
1299
1339
  :param delim: str: (Default value = "\n\n")
1300
- :param from_element: int: (Default value = 0)
1301
- :param to_element: Optional[int]: (Default value = None)
1302
1340
  :param labels: set[DocItemLabel]
1303
1341
  :param "subtitle-level-1":
1304
1342
  :param "paragraph":
@@ -1306,7 +1344,6 @@ class DoclingDocument(BaseModel):
1306
1344
  :param "table":
1307
1345
  :param "Text":
1308
1346
  :param "text":
1309
- :param ]:
1310
1347
  :param strict_text: bool: (Default value = False)
1311
1348
  :param image_placeholder str: (Default value = "<!-- image -->")
1312
1349
  the placeholder to include to position images in the markdown.
@@ -1320,7 +1357,7 @@ class DoclingDocument(BaseModel):
1320
1357
  in_list = False # Track if we're currently processing list items
1321
1358
 
1322
1359
  for ix, (item, level) in enumerate(
1323
- self.iterate_items(self.body, with_groups=True)
1360
+ self.iterate_items(self.body, with_groups=True, page_no=page_no)
1324
1361
  ):
1325
1362
  # If we've moved to a lower level, we're exiting one or more groups
1326
1363
  if level < previous_level:
@@ -1331,7 +1368,7 @@ class DoclingDocument(BaseModel):
1331
1368
 
1332
1369
  previous_level = level # Update previous_level for next iteration
1333
1370
 
1334
- if ix < from_element and to_element <= ix:
1371
+ if ix < from_element or to_element <= ix:
1335
1372
  continue # skip as many items as you want
1336
1373
 
1337
1374
  # Handle newlines between different types of content
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.3.1
3
+ Version: 2.4.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -21,7 +21,7 @@ docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HX
21
21
  docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
22
22
  docling_core/types/doc/__init__.py,sha256=bEL4zKVOG7Wxm6xQrgF58mu-Teds9aSavuEAKVNhrTU,639
23
23
  docling_core/types/doc/base.py,sha256=zvx631U_yQCcJam83hNdDanXEYnO3eN-CCw9vDr6S-I,4442
24
- docling_core/types/doc/document.py,sha256=XF43-v9oflV-E5r2k2quoKvq8qBp5mAB_VunshY9b10,56356
24
+ docling_core/types/doc/document.py,sha256=6KeHY4yl4Ry5nT6wacb8ujJ5LnyEZohXG5MAGhoPWGY,57771
25
25
  docling_core/types/doc/labels.py,sha256=A8vWP82VAeXO1rlCO0oDKo_Hb8uDeQe0myOTY3P03hk,1596
26
26
  docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
27
27
  docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
@@ -49,8 +49,8 @@ docling_core/utils/generate_docs.py,sha256=BdKAoduWXOc7YMvcmlhjoJOFlUxij1ybxglj6
49
49
  docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2tyi_OhHepHYtZg,1654
50
50
  docling_core/utils/validate.py,sha256=3FmnxnKTDZC5J9OGxCL3U3DGRl0t0bBV1NcySXswdas,2031
51
51
  docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
52
- docling_core-2.3.1.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
53
- docling_core-2.3.1.dist-info/METADATA,sha256=mASC44D6AB2bIACFr2oGrsZHtHRzn5e1wjBJyy6ccns,5432
54
- docling_core-2.3.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
55
- docling_core-2.3.1.dist-info/entry_points.txt,sha256=jIxlWv3tnO04irlZc0zfhqJIgz1bg9Hha4AkaLWSdUA,177
56
- docling_core-2.3.1.dist-info/RECORD,,
52
+ docling_core-2.4.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
53
+ docling_core-2.4.0.dist-info/METADATA,sha256=fXFVK6Ey5DC15uSYgMixUmGxH6hEM-Kx06tK7jvW2IA,5432
54
+ docling_core-2.4.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
55
+ docling_core-2.4.0.dist-info/entry_points.txt,sha256=jIxlWv3tnO04irlZc0zfhqJIgz1bg9Hha4AkaLWSdUA,177
56
+ docling_core-2.4.0.dist-info/RECORD,,