docling-core 2.29.0__py3-none-any.whl → 2.30.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -370,6 +370,13 @@ class HTMLPictureSerializer(BasePictureSerializer):
370
370
  **kwargs: Any,
371
371
  ) -> SerializationResult:
372
372
  """Export picture to HTML format."""
373
+
374
+ def get_img_row(imgb64: str, ind: int) -> str:
375
+ row = '<tr><td style="border: 2px solid black; padding: 8px;">'
376
+ row += f'<img src="data:image/png;base64,{imgb64}" alt="image {ind}">'
377
+ row += "</td></tr>\n"
378
+ return row
379
+
373
380
  params = HTMLParams(**kwargs)
374
381
 
375
382
  res_parts: list[SerializationResult] = []
@@ -393,6 +400,22 @@ class HTMLPictureSerializer(BasePictureSerializer):
393
400
  and item.image.uri.scheme == "data"
394
401
  ):
395
402
  img_text = f'<img src="{item.image.uri}">'
403
+ elif len(item.prov) > 1: # more than 1 provenance
404
+
405
+ img_text = (
406
+ '<table style="border-collapse: collapse; width: 100%;">\n'
407
+ )
408
+ for ind, prov in enumerate(item.prov):
409
+ img = item.get_image(doc, prov_index=ind)
410
+
411
+ if img is not None:
412
+ imgb64 = item._image_to_base64(img)
413
+ img_text += get_img_row(imgb64=imgb64, ind=ind)
414
+ else:
415
+ _logger.warning("Could not get image")
416
+
417
+ img_text += "</table>\n"
418
+
396
419
  else:
397
420
  # get the item.image._pil or crop it out of the page-image
398
421
  img = item.get_image(doc)
@@ -400,6 +423,9 @@ class HTMLPictureSerializer(BasePictureSerializer):
400
423
  if img is not None:
401
424
  imgb64 = item._image_to_base64(img)
402
425
  img_text = f'<img src="data:image/png;base64,{imgb64}">'
426
+ else:
427
+ _logger.warning("Could not get image")
428
+
403
429
  elif params.image_mode == ImageRefMode.REFERENCED:
404
430
  if isinstance(item.image, ImageRef) and not (
405
431
  isinstance(item.image.uri, AnyUrl)
@@ -790,7 +790,9 @@ class DocItem(
790
790
 
791
791
  return location
792
792
 
793
- def get_image(self, doc: "DoclingDocument") -> Optional[PILImage.Image]:
793
+ def get_image(
794
+ self, doc: "DoclingDocument", prov_index: int = 0
795
+ ) -> Optional[PILImage.Image]:
794
796
  """Returns the image of this DocItem.
795
797
 
796
798
  The function returns None if this DocItem has no valid provenance or
@@ -800,7 +802,7 @@ class DocItem(
800
802
  if not len(self.prov):
801
803
  return None
802
804
 
803
- page = doc.pages.get(self.prov[0].page_no)
805
+ page = doc.pages.get(self.prov[prov_index].page_no)
804
806
  if page is None or page.size is None or page.image is None:
805
807
  return None
806
808
 
@@ -808,7 +810,7 @@ class DocItem(
808
810
  if not page_image:
809
811
  return None
810
812
  crop_bbox = (
811
- self.prov[0]
813
+ self.prov[prov_index]
812
814
  .bbox.to_top_left_origin(page_height=page.size.height)
813
815
  .scale_to_size(old_size=page.size, new_size=page.image.size)
814
816
  # .scaled(scale=page_image.height / page.size.height)
@@ -973,7 +975,9 @@ class FloatingItem(DocItem):
973
975
  text += cap.resolve(doc).text
974
976
  return text
975
977
 
976
- def get_image(self, doc: "DoclingDocument") -> Optional[PILImage.Image]:
978
+ def get_image(
979
+ self, doc: "DoclingDocument", prov_index: int = 0
980
+ ) -> Optional[PILImage.Image]:
977
981
  """Returns the image corresponding to this FloatingItem.
978
982
 
979
983
  This function returns the PIL image from self.image if one is available.
@@ -985,7 +989,7 @@ class FloatingItem(DocItem):
985
989
  """
986
990
  if self.image is not None:
987
991
  return self.image.pil_image
988
- return super().get_image(doc=doc)
992
+ return super().get_image(doc=doc, prov_index=prov_index)
989
993
 
990
994
 
991
995
  class CodeItem(FloatingItem, TextItem):
@@ -1073,7 +1077,7 @@ class PictureItem(FloatingItem):
1073
1077
  image_bytes = self.image._pil.tobytes()
1074
1078
 
1075
1079
  # Create a hash object (e.g., SHA-256)
1076
- hasher = hashlib.sha256()
1080
+ hasher = hashlib.sha256(usedforsecurity=False)
1077
1081
 
1078
1082
  # Feed the image bytes into the hash object
1079
1083
  hasher.update(image_bytes)
@@ -2657,16 +2661,25 @@ class DoclingDocument(BaseModel):
2657
2661
  if should_yield:
2658
2662
  yield root, my_stack
2659
2663
 
2660
- # Handle picture traversal - only traverse children if requested
2661
- if isinstance(root, PictureItem) and not traverse_pictures:
2662
- return
2663
-
2664
2664
  my_stack.append(-1)
2665
2665
 
2666
+ allowed_pic_refs: set[str] = (
2667
+ {r.cref for r in root.captions}
2668
+ if (root_is_picture := isinstance(root, PictureItem))
2669
+ else set()
2670
+ )
2671
+
2666
2672
  # Traverse children
2667
2673
  for child_ind, child_ref in enumerate(root.children):
2668
- my_stack[-1] = child_ind
2669
2674
  child = child_ref.resolve(self)
2675
+ if (
2676
+ root_is_picture
2677
+ and not traverse_pictures
2678
+ and isinstance(child, DocItem)
2679
+ and child.self_ref not in allowed_pic_refs
2680
+ ):
2681
+ continue
2682
+ my_stack[-1] = child_ind
2670
2683
 
2671
2684
  if isinstance(child, NodeItem):
2672
2685
  yield from self._iterate_items_with_stack(
@@ -3597,7 +3610,9 @@ class DoclingDocument(BaseModel):
3597
3610
  rf"{DocumentToken.UNORDERED_LIST.value}|"
3598
3611
  rf"{DocItemLabel.KEY_VALUE_REGION}|"
3599
3612
  rf"{DocumentToken.CHART.value}|"
3600
- rf"{DocumentToken.OTSL.value})>.*?</(?P=tag)>"
3613
+ rf"{DocumentToken.OTSL.value})>"
3614
+ rf"(?P<content>.*?)"
3615
+ rf"(?:(?P<closed></(?P=tag)>)|(?P<eof>$))"
3601
3616
  )
3602
3617
  pattern = re.compile(tag_pattern, re.DOTALL)
3603
3618
 
@@ -3607,6 +3622,10 @@ class DoclingDocument(BaseModel):
3607
3622
  tag_name = match.group("tag")
3608
3623
 
3609
3624
  bbox = extract_bounding_box(full_chunk) # Extracts first bbox
3625
+ if not match.group("closed"):
3626
+ # no closing tag; only the existence of the item is recovered
3627
+ full_chunk = f"<{tag_name}></{tag_name}>"
3628
+
3610
3629
  doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.PARAGRAPH)
3611
3630
 
3612
3631
  if tag_name == DocumentToken.OTSL.value:
@@ -78,6 +78,7 @@ class GroupLabel(str, Enum):
78
78
  KEY_VALUE_AREA = "key_value_area"
79
79
  COMMENT_SECTION = "comment_section"
80
80
  INLINE = "inline"
81
+ PICTURE_AREA = "picture_area"
81
82
 
82
83
  def __str__(self):
83
84
  """Get string value."""
@@ -472,8 +472,27 @@ class SegmentedPage(BaseModel):
472
472
  word_cells: List[TextCell] = []
473
473
  textline_cells: List[TextCell] = []
474
474
 
475
+ # These flags are set to differentiate if above lists of this SegmentedPage
476
+ # are empty (page had no content) or if they have not been computed (i.e. textline_cells may be present
477
+ # but word_cells are not)
478
+ has_chars: bool = False
479
+ has_words: bool = False
480
+ has_lines: bool = False
481
+
475
482
  image: Optional[ImageRef] = None
476
483
 
484
+ @model_validator(mode="after")
485
+ def validate_page(self) -> "SegmentedPage":
486
+ """Validate page."""
487
+ if len(self.textline_cells) > 0:
488
+ self.has_lines = True
489
+ if len(self.word_cells) > 0:
490
+ self.has_words = True
491
+ if len(self.char_cells) > 0:
492
+ self.has_chars = True
493
+
494
+ return self
495
+
477
496
  def iterate_cells(self, unit_type: TextCellUnit) -> Iterator[TextCell]:
478
497
  """Iterate through text cells of the specified unit type.
479
498
 
@@ -47,7 +47,7 @@ from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocu
47
47
 
48
48
 
49
49
  def _create_hash(string: str):
50
- hasher = hashlib.sha256()
50
+ hasher = hashlib.sha256(usedforsecurity=False)
51
51
  hasher.update(string.encode("utf-8"))
52
52
 
53
53
  return hasher.hexdigest()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.29.0
3
+ Version: 2.30.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://github.com/docling-project
6
6
  License: MIT
@@ -29,7 +29,7 @@ docling_core/transforms/serializer/__init__.py,sha256=CECQlMoCDUxkg4RAUdC3itA3I3
29
29
  docling_core/transforms/serializer/base.py,sha256=9bgpWA0oMmZNRc3yIuZVnu5bJ1glClBsswtVF1vYwMI,6046
30
30
  docling_core/transforms/serializer/common.py,sha256=xBwhsgDZbNWMpp6ExpyUWO8_NvHPfbPF1ak2z9API5M,17435
31
31
  docling_core/transforms/serializer/doctags.py,sha256=mEmRWVuebcG5pZcR1_HX146cyUk0_FjaLQtMXSgh9hs,17870
32
- docling_core/transforms/serializer/html.py,sha256=r37zA88ca0GAo1-I-il6ZMQA7OsT0gsjAYsqQFfH9o4,33064
32
+ docling_core/transforms/serializer/html.py,sha256=2zlV8B-xsXHfTM13sb2pRwtKaxLcNUlW0p0NCgbiatk,34088
33
33
  docling_core/transforms/serializer/html_styles.py,sha256=-jBwS4EU7yfKoz0GSoxhwx90OmIKieO6TwPw57IuxcA,4692
34
34
  docling_core/transforms/serializer/markdown.py,sha256=YqThAYMsOWSg6nZnnmrUHZohn0QvfZzRqpLrB-Keev8,17873
35
35
  docling_core/transforms/visualizer/__init__.py,sha256=gUfF25yiJ_KO46ZIUNqZQOZGy2PLx6gnnr6AZYxKHXI,35
@@ -40,9 +40,9 @@ docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HX
40
40
  docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
41
41
  docling_core/types/doc/__init__.py,sha256=bysJn2iwjAHwThSWDPXEdVUUij7p_ax12_nx2_0CMdg,653
42
42
  docling_core/types/doc/base.py,sha256=sM3IyFXzVh2WT8IGh5nejXYh8sf39yBh8TBSlHeJ9CI,12611
43
- docling_core/types/doc/document.py,sha256=wCUKOGngiqITddTinlU-qGys3QdfGn29eoRUEeKrHOQ,140220
44
- docling_core/types/doc/labels.py,sha256=3QgteZZ4jKi0fideTuTnuriviJBwew-5RKE4pse7Ppk,5812
45
- docling_core/types/doc/page.py,sha256=MvQnrCLDQDd9STawMFNXVCI4oYKYcQ3ECXlAUBbSz9A,40406
43
+ docling_core/types/doc/document.py,sha256=eboNYL-QVnDNnw3vL7PPVdPosfs5oNfsrVofxmdBDHw,140884
44
+ docling_core/types/doc/labels.py,sha256=xyrQC89OfedBaBvulXtgIGgN3Sv0D4J5k0qSXnGw2hk,5846
45
+ docling_core/types/doc/page.py,sha256=1JMPwglaTITBvg959L_pcWPb-fXoDYGh-e_tGZMzVMQ,41060
46
46
  docling_core/types/doc/tokens.py,sha256=z22l9J81_sg9CYMvOuLmPuLsNT7h_s7wao2UT89DvI8,9278
47
47
  docling_core/types/doc/utils.py,sha256=SaiQD-WMMooFm1bMqwatU-IGhtG048iKJb-ppnJit_k,2250
48
48
  docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
@@ -70,11 +70,11 @@ docling_core/utils/alias.py,sha256=B6Lqvss8CbaNARHLR4qSmNh9OkB6LvqTpxfsFmkLAFo,8
70
70
  docling_core/utils/file.py,sha256=GzX0pclvewwPoqHJSaVUuULzSJwJgkCUwgKgJ7G5ohQ,5628
71
71
  docling_core/utils/generate_docs.py,sha256=BdKAoduWXOc7YMvcmlhjoJOFlUxij1ybxglj6LZDtC8,2290
72
72
  docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2tyi_OhHepHYtZg,1654
73
- docling_core/utils/legacy.py,sha256=SqNQAxl97aHfoJEsC9vZcMJg5FNkmqKPFi-wdSrnfI0,24442
73
+ docling_core/utils/legacy.py,sha256=DrI3QGoL755ZCIoKHF74-pTWm8R0zfFo2C2vB5dT2aY,24463
74
74
  docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
75
75
  docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
76
- docling_core-2.29.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
77
- docling_core-2.29.0.dist-info/METADATA,sha256=Lkbmi6367j_0fHabpIhLMgyv9qXBEI4fONRdMBvDEm4,5976
78
- docling_core-2.29.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
79
- docling_core-2.29.0.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
80
- docling_core-2.29.0.dist-info/RECORD,,
76
+ docling_core-2.30.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
77
+ docling_core-2.30.0.dist-info/METADATA,sha256=mzObLyfGYJ-ZLvh5X4plERiaEI_58l7fXs7zp0uqn_M,5976
78
+ docling_core-2.30.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
79
+ docling_core-2.30.0.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
80
+ docling_core-2.30.0.dist-info/RECORD,,