docling-core 2.30.1__py3-none-any.whl → 2.31.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -169,7 +169,7 @@ class CommonParams(BaseModel):
169
169
 
170
170
  def merge_with_patch(self, patch: dict[str, Any]) -> Self:
171
171
  """Create an instance by merging the provided patch dict on top of self."""
172
- res = self.model_validate({**self.model_dump(), **patch})
172
+ res = self.model_copy(update=patch)
173
173
  return res
174
174
 
175
175
 
@@ -260,10 +260,10 @@ class DocSerializer(BaseModel, BaseDocSerializer):
260
260
  """Serialize a document out of its pages."""
261
261
  ...
262
262
 
263
- def _serialize_body(self) -> SerializationResult:
263
+ def _serialize_body(self, **kwargs) -> SerializationResult:
264
264
  """Serialize the document body."""
265
265
  subparts = self.get_parts()
266
- res = self.serialize_doc(parts=subparts)
266
+ res = self.serialize_doc(parts=subparts, **kwargs)
267
267
  return res
268
268
 
269
269
  @override
@@ -278,12 +278,12 @@ class DocSerializer(BaseModel, BaseDocSerializer):
278
278
  ) -> SerializationResult:
279
279
  """Serialize a given node."""
280
280
  my_visited: set[str] = visited if visited is not None else set()
281
- my_kwargs = self.params.merge_with_patch(patch=kwargs).model_dump()
281
+ my_kwargs = {**self.params.model_dump(), **kwargs}
282
282
  empty_res = create_ser_result()
283
283
  if item is None or item == self.doc.body:
284
284
  if self.doc.body.self_ref not in my_visited:
285
285
  my_visited.add(self.doc.body.self_ref)
286
- return self._serialize_body()
286
+ return self._serialize_body(**my_kwargs)
287
287
  else:
288
288
  return empty_res
289
289
 
@@ -16,6 +16,7 @@ from xml.etree.cElementTree import SubElement, tostring
16
16
  from xml.sax.saxutils import unescape
17
17
 
18
18
  import latex2mathml.converter
19
+ from PIL.Image import Image
19
20
  from pydantic import AnyUrl, BaseModel
20
21
  from typing_extensions import override
21
22
 
@@ -40,6 +41,7 @@ from docling_core.transforms.serializer.html_styles import (
40
41
  _get_css_for_single_column,
41
42
  _get_css_for_split_page,
42
43
  )
44
+ from docling_core.transforms.visualizer.base import BaseVisualizer
43
45
  from docling_core.types.doc.base import ImageRefMode
44
46
  from docling_core.types.doc.document import (
45
47
  CodeItem,
@@ -821,9 +823,22 @@ class HTMLDocSerializer(DocSerializer):
821
823
  def serialize_doc(
822
824
  self,
823
825
  parts: list[SerializationResult],
826
+ visualizer: Optional[BaseVisualizer] = None,
824
827
  **kwargs: Any,
825
828
  ) -> SerializationResult:
826
829
  """Serialize a document out of its pages."""
830
+
831
+ def _serialize_page_img(page_img: Image):
832
+ buffered = BytesIO()
833
+ page_img.save(buffered, format="PNG") # Save the image to the byte stream
834
+ img_bytes = buffered.getvalue() # Get the byte data
835
+
836
+ # Encode to Base64 and decode to string
837
+ img_base64 = base64.b64encode(img_bytes).decode("utf-8")
838
+ img_text = f'<img src="data:image/png;base64,{img_base64}">'
839
+
840
+ return f"<figure>{img_text}</figure>"
841
+
827
842
  # Create HTML structure
828
843
  html_parts = [
829
844
  "<!DOCTYPE html>",
@@ -853,19 +868,26 @@ class HTMLDocSerializer(DocSerializer):
853
868
  html_parts.append("<table>")
854
869
  html_parts.append("<tbody>")
855
870
 
871
+ vized_pages_dict: dict[Optional[int], Image] = {}
872
+ if visualizer:
873
+ vized_pages_dict = visualizer.get_visualization(doc=self.doc)
874
+
856
875
  for page_no, page in pages.items():
857
876
 
858
877
  if isinstance(page_no, int):
859
878
  if applicable_pages is not None and page_no not in applicable_pages:
860
879
  continue
861
880
  page_img = self.doc.pages[page_no].image
881
+ vized_page = vized_pages_dict.get(page_no)
862
882
 
863
883
  html_parts.append("<tr>")
864
884
 
865
885
  html_parts.append("<td>")
866
886
 
887
+ if vized_page:
888
+ html_parts.append(_serialize_page_img(page_img=vized_page))
867
889
  # short-cut: we already have the image in base64
868
- if (
890
+ elif (
869
891
  (page_img is not None)
870
892
  and isinstance(page_img, ImageRef)
871
893
  and isinstance(page_img.uri, AnyUrl)
@@ -875,18 +897,7 @@ class HTMLDocSerializer(DocSerializer):
875
897
  html_parts.append(f"<figure>{img_text}</figure>")
876
898
 
877
899
  elif (page_img is not None) and (page_img._pil is not None):
878
-
879
- buffered = BytesIO()
880
- page_img._pil.save(
881
- buffered, format="PNG"
882
- ) # Save the image to the byte stream
883
- img_bytes = buffered.getvalue() # Get the byte data
884
-
885
- # Encode to Base64 and decode to string
886
- img_base64 = base64.b64encode(img_bytes).decode("utf-8")
887
- img_text = f'<img src="data:image/png;base64,{img_base64}">'
888
-
889
- html_parts.append(f"<figure>{img_text}</figure>")
900
+ html_parts.append(_serialize_page_img(page_img=page_img._pil))
890
901
  else:
891
902
  html_parts.append("<figure>no page-image found</figure>")
892
903
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.30.1
3
+ Version: 2.31.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://github.com/docling-project
6
6
  License: MIT
@@ -27,9 +27,9 @@ docling_core/transforms/chunker/tokenizer/huggingface.py,sha256=JQ-D3b5vTPQbvu4H
27
27
  docling_core/transforms/chunker/tokenizer/openai.py,sha256=zt2kwcC-r8MafeEG0CESab8E4RIC9aaFXxxnxOGyTMA,918
28
28
  docling_core/transforms/serializer/__init__.py,sha256=CECQlMoCDUxkg4RAUdC3itA3I3qFhKhe2HcYghN6_xw,105
29
29
  docling_core/transforms/serializer/base.py,sha256=9bgpWA0oMmZNRc3yIuZVnu5bJ1glClBsswtVF1vYwMI,6046
30
- docling_core/transforms/serializer/common.py,sha256=xBwhsgDZbNWMpp6ExpyUWO8_NvHPfbPF1ak2z9API5M,17435
30
+ docling_core/transforms/serializer/common.py,sha256=TC1EwHIp9PYcI8jeTKeavUAPtounYmS0V1bfS_wDKm0,17427
31
31
  docling_core/transforms/serializer/doctags.py,sha256=mEmRWVuebcG5pZcR1_HX146cyUk0_FjaLQtMXSgh9hs,17870
32
- docling_core/transforms/serializer/html.py,sha256=2zlV8B-xsXHfTM13sb2pRwtKaxLcNUlW0p0NCgbiatk,34088
32
+ docling_core/transforms/serializer/html.py,sha256=Xq9CU5qZTDdwstizYqWNL_TFNDs9NHK_6JvvZk0TP98,34571
33
33
  docling_core/transforms/serializer/html_styles.py,sha256=-jBwS4EU7yfKoz0GSoxhwx90OmIKieO6TwPw57IuxcA,4692
34
34
  docling_core/transforms/serializer/markdown.py,sha256=YqThAYMsOWSg6nZnnmrUHZohn0QvfZzRqpLrB-Keev8,17873
35
35
  docling_core/transforms/visualizer/__init__.py,sha256=gUfF25yiJ_KO46ZIUNqZQOZGy2PLx6gnnr6AZYxKHXI,35
@@ -73,8 +73,8 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
73
73
  docling_core/utils/legacy.py,sha256=DrI3QGoL755ZCIoKHF74-pTWm8R0zfFo2C2vB5dT2aY,24463
74
74
  docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
75
75
  docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
76
- docling_core-2.30.1.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
77
- docling_core-2.30.1.dist-info/METADATA,sha256=qvcnzM33mlJCxj-5MIz4VjdfAGHk-xiYqbNPzsEx6GY,5976
78
- docling_core-2.30.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
79
- docling_core-2.30.1.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
80
- docling_core-2.30.1.dist-info/RECORD,,
76
+ docling_core-2.31.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
77
+ docling_core-2.31.0.dist-info/METADATA,sha256=OohNxPwKcbRVVKm_kpa3HRFhYh9ZMVyBIlf3apF9hm4,5976
78
+ docling_core-2.31.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
79
+ docling_core-2.31.0.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
80
+ docling_core-2.31.0.dist-info/RECORD,,