PyPI - docling-core - Versions diffs - 2.44.0__py3-none-any.whl → 2.44.2__py3-none-any.whl - Mend

docling-core 2.44.0py3-none-any.whl → 2.44.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docling-core might be problematic. Click here for more details.

Files changed (8) hide show

docling_core/transforms/serializer/html.py CHANGED Viewed

@@ -130,11 +130,14 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
         doc_serializer: BaseDocSerializer,
         doc: DoclingDocument,
         is_inline_scope: bool = False,
+        visited: Optional[set[str]] = None,
         **kwargs: Any,
     ) -> SerializationResult:
         """Serializes the passed text item to HTML."""
         params = HTMLParams(**kwargs)
+        my_visited: set[str] = visited if visited is not None else set()
         res_parts: list[SerializationResult] = []
+        post_processed = False
         # Prepare the HTML based on item type
         if isinstance(item, TitleItem):
@@ -162,7 +165,28 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
         elif isinstance(item, ListItem):
             # List items are handled by list serializer
-            text_inner = self._prepare_content(item.text)
+            text_parts: list[str] = []
+            if item_text := self._prepare_content(item.text):
+                item_text = doc_serializer.post_process(
+                    text=item_text,
+                    formatting=item.formatting,
+                    hyperlink=item.hyperlink,
+                )
+                post_processed = True
+                text_parts.append(item_text)
+            nested_parts = [
+                r.text
+                for r in doc_serializer.get_parts(
+                    item=item,
+                    is_inline_scope=is_inline_scope,
+                    visited=my_visited,
+                    **kwargs,
+                )
+            ]
+            text_parts.extend(nested_parts)
+            text_inner = "\n".join(text_parts)
+            if nested_parts:
+                text_inner = f"\n{text_inner}\n"
             text = (
                 get_html_tag_with_text_direction(
                     html_tag="li",
@@ -185,11 +209,12 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
             text = get_html_tag_with_text_direction(html_tag="p", text=text_inner)
         # Apply formatting and hyperlinks
-        text = doc_serializer.post_process(
-            text=text,
-            formatting=item.formatting,
-            hyperlink=item.hyperlink,
-        )
+        if not post_processed:
+            text = doc_serializer.post_process(
+                text=text,
+                formatting=item.formatting,
+                hyperlink=item.hyperlink,
+            )
         if text:
             text_res = create_ser_result(text=text, span_source=item)
@@ -703,7 +728,6 @@ class HTMLListSerializer(BaseModel, BaseListSerializer):
     ) -> SerializationResult:
         """Serializes a list to HTML."""
         my_visited: set[str] = visited if visited is not None else set()
-        params = HTMLParams(**kwargs)
         # Get all child parts
         parts = doc_serializer.get_parts(
             item=item,
@@ -713,72 +737,8 @@ class HTMLListSerializer(BaseModel, BaseListSerializer):
             **kwargs,
         )
-        # Append nested list to parent list item:
-        i = 0
-        while i < len(parts):
-            prt = parts[i]
-            if prt.text.startswith(("<ul>", "<ol>")):
-                for j in range(i - 1, -1, -1):
-                    if parts[j].text.startswith(("<li>", "<li ")) and parts[
-                        j
-                    ].text.endswith("</li>"):
-                        before, _, _ = parts[j].text.rpartition("</li>")
-                        parts[j].text = f"{before}\n{prt.text}\n</li>"
-                        break
-                if j > -1:
-                    parts.pop(i)
-            else:
-                i += 1
         # Add all child parts
-        text_res = "\n".join(
-            [
-                (
-                    p.text
-                    if (
-                        (
-                            p.text.startswith(("<li>", "<li "))
-                            and p.text.endswith("</li>")
-                        )
-                        or (
-                            p.text.startswith(("<ol>", "<ol "))
-                            and p.text.endswith("</ol>")
-                        )
-                        or (
-                            p.text.startswith(("<ul>", "<ul "))
-                            and p.text.endswith("</ul>")
-                        )
-                    )
-                    else (
-                        get_html_tag_with_text_direction(
-                            html_tag="li",
-                            text=p.text,
-                            attrs=(
-                                {
-                                    "style": f"list-style-type: '{grandparent_item.marker} ';"
-                                }
-                                if params.show_original_list_item_marker
-                                and grandparent_item.marker
-                                else {}
-                            ),
-                        )
-                        if p.spans
-                        and p.spans[0].item.parent
-                        and isinstance(
-                            (parent_item := p.spans[0].item.parent.resolve(doc)),
-                            InlineGroup,
-                        )
-                        and parent_item.parent
-                        and isinstance(
-                            (grandparent_item := parent_item.parent.resolve(doc)),
-                            ListItem,
-                        )
-                        else f"<li>{p.text}</li>"
-                    )
-                )
-                for p in parts
-            ]
-        )
+        text_res = "\n".join(p.text for p in parts if p.text)
         if text_res:
             tag = "ol" if item.first_item_is_enumerated(doc) else "ul"
             text_res = f"<{tag}>\n{text_res}\n</{tag}>"

docling_core/types/doc/document.py CHANGED Viewed

@@ -1373,11 +1373,12 @@ class PictureItem(FloatingItem):
         )  # Encode to Base64 and decode to string
         return img_base64
-    def _image_to_hexhash(self) -> Optional[str]:
+    @staticmethod
+    def _image_to_hexhash(img: Optional[PILImage.Image]) -> Optional[str]:
         """Hexash from the image."""
-        if self.image is not None and self.image._pil is not None:
+        if img is not None:
             # Convert the image to raw bytes
-            image_bytes = self.image._pil.tobytes()
+            image_bytes = img.tobytes()
             # Create a hash object (e.g., SHA-256)
             hasher = hashlib.sha256(usedforsecurity=False)
@@ -4116,16 +4117,10 @@ class DoclingDocument(BaseModel):
         if image_dir.is_dir():
             for item, level in result.iterate_items(page_no=page_no, with_groups=False):
                 if isinstance(item, PictureItem):
+                    img = item.get_image(doc=self)
+                    if img is not None:
-                    if (
-                        item.image is not None
-                        and isinstance(item.image.uri, AnyUrl)
-                        and item.image.uri.scheme == "data"
-                        and item.image.pil_image is not None
-                    ):
-                        img = item.image.pil_image
-                        hexhash = item._image_to_hexhash()
+                        hexhash = PictureItem._image_to_hexhash(img)
                         # loc_path = image_dir / f"image_{img_count:06}.png"
                         if hexhash is not None:
@@ -4140,6 +4135,11 @@ class DoclingDocument(BaseModel):
                             else:
                                 obj_path = loc_path
+                            if item.image is None:
+                                scale = img.size[0] / item.prov[0].bbox.width
+                                item.image = ImageRef.from_pil(
+                                    image=img, dpi=round(72 * scale)
+                                )
                             item.image.uri = Path(obj_path)
                         # if item.image._pil is not None:
@@ -4539,6 +4539,8 @@ class DoclingDocument(BaseModel):
             reference_path = None
         else:
             reference_path = filename.parent
+            artifacts_dir = reference_path / artifacts_dir
         return artifacts_dir, reference_path
     def _make_copy_with_refmode(

{docling_core-2.44.0.dist-info → docling_core-2.44.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docling-core
-Version: 2.44.0
+Version: 2.44.2
 Summary: A python library to define and validate data types in Docling.
 Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
 Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>

{docling_core-2.44.0.dist-info → docling_core-2.44.2.dist-info}/RECORD RENAMED Viewed

@@ -30,7 +30,7 @@ docling_core/transforms/serializer/__init__.py,sha256=CECQlMoCDUxkg4RAUdC3itA3I3
 docling_core/transforms/serializer/base.py,sha256=TI8Epj7gyxdTet9j-Rs4o5U09gfACfAIVoirlschviM,7266
 docling_core/transforms/serializer/common.py,sha256=0TNEGoA_rJ-qkVYp-X8SMUr3jTrbf6TRzPzwufYh5JM,19114
 docling_core/transforms/serializer/doctags.py,sha256=TD0yAm1qSVy-GsE6svpUAI-Yqjcf2rrTZ3ac9YU3gbE,19858
-docling_core/transforms/serializer/html.py,sha256=JswD_tQiQgmOJ29erkn6qEJX0F2N7zaoxJ9TCCSQPLE,39635
+docling_core/transforms/serializer/html.py,sha256=KnSMjtNZlBMfkuhtgB8T70iQSTfG_E8FFDfVRRo9WNs,38087
 docling_core/transforms/serializer/html_styles.py,sha256=-jBwS4EU7yfKoz0GSoxhwx90OmIKieO6TwPw57IuxcA,4692
 docling_core/transforms/serializer/markdown.py,sha256=VwonuAkuOPmQM7ibDIGvQBHOqhTcTJ_t187fLQQiNPo,23951
 docling_core/transforms/visualizer/__init__.py,sha256=gUfF25yiJ_KO46ZIUNqZQOZGy2PLx6gnnr6AZYxKHXI,35
@@ -43,7 +43,7 @@ docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HX
 docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
 docling_core/types/doc/__init__.py,sha256=8hOhm5W9mArf3zwgfoMxDs1pHizhLFSAZlLu1tPBBRk,1641
 docling_core/types/doc/base.py,sha256=i98y4IF250adR-8BSS374K90fwfwG-vBfWh14tLC5Cs,15906
-docling_core/types/doc/document.py,sha256=TW24eLKvQOySgip-nhGNZQ1JSdaBIuf03lYqH7SnUB8,200894
+docling_core/types/doc/document.py,sha256=-cL4eGFRbQHgXAsCG8zALxAx-IoanvkqG5E1zvKOMxI,201012
 docling_core/types/doc/labels.py,sha256=-W1-LW6z0J9F9ExJqR0Wd1WeqWTaY3Unm-j1UkQGlC4,7330
 docling_core/types/doc/page.py,sha256=35h1xdtCM3-AaN8Dim9jDseZIiw-3GxpB-ofF-H2rQQ,41878
 docling_core/types/doc/tokens.py,sha256=z22l9J81_sg9CYMvOuLmPuLsNT7h_s7wao2UT89DvI8,9278
@@ -76,9 +76,9 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
 docling_core/utils/legacy.py,sha256=5lghO48OEcV9V51tRnH3YSKgLtdqhr-Q5C_OcJZ8TOs,24392
 docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
 docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
-docling_core-2.44.0.dist-info/licenses/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
-docling_core-2.44.0.dist-info/METADATA,sha256=wCeKBVEkdB642lV8KJdldWmw3vFgxHtd_ttD2WMalJk,6453
-docling_core-2.44.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-docling_core-2.44.0.dist-info/entry_points.txt,sha256=ER4zROQWkFMHIrY-oqY5E4HeCcCIg8dLkNztYGxdb7c,59
-docling_core-2.44.0.dist-info/top_level.txt,sha256=O-tcXpGiurlud-1ZxMq1b-OmrfAVA4sajcgWU32RtfA,13
-docling_core-2.44.0.dist-info/RECORD,,
+docling_core-2.44.2.dist-info/licenses/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
+docling_core-2.44.2.dist-info/METADATA,sha256=IZWVMKuPPpzd3ksiFXTPUu3FSw13zuwa5qyaLWlBEyY,6453
+docling_core-2.44.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+docling_core-2.44.2.dist-info/entry_points.txt,sha256=ER4zROQWkFMHIrY-oqY5E4HeCcCIg8dLkNztYGxdb7c,59
+docling_core-2.44.2.dist-info/top_level.txt,sha256=O-tcXpGiurlud-1ZxMq1b-OmrfAVA4sajcgWU32RtfA,13
+docling_core-2.44.2.dist-info/RECORD,,

{docling_core-2.44.0.dist-info → docling_core-2.44.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{docling_core-2.44.0.dist-info → docling_core-2.44.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{docling_core-2.44.0.dist-info → docling_core-2.44.2.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{docling_core-2.44.0.dist-info → docling_core-2.44.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

docling-core 2.44.0__py3-none-any.whl → 2.44.2__py3-none-any.whl

Potentially problematic release.

docling-core 2.44.0py3-none-any.whl → 2.44.2py3-none-any.whl