PyPI - docling-core - Versions diffs - 2.25.0__py3-none-any.whl → 2.26.1__py3-none-any.whl - Mend

docling-core 2.25.0py3-none-any.whl → 2.26.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docling-core might be problematic. Click here for more details.

Files changed (18) hide show

docling_core/experimental/serializer/base.py +29 -3
docling_core/experimental/serializer/common.py +157 -71
docling_core/experimental/serializer/doctags.py +88 -54
docling_core/experimental/serializer/html.py +941 -0
docling_core/experimental/serializer/html_styles.py +212 -0
docling_core/experimental/serializer/markdown.py +105 -63
docling_core/transforms/chunker/base.py +8 -2
docling_core/transforms/chunker/hierarchical_chunker.py +130 -109
docling_core/transforms/chunker/hybrid_chunker.py +54 -12
docling_core/types/doc/document.py +702 -482
docling_core/types/doc/labels.py +2 -0
docling_core/types/doc/page.py +12 -17
docling_core/types/doc/tokens.py +3 -0
{docling_core-2.25.0.dist-info → docling_core-2.26.1.dist-info}/METADATA +1 -1
{docling_core-2.25.0.dist-info → docling_core-2.26.1.dist-info}/RECORD +18 -16
{docling_core-2.25.0.dist-info → docling_core-2.26.1.dist-info}/LICENSE +0 -0
{docling_core-2.25.0.dist-info → docling_core-2.26.1.dist-info}/WHEEL +0 -0
{docling_core-2.25.0.dist-info → docling_core-2.26.1.dist-info}/entry_points.txt +0 -0

docling_core/experimental/serializer/html_styles.py ADDED Viewed

@@ -0,0 +1,212 @@
+"""HTML styles for different export modes."""
+def _get_css_with_no_styling() -> str:
+    """Return default CSS styles for the HTML document."""
+    return "<style></style>"
+def _get_css_for_split_page() -> str:
+    """Return default CSS styles for the HTML document."""
+    return """<style>
+    html {
+        background-color: #e1e1e1;
+        font-family: Arial, sans-serif;
+        line-height: 1.6;
+    }
+    img {
+        min-width: 500px;
+        max-width: 100%;
+    }
+    table {
+        border-collapse: collapse;
+        border: 0px solid #fff;
+        width: 100%;
+    }
+    td {
+        vertical-align: top;
+    }
+    .page {
+        background-color: white;
+        margin-top:15px;
+        padding: 30px;
+        border: 1px solid black;
+        width:100%;
+        max-width:1000px;
+        box-shadow: 0 0 10px rgba(0,0,0,0.5);
+    }
+    .page figure {
+        text-align: center;
+    }
+    .page img {
+        max-width: 900px;
+        min-width: auto;
+    }
+    .page table {
+        border-collapse: collapse;
+        margin: 1em 0;
+        width: 100%;
+    }
+    .page table td {
+        border: 1px solid #ddd;
+        padding: 8px;
+        text-align: left;
+    }
+    .page table th {
+        border: 1px solid #ddd;
+        padding: 8px;
+        text-align: left;
+        background-color: #f2f2f2;
+        font-weight: bold;
+    }
+    .page table caption {
+        color: #666;
+        font-style: italic;
+        margin-top: 0.5em;
+        padding: 8px;
+        margin-top: 5px;
+        margin-bottom: 5px;
+    }
+    .page figcaption {
+        color: #666;
+        font-style: italic;
+        margin-top: 0.5em;
+        padding: 8px;
+        margin-top: 5px;
+        margin-bottom: 5px;
+    }
+    code {
+        background-color: rgb(228, 228, 228);
+        border: 1px solid darkgray;
+        padding: 10px;
+        display: inline-block;
+        font-family: monospace;
+        max-width:980px;
+        word-wrap: normal;
+        white-space: pre-wrap;
+        word-wrap: break-word;
+        /*overflow-wrap: break-word;*/
+    }
+</style>
+"""
+def _get_css_for_single_column() -> str:
+    """Return CSS styles for the single-column HTML document."""
+    return """<style>
+    html {
+        background-color: #f5f5f5;
+        font-family: Arial, sans-serif;
+        line-height: 1.6;
+    }
+    body {
+        max-width: 800px;
+        margin: 0 auto;
+        padding: 2rem;
+        background-color: white;
+        box-shadow: 0 0 10px rgba(0,0,0,0.1);
+    }
+    h1, h2, h3, h4, h5, h6 {
+        color: #333;
+        margin-top: 1.5em;
+        margin-bottom: 0.5em;
+    }
+    h1 {
+        font-size: 2em;
+        border-bottom: 1px solid #eee;
+        padding-bottom: 0.3em;
+    }
+    table {
+        border-collapse: collapse;
+        margin: 1em 0;
+        width: 100%;
+    }
+    th, td {
+        border: 1px solid #ddd;
+        padding: 8px;
+        text-align: left;
+    }
+    th {
+        background-color: #f2f2f2;
+        font-weight: bold;
+    }
+    figure {
+        margin: 1.5em 0;
+        text-align: center;
+    }
+    figcaption {
+        color: #666;
+        font-style: italic;
+        margin-top: 0.5em;
+    }
+    img {
+        max-width: 100%;
+        height: auto;
+    }
+    pre {
+        background-color: #f6f8fa;
+        border-radius: 3px;
+        padding: 1em;
+        overflow: auto;
+    }
+    code {
+        font-family: monospace;
+        background-color: #f6f8fa;
+        padding: 0.2em 0.4em;
+        border-radius: 3px;
+    }
+    pre code {
+        background-color: transparent;
+        padding: 0;
+    }
+    .formula {
+        text-align: center;
+        padding: 0.5em;
+        margin: 1em 0;
+        background-color: #f9f9f9;
+    }
+    .formula-not-decoded {
+        text-align: center;
+        padding: 0.5em;
+        margin: 1em 0;
+        background: repeating-linear-gradient(
+            45deg,
+            #f0f0f0,
+            #f0f0f0 10px,
+            #f9f9f9 10px,
+            #f9f9f9 20px
+        );
+    }
+    .page-break {
+        page-break-after: always;
+        border-top: 1px dashed #ccc;
+        margin: 2em 0;
+    }
+    .key-value-region {
+        background-color: #f9f9f9;
+        padding: 1em;
+        border-radius: 4px;
+        margin: 1em 0;
+    }
+    .key-value-region dt {
+        font-weight: bold;
+    }
+    .key-value-region dd {
+        margin-left: 1em;
+        margin-bottom: 0.5em;
+    }
+    .form-container {
+        border: 1px solid #ddd;
+        padding: 1em;
+        border-radius: 4px;
+        margin: 1em 0;
+    }
+    .form-item {
+        margin-bottom: 0.5em;
+    }
+    .image-classification {
+        font-size: 0.9em;
+        color: #666;
+        margin-top: 0.5em;
+    }
+</style>"""

docling_core/experimental/serializer/markdown.py CHANGED Viewed

@@ -26,7 +26,12 @@ from docling_core.experimental.serializer.base import (
     BaseTextSerializer,
     SerializationResult,
 )
-from docling_core.experimental.serializer.common import CommonParams, DocSerializer
+from docling_core.experimental.serializer.common import (
+    CommonParams,
+    DocSerializer,
+    _PageBreakSerResult,
+    create_ser_result,
+)
 from docling_core.types.doc.base import ImageRefMode
 from docling_core.types.doc.document import (
     CodeItem,
@@ -43,6 +48,7 @@ from docling_core.types.doc.document import (
     NodeItem,
     OrderedList,
     PictureItem,
+    PictureTabularChartData,
     SectionHeaderItem,
     TableItem,
     TextItem,
@@ -57,10 +63,12 @@ class MarkdownParams(CommonParams):
     layers: set[ContentLayer] = {ContentLayer.BODY}
     image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER
     image_placeholder: str = "<!-- image -->"
+    enable_chart_tables: bool = True
     indent: int = 4
     wrap_width: Optional[PositiveInt] = None
     page_break_placeholder: Optional[str] = None  # e.g. "<!-- page break -->"
     escape_underscores: bool = True
+    escape_html: bool = True
 class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
@@ -78,46 +86,51 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
     ) -> SerializationResult:
         """Serializes the passed item."""
         params = MarkdownParams(**kwargs)
-        parts: list[str] = []
+        res_parts: list[SerializationResult] = []
         escape_html = True
         escape_underscores = True
         if isinstance(item, TitleItem):
-            text = f"# {item.text}"
+            text_part = f"# {item.text}"
         elif isinstance(item, SectionHeaderItem):
-            text = f"{(item.level + 1) * '#'} {item.text}"
+            text_part = f"{(item.level + 1) * '#'} {item.text}"
         elif isinstance(item, CodeItem):
-            text = f"`{item.text}`" if is_inline_scope else f"```\n{item.text}\n```"
+            text_part = (
+                f"`{item.text}`" if is_inline_scope else f"```\n{item.text}\n```"
+            )
             escape_html = False
             escape_underscores = False
         elif isinstance(item, FormulaItem):
             if item.text:
-                text = f"${item.text}$" if is_inline_scope else f"$${item.text}$$"
+                text_part = f"${item.text}$" if is_inline_scope else f"$${item.text}$$"
             elif item.orig:
-                text = "<!-- formula-not-decoded -->"
+                text_part = "<!-- formula-not-decoded -->"
             else:
-                text = ""
+                text_part = ""
             escape_html = False
             escape_underscores = False
         elif params.wrap_width:
-            text = textwrap.fill(item.text, width=params.wrap_width)
+            text_part = textwrap.fill(item.text, width=params.wrap_width)
         else:
-            text = item.text
-        parts.append(text)
+            text_part = item.text
+        if text_part:
+            text_res = create_ser_result(text=text_part, span_source=item)
+            res_parts.append(text_res)
         if isinstance(item, FloatingItem):
-            cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
-            if cap_text:
-                parts.append(cap_text)
+            cap_res = doc_serializer.serialize_captions(item=item, **kwargs)
+            if cap_res.text:
+                res_parts.append(cap_res)
-        text_res = (" " if is_inline_scope else "\n\n").join(parts)
-        text_res = doc_serializer.post_process(
-            text=text_res,
+        text = (" " if is_inline_scope else "\n\n").join([r.text for r in res_parts])
+        text = doc_serializer.post_process(
+            text=text,
             escape_html=escape_html,
             escape_underscores=escape_underscores,
             formatting=item.formatting,
             hyperlink=item.hyperlink,
         )
-        return SerializationResult(text=text_res)
+        return create_ser_result(text=text, span_source=res_parts)
 class MarkdownTableSerializer(BaseTableSerializer):
@@ -133,14 +146,14 @@ class MarkdownTableSerializer(BaseTableSerializer):
         **kwargs,
     ) -> SerializationResult:
         """Serializes the passed item."""
-        text_parts: list[str] = []
+        res_parts: list[SerializationResult] = []
         cap_res = doc_serializer.serialize_captions(
             item=item,
             **kwargs,
         )
         if cap_res.text:
-            text_parts.append(cap_res.text)
+            res_parts.append(cap_res)
         if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
             rows = [
@@ -165,11 +178,11 @@ class MarkdownTableSerializer(BaseTableSerializer):
             else:
                 table_text = ""
             if table_text:
-                text_parts.append(table_text)
+                res_parts.append(create_ser_result(text=table_text, span_source=item))
-        text_res = "\n\n".join(text_parts)
+        text_res = "\n\n".join([r.text for r in res_parts])
-        return SerializationResult(text=text_res)
+        return create_ser_result(text=text_res, span_source=res_parts)
 class MarkdownPictureSerializer(BasePictureSerializer):
@@ -187,14 +200,14 @@ class MarkdownPictureSerializer(BasePictureSerializer):
         """Serializes the passed item."""
         params = MarkdownParams(**kwargs)
-        texts: list[str] = []
+        res_parts: list[SerializationResult] = []
         cap_res = doc_serializer.serialize_captions(
             item=item,
             **kwargs,
         )
         if cap_res.text:
-            texts.append(cap_res.text)
+            res_parts.append(cap_res)
         if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
             img_res = self._serialize_image_part(
@@ -204,11 +217,28 @@ class MarkdownPictureSerializer(BasePictureSerializer):
                 image_placeholder=params.image_placeholder,
             )
             if img_res.text:
-                texts.append(img_res.text)
-        text_res = "\n\n".join(texts)
+                res_parts.append(img_res)
+        if params.enable_chart_tables:
+            # Check if picture has attached PictureTabularChartData
+            tabular_chart_annotations = [
+                ann
+                for ann in item.annotations
+                if isinstance(ann, PictureTabularChartData)
+            ]
+            if len(tabular_chart_annotations) > 0:
+                temp_doc = DoclingDocument(name="temp")
+                temp_table = temp_doc.add_table(
+                    data=tabular_chart_annotations[0].chart_data
+                )
+                md_table_content = temp_table.export_to_markdown(temp_doc)
+                if len(md_table_content) > 0:
+                    res_parts.append(
+                        create_ser_result(text=md_table_content, span_source=item)
+                    )
+        text_res = "\n\n".join([r.text for r in res_parts])
-        return SerializationResult(text=text_res)
+        return create_ser_result(text=text_res, span_source=res_parts)
     def _serialize_image_part(
         self,
@@ -255,7 +285,7 @@ class MarkdownPictureSerializer(BasePictureSerializer):
         else:
             text_res = image_placeholder
-        return SerializationResult(text=text_res)
+        return create_ser_result(text=text_res, span_source=item)
 class MarkdownKeyValueSerializer(BaseKeyValueSerializer):
@@ -272,12 +302,13 @@ class MarkdownKeyValueSerializer(BaseKeyValueSerializer):
     ) -> SerializationResult:
         """Serializes the passed item."""
         # TODO add actual implementation
-        text_res = (
-            "<!-- missing-key-value-item -->"
-            if item.self_ref not in doc_serializer.get_excluded_refs()
-            else ""
-        )
-        return SerializationResult(text=text_res)
+        if item.self_ref not in doc_serializer.get_excluded_refs():
+            return create_ser_result(
+                text="<!-- missing-key-value-item -->",
+                span_source=item,
+            )
+        else:
+            return create_ser_result()
 class MarkdownFormSerializer(BaseFormSerializer):
@@ -294,12 +325,13 @@ class MarkdownFormSerializer(BaseFormSerializer):
     ) -> SerializationResult:
         """Serializes the passed item."""
         # TODO add actual implementation
-        text_res = (
-            "<!-- missing-form-item -->"
-            if item.self_ref not in doc_serializer.get_excluded_refs()
-            else ""
-        )
-        return SerializationResult(text=text_res)
+        if item.self_ref not in doc_serializer.get_excluded_refs():
+            return create_ser_result(
+                text="<!-- missing-form-item -->",
+                span_source=item,
+            )
+        else:
+            return create_ser_result()
 class MarkdownListSerializer(BaseModel, BaseListSerializer):
@@ -319,7 +351,7 @@ class MarkdownListSerializer(BaseModel, BaseListSerializer):
     ) -> SerializationResult:
         """Serializes the passed item."""
         params = MarkdownParams(**kwargs)
-        my_visited = visited or set()
+        my_visited = visited if visited is not None else set()
         parts = doc_serializer.get_parts(
             item=item,
             list_level=list_level + 1,
@@ -332,6 +364,7 @@ class MarkdownListSerializer(BaseModel, BaseListSerializer):
         for p in parts:
             if p.text and p.text[0] == " " and my_parts:
                 my_parts[-1].text = sep.join([my_parts[-1].text, p.text])  # update last
+                my_parts[-1].spans.extend(p.spans)
             else:
                 my_parts.append(p)
@@ -343,12 +376,16 @@ class MarkdownListSerializer(BaseModel, BaseListSerializer):
                 (
                     c.text
                     if c.text and c.text[0] == " "
-                    else f"{indent_str}{f'{i + 1}.' if is_ol else '-'} {c.text}"
+                    else (
+                        f"{indent_str}"
+                        f"{'' if isinstance(c, _PageBreakSerResult) else (f'{i + 1}. ' if is_ol else '- ')}"  # noqa: E501
+                        f"{c.text}"
+                    )
                 )
                 for i, c in enumerate(my_parts)
             ]
         )
-        return SerializationResult(text=text_res)
+        return create_ser_result(text=text_res, span_source=my_parts)
 class MarkdownInlineSerializer(BaseInlineSerializer):
@@ -366,15 +403,16 @@ class MarkdownInlineSerializer(BaseInlineSerializer):
         **kwargs,
     ) -> SerializationResult:
         """Serializes the passed item."""
-        my_visited = visited or set()
+        my_visited = visited if visited is not None else set()
         parts = doc_serializer.get_parts(
             item=item,
             list_level=list_level,
             is_inline_scope=True,
             visited=my_visited,
+            **kwargs,
         )
         text_res = " ".join([p.text for p in parts if p.text])
-        return SerializationResult(text=text_res)
+        return create_ser_result(text=text_res, span_source=parts)
 class MarkdownFallbackSerializer(BaseFallbackSerializer):
@@ -391,10 +429,12 @@ class MarkdownFallbackSerializer(BaseFallbackSerializer):
     ) -> SerializationResult:
         """Serializes the passed item."""
         if isinstance(item, DocItem):
-            text_res = "<!-- missing-text -->"
+            return create_ser_result(
+                text="<!-- missing-text -->",
+                span_source=item,
+            )
         else:
-            text_res = ""  # TODO go with explicit None return type?
-        return SerializationResult(text=text_res)
+            return create_ser_result()
 class MarkdownDocSerializer(DocSerializer):
@@ -472,7 +512,7 @@ class MarkdownDocSerializer(DocSerializer):
         params = self.params.merge_with_patch(patch=kwargs)
         if escape_underscores and params.escape_underscores:
             res = self._escape_underscores(text)
-        if escape_html:
+        if escape_html and params.escape_html:
             res = html.escape(res, quote=False)
         res = super().post_process(
             text=res,
@@ -482,17 +522,19 @@ class MarkdownDocSerializer(DocSerializer):
         return res
     @override
-    def serialize_page(self, parts: list[SerializationResult]) -> SerializationResult:
-        """Serialize a page out of its parts."""
-        text_res = "\n\n".join([p.text for p in parts])
-        return SerializationResult(text=text_res)
+    def serialize_doc(
+        self, *, parts: list[SerializationResult], **kwargs
+    ) -> SerializationResult:
+        """Serialize a document out of its parts."""
+        text_res = "\n\n".join([p.text for p in parts if p.text])
+        if self.params.page_break_placeholder:
+            page_sep = self.params.page_break_placeholder or ""
+            for full_match, _, _ in self._get_page_breaks(text=text_res):
+                text_res = text_res.replace(full_match, page_sep)
+        return create_ser_result(text=text_res, span_source=parts)
     @override
-    def serialize_doc(self, pages: list[SerializationResult]) -> SerializationResult:
-        """Serialize a document out of its pages."""
-        if self.params.page_break_placeholder is not None:
-            sep = f"\n\n{self.params.page_break_placeholder}\n\n"
-            text_res = sep.join([p.text for p in pages if p.text])
-            return SerializationResult(text=text_res)
-        else:
-            return self.serialize_page(parts=pages)
+    def requires_page_break(self):
+        """Whether to add page breaks."""
+        return self.params.page_break_placeholder is not None

docling_core/transforms/chunker/base.py CHANGED Viewed

@@ -9,6 +9,7 @@ from abc import ABC, abstractmethod
 from typing import Any, ClassVar, Iterator
 from pydantic import BaseModel
+from typing_extensions import deprecated
 from docling_core.types.doc import DoclingDocument as DLDocument
@@ -65,8 +66,8 @@ class BaseChunker(BaseModel, ABC):
         """
         raise NotImplementedError()
-    def serialize(self, chunk: BaseChunk) -> str:
-        """Serialize the given chunk. This base implementation is embedding-targeted.
+    def contextualize(self, chunk: BaseChunk) -> str:
+        """Contextualize the given chunk. This implementation is embedding-targeted.
         Args:
             chunk: chunk to serialize
@@ -93,3 +94,8 @@ class BaseChunker(BaseModel, ABC):
         items.append(chunk.text)
         return self.delim.join(items)
+    @deprecated("Use contextualize() instead.")
+    def serialize(self, chunk: BaseChunk) -> str:
+        """Contextualize the given chunk. This implementation is embedding-targeted."""
+        return self.contextualize(chunk=chunk)

docling-core 2.25.0__py3-none-any.whl → 2.26.1__py3-none-any.whl

Potentially problematic release.

docling-core 2.25.0py3-none-any.whl → 2.26.1py3-none-any.whl