PyPI - docling-core - Versions diffs - 2.47.0__tar.gz → 2.48.0__tar.gz - Mend

docling-core 2.47.0tar.gz → 2.48.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docling-core might be problematic. Click here for more details.

Files changed (110) hide show

{docling_core-2.47.0 → docling_core-2.48.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docling-core
-Version: 2.47.0
+Version: 2.48.0
 Summary: A python library to define and validate data types in Docling.
 Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
 Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>

{docling_core-2.47.0 → docling_core-2.48.0}/docling_core/transforms/chunker/hierarchical_chunker.py RENAMED Viewed

@@ -145,7 +145,7 @@ class TripletTableSerializer(BaseTableSerializer):
             parts.append(cap_res)
         if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
-            table_df = item.export_to_dataframe()
+            table_df = item.export_to_dataframe(doc)
             if table_df.shape[0] >= 1 and table_df.shape[1] >= 2:
                 # copy header as first row and shift all rows by one

{docling_core-2.47.0 → docling_core-2.48.0}/docling_core/transforms/serializer/common.py RENAMED Viewed

@@ -394,6 +394,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
                 item=item,
                 doc_serializer=self,
                 doc=self.doc,
+                visited=my_visited,
                 **my_kwargs,
             )
         return part

{docling_core-2.47.0 → docling_core-2.48.0}/docling_core/transforms/serializer/doctags.py RENAMED Viewed

@@ -32,6 +32,7 @@ from docling_core.types.doc.document import (
     DoclingDocument,
     FloatingItem,
     FormItem,
+    GroupItem,
     InlineGroup,
     KeyValueItem,
     ListGroup,
@@ -42,6 +43,7 @@ from docling_core.types.doc.document import (
     PictureMoleculeData,
     PictureTabularChartData,
     ProvenanceItem,
+    SectionHeaderItem,
     TableItem,
     TextItem,
 )
@@ -94,11 +96,11 @@ class DocTagsTextSerializer(BaseModel, BaseTextSerializer):
         item: TextItem,
         doc_serializer: BaseDocSerializer,
         doc: DoclingDocument,
+        visited: Optional[set[str]] = None,
         **kwargs: Any,
     ) -> SerializationResult:
         """Serializes the passed item."""
-        from docling_core.types.doc.document import SectionHeaderItem
+        my_visited = visited if visited is not None else set()
         params = DocTagsParams(**kwargs)
         wrap_tag: Optional[str] = DocumentToken.create_token_name_from_doc_item_label(
             label=item.label,
@@ -116,12 +118,21 @@ class DocTagsTextSerializer(BaseModel, BaseTextSerializer):
                 parts.append(location)
         if params.add_content:
-            text_part = item.text
-            text_part = doc_serializer.post_process(
-                text=text_part,
-                formatting=item.formatting,
-                hyperlink=item.hyperlink,
-            )
+            if (
+                item.text == ""
+                and len(item.children) == 1
+                and isinstance(
+                    (child_group := item.children[0].resolve(doc)), InlineGroup
+                )
+            ):
+                ser_res = doc_serializer.serialize(item=child_group, visited=my_visited)
+                text_part = ser_res.text
+            else:
+                text_part = doc_serializer.post_process(
+                    text=item.text,
+                    formatting=item.formatting,
+                    hyperlink=item.hyperlink,
+                )
             if isinstance(item, CodeItem):
                 language_token = DocumentToken.get_code_language_token(
@@ -506,7 +517,12 @@ class DocTagsFallbackSerializer(BaseFallbackSerializer):
         **kwargs: Any,
     ) -> SerializationResult:
         """Serializes the passed item."""
-        return create_ser_result()
+        if isinstance(item, GroupItem):
+            parts = doc_serializer.get_parts(item=item, **kwargs)
+            text_res = "\n".join([p.text for p in parts if p.text])
+            return create_ser_result(text=text_res, span_source=parts)
+        else:
+            return create_ser_result()
 class DocTagsAnnotationSerializer(BaseAnnotationSerializer):

{docling_core-2.47.0 → docling_core-2.48.0}/docling_core/transforms/serializer/html.py RENAMED Viewed

@@ -55,6 +55,7 @@ from docling_core.types.doc.document import (
     FormItem,
     FormulaItem,
     GraphData,
+    GroupItem,
     ImageRef,
     InlineGroup,
     KeyValueItem,
@@ -139,21 +140,34 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
         res_parts: list[SerializationResult] = []
         post_processed = False
-        # Prepare the HTML based on item type
-        if isinstance(item, TitleItem):
-            text_inner = self._prepare_content(item.text)
-            text = get_html_tag_with_text_direction(html_tag="h1", text=text_inner)
+        has_inline_repr = (
+            item.text == ""
+            and len(item.children) == 1
+            and isinstance((child_group := item.children[0].resolve(doc)), InlineGroup)
+        )
+        if has_inline_repr:
+            text = doc_serializer.serialize(item=child_group, visited=my_visited).text
+            post_processed = True
+        else:
+            text = item.text
+            if not isinstance(item, (CodeItem, FormulaItem)):
+                text = html.escape(text, quote=False)
+                text = text.replace("\n", "<br>")
-        elif isinstance(item, SectionHeaderItem):
-            section_level = min(item.level + 1, 6)
-            text_inner = self._prepare_content(item.text)
+        # Prepare the HTML based on item type
+        if isinstance(item, (TitleItem, SectionHeaderItem)):
+            section_level = (
+                min(item.level + 1, 6) if isinstance(item, SectionHeaderItem) else 1
+            )
             text = get_html_tag_with_text_direction(
-                html_tag=f"h{section_level}", text=text_inner
+                html_tag=f"h{section_level}", text=text
             )
         elif isinstance(item, FormulaItem):
             text = self._process_formula(
                 item=item,
+                text=text,
+                orig=item.orig,
                 doc=doc,
                 image_mode=params.image_mode,
                 formula_to_mathml=params.formula_to_mathml,
@@ -161,19 +175,26 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
             )
         elif isinstance(item, CodeItem):
-            text = self._process_code(item=item, is_inline_scope=is_inline_scope)
+            text = (
+                f"<code>{text}</code>"
+                if is_inline_scope
+                else f"<pre><code>{text}</code></pre>"
+            )
         elif isinstance(item, ListItem):
             # List items are handled by list serializer
             text_parts: list[str] = []
-            if item_text := self._prepare_content(item.text):
-                item_text = doc_serializer.post_process(
-                    text=item_text,
-                    formatting=item.formatting,
-                    hyperlink=item.hyperlink,
-                )
-                post_processed = True
-                text_parts.append(item_text)
+            if text:
+                if has_inline_repr:
+                    text = f"\n{text}\n"
+                else:
+                    text = doc_serializer.post_process(
+                        text=text,
+                        formatting=item.formatting,
+                        hyperlink=item.hyperlink,
+                    )
+                    post_processed = True
+                text_parts.append(text)
             nested_parts = [
                 r.text
                 for r in doc_serializer.get_parts(
@@ -184,29 +205,26 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
                 )
             ]
             text_parts.extend(nested_parts)
-            text_inner = "\n".join(text_parts)
+            text = "\n".join(text_parts)
             if nested_parts:
-                text_inner = f"\n{text_inner}\n"
+                text = f"\n{text}\n"
             text = (
                 get_html_tag_with_text_direction(
                     html_tag="li",
-                    text=text_inner,
+                    text=text,
                     attrs=(
                         {"style": f"list-style-type: '{item.marker} ';"}
                         if params.show_original_list_item_marker and item.marker
                         else {}
                     ),
                 )
-                if text_inner
+                if text
                 else ""
             )
-        elif is_inline_scope:
-            text = self._prepare_content(item.text)
-        else:
+        elif not is_inline_scope:
             # Regular text item
-            text_inner = self._prepare_content(item.text)
-            text = get_html_tag_with_text_direction(html_tag="p", text=text_inner)
+            text = get_html_tag_with_text_direction(html_tag="p", text=text)
         # Apply formatting and hyperlinks
         if not post_processed:
@@ -227,66 +245,44 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
         return create_ser_result(text=text, span_source=res_parts)
-    def _prepare_content(
-        self, text: str, do_escape_html=True, do_replace_newline=True
-    ) -> str:
-        """Prepare text content for HTML inclusion."""
-        if do_escape_html:
-            text = html.escape(text, quote=False)
-        if do_replace_newline:
-            text = text.replace("\n", "<br>")
-        return text
-    def _process_code(
-        self,
-        item: CodeItem,
-        is_inline_scope: bool,
-    ) -> str:
-        code_text = self._prepare_content(
-            item.text, do_escape_html=False, do_replace_newline=False
-        )
-        if is_inline_scope:
-            text = f"<code>{code_text}</code>"
-        else:
-            text = f"<pre><code>{code_text}</code></pre>"
-        return text
     def _process_formula(
         self,
-        item: FormulaItem,
+        *,
+        item: DocItem,
+        text: str,
+        orig: str,
         doc: DoclingDocument,
         image_mode: ImageRefMode,
         formula_to_mathml: bool,
         is_inline_scope: bool,
     ) -> str:
         """Process a formula item to HTML/MathML."""
-        math_formula = self._prepare_content(
-            item.text, do_escape_html=False, do_replace_newline=False
-        )
         # If formula is empty, try to use an image fallback
-        if item.text == "" and item.orig != "":
-            img_fallback = self._get_formula_image_fallback(item, doc)
-            if (
-                image_mode == ImageRefMode.EMBEDDED
-                and len(item.prov) > 0
-                and img_fallback
-            ):
-                return img_fallback
+        if (
+            text == ""
+            and orig != ""
+            and len(item.prov) > 0
+            and image_mode == ImageRefMode.EMBEDDED
+            and (
+                img_fallback := self._get_formula_image_fallback(
+                    item=item, orig=orig, doc=doc
+                )
+            )
+        ):
+            return img_fallback
         # Try to generate MathML
-        if formula_to_mathml and math_formula:
+        elif formula_to_mathml and text:
             try:
                 # Set display mode based on context
                 display_mode = "inline" if is_inline_scope else "block"
                 mathml_element = latex2mathml.converter.convert_to_element(
-                    math_formula, display=display_mode
+                    text, display=display_mode
                 )
                 annotation = SubElement(
                     mathml_element, "annotation", dict(encoding="TeX")
                 )
-                annotation.text = math_formula
+                annotation.text = text
                 mathml = unescape(tostring(mathml_element, encoding="unicode"))
                 # Don't wrap in div for inline formulas
@@ -296,40 +292,40 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
                     return f"<div>{mathml}</div>"
             except Exception:
-                img_fallback = self._get_formula_image_fallback(item, doc)
+                img_fallback = self._get_formula_image_fallback(
+                    item=item, orig=orig, doc=doc
+                )
                 if (
                     image_mode == ImageRefMode.EMBEDDED
                     and len(item.prov) > 0
                     and img_fallback
                 ):
                     return img_fallback
-                elif math_formula:
-                    return f"<pre>{math_formula}</pre>"
+                elif text:
+                    return f"<pre>{text}</pre>"
                 else:
                     return "<pre>Formula not decoded</pre>"
         _logger.warning("Could not parse formula with MathML")
         # Fallback options if we got here
-        if math_formula and is_inline_scope:
-            return f"<code>{math_formula}</code>"
-        elif math_formula and (not is_inline_scope):
-            f"<pre>{math_formula}</pre>"
+        if text and is_inline_scope:
+            return f"<code>{text}</code>"
+        elif text and (not is_inline_scope):
+            f"<pre>{text}</pre>"
         elif is_inline_scope:
             return '<span class="formula-not-decoded">Formula not decoded</span>'
         return '<div class="formula-not-decoded">Formula not decoded</div>'
     def _get_formula_image_fallback(
-        self, item: TextItem, doc: DoclingDocument
+        self, *, item: DocItem, orig: str, doc: DoclingDocument
     ) -> Optional[str]:
         """Try to get an image fallback for a formula."""
         item_image = item.get_image(doc=doc)
         if item_image is not None:
             img_ref = ImageRef.from_pil(item_image, dpi=72)
-            return (
-                "<figure>" f'<img src="{img_ref.uri}" alt="{item.orig}" />' "</figure>"
-            )
+            return "<figure>" f'<img src="{img_ref.uri}" alt="{orig}" />' "</figure>"
         return None
@@ -792,21 +788,30 @@ class HTMLFallbackSerializer(BaseFallbackSerializer):
     """HTML-specific fallback serializer."""
     @override
-    def serialize(self, *, item: NodeItem, **kwargs: Any) -> SerializationResult:
+    def serialize(
+        self,
+        *,
+        item: NodeItem,
+        doc_serializer: "BaseDocSerializer",
+        doc: DoclingDocument,
+        **kwargs: Any,
+    ) -> SerializationResult:
         """Fallback serializer for items not handled by other serializers."""
-        if isinstance(item, DocItem):
+        if isinstance(item, GroupItem):
+            parts = doc_serializer.get_parts(item=item, **kwargs)
+            text_res = "\n".join([p.text for p in parts if p.text])
+            return create_ser_result(text=text_res, span_source=parts)
+        else:
             return create_ser_result(
                 text=f"<!-- Unhandled item type: {item.__class__.__name__} -->",
-                span_source=item,
+                span_source=item if isinstance(item, DocItem) else [],
             )
-        else:
-            # For group items, we don't generate any markup
-            return create_ser_result()
 class HTMLAnnotationSerializer(BaseModel, BaseAnnotationSerializer):
     """HTML-specific annotation serializer."""
+    @override
     def serialize(
         self,
         *,

{docling_core-2.47.0 → docling_core-2.48.0}/docling_core/transforms/serializer/markdown.py RENAMED Viewed

@@ -45,6 +45,7 @@ from docling_core.types.doc.document import (
     Formatting,
     FormItem,
     FormulaItem,
+    GroupItem,
     ImageRef,
     InlineGroup,
     KeyValueItem,
@@ -124,26 +125,24 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
         my_visited = visited if visited is not None else set()
         params = MarkdownParams(**kwargs)
         res_parts: list[SerializationResult] = []
-        text = item.text
         escape_html = True
         escape_underscores = True
-        processing_pending = True
-        if isinstance(item, (ListItem, TitleItem, SectionHeaderItem)):
-            # case where processing/formatting should be applied first (in inner scope)
+        has_inline_repr = (
+            item.text == ""
+            and len(item.children) == 1
+            and isinstance((child_group := item.children[0].resolve(doc)), InlineGroup)
+        )
+        if has_inline_repr:
+            text = doc_serializer.serialize(item=child_group, visited=my_visited).text
             processing_pending = False
-            if (
-                text == ""
-                and len(item.children) == 1
-                and isinstance(
-                    (child_group := item.children[0].resolve(doc)), InlineGroup
-                )
-            ):
-                # case of inline within heading / list item
-                ser_res = doc_serializer.serialize(item=child_group)
-                text = ser_res.text
-                for span in ser_res.spans:
-                    my_visited.add(span.item.self_ref)
-            else:
+        else:
+            text = item.text
+            processing_pending = True
+        if isinstance(item, (ListItem, TitleItem, SectionHeaderItem)):
+            if not has_inline_repr:
+                # case where processing/formatting should be applied first (in inner scope)
                 text = doc_serializer.post_process(
                     text=text,
                     escape_html=escape_html,
@@ -151,6 +150,7 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
                     formatting=item.formatting,
                     hyperlink=item.hyperlink,
                 )
+                processing_pending = False
             if isinstance(item, ListItem):
                 pieces: list[str] = []
@@ -600,13 +600,15 @@ class MarkdownFallbackSerializer(BaseFallbackSerializer):
         **kwargs: Any,
     ) -> SerializationResult:
         """Serializes the passed item."""
-        if isinstance(item, DocItem):
+        if isinstance(item, GroupItem):
+            parts = doc_serializer.get_parts(item=item, **kwargs)
+            text_res = "\n\n".join([p.text for p in parts if p.text])
+            return create_ser_result(text=text_res, span_source=parts)
+        else:
             return create_ser_result(
                 text="<!-- missing-text -->",
-                span_source=item,
+                span_source=item if isinstance(item, DocItem) else [],
             )
-        else:
-            return create_ser_result()
 class MarkdownDocSerializer(DocSerializer):

{docling_core-2.47.0 → docling_core-2.48.0}/docling_core/types/doc/document.py RENAMED Viewed

@@ -60,7 +60,7 @@ _logger = logging.getLogger(__name__)
 Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
 LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
-CURRENT_VERSION: Final = "1.6.0"
+CURRENT_VERSION: Final = "1.7.0"
 DEFAULT_EXPORT_LABELS = {
     DocItemLabel.TITLE,
@@ -310,6 +310,7 @@ class TableCell(BaseModel):
     column_header: bool = False
     row_header: bool = False
     row_section: bool = False
+    fillable: bool = False
     @model_validator(mode="before")
     @classmethod

{docling_core-2.47.0 → docling_core-2.48.0}/docling_core.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docling-core
-Version: 2.47.0
+Version: 2.48.0
 Summary: A python library to define and validate data types in Docling.
 Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
 Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>

{docling_core-2.47.0 → docling_core-2.48.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "docling-core"
-version = "2.47.0"  # DO NOT EDIT, updated automatically
+version = "2.48.0"  # DO NOT EDIT, updated automatically
 description = "A python library to define and validate data types in Docling."
 license = "MIT"
 license-files = ["LICENSE"]

{docling_core-2.47.0 → docling_core-2.48.0}/test/test_docling_doc.py RENAMED Viewed

@@ -734,7 +734,7 @@ def _test_export_methods(
     for table in doc.tables:
         table.export_to_markdown()
         table.export_to_html(doc)
-        table.export_to_dataframe()
+        table.export_to_dataframe(doc)
         table.export_to_doctags(doc)
     # Test Images export ...
@@ -2102,7 +2102,7 @@ def _construct_rich_table_doc():
     table_item = doc.add_table(
         data=TableData(
-            num_rows=4,
+            num_rows=5,
             num_cols=2,
         ),
     )
@@ -2121,6 +2121,17 @@ def _construct_rich_table_doc():
     rich_item_3 = doc.add_table(
         data=TableData(num_rows=2, num_cols=3), parent=table_item
     )
+    rich_item_4 = doc.add_group(parent=table_item, label=GroupLabel.UNSPECIFIED)
+    doc.add_text(
+        parent=rich_item_4,
+        text="Some text in a generic group.",
+        label=DocItemLabel.TEXT,
+    )
+    doc.add_text(
+        parent=rich_item_4, text="More text in the group.", label=DocItemLabel.TEXT
+    )
     for i in range(rich_item_3.data.num_rows):
         for j in range(rich_item_3.data.num_cols):
             cell = TableCell(
@@ -2158,6 +2169,14 @@ def _construct_rich_table_doc():
                     end_col_offset_idx=j + 1,
                     ref=rich_item_3.get_ref(),
                 )
+            elif i == 4 and j == 0:
+                cell = RichTableCell(
+                    start_row_offset_idx=i,
+                    end_row_offset_idx=i + 1,
+                    start_col_offset_idx=j,
+                    end_col_offset_idx=j + 1,
+                    ref=rich_item_4.get_ref(),
+                )
             else:
                 cell = TableCell(
                     start_row_offset_idx=i,

{docling_core-2.47.0 → docling_core-2.48.0}/test/test_serialization.py RENAMED Viewed

@@ -85,6 +85,11 @@ def verify(exp_file: Path, actual: str):
         assert expected == actual
+# ===============================
+# Markdown tests
+# ===============================
 def test_md_cross_page_list_page_break():
     src = Path("./test/data/doc/activities.json")
     doc = DoclingDocument.load_from_json(src)
@@ -99,7 +104,7 @@ def test_md_cross_page_list_page_break():
         ),
     )
     actual = ser.serialize().text
-    verify(exp_file=src.parent / f"{src.stem}.gt.md", actual=actual)
+    verify(exp_file=src.with_suffix(".gt.md"), actual=actual)
 def test_md_cross_page_list_page_break_none():
@@ -170,20 +175,6 @@ def test_md_cross_page_list_page_break_p2():
     verify(exp_file=src.parent / f"{src.stem}_p2.gt.md", actual=actual)
-def test_html_charts():
-    src = Path("./test/data/doc/barchart.json")
-    doc = DoclingDocument.load_from_json(src)
-    ser = HTMLDocSerializer(
-        doc=doc,
-        params=HTMLParams(
-            image_mode=ImageRefMode.PLACEHOLDER,
-        ),
-    )
-    actual = ser.serialize().text
-    verify(exp_file=src.parent / f"{src.stem}.gt.html", actual=actual)
 def test_md_charts():
     src = Path("./test/data/doc/barchart.json")
     doc = DoclingDocument.load_from_json(src)
@@ -195,7 +186,7 @@ def test_md_charts():
         ),
     )
     actual = ser.serialize().text
-    verify(exp_file=src.parent / f"{src.stem}.gt.md", actual=actual)
+    verify(exp_file=src.with_suffix(".gt.md"), actual=actual)
 def test_md_inline_and_formatting():
@@ -209,51 +200,7 @@ def test_md_inline_and_formatting():
         ),
     )
     actual = ser.serialize().text
-    verify(exp_file=src.parent / f"{src.stem}.md", actual=actual)
-def test_html_cross_page_list_page_break():
-    src = Path("./test/data/doc/activities.json")
-    doc = DoclingDocument.load_from_json(src)
-    ser = HTMLDocSerializer(
-        doc=doc,
-        params=HTMLParams(
-            image_mode=ImageRefMode.PLACEHOLDER,
-        ),
-    )
-    actual = ser.serialize().text
-    verify(exp_file=src.parent / f"{src.stem}.gt.html", actual=actual)
-def test_html_cross_page_list_page_break_p1():
-    src = Path("./test/data/doc/activities.json")
-    doc = DoclingDocument.load_from_json(src)
-    ser = HTMLDocSerializer(
-        doc=doc,
-        params=HTMLParams(
-            image_mode=ImageRefMode.PLACEHOLDER,
-            pages={1},
-        ),
-    )
-    actual = ser.serialize().text
-    verify(exp_file=src.parent / f"{src.stem}_p1.gt.html", actual=actual)
-def test_html_cross_page_list_page_break_p2():
-    src = Path("./test/data/doc/activities.json")
-    doc = DoclingDocument.load_from_json(src)
-    ser = HTMLDocSerializer(
-        doc=doc,
-        params=HTMLParams(
-            image_mode=ImageRefMode.PLACEHOLDER,
-            pages={2},
-        ),
-    )
-    actual = ser.serialize().text
-    verify(exp_file=src.parent / f"{src.stem}_p2.gt.html", actual=actual)
+    verify(exp_file=src.with_suffix(".gt.md"), actual=actual)
 def test_md_pb_placeholder_and_page_filter():
@@ -269,7 +216,7 @@ def test_md_pb_placeholder_and_page_filter():
         ),
     )
     actual = ser.serialize().text
-    verify(exp_file=src.parent / f"{src.stem}.gt.md", actual=actual)
+    verify(exp_file=src.with_suffix(".gt.md"), actual=actual)
 def test_md_list_item_markers():
@@ -358,7 +305,7 @@ def test_md_nested_lists():
     ser = MarkdownDocSerializer(doc=doc)
     actual = ser.serialize().text
-    verify(exp_file=src.parent / f"{src.stem}.gt.md", actual=actual)
+    verify(exp_file=src.with_suffix(".gt.md"), actual=actual)
 def test_md_rich_table():
@@ -370,6 +317,69 @@ def test_md_rich_table():
     verify(exp_file=exp_file, actual=actual)
+# ===============================
+# HTML tests
+# ===============================
+def test_html_charts():
+    src = Path("./test/data/doc/barchart.json")
+    doc = DoclingDocument.load_from_json(src)
+    ser = HTMLDocSerializer(
+        doc=doc,
+        params=HTMLParams(
+            image_mode=ImageRefMode.PLACEHOLDER,
+        ),
+    )
+    actual = ser.serialize().text
+    verify(exp_file=src.with_suffix(".gt.html"), actual=actual)
+def test_html_cross_page_list_page_break():
+    src = Path("./test/data/doc/activities.json")
+    doc = DoclingDocument.load_from_json(src)
+    ser = HTMLDocSerializer(
+        doc=doc,
+        params=HTMLParams(
+            image_mode=ImageRefMode.PLACEHOLDER,
+        ),
+    )
+    actual = ser.serialize().text
+    verify(exp_file=src.with_suffix(".gt.html"), actual=actual)
+def test_html_cross_page_list_page_break_p1():
+    src = Path("./test/data/doc/activities.json")
+    doc = DoclingDocument.load_from_json(src)
+    ser = HTMLDocSerializer(
+        doc=doc,
+        params=HTMLParams(
+            image_mode=ImageRefMode.PLACEHOLDER,
+            pages={1},
+        ),
+    )
+    actual = ser.serialize().text
+    verify(exp_file=src.parent / f"{src.stem}_p1.gt.html", actual=actual)
+def test_html_cross_page_list_page_break_p2():
+    src = Path("./test/data/doc/activities.json")
+    doc = DoclingDocument.load_from_json(src)
+    ser = HTMLDocSerializer(
+        doc=doc,
+        params=HTMLParams(
+            image_mode=ImageRefMode.PLACEHOLDER,
+            pages={2},
+        ),
+    )
+    actual = ser.serialize().text
+    verify(exp_file=src.parent / f"{src.stem}_p2.gt.html", actual=actual)
 def test_html_split_page():
     src = Path("./test/data/doc/2408.09869v3_enriched.json")
     doc = DoclingDocument.load_from_json(src)
@@ -506,7 +516,7 @@ def test_html_nested_lists():
     ser = HTMLDocSerializer(doc=doc)
     actual = ser.serialize().text
-    verify(exp_file=src.parent / f"{src.stem}.gt.html", actual=actual)
+    verify(exp_file=src.with_suffix(".gt.html"), actual=actual)
 def test_html_rich_table():
@@ -518,13 +528,27 @@ def test_html_rich_table():
     verify(exp_file=exp_file, actual=actual)
+def test_html_inline_and_formatting():
+    src = Path("./test/data/doc/inline_and_formatting.yaml")
+    doc = DoclingDocument.load_from_yaml(src)
+    ser = HTMLDocSerializer(doc=doc)
+    actual = ser.serialize().text
+    verify(exp_file=src.with_suffix(".gt.html"), actual=actual)
+# ===============================
+# DocTags tests
+# ===============================
 def test_doctags_inline_loc_tags():
     src = Path("./test/data/doc/2408.09869v3_enriched.json")
     doc = DoclingDocument.load_from_json(src)
     ser = DocTagsDocSerializer(doc=doc)
     actual = ser.serialize().text
-    verify(exp_file=src.parent / f"{src.stem}.out.dt", actual=actual)
+    verify(exp_file=src.with_suffix(".out.dt"), actual=actual)
 def test_doctags_rich_table():
@@ -535,3 +559,12 @@ def test_doctags_rich_table():
     ser = DocTagsDocSerializer(doc=doc)
     actual = ser.serialize().text
     verify(exp_file=exp_file, actual=actual)
+def test_doctags_inline_and_formatting():
+    src = Path("./test/data/doc/inline_and_formatting.yaml")
+    doc = DoclingDocument.load_from_yaml(src)
+    ser = DocTagsDocSerializer(doc=doc)
+    actual = ser.serialize().text
+    verify(exp_file=src.with_suffix(".gt.dt"), actual=actual)