PyPI - docling-core - Versions diffs - 2.21.1__tar.gz → 2.22.0__tar.gz - Mend

docling-core 2.21.1tar.gz → 2.22.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docling-core might be problematic. Click here for more details.

Files changed (62) hide show

{docling_core-2.21.1 → docling_core-2.22.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling-core
-Version: 2.21.1
+Version: 2.22.0
 Summary: A python library to define and validate data types in Docling.
 Home-page: https://ds4sd.github.io/
 License: MIT

{docling_core-2.21.1 → docling_core-2.22.0}/docling_core/transforms/chunker/hybrid_chunker.py RENAMED Viewed

@@ -73,7 +73,7 @@ class HybridChunker(BaseChunker):
             for t in text:
                 total += self._count_text_tokens(t)
             return total
-        return len(self._tokenizer.tokenize(text, max_length=None))
+        return len(self._tokenizer.tokenize(text))
     class _ChunkLengthInfo(BaseModel):
         total_len: int
@@ -82,7 +82,7 @@ class HybridChunker(BaseChunker):
     def _count_chunk_tokens(self, doc_chunk: DocChunk):
         ser_txt = self.serialize(chunk=doc_chunk)
-        return len(self._tokenizer.tokenize(text=ser_txt, max_length=None))
+        return len(self._tokenizer.tokenize(text=ser_txt))
     def _doc_chunk_length(self, doc_chunk: DocChunk):
         text_length = self._count_text_tokens(doc_chunk.text)

{docling_core-2.21.1 → docling_core-2.22.0}/docling_core/types/doc/document.py RENAMED Viewed

@@ -4,6 +4,7 @@ import base64
 import copy
 import hashlib
 import html
+import itertools
 import json
 import logging
 import mimetypes
@@ -37,7 +38,7 @@ from pydantic import (
     model_validator,
 )
 from tabulate import tabulate
-from typing_extensions import Annotated, Self
+from typing_extensions import Annotated, Self, deprecated
 from docling_core.search.package import VERSION_PATTERN
 from docling_core.types.base import _JSON_POINTER_REGEX
@@ -522,6 +523,49 @@ class ImageRef(BaseModel):
         )
+class DocTagsPage(BaseModel):
+    """DocTagsPage."""
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    tokens: str
+    image: Optional[PILImage.Image] = None
+class DocTagsDocument(BaseModel):
+    """DocTagsDocument."""
+    pages: List[DocTagsPage] = []
+    @classmethod
+    def from_doctags_and_image_pairs(
+        cls, doctags: List[Union[Path, str]], images: List[Union[Path, PILImage.Image]]
+    ):
+        """from_doctags_and_image_pairs."""
+        if len(doctags) != len(images):
+            raise ValueError("Number of page doctags must be equal to page images!")
+        doctags_doc = cls()
+        pages = []
+        for dt, img in zip(doctags, images):
+            if isinstance(dt, Path):
+                with dt.open("r") as fp:
+                    dt = fp.read()
+            elif isinstance(dt, str):
+                pass
+            if isinstance(img, Path):
+                img = PILImage.open(img)
+            elif isinstance(dt, PILImage.Image):
+                pass
+            page = DocTagsPage(tokens=dt, image=img)
+            pages.append(page)
+        doctags_doc.pages = pages
+        return doctags_doc
 class ProvenanceItem(BaseModel):
     """ProvenanceItem."""
@@ -800,7 +844,7 @@ class CodeItem(FloatingItem, TextItem):
         :param add_content: bool:  (Default value = True)
         """
-        body = f"<{self.label.value}{new_line}"
+        body = f"<{self.label.value}>{new_line}"
         if add_location:
             body += self.get_location_tokens(
@@ -813,7 +857,7 @@ class CodeItem(FloatingItem, TextItem):
         if add_content and self.text is not None:
             body += f"<_{self.code_language.value}_>{self.text}{new_line}"
-        body += f"</{self.label.value}\n"
+        body += f"</{self.label.value}>\n"
         return body
@@ -1003,6 +1047,20 @@ class PictureItem(FloatingItem):
             predicted_class = classifications[0].predicted_classes[0].class_name
             body += DocumentToken.get_picture_classification_token(predicted_class)
+        smiles_annotations = [
+            ann for ann in self.annotations if isinstance(ann, PictureMoleculeData)
+        ]
+        if len(smiles_annotations) > 0:
+            body += (
+                "<"
+                + DocumentToken.SMILES.value
+                + ">"
+                + smiles_annotations[0].smi
+                + "</"
+                + DocumentToken.SMILES.value
+                + ">"
+            )
         if add_caption and len(self.captions):
             text = self.caption_text(doc)
@@ -2487,7 +2545,6 @@ class DoclingDocument(BaseModel):
                         is_inline_scope=is_inline_scope,
                         visited=visited,
                     )
-                    # NOTE: assumes unordered (flag & marker currently in ListItem)
                     indent_str = list_level * indent * " "
                     is_ol = item.label == GroupLabel.ORDERED_LIST
                     text = "\n".join(
@@ -2501,7 +2558,12 @@ class DoclingDocument(BaseModel):
                             for i, c in enumerate(comps)
                         ]
                     )
-                    _ingest_text(text=text)
+                    _ingest_text(
+                        text=text,
+                        # special chars have already been escaped as needed
+                        do_escape_html=False,
+                        do_escape_underscores=False,
+                    )
                 elif item.label == GroupLabel.INLINE:
                     comps = self._get_markdown_components(
                         node=item,
@@ -2520,7 +2582,13 @@ class DoclingDocument(BaseModel):
                         is_inline_scope=True,
                         visited=visited,
                     )
-                    _ingest_text(" ".join(comps))
+                    text = " ".join(comps)
+                    _ingest_text(
+                        text=text,
+                        # special chars have already been escaped as needed
+                        do_escape_html=False,
+                        do_escape_underscores=False,
+                    )
                 else:
                     continue
@@ -2838,7 +2906,7 @@ class DoclingDocument(BaseModel):
                 # Building a math equation in MathML format
                 # ref https://www.w3.org/TR/wai-aria-1.1/#math
-                elif formula_to_mathml:
+                elif formula_to_mathml and len(math_formula) > 0:
                     try:
                         mathml_element = latex2mathml.converter.convert_to_element(
                             math_formula, display="block"
@@ -2860,7 +2928,7 @@ class DoclingDocument(BaseModel):
                             and img_fallback is not None
                         ):
                             text = img_fallback
-                        elif len(math_formula) > 0:
+                        else:
                             text = f"<pre>{math_formula}</pre>"
                 elif math_formula != "":
@@ -2926,7 +2994,378 @@ class DoclingDocument(BaseModel):
         return html_text
-    def save_as_document_tokens(
+    def load_from_doctags(  # noqa: C901
+        self,
+        doctag_document: DocTagsDocument,
+    ) -> "DoclingDocument":
+        r"""Load Docling document from lists of DocTags and Images."""
+        # Maps the recognized tag to a Docling label.
+        # Code items will be given DocItemLabel.CODE
+        tag_to_doclabel = {
+            "title": DocItemLabel.TITLE,
+            "document_index": DocItemLabel.DOCUMENT_INDEX,
+            "otsl": DocItemLabel.TABLE,
+            "section_header_level_1": DocItemLabel.SECTION_HEADER,
+            "checkbox_selected": DocItemLabel.CHECKBOX_SELECTED,
+            "checkbox_unselected": DocItemLabel.CHECKBOX_UNSELECTED,
+            "text": DocItemLabel.TEXT,
+            "page_header": DocItemLabel.PAGE_HEADER,
+            "page_footer": DocItemLabel.PAGE_FOOTER,
+            "formula": DocItemLabel.FORMULA,
+            "caption": DocItemLabel.CAPTION,
+            "picture": DocItemLabel.PICTURE,
+            "list_item": DocItemLabel.LIST_ITEM,
+            "footnote": DocItemLabel.FOOTNOTE,
+            "code": DocItemLabel.CODE,
+        }
+        def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]:
+            """Extract <loc_...> coords from the chunk, normalized by / 500."""
+            coords = re.findall(r"<loc_(\d+)>", text_chunk)
+            if len(coords) == 4:
+                l, t, r, b = map(float, coords)
+                return BoundingBox(l=l / 500, t=t / 500, r=r / 500, b=b / 500)
+            return None
+        def extract_inner_text(text_chunk: str) -> str:
+            """Strip all <...> tags inside the chunk to get the raw text content."""
+            return re.sub(r"<.*?>", "", text_chunk, flags=re.DOTALL).strip()
+        def otsl_parse_texts(texts, tokens):
+            split_word = TableToken.OTSL_NL.value
+            split_row_tokens = [
+                list(y)
+                for x, y in itertools.groupby(tokens, lambda z: z == split_word)
+                if not x
+            ]
+            table_cells = []
+            r_idx = 0
+            c_idx = 0
+            def count_right(tokens, c_idx, r_idx, which_tokens):
+                span = 0
+                c_idx_iter = c_idx
+                while tokens[r_idx][c_idx_iter] in which_tokens:
+                    c_idx_iter += 1
+                    span += 1
+                    if c_idx_iter >= len(tokens[r_idx]):
+                        return span
+                return span
+            def count_down(tokens, c_idx, r_idx, which_tokens):
+                span = 0
+                r_idx_iter = r_idx
+                while tokens[r_idx_iter][c_idx] in which_tokens:
+                    r_idx_iter += 1
+                    span += 1
+                    if r_idx_iter >= len(tokens):
+                        return span
+                return span
+            for i, text in enumerate(texts):
+                cell_text = ""
+                if text in [
+                    TableToken.OTSL_FCEL.value,
+                    TableToken.OTSL_ECEL.value,
+                    TableToken.OTSL_CHED.value,
+                    TableToken.OTSL_RHED.value,
+                    TableToken.OTSL_SROW.value,
+                ]:
+                    row_span = 1
+                    col_span = 1
+                    right_offset = 1
+                    if text != TableToken.OTSL_ECEL.value:
+                        cell_text = texts[i + 1]
+                        right_offset = 2
+                    # Check next element(s) for lcel / ucel / xcel,
+                    # set properly row_span, col_span
+                    next_right_cell = ""
+                    if i + right_offset < len(texts):
+                        next_right_cell = texts[i + right_offset]
+                    next_bottom_cell = ""
+                    if r_idx + 1 < len(split_row_tokens):
+                        if c_idx < len(split_row_tokens[r_idx + 1]):
+                            next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
+                    if next_right_cell in [
+                        TableToken.OTSL_LCEL.value,
+                        TableToken.OTSL_XCEL.value,
+                    ]:
+                        # we have horisontal spanning cell or 2d spanning cell
+                        col_span += count_right(
+                            split_row_tokens,
+                            c_idx + 1,
+                            r_idx,
+                            [TableToken.OTSL_LCEL.value, TableToken.OTSL_XCEL.value],
+                        )
+                    if next_bottom_cell in [
+                        TableToken.OTSL_UCEL.value,
+                        TableToken.OTSL_XCEL.value,
+                    ]:
+                        # we have a vertical spanning cell or 2d spanning cell
+                        row_span += count_down(
+                            split_row_tokens,
+                            c_idx,
+                            r_idx + 1,
+                            [TableToken.OTSL_UCEL.value, TableToken.OTSL_XCEL.value],
+                        )
+                    table_cells.append(
+                        TableCell(
+                            text=cell_text.strip(),
+                            row_span=row_span,
+                            col_span=col_span,
+                            start_row_offset_idx=r_idx,
+                            end_row_offset_idx=r_idx + row_span,
+                            start_col_offset_idx=c_idx,
+                            end_col_offset_idx=c_idx + col_span,
+                        )
+                    )
+                if text in [
+                    TableToken.OTSL_FCEL.value,
+                    TableToken.OTSL_ECEL.value,
+                    TableToken.OTSL_CHED.value,
+                    TableToken.OTSL_RHED.value,
+                    TableToken.OTSL_SROW.value,
+                    TableToken.OTSL_LCEL.value,
+                    TableToken.OTSL_UCEL.value,
+                    TableToken.OTSL_XCEL.value,
+                ]:
+                    c_idx += 1
+                if text == TableToken.OTSL_NL.value:
+                    r_idx += 1
+                    c_idx = 0
+            return table_cells, split_row_tokens
+        def otsl_extract_tokens_and_text(s: str):
+            # Pattern to match anything enclosed by < >
+            # (including the angle brackets themselves)
+            pattern = r"(<[^>]+>)"
+            # Find all tokens (e.g. "<otsl>", "<loc_140>", etc.)
+            tokens = re.findall(pattern, s)
+            # Remove any tokens that start with "<loc_"
+            tokens = [
+                token
+                for token in tokens
+                if not (
+                    token.startswith(rf"<{DocumentToken.LOC.value}")
+                    or token
+                    in [
+                        rf"<{DocumentToken.OTSL.value}>",
+                        rf"</{DocumentToken.OTSL.value}>",
+                    ]
+                )
+            ]
+            # Split the string by those tokens to get the in-between text
+            text_parts = re.split(pattern, s)
+            text_parts = [
+                token
+                for token in text_parts
+                if not (
+                    token.startswith(rf"<{DocumentToken.LOC.value}")
+                    or token
+                    in [
+                        rf"<{DocumentToken.OTSL.value}>",
+                        rf"</{DocumentToken.OTSL.value}>",
+                    ]
+                )
+            ]
+            # Remove any empty or purely whitespace strings from text_parts
+            text_parts = [part for part in text_parts if part.strip()]
+            return tokens, text_parts
+        def parse_table_content(otsl_content: str) -> TableData:
+            tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content)
+            table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens)
+            return TableData(
+                num_rows=len(split_row_tokens),
+                num_cols=(
+                    max(len(row) for row in split_row_tokens) if split_row_tokens else 0
+                ),
+                table_cells=table_cells,
+            )
+        # doc = DoclingDocument(name="Document")
+        for pg_idx, doctag_page in enumerate(doctag_document.pages):
+            page_doctags = doctag_page.tokens
+            image = doctag_page.image
+            page_no = pg_idx + 1
+            # bounding_boxes = []
+            if image is not None:
+                pg_width = image.width
+                pg_height = image.height
+            else:
+                pg_width = 1
+                pg_height = 1
+            """
+            1. Finds all <tag>...</tag>
+               blocks in the entire string (multi-line friendly)
+               in the order they appear.
+            2. For each chunk, extracts bounding box (if any) and inner text.
+            3. Adds the item to a DoclingDocument structure with the right label.
+            4. Tracks bounding boxes+color in a separate list for later visualization.
+            """
+            # Regex for root level recognized tags
+            tag_pattern = (
+                rf"<(?P<tag>{DocItemLabel.TITLE}|{DocItemLabel.DOCUMENT_INDEX}|"
+                rf"{DocItemLabel.CHECKBOX_UNSELECTED}|{DocItemLabel.CHECKBOX_SELECTED}|"
+                rf"{DocItemLabel.TEXT}|{DocItemLabel.PAGE_HEADER}|"
+                rf"{DocItemLabel.PAGE_FOOTER}|{DocItemLabel.FORMULA}|"
+                rf"{DocItemLabel.CAPTION}|{DocItemLabel.PICTURE}|"
+                rf"{DocItemLabel.FOOTNOTE}|{DocItemLabel.CODE}|"
+                rf"{DocItemLabel.SECTION_HEADER}_level_1|"
+                rf"{DocumentToken.ORDERED_LIST.value}|"
+                rf"{DocumentToken.UNORDERED_LIST.value}|"
+                rf"{DocumentToken.OTSL.value})>.*?</(?P=tag)>"
+            )
+            # DocumentToken.OTSL
+            pattern = re.compile(tag_pattern, re.DOTALL)
+            # Go through each match in order
+            for match in pattern.finditer(page_doctags):
+                full_chunk = match.group(0)
+                tag_name = match.group("tag")
+                bbox = extract_bounding_box(full_chunk) if image else None
+                doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.PARAGRAPH)
+                if tag_name == DocumentToken.OTSL.value:
+                    table_data = parse_table_content(full_chunk)
+                    bbox = extract_bounding_box(full_chunk) if image else None
+                    if bbox:
+                        prov = ProvenanceItem(
+                            bbox=bbox.resize_by_scale(pg_width, pg_height),
+                            charspan=(0, 0),
+                            page_no=page_no,
+                        )
+                        self.add_table(data=table_data, prov=prov)
+                    else:
+                        self.add_table(data=table_data)
+                elif tag_name == DocItemLabel.PICTURE:
+                    text_caption_content = extract_inner_text(full_chunk)
+                    if image:
+                        if bbox:
+                            im_width, im_height = image.size
+                            crop_box = (
+                                int(bbox.l * im_width),
+                                int(bbox.t * im_height),
+                                int(bbox.r * im_width),
+                                int(bbox.b * im_height),
+                            )
+                            cropped_image = image.crop(crop_box)
+                            pic = self.add_picture(
+                                parent=None,
+                                image=ImageRef.from_pil(image=cropped_image, dpi=72),
+                                prov=(
+                                    ProvenanceItem(
+                                        bbox=bbox.resize_by_scale(pg_width, pg_height),
+                                        charspan=(0, 0),
+                                        page_no=page_no,
+                                    )
+                                ),
+                            )
+                            # If there is a caption to an image, add it as well
+                            if len(text_caption_content) > 0:
+                                caption_item = self.add_text(
+                                    label=DocItemLabel.CAPTION,
+                                    text=text_caption_content,
+                                    parent=None,
+                                )
+                                pic.captions.append(caption_item.get_ref())
+                    else:
+                        if bbox:
+                            # In case we don't have access to an binary of an image
+                            self.add_picture(
+                                parent=None,
+                                prov=ProvenanceItem(
+                                    bbox=bbox, charspan=(0, 0), page_no=page_no
+                                ),
+                            )
+                            # If there is a caption to an image, add it as well
+                            if len(text_caption_content) > 0:
+                                caption_item = self.add_text(
+                                    label=DocItemLabel.CAPTION,
+                                    text=text_caption_content,
+                                    parent=None,
+                                )
+                                pic.captions.append(caption_item.get_ref())
+                elif tag_name in [
+                    DocumentToken.ORDERED_LIST.value,
+                    DocumentToken.UNORDERED_LIST.value,
+                ]:
+                    list_label = GroupLabel.LIST
+                    enum_marker = ""
+                    enum_value = 0
+                    if tag_name == DocumentToken.ORDERED_LIST.value:
+                        list_label = GroupLabel.ORDERED_LIST
+                    list_item_pattern = (
+                        rf"<(?P<tag>{DocItemLabel.LIST_ITEM})>.*?</(?P=tag)>"
+                    )
+                    li_pattern = re.compile(list_item_pattern, re.DOTALL)
+                    # Add list group:
+                    new_list = self.add_group(label=list_label, name="list")
+                    # Pricess list items
+                    for li_match in li_pattern.finditer(full_chunk):
+                        enum_value += 1
+                        if tag_name == DocumentToken.ORDERED_LIST.value:
+                            enum_marker = str(enum_value) + "."
+                        li_full_chunk = li_match.group(0)
+                        li_bbox = extract_bounding_box(li_full_chunk) if image else None
+                        text_content = extract_inner_text(li_full_chunk)
+                        # Add list item
+                        self.add_list_item(
+                            marker=enum_marker,
+                            enumerated=(tag_name == DocumentToken.ORDERED_LIST.value),
+                            parent=new_list,
+                            text=text_content,
+                            prov=(
+                                ProvenanceItem(
+                                    bbox=li_bbox.resize_by_scale(pg_width, pg_height),
+                                    charspan=(0, len(text_content)),
+                                    page_no=page_no,
+                                )
+                                if li_bbox
+                                else None
+                            ),
+                        )
+                else:
+                    # For everything else, treat as text
+                    text_content = extract_inner_text(full_chunk)
+                    self.add_text(
+                        label=doc_label,
+                        text=text_content,
+                        prov=(
+                            ProvenanceItem(
+                                bbox=bbox.resize_by_scale(pg_width, pg_height),
+                                charspan=(0, len(text_content)),
+                                page_no=page_no,
+                            )
+                            if bbox
+                            else None
+                        ),
+                    )
+        return self
+    @deprecated("Use save_as_doctags instead.")
+    def save_as_document_tokens(self, *args, **kwargs):
+        r"""Save the document content to a DocumentToken format."""
+        return self.save_as_doctags(*args, **kwargs)
+    def save_as_doctags(
         self,
         filename: Path,
         delim: str = "",
@@ -2942,7 +3381,7 @@ class DoclingDocument(BaseModel):
         add_table_cell_location: bool = False,
         add_table_cell_text: bool = True,
     ):
-        r"""Save the document content to a DocumentToken format."""
+        r"""Save the document content to DocTags format."""
         out = self.export_to_document_tokens(
             delim=delim,
             from_element=from_element,

{docling_core-2.21.1 → docling_core-2.22.0}/docling_core/types/doc/tokens.py RENAMED Viewed

@@ -50,6 +50,7 @@ class DocumentToken(Enum):
     UNORDERED_LIST = "unordered_list"
     LOC = "loc_"
     PAGE_BREAK = "page_break"
+    SMILES = "smiles"
     @classmethod
     def get_special_tokens(

{docling_core-2.21.1 → docling_core-2.22.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling-core"
-version = "2.21.1"
+version = "2.22.0"
 description = "A python library to define and validate data types in Docling."
 license = "MIT"
 authors = [