PyPI - docling-core - Versions diffs - 2.14.0__tar.gz → 2.15.0__tar.gz - Mend

docling-core 2.14.0tar.gz → 2.15.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docling-core might be problematic. Click here for more details.

Files changed (62) hide show

{docling_core-2.14.0 → docling_core-2.15.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling-core
-Version: 2.14.0
+Version: 2.15.0
 Summary: A python library to define and validate data types in Docling.
 Home-page: https://ds4sd.github.io/
 License: MIT

{docling_core-2.14.0 → docling_core-2.15.0}/docling_core/transforms/chunker/base.py RENAMED Viewed

@@ -51,7 +51,7 @@ class BaseChunker(BaseModel, ABC):
     delim: str = DFLT_DELIM
     @abstractmethod
-    def chunk(self, dl_doc: DLDocument, **kwargs) -> Iterator[BaseChunk]:
+    def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
         """Chunk the provided document.
         Args:

{docling_core-2.14.0 → docling_core-2.15.0}/docling_core/transforms/chunker/hybrid_chunker.py RENAMED Viewed

@@ -6,7 +6,7 @@
 """Hybrid chunker implementation leveraging both doc structure & token awareness."""
 import warnings
-from typing import Iterable, Iterator, Optional, Union
+from typing import Any, Iterable, Iterator, Optional, Union
 from pydantic import BaseModel, ConfigDict, PositiveInt, TypeAdapter, model_validator
 from typing_extensions import Self
@@ -65,13 +65,13 @@ class HybridChunker(BaseChunker):
             )
         return self
-    def _count_tokens(self, text: Optional[Union[str, list[str]]]):
+    def _count_text_tokens(self, text: Optional[Union[str, list[str]]]):
         if text is None:
             return 0
         elif isinstance(text, list):
             total = 0
             for t in text:
-                total += self._count_tokens(t)
+                total += self._count_text_tokens(t)
             return total
         return len(self._tokenizer.tokenize(text, max_length=None))
@@ -80,11 +80,13 @@ class HybridChunker(BaseChunker):
         text_len: int
         other_len: int
+    def _count_chunk_tokens(self, doc_chunk: DocChunk):
+        ser_txt = self.serialize(chunk=doc_chunk)
+        return len(self._tokenizer.tokenize(text=ser_txt, max_length=None))
     def _doc_chunk_length(self, doc_chunk: DocChunk):
-        text_length = self._count_tokens(doc_chunk.text)
-        headings_length = self._count_tokens(doc_chunk.meta.headings)
-        captions_length = self._count_tokens(doc_chunk.meta.captions)
-        total = text_length + headings_length + captions_length
+        text_length = self._count_text_tokens(doc_chunk.text)
+        total = self._count_chunk_tokens(doc_chunk=doc_chunk)
         return self._ChunkLengthInfo(
             total_len=total,
             text_len=text_length,
@@ -92,90 +94,69 @@ class HybridChunker(BaseChunker):
         )
     def _make_chunk_from_doc_items(
-        self, doc_chunk: DocChunk, window_text: str, window_start: int, window_end: int
+        self, doc_chunk: DocChunk, window_start: int, window_end: int
     ):
+        doc_items = doc_chunk.meta.doc_items[window_start : window_end + 1]
         meta = DocMeta(
-            doc_items=doc_chunk.meta.doc_items[window_start : window_end + 1],
+            doc_items=doc_items,
             headings=doc_chunk.meta.headings,
             captions=doc_chunk.meta.captions,
             origin=doc_chunk.meta.origin,
         )
+        window_text = (
+            doc_chunk.text
+            if len(doc_chunk.meta.doc_items) == 1
+            else self.delim.join(
+                [
+                    doc_item.text
+                    for doc_item in doc_items
+                    if isinstance(doc_item, TextItem)
+                ]
+            )
+        )
         new_chunk = DocChunk(text=window_text, meta=meta)
         return new_chunk
-    def _merge_text(self, t1, t2):
-        if t1 == "":
-            return t2
-        elif t2 == "":
-            return t1
-        else:
-            return f"{t1}{self.delim}{t2}"
     def _split_by_doc_items(self, doc_chunk: DocChunk) -> list[DocChunk]:
-        if doc_chunk.meta.doc_items is None or len(doc_chunk.meta.doc_items) <= 1:
-            return [doc_chunk]
-        length = self._doc_chunk_length(doc_chunk)
-        if length.total_len <= self.max_tokens:
-            return [doc_chunk]
-        else:
-            chunks = []
-            window_start = 0
-            window_end = 0
-            window_text = ""
-            window_text_length = 0
-            other_length = length.other_len
-            num_items = len(doc_chunk.meta.doc_items)
-            while window_end < num_items:
-                doc_item = doc_chunk.meta.doc_items[window_end]
-                if isinstance(doc_item, TextItem):
-                    text = doc_item.text
-                else:
-                    raise RuntimeError("Non-TextItem split not implemented yet")
-                text_length = self._count_tokens(text)
-                if (
-                    text_length + window_text_length + other_length < self.max_tokens
-                    and window_end < num_items - 1
-                ):
+        chunks = []
+        window_start = 0
+        window_end = 0  # an inclusive index
+        num_items = len(doc_chunk.meta.doc_items)
+        while window_end < num_items:
+            new_chunk = self._make_chunk_from_doc_items(
+                doc_chunk=doc_chunk,
+                window_start=window_start,
+                window_end=window_end,
+            )
+            if self._count_chunk_tokens(doc_chunk=new_chunk) <= self.max_tokens:
+                if window_end < num_items - 1:
+                    window_end += 1
                     # Still room left to add more to this chunk AND still at least one
                     # item left
-                    window_end += 1
-                    window_text_length += text_length
-                    window_text = self._merge_text(window_text, text)
-                elif text_length + window_text_length + other_length < self.max_tokens:
+                    continue
+                else:
                     # All the items in the window fit into the chunk and there are no
                     # other items left
-                    window_text = self._merge_text(window_text, text)
-                    new_chunk = self._make_chunk_from_doc_items(
-                        doc_chunk, window_text, window_start, window_end
-                    )
-                    chunks.append(new_chunk)
-                    window_end = num_items
-                elif window_start == window_end:
-                    # Only one item in the window and it doesn't fit into the chunk. So
-                    # we'll just make it a chunk for now and it will get split in the
-                    # plain text splitter.
-                    window_text = self._merge_text(window_text, text)
-                    new_chunk = self._make_chunk_from_doc_items(
-                        doc_chunk, window_text, window_start, window_end
-                    )
-                    chunks.append(new_chunk)
-                    window_start = window_end + 1
-                    window_end = window_start
-                    window_text = ""
-                    window_text_length = 0
-                else:
-                    # Multiple items in the window but they don't fit into the chunk.
-                    # However, the existing items must have fit or we wouldn't have
-                    # gotten here. So we put everything but the last item into the chunk
-                    # and then start a new window INCLUDING the current window end.
-                    new_chunk = self._make_chunk_from_doc_items(
-                        doc_chunk, window_text, window_start, window_end - 1
-                    )
-                    chunks.append(new_chunk)
-                    window_start = window_end
-                    window_text = ""
-                    window_text_length = 0
-            return chunks
+                    window_end = num_items  # signalizing the last loop
+            elif window_start == window_end:
+                # Only one item in the window and it doesn't fit into the chunk. So
+                # we'll just make it a chunk for now and it will get split in the
+                # plain text splitter.
+                window_end += 1
+                window_start = window_end
+            else:
+                # Multiple items in the window but they don't fit into the chunk.
+                # However, the existing items must have fit or we wouldn't have
+                # gotten here. So we put everything but the last item into the chunk
+                # and then start a new window INCLUDING the current window end.
+                new_chunk = self._make_chunk_from_doc_items(
+                    doc_chunk=doc_chunk,
+                    window_start=window_start,
+                    window_end=window_end - 1,
+                )
+                window_start = window_end
+            chunks.append(new_chunk)
+        return chunks
     def _split_using_plain_text(
         self,
@@ -204,36 +185,38 @@ class HybridChunker(BaseChunker):
     def _merge_chunks_with_matching_metadata(self, chunks: list[DocChunk]):
         output_chunks = []
         window_start = 0
-        window_end = 0
+        window_end = 0  # an inclusive index
         num_chunks = len(chunks)
         while window_end < num_chunks:
             chunk = chunks[window_end]
-            lengths = self._doc_chunk_length(chunk)
             headings_and_captions = (chunk.meta.headings, chunk.meta.captions)
             ready_to_append = False
             if window_start == window_end:
-                # starting a new block of chunks to potentially merge
                 current_headings_and_captions = headings_and_captions
-                window_text = chunk.text
-                window_other_length = lengths.other_len
-                window_text_length = lengths.text_len
-                window_items = chunk.meta.doc_items
                 window_end += 1
                 first_chunk_of_window = chunk
-            elif (
-                headings_and_captions == current_headings_and_captions
-                and window_text_length + window_other_length + lengths.text_len
-                <= self.max_tokens
-            ):
-                # there is room to include the new chunk so add it to the window and
-                # continue
-                window_text = self._merge_text(window_text, chunk.text)
-                window_text_length += lengths.text_len
-                window_items = window_items + chunk.meta.doc_items
-                window_end += 1
             else:
-                ready_to_append = True
+                chks = chunks[window_start : window_end + 1]
+                doc_items = [it for chk in chks for it in chk.meta.doc_items]
+                candidate = DocChunk(
+                    text=self.delim.join([chk.text for chk in chks]),
+                    meta=DocMeta(
+                        doc_items=doc_items,
+                        headings=current_headings_and_captions[0],
+                        captions=current_headings_and_captions[1],
+                        origin=chunk.meta.origin,
+                    ),
+                )
+                if (
+                    headings_and_captions == current_headings_and_captions
+                    and self._count_chunk_tokens(doc_chunk=candidate) <= self.max_tokens
+                ):
+                    # there is room to include the new chunk so add it to the window and
+                    # continue
+                    window_end += 1
+                    new_chunk = candidate
+                else:
+                    ready_to_append = True
             if ready_to_append or window_end == num_chunks:
                 # no more room OR the start of new metadata.  Either way, end the block
                 # and use the current window_end as the start of a new block
@@ -241,16 +224,6 @@ class HybridChunker(BaseChunker):
                     # just one chunk so use it as is
                     output_chunks.append(first_chunk_of_window)
                 else:
-                    new_meta = DocMeta(
-                        doc_items=window_items,
-                        headings=current_headings_and_captions[0],
-                        captions=current_headings_and_captions[1],
-                        origin=chunk.meta.origin,
-                    )
-                    new_chunk = DocChunk(
-                        text=window_text,
-                        meta=new_meta,
-                    )
                     output_chunks.append(new_chunk)
                 # no need to reset window_text, etc. because that will be reset in the
                 # next iteration in the if window_start == window_end block
@@ -258,7 +231,7 @@ class HybridChunker(BaseChunker):
         return output_chunks
-    def chunk(self, dl_doc: DoclingDocument, **kwargs) -> Iterator[BaseChunk]:
+    def chunk(self, dl_doc: DoclingDocument, **kwargs: Any) -> Iterator[BaseChunk]:
         r"""Chunk the provided document.
         Args:

{docling_core-2.14.0 → docling_core-2.15.0}/docling_core/types/doc/__init__.py RENAMED Viewed

@@ -7,6 +7,7 @@
 from .base import BoundingBox, CoordOrigin, ImageRefMode, Size
 from .document import (
+    CodeItem,
     DocItem,
     DoclingDocument,
     DocumentOrigin,

{docling_core-2.14.0 → docling_core-2.15.0}/docling_core/types/doc/base.py RENAMED Viewed

@@ -150,7 +150,7 @@ class BoundingBox(BaseModel):
         """
         if self.coord_origin == CoordOrigin.BOTTOMLEFT:
-            return self
+            return self.model_copy()
         elif self.coord_origin == CoordOrigin.TOPLEFT:
             return BoundingBox(
                 l=self.l,
@@ -167,7 +167,7 @@ class BoundingBox(BaseModel):
         """
         if self.coord_origin == CoordOrigin.TOPLEFT:
-            return self
+            return self.model_copy()
         elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
             return BoundingBox(
                 l=self.l,

{docling_core-2.14.0 → docling_core-2.15.0}/docling_core/types/doc/document.py RENAMED Viewed

@@ -36,7 +36,7 @@ from docling_core.search.package import VERSION_PATTERN
 from docling_core.types.base import _JSON_POINTER_REGEX
 from docling_core.types.doc import BoundingBox, Size
 from docling_core.types.doc.base import ImageRefMode
-from docling_core.types.doc.labels import DocItemLabel, GroupLabel
+from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel, GroupLabel
 from docling_core.types.doc.tokens import DocumentToken, TableToken
 from docling_core.types.doc.utils import relative_path
@@ -597,7 +597,6 @@ class TextItem(DocItem):
         DocItemLabel.CAPTION,
         DocItemLabel.CHECKBOX_SELECTED,
         DocItemLabel.CHECKBOX_UNSELECTED,
-        DocItemLabel.CODE,
         DocItemLabel.FOOTNOTE,
         DocItemLabel.FORMULA,
         DocItemLabel.PAGE_FOOTER,
@@ -656,6 +655,15 @@ class TextItem(DocItem):
         return body
+class CodeItem(TextItem):
+    """CodeItem."""
+    label: typing.Literal[DocItemLabel.CODE] = (
+        DocItemLabel.CODE  # type: ignore[assignment]
+    )
+    code_language: CodeLanguageLabel = CodeLanguageLabel.UNKNOWN
 class SectionHeaderItem(TextItem):
     """SectionItem."""
@@ -1302,6 +1310,7 @@ ContentItem = Annotated[
         TextItem,
         SectionHeaderItem,
         ListItem,
+        CodeItem,
         PictureItem,
         TableItem,
         KeyValueItem,
@@ -1397,7 +1406,7 @@ class DoclingDocument(BaseModel):
     body: GroupItem = GroupItem(name="_root_", self_ref="#/body")  # List[RefItem] = []
     groups: List[GroupItem] = []
-    texts: List[Union[SectionHeaderItem, ListItem, TextItem]] = []
+    texts: List[Union[SectionHeaderItem, ListItem, TextItem, CodeItem]] = []
     pictures: List[PictureItem] = []
     tables: List[TableItem] = []
     key_value_items: List[KeyValueItem] = []
@@ -1643,6 +1652,46 @@ class DoclingDocument(BaseModel):
         return text_item
+    def add_code(
+        self,
+        text: str,
+        code_language: Optional[CodeLanguageLabel] = None,
+        orig: Optional[str] = None,
+        prov: Optional[ProvenanceItem] = None,
+        parent: Optional[NodeItem] = None,
+    ):
+        """add_code.
+        :param text: str:
+        :param code_language: Optional[str]: (Default value = None)
+        :param orig: Optional[str]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param parent: Optional[NodeItem]:  (Default value = None)
+        """
+        if not parent:
+            parent = self.body
+        if not orig:
+            orig = text
+        text_index = len(self.texts)
+        cref = f"#/texts/{text_index}"
+        code_item = CodeItem(
+            text=text,
+            orig=orig,
+            self_ref=cref,
+            parent=parent.get_ref(),
+        )
+        if code_language:
+            code_item.code_language = code_language
+        if prov:
+            code_item.prov.append(prov)
+        self.texts.append(code_item)
+        parent.children.append(RefItem(cref=cref))
+        return code_item
     def add_heading(
         self,
         text: str,
@@ -2086,7 +2135,7 @@ class DoclingDocument(BaseModel):
                 text = f"{marker} {item.text}\n"
                 mdtexts.append(text.strip() + "\n")
-            elif isinstance(item, TextItem) and item.label in [DocItemLabel.CODE]:
+            elif isinstance(item, CodeItem) and item.label in labels:
                 in_list = False
                 text = f"```\n{item.text}\n```\n"
                 mdtexts.append(text)
@@ -2392,11 +2441,14 @@ class DoclingDocument(BaseModel):
                 text = f"<li>{item.text}</li>"
                 html_texts.append(text)
+            elif isinstance(item, CodeItem) and item.label in labels:
+                text = f"<pre><code>{item.text}</code></pre>"
+                html_texts.append(text.strip())
             elif isinstance(item, TextItem) and item.label in labels:
                 text = f"<p>{item.text}</p>"
                 html_texts.append(text.strip())
             elif isinstance(item, TableItem):
                 text = item.export_to_html(doc=self, add_caption=True)
@@ -2594,6 +2646,17 @@ class DoclingDocument(BaseModel):
                     add_content=add_content,
                     add_page_index=add_page_index,
                 )
+            elif isinstance(item, CodeItem) and (item.label in labels):
+                result += item.export_to_document_tokens(
+                    doc=self,
+                    new_line=delim,
+                    xsize=xsize,
+                    ysize=ysize,
+                    add_location=add_location,
+                    add_content=add_content,
+                    add_page_index=add_page_index,
+                )
             elif isinstance(item, TextItem) and (item.label in labels):

{docling_core-2.14.0 → docling_core-2.15.0}/docling_core/types/doc/labels.py RENAMED Viewed

@@ -138,3 +138,69 @@ class TableCellLabel(str, Enum):
     def __str__(self):
         """Get string value."""
         return str(self.value)
+class CodeLanguageLabel(str, Enum):
+    """CodeLanguageLabel."""
+    ADA = "Ada"
+    AWK = "Awk"
+    BASH = "Bash"
+    BC = "bc"
+    C = "C"
+    C_SHARP = "C#"
+    C_PLUS_PLUS = "C++"
+    CMAKE = "CMake"
+    COBOL = "COBOL"
+    CSS = "CSS"
+    CEYLON = "Ceylon"
+    CLOJURE = "Clojure"
+    CRYSTAL = "Crystal"
+    CUDA = "Cuda"
+    CYTHON = "Cython"
+    D = "D"
+    DART = "Dart"
+    DC = "dc"
+    DOCKERFILE = "Dockerfile"
+    ELIXIR = "Elixir"
+    ERLANG = "Erlang"
+    FORTRAN = "FORTRAN"
+    FORTH = "Forth"
+    GO = "Go"
+    HTML = "HTML"
+    HASKELL = "Haskell"
+    HAXE = "Haxe"
+    JAVA = "Java"
+    JAVASCRIPT = "JavaScript"
+    JULIA = "Julia"
+    KOTLIN = "Kotlin"
+    LISP = "Lisp"
+    LUA = "Lua"
+    MATLAB = "Matlab"
+    MOONSCRIPT = "MoonScript"
+    NIM = "Nim"
+    OCAML = "OCaml"
+    OBJECTIVEC = "ObjectiveC"
+    OCTAVE = "Octave"
+    PHP = "PHP"
+    PASCAL = "Pascal"
+    PERL = "Perl"
+    PROLOG = "Prolog"
+    PYTHON = "Python"
+    RACKET = "Racket"
+    RUBY = "Ruby"
+    RUST = "Rust"
+    SML = "SML"
+    SQL = "SQL"
+    SCALA = "Scala"
+    SCHEME = "Scheme"
+    SWIFT = "Swift"
+    TYPESCRIPT = "TypeScript"
+    UNKNOWN = "unknown"
+    VISUALBASIC = "VisualBasic"
+    XML = "XML"
+    YAML = "YAML"
+    def __str__(self):
+        """Get string value."""
+        return str(self.value)

{docling_core-2.14.0 → docling_core-2.15.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling-core"
-version = "2.14.0"
+version = "2.15.0"
 description = "A python library to define and validate data types in Docling."
 license = "MIT"
 authors = [
@@ -79,6 +79,15 @@ types-setuptools = "^70.3.0"
 python-semantic-release = "^7.32.2"
 pandas-stubs = "^2.1.4.231227"
+[tool.poetry.group.constraints]
+optional = true
+[tool.poetry.group.constraints.dependencies]
+numpy = [
+    { version = ">=1.24.4,<3.0.0", markers = 'python_version >= "3.10"' },
+    { version = ">=1.24.4,<2.1.0", markers = 'python_version < "3.10"' },
+]
 [tool.setuptools.packages.find]
 where = ["docling_core/resources/schemas"]
@@ -127,6 +136,7 @@ module = [
     "jsonref.*",
     "jsonschema.*",
     "requests.*",
+    "semchunk.*",
     "tabulate.*",
     "transformers.*",
     "yaml.*",