PyPI - docling - Versions diffs - 2.8.3__py3-none-any.whl → 2.10.0__py3-none-any.whl - Mend

docling 2.8.3py3-none-any.whl → 2.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docling might be problematic. Click here for more details.

Files changed (18) hide show

docling/backend/docling_parse_backend.py +1 -1
docling/backend/docling_parse_v2_backend.py +7 -5
docling/backend/msword_backend.py +43 -27
docling/chunking/__init__.py +12 -0
docling/cli/main.py +83 -28
docling/datamodel/base_models.py +1 -0
docling/datamodel/document.py +2 -253
docling/datamodel/pipeline_options.py +5 -1
docling/document_converter.py +5 -5
docling/models/ds_glm_model.py +2 -2
docling/pipeline/standard_pdf_pipeline.py +2 -0
docling/py.typed +1 -0
docling/utils/glm_utils.py +336 -0
{docling-2.8.3.dist-info → docling-2.10.0.dist-info}/METADATA +7 -7
{docling-2.8.3.dist-info → docling-2.10.0.dist-info}/RECORD +18 -15
{docling-2.8.3.dist-info → docling-2.10.0.dist-info}/LICENSE +0 -0
{docling-2.8.3.dist-info → docling-2.10.0.dist-info}/WHEEL +0 -0
{docling-2.8.3.dist-info → docling-2.10.0.dist-info}/entry_points.txt +0 -0

docling/backend/docling_parse_backend.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import Iterable, List, Optional, Union
 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin, Size
-from docling_parse.docling_parse import pdf_parser_v1
+from docling_parse.pdf_parsers import pdf_parser_v1
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfPage

docling/backend/docling_parse_v2_backend.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling_parse.docling_parse import pdf_parser_v2
+from docling_parse.pdf_parsers import pdf_parser_v2
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfPage
@@ -210,12 +210,14 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend):
         self.parser = pdf_parser_v2("fatal")
         success = False
-        if isinstance(path_or_stream, BytesIO):
+        if isinstance(self.path_or_stream, BytesIO):
             success = self.parser.load_document_from_bytesio(
-                self.document_hash, path_or_stream
+                self.document_hash, self.path_or_stream
+            )
+        elif isinstance(self.path_or_stream, Path):
+            success = self.parser.load_document(
+                self.document_hash, str(self.path_or_stream)
             )
-        elif isinstance(path_or_stream, Path):
-            success = self.parser.load_document(self.document_hash, str(path_or_stream))
         if not success:
             raise RuntimeError(

docling/backend/msword_backend.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import logging
+import re
 from io import BytesIO
 from pathlib import Path
 from typing import Set, Union
@@ -133,7 +134,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
     def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
         for element in body:
             tag_name = etree.QName(element).localname
             # Check for Inline Images (blip elements)
             namespaces = {
                 "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
@@ -153,6 +153,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 self.handle_pictures(element, docx_obj, drawing_blip, doc)
             # Check for Text
             elif tag_name in ["p"]:
+                # "tcPr", "sectPr"
                 self.handle_text_elements(element, docx_obj, doc)
             else:
                 _log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
@@ -166,6 +167,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         except ValueError:
             return default
+    def split_text_and_number(self, input_string):
+        match = re.match(r"(\D+)(\d+)$|^(\d+)(\D+)", input_string)
+        if match:
+            parts = list(filter(None, match.groups()))
+            return parts
+        else:
+            return [input_string]
     def get_numId_and_ilvl(self, paragraph):
         # Access the XML element of the paragraph
         numPr = paragraph._element.find(
@@ -188,7 +197,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
     def get_label_and_level(self, paragraph):
         if paragraph.style is None:
             return "Normal", None
-        label = paragraph.style.name
+        label = paragraph.style.style_id
         if label is None:
             return "Normal", None
         if ":" in label:
@@ -197,7 +206,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
             if len(parts) == 2:
                 return parts[0], int(parts[1])
-        parts = label.split(" ")
+        parts = self.split_text_and_number(label)
         if "Heading" in label and len(parts) == 2:
             parts.sort()
@@ -219,14 +228,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         if paragraph.text is None:
             return
         text = paragraph.text.strip()
-        # if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!
         # Common styles for bullet and numbered lists.
         # "List Bullet", "List Number", "List Paragraph"
         # Identify wether list is a numbered list or not
         # is_numbered = "List Bullet" not in paragraph.style.name
         is_numbered = False
-        p_style_name, p_level = self.get_label_and_level(paragraph)
+        p_style_id, p_level = self.get_label_and_level(paragraph)
         numid, ilevel = self.get_numId_and_ilvl(paragraph)
         if numid == 0:
@@ -238,14 +246,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 element,
                 docx_obj,
                 doc,
-                p_style_name,
+                p_style_id,
                 p_level,
                 numid,
                 ilevel,
                 text,
                 is_numbered,
             )
-            self.update_history(p_style_name, p_level, numid, ilevel)
+            self.update_history(p_style_id, p_level, numid, ilevel)
             return
         elif numid is None and self.prev_numid() is not None:  # Close list
             for key, val in self.parents.items():
@@ -253,23 +261,23 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                     self.parents[key] = None
             self.level = self.level_at_new_list - 1
             self.level_at_new_list = None
-        if p_style_name in ["Title"]:
+        if p_style_id in ["Title"]:
             for key, val in self.parents.items():
                 self.parents[key] = None
             self.parents[0] = doc.add_text(
                 parent=None, label=DocItemLabel.TITLE, text=text
             )
-        elif "Heading" in p_style_name:
-            self.add_header(element, docx_obj, doc, p_style_name, p_level, text)
+        elif "Heading" in p_style_id:
+            self.add_header(element, docx_obj, doc, p_style_id, p_level, text)
-        elif p_style_name in [
+        elif p_style_id in [
             "Paragraph",
             "Normal",
             "Subtitle",
             "Author",
-            "Default Text",
-            "List Paragraph",
-            "List Bullet",
+            "DefaultText",
+            "ListParagraph",
+            "ListBullet",
             "Quote",
         ]:
             level = self.get_level()
@@ -285,15 +293,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
             )
-        self.update_history(p_style_name, p_level, numid, ilevel)
+        self.update_history(p_style_id, p_level, numid, ilevel)
         return
     def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str):
         level = self.get_level()
         if isinstance(curr_level, int):
             if curr_level > level:
                 # add invisible group
                 for i in range(level, curr_level):
                     self.parents[i] = doc.add_group(
@@ -301,9 +307,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                         label=GroupLabel.SECTION,
                         name=f"header-{i}",
                     )
             elif curr_level < level:
                 # remove the tail
                 for key, val in self.parents.items():
                     if key >= curr_level:
@@ -314,7 +318,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 text=text,
                 level=curr_level,
             )
         else:
             self.parents[self.level] = doc.add_heading(
                 parent=self.parents[self.level - 1],
@@ -328,7 +331,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         element,
         docx_obj,
         doc,
-        p_style_name,
+        p_style_id,
         p_level,
         numid,
         ilevel,
@@ -346,7 +349,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
             )
-            # TODO: Set marker and enumerated arguments if this is an enumeration element.
+            # Set marker and enumerated arguments if this is an enumeration element.
             self.listIter += 1
             if is_numbered:
                 enum_marker = str(self.listIter) + "."
@@ -365,8 +368,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 self.level_at_new_list + self.prev_indent() + 1,
                 self.level_at_new_list + ilevel + 1,
             ):
-                # TODO: determine if this is an unordered list or an ordered list.
-                #  Set GroupLabel.ORDERED_LIST when it fits.
+                # Determine if this is an unordered list or an ordered list.
+                # Set GroupLabel.ORDERED_LIST when it fits.
                 self.listIter = 0
                 if is_numbered:
                     self.parents[i] = doc.add_group(
@@ -467,6 +470,19 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 row_span = get_rowspan(cell)
                 col_span = get_colspan(cell)
+                cell_text = cell.text
+                # In case cell doesn't return text via docx library:
+                if len(cell_text) == 0:
+                    cell_xml = cell._element
+                    texts = [""]
+                    for elem in cell_xml.iter():
+                        if elem.tag.endswith("t"):  # <w:t> tags that contain text
+                            if elem.text:
+                                texts.append(elem.text)
+                    # Join the collected text
+                    cell_text = " ".join(texts).strip()
                 # Find the next available column in the grid
                 while table_grid[row_idx][col_idx] is not None:
                     col_idx += 1
@@ -477,15 +493,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                         table_grid[row_idx + i][col_idx + j] = ""
                 cell = TableCell(
-                    text=cell.text,
+                    text=cell_text,
                     row_span=row_span,
                     col_span=col_span,
                     start_row_offset_idx=row_idx,
                     end_row_offset_idx=row_idx + row_span,
                     start_col_offset_idx=col_idx,
                     end_col_offset_idx=col_idx + col_span,
-                    col_header=False,  # col_header,
-                    row_header=False,  # ((not col_header) and html_cell.name=='th')
+                    col_header=False,
+                    row_header=False,
                 )
                 data.table_cells.append(cell)

docling/chunking/__init__.py ADDED Viewed

@@ -0,0 +1,12 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+from docling_core.transforms.chunker.base import BaseChunk, BaseChunker, BaseMeta
+from docling_core.transforms.chunker.hierarchical_chunker import (
+    DocChunk,
+    DocMeta,
+    HierarchicalChunker,
+)
+from docling_core.transforms.chunker.hybrid_chunker import HybridChunker

docling/cli/main.py CHANGED Viewed

@@ -10,7 +10,9 @@ from pathlib import Path
 from typing import Annotated, Dict, Iterable, List, Optional, Type
 import typer
+from docling_core.types.doc import ImageRefMode
 from docling_core.utils.file import resolve_source_to_path
+from pydantic import TypeAdapter, ValidationError
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
@@ -86,9 +88,11 @@ def export_documents(
     conv_results: Iterable[ConversionResult],
     output_dir: Path,
     export_json: bool,
+    export_html: bool,
     export_md: bool,
     export_txt: bool,
     export_doctags: bool,
+    image_export_mode: ImageRefMode,
 ):
     success_count = 0
@@ -99,33 +103,45 @@ def export_documents(
             success_count += 1
             doc_filename = conv_res.input.file.stem
-            # Export Deep Search document JSON format:
+            # Export JSON format:
             if export_json:
                 fname = output_dir / f"{doc_filename}.json"
-                with fname.open("w", encoding="utf8") as fp:
-                    _log.info(f"writing JSON output to {fname}")
-                    fp.write(json.dumps(conv_res.document.export_to_dict()))
+                _log.info(f"writing JSON output to {fname}")
+                conv_res.document.save_as_json(
+                    filename=fname, image_mode=image_export_mode
+                )
+            # Export HTML format:
+            if export_html:
+                fname = output_dir / f"{doc_filename}.html"
+                _log.info(f"writing HTML output to {fname}")
+                conv_res.document.save_as_html(
+                    filename=fname, image_mode=image_export_mode
+                )
             # Export Text format:
             if export_txt:
                 fname = output_dir / f"{doc_filename}.txt"
-                with fname.open("w", encoding="utf8") as fp:
-                    _log.info(f"writing Text output to {fname}")
-                    fp.write(conv_res.document.export_to_markdown(strict_text=True))
+                _log.info(f"writing TXT output to {fname}")
+                conv_res.document.save_as_markdown(
+                    filename=fname,
+                    strict_text=True,
+                    image_mode=ImageRefMode.PLACEHOLDER,
+                )
             # Export Markdown format:
             if export_md:
                 fname = output_dir / f"{doc_filename}.md"
-                with fname.open("w", encoding="utf8") as fp:
-                    _log.info(f"writing Markdown output to {fname}")
-                    fp.write(conv_res.document.export_to_markdown())
+                _log.info(f"writing Markdown output to {fname}")
+                conv_res.document.save_as_markdown(
+                    filename=fname, image_mode=image_export_mode
+                )
             # Export Document Tags format:
             if export_doctags:
                 fname = output_dir / f"{doc_filename}.doctags"
-                with fname.open("w", encoding="utf8") as fp:
-                    _log.info(f"writing Doc Tags output to {fname}")
-                    fp.write(conv_res.document.export_to_document_tokens())
+                _log.info(f"writing Doc Tags output to {fname}")
+                conv_res.document.save_as_document_tokens(filename=fname)
         else:
             _log.warning(f"Document {conv_res.input.file} failed to convert.")
@@ -160,6 +176,13 @@ def convert(
     to_formats: List[OutputFormat] = typer.Option(
         None, "--to", help="Specify output formats. Defaults to Markdown."
     ),
+    image_export_mode: Annotated[
+        ImageRefMode,
+        typer.Option(
+            ...,
+            help="Image export mode for the document (only in case of JSON, Markdown or HTML). With `placeholder`, only the position of the image is marked in the output. In `embedded` mode, the image is embedded as base64 encoded string. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.",
+        ),
+    ] = ImageRefMode.EMBEDDED,
     ocr: Annotated[
         bool,
         typer.Option(
@@ -185,7 +208,7 @@ def convert(
     ] = None,
     pdf_backend: Annotated[
         PdfBackend, typer.Option(..., help="The PDF backend to use.")
-    ] = PdfBackend.DLPARSE_V1,
+    ] = PdfBackend.DLPARSE_V2,
     table_mode: Annotated[
         TableFormerMode,
         typer.Option(..., help="The mode to use in the table structure model."),
@@ -260,24 +283,45 @@ def convert(
     with tempfile.TemporaryDirectory() as tempdir:
         input_doc_paths: List[Path] = []
         for src in input_sources:
-            source = resolve_source_to_path(source=src, workdir=Path(tempdir))
-            if not source.exists():
+            try:
+                # check if we can fetch some remote url
+                source = resolve_source_to_path(source=src, workdir=Path(tempdir))
+                input_doc_paths.append(source)
+            except FileNotFoundError:
                 err_console.print(
-                    f"[red]Error: The input file {source} does not exist.[/red]"
+                    f"[red]Error: The input file {src} does not exist.[/red]"
                 )
                 raise typer.Abort()
-            elif source.is_dir():
-                for fmt in from_formats:
-                    for ext in FormatToExtensions[fmt]:
-                        input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
-                        input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
-            else:
-                input_doc_paths.append(source)
+            except IsADirectoryError:
+                # if the input matches to a file or a folder
+                try:
+                    local_path = TypeAdapter(Path).validate_python(src)
+                    if local_path.exists() and local_path.is_dir():
+                        for fmt in from_formats:
+                            for ext in FormatToExtensions[fmt]:
+                                input_doc_paths.extend(
+                                    list(local_path.glob(f"**/*.{ext}"))
+                                )
+                                input_doc_paths.extend(
+                                    list(local_path.glob(f"**/*.{ext.upper()}"))
+                                )
+                    elif local_path.exists():
+                        input_doc_paths.append(local_path)
+                    else:
+                        err_console.print(
+                            f"[red]Error: The input file {src} does not exist.[/red]"
+                        )
+                        raise typer.Abort()
+                except Exception as err:
+                    err_console.print(f"[red]Error: Cannot read the input {src}.[/red]")
+                    _log.info(err)  # will print more details if verbose is activated
+                    raise typer.Abort()
         if to_formats is None:
             to_formats = [OutputFormat.MARKDOWN]
         export_json = OutputFormat.JSON in to_formats
+        export_html = OutputFormat.HTML in to_formats
         export_md = OutputFormat.MARKDOWN in to_formats
         export_txt = OutputFormat.TEXT in to_formats
         export_doctags = OutputFormat.DOCTAGS in to_formats
@@ -309,6 +353,13 @@ def convert(
         )
         pipeline_options.table_structure_options.mode = table_mode
+        if image_export_mode != ImageRefMode.PLACEHOLDER:
+            pipeline_options.generate_page_images = True
+            pipeline_options.generate_picture_images = (
+                True  # FIXME: to be deprecated in verson 3
+            )
+            pipeline_options.images_scale = 2
         if artifacts_path is not None:
             pipeline_options.artifacts_path = artifacts_path
@@ -321,11 +372,13 @@ def convert(
         else:
             raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
+        pdf_format_option = PdfFormatOption(
+            pipeline_options=pipeline_options,
+            backend=backend,  # pdf_backend
+        )
         format_options: Dict[InputFormat, FormatOption] = {
-            InputFormat.PDF: PdfFormatOption(
-                pipeline_options=pipeline_options,
-                backend=backend,  # pdf_backend
-            )
+            InputFormat.PDF: pdf_format_option,
+            InputFormat.IMAGE: pdf_format_option,
         }
         doc_converter = DocumentConverter(
             allowed_formats=from_formats,
@@ -343,9 +396,11 @@ def convert(
             conv_results,
             output_dir=output,
             export_json=export_json,
+            export_html=export_html,
             export_md=export_md,
             export_txt=export_txt,
             export_doctags=export_doctags,
+            image_export_mode=image_export_mode,
         )
         end_time = time.time() - start_time

docling/datamodel/base_models.py CHANGED Viewed

@@ -41,6 +41,7 @@ class InputFormat(str, Enum):
 class OutputFormat(str, Enum):
     MARKDOWN = "md"
     JSON = "json"
+    HTML = "html"
     TEXT = "text"
     DOCTAGS = "doctags"

docling/datamodel/document.py CHANGED Viewed

@@ -33,6 +33,7 @@ from docling_core.types.legacy_doc.document import (
 from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
 from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
 from docling_core.utils.file import resolve_source_to_stream
+from docling_core.utils.legacy import docling_document_to_legacy
 from pydantic import BaseModel
 from typing_extensions import deprecated
@@ -189,259 +190,7 @@ class ConversionResult(BaseModel):
     @property
     @deprecated("Use document instead.")
     def legacy_document(self):
-        reverse_label_mapping = {
-            DocItemLabel.CAPTION.value: "Caption",
-            DocItemLabel.FOOTNOTE.value: "Footnote",
-            DocItemLabel.FORMULA.value: "Formula",
-            DocItemLabel.LIST_ITEM.value: "List-item",
-            DocItemLabel.PAGE_FOOTER.value: "Page-footer",
-            DocItemLabel.PAGE_HEADER.value: "Page-header",
-            DocItemLabel.PICTURE.value: "Picture",  # low threshold adjust to capture chemical structures for examples.
-            DocItemLabel.SECTION_HEADER.value: "Section-header",
-            DocItemLabel.TABLE.value: "Table",
-            DocItemLabel.TEXT.value: "Text",
-            DocItemLabel.TITLE.value: "Title",
-            DocItemLabel.DOCUMENT_INDEX.value: "Document Index",
-            DocItemLabel.CODE.value: "Code",
-            DocItemLabel.CHECKBOX_SELECTED.value: "Checkbox-Selected",
-            DocItemLabel.CHECKBOX_UNSELECTED.value: "Checkbox-Unselected",
-            DocItemLabel.FORM.value: "Form",
-            DocItemLabel.KEY_VALUE_REGION.value: "Key-Value Region",
-            DocItemLabel.PARAGRAPH.value: "paragraph",
-        }
-        title = ""
-        desc = DsDocumentDescription(logs=[])
-        page_hashes = [
-            PageReference(
-                hash=create_hash(self.input.document_hash + ":" + str(p.page_no - 1)),
-                page=p.page_no,
-                model="default",
-            )
-            for p in self.document.pages.values()
-        ]
-        file_info = DsFileInfoObject(
-            filename=self.input.file.name,
-            document_hash=self.input.document_hash,
-            num_pages=self.input.page_count,
-            page_hashes=page_hashes,
-        )
-        main_text = []
-        tables = []
-        figures = []
-        equations = []
-        footnotes = []
-        page_headers = []
-        page_footers = []
-        embedded_captions = set()
-        for ix, (item, level) in enumerate(
-            self.document.iterate_items(self.document.body)
-        ):
-            if isinstance(item, (TableItem, PictureItem)) and len(item.captions) > 0:
-                caption = item.caption_text(self.document)
-                if caption:
-                    embedded_captions.add(caption)
-        for item, level in self.document.iterate_items():
-            if isinstance(item, DocItem):
-                item_type = item.label
-                if isinstance(item, (TextItem, ListItem, SectionHeaderItem)):
-                    if isinstance(item, ListItem) and item.marker:
-                        text = f"{item.marker} {item.text}"
-                    else:
-                        text = item.text
-                    # Can be empty.
-                    prov = [
-                        Prov(
-                            bbox=p.bbox.as_tuple(),
-                            page=p.page_no,
-                            span=[0, len(item.text)],
-                        )
-                        for p in item.prov
-                    ]
-                    main_text.append(
-                        BaseText(
-                            text=text,
-                            obj_type=layout_label_to_ds_type.get(item.label),
-                            name=reverse_label_mapping[item.label],
-                            prov=prov,
-                        )
-                    )
-                    # skip captions of they are embedded in the actual
-                    # floating object
-                    if item_type == DocItemLabel.CAPTION and text in embedded_captions:
-                        continue
-                elif isinstance(item, TableItem) and item.data:
-                    index = len(tables)
-                    ref_str = f"#/tables/{index}"
-                    main_text.append(
-                        Ref(
-                            name=reverse_label_mapping[item.label],
-                            obj_type=layout_label_to_ds_type.get(item.label),
-                            ref=ref_str,
-                        ),
-                    )
-                    # Initialise empty table data grid (only empty cells)
-                    table_data = [
-                        [
-                            TableCell(
-                                text="",
-                                # bbox=[0,0,0,0],
-                                spans=[[i, j]],
-                                obj_type="body",
-                            )
-                            for j in range(item.data.num_cols)
-                        ]
-                        for i in range(item.data.num_rows)
-                    ]
-                    # Overwrite cells in table data for which there is actual cell content.
-                    for cell in item.data.table_cells:
-                        for i in range(
-                            min(cell.start_row_offset_idx, item.data.num_rows),
-                            min(cell.end_row_offset_idx, item.data.num_rows),
-                        ):
-                            for j in range(
-                                min(cell.start_col_offset_idx, item.data.num_cols),
-                                min(cell.end_col_offset_idx, item.data.num_cols),
-                            ):
-                                celltype = "body"
-                                if cell.column_header:
-                                    celltype = "col_header"
-                                elif cell.row_header:
-                                    celltype = "row_header"
-                                elif cell.row_section:
-                                    celltype = "row_section"
-                                def make_spans(cell):
-                                    for rspan in range(
-                                        min(
-                                            cell.start_row_offset_idx,
-                                            item.data.num_rows,
-                                        ),
-                                        min(
-                                            cell.end_row_offset_idx, item.data.num_rows
-                                        ),
-                                    ):
-                                        for cspan in range(
-                                            min(
-                                                cell.start_col_offset_idx,
-                                                item.data.num_cols,
-                                            ),
-                                            min(
-                                                cell.end_col_offset_idx,
-                                                item.data.num_cols,
-                                            ),
-                                        ):
-                                            yield [rspan, cspan]
-                                spans = list(make_spans(cell))
-                                table_data[i][j] = GlmTableCell(
-                                    text=cell.text,
-                                    bbox=(
-                                        cell.bbox.as_tuple()
-                                        if cell.bbox is not None
-                                        else None
-                                    ),  # check if this is bottom-left
-                                    spans=spans,
-                                    obj_type=celltype,
-                                    col=j,
-                                    row=i,
-                                    row_header=cell.row_header,
-                                    row_section=cell.row_section,
-                                    col_header=cell.column_header,
-                                    row_span=[
-                                        cell.start_row_offset_idx,
-                                        cell.end_row_offset_idx,
-                                    ],
-                                    col_span=[
-                                        cell.start_col_offset_idx,
-                                        cell.end_col_offset_idx,
-                                    ],
-                                )
-                    # Compute the caption
-                    caption = item.caption_text(self.document)
-                    tables.append(
-                        DsSchemaTable(
-                            text=caption,
-                            num_cols=item.data.num_cols,
-                            num_rows=item.data.num_rows,
-                            obj_type=layout_label_to_ds_type.get(item.label),
-                            data=table_data,
-                            prov=[
-                                Prov(
-                                    bbox=p.bbox.as_tuple(),
-                                    page=p.page_no,
-                                    span=[0, 0],
-                                )
-                                for p in item.prov
-                            ],
-                        )
-                    )
-                elif isinstance(item, PictureItem):
-                    index = len(figures)
-                    ref_str = f"#/figures/{index}"
-                    main_text.append(
-                        Ref(
-                            name=reverse_label_mapping[item.label],
-                            obj_type=layout_label_to_ds_type.get(item.label),
-                            ref=ref_str,
-                        ),
-                    )
-                    # Compute the caption
-                    caption = item.caption_text(self.document)
-                    figures.append(
-                        Figure(
-                            prov=[
-                                Prov(
-                                    bbox=p.bbox.as_tuple(),
-                                    page=p.page_no,
-                                    span=[0, len(caption)],
-                                )
-                                for p in item.prov
-                            ],
-                            obj_type=layout_label_to_ds_type.get(item.label),
-                            text=caption,
-                            # data=[[]],
-                        )
-                    )
-        page_dimensions = [
-            PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
-            for p in self.document.pages.values()
-        ]
-        ds_doc = DsDocument(
-            name=title,
-            description=desc,
-            file_info=file_info,
-            main_text=main_text,
-            equations=equations,
-            footnotes=footnotes,
-            page_headers=page_headers,
-            page_footers=page_footers,
-            tables=tables,
-            figures=figures,
-            page_dimensions=page_dimensions,
-        )
-        return ds_doc
+        return docling_document_to_legacy(self.document)
 class _DummyBackend(AbstractDocumentBackend):

docling/datamodel/pipeline_options.py CHANGED Viewed

@@ -143,7 +143,11 @@ class PdfPipelineOptions(PipelineOptions):
     table_structure_options: TableStructureOptions = TableStructureOptions()
     ocr_options: Union[
-        EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions
+        EasyOcrOptions,
+        TesseractCliOcrOptions,
+        TesseractOcrOptions,
+        OcrMacOptions,
+        RapidOcrOptions,
     ] = Field(EasyOcrOptions(), discriminator="kind")
     images_scale: float = 1.0

docling/document_converter.py CHANGED Viewed

@@ -9,7 +9,7 @@ from pydantic import BaseModel, ConfigDict, model_validator, validate_call
 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.asciidoc_backend import AsciiDocBackend
-from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
 from docling.backend.html_backend import HTMLDocumentBackend
 from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.backend.msexcel_backend import MsExcelDocumentBackend
@@ -84,12 +84,12 @@ class HTMLFormatOption(FormatOption):
 class PdfFormatOption(FormatOption):
     pipeline_cls: Type = StandardPdfPipeline
-    backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
+    backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
 class ImageFormatOption(FormatOption):
     pipeline_cls: Type = StandardPdfPipeline
-    backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
+    backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
 def _get_default_option(format: InputFormat) -> FormatOption:
@@ -113,10 +113,10 @@ def _get_default_option(format: InputFormat) -> FormatOption:
             pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
         ),
         InputFormat.IMAGE: FormatOption(
-            pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
+            pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
         ),
         InputFormat.PDF: FormatOption(
-            pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
+            pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
         ),
     }
     if (options := format_to_default_options.get(format)) is not None:

docling/models/ds_glm_model.py CHANGED Viewed

@@ -4,7 +4,6 @@ from pathlib import Path
 from typing import List, Union
 from deepsearch_glm.nlp_utils import init_nlp_model
-from deepsearch_glm.utils.doc_utils import to_docling_document
 from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
 from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
 from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
@@ -29,6 +28,7 @@ from pydantic import BaseModel, ConfigDict
 from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
 from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
 from docling.datamodel.settings import settings
+from docling.utils.glm_utils import to_docling_document
 from docling.utils.profiling import ProfilingScope, TimeRecorder
 from docling.utils.utils import create_hash
@@ -232,7 +232,7 @@ class GlmModel:
     def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
         with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
             ds_doc = self._to_legacy_document(conv_res)
-            ds_doc_dict = ds_doc.model_dump(by_alias=True)
+            ds_doc_dict = ds_doc.model_dump(by_alias=True, exclude_none=True)
             glm_doc = self.model.apply_on_doc(ds_doc_dict)

docling/pipeline/standard_pdf_pipeline.py CHANGED Viewed

@@ -97,7 +97,9 @@ class StandardPdfPipeline(PaginatedPipeline):
         local_dir: Optional[Path] = None, force: bool = False
     ) -> Path:
         from huggingface_hub import snapshot_download
+        from huggingface_hub.utils import disable_progress_bars
+        disable_progress_bars()
         download_path = snapshot_download(
             repo_id="ds4sd/docling-models",
             force_download=force,

docling/py.typed ADDED Viewed

	@@ -0,0 +1 @@
1	+

docling/utils/glm_utils.py ADDED Viewed

@@ -0,0 +1,336 @@
+import re
+from pathlib import Path
+from typing import List
+import pandas as pd
+from docling_core.types.doc import (
+    BoundingBox,
+    CoordOrigin,
+    DocItemLabel,
+    DoclingDocument,
+    DocumentOrigin,
+    GroupLabel,
+    ProvenanceItem,
+    Size,
+    TableCell,
+    TableData,
+)
+def resolve_item(paths, obj):
+    """Find item in document from a reference path"""
+    if len(paths) == 0:
+        return obj
+    if paths[0] == "#":
+        return resolve_item(paths[1:], obj)
+    try:
+        key = int(paths[0])
+    except:
+        key = paths[0]
+    if len(paths) == 1:
+        if isinstance(key, str) and key in obj:
+            return obj[key]
+        elif isinstance(key, int) and key < len(obj):
+            return obj[key]
+        else:
+            return None
+    elif len(paths) > 1:
+        if isinstance(key, str) and key in obj:
+            return resolve_item(paths[1:], obj[key])
+        elif isinstance(key, int) and key < len(obj):
+            return resolve_item(paths[1:], obj[key])
+        else:
+            return None
+    else:
+        return None
+def _flatten_table_grid(grid: List[List[dict]]) -> List[dict]:
+    unique_objects = []
+    seen_spans = set()
+    for sublist in grid:
+        for obj in sublist:
+            # Convert the spans list to a tuple of tuples for hashing
+            spans_tuple = tuple(tuple(span) for span in obj["spans"])
+            if spans_tuple not in seen_spans:
+                seen_spans.add(spans_tuple)
+                unique_objects.append(obj)
+    return unique_objects
+def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
+    origin = DocumentOrigin(
+        mimetype="application/pdf",
+        filename=doc_glm["file-info"]["filename"],
+        binary_hash=doc_glm["file-info"]["document-hash"],
+    )
+    doc_name = Path(origin.filename).stem
+    doc: DoclingDocument = DoclingDocument(name=doc_name, origin=origin)
+    for page_dim in doc_glm["page-dimensions"]:
+        page_no = int(page_dim["page"])
+        size = Size(width=page_dim["width"], height=page_dim["height"])
+        doc.add_page(page_no=page_no, size=size)
+    if "properties" in doc_glm:
+        props = pd.DataFrame(
+            doc_glm["properties"]["data"], columns=doc_glm["properties"]["headers"]
+        )
+    else:
+        props = pd.DataFrame()
+    current_list = None
+    for ix, pelem in enumerate(doc_glm["page-elements"]):
+        ptype = pelem["type"]
+        span_i = pelem["span"][0]
+        span_j = pelem["span"][1]
+        if "iref" not in pelem:
+            # print(json.dumps(pelem, indent=2))
+            continue
+        iref = pelem["iref"]
+        if re.match("#/figures/(\\d+)/captions/(.+)", iref):
+            # print(f"skip {iref}")
+            continue
+        if re.match("#/tables/(\\d+)/captions/(.+)", iref):
+            # print(f"skip {iref}")
+            continue
+        path = iref.split("/")
+        obj = resolve_item(path, doc_glm)
+        if obj is None:
+            current_list = None
+            print(f"warning: undefined {path}")
+            continue
+        if ptype == "figure":
+            current_list = None
+            text = ""
+            caption_refs = []
+            for caption in obj["captions"]:
+                text += caption["text"]
+                for nprov in caption["prov"]:
+                    npaths = nprov["$ref"].split("/")
+                    nelem = resolve_item(npaths, doc_glm)
+                    if nelem is None:
+                        # print(f"warning: undefined caption {npaths}")
+                        continue
+                    span_i = nelem["span"][0]
+                    span_j = nelem["span"][1]
+                    cap_text = caption["text"][span_i:span_j]
+                    # doc_glm["page-elements"].remove(nelem)
+                    prov = ProvenanceItem(
+                        page_no=nelem["page"],
+                        charspan=tuple(nelem["span"]),
+                        bbox=BoundingBox.from_tuple(
+                            nelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
+                        ),
+                    )
+                    caption_obj = doc.add_text(
+                        label=DocItemLabel.CAPTION, text=cap_text, prov=prov
+                    )
+                    caption_refs.append(caption_obj.get_ref())
+            prov = ProvenanceItem(
+                page_no=pelem["page"],
+                charspan=(0, len(text)),
+                bbox=BoundingBox.from_tuple(
+                    pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
+                ),
+            )
+            pic = doc.add_picture(prov=prov)
+            pic.captions.extend(caption_refs)
+            _add_child_elements(pic, doc, obj, pelem)
+        elif ptype == "table":
+            current_list = None
+            text = ""
+            caption_refs = []
+            for caption in obj["captions"]:
+                text += caption["text"]
+                for nprov in caption["prov"]:
+                    npaths = nprov["$ref"].split("/")
+                    nelem = resolve_item(npaths, doc_glm)
+                    if nelem is None:
+                        # print(f"warning: undefined caption {npaths}")
+                        continue
+                    span_i = nelem["span"][0]
+                    span_j = nelem["span"][1]
+                    cap_text = caption["text"][span_i:span_j]
+                    # doc_glm["page-elements"].remove(nelem)
+                    prov = ProvenanceItem(
+                        page_no=nelem["page"],
+                        charspan=tuple(nelem["span"]),
+                        bbox=BoundingBox.from_tuple(
+                            nelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
+                        ),
+                    )
+                    caption_obj = doc.add_text(
+                        label=DocItemLabel.CAPTION, text=cap_text, prov=prov
+                    )
+                    caption_refs.append(caption_obj.get_ref())
+            table_cells_glm = _flatten_table_grid(obj["data"])
+            table_cells = []
+            for tbl_cell_glm in table_cells_glm:
+                if tbl_cell_glm["bbox"] is not None:
+                    bbox = BoundingBox.from_tuple(
+                        tbl_cell_glm["bbox"], origin=CoordOrigin.BOTTOMLEFT
+                    )
+                else:
+                    bbox = None
+                is_col_header = False
+                is_row_header = False
+                is_row_section = False
+                if tbl_cell_glm["type"] == "col_header":
+                    is_col_header = True
+                elif tbl_cell_glm["type"] == "row_header":
+                    is_row_header = True
+                elif tbl_cell_glm["type"] == "row_section":
+                    is_row_section = True
+                table_cells.append(
+                    TableCell(
+                        row_span=tbl_cell_glm["row-span"][1]
+                        - tbl_cell_glm["row-span"][0],
+                        col_span=tbl_cell_glm["col-span"][1]
+                        - tbl_cell_glm["col-span"][0],
+                        start_row_offset_idx=tbl_cell_glm["row-span"][0],
+                        end_row_offset_idx=tbl_cell_glm["row-span"][1],
+                        start_col_offset_idx=tbl_cell_glm["col-span"][0],
+                        end_col_offset_idx=tbl_cell_glm["col-span"][1],
+                        text=tbl_cell_glm["text"],
+                        bbox=bbox,
+                        column_header=is_col_header,
+                        row_header=is_row_header,
+                        row_section=is_row_section,
+                    )
+                )
+            tbl_data = TableData(
+                num_rows=obj.get("#-rows", 0),
+                num_cols=obj.get("#-cols", 0),
+                table_cells=table_cells,
+            )
+            prov = ProvenanceItem(
+                page_no=pelem["page"],
+                charspan=(0, 0),
+                bbox=BoundingBox.from_tuple(
+                    pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
+                ),
+            )
+            tbl = doc.add_table(data=tbl_data, prov=prov)
+            tbl.captions.extend(caption_refs)
+        elif ptype in ["form", "key_value_region"]:
+            label = DocItemLabel(ptype)
+            container_el = doc.add_group(label=GroupLabel.UNSPECIFIED, name=label)
+            _add_child_elements(container_el, doc, obj, pelem)
+        elif "text" in obj:
+            text = obj["text"][span_i:span_j]
+            type_label = pelem["type"]
+            name_label = pelem["name"]
+            if update_name_label and len(props) > 0 and type_label == "paragraph":
+                prop = props[
+                    (props["type"] == "semantic") & (props["subj_path"] == iref)
+                ]
+                if len(prop) == 1 and prop.iloc[0]["confidence"] > 0.85:
+                    name_label = prop.iloc[0]["label"]
+            prov = ProvenanceItem(
+                page_no=pelem["page"],
+                charspan=(0, len(text)),
+                bbox=BoundingBox.from_tuple(
+                    pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
+                ),
+            )
+            label = DocItemLabel(name_label)
+            if label == DocItemLabel.LIST_ITEM:
+                if current_list is None:
+                    current_list = doc.add_group(label=GroupLabel.LIST, name="list")
+                # TODO: Infer if this is a numbered or a bullet list item
+                doc.add_list_item(
+                    text=text, enumerated=False, prov=prov, parent=current_list
+                )
+            elif label == DocItemLabel.SECTION_HEADER:
+                current_list = None
+                doc.add_heading(text=text, prov=prov)
+            else:
+                current_list = None
+                doc.add_text(label=DocItemLabel(name_label), text=text, prov=prov)
+    return doc
+def _add_child_elements(container_el, doc, obj, pelem):
+    payload = obj.get("payload")
+    if payload is not None:
+        children = payload.get("children", [])
+        for child in children:
+            c_label = DocItemLabel(child["label"])
+            c_bbox = BoundingBox.model_validate(child["bbox"]).to_bottom_left_origin(
+                doc.pages[pelem["page"]].size.height
+            )
+            c_text = " ".join(
+                [
+                    cell["text"].replace("\x02", "-").strip()
+                    for cell in child["cells"]
+                    if len(cell["text"].strip()) > 0
+                ]
+            )
+            c_prov = ProvenanceItem(
+                page_no=pelem["page"], charspan=(0, len(c_text)), bbox=c_bbox
+            )
+            if c_label == DocItemLabel.LIST_ITEM:
+                # TODO: Infer if this is a numbered or a bullet list item
+                doc.add_list_item(parent=container_el, text=c_text, prov=c_prov)
+            elif c_label == DocItemLabel.SECTION_HEADER:
+                doc.add_heading(parent=container_el, text=c_text, prov=c_prov)
+            else:
+                doc.add_text(
+                    parent=container_el, label=c_label, text=c_text, prov=c_prov
+                )

{docling-2.8.3.dist-info → docling-2.10.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 2.8.3
+Version: 2.10.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Home-page: https://github.com/DS4SD/docling
 License: MIT
@@ -25,10 +25,10 @@ Provides-Extra: rapidocr
 Provides-Extra: tesserocr
 Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
 Requires-Dist: certifi (>=2024.7.4)
-Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
-Requires-Dist: docling-core (>=2.6.1,<3.0.0)
+Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
+Requires-Dist: docling-core[chunking] (>=2.9.0,<3.0.0)
 Requires-Dist: docling-ibm-models (>=2.0.6,<3.0.0)
-Requires-Dist: docling-parse (>=2.0.5,<3.0.0)
+Requires-Dist: docling-parse (>=3.0.0,<4.0.0)
 Requires-Dist: easyocr (>=1.7,<2.0)
 Requires-Dist: filetype (>=1.2.0,<2.0.0)
 Requires-Dist: huggingface_hub (>=0.23,<1)
@@ -39,7 +39,7 @@ Requires-Dist: onnxruntime (>=1.7.0,<1.20.0) ; (python_version < "3.10") and (ex
 Requires-Dist: onnxruntime (>=1.7.0,<2.0.0) ; (python_version >= "3.10") and (extra == "rapidocr")
 Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
 Requires-Dist: pandas (>=2.1.4,<3.0.0)
-Requires-Dist: pydantic (>=2.0.0,<2.10)
+Requires-Dist: pydantic (>=2.0.0,<3.0.0)
 Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
 Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
 Requires-Dist: python-docx (>=1.1.2,<2.0.0)
@@ -59,7 +59,7 @@ Description-Content-Type: text/markdown
   </a>
 </p>
-# 🦆 Docling
+# Docling
 <p align="center">
   <a href="https://trendshift.io/repositories/12132" target="_blank"><img src="https://trendshift.io/api/badge/repositories/12132" alt="DS4SD%2Fdocling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
@@ -81,7 +81,7 @@ Docling parses documents and exports them to the desired format with ease and sp
 ## Features
-* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to Markdown and JSON
+* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
 * 📑 Advanced PDF document understanding including page layout, reading order & table structures
 * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
 * 🤖 Easy integration with 🦙 LlamaIndex & 🦜🔗 LangChain for powerful RAG / QA applications

{docling-2.8.3.dist-info → docling-2.10.0.dist-info}/RECORD RENAMED Viewed

@@ -2,28 +2,29 @@ docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/backend/abstract_backend.py,sha256=-or6kWVV7egQeyIuN-vI0Tr7Q1htalBZSlhgq_G2RdU,1678
 docling/backend/asciidoc_backend.py,sha256=kXZxOLk_LvLFVZwnJVVwjmvc3QWZ0iiG7VnwjgtC3hI,14051
-docling/backend/docling_parse_backend.py,sha256=csWy6ZGxDuZfNr0YTrUU40DXqelN_TJksWIYoXxZMjU,7633
-docling/backend/docling_parse_v2_backend.py,sha256=gUr9_fwHbkj238oYQPJ9AxpjFL2jGvhjBlBQPblmSAg,8589
+docling/backend/docling_parse_backend.py,sha256=_jY5f5-KGI3hi5pcZAY6e7tPLocSi5JUWrxraDVszqI,7631
+docling/backend/docling_parse_v2_backend.py,sha256=1TDUdMIp3fEjCWBNjusUHiCUmH1g6yZQ-b13scofP0Y,8637
 docling/backend/html_backend.py,sha256=qbu1W8xoTGnXMuZPRPLq68hDbCEj6ygnpxP5gYaodAQ,15593
 docling/backend/md_backend.py,sha256=tmuSCghjor9PqKIiVieCuZ4_t5JEjZMy3cq7u3yTgyU,14032
 docling/backend/msexcel_backend.py,sha256=23qUEScqr5GhY06xiqg-eBQ_JlAqO0FkPEmX6554sVA,12040
 docling/backend/mspowerpoint_backend.py,sha256=QD0NaatTO8U9CIFoiipkq3X5HxLZaaahH8nlrQ6ecDA,15710
-docling/backend/msword_backend.py,sha256=VFHPr-gCak7w3NJToc5Cs-JaTb4Vm3a1JnnRIfJO3TI,18526
+docling/backend/msword_backend.py,sha256=K1D_h0ulLA6KQsPe62327cDVkQqV1f7EetCHo66wCKw,19233
 docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
 docling/backend/pypdfium2_backend.py,sha256=B4bfv-dfzlWiKTfF8LN5fto_99YBu8A2c1_XIVwRUWI,8996
+docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
 docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/cli/main.py,sha256=R9ao2zCv1GZQIATOqg9b64O7AOUCWLwjJ-2FIpW8m0I,12236
+docling/cli/main.py,sha256=bLk1RG0jwM4dn6G5qa5Q-S4_N3agKnoE28pTfbpV4-k,14713
 docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/datamodel/base_models.py,sha256=mJ4h2haE0cOYz_eLd7QlRKU1y7u4yccMGk0tiZNICkQ,5542
-docling/datamodel/document.py,sha256=Y0NEFphwz44VxIaRaDRhtmw6rifzSC7MqyaDBzaR0lM,20902
-docling/datamodel/pipeline_options.py,sha256=K65nEZ52aRfF8hWIzl0zVvRQj-3XVwoBbxTacGS6jEg,4960
+docling/datamodel/base_models.py,sha256=627IB8HZdXGmHNfsX4Qhf7kKSxx2btPjS7z8hitvhyE,5560
+docling/datamodel/document.py,sha256=GNlTsgKgDqdqv2dfhpYmnqymxDQWWWC8HgE8uAta8V4,10265
+docling/datamodel/pipeline_options.py,sha256=zQxLVioyBrldI4V9phQma1kTTgjmFQ6d3gVj2xq51gw,5010
 docling/datamodel/settings.py,sha256=JK8lZPBjUx2kD2q-Qpg-o3vOElADMcyQbRUL0EHZ7us,1263
-docling/document_converter.py,sha256=bsXGQCUrbL2LmaqaaEmlkfSANl2XwBBx8HDLwFrqhFY,11570
+docling/document_converter.py,sha256=Iz5eerBWFPVJoXAMlXEivRQX2VLBiUkA07BL4NNbaEs,11583
 docling/exceptions.py,sha256=-FoP46rFJgz_jn5uDv2V052udEEg8gckk6uhoItchXc,85
 docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/models/base_model.py,sha256=Yq_-FmUhqhE20vXYG3WiQXDRTIPjik1CyuEZ8iYTGAY,701
 docling/models/base_ocr_model.py,sha256=rGSpBF4dByITcsBaRIgvFKpiu0CrhmZS_PHIo686Dw0,6428
-docling/models/ds_glm_model.py,sha256=hBRCx6oFGhxBbKEJlRSWVndDwFtB5IpeLOowFAVqFM0,12033
+docling/models/ds_glm_model.py,sha256=3UpFu3Oavw9p0GItx2S9R7bPDdjY2NvpUQQDSVMctys,12045
 docling/models/easyocr_model.py,sha256=c2m4x9dZpSc-cMgeEdFBRVBlB78uMGlYD8Q_2gzRuMU,3734
 docling/models/layout_model.py,sha256=ZvbTSyxvXB5yLHNEti0Wv3trz0vwGuHySI5TCdApb0U,14011
 docling/models/ocr_mac_model.py,sha256=bLP14UUmZcSzjDe-HLj-mtksTuBmsCTg2C1wCxUpan0,4502
@@ -36,14 +37,16 @@ docling/models/tesseract_ocr_model.py,sha256=RDf6iV1q-oXaGfZXv0bW6SqjHNKQvBUDlUs
 docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/pipeline/base_pipeline.py,sha256=IF1XWYgUGbdB4-teLkmM4Hvg_UNEfPrGuhExMRTUsk8,7168
 docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
-docling/pipeline/standard_pdf_pipeline.py,sha256=7sbkh9EwXlhSfJSgf-WyjB5jdJ1El7Pn4siSssTJpq8,8789
+docling/pipeline/standard_pdf_pipeline.py,sha256=B1q8xt3Dfecpi8s8DrcfPzdATh8TYgL43FDzzcS4vEA,8885
+docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
 docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
+docling/utils/glm_utils.py,sha256=H1O_tDiRksMgw45rY9LhK6GjcZSOq5IyoGurGjoo-Ac,11211
 docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
 docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
 docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
-docling-2.8.3.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
-docling-2.8.3.dist-info/METADATA,sha256=TKraAUApw0vLlToJ37cBQPNyJwoPmdWMIn73hYwq4Y8,7682
-docling-2.8.3.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-docling-2.8.3.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
-docling-2.8.3.dist-info/RECORD,,
+docling-2.10.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
+docling-2.10.0.dist-info/METADATA,sha256=YVI-dBKxqAxrLATigzeXPZvwDZUhLSl_doltc-HenQ4,7731
+docling-2.10.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+docling-2.10.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
+docling-2.10.0.dist-info/RECORD,,

{docling-2.8.3.dist-info → docling-2.10.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{docling-2.8.3.dist-info → docling-2.10.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{docling-2.8.3.dist-info → docling-2.10.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

docling 2.8.3__py3-none-any.whl → 2.10.0__py3-none-any.whl

Potentially problematic release.

docling 2.8.3py3-none-any.whl → 2.10.0py3-none-any.whl