PyPI - docling - Versions diffs - 1.9.0__tar.gz → 1.11.0__tar.gz - Mend

docling 1.9.0tar.gz → 1.11.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

{docling-1.9.0 → docling-1.11.0}/LICENSE RENAMED Viewed

@@ -1,6 +1,6 @@
 MIT License
-Copyright (c) [year] [fullname]
+Copyright (c) 2024 International Business Machines
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

{docling-1.9.0 → docling-1.11.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 1.9.0
+Version: 1.11.0
 Summary: Docling PDF conversion package
 Home-page: https://github.com/DS4SD/docling
 License: MIT
@@ -20,14 +20,14 @@ Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Requires-Dist: certifi (>=2024.7.4)
-Requires-Dist: deepsearch-glm (>=0.19.1,<0.20.0)
-Requires-Dist: docling-core (>=1.1.3,<2.0.0)
-Requires-Dist: docling-ibm-models (>=1.1.3,<2.0.0)
-Requires-Dist: docling-parse (>=1.1.3,<2.0.0)
+Requires-Dist: deepsearch-glm (>=0.21.0,<0.22.0)
+Requires-Dist: docling-core (>=1.2.0,<2.0.0)
+Requires-Dist: docling-ibm-models (>=1.1.7,<2.0.0)
+Requires-Dist: docling-parse (>=1.2.0,<2.0.0)
 Requires-Dist: easyocr (>=1.7,<2.0)
 Requires-Dist: filetype (>=1.2.0,<2.0.0)
 Requires-Dist: huggingface_hub (>=0.23,<1)
-Requires-Dist: pyarrow (>=17.0.0,<18.0.0)
+Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
 Requires-Dist: pydantic (>=2.0.0,<3.0.0)
 Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
 Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
@@ -63,7 +63,7 @@ Docling bundles PDF document conversion to JSON and Markdown in an easy, self-co
 * 📝 Extracts metadata from the document, such as title, authors, references and language
 * 🔍 Optionally applies OCR (use with scanned PDFs)
-Doing RAG or Q/A? Also consider [Quackling](https://github.com/DS4SD/quackling) to get the most out of your documents.
+For RAG, check out [Quackling](https://github.com/DS4SD/quackling) to get the most out of your docs, be it using LlamaIndex, LangChain or your pipeline.
 ## Installation
@@ -183,6 +183,10 @@ results = doc_converter.convert(conv_input)
 You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
+## Technical report
+For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
 ## Contributing
 Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
@@ -196,10 +200,10 @@ If you use Docling in your projects, please consider citing the following:
 @techreport{Docling,
   author = {Deep Search Team},
   month = {8},
-  title = {{Docling Technical Report}},
-  url={https://arxiv.org/abs/2408.09869},
-  eprint={2408.09869},
-  doi = "10.48550/arXiv.2408.09869",
+  title = {Docling Technical Report},
+  url = {https://arxiv.org/abs/2408.09869},
+  eprint = {2408.09869},
+  doi = {10.48550/arXiv.2408.09869},
   version = {1.0.0},
   year = {2024}
 }

{docling-1.9.0 → docling-1.11.0}/README.md RENAMED Viewed

@@ -24,7 +24,7 @@ Docling bundles PDF document conversion to JSON and Markdown in an easy, self-co
 * 📝 Extracts metadata from the document, such as title, authors, references and language
 * 🔍 Optionally applies OCR (use with scanned PDFs)
-Doing RAG or Q/A? Also consider [Quackling](https://github.com/DS4SD/quackling) to get the most out of your documents.
+For RAG, check out [Quackling](https://github.com/DS4SD/quackling) to get the most out of your docs, be it using LlamaIndex, LangChain or your pipeline.
 ## Installation
@@ -144,6 +144,10 @@ results = doc_converter.convert(conv_input)
 You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
+## Technical report
+For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
 ## Contributing
 Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
@@ -157,10 +161,10 @@ If you use Docling in your projects, please consider citing the following:
 @techreport{Docling,
   author = {Deep Search Team},
   month = {8},
-  title = {{Docling Technical Report}},
-  url={https://arxiv.org/abs/2408.09869},
-  eprint={2408.09869},
-  doi = "10.48550/arXiv.2408.09869",
+  title = {Docling Technical Report},
+  url = {https://arxiv.org/abs/2408.09869},
+  eprint = {2408.09869},
+  doi = {10.48550/arXiv.2408.09869},
   version = {1.0.0},
   year = {2024}
 }

{docling-1.9.0 → docling-1.11.0}/docling/datamodel/document.py RENAMED Viewed

@@ -11,6 +11,7 @@ from docling_core.types import FileInfoObject as DsFileInfoObject
 from docling_core.types import PageDimensions, PageReference, Prov, Ref
 from docling_core.types import Table as DsSchemaTable
 from docling_core.types import TableCell
+from docling_core.types.doc.base import Figure
 from pydantic import BaseModel
 from typing_extensions import deprecated
@@ -279,7 +280,7 @@ class ConvertedDocument(BaseModel):
                     ),
                 )
                 figures.append(
-                    BaseCell(
+                    Figure(
                         prov=[
                             Prov(
                                 bbox=target_bbox,
@@ -312,8 +313,76 @@ class ConvertedDocument(BaseModel):
     def render_as_dict(self):
         return self.output.model_dump(by_alias=True, exclude_none=True)
-    def render_as_markdown(self):
-        return self.output.export_to_markdown()
+    def render_as_markdown(
+        self,
+        delim: str = "\n\n",
+        main_text_start: int = 0,
+        main_text_stop: Optional[int] = None,
+        main_text_labels: list[str] = [
+            "title",
+            "subtitle-level-1",
+            "paragraph",
+            "caption",
+            "table",
+        ],
+        strict_text: bool = False,
+    ):
+        return self.output.export_to_markdown(
+            delim=delim,
+            main_text_start=main_text_start,
+            main_text_stop=main_text_stop,
+            main_text_labels=main_text_labels,
+            strict_text=strict_text,
+        )
+    def render_as_text(
+        self,
+        delim: str = "\n\n",
+        main_text_start: int = 0,
+        main_text_stop: Optional[int] = None,
+        main_text_labels: list[str] = [
+            "title",
+            "subtitle-level-1",
+            "paragraph",
+            "caption",
+        ],
+    ):
+        return self.output.export_to_markdown(
+            delim=delim,
+            main_text_start=main_text_start,
+            main_text_stop=main_text_stop,
+            main_text_labels=main_text_labels,
+            strict_text=True,
+        )
+    def render_as_doctags(
+        self,
+        delim: str = "\n\n",
+        main_text_start: int = 0,
+        main_text_stop: Optional[int] = None,
+        main_text_labels: list[str] = [
+            "title",
+            "subtitle-level-1",
+            "paragraph",
+            "caption",
+            "table",
+            "figure",
+        ],
+        page_tagging: bool = True,
+        location_tagging: bool = True,
+        location_dimensions: Tuple[int, int] = (100, 100),
+        add_new_line: bool = True,
+    ) -> str:
+        return self.output.export_to_document_tokens(
+            delim=delim,
+            main_text_start=main_text_start,
+            main_text_stop=main_text_stop,
+            main_text_labels=main_text_labels,
+            page_tagging=page_tagging,
+            location_tagging=location_tagging,
+            location_dimensions=location_dimensions,
+            add_new_line=add_new_line,
+        )
     def render_element_images(
         self, element_types: Tuple[PageElement] = (FigureElement,)

{docling-1.9.0 → docling-1.11.0}/docling/models/ds_glm_model.py RENAMED Viewed

@@ -2,7 +2,7 @@ import copy
 import random
 from deepsearch_glm.nlp_utils import init_nlp_model
-from deepsearch_glm.utils.ds_utils import to_legacy_document_format
+from deepsearch_glm.utils.doc_utils import to_legacy_document_format
 from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
 from docling_core.types import BaseText
 from docling_core.types import Document as DsDocument

{docling-1.9.0 → docling-1.11.0}/docling/utils/export.py RENAMED Viewed

@@ -163,8 +163,12 @@ def generate_multimodal_pages(
         content_md = doc.export_to_markdown(
             main_text_start=start_ix, main_text_stop=end_ix
         )
+        # No page-tagging since we only do 1 page at the time
+        content_dt = doc.export_to_document_tokens(
+            main_text_start=start_ix, main_text_stop=end_ix, page_tagging=False
+        )
-        return content_text, content_md, page_cells, page_segments, page
+        return content_text, content_md, content_dt, page_cells, page_segments, page
     for ix, orig_item in enumerate(doc.main_text):

{docling-1.9.0 → docling-1.11.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "1.9.0"  # DO NOT EDIT, updated automatically
+version = "1.11.0"  # DO NOT EDIT, updated automatically
 description = "Docling PDF conversion package"
 authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
 license = "MIT"
@@ -23,20 +23,20 @@ packages = [{include = "docling"}]
 [tool.poetry.dependencies]
 python = "^3.10"
 pydantic = "^2.0.0"
-docling-core = "^1.1.3"
-docling-ibm-models = "^1.1.3"
-deepsearch-glm = "^0.19.1"
+docling-core = "^1.2.0"
+docling-ibm-models = "^1.1.7"
+deepsearch-glm = "^0.21.0"
 filetype = "^1.2.0"
 pypdfium2 = "^4.30.0"
 pydantic-settings = "^2.3.0"
 huggingface_hub = ">=0.23,<1"
 requests = "^2.32.3"
 easyocr = "^1.7"
-docling-parse = "^1.1.3"
+docling-parse = "^1.2.0"
 certifi = ">=2024.7.4"
 rtree = "^1.3.0"
 scipy = "^1.14.1"
-pyarrow = "^17.0.0"
+pyarrow = "^16.1.0"
 [tool.poetry.group.dev.dependencies]
 black = {extras = ["jupyter"], version = "^24.4.2"}