PyPI - docling - Versions diffs - 1.15.0__py3-none-any.whl → 1.16.0__py3-none-any.whl - Mend

docling 1.15.0py3-none-any.whl → 1.16.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

docling/cli/main.py +2 -1
docling/datamodel/base_models.py +4 -16
docling/datamodel/document.py +1 -1
docling/datamodel/pipeline_options.py +25 -0
docling/document_converter.py +1 -1
docling/models/table_structure_model.py +8 -1
docling/pipeline/base_model_pipeline.py +2 -1
docling/pipeline/standard_model_pipeline.py +2 -1
{docling-1.15.0.dist-info → docling-1.16.0.dist-info}/METADATA +17 -9
{docling-1.15.0.dist-info → docling-1.16.0.dist-info}/RECORD +13 -12
{docling-1.15.0.dist-info → docling-1.16.0.dist-info}/LICENSE +0 -0
{docling-1.15.0.dist-info → docling-1.16.0.dist-info}/WHEEL +0 -0
{docling-1.15.0.dist-info → docling-1.16.0.dist-info}/entry_points.txt +0 -0

docling/cli/main.py CHANGED Viewed

@@ -12,8 +12,9 @@ from docling_core.utils.file import resolve_file_source
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
-from docling.datamodel.base_models import ConversionStatus, PipelineOptions
+from docling.datamodel.base_models import ConversionStatus
 from docling.datamodel.document import ConversionResult, DocumentConversionInput
+from docling.datamodel.pipeline_options import PipelineOptions
 from docling.document_converter import DocumentConverter
 warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")

docling/datamodel/base_models.py CHANGED Viewed

@@ -9,6 +9,10 @@ from pydantic import BaseModel, ConfigDict, Field, model_validator
 from typing_extensions import Self
 from docling.backend.abstract_backend import PdfPageBackend
+from docling.datamodel.pipeline_options import (  # Must be imported here for backward compatibility.
+    PipelineOptions,
+    TableStructureOptions,
+)
 class ConversionStatus(str, Enum):
@@ -298,22 +302,6 @@ class DocumentStream(BaseModel):
     stream: BytesIO
-class TableStructureOptions(BaseModel):
-    do_cell_matching: bool = (
-        True
-        # True:  Matches predictions back to PDF cells. Can break table output if PDF cells
-        #        are merged across table columns.
-        # False: Let table structure model define the text cells, ignore PDF cells.
-    )
-class PipelineOptions(BaseModel):
-    do_table_structure: bool = True  # True: perform table structure extraction
-    do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text
-    table_structure_options: TableStructureOptions = TableStructureOptions()
 class AssembleOptions(BaseModel):
     keep_page_images: Annotated[
         bool,

docling/datamodel/document.py CHANGED Viewed

@@ -4,13 +4,13 @@ from pathlib import Path, PurePath
 from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
 from docling_core.types import BaseCell, BaseText
-from docling_core.types import BoundingBox as DsBoundingBox
 from docling_core.types import Document as DsDocument
 from docling_core.types import DocumentDescription as DsDocumentDescription
 from docling_core.types import FileInfoObject as DsFileInfoObject
 from docling_core.types import PageDimensions, PageReference, Prov, Ref
 from docling_core.types import Table as DsSchemaTable
 from docling_core.types import TableCell
+from docling_core.types.doc.base import BoundingBox as DsBoundingBox
 from docling_core.types.doc.base import Figure
 from pydantic import BaseModel
 from typing_extensions import deprecated

docling/datamodel/pipeline_options.py ADDED Viewed

@@ -0,0 +1,25 @@
+from enum import Enum, auto
+from pydantic import BaseModel
+class TableFormerMode(str, Enum):
+    FAST = auto()
+    ACCURATE = auto()
+class TableStructureOptions(BaseModel):
+    do_cell_matching: bool = (
+        True
+        # True:  Matches predictions back to PDF cells. Can break table output if PDF cells
+        #        are merged across table columns.
+        # False: Let table structure model define the text cells, ignore PDF cells.
+    )
+    mode: TableFormerMode = TableFormerMode.FAST
+class PipelineOptions(BaseModel):
+    do_table_structure: bool = True  # True: perform table structure extraction
+    do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text
+    table_structure_options: TableStructureOptions = TableStructureOptions()

docling/document_converter.py CHANGED Viewed

@@ -18,13 +18,13 @@ from docling.datamodel.base_models import (
     DoclingComponentType,
     ErrorItem,
     Page,
-    PipelineOptions,
 )
 from docling.datamodel.document import (
     ConversionResult,
     DocumentConversionInput,
     InputDocument,
 )
+from docling.datamodel.pipeline_options import PipelineOptions
 from docling.datamodel.settings import settings
 from docling.models.ds_glm_model import GlmModel
 from docling.models.page_assemble_model import PageAssembleModel

docling/models/table_structure_model.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import copy
+from pathlib import Path
 from typing import Iterable, List
 import numpy
@@ -12,16 +13,22 @@ from docling.datamodel.base_models import (
     TableElement,
     TableStructurePrediction,
 )
+from docling.datamodel.pipeline_options import TableFormerMode
 class TableStructureModel:
     def __init__(self, config):
         self.config = config
         self.do_cell_matching = config["do_cell_matching"]
+        self.mode = config["mode"]
         self.enabled = config["enabled"]
         if self.enabled:
-            artifacts_path = config["artifacts_path"]
+            artifacts_path: Path = config["artifacts_path"]
+            if self.mode == TableFormerMode.ACCURATE:
+                artifacts_path = artifacts_path / "fat"
             # Third Party
             import docling_ibm_models.tableformer.common as c

docling/pipeline/base_model_pipeline.py CHANGED Viewed

@@ -1,7 +1,8 @@
 from pathlib import Path
 from typing import Callable, Iterable, List
-from docling.datamodel.base_models import Page, PipelineOptions
+from docling.datamodel.base_models import Page
+from docling.datamodel.pipeline_options import PipelineOptions
 class BaseModelPipeline:

docling/pipeline/standard_model_pipeline.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from pathlib import Path
-from docling.datamodel.base_models import PipelineOptions
+from docling.datamodel.pipeline_options import PipelineOptions
 from docling.models.easyocr_model import EasyOcrModel
 from docling.models.layout_model import LayoutModel
 from docling.models.table_structure_model import TableStructureModel
@@ -32,6 +32,7 @@ class StandardModelPipeline(BaseModelPipeline):
                     "artifacts_path": artifacts_path
                     / StandardModelPipeline._table_model_path,
                     "enabled": pipeline_options.do_table_structure,
+                    "mode": pipeline_options.table_structure_options.mode,
                     "do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
                 }
             ),

{docling-1.15.0.dist-info → docling-1.16.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 1.15.0
+Version: 1.16.0
 Summary: Docling PDF conversion package
 Home-page: https://github.com/DS4SD/docling
 License: MIT
@@ -19,7 +19,6 @@ Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
-Provides-Extra: examples
 Requires-Dist: certifi (>=2024.7.4)
 Requires-Dist: deepsearch-glm (>=0.21.1,<0.22.0)
 Requires-Dist: docling-core (>=1.6.2,<2.0.0)
@@ -28,17 +27,10 @@ Requires-Dist: docling-parse (>=1.2.0,<2.0.0)
 Requires-Dist: easyocr (>=1.7,<2.0)
 Requires-Dist: filetype (>=1.2.0,<2.0.0)
 Requires-Dist: huggingface_hub (>=0.23,<1)
-Requires-Dist: langchain-huggingface (>=0.0.3,<0.0.4) ; extra == "examples"
-Requires-Dist: langchain-milvus (>=0.1.4,<0.2.0) ; extra == "examples"
-Requires-Dist: langchain-text-splitters (>=0.2.4,<0.3.0) ; extra == "examples"
-Requires-Dist: llama-index-embeddings-huggingface (>=0.3.1,<0.4.0) ; extra == "examples"
-Requires-Dist: llama-index-llms-huggingface-api (>=0.2.0,<0.3.0) ; extra == "examples"
-Requires-Dist: llama-index-vector-stores-milvus (>=0.2.1,<0.3.0) ; extra == "examples"
 Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
 Requires-Dist: pydantic (>=2.0.0,<3.0.0)
 Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
 Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
-Requires-Dist: python-dotenv (>=1.0.1,<2.0.0) ; extra == "examples"
 Requires-Dist: requests (>=2.32.3,<3.0.0)
 Requires-Dist: rtree (>=1.3.0,<2.0.0)
 Requires-Dist: scipy (>=1.14.1,<2.0.0)
@@ -211,6 +203,8 @@ This can improve output quality if you find that multiple columns in extracted t
 ```python
+from docling.datamodel.pipeline_options import PipelineOptions
 pipeline_options = PipelineOptions(do_table_structure=True)
 pipeline_options.table_structure_options.do_cell_matching = False  # uses text cells predicted from table structure model
@@ -220,6 +214,20 @@ doc_converter = DocumentConverter(
 )
 ```
+Since docling 1.16.0: You can control which TableFormer mode you want to use. Choose between `TableFormerMode.FAST` (default) and `TableFormerMode.ACCURATE` (better, but slower) to receive better quality with difficult table structures.
+```python
+from docling.datamodel.pipeline_options import PipelineOptions, TableFormerMode
+pipeline_options = PipelineOptions(do_table_structure=True)
+pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE  # use more accurate TableFormer model
+doc_converter = DocumentConverter(
+    artifacts_path=artifacts_path,
+    pipeline_options=pipeline_options,
+)
+```
 ### Impose limits on the document size
 You can limit the file size and number of pages which should be allowed to process per document:

{docling-1.15.0.dist-info → docling-1.16.0.dist-info}/RECORD RENAMED Viewed

@@ -4,28 +4,29 @@ docling/backend/abstract_backend.py,sha256=clJtGxLedpLriEhpx7oyxjmlwMLPorkv-1tdf
 docling/backend/docling_parse_backend.py,sha256=RUWWZbx2cUotZeeTkc-Lbg2k8MVFXFxaDjM4sPfaFZE,7475
 docling/backend/pypdfium2_backend.py,sha256=bIIImVM73wmcVcKMqjl4JF8CD-Qj2W5rZbI4G7clU4s,8877
 docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/cli/main.py,sha256=gJBxgZIGza0UBUAPP8pVFp_Ma3rzB9CCw-w3Bs5wieE,7121
+docling/cli/main.py,sha256=dgzaRJib5jlDbV1JfIRRYRSCT-ZiJkRwsx7KjlSVbUU,7167
 docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/datamodel/base_models.py,sha256=tE2Sxoe3e_fBZjq3GDo2NCughDMU5xDeAfkQgT72TRI,9168
-docling/datamodel/document.py,sha256=hzWObTCtPPU7tvMr5FRKAT-7JGK4lGoOJuAHyULYuxc,16186
+docling/datamodel/base_models.py,sha256=b2FXRhfHrJiBGr5OdlSJ1ssUzvU2rVjelfY5_0YoK8w,8732
+docling/datamodel/document.py,sha256=8iHylLaty-ZVYGzYmP61dZPtduzEyiRM79Tx3gS9UEU,16195
+docling/datamodel/pipeline_options.py,sha256=SuJzaLHTR_-ZZz8KB0wwYRxULTcnfQESiilOGCY0fpc,773
 docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
-docling/document_converter.py,sha256=5OiNafoaVcQhZ8ATF69xRp2KyFyKeSMhmwEFUoCzP-k,10980
+docling/document_converter.py,sha256=bk345OKkHbBngc3QS6BevT8068yEt1cS-PtAH3k_tCo,11022
 docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/models/base_ocr_model.py,sha256=Ipl82a3AV2OsgMQSMEMpnWJ6MXcmyIQzmp52PmTaB0g,4465
 docling/models/ds_glm_model.py,sha256=VXGmj8cW0WKMz1He4tp1lZhXHkS8Z39U1G-ujkc7deU,3368
 docling/models/easyocr_model.py,sha256=ABIqALvtNNrDQ47fXaZ0lDFhOwKsYGUUlAPnIsFZgZA,2232
 docling/models/layout_model.py,sha256=ZFmaLXlRWUfsT1pJCiYVxhQFrBBsiz6Aw0m9GM3UvVM,11249
 docling/models/page_assemble_model.py,sha256=8eoG2WiFxPxq9TPvM-wkngb2gkr0tdtCRVXg1JcTETo,5550
-docling/models/table_structure_model.py,sha256=0wOeiRoma6et7FtoJZw2SA3wBd9-R9ivp5uvXBQqeM4,5768
+docling/models/table_structure_model.py,sha256=iHJjWdKCpTcH3l_ElMWnC5pt6tkUpIuByed304Fdq9w,6009
 docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/pipeline/base_model_pipeline.py,sha256=H5XoADpsJEZls8BI3FnppR2ubltkQwf_er4Qr74rdQ8,561
-docling/pipeline/standard_model_pipeline.py,sha256=UTjyaEXvz9htYZz-IMTkn11cZwNjgvo_Fl2dfBVnRQs,1442
+docling/pipeline/base_model_pipeline.py,sha256=rrMog3EuiR5Gx9OWtfMj24rQvHCrWkxZ3g9OIr7LPSQ,607
+docling/pipeline/standard_model_pipeline.py,sha256=9HdUq9TjNOsE9ixA_MpWNmyQUaTg7SS-aVE0zP7Ujys,1522
 docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/utils/export.py,sha256=bKLdbeUcR-rQsGPV1IqJkCHKMCv7X2QOHyxmjNuH3HE,4655
 docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
 docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
-docling-1.15.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
-docling-1.15.0.dist-info/METADATA,sha256=rPvnvD2kQvVibj_Iwf3U6LGjxDaX1Bm8p9dXBuNWPcY,13208
-docling-1.15.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-docling-1.15.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
-docling-1.15.0.dist-info/RECORD,,
+docling-1.16.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
+docling-1.16.0.dist-info/METADATA,sha256=AESUBe1Ea2pC0vJvvNwLW7l_z26k3iriJiicLFU3U7E,13308
+docling-1.16.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+docling-1.16.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
+docling-1.16.0.dist-info/RECORD,,

{docling-1.15.0.dist-info → docling-1.16.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{docling-1.15.0.dist-info → docling-1.16.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{docling-1.15.0.dist-info → docling-1.16.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

docling 1.15.0__py3-none-any.whl → 1.16.0__py3-none-any.whl

docling 1.15.0py3-none-any.whl → 1.16.0py3-none-any.whl