docling 1.15.0__py3-none-any.whl → 1.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
docling/cli/main.py CHANGED
@@ -12,8 +12,9 @@ from docling_core.utils.file import resolve_file_source
12
12
 
13
13
  from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
14
14
  from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
15
- from docling.datamodel.base_models import ConversionStatus, PipelineOptions
15
+ from docling.datamodel.base_models import ConversionStatus
16
16
  from docling.datamodel.document import ConversionResult, DocumentConversionInput
17
+ from docling.datamodel.pipeline_options import PipelineOptions
17
18
  from docling.document_converter import DocumentConverter
18
19
 
19
20
  warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
@@ -9,6 +9,10 @@ from pydantic import BaseModel, ConfigDict, Field, model_validator
9
9
  from typing_extensions import Self
10
10
 
11
11
  from docling.backend.abstract_backend import PdfPageBackend
12
+ from docling.datamodel.pipeline_options import ( # Must be imported here for backward compatibility.
13
+ PipelineOptions,
14
+ TableStructureOptions,
15
+ )
12
16
 
13
17
 
14
18
  class ConversionStatus(str, Enum):
@@ -298,22 +302,6 @@ class DocumentStream(BaseModel):
298
302
  stream: BytesIO
299
303
 
300
304
 
301
- class TableStructureOptions(BaseModel):
302
- do_cell_matching: bool = (
303
- True
304
- # True: Matches predictions back to PDF cells. Can break table output if PDF cells
305
- # are merged across table columns.
306
- # False: Let table structure model define the text cells, ignore PDF cells.
307
- )
308
-
309
-
310
- class PipelineOptions(BaseModel):
311
- do_table_structure: bool = True # True: perform table structure extraction
312
- do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
313
-
314
- table_structure_options: TableStructureOptions = TableStructureOptions()
315
-
316
-
317
305
  class AssembleOptions(BaseModel):
318
306
  keep_page_images: Annotated[
319
307
  bool,
@@ -4,13 +4,13 @@ from pathlib import Path, PurePath
4
4
  from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
5
5
 
6
6
  from docling_core.types import BaseCell, BaseText
7
- from docling_core.types import BoundingBox as DsBoundingBox
8
7
  from docling_core.types import Document as DsDocument
9
8
  from docling_core.types import DocumentDescription as DsDocumentDescription
10
9
  from docling_core.types import FileInfoObject as DsFileInfoObject
11
10
  from docling_core.types import PageDimensions, PageReference, Prov, Ref
12
11
  from docling_core.types import Table as DsSchemaTable
13
12
  from docling_core.types import TableCell
13
+ from docling_core.types.doc.base import BoundingBox as DsBoundingBox
14
14
  from docling_core.types.doc.base import Figure
15
15
  from pydantic import BaseModel
16
16
  from typing_extensions import deprecated
@@ -0,0 +1,25 @@
1
+ from enum import Enum, auto
2
+
3
+ from pydantic import BaseModel
4
+
5
+
6
+ class TableFormerMode(str, Enum):
7
+ FAST = auto()
8
+ ACCURATE = auto()
9
+
10
+
11
+ class TableStructureOptions(BaseModel):
12
+ do_cell_matching: bool = (
13
+ True
14
+ # True: Matches predictions back to PDF cells. Can break table output if PDF cells
15
+ # are merged across table columns.
16
+ # False: Let table structure model define the text cells, ignore PDF cells.
17
+ )
18
+ mode: TableFormerMode = TableFormerMode.FAST
19
+
20
+
21
+ class PipelineOptions(BaseModel):
22
+ do_table_structure: bool = True # True: perform table structure extraction
23
+ do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
24
+
25
+ table_structure_options: TableStructureOptions = TableStructureOptions()
@@ -18,13 +18,13 @@ from docling.datamodel.base_models import (
18
18
  DoclingComponentType,
19
19
  ErrorItem,
20
20
  Page,
21
- PipelineOptions,
22
21
  )
23
22
  from docling.datamodel.document import (
24
23
  ConversionResult,
25
24
  DocumentConversionInput,
26
25
  InputDocument,
27
26
  )
27
+ from docling.datamodel.pipeline_options import PipelineOptions
28
28
  from docling.datamodel.settings import settings
29
29
  from docling.models.ds_glm_model import GlmModel
30
30
  from docling.models.page_assemble_model import PageAssembleModel
@@ -1,4 +1,5 @@
1
1
  import copy
2
+ from pathlib import Path
2
3
  from typing import Iterable, List
3
4
 
4
5
  import numpy
@@ -12,16 +13,22 @@ from docling.datamodel.base_models import (
12
13
  TableElement,
13
14
  TableStructurePrediction,
14
15
  )
16
+ from docling.datamodel.pipeline_options import TableFormerMode
15
17
 
16
18
 
17
19
  class TableStructureModel:
18
20
  def __init__(self, config):
19
21
  self.config = config
20
22
  self.do_cell_matching = config["do_cell_matching"]
23
+ self.mode = config["mode"]
21
24
 
22
25
  self.enabled = config["enabled"]
23
26
  if self.enabled:
24
- artifacts_path = config["artifacts_path"]
27
+ artifacts_path: Path = config["artifacts_path"]
28
+
29
+ if self.mode == TableFormerMode.ACCURATE:
30
+ artifacts_path = artifacts_path / "fat"
31
+
25
32
  # Third Party
26
33
  import docling_ibm_models.tableformer.common as c
27
34
 
@@ -1,7 +1,8 @@
1
1
  from pathlib import Path
2
2
  from typing import Callable, Iterable, List
3
3
 
4
- from docling.datamodel.base_models import Page, PipelineOptions
4
+ from docling.datamodel.base_models import Page
5
+ from docling.datamodel.pipeline_options import PipelineOptions
5
6
 
6
7
 
7
8
  class BaseModelPipeline:
@@ -1,6 +1,6 @@
1
1
  from pathlib import Path
2
2
 
3
- from docling.datamodel.base_models import PipelineOptions
3
+ from docling.datamodel.pipeline_options import PipelineOptions
4
4
  from docling.models.easyocr_model import EasyOcrModel
5
5
  from docling.models.layout_model import LayoutModel
6
6
  from docling.models.table_structure_model import TableStructureModel
@@ -32,6 +32,7 @@ class StandardModelPipeline(BaseModelPipeline):
32
32
  "artifacts_path": artifacts_path
33
33
  / StandardModelPipeline._table_model_path,
34
34
  "enabled": pipeline_options.do_table_structure,
35
+ "mode": pipeline_options.table_structure_options.mode,
35
36
  "do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
36
37
  }
37
38
  ),
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 1.15.0
3
+ Version: 1.16.0
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -19,7 +19,6 @@ Classifier: Programming Language :: Python :: 3.10
19
19
  Classifier: Programming Language :: Python :: 3.11
20
20
  Classifier: Programming Language :: Python :: 3.12
21
21
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
- Provides-Extra: examples
23
22
  Requires-Dist: certifi (>=2024.7.4)
24
23
  Requires-Dist: deepsearch-glm (>=0.21.1,<0.22.0)
25
24
  Requires-Dist: docling-core (>=1.6.2,<2.0.0)
@@ -28,17 +27,10 @@ Requires-Dist: docling-parse (>=1.2.0,<2.0.0)
28
27
  Requires-Dist: easyocr (>=1.7,<2.0)
29
28
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
30
29
  Requires-Dist: huggingface_hub (>=0.23,<1)
31
- Requires-Dist: langchain-huggingface (>=0.0.3,<0.0.4) ; extra == "examples"
32
- Requires-Dist: langchain-milvus (>=0.1.4,<0.2.0) ; extra == "examples"
33
- Requires-Dist: langchain-text-splitters (>=0.2.4,<0.3.0) ; extra == "examples"
34
- Requires-Dist: llama-index-embeddings-huggingface (>=0.3.1,<0.4.0) ; extra == "examples"
35
- Requires-Dist: llama-index-llms-huggingface-api (>=0.2.0,<0.3.0) ; extra == "examples"
36
- Requires-Dist: llama-index-vector-stores-milvus (>=0.2.1,<0.3.0) ; extra == "examples"
37
30
  Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
38
31
  Requires-Dist: pydantic (>=2.0.0,<3.0.0)
39
32
  Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
40
33
  Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
41
- Requires-Dist: python-dotenv (>=1.0.1,<2.0.0) ; extra == "examples"
42
34
  Requires-Dist: requests (>=2.32.3,<3.0.0)
43
35
  Requires-Dist: rtree (>=1.3.0,<2.0.0)
44
36
  Requires-Dist: scipy (>=1.14.1,<2.0.0)
@@ -211,6 +203,8 @@ This can improve output quality if you find that multiple columns in extracted t
211
203
 
212
204
 
213
205
  ```python
206
+ from docling.datamodel.pipeline_options import PipelineOptions
207
+
214
208
  pipeline_options = PipelineOptions(do_table_structure=True)
215
209
  pipeline_options.table_structure_options.do_cell_matching = False # uses text cells predicted from table structure model
216
210
 
@@ -220,6 +214,20 @@ doc_converter = DocumentConverter(
220
214
  )
221
215
  ```
222
216
 
217
+ Since docling 1.16.0: You can control which TableFormer mode you want to use. Choose between `TableFormerMode.FAST` (default) and `TableFormerMode.ACCURATE` (better, but slower) to receive better quality with difficult table structures.
218
+
219
+ ```python
220
+ from docling.datamodel.pipeline_options import PipelineOptions, TableFormerMode
221
+
222
+ pipeline_options = PipelineOptions(do_table_structure=True)
223
+ pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE # use more accurate TableFormer model
224
+
225
+ doc_converter = DocumentConverter(
226
+ artifacts_path=artifacts_path,
227
+ pipeline_options=pipeline_options,
228
+ )
229
+ ```
230
+
223
231
  ### Impose limits on the document size
224
232
 
225
233
  You can limit the file size and number of pages which should be allowed to process per document:
@@ -4,28 +4,29 @@ docling/backend/abstract_backend.py,sha256=clJtGxLedpLriEhpx7oyxjmlwMLPorkv-1tdf
4
4
  docling/backend/docling_parse_backend.py,sha256=RUWWZbx2cUotZeeTkc-Lbg2k8MVFXFxaDjM4sPfaFZE,7475
5
5
  docling/backend/pypdfium2_backend.py,sha256=bIIImVM73wmcVcKMqjl4JF8CD-Qj2W5rZbI4G7clU4s,8877
6
6
  docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- docling/cli/main.py,sha256=gJBxgZIGza0UBUAPP8pVFp_Ma3rzB9CCw-w3Bs5wieE,7121
7
+ docling/cli/main.py,sha256=dgzaRJib5jlDbV1JfIRRYRSCT-ZiJkRwsx7KjlSVbUU,7167
8
8
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
- docling/datamodel/base_models.py,sha256=tE2Sxoe3e_fBZjq3GDo2NCughDMU5xDeAfkQgT72TRI,9168
10
- docling/datamodel/document.py,sha256=hzWObTCtPPU7tvMr5FRKAT-7JGK4lGoOJuAHyULYuxc,16186
9
+ docling/datamodel/base_models.py,sha256=b2FXRhfHrJiBGr5OdlSJ1ssUzvU2rVjelfY5_0YoK8w,8732
10
+ docling/datamodel/document.py,sha256=8iHylLaty-ZVYGzYmP61dZPtduzEyiRM79Tx3gS9UEU,16195
11
+ docling/datamodel/pipeline_options.py,sha256=SuJzaLHTR_-ZZz8KB0wwYRxULTcnfQESiilOGCY0fpc,773
11
12
  docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
12
- docling/document_converter.py,sha256=5OiNafoaVcQhZ8ATF69xRp2KyFyKeSMhmwEFUoCzP-k,10980
13
+ docling/document_converter.py,sha256=bk345OKkHbBngc3QS6BevT8068yEt1cS-PtAH3k_tCo,11022
13
14
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
15
  docling/models/base_ocr_model.py,sha256=Ipl82a3AV2OsgMQSMEMpnWJ6MXcmyIQzmp52PmTaB0g,4465
15
16
  docling/models/ds_glm_model.py,sha256=VXGmj8cW0WKMz1He4tp1lZhXHkS8Z39U1G-ujkc7deU,3368
16
17
  docling/models/easyocr_model.py,sha256=ABIqALvtNNrDQ47fXaZ0lDFhOwKsYGUUlAPnIsFZgZA,2232
17
18
  docling/models/layout_model.py,sha256=ZFmaLXlRWUfsT1pJCiYVxhQFrBBsiz6Aw0m9GM3UvVM,11249
18
19
  docling/models/page_assemble_model.py,sha256=8eoG2WiFxPxq9TPvM-wkngb2gkr0tdtCRVXg1JcTETo,5550
19
- docling/models/table_structure_model.py,sha256=0wOeiRoma6et7FtoJZw2SA3wBd9-R9ivp5uvXBQqeM4,5768
20
+ docling/models/table_structure_model.py,sha256=iHJjWdKCpTcH3l_ElMWnC5pt6tkUpIuByed304Fdq9w,6009
20
21
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
- docling/pipeline/base_model_pipeline.py,sha256=H5XoADpsJEZls8BI3FnppR2ubltkQwf_er4Qr74rdQ8,561
22
- docling/pipeline/standard_model_pipeline.py,sha256=UTjyaEXvz9htYZz-IMTkn11cZwNjgvo_Fl2dfBVnRQs,1442
22
+ docling/pipeline/base_model_pipeline.py,sha256=rrMog3EuiR5Gx9OWtfMj24rQvHCrWkxZ3g9OIr7LPSQ,607
23
+ docling/pipeline/standard_model_pipeline.py,sha256=9HdUq9TjNOsE9ixA_MpWNmyQUaTg7SS-aVE0zP7Ujys,1522
23
24
  docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
25
  docling/utils/export.py,sha256=bKLdbeUcR-rQsGPV1IqJkCHKMCv7X2QOHyxmjNuH3HE,4655
25
26
  docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
26
27
  docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
27
- docling-1.15.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
28
- docling-1.15.0.dist-info/METADATA,sha256=rPvnvD2kQvVibj_Iwf3U6LGjxDaX1Bm8p9dXBuNWPcY,13208
29
- docling-1.15.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
30
- docling-1.15.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
31
- docling-1.15.0.dist-info/RECORD,,
28
+ docling-1.16.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
29
+ docling-1.16.0.dist-info/METADATA,sha256=AESUBe1Ea2pC0vJvvNwLW7l_z26k3iriJiicLFU3U7E,13308
30
+ docling-1.16.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
31
+ docling-1.16.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
32
+ docling-1.16.0.dist-info/RECORD,,