docling 1.15.0__py3-none-any.whl → 1.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/cli/main.py +2 -1
- docling/datamodel/base_models.py +4 -16
- docling/datamodel/document.py +1 -1
- docling/datamodel/pipeline_options.py +25 -0
- docling/document_converter.py +1 -1
- docling/models/table_structure_model.py +8 -1
- docling/pipeline/base_model_pipeline.py +2 -1
- docling/pipeline/standard_model_pipeline.py +2 -1
- {docling-1.15.0.dist-info → docling-1.16.0.dist-info}/METADATA +17 -9
- {docling-1.15.0.dist-info → docling-1.16.0.dist-info}/RECORD +13 -12
- {docling-1.15.0.dist-info → docling-1.16.0.dist-info}/LICENSE +0 -0
- {docling-1.15.0.dist-info → docling-1.16.0.dist-info}/WHEEL +0 -0
- {docling-1.15.0.dist-info → docling-1.16.0.dist-info}/entry_points.txt +0 -0
docling/cli/main.py
CHANGED
@@ -12,8 +12,9 @@ from docling_core.utils.file import resolve_file_source
|
|
12
12
|
|
13
13
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
14
14
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
15
|
-
from docling.datamodel.base_models import ConversionStatus
|
15
|
+
from docling.datamodel.base_models import ConversionStatus
|
16
16
|
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
17
|
+
from docling.datamodel.pipeline_options import PipelineOptions
|
17
18
|
from docling.document_converter import DocumentConverter
|
18
19
|
|
19
20
|
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
docling/datamodel/base_models.py
CHANGED
@@ -9,6 +9,10 @@ from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
9
9
|
from typing_extensions import Self
|
10
10
|
|
11
11
|
from docling.backend.abstract_backend import PdfPageBackend
|
12
|
+
from docling.datamodel.pipeline_options import ( # Must be imported here for backward compatibility.
|
13
|
+
PipelineOptions,
|
14
|
+
TableStructureOptions,
|
15
|
+
)
|
12
16
|
|
13
17
|
|
14
18
|
class ConversionStatus(str, Enum):
|
@@ -298,22 +302,6 @@ class DocumentStream(BaseModel):
|
|
298
302
|
stream: BytesIO
|
299
303
|
|
300
304
|
|
301
|
-
class TableStructureOptions(BaseModel):
|
302
|
-
do_cell_matching: bool = (
|
303
|
-
True
|
304
|
-
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
|
305
|
-
# are merged across table columns.
|
306
|
-
# False: Let table structure model define the text cells, ignore PDF cells.
|
307
|
-
)
|
308
|
-
|
309
|
-
|
310
|
-
class PipelineOptions(BaseModel):
|
311
|
-
do_table_structure: bool = True # True: perform table structure extraction
|
312
|
-
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
313
|
-
|
314
|
-
table_structure_options: TableStructureOptions = TableStructureOptions()
|
315
|
-
|
316
|
-
|
317
305
|
class AssembleOptions(BaseModel):
|
318
306
|
keep_page_images: Annotated[
|
319
307
|
bool,
|
docling/datamodel/document.py
CHANGED
@@ -4,13 +4,13 @@ from pathlib import Path, PurePath
|
|
4
4
|
from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
|
5
5
|
|
6
6
|
from docling_core.types import BaseCell, BaseText
|
7
|
-
from docling_core.types import BoundingBox as DsBoundingBox
|
8
7
|
from docling_core.types import Document as DsDocument
|
9
8
|
from docling_core.types import DocumentDescription as DsDocumentDescription
|
10
9
|
from docling_core.types import FileInfoObject as DsFileInfoObject
|
11
10
|
from docling_core.types import PageDimensions, PageReference, Prov, Ref
|
12
11
|
from docling_core.types import Table as DsSchemaTable
|
13
12
|
from docling_core.types import TableCell
|
13
|
+
from docling_core.types.doc.base import BoundingBox as DsBoundingBox
|
14
14
|
from docling_core.types.doc.base import Figure
|
15
15
|
from pydantic import BaseModel
|
16
16
|
from typing_extensions import deprecated
|
@@ -0,0 +1,25 @@
|
|
1
|
+
from enum import Enum, auto
|
2
|
+
|
3
|
+
from pydantic import BaseModel
|
4
|
+
|
5
|
+
|
6
|
+
class TableFormerMode(str, Enum):
|
7
|
+
FAST = auto()
|
8
|
+
ACCURATE = auto()
|
9
|
+
|
10
|
+
|
11
|
+
class TableStructureOptions(BaseModel):
|
12
|
+
do_cell_matching: bool = (
|
13
|
+
True
|
14
|
+
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
|
15
|
+
# are merged across table columns.
|
16
|
+
# False: Let table structure model define the text cells, ignore PDF cells.
|
17
|
+
)
|
18
|
+
mode: TableFormerMode = TableFormerMode.FAST
|
19
|
+
|
20
|
+
|
21
|
+
class PipelineOptions(BaseModel):
|
22
|
+
do_table_structure: bool = True # True: perform table structure extraction
|
23
|
+
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
24
|
+
|
25
|
+
table_structure_options: TableStructureOptions = TableStructureOptions()
|
docling/document_converter.py
CHANGED
@@ -18,13 +18,13 @@ from docling.datamodel.base_models import (
|
|
18
18
|
DoclingComponentType,
|
19
19
|
ErrorItem,
|
20
20
|
Page,
|
21
|
-
PipelineOptions,
|
22
21
|
)
|
23
22
|
from docling.datamodel.document import (
|
24
23
|
ConversionResult,
|
25
24
|
DocumentConversionInput,
|
26
25
|
InputDocument,
|
27
26
|
)
|
27
|
+
from docling.datamodel.pipeline_options import PipelineOptions
|
28
28
|
from docling.datamodel.settings import settings
|
29
29
|
from docling.models.ds_glm_model import GlmModel
|
30
30
|
from docling.models.page_assemble_model import PageAssembleModel
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import copy
|
2
|
+
from pathlib import Path
|
2
3
|
from typing import Iterable, List
|
3
4
|
|
4
5
|
import numpy
|
@@ -12,16 +13,22 @@ from docling.datamodel.base_models import (
|
|
12
13
|
TableElement,
|
13
14
|
TableStructurePrediction,
|
14
15
|
)
|
16
|
+
from docling.datamodel.pipeline_options import TableFormerMode
|
15
17
|
|
16
18
|
|
17
19
|
class TableStructureModel:
|
18
20
|
def __init__(self, config):
|
19
21
|
self.config = config
|
20
22
|
self.do_cell_matching = config["do_cell_matching"]
|
23
|
+
self.mode = config["mode"]
|
21
24
|
|
22
25
|
self.enabled = config["enabled"]
|
23
26
|
if self.enabled:
|
24
|
-
artifacts_path = config["artifacts_path"]
|
27
|
+
artifacts_path: Path = config["artifacts_path"]
|
28
|
+
|
29
|
+
if self.mode == TableFormerMode.ACCURATE:
|
30
|
+
artifacts_path = artifacts_path / "fat"
|
31
|
+
|
25
32
|
# Third Party
|
26
33
|
import docling_ibm_models.tableformer.common as c
|
27
34
|
|
@@ -1,7 +1,8 @@
|
|
1
1
|
from pathlib import Path
|
2
2
|
from typing import Callable, Iterable, List
|
3
3
|
|
4
|
-
from docling.datamodel.base_models import Page
|
4
|
+
from docling.datamodel.base_models import Page
|
5
|
+
from docling.datamodel.pipeline_options import PipelineOptions
|
5
6
|
|
6
7
|
|
7
8
|
class BaseModelPipeline:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
from pathlib import Path
|
2
2
|
|
3
|
-
from docling.datamodel.
|
3
|
+
from docling.datamodel.pipeline_options import PipelineOptions
|
4
4
|
from docling.models.easyocr_model import EasyOcrModel
|
5
5
|
from docling.models.layout_model import LayoutModel
|
6
6
|
from docling.models.table_structure_model import TableStructureModel
|
@@ -32,6 +32,7 @@ class StandardModelPipeline(BaseModelPipeline):
|
|
32
32
|
"artifacts_path": artifacts_path
|
33
33
|
/ StandardModelPipeline._table_model_path,
|
34
34
|
"enabled": pipeline_options.do_table_structure,
|
35
|
+
"mode": pipeline_options.table_structure_options.mode,
|
35
36
|
"do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
|
36
37
|
}
|
37
38
|
),
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.16.0
|
4
4
|
Summary: Docling PDF conversion package
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -19,7 +19,6 @@ Classifier: Programming Language :: Python :: 3.10
|
|
19
19
|
Classifier: Programming Language :: Python :: 3.11
|
20
20
|
Classifier: Programming Language :: Python :: 3.12
|
21
21
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
22
|
-
Provides-Extra: examples
|
23
22
|
Requires-Dist: certifi (>=2024.7.4)
|
24
23
|
Requires-Dist: deepsearch-glm (>=0.21.1,<0.22.0)
|
25
24
|
Requires-Dist: docling-core (>=1.6.2,<2.0.0)
|
@@ -28,17 +27,10 @@ Requires-Dist: docling-parse (>=1.2.0,<2.0.0)
|
|
28
27
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
29
28
|
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
30
29
|
Requires-Dist: huggingface_hub (>=0.23,<1)
|
31
|
-
Requires-Dist: langchain-huggingface (>=0.0.3,<0.0.4) ; extra == "examples"
|
32
|
-
Requires-Dist: langchain-milvus (>=0.1.4,<0.2.0) ; extra == "examples"
|
33
|
-
Requires-Dist: langchain-text-splitters (>=0.2.4,<0.3.0) ; extra == "examples"
|
34
|
-
Requires-Dist: llama-index-embeddings-huggingface (>=0.3.1,<0.4.0) ; extra == "examples"
|
35
|
-
Requires-Dist: llama-index-llms-huggingface-api (>=0.2.0,<0.3.0) ; extra == "examples"
|
36
|
-
Requires-Dist: llama-index-vector-stores-milvus (>=0.2.1,<0.3.0) ; extra == "examples"
|
37
30
|
Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
|
38
31
|
Requires-Dist: pydantic (>=2.0.0,<3.0.0)
|
39
32
|
Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
|
40
33
|
Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
|
41
|
-
Requires-Dist: python-dotenv (>=1.0.1,<2.0.0) ; extra == "examples"
|
42
34
|
Requires-Dist: requests (>=2.32.3,<3.0.0)
|
43
35
|
Requires-Dist: rtree (>=1.3.0,<2.0.0)
|
44
36
|
Requires-Dist: scipy (>=1.14.1,<2.0.0)
|
@@ -211,6 +203,8 @@ This can improve output quality if you find that multiple columns in extracted t
|
|
211
203
|
|
212
204
|
|
213
205
|
```python
|
206
|
+
from docling.datamodel.pipeline_options import PipelineOptions
|
207
|
+
|
214
208
|
pipeline_options = PipelineOptions(do_table_structure=True)
|
215
209
|
pipeline_options.table_structure_options.do_cell_matching = False # uses text cells predicted from table structure model
|
216
210
|
|
@@ -220,6 +214,20 @@ doc_converter = DocumentConverter(
|
|
220
214
|
)
|
221
215
|
```
|
222
216
|
|
217
|
+
Since docling 1.16.0: You can control which TableFormer mode you want to use. Choose between `TableFormerMode.FAST` (default) and `TableFormerMode.ACCURATE` (better, but slower) to receive better quality with difficult table structures.
|
218
|
+
|
219
|
+
```python
|
220
|
+
from docling.datamodel.pipeline_options import PipelineOptions, TableFormerMode
|
221
|
+
|
222
|
+
pipeline_options = PipelineOptions(do_table_structure=True)
|
223
|
+
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE # use more accurate TableFormer model
|
224
|
+
|
225
|
+
doc_converter = DocumentConverter(
|
226
|
+
artifacts_path=artifacts_path,
|
227
|
+
pipeline_options=pipeline_options,
|
228
|
+
)
|
229
|
+
```
|
230
|
+
|
223
231
|
### Impose limits on the document size
|
224
232
|
|
225
233
|
You can limit the file size and number of pages which should be allowed to process per document:
|
@@ -4,28 +4,29 @@ docling/backend/abstract_backend.py,sha256=clJtGxLedpLriEhpx7oyxjmlwMLPorkv-1tdf
|
|
4
4
|
docling/backend/docling_parse_backend.py,sha256=RUWWZbx2cUotZeeTkc-Lbg2k8MVFXFxaDjM4sPfaFZE,7475
|
5
5
|
docling/backend/pypdfium2_backend.py,sha256=bIIImVM73wmcVcKMqjl4JF8CD-Qj2W5rZbI4G7clU4s,8877
|
6
6
|
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
docling/cli/main.py,sha256=
|
7
|
+
docling/cli/main.py,sha256=dgzaRJib5jlDbV1JfIRRYRSCT-ZiJkRwsx7KjlSVbUU,7167
|
8
8
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
|
-
docling/datamodel/base_models.py,sha256=
|
10
|
-
docling/datamodel/document.py,sha256=
|
9
|
+
docling/datamodel/base_models.py,sha256=b2FXRhfHrJiBGr5OdlSJ1ssUzvU2rVjelfY5_0YoK8w,8732
|
10
|
+
docling/datamodel/document.py,sha256=8iHylLaty-ZVYGzYmP61dZPtduzEyiRM79Tx3gS9UEU,16195
|
11
|
+
docling/datamodel/pipeline_options.py,sha256=SuJzaLHTR_-ZZz8KB0wwYRxULTcnfQESiilOGCY0fpc,773
|
11
12
|
docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
|
12
|
-
docling/document_converter.py,sha256=
|
13
|
+
docling/document_converter.py,sha256=bk345OKkHbBngc3QS6BevT8068yEt1cS-PtAH3k_tCo,11022
|
13
14
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
15
|
docling/models/base_ocr_model.py,sha256=Ipl82a3AV2OsgMQSMEMpnWJ6MXcmyIQzmp52PmTaB0g,4465
|
15
16
|
docling/models/ds_glm_model.py,sha256=VXGmj8cW0WKMz1He4tp1lZhXHkS8Z39U1G-ujkc7deU,3368
|
16
17
|
docling/models/easyocr_model.py,sha256=ABIqALvtNNrDQ47fXaZ0lDFhOwKsYGUUlAPnIsFZgZA,2232
|
17
18
|
docling/models/layout_model.py,sha256=ZFmaLXlRWUfsT1pJCiYVxhQFrBBsiz6Aw0m9GM3UvVM,11249
|
18
19
|
docling/models/page_assemble_model.py,sha256=8eoG2WiFxPxq9TPvM-wkngb2gkr0tdtCRVXg1JcTETo,5550
|
19
|
-
docling/models/table_structure_model.py,sha256=
|
20
|
+
docling/models/table_structure_model.py,sha256=iHJjWdKCpTcH3l_ElMWnC5pt6tkUpIuByed304Fdq9w,6009
|
20
21
|
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
21
|
-
docling/pipeline/base_model_pipeline.py,sha256=
|
22
|
-
docling/pipeline/standard_model_pipeline.py,sha256=
|
22
|
+
docling/pipeline/base_model_pipeline.py,sha256=rrMog3EuiR5Gx9OWtfMj24rQvHCrWkxZ3g9OIr7LPSQ,607
|
23
|
+
docling/pipeline/standard_model_pipeline.py,sha256=9HdUq9TjNOsE9ixA_MpWNmyQUaTg7SS-aVE0zP7Ujys,1522
|
23
24
|
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
24
25
|
docling/utils/export.py,sha256=bKLdbeUcR-rQsGPV1IqJkCHKMCv7X2QOHyxmjNuH3HE,4655
|
25
26
|
docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
|
26
27
|
docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
|
27
|
-
docling-1.
|
28
|
-
docling-1.
|
29
|
-
docling-1.
|
30
|
-
docling-1.
|
31
|
-
docling-1.
|
28
|
+
docling-1.16.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
29
|
+
docling-1.16.0.dist-info/METADATA,sha256=AESUBe1Ea2pC0vJvvNwLW7l_z26k3iriJiicLFU3U7E,13308
|
30
|
+
docling-1.16.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
31
|
+
docling-1.16.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
|
32
|
+
docling-1.16.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|