docling 1.14.0__tar.gz → 1.16.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-1.14.0 → docling-1.16.0}/PKG-INFO +18 -10
- {docling-1.14.0 → docling-1.16.0}/README.md +16 -0
- {docling-1.14.0 → docling-1.16.0}/docling/cli/main.py +2 -1
- {docling-1.14.0 → docling-1.16.0}/docling/datamodel/base_models.py +4 -16
- {docling-1.14.0 → docling-1.16.0}/docling/datamodel/document.py +4 -1
- docling-1.16.0/docling/datamodel/pipeline_options.py +25 -0
- {docling-1.14.0 → docling-1.16.0}/docling/document_converter.py +1 -1
- {docling-1.14.0 → docling-1.16.0}/docling/models/table_structure_model.py +8 -1
- {docling-1.14.0 → docling-1.16.0}/docling/pipeline/base_model_pipeline.py +2 -1
- {docling-1.14.0 → docling-1.16.0}/docling/pipeline/standard_model_pipeline.py +2 -1
- {docling-1.14.0 → docling-1.16.0}/pyproject.toml +23 -39
- {docling-1.14.0 → docling-1.16.0}/LICENSE +0 -0
- {docling-1.14.0 → docling-1.16.0}/docling/__init__.py +0 -0
- {docling-1.14.0 → docling-1.16.0}/docling/backend/__init__.py +0 -0
- {docling-1.14.0 → docling-1.16.0}/docling/backend/abstract_backend.py +0 -0
- {docling-1.14.0 → docling-1.16.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-1.14.0 → docling-1.16.0}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-1.14.0 → docling-1.16.0}/docling/cli/__init__.py +0 -0
- {docling-1.14.0 → docling-1.16.0}/docling/datamodel/__init__.py +0 -0
- {docling-1.14.0 → docling-1.16.0}/docling/datamodel/settings.py +0 -0
- {docling-1.14.0 → docling-1.16.0}/docling/models/__init__.py +0 -0
- {docling-1.14.0 → docling-1.16.0}/docling/models/base_ocr_model.py +0 -0
- {docling-1.14.0 → docling-1.16.0}/docling/models/ds_glm_model.py +0 -0
- {docling-1.14.0 → docling-1.16.0}/docling/models/easyocr_model.py +0 -0
- {docling-1.14.0 → docling-1.16.0}/docling/models/layout_model.py +0 -0
- {docling-1.14.0 → docling-1.16.0}/docling/models/page_assemble_model.py +0 -0
- {docling-1.14.0 → docling-1.16.0}/docling/pipeline/__init__.py +0 -0
- {docling-1.14.0 → docling-1.16.0}/docling/utils/__init__.py +0 -0
- {docling-1.14.0 → docling-1.16.0}/docling/utils/export.py +0 -0
- {docling-1.14.0 → docling-1.16.0}/docling/utils/layout_utils.py +0 -0
- {docling-1.14.0 → docling-1.16.0}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.16.0
|
4
4
|
Summary: Docling PDF conversion package
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -19,26 +19,18 @@ Classifier: Programming Language :: Python :: 3.10
|
|
19
19
|
Classifier: Programming Language :: Python :: 3.11
|
20
20
|
Classifier: Programming Language :: Python :: 3.12
|
21
21
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
22
|
-
Provides-Extra: examples
|
23
22
|
Requires-Dist: certifi (>=2024.7.4)
|
24
23
|
Requires-Dist: deepsearch-glm (>=0.21.1,<0.22.0)
|
25
|
-
Requires-Dist: docling-core (>=1.
|
24
|
+
Requires-Dist: docling-core (>=1.6.2,<2.0.0)
|
26
25
|
Requires-Dist: docling-ibm-models (>=1.2.0,<2.0.0)
|
27
26
|
Requires-Dist: docling-parse (>=1.2.0,<2.0.0)
|
28
27
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
29
28
|
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
30
29
|
Requires-Dist: huggingface_hub (>=0.23,<1)
|
31
|
-
Requires-Dist: langchain-huggingface (>=0.0.3,<0.0.4) ; extra == "examples"
|
32
|
-
Requires-Dist: langchain-milvus (>=0.1.4,<0.2.0) ; extra == "examples"
|
33
|
-
Requires-Dist: langchain-text-splitters (>=0.2.4,<0.3.0) ; extra == "examples"
|
34
|
-
Requires-Dist: llama-index-embeddings-huggingface (>=0.3.1,<0.4.0) ; extra == "examples"
|
35
|
-
Requires-Dist: llama-index-llms-huggingface-api (>=0.2.0,<0.3.0) ; extra == "examples"
|
36
|
-
Requires-Dist: llama-index-vector-stores-milvus (>=0.2.1,<0.3.0) ; extra == "examples"
|
37
30
|
Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
|
38
31
|
Requires-Dist: pydantic (>=2.0.0,<3.0.0)
|
39
32
|
Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
|
40
33
|
Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
|
41
|
-
Requires-Dist: python-dotenv (>=1.0.1,<2.0.0) ; extra == "examples"
|
42
34
|
Requires-Dist: requests (>=2.32.3,<3.0.0)
|
43
35
|
Requires-Dist: rtree (>=1.3.0,<2.0.0)
|
44
36
|
Requires-Dist: scipy (>=1.14.1,<2.0.0)
|
@@ -211,6 +203,8 @@ This can improve output quality if you find that multiple columns in extracted t
|
|
211
203
|
|
212
204
|
|
213
205
|
```python
|
206
|
+
from docling.datamodel.pipeline_options import PipelineOptions
|
207
|
+
|
214
208
|
pipeline_options = PipelineOptions(do_table_structure=True)
|
215
209
|
pipeline_options.table_structure_options.do_cell_matching = False # uses text cells predicted from table structure model
|
216
210
|
|
@@ -220,6 +214,20 @@ doc_converter = DocumentConverter(
|
|
220
214
|
)
|
221
215
|
```
|
222
216
|
|
217
|
+
Since docling 1.16.0: You can control which TableFormer mode you want to use. Choose between `TableFormerMode.FAST` (default) and `TableFormerMode.ACCURATE` (better, but slower) to receive better quality with difficult table structures.
|
218
|
+
|
219
|
+
```python
|
220
|
+
from docling.datamodel.pipeline_options import PipelineOptions, TableFormerMode
|
221
|
+
|
222
|
+
pipeline_options = PipelineOptions(do_table_structure=True)
|
223
|
+
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE # use more accurate TableFormer model
|
224
|
+
|
225
|
+
doc_converter = DocumentConverter(
|
226
|
+
artifacts_path=artifacts_path,
|
227
|
+
pipeline_options=pipeline_options,
|
228
|
+
)
|
229
|
+
```
|
230
|
+
|
223
231
|
### Impose limits on the document size
|
224
232
|
|
225
233
|
You can limit the file size and number of pages which should be allowed to process per document:
|
@@ -159,6 +159,8 @@ This can improve output quality if you find that multiple columns in extracted t
|
|
159
159
|
|
160
160
|
|
161
161
|
```python
|
162
|
+
from docling.datamodel.pipeline_options import PipelineOptions
|
163
|
+
|
162
164
|
pipeline_options = PipelineOptions(do_table_structure=True)
|
163
165
|
pipeline_options.table_structure_options.do_cell_matching = False # uses text cells predicted from table structure model
|
164
166
|
|
@@ -168,6 +170,20 @@ doc_converter = DocumentConverter(
|
|
168
170
|
)
|
169
171
|
```
|
170
172
|
|
173
|
+
Since docling 1.16.0: You can control which TableFormer mode you want to use. Choose between `TableFormerMode.FAST` (default) and `TableFormerMode.ACCURATE` (better, but slower) to receive better quality with difficult table structures.
|
174
|
+
|
175
|
+
```python
|
176
|
+
from docling.datamodel.pipeline_options import PipelineOptions, TableFormerMode
|
177
|
+
|
178
|
+
pipeline_options = PipelineOptions(do_table_structure=True)
|
179
|
+
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE # use more accurate TableFormer model
|
180
|
+
|
181
|
+
doc_converter = DocumentConverter(
|
182
|
+
artifacts_path=artifacts_path,
|
183
|
+
pipeline_options=pipeline_options,
|
184
|
+
)
|
185
|
+
```
|
186
|
+
|
171
187
|
### Impose limits on the document size
|
172
188
|
|
173
189
|
You can limit the file size and number of pages which should be allowed to process per document:
|
@@ -12,8 +12,9 @@ from docling_core.utils.file import resolve_file_source
|
|
12
12
|
|
13
13
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
14
14
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
15
|
-
from docling.datamodel.base_models import ConversionStatus
|
15
|
+
from docling.datamodel.base_models import ConversionStatus
|
16
16
|
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
17
|
+
from docling.datamodel.pipeline_options import PipelineOptions
|
17
18
|
from docling.document_converter import DocumentConverter
|
18
19
|
|
19
20
|
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
@@ -9,6 +9,10 @@ from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
9
9
|
from typing_extensions import Self
|
10
10
|
|
11
11
|
from docling.backend.abstract_backend import PdfPageBackend
|
12
|
+
from docling.datamodel.pipeline_options import ( # Must be imported here for backward compatibility.
|
13
|
+
PipelineOptions,
|
14
|
+
TableStructureOptions,
|
15
|
+
)
|
12
16
|
|
13
17
|
|
14
18
|
class ConversionStatus(str, Enum):
|
@@ -298,22 +302,6 @@ class DocumentStream(BaseModel):
|
|
298
302
|
stream: BytesIO
|
299
303
|
|
300
304
|
|
301
|
-
class TableStructureOptions(BaseModel):
|
302
|
-
do_cell_matching: bool = (
|
303
|
-
True
|
304
|
-
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
|
305
|
-
# are merged across table columns.
|
306
|
-
# False: Let table structure model define the text cells, ignore PDF cells.
|
307
|
-
)
|
308
|
-
|
309
|
-
|
310
|
-
class PipelineOptions(BaseModel):
|
311
|
-
do_table_structure: bool = True # True: perform table structure extraction
|
312
|
-
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
313
|
-
|
314
|
-
table_structure_options: TableStructureOptions = TableStructureOptions()
|
315
|
-
|
316
|
-
|
317
305
|
class AssembleOptions(BaseModel):
|
318
306
|
keep_page_images: Annotated[
|
319
307
|
bool,
|
@@ -4,13 +4,13 @@ from pathlib import Path, PurePath
|
|
4
4
|
from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
|
5
5
|
|
6
6
|
from docling_core.types import BaseCell, BaseText
|
7
|
-
from docling_core.types import BoundingBox as DsBoundingBox
|
8
7
|
from docling_core.types import Document as DsDocument
|
9
8
|
from docling_core.types import DocumentDescription as DsDocumentDescription
|
10
9
|
from docling_core.types import FileInfoObject as DsFileInfoObject
|
11
10
|
from docling_core.types import PageDimensions, PageReference, Prov, Ref
|
12
11
|
from docling_core.types import Table as DsSchemaTable
|
13
12
|
from docling_core.types import TableCell
|
13
|
+
from docling_core.types.doc.base import BoundingBox as DsBoundingBox
|
14
14
|
from docling_core.types.doc.base import Figure
|
15
15
|
from pydantic import BaseModel
|
16
16
|
from typing_extensions import deprecated
|
@@ -324,8 +324,10 @@ class ConvertedDocument(BaseModel):
|
|
324
324
|
"paragraph",
|
325
325
|
"caption",
|
326
326
|
"table",
|
327
|
+
"figure",
|
327
328
|
],
|
328
329
|
strict_text: bool = False,
|
330
|
+
image_placeholder: str = "<!-- image -->",
|
329
331
|
):
|
330
332
|
return self.output.export_to_markdown(
|
331
333
|
delim=delim,
|
@@ -333,6 +335,7 @@ class ConvertedDocument(BaseModel):
|
|
333
335
|
main_text_stop=main_text_stop,
|
334
336
|
main_text_labels=main_text_labels,
|
335
337
|
strict_text=strict_text,
|
338
|
+
image_placeholder=image_placeholder,
|
336
339
|
)
|
337
340
|
|
338
341
|
def render_as_text(
|
@@ -0,0 +1,25 @@
|
|
1
|
+
from enum import Enum, auto
|
2
|
+
|
3
|
+
from pydantic import BaseModel
|
4
|
+
|
5
|
+
|
6
|
+
class TableFormerMode(str, Enum):
|
7
|
+
FAST = auto()
|
8
|
+
ACCURATE = auto()
|
9
|
+
|
10
|
+
|
11
|
+
class TableStructureOptions(BaseModel):
|
12
|
+
do_cell_matching: bool = (
|
13
|
+
True
|
14
|
+
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
|
15
|
+
# are merged across table columns.
|
16
|
+
# False: Let table structure model define the text cells, ignore PDF cells.
|
17
|
+
)
|
18
|
+
mode: TableFormerMode = TableFormerMode.FAST
|
19
|
+
|
20
|
+
|
21
|
+
class PipelineOptions(BaseModel):
|
22
|
+
do_table_structure: bool = True # True: perform table structure extraction
|
23
|
+
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
24
|
+
|
25
|
+
table_structure_options: TableStructureOptions = TableStructureOptions()
|
@@ -18,13 +18,13 @@ from docling.datamodel.base_models import (
|
|
18
18
|
DoclingComponentType,
|
19
19
|
ErrorItem,
|
20
20
|
Page,
|
21
|
-
PipelineOptions,
|
22
21
|
)
|
23
22
|
from docling.datamodel.document import (
|
24
23
|
ConversionResult,
|
25
24
|
DocumentConversionInput,
|
26
25
|
InputDocument,
|
27
26
|
)
|
27
|
+
from docling.datamodel.pipeline_options import PipelineOptions
|
28
28
|
from docling.datamodel.settings import settings
|
29
29
|
from docling.models.ds_glm_model import GlmModel
|
30
30
|
from docling.models.page_assemble_model import PageAssembleModel
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import copy
|
2
|
+
from pathlib import Path
|
2
3
|
from typing import Iterable, List
|
3
4
|
|
4
5
|
import numpy
|
@@ -12,16 +13,22 @@ from docling.datamodel.base_models import (
|
|
12
13
|
TableElement,
|
13
14
|
TableStructurePrediction,
|
14
15
|
)
|
16
|
+
from docling.datamodel.pipeline_options import TableFormerMode
|
15
17
|
|
16
18
|
|
17
19
|
class TableStructureModel:
|
18
20
|
def __init__(self, config):
|
19
21
|
self.config = config
|
20
22
|
self.do_cell_matching = config["do_cell_matching"]
|
23
|
+
self.mode = config["mode"]
|
21
24
|
|
22
25
|
self.enabled = config["enabled"]
|
23
26
|
if self.enabled:
|
24
|
-
artifacts_path = config["artifacts_path"]
|
27
|
+
artifacts_path: Path = config["artifacts_path"]
|
28
|
+
|
29
|
+
if self.mode == TableFormerMode.ACCURATE:
|
30
|
+
artifacts_path = artifacts_path / "fat"
|
31
|
+
|
25
32
|
# Third Party
|
26
33
|
import docling_ibm_models.tableformer.common as c
|
27
34
|
|
@@ -1,7 +1,8 @@
|
|
1
1
|
from pathlib import Path
|
2
2
|
from typing import Callable, Iterable, List
|
3
3
|
|
4
|
-
from docling.datamodel.base_models import Page
|
4
|
+
from docling.datamodel.base_models import Page
|
5
|
+
from docling.datamodel.pipeline_options import PipelineOptions
|
5
6
|
|
6
7
|
|
7
8
|
class BaseModelPipeline:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
from pathlib import Path
|
2
2
|
|
3
|
-
from docling.datamodel.
|
3
|
+
from docling.datamodel.pipeline_options import PipelineOptions
|
4
4
|
from docling.models.easyocr_model import EasyOcrModel
|
5
5
|
from docling.models.layout_model import LayoutModel
|
6
6
|
from docling.models.table_structure_model import TableStructureModel
|
@@ -32,6 +32,7 @@ class StandardModelPipeline(BaseModelPipeline):
|
|
32
32
|
"artifacts_path": artifacts_path
|
33
33
|
/ StandardModelPipeline._table_model_path,
|
34
34
|
"enabled": pipeline_options.do_table_structure,
|
35
|
+
"mode": pipeline_options.table_structure_options.mode,
|
35
36
|
"do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
|
36
37
|
}
|
37
38
|
),
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "docling"
|
3
|
-
version = "1.
|
3
|
+
version = "1.16.0" # DO NOT EDIT, updated automatically
|
4
4
|
description = "Docling PDF conversion package"
|
5
5
|
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
6
6
|
license = "MIT"
|
@@ -21,9 +21,23 @@ keywords= ["docling", "convert", "document", "pdf", "layout model", "segmentatio
|
|
21
21
|
packages = [{include = "docling"}]
|
22
22
|
|
23
23
|
[tool.poetry.dependencies]
|
24
|
+
##############
|
25
|
+
# constraints:
|
26
|
+
##############
|
27
|
+
torch = [
|
28
|
+
{version = "^2.2.2", optional = true, markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'"},
|
29
|
+
{version = "~2.2.2", optional = true, markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'"}
|
30
|
+
]
|
31
|
+
torchvision = [
|
32
|
+
{version = "^0", optional = true, markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'"},
|
33
|
+
{version = "~0.17.2", optional = true, markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'"}
|
34
|
+
]
|
35
|
+
######################
|
36
|
+
# actual dependencies:
|
37
|
+
######################
|
24
38
|
python = "^3.10"
|
25
39
|
pydantic = "^2.0.0"
|
26
|
-
docling-core = "^1.
|
40
|
+
docling-core = "^1.6.2"
|
27
41
|
docling-ibm-models = "^1.2.0"
|
28
42
|
deepsearch-glm = "^0.21.1"
|
29
43
|
filetype = "^1.2.0"
|
@@ -37,29 +51,6 @@ certifi = ">=2024.7.4"
|
|
37
51
|
rtree = "^1.3.0"
|
38
52
|
scipy = "^1.14.1"
|
39
53
|
pyarrow = "^16.1.0"
|
40
|
-
|
41
|
-
#########
|
42
|
-
# extras:
|
43
|
-
#########
|
44
|
-
python-dotenv = { version = "^1.0.1", optional = true }
|
45
|
-
llama-index-embeddings-huggingface = { version = "^0.3.1", optional = true }
|
46
|
-
llama-index-llms-huggingface-api = { version = "^0.2.0", optional = true }
|
47
|
-
llama-index-vector-stores-milvus = { version = "^0.2.1", optional = true }
|
48
|
-
langchain-huggingface = { version = "^0.0.3", optional = true}
|
49
|
-
langchain-milvus = { version = "^0.1.4", optional = true }
|
50
|
-
langchain-text-splitters = { version = "^0.2.4", optional = true }
|
51
|
-
|
52
|
-
##############
|
53
|
-
# constraints:
|
54
|
-
##############
|
55
|
-
torch = [
|
56
|
-
{version = "^2.2.2", optional = true, markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'"},
|
57
|
-
{version = "~2.2.2", optional = true, markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'"}
|
58
|
-
]
|
59
|
-
torchvision = [
|
60
|
-
{version = "^0", optional = true, markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'"},
|
61
|
-
{version = "~0.17.2", optional = true, markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'"}
|
62
|
-
]
|
63
54
|
typer = "^0.12.5"
|
64
55
|
|
65
56
|
[tool.poetry.group.dev.dependencies]
|
@@ -82,20 +73,13 @@ nbqa = "^1.9.0"
|
|
82
73
|
|
83
74
|
[tool.poetry.group.examples.dependencies]
|
84
75
|
datasets = "^2.21.0"
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
"llama-index-vector-stores-milvus",
|
93
|
-
# LangChain examples:
|
94
|
-
"langchain-huggingface",
|
95
|
-
"langchain-milvus",
|
96
|
-
"langchain-text-splitters",
|
97
|
-
]
|
98
|
-
|
76
|
+
python-dotenv = "^1.0.1"
|
77
|
+
llama-index-embeddings-huggingface = "^0.3.1"
|
78
|
+
llama-index-llms-huggingface-api = "^0.2.0"
|
79
|
+
llama-index-vector-stores-milvus = "^0.2.1"
|
80
|
+
langchain-huggingface = "^0.0.3"
|
81
|
+
langchain-milvus = "^0.1.4"
|
82
|
+
langchain-text-splitters = "^0.2.4"
|
99
83
|
|
100
84
|
[tool.poetry.scripts]
|
101
85
|
docling = "docling.cli.main:app"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|