docling 1.2.0__tar.gz → 1.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-1.2.0 → docling-1.3.0}/PKG-INFO +8 -3
- {docling-1.2.0 → docling-1.3.0}/README.md +6 -2
- {docling-1.2.0 → docling-1.3.0}/docling/backend/abstract_backend.py +1 -1
- {docling-1.2.0 → docling-1.3.0}/docling/backend/docling_parse_backend.py +3 -2
- {docling-1.2.0 → docling-1.3.0}/docling/backend/pypdfium2_backend.py +1 -1
- {docling-1.2.0 → docling-1.3.0}/docling/datamodel/base_models.py +6 -0
- {docling-1.2.0 → docling-1.3.0}/docling/document_converter.py +13 -4
- {docling-1.2.0 → docling-1.3.0}/pyproject.toml +2 -1
- {docling-1.2.0 → docling-1.3.0}/LICENSE +0 -0
- {docling-1.2.0 → docling-1.3.0}/docling/__init__.py +0 -0
- {docling-1.2.0 → docling-1.3.0}/docling/backend/__init__.py +0 -0
- {docling-1.2.0 → docling-1.3.0}/docling/datamodel/__init__.py +0 -0
- {docling-1.2.0 → docling-1.3.0}/docling/datamodel/document.py +0 -0
- {docling-1.2.0 → docling-1.3.0}/docling/datamodel/settings.py +0 -0
- {docling-1.2.0 → docling-1.3.0}/docling/models/__init__.py +0 -0
- {docling-1.2.0 → docling-1.3.0}/docling/models/ds_glm_model.py +0 -0
- {docling-1.2.0 → docling-1.3.0}/docling/models/easyocr_model.py +0 -0
- {docling-1.2.0 → docling-1.3.0}/docling/models/layout_model.py +0 -0
- {docling-1.2.0 → docling-1.3.0}/docling/models/page_assemble_model.py +0 -0
- {docling-1.2.0 → docling-1.3.0}/docling/models/table_structure_model.py +0 -0
- {docling-1.2.0 → docling-1.3.0}/docling/pipeline/__init__.py +0 -0
- {docling-1.2.0 → docling-1.3.0}/docling/pipeline/base_model_pipeline.py +0 -0
- {docling-1.2.0 → docling-1.3.0}/docling/pipeline/standard_model_pipeline.py +0 -0
- {docling-1.2.0 → docling-1.3.0}/docling/utils/__init__.py +0 -0
- {docling-1.2.0 → docling-1.3.0}/docling/utils/layout_utils.py +0 -0
- {docling-1.2.0 → docling-1.3.0}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.3.0
|
4
4
|
Summary: Docling PDF conversion package
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -21,6 +21,7 @@ Classifier: Programming Language :: Python :: 3.12
|
|
21
21
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
22
22
|
Provides-Extra: easyocr
|
23
23
|
Provides-Extra: ocr
|
24
|
+
Requires-Dist: certifi (>=2024.7.4)
|
24
25
|
Requires-Dist: deepsearch-glm (>=0.19.0,<1)
|
25
26
|
Requires-Dist: docling-core (>=1.1.2,<2.0.0)
|
26
27
|
Requires-Dist: docling-ibm-models (>=1.1.0,<2.0.0)
|
@@ -93,17 +94,21 @@ print(doc.export_to_markdown()) # output: "## DocLayNet: A Large Human-Annotate
|
|
93
94
|
|
94
95
|
### Convert a batch of documents
|
95
96
|
|
96
|
-
For an example of batch-converting documents, see [
|
97
|
+
For an example of batch-converting documents, see [batch_convert.py](https://github.com/DS4SD/docling/blob/main/examples/batch_convert.py).
|
97
98
|
|
98
99
|
From a local repo clone, you can run it with:
|
99
100
|
|
100
101
|
```
|
101
|
-
python examples/
|
102
|
+
python examples/batch_convert.py
|
102
103
|
```
|
103
104
|
The output of the above command will be written to `./scratch`.
|
104
105
|
|
105
106
|
### Adjust pipeline features
|
106
107
|
|
108
|
+
The example file [custom_convert.py](https://github.com/DS4SD/docling/blob/main/examples/custom_convert.py) contains multiple ways
|
109
|
+
one can adjust the conversion pipeline and features.
|
110
|
+
|
111
|
+
|
107
112
|
#### Control pipeline options
|
108
113
|
|
109
114
|
You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter`:
|
@@ -56,17 +56,21 @@ print(doc.export_to_markdown()) # output: "## DocLayNet: A Large Human-Annotate
|
|
56
56
|
|
57
57
|
### Convert a batch of documents
|
58
58
|
|
59
|
-
For an example of batch-converting documents, see [
|
59
|
+
For an example of batch-converting documents, see [batch_convert.py](https://github.com/DS4SD/docling/blob/main/examples/batch_convert.py).
|
60
60
|
|
61
61
|
From a local repo clone, you can run it with:
|
62
62
|
|
63
63
|
```
|
64
|
-
python examples/
|
64
|
+
python examples/batch_convert.py
|
65
65
|
```
|
66
66
|
The output of the above command will be written to `./scratch`.
|
67
67
|
|
68
68
|
### Adjust pipeline features
|
69
69
|
|
70
|
+
The example file [custom_convert.py](https://github.com/DS4SD/docling/blob/main/examples/custom_convert.py) contains multiple ways
|
71
|
+
one can adjust the conversion pipeline and features.
|
72
|
+
|
73
|
+
|
70
74
|
#### Control pipeline options
|
71
75
|
|
72
76
|
You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter`:
|
@@ -146,11 +146,12 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
146
146
|
|
147
147
|
|
148
148
|
class DoclingParseDocumentBackend(PdfDocumentBackend):
|
149
|
-
def __init__(self, path_or_stream:
|
149
|
+
def __init__(self, path_or_stream: Union[BytesIO, Path]):
|
150
150
|
super().__init__(path_or_stream)
|
151
151
|
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
152
152
|
# Parsing cells with docling_parser call
|
153
|
-
|
153
|
+
if isinstance(path_or_stream, BytesIO):
|
154
|
+
raise NotImplemented("This backend does not support byte streams yet.")
|
154
155
|
parser = pdf_parser()
|
155
156
|
self._parser_doc = parser.find_cells(str(path_or_stream))
|
156
157
|
|
@@ -199,7 +199,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
199
199
|
|
200
200
|
|
201
201
|
class PyPdfiumDocumentBackend(PdfDocumentBackend):
|
202
|
-
def __init__(self, path_or_stream:
|
202
|
+
def __init__(self, path_or_stream: Union[BytesIO, Path]):
|
203
203
|
super().__init__(path_or_stream)
|
204
204
|
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
205
205
|
|
@@ -265,3 +265,9 @@ class PipelineOptions(BaseModel):
|
|
265
265
|
do_ocr: bool = False # True: perform OCR, replace programmatic PDF text
|
266
266
|
|
267
267
|
table_structure_options: TableStructureOptions = TableStructureOptions()
|
268
|
+
|
269
|
+
|
270
|
+
class AssembleOptions(BaseModel):
|
271
|
+
keep_page_images: bool = (
|
272
|
+
False # False: page images are removed in the assemble step
|
273
|
+
)
|
@@ -14,6 +14,7 @@ from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
|
|
14
14
|
from docling.backend.abstract_backend import PdfDocumentBackend
|
15
15
|
from docling.datamodel.base_models import (
|
16
16
|
AssembledUnit,
|
17
|
+
AssembleOptions,
|
17
18
|
ConversionStatus,
|
18
19
|
Page,
|
19
20
|
PipelineOptions,
|
@@ -44,6 +45,7 @@ class DocumentConverter:
|
|
44
45
|
pipeline_options: PipelineOptions = PipelineOptions(),
|
45
46
|
pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND,
|
46
47
|
pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline,
|
48
|
+
assemble_options: AssembleOptions = AssembleOptions(),
|
47
49
|
):
|
48
50
|
if not artifacts_path:
|
49
51
|
artifacts_path = self.download_models_hf()
|
@@ -57,6 +59,7 @@ class DocumentConverter:
|
|
57
59
|
self.page_assemble_model = PageAssembleModel(config={})
|
58
60
|
self.glm_model = GlmModel(config={})
|
59
61
|
self.pdf_backend = pdf_backend
|
62
|
+
self.assemble_options = assemble_options
|
60
63
|
|
61
64
|
@staticmethod
|
62
65
|
def download_models_hf(
|
@@ -174,17 +177,23 @@ class DocumentConverter:
|
|
174
177
|
pages_with_images,
|
175
178
|
)
|
176
179
|
|
180
|
+
# 4. Run pipeline stages
|
177
181
|
pipeline_pages = self.model_pipeline.apply(pages_with_cells)
|
178
182
|
|
179
|
-
#
|
183
|
+
# 5. Assemble page elements (per page)
|
180
184
|
assembled_pages = self.page_assemble_model(pipeline_pages)
|
181
185
|
|
182
186
|
# exhaust assembled_pages
|
183
187
|
for assembled_page in assembled_pages:
|
184
188
|
# Free up mem resources before moving on with next batch
|
185
|
-
|
186
|
-
|
187
|
-
|
189
|
+
|
190
|
+
# Remove page images (can be disabled)
|
191
|
+
if not self.assemble_options.keep_page_images:
|
192
|
+
assembled_page.image = (
|
193
|
+
None # Comment this if you want to visualize page images
|
194
|
+
)
|
195
|
+
|
196
|
+
# Unload backend
|
188
197
|
assembled_page._backend.unload()
|
189
198
|
|
190
199
|
all_assembled_pages.append(assembled_page)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "docling"
|
3
|
-
version = "1.
|
3
|
+
version = "1.3.0" # DO NOT EDIT, updated automatically
|
4
4
|
description = "Docling PDF conversion package"
|
5
5
|
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
6
6
|
license = "MIT"
|
@@ -33,6 +33,7 @@ huggingface_hub = ">=0.23,<1"
|
|
33
33
|
requests = "^2.32.3"
|
34
34
|
easyocr = { version = "^1.7", optional = true }
|
35
35
|
docling-parse = "^0.0.1"
|
36
|
+
certifi = ">=2024.7.4"
|
36
37
|
|
37
38
|
[tool.poetry.group.dev.dependencies]
|
38
39
|
black = {extras = ["jupyter"], version = "^24.4.2"}
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|