docling 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/datamodel/base_models.py +6 -0
- docling/document_converter.py +13 -4
- {docling-1.2.1.dist-info → docling-1.3.0.dist-info}/METADATA +1 -1
- {docling-1.2.1.dist-info → docling-1.3.0.dist-info}/RECORD +6 -6
- {docling-1.2.1.dist-info → docling-1.3.0.dist-info}/LICENSE +0 -0
- {docling-1.2.1.dist-info → docling-1.3.0.dist-info}/WHEEL +0 -0
docling/datamodel/base_models.py
CHANGED
@@ -265,3 +265,9 @@ class PipelineOptions(BaseModel):
|
|
265
265
|
do_ocr: bool = False # True: perform OCR, replace programmatic PDF text
|
266
266
|
|
267
267
|
table_structure_options: TableStructureOptions = TableStructureOptions()
|
268
|
+
|
269
|
+
|
270
|
+
class AssembleOptions(BaseModel):
|
271
|
+
keep_page_images: bool = (
|
272
|
+
False # False: page images are removed in the assemble step
|
273
|
+
)
|
docling/document_converter.py
CHANGED
@@ -14,6 +14,7 @@ from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
|
|
14
14
|
from docling.backend.abstract_backend import PdfDocumentBackend
|
15
15
|
from docling.datamodel.base_models import (
|
16
16
|
AssembledUnit,
|
17
|
+
AssembleOptions,
|
17
18
|
ConversionStatus,
|
18
19
|
Page,
|
19
20
|
PipelineOptions,
|
@@ -44,6 +45,7 @@ class DocumentConverter:
|
|
44
45
|
pipeline_options: PipelineOptions = PipelineOptions(),
|
45
46
|
pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND,
|
46
47
|
pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline,
|
48
|
+
assemble_options: AssembleOptions = AssembleOptions(),
|
47
49
|
):
|
48
50
|
if not artifacts_path:
|
49
51
|
artifacts_path = self.download_models_hf()
|
@@ -57,6 +59,7 @@ class DocumentConverter:
|
|
57
59
|
self.page_assemble_model = PageAssembleModel(config={})
|
58
60
|
self.glm_model = GlmModel(config={})
|
59
61
|
self.pdf_backend = pdf_backend
|
62
|
+
self.assemble_options = assemble_options
|
60
63
|
|
61
64
|
@staticmethod
|
62
65
|
def download_models_hf(
|
@@ -174,17 +177,23 @@ class DocumentConverter:
|
|
174
177
|
pages_with_images,
|
175
178
|
)
|
176
179
|
|
180
|
+
# 4. Run pipeline stages
|
177
181
|
pipeline_pages = self.model_pipeline.apply(pages_with_cells)
|
178
182
|
|
179
|
-
#
|
183
|
+
# 5. Assemble page elements (per page)
|
180
184
|
assembled_pages = self.page_assemble_model(pipeline_pages)
|
181
185
|
|
182
186
|
# exhaust assembled_pages
|
183
187
|
for assembled_page in assembled_pages:
|
184
188
|
# Free up mem resources before moving on with next batch
|
185
|
-
|
186
|
-
|
187
|
-
|
189
|
+
|
190
|
+
# Remove page images (can be disabled)
|
191
|
+
if not self.assemble_options.keep_page_images:
|
192
|
+
assembled_page.image = (
|
193
|
+
None # Comment this if you want to visualize page images
|
194
|
+
)
|
195
|
+
|
196
|
+
# Unload backend
|
188
197
|
assembled_page._backend.unload()
|
189
198
|
|
190
199
|
all_assembled_pages.append(assembled_page)
|
@@ -4,10 +4,10 @@ docling/backend/abstract_backend.py,sha256=swwmXzNueZSHqEOvw4j-IFhP2OUJhBeB--gV7
|
|
4
4
|
docling/backend/docling_parse_backend.py,sha256=mGuJCpMVqyrZK-cXKRWrELPz0Wt1h6uydx4QwWI1rew,5912
|
5
5
|
docling/backend/pypdfium2_backend.py,sha256=tv6JxyTkTdT2qr2ghsQgYA2zgpCDxKYSdHVBTAR7FSk,7411
|
6
6
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
docling/datamodel/base_models.py,sha256=
|
7
|
+
docling/datamodel/base_models.py,sha256=irZLAHdsROOOwRbywKIA0mk3H8GrLwtGjOgTV6G0QoU,7004
|
8
8
|
docling/datamodel/document.py,sha256=FG_ntDFRBWj-MhV52D0sC8XaZOwN3yryyXahsVHGnyI,12517
|
9
9
|
docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
|
10
|
-
docling/document_converter.py,sha256=
|
10
|
+
docling/document_converter.py,sha256=dMucsq6M_nwPsC1ChogVwJgNDv8sJuFklQWWinDZaug,10246
|
11
11
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
12
|
docling/models/ds_glm_model.py,sha256=wmb--2JKFQby-kvidw6PyM8wURPXYPQ_Z_eKKCBAdYQ,3192
|
13
13
|
docling/models/easyocr_model.py,sha256=NaHVs8IN0eW9KB076E2Kae1s-bq74_4IMWueze9QqtE,2290
|
@@ -20,7 +20,7 @@ docling/pipeline/standard_model_pipeline.py,sha256=UTwodKUKrisLoVcntbNUBDhjzRyFv
|
|
20
20
|
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
21
21
|
docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
|
22
22
|
docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
|
23
|
-
docling-1.
|
24
|
-
docling-1.
|
25
|
-
docling-1.
|
26
|
-
docling-1.
|
23
|
+
docling-1.3.0.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
|
24
|
+
docling-1.3.0.dist-info/METADATA,sha256=wi2DOn77z_BIMSLsrmzebYZUgpjHYWbNTOIVEY3A4-o,7042
|
25
|
+
docling-1.3.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
26
|
+
docling-1.3.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|