docling 1.2.1__tar.gz → 1.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. {docling-1.2.1 → docling-1.3.0}/PKG-INFO +1 -1
  2. {docling-1.2.1 → docling-1.3.0}/docling/datamodel/base_models.py +6 -0
  3. {docling-1.2.1 → docling-1.3.0}/docling/document_converter.py +13 -4
  4. {docling-1.2.1 → docling-1.3.0}/pyproject.toml +1 -1
  5. {docling-1.2.1 → docling-1.3.0}/LICENSE +0 -0
  6. {docling-1.2.1 → docling-1.3.0}/README.md +0 -0
  7. {docling-1.2.1 → docling-1.3.0}/docling/__init__.py +0 -0
  8. {docling-1.2.1 → docling-1.3.0}/docling/backend/__init__.py +0 -0
  9. {docling-1.2.1 → docling-1.3.0}/docling/backend/abstract_backend.py +0 -0
  10. {docling-1.2.1 → docling-1.3.0}/docling/backend/docling_parse_backend.py +0 -0
  11. {docling-1.2.1 → docling-1.3.0}/docling/backend/pypdfium2_backend.py +0 -0
  12. {docling-1.2.1 → docling-1.3.0}/docling/datamodel/__init__.py +0 -0
  13. {docling-1.2.1 → docling-1.3.0}/docling/datamodel/document.py +0 -0
  14. {docling-1.2.1 → docling-1.3.0}/docling/datamodel/settings.py +0 -0
  15. {docling-1.2.1 → docling-1.3.0}/docling/models/__init__.py +0 -0
  16. {docling-1.2.1 → docling-1.3.0}/docling/models/ds_glm_model.py +0 -0
  17. {docling-1.2.1 → docling-1.3.0}/docling/models/easyocr_model.py +0 -0
  18. {docling-1.2.1 → docling-1.3.0}/docling/models/layout_model.py +0 -0
  19. {docling-1.2.1 → docling-1.3.0}/docling/models/page_assemble_model.py +0 -0
  20. {docling-1.2.1 → docling-1.3.0}/docling/models/table_structure_model.py +0 -0
  21. {docling-1.2.1 → docling-1.3.0}/docling/pipeline/__init__.py +0 -0
  22. {docling-1.2.1 → docling-1.3.0}/docling/pipeline/base_model_pipeline.py +0 -0
  23. {docling-1.2.1 → docling-1.3.0}/docling/pipeline/standard_model_pipeline.py +0 -0
  24. {docling-1.2.1 → docling-1.3.0}/docling/utils/__init__.py +0 -0
  25. {docling-1.2.1 → docling-1.3.0}/docling/utils/layout_utils.py +0 -0
  26. {docling-1.2.1 → docling-1.3.0}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 1.2.1
3
+ Version: 1.3.0
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -265,3 +265,9 @@ class PipelineOptions(BaseModel):
265
265
  do_ocr: bool = False # True: perform OCR, replace programmatic PDF text
266
266
 
267
267
  table_structure_options: TableStructureOptions = TableStructureOptions()
268
+
269
+
270
+ class AssembleOptions(BaseModel):
271
+ keep_page_images: bool = (
272
+ False # False: page images are removed in the assemble step
273
+ )
@@ -14,6 +14,7 @@ from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
14
14
  from docling.backend.abstract_backend import PdfDocumentBackend
15
15
  from docling.datamodel.base_models import (
16
16
  AssembledUnit,
17
+ AssembleOptions,
17
18
  ConversionStatus,
18
19
  Page,
19
20
  PipelineOptions,
@@ -44,6 +45,7 @@ class DocumentConverter:
44
45
  pipeline_options: PipelineOptions = PipelineOptions(),
45
46
  pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND,
46
47
  pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline,
48
+ assemble_options: AssembleOptions = AssembleOptions(),
47
49
  ):
48
50
  if not artifacts_path:
49
51
  artifacts_path = self.download_models_hf()
@@ -57,6 +59,7 @@ class DocumentConverter:
57
59
  self.page_assemble_model = PageAssembleModel(config={})
58
60
  self.glm_model = GlmModel(config={})
59
61
  self.pdf_backend = pdf_backend
62
+ self.assemble_options = assemble_options
60
63
 
61
64
  @staticmethod
62
65
  def download_models_hf(
@@ -174,17 +177,23 @@ class DocumentConverter:
174
177
  pages_with_images,
175
178
  )
176
179
 
180
+ # 4. Run pipeline stages
177
181
  pipeline_pages = self.model_pipeline.apply(pages_with_cells)
178
182
 
179
- # 7. Assemble page elements (per page)
183
+ # 5. Assemble page elements (per page)
180
184
  assembled_pages = self.page_assemble_model(pipeline_pages)
181
185
 
182
186
  # exhaust assembled_pages
183
187
  for assembled_page in assembled_pages:
184
188
  # Free up mem resources before moving on with next batch
185
- assembled_page.image = (
186
- None # Comment this if you want to visualize page images
187
- )
189
+
190
+ # Remove page images (can be disabled)
191
+ if not self.assemble_options.keep_page_images:
192
+ assembled_page.image = (
193
+ None # Comment this if you want to visualize page images
194
+ )
195
+
196
+ # Unload backend
188
197
  assembled_page._backend.unload()
189
198
 
190
199
  all_assembled_pages.append(assembled_page)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "1.2.1" # DO NOT EDIT, updated automatically
3
+ version = "1.3.0" # DO NOT EDIT, updated automatically
4
4
  description = "Docling PDF conversion package"
5
5
  authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"
File without changes
File without changes
File without changes
File without changes