docling 1.2.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -35,7 +35,7 @@ class PdfPageBackend(ABC):
35
35
 
36
36
  class PdfDocumentBackend(ABC):
37
37
  @abstractmethod
38
- def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
38
+ def __init__(self, path_or_stream: Union[BytesIO, Path]):
39
39
  pass
40
40
 
41
41
  @abstractmethod
@@ -146,11 +146,12 @@ class DoclingParsePageBackend(PdfPageBackend):
146
146
 
147
147
 
148
148
  class DoclingParseDocumentBackend(PdfDocumentBackend):
149
- def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
149
+ def __init__(self, path_or_stream: Union[BytesIO, Path]):
150
150
  super().__init__(path_or_stream)
151
151
  self._pdoc = pdfium.PdfDocument(path_or_stream)
152
152
  # Parsing cells with docling_parser call
153
- print("PARSING WITH DOCLING PARSE")
153
+ if isinstance(path_or_stream, BytesIO):
154
+ raise NotImplemented("This backend does not support byte streams yet.")
154
155
  parser = pdf_parser()
155
156
  self._parser_doc = parser.find_cells(str(path_or_stream))
156
157
 
@@ -199,7 +199,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
199
199
 
200
200
 
201
201
  class PyPdfiumDocumentBackend(PdfDocumentBackend):
202
- def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
202
+ def __init__(self, path_or_stream: Union[BytesIO, Path]):
203
203
  super().__init__(path_or_stream)
204
204
  self._pdoc = pdfium.PdfDocument(path_or_stream)
205
205
 
@@ -265,3 +265,9 @@ class PipelineOptions(BaseModel):
265
265
  do_ocr: bool = False # True: perform OCR, replace programmatic PDF text
266
266
 
267
267
  table_structure_options: TableStructureOptions = TableStructureOptions()
268
+
269
+
270
+ class AssembleOptions(BaseModel):
271
+ keep_page_images: bool = (
272
+ False # False: page images are removed in the assemble step
273
+ )
@@ -14,6 +14,7 @@ from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
14
14
  from docling.backend.abstract_backend import PdfDocumentBackend
15
15
  from docling.datamodel.base_models import (
16
16
  AssembledUnit,
17
+ AssembleOptions,
17
18
  ConversionStatus,
18
19
  Page,
19
20
  PipelineOptions,
@@ -44,6 +45,7 @@ class DocumentConverter:
44
45
  pipeline_options: PipelineOptions = PipelineOptions(),
45
46
  pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND,
46
47
  pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline,
48
+ assemble_options: AssembleOptions = AssembleOptions(),
47
49
  ):
48
50
  if not artifacts_path:
49
51
  artifacts_path = self.download_models_hf()
@@ -57,6 +59,7 @@ class DocumentConverter:
57
59
  self.page_assemble_model = PageAssembleModel(config={})
58
60
  self.glm_model = GlmModel(config={})
59
61
  self.pdf_backend = pdf_backend
62
+ self.assemble_options = assemble_options
60
63
 
61
64
  @staticmethod
62
65
  def download_models_hf(
@@ -174,17 +177,23 @@ class DocumentConverter:
174
177
  pages_with_images,
175
178
  )
176
179
 
180
+ # 4. Run pipeline stages
177
181
  pipeline_pages = self.model_pipeline.apply(pages_with_cells)
178
182
 
179
- # 7. Assemble page elements (per page)
183
+ # 5. Assemble page elements (per page)
180
184
  assembled_pages = self.page_assemble_model(pipeline_pages)
181
185
 
182
186
  # exhaust assembled_pages
183
187
  for assembled_page in assembled_pages:
184
188
  # Free up mem resources before moving on with next batch
185
- assembled_page.image = (
186
- None # Comment this if you want to visualize page images
187
- )
189
+
190
+ # Remove page images (can be disabled)
191
+ if not self.assemble_options.keep_page_images:
192
+ assembled_page.image = (
193
+ None # Comment this if you want to visualize page images
194
+ )
195
+
196
+ # Unload backend
188
197
  assembled_page._backend.unload()
189
198
 
190
199
  all_assembled_pages.append(assembled_page)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 1.2.0
3
+ Version: 1.3.0
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -21,6 +21,7 @@ Classifier: Programming Language :: Python :: 3.12
21
21
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Provides-Extra: easyocr
23
23
  Provides-Extra: ocr
24
+ Requires-Dist: certifi (>=2024.7.4)
24
25
  Requires-Dist: deepsearch-glm (>=0.19.0,<1)
25
26
  Requires-Dist: docling-core (>=1.1.2,<2.0.0)
26
27
  Requires-Dist: docling-ibm-models (>=1.1.0,<2.0.0)
@@ -93,17 +94,21 @@ print(doc.export_to_markdown()) # output: "## DocLayNet: A Large Human-Annotate
93
94
 
94
95
  ### Convert a batch of documents
95
96
 
96
- For an example of batch-converting documents, see [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py).
97
+ For an example of batch-converting documents, see [batch_convert.py](https://github.com/DS4SD/docling/blob/main/examples/batch_convert.py).
97
98
 
98
99
  From a local repo clone, you can run it with:
99
100
 
100
101
  ```
101
- python examples/convert.py
102
+ python examples/batch_convert.py
102
103
  ```
103
104
  The output of the above command will be written to `./scratch`.
104
105
 
105
106
  ### Adjust pipeline features
106
107
 
108
+ The example file [custom_convert.py](https://github.com/DS4SD/docling/blob/main/examples/custom_convert.py) contains multiple ways
109
+ one can adjust the conversion pipeline and features.
110
+
111
+
107
112
  #### Control pipeline options
108
113
 
109
114
  You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter`:
@@ -1,13 +1,13 @@
1
1
  docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- docling/backend/abstract_backend.py,sha256=dINr8oTax9Fq31Y1AR0CGWNZtAHN5aqB_M7TAPkJNVQ,1122
4
- docling/backend/docling_parse_backend.py,sha256=cupeYC1evzM31lXskH-mbXnZhw1_JHyUiJ-cpTmlrM4,5834
5
- docling/backend/pypdfium2_backend.py,sha256=cIQGFkwzceN57PzmACt06CytRo0A_t-im6rW804RC3M,7421
3
+ docling/backend/abstract_backend.py,sha256=swwmXzNueZSHqEOvw4j-IFhP2OUJhBeB--gV7NtzKgo,1112
4
+ docling/backend/docling_parse_backend.py,sha256=mGuJCpMVqyrZK-cXKRWrELPz0Wt1h6uydx4QwWI1rew,5912
5
+ docling/backend/pypdfium2_backend.py,sha256=tv6JxyTkTdT2qr2ghsQgYA2zgpCDxKYSdHVBTAR7FSk,7411
6
6
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- docling/datamodel/base_models.py,sha256=k7gLFPnq3ArEMAFz6qUcp5qemlYzVhOmR9qtBTkAiX4,6862
7
+ docling/datamodel/base_models.py,sha256=irZLAHdsROOOwRbywKIA0mk3H8GrLwtGjOgTV6G0QoU,7004
8
8
  docling/datamodel/document.py,sha256=FG_ntDFRBWj-MhV52D0sC8XaZOwN3yryyXahsVHGnyI,12517
9
9
  docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
10
- docling/document_converter.py,sha256=I9vjTLCLahsMrcs9ozM3C5r_CtBN-9qHk7-ANma7fkc,9895
10
+ docling/document_converter.py,sha256=dMucsq6M_nwPsC1ChogVwJgNDv8sJuFklQWWinDZaug,10246
11
11
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  docling/models/ds_glm_model.py,sha256=wmb--2JKFQby-kvidw6PyM8wURPXYPQ_Z_eKKCBAdYQ,3192
13
13
  docling/models/easyocr_model.py,sha256=NaHVs8IN0eW9KB076E2Kae1s-bq74_4IMWueze9QqtE,2290
@@ -20,7 +20,7 @@ docling/pipeline/standard_model_pipeline.py,sha256=UTwodKUKrisLoVcntbNUBDhjzRyFv
20
20
  docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
21
  docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
22
22
  docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
23
- docling-1.2.0.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
24
- docling-1.2.0.dist-info/METADATA,sha256=9ZWFckdLpf45avuDgZgyzQK6J2oLCK0_oCW9T9Rx4iU,6802
25
- docling-1.2.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
26
- docling-1.2.0.dist-info/RECORD,,
23
+ docling-1.3.0.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
24
+ docling-1.3.0.dist-info/METADATA,sha256=wi2DOn77z_BIMSLsrmzebYZUgpjHYWbNTOIVEY3A4-o,7042
25
+ docling-1.3.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
26
+ docling-1.3.0.dist-info/RECORD,,