docling 1.2.1__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -150,10 +150,11 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
150
150
  super().__init__(path_or_stream)
151
151
  self._pdoc = pdfium.PdfDocument(path_or_stream)
152
152
  # Parsing cells with docling_parser call
153
- if isinstance(path_or_stream, BytesIO):
154
- raise NotImplemented("This backend does not support byte streams yet.")
155
153
  parser = pdf_parser()
156
- self._parser_doc = parser.find_cells(str(path_or_stream))
154
+ if isinstance(path_or_stream, BytesIO):
155
+ self._parser_doc = parser.find_cells_from_bytesio(path_or_stream)
156
+ else:
157
+ self._parser_doc = parser.find_cells(str(path_or_stream))
157
158
 
158
159
  def page_count(self) -> int:
159
160
  return len(self._parser_doc["pages"])
@@ -265,3 +265,9 @@ class PipelineOptions(BaseModel):
265
265
  do_ocr: bool = False # True: perform OCR, replace programmatic PDF text
266
266
 
267
267
  table_structure_options: TableStructureOptions = TableStructureOptions()
268
+
269
+
270
+ class AssembleOptions(BaseModel):
271
+ keep_page_images: bool = (
272
+ False # False: page images are removed in the assemble step
273
+ )
@@ -14,7 +14,7 @@ from docling_core.types import TableCell
14
14
  from pydantic import BaseModel
15
15
 
16
16
  from docling.backend.abstract_backend import PdfDocumentBackend
17
- from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
17
+ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
18
18
  from docling.datamodel.base_models import (
19
19
  AssembledUnit,
20
20
  ConversionStatus,
@@ -64,7 +64,7 @@ class InputDocument(BaseModel):
64
64
  path_or_stream: Union[BytesIO, Path],
65
65
  filename: Optional[str] = None,
66
66
  limits: Optional[DocumentLimits] = None,
67
- pdf_backend=PyPdfiumDocumentBackend,
67
+ pdf_backend=DoclingParseDocumentBackend,
68
68
  ):
69
69
  super().__init__()
70
70
 
@@ -308,7 +308,7 @@ class DocumentConversionInput(BaseModel):
308
308
  _path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
309
309
  limits: Optional[DocumentLimits] = DocumentLimits()
310
310
 
311
- DEFAULT_BACKEND: ClassVar = PyPdfiumDocumentBackend
311
+ DEFAULT_BACKEND: ClassVar = DoclingParseDocumentBackend
312
312
 
313
313
  def docs(
314
314
  self, pdf_backend: Optional[Type[PdfDocumentBackend]] = None
@@ -14,6 +14,7 @@ from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
14
14
  from docling.backend.abstract_backend import PdfDocumentBackend
15
15
  from docling.datamodel.base_models import (
16
16
  AssembledUnit,
17
+ AssembleOptions,
17
18
  ConversionStatus,
18
19
  Page,
19
20
  PipelineOptions,
@@ -44,6 +45,7 @@ class DocumentConverter:
44
45
  pipeline_options: PipelineOptions = PipelineOptions(),
45
46
  pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND,
46
47
  pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline,
48
+ assemble_options: AssembleOptions = AssembleOptions(),
47
49
  ):
48
50
  if not artifacts_path:
49
51
  artifacts_path = self.download_models_hf()
@@ -57,6 +59,7 @@ class DocumentConverter:
57
59
  self.page_assemble_model = PageAssembleModel(config={})
58
60
  self.glm_model = GlmModel(config={})
59
61
  self.pdf_backend = pdf_backend
62
+ self.assemble_options = assemble_options
60
63
 
61
64
  @staticmethod
62
65
  def download_models_hf(
@@ -174,17 +177,23 @@ class DocumentConverter:
174
177
  pages_with_images,
175
178
  )
176
179
 
180
+ # 4. Run pipeline stages
177
181
  pipeline_pages = self.model_pipeline.apply(pages_with_cells)
178
182
 
179
- # 7. Assemble page elements (per page)
183
+ # 5. Assemble page elements (per page)
180
184
  assembled_pages = self.page_assemble_model(pipeline_pages)
181
185
 
182
186
  # exhaust assembled_pages
183
187
  for assembled_page in assembled_pages:
184
188
  # Free up mem resources before moving on with next batch
185
- assembled_page.image = (
186
- None # Comment this if you want to visualize page images
187
- )
189
+
190
+ # Remove page images (can be disabled)
191
+ if not self.assemble_options.keep_page_images:
192
+ assembled_page.image = (
193
+ None # Comment this if you want to visualize page images
194
+ )
195
+
196
+ # Unload backend
188
197
  assembled_page._backend.unload()
189
198
 
190
199
  all_assembled_pages.append(assembled_page)
@@ -69,6 +69,10 @@ class LayoutModel:
69
69
  "Key-Value Region": 0.45,
70
70
  }
71
71
 
72
+ CLASS_REMAPPINGS = {
73
+ "Document Index": "Table",
74
+ }
75
+
72
76
  _log.debug("================= Start postprocess function ====================")
73
77
  start_time = time.time()
74
78
  # Apply Confidence Threshold to cluster predictions
@@ -79,6 +83,10 @@ class LayoutModel:
79
83
  confidence = CLASS_THRESHOLDS[cluster.label]
80
84
  if cluster.confidence >= confidence:
81
85
  # annotation["created_by"] = "high_conf_pred"
86
+
87
+ # Remap class labels where needed.
88
+ if cluster.label in CLASS_REMAPPINGS.keys():
89
+ cluster.label = CLASS_REMAPPINGS[cluster.label]
82
90
  clusters_out.append(cluster)
83
91
 
84
92
  # map to dictionary clusters and cells, with bottom left origin
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 1.2.1
3
+ Version: 1.4.0
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -24,8 +24,8 @@ Provides-Extra: ocr
24
24
  Requires-Dist: certifi (>=2024.7.4)
25
25
  Requires-Dist: deepsearch-glm (>=0.19.0,<1)
26
26
  Requires-Dist: docling-core (>=1.1.2,<2.0.0)
27
- Requires-Dist: docling-ibm-models (>=1.1.0,<2.0.0)
28
- Requires-Dist: docling-parse (>=0.0.1,<0.0.2)
27
+ Requires-Dist: docling-ibm-models (>=1.1.1,<2.0.0)
28
+ Requires-Dist: docling-parse (>=0.2.0,<0.3.0)
29
29
  Requires-Dist: easyocr (>=1.7,<2.0) ; extra == "easyocr" or extra == "ocr"
30
30
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
31
31
  Requires-Dist: huggingface_hub (>=0.23,<1)
@@ -1,17 +1,17 @@
1
1
  docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  docling/backend/abstract_backend.py,sha256=swwmXzNueZSHqEOvw4j-IFhP2OUJhBeB--gV7NtzKgo,1112
4
- docling/backend/docling_parse_backend.py,sha256=mGuJCpMVqyrZK-cXKRWrELPz0Wt1h6uydx4QwWI1rew,5912
4
+ docling/backend/docling_parse_backend.py,sha256=bgsmnwDmroBwuOwkEUzlN9KMEIFJ1xUaCZW6rsr5G-c,5924
5
5
  docling/backend/pypdfium2_backend.py,sha256=tv6JxyTkTdT2qr2ghsQgYA2zgpCDxKYSdHVBTAR7FSk,7411
6
6
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- docling/datamodel/base_models.py,sha256=k7gLFPnq3ArEMAFz6qUcp5qemlYzVhOmR9qtBTkAiX4,6862
8
- docling/datamodel/document.py,sha256=FG_ntDFRBWj-MhV52D0sC8XaZOwN3yryyXahsVHGnyI,12517
7
+ docling/datamodel/base_models.py,sha256=irZLAHdsROOOwRbywKIA0mk3H8GrLwtGjOgTV6G0QoU,7004
8
+ docling/datamodel/document.py,sha256=lZHXINmPWvpzrV3PTilgJs1blqTMCnJdLEww_qfcqdE,12533
9
9
  docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
10
- docling/document_converter.py,sha256=I9vjTLCLahsMrcs9ozM3C5r_CtBN-9qHk7-ANma7fkc,9895
10
+ docling/document_converter.py,sha256=dMucsq6M_nwPsC1ChogVwJgNDv8sJuFklQWWinDZaug,10246
11
11
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  docling/models/ds_glm_model.py,sha256=wmb--2JKFQby-kvidw6PyM8wURPXYPQ_Z_eKKCBAdYQ,3192
13
13
  docling/models/easyocr_model.py,sha256=NaHVs8IN0eW9KB076E2Kae1s-bq74_4IMWueze9QqtE,2290
14
- docling/models/layout_model.py,sha256=4AfPFiu6pXc8wIQ1sQlEZnHRt7SnBmfzDdctiRveOWw,10944
14
+ docling/models/layout_model.py,sha256=3mOgNvCYPh99_oLxJy-ZaIqGOFgG5bcIQ0tTubW656Q,11204
15
15
  docling/models/page_assemble_model.py,sha256=8eoG2WiFxPxq9TPvM-wkngb2gkr0tdtCRVXg1JcTETo,5550
16
16
  docling/models/table_structure_model.py,sha256=xUmfunZNYC30P0fRdESdztqy1FVlMzlhJjLBp-xcn4A,5638
17
17
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -20,7 +20,7 @@ docling/pipeline/standard_model_pipeline.py,sha256=UTwodKUKrisLoVcntbNUBDhjzRyFv
20
20
  docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
21
  docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
22
22
  docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
23
- docling-1.2.1.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
24
- docling-1.2.1.dist-info/METADATA,sha256=EwclgZsLCgm5qOT5na5QRxSwSKYZCIfSeotTlwMRyYk,7042
25
- docling-1.2.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
26
- docling-1.2.1.dist-info/RECORD,,
23
+ docling-1.4.0.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
24
+ docling-1.4.0.dist-info/METADATA,sha256=Hu8pvrxpc0b1qzQvvzI_ijRAQWjOfcfNl4_1Zb7oyoc,7042
25
+ docling-1.4.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
26
+ docling-1.4.0.dist-info/RECORD,,