docling 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -150,10 +150,11 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
150
150
  super().__init__(path_or_stream)
151
151
  self._pdoc = pdfium.PdfDocument(path_or_stream)
152
152
  # Parsing cells with docling_parser call
153
- if isinstance(path_or_stream, BytesIO):
154
- raise NotImplemented("This backend does not support byte streams yet.")
155
153
  parser = pdf_parser()
156
- self._parser_doc = parser.find_cells(str(path_or_stream))
154
+ if isinstance(path_or_stream, BytesIO):
155
+ self._parser_doc = parser.find_cells_from_bytesio(path_or_stream)
156
+ else:
157
+ self._parser_doc = parser.find_cells(str(path_or_stream))
157
158
 
158
159
  def page_count(self) -> int:
159
160
  return len(self._parser_doc["pages"])
@@ -14,7 +14,7 @@ from docling_core.types import TableCell
14
14
  from pydantic import BaseModel
15
15
 
16
16
  from docling.backend.abstract_backend import PdfDocumentBackend
17
- from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
17
+ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
18
18
  from docling.datamodel.base_models import (
19
19
  AssembledUnit,
20
20
  ConversionStatus,
@@ -64,7 +64,7 @@ class InputDocument(BaseModel):
64
64
  path_or_stream: Union[BytesIO, Path],
65
65
  filename: Optional[str] = None,
66
66
  limits: Optional[DocumentLimits] = None,
67
- pdf_backend=PyPdfiumDocumentBackend,
67
+ pdf_backend=DoclingParseDocumentBackend,
68
68
  ):
69
69
  super().__init__()
70
70
 
@@ -308,7 +308,7 @@ class DocumentConversionInput(BaseModel):
308
308
  _path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
309
309
  limits: Optional[DocumentLimits] = DocumentLimits()
310
310
 
311
- DEFAULT_BACKEND: ClassVar = PyPdfiumDocumentBackend
311
+ DEFAULT_BACKEND: ClassVar = DoclingParseDocumentBackend
312
312
 
313
313
  def docs(
314
314
  self, pdf_backend: Optional[Type[PdfDocumentBackend]] = None
@@ -69,6 +69,10 @@ class LayoutModel:
69
69
  "Key-Value Region": 0.45,
70
70
  }
71
71
 
72
+ CLASS_REMAPPINGS = {
73
+ "Document Index": "Table",
74
+ }
75
+
72
76
  _log.debug("================= Start postprocess function ====================")
73
77
  start_time = time.time()
74
78
  # Apply Confidence Threshold to cluster predictions
@@ -79,6 +83,10 @@ class LayoutModel:
79
83
  confidence = CLASS_THRESHOLDS[cluster.label]
80
84
  if cluster.confidence >= confidence:
81
85
  # annotation["created_by"] = "high_conf_pred"
86
+
87
+ # Remap class labels where needed.
88
+ if cluster.label in CLASS_REMAPPINGS.keys():
89
+ cluster.label = CLASS_REMAPPINGS[cluster.label]
82
90
  clusters_out.append(cluster)
83
91
 
84
92
  # map to dictionary clusters and cells, with bottom left origin
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 1.3.0
3
+ Version: 1.4.0
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -24,8 +24,8 @@ Provides-Extra: ocr
24
24
  Requires-Dist: certifi (>=2024.7.4)
25
25
  Requires-Dist: deepsearch-glm (>=0.19.0,<1)
26
26
  Requires-Dist: docling-core (>=1.1.2,<2.0.0)
27
- Requires-Dist: docling-ibm-models (>=1.1.0,<2.0.0)
28
- Requires-Dist: docling-parse (>=0.0.1,<0.0.2)
27
+ Requires-Dist: docling-ibm-models (>=1.1.1,<2.0.0)
28
+ Requires-Dist: docling-parse (>=0.2.0,<0.3.0)
29
29
  Requires-Dist: easyocr (>=1.7,<2.0) ; extra == "easyocr" or extra == "ocr"
30
30
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
31
31
  Requires-Dist: huggingface_hub (>=0.23,<1)
@@ -1,17 +1,17 @@
1
1
  docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  docling/backend/abstract_backend.py,sha256=swwmXzNueZSHqEOvw4j-IFhP2OUJhBeB--gV7NtzKgo,1112
4
- docling/backend/docling_parse_backend.py,sha256=mGuJCpMVqyrZK-cXKRWrELPz0Wt1h6uydx4QwWI1rew,5912
4
+ docling/backend/docling_parse_backend.py,sha256=bgsmnwDmroBwuOwkEUzlN9KMEIFJ1xUaCZW6rsr5G-c,5924
5
5
  docling/backend/pypdfium2_backend.py,sha256=tv6JxyTkTdT2qr2ghsQgYA2zgpCDxKYSdHVBTAR7FSk,7411
6
6
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
7
  docling/datamodel/base_models.py,sha256=irZLAHdsROOOwRbywKIA0mk3H8GrLwtGjOgTV6G0QoU,7004
8
- docling/datamodel/document.py,sha256=FG_ntDFRBWj-MhV52D0sC8XaZOwN3yryyXahsVHGnyI,12517
8
+ docling/datamodel/document.py,sha256=lZHXINmPWvpzrV3PTilgJs1blqTMCnJdLEww_qfcqdE,12533
9
9
  docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
10
10
  docling/document_converter.py,sha256=dMucsq6M_nwPsC1ChogVwJgNDv8sJuFklQWWinDZaug,10246
11
11
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  docling/models/ds_glm_model.py,sha256=wmb--2JKFQby-kvidw6PyM8wURPXYPQ_Z_eKKCBAdYQ,3192
13
13
  docling/models/easyocr_model.py,sha256=NaHVs8IN0eW9KB076E2Kae1s-bq74_4IMWueze9QqtE,2290
14
- docling/models/layout_model.py,sha256=4AfPFiu6pXc8wIQ1sQlEZnHRt7SnBmfzDdctiRveOWw,10944
14
+ docling/models/layout_model.py,sha256=3mOgNvCYPh99_oLxJy-ZaIqGOFgG5bcIQ0tTubW656Q,11204
15
15
  docling/models/page_assemble_model.py,sha256=8eoG2WiFxPxq9TPvM-wkngb2gkr0tdtCRVXg1JcTETo,5550
16
16
  docling/models/table_structure_model.py,sha256=xUmfunZNYC30P0fRdESdztqy1FVlMzlhJjLBp-xcn4A,5638
17
17
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -20,7 +20,7 @@ docling/pipeline/standard_model_pipeline.py,sha256=UTwodKUKrisLoVcntbNUBDhjzRyFv
20
20
  docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
21
  docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
22
22
  docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
23
- docling-1.3.0.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
24
- docling-1.3.0.dist-info/METADATA,sha256=wi2DOn77z_BIMSLsrmzebYZUgpjHYWbNTOIVEY3A4-o,7042
25
- docling-1.3.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
26
- docling-1.3.0.dist-info/RECORD,,
23
+ docling-1.4.0.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
24
+ docling-1.4.0.dist-info/METADATA,sha256=Hu8pvrxpc0b1qzQvvzI_ijRAQWjOfcfNl4_1Zb7oyoc,7042
25
+ docling-1.4.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
26
+ docling-1.4.0.dist-info/RECORD,,