docling 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/docling_parse_backend.py +4 -3
- docling/datamodel/document.py +3 -3
- docling/models/layout_model.py +8 -0
- {docling-1.3.0.dist-info → docling-1.4.0.dist-info}/METADATA +3 -3
- {docling-1.3.0.dist-info → docling-1.4.0.dist-info}/RECORD +7 -7
- {docling-1.3.0.dist-info → docling-1.4.0.dist-info}/LICENSE +0 -0
- {docling-1.3.0.dist-info → docling-1.4.0.dist-info}/WHEEL +0 -0
@@ -150,10 +150,11 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
|
|
150
150
|
super().__init__(path_or_stream)
|
151
151
|
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
152
152
|
# Parsing cells with docling_parser call
|
153
|
-
if isinstance(path_or_stream, BytesIO):
|
154
|
-
raise NotImplemented("This backend does not support byte streams yet.")
|
155
153
|
parser = pdf_parser()
|
156
|
-
|
154
|
+
if isinstance(path_or_stream, BytesIO):
|
155
|
+
self._parser_doc = parser.find_cells_from_bytesio(path_or_stream)
|
156
|
+
else:
|
157
|
+
self._parser_doc = parser.find_cells(str(path_or_stream))
|
157
158
|
|
158
159
|
def page_count(self) -> int:
|
159
160
|
return len(self._parser_doc["pages"])
|
docling/datamodel/document.py
CHANGED
@@ -14,7 +14,7 @@ from docling_core.types import TableCell
|
|
14
14
|
from pydantic import BaseModel
|
15
15
|
|
16
16
|
from docling.backend.abstract_backend import PdfDocumentBackend
|
17
|
-
from docling.backend.
|
17
|
+
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
18
18
|
from docling.datamodel.base_models import (
|
19
19
|
AssembledUnit,
|
20
20
|
ConversionStatus,
|
@@ -64,7 +64,7 @@ class InputDocument(BaseModel):
|
|
64
64
|
path_or_stream: Union[BytesIO, Path],
|
65
65
|
filename: Optional[str] = None,
|
66
66
|
limits: Optional[DocumentLimits] = None,
|
67
|
-
pdf_backend=
|
67
|
+
pdf_backend=DoclingParseDocumentBackend,
|
68
68
|
):
|
69
69
|
super().__init__()
|
70
70
|
|
@@ -308,7 +308,7 @@ class DocumentConversionInput(BaseModel):
|
|
308
308
|
_path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
|
309
309
|
limits: Optional[DocumentLimits] = DocumentLimits()
|
310
310
|
|
311
|
-
DEFAULT_BACKEND: ClassVar =
|
311
|
+
DEFAULT_BACKEND: ClassVar = DoclingParseDocumentBackend
|
312
312
|
|
313
313
|
def docs(
|
314
314
|
self, pdf_backend: Optional[Type[PdfDocumentBackend]] = None
|
docling/models/layout_model.py
CHANGED
@@ -69,6 +69,10 @@ class LayoutModel:
|
|
69
69
|
"Key-Value Region": 0.45,
|
70
70
|
}
|
71
71
|
|
72
|
+
CLASS_REMAPPINGS = {
|
73
|
+
"Document Index": "Table",
|
74
|
+
}
|
75
|
+
|
72
76
|
_log.debug("================= Start postprocess function ====================")
|
73
77
|
start_time = time.time()
|
74
78
|
# Apply Confidence Threshold to cluster predictions
|
@@ -79,6 +83,10 @@ class LayoutModel:
|
|
79
83
|
confidence = CLASS_THRESHOLDS[cluster.label]
|
80
84
|
if cluster.confidence >= confidence:
|
81
85
|
# annotation["created_by"] = "high_conf_pred"
|
86
|
+
|
87
|
+
# Remap class labels where needed.
|
88
|
+
if cluster.label in CLASS_REMAPPINGS.keys():
|
89
|
+
cluster.label = CLASS_REMAPPINGS[cluster.label]
|
82
90
|
clusters_out.append(cluster)
|
83
91
|
|
84
92
|
# map to dictionary clusters and cells, with bottom left origin
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.4.0
|
4
4
|
Summary: Docling PDF conversion package
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -24,8 +24,8 @@ Provides-Extra: ocr
|
|
24
24
|
Requires-Dist: certifi (>=2024.7.4)
|
25
25
|
Requires-Dist: deepsearch-glm (>=0.19.0,<1)
|
26
26
|
Requires-Dist: docling-core (>=1.1.2,<2.0.0)
|
27
|
-
Requires-Dist: docling-ibm-models (>=1.1.
|
28
|
-
Requires-Dist: docling-parse (>=0.0
|
27
|
+
Requires-Dist: docling-ibm-models (>=1.1.1,<2.0.0)
|
28
|
+
Requires-Dist: docling-parse (>=0.2.0,<0.3.0)
|
29
29
|
Requires-Dist: easyocr (>=1.7,<2.0) ; extra == "easyocr" or extra == "ocr"
|
30
30
|
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
31
31
|
Requires-Dist: huggingface_hub (>=0.23,<1)
|
@@ -1,17 +1,17 @@
|
|
1
1
|
docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
2
|
docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
3
|
docling/backend/abstract_backend.py,sha256=swwmXzNueZSHqEOvw4j-IFhP2OUJhBeB--gV7NtzKgo,1112
|
4
|
-
docling/backend/docling_parse_backend.py,sha256=
|
4
|
+
docling/backend/docling_parse_backend.py,sha256=bgsmnwDmroBwuOwkEUzlN9KMEIFJ1xUaCZW6rsr5G-c,5924
|
5
5
|
docling/backend/pypdfium2_backend.py,sha256=tv6JxyTkTdT2qr2ghsQgYA2zgpCDxKYSdHVBTAR7FSk,7411
|
6
6
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
7
|
docling/datamodel/base_models.py,sha256=irZLAHdsROOOwRbywKIA0mk3H8GrLwtGjOgTV6G0QoU,7004
|
8
|
-
docling/datamodel/document.py,sha256=
|
8
|
+
docling/datamodel/document.py,sha256=lZHXINmPWvpzrV3PTilgJs1blqTMCnJdLEww_qfcqdE,12533
|
9
9
|
docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
|
10
10
|
docling/document_converter.py,sha256=dMucsq6M_nwPsC1ChogVwJgNDv8sJuFklQWWinDZaug,10246
|
11
11
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
12
|
docling/models/ds_glm_model.py,sha256=wmb--2JKFQby-kvidw6PyM8wURPXYPQ_Z_eKKCBAdYQ,3192
|
13
13
|
docling/models/easyocr_model.py,sha256=NaHVs8IN0eW9KB076E2Kae1s-bq74_4IMWueze9QqtE,2290
|
14
|
-
docling/models/layout_model.py,sha256=
|
14
|
+
docling/models/layout_model.py,sha256=3mOgNvCYPh99_oLxJy-ZaIqGOFgG5bcIQ0tTubW656Q,11204
|
15
15
|
docling/models/page_assemble_model.py,sha256=8eoG2WiFxPxq9TPvM-wkngb2gkr0tdtCRVXg1JcTETo,5550
|
16
16
|
docling/models/table_structure_model.py,sha256=xUmfunZNYC30P0fRdESdztqy1FVlMzlhJjLBp-xcn4A,5638
|
17
17
|
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -20,7 +20,7 @@ docling/pipeline/standard_model_pipeline.py,sha256=UTwodKUKrisLoVcntbNUBDhjzRyFv
|
|
20
20
|
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
21
21
|
docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
|
22
22
|
docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
|
23
|
-
docling-1.
|
24
|
-
docling-1.
|
25
|
-
docling-1.
|
26
|
-
docling-1.
|
23
|
+
docling-1.4.0.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
|
24
|
+
docling-1.4.0.dist-info/METADATA,sha256=Hu8pvrxpc0b1qzQvvzI_ijRAQWjOfcfNl4_1Zb7oyoc,7042
|
25
|
+
docling-1.4.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
26
|
+
docling-1.4.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|