docling 1.2.1__py3-none-any.whl → 1.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/docling_parse_backend.py +4 -3
- docling/datamodel/base_models.py +6 -0
- docling/datamodel/document.py +3 -3
- docling/document_converter.py +13 -4
- docling/models/layout_model.py +8 -0
- {docling-1.2.1.dist-info → docling-1.4.0.dist-info}/METADATA +3 -3
- {docling-1.2.1.dist-info → docling-1.4.0.dist-info}/RECORD +9 -9
- {docling-1.2.1.dist-info → docling-1.4.0.dist-info}/LICENSE +0 -0
- {docling-1.2.1.dist-info → docling-1.4.0.dist-info}/WHEEL +0 -0
@@ -150,10 +150,11 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
|
|
150
150
|
super().__init__(path_or_stream)
|
151
151
|
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
152
152
|
# Parsing cells with docling_parser call
|
153
|
-
if isinstance(path_or_stream, BytesIO):
|
154
|
-
raise NotImplemented("This backend does not support byte streams yet.")
|
155
153
|
parser = pdf_parser()
|
156
|
-
|
154
|
+
if isinstance(path_or_stream, BytesIO):
|
155
|
+
self._parser_doc = parser.find_cells_from_bytesio(path_or_stream)
|
156
|
+
else:
|
157
|
+
self._parser_doc = parser.find_cells(str(path_or_stream))
|
157
158
|
|
158
159
|
def page_count(self) -> int:
|
159
160
|
return len(self._parser_doc["pages"])
|
docling/datamodel/base_models.py
CHANGED
@@ -265,3 +265,9 @@ class PipelineOptions(BaseModel):
|
|
265
265
|
do_ocr: bool = False # True: perform OCR, replace programmatic PDF text
|
266
266
|
|
267
267
|
table_structure_options: TableStructureOptions = TableStructureOptions()
|
268
|
+
|
269
|
+
|
270
|
+
class AssembleOptions(BaseModel):
|
271
|
+
keep_page_images: bool = (
|
272
|
+
False # False: page images are removed in the assemble step
|
273
|
+
)
|
docling/datamodel/document.py
CHANGED
@@ -14,7 +14,7 @@ from docling_core.types import TableCell
|
|
14
14
|
from pydantic import BaseModel
|
15
15
|
|
16
16
|
from docling.backend.abstract_backend import PdfDocumentBackend
|
17
|
-
from docling.backend.
|
17
|
+
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
18
18
|
from docling.datamodel.base_models import (
|
19
19
|
AssembledUnit,
|
20
20
|
ConversionStatus,
|
@@ -64,7 +64,7 @@ class InputDocument(BaseModel):
|
|
64
64
|
path_or_stream: Union[BytesIO, Path],
|
65
65
|
filename: Optional[str] = None,
|
66
66
|
limits: Optional[DocumentLimits] = None,
|
67
|
-
pdf_backend=
|
67
|
+
pdf_backend=DoclingParseDocumentBackend,
|
68
68
|
):
|
69
69
|
super().__init__()
|
70
70
|
|
@@ -308,7 +308,7 @@ class DocumentConversionInput(BaseModel):
|
|
308
308
|
_path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
|
309
309
|
limits: Optional[DocumentLimits] = DocumentLimits()
|
310
310
|
|
311
|
-
DEFAULT_BACKEND: ClassVar =
|
311
|
+
DEFAULT_BACKEND: ClassVar = DoclingParseDocumentBackend
|
312
312
|
|
313
313
|
def docs(
|
314
314
|
self, pdf_backend: Optional[Type[PdfDocumentBackend]] = None
|
docling/document_converter.py
CHANGED
@@ -14,6 +14,7 @@ from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
|
|
14
14
|
from docling.backend.abstract_backend import PdfDocumentBackend
|
15
15
|
from docling.datamodel.base_models import (
|
16
16
|
AssembledUnit,
|
17
|
+
AssembleOptions,
|
17
18
|
ConversionStatus,
|
18
19
|
Page,
|
19
20
|
PipelineOptions,
|
@@ -44,6 +45,7 @@ class DocumentConverter:
|
|
44
45
|
pipeline_options: PipelineOptions = PipelineOptions(),
|
45
46
|
pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND,
|
46
47
|
pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline,
|
48
|
+
assemble_options: AssembleOptions = AssembleOptions(),
|
47
49
|
):
|
48
50
|
if not artifacts_path:
|
49
51
|
artifacts_path = self.download_models_hf()
|
@@ -57,6 +59,7 @@ class DocumentConverter:
|
|
57
59
|
self.page_assemble_model = PageAssembleModel(config={})
|
58
60
|
self.glm_model = GlmModel(config={})
|
59
61
|
self.pdf_backend = pdf_backend
|
62
|
+
self.assemble_options = assemble_options
|
60
63
|
|
61
64
|
@staticmethod
|
62
65
|
def download_models_hf(
|
@@ -174,17 +177,23 @@ class DocumentConverter:
|
|
174
177
|
pages_with_images,
|
175
178
|
)
|
176
179
|
|
180
|
+
# 4. Run pipeline stages
|
177
181
|
pipeline_pages = self.model_pipeline.apply(pages_with_cells)
|
178
182
|
|
179
|
-
#
|
183
|
+
# 5. Assemble page elements (per page)
|
180
184
|
assembled_pages = self.page_assemble_model(pipeline_pages)
|
181
185
|
|
182
186
|
# exhaust assembled_pages
|
183
187
|
for assembled_page in assembled_pages:
|
184
188
|
# Free up mem resources before moving on with next batch
|
185
|
-
|
186
|
-
|
187
|
-
|
189
|
+
|
190
|
+
# Remove page images (can be disabled)
|
191
|
+
if not self.assemble_options.keep_page_images:
|
192
|
+
assembled_page.image = (
|
193
|
+
None # Comment this if you want to visualize page images
|
194
|
+
)
|
195
|
+
|
196
|
+
# Unload backend
|
188
197
|
assembled_page._backend.unload()
|
189
198
|
|
190
199
|
all_assembled_pages.append(assembled_page)
|
docling/models/layout_model.py
CHANGED
@@ -69,6 +69,10 @@ class LayoutModel:
|
|
69
69
|
"Key-Value Region": 0.45,
|
70
70
|
}
|
71
71
|
|
72
|
+
CLASS_REMAPPINGS = {
|
73
|
+
"Document Index": "Table",
|
74
|
+
}
|
75
|
+
|
72
76
|
_log.debug("================= Start postprocess function ====================")
|
73
77
|
start_time = time.time()
|
74
78
|
# Apply Confidence Threshold to cluster predictions
|
@@ -79,6 +83,10 @@ class LayoutModel:
|
|
79
83
|
confidence = CLASS_THRESHOLDS[cluster.label]
|
80
84
|
if cluster.confidence >= confidence:
|
81
85
|
# annotation["created_by"] = "high_conf_pred"
|
86
|
+
|
87
|
+
# Remap class labels where needed.
|
88
|
+
if cluster.label in CLASS_REMAPPINGS.keys():
|
89
|
+
cluster.label = CLASS_REMAPPINGS[cluster.label]
|
82
90
|
clusters_out.append(cluster)
|
83
91
|
|
84
92
|
# map to dictionary clusters and cells, with bottom left origin
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.4.0
|
4
4
|
Summary: Docling PDF conversion package
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -24,8 +24,8 @@ Provides-Extra: ocr
|
|
24
24
|
Requires-Dist: certifi (>=2024.7.4)
|
25
25
|
Requires-Dist: deepsearch-glm (>=0.19.0,<1)
|
26
26
|
Requires-Dist: docling-core (>=1.1.2,<2.0.0)
|
27
|
-
Requires-Dist: docling-ibm-models (>=1.1.
|
28
|
-
Requires-Dist: docling-parse (>=0.0
|
27
|
+
Requires-Dist: docling-ibm-models (>=1.1.1,<2.0.0)
|
28
|
+
Requires-Dist: docling-parse (>=0.2.0,<0.3.0)
|
29
29
|
Requires-Dist: easyocr (>=1.7,<2.0) ; extra == "easyocr" or extra == "ocr"
|
30
30
|
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
31
31
|
Requires-Dist: huggingface_hub (>=0.23,<1)
|
@@ -1,17 +1,17 @@
|
|
1
1
|
docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
2
|
docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
3
|
docling/backend/abstract_backend.py,sha256=swwmXzNueZSHqEOvw4j-IFhP2OUJhBeB--gV7NtzKgo,1112
|
4
|
-
docling/backend/docling_parse_backend.py,sha256=
|
4
|
+
docling/backend/docling_parse_backend.py,sha256=bgsmnwDmroBwuOwkEUzlN9KMEIFJ1xUaCZW6rsr5G-c,5924
|
5
5
|
docling/backend/pypdfium2_backend.py,sha256=tv6JxyTkTdT2qr2ghsQgYA2zgpCDxKYSdHVBTAR7FSk,7411
|
6
6
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
docling/datamodel/base_models.py,sha256=
|
8
|
-
docling/datamodel/document.py,sha256=
|
7
|
+
docling/datamodel/base_models.py,sha256=irZLAHdsROOOwRbywKIA0mk3H8GrLwtGjOgTV6G0QoU,7004
|
8
|
+
docling/datamodel/document.py,sha256=lZHXINmPWvpzrV3PTilgJs1blqTMCnJdLEww_qfcqdE,12533
|
9
9
|
docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
|
10
|
-
docling/document_converter.py,sha256=
|
10
|
+
docling/document_converter.py,sha256=dMucsq6M_nwPsC1ChogVwJgNDv8sJuFklQWWinDZaug,10246
|
11
11
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
12
|
docling/models/ds_glm_model.py,sha256=wmb--2JKFQby-kvidw6PyM8wURPXYPQ_Z_eKKCBAdYQ,3192
|
13
13
|
docling/models/easyocr_model.py,sha256=NaHVs8IN0eW9KB076E2Kae1s-bq74_4IMWueze9QqtE,2290
|
14
|
-
docling/models/layout_model.py,sha256=
|
14
|
+
docling/models/layout_model.py,sha256=3mOgNvCYPh99_oLxJy-ZaIqGOFgG5bcIQ0tTubW656Q,11204
|
15
15
|
docling/models/page_assemble_model.py,sha256=8eoG2WiFxPxq9TPvM-wkngb2gkr0tdtCRVXg1JcTETo,5550
|
16
16
|
docling/models/table_structure_model.py,sha256=xUmfunZNYC30P0fRdESdztqy1FVlMzlhJjLBp-xcn4A,5638
|
17
17
|
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -20,7 +20,7 @@ docling/pipeline/standard_model_pipeline.py,sha256=UTwodKUKrisLoVcntbNUBDhjzRyFv
|
|
20
20
|
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
21
21
|
docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
|
22
22
|
docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
|
23
|
-
docling-1.
|
24
|
-
docling-1.
|
25
|
-
docling-1.
|
26
|
-
docling-1.
|
23
|
+
docling-1.4.0.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
|
24
|
+
docling-1.4.0.dist-info/METADATA,sha256=Hu8pvrxpc0b1qzQvvzI_ijRAQWjOfcfNl4_1Zb7oyoc,7042
|
25
|
+
docling-1.4.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
26
|
+
docling-1.4.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|