docling 1.6.3__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/abstract_backend.py +7 -3
- docling/backend/docling_parse_backend.py +22 -19
- docling/backend/pypdfium2_backend.py +3 -2
- docling/datamodel/document.py +6 -2
- docling/document_converter.py +2 -0
- {docling-1.6.3.dist-info → docling-1.7.0.dist-info}/METADATA +2 -2
- {docling-1.6.3.dist-info → docling-1.7.0.dist-info}/RECORD +9 -9
- {docling-1.6.3.dist-info → docling-1.7.0.dist-info}/LICENSE +0 -0
- {docling-1.6.3.dist-info → docling-1.7.0.dist-info}/WHEEL +0 -0
@@ -39,8 +39,9 @@ class PdfPageBackend(ABC):
|
|
39
39
|
|
40
40
|
class PdfDocumentBackend(ABC):
|
41
41
|
@abstractmethod
|
42
|
-
def __init__(self, path_or_stream: Union[BytesIO, Path]):
|
43
|
-
|
42
|
+
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
43
|
+
self.path_or_stream = path_or_stream
|
44
|
+
self.document_hash = document_hash
|
44
45
|
|
45
46
|
@abstractmethod
|
46
47
|
def load_page(self, page_no: int) -> PdfPageBackend:
|
@@ -56,4 +57,7 @@ class PdfDocumentBackend(ABC):
|
|
56
57
|
|
57
58
|
@abstractmethod
|
58
59
|
def unload(self):
|
59
|
-
|
60
|
+
if isinstance(self.path_or_stream, BytesIO):
|
61
|
+
self.path_or_stream.close()
|
62
|
+
|
63
|
+
self.path_or_stream = None
|
@@ -1,6 +1,5 @@
|
|
1
1
|
import logging
|
2
2
|
import random
|
3
|
-
import time
|
4
3
|
from io import BytesIO
|
5
4
|
from pathlib import Path
|
6
5
|
from typing import Iterable, Optional, Union
|
@@ -17,11 +16,14 @@ _log = logging.getLogger(__name__)
|
|
17
16
|
|
18
17
|
|
19
18
|
class DoclingParsePageBackend(PdfPageBackend):
|
20
|
-
def __init__(
|
19
|
+
def __init__(
|
20
|
+
self, parser: pdf_parser, document_hash: str, page_no: int, page_obj: PdfPage
|
21
|
+
):
|
21
22
|
super().__init__(page_obj)
|
22
23
|
self._ppage = page_obj
|
23
|
-
|
24
|
-
|
24
|
+
|
25
|
+
parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
|
26
|
+
self._dpage = parsed_page["pages"][0]
|
25
27
|
|
26
28
|
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
27
29
|
# Find intersecting cells on the page
|
@@ -168,38 +170,39 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
168
170
|
def unload(self):
|
169
171
|
self._ppage = None
|
170
172
|
self._dpage = None
|
171
|
-
self.text_page = None
|
172
173
|
|
173
174
|
|
174
175
|
class DoclingParseDocumentBackend(PdfDocumentBackend):
|
175
|
-
def __init__(self, path_or_stream: Union[BytesIO, Path]):
|
176
|
-
super().__init__(path_or_stream)
|
177
|
-
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
178
|
-
# Parsing cells with docling_parser call
|
179
|
-
parser = pdf_parser()
|
176
|
+
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
177
|
+
super().__init__(path_or_stream, document_hash)
|
180
178
|
|
181
|
-
|
179
|
+
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
180
|
+
self.parser = pdf_parser()
|
182
181
|
|
182
|
+
success = False
|
183
183
|
if isinstance(path_or_stream, BytesIO):
|
184
|
-
|
185
|
-
|
186
|
-
|
184
|
+
success = self.parser.load_document_from_bytesio(
|
185
|
+
document_hash, path_or_stream
|
186
|
+
)
|
187
|
+
elif isinstance(path_or_stream, Path):
|
188
|
+
success = self.parser.load_document(document_hash, str(path_or_stream))
|
187
189
|
|
188
|
-
|
189
|
-
|
190
|
+
if not success:
|
191
|
+
raise RuntimeError("docling-parse could not load this document.")
|
190
192
|
|
191
193
|
def page_count(self) -> int:
|
192
|
-
return len(self.
|
194
|
+
return len(self._pdoc) # To be replaced with docling-parse API
|
193
195
|
|
194
196
|
def load_page(self, page_no: int) -> DoclingParsePageBackend:
|
195
197
|
return DoclingParsePageBackend(
|
196
|
-
self.
|
198
|
+
self.parser, self.document_hash, page_no, self._pdoc[page_no]
|
197
199
|
)
|
198
200
|
|
199
201
|
def is_valid(self) -> bool:
|
200
202
|
return self.page_count() > 0
|
201
203
|
|
202
204
|
def unload(self):
|
205
|
+
super().unload()
|
206
|
+
self.parser.unload_document(self.document_hash)
|
203
207
|
self._pdoc.close()
|
204
208
|
self._pdoc = None
|
205
|
-
self._parser_doc = None
|
@@ -215,8 +215,8 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
215
215
|
|
216
216
|
|
217
217
|
class PyPdfiumDocumentBackend(PdfDocumentBackend):
|
218
|
-
def __init__(self, path_or_stream: Union[BytesIO, Path]):
|
219
|
-
super().__init__(path_or_stream)
|
218
|
+
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
219
|
+
super().__init__(path_or_stream, document_hash)
|
220
220
|
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
221
221
|
|
222
222
|
def page_count(self) -> int:
|
@@ -229,5 +229,6 @@ class PyPdfiumDocumentBackend(PdfDocumentBackend):
|
|
229
229
|
return self.page_count() > 0
|
230
230
|
|
231
231
|
def unload(self):
|
232
|
+
super().unload()
|
232
233
|
self._pdoc.close()
|
233
234
|
self._pdoc = None
|
docling/datamodel/document.py
CHANGED
@@ -79,7 +79,9 @@ class InputDocument(BaseModel):
|
|
79
79
|
self.valid = False
|
80
80
|
else:
|
81
81
|
self.document_hash = create_file_hash(path_or_stream)
|
82
|
-
self._backend = pdf_backend(
|
82
|
+
self._backend = pdf_backend(
|
83
|
+
path_or_stream=path_or_stream, document_hash=self.document_hash
|
84
|
+
)
|
83
85
|
|
84
86
|
elif isinstance(path_or_stream, BytesIO):
|
85
87
|
self.file = PurePath(filename)
|
@@ -89,7 +91,9 @@ class InputDocument(BaseModel):
|
|
89
91
|
self.valid = False
|
90
92
|
else:
|
91
93
|
self.document_hash = create_file_hash(path_or_stream)
|
92
|
-
self._backend = pdf_backend(
|
94
|
+
self._backend = pdf_backend(
|
95
|
+
path_or_stream=path_or_stream, document_hash=self.document_hash
|
96
|
+
)
|
93
97
|
|
94
98
|
if self.document_hash and self._backend.page_count() > 0:
|
95
99
|
self.page_count = self._backend.page_count()
|
docling/document_converter.py
CHANGED
@@ -141,6 +141,8 @@ class DocumentConverter:
|
|
141
141
|
start_doc_time = time.time()
|
142
142
|
converted_doc = ConvertedDocument(input=in_doc)
|
143
143
|
|
144
|
+
_log.info(f"Processing document {in_doc.file.name}")
|
145
|
+
|
144
146
|
if not in_doc.valid:
|
145
147
|
converted_doc.status = ConversionStatus.FAILURE
|
146
148
|
return converted_doc
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.7.0
|
4
4
|
Summary: Docling PDF conversion package
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -23,7 +23,7 @@ Requires-Dist: certifi (>=2024.7.4)
|
|
23
23
|
Requires-Dist: deepsearch-glm (>=0.19.0,<1)
|
24
24
|
Requires-Dist: docling-core (>=1.1.2,<2.0.0)
|
25
25
|
Requires-Dist: docling-ibm-models (>=1.1.2,<2.0.0)
|
26
|
-
Requires-Dist: docling-parse (>=0.2.0
|
26
|
+
Requires-Dist: docling-parse (>=1.0.0,<2.0.0)
|
27
27
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
28
28
|
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
29
29
|
Requires-Dist: huggingface_hub (>=0.23,<1)
|
@@ -1,13 +1,13 @@
|
|
1
1
|
docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
2
|
docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
-
docling/backend/abstract_backend.py,sha256=
|
4
|
-
docling/backend/docling_parse_backend.py,sha256=
|
5
|
-
docling/backend/pypdfium2_backend.py,sha256=
|
3
|
+
docling/backend/abstract_backend.py,sha256=wvrywm1pPt79L5Dt_da5QGmf9dDzjBGj1rSaUQxqI8s,1432
|
4
|
+
docling/backend/docling_parse_backend.py,sha256=E4JFpWBamiU8vRmHvgMZLU7lOxtUMq85TV5hMrGvWJI,7070
|
5
|
+
docling/backend/pypdfium2_backend.py,sha256=84AnFah8Ztk-j8_9MTHalPU3a9fClrEz7A_rfzWDkFc,8122
|
6
6
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
7
|
docling/datamodel/base_models.py,sha256=5VHit5h7OleKnbhvy-sWDxQLizEdNrGUBrypyzwHyAE,8604
|
8
|
-
docling/datamodel/document.py,sha256=
|
8
|
+
docling/datamodel/document.py,sha256=J97KeT8fJRKijUorDky-xA2FoOGBXOjrReYjdeo8NK4,13333
|
9
9
|
docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
|
10
|
-
docling/document_converter.py,sha256=
|
10
|
+
docling/document_converter.py,sha256=Tx4BwtOxpwizmXgJl2nK6I-6m0V3fygHwYFomJTH2Ns,10433
|
11
11
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
12
|
docling/models/base_ocr_model.py,sha256=Ipl82a3AV2OsgMQSMEMpnWJ6MXcmyIQzmp52PmTaB0g,4465
|
13
13
|
docling/models/ds_glm_model.py,sha256=wmb--2JKFQby-kvidw6PyM8wURPXYPQ_Z_eKKCBAdYQ,3192
|
@@ -21,7 +21,7 @@ docling/pipeline/standard_model_pipeline.py,sha256=UTjyaEXvz9htYZz-IMTkn11cZwNjg
|
|
21
21
|
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
22
|
docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
|
23
23
|
docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
|
24
|
-
docling-1.
|
25
|
-
docling-1.
|
26
|
-
docling-1.
|
27
|
-
docling-1.
|
24
|
+
docling-1.7.0.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
|
25
|
+
docling-1.7.0.dist-info/METADATA,sha256=yqKqPH3w2IxOhmdqHy0nJcYKDrJAufx6uF1Ti44pRs4,7229
|
26
|
+
docling-1.7.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
27
|
+
docling-1.7.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|