docling 1.6.3__py3-none-any.whl → 1.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/abstract_backend.py +7 -3
- docling/backend/docling_parse_backend.py +35 -19
- docling/backend/pypdfium2_backend.py +3 -2
- docling/datamodel/document.py +6 -2
- docling/document_converter.py +2 -0
- {docling-1.6.3.dist-info → docling-1.7.1.dist-info}/METADATA +2 -2
- {docling-1.6.3.dist-info → docling-1.7.1.dist-info}/RECORD +9 -9
- {docling-1.6.3.dist-info → docling-1.7.1.dist-info}/LICENSE +0 -0
- {docling-1.6.3.dist-info → docling-1.7.1.dist-info}/WHEEL +0 -0
@@ -39,8 +39,9 @@ class PdfPageBackend(ABC):
|
|
39
39
|
|
40
40
|
class PdfDocumentBackend(ABC):
|
41
41
|
@abstractmethod
|
42
|
-
def __init__(self, path_or_stream: Union[BytesIO, Path]):
|
43
|
-
|
42
|
+
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
43
|
+
self.path_or_stream = path_or_stream
|
44
|
+
self.document_hash = document_hash
|
44
45
|
|
45
46
|
@abstractmethod
|
46
47
|
def load_page(self, page_no: int) -> PdfPageBackend:
|
@@ -56,4 +57,7 @@ class PdfDocumentBackend(ABC):
|
|
56
57
|
|
57
58
|
@abstractmethod
|
58
59
|
def unload(self):
|
59
|
-
|
60
|
+
if isinstance(self.path_or_stream, BytesIO):
|
61
|
+
self.path_or_stream.close()
|
62
|
+
|
63
|
+
self.path_or_stream = None
|
@@ -1,6 +1,5 @@
|
|
1
1
|
import logging
|
2
2
|
import random
|
3
|
-
import time
|
4
3
|
from io import BytesIO
|
5
4
|
from pathlib import Path
|
6
5
|
from typing import Iterable, Optional, Union
|
@@ -17,13 +16,26 @@ _log = logging.getLogger(__name__)
|
|
17
16
|
|
18
17
|
|
19
18
|
class DoclingParsePageBackend(PdfPageBackend):
|
20
|
-
def __init__(
|
19
|
+
def __init__(
|
20
|
+
self, parser: pdf_parser, document_hash: str, page_no: int, page_obj: PdfPage
|
21
|
+
):
|
21
22
|
super().__init__(page_obj)
|
22
23
|
self._ppage = page_obj
|
23
|
-
|
24
|
-
|
24
|
+
|
25
|
+
parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
|
26
|
+
|
27
|
+
self._dpage = None
|
28
|
+
self.broken_page = "pages" not in parsed_page
|
29
|
+
if not self.broken_page:
|
30
|
+
self._dpage = parsed_page["pages"][0]
|
31
|
+
else:
|
32
|
+
raise RuntimeError(
|
33
|
+
f"Page {page_no} of document {document_hash} could not be parsed."
|
34
|
+
)
|
25
35
|
|
26
36
|
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
37
|
+
if self.broken_page:
|
38
|
+
return ""
|
27
39
|
# Find intersecting cells on the page
|
28
40
|
text_piece = ""
|
29
41
|
page_size = self.get_size()
|
@@ -58,6 +70,9 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
58
70
|
cells = []
|
59
71
|
cell_counter = 0
|
60
72
|
|
73
|
+
if self.broken_page:
|
74
|
+
return cells
|
75
|
+
|
61
76
|
page_size = self.get_size()
|
62
77
|
|
63
78
|
parser_width = self._dpage["width"]
|
@@ -168,38 +183,39 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
168
183
|
def unload(self):
|
169
184
|
self._ppage = None
|
170
185
|
self._dpage = None
|
171
|
-
self.text_page = None
|
172
186
|
|
173
187
|
|
174
188
|
class DoclingParseDocumentBackend(PdfDocumentBackend):
|
175
|
-
def __init__(self, path_or_stream: Union[BytesIO, Path]):
|
176
|
-
super().__init__(path_or_stream)
|
177
|
-
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
178
|
-
# Parsing cells with docling_parser call
|
179
|
-
parser = pdf_parser()
|
189
|
+
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
190
|
+
super().__init__(path_or_stream, document_hash)
|
180
191
|
|
181
|
-
|
192
|
+
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
193
|
+
self.parser = pdf_parser()
|
182
194
|
|
195
|
+
success = False
|
183
196
|
if isinstance(path_or_stream, BytesIO):
|
184
|
-
|
185
|
-
|
186
|
-
|
197
|
+
success = self.parser.load_document_from_bytesio(
|
198
|
+
document_hash, path_or_stream
|
199
|
+
)
|
200
|
+
elif isinstance(path_or_stream, Path):
|
201
|
+
success = self.parser.load_document(document_hash, str(path_or_stream))
|
187
202
|
|
188
|
-
|
189
|
-
|
203
|
+
if not success:
|
204
|
+
raise RuntimeError("docling-parse could not load this document.")
|
190
205
|
|
191
206
|
def page_count(self) -> int:
|
192
|
-
return len(self.
|
207
|
+
return len(self._pdoc) # To be replaced with docling-parse API
|
193
208
|
|
194
209
|
def load_page(self, page_no: int) -> DoclingParsePageBackend:
|
195
210
|
return DoclingParsePageBackend(
|
196
|
-
self.
|
211
|
+
self.parser, self.document_hash, page_no, self._pdoc[page_no]
|
197
212
|
)
|
198
213
|
|
199
214
|
def is_valid(self) -> bool:
|
200
215
|
return self.page_count() > 0
|
201
216
|
|
202
217
|
def unload(self):
|
218
|
+
super().unload()
|
219
|
+
self.parser.unload_document(self.document_hash)
|
203
220
|
self._pdoc.close()
|
204
221
|
self._pdoc = None
|
205
|
-
self._parser_doc = None
|
@@ -215,8 +215,8 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
215
215
|
|
216
216
|
|
217
217
|
class PyPdfiumDocumentBackend(PdfDocumentBackend):
|
218
|
-
def __init__(self, path_or_stream: Union[BytesIO, Path]):
|
219
|
-
super().__init__(path_or_stream)
|
218
|
+
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
219
|
+
super().__init__(path_or_stream, document_hash)
|
220
220
|
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
221
221
|
|
222
222
|
def page_count(self) -> int:
|
@@ -229,5 +229,6 @@ class PyPdfiumDocumentBackend(PdfDocumentBackend):
|
|
229
229
|
return self.page_count() > 0
|
230
230
|
|
231
231
|
def unload(self):
|
232
|
+
super().unload()
|
232
233
|
self._pdoc.close()
|
233
234
|
self._pdoc = None
|
docling/datamodel/document.py
CHANGED
@@ -79,7 +79,9 @@ class InputDocument(BaseModel):
|
|
79
79
|
self.valid = False
|
80
80
|
else:
|
81
81
|
self.document_hash = create_file_hash(path_or_stream)
|
82
|
-
self._backend = pdf_backend(
|
82
|
+
self._backend = pdf_backend(
|
83
|
+
path_or_stream=path_or_stream, document_hash=self.document_hash
|
84
|
+
)
|
83
85
|
|
84
86
|
elif isinstance(path_or_stream, BytesIO):
|
85
87
|
self.file = PurePath(filename)
|
@@ -89,7 +91,9 @@ class InputDocument(BaseModel):
|
|
89
91
|
self.valid = False
|
90
92
|
else:
|
91
93
|
self.document_hash = create_file_hash(path_or_stream)
|
92
|
-
self._backend = pdf_backend(
|
94
|
+
self._backend = pdf_backend(
|
95
|
+
path_or_stream=path_or_stream, document_hash=self.document_hash
|
96
|
+
)
|
93
97
|
|
94
98
|
if self.document_hash and self._backend.page_count() > 0:
|
95
99
|
self.page_count = self._backend.page_count()
|
docling/document_converter.py
CHANGED
@@ -141,6 +141,8 @@ class DocumentConverter:
|
|
141
141
|
start_doc_time = time.time()
|
142
142
|
converted_doc = ConvertedDocument(input=in_doc)
|
143
143
|
|
144
|
+
_log.info(f"Processing document {in_doc.file.name}")
|
145
|
+
|
144
146
|
if not in_doc.valid:
|
145
147
|
converted_doc.status = ConversionStatus.FAILURE
|
146
148
|
return converted_doc
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.7.1
|
4
4
|
Summary: Docling PDF conversion package
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -23,7 +23,7 @@ Requires-Dist: certifi (>=2024.7.4)
|
|
23
23
|
Requires-Dist: deepsearch-glm (>=0.19.0,<1)
|
24
24
|
Requires-Dist: docling-core (>=1.1.2,<2.0.0)
|
25
25
|
Requires-Dist: docling-ibm-models (>=1.1.2,<2.0.0)
|
26
|
-
Requires-Dist: docling-parse (>=
|
26
|
+
Requires-Dist: docling-parse (>=1.1.1,<2.0.0)
|
27
27
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
28
28
|
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
29
29
|
Requires-Dist: huggingface_hub (>=0.23,<1)
|
@@ -1,13 +1,13 @@
|
|
1
1
|
docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
2
|
docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
-
docling/backend/abstract_backend.py,sha256=
|
4
|
-
docling/backend/docling_parse_backend.py,sha256=
|
5
|
-
docling/backend/pypdfium2_backend.py,sha256=
|
3
|
+
docling/backend/abstract_backend.py,sha256=wvrywm1pPt79L5Dt_da5QGmf9dDzjBGj1rSaUQxqI8s,1432
|
4
|
+
docling/backend/docling_parse_backend.py,sha256=hXyF2VPPdLs7APWEXTlfz0wI86rUGYa67Q73zgTB-Ug,7438
|
5
|
+
docling/backend/pypdfium2_backend.py,sha256=84AnFah8Ztk-j8_9MTHalPU3a9fClrEz7A_rfzWDkFc,8122
|
6
6
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
7
|
docling/datamodel/base_models.py,sha256=5VHit5h7OleKnbhvy-sWDxQLizEdNrGUBrypyzwHyAE,8604
|
8
|
-
docling/datamodel/document.py,sha256=
|
8
|
+
docling/datamodel/document.py,sha256=J97KeT8fJRKijUorDky-xA2FoOGBXOjrReYjdeo8NK4,13333
|
9
9
|
docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
|
10
|
-
docling/document_converter.py,sha256=
|
10
|
+
docling/document_converter.py,sha256=Tx4BwtOxpwizmXgJl2nK6I-6m0V3fygHwYFomJTH2Ns,10433
|
11
11
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
12
|
docling/models/base_ocr_model.py,sha256=Ipl82a3AV2OsgMQSMEMpnWJ6MXcmyIQzmp52PmTaB0g,4465
|
13
13
|
docling/models/ds_glm_model.py,sha256=wmb--2JKFQby-kvidw6PyM8wURPXYPQ_Z_eKKCBAdYQ,3192
|
@@ -21,7 +21,7 @@ docling/pipeline/standard_model_pipeline.py,sha256=UTjyaEXvz9htYZz-IMTkn11cZwNjg
|
|
21
21
|
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
22
|
docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
|
23
23
|
docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
|
24
|
-
docling-1.
|
25
|
-
docling-1.
|
26
|
-
docling-1.
|
27
|
-
docling-1.
|
24
|
+
docling-1.7.1.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
|
25
|
+
docling-1.7.1.dist-info/METADATA,sha256=ADdVabYgc4VEIGKhM-tI6XBU_CG9tzKl_au69TZ9LbY,7229
|
26
|
+
docling-1.7.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
27
|
+
docling-1.7.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|