docling 1.7.1__py3-none-any.whl → 1.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/abstract_backend.py +4 -2
- docling/backend/docling_parse_backend.py +12 -9
- docling/backend/pypdfium2_backend.py +26 -5
- docling/datamodel/base_models.py +13 -1
- docling/datamodel/document.py +2 -1
- docling/document_converter.py +22 -4
- {docling-1.7.1.dist-info → docling-1.8.0.dist-info}/METADATA +1 -1
- {docling-1.7.1.dist-info → docling-1.8.0.dist-info}/RECORD +10 -10
- {docling-1.7.1.dist-info → docling-1.8.0.dist-info}/LICENSE +0 -0
- {docling-1.7.1.dist-info → docling-1.8.0.dist-info}/WHEEL +0 -0
@@ -7,8 +7,6 @@ from PIL import Image
|
|
7
7
|
|
8
8
|
|
9
9
|
class PdfPageBackend(ABC):
|
10
|
-
def __init__(self, page_obj: Any) -> object:
|
11
|
-
pass
|
12
10
|
|
13
11
|
@abstractmethod
|
14
12
|
def get_text_in_rect(self, bbox: "BoundingBox") -> str:
|
@@ -32,6 +30,10 @@ class PdfPageBackend(ABC):
|
|
32
30
|
def get_size(self) -> "PageSize":
|
33
31
|
pass
|
34
32
|
|
33
|
+
@abstractmethod
|
34
|
+
def is_valid(self) -> bool:
|
35
|
+
pass
|
36
|
+
|
35
37
|
@abstractmethod
|
36
38
|
def unload(self):
|
37
39
|
pass
|
@@ -19,22 +19,23 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
19
19
|
def __init__(
|
20
20
|
self, parser: pdf_parser, document_hash: str, page_no: int, page_obj: PdfPage
|
21
21
|
):
|
22
|
-
super().__init__(page_obj)
|
23
22
|
self._ppage = page_obj
|
24
|
-
|
25
23
|
parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
|
26
24
|
|
27
25
|
self._dpage = None
|
28
|
-
self.
|
29
|
-
if
|
26
|
+
self.valid = "pages" in parsed_page
|
27
|
+
if self.valid:
|
30
28
|
self._dpage = parsed_page["pages"][0]
|
31
29
|
else:
|
32
|
-
|
33
|
-
f"
|
30
|
+
_log.info(
|
31
|
+
f"An error occured when loading page {page_no} of document {document_hash}."
|
34
32
|
)
|
35
33
|
|
34
|
+
def is_valid(self) -> bool:
|
35
|
+
return self.valid
|
36
|
+
|
36
37
|
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
37
|
-
if self.
|
38
|
+
if not self.valid:
|
38
39
|
return ""
|
39
40
|
# Find intersecting cells on the page
|
40
41
|
text_piece = ""
|
@@ -70,7 +71,7 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
70
71
|
cells = []
|
71
72
|
cell_counter = 0
|
72
73
|
|
73
|
-
if self.
|
74
|
+
if not self.valid:
|
74
75
|
return cells
|
75
76
|
|
76
77
|
page_size = self.get_size()
|
@@ -201,7 +202,9 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
|
|
201
202
|
success = self.parser.load_document(document_hash, str(path_or_stream))
|
202
203
|
|
203
204
|
if not success:
|
204
|
-
raise RuntimeError(
|
205
|
+
raise RuntimeError(
|
206
|
+
f"docling-parse could not load document {document_hash}."
|
207
|
+
)
|
205
208
|
|
206
209
|
def page_count(self) -> int:
|
207
210
|
return len(self._pdoc) # To be replaced with docling-parse API
|
@@ -1,3 +1,4 @@
|
|
1
|
+
import logging
|
1
2
|
import random
|
2
3
|
from io import BytesIO
|
3
4
|
from pathlib import Path
|
@@ -7,17 +8,32 @@ import pypdfium2 as pdfium
|
|
7
8
|
import pypdfium2.raw as pdfium_c
|
8
9
|
from PIL import Image, ImageDraw
|
9
10
|
from pypdfium2 import PdfPage
|
11
|
+
from pypdfium2._helpers.misc import PdfiumError
|
10
12
|
|
11
13
|
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
|
12
14
|
from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
|
13
15
|
|
16
|
+
_log = logging.getLogger(__name__)
|
17
|
+
|
14
18
|
|
15
19
|
class PyPdfiumPageBackend(PdfPageBackend):
|
16
|
-
def __init__(
|
17
|
-
|
18
|
-
|
20
|
+
def __init__(
|
21
|
+
self, pdfium_doc: pdfium.PdfDocument, document_hash: str, page_no: int
|
22
|
+
):
|
23
|
+
self.valid = True # No better way to tell from pypdfium.
|
24
|
+
try:
|
25
|
+
self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
|
26
|
+
except PdfiumError as e:
|
27
|
+
_log.info(
|
28
|
+
f"An exception occured when loading page {page_no} of document {document_hash}.",
|
29
|
+
exc_info=True,
|
30
|
+
)
|
31
|
+
self.valid = False
|
19
32
|
self.text_page = None
|
20
33
|
|
34
|
+
def is_valid(self) -> bool:
|
35
|
+
return self.valid
|
36
|
+
|
21
37
|
def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
|
22
38
|
AREA_THRESHOLD = 32 * 32
|
23
39
|
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
@@ -217,13 +233,18 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
217
233
|
class PyPdfiumDocumentBackend(PdfDocumentBackend):
|
218
234
|
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
219
235
|
super().__init__(path_or_stream, document_hash)
|
220
|
-
|
236
|
+
try:
|
237
|
+
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
238
|
+
except PdfiumError as e:
|
239
|
+
raise RuntimeError(
|
240
|
+
f"pypdfium could not load document {document_hash}"
|
241
|
+
) from e
|
221
242
|
|
222
243
|
def page_count(self) -> int:
|
223
244
|
return len(self._pdoc)
|
224
245
|
|
225
246
|
def load_page(self, page_no: int) -> PyPdfiumPageBackend:
|
226
|
-
return PyPdfiumPageBackend(self._pdoc
|
247
|
+
return PyPdfiumPageBackend(self._pdoc, self.document_hash, page_no)
|
227
248
|
|
228
249
|
def is_valid(self) -> bool:
|
229
250
|
return self.page_count() > 0
|
docling/datamodel/base_models.py
CHANGED
@@ -16,7 +16,7 @@ class ConversionStatus(str, Enum):
|
|
16
16
|
STARTED = auto()
|
17
17
|
FAILURE = auto()
|
18
18
|
SUCCESS = auto()
|
19
|
-
|
19
|
+
PARTIAL_SUCCESS = auto()
|
20
20
|
|
21
21
|
|
22
22
|
class DocInputType(str, Enum):
|
@@ -29,6 +29,18 @@ class CoordOrigin(str, Enum):
|
|
29
29
|
BOTTOMLEFT = auto()
|
30
30
|
|
31
31
|
|
32
|
+
class DoclingComponentType(str, Enum):
|
33
|
+
PDF_BACKEND = auto()
|
34
|
+
MODEL = auto()
|
35
|
+
DOC_ASSEMBLER = auto()
|
36
|
+
|
37
|
+
|
38
|
+
class ErrorItem(BaseModel):
|
39
|
+
component_type: DoclingComponentType
|
40
|
+
module_name: str
|
41
|
+
error_message: str
|
42
|
+
|
43
|
+
|
32
44
|
class PageSize(BaseModel):
|
33
45
|
width: float = 0.0
|
34
46
|
height: float = 0.0
|
docling/datamodel/document.py
CHANGED
@@ -19,6 +19,7 @@ from docling.datamodel.base_models import (
|
|
19
19
|
AssembledUnit,
|
20
20
|
ConversionStatus,
|
21
21
|
DocumentStream,
|
22
|
+
ErrorItem,
|
22
23
|
FigureElement,
|
23
24
|
Page,
|
24
25
|
PageElement,
|
@@ -118,7 +119,7 @@ class ConvertedDocument(BaseModel):
|
|
118
119
|
input: InputDocument
|
119
120
|
|
120
121
|
status: ConversionStatus = ConversionStatus.PENDING # failure, success
|
121
|
-
errors: List[
|
122
|
+
errors: List[ErrorItem] = [] # structure to keep errors
|
122
123
|
|
123
124
|
pages: List[Page] = []
|
124
125
|
assembled: Optional[AssembledUnit] = None
|
docling/document_converter.py
CHANGED
@@ -16,6 +16,8 @@ from docling.datamodel.base_models import (
|
|
16
16
|
AssembledUnit,
|
17
17
|
AssembleOptions,
|
18
18
|
ConversionStatus,
|
19
|
+
DoclingComponentType,
|
20
|
+
ErrorItem,
|
19
21
|
Page,
|
20
22
|
PipelineOptions,
|
21
23
|
)
|
@@ -157,7 +159,6 @@ class DocumentConverter:
|
|
157
159
|
for page_batch in chunkify(
|
158
160
|
converted_doc.pages, settings.perf.page_batch_size
|
159
161
|
):
|
160
|
-
|
161
162
|
start_pb_time = time.time()
|
162
163
|
# Pipeline
|
163
164
|
|
@@ -205,12 +206,27 @@ class DocumentConverter:
|
|
205
206
|
converted_doc.pages = all_assembled_pages
|
206
207
|
self.assemble_doc(converted_doc)
|
207
208
|
|
208
|
-
|
209
|
+
status = ConversionStatus.SUCCESS
|
210
|
+
for page in converted_doc.pages:
|
211
|
+
if not page._backend.is_valid():
|
212
|
+
converted_doc.errors.append(
|
213
|
+
ErrorItem(
|
214
|
+
component_type=DoclingComponentType.PDF_BACKEND,
|
215
|
+
module_name=type(page._backend).__name__,
|
216
|
+
error_message=f"Page {page.page_no} failed to parse.",
|
217
|
+
)
|
218
|
+
)
|
219
|
+
status = ConversionStatus.PARTIAL_SUCCESS
|
220
|
+
|
221
|
+
converted_doc.status = status
|
209
222
|
|
210
223
|
except Exception as e:
|
211
224
|
converted_doc.status = ConversionStatus.FAILURE
|
212
225
|
trace = "\n".join(traceback.format_exception(e))
|
213
|
-
_log.info(
|
226
|
+
_log.info(
|
227
|
+
f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
|
228
|
+
f"{trace}"
|
229
|
+
)
|
214
230
|
|
215
231
|
end_doc_time = time.time() - start_doc_time
|
216
232
|
_log.info(
|
@@ -230,7 +246,9 @@ class DocumentConverter:
|
|
230
246
|
# Generate the page image and store it in the page object
|
231
247
|
def populate_page_images(self, doc: InputDocument, page: Page) -> Page:
|
232
248
|
# default scale
|
233
|
-
page.get_image(
|
249
|
+
page.get_image(
|
250
|
+
scale=1.0
|
251
|
+
) # puts the page image on the image cache at default scale
|
234
252
|
|
235
253
|
# user requested scales
|
236
254
|
if self.assemble_options.images_scale is not None:
|
@@ -1,13 +1,13 @@
|
|
1
1
|
docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
2
|
docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
-
docling/backend/abstract_backend.py,sha256=
|
4
|
-
docling/backend/docling_parse_backend.py,sha256=
|
5
|
-
docling/backend/pypdfium2_backend.py,sha256=
|
3
|
+
docling/backend/abstract_backend.py,sha256=xfNNiZKksPPa9KAiA-fHD86flg0It4n_29ccpm8fFiY,1436
|
4
|
+
docling/backend/docling_parse_backend.py,sha256=r3aJwsWR7qG47ElhOa9iQJJQauHMt950FfCsf6fhlP4,7480
|
5
|
+
docling/backend/pypdfium2_backend.py,sha256=FggVFitmyMMmLar6vk6XQsavGOPQx95TD14opWYRMAY,8837
|
6
6
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
docling/datamodel/base_models.py,sha256=
|
8
|
-
docling/datamodel/document.py,sha256=
|
7
|
+
docling/datamodel/base_models.py,sha256=DHpdLvEPpBaQsZ9gFmAKRiXdC2HwoApE9ufCw7GEhx4,8827
|
8
|
+
docling/datamodel/document.py,sha256=0KT_G6VGbbLoT8BNiMjzNOWF-c_USQEOP0Bb_rgGMUo,13353
|
9
9
|
docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
|
10
|
-
docling/document_converter.py,sha256=
|
10
|
+
docling/document_converter.py,sha256=pfHrJfvUKAAFNY8MeECwjvschrRQbV6qbDM72x7aw60,11187
|
11
11
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
12
|
docling/models/base_ocr_model.py,sha256=Ipl82a3AV2OsgMQSMEMpnWJ6MXcmyIQzmp52PmTaB0g,4465
|
13
13
|
docling/models/ds_glm_model.py,sha256=wmb--2JKFQby-kvidw6PyM8wURPXYPQ_Z_eKKCBAdYQ,3192
|
@@ -21,7 +21,7 @@ docling/pipeline/standard_model_pipeline.py,sha256=UTjyaEXvz9htYZz-IMTkn11cZwNjg
|
|
21
21
|
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
22
|
docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
|
23
23
|
docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
|
24
|
-
docling-1.
|
25
|
-
docling-1.
|
26
|
-
docling-1.
|
27
|
-
docling-1.
|
24
|
+
docling-1.8.0.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
|
25
|
+
docling-1.8.0.dist-info/METADATA,sha256=Z9S-zY0YuOT_xPHbbjWzdOCNQcnEiEpaZ7jSApZpUF0,7229
|
26
|
+
docling-1.8.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
27
|
+
docling-1.8.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|