docling 1.7.1__tar.gz → 1.8.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-1.7.1 → docling-1.8.1}/PKG-INFO +1 -1
- {docling-1.7.1 → docling-1.8.1}/docling/backend/abstract_backend.py +4 -2
- {docling-1.7.1 → docling-1.8.1}/docling/backend/docling_parse_backend.py +12 -9
- {docling-1.7.1 → docling-1.8.1}/docling/backend/pypdfium2_backend.py +26 -5
- {docling-1.7.1 → docling-1.8.1}/docling/datamodel/base_models.py +13 -1
- {docling-1.7.1 → docling-1.8.1}/docling/datamodel/document.py +2 -1
- {docling-1.7.1 → docling-1.8.1}/docling/document_converter.py +25 -8
- {docling-1.7.1 → docling-1.8.1}/pyproject.toml +1 -1
- {docling-1.7.1 → docling-1.8.1}/LICENSE +0 -0
- {docling-1.7.1 → docling-1.8.1}/README.md +0 -0
- {docling-1.7.1 → docling-1.8.1}/docling/__init__.py +0 -0
- {docling-1.7.1 → docling-1.8.1}/docling/backend/__init__.py +0 -0
- {docling-1.7.1 → docling-1.8.1}/docling/datamodel/__init__.py +0 -0
- {docling-1.7.1 → docling-1.8.1}/docling/datamodel/settings.py +0 -0
- {docling-1.7.1 → docling-1.8.1}/docling/models/__init__.py +0 -0
- {docling-1.7.1 → docling-1.8.1}/docling/models/base_ocr_model.py +0 -0
- {docling-1.7.1 → docling-1.8.1}/docling/models/ds_glm_model.py +0 -0
- {docling-1.7.1 → docling-1.8.1}/docling/models/easyocr_model.py +0 -0
- {docling-1.7.1 → docling-1.8.1}/docling/models/layout_model.py +0 -0
- {docling-1.7.1 → docling-1.8.1}/docling/models/page_assemble_model.py +0 -0
- {docling-1.7.1 → docling-1.8.1}/docling/models/table_structure_model.py +0 -0
- {docling-1.7.1 → docling-1.8.1}/docling/pipeline/__init__.py +0 -0
- {docling-1.7.1 → docling-1.8.1}/docling/pipeline/base_model_pipeline.py +0 -0
- {docling-1.7.1 → docling-1.8.1}/docling/pipeline/standard_model_pipeline.py +0 -0
- {docling-1.7.1 → docling-1.8.1}/docling/utils/__init__.py +0 -0
- {docling-1.7.1 → docling-1.8.1}/docling/utils/layout_utils.py +0 -0
- {docling-1.7.1 → docling-1.8.1}/docling/utils/utils.py +0 -0
@@ -7,8 +7,6 @@ from PIL import Image
|
|
7
7
|
|
8
8
|
|
9
9
|
class PdfPageBackend(ABC):
|
10
|
-
def __init__(self, page_obj: Any) -> object:
|
11
|
-
pass
|
12
10
|
|
13
11
|
@abstractmethod
|
14
12
|
def get_text_in_rect(self, bbox: "BoundingBox") -> str:
|
@@ -32,6 +30,10 @@ class PdfPageBackend(ABC):
|
|
32
30
|
def get_size(self) -> "PageSize":
|
33
31
|
pass
|
34
32
|
|
33
|
+
@abstractmethod
|
34
|
+
def is_valid(self) -> bool:
|
35
|
+
pass
|
36
|
+
|
35
37
|
@abstractmethod
|
36
38
|
def unload(self):
|
37
39
|
pass
|
@@ -19,22 +19,23 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
19
19
|
def __init__(
|
20
20
|
self, parser: pdf_parser, document_hash: str, page_no: int, page_obj: PdfPage
|
21
21
|
):
|
22
|
-
super().__init__(page_obj)
|
23
22
|
self._ppage = page_obj
|
24
|
-
|
25
23
|
parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
|
26
24
|
|
27
25
|
self._dpage = None
|
28
|
-
self.
|
29
|
-
if
|
26
|
+
self.valid = "pages" in parsed_page
|
27
|
+
if self.valid:
|
30
28
|
self._dpage = parsed_page["pages"][0]
|
31
29
|
else:
|
32
|
-
|
33
|
-
f"
|
30
|
+
_log.info(
|
31
|
+
f"An error occured when loading page {page_no} of document {document_hash}."
|
34
32
|
)
|
35
33
|
|
34
|
+
def is_valid(self) -> bool:
|
35
|
+
return self.valid
|
36
|
+
|
36
37
|
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
37
|
-
if self.
|
38
|
+
if not self.valid:
|
38
39
|
return ""
|
39
40
|
# Find intersecting cells on the page
|
40
41
|
text_piece = ""
|
@@ -70,7 +71,7 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
70
71
|
cells = []
|
71
72
|
cell_counter = 0
|
72
73
|
|
73
|
-
if self.
|
74
|
+
if not self.valid:
|
74
75
|
return cells
|
75
76
|
|
76
77
|
page_size = self.get_size()
|
@@ -201,7 +202,9 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
|
|
201
202
|
success = self.parser.load_document(document_hash, str(path_or_stream))
|
202
203
|
|
203
204
|
if not success:
|
204
|
-
raise RuntimeError(
|
205
|
+
raise RuntimeError(
|
206
|
+
f"docling-parse could not load document {document_hash}."
|
207
|
+
)
|
205
208
|
|
206
209
|
def page_count(self) -> int:
|
207
210
|
return len(self._pdoc) # To be replaced with docling-parse API
|
@@ -1,3 +1,4 @@
|
|
1
|
+
import logging
|
1
2
|
import random
|
2
3
|
from io import BytesIO
|
3
4
|
from pathlib import Path
|
@@ -7,17 +8,32 @@ import pypdfium2 as pdfium
|
|
7
8
|
import pypdfium2.raw as pdfium_c
|
8
9
|
from PIL import Image, ImageDraw
|
9
10
|
from pypdfium2 import PdfPage
|
11
|
+
from pypdfium2._helpers.misc import PdfiumError
|
10
12
|
|
11
13
|
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
|
12
14
|
from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
|
13
15
|
|
16
|
+
_log = logging.getLogger(__name__)
|
17
|
+
|
14
18
|
|
15
19
|
class PyPdfiumPageBackend(PdfPageBackend):
|
16
|
-
def __init__(
|
17
|
-
|
18
|
-
|
20
|
+
def __init__(
|
21
|
+
self, pdfium_doc: pdfium.PdfDocument, document_hash: str, page_no: int
|
22
|
+
):
|
23
|
+
self.valid = True # No better way to tell from pypdfium.
|
24
|
+
try:
|
25
|
+
self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
|
26
|
+
except PdfiumError as e:
|
27
|
+
_log.info(
|
28
|
+
f"An exception occured when loading page {page_no} of document {document_hash}.",
|
29
|
+
exc_info=True,
|
30
|
+
)
|
31
|
+
self.valid = False
|
19
32
|
self.text_page = None
|
20
33
|
|
34
|
+
def is_valid(self) -> bool:
|
35
|
+
return self.valid
|
36
|
+
|
21
37
|
def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
|
22
38
|
AREA_THRESHOLD = 32 * 32
|
23
39
|
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
@@ -217,13 +233,18 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
217
233
|
class PyPdfiumDocumentBackend(PdfDocumentBackend):
|
218
234
|
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
219
235
|
super().__init__(path_or_stream, document_hash)
|
220
|
-
|
236
|
+
try:
|
237
|
+
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
238
|
+
except PdfiumError as e:
|
239
|
+
raise RuntimeError(
|
240
|
+
f"pypdfium could not load document {document_hash}"
|
241
|
+
) from e
|
221
242
|
|
222
243
|
def page_count(self) -> int:
|
223
244
|
return len(self._pdoc)
|
224
245
|
|
225
246
|
def load_page(self, page_no: int) -> PyPdfiumPageBackend:
|
226
|
-
return PyPdfiumPageBackend(self._pdoc
|
247
|
+
return PyPdfiumPageBackend(self._pdoc, self.document_hash, page_no)
|
227
248
|
|
228
249
|
def is_valid(self) -> bool:
|
229
250
|
return self.page_count() > 0
|
@@ -16,7 +16,7 @@ class ConversionStatus(str, Enum):
|
|
16
16
|
STARTED = auto()
|
17
17
|
FAILURE = auto()
|
18
18
|
SUCCESS = auto()
|
19
|
-
|
19
|
+
PARTIAL_SUCCESS = auto()
|
20
20
|
|
21
21
|
|
22
22
|
class DocInputType(str, Enum):
|
@@ -29,6 +29,18 @@ class CoordOrigin(str, Enum):
|
|
29
29
|
BOTTOMLEFT = auto()
|
30
30
|
|
31
31
|
|
32
|
+
class DoclingComponentType(str, Enum):
|
33
|
+
PDF_BACKEND = auto()
|
34
|
+
MODEL = auto()
|
35
|
+
DOC_ASSEMBLER = auto()
|
36
|
+
|
37
|
+
|
38
|
+
class ErrorItem(BaseModel):
|
39
|
+
component_type: DoclingComponentType
|
40
|
+
module_name: str
|
41
|
+
error_message: str
|
42
|
+
|
43
|
+
|
32
44
|
class PageSize(BaseModel):
|
33
45
|
width: float = 0.0
|
34
46
|
height: float = 0.0
|
@@ -19,6 +19,7 @@ from docling.datamodel.base_models import (
|
|
19
19
|
AssembledUnit,
|
20
20
|
ConversionStatus,
|
21
21
|
DocumentStream,
|
22
|
+
ErrorItem,
|
22
23
|
FigureElement,
|
23
24
|
Page,
|
24
25
|
PageElement,
|
@@ -118,7 +119,7 @@ class ConvertedDocument(BaseModel):
|
|
118
119
|
input: InputDocument
|
119
120
|
|
120
121
|
status: ConversionStatus = ConversionStatus.PENDING # failure, success
|
121
|
-
errors: List[
|
122
|
+
errors: List[ErrorItem] = [] # structure to keep errors
|
122
123
|
|
123
124
|
pages: List[Page] = []
|
124
125
|
assembled: Optional[AssembledUnit] = None
|
@@ -16,6 +16,8 @@ from docling.datamodel.base_models import (
|
|
16
16
|
AssembledUnit,
|
17
17
|
AssembleOptions,
|
18
18
|
ConversionStatus,
|
19
|
+
DoclingComponentType,
|
20
|
+
ErrorItem,
|
19
21
|
Page,
|
20
22
|
PipelineOptions,
|
21
23
|
)
|
@@ -86,7 +88,7 @@ class DocumentConverter:
|
|
86
88
|
# Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
|
87
89
|
yield from map(self.process_document, input_batch)
|
88
90
|
|
89
|
-
def convert_single(self, source: Path | AnyHttpUrl | str) ->
|
91
|
+
def convert_single(self, source: Path | AnyHttpUrl | str) -> ConvertedDocument:
|
90
92
|
"""Convert a single document.
|
91
93
|
|
92
94
|
Args:
|
@@ -131,11 +133,10 @@ class DocumentConverter:
|
|
131
133
|
converted_doc: ConvertedDocument = next(converted_docs_iter)
|
132
134
|
if converted_doc.status not in {
|
133
135
|
ConversionStatus.SUCCESS,
|
134
|
-
ConversionStatus.
|
136
|
+
ConversionStatus.PARTIAL_SUCCESS,
|
135
137
|
}:
|
136
138
|
raise RuntimeError(f"Conversion failed with status: {converted_doc.status}")
|
137
|
-
|
138
|
-
return doc
|
139
|
+
return converted_doc
|
139
140
|
|
140
141
|
def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
|
141
142
|
start_doc_time = time.time()
|
@@ -157,7 +158,6 @@ class DocumentConverter:
|
|
157
158
|
for page_batch in chunkify(
|
158
159
|
converted_doc.pages, settings.perf.page_batch_size
|
159
160
|
):
|
160
|
-
|
161
161
|
start_pb_time = time.time()
|
162
162
|
# Pipeline
|
163
163
|
|
@@ -205,12 +205,27 @@ class DocumentConverter:
|
|
205
205
|
converted_doc.pages = all_assembled_pages
|
206
206
|
self.assemble_doc(converted_doc)
|
207
207
|
|
208
|
-
|
208
|
+
status = ConversionStatus.SUCCESS
|
209
|
+
for page in converted_doc.pages:
|
210
|
+
if not page._backend.is_valid():
|
211
|
+
converted_doc.errors.append(
|
212
|
+
ErrorItem(
|
213
|
+
component_type=DoclingComponentType.PDF_BACKEND,
|
214
|
+
module_name=type(page._backend).__name__,
|
215
|
+
error_message=f"Page {page.page_no} failed to parse.",
|
216
|
+
)
|
217
|
+
)
|
218
|
+
status = ConversionStatus.PARTIAL_SUCCESS
|
219
|
+
|
220
|
+
converted_doc.status = status
|
209
221
|
|
210
222
|
except Exception as e:
|
211
223
|
converted_doc.status = ConversionStatus.FAILURE
|
212
224
|
trace = "\n".join(traceback.format_exception(e))
|
213
|
-
_log.info(
|
225
|
+
_log.info(
|
226
|
+
f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
|
227
|
+
f"{trace}"
|
228
|
+
)
|
214
229
|
|
215
230
|
end_doc_time = time.time() - start_doc_time
|
216
231
|
_log.info(
|
@@ -230,7 +245,9 @@ class DocumentConverter:
|
|
230
245
|
# Generate the page image and store it in the page object
|
231
246
|
def populate_page_images(self, doc: InputDocument, page: Page) -> Page:
|
232
247
|
# default scale
|
233
|
-
page.get_image(
|
248
|
+
page.get_image(
|
249
|
+
scale=1.0
|
250
|
+
) # puts the page image on the image cache at default scale
|
234
251
|
|
235
252
|
# user requested scales
|
236
253
|
if self.assemble_options.images_scale is not None:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "docling"
|
3
|
-
version = "1.
|
3
|
+
version = "1.8.1" # DO NOT EDIT, updated automatically
|
4
4
|
description = "Docling PDF conversion package"
|
5
5
|
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
6
6
|
license = "MIT"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|