docling 1.7.0__py3-none-any.whl → 1.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,8 +7,6 @@ from PIL import Image
7
7
 
8
8
 
9
9
  class PdfPageBackend(ABC):
10
- def __init__(self, page_obj: Any) -> object:
11
- pass
12
10
 
13
11
  @abstractmethod
14
12
  def get_text_in_rect(self, bbox: "BoundingBox") -> str:
@@ -32,6 +30,10 @@ class PdfPageBackend(ABC):
32
30
  def get_size(self) -> "PageSize":
33
31
  pass
34
32
 
33
+ @abstractmethod
34
+ def is_valid(self) -> bool:
35
+ pass
36
+
35
37
  @abstractmethod
36
38
  def unload(self):
37
39
  pass
@@ -19,13 +19,24 @@ class DoclingParsePageBackend(PdfPageBackend):
19
19
  def __init__(
20
20
  self, parser: pdf_parser, document_hash: str, page_no: int, page_obj: PdfPage
21
21
  ):
22
- super().__init__(page_obj)
23
22
  self._ppage = page_obj
24
-
25
23
  parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
26
- self._dpage = parsed_page["pages"][0]
24
+
25
+ self._dpage = None
26
+ self.valid = "pages" in parsed_page
27
+ if self.valid:
28
+ self._dpage = parsed_page["pages"][0]
29
+ else:
30
+ _log.info(
31
+ f"An error occured when loading page {page_no} of document {document_hash}."
32
+ )
33
+
34
+ def is_valid(self) -> bool:
35
+ return self.valid
27
36
 
28
37
  def get_text_in_rect(self, bbox: BoundingBox) -> str:
38
+ if not self.valid:
39
+ return ""
29
40
  # Find intersecting cells on the page
30
41
  text_piece = ""
31
42
  page_size = self.get_size()
@@ -60,6 +71,9 @@ class DoclingParsePageBackend(PdfPageBackend):
60
71
  cells = []
61
72
  cell_counter = 0
62
73
 
74
+ if not self.valid:
75
+ return cells
76
+
63
77
  page_size = self.get_size()
64
78
 
65
79
  parser_width = self._dpage["width"]
@@ -188,7 +202,9 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
188
202
  success = self.parser.load_document(document_hash, str(path_or_stream))
189
203
 
190
204
  if not success:
191
- raise RuntimeError("docling-parse could not load this document.")
205
+ raise RuntimeError(
206
+ f"docling-parse could not load document {document_hash}."
207
+ )
192
208
 
193
209
  def page_count(self) -> int:
194
210
  return len(self._pdoc) # To be replaced with docling-parse API
@@ -1,3 +1,4 @@
1
+ import logging
1
2
  import random
2
3
  from io import BytesIO
3
4
  from pathlib import Path
@@ -7,17 +8,32 @@ import pypdfium2 as pdfium
7
8
  import pypdfium2.raw as pdfium_c
8
9
  from PIL import Image, ImageDraw
9
10
  from pypdfium2 import PdfPage
11
+ from pypdfium2._helpers.misc import PdfiumError
10
12
 
11
13
  from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
12
14
  from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
13
15
 
16
+ _log = logging.getLogger(__name__)
17
+
14
18
 
15
19
  class PyPdfiumPageBackend(PdfPageBackend):
16
- def __init__(self, page_obj: PdfPage):
17
- super().__init__(page_obj)
18
- self._ppage = page_obj
20
+ def __init__(
21
+ self, pdfium_doc: pdfium.PdfDocument, document_hash: str, page_no: int
22
+ ):
23
+ self.valid = True # No better way to tell from pypdfium.
24
+ try:
25
+ self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
26
+ except PdfiumError as e:
27
+ _log.info(
28
+ f"An exception occured when loading page {page_no} of document {document_hash}.",
29
+ exc_info=True,
30
+ )
31
+ self.valid = False
19
32
  self.text_page = None
20
33
 
34
+ def is_valid(self) -> bool:
35
+ return self.valid
36
+
21
37
  def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
22
38
  AREA_THRESHOLD = 32 * 32
23
39
  for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
@@ -217,13 +233,18 @@ class PyPdfiumPageBackend(PdfPageBackend):
217
233
  class PyPdfiumDocumentBackend(PdfDocumentBackend):
218
234
  def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
219
235
  super().__init__(path_or_stream, document_hash)
220
- self._pdoc = pdfium.PdfDocument(path_or_stream)
236
+ try:
237
+ self._pdoc = pdfium.PdfDocument(path_or_stream)
238
+ except PdfiumError as e:
239
+ raise RuntimeError(
240
+ f"pypdfium could not load document {document_hash}"
241
+ ) from e
221
242
 
222
243
  def page_count(self) -> int:
223
244
  return len(self._pdoc)
224
245
 
225
246
  def load_page(self, page_no: int) -> PyPdfiumPageBackend:
226
- return PyPdfiumPageBackend(self._pdoc[page_no])
247
+ return PyPdfiumPageBackend(self._pdoc, self.document_hash, page_no)
227
248
 
228
249
  def is_valid(self) -> bool:
229
250
  return self.page_count() > 0
@@ -16,7 +16,7 @@ class ConversionStatus(str, Enum):
16
16
  STARTED = auto()
17
17
  FAILURE = auto()
18
18
  SUCCESS = auto()
19
- SUCCESS_WITH_ERRORS = auto()
19
+ PARTIAL_SUCCESS = auto()
20
20
 
21
21
 
22
22
  class DocInputType(str, Enum):
@@ -29,6 +29,18 @@ class CoordOrigin(str, Enum):
29
29
  BOTTOMLEFT = auto()
30
30
 
31
31
 
32
+ class DoclingComponentType(str, Enum):
33
+ PDF_BACKEND = auto()
34
+ MODEL = auto()
35
+ DOC_ASSEMBLER = auto()
36
+
37
+
38
+ class ErrorItem(BaseModel):
39
+ component_type: DoclingComponentType
40
+ module_name: str
41
+ error_message: str
42
+
43
+
32
44
  class PageSize(BaseModel):
33
45
  width: float = 0.0
34
46
  height: float = 0.0
@@ -19,6 +19,7 @@ from docling.datamodel.base_models import (
19
19
  AssembledUnit,
20
20
  ConversionStatus,
21
21
  DocumentStream,
22
+ ErrorItem,
22
23
  FigureElement,
23
24
  Page,
24
25
  PageElement,
@@ -118,7 +119,7 @@ class ConvertedDocument(BaseModel):
118
119
  input: InputDocument
119
120
 
120
121
  status: ConversionStatus = ConversionStatus.PENDING # failure, success
121
- errors: List[Dict] = [] # structure to keep errors
122
+ errors: List[ErrorItem] = [] # structure to keep errors
122
123
 
123
124
  pages: List[Page] = []
124
125
  assembled: Optional[AssembledUnit] = None
@@ -16,6 +16,8 @@ from docling.datamodel.base_models import (
16
16
  AssembledUnit,
17
17
  AssembleOptions,
18
18
  ConversionStatus,
19
+ DoclingComponentType,
20
+ ErrorItem,
19
21
  Page,
20
22
  PipelineOptions,
21
23
  )
@@ -157,7 +159,6 @@ class DocumentConverter:
157
159
  for page_batch in chunkify(
158
160
  converted_doc.pages, settings.perf.page_batch_size
159
161
  ):
160
-
161
162
  start_pb_time = time.time()
162
163
  # Pipeline
163
164
 
@@ -205,12 +206,27 @@ class DocumentConverter:
205
206
  converted_doc.pages = all_assembled_pages
206
207
  self.assemble_doc(converted_doc)
207
208
 
208
- converted_doc.status = ConversionStatus.SUCCESS
209
+ status = ConversionStatus.SUCCESS
210
+ for page in converted_doc.pages:
211
+ if not page._backend.is_valid():
212
+ converted_doc.errors.append(
213
+ ErrorItem(
214
+ component_type=DoclingComponentType.PDF_BACKEND,
215
+ module_name=type(page._backend).__name__,
216
+ error_message=f"Page {page.page_no} failed to parse.",
217
+ )
218
+ )
219
+ status = ConversionStatus.PARTIAL_SUCCESS
220
+
221
+ converted_doc.status = status
209
222
 
210
223
  except Exception as e:
211
224
  converted_doc.status = ConversionStatus.FAILURE
212
225
  trace = "\n".join(traceback.format_exception(e))
213
- _log.info(f"Encountered an error during conversion: {trace}")
226
+ _log.info(
227
+ f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
228
+ f"{trace}"
229
+ )
214
230
 
215
231
  end_doc_time = time.time() - start_doc_time
216
232
  _log.info(
@@ -230,7 +246,9 @@ class DocumentConverter:
230
246
  # Generate the page image and store it in the page object
231
247
  def populate_page_images(self, doc: InputDocument, page: Page) -> Page:
232
248
  # default scale
233
- page.get_image(scale=1.0)
249
+ page.get_image(
250
+ scale=1.0
251
+ ) # puts the page image on the image cache at default scale
234
252
 
235
253
  # user requested scales
236
254
  if self.assemble_options.images_scale is not None:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 1.7.0
3
+ Version: 1.8.0
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -23,7 +23,7 @@ Requires-Dist: certifi (>=2024.7.4)
23
23
  Requires-Dist: deepsearch-glm (>=0.19.0,<1)
24
24
  Requires-Dist: docling-core (>=1.1.2,<2.0.0)
25
25
  Requires-Dist: docling-ibm-models (>=1.1.2,<2.0.0)
26
- Requires-Dist: docling-parse (>=1.0.0,<2.0.0)
26
+ Requires-Dist: docling-parse (>=1.1.1,<2.0.0)
27
27
  Requires-Dist: easyocr (>=1.7,<2.0)
28
28
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
29
29
  Requires-Dist: huggingface_hub (>=0.23,<1)
@@ -1,13 +1,13 @@
1
1
  docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- docling/backend/abstract_backend.py,sha256=wvrywm1pPt79L5Dt_da5QGmf9dDzjBGj1rSaUQxqI8s,1432
4
- docling/backend/docling_parse_backend.py,sha256=E4JFpWBamiU8vRmHvgMZLU7lOxtUMq85TV5hMrGvWJI,7070
5
- docling/backend/pypdfium2_backend.py,sha256=84AnFah8Ztk-j8_9MTHalPU3a9fClrEz7A_rfzWDkFc,8122
3
+ docling/backend/abstract_backend.py,sha256=xfNNiZKksPPa9KAiA-fHD86flg0It4n_29ccpm8fFiY,1436
4
+ docling/backend/docling_parse_backend.py,sha256=r3aJwsWR7qG47ElhOa9iQJJQauHMt950FfCsf6fhlP4,7480
5
+ docling/backend/pypdfium2_backend.py,sha256=FggVFitmyMMmLar6vk6XQsavGOPQx95TD14opWYRMAY,8837
6
6
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- docling/datamodel/base_models.py,sha256=5VHit5h7OleKnbhvy-sWDxQLizEdNrGUBrypyzwHyAE,8604
8
- docling/datamodel/document.py,sha256=J97KeT8fJRKijUorDky-xA2FoOGBXOjrReYjdeo8NK4,13333
7
+ docling/datamodel/base_models.py,sha256=DHpdLvEPpBaQsZ9gFmAKRiXdC2HwoApE9ufCw7GEhx4,8827
8
+ docling/datamodel/document.py,sha256=0KT_G6VGbbLoT8BNiMjzNOWF-c_USQEOP0Bb_rgGMUo,13353
9
9
  docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
10
- docling/document_converter.py,sha256=Tx4BwtOxpwizmXgJl2nK6I-6m0V3fygHwYFomJTH2Ns,10433
10
+ docling/document_converter.py,sha256=pfHrJfvUKAAFNY8MeECwjvschrRQbV6qbDM72x7aw60,11187
11
11
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  docling/models/base_ocr_model.py,sha256=Ipl82a3AV2OsgMQSMEMpnWJ6MXcmyIQzmp52PmTaB0g,4465
13
13
  docling/models/ds_glm_model.py,sha256=wmb--2JKFQby-kvidw6PyM8wURPXYPQ_Z_eKKCBAdYQ,3192
@@ -21,7 +21,7 @@ docling/pipeline/standard_model_pipeline.py,sha256=UTjyaEXvz9htYZz-IMTkn11cZwNjg
21
21
  docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
22
  docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
23
23
  docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
24
- docling-1.7.0.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
25
- docling-1.7.0.dist-info/METADATA,sha256=yqKqPH3w2IxOhmdqHy0nJcYKDrJAufx6uF1Ti44pRs4,7229
26
- docling-1.7.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
27
- docling-1.7.0.dist-info/RECORD,,
24
+ docling-1.8.0.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
25
+ docling-1.8.0.dist-info/METADATA,sha256=Z9S-zY0YuOT_xPHbbjWzdOCNQcnEiEpaZ7jSApZpUF0,7229
26
+ docling-1.8.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
27
+ docling-1.8.0.dist-info/RECORD,,