docling 1.7.1__tar.gz → 1.8.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. {docling-1.7.1 → docling-1.8.1}/PKG-INFO +1 -1
  2. {docling-1.7.1 → docling-1.8.1}/docling/backend/abstract_backend.py +4 -2
  3. {docling-1.7.1 → docling-1.8.1}/docling/backend/docling_parse_backend.py +12 -9
  4. {docling-1.7.1 → docling-1.8.1}/docling/backend/pypdfium2_backend.py +26 -5
  5. {docling-1.7.1 → docling-1.8.1}/docling/datamodel/base_models.py +13 -1
  6. {docling-1.7.1 → docling-1.8.1}/docling/datamodel/document.py +2 -1
  7. {docling-1.7.1 → docling-1.8.1}/docling/document_converter.py +25 -8
  8. {docling-1.7.1 → docling-1.8.1}/pyproject.toml +1 -1
  9. {docling-1.7.1 → docling-1.8.1}/LICENSE +0 -0
  10. {docling-1.7.1 → docling-1.8.1}/README.md +0 -0
  11. {docling-1.7.1 → docling-1.8.1}/docling/__init__.py +0 -0
  12. {docling-1.7.1 → docling-1.8.1}/docling/backend/__init__.py +0 -0
  13. {docling-1.7.1 → docling-1.8.1}/docling/datamodel/__init__.py +0 -0
  14. {docling-1.7.1 → docling-1.8.1}/docling/datamodel/settings.py +0 -0
  15. {docling-1.7.1 → docling-1.8.1}/docling/models/__init__.py +0 -0
  16. {docling-1.7.1 → docling-1.8.1}/docling/models/base_ocr_model.py +0 -0
  17. {docling-1.7.1 → docling-1.8.1}/docling/models/ds_glm_model.py +0 -0
  18. {docling-1.7.1 → docling-1.8.1}/docling/models/easyocr_model.py +0 -0
  19. {docling-1.7.1 → docling-1.8.1}/docling/models/layout_model.py +0 -0
  20. {docling-1.7.1 → docling-1.8.1}/docling/models/page_assemble_model.py +0 -0
  21. {docling-1.7.1 → docling-1.8.1}/docling/models/table_structure_model.py +0 -0
  22. {docling-1.7.1 → docling-1.8.1}/docling/pipeline/__init__.py +0 -0
  23. {docling-1.7.1 → docling-1.8.1}/docling/pipeline/base_model_pipeline.py +0 -0
  24. {docling-1.7.1 → docling-1.8.1}/docling/pipeline/standard_model_pipeline.py +0 -0
  25. {docling-1.7.1 → docling-1.8.1}/docling/utils/__init__.py +0 -0
  26. {docling-1.7.1 → docling-1.8.1}/docling/utils/layout_utils.py +0 -0
  27. {docling-1.7.1 → docling-1.8.1}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 1.7.1
3
+ Version: 1.8.1
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -7,8 +7,6 @@ from PIL import Image
7
7
 
8
8
 
9
9
  class PdfPageBackend(ABC):
10
- def __init__(self, page_obj: Any) -> object:
11
- pass
12
10
 
13
11
  @abstractmethod
14
12
  def get_text_in_rect(self, bbox: "BoundingBox") -> str:
@@ -32,6 +30,10 @@ class PdfPageBackend(ABC):
32
30
  def get_size(self) -> "PageSize":
33
31
  pass
34
32
 
33
+ @abstractmethod
34
+ def is_valid(self) -> bool:
35
+ pass
36
+
35
37
  @abstractmethod
36
38
  def unload(self):
37
39
  pass
@@ -19,22 +19,23 @@ class DoclingParsePageBackend(PdfPageBackend):
19
19
  def __init__(
20
20
  self, parser: pdf_parser, document_hash: str, page_no: int, page_obj: PdfPage
21
21
  ):
22
- super().__init__(page_obj)
23
22
  self._ppage = page_obj
24
-
25
23
  parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
26
24
 
27
25
  self._dpage = None
28
- self.broken_page = "pages" not in parsed_page
29
- if not self.broken_page:
26
+ self.valid = "pages" in parsed_page
27
+ if self.valid:
30
28
  self._dpage = parsed_page["pages"][0]
31
29
  else:
32
- raise RuntimeError(
33
- f"Page {page_no} of document {document_hash} could not be parsed."
30
+ _log.info(
31
+ f"An error occured when loading page {page_no} of document {document_hash}."
34
32
  )
35
33
 
34
+ def is_valid(self) -> bool:
35
+ return self.valid
36
+
36
37
  def get_text_in_rect(self, bbox: BoundingBox) -> str:
37
- if self.broken_page:
38
+ if not self.valid:
38
39
  return ""
39
40
  # Find intersecting cells on the page
40
41
  text_piece = ""
@@ -70,7 +71,7 @@ class DoclingParsePageBackend(PdfPageBackend):
70
71
  cells = []
71
72
  cell_counter = 0
72
73
 
73
- if self.broken_page:
74
+ if not self.valid:
74
75
  return cells
75
76
 
76
77
  page_size = self.get_size()
@@ -201,7 +202,9 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
201
202
  success = self.parser.load_document(document_hash, str(path_or_stream))
202
203
 
203
204
  if not success:
204
- raise RuntimeError("docling-parse could not load this document.")
205
+ raise RuntimeError(
206
+ f"docling-parse could not load document {document_hash}."
207
+ )
205
208
 
206
209
  def page_count(self) -> int:
207
210
  return len(self._pdoc) # To be replaced with docling-parse API
@@ -1,3 +1,4 @@
1
+ import logging
1
2
  import random
2
3
  from io import BytesIO
3
4
  from pathlib import Path
@@ -7,17 +8,32 @@ import pypdfium2 as pdfium
7
8
  import pypdfium2.raw as pdfium_c
8
9
  from PIL import Image, ImageDraw
9
10
  from pypdfium2 import PdfPage
11
+ from pypdfium2._helpers.misc import PdfiumError
10
12
 
11
13
  from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
12
14
  from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
13
15
 
16
+ _log = logging.getLogger(__name__)
17
+
14
18
 
15
19
  class PyPdfiumPageBackend(PdfPageBackend):
16
- def __init__(self, page_obj: PdfPage):
17
- super().__init__(page_obj)
18
- self._ppage = page_obj
20
+ def __init__(
21
+ self, pdfium_doc: pdfium.PdfDocument, document_hash: str, page_no: int
22
+ ):
23
+ self.valid = True # No better way to tell from pypdfium.
24
+ try:
25
+ self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
26
+ except PdfiumError as e:
27
+ _log.info(
28
+ f"An exception occured when loading page {page_no} of document {document_hash}.",
29
+ exc_info=True,
30
+ )
31
+ self.valid = False
19
32
  self.text_page = None
20
33
 
34
+ def is_valid(self) -> bool:
35
+ return self.valid
36
+
21
37
  def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
22
38
  AREA_THRESHOLD = 32 * 32
23
39
  for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
@@ -217,13 +233,18 @@ class PyPdfiumPageBackend(PdfPageBackend):
217
233
  class PyPdfiumDocumentBackend(PdfDocumentBackend):
218
234
  def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
219
235
  super().__init__(path_or_stream, document_hash)
220
- self._pdoc = pdfium.PdfDocument(path_or_stream)
236
+ try:
237
+ self._pdoc = pdfium.PdfDocument(path_or_stream)
238
+ except PdfiumError as e:
239
+ raise RuntimeError(
240
+ f"pypdfium could not load document {document_hash}"
241
+ ) from e
221
242
 
222
243
  def page_count(self) -> int:
223
244
  return len(self._pdoc)
224
245
 
225
246
  def load_page(self, page_no: int) -> PyPdfiumPageBackend:
226
- return PyPdfiumPageBackend(self._pdoc[page_no])
247
+ return PyPdfiumPageBackend(self._pdoc, self.document_hash, page_no)
227
248
 
228
249
  def is_valid(self) -> bool:
229
250
  return self.page_count() > 0
@@ -16,7 +16,7 @@ class ConversionStatus(str, Enum):
16
16
  STARTED = auto()
17
17
  FAILURE = auto()
18
18
  SUCCESS = auto()
19
- SUCCESS_WITH_ERRORS = auto()
19
+ PARTIAL_SUCCESS = auto()
20
20
 
21
21
 
22
22
  class DocInputType(str, Enum):
@@ -29,6 +29,18 @@ class CoordOrigin(str, Enum):
29
29
  BOTTOMLEFT = auto()
30
30
 
31
31
 
32
+ class DoclingComponentType(str, Enum):
33
+ PDF_BACKEND = auto()
34
+ MODEL = auto()
35
+ DOC_ASSEMBLER = auto()
36
+
37
+
38
+ class ErrorItem(BaseModel):
39
+ component_type: DoclingComponentType
40
+ module_name: str
41
+ error_message: str
42
+
43
+
32
44
  class PageSize(BaseModel):
33
45
  width: float = 0.0
34
46
  height: float = 0.0
@@ -19,6 +19,7 @@ from docling.datamodel.base_models import (
19
19
  AssembledUnit,
20
20
  ConversionStatus,
21
21
  DocumentStream,
22
+ ErrorItem,
22
23
  FigureElement,
23
24
  Page,
24
25
  PageElement,
@@ -118,7 +119,7 @@ class ConvertedDocument(BaseModel):
118
119
  input: InputDocument
119
120
 
120
121
  status: ConversionStatus = ConversionStatus.PENDING # failure, success
121
- errors: List[Dict] = [] # structure to keep errors
122
+ errors: List[ErrorItem] = [] # structure to keep errors
122
123
 
123
124
  pages: List[Page] = []
124
125
  assembled: Optional[AssembledUnit] = None
@@ -16,6 +16,8 @@ from docling.datamodel.base_models import (
16
16
  AssembledUnit,
17
17
  AssembleOptions,
18
18
  ConversionStatus,
19
+ DoclingComponentType,
20
+ ErrorItem,
19
21
  Page,
20
22
  PipelineOptions,
21
23
  )
@@ -86,7 +88,7 @@ class DocumentConverter:
86
88
  # Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
87
89
  yield from map(self.process_document, input_batch)
88
90
 
89
- def convert_single(self, source: Path | AnyHttpUrl | str) -> Document:
91
+ def convert_single(self, source: Path | AnyHttpUrl | str) -> ConvertedDocument:
90
92
  """Convert a single document.
91
93
 
92
94
  Args:
@@ -131,11 +133,10 @@ class DocumentConverter:
131
133
  converted_doc: ConvertedDocument = next(converted_docs_iter)
132
134
  if converted_doc.status not in {
133
135
  ConversionStatus.SUCCESS,
134
- ConversionStatus.SUCCESS_WITH_ERRORS,
136
+ ConversionStatus.PARTIAL_SUCCESS,
135
137
  }:
136
138
  raise RuntimeError(f"Conversion failed with status: {converted_doc.status}")
137
- doc = converted_doc.to_ds_document()
138
- return doc
139
+ return converted_doc
139
140
 
140
141
  def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
141
142
  start_doc_time = time.time()
@@ -157,7 +158,6 @@ class DocumentConverter:
157
158
  for page_batch in chunkify(
158
159
  converted_doc.pages, settings.perf.page_batch_size
159
160
  ):
160
-
161
161
  start_pb_time = time.time()
162
162
  # Pipeline
163
163
 
@@ -205,12 +205,27 @@ class DocumentConverter:
205
205
  converted_doc.pages = all_assembled_pages
206
206
  self.assemble_doc(converted_doc)
207
207
 
208
- converted_doc.status = ConversionStatus.SUCCESS
208
+ status = ConversionStatus.SUCCESS
209
+ for page in converted_doc.pages:
210
+ if not page._backend.is_valid():
211
+ converted_doc.errors.append(
212
+ ErrorItem(
213
+ component_type=DoclingComponentType.PDF_BACKEND,
214
+ module_name=type(page._backend).__name__,
215
+ error_message=f"Page {page.page_no} failed to parse.",
216
+ )
217
+ )
218
+ status = ConversionStatus.PARTIAL_SUCCESS
219
+
220
+ converted_doc.status = status
209
221
 
210
222
  except Exception as e:
211
223
  converted_doc.status = ConversionStatus.FAILURE
212
224
  trace = "\n".join(traceback.format_exception(e))
213
- _log.info(f"Encountered an error during conversion: {trace}")
225
+ _log.info(
226
+ f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
227
+ f"{trace}"
228
+ )
214
229
 
215
230
  end_doc_time = time.time() - start_doc_time
216
231
  _log.info(
@@ -230,7 +245,9 @@ class DocumentConverter:
230
245
  # Generate the page image and store it in the page object
231
246
  def populate_page_images(self, doc: InputDocument, page: Page) -> Page:
232
247
  # default scale
233
- page.get_image(scale=1.0)
248
+ page.get_image(
249
+ scale=1.0
250
+ ) # puts the page image on the image cache at default scale
234
251
 
235
252
  # user requested scales
236
253
  if self.assemble_options.images_scale is not None:
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "1.7.1" # DO NOT EDIT, updated automatically
3
+ version = "1.8.1" # DO NOT EDIT, updated automatically
4
4
  description = "Docling PDF conversion package"
5
5
  authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"
File without changes
File without changes
File without changes
File without changes