docling 1.6.3__py3-none-any.whl → 1.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -39,8 +39,9 @@ class PdfPageBackend(ABC):
39
39
 
40
40
  class PdfDocumentBackend(ABC):
41
41
  @abstractmethod
42
- def __init__(self, path_or_stream: Union[BytesIO, Path]):
43
- pass
42
+ def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
43
+ self.path_or_stream = path_or_stream
44
+ self.document_hash = document_hash
44
45
 
45
46
  @abstractmethod
46
47
  def load_page(self, page_no: int) -> PdfPageBackend:
@@ -56,4 +57,7 @@ class PdfDocumentBackend(ABC):
56
57
 
57
58
  @abstractmethod
58
59
  def unload(self):
59
- pass
60
+ if isinstance(self.path_or_stream, BytesIO):
61
+ self.path_or_stream.close()
62
+
63
+ self.path_or_stream = None
@@ -1,6 +1,5 @@
1
1
  import logging
2
2
  import random
3
- import time
4
3
  from io import BytesIO
5
4
  from pathlib import Path
6
5
  from typing import Iterable, Optional, Union
@@ -17,13 +16,26 @@ _log = logging.getLogger(__name__)
17
16
 
18
17
 
19
18
  class DoclingParsePageBackend(PdfPageBackend):
20
- def __init__(self, page_obj: PdfPage, docling_page_obj):
19
+ def __init__(
20
+ self, parser: pdf_parser, document_hash: str, page_no: int, page_obj: PdfPage
21
+ ):
21
22
  super().__init__(page_obj)
22
23
  self._ppage = page_obj
23
- self._dpage = docling_page_obj
24
- self.text_page = None
24
+
25
+ parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
26
+
27
+ self._dpage = None
28
+ self.broken_page = "pages" not in parsed_page
29
+ if not self.broken_page:
30
+ self._dpage = parsed_page["pages"][0]
31
+ else:
32
+ raise RuntimeError(
33
+ f"Page {page_no} of document {document_hash} could not be parsed."
34
+ )
25
35
 
26
36
  def get_text_in_rect(self, bbox: BoundingBox) -> str:
37
+ if self.broken_page:
38
+ return ""
27
39
  # Find intersecting cells on the page
28
40
  text_piece = ""
29
41
  page_size = self.get_size()
@@ -58,6 +70,9 @@ class DoclingParsePageBackend(PdfPageBackend):
58
70
  cells = []
59
71
  cell_counter = 0
60
72
 
73
+ if self.broken_page:
74
+ return cells
75
+
61
76
  page_size = self.get_size()
62
77
 
63
78
  parser_width = self._dpage["width"]
@@ -168,38 +183,39 @@ class DoclingParsePageBackend(PdfPageBackend):
168
183
  def unload(self):
169
184
  self._ppage = None
170
185
  self._dpage = None
171
- self.text_page = None
172
186
 
173
187
 
174
188
  class DoclingParseDocumentBackend(PdfDocumentBackend):
175
- def __init__(self, path_or_stream: Union[BytesIO, Path]):
176
- super().__init__(path_or_stream)
177
- self._pdoc = pdfium.PdfDocument(path_or_stream)
178
- # Parsing cells with docling_parser call
179
- parser = pdf_parser()
189
+ def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
190
+ super().__init__(path_or_stream, document_hash)
180
191
 
181
- start_pb_time = time.time()
192
+ self._pdoc = pdfium.PdfDocument(path_or_stream)
193
+ self.parser = pdf_parser()
182
194
 
195
+ success = False
183
196
  if isinstance(path_or_stream, BytesIO):
184
- self._parser_doc = parser.find_cells_from_bytesio(path_or_stream)
185
- else:
186
- self._parser_doc = parser.find_cells(str(path_or_stream))
197
+ success = self.parser.load_document_from_bytesio(
198
+ document_hash, path_or_stream
199
+ )
200
+ elif isinstance(path_or_stream, Path):
201
+ success = self.parser.load_document(document_hash, str(path_or_stream))
187
202
 
188
- end_pb_time = time.time() - start_pb_time
189
- _log.info(f"Time to parse with docling-parse: time={end_pb_time:.3f}")
203
+ if not success:
204
+ raise RuntimeError("docling-parse could not load this document.")
190
205
 
191
206
  def page_count(self) -> int:
192
- return len(self._parser_doc["pages"])
207
+ return len(self._pdoc) # To be replaced with docling-parse API
193
208
 
194
209
  def load_page(self, page_no: int) -> DoclingParsePageBackend:
195
210
  return DoclingParsePageBackend(
196
- self._pdoc[page_no], self._parser_doc["pages"][page_no]
211
+ self.parser, self.document_hash, page_no, self._pdoc[page_no]
197
212
  )
198
213
 
199
214
  def is_valid(self) -> bool:
200
215
  return self.page_count() > 0
201
216
 
202
217
  def unload(self):
218
+ super().unload()
219
+ self.parser.unload_document(self.document_hash)
203
220
  self._pdoc.close()
204
221
  self._pdoc = None
205
- self._parser_doc = None
@@ -215,8 +215,8 @@ class PyPdfiumPageBackend(PdfPageBackend):
215
215
 
216
216
 
217
217
  class PyPdfiumDocumentBackend(PdfDocumentBackend):
218
- def __init__(self, path_or_stream: Union[BytesIO, Path]):
219
- super().__init__(path_or_stream)
218
+ def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
219
+ super().__init__(path_or_stream, document_hash)
220
220
  self._pdoc = pdfium.PdfDocument(path_or_stream)
221
221
 
222
222
  def page_count(self) -> int:
@@ -229,5 +229,6 @@ class PyPdfiumDocumentBackend(PdfDocumentBackend):
229
229
  return self.page_count() > 0
230
230
 
231
231
  def unload(self):
232
+ super().unload()
232
233
  self._pdoc.close()
233
234
  self._pdoc = None
@@ -79,7 +79,9 @@ class InputDocument(BaseModel):
79
79
  self.valid = False
80
80
  else:
81
81
  self.document_hash = create_file_hash(path_or_stream)
82
- self._backend = pdf_backend(path_or_stream=path_or_stream)
82
+ self._backend = pdf_backend(
83
+ path_or_stream=path_or_stream, document_hash=self.document_hash
84
+ )
83
85
 
84
86
  elif isinstance(path_or_stream, BytesIO):
85
87
  self.file = PurePath(filename)
@@ -89,7 +91,9 @@ class InputDocument(BaseModel):
89
91
  self.valid = False
90
92
  else:
91
93
  self.document_hash = create_file_hash(path_or_stream)
92
- self._backend = pdf_backend(path_or_stream=path_or_stream)
94
+ self._backend = pdf_backend(
95
+ path_or_stream=path_or_stream, document_hash=self.document_hash
96
+ )
93
97
 
94
98
  if self.document_hash and self._backend.page_count() > 0:
95
99
  self.page_count = self._backend.page_count()
@@ -141,6 +141,8 @@ class DocumentConverter:
141
141
  start_doc_time = time.time()
142
142
  converted_doc = ConvertedDocument(input=in_doc)
143
143
 
144
+ _log.info(f"Processing document {in_doc.file.name}")
145
+
144
146
  if not in_doc.valid:
145
147
  converted_doc.status = ConversionStatus.FAILURE
146
148
  return converted_doc
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 1.6.3
3
+ Version: 1.7.1
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -23,7 +23,7 @@ Requires-Dist: certifi (>=2024.7.4)
23
23
  Requires-Dist: deepsearch-glm (>=0.19.0,<1)
24
24
  Requires-Dist: docling-core (>=1.1.2,<2.0.0)
25
25
  Requires-Dist: docling-ibm-models (>=1.1.2,<2.0.0)
26
- Requires-Dist: docling-parse (>=0.2.0,<0.3.0)
26
+ Requires-Dist: docling-parse (>=1.1.1,<2.0.0)
27
27
  Requires-Dist: easyocr (>=1.7,<2.0)
28
28
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
29
29
  Requires-Dist: huggingface_hub (>=0.23,<1)
@@ -1,13 +1,13 @@
1
1
  docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- docling/backend/abstract_backend.py,sha256=ZfEHaBPGM1cmqrhaEoU3MHhnHU11NhOnhtFEIbVMYDo,1221
4
- docling/backend/docling_parse_backend.py,sha256=ELDJeC0bHYWEtkMcvcPxTMIbTBLO1N9VLeqsardlXg4,6880
5
- docling/backend/pypdfium2_backend.py,sha256=xUiIYgd7i22YDx4-W2hfPUaQFszW0gcT6pavG5qZ8LE,8062
3
+ docling/backend/abstract_backend.py,sha256=wvrywm1pPt79L5Dt_da5QGmf9dDzjBGj1rSaUQxqI8s,1432
4
+ docling/backend/docling_parse_backend.py,sha256=hXyF2VPPdLs7APWEXTlfz0wI86rUGYa67Q73zgTB-Ug,7438
5
+ docling/backend/pypdfium2_backend.py,sha256=84AnFah8Ztk-j8_9MTHalPU3a9fClrEz7A_rfzWDkFc,8122
6
6
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
7
  docling/datamodel/base_models.py,sha256=5VHit5h7OleKnbhvy-sWDxQLizEdNrGUBrypyzwHyAE,8604
8
- docling/datamodel/document.py,sha256=Dgi9pSwXCgIoR26MKiRDiVMyMaFKdvGSKq2Fm5Lef9M,13173
8
+ docling/datamodel/document.py,sha256=J97KeT8fJRKijUorDky-xA2FoOGBXOjrReYjdeo8NK4,13333
9
9
  docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
10
- docling/document_converter.py,sha256=UFSELvUSWsr8s0VByu4lNuzu7bn7zZauJTL3FTSLSBg,10371
10
+ docling/document_converter.py,sha256=Tx4BwtOxpwizmXgJl2nK6I-6m0V3fygHwYFomJTH2Ns,10433
11
11
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  docling/models/base_ocr_model.py,sha256=Ipl82a3AV2OsgMQSMEMpnWJ6MXcmyIQzmp52PmTaB0g,4465
13
13
  docling/models/ds_glm_model.py,sha256=wmb--2JKFQby-kvidw6PyM8wURPXYPQ_Z_eKKCBAdYQ,3192
@@ -21,7 +21,7 @@ docling/pipeline/standard_model_pipeline.py,sha256=UTjyaEXvz9htYZz-IMTkn11cZwNjg
21
21
  docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
22
  docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
23
23
  docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
24
- docling-1.6.3.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
25
- docling-1.6.3.dist-info/METADATA,sha256=DeRKK5TVCv9rp3eQfZkXfZXwKLi4df2l10qXKcm3ISQ,7229
26
- docling-1.6.3.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
27
- docling-1.6.3.dist-info/RECORD,,
24
+ docling-1.7.1.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
25
+ docling-1.7.1.dist-info/METADATA,sha256=ADdVabYgc4VEIGKhM-tI6XBU_CG9tzKl_au69TZ9LbY,7229
26
+ docling-1.7.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
27
+ docling-1.7.1.dist-info/RECORD,,