docling 1.6.3__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -39,8 +39,9 @@ class PdfPageBackend(ABC):
39
39
 
40
40
  class PdfDocumentBackend(ABC):
41
41
  @abstractmethod
42
- def __init__(self, path_or_stream: Union[BytesIO, Path]):
43
- pass
42
+ def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
43
+ self.path_or_stream = path_or_stream
44
+ self.document_hash = document_hash
44
45
 
45
46
  @abstractmethod
46
47
  def load_page(self, page_no: int) -> PdfPageBackend:
@@ -56,4 +57,7 @@ class PdfDocumentBackend(ABC):
56
57
 
57
58
  @abstractmethod
58
59
  def unload(self):
59
- pass
60
+ if isinstance(self.path_or_stream, BytesIO):
61
+ self.path_or_stream.close()
62
+
63
+ self.path_or_stream = None
@@ -1,6 +1,5 @@
1
1
  import logging
2
2
  import random
3
- import time
4
3
  from io import BytesIO
5
4
  from pathlib import Path
6
5
  from typing import Iterable, Optional, Union
@@ -17,11 +16,14 @@ _log = logging.getLogger(__name__)
17
16
 
18
17
 
19
18
  class DoclingParsePageBackend(PdfPageBackend):
20
- def __init__(self, page_obj: PdfPage, docling_page_obj):
19
+ def __init__(
20
+ self, parser: pdf_parser, document_hash: str, page_no: int, page_obj: PdfPage
21
+ ):
21
22
  super().__init__(page_obj)
22
23
  self._ppage = page_obj
23
- self._dpage = docling_page_obj
24
- self.text_page = None
24
+
25
+ parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
26
+ self._dpage = parsed_page["pages"][0]
25
27
 
26
28
  def get_text_in_rect(self, bbox: BoundingBox) -> str:
27
29
  # Find intersecting cells on the page
@@ -168,38 +170,39 @@ class DoclingParsePageBackend(PdfPageBackend):
168
170
  def unload(self):
169
171
  self._ppage = None
170
172
  self._dpage = None
171
- self.text_page = None
172
173
 
173
174
 
174
175
  class DoclingParseDocumentBackend(PdfDocumentBackend):
175
- def __init__(self, path_or_stream: Union[BytesIO, Path]):
176
- super().__init__(path_or_stream)
177
- self._pdoc = pdfium.PdfDocument(path_or_stream)
178
- # Parsing cells with docling_parser call
179
- parser = pdf_parser()
176
+ def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
177
+ super().__init__(path_or_stream, document_hash)
180
178
 
181
- start_pb_time = time.time()
179
+ self._pdoc = pdfium.PdfDocument(path_or_stream)
180
+ self.parser = pdf_parser()
182
181
 
182
+ success = False
183
183
  if isinstance(path_or_stream, BytesIO):
184
- self._parser_doc = parser.find_cells_from_bytesio(path_or_stream)
185
- else:
186
- self._parser_doc = parser.find_cells(str(path_or_stream))
184
+ success = self.parser.load_document_from_bytesio(
185
+ document_hash, path_or_stream
186
+ )
187
+ elif isinstance(path_or_stream, Path):
188
+ success = self.parser.load_document(document_hash, str(path_or_stream))
187
189
 
188
- end_pb_time = time.time() - start_pb_time
189
- _log.info(f"Time to parse with docling-parse: time={end_pb_time:.3f}")
190
+ if not success:
191
+ raise RuntimeError("docling-parse could not load this document.")
190
192
 
191
193
  def page_count(self) -> int:
192
- return len(self._parser_doc["pages"])
194
+ return len(self._pdoc) # To be replaced with docling-parse API
193
195
 
194
196
  def load_page(self, page_no: int) -> DoclingParsePageBackend:
195
197
  return DoclingParsePageBackend(
196
- self._pdoc[page_no], self._parser_doc["pages"][page_no]
198
+ self.parser, self.document_hash, page_no, self._pdoc[page_no]
197
199
  )
198
200
 
199
201
  def is_valid(self) -> bool:
200
202
  return self.page_count() > 0
201
203
 
202
204
  def unload(self):
205
+ super().unload()
206
+ self.parser.unload_document(self.document_hash)
203
207
  self._pdoc.close()
204
208
  self._pdoc = None
205
- self._parser_doc = None
@@ -215,8 +215,8 @@ class PyPdfiumPageBackend(PdfPageBackend):
215
215
 
216
216
 
217
217
  class PyPdfiumDocumentBackend(PdfDocumentBackend):
218
- def __init__(self, path_or_stream: Union[BytesIO, Path]):
219
- super().__init__(path_or_stream)
218
+ def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
219
+ super().__init__(path_or_stream, document_hash)
220
220
  self._pdoc = pdfium.PdfDocument(path_or_stream)
221
221
 
222
222
  def page_count(self) -> int:
@@ -229,5 +229,6 @@ class PyPdfiumDocumentBackend(PdfDocumentBackend):
229
229
  return self.page_count() > 0
230
230
 
231
231
  def unload(self):
232
+ super().unload()
232
233
  self._pdoc.close()
233
234
  self._pdoc = None
@@ -79,7 +79,9 @@ class InputDocument(BaseModel):
79
79
  self.valid = False
80
80
  else:
81
81
  self.document_hash = create_file_hash(path_or_stream)
82
- self._backend = pdf_backend(path_or_stream=path_or_stream)
82
+ self._backend = pdf_backend(
83
+ path_or_stream=path_or_stream, document_hash=self.document_hash
84
+ )
83
85
 
84
86
  elif isinstance(path_or_stream, BytesIO):
85
87
  self.file = PurePath(filename)
@@ -89,7 +91,9 @@ class InputDocument(BaseModel):
89
91
  self.valid = False
90
92
  else:
91
93
  self.document_hash = create_file_hash(path_or_stream)
92
- self._backend = pdf_backend(path_or_stream=path_or_stream)
94
+ self._backend = pdf_backend(
95
+ path_or_stream=path_or_stream, document_hash=self.document_hash
96
+ )
93
97
 
94
98
  if self.document_hash and self._backend.page_count() > 0:
95
99
  self.page_count = self._backend.page_count()
@@ -141,6 +141,8 @@ class DocumentConverter:
141
141
  start_doc_time = time.time()
142
142
  converted_doc = ConvertedDocument(input=in_doc)
143
143
 
144
+ _log.info(f"Processing document {in_doc.file.name}")
145
+
144
146
  if not in_doc.valid:
145
147
  converted_doc.status = ConversionStatus.FAILURE
146
148
  return converted_doc
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 1.6.3
3
+ Version: 1.7.0
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -23,7 +23,7 @@ Requires-Dist: certifi (>=2024.7.4)
23
23
  Requires-Dist: deepsearch-glm (>=0.19.0,<1)
24
24
  Requires-Dist: docling-core (>=1.1.2,<2.0.0)
25
25
  Requires-Dist: docling-ibm-models (>=1.1.2,<2.0.0)
26
- Requires-Dist: docling-parse (>=0.2.0,<0.3.0)
26
+ Requires-Dist: docling-parse (>=1.0.0,<2.0.0)
27
27
  Requires-Dist: easyocr (>=1.7,<2.0)
28
28
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
29
29
  Requires-Dist: huggingface_hub (>=0.23,<1)
@@ -1,13 +1,13 @@
1
1
  docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- docling/backend/abstract_backend.py,sha256=ZfEHaBPGM1cmqrhaEoU3MHhnHU11NhOnhtFEIbVMYDo,1221
4
- docling/backend/docling_parse_backend.py,sha256=ELDJeC0bHYWEtkMcvcPxTMIbTBLO1N9VLeqsardlXg4,6880
5
- docling/backend/pypdfium2_backend.py,sha256=xUiIYgd7i22YDx4-W2hfPUaQFszW0gcT6pavG5qZ8LE,8062
3
+ docling/backend/abstract_backend.py,sha256=wvrywm1pPt79L5Dt_da5QGmf9dDzjBGj1rSaUQxqI8s,1432
4
+ docling/backend/docling_parse_backend.py,sha256=E4JFpWBamiU8vRmHvgMZLU7lOxtUMq85TV5hMrGvWJI,7070
5
+ docling/backend/pypdfium2_backend.py,sha256=84AnFah8Ztk-j8_9MTHalPU3a9fClrEz7A_rfzWDkFc,8122
6
6
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
7
  docling/datamodel/base_models.py,sha256=5VHit5h7OleKnbhvy-sWDxQLizEdNrGUBrypyzwHyAE,8604
8
- docling/datamodel/document.py,sha256=Dgi9pSwXCgIoR26MKiRDiVMyMaFKdvGSKq2Fm5Lef9M,13173
8
+ docling/datamodel/document.py,sha256=J97KeT8fJRKijUorDky-xA2FoOGBXOjrReYjdeo8NK4,13333
9
9
  docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
10
- docling/document_converter.py,sha256=UFSELvUSWsr8s0VByu4lNuzu7bn7zZauJTL3FTSLSBg,10371
10
+ docling/document_converter.py,sha256=Tx4BwtOxpwizmXgJl2nK6I-6m0V3fygHwYFomJTH2Ns,10433
11
11
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  docling/models/base_ocr_model.py,sha256=Ipl82a3AV2OsgMQSMEMpnWJ6MXcmyIQzmp52PmTaB0g,4465
13
13
  docling/models/ds_glm_model.py,sha256=wmb--2JKFQby-kvidw6PyM8wURPXYPQ_Z_eKKCBAdYQ,3192
@@ -21,7 +21,7 @@ docling/pipeline/standard_model_pipeline.py,sha256=UTjyaEXvz9htYZz-IMTkn11cZwNjg
21
21
  docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
22
  docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
23
23
  docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
24
- docling-1.6.3.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
25
- docling-1.6.3.dist-info/METADATA,sha256=DeRKK5TVCv9rp3eQfZkXfZXwKLi4df2l10qXKcm3ISQ,7229
26
- docling-1.6.3.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
27
- docling-1.6.3.dist-info/RECORD,,
24
+ docling-1.7.0.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
25
+ docling-1.7.0.dist-info/METADATA,sha256=yqKqPH3w2IxOhmdqHy0nJcYKDrJAufx6uF1Ti44pRs4,7229
26
+ docling-1.7.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
27
+ docling-1.7.0.dist-info/RECORD,,