docling 1.6.2__tar.gz → 1.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. {docling-1.6.2 → docling-1.7.0}/PKG-INFO +2 -2
  2. {docling-1.6.2 → docling-1.7.0}/docling/backend/abstract_backend.py +7 -3
  3. {docling-1.6.2 → docling-1.7.0}/docling/backend/docling_parse_backend.py +22 -21
  4. {docling-1.6.2 → docling-1.7.0}/docling/backend/pypdfium2_backend.py +3 -2
  5. {docling-1.6.2 → docling-1.7.0}/docling/datamodel/document.py +6 -2
  6. {docling-1.6.2 → docling-1.7.0}/docling/document_converter.py +2 -0
  7. {docling-1.6.2 → docling-1.7.0}/pyproject.toml +2 -2
  8. {docling-1.6.2 → docling-1.7.0}/LICENSE +0 -0
  9. {docling-1.6.2 → docling-1.7.0}/README.md +0 -0
  10. {docling-1.6.2 → docling-1.7.0}/docling/__init__.py +0 -0
  11. {docling-1.6.2 → docling-1.7.0}/docling/backend/__init__.py +0 -0
  12. {docling-1.6.2 → docling-1.7.0}/docling/datamodel/__init__.py +0 -0
  13. {docling-1.6.2 → docling-1.7.0}/docling/datamodel/base_models.py +0 -0
  14. {docling-1.6.2 → docling-1.7.0}/docling/datamodel/settings.py +0 -0
  15. {docling-1.6.2 → docling-1.7.0}/docling/models/__init__.py +0 -0
  16. {docling-1.6.2 → docling-1.7.0}/docling/models/base_ocr_model.py +0 -0
  17. {docling-1.6.2 → docling-1.7.0}/docling/models/ds_glm_model.py +0 -0
  18. {docling-1.6.2 → docling-1.7.0}/docling/models/easyocr_model.py +0 -0
  19. {docling-1.6.2 → docling-1.7.0}/docling/models/layout_model.py +0 -0
  20. {docling-1.6.2 → docling-1.7.0}/docling/models/page_assemble_model.py +0 -0
  21. {docling-1.6.2 → docling-1.7.0}/docling/models/table_structure_model.py +0 -0
  22. {docling-1.6.2 → docling-1.7.0}/docling/pipeline/__init__.py +0 -0
  23. {docling-1.6.2 → docling-1.7.0}/docling/pipeline/base_model_pipeline.py +0 -0
  24. {docling-1.6.2 → docling-1.7.0}/docling/pipeline/standard_model_pipeline.py +0 -0
  25. {docling-1.6.2 → docling-1.7.0}/docling/utils/__init__.py +0 -0
  26. {docling-1.6.2 → docling-1.7.0}/docling/utils/layout_utils.py +0 -0
  27. {docling-1.6.2 → docling-1.7.0}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 1.6.2
3
+ Version: 1.7.0
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -23,7 +23,7 @@ Requires-Dist: certifi (>=2024.7.4)
23
23
  Requires-Dist: deepsearch-glm (>=0.19.0,<1)
24
24
  Requires-Dist: docling-core (>=1.1.2,<2.0.0)
25
25
  Requires-Dist: docling-ibm-models (>=1.1.2,<2.0.0)
26
- Requires-Dist: docling-parse (>=0.2.0,<0.3.0)
26
+ Requires-Dist: docling-parse (>=1.0.0,<2.0.0)
27
27
  Requires-Dist: easyocr (>=1.7,<2.0)
28
28
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
29
29
  Requires-Dist: huggingface_hub (>=0.23,<1)
@@ -39,8 +39,9 @@ class PdfPageBackend(ABC):
39
39
 
40
40
  class PdfDocumentBackend(ABC):
41
41
  @abstractmethod
42
- def __init__(self, path_or_stream: Union[BytesIO, Path]):
43
- pass
42
+ def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
43
+ self.path_or_stream = path_or_stream
44
+ self.document_hash = document_hash
44
45
 
45
46
  @abstractmethod
46
47
  def load_page(self, page_no: int) -> PdfPageBackend:
@@ -56,4 +57,7 @@ class PdfDocumentBackend(ABC):
56
57
 
57
58
  @abstractmethod
58
59
  def unload(self):
59
- pass
60
+ if isinstance(self.path_or_stream, BytesIO):
61
+ self.path_or_stream.close()
62
+
63
+ self.path_or_stream = None
@@ -1,6 +1,5 @@
1
1
  import logging
2
2
  import random
3
- import time
4
3
  from io import BytesIO
5
4
  from pathlib import Path
6
5
  from typing import Iterable, Optional, Union
@@ -17,11 +16,14 @@ _log = logging.getLogger(__name__)
17
16
 
18
17
 
19
18
  class DoclingParsePageBackend(PdfPageBackend):
20
- def __init__(self, page_obj: PdfPage, docling_page_obj):
19
+ def __init__(
20
+ self, parser: pdf_parser, document_hash: str, page_no: int, page_obj: PdfPage
21
+ ):
21
22
  super().__init__(page_obj)
22
23
  self._ppage = page_obj
23
- self._dpage = docling_page_obj
24
- self.text_page = None
24
+
25
+ parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
26
+ self._dpage = parsed_page["pages"][0]
25
27
 
26
28
  def get_text_in_rect(self, bbox: BoundingBox) -> str:
27
29
  # Find intersecting cells on the page
@@ -168,40 +170,39 @@ class DoclingParsePageBackend(PdfPageBackend):
168
170
  def unload(self):
169
171
  self._ppage = None
170
172
  self._dpage = None
171
- self.text_page = None
172
173
 
173
174
 
174
175
  class DoclingParseDocumentBackend(PdfDocumentBackend):
175
- def __init__(self, path_or_stream: Union[BytesIO, Path]):
176
- super().__init__(path_or_stream)
177
- self._pdoc = pdfium.PdfDocument(path_or_stream)
178
- # Parsing cells with docling_parser call
179
- parser = pdf_parser()
176
+ def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
177
+ super().__init__(path_or_stream, document_hash)
180
178
 
181
- start_pb_time = time.time()
179
+ self._pdoc = pdfium.PdfDocument(path_or_stream)
180
+ self.parser = pdf_parser()
182
181
 
182
+ success = False
183
183
  if isinstance(path_or_stream, BytesIO):
184
- self._parser_doc = parser.find_cells_from_bytesio(path_or_stream)
185
- else:
186
- self._parser_doc = parser.find_cells(str(path_or_stream))
184
+ success = self.parser.load_document_from_bytesio(
185
+ document_hash, path_or_stream
186
+ )
187
+ elif isinstance(path_or_stream, Path):
188
+ success = self.parser.load_document(document_hash, str(path_or_stream))
187
189
 
188
- end_pb_time = time.time() - start_pb_time
189
- _log.info(
190
- f"Time to parse {path_or_stream.name} with docling-parse: time={end_pb_time:.3f}"
191
- )
190
+ if not success:
191
+ raise RuntimeError("docling-parse could not load this document.")
192
192
 
193
193
  def page_count(self) -> int:
194
- return len(self._parser_doc["pages"])
194
+ return len(self._pdoc) # To be replaced with docling-parse API
195
195
 
196
196
  def load_page(self, page_no: int) -> DoclingParsePageBackend:
197
197
  return DoclingParsePageBackend(
198
- self._pdoc[page_no], self._parser_doc["pages"][page_no]
198
+ self.parser, self.document_hash, page_no, self._pdoc[page_no]
199
199
  )
200
200
 
201
201
  def is_valid(self) -> bool:
202
202
  return self.page_count() > 0
203
203
 
204
204
  def unload(self):
205
+ super().unload()
206
+ self.parser.unload_document(self.document_hash)
205
207
  self._pdoc.close()
206
208
  self._pdoc = None
207
- self._parser_doc = None
@@ -215,8 +215,8 @@ class PyPdfiumPageBackend(PdfPageBackend):
215
215
 
216
216
 
217
217
  class PyPdfiumDocumentBackend(PdfDocumentBackend):
218
- def __init__(self, path_or_stream: Union[BytesIO, Path]):
219
- super().__init__(path_or_stream)
218
+ def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
219
+ super().__init__(path_or_stream, document_hash)
220
220
  self._pdoc = pdfium.PdfDocument(path_or_stream)
221
221
 
222
222
  def page_count(self) -> int:
@@ -229,5 +229,6 @@ class PyPdfiumDocumentBackend(PdfDocumentBackend):
229
229
  return self.page_count() > 0
230
230
 
231
231
  def unload(self):
232
+ super().unload()
232
233
  self._pdoc.close()
233
234
  self._pdoc = None
@@ -79,7 +79,9 @@ class InputDocument(BaseModel):
79
79
  self.valid = False
80
80
  else:
81
81
  self.document_hash = create_file_hash(path_or_stream)
82
- self._backend = pdf_backend(path_or_stream=path_or_stream)
82
+ self._backend = pdf_backend(
83
+ path_or_stream=path_or_stream, document_hash=self.document_hash
84
+ )
83
85
 
84
86
  elif isinstance(path_or_stream, BytesIO):
85
87
  self.file = PurePath(filename)
@@ -89,7 +91,9 @@ class InputDocument(BaseModel):
89
91
  self.valid = False
90
92
  else:
91
93
  self.document_hash = create_file_hash(path_or_stream)
92
- self._backend = pdf_backend(path_or_stream=path_or_stream)
94
+ self._backend = pdf_backend(
95
+ path_or_stream=path_or_stream, document_hash=self.document_hash
96
+ )
93
97
 
94
98
  if self.document_hash and self._backend.page_count() > 0:
95
99
  self.page_count = self._backend.page_count()
@@ -141,6 +141,8 @@ class DocumentConverter:
141
141
  start_doc_time = time.time()
142
142
  converted_doc = ConvertedDocument(input=in_doc)
143
143
 
144
+ _log.info(f"Processing document {in_doc.file.name}")
145
+
144
146
  if not in_doc.valid:
145
147
  converted_doc.status = ConversionStatus.FAILURE
146
148
  return converted_doc
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "1.6.2" # DO NOT EDIT, updated automatically
3
+ version = "1.7.0" # DO NOT EDIT, updated automatically
4
4
  description = "Docling PDF conversion package"
5
5
  authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"
@@ -32,7 +32,7 @@ pydantic-settings = "^2.3.0"
32
32
  huggingface_hub = ">=0.23,<1"
33
33
  requests = "^2.32.3"
34
34
  easyocr = "^1.7"
35
- docling-parse = "^0.2.0"
35
+ docling-parse = "^1.0.0"
36
36
  certifi = ">=2024.7.4"
37
37
  rtree = "^1.3.0"
38
38
  scipy = "^1.14.1"
File without changes
File without changes
File without changes
File without changes