docling 1.19.0__tar.gz → 1.20.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {docling-1.19.0 → docling-1.20.0}/PKG-INFO +12 -12
  2. {docling-1.19.0 → docling-1.20.0}/README.md +8 -9
  3. docling-1.20.0/docling/backend/docling_parse_v2_backend.py +236 -0
  4. {docling-1.19.0 → docling-1.20.0}/docling/models/tesseract_ocr_cli_model.py +2 -2
  5. {docling-1.19.0 → docling-1.20.0}/pyproject.toml +8 -4
  6. {docling-1.19.0 → docling-1.20.0}/LICENSE +0 -0
  7. {docling-1.19.0 → docling-1.20.0}/docling/__init__.py +0 -0
  8. {docling-1.19.0 → docling-1.20.0}/docling/backend/__init__.py +0 -0
  9. {docling-1.19.0 → docling-1.20.0}/docling/backend/abstract_backend.py +0 -0
  10. {docling-1.19.0 → docling-1.20.0}/docling/backend/docling_parse_backend.py +0 -0
  11. {docling-1.19.0 → docling-1.20.0}/docling/backend/pypdfium2_backend.py +0 -0
  12. {docling-1.19.0 → docling-1.20.0}/docling/cli/__init__.py +0 -0
  13. {docling-1.19.0 → docling-1.20.0}/docling/cli/main.py +0 -0
  14. {docling-1.19.0 → docling-1.20.0}/docling/datamodel/__init__.py +0 -0
  15. {docling-1.19.0 → docling-1.20.0}/docling/datamodel/base_models.py +0 -0
  16. {docling-1.19.0 → docling-1.20.0}/docling/datamodel/document.py +0 -0
  17. {docling-1.19.0 → docling-1.20.0}/docling/datamodel/pipeline_options.py +0 -0
  18. {docling-1.19.0 → docling-1.20.0}/docling/datamodel/settings.py +0 -0
  19. {docling-1.19.0 → docling-1.20.0}/docling/document_converter.py +0 -0
  20. {docling-1.19.0 → docling-1.20.0}/docling/models/__init__.py +0 -0
  21. {docling-1.19.0 → docling-1.20.0}/docling/models/base_ocr_model.py +0 -0
  22. {docling-1.19.0 → docling-1.20.0}/docling/models/ds_glm_model.py +0 -0
  23. {docling-1.19.0 → docling-1.20.0}/docling/models/easyocr_model.py +0 -0
  24. {docling-1.19.0 → docling-1.20.0}/docling/models/layout_model.py +0 -0
  25. {docling-1.19.0 → docling-1.20.0}/docling/models/page_assemble_model.py +0 -0
  26. {docling-1.19.0 → docling-1.20.0}/docling/models/table_structure_model.py +0 -0
  27. {docling-1.19.0 → docling-1.20.0}/docling/models/tesseract_ocr_model.py +0 -0
  28. {docling-1.19.0 → docling-1.20.0}/docling/pipeline/__init__.py +0 -0
  29. {docling-1.19.0 → docling-1.20.0}/docling/pipeline/base_model_pipeline.py +0 -0
  30. {docling-1.19.0 → docling-1.20.0}/docling/pipeline/standard_model_pipeline.py +0 -0
  31. {docling-1.19.0 → docling-1.20.0}/docling/utils/__init__.py +0 -0
  32. {docling-1.19.0 → docling-1.20.0}/docling/utils/export.py +0 -0
  33. {docling-1.19.0 → docling-1.20.0}/docling/utils/layout_utils.py +0 -0
  34. {docling-1.19.0 → docling-1.20.0}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 1.19.0
3
+ Version: 1.20.0
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -22,12 +22,13 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Provides-Extra: tesserocr
23
23
  Requires-Dist: certifi (>=2024.7.4)
24
24
  Requires-Dist: deepsearch-glm (>=0.22.0,<0.23.0)
25
- Requires-Dist: docling-core (>=1.6.2,<2.0.0)
25
+ Requires-Dist: docling-core (>=1.7.1,<2.0.0)
26
26
  Requires-Dist: docling-ibm-models (>=2.0.0,<3.0.0)
27
- Requires-Dist: docling-parse (>=1.4.1,<2.0.0)
27
+ Requires-Dist: docling-parse (>=1.6.0,<2.0.0)
28
28
  Requires-Dist: easyocr (>=1.7,<2.0)
29
29
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
30
30
  Requires-Dist: huggingface_hub (>=0.23,<1)
31
+ Requires-Dist: pandas (>=2.1.4,<3.0.0)
31
32
  Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
32
33
  Requires-Dist: pydantic (>=2.0.0,<3.0.0)
33
34
  Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
@@ -335,15 +336,14 @@ from docling_core.transforms.chunker import HierarchicalChunker
335
336
 
336
337
  doc = DocumentConverter().convert_single("https://arxiv.org/pdf/2206.01062").output
337
338
  chunks = list(HierarchicalChunker().chunk(doc))
338
- # > [
339
- # > ChunkWithMetadata(
340
- # > path='$.main-text[0]',
341
- # > text='DocLayNet: A Large Human-Annotated Dataset [...]',
342
- # > page=1,
343
- # > bbox=[107.30, 672.38, 505.19, 709.08]
344
- # > ),
345
- # > [...]
346
- # > ]
339
+ print(chunks[0])
340
+ # ChunkWithMetadata(
341
+ # path='#/main-text/1',
342
+ # text='DocLayNet: A Large Human-Annotated Dataset [...]',
343
+ # page=1,
344
+ # bbox=[107.30, 672.38, 505.19, 709.08],
345
+ # [...]
346
+ # )
347
347
  ```
348
348
 
349
349
 
@@ -289,15 +289,14 @@ from docling_core.transforms.chunker import HierarchicalChunker
289
289
 
290
290
  doc = DocumentConverter().convert_single("https://arxiv.org/pdf/2206.01062").output
291
291
  chunks = list(HierarchicalChunker().chunk(doc))
292
- # > [
293
- # > ChunkWithMetadata(
294
- # > path='$.main-text[0]',
295
- # > text='DocLayNet: A Large Human-Annotated Dataset [...]',
296
- # > page=1,
297
- # > bbox=[107.30, 672.38, 505.19, 709.08]
298
- # > ),
299
- # > [...]
300
- # > ]
292
+ print(chunks[0])
293
+ # ChunkWithMetadata(
294
+ # path='#/main-text/1',
295
+ # text='DocLayNet: A Large Human-Annotated Dataset [...]',
296
+ # page=1,
297
+ # bbox=[107.30, 672.38, 505.19, 709.08],
298
+ # [...]
299
+ # )
301
300
  ```
302
301
 
303
302
 
@@ -0,0 +1,236 @@
1
+ import logging
2
+ import random
3
+ from io import BytesIO
4
+ from pathlib import Path
5
+ from typing import Iterable, List, Optional, Union
6
+
7
+ import pypdfium2 as pdfium
8
+ from docling_parse.docling_parse import pdf_parser_v2
9
+ from PIL import Image, ImageDraw
10
+ from pypdfium2 import PdfPage
11
+
12
+ from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
13
+ from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
14
+
15
+ _log = logging.getLogger(__name__)
16
+
17
+
18
+ class DoclingParseV2PageBackend(PdfPageBackend):
19
+ def __init__(
20
+ self, parser: pdf_parser_v2, document_hash: str, page_no: int, page_obj: PdfPage
21
+ ):
22
+ self._ppage = page_obj
23
+ parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
24
+
25
+ self.valid = "pages" in parsed_page
26
+ if self.valid:
27
+ self._dpage = parsed_page["pages"][page_no]
28
+ else:
29
+ _log.info(
30
+ f"An error occured when loading page {page_no} of document {document_hash}."
31
+ )
32
+
33
+ def is_valid(self) -> bool:
34
+ return self.valid
35
+
36
+ def get_text_in_rect(self, bbox: BoundingBox) -> str:
37
+ if not self.valid:
38
+ return ""
39
+ # Find intersecting cells on the page
40
+ text_piece = ""
41
+ page_size = self.get_size()
42
+
43
+ parser_width = self._dpage["sanitized"]["dimension"]["width"]
44
+ parser_height = self._dpage["sanitized"]["dimension"]["height"]
45
+
46
+ scale = (
47
+ 1 # FIX - Replace with param in get_text_in_rect across backends (optional)
48
+ )
49
+
50
+ cells_data = self._dpage["sanitized"]["cells"]["data"]
51
+ cells_header = self._dpage["sanitized"]["cells"]["header"]
52
+
53
+ for i, cell_data in enumerate(cells_data):
54
+ x0 = cell_data[cells_header.index("x0")]
55
+ y0 = cell_data[cells_header.index("y0")]
56
+ x1 = cell_data[cells_header.index("x1")]
57
+ y1 = cell_data[cells_header.index("y1")]
58
+
59
+ cell_bbox = BoundingBox(
60
+ l=x0 * scale * page_size.width / parser_width,
61
+ b=y0 * scale * page_size.height / parser_height,
62
+ r=x1 * scale * page_size.width / parser_width,
63
+ t=y1 * scale * page_size.height / parser_height,
64
+ coord_origin=CoordOrigin.BOTTOMLEFT,
65
+ ).to_top_left_origin(page_height=page_size.height * scale)
66
+
67
+ overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
68
+
69
+ if overlap_frac > 0.5:
70
+ if len(text_piece) > 0:
71
+ text_piece += " "
72
+ text_piece += cell_data[cells_header.index("text")]
73
+
74
+ return text_piece
75
+
76
+ def get_text_cells(self) -> Iterable[Cell]:
77
+ cells: List[Cell] = []
78
+ cell_counter = 0
79
+
80
+ if not self.valid:
81
+ return cells
82
+
83
+ page_size = self.get_size()
84
+
85
+ parser_width = self._dpage["sanitized"]["dimension"]["width"]
86
+ parser_height = self._dpage["sanitized"]["dimension"]["height"]
87
+
88
+ cells_data = self._dpage["sanitized"]["cells"]["data"]
89
+ cells_header = self._dpage["sanitized"]["cells"]["header"]
90
+
91
+ for i, cell_data in enumerate(cells_data):
92
+ x0 = cell_data[cells_header.index("x0")]
93
+ y0 = cell_data[cells_header.index("y0")]
94
+ x1 = cell_data[cells_header.index("x1")]
95
+ y1 = cell_data[cells_header.index("y1")]
96
+
97
+ if x1 < x0:
98
+ x0, x1 = x1, x0
99
+ if y1 < y0:
100
+ y0, y1 = y1, y0
101
+
102
+ text_piece = cell_data[cells_header.index("text")]
103
+ cells.append(
104
+ Cell(
105
+ id=cell_counter,
106
+ text=text_piece,
107
+ bbox=BoundingBox(
108
+ # l=x0, b=y0, r=x1, t=y1,
109
+ l=x0 * page_size.width / parser_width,
110
+ b=y0 * page_size.height / parser_height,
111
+ r=x1 * page_size.width / parser_width,
112
+ t=y1 * page_size.height / parser_height,
113
+ coord_origin=CoordOrigin.BOTTOMLEFT,
114
+ ).to_top_left_origin(page_size.height),
115
+ )
116
+ )
117
+ cell_counter += 1
118
+
119
+ def draw_clusters_and_cells():
120
+ image = (
121
+ self.get_page_image()
122
+ ) # make new image to avoid drawing on the saved ones
123
+ draw = ImageDraw.Draw(image)
124
+ for c in cells:
125
+ x0, y0, x1, y1 = c.bbox.as_tuple()
126
+ cell_color = (
127
+ random.randint(30, 140),
128
+ random.randint(30, 140),
129
+ random.randint(30, 140),
130
+ )
131
+ draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
132
+ image.show()
133
+
134
+ # draw_clusters_and_cells()
135
+
136
+ return cells
137
+
138
+ def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
139
+ AREA_THRESHOLD = 32 * 32
140
+
141
+ images = self._dpage["sanitized"]["images"]["data"]
142
+ images_header = self._dpage["sanitized"]["images"]["header"]
143
+
144
+ for row in images:
145
+ x0 = row[images_header.index("x0")]
146
+ y0 = row[images_header.index("y0")]
147
+ x1 = row[images_header.index("x1")]
148
+ y1 = row[images_header.index("y1")]
149
+
150
+ cropbox = BoundingBox.from_tuple(
151
+ (x0, y0, x1, y1), origin=CoordOrigin.BOTTOMLEFT
152
+ ).to_top_left_origin(self.get_size().height)
153
+
154
+ if cropbox.area() > AREA_THRESHOLD:
155
+ cropbox = cropbox.scaled(scale=scale)
156
+
157
+ yield cropbox
158
+
159
+ def get_page_image(
160
+ self, scale: float = 1, cropbox: Optional[BoundingBox] = None
161
+ ) -> Image.Image:
162
+
163
+ page_size = self.get_size()
164
+
165
+ if not cropbox:
166
+ cropbox = BoundingBox(
167
+ l=0,
168
+ r=page_size.width,
169
+ t=0,
170
+ b=page_size.height,
171
+ coord_origin=CoordOrigin.TOPLEFT,
172
+ )
173
+ padbox = BoundingBox(
174
+ l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
175
+ )
176
+ else:
177
+ padbox = cropbox.to_bottom_left_origin(page_size.height)
178
+ padbox.r = page_size.width - padbox.r
179
+ padbox.t = page_size.height - padbox.t
180
+
181
+ image = (
182
+ self._ppage.render(
183
+ scale=scale * 1.5,
184
+ rotation=0, # no additional rotation
185
+ crop=padbox.as_tuple(),
186
+ )
187
+ .to_pil()
188
+ .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
189
+ ) # We resize the image from 1.5x the given scale to make it sharper.
190
+
191
+ return image
192
+
193
+ def get_size(self) -> PageSize:
194
+ return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
195
+
196
+ def unload(self):
197
+ self._ppage = None
198
+ self._dpage = None
199
+
200
+
201
+ class DoclingParseV2DocumentBackend(PdfDocumentBackend):
202
+ def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
203
+ super().__init__(path_or_stream, document_hash)
204
+
205
+ self._pdoc = pdfium.PdfDocument(path_or_stream)
206
+ self.parser = pdf_parser_v2("fatal")
207
+
208
+ success = False
209
+ if isinstance(path_or_stream, BytesIO):
210
+ success = self.parser.load_document_from_bytesio(
211
+ document_hash, path_or_stream
212
+ )
213
+ elif isinstance(path_or_stream, Path):
214
+ success = self.parser.load_document(document_hash, str(path_or_stream))
215
+
216
+ if not success:
217
+ raise RuntimeError(
218
+ f"docling-parse could not load document {document_hash}."
219
+ )
220
+
221
+ def page_count(self) -> int:
222
+ return len(self._pdoc) # To be replaced with docling-parse API
223
+
224
+ def load_page(self, page_no: int) -> DoclingParseV2PageBackend:
225
+ return DoclingParseV2PageBackend(
226
+ self.parser, self.document_hash, page_no, self._pdoc[page_no]
227
+ )
228
+
229
+ def is_valid(self) -> bool:
230
+ return self.page_count() > 0
231
+
232
+ def unload(self):
233
+ super().unload()
234
+ self.parser.unload_document(self.document_hash)
235
+ self._pdoc.close()
236
+ self._pdoc = None
@@ -1,7 +1,7 @@
1
1
  import io
2
2
  import logging
3
3
  import tempfile
4
- from subprocess import PIPE, Popen
4
+ from subprocess import DEVNULL, PIPE, Popen
5
5
  from typing import Iterable, Tuple
6
6
 
7
7
  import pandas as pd
@@ -81,7 +81,7 @@ class TesseractOcrCliModel(BaseOcrModel):
81
81
  cmd += [ifilename, "stdout", "tsv"]
82
82
  _log.info("command: {}".format(" ".join(cmd)))
83
83
 
84
- proc = Popen(cmd, stdout=PIPE)
84
+ proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
85
85
  output, _ = proc.communicate()
86
86
 
87
87
  # _log.info(output)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "1.19.0" # DO NOT EDIT, updated automatically
3
+ version = "1.20.0" # DO NOT EDIT, updated automatically
4
4
  description = "Docling PDF conversion package"
5
5
  authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"
@@ -37,7 +37,7 @@ torchvision = [
37
37
  ######################
38
38
  python = "^3.10"
39
39
  pydantic = "^2.0.0"
40
- docling-core = "^1.6.2"
40
+ docling-core = "^1.7.1"
41
41
  docling-ibm-models = "^2.0.0"
42
42
  deepsearch-glm = "^0.22.0"
43
43
  filetype = "^1.2.0"
@@ -47,12 +47,13 @@ huggingface_hub = ">=0.23,<1"
47
47
  requests = "^2.32.3"
48
48
  easyocr = "^1.7"
49
49
  tesserocr = { version = "^2.7.1", optional = true }
50
- docling-parse = "^1.4.1"
50
+ docling-parse = "^1.6.0"
51
51
  certifi = ">=2024.7.4"
52
52
  rtree = "^1.3.0"
53
53
  scipy = "^1.14.1"
54
54
  pyarrow = "^16.1.0"
55
55
  typer = "^0.12.5"
56
+ pandas = "^2.1.4"
56
57
 
57
58
  [tool.poetry.group.dev.dependencies]
58
59
  black = {extras = ["jupyter"], version = "^24.4.2"}
@@ -67,7 +68,7 @@ pytest-xdist = "^3.3.1"
67
68
  types-requests = "^2.31.0.2"
68
69
  flake8-pyproject = "^1.2.3"
69
70
  pylint = "^2.17.5"
70
- pandas-stubs = "^2.2.2.240909"
71
+ pandas-stubs = "^2.1.4.231227"
71
72
  ipykernel = "^6.29.5"
72
73
  ipywidgets = "^8.1.5"
73
74
  nbqa = "^1.9.0"
@@ -75,6 +76,9 @@ nbqa = "^1.9.0"
75
76
  [tool.poetry.group.examples.dependencies]
76
77
  datasets = "^2.21.0"
77
78
  python-dotenv = "^1.0.1"
79
+ llama-index-readers-docling = "^0.1.0"
80
+ llama-index-node-parser-docling = "^0.1.0"
81
+ llama-index-readers-file = "^0.2.2"
78
82
  llama-index-embeddings-huggingface = "^0.3.1"
79
83
  llama-index-llms-huggingface-api = "^0.2.0"
80
84
  llama-index-vector-stores-milvus = "^0.2.1"
File without changes
File without changes
File without changes