docling 1.4.0__py3-none-any.whl → 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -18,6 +18,10 @@ class PdfPageBackend(ABC):
18
18
  def get_text_cells(self) -> Iterable["Cell"]:
19
19
  pass
20
20
 
21
+ @abstractmethod
22
+ def get_bitmap_rects(self, scale: int = 1) -> Iterable["BoundingBox"]:
23
+ pass
24
+
21
25
  @abstractmethod
22
26
  def get_page_image(
23
27
  self, scale: int = 1, cropbox: Optional["BoundingBox"] = None
@@ -1,7 +1,9 @@
1
+ import logging
1
2
  import random
3
+ import time
2
4
  from io import BytesIO
3
5
  from pathlib import Path
4
- from typing import Iterable, List, Optional, Union
6
+ from typing import Iterable, Optional, Union
5
7
 
6
8
  import pypdfium2 as pdfium
7
9
  from docling_parse.docling_parse import pdf_parser
@@ -11,6 +13,8 @@ from pypdfium2 import PdfPage
11
13
  from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
12
14
  from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
13
15
 
16
+ _log = logging.getLogger(__name__)
17
+
14
18
 
15
19
  class DoclingParsePageBackend(PdfPageBackend):
16
20
  def __init__(self, page_obj: PdfPage, docling_page_obj):
@@ -39,7 +43,7 @@ class DoclingParsePageBackend(PdfPageBackend):
39
43
  r=x1 * scale * page_size.width / parser_width,
40
44
  t=y1 * scale * page_size.height / parser_height,
41
45
  coord_origin=CoordOrigin.BOTTOMLEFT,
42
- ).to_top_left_origin(page_size.height * scale)
46
+ ).to_top_left_origin(page_height=page_size.height * scale)
43
47
 
44
48
  overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
45
49
 
@@ -62,6 +66,12 @@ class DoclingParsePageBackend(PdfPageBackend):
62
66
  for i in range(len(self._dpage["cells"])):
63
67
  rect = self._dpage["cells"][i]["box"]["device"]
64
68
  x0, y0, x1, y1 = rect
69
+
70
+ if x1 < x0:
71
+ x0, x1 = x1, x0
72
+ if y1 < y0:
73
+ y0, y1 = y1, y0
74
+
65
75
  text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
66
76
  cells.append(
67
77
  Cell(
@@ -80,7 +90,9 @@ class DoclingParsePageBackend(PdfPageBackend):
80
90
  cell_counter += 1
81
91
 
82
92
  def draw_clusters_and_cells():
83
- image = self.get_page_image()
93
+ image = (
94
+ self.get_page_image()
95
+ ) # make new image to avoid drawing on the saved ones
84
96
  draw = ImageDraw.Draw(image)
85
97
  for c in cells:
86
98
  x0, y0, x1, y1 = c.bbox.as_tuple()
@@ -102,6 +114,20 @@ class DoclingParsePageBackend(PdfPageBackend):
102
114
 
103
115
  return cells
104
116
 
117
+ def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
118
+ AREA_THRESHOLD = 32 * 32
119
+
120
+ for i in range(len(self._dpage["images"])):
121
+ bitmap = self._dpage["images"][i]
122
+ cropbox = BoundingBox.from_tuple(
123
+ bitmap["box"], origin=CoordOrigin.BOTTOMLEFT
124
+ ).to_top_left_origin(self.get_size().height)
125
+
126
+ if cropbox.area() > AREA_THRESHOLD:
127
+ cropbox = cropbox.scaled(scale=scale)
128
+
129
+ yield cropbox
130
+
105
131
  def get_page_image(
106
132
  self, scale: int = 1, cropbox: Optional[BoundingBox] = None
107
133
  ) -> Image.Image:
@@ -151,15 +177,23 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
151
177
  self._pdoc = pdfium.PdfDocument(path_or_stream)
152
178
  # Parsing cells with docling_parser call
153
179
  parser = pdf_parser()
180
+
181
+ start_pb_time = time.time()
182
+
154
183
  if isinstance(path_or_stream, BytesIO):
155
184
  self._parser_doc = parser.find_cells_from_bytesio(path_or_stream)
156
185
  else:
157
186
  self._parser_doc = parser.find_cells(str(path_or_stream))
158
187
 
188
+ end_pb_time = time.time() - start_pb_time
189
+ _log.info(
190
+ f"Time to parse {path_or_stream.name} with docling-parse: time={end_pb_time:.3f}"
191
+ )
192
+
159
193
  def page_count(self) -> int:
160
194
  return len(self._parser_doc["pages"])
161
195
 
162
- def load_page(self, page_no: int) -> PdfPage:
196
+ def load_page(self, page_no: int) -> DoclingParsePageBackend:
163
197
  return DoclingParsePageBackend(
164
198
  self._pdoc[page_no], self._parser_doc["pages"][page_no]
165
199
  )
@@ -4,6 +4,7 @@ from pathlib import Path
4
4
  from typing import Iterable, List, Optional, Union
5
5
 
6
6
  import pypdfium2 as pdfium
7
+ import pypdfium2.raw as pdfium_c
7
8
  from PIL import Image, ImageDraw
8
9
  from pypdfium2 import PdfPage
9
10
 
@@ -17,6 +18,19 @@ class PyPdfiumPageBackend(PdfPageBackend):
17
18
  self._ppage = page_obj
18
19
  self.text_page = None
19
20
 
21
+ def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
22
+ AREA_THRESHOLD = 32 * 32
23
+ for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
24
+ pos = obj.get_pos()
25
+ cropbox = BoundingBox.from_tuple(
26
+ pos, origin=CoordOrigin.BOTTOMLEFT
27
+ ).to_top_left_origin(page_height=self.get_size().height)
28
+
29
+ if cropbox.area() > AREA_THRESHOLD:
30
+ cropbox = cropbox.scaled(scale=scale)
31
+
32
+ yield cropbox
33
+
20
34
  def get_text_in_rect(self, bbox: BoundingBox) -> str:
21
35
  if not self.text_page:
22
36
  self.text_page = self._ppage.get_textpage()
@@ -134,7 +148,9 @@ class PyPdfiumPageBackend(PdfPageBackend):
134
148
  return merged_cells
135
149
 
136
150
  def draw_clusters_and_cells():
137
- image = self.get_page_image()
151
+ image = (
152
+ self.get_page_image()
153
+ ) # make new image to avoid drawing on the saved ones
138
154
  draw = ImageDraw.Draw(image)
139
155
  for c in cells:
140
156
  x0, y0, x1, y1 = c.bbox.as_tuple()
@@ -206,7 +222,7 @@ class PyPdfiumDocumentBackend(PdfDocumentBackend):
206
222
  def page_count(self) -> int:
207
223
  return len(self._pdoc)
208
224
 
209
- def load_page(self, page_no: int) -> PdfPage:
225
+ def load_page(self, page_no: int) -> PyPdfiumPageBackend:
210
226
  return PyPdfiumPageBackend(self._pdoc[page_no])
211
227
 
212
228
  def is_valid(self) -> bool:
@@ -1,10 +1,12 @@
1
1
  import copy
2
+ import warnings
2
3
  from enum import Enum, auto
3
4
  from io import BytesIO
4
- from typing import Any, Dict, List, Optional, Tuple, Union
5
+ from typing import Annotated, Any, Dict, List, Optional, Tuple, Union
5
6
 
6
7
  from PIL.Image import Image
7
- from pydantic import BaseModel, ConfigDict, model_validator
8
+ from pydantic import BaseModel, ConfigDict, Field, model_validator
9
+ from typing_extensions import Self
8
10
 
9
11
  from docling.backend.abstract_backend import PdfPageBackend
10
12
 
@@ -66,13 +68,21 @@ class BoundingBox(BaseModel):
66
68
  @classmethod
67
69
  def from_tuple(cls, coord: Tuple[float], origin: CoordOrigin):
68
70
  if origin == CoordOrigin.TOPLEFT:
69
- return BoundingBox(
70
- l=coord[0], t=coord[1], r=coord[2], b=coord[3], coord_origin=origin
71
- )
71
+ l, t, r, b = coord[0], coord[1], coord[2], coord[3]
72
+ if r < l:
73
+ l, r = r, l
74
+ if b < t:
75
+ b, t = t, b
76
+
77
+ return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
72
78
  elif origin == CoordOrigin.BOTTOMLEFT:
73
- return BoundingBox(
74
- l=coord[0], b=coord[1], r=coord[2], t=coord[3], coord_origin=origin
75
- )
79
+ l, b, r, t = coord[0], coord[1], coord[2], coord[3]
80
+ if r < l:
81
+ l, r = r, l
82
+ if b > t:
83
+ b, t = t, b
84
+
85
+ return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
76
86
 
77
87
  def area(self) -> float:
78
88
  return (self.r - self.l) * (self.b - self.t)
@@ -234,14 +244,30 @@ class Page(BaseModel):
234
244
  model_config = ConfigDict(arbitrary_types_allowed=True)
235
245
 
236
246
  page_no: int
237
- page_hash: str = None
238
- size: PageSize = None
239
- image: Image = None
247
+ page_hash: Optional[str] = None
248
+ size: Optional[PageSize] = None
240
249
  cells: List[Cell] = None
241
250
  predictions: PagePredictions = PagePredictions()
242
- assembled: AssembledUnit = None
251
+ assembled: Optional[AssembledUnit] = None
243
252
 
244
- _backend: PdfPageBackend = None # Internal PDF backend
253
+ _backend: Optional[PdfPageBackend] = (
254
+ None # Internal PDF backend. By default it is cleared during assembling.
255
+ )
256
+ _default_image_scale: float = 1.0 # Default image scale for external usage.
257
+ _image_cache: Dict[float, Image] = (
258
+ {}
259
+ ) # Cache of images in different scales. By default it is cleared during assembling.
260
+
261
+ def get_image(self, scale: float = 1.0) -> Optional[Image]:
262
+ if self._backend is None:
263
+ return self._image_cache.get(scale, None)
264
+ if not scale in self._image_cache:
265
+ self._image_cache[scale] = self._backend.get_page_image(scale=scale)
266
+ return self._image_cache[scale]
267
+
268
+ @property
269
+ def image(self) -> Optional[Image]:
270
+ return self.get_image(scale=self._default_image_scale)
245
271
 
246
272
 
247
273
  class DocumentStream(BaseModel):
@@ -262,12 +288,25 @@ class TableStructureOptions(BaseModel):
262
288
 
263
289
  class PipelineOptions(BaseModel):
264
290
  do_table_structure: bool = True # True: perform table structure extraction
265
- do_ocr: bool = False # True: perform OCR, replace programmatic PDF text
291
+ do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
266
292
 
267
293
  table_structure_options: TableStructureOptions = TableStructureOptions()
268
294
 
269
295
 
270
296
  class AssembleOptions(BaseModel):
271
- keep_page_images: bool = (
272
- False # False: page images are removed in the assemble step
273
- )
297
+ keep_page_images: Annotated[
298
+ bool,
299
+ Field(
300
+ deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead"
301
+ ),
302
+ ] = False # False: page images are removed in the assemble step
303
+ images_scale: Optional[float] = None # if set, the scale for generated images
304
+
305
+ @model_validator(mode="after")
306
+ def set_page_images_from_deprecated(self) -> Self:
307
+ with warnings.catch_warnings():
308
+ warnings.simplefilter("ignore", DeprecationWarning)
309
+ default_scale = 1.0
310
+ if self.keep_page_images and self.images_scale is None:
311
+ self.images_scale = default_scale
312
+ return self
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from io import BytesIO
3
3
  from pathlib import Path, PurePath
4
- from typing import ClassVar, Dict, Iterable, List, Optional, Type, Union
4
+ from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
5
5
 
6
6
  from docling_core.types import BaseCell, BaseText
7
7
  from docling_core.types import BoundingBox as DsBoundingBox
@@ -21,6 +21,7 @@ from docling.datamodel.base_models import (
21
21
  DocumentStream,
22
22
  FigureElement,
23
23
  Page,
24
+ PageElement,
24
25
  TableElement,
25
26
  TextElement,
26
27
  )
@@ -302,6 +303,20 @@ class ConvertedDocument(BaseModel):
302
303
  else:
303
304
  return ""
304
305
 
306
+ def render_element_images(
307
+ self, element_types: Tuple[PageElement] = (FigureElement,)
308
+ ):
309
+ for element in self.assembled.elements:
310
+ if isinstance(element, element_types):
311
+ page_ix = element.page_no
312
+ scale = self.pages[page_ix]._default_image_scale
313
+ crop_bbox = element.cluster.bbox.scaled(scale=scale).to_top_left_origin(
314
+ page_height=self.pages[page_ix].size.height * scale
315
+ )
316
+
317
+ cropped_im = self.pages[page_ix].image.crop(crop_bbox.as_tuple())
318
+ yield element, cropped_im
319
+
305
320
 
306
321
  class DocumentConversionInput(BaseModel):
307
322
 
@@ -35,8 +35,6 @@ _log = logging.getLogger(__name__)
35
35
 
36
36
 
37
37
  class DocumentConverter:
38
- _layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
39
- _table_model_path = "model_artifacts/tableformer"
40
38
  _default_download_filename = "file.pdf"
41
39
 
42
40
  def __init__(
@@ -188,10 +186,8 @@ class DocumentConverter:
188
186
  # Free up mem resources before moving on with next batch
189
187
 
190
188
  # Remove page images (can be disabled)
191
- if not self.assemble_options.keep_page_images:
192
- assembled_page.image = (
193
- None # Comment this if you want to visualize page images
194
- )
189
+ if self.assemble_options.images_scale is None:
190
+ assembled_page._image_cache = {}
195
191
 
196
192
  # Unload backend
197
193
  assembled_page._backend.unload()
@@ -231,7 +227,15 @@ class DocumentConverter:
231
227
 
232
228
  # Generate the page image and store it in the page object
233
229
  def populate_page_images(self, doc: InputDocument, page: Page) -> Page:
234
- page.image = page._backend.get_page_image()
230
+ # default scale
231
+ page.get_image(scale=1.0)
232
+
233
+ # user requested scales
234
+ if self.assemble_options.images_scale is not None:
235
+ page._default_image_scale = self.assemble_options.images_scale
236
+ page.get_image(
237
+ scale=self.assemble_options.images_scale
238
+ ) # this will trigger storing the image in the internal cache
235
239
 
236
240
  return page
237
241
 
@@ -247,7 +251,7 @@ class DocumentConverter:
247
251
  draw.rectangle([(x0, y0), (x1, y1)], outline="red")
248
252
  image.show()
249
253
 
250
- # draw_text_boxes(page.image, cells)
254
+ # draw_text_boxes(page.get_image(scale=1.0), cells)
251
255
 
252
256
  return page
253
257
 
@@ -0,0 +1,124 @@
1
+ import copy
2
+ import logging
3
+ from abc import abstractmethod
4
+ from typing import Iterable, List, Tuple
5
+
6
+ import numpy
7
+ import numpy as np
8
+ from PIL import Image, ImageDraw
9
+ from rtree import index
10
+ from scipy.ndimage import find_objects, label
11
+
12
+ from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
13
+
14
+ _log = logging.getLogger(__name__)
15
+
16
+
17
+ class BaseOcrModel:
18
+ def __init__(self, config):
19
+ self.config = config
20
+ self.enabled = config["enabled"]
21
+
22
+ # Computes the optimum amount and coordinates of rectangles to OCR on a given page
23
+ def get_ocr_rects(self, page: Page) -> Tuple[bool, List[BoundingBox]]:
24
+ BITMAP_COVERAGE_TRESHOLD = 0.75
25
+
26
+ def find_ocr_rects(size, bitmap_rects):
27
+ image = Image.new(
28
+ "1", (round(size.width), round(size.height))
29
+ ) # '1' mode is binary
30
+
31
+ # Draw all bitmap rects into a binary image
32
+ draw = ImageDraw.Draw(image)
33
+ for rect in bitmap_rects:
34
+ x0, y0, x1, y1 = rect.as_tuple()
35
+ x0, y0, x1, y1 = round(x0), round(y0), round(x1), round(y1)
36
+ draw.rectangle([(x0, y0), (x1, y1)], fill=1)
37
+
38
+ np_image = np.array(image)
39
+
40
+ # Find the connected components
41
+ labeled_image, num_features = label(
42
+ np_image > 0
43
+ ) # Label black (0 value) regions
44
+
45
+ # Find enclosing bounding boxes for each connected component.
46
+ slices = find_objects(labeled_image)
47
+ bounding_boxes = [
48
+ BoundingBox(
49
+ l=slc[1].start,
50
+ t=slc[0].start,
51
+ r=slc[1].stop - 1,
52
+ b=slc[0].stop - 1,
53
+ coord_origin=CoordOrigin.TOPLEFT,
54
+ )
55
+ for slc in slices
56
+ ]
57
+
58
+ # Compute area fraction on page covered by bitmaps
59
+ area_frac = np.sum(np_image > 0) / (size.width * size.height)
60
+
61
+ return (area_frac, bounding_boxes) # fraction covered # boxes
62
+
63
+ bitmap_rects = page._backend.get_bitmap_rects()
64
+ coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
65
+
66
+ # return full-page rectangle if sufficiently covered with bitmaps
67
+ if coverage > BITMAP_COVERAGE_TRESHOLD:
68
+ return [
69
+ BoundingBox(
70
+ l=0,
71
+ t=0,
72
+ r=page.size.width,
73
+ b=page.size.height,
74
+ coord_origin=CoordOrigin.TOPLEFT,
75
+ )
76
+ ]
77
+ # return individual rectangles if the bitmap coverage is smaller
78
+ elif coverage < BITMAP_COVERAGE_TRESHOLD:
79
+ return ocr_rects
80
+
81
+ # Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
82
+ def filter_ocr_cells(self, ocr_cells, programmatic_cells):
83
+ # Create R-tree index for programmatic cells
84
+ p = index.Property()
85
+ p.dimension = 2
86
+ idx = index.Index(properties=p)
87
+ for i, cell in enumerate(programmatic_cells):
88
+ idx.insert(i, cell.bbox.as_tuple())
89
+
90
+ def is_overlapping_with_existing_cells(ocr_cell):
91
+ # Query the R-tree to get overlapping rectangles
92
+ possible_matches_index = list(idx.intersection(ocr_cell.bbox.as_tuple()))
93
+
94
+ return (
95
+ len(possible_matches_index) > 0
96
+ ) # this is a weak criterion but it works.
97
+
98
+ filtered_ocr_cells = [
99
+ rect for rect in ocr_cells if not is_overlapping_with_existing_cells(rect)
100
+ ]
101
+ return filtered_ocr_cells
102
+
103
+ def draw_ocr_rects_and_cells(self, page, ocr_rects):
104
+ image = copy.deepcopy(page.image)
105
+ draw = ImageDraw.Draw(image, "RGBA")
106
+
107
+ # Draw OCR rectangles as yellow filled rect
108
+ for rect in ocr_rects:
109
+ x0, y0, x1, y1 = rect.as_tuple()
110
+ shade_color = (255, 255, 0, 40) # transparent yellow
111
+ draw.rectangle([(x0, y0), (x1, y1)], fill=shade_color, outline=None)
112
+
113
+ # Draw OCR and programmatic cells
114
+ for tc in page.cells:
115
+ x0, y0, x1, y1 = tc.bbox.as_tuple()
116
+ color = "red"
117
+ if isinstance(tc, OcrCell):
118
+ color = "magenta"
119
+ draw.rectangle([(x0, y0), (x1, y1)], outline=color)
120
+ image.show()
121
+
122
+ @abstractmethod
123
+ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
124
+ pass
@@ -1,20 +1,18 @@
1
- import copy
2
1
  import logging
3
- import random
4
2
  from typing import Iterable
5
3
 
6
4
  import numpy
7
- from PIL import ImageDraw
8
5
 
9
6
  from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
7
+ from docling.models.base_ocr_model import BaseOcrModel
10
8
 
11
9
  _log = logging.getLogger(__name__)
12
10
 
13
11
 
14
- class EasyOcrModel:
12
+ class EasyOcrModel(BaseOcrModel):
15
13
  def __init__(self, config):
16
- self.config = config
17
- self.enabled = config["enabled"]
14
+ super().__init__(config)
15
+
18
16
  self.scale = 3 # multiplier for 72 dpi == 216 dpi.
19
17
 
20
18
  if self.enabled:
@@ -29,49 +27,44 @@ class EasyOcrModel:
29
27
  return
30
28
 
31
29
  for page in page_batch:
32
- # rects = page._fpage.
33
- high_res_image = page._backend.get_page_image(scale=self.scale)
34
- im = numpy.array(high_res_image)
35
- result = self.reader.readtext(im)
36
-
37
- del high_res_image
38
- del im
39
-
40
- cells = [
41
- OcrCell(
42
- id=ix,
43
- text=line[1],
44
- confidence=line[2],
45
- bbox=BoundingBox.from_tuple(
46
- coord=(
47
- line[0][0][0] / self.scale,
48
- line[0][0][1] / self.scale,
49
- line[0][2][0] / self.scale,
50
- line[0][2][1] / self.scale,
51
- ),
52
- origin=CoordOrigin.TOPLEFT,
53
- ),
30
+ ocr_rects = self.get_ocr_rects(page)
31
+
32
+ all_ocr_cells = []
33
+ for ocr_rect in ocr_rects:
34
+ high_res_image = page._backend.get_page_image(
35
+ scale=self.scale, cropbox=ocr_rect
54
36
  )
55
- for ix, line in enumerate(result)
56
- ]
37
+ im = numpy.array(high_res_image)
38
+ result = self.reader.readtext(im)
39
+
40
+ del high_res_image
41
+ del im
42
+
43
+ cells = [
44
+ OcrCell(
45
+ id=ix,
46
+ text=line[1],
47
+ confidence=line[2],
48
+ bbox=BoundingBox.from_tuple(
49
+ coord=(
50
+ (line[0][0][0] / self.scale) + ocr_rect.l,
51
+ (line[0][0][1] / self.scale) + ocr_rect.t,
52
+ (line[0][2][0] / self.scale) + ocr_rect.l,
53
+ (line[0][2][1] / self.scale) + ocr_rect.t,
54
+ ),
55
+ origin=CoordOrigin.TOPLEFT,
56
+ ),
57
+ )
58
+ for ix, line in enumerate(result)
59
+ ]
60
+ all_ocr_cells.extend(cells)
57
61
 
58
- page.cells = cells # For now, just overwrites all digital cells.
62
+ ## Remove OCR cells which overlap with programmatic cells.
63
+ filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
59
64
 
60
- # DEBUG code:
61
- def draw_clusters_and_cells():
62
- image = copy.deepcopy(page.image)
63
- draw = ImageDraw.Draw(image)
64
-
65
- cell_color = (
66
- random.randint(30, 140),
67
- random.randint(30, 140),
68
- random.randint(30, 140),
69
- )
70
- for tc in cells:
71
- x0, y0, x1, y1 = tc.bbox.as_tuple()
72
- draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
73
- image.show()
65
+ page.cells.extend(filtered_ocr_cells)
74
66
 
75
- # draw_clusters_and_cells()
67
+ # DEBUG code:
68
+ # self.draw_ocr_rects_and_cells(page, ocr_rects)
76
69
 
77
70
  yield page
@@ -267,7 +267,9 @@ class LayoutModel:
267
267
  def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
268
268
  for page in page_batch:
269
269
  clusters = []
270
- for ix, pred_item in enumerate(self.layout_predictor.predict(page.image)):
270
+ for ix, pred_item in enumerate(
271
+ self.layout_predictor.predict(page.get_image(scale=1.0))
272
+ ):
271
273
  cluster = Cluster(
272
274
  id=ix,
273
275
  label=pred_item["label"],
@@ -1,5 +1,4 @@
1
1
  import copy
2
- import random
3
2
  from typing import Iterable, List
4
3
 
5
4
  import numpy
@@ -34,7 +33,9 @@ class TableStructureModel:
34
33
  self.scale = 2.0 # Scale up table input images to 144 dpi
35
34
 
36
35
  def draw_table_and_cells(self, page: Page, tbl_list: List[TableElement]):
37
- image = page._backend.get_page_image()
36
+ image = (
37
+ page._backend.get_page_image()
38
+ ) # make new image to avoid drawing on the saved ones
38
39
  draw = ImageDraw.Draw(image)
39
40
 
40
41
  for table_element in tbl_list:
@@ -94,13 +95,7 @@ class TableStructureModel:
94
95
  "width": page.size.width * self.scale,
95
96
  "height": page.size.height * self.scale,
96
97
  }
97
- # add image to page input.
98
- if self.scale == 1.0:
99
- page_input["image"] = numpy.asarray(page.image)
100
- else: # render new page image on the fly at desired scale
101
- page_input["image"] = numpy.asarray(
102
- page._backend.get_page_image(scale=self.scale)
103
- )
98
+ page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))
104
99
 
105
100
  table_clusters, table_bboxes = zip(*in_tables)
106
101
 
@@ -1,4 +1,3 @@
1
- from abc import abstractmethod
2
1
  from pathlib import Path
3
2
  from typing import Iterable
4
3
 
@@ -1,10 +1,8 @@
1
1
  from pathlib import Path
2
- from typing import Iterable
3
2
 
4
- from docling.datamodel.base_models import Page, PipelineOptions
3
+ from docling.datamodel.base_models import PipelineOptions
5
4
  from docling.models.easyocr_model import EasyOcrModel
6
5
  from docling.models.layout_model import LayoutModel
7
- from docling.models.page_assemble_model import PageAssembleModel
8
6
  from docling.models.table_structure_model import TableStructureModel
9
7
  from docling.pipeline.base_model_pipeline import BaseModelPipeline
10
8
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 1.4.0
3
+ Version: 1.6.0
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -19,20 +19,20 @@ Classifier: Programming Language :: Python :: 3.10
19
19
  Classifier: Programming Language :: Python :: 3.11
20
20
  Classifier: Programming Language :: Python :: 3.12
21
21
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
- Provides-Extra: easyocr
23
22
  Provides-Extra: ocr
24
23
  Requires-Dist: certifi (>=2024.7.4)
25
24
  Requires-Dist: deepsearch-glm (>=0.19.0,<1)
26
25
  Requires-Dist: docling-core (>=1.1.2,<2.0.0)
27
26
  Requires-Dist: docling-ibm-models (>=1.1.1,<2.0.0)
28
27
  Requires-Dist: docling-parse (>=0.2.0,<0.3.0)
29
- Requires-Dist: easyocr (>=1.7,<2.0) ; extra == "easyocr" or extra == "ocr"
28
+ Requires-Dist: easyocr (>=1.7,<2.0) ; extra == "ocr"
30
29
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
31
30
  Requires-Dist: huggingface_hub (>=0.23,<1)
32
31
  Requires-Dist: pydantic (>=2.0.0,<3.0.0)
33
32
  Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
34
33
  Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
35
34
  Requires-Dist: requests (>=2.32.3,<3.0.0)
35
+ Requires-Dist: rtree (>=1.3.0,<2.0.0)
36
36
  Project-URL: Repository, https://github.com/DS4SD/docling
37
37
  Description-Content-Type: text/markdown
38
38
 
@@ -44,6 +44,7 @@ Description-Content-Type: text/markdown
44
44
 
45
45
  # Docling
46
46
 
47
+ [![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
47
48
  [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
48
49
  ![Python](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue)
49
50
  [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
@@ -172,13 +173,15 @@ Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main
172
173
  If you use Docling in your projects, please consider citing the following:
173
174
 
174
175
  ```bib
175
- @software{Docling,
176
- author = {Deep Search Team},
177
- month = {7},
178
- title = {{Docling}},
179
- url = {https://github.com/DS4SD/docling},
180
- version = {main},
181
- year = {2024}
176
+ @techreport{Docling,
177
+ author = {Deep Search Team},
178
+ month = {8},
179
+ title = {{Docling Technical Report}},
180
+ url={https://arxiv.org/abs/2408.09869},
181
+ eprint={2408.09869},
182
+ doi = "10.48550/arXiv.2408.09869",
183
+ version = {1.0.0},
184
+ year = {2024}
182
185
  }
183
186
  ```
184
187
 
@@ -0,0 +1,27 @@
1
+ docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ docling/backend/abstract_backend.py,sha256=ZfEHaBPGM1cmqrhaEoU3MHhnHU11NhOnhtFEIbVMYDo,1221
4
+ docling/backend/docling_parse_backend.py,sha256=TN7Ln3Lkc8k0v6HzxA2iUGc8f2iqMw0I-3eryLQkpdw,6924
5
+ docling/backend/pypdfium2_backend.py,sha256=xUiIYgd7i22YDx4-W2hfPUaQFszW0gcT6pavG5qZ8LE,8062
6
+ docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ docling/datamodel/base_models.py,sha256=5VHit5h7OleKnbhvy-sWDxQLizEdNrGUBrypyzwHyAE,8604
8
+ docling/datamodel/document.py,sha256=Dgi9pSwXCgIoR26MKiRDiVMyMaFKdvGSKq2Fm5Lef9M,13173
9
+ docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
10
+ docling/document_converter.py,sha256=UFSELvUSWsr8s0VByu4lNuzu7bn7zZauJTL3FTSLSBg,10371
11
+ docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ docling/models/base_ocr_model.py,sha256=Ipl82a3AV2OsgMQSMEMpnWJ6MXcmyIQzmp52PmTaB0g,4465
13
+ docling/models/ds_glm_model.py,sha256=wmb--2JKFQby-kvidw6PyM8wURPXYPQ_Z_eKKCBAdYQ,3192
14
+ docling/models/easyocr_model.py,sha256=ABIqALvtNNrDQ47fXaZ0lDFhOwKsYGUUlAPnIsFZgZA,2232
15
+ docling/models/layout_model.py,sha256=ZFmaLXlRWUfsT1pJCiYVxhQFrBBsiz6Aw0m9GM3UvVM,11249
16
+ docling/models/page_assemble_model.py,sha256=8eoG2WiFxPxq9TPvM-wkngb2gkr0tdtCRVXg1JcTETo,5550
17
+ docling/models/table_structure_model.py,sha256=5jzTlpM-GdCSq4l0vD1W6aSPTJXeTcXEnNuPxnw-DlA,5437
18
+ docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
+ docling/pipeline/base_model_pipeline.py,sha256=AC5NTR0xLy5JIZqsTINkKEHeCPqpyvJpuE_bcnZhyvI,529
20
+ docling/pipeline/standard_model_pipeline.py,sha256=UTjyaEXvz9htYZz-IMTkn11cZwNjgvo_Fl2dfBVnRQs,1442
21
+ docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
+ docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
23
+ docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
24
+ docling-1.6.0.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
25
+ docling-1.6.0.dist-info/METADATA,sha256=iMNzQ5wFtqHCTYat46cOq9JK0nhYKr1N6_PuEuah5D4,7227
26
+ docling-1.6.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
27
+ docling-1.6.0.dist-info/RECORD,,
@@ -1,26 +0,0 @@
1
- docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- docling/backend/abstract_backend.py,sha256=swwmXzNueZSHqEOvw4j-IFhP2OUJhBeB--gV7NtzKgo,1112
4
- docling/backend/docling_parse_backend.py,sha256=bgsmnwDmroBwuOwkEUzlN9KMEIFJ1xUaCZW6rsr5G-c,5924
5
- docling/backend/pypdfium2_backend.py,sha256=tv6JxyTkTdT2qr2ghsQgYA2zgpCDxKYSdHVBTAR7FSk,7411
6
- docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- docling/datamodel/base_models.py,sha256=irZLAHdsROOOwRbywKIA0mk3H8GrLwtGjOgTV6G0QoU,7004
8
- docling/datamodel/document.py,sha256=lZHXINmPWvpzrV3PTilgJs1blqTMCnJdLEww_qfcqdE,12533
9
- docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
10
- docling/document_converter.py,sha256=dMucsq6M_nwPsC1ChogVwJgNDv8sJuFklQWWinDZaug,10246
11
- docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
- docling/models/ds_glm_model.py,sha256=wmb--2JKFQby-kvidw6PyM8wURPXYPQ_Z_eKKCBAdYQ,3192
13
- docling/models/easyocr_model.py,sha256=NaHVs8IN0eW9KB076E2Kae1s-bq74_4IMWueze9QqtE,2290
14
- docling/models/layout_model.py,sha256=3mOgNvCYPh99_oLxJy-ZaIqGOFgG5bcIQ0tTubW656Q,11204
15
- docling/models/page_assemble_model.py,sha256=8eoG2WiFxPxq9TPvM-wkngb2gkr0tdtCRVXg1JcTETo,5550
16
- docling/models/table_structure_model.py,sha256=xUmfunZNYC30P0fRdESdztqy1FVlMzlhJjLBp-xcn4A,5638
17
- docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
- docling/pipeline/base_model_pipeline.py,sha256=ozHdJak0yQAxQf7pQN_C480vI35A2e5KL5Qq1xSkq5c,560
19
- docling/pipeline/standard_model_pipeline.py,sha256=UTwodKUKrisLoVcntbNUBDhjzRyFvpdUvyVw-gNmBlM,1541
20
- docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
- docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
22
- docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
23
- docling-1.4.0.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
24
- docling-1.4.0.dist-info/METADATA,sha256=Hu8pvrxpc0b1qzQvvzI_ijRAQWjOfcfNl4_1Zb7oyoc,7042
25
- docling-1.4.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
26
- docling-1.4.0.dist-info/RECORD,,