docling 1.1.2__tar.gz → 1.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. {docling-1.1.2 → docling-1.5.0}/PKG-INFO +20 -11
  2. {docling-1.1.2 → docling-1.5.0}/README.md +16 -9
  3. {docling-1.1.2 → docling-1.5.0}/docling/backend/abstract_backend.py +1 -1
  4. docling-1.5.0/docling/backend/docling_parse_backend.py +187 -0
  5. {docling-1.1.2 → docling-1.5.0}/docling/backend/pypdfium2_backend.py +4 -2
  6. {docling-1.1.2 → docling-1.5.0}/docling/datamodel/base_models.py +44 -7
  7. {docling-1.1.2 → docling-1.5.0}/docling/datamodel/document.py +19 -4
  8. {docling-1.1.2 → docling-1.5.0}/docling/document_converter.py +21 -6
  9. {docling-1.1.2 → docling-1.5.0}/docling/models/easyocr_model.py +1 -1
  10. {docling-1.1.2 → docling-1.5.0}/docling/models/layout_model.py +11 -1
  11. {docling-1.1.2 → docling-1.5.0}/docling/models/table_structure_model.py +4 -8
  12. {docling-1.1.2 → docling-1.5.0}/pyproject.toml +4 -2
  13. {docling-1.1.2 → docling-1.5.0}/LICENSE +0 -0
  14. {docling-1.1.2 → docling-1.5.0}/docling/__init__.py +0 -0
  15. {docling-1.1.2 → docling-1.5.0}/docling/backend/__init__.py +0 -0
  16. {docling-1.1.2 → docling-1.5.0}/docling/datamodel/__init__.py +0 -0
  17. {docling-1.1.2 → docling-1.5.0}/docling/datamodel/settings.py +0 -0
  18. {docling-1.1.2 → docling-1.5.0}/docling/models/__init__.py +0 -0
  19. {docling-1.1.2 → docling-1.5.0}/docling/models/ds_glm_model.py +0 -0
  20. {docling-1.1.2 → docling-1.5.0}/docling/models/page_assemble_model.py +0 -0
  21. {docling-1.1.2 → docling-1.5.0}/docling/pipeline/__init__.py +0 -0
  22. {docling-1.1.2 → docling-1.5.0}/docling/pipeline/base_model_pipeline.py +0 -0
  23. {docling-1.1.2 → docling-1.5.0}/docling/pipeline/standard_model_pipeline.py +0 -0
  24. {docling-1.1.2 → docling-1.5.0}/docling/utils/__init__.py +0 -0
  25. {docling-1.1.2 → docling-1.5.0}/docling/utils/layout_utils.py +0 -0
  26. {docling-1.1.2 → docling-1.5.0}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 1.1.2
3
+ Version: 1.5.0
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -21,9 +21,11 @@ Classifier: Programming Language :: Python :: 3.12
21
21
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Provides-Extra: easyocr
23
23
  Provides-Extra: ocr
24
+ Requires-Dist: certifi (>=2024.7.4)
24
25
  Requires-Dist: deepsearch-glm (>=0.19.0,<1)
25
26
  Requires-Dist: docling-core (>=1.1.2,<2.0.0)
26
- Requires-Dist: docling-ibm-models (>=1.1.0,<2.0.0)
27
+ Requires-Dist: docling-ibm-models (>=1.1.1,<2.0.0)
28
+ Requires-Dist: docling-parse (>=0.2.0,<0.3.0)
27
29
  Requires-Dist: easyocr (>=1.7,<2.0) ; extra == "easyocr" or extra == "ocr"
28
30
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
29
31
  Requires-Dist: huggingface_hub (>=0.23,<1)
@@ -42,6 +44,7 @@ Description-Content-Type: text/markdown
42
44
 
43
45
  # Docling
44
46
 
47
+ [![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
45
48
  [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
46
49
  ![Python](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue)
47
50
  [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
@@ -92,17 +95,21 @@ print(doc.export_to_markdown()) # output: "## DocLayNet: A Large Human-Annotate
92
95
 
93
96
  ### Convert a batch of documents
94
97
 
95
- For an example of batch-converting documents, see [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py).
98
+ For an example of batch-converting documents, see [batch_convert.py](https://github.com/DS4SD/docling/blob/main/examples/batch_convert.py).
96
99
 
97
100
  From a local repo clone, you can run it with:
98
101
 
99
102
  ```
100
- python examples/convert.py
103
+ python examples/batch_convert.py
101
104
  ```
102
105
  The output of the above command will be written to `./scratch`.
103
106
 
104
107
  ### Adjust pipeline features
105
108
 
109
+ The example file [custom_convert.py](https://github.com/DS4SD/docling/blob/main/examples/custom_convert.py) contains multiple ways
110
+ one can adjust the conversion pipeline and features.
111
+
112
+
106
113
  #### Control pipeline options
107
114
 
108
115
  You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter`:
@@ -166,13 +173,15 @@ Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main
166
173
  If you use Docling in your projects, please consider citing the following:
167
174
 
168
175
  ```bib
169
- @software{Docling,
170
- author = {Deep Search Team},
171
- month = {7},
172
- title = {{Docling}},
173
- url = {https://github.com/DS4SD/docling},
174
- version = {main},
175
- year = {2024}
176
+ @techreport{Docling,
177
+ author = {Deep Search Team},
178
+ month = {8},
179
+ title = {{Docling Technical Report}},
180
+ url={https://arxiv.org/abs/2408.09869},
181
+ eprint={2408.09869},
182
+ doi = "10.48550/arXiv.2408.09869",
183
+ version = {1.0.0},
184
+ year = {2024}
176
185
  }
177
186
  ```
178
187
 
@@ -6,6 +6,7 @@
6
6
 
7
7
  # Docling
8
8
 
9
+ [![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
9
10
  [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
10
11
  ![Python](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue)
11
12
  [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
@@ -56,17 +57,21 @@ print(doc.export_to_markdown()) # output: "## DocLayNet: A Large Human-Annotate
56
57
 
57
58
  ### Convert a batch of documents
58
59
 
59
- For an example of batch-converting documents, see [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py).
60
+ For an example of batch-converting documents, see [batch_convert.py](https://github.com/DS4SD/docling/blob/main/examples/batch_convert.py).
60
61
 
61
62
  From a local repo clone, you can run it with:
62
63
 
63
64
  ```
64
- python examples/convert.py
65
+ python examples/batch_convert.py
65
66
  ```
66
67
  The output of the above command will be written to `./scratch`.
67
68
 
68
69
  ### Adjust pipeline features
69
70
 
71
+ The example file [custom_convert.py](https://github.com/DS4SD/docling/blob/main/examples/custom_convert.py) contains multiple ways
72
+ one can adjust the conversion pipeline and features.
73
+
74
+
70
75
  #### Control pipeline options
71
76
 
72
77
  You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter`:
@@ -130,13 +135,15 @@ Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main
130
135
  If you use Docling in your projects, please consider citing the following:
131
136
 
132
137
  ```bib
133
- @software{Docling,
134
- author = {Deep Search Team},
135
- month = {7},
136
- title = {{Docling}},
137
- url = {https://github.com/DS4SD/docling},
138
- version = {main},
139
- year = {2024}
138
+ @techreport{Docling,
139
+ author = {Deep Search Team},
140
+ month = {8},
141
+ title = {{Docling Technical Report}},
142
+ url={https://arxiv.org/abs/2408.09869},
143
+ eprint={2408.09869},
144
+ doi = "10.48550/arXiv.2408.09869",
145
+ version = {1.0.0},
146
+ year = {2024}
140
147
  }
141
148
  ```
142
149
 
@@ -35,7 +35,7 @@ class PdfPageBackend(ABC):
35
35
 
36
36
  class PdfDocumentBackend(ABC):
37
37
  @abstractmethod
38
- def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
38
+ def __init__(self, path_or_stream: Union[BytesIO, Path]):
39
39
  pass
40
40
 
41
41
  @abstractmethod
@@ -0,0 +1,187 @@
1
+ import logging
2
+ import random
3
+ import time
4
+ from io import BytesIO
5
+ from pathlib import Path
6
+ from typing import Iterable, List, Optional, Union
7
+
8
+ import pypdfium2 as pdfium
9
+ from docling_parse.docling_parse import pdf_parser
10
+ from PIL import Image, ImageDraw
11
+ from pypdfium2 import PdfPage
12
+
13
+ from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
14
+ from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
15
+
16
+ _log = logging.getLogger(__name__)
17
+
18
+
19
+ class DoclingParsePageBackend(PdfPageBackend):
20
+ def __init__(self, page_obj: PdfPage, docling_page_obj):
21
+ super().__init__(page_obj)
22
+ self._ppage = page_obj
23
+ self._dpage = docling_page_obj
24
+ self.text_page = None
25
+
26
+ def get_text_in_rect(self, bbox: BoundingBox) -> str:
27
+ # Find intersecting cells on the page
28
+ text_piece = ""
29
+ page_size = self.get_size()
30
+ parser_width = self._dpage["width"]
31
+ parser_height = self._dpage["height"]
32
+
33
+ scale = (
34
+ 1 # FIX - Replace with param in get_text_in_rect across backends (optional)
35
+ )
36
+
37
+ for i in range(len(self._dpage["cells"])):
38
+ rect = self._dpage["cells"][i]["box"]["device"]
39
+ x0, y0, x1, y1 = rect
40
+ cell_bbox = BoundingBox(
41
+ l=x0 * scale * page_size.width / parser_width,
42
+ b=y0 * scale * page_size.height / parser_height,
43
+ r=x1 * scale * page_size.width / parser_width,
44
+ t=y1 * scale * page_size.height / parser_height,
45
+ coord_origin=CoordOrigin.BOTTOMLEFT,
46
+ ).to_top_left_origin(page_size.height * scale)
47
+
48
+ overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
49
+
50
+ if overlap_frac > 0.5:
51
+ if len(text_piece) > 0:
52
+ text_piece += " "
53
+ text_piece += self._dpage["cells"][i]["content"]["rnormalized"]
54
+
55
+ return text_piece
56
+
57
+ def get_text_cells(self) -> Iterable[Cell]:
58
+ cells = []
59
+ cell_counter = 0
60
+
61
+ page_size = self.get_size()
62
+
63
+ parser_width = self._dpage["width"]
64
+ parser_height = self._dpage["height"]
65
+
66
+ for i in range(len(self._dpage["cells"])):
67
+ rect = self._dpage["cells"][i]["box"]["device"]
68
+ x0, y0, x1, y1 = rect
69
+ text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
70
+ cells.append(
71
+ Cell(
72
+ id=cell_counter,
73
+ text=text_piece,
74
+ bbox=BoundingBox(
75
+ # l=x0, b=y0, r=x1, t=y1,
76
+ l=x0 * page_size.width / parser_width,
77
+ b=y0 * page_size.height / parser_height,
78
+ r=x1 * page_size.width / parser_width,
79
+ t=y1 * page_size.height / parser_height,
80
+ coord_origin=CoordOrigin.BOTTOMLEFT,
81
+ ).to_top_left_origin(page_size.height),
82
+ )
83
+ )
84
+ cell_counter += 1
85
+
86
+ def draw_clusters_and_cells():
87
+ image = (
88
+ self.get_page_image()
89
+ ) # make new image to avoid drawing on the saved ones
90
+ draw = ImageDraw.Draw(image)
91
+ for c in cells:
92
+ x0, y0, x1, y1 = c.bbox.as_tuple()
93
+ cell_color = (
94
+ random.randint(30, 140),
95
+ random.randint(30, 140),
96
+ random.randint(30, 140),
97
+ )
98
+ draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
99
+ image.show()
100
+
101
+ # before merge:
102
+ # draw_clusters_and_cells()
103
+
104
+ # cells = merge_horizontal_cells(cells)
105
+
106
+ # after merge:
107
+ # draw_clusters_and_cells()
108
+
109
+ return cells
110
+
111
+ def get_page_image(
112
+ self, scale: int = 1, cropbox: Optional[BoundingBox] = None
113
+ ) -> Image.Image:
114
+
115
+ page_size = self.get_size()
116
+
117
+ if not cropbox:
118
+ cropbox = BoundingBox(
119
+ l=0,
120
+ r=page_size.width,
121
+ t=0,
122
+ b=page_size.height,
123
+ coord_origin=CoordOrigin.TOPLEFT,
124
+ )
125
+ padbox = BoundingBox(
126
+ l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
127
+ )
128
+ else:
129
+ padbox = cropbox.to_bottom_left_origin(page_size.height)
130
+ padbox.r = page_size.width - padbox.r
131
+ padbox.t = page_size.height - padbox.t
132
+
133
+ image = (
134
+ self._ppage.render(
135
+ scale=scale * 1.5,
136
+ rotation=0, # no additional rotation
137
+ crop=padbox.as_tuple(),
138
+ )
139
+ .to_pil()
140
+ .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
141
+ ) # We resize the image from 1.5x the given scale to make it sharper.
142
+
143
+ return image
144
+
145
+ def get_size(self) -> PageSize:
146
+ return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
147
+
148
+ def unload(self):
149
+ self._ppage = None
150
+ self._dpage = None
151
+ self.text_page = None
152
+
153
+
154
+ class DoclingParseDocumentBackend(PdfDocumentBackend):
155
+ def __init__(self, path_or_stream: Union[BytesIO, Path]):
156
+ super().__init__(path_or_stream)
157
+ self._pdoc = pdfium.PdfDocument(path_or_stream)
158
+ # Parsing cells with docling_parser call
159
+ parser = pdf_parser()
160
+
161
+ start_pb_time = time.time()
162
+
163
+ if isinstance(path_or_stream, BytesIO):
164
+ self._parser_doc = parser.find_cells_from_bytesio(path_or_stream)
165
+ else:
166
+ self._parser_doc = parser.find_cells(str(path_or_stream))
167
+
168
+ end_pb_time = time.time() - start_pb_time
169
+ _log.info(
170
+ f"Time to parse {path_or_stream.name} with docling-parse: time={end_pb_time:.3f}"
171
+ )
172
+
173
+ def page_count(self) -> int:
174
+ return len(self._parser_doc["pages"])
175
+
176
+ def load_page(self, page_no: int) -> PdfPage:
177
+ return DoclingParsePageBackend(
178
+ self._pdoc[page_no], self._parser_doc["pages"][page_no]
179
+ )
180
+
181
+ def is_valid(self) -> bool:
182
+ return self.page_count() > 0
183
+
184
+ def unload(self):
185
+ self._pdoc.close()
186
+ self._pdoc = None
187
+ self._parser_doc = None
@@ -134,7 +134,9 @@ class PyPdfiumPageBackend(PdfPageBackend):
134
134
  return merged_cells
135
135
 
136
136
  def draw_clusters_and_cells():
137
- image = self.get_page_image()
137
+ image = (
138
+ self.get_page_image()
139
+ ) # make new image to avoid drawing on the saved ones
138
140
  draw = ImageDraw.Draw(image)
139
141
  for c in cells:
140
142
  x0, y0, x1, y1 = c.bbox.as_tuple()
@@ -199,7 +201,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
199
201
 
200
202
 
201
203
  class PyPdfiumDocumentBackend(PdfDocumentBackend):
202
- def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
204
+ def __init__(self, path_or_stream: Union[BytesIO, Path]):
203
205
  super().__init__(path_or_stream)
204
206
  self._pdoc = pdfium.PdfDocument(path_or_stream)
205
207
 
@@ -1,10 +1,12 @@
1
1
  import copy
2
+ import warnings
2
3
  from enum import Enum, auto
3
4
  from io import BytesIO
4
- from typing import Any, Dict, List, Optional, Tuple, Union
5
+ from typing import Annotated, Any, Dict, List, Optional, Tuple, Union
5
6
 
6
7
  from PIL.Image import Image
7
- from pydantic import BaseModel, ConfigDict, model_validator
8
+ from pydantic import BaseModel, ConfigDict, Field, model_validator
9
+ from typing_extensions import Self
8
10
 
9
11
  from docling.backend.abstract_backend import PdfPageBackend
10
12
 
@@ -234,14 +236,30 @@ class Page(BaseModel):
234
236
  model_config = ConfigDict(arbitrary_types_allowed=True)
235
237
 
236
238
  page_no: int
237
- page_hash: str = None
238
- size: PageSize = None
239
- image: Image = None
239
+ page_hash: Optional[str] = None
240
+ size: Optional[PageSize] = None
240
241
  cells: List[Cell] = None
241
242
  predictions: PagePredictions = PagePredictions()
242
- assembled: AssembledUnit = None
243
+ assembled: Optional[AssembledUnit] = None
243
244
 
244
- _backend: PdfPageBackend = None # Internal PDF backend
245
+ _backend: Optional[PdfPageBackend] = (
246
+ None # Internal PDF backend. By default it is cleared during assembling.
247
+ )
248
+ _default_image_scale: float = 1.0 # Default image scale for external usage.
249
+ _image_cache: Dict[float, Image] = (
250
+ {}
251
+ ) # Cache of images in different scales. By default it is cleared during assembling.
252
+
253
+ def get_image(self, scale: float = 1.0) -> Optional[Image]:
254
+ if self._backend is None:
255
+ return self._image_cache.get(scale, None)
256
+ if not scale in self._image_cache:
257
+ self._image_cache[scale] = self._backend.get_page_image(scale=scale)
258
+ return self._image_cache[scale]
259
+
260
+ @property
261
+ def image(self) -> Optional[Image]:
262
+ return self.get_image(scale=self._default_image_scale)
245
263
 
246
264
 
247
265
  class DocumentStream(BaseModel):
@@ -265,3 +283,22 @@ class PipelineOptions(BaseModel):
265
283
  do_ocr: bool = False # True: perform OCR, replace programmatic PDF text
266
284
 
267
285
  table_structure_options: TableStructureOptions = TableStructureOptions()
286
+
287
+
288
+ class AssembleOptions(BaseModel):
289
+ keep_page_images: Annotated[
290
+ bool,
291
+ Field(
292
+ deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead"
293
+ ),
294
+ ] = False # False: page images are removed in the assemble step
295
+ images_scale: Optional[float] = None # if set, the scale for generated images
296
+
297
+ @model_validator(mode="after")
298
+ def set_page_images_from_deprecated(self) -> Self:
299
+ with warnings.catch_warnings():
300
+ warnings.simplefilter("ignore", DeprecationWarning)
301
+ default_scale = 1.0
302
+ if self.keep_page_images and self.images_scale is None:
303
+ self.images_scale = default_scale
304
+ return self
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from io import BytesIO
3
3
  from pathlib import Path, PurePath
4
- from typing import ClassVar, Dict, Iterable, List, Optional, Type, Union
4
+ from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
5
5
 
6
6
  from docling_core.types import BaseCell, BaseText
7
7
  from docling_core.types import BoundingBox as DsBoundingBox
@@ -14,13 +14,14 @@ from docling_core.types import TableCell
14
14
  from pydantic import BaseModel
15
15
 
16
16
  from docling.backend.abstract_backend import PdfDocumentBackend
17
- from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
17
+ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
18
18
  from docling.datamodel.base_models import (
19
19
  AssembledUnit,
20
20
  ConversionStatus,
21
21
  DocumentStream,
22
22
  FigureElement,
23
23
  Page,
24
+ PageElement,
24
25
  TableElement,
25
26
  TextElement,
26
27
  )
@@ -64,7 +65,7 @@ class InputDocument(BaseModel):
64
65
  path_or_stream: Union[BytesIO, Path],
65
66
  filename: Optional[str] = None,
66
67
  limits: Optional[DocumentLimits] = None,
67
- pdf_backend=PyPdfiumDocumentBackend,
68
+ pdf_backend=DoclingParseDocumentBackend,
68
69
  ):
69
70
  super().__init__()
70
71
 
@@ -302,13 +303,27 @@ class ConvertedDocument(BaseModel):
302
303
  else:
303
304
  return ""
304
305
 
306
+ def render_element_images(
307
+ self, element_types: Tuple[PageElement] = (FigureElement,)
308
+ ):
309
+ for element in self.assembled.elements:
310
+ if isinstance(element, element_types):
311
+ page_ix = element.page_no
312
+ scale = self.pages[page_ix]._default_image_scale
313
+ crop_bbox = element.cluster.bbox.scaled(scale=scale).to_top_left_origin(
314
+ page_height=self.pages[page_ix].size.height * scale
315
+ )
316
+
317
+ cropped_im = self.pages[page_ix].image.crop(crop_bbox.as_tuple())
318
+ yield element, cropped_im
319
+
305
320
 
306
321
  class DocumentConversionInput(BaseModel):
307
322
 
308
323
  _path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
309
324
  limits: Optional[DocumentLimits] = DocumentLimits()
310
325
 
311
- DEFAULT_BACKEND: ClassVar = PyPdfiumDocumentBackend
326
+ DEFAULT_BACKEND: ClassVar = DoclingParseDocumentBackend
312
327
 
313
328
  def docs(
314
329
  self, pdf_backend: Optional[Type[PdfDocumentBackend]] = None
@@ -14,6 +14,7 @@ from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
14
14
  from docling.backend.abstract_backend import PdfDocumentBackend
15
15
  from docling.datamodel.base_models import (
16
16
  AssembledUnit,
17
+ AssembleOptions,
17
18
  ConversionStatus,
18
19
  Page,
19
20
  PipelineOptions,
@@ -44,6 +45,7 @@ class DocumentConverter:
44
45
  pipeline_options: PipelineOptions = PipelineOptions(),
45
46
  pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND,
46
47
  pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline,
48
+ assemble_options: AssembleOptions = AssembleOptions(),
47
49
  ):
48
50
  if not artifacts_path:
49
51
  artifacts_path = self.download_models_hf()
@@ -57,6 +59,7 @@ class DocumentConverter:
57
59
  self.page_assemble_model = PageAssembleModel(config={})
58
60
  self.glm_model = GlmModel(config={})
59
61
  self.pdf_backend = pdf_backend
62
+ self.assemble_options = assemble_options
60
63
 
61
64
  @staticmethod
62
65
  def download_models_hf(
@@ -174,17 +177,21 @@ class DocumentConverter:
174
177
  pages_with_images,
175
178
  )
176
179
 
180
+ # 4. Run pipeline stages
177
181
  pipeline_pages = self.model_pipeline.apply(pages_with_cells)
178
182
 
179
- # 7. Assemble page elements (per page)
183
+ # 5. Assemble page elements (per page)
180
184
  assembled_pages = self.page_assemble_model(pipeline_pages)
181
185
 
182
186
  # exhaust assembled_pages
183
187
  for assembled_page in assembled_pages:
184
188
  # Free up mem resources before moving on with next batch
185
- assembled_page.image = (
186
- None # Comment this if you want to visualize page images
187
- )
189
+
190
+ # Remove page images (can be disabled)
191
+ if self.assemble_options.images_scale is None:
192
+ assembled_page._image_cache = {}
193
+
194
+ # Unload backend
188
195
  assembled_page._backend.unload()
189
196
 
190
197
  all_assembled_pages.append(assembled_page)
@@ -222,7 +229,15 @@ class DocumentConverter:
222
229
 
223
230
  # Generate the page image and store it in the page object
224
231
  def populate_page_images(self, doc: InputDocument, page: Page) -> Page:
225
- page.image = page._backend.get_page_image()
232
+ # default scale
233
+ page.get_image(scale=1.0)
234
+
235
+ # user requested scales
236
+ if self.assemble_options.images_scale is not None:
237
+ page._default_image_scale = self.assemble_options.images_scale
238
+ page.get_image(
239
+ scale=self.assemble_options.images_scale
240
+ ) # this will trigger storing the image in the internal cache
226
241
 
227
242
  return page
228
243
 
@@ -238,7 +253,7 @@ class DocumentConverter:
238
253
  draw.rectangle([(x0, y0), (x1, y1)], outline="red")
239
254
  image.show()
240
255
 
241
- # draw_text_boxes(page.image, cells)
256
+ # draw_text_boxes(page.get_image(scale=1.0), cells)
242
257
 
243
258
  return page
244
259
 
@@ -30,7 +30,7 @@ class EasyOcrModel:
30
30
 
31
31
  for page in page_batch:
32
32
  # rects = page._fpage.
33
- high_res_image = page._backend.get_page_image(scale=self.scale)
33
+ high_res_image = page.get_image(scale=self.scale)
34
34
  im = numpy.array(high_res_image)
35
35
  result = self.reader.readtext(im)
36
36
 
@@ -69,6 +69,10 @@ class LayoutModel:
69
69
  "Key-Value Region": 0.45,
70
70
  }
71
71
 
72
+ CLASS_REMAPPINGS = {
73
+ "Document Index": "Table",
74
+ }
75
+
72
76
  _log.debug("================= Start postprocess function ====================")
73
77
  start_time = time.time()
74
78
  # Apply Confidence Threshold to cluster predictions
@@ -79,6 +83,10 @@ class LayoutModel:
79
83
  confidence = CLASS_THRESHOLDS[cluster.label]
80
84
  if cluster.confidence >= confidence:
81
85
  # annotation["created_by"] = "high_conf_pred"
86
+
87
+ # Remap class labels where needed.
88
+ if cluster.label in CLASS_REMAPPINGS.keys():
89
+ cluster.label = CLASS_REMAPPINGS[cluster.label]
82
90
  clusters_out.append(cluster)
83
91
 
84
92
  # map to dictionary clusters and cells, with bottom left origin
@@ -259,7 +267,9 @@ class LayoutModel:
259
267
  def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
260
268
  for page in page_batch:
261
269
  clusters = []
262
- for ix, pred_item in enumerate(self.layout_predictor.predict(page.image)):
270
+ for ix, pred_item in enumerate(
271
+ self.layout_predictor.predict(page.get_image(scale=1.0))
272
+ ):
263
273
  cluster = Cluster(
264
274
  id=ix,
265
275
  label=pred_item["label"],
@@ -34,7 +34,9 @@ class TableStructureModel:
34
34
  self.scale = 2.0 # Scale up table input images to 144 dpi
35
35
 
36
36
  def draw_table_and_cells(self, page: Page, tbl_list: List[TableElement]):
37
- image = page._backend.get_page_image()
37
+ image = (
38
+ page._backend.get_page_image()
39
+ ) # make new image to avoid drawing on the saved ones
38
40
  draw = ImageDraw.Draw(image)
39
41
 
40
42
  for table_element in tbl_list:
@@ -94,13 +96,7 @@ class TableStructureModel:
94
96
  "width": page.size.width * self.scale,
95
97
  "height": page.size.height * self.scale,
96
98
  }
97
- # add image to page input.
98
- if self.scale == 1.0:
99
- page_input["image"] = numpy.asarray(page.image)
100
- else: # render new page image on the fly at desired scale
101
- page_input["image"] = numpy.asarray(
102
- page._backend.get_page_image(scale=self.scale)
103
- )
99
+ page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))
104
100
 
105
101
  table_clusters, table_bboxes = zip(*in_tables)
106
102
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "1.1.2" # DO NOT EDIT, updated automatically
3
+ version = "1.5.0" # DO NOT EDIT, updated automatically
4
4
  description = "Docling PDF conversion package"
5
5
  authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"
@@ -24,7 +24,7 @@ packages = [{include = "docling"}]
24
24
  python = "^3.10"
25
25
  pydantic = "^2.0.0"
26
26
  docling-core = "^1.1.2"
27
- docling-ibm-models = "^1.1.0"
27
+ docling-ibm-models = "^1.1.1"
28
28
  deepsearch-glm = ">=0.19.0,<1"
29
29
  filetype = "^1.2.0"
30
30
  pypdfium2 = "^4.30.0"
@@ -32,6 +32,8 @@ pydantic-settings = "^2.3.0"
32
32
  huggingface_hub = ">=0.23,<1"
33
33
  requests = "^2.32.3"
34
34
  easyocr = { version = "^1.7", optional = true }
35
+ docling-parse = "^0.2.0"
36
+ certifi = ">=2024.7.4"
35
37
 
36
38
  [tool.poetry.group.dev.dependencies]
37
39
  black = {extras = ["jupyter"], version = "^24.4.2"}
File without changes
File without changes
File without changes