docling 1.1.2__tar.gz → 1.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. {docling-1.1.2 → docling-1.2.1}/PKG-INFO +9 -3
  2. {docling-1.1.2 → docling-1.2.1}/README.md +6 -2
  3. {docling-1.1.2 → docling-1.2.1}/docling/backend/abstract_backend.py +1 -1
  4. docling-1.2.1/docling/backend/docling_parse_backend.py +172 -0
  5. {docling-1.1.2 → docling-1.2.1}/docling/backend/pypdfium2_backend.py +1 -1
  6. {docling-1.1.2 → docling-1.2.1}/pyproject.toml +3 -1
  7. {docling-1.1.2 → docling-1.2.1}/LICENSE +0 -0
  8. {docling-1.1.2 → docling-1.2.1}/docling/__init__.py +0 -0
  9. {docling-1.1.2 → docling-1.2.1}/docling/backend/__init__.py +0 -0
  10. {docling-1.1.2 → docling-1.2.1}/docling/datamodel/__init__.py +0 -0
  11. {docling-1.1.2 → docling-1.2.1}/docling/datamodel/base_models.py +0 -0
  12. {docling-1.1.2 → docling-1.2.1}/docling/datamodel/document.py +0 -0
  13. {docling-1.1.2 → docling-1.2.1}/docling/datamodel/settings.py +0 -0
  14. {docling-1.1.2 → docling-1.2.1}/docling/document_converter.py +0 -0
  15. {docling-1.1.2 → docling-1.2.1}/docling/models/__init__.py +0 -0
  16. {docling-1.1.2 → docling-1.2.1}/docling/models/ds_glm_model.py +0 -0
  17. {docling-1.1.2 → docling-1.2.1}/docling/models/easyocr_model.py +0 -0
  18. {docling-1.1.2 → docling-1.2.1}/docling/models/layout_model.py +0 -0
  19. {docling-1.1.2 → docling-1.2.1}/docling/models/page_assemble_model.py +0 -0
  20. {docling-1.1.2 → docling-1.2.1}/docling/models/table_structure_model.py +0 -0
  21. {docling-1.1.2 → docling-1.2.1}/docling/pipeline/__init__.py +0 -0
  22. {docling-1.1.2 → docling-1.2.1}/docling/pipeline/base_model_pipeline.py +0 -0
  23. {docling-1.1.2 → docling-1.2.1}/docling/pipeline/standard_model_pipeline.py +0 -0
  24. {docling-1.1.2 → docling-1.2.1}/docling/utils/__init__.py +0 -0
  25. {docling-1.1.2 → docling-1.2.1}/docling/utils/layout_utils.py +0 -0
  26. {docling-1.1.2 → docling-1.2.1}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 1.1.2
3
+ Version: 1.2.1
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -21,9 +21,11 @@ Classifier: Programming Language :: Python :: 3.12
21
21
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Provides-Extra: easyocr
23
23
  Provides-Extra: ocr
24
+ Requires-Dist: certifi (>=2024.7.4)
24
25
  Requires-Dist: deepsearch-glm (>=0.19.0,<1)
25
26
  Requires-Dist: docling-core (>=1.1.2,<2.0.0)
26
27
  Requires-Dist: docling-ibm-models (>=1.1.0,<2.0.0)
28
+ Requires-Dist: docling-parse (>=0.0.1,<0.0.2)
27
29
  Requires-Dist: easyocr (>=1.7,<2.0) ; extra == "easyocr" or extra == "ocr"
28
30
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
29
31
  Requires-Dist: huggingface_hub (>=0.23,<1)
@@ -92,17 +94,21 @@ print(doc.export_to_markdown()) # output: "## DocLayNet: A Large Human-Annotate
92
94
 
93
95
  ### Convert a batch of documents
94
96
 
95
- For an example of batch-converting documents, see [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py).
97
+ For an example of batch-converting documents, see [batch_convert.py](https://github.com/DS4SD/docling/blob/main/examples/batch_convert.py).
96
98
 
97
99
  From a local repo clone, you can run it with:
98
100
 
99
101
  ```
100
- python examples/convert.py
102
+ python examples/batch_convert.py
101
103
  ```
102
104
  The output of the above command will be written to `./scratch`.
103
105
 
104
106
  ### Adjust pipeline features
105
107
 
108
+ The example file [custom_convert.py](https://github.com/DS4SD/docling/blob/main/examples/custom_convert.py) contains multiple ways
109
+ one can adjust the conversion pipeline and features.
110
+
111
+
106
112
  #### Control pipeline options
107
113
 
108
114
  You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter`:
@@ -56,17 +56,21 @@ print(doc.export_to_markdown()) # output: "## DocLayNet: A Large Human-Annotate
56
56
 
57
57
  ### Convert a batch of documents
58
58
 
59
- For an example of batch-converting documents, see [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py).
59
+ For an example of batch-converting documents, see [batch_convert.py](https://github.com/DS4SD/docling/blob/main/examples/batch_convert.py).
60
60
 
61
61
  From a local repo clone, you can run it with:
62
62
 
63
63
  ```
64
- python examples/convert.py
64
+ python examples/batch_convert.py
65
65
  ```
66
66
  The output of the above command will be written to `./scratch`.
67
67
 
68
68
  ### Adjust pipeline features
69
69
 
70
+ The example file [custom_convert.py](https://github.com/DS4SD/docling/blob/main/examples/custom_convert.py) contains multiple ways
71
+ one can adjust the conversion pipeline and features.
72
+
73
+
70
74
  #### Control pipeline options
71
75
 
72
76
  You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter`:
@@ -35,7 +35,7 @@ class PdfPageBackend(ABC):
35
35
 
36
36
  class PdfDocumentBackend(ABC):
37
37
  @abstractmethod
38
- def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
38
+ def __init__(self, path_or_stream: Union[BytesIO, Path]):
39
39
  pass
40
40
 
41
41
  @abstractmethod
@@ -0,0 +1,172 @@
1
+ import random
2
+ from io import BytesIO
3
+ from pathlib import Path
4
+ from typing import Iterable, List, Optional, Union
5
+
6
+ import pypdfium2 as pdfium
7
+ from docling_parse.docling_parse import pdf_parser
8
+ from PIL import Image, ImageDraw
9
+ from pypdfium2 import PdfPage
10
+
11
+ from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
12
+ from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
13
+
14
+
15
+ class DoclingParsePageBackend(PdfPageBackend):
16
+ def __init__(self, page_obj: PdfPage, docling_page_obj):
17
+ super().__init__(page_obj)
18
+ self._ppage = page_obj
19
+ self._dpage = docling_page_obj
20
+ self.text_page = None
21
+
22
+ def get_text_in_rect(self, bbox: BoundingBox) -> str:
23
+ # Find intersecting cells on the page
24
+ text_piece = ""
25
+ page_size = self.get_size()
26
+ parser_width = self._dpage["width"]
27
+ parser_height = self._dpage["height"]
28
+
29
+ scale = (
30
+ 1 # FIX - Replace with param in get_text_in_rect across backends (optional)
31
+ )
32
+
33
+ for i in range(len(self._dpage["cells"])):
34
+ rect = self._dpage["cells"][i]["box"]["device"]
35
+ x0, y0, x1, y1 = rect
36
+ cell_bbox = BoundingBox(
37
+ l=x0 * scale * page_size.width / parser_width,
38
+ b=y0 * scale * page_size.height / parser_height,
39
+ r=x1 * scale * page_size.width / parser_width,
40
+ t=y1 * scale * page_size.height / parser_height,
41
+ coord_origin=CoordOrigin.BOTTOMLEFT,
42
+ ).to_top_left_origin(page_size.height * scale)
43
+
44
+ overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
45
+
46
+ if overlap_frac > 0.5:
47
+ if len(text_piece) > 0:
48
+ text_piece += " "
49
+ text_piece += self._dpage["cells"][i]["content"]["rnormalized"]
50
+
51
+ return text_piece
52
+
53
+ def get_text_cells(self) -> Iterable[Cell]:
54
+ cells = []
55
+ cell_counter = 0
56
+
57
+ page_size = self.get_size()
58
+
59
+ parser_width = self._dpage["width"]
60
+ parser_height = self._dpage["height"]
61
+
62
+ for i in range(len(self._dpage["cells"])):
63
+ rect = self._dpage["cells"][i]["box"]["device"]
64
+ x0, y0, x1, y1 = rect
65
+ text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
66
+ cells.append(
67
+ Cell(
68
+ id=cell_counter,
69
+ text=text_piece,
70
+ bbox=BoundingBox(
71
+ # l=x0, b=y0, r=x1, t=y1,
72
+ l=x0 * page_size.width / parser_width,
73
+ b=y0 * page_size.height / parser_height,
74
+ r=x1 * page_size.width / parser_width,
75
+ t=y1 * page_size.height / parser_height,
76
+ coord_origin=CoordOrigin.BOTTOMLEFT,
77
+ ).to_top_left_origin(page_size.height),
78
+ )
79
+ )
80
+ cell_counter += 1
81
+
82
+ def draw_clusters_and_cells():
83
+ image = self.get_page_image()
84
+ draw = ImageDraw.Draw(image)
85
+ for c in cells:
86
+ x0, y0, x1, y1 = c.bbox.as_tuple()
87
+ cell_color = (
88
+ random.randint(30, 140),
89
+ random.randint(30, 140),
90
+ random.randint(30, 140),
91
+ )
92
+ draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
93
+ image.show()
94
+
95
+ # before merge:
96
+ # draw_clusters_and_cells()
97
+
98
+ # cells = merge_horizontal_cells(cells)
99
+
100
+ # after merge:
101
+ # draw_clusters_and_cells()
102
+
103
+ return cells
104
+
105
+ def get_page_image(
106
+ self, scale: int = 1, cropbox: Optional[BoundingBox] = None
107
+ ) -> Image.Image:
108
+
109
+ page_size = self.get_size()
110
+
111
+ if not cropbox:
112
+ cropbox = BoundingBox(
113
+ l=0,
114
+ r=page_size.width,
115
+ t=0,
116
+ b=page_size.height,
117
+ coord_origin=CoordOrigin.TOPLEFT,
118
+ )
119
+ padbox = BoundingBox(
120
+ l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
121
+ )
122
+ else:
123
+ padbox = cropbox.to_bottom_left_origin(page_size.height)
124
+ padbox.r = page_size.width - padbox.r
125
+ padbox.t = page_size.height - padbox.t
126
+
127
+ image = (
128
+ self._ppage.render(
129
+ scale=scale * 1.5,
130
+ rotation=0, # no additional rotation
131
+ crop=padbox.as_tuple(),
132
+ )
133
+ .to_pil()
134
+ .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
135
+ ) # We resize the image from 1.5x the given scale to make it sharper.
136
+
137
+ return image
138
+
139
+ def get_size(self) -> PageSize:
140
+ return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
141
+
142
+ def unload(self):
143
+ self._ppage = None
144
+ self._dpage = None
145
+ self.text_page = None
146
+
147
+
148
+ class DoclingParseDocumentBackend(PdfDocumentBackend):
149
+ def __init__(self, path_or_stream: Union[BytesIO, Path]):
150
+ super().__init__(path_or_stream)
151
+ self._pdoc = pdfium.PdfDocument(path_or_stream)
152
+ # Parsing cells with docling_parser call
153
+ if isinstance(path_or_stream, BytesIO):
154
+ raise NotImplemented("This backend does not support byte streams yet.")
155
+ parser = pdf_parser()
156
+ self._parser_doc = parser.find_cells(str(path_or_stream))
157
+
158
+ def page_count(self) -> int:
159
+ return len(self._parser_doc["pages"])
160
+
161
+ def load_page(self, page_no: int) -> PdfPage:
162
+ return DoclingParsePageBackend(
163
+ self._pdoc[page_no], self._parser_doc["pages"][page_no]
164
+ )
165
+
166
+ def is_valid(self) -> bool:
167
+ return self.page_count() > 0
168
+
169
+ def unload(self):
170
+ self._pdoc.close()
171
+ self._pdoc = None
172
+ self._parser_doc = None
@@ -199,7 +199,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
199
199
 
200
200
 
201
201
  class PyPdfiumDocumentBackend(PdfDocumentBackend):
202
- def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
202
+ def __init__(self, path_or_stream: Union[BytesIO, Path]):
203
203
  super().__init__(path_or_stream)
204
204
  self._pdoc = pdfium.PdfDocument(path_or_stream)
205
205
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "1.1.2" # DO NOT EDIT, updated automatically
3
+ version = "1.2.1" # DO NOT EDIT, updated automatically
4
4
  description = "Docling PDF conversion package"
5
5
  authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"
@@ -32,6 +32,8 @@ pydantic-settings = "^2.3.0"
32
32
  huggingface_hub = ">=0.23,<1"
33
33
  requests = "^2.32.3"
34
34
  easyocr = { version = "^1.7", optional = true }
35
+ docling-parse = "^0.0.1"
36
+ certifi = ">=2024.7.4"
35
37
 
36
38
  [tool.poetry.group.dev.dependencies]
37
39
  black = {extras = ["jupyter"], version = "^24.4.2"}
File without changes
File without changes
File without changes