docling 2.15.0__py3-none-any.whl → 2.15.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/docling_parse_backend.py +1 -1
- docling/backend/docling_parse_v2_backend.py +1 -1
- docling/backend/pypdfium2_backend.py +1 -1
- docling/datamodel/pipeline_options.py +1 -1
- docling/models/base_ocr_model.py +15 -12
- {docling-2.15.0.dist-info → docling-2.15.1.dist-info}/METADATA +3 -4
- {docling-2.15.0.dist-info → docling-2.15.1.dist-info}/RECORD +10 -10
- {docling-2.15.0.dist-info → docling-2.15.1.dist-info}/LICENSE +0 -0
- {docling-2.15.0.dist-info → docling-2.15.1.dist-info}/WHEEL +0 -0
- {docling-2.15.0.dist-info → docling-2.15.1.dist-info}/entry_points.txt +0 -0
@@ -132,7 +132,7 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
132
132
|
return cells
|
133
133
|
|
134
134
|
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
135
|
-
AREA_THRESHOLD = 32 * 32
|
135
|
+
AREA_THRESHOLD = 0 # 32 * 32
|
136
136
|
|
137
137
|
for i in range(len(self._dpage["images"])):
|
138
138
|
bitmap = self._dpage["images"][i]
|
@@ -140,7 +140,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
|
140
140
|
return cells
|
141
141
|
|
142
142
|
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
143
|
-
AREA_THRESHOLD = 32 * 32
|
143
|
+
AREA_THRESHOLD = 0 # 32 * 32
|
144
144
|
|
145
145
|
images = self._dpage["sanitized"]["images"]["data"]
|
146
146
|
images_header = self._dpage["sanitized"]["images"]["header"]
|
@@ -39,7 +39,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
39
39
|
return self.valid
|
40
40
|
|
41
41
|
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
42
|
-
AREA_THRESHOLD = 32 * 32
|
42
|
+
AREA_THRESHOLD = 0 # 32 * 32
|
43
43
|
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
44
44
|
pos = obj.get_pos()
|
45
45
|
cropbox = BoundingBox.from_tuple(
|
@@ -139,7 +139,7 @@ class EasyOcrOptions(OcrOptions):
|
|
139
139
|
|
140
140
|
use_gpu: Optional[bool] = None
|
141
141
|
|
142
|
-
confidence_threshold: float = 0.
|
142
|
+
confidence_threshold: float = 0.5
|
143
143
|
|
144
144
|
model_storage_directory: Optional[str] = None
|
145
145
|
recog_network: Optional[str] = "standard"
|
docling/models/base_ocr_model.py
CHANGED
@@ -8,7 +8,7 @@ import numpy as np
|
|
8
8
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
9
9
|
from PIL import Image, ImageDraw
|
10
10
|
from rtree import index
|
11
|
-
from scipy.ndimage import find_objects, label
|
11
|
+
from scipy.ndimage import binary_dilation, find_objects, label
|
12
12
|
|
13
13
|
from docling.datamodel.base_models import Cell, OcrCell, Page
|
14
14
|
from docling.datamodel.document import ConversionResult
|
@@ -43,6 +43,12 @@ class BaseOcrModel(BasePageModel):
|
|
43
43
|
|
44
44
|
np_image = np.array(image)
|
45
45
|
|
46
|
+
# Dilate the image by 10 pixels to merge nearby bitmap rectangles
|
47
|
+
structure = np.ones(
|
48
|
+
(20, 20)
|
49
|
+
) # Create a 20x20 structure element (10 pixels in all directions)
|
50
|
+
np_image = binary_dilation(np_image > 0, structure=structure)
|
51
|
+
|
46
52
|
# Find the connected components
|
47
53
|
labeled_image, num_features = label(
|
48
54
|
np_image > 0
|
@@ -72,7 +78,7 @@ class BaseOcrModel(BasePageModel):
|
|
72
78
|
bitmap_rects = []
|
73
79
|
coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
|
74
80
|
|
75
|
-
# return full-page rectangle if
|
81
|
+
# return full-page rectangle if page is dominantly covered with bitmaps
|
76
82
|
if self.options.force_full_page_ocr or coverage > max(
|
77
83
|
BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold
|
78
84
|
):
|
@@ -85,17 +91,11 @@ class BaseOcrModel(BasePageModel):
|
|
85
91
|
coord_origin=CoordOrigin.TOPLEFT,
|
86
92
|
)
|
87
93
|
]
|
88
|
-
# return individual rectangles if the bitmap coverage is
|
89
|
-
|
90
|
-
|
91
|
-
# skip OCR if the bitmap area on the page is smaller than the options threshold
|
92
|
-
ocr_rects = [
|
93
|
-
rect
|
94
|
-
for rect in ocr_rects
|
95
|
-
if rect.area() / (page.size.width * page.size.height)
|
96
|
-
> self.options.bitmap_area_threshold
|
97
|
-
]
|
94
|
+
# return individual rectangles if the bitmap coverage is above the threshold
|
95
|
+
elif coverage > self.options.bitmap_area_threshold:
|
98
96
|
return ocr_rects
|
97
|
+
else: # overall coverage of bitmaps is too low, drop all bitmap rectangles.
|
98
|
+
return []
|
99
99
|
|
100
100
|
# Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
|
101
101
|
def _filter_ocr_cells(self, ocr_cells, programmatic_cells):
|
@@ -162,6 +162,9 @@ class BaseOcrModel(BasePageModel):
|
|
162
162
|
x0 *= scale_x
|
163
163
|
x1 *= scale_x
|
164
164
|
|
165
|
+
if y1 <= y0:
|
166
|
+
y1, y0 = y0, y1
|
167
|
+
|
165
168
|
color = "gray"
|
166
169
|
if isinstance(tc, OcrCell):
|
167
170
|
color = "magenta"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.15.
|
3
|
+
Version: 2.15.1
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -45,7 +45,7 @@ Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
|
|
45
45
|
Requires-Dist: python-docx (>=1.1.2,<2.0.0)
|
46
46
|
Requires-Dist: python-pptx (>=1.0.2,<2.0.0)
|
47
47
|
Requires-Dist: rapidocr-onnxruntime (>=1.4.0,<2.0.0) ; (python_version < "3.13") and (extra == "rapidocr")
|
48
|
-
Requires-Dist: requests (>=2.32.
|
48
|
+
Requires-Dist: requests (>=2.32.2,<3.0.0)
|
49
49
|
Requires-Dist: rtree (>=1.3.0,<2.0.0)
|
50
50
|
Requires-Dist: scipy (>=1.6.0,<2.0.0)
|
51
51
|
Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
|
@@ -84,7 +84,7 @@ Docling parses documents and exports them to the desired format with ease and sp
|
|
84
84
|
* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
|
85
85
|
* 📑 Advanced PDF document understanding including page layout, reading order & table structures
|
86
86
|
* 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
|
87
|
-
* 🤖
|
87
|
+
* 🤖 Plug-and-play [integrations](https://ds4sd.github.io/docling/integrations/) incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
88
88
|
* 🔍 OCR support for scanned PDFs
|
89
89
|
* 💻 Simple and convenient CLI
|
90
90
|
|
@@ -94,7 +94,6 @@ Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty
|
|
94
94
|
|
95
95
|
* ♾️ Equation & code extraction
|
96
96
|
* 📝 Metadata extraction, including title, authors, references & language
|
97
|
-
* 🦜🔗 Native LangChain extension
|
98
97
|
|
99
98
|
## Installation
|
100
99
|
|
@@ -2,15 +2,15 @@ docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
3
|
docling/backend/abstract_backend.py,sha256=-or6kWVV7egQeyIuN-vI0Tr7Q1htalBZSlhgq_G2RdU,1678
|
4
4
|
docling/backend/asciidoc_backend.py,sha256=kXZxOLk_LvLFVZwnJVVwjmvc3QWZ0iiG7VnwjgtC3hI,14051
|
5
|
-
docling/backend/docling_parse_backend.py,sha256=
|
6
|
-
docling/backend/docling_parse_v2_backend.py,sha256
|
5
|
+
docling/backend/docling_parse_backend.py,sha256=cJLkuOmfCtshRrwsv7WWayRNeMQASZv76v3nUHucqgM,7636
|
6
|
+
docling/backend/docling_parse_v2_backend.py,sha256=-lLsorxhK_Awrql_zXPen2LX0Gt9UvcDLMcmXf7_LKc,8642
|
7
7
|
docling/backend/html_backend.py,sha256=O8qXaw7MzOIdaxbBcjHieM9Ce4GEdtBj9YW0vpJspuA,15560
|
8
8
|
docling/backend/md_backend.py,sha256=tmuSCghjor9PqKIiVieCuZ4_t5JEjZMy3cq7u3yTgyU,14032
|
9
9
|
docling/backend/msexcel_backend.py,sha256=23qUEScqr5GhY06xiqg-eBQ_JlAqO0FkPEmX6554sVA,12040
|
10
10
|
docling/backend/mspowerpoint_backend.py,sha256=kOGawhcn0BFq4M_C6kW0mY8vMIB24_6R6q6GaszbSt0,15957
|
11
11
|
docling/backend/msword_backend.py,sha256=K1D_h0ulLA6KQsPe62327cDVkQqV1f7EetCHo66wCKw,19233
|
12
12
|
docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
|
13
|
-
docling/backend/pypdfium2_backend.py,sha256=
|
13
|
+
docling/backend/pypdfium2_backend.py,sha256=Exb3NBp3x2YSLoNfmXq4NefShgooJXsxTXrJ4JbTzcc,9001
|
14
14
|
docling/backend/xml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
15
|
docling/backend/xml/pubmed_backend.py,sha256=LMnpowjnxa5SydfNC00Ll840BYraL8dCJu-FfC9iSKk,20447
|
16
16
|
docling/backend/xml/uspto_backend.py,sha256=2YsnB-WRARIAaHPL6gxHePP24GQGi-Up2_K8ZapD3k4,70974
|
@@ -20,13 +20,13 @@ docling/cli/main.py,sha256=NR7NEt8Sf3FE9D7sHpEmABM9mFMTMO5w0VPwYIIvVsk,15481
|
|
20
20
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
21
21
|
docling/datamodel/base_models.py,sha256=50Jf5zk9c4-zmnOzZLoPBnHQhTX0_OFQzIkKgnKK1o4,6229
|
22
22
|
docling/datamodel/document.py,sha256=OHM6bm0a-62xnAZ8DFlMHzATmbgNcfMxQoQO2udaW5Q,13071
|
23
|
-
docling/datamodel/pipeline_options.py,sha256=
|
23
|
+
docling/datamodel/pipeline_options.py,sha256=wKFzw8sAim6emQGsjuS12n7FfpMo8HVNoMOPhkXTkVo,7734
|
24
24
|
docling/datamodel/settings.py,sha256=Sw0rN_f8rdLV1eNvVeKiyET2Oe6oz9jtW3lJzniW9Do,1302
|
25
25
|
docling/document_converter.py,sha256=_pk0sHuPXJ14NEutatf5bK2VyNiU5cvYsVbh1HIgrIw,12431
|
26
26
|
docling/exceptions.py,sha256=-FoP46rFJgz_jn5uDv2V052udEEg8gckk6uhoItchXc,85
|
27
27
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
28
28
|
docling/models/base_model.py,sha256=Yq_-FmUhqhE20vXYG3WiQXDRTIPjik1CyuEZ8iYTGAY,701
|
29
|
-
docling/models/base_ocr_model.py,sha256=
|
29
|
+
docling/models/base_ocr_model.py,sha256=YiUMvdjnHw9SHjnfJKT5INrPMoIGEf_Z2OApfl_VRTE,6919
|
30
30
|
docling/models/ds_glm_model.py,sha256=CkhsP0cEWwm4wb1g3cLFriVGpVtELiUK3REDMkPwAMw,13028
|
31
31
|
docling/models/easyocr_model.py,sha256=Kakb20ioBxDmNsIqoGvSSs_vbqAWN3QQNHYtEi-eErg,4990
|
32
32
|
docling/models/layout_model.py,sha256=Xo8sclRTOO_V8Cr4RwuxB67vSWKF0LZ5nJRYU1WI--k,9063
|
@@ -49,8 +49,8 @@ docling/utils/glm_utils.py,sha256=IB19wToGath97gD3jAA3G_rQSptnZKhQCWLvPUCnkww,11
|
|
49
49
|
docling/utils/layout_postprocessor.py,sha256=urRzeF9PrKiMBvA6DdHHwyLxG06CMhelgJeV5B1l6l0,24258
|
50
50
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
51
51
|
docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
|
52
|
-
docling-2.15.
|
53
|
-
docling-2.15.
|
54
|
-
docling-2.15.
|
55
|
-
docling-2.15.
|
56
|
-
docling-2.15.
|
52
|
+
docling-2.15.1.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
53
|
+
docling-2.15.1.dist-info/METADATA,sha256=6WRzA633us43nw7RHwhX_jwizh2JSpGWxNh0pJq2ZYs,7739
|
54
|
+
docling-2.15.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
55
|
+
docling-2.15.1.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
|
56
|
+
docling-2.15.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|