docling 1.5.0__tar.gz → 1.6.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-1.5.0 → docling-1.6.1}/PKG-INFO +5 -4
- {docling-1.5.0 → docling-1.6.1}/docling/backend/abstract_backend.py +4 -0
- {docling-1.5.0 → docling-1.6.1}/docling/backend/docling_parse_backend.py +23 -3
- {docling-1.5.0 → docling-1.6.1}/docling/backend/pypdfium2_backend.py +15 -1
- {docling-1.5.0 → docling-1.6.1}/docling/datamodel/base_models.py +15 -7
- {docling-1.5.0 → docling-1.6.1}/docling/document_converter.py +0 -2
- docling-1.6.1/docling/models/base_ocr_model.py +124 -0
- docling-1.6.1/docling/models/easyocr_model.py +70 -0
- {docling-1.5.0 → docling-1.6.1}/docling/models/table_structure_model.py +0 -1
- {docling-1.5.0 → docling-1.6.1}/docling/pipeline/base_model_pipeline.py +0 -1
- {docling-1.5.0 → docling-1.6.1}/docling/pipeline/standard_model_pipeline.py +1 -3
- {docling-1.5.0 → docling-1.6.1}/pyproject.toml +5 -4
- docling-1.5.0/docling/models/easyocr_model.py +0 -77
- {docling-1.5.0 → docling-1.6.1}/LICENSE +0 -0
- {docling-1.5.0 → docling-1.6.1}/README.md +0 -0
- {docling-1.5.0 → docling-1.6.1}/docling/__init__.py +0 -0
- {docling-1.5.0 → docling-1.6.1}/docling/backend/__init__.py +0 -0
- {docling-1.5.0 → docling-1.6.1}/docling/datamodel/__init__.py +0 -0
- {docling-1.5.0 → docling-1.6.1}/docling/datamodel/document.py +0 -0
- {docling-1.5.0 → docling-1.6.1}/docling/datamodel/settings.py +0 -0
- {docling-1.5.0 → docling-1.6.1}/docling/models/__init__.py +0 -0
- {docling-1.5.0 → docling-1.6.1}/docling/models/ds_glm_model.py +0 -0
- {docling-1.5.0 → docling-1.6.1}/docling/models/layout_model.py +0 -0
- {docling-1.5.0 → docling-1.6.1}/docling/models/page_assemble_model.py +0 -0
- {docling-1.5.0 → docling-1.6.1}/docling/pipeline/__init__.py +0 -0
- {docling-1.5.0 → docling-1.6.1}/docling/utils/__init__.py +0 -0
- {docling-1.5.0 → docling-1.6.1}/docling/utils/layout_utils.py +0 -0
- {docling-1.5.0 → docling-1.6.1}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.6.1
|
4
4
|
Summary: Docling PDF conversion package
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -19,20 +19,21 @@ Classifier: Programming Language :: Python :: 3.10
|
|
19
19
|
Classifier: Programming Language :: Python :: 3.11
|
20
20
|
Classifier: Programming Language :: Python :: 3.12
|
21
21
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
22
|
-
Provides-Extra: easyocr
|
23
22
|
Provides-Extra: ocr
|
24
23
|
Requires-Dist: certifi (>=2024.7.4)
|
25
24
|
Requires-Dist: deepsearch-glm (>=0.19.0,<1)
|
26
25
|
Requires-Dist: docling-core (>=1.1.2,<2.0.0)
|
27
|
-
Requires-Dist: docling-ibm-models (>=1.1.
|
26
|
+
Requires-Dist: docling-ibm-models (>=1.1.2,<2.0.0)
|
28
27
|
Requires-Dist: docling-parse (>=0.2.0,<0.3.0)
|
29
|
-
Requires-Dist: easyocr (>=1.7,<2.0) ; extra == "
|
28
|
+
Requires-Dist: easyocr (>=1.7,<2.0) ; extra == "ocr"
|
30
29
|
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
31
30
|
Requires-Dist: huggingface_hub (>=0.23,<1)
|
32
31
|
Requires-Dist: pydantic (>=2.0.0,<3.0.0)
|
33
32
|
Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
|
34
33
|
Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
|
35
34
|
Requires-Dist: requests (>=2.32.3,<3.0.0)
|
35
|
+
Requires-Dist: rtree (>=1.3.0,<2.0.0)
|
36
|
+
Requires-Dist: scipy (>=1.14.1,<2.0.0)
|
36
37
|
Project-URL: Repository, https://github.com/DS4SD/docling
|
37
38
|
Description-Content-Type: text/markdown
|
38
39
|
|
@@ -18,6 +18,10 @@ class PdfPageBackend(ABC):
|
|
18
18
|
def get_text_cells(self) -> Iterable["Cell"]:
|
19
19
|
pass
|
20
20
|
|
21
|
+
@abstractmethod
|
22
|
+
def get_bitmap_rects(self, scale: int = 1) -> Iterable["BoundingBox"]:
|
23
|
+
pass
|
24
|
+
|
21
25
|
@abstractmethod
|
22
26
|
def get_page_image(
|
23
27
|
self, scale: int = 1, cropbox: Optional["BoundingBox"] = None
|
@@ -3,7 +3,7 @@ import random
|
|
3
3
|
import time
|
4
4
|
from io import BytesIO
|
5
5
|
from pathlib import Path
|
6
|
-
from typing import Iterable,
|
6
|
+
from typing import Iterable, Optional, Union
|
7
7
|
|
8
8
|
import pypdfium2 as pdfium
|
9
9
|
from docling_parse.docling_parse import pdf_parser
|
@@ -43,7 +43,7 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
43
43
|
r=x1 * scale * page_size.width / parser_width,
|
44
44
|
t=y1 * scale * page_size.height / parser_height,
|
45
45
|
coord_origin=CoordOrigin.BOTTOMLEFT,
|
46
|
-
).to_top_left_origin(page_size.height * scale)
|
46
|
+
).to_top_left_origin(page_height=page_size.height * scale)
|
47
47
|
|
48
48
|
overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
|
49
49
|
|
@@ -66,6 +66,12 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
66
66
|
for i in range(len(self._dpage["cells"])):
|
67
67
|
rect = self._dpage["cells"][i]["box"]["device"]
|
68
68
|
x0, y0, x1, y1 = rect
|
69
|
+
|
70
|
+
if x1 < x0:
|
71
|
+
x0, x1 = x1, x0
|
72
|
+
if y1 < y0:
|
73
|
+
y0, y1 = y1, y0
|
74
|
+
|
69
75
|
text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
|
70
76
|
cells.append(
|
71
77
|
Cell(
|
@@ -108,6 +114,20 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
108
114
|
|
109
115
|
return cells
|
110
116
|
|
117
|
+
def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
|
118
|
+
AREA_THRESHOLD = 32 * 32
|
119
|
+
|
120
|
+
for i in range(len(self._dpage["images"])):
|
121
|
+
bitmap = self._dpage["images"][i]
|
122
|
+
cropbox = BoundingBox.from_tuple(
|
123
|
+
bitmap["box"], origin=CoordOrigin.BOTTOMLEFT
|
124
|
+
).to_top_left_origin(self.get_size().height)
|
125
|
+
|
126
|
+
if cropbox.area() > AREA_THRESHOLD:
|
127
|
+
cropbox = cropbox.scaled(scale=scale)
|
128
|
+
|
129
|
+
yield cropbox
|
130
|
+
|
111
131
|
def get_page_image(
|
112
132
|
self, scale: int = 1, cropbox: Optional[BoundingBox] = None
|
113
133
|
) -> Image.Image:
|
@@ -173,7 +193,7 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
|
|
173
193
|
def page_count(self) -> int:
|
174
194
|
return len(self._parser_doc["pages"])
|
175
195
|
|
176
|
-
def load_page(self, page_no: int) ->
|
196
|
+
def load_page(self, page_no: int) -> DoclingParsePageBackend:
|
177
197
|
return DoclingParsePageBackend(
|
178
198
|
self._pdoc[page_no], self._parser_doc["pages"][page_no]
|
179
199
|
)
|
@@ -4,6 +4,7 @@ from pathlib import Path
|
|
4
4
|
from typing import Iterable, List, Optional, Union
|
5
5
|
|
6
6
|
import pypdfium2 as pdfium
|
7
|
+
import pypdfium2.raw as pdfium_c
|
7
8
|
from PIL import Image, ImageDraw
|
8
9
|
from pypdfium2 import PdfPage
|
9
10
|
|
@@ -17,6 +18,19 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
17
18
|
self._ppage = page_obj
|
18
19
|
self.text_page = None
|
19
20
|
|
21
|
+
def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
|
22
|
+
AREA_THRESHOLD = 32 * 32
|
23
|
+
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
24
|
+
pos = obj.get_pos()
|
25
|
+
cropbox = BoundingBox.from_tuple(
|
26
|
+
pos, origin=CoordOrigin.BOTTOMLEFT
|
27
|
+
).to_top_left_origin(page_height=self.get_size().height)
|
28
|
+
|
29
|
+
if cropbox.area() > AREA_THRESHOLD:
|
30
|
+
cropbox = cropbox.scaled(scale=scale)
|
31
|
+
|
32
|
+
yield cropbox
|
33
|
+
|
20
34
|
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
21
35
|
if not self.text_page:
|
22
36
|
self.text_page = self._ppage.get_textpage()
|
@@ -208,7 +222,7 @@ class PyPdfiumDocumentBackend(PdfDocumentBackend):
|
|
208
222
|
def page_count(self) -> int:
|
209
223
|
return len(self._pdoc)
|
210
224
|
|
211
|
-
def load_page(self, page_no: int) ->
|
225
|
+
def load_page(self, page_no: int) -> PyPdfiumPageBackend:
|
212
226
|
return PyPdfiumPageBackend(self._pdoc[page_no])
|
213
227
|
|
214
228
|
def is_valid(self) -> bool:
|
@@ -68,13 +68,21 @@ class BoundingBox(BaseModel):
|
|
68
68
|
@classmethod
|
69
69
|
def from_tuple(cls, coord: Tuple[float], origin: CoordOrigin):
|
70
70
|
if origin == CoordOrigin.TOPLEFT:
|
71
|
-
|
72
|
-
|
73
|
-
|
71
|
+
l, t, r, b = coord[0], coord[1], coord[2], coord[3]
|
72
|
+
if r < l:
|
73
|
+
l, r = r, l
|
74
|
+
if b < t:
|
75
|
+
b, t = t, b
|
76
|
+
|
77
|
+
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
|
74
78
|
elif origin == CoordOrigin.BOTTOMLEFT:
|
75
|
-
|
76
|
-
|
77
|
-
|
79
|
+
l, b, r, t = coord[0], coord[1], coord[2], coord[3]
|
80
|
+
if r < l:
|
81
|
+
l, r = r, l
|
82
|
+
if b > t:
|
83
|
+
b, t = t, b
|
84
|
+
|
85
|
+
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
|
78
86
|
|
79
87
|
def area(self) -> float:
|
80
88
|
return (self.r - self.l) * (self.b - self.t)
|
@@ -280,7 +288,7 @@ class TableStructureOptions(BaseModel):
|
|
280
288
|
|
281
289
|
class PipelineOptions(BaseModel):
|
282
290
|
do_table_structure: bool = True # True: perform table structure extraction
|
283
|
-
do_ocr: bool =
|
291
|
+
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
284
292
|
|
285
293
|
table_structure_options: TableStructureOptions = TableStructureOptions()
|
286
294
|
|
@@ -0,0 +1,124 @@
|
|
1
|
+
import copy
|
2
|
+
import logging
|
3
|
+
from abc import abstractmethod
|
4
|
+
from typing import Iterable, List, Tuple
|
5
|
+
|
6
|
+
import numpy
|
7
|
+
import numpy as np
|
8
|
+
from PIL import Image, ImageDraw
|
9
|
+
from rtree import index
|
10
|
+
from scipy.ndimage import find_objects, label
|
11
|
+
|
12
|
+
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
|
13
|
+
|
14
|
+
_log = logging.getLogger(__name__)
|
15
|
+
|
16
|
+
|
17
|
+
class BaseOcrModel:
|
18
|
+
def __init__(self, config):
|
19
|
+
self.config = config
|
20
|
+
self.enabled = config["enabled"]
|
21
|
+
|
22
|
+
# Computes the optimum amount and coordinates of rectangles to OCR on a given page
|
23
|
+
def get_ocr_rects(self, page: Page) -> Tuple[bool, List[BoundingBox]]:
|
24
|
+
BITMAP_COVERAGE_TRESHOLD = 0.75
|
25
|
+
|
26
|
+
def find_ocr_rects(size, bitmap_rects):
|
27
|
+
image = Image.new(
|
28
|
+
"1", (round(size.width), round(size.height))
|
29
|
+
) # '1' mode is binary
|
30
|
+
|
31
|
+
# Draw all bitmap rects into a binary image
|
32
|
+
draw = ImageDraw.Draw(image)
|
33
|
+
for rect in bitmap_rects:
|
34
|
+
x0, y0, x1, y1 = rect.as_tuple()
|
35
|
+
x0, y0, x1, y1 = round(x0), round(y0), round(x1), round(y1)
|
36
|
+
draw.rectangle([(x0, y0), (x1, y1)], fill=1)
|
37
|
+
|
38
|
+
np_image = np.array(image)
|
39
|
+
|
40
|
+
# Find the connected components
|
41
|
+
labeled_image, num_features = label(
|
42
|
+
np_image > 0
|
43
|
+
) # Label black (0 value) regions
|
44
|
+
|
45
|
+
# Find enclosing bounding boxes for each connected component.
|
46
|
+
slices = find_objects(labeled_image)
|
47
|
+
bounding_boxes = [
|
48
|
+
BoundingBox(
|
49
|
+
l=slc[1].start,
|
50
|
+
t=slc[0].start,
|
51
|
+
r=slc[1].stop - 1,
|
52
|
+
b=slc[0].stop - 1,
|
53
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
54
|
+
)
|
55
|
+
for slc in slices
|
56
|
+
]
|
57
|
+
|
58
|
+
# Compute area fraction on page covered by bitmaps
|
59
|
+
area_frac = np.sum(np_image > 0) / (size.width * size.height)
|
60
|
+
|
61
|
+
return (area_frac, bounding_boxes) # fraction covered # boxes
|
62
|
+
|
63
|
+
bitmap_rects = page._backend.get_bitmap_rects()
|
64
|
+
coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
|
65
|
+
|
66
|
+
# return full-page rectangle if sufficiently covered with bitmaps
|
67
|
+
if coverage > BITMAP_COVERAGE_TRESHOLD:
|
68
|
+
return [
|
69
|
+
BoundingBox(
|
70
|
+
l=0,
|
71
|
+
t=0,
|
72
|
+
r=page.size.width,
|
73
|
+
b=page.size.height,
|
74
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
75
|
+
)
|
76
|
+
]
|
77
|
+
# return individual rectangles if the bitmap coverage is smaller
|
78
|
+
elif coverage < BITMAP_COVERAGE_TRESHOLD:
|
79
|
+
return ocr_rects
|
80
|
+
|
81
|
+
# Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
|
82
|
+
def filter_ocr_cells(self, ocr_cells, programmatic_cells):
|
83
|
+
# Create R-tree index for programmatic cells
|
84
|
+
p = index.Property()
|
85
|
+
p.dimension = 2
|
86
|
+
idx = index.Index(properties=p)
|
87
|
+
for i, cell in enumerate(programmatic_cells):
|
88
|
+
idx.insert(i, cell.bbox.as_tuple())
|
89
|
+
|
90
|
+
def is_overlapping_with_existing_cells(ocr_cell):
|
91
|
+
# Query the R-tree to get overlapping rectangles
|
92
|
+
possible_matches_index = list(idx.intersection(ocr_cell.bbox.as_tuple()))
|
93
|
+
|
94
|
+
return (
|
95
|
+
len(possible_matches_index) > 0
|
96
|
+
) # this is a weak criterion but it works.
|
97
|
+
|
98
|
+
filtered_ocr_cells = [
|
99
|
+
rect for rect in ocr_cells if not is_overlapping_with_existing_cells(rect)
|
100
|
+
]
|
101
|
+
return filtered_ocr_cells
|
102
|
+
|
103
|
+
def draw_ocr_rects_and_cells(self, page, ocr_rects):
|
104
|
+
image = copy.deepcopy(page.image)
|
105
|
+
draw = ImageDraw.Draw(image, "RGBA")
|
106
|
+
|
107
|
+
# Draw OCR rectangles as yellow filled rect
|
108
|
+
for rect in ocr_rects:
|
109
|
+
x0, y0, x1, y1 = rect.as_tuple()
|
110
|
+
shade_color = (255, 255, 0, 40) # transparent yellow
|
111
|
+
draw.rectangle([(x0, y0), (x1, y1)], fill=shade_color, outline=None)
|
112
|
+
|
113
|
+
# Draw OCR and programmatic cells
|
114
|
+
for tc in page.cells:
|
115
|
+
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
116
|
+
color = "red"
|
117
|
+
if isinstance(tc, OcrCell):
|
118
|
+
color = "magenta"
|
119
|
+
draw.rectangle([(x0, y0), (x1, y1)], outline=color)
|
120
|
+
image.show()
|
121
|
+
|
122
|
+
@abstractmethod
|
123
|
+
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
124
|
+
pass
|
@@ -0,0 +1,70 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import Iterable
|
3
|
+
|
4
|
+
import numpy
|
5
|
+
|
6
|
+
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
|
7
|
+
from docling.models.base_ocr_model import BaseOcrModel
|
8
|
+
|
9
|
+
_log = logging.getLogger(__name__)
|
10
|
+
|
11
|
+
|
12
|
+
class EasyOcrModel(BaseOcrModel):
|
13
|
+
def __init__(self, config):
|
14
|
+
super().__init__(config)
|
15
|
+
|
16
|
+
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
17
|
+
|
18
|
+
if self.enabled:
|
19
|
+
import easyocr
|
20
|
+
|
21
|
+
self.reader = easyocr.Reader(config["lang"])
|
22
|
+
|
23
|
+
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
24
|
+
|
25
|
+
if not self.enabled:
|
26
|
+
yield from page_batch
|
27
|
+
return
|
28
|
+
|
29
|
+
for page in page_batch:
|
30
|
+
ocr_rects = self.get_ocr_rects(page)
|
31
|
+
|
32
|
+
all_ocr_cells = []
|
33
|
+
for ocr_rect in ocr_rects:
|
34
|
+
high_res_image = page._backend.get_page_image(
|
35
|
+
scale=self.scale, cropbox=ocr_rect
|
36
|
+
)
|
37
|
+
im = numpy.array(high_res_image)
|
38
|
+
result = self.reader.readtext(im)
|
39
|
+
|
40
|
+
del high_res_image
|
41
|
+
del im
|
42
|
+
|
43
|
+
cells = [
|
44
|
+
OcrCell(
|
45
|
+
id=ix,
|
46
|
+
text=line[1],
|
47
|
+
confidence=line[2],
|
48
|
+
bbox=BoundingBox.from_tuple(
|
49
|
+
coord=(
|
50
|
+
(line[0][0][0] / self.scale) + ocr_rect.l,
|
51
|
+
(line[0][0][1] / self.scale) + ocr_rect.t,
|
52
|
+
(line[0][2][0] / self.scale) + ocr_rect.l,
|
53
|
+
(line[0][2][1] / self.scale) + ocr_rect.t,
|
54
|
+
),
|
55
|
+
origin=CoordOrigin.TOPLEFT,
|
56
|
+
),
|
57
|
+
)
|
58
|
+
for ix, line in enumerate(result)
|
59
|
+
]
|
60
|
+
all_ocr_cells.extend(cells)
|
61
|
+
|
62
|
+
## Remove OCR cells which overlap with programmatic cells.
|
63
|
+
filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
|
64
|
+
|
65
|
+
page.cells.extend(filtered_ocr_cells)
|
66
|
+
|
67
|
+
# DEBUG code:
|
68
|
+
# self.draw_ocr_rects_and_cells(page, ocr_rects)
|
69
|
+
|
70
|
+
yield page
|
@@ -1,10 +1,8 @@
|
|
1
1
|
from pathlib import Path
|
2
|
-
from typing import Iterable
|
3
2
|
|
4
|
-
from docling.datamodel.base_models import
|
3
|
+
from docling.datamodel.base_models import PipelineOptions
|
5
4
|
from docling.models.easyocr_model import EasyOcrModel
|
6
5
|
from docling.models.layout_model import LayoutModel
|
7
|
-
from docling.models.page_assemble_model import PageAssembleModel
|
8
6
|
from docling.models.table_structure_model import TableStructureModel
|
9
7
|
from docling.pipeline.base_model_pipeline import BaseModelPipeline
|
10
8
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "docling"
|
3
|
-
version = "1.
|
3
|
+
version = "1.6.1" # DO NOT EDIT, updated automatically
|
4
4
|
description = "Docling PDF conversion package"
|
5
5
|
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
6
6
|
license = "MIT"
|
@@ -24,16 +24,18 @@ packages = [{include = "docling"}]
|
|
24
24
|
python = "^3.10"
|
25
25
|
pydantic = "^2.0.0"
|
26
26
|
docling-core = "^1.1.2"
|
27
|
-
docling-ibm-models = "^1.1.
|
27
|
+
docling-ibm-models = "^1.1.2"
|
28
28
|
deepsearch-glm = ">=0.19.0,<1"
|
29
29
|
filetype = "^1.2.0"
|
30
30
|
pypdfium2 = "^4.30.0"
|
31
31
|
pydantic-settings = "^2.3.0"
|
32
32
|
huggingface_hub = ">=0.23,<1"
|
33
33
|
requests = "^2.32.3"
|
34
|
-
easyocr = { version = "^1.7"
|
34
|
+
easyocr = { version = "^1.7"}
|
35
35
|
docling-parse = "^0.2.0"
|
36
36
|
certifi = ">=2024.7.4"
|
37
|
+
rtree = "^1.3.0"
|
38
|
+
scipy = "^1.14.1"
|
37
39
|
|
38
40
|
[tool.poetry.group.dev.dependencies]
|
39
41
|
black = {extras = ["jupyter"], version = "^24.4.2"}
|
@@ -50,7 +52,6 @@ flake8-pyproject = "^1.2.3"
|
|
50
52
|
pylint = "^2.17.5"
|
51
53
|
|
52
54
|
[tool.poetry.extras]
|
53
|
-
easyocr = ["easyocr"]
|
54
55
|
ocr = ["easyocr"]
|
55
56
|
|
56
57
|
[build-system]
|
@@ -1,77 +0,0 @@
|
|
1
|
-
import copy
|
2
|
-
import logging
|
3
|
-
import random
|
4
|
-
from typing import Iterable
|
5
|
-
|
6
|
-
import numpy
|
7
|
-
from PIL import ImageDraw
|
8
|
-
|
9
|
-
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
|
10
|
-
|
11
|
-
_log = logging.getLogger(__name__)
|
12
|
-
|
13
|
-
|
14
|
-
class EasyOcrModel:
|
15
|
-
def __init__(self, config):
|
16
|
-
self.config = config
|
17
|
-
self.enabled = config["enabled"]
|
18
|
-
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
19
|
-
|
20
|
-
if self.enabled:
|
21
|
-
import easyocr
|
22
|
-
|
23
|
-
self.reader = easyocr.Reader(config["lang"])
|
24
|
-
|
25
|
-
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
26
|
-
|
27
|
-
if not self.enabled:
|
28
|
-
yield from page_batch
|
29
|
-
return
|
30
|
-
|
31
|
-
for page in page_batch:
|
32
|
-
# rects = page._fpage.
|
33
|
-
high_res_image = page.get_image(scale=self.scale)
|
34
|
-
im = numpy.array(high_res_image)
|
35
|
-
result = self.reader.readtext(im)
|
36
|
-
|
37
|
-
del high_res_image
|
38
|
-
del im
|
39
|
-
|
40
|
-
cells = [
|
41
|
-
OcrCell(
|
42
|
-
id=ix,
|
43
|
-
text=line[1],
|
44
|
-
confidence=line[2],
|
45
|
-
bbox=BoundingBox.from_tuple(
|
46
|
-
coord=(
|
47
|
-
line[0][0][0] / self.scale,
|
48
|
-
line[0][0][1] / self.scale,
|
49
|
-
line[0][2][0] / self.scale,
|
50
|
-
line[0][2][1] / self.scale,
|
51
|
-
),
|
52
|
-
origin=CoordOrigin.TOPLEFT,
|
53
|
-
),
|
54
|
-
)
|
55
|
-
for ix, line in enumerate(result)
|
56
|
-
]
|
57
|
-
|
58
|
-
page.cells = cells # For now, just overwrites all digital cells.
|
59
|
-
|
60
|
-
# DEBUG code:
|
61
|
-
def draw_clusters_and_cells():
|
62
|
-
image = copy.deepcopy(page.image)
|
63
|
-
draw = ImageDraw.Draw(image)
|
64
|
-
|
65
|
-
cell_color = (
|
66
|
-
random.randint(30, 140),
|
67
|
-
random.randint(30, 140),
|
68
|
-
random.randint(30, 140),
|
69
|
-
)
|
70
|
-
for tc in cells:
|
71
|
-
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
72
|
-
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
73
|
-
image.show()
|
74
|
-
|
75
|
-
# draw_clusters_and_cells()
|
76
|
-
|
77
|
-
yield page
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|