python-doctr 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doctr/__init__.py +1 -1
- doctr/contrib/__init__.py +0 -0
- doctr/contrib/artefacts.py +131 -0
- doctr/contrib/base.py +105 -0
- doctr/datasets/cord.py +10 -1
- doctr/datasets/datasets/pytorch.py +2 -2
- doctr/datasets/funsd.py +11 -1
- doctr/datasets/generator/base.py +6 -5
- doctr/datasets/ic03.py +11 -1
- doctr/datasets/ic13.py +10 -1
- doctr/datasets/iiit5k.py +26 -16
- doctr/datasets/imgur5k.py +11 -2
- doctr/datasets/loader.py +1 -6
- doctr/datasets/sroie.py +11 -1
- doctr/datasets/svhn.py +11 -1
- doctr/datasets/svt.py +11 -1
- doctr/datasets/synthtext.py +11 -1
- doctr/datasets/utils.py +9 -3
- doctr/datasets/vocabs.py +15 -4
- doctr/datasets/wildreceipt.py +12 -1
- doctr/file_utils.py +45 -12
- doctr/io/elements.py +52 -10
- doctr/io/html.py +2 -2
- doctr/io/image/pytorch.py +6 -8
- doctr/io/image/tensorflow.py +1 -1
- doctr/io/pdf.py +5 -2
- doctr/io/reader.py +6 -0
- doctr/models/__init__.py +0 -1
- doctr/models/_utils.py +57 -20
- doctr/models/builder.py +73 -15
- doctr/models/classification/magc_resnet/tensorflow.py +13 -6
- doctr/models/classification/mobilenet/pytorch.py +47 -9
- doctr/models/classification/mobilenet/tensorflow.py +51 -14
- doctr/models/classification/predictor/pytorch.py +28 -17
- doctr/models/classification/predictor/tensorflow.py +26 -16
- doctr/models/classification/resnet/tensorflow.py +21 -8
- doctr/models/classification/textnet/pytorch.py +3 -3
- doctr/models/classification/textnet/tensorflow.py +11 -5
- doctr/models/classification/vgg/tensorflow.py +9 -3
- doctr/models/classification/vit/tensorflow.py +10 -4
- doctr/models/classification/zoo.py +55 -19
- doctr/models/detection/_utils/__init__.py +1 -0
- doctr/models/detection/_utils/base.py +66 -0
- doctr/models/detection/differentiable_binarization/base.py +4 -3
- doctr/models/detection/differentiable_binarization/pytorch.py +2 -2
- doctr/models/detection/differentiable_binarization/tensorflow.py +34 -12
- doctr/models/detection/fast/base.py +6 -5
- doctr/models/detection/fast/pytorch.py +4 -4
- doctr/models/detection/fast/tensorflow.py +15 -12
- doctr/models/detection/linknet/base.py +4 -3
- doctr/models/detection/linknet/tensorflow.py +23 -11
- doctr/models/detection/predictor/pytorch.py +15 -1
- doctr/models/detection/predictor/tensorflow.py +17 -3
- doctr/models/detection/zoo.py +7 -2
- doctr/models/factory/hub.py +8 -18
- doctr/models/kie_predictor/base.py +13 -3
- doctr/models/kie_predictor/pytorch.py +45 -20
- doctr/models/kie_predictor/tensorflow.py +44 -17
- doctr/models/modules/layers/pytorch.py +2 -3
- doctr/models/modules/layers/tensorflow.py +6 -8
- doctr/models/modules/transformer/pytorch.py +2 -2
- doctr/models/modules/transformer/tensorflow.py +0 -2
- doctr/models/modules/vision_transformer/pytorch.py +1 -1
- doctr/models/modules/vision_transformer/tensorflow.py +1 -1
- doctr/models/predictor/base.py +97 -58
- doctr/models/predictor/pytorch.py +35 -20
- doctr/models/predictor/tensorflow.py +35 -18
- doctr/models/preprocessor/pytorch.py +4 -4
- doctr/models/preprocessor/tensorflow.py +3 -2
- doctr/models/recognition/crnn/tensorflow.py +8 -6
- doctr/models/recognition/master/pytorch.py +2 -2
- doctr/models/recognition/master/tensorflow.py +9 -4
- doctr/models/recognition/parseq/pytorch.py +4 -3
- doctr/models/recognition/parseq/tensorflow.py +14 -11
- doctr/models/recognition/sar/pytorch.py +7 -6
- doctr/models/recognition/sar/tensorflow.py +10 -12
- doctr/models/recognition/vitstr/pytorch.py +1 -1
- doctr/models/recognition/vitstr/tensorflow.py +9 -4
- doctr/models/recognition/zoo.py +1 -1
- doctr/models/utils/pytorch.py +1 -1
- doctr/models/utils/tensorflow.py +15 -15
- doctr/models/zoo.py +2 -2
- doctr/py.typed +0 -0
- doctr/transforms/functional/base.py +1 -1
- doctr/transforms/functional/pytorch.py +5 -5
- doctr/transforms/modules/base.py +37 -15
- doctr/transforms/modules/pytorch.py +73 -14
- doctr/transforms/modules/tensorflow.py +78 -19
- doctr/utils/fonts.py +7 -5
- doctr/utils/geometry.py +141 -31
- doctr/utils/metrics.py +34 -175
- doctr/utils/reconstitution.py +212 -0
- doctr/utils/visualization.py +5 -118
- doctr/version.py +1 -1
- {python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/METADATA +85 -81
- python_doctr-0.10.0.dist-info/RECORD +173 -0
- {python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/WHEEL +1 -1
- doctr/models/artefacts/__init__.py +0 -2
- doctr/models/artefacts/barcode.py +0 -74
- doctr/models/artefacts/face.py +0 -63
- doctr/models/obj_detection/__init__.py +0 -1
- doctr/models/obj_detection/faster_rcnn/__init__.py +0 -4
- doctr/models/obj_detection/faster_rcnn/pytorch.py +0 -81
- python_doctr-0.8.1.dist-info/RECORD +0 -173
- {python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/LICENSE +0 -0
- {python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/top_level.txt +0 -0
- {python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/zip-safe +0 -0
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
# Copyright (C) 2021-2024, Mindee.
|
|
2
|
+
|
|
3
|
+
# This program is licensed under the Apache License 2.0.
|
|
4
|
+
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Any, Dict, Optional
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
from anyascii import anyascii
|
|
10
|
+
from PIL import Image, ImageDraw
|
|
11
|
+
|
|
12
|
+
from .fonts import get_font
|
|
13
|
+
|
|
14
|
+
__all__ = ["synthesize_page", "synthesize_kie_page"]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
# Global variable to avoid multiple warnings
|
|
18
|
+
ROTATION_WARNING = False
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _warn_rotation(entry: Dict[str, Any]) -> None: # pragma: no cover
|
|
22
|
+
global ROTATION_WARNING
|
|
23
|
+
if not ROTATION_WARNING and len(entry["geometry"]) == 4:
|
|
24
|
+
logging.warning("Polygons with larger rotations will lead to inaccurate rendering")
|
|
25
|
+
ROTATION_WARNING = True
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _synthesize(
|
|
29
|
+
response: Image.Image,
|
|
30
|
+
entry: Dict[str, Any],
|
|
31
|
+
w: int,
|
|
32
|
+
h: int,
|
|
33
|
+
draw_proba: bool = False,
|
|
34
|
+
font_family: Optional[str] = None,
|
|
35
|
+
smoothing_factor: float = 0.75,
|
|
36
|
+
min_font_size: int = 6,
|
|
37
|
+
max_font_size: int = 50,
|
|
38
|
+
) -> Image.Image:
|
|
39
|
+
if len(entry["geometry"]) == 2:
|
|
40
|
+
(xmin, ymin), (xmax, ymax) = entry["geometry"]
|
|
41
|
+
polygon = [(xmin, ymin), (xmax, ymin), (xmax, ymax), (xmin, ymax)]
|
|
42
|
+
else:
|
|
43
|
+
polygon = entry["geometry"]
|
|
44
|
+
|
|
45
|
+
# Calculate the bounding box of the word
|
|
46
|
+
x_coords, y_coords = zip(*polygon)
|
|
47
|
+
xmin, ymin, xmax, ymax = (
|
|
48
|
+
int(round(w * min(x_coords))),
|
|
49
|
+
int(round(h * min(y_coords))),
|
|
50
|
+
int(round(w * max(x_coords))),
|
|
51
|
+
int(round(h * max(y_coords))),
|
|
52
|
+
)
|
|
53
|
+
word_width = xmax - xmin
|
|
54
|
+
word_height = ymax - ymin
|
|
55
|
+
|
|
56
|
+
# If lines are provided instead of words, concatenate the word entries
|
|
57
|
+
if "words" in entry:
|
|
58
|
+
word_text = " ".join(word["value"] for word in entry["words"])
|
|
59
|
+
else:
|
|
60
|
+
word_text = entry["value"]
|
|
61
|
+
# Find the optimal font size
|
|
62
|
+
try:
|
|
63
|
+
font_size = min(word_height, max_font_size)
|
|
64
|
+
font = get_font(font_family, font_size)
|
|
65
|
+
text_width, text_height = font.getbbox(word_text)[2:4]
|
|
66
|
+
|
|
67
|
+
while (text_width > word_width or text_height > word_height) and font_size > min_font_size:
|
|
68
|
+
font_size = max(int(font_size * smoothing_factor), min_font_size)
|
|
69
|
+
font = get_font(font_family, font_size)
|
|
70
|
+
text_width, text_height = font.getbbox(word_text)[2:4]
|
|
71
|
+
except ValueError:
|
|
72
|
+
font = get_font(font_family, min_font_size)
|
|
73
|
+
|
|
74
|
+
# Create a mask for the word
|
|
75
|
+
mask = Image.new("L", (w, h), 0)
|
|
76
|
+
ImageDraw.Draw(mask).polygon([(int(round(w * x)), int(round(h * y))) for x, y in polygon], fill=255)
|
|
77
|
+
|
|
78
|
+
# Draw the word text
|
|
79
|
+
d = ImageDraw.Draw(response)
|
|
80
|
+
try:
|
|
81
|
+
try:
|
|
82
|
+
d.text((xmin, ymin), word_text, font=font, fill=(0, 0, 0), anchor="lt")
|
|
83
|
+
except UnicodeEncodeError:
|
|
84
|
+
d.text((xmin, ymin), anyascii(word_text), font=font, fill=(0, 0, 0), anchor="lt")
|
|
85
|
+
# Catch generic exceptions to avoid crashing the whole rendering
|
|
86
|
+
except Exception: # pragma: no cover
|
|
87
|
+
logging.warning(f"Could not render word: {word_text}")
|
|
88
|
+
|
|
89
|
+
if draw_proba:
|
|
90
|
+
confidence = (
|
|
91
|
+
entry["confidence"]
|
|
92
|
+
if "confidence" in entry
|
|
93
|
+
else sum(w["confidence"] for w in entry["words"]) / len(entry["words"])
|
|
94
|
+
)
|
|
95
|
+
p = int(255 * confidence)
|
|
96
|
+
color = (255 - p, 0, p) # Red to blue gradient based on probability
|
|
97
|
+
d.rectangle([(xmin, ymin), (xmax, ymax)], outline=color, width=2)
|
|
98
|
+
|
|
99
|
+
prob_font = get_font(font_family, 20)
|
|
100
|
+
prob_text = f"{confidence:.2f}"
|
|
101
|
+
prob_text_width, prob_text_height = prob_font.getbbox(prob_text)[2:4]
|
|
102
|
+
|
|
103
|
+
# Position the probability slightly above the bounding box
|
|
104
|
+
prob_x_offset = (word_width - prob_text_width) // 2
|
|
105
|
+
prob_y_offset = ymin - prob_text_height - 2
|
|
106
|
+
prob_y_offset = max(0, prob_y_offset)
|
|
107
|
+
|
|
108
|
+
d.text((xmin + prob_x_offset, prob_y_offset), prob_text, font=prob_font, fill=color, anchor="lt")
|
|
109
|
+
|
|
110
|
+
return response
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def synthesize_page(
|
|
114
|
+
page: Dict[str, Any],
|
|
115
|
+
draw_proba: bool = False,
|
|
116
|
+
font_family: Optional[str] = None,
|
|
117
|
+
smoothing_factor: float = 0.95,
|
|
118
|
+
min_font_size: int = 8,
|
|
119
|
+
max_font_size: int = 50,
|
|
120
|
+
) -> np.ndarray:
|
|
121
|
+
"""Draw a the content of the element page (OCR response) on a blank page.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
----
|
|
125
|
+
page: exported Page object to represent
|
|
126
|
+
draw_proba: if True, draw words in colors to represent confidence. Blue: p=1, red: p=0
|
|
127
|
+
font_family: family of the font
|
|
128
|
+
smoothing_factor: factor to smooth the font size
|
|
129
|
+
min_font_size: minimum font size
|
|
130
|
+
max_font_size: maximum font size
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
-------
|
|
134
|
+
the synthesized page
|
|
135
|
+
"""
|
|
136
|
+
# Draw template
|
|
137
|
+
h, w = page["dimensions"]
|
|
138
|
+
response = Image.new("RGB", (w, h), color=(255, 255, 255))
|
|
139
|
+
|
|
140
|
+
for block in page["blocks"]:
|
|
141
|
+
# If lines are provided use these to get better rendering results
|
|
142
|
+
if len(block["lines"]) > 1:
|
|
143
|
+
for line in block["lines"]:
|
|
144
|
+
_warn_rotation(block) # pragma: no cover
|
|
145
|
+
response = _synthesize(
|
|
146
|
+
response=response,
|
|
147
|
+
entry=line,
|
|
148
|
+
w=w,
|
|
149
|
+
h=h,
|
|
150
|
+
draw_proba=draw_proba,
|
|
151
|
+
font_family=font_family,
|
|
152
|
+
smoothing_factor=smoothing_factor,
|
|
153
|
+
min_font_size=min_font_size,
|
|
154
|
+
max_font_size=max_font_size,
|
|
155
|
+
)
|
|
156
|
+
# Otherwise, draw each word
|
|
157
|
+
else:
|
|
158
|
+
for line in block["lines"]:
|
|
159
|
+
_warn_rotation(block) # pragma: no cover
|
|
160
|
+
for word in line["words"]:
|
|
161
|
+
response = _synthesize(
|
|
162
|
+
response=response,
|
|
163
|
+
entry=word,
|
|
164
|
+
w=w,
|
|
165
|
+
h=h,
|
|
166
|
+
draw_proba=draw_proba,
|
|
167
|
+
font_family=font_family,
|
|
168
|
+
smoothing_factor=smoothing_factor,
|
|
169
|
+
min_font_size=min_font_size,
|
|
170
|
+
max_font_size=max_font_size,
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
return np.array(response, dtype=np.uint8)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def synthesize_kie_page(
|
|
177
|
+
page: Dict[str, Any],
|
|
178
|
+
draw_proba: bool = False,
|
|
179
|
+
font_family: Optional[str] = None,
|
|
180
|
+
) -> np.ndarray:
|
|
181
|
+
"""Draw a the content of the element page (OCR response) on a blank page.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
----
|
|
185
|
+
page: exported Page object to represent
|
|
186
|
+
draw_proba: if True, draw words in colors to represent confidence. Blue: p=1, red: p=0
|
|
187
|
+
font_family: family of the font
|
|
188
|
+
smoothing_factor: factor to smooth the font size
|
|
189
|
+
min_font_size: minimum font size
|
|
190
|
+
max_font_size: maximum font size
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
-------
|
|
194
|
+
the synthesized page
|
|
195
|
+
"""
|
|
196
|
+
# Draw template
|
|
197
|
+
h, w = page["dimensions"]
|
|
198
|
+
response = Image.new("RGB", (w, h), color=(255, 255, 255))
|
|
199
|
+
|
|
200
|
+
# Draw each word
|
|
201
|
+
for predictions in page["predictions"].values():
|
|
202
|
+
for prediction in predictions:
|
|
203
|
+
_warn_rotation(prediction) # pragma: no cover
|
|
204
|
+
response = _synthesize(
|
|
205
|
+
response=response,
|
|
206
|
+
entry=prediction,
|
|
207
|
+
w=w,
|
|
208
|
+
h=h,
|
|
209
|
+
draw_proba=draw_proba,
|
|
210
|
+
font_family=font_family,
|
|
211
|
+
)
|
|
212
|
+
return np.array(response, dtype=np.uint8)
|
doctr/utils/visualization.py
CHANGED
|
@@ -9,16 +9,12 @@ from typing import Any, Dict, List, Optional, Tuple, Union
|
|
|
9
9
|
import cv2
|
|
10
10
|
import matplotlib.patches as patches
|
|
11
11
|
import matplotlib.pyplot as plt
|
|
12
|
-
import mplcursors
|
|
13
12
|
import numpy as np
|
|
14
13
|
from matplotlib.figure import Figure
|
|
15
|
-
from PIL import Image, ImageDraw
|
|
16
|
-
from unidecode import unidecode
|
|
17
14
|
|
|
18
15
|
from .common_types import BoundingBox, Polygon4P
|
|
19
|
-
from .fonts import get_font
|
|
20
16
|
|
|
21
|
-
__all__ = ["visualize_page", "
|
|
17
|
+
__all__ = ["visualize_page", "visualize_kie_page", "draw_boxes"]
|
|
22
18
|
|
|
23
19
|
|
|
24
20
|
def rect_patch(
|
|
@@ -281,6 +277,8 @@ def visualize_page(
|
|
|
281
277
|
artists.append(rect)
|
|
282
278
|
|
|
283
279
|
if interactive:
|
|
280
|
+
import mplcursors
|
|
281
|
+
|
|
284
282
|
# Create mlp Cursor to hover patches in artists
|
|
285
283
|
mplcursors.Cursor(artists, hover=2).connect("add", lambda sel: sel.annotation.set_text(sel.artist.get_label()))
|
|
286
284
|
fig.tight_layout(pad=0.0)
|
|
@@ -288,63 +286,6 @@ def visualize_page(
|
|
|
288
286
|
return fig
|
|
289
287
|
|
|
290
288
|
|
|
291
|
-
def synthesize_page(
|
|
292
|
-
page: Dict[str, Any],
|
|
293
|
-
draw_proba: bool = False,
|
|
294
|
-
font_family: Optional[str] = None,
|
|
295
|
-
) -> np.ndarray:
|
|
296
|
-
"""Draw a the content of the element page (OCR response) on a blank page.
|
|
297
|
-
|
|
298
|
-
Args:
|
|
299
|
-
----
|
|
300
|
-
page: exported Page object to represent
|
|
301
|
-
draw_proba: if True, draw words in colors to represent confidence. Blue: p=1, red: p=0
|
|
302
|
-
font_size: size of the font, default font = 13
|
|
303
|
-
font_family: family of the font
|
|
304
|
-
|
|
305
|
-
Returns:
|
|
306
|
-
-------
|
|
307
|
-
the synthesized page
|
|
308
|
-
"""
|
|
309
|
-
# Draw template
|
|
310
|
-
h, w = page["dimensions"]
|
|
311
|
-
response = 255 * np.ones((h, w, 3), dtype=np.int32)
|
|
312
|
-
|
|
313
|
-
# Draw each word
|
|
314
|
-
for block in page["blocks"]:
|
|
315
|
-
for line in block["lines"]:
|
|
316
|
-
for word in line["words"]:
|
|
317
|
-
# Get aboslute word geometry
|
|
318
|
-
(xmin, ymin), (xmax, ymax) = word["geometry"]
|
|
319
|
-
xmin, xmax = int(round(w * xmin)), int(round(w * xmax))
|
|
320
|
-
ymin, ymax = int(round(h * ymin)), int(round(h * ymax))
|
|
321
|
-
|
|
322
|
-
# White drawing context adapted to font size, 0.75 factor to convert pts --> pix
|
|
323
|
-
font = get_font(font_family, int(0.75 * (ymax - ymin)))
|
|
324
|
-
img = Image.new("RGB", (xmax - xmin, ymax - ymin), color=(255, 255, 255))
|
|
325
|
-
d = ImageDraw.Draw(img)
|
|
326
|
-
# Draw in black the value of the word
|
|
327
|
-
try:
|
|
328
|
-
d.text((0, 0), word["value"], font=font, fill=(0, 0, 0))
|
|
329
|
-
except UnicodeEncodeError:
|
|
330
|
-
# When character cannot be encoded, use its unidecode version
|
|
331
|
-
d.text((0, 0), unidecode(word["value"]), font=font, fill=(0, 0, 0))
|
|
332
|
-
|
|
333
|
-
# Colorize if draw_proba
|
|
334
|
-
if draw_proba:
|
|
335
|
-
p = int(255 * word["confidence"])
|
|
336
|
-
mask = np.where(np.array(img) == 0, 1, 0)
|
|
337
|
-
proba: np.ndarray = np.array([255 - p, 0, p])
|
|
338
|
-
color = mask * proba[np.newaxis, np.newaxis, :]
|
|
339
|
-
white_mask = 255 * (1 - mask)
|
|
340
|
-
img = color + white_mask
|
|
341
|
-
|
|
342
|
-
# Write to response page
|
|
343
|
-
response[ymin:ymax, xmin:xmax, :] = np.array(img)
|
|
344
|
-
|
|
345
|
-
return response
|
|
346
|
-
|
|
347
|
-
|
|
348
289
|
def visualize_kie_page(
|
|
349
290
|
page: Dict[str, Any],
|
|
350
291
|
image: np.ndarray,
|
|
@@ -413,6 +354,8 @@ def visualize_kie_page(
|
|
|
413
354
|
artists.append(rect)
|
|
414
355
|
|
|
415
356
|
if interactive:
|
|
357
|
+
import mplcursors
|
|
358
|
+
|
|
416
359
|
# Create mlp Cursor to hover patches in artists
|
|
417
360
|
mplcursors.Cursor(artists, hover=2).connect("add", lambda sel: sel.annotation.set_text(sel.artist.get_label()))
|
|
418
361
|
fig.tight_layout(pad=0.0)
|
|
@@ -420,62 +363,6 @@ def visualize_kie_page(
|
|
|
420
363
|
return fig
|
|
421
364
|
|
|
422
365
|
|
|
423
|
-
def synthesize_kie_page(
|
|
424
|
-
page: Dict[str, Any],
|
|
425
|
-
draw_proba: bool = False,
|
|
426
|
-
font_family: Optional[str] = None,
|
|
427
|
-
) -> np.ndarray:
|
|
428
|
-
"""Draw a the content of the element page (OCR response) on a blank page.
|
|
429
|
-
|
|
430
|
-
Args:
|
|
431
|
-
----
|
|
432
|
-
page: exported Page object to represent
|
|
433
|
-
draw_proba: if True, draw words in colors to represent confidence. Blue: p=1, red: p=0
|
|
434
|
-
font_size: size of the font, default font = 13
|
|
435
|
-
font_family: family of the font
|
|
436
|
-
|
|
437
|
-
Returns:
|
|
438
|
-
-------
|
|
439
|
-
the synthesized page
|
|
440
|
-
"""
|
|
441
|
-
# Draw template
|
|
442
|
-
h, w = page["dimensions"]
|
|
443
|
-
response = 255 * np.ones((h, w, 3), dtype=np.int32)
|
|
444
|
-
|
|
445
|
-
# Draw each word
|
|
446
|
-
for predictions in page["predictions"].values():
|
|
447
|
-
for prediction in predictions:
|
|
448
|
-
# Get aboslute word geometry
|
|
449
|
-
(xmin, ymin), (xmax, ymax) = prediction["geometry"]
|
|
450
|
-
xmin, xmax = int(round(w * xmin)), int(round(w * xmax))
|
|
451
|
-
ymin, ymax = int(round(h * ymin)), int(round(h * ymax))
|
|
452
|
-
|
|
453
|
-
# White drawing context adapted to font size, 0.75 factor to convert pts --> pix
|
|
454
|
-
font = get_font(font_family, int(0.75 * (ymax - ymin)))
|
|
455
|
-
img = Image.new("RGB", (xmax - xmin, ymax - ymin), color=(255, 255, 255))
|
|
456
|
-
d = ImageDraw.Draw(img)
|
|
457
|
-
# Draw in black the value of the word
|
|
458
|
-
try:
|
|
459
|
-
d.text((0, 0), prediction["value"], font=font, fill=(0, 0, 0))
|
|
460
|
-
except UnicodeEncodeError:
|
|
461
|
-
# When character cannot be encoded, use its unidecode version
|
|
462
|
-
d.text((0, 0), unidecode(prediction["value"]), font=font, fill=(0, 0, 0))
|
|
463
|
-
|
|
464
|
-
# Colorize if draw_proba
|
|
465
|
-
if draw_proba:
|
|
466
|
-
p = int(255 * prediction["confidence"])
|
|
467
|
-
mask = np.where(np.array(img) == 0, 1, 0)
|
|
468
|
-
proba: np.ndarray = np.array([255 - p, 0, p])
|
|
469
|
-
color = mask * proba[np.newaxis, np.newaxis, :]
|
|
470
|
-
white_mask = 255 * (1 - mask)
|
|
471
|
-
img = color + white_mask
|
|
472
|
-
|
|
473
|
-
# Write to response page
|
|
474
|
-
response[ymin:ymax, xmin:xmax, :] = np.array(img)
|
|
475
|
-
|
|
476
|
-
return response
|
|
477
|
-
|
|
478
|
-
|
|
479
366
|
def draw_boxes(boxes: np.ndarray, image: np.ndarray, color: Optional[Tuple[int, int, int]] = None, **kwargs) -> None:
|
|
480
367
|
"""Draw an array of relative straight boxes on an image
|
|
481
368
|
|
doctr/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = 'v0.
|
|
1
|
+
__version__ = 'v0.10.0'
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: python-doctr
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.10.0
|
|
4
4
|
Summary: Document Text Recognition (docTR): deep Learning for high-performance OCR on documents.
|
|
5
5
|
Author-email: Mindee <contact@mindee.com>
|
|
6
6
|
Maintainer: François-Guillaume Fernandez, Charles Gaillard, Olivier Dulcy, Felix Dittrich
|
|
@@ -219,87 +219,93 @@ Classifier: License :: OSI Approved :: Apache Software License
|
|
|
219
219
|
Classifier: Natural Language :: English
|
|
220
220
|
Classifier: Operating System :: OS Independent
|
|
221
221
|
Classifier: Programming Language :: Python :: 3
|
|
222
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
223
222
|
Classifier: Programming Language :: Python :: 3.9
|
|
224
223
|
Classifier: Programming Language :: Python :: 3.10
|
|
224
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
225
225
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
226
|
-
Requires-Python: <4,>=3.
|
|
226
|
+
Requires-Python: <4,>=3.9.0
|
|
227
227
|
Description-Content-Type: text/markdown
|
|
228
228
|
License-File: LICENSE
|
|
229
|
-
Requires-Dist:
|
|
230
|
-
Requires-Dist:
|
|
231
|
-
Requires-Dist:
|
|
232
|
-
Requires-Dist:
|
|
233
|
-
Requires-Dist:
|
|
234
|
-
Requires-Dist:
|
|
235
|
-
Requires-Dist:
|
|
236
|
-
Requires-Dist:
|
|
237
|
-
Requires-Dist:
|
|
238
|
-
Requires-Dist:
|
|
239
|
-
Requires-Dist:
|
|
240
|
-
Requires-Dist:
|
|
241
|
-
Requires-Dist:
|
|
242
|
-
Requires-Dist:
|
|
243
|
-
|
|
244
|
-
Requires-Dist:
|
|
245
|
-
Requires-Dist: unidecode >=1.0.0
|
|
246
|
-
Requires-Dist: tqdm >=4.30.0
|
|
229
|
+
Requires-Dist: numpy<3.0.0,>=1.16.0
|
|
230
|
+
Requires-Dist: scipy<2.0.0,>=1.4.0
|
|
231
|
+
Requires-Dist: h5py<4.0.0,>=3.1.0
|
|
232
|
+
Requires-Dist: opencv-python<5.0.0,>=4.5.0
|
|
233
|
+
Requires-Dist: pypdfium2<5.0.0,>=4.11.0
|
|
234
|
+
Requires-Dist: pyclipper<2.0.0,>=1.2.0
|
|
235
|
+
Requires-Dist: shapely<3.0.0,>=1.6.0
|
|
236
|
+
Requires-Dist: langdetect<2.0.0,>=1.0.9
|
|
237
|
+
Requires-Dist: rapidfuzz<4.0.0,>=3.0.0
|
|
238
|
+
Requires-Dist: huggingface-hub<1.0.0,>=0.20.0
|
|
239
|
+
Requires-Dist: Pillow>=9.2.0
|
|
240
|
+
Requires-Dist: defusedxml>=0.7.0
|
|
241
|
+
Requires-Dist: anyascii>=0.3.2
|
|
242
|
+
Requires-Dist: tqdm>=4.30.0
|
|
243
|
+
Provides-Extra: contrib
|
|
244
|
+
Requires-Dist: onnxruntime>=1.11.0; extra == "contrib"
|
|
247
245
|
Provides-Extra: dev
|
|
248
|
-
Requires-Dist: tensorflow
|
|
249
|
-
Requires-Dist:
|
|
250
|
-
Requires-Dist:
|
|
251
|
-
Requires-Dist:
|
|
252
|
-
Requires-Dist:
|
|
253
|
-
Requires-Dist:
|
|
254
|
-
Requires-Dist:
|
|
255
|
-
Requires-Dist:
|
|
256
|
-
Requires-Dist:
|
|
257
|
-
Requires-Dist:
|
|
258
|
-
Requires-Dist:
|
|
259
|
-
Requires-Dist:
|
|
260
|
-
Requires-Dist:
|
|
261
|
-
Requires-Dist:
|
|
262
|
-
Requires-Dist:
|
|
263
|
-
Requires-Dist:
|
|
264
|
-
Requires-Dist:
|
|
265
|
-
Requires-Dist:
|
|
266
|
-
Requires-Dist:
|
|
267
|
-
Requires-Dist: sphinx-
|
|
268
|
-
Requires-Dist:
|
|
269
|
-
Requires-Dist:
|
|
246
|
+
Requires-Dist: tensorflow<3.0.0,>=2.15.0; extra == "dev"
|
|
247
|
+
Requires-Dist: tf-keras<3.0.0,>=2.15.0; extra == "dev"
|
|
248
|
+
Requires-Dist: tf2onnx<2.0.0,>=1.16.0; extra == "dev"
|
|
249
|
+
Requires-Dist: torch<3.0.0,>=2.0.0; extra == "dev"
|
|
250
|
+
Requires-Dist: torchvision>=0.15.0; extra == "dev"
|
|
251
|
+
Requires-Dist: onnx<3.0.0,>=1.12.0; extra == "dev"
|
|
252
|
+
Requires-Dist: weasyprint>=55.0; extra == "dev"
|
|
253
|
+
Requires-Dist: matplotlib>=3.1.0; extra == "dev"
|
|
254
|
+
Requires-Dist: mplcursors>=0.3; extra == "dev"
|
|
255
|
+
Requires-Dist: pytest>=5.3.2; extra == "dev"
|
|
256
|
+
Requires-Dist: coverage[toml]>=4.5.4; extra == "dev"
|
|
257
|
+
Requires-Dist: onnxruntime>=1.11.0; extra == "dev"
|
|
258
|
+
Requires-Dist: requests>=2.20.0; extra == "dev"
|
|
259
|
+
Requires-Dist: psutil>=5.9.5; extra == "dev"
|
|
260
|
+
Requires-Dist: ruff>=0.3.0; extra == "dev"
|
|
261
|
+
Requires-Dist: mypy>=1.0; extra == "dev"
|
|
262
|
+
Requires-Dist: pre-commit>=3.0.0; extra == "dev"
|
|
263
|
+
Requires-Dist: sphinx!=3.5.0,>=3.0.0; extra == "dev"
|
|
264
|
+
Requires-Dist: sphinxemoji>=0.1.8; extra == "dev"
|
|
265
|
+
Requires-Dist: sphinx-copybutton>=0.3.1; extra == "dev"
|
|
266
|
+
Requires-Dist: docutils<0.22; extra == "dev"
|
|
267
|
+
Requires-Dist: recommonmark>=0.7.1; extra == "dev"
|
|
268
|
+
Requires-Dist: sphinx-markdown-tables>=0.0.15; extra == "dev"
|
|
269
|
+
Requires-Dist: sphinx-tabs>=3.3.0; extra == "dev"
|
|
270
|
+
Requires-Dist: furo>=2022.3.4; extra == "dev"
|
|
270
271
|
Provides-Extra: docs
|
|
271
|
-
Requires-Dist: sphinx
|
|
272
|
-
Requires-Dist: sphinxemoji
|
|
273
|
-
Requires-Dist: sphinx-copybutton
|
|
274
|
-
Requires-Dist: docutils
|
|
275
|
-
Requires-Dist: recommonmark
|
|
276
|
-
Requires-Dist: sphinx-markdown-tables
|
|
277
|
-
Requires-Dist: sphinx-tabs
|
|
278
|
-
Requires-Dist: furo
|
|
272
|
+
Requires-Dist: sphinx!=3.5.0,>=3.0.0; extra == "docs"
|
|
273
|
+
Requires-Dist: sphinxemoji>=0.1.8; extra == "docs"
|
|
274
|
+
Requires-Dist: sphinx-copybutton>=0.3.1; extra == "docs"
|
|
275
|
+
Requires-Dist: docutils<0.22; extra == "docs"
|
|
276
|
+
Requires-Dist: recommonmark>=0.7.1; extra == "docs"
|
|
277
|
+
Requires-Dist: sphinx-markdown-tables>=0.0.15; extra == "docs"
|
|
278
|
+
Requires-Dist: sphinx-tabs>=3.3.0; extra == "docs"
|
|
279
|
+
Requires-Dist: furo>=2022.3.4; extra == "docs"
|
|
280
|
+
Provides-Extra: html
|
|
281
|
+
Requires-Dist: weasyprint>=55.0; extra == "html"
|
|
279
282
|
Provides-Extra: quality
|
|
280
|
-
Requires-Dist: ruff
|
|
281
|
-
Requires-Dist: mypy
|
|
282
|
-
Requires-Dist: pre-commit
|
|
283
|
+
Requires-Dist: ruff>=0.1.5; extra == "quality"
|
|
284
|
+
Requires-Dist: mypy>=0.812; extra == "quality"
|
|
285
|
+
Requires-Dist: pre-commit>=2.17.0; extra == "quality"
|
|
283
286
|
Provides-Extra: testing
|
|
284
|
-
Requires-Dist: pytest
|
|
285
|
-
Requires-Dist: coverage[toml]
|
|
286
|
-
Requires-Dist:
|
|
287
|
-
Requires-Dist:
|
|
288
|
-
Requires-Dist:
|
|
289
|
-
Requires-Dist: psutil >=5.9.5 ; extra == 'testing'
|
|
287
|
+
Requires-Dist: pytest>=5.3.2; extra == "testing"
|
|
288
|
+
Requires-Dist: coverage[toml]>=4.5.4; extra == "testing"
|
|
289
|
+
Requires-Dist: onnxruntime>=1.11.0; extra == "testing"
|
|
290
|
+
Requires-Dist: requests>=2.20.0; extra == "testing"
|
|
291
|
+
Requires-Dist: psutil>=5.9.5; extra == "testing"
|
|
290
292
|
Provides-Extra: tf
|
|
291
|
-
Requires-Dist: tensorflow
|
|
292
|
-
Requires-Dist:
|
|
293
|
+
Requires-Dist: tensorflow<3.0.0,>=2.15.0; extra == "tf"
|
|
294
|
+
Requires-Dist: tf-keras<3.0.0,>=2.15.0; extra == "tf"
|
|
295
|
+
Requires-Dist: tf2onnx<2.0.0,>=1.16.0; extra == "tf"
|
|
293
296
|
Provides-Extra: torch
|
|
294
|
-
Requires-Dist: torch
|
|
295
|
-
Requires-Dist: torchvision
|
|
296
|
-
Requires-Dist: onnx
|
|
297
|
+
Requires-Dist: torch<3.0.0,>=2.0.0; extra == "torch"
|
|
298
|
+
Requires-Dist: torchvision>=0.15.0; extra == "torch"
|
|
299
|
+
Requires-Dist: onnx<3.0.0,>=1.12.0; extra == "torch"
|
|
300
|
+
Provides-Extra: viz
|
|
301
|
+
Requires-Dist: matplotlib>=3.1.0; extra == "viz"
|
|
302
|
+
Requires-Dist: mplcursors>=0.3; extra == "viz"
|
|
297
303
|
|
|
298
304
|
<p align="center">
|
|
299
305
|
<img src="https://github.com/mindee/doctr/raw/main/docs/images/Logo_doctr.gif" width="40%">
|
|
300
306
|
</p>
|
|
301
307
|
|
|
302
|
-
[](https://slack.mindee.com) [](LICENSE)  [](https://github.com/mindee/doctr/pkgs/container/doctr) [](https://codecov.io/gh/mindee/doctr) [](https://www.codefactor.io/repository/github/mindee/doctr) [](https://app.codacy.com/gh/mindee/doctr?utm_source=github.com&utm_medium=referral&utm_content=mindee/doctr&utm_campaign=Badge_Grade) [](https://mindee.github.io/doctr) [](https://slack.mindee.com) [](LICENSE)  [](https://github.com/mindee/doctr/pkgs/container/doctr) [](https://codecov.io/gh/mindee/doctr) [](https://www.codefactor.io/repository/github/mindee/doctr) [](https://app.codacy.com/gh/mindee/doctr?utm_source=github.com&utm_medium=referral&utm_content=mindee/doctr&utm_campaign=Badge_Grade) [](https://mindee.github.io/doctr) [](https://pypi.org/project/python-doctr/) [](https://huggingface.co/spaces/mindee/doctr) [](https://colab.research.google.com/github/mindee/notebooks/blob/main/doctr/quicktour.ipynb)
|
|
303
309
|
|
|
304
310
|
|
|
305
311
|
**Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 & PyTorch**
|
|
@@ -334,7 +340,7 @@ from doctr.io import DocumentFile
|
|
|
334
340
|
pdf_doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
|
|
335
341
|
# Image
|
|
336
342
|
single_img_doc = DocumentFile.from_images("path/to/your/img.jpg")
|
|
337
|
-
# Webpage
|
|
343
|
+
# Webpage (requires `weasyprint` to be installed)
|
|
338
344
|
webpage_doc = DocumentFile.from_url("https://www.yoursite.com")
|
|
339
345
|
# Multiple page images
|
|
340
346
|
multi_img_doc = DocumentFile.from_images(["path/to/page1.jpg", "path/to/page2.jpg"])
|
|
@@ -372,6 +378,7 @@ If both options are set to False, the predictor will always fit and return rotat
|
|
|
372
378
|
To interpret your model's predictions, you can visualize them interactively as follows:
|
|
373
379
|
|
|
374
380
|
```python
|
|
381
|
+
# Display the result (requires matplotlib & mplcursors to be installed)
|
|
375
382
|
result.show()
|
|
376
383
|
```
|
|
377
384
|
|
|
@@ -431,17 +438,7 @@ The KIE predictor results per page are in a dictionary format with each key repr
|
|
|
431
438
|
|
|
432
439
|
### Prerequisites
|
|
433
440
|
|
|
434
|
-
Python 3.
|
|
435
|
-
|
|
436
|
-
Since we use [weasyprint](https://weasyprint.org/), you will need extra dependencies if you are not running Linux.
|
|
437
|
-
|
|
438
|
-
For MacOS users, you can install them as follows:
|
|
439
|
-
|
|
440
|
-
```shell
|
|
441
|
-
brew install cairo pango gdk-pixbuf libffi
|
|
442
|
-
```
|
|
443
|
-
|
|
444
|
-
For Windows users, those dependencies are included in GTK. You can find the latest installer over [here](https://github.com/tschoonj/GTK-for-Windows-Runtime-Environment-Installer/releases).
|
|
441
|
+
Python 3.9 (or higher) and [pip](https://pip.pypa.io/en/stable/) are required to install docTR.
|
|
445
442
|
|
|
446
443
|
### Latest release
|
|
447
444
|
|
|
@@ -460,12 +457,14 @@ We try to keep framework-specific dependencies to a minimum. You can install fra
|
|
|
460
457
|
pip install "python-doctr[tf]"
|
|
461
458
|
# for PyTorch
|
|
462
459
|
pip install "python-doctr[torch]"
|
|
460
|
+
# optional dependencies for visualization, html, and contrib modules can be installed as follows:
|
|
461
|
+
pip install "python-doctr[torch,viz,html,contib]"
|
|
463
462
|
```
|
|
464
463
|
|
|
465
464
|
For MacBooks with M1 chip, you will need some additional packages or specific versions:
|
|
466
465
|
|
|
467
466
|
- TensorFlow 2: [metal plugin](https://developer.apple.com/metal/tensorflow-plugin/)
|
|
468
|
-
- PyTorch: [version >=
|
|
467
|
+
- PyTorch: [version >= 2.0.0](https://pytorch.org/get-started/locally/#start-locally)
|
|
469
468
|
|
|
470
469
|
### Developer mode
|
|
471
470
|
|
|
@@ -647,9 +646,14 @@ Your API should now be running locally on your port 8002. Access your automatica
|
|
|
647
646
|
|
|
648
647
|
```python
|
|
649
648
|
import requests
|
|
649
|
+
|
|
650
|
+
params = {"det_arch": "db_resnet50", "reco_arch": "crnn_vgg16_bn"}
|
|
651
|
+
|
|
650
652
|
with open('/path/to/your/doc.jpg', 'rb') as f:
|
|
651
|
-
|
|
652
|
-
|
|
653
|
+
files = [ # application/pdf, image/jpeg, image/png supported
|
|
654
|
+
("files", ("doc.jpg", f.read(), "image/jpeg")),
|
|
655
|
+
]
|
|
656
|
+
print(requests.post("http://localhost:8080/ocr", params=params, files=files).json())
|
|
653
657
|
```
|
|
654
658
|
|
|
655
659
|
### Example notebooks
|