python-doctr 0.10.0__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doctr/contrib/__init__.py +1 -0
- doctr/contrib/artefacts.py +7 -9
- doctr/contrib/base.py +8 -17
- doctr/datasets/__init__.py +1 -0
- doctr/datasets/coco_text.py +139 -0
- doctr/datasets/cord.py +10 -8
- doctr/datasets/datasets/__init__.py +4 -4
- doctr/datasets/datasets/base.py +16 -16
- doctr/datasets/datasets/pytorch.py +12 -12
- doctr/datasets/datasets/tensorflow.py +10 -10
- doctr/datasets/detection.py +6 -9
- doctr/datasets/doc_artefacts.py +3 -4
- doctr/datasets/funsd.py +9 -8
- doctr/datasets/generator/__init__.py +4 -4
- doctr/datasets/generator/base.py +16 -17
- doctr/datasets/generator/pytorch.py +1 -3
- doctr/datasets/generator/tensorflow.py +1 -3
- doctr/datasets/ic03.py +5 -6
- doctr/datasets/ic13.py +6 -6
- doctr/datasets/iiit5k.py +10 -6
- doctr/datasets/iiithws.py +4 -5
- doctr/datasets/imgur5k.py +15 -7
- doctr/datasets/loader.py +4 -7
- doctr/datasets/mjsynth.py +6 -5
- doctr/datasets/ocr.py +3 -4
- doctr/datasets/orientation.py +3 -4
- doctr/datasets/recognition.py +4 -5
- doctr/datasets/sroie.py +6 -5
- doctr/datasets/svhn.py +7 -6
- doctr/datasets/svt.py +6 -7
- doctr/datasets/synthtext.py +19 -7
- doctr/datasets/utils.py +41 -35
- doctr/datasets/vocabs.py +1107 -49
- doctr/datasets/wildreceipt.py +14 -10
- doctr/file_utils.py +11 -7
- doctr/io/elements.py +96 -82
- doctr/io/html.py +1 -3
- doctr/io/image/__init__.py +3 -3
- doctr/io/image/base.py +2 -5
- doctr/io/image/pytorch.py +3 -12
- doctr/io/image/tensorflow.py +2 -11
- doctr/io/pdf.py +5 -7
- doctr/io/reader.py +5 -11
- doctr/models/_utils.py +15 -23
- doctr/models/builder.py +30 -48
- doctr/models/classification/__init__.py +1 -0
- doctr/models/classification/magc_resnet/__init__.py +3 -3
- doctr/models/classification/magc_resnet/pytorch.py +11 -15
- doctr/models/classification/magc_resnet/tensorflow.py +11 -14
- doctr/models/classification/mobilenet/__init__.py +3 -3
- doctr/models/classification/mobilenet/pytorch.py +20 -18
- doctr/models/classification/mobilenet/tensorflow.py +19 -23
- doctr/models/classification/predictor/__init__.py +4 -4
- doctr/models/classification/predictor/pytorch.py +7 -9
- doctr/models/classification/predictor/tensorflow.py +6 -8
- doctr/models/classification/resnet/__init__.py +4 -4
- doctr/models/classification/resnet/pytorch.py +47 -34
- doctr/models/classification/resnet/tensorflow.py +45 -35
- doctr/models/classification/textnet/__init__.py +3 -3
- doctr/models/classification/textnet/pytorch.py +20 -18
- doctr/models/classification/textnet/tensorflow.py +19 -17
- doctr/models/classification/vgg/__init__.py +3 -3
- doctr/models/classification/vgg/pytorch.py +21 -8
- doctr/models/classification/vgg/tensorflow.py +20 -14
- doctr/models/classification/vip/__init__.py +4 -0
- doctr/models/classification/vip/layers/__init__.py +4 -0
- doctr/models/classification/vip/layers/pytorch.py +615 -0
- doctr/models/classification/vip/pytorch.py +505 -0
- doctr/models/classification/vit/__init__.py +3 -3
- doctr/models/classification/vit/pytorch.py +18 -15
- doctr/models/classification/vit/tensorflow.py +15 -12
- doctr/models/classification/zoo.py +23 -14
- doctr/models/core.py +3 -3
- doctr/models/detection/_utils/__init__.py +4 -4
- doctr/models/detection/_utils/base.py +4 -7
- doctr/models/detection/_utils/pytorch.py +1 -5
- doctr/models/detection/_utils/tensorflow.py +1 -5
- doctr/models/detection/core.py +2 -8
- doctr/models/detection/differentiable_binarization/__init__.py +4 -4
- doctr/models/detection/differentiable_binarization/base.py +10 -21
- doctr/models/detection/differentiable_binarization/pytorch.py +37 -31
- doctr/models/detection/differentiable_binarization/tensorflow.py +26 -29
- doctr/models/detection/fast/__init__.py +4 -4
- doctr/models/detection/fast/base.py +8 -17
- doctr/models/detection/fast/pytorch.py +37 -35
- doctr/models/detection/fast/tensorflow.py +24 -28
- doctr/models/detection/linknet/__init__.py +4 -4
- doctr/models/detection/linknet/base.py +8 -18
- doctr/models/detection/linknet/pytorch.py +34 -28
- doctr/models/detection/linknet/tensorflow.py +24 -25
- doctr/models/detection/predictor/__init__.py +5 -5
- doctr/models/detection/predictor/pytorch.py +6 -7
- doctr/models/detection/predictor/tensorflow.py +5 -6
- doctr/models/detection/zoo.py +27 -7
- doctr/models/factory/hub.py +6 -10
- doctr/models/kie_predictor/__init__.py +5 -5
- doctr/models/kie_predictor/base.py +4 -5
- doctr/models/kie_predictor/pytorch.py +19 -20
- doctr/models/kie_predictor/tensorflow.py +14 -15
- doctr/models/modules/layers/__init__.py +3 -3
- doctr/models/modules/layers/pytorch.py +55 -10
- doctr/models/modules/layers/tensorflow.py +5 -7
- doctr/models/modules/transformer/__init__.py +3 -3
- doctr/models/modules/transformer/pytorch.py +12 -13
- doctr/models/modules/transformer/tensorflow.py +9 -10
- doctr/models/modules/vision_transformer/__init__.py +3 -3
- doctr/models/modules/vision_transformer/pytorch.py +2 -3
- doctr/models/modules/vision_transformer/tensorflow.py +3 -3
- doctr/models/predictor/__init__.py +5 -5
- doctr/models/predictor/base.py +28 -29
- doctr/models/predictor/pytorch.py +13 -14
- doctr/models/predictor/tensorflow.py +9 -10
- doctr/models/preprocessor/__init__.py +4 -4
- doctr/models/preprocessor/pytorch.py +13 -17
- doctr/models/preprocessor/tensorflow.py +10 -14
- doctr/models/recognition/__init__.py +1 -0
- doctr/models/recognition/core.py +3 -7
- doctr/models/recognition/crnn/__init__.py +4 -4
- doctr/models/recognition/crnn/pytorch.py +30 -29
- doctr/models/recognition/crnn/tensorflow.py +21 -24
- doctr/models/recognition/master/__init__.py +3 -3
- doctr/models/recognition/master/base.py +3 -7
- doctr/models/recognition/master/pytorch.py +32 -25
- doctr/models/recognition/master/tensorflow.py +22 -25
- doctr/models/recognition/parseq/__init__.py +3 -3
- doctr/models/recognition/parseq/base.py +3 -7
- doctr/models/recognition/parseq/pytorch.py +47 -29
- doctr/models/recognition/parseq/tensorflow.py +29 -27
- doctr/models/recognition/predictor/__init__.py +5 -5
- doctr/models/recognition/predictor/_utils.py +111 -52
- doctr/models/recognition/predictor/pytorch.py +9 -9
- doctr/models/recognition/predictor/tensorflow.py +8 -9
- doctr/models/recognition/sar/__init__.py +4 -4
- doctr/models/recognition/sar/pytorch.py +30 -22
- doctr/models/recognition/sar/tensorflow.py +22 -24
- doctr/models/recognition/utils.py +57 -53
- doctr/models/recognition/viptr/__init__.py +4 -0
- doctr/models/recognition/viptr/pytorch.py +277 -0
- doctr/models/recognition/vitstr/__init__.py +4 -4
- doctr/models/recognition/vitstr/base.py +3 -7
- doctr/models/recognition/vitstr/pytorch.py +28 -21
- doctr/models/recognition/vitstr/tensorflow.py +22 -23
- doctr/models/recognition/zoo.py +27 -11
- doctr/models/utils/__init__.py +4 -4
- doctr/models/utils/pytorch.py +41 -34
- doctr/models/utils/tensorflow.py +31 -23
- doctr/models/zoo.py +1 -5
- doctr/transforms/functional/__init__.py +3 -3
- doctr/transforms/functional/base.py +4 -11
- doctr/transforms/functional/pytorch.py +20 -28
- doctr/transforms/functional/tensorflow.py +10 -22
- doctr/transforms/modules/__init__.py +4 -4
- doctr/transforms/modules/base.py +48 -55
- doctr/transforms/modules/pytorch.py +58 -22
- doctr/transforms/modules/tensorflow.py +18 -32
- doctr/utils/common_types.py +8 -9
- doctr/utils/data.py +9 -13
- doctr/utils/fonts.py +2 -7
- doctr/utils/geometry.py +17 -48
- doctr/utils/metrics.py +17 -37
- doctr/utils/multithreading.py +4 -6
- doctr/utils/reconstitution.py +9 -13
- doctr/utils/repr.py +2 -3
- doctr/utils/visualization.py +16 -29
- doctr/version.py +1 -1
- {python_doctr-0.10.0.dist-info → python_doctr-0.12.0.dist-info}/METADATA +70 -52
- python_doctr-0.12.0.dist-info/RECORD +180 -0
- {python_doctr-0.10.0.dist-info → python_doctr-0.12.0.dist-info}/WHEEL +1 -1
- python_doctr-0.10.0.dist-info/RECORD +0 -173
- {python_doctr-0.10.0.dist-info → python_doctr-0.12.0.dist-info/licenses}/LICENSE +0 -0
- {python_doctr-0.10.0.dist-info → python_doctr-0.12.0.dist-info}/top_level.txt +0 -0
- {python_doctr-0.10.0.dist-info → python_doctr-0.12.0.dist-info}/zip-safe +0 -0
doctr/io/image/pytorch.py
CHANGED
|
@@ -1,10 +1,9 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
6
|
from io import BytesIO
|
|
7
|
-
from typing import Tuple
|
|
8
7
|
|
|
9
8
|
import numpy as np
|
|
10
9
|
import torch
|
|
@@ -20,12 +19,10 @@ def tensor_from_pil(pil_img: Image.Image, dtype: torch.dtype = torch.float32) ->
|
|
|
20
19
|
"""Convert a PIL Image to a PyTorch tensor
|
|
21
20
|
|
|
22
21
|
Args:
|
|
23
|
-
----
|
|
24
22
|
pil_img: a PIL image
|
|
25
23
|
dtype: the output tensor data type
|
|
26
24
|
|
|
27
25
|
Returns:
|
|
28
|
-
-------
|
|
29
26
|
decoded image as tensor
|
|
30
27
|
"""
|
|
31
28
|
if dtype == torch.float32:
|
|
@@ -40,12 +37,10 @@ def read_img_as_tensor(img_path: AbstractPath, dtype: torch.dtype = torch.float3
|
|
|
40
37
|
"""Read an image file as a PyTorch tensor
|
|
41
38
|
|
|
42
39
|
Args:
|
|
43
|
-
----
|
|
44
40
|
img_path: location of the image file
|
|
45
41
|
dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
|
|
46
42
|
|
|
47
43
|
Returns:
|
|
48
|
-
-------
|
|
49
44
|
decoded image as a tensor
|
|
50
45
|
"""
|
|
51
46
|
if dtype not in (torch.uint8, torch.float16, torch.float32):
|
|
@@ -59,12 +54,10 @@ def decode_img_as_tensor(img_content: bytes, dtype: torch.dtype = torch.float32)
|
|
|
59
54
|
"""Read a byte stream as a PyTorch tensor
|
|
60
55
|
|
|
61
56
|
Args:
|
|
62
|
-
----
|
|
63
57
|
img_content: bytes of a decoded image
|
|
64
58
|
dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
|
|
65
59
|
|
|
66
60
|
Returns:
|
|
67
|
-
-------
|
|
68
61
|
decoded image as a tensor
|
|
69
62
|
"""
|
|
70
63
|
if dtype not in (torch.uint8, torch.float16, torch.float32):
|
|
@@ -78,12 +71,10 @@ def tensor_from_numpy(npy_img: np.ndarray, dtype: torch.dtype = torch.float32) -
|
|
|
78
71
|
"""Read an image file as a PyTorch tensor
|
|
79
72
|
|
|
80
73
|
Args:
|
|
81
|
-
----
|
|
82
74
|
npy_img: image encoded as a numpy array of shape (H, W, C) in np.uint8
|
|
83
75
|
dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
|
|
84
76
|
|
|
85
77
|
Returns:
|
|
86
|
-
-------
|
|
87
78
|
same image as a tensor of shape (C, H, W)
|
|
88
79
|
"""
|
|
89
80
|
if dtype not in (torch.uint8, torch.float16, torch.float32):
|
|
@@ -102,6 +93,6 @@ def tensor_from_numpy(npy_img: np.ndarray, dtype: torch.dtype = torch.float32) -
|
|
|
102
93
|
return img
|
|
103
94
|
|
|
104
95
|
|
|
105
|
-
def get_img_shape(img: torch.Tensor) ->
|
|
96
|
+
def get_img_shape(img: torch.Tensor) -> tuple[int, int]:
|
|
106
97
|
"""Get the shape of an image"""
|
|
107
|
-
return img.shape[-2:]
|
|
98
|
+
return img.shape[-2:]
|
doctr/io/image/tensorflow.py
CHANGED
|
@@ -1,9 +1,8 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
|
-
from typing import Tuple
|
|
7
6
|
|
|
8
7
|
import numpy as np
|
|
9
8
|
import tensorflow as tf
|
|
@@ -19,12 +18,10 @@ def tensor_from_pil(pil_img: Image.Image, dtype: tf.dtypes.DType = tf.float32) -
|
|
|
19
18
|
"""Convert a PIL Image to a TensorFlow tensor
|
|
20
19
|
|
|
21
20
|
Args:
|
|
22
|
-
----
|
|
23
21
|
pil_img: a PIL image
|
|
24
22
|
dtype: the output tensor data type
|
|
25
23
|
|
|
26
24
|
Returns:
|
|
27
|
-
-------
|
|
28
25
|
decoded image as tensor
|
|
29
26
|
"""
|
|
30
27
|
npy_img = img_to_array(pil_img)
|
|
@@ -36,12 +33,10 @@ def read_img_as_tensor(img_path: AbstractPath, dtype: tf.dtypes.DType = tf.float
|
|
|
36
33
|
"""Read an image file as a TensorFlow tensor
|
|
37
34
|
|
|
38
35
|
Args:
|
|
39
|
-
----
|
|
40
36
|
img_path: location of the image file
|
|
41
37
|
dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
|
|
42
38
|
|
|
43
39
|
Returns:
|
|
44
|
-
-------
|
|
45
40
|
decoded image as a tensor
|
|
46
41
|
"""
|
|
47
42
|
if dtype not in (tf.uint8, tf.float16, tf.float32):
|
|
@@ -61,12 +56,10 @@ def decode_img_as_tensor(img_content: bytes, dtype: tf.dtypes.DType = tf.float32
|
|
|
61
56
|
"""Read a byte stream as a TensorFlow tensor
|
|
62
57
|
|
|
63
58
|
Args:
|
|
64
|
-
----
|
|
65
59
|
img_content: bytes of a decoded image
|
|
66
60
|
dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
|
|
67
61
|
|
|
68
62
|
Returns:
|
|
69
|
-
-------
|
|
70
63
|
decoded image as a tensor
|
|
71
64
|
"""
|
|
72
65
|
if dtype not in (tf.uint8, tf.float16, tf.float32):
|
|
@@ -85,12 +78,10 @@ def tensor_from_numpy(npy_img: np.ndarray, dtype: tf.dtypes.DType = tf.float32)
|
|
|
85
78
|
"""Read an image file as a TensorFlow tensor
|
|
86
79
|
|
|
87
80
|
Args:
|
|
88
|
-
----
|
|
89
81
|
npy_img: image encoded as a numpy array of shape (H, W, C) in np.uint8
|
|
90
82
|
dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
|
|
91
83
|
|
|
92
84
|
Returns:
|
|
93
|
-
-------
|
|
94
85
|
same image as a tensor of shape (H, W, C)
|
|
95
86
|
"""
|
|
96
87
|
if dtype not in (tf.uint8, tf.float16, tf.float32):
|
|
@@ -105,6 +96,6 @@ def tensor_from_numpy(npy_img: np.ndarray, dtype: tf.dtypes.DType = tf.float32)
|
|
|
105
96
|
return img
|
|
106
97
|
|
|
107
98
|
|
|
108
|
-
def get_img_shape(img: tf.Tensor) ->
|
|
99
|
+
def get_img_shape(img: tf.Tensor) -> tuple[int, int]:
|
|
109
100
|
"""Get the shape of an image"""
|
|
110
101
|
return img.shape[:2]
|
doctr/io/pdf.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
|
-
from typing import Any
|
|
6
|
+
from typing import Any
|
|
7
7
|
|
|
8
8
|
import numpy as np
|
|
9
9
|
import pypdfium2 as pdfium
|
|
@@ -15,18 +15,17 @@ __all__ = ["read_pdf"]
|
|
|
15
15
|
|
|
16
16
|
def read_pdf(
|
|
17
17
|
file: AbstractFile,
|
|
18
|
-
scale:
|
|
18
|
+
scale: int = 2,
|
|
19
19
|
rgb_mode: bool = True,
|
|
20
|
-
password:
|
|
20
|
+
password: str | None = None,
|
|
21
21
|
**kwargs: Any,
|
|
22
|
-
) ->
|
|
22
|
+
) -> list[np.ndarray]:
|
|
23
23
|
"""Read a PDF file and convert it into an image in numpy format
|
|
24
24
|
|
|
25
25
|
>>> from doctr.io import read_pdf
|
|
26
26
|
>>> doc = read_pdf("path/to/your/doc.pdf")
|
|
27
27
|
|
|
28
28
|
Args:
|
|
29
|
-
----
|
|
30
29
|
file: the path to the PDF file
|
|
31
30
|
scale: rendering scale (1 corresponds to 72dpi)
|
|
32
31
|
rgb_mode: if True, the output will be RGB, otherwise BGR
|
|
@@ -34,7 +33,6 @@ def read_pdf(
|
|
|
34
33
|
**kwargs: additional parameters to :meth:`pypdfium2.PdfPage.render`
|
|
35
34
|
|
|
36
35
|
Returns:
|
|
37
|
-
-------
|
|
38
36
|
the list of pages decoded as numpy ndarray of shape H x W x C
|
|
39
37
|
"""
|
|
40
38
|
# Rasterise pages to numpy ndarrays with pypdfium2
|
doctr/io/reader.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
|
+
from collections.abc import Sequence
|
|
6
7
|
from pathlib import Path
|
|
7
|
-
from typing import List, Sequence, Union
|
|
8
8
|
|
|
9
9
|
import numpy as np
|
|
10
10
|
|
|
@@ -22,37 +22,33 @@ class DocumentFile:
|
|
|
22
22
|
"""Read a document from multiple extensions"""
|
|
23
23
|
|
|
24
24
|
@classmethod
|
|
25
|
-
def from_pdf(cls, file: AbstractFile, **kwargs) ->
|
|
25
|
+
def from_pdf(cls, file: AbstractFile, **kwargs) -> list[np.ndarray]:
|
|
26
26
|
"""Read a PDF file
|
|
27
27
|
|
|
28
28
|
>>> from doctr.io import DocumentFile
|
|
29
29
|
>>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
|
|
30
30
|
|
|
31
31
|
Args:
|
|
32
|
-
----
|
|
33
32
|
file: the path to the PDF file or a binary stream
|
|
34
33
|
**kwargs: additional parameters to :meth:`pypdfium2.PdfPage.render`
|
|
35
34
|
|
|
36
35
|
Returns:
|
|
37
|
-
-------
|
|
38
36
|
the list of pages decoded as numpy ndarray of shape H x W x 3
|
|
39
37
|
"""
|
|
40
38
|
return read_pdf(file, **kwargs)
|
|
41
39
|
|
|
42
40
|
@classmethod
|
|
43
|
-
def from_url(cls, url: str, **kwargs) ->
|
|
41
|
+
def from_url(cls, url: str, **kwargs) -> list[np.ndarray]:
|
|
44
42
|
"""Interpret a web page as a PDF document
|
|
45
43
|
|
|
46
44
|
>>> from doctr.io import DocumentFile
|
|
47
45
|
>>> doc = DocumentFile.from_url("https://www.yoursite.com")
|
|
48
46
|
|
|
49
47
|
Args:
|
|
50
|
-
----
|
|
51
48
|
url: the URL of the target web page
|
|
52
49
|
**kwargs: additional parameters to :meth:`pypdfium2.PdfPage.render`
|
|
53
50
|
|
|
54
51
|
Returns:
|
|
55
|
-
-------
|
|
56
52
|
the list of pages decoded as numpy ndarray of shape H x W x 3
|
|
57
53
|
"""
|
|
58
54
|
requires_package(
|
|
@@ -64,19 +60,17 @@ class DocumentFile:
|
|
|
64
60
|
return cls.from_pdf(pdf_stream, **kwargs)
|
|
65
61
|
|
|
66
62
|
@classmethod
|
|
67
|
-
def from_images(cls, files:
|
|
63
|
+
def from_images(cls, files: Sequence[AbstractFile] | AbstractFile, **kwargs) -> list[np.ndarray]:
|
|
68
64
|
"""Read an image file (or a collection of image files) and convert it into an image in numpy format
|
|
69
65
|
|
|
70
66
|
>>> from doctr.io import DocumentFile
|
|
71
67
|
>>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"])
|
|
72
68
|
|
|
73
69
|
Args:
|
|
74
|
-
----
|
|
75
70
|
files: the path to the image file or a binary stream, or a collection of those
|
|
76
71
|
**kwargs: additional parameters to :meth:`doctr.io.image.read_img_as_numpy`
|
|
77
72
|
|
|
78
73
|
Returns:
|
|
79
|
-
-------
|
|
80
74
|
the list of pages decoded as numpy ndarray of shape H x W x 3
|
|
81
75
|
"""
|
|
82
76
|
if isinstance(files, (str, Path, bytes)):
|
doctr/models/_utils.py
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
6
|
from math import floor
|
|
7
7
|
from statistics import median_low
|
|
8
|
-
from typing import Any
|
|
8
|
+
from typing import Any
|
|
9
9
|
|
|
10
10
|
import cv2
|
|
11
11
|
import numpy as np
|
|
@@ -20,11 +20,9 @@ def get_max_width_length_ratio(contour: np.ndarray) -> float:
|
|
|
20
20
|
"""Get the maximum shape ratio of a contour.
|
|
21
21
|
|
|
22
22
|
Args:
|
|
23
|
-
----
|
|
24
23
|
contour: the contour from cv2.findContour
|
|
25
24
|
|
|
26
25
|
Returns:
|
|
27
|
-
-------
|
|
28
26
|
the maximum shape ratio
|
|
29
27
|
"""
|
|
30
28
|
_, (w, h), _ = cv2.minAreaRect(contour)
|
|
@@ -33,7 +31,7 @@ def get_max_width_length_ratio(contour: np.ndarray) -> float:
|
|
|
33
31
|
|
|
34
32
|
def estimate_orientation(
|
|
35
33
|
img: np.ndarray,
|
|
36
|
-
general_page_orientation:
|
|
34
|
+
general_page_orientation: tuple[int, float] | None = None,
|
|
37
35
|
n_ct: int = 70,
|
|
38
36
|
ratio_threshold_for_lines: float = 3,
|
|
39
37
|
min_confidence: float = 0.2,
|
|
@@ -43,7 +41,6 @@ def estimate_orientation(
|
|
|
43
41
|
lines of the document and the assumption that they should be horizontal.
|
|
44
42
|
|
|
45
43
|
Args:
|
|
46
|
-
----
|
|
47
44
|
img: the img or bitmap to analyze (H, W, C)
|
|
48
45
|
general_page_orientation: the general orientation of the page (angle [0, 90, 180, 270 (-90)], confidence)
|
|
49
46
|
estimated by a model
|
|
@@ -53,7 +50,6 @@ def estimate_orientation(
|
|
|
53
50
|
lower_area: the minimum area of a contour to be considered
|
|
54
51
|
|
|
55
52
|
Returns:
|
|
56
|
-
-------
|
|
57
53
|
the estimated angle of the page (clockwise, negative for left side rotation, positive for right side rotation)
|
|
58
54
|
"""
|
|
59
55
|
assert len(img.shape) == 3 and img.shape[-1] in [1, 3], f"Image shape {img.shape} not supported"
|
|
@@ -64,13 +60,13 @@ def estimate_orientation(
|
|
|
64
60
|
gray_img = cv2.medianBlur(gray_img, 5)
|
|
65
61
|
thresh = cv2.threshold(gray_img, thresh=0, maxval=255, type=cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
|
|
66
62
|
else:
|
|
67
|
-
thresh = img.astype(np.uint8)
|
|
63
|
+
thresh = img.astype(np.uint8)
|
|
68
64
|
|
|
69
65
|
page_orientation, orientation_confidence = general_page_orientation or (None, 0.0)
|
|
70
66
|
if page_orientation and orientation_confidence >= min_confidence:
|
|
71
67
|
# We rotate the image to the general orientation which improves the detection
|
|
72
68
|
# No expand needed bitmap is already padded
|
|
73
|
-
thresh = rotate_image(thresh, -page_orientation)
|
|
69
|
+
thresh = rotate_image(thresh, -page_orientation)
|
|
74
70
|
else: # That's only required if we do not work on the detection models bin map
|
|
75
71
|
# try to merge words in lines
|
|
76
72
|
(h, w) = img.shape[:2]
|
|
@@ -91,7 +87,7 @@ def estimate_orientation(
|
|
|
91
87
|
|
|
92
88
|
angles = []
|
|
93
89
|
for contour in contours[:n_ct]:
|
|
94
|
-
_, (w, h), angle = cv2.minAreaRect(contour)
|
|
90
|
+
_, (w, h), angle = cv2.minAreaRect(contour)
|
|
95
91
|
if w / h > ratio_threshold_for_lines: # select only contours with ratio like lines
|
|
96
92
|
angles.append(angle)
|
|
97
93
|
elif w / h < 1 / ratio_threshold_for_lines: # if lines are vertical, substract 90 degree
|
|
@@ -119,9 +115,9 @@ def estimate_orientation(
|
|
|
119
115
|
|
|
120
116
|
|
|
121
117
|
def rectify_crops(
|
|
122
|
-
crops:
|
|
123
|
-
orientations:
|
|
124
|
-
) ->
|
|
118
|
+
crops: list[np.ndarray],
|
|
119
|
+
orientations: list[int],
|
|
120
|
+
) -> list[np.ndarray]:
|
|
125
121
|
"""Rotate each crop of the list according to the predicted orientation:
|
|
126
122
|
0: already straight, no rotation
|
|
127
123
|
1: 90 ccw, rotate 3 times ccw
|
|
@@ -139,8 +135,8 @@ def rectify_crops(
|
|
|
139
135
|
|
|
140
136
|
def rectify_loc_preds(
|
|
141
137
|
page_loc_preds: np.ndarray,
|
|
142
|
-
orientations:
|
|
143
|
-
) ->
|
|
138
|
+
orientations: list[int],
|
|
139
|
+
) -> np.ndarray | None:
|
|
144
140
|
"""Orient the quadrangle (Polygon4P) according to the predicted orientation,
|
|
145
141
|
so that the points are in this order: top L, top R, bot R, bot L if the crop is readable
|
|
146
142
|
"""
|
|
@@ -157,16 +153,14 @@ def rectify_loc_preds(
|
|
|
157
153
|
)
|
|
158
154
|
|
|
159
155
|
|
|
160
|
-
def get_language(text: str) ->
|
|
156
|
+
def get_language(text: str) -> tuple[str, float]:
|
|
161
157
|
"""Get languages of a text using langdetect model.
|
|
162
158
|
Get the language with the highest probability or no language if only a few words or a low probability
|
|
163
159
|
|
|
164
160
|
Args:
|
|
165
|
-
----
|
|
166
161
|
text (str): text
|
|
167
162
|
|
|
168
163
|
Returns:
|
|
169
|
-
-------
|
|
170
164
|
The detected language in ISO 639 code and confidence score
|
|
171
165
|
"""
|
|
172
166
|
try:
|
|
@@ -179,16 +173,14 @@ def get_language(text: str) -> Tuple[str, float]:
|
|
|
179
173
|
|
|
180
174
|
|
|
181
175
|
def invert_data_structure(
|
|
182
|
-
x:
|
|
183
|
-
) ->
|
|
184
|
-
"""Invert a
|
|
176
|
+
x: list[dict[str, Any]] | dict[str, list[Any]],
|
|
177
|
+
) -> list[dict[str, Any]] | dict[str, list[Any]]:
|
|
178
|
+
"""Invert a list of dict of elements to a dict of list of elements and the other way around
|
|
185
179
|
|
|
186
180
|
Args:
|
|
187
|
-
----
|
|
188
181
|
x: a list of dictionaries with the same keys or a dictionary of lists of the same length
|
|
189
182
|
|
|
190
183
|
Returns:
|
|
191
|
-
-------
|
|
192
184
|
dictionary of list when x is a list of dictionaries or a list of dictionaries when x is dictionary of lists
|
|
193
185
|
"""
|
|
194
186
|
if isinstance(x, dict):
|
doctr/models/builder.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
6
|
|
|
7
|
-
from typing import Any
|
|
7
|
+
from typing import Any
|
|
8
8
|
|
|
9
9
|
import numpy as np
|
|
10
10
|
from scipy.cluster.hierarchy import fclusterdata
|
|
@@ -20,7 +20,6 @@ class DocumentBuilder(NestedObject):
|
|
|
20
20
|
"""Implements a document builder
|
|
21
21
|
|
|
22
22
|
Args:
|
|
23
|
-
----
|
|
24
23
|
resolve_lines: whether words should be automatically grouped into lines
|
|
25
24
|
resolve_blocks: whether lines should be automatically grouped into blocks
|
|
26
25
|
paragraph_break: relative length of the minimum space separating paragraphs
|
|
@@ -41,15 +40,13 @@ class DocumentBuilder(NestedObject):
|
|
|
41
40
|
self.export_as_straight_boxes = export_as_straight_boxes
|
|
42
41
|
|
|
43
42
|
@staticmethod
|
|
44
|
-
def _sort_boxes(boxes: np.ndarray) ->
|
|
43
|
+
def _sort_boxes(boxes: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
|
|
45
44
|
"""Sort bounding boxes from top to bottom, left to right
|
|
46
45
|
|
|
47
46
|
Args:
|
|
48
|
-
----
|
|
49
47
|
boxes: bounding boxes of shape (N, 4) or (N, 4, 2) (in case of rotated bbox)
|
|
50
48
|
|
|
51
49
|
Returns:
|
|
52
|
-
-------
|
|
53
50
|
tuple: indices of ordered boxes of shape (N,), boxes
|
|
54
51
|
If straight boxes are passed tpo the function, boxes are unchanged
|
|
55
52
|
else: boxes returned are straight boxes fitted to the straightened rotated boxes
|
|
@@ -65,16 +62,14 @@ class DocumentBuilder(NestedObject):
|
|
|
65
62
|
boxes = np.concatenate((boxes.min(1), boxes.max(1)), -1)
|
|
66
63
|
return (boxes[:, 0] + 2 * boxes[:, 3] / np.median(boxes[:, 3] - boxes[:, 1])).argsort(), boxes
|
|
67
64
|
|
|
68
|
-
def _resolve_sub_lines(self, boxes: np.ndarray, word_idcs:
|
|
65
|
+
def _resolve_sub_lines(self, boxes: np.ndarray, word_idcs: list[int]) -> list[list[int]]:
|
|
69
66
|
"""Split a line in sub_lines
|
|
70
67
|
|
|
71
68
|
Args:
|
|
72
|
-
----
|
|
73
69
|
boxes: bounding boxes of shape (N, 4)
|
|
74
70
|
word_idcs: list of indexes for the words of the line
|
|
75
71
|
|
|
76
72
|
Returns:
|
|
77
|
-
-------
|
|
78
73
|
A list of (sub-)lines computed from the original line (words)
|
|
79
74
|
"""
|
|
80
75
|
lines = []
|
|
@@ -105,15 +100,13 @@ class DocumentBuilder(NestedObject):
|
|
|
105
100
|
|
|
106
101
|
return lines
|
|
107
102
|
|
|
108
|
-
def _resolve_lines(self, boxes: np.ndarray) ->
|
|
103
|
+
def _resolve_lines(self, boxes: np.ndarray) -> list[list[int]]:
|
|
109
104
|
"""Order boxes to group them in lines
|
|
110
105
|
|
|
111
106
|
Args:
|
|
112
|
-
----
|
|
113
107
|
boxes: bounding boxes of shape (N, 4) or (N, 4, 2) in case of rotated bbox
|
|
114
108
|
|
|
115
109
|
Returns:
|
|
116
|
-
-------
|
|
117
110
|
nested list of box indices
|
|
118
111
|
"""
|
|
119
112
|
# Sort boxes, and straighten the boxes if they are rotated
|
|
@@ -153,16 +146,14 @@ class DocumentBuilder(NestedObject):
|
|
|
153
146
|
return lines
|
|
154
147
|
|
|
155
148
|
@staticmethod
|
|
156
|
-
def _resolve_blocks(boxes: np.ndarray, lines:
|
|
149
|
+
def _resolve_blocks(boxes: np.ndarray, lines: list[list[int]]) -> list[list[list[int]]]:
|
|
157
150
|
"""Order lines to group them in blocks
|
|
158
151
|
|
|
159
152
|
Args:
|
|
160
|
-
----
|
|
161
153
|
boxes: bounding boxes of shape (N, 4) or (N, 4, 2)
|
|
162
154
|
lines: list of lines, each line is a list of idx
|
|
163
155
|
|
|
164
156
|
Returns:
|
|
165
|
-
-------
|
|
166
157
|
nested list of box indices
|
|
167
158
|
"""
|
|
168
159
|
# Resolve enclosing boxes of lines
|
|
@@ -207,7 +198,7 @@ class DocumentBuilder(NestedObject):
|
|
|
207
198
|
# Compute clusters
|
|
208
199
|
clusters = fclusterdata(box_features, t=0.1, depth=4, criterion="distance", metric="euclidean")
|
|
209
200
|
|
|
210
|
-
_blocks:
|
|
201
|
+
_blocks: dict[int, list[int]] = {}
|
|
211
202
|
# Form clusters
|
|
212
203
|
for line_idx, cluster_idx in enumerate(clusters):
|
|
213
204
|
if cluster_idx in _blocks.keys():
|
|
@@ -224,13 +215,12 @@ class DocumentBuilder(NestedObject):
|
|
|
224
215
|
self,
|
|
225
216
|
boxes: np.ndarray,
|
|
226
217
|
objectness_scores: np.ndarray,
|
|
227
|
-
word_preds:
|
|
228
|
-
crop_orientations:
|
|
229
|
-
) ->
|
|
218
|
+
word_preds: list[tuple[str, float]],
|
|
219
|
+
crop_orientations: list[dict[str, Any]],
|
|
220
|
+
) -> list[Block]:
|
|
230
221
|
"""Gather independent words in structured blocks
|
|
231
222
|
|
|
232
223
|
Args:
|
|
233
|
-
----
|
|
234
224
|
boxes: bounding boxes of all detected words of the page, of shape (N, 4) or (N, 4, 2)
|
|
235
225
|
objectness_scores: objectness scores of all detected words of the page, of shape N
|
|
236
226
|
word_preds: list of all detected words of the page, of shape N
|
|
@@ -238,7 +228,6 @@ class DocumentBuilder(NestedObject):
|
|
|
238
228
|
the general orientation (orientations + confidences) of the crops
|
|
239
229
|
|
|
240
230
|
Returns:
|
|
241
|
-
-------
|
|
242
231
|
list of block elements
|
|
243
232
|
"""
|
|
244
233
|
if boxes.shape[0] != len(word_preds):
|
|
@@ -295,19 +284,18 @@ class DocumentBuilder(NestedObject):
|
|
|
295
284
|
|
|
296
285
|
def __call__(
|
|
297
286
|
self,
|
|
298
|
-
pages:
|
|
299
|
-
boxes:
|
|
300
|
-
objectness_scores:
|
|
301
|
-
text_preds:
|
|
302
|
-
page_shapes:
|
|
303
|
-
crop_orientations:
|
|
304
|
-
orientations:
|
|
305
|
-
languages:
|
|
287
|
+
pages: list[np.ndarray],
|
|
288
|
+
boxes: list[np.ndarray],
|
|
289
|
+
objectness_scores: list[np.ndarray],
|
|
290
|
+
text_preds: list[list[tuple[str, float]]],
|
|
291
|
+
page_shapes: list[tuple[int, int]],
|
|
292
|
+
crop_orientations: list[dict[str, Any]],
|
|
293
|
+
orientations: list[dict[str, Any]] | None = None,
|
|
294
|
+
languages: list[dict[str, Any]] | None = None,
|
|
306
295
|
) -> Document:
|
|
307
296
|
"""Re-arrange detected words into structured blocks
|
|
308
297
|
|
|
309
298
|
Args:
|
|
310
|
-
----
|
|
311
299
|
pages: list of N elements, where each element represents the page image
|
|
312
300
|
boxes: list of N elements, where each element represents the localization predictions, of shape (*, 4)
|
|
313
301
|
or (*, 4, 2) for all words for a given page
|
|
@@ -322,7 +310,6 @@ class DocumentBuilder(NestedObject):
|
|
|
322
310
|
where each element is a dictionary containing the language (language + confidence)
|
|
323
311
|
|
|
324
312
|
Returns:
|
|
325
|
-
-------
|
|
326
313
|
document object
|
|
327
314
|
"""
|
|
328
315
|
if len(boxes) != len(text_preds) != len(crop_orientations) != len(objectness_scores) or len(boxes) != len(
|
|
@@ -374,7 +361,6 @@ class KIEDocumentBuilder(DocumentBuilder):
|
|
|
374
361
|
"""Implements a KIE document builder
|
|
375
362
|
|
|
376
363
|
Args:
|
|
377
|
-
----
|
|
378
364
|
resolve_lines: whether words should be automatically grouped into lines
|
|
379
365
|
resolve_blocks: whether lines should be automatically grouped into blocks
|
|
380
366
|
paragraph_break: relative length of the minimum space separating paragraphs
|
|
@@ -384,19 +370,18 @@ class KIEDocumentBuilder(DocumentBuilder):
|
|
|
384
370
|
|
|
385
371
|
def __call__( # type: ignore[override]
|
|
386
372
|
self,
|
|
387
|
-
pages:
|
|
388
|
-
boxes:
|
|
389
|
-
objectness_scores:
|
|
390
|
-
text_preds:
|
|
391
|
-
page_shapes:
|
|
392
|
-
crop_orientations:
|
|
393
|
-
orientations:
|
|
394
|
-
languages:
|
|
373
|
+
pages: list[np.ndarray],
|
|
374
|
+
boxes: list[dict[str, np.ndarray]],
|
|
375
|
+
objectness_scores: list[dict[str, np.ndarray]],
|
|
376
|
+
text_preds: list[dict[str, list[tuple[str, float]]]],
|
|
377
|
+
page_shapes: list[tuple[int, int]],
|
|
378
|
+
crop_orientations: list[dict[str, list[dict[str, Any]]]],
|
|
379
|
+
orientations: list[dict[str, Any]] | None = None,
|
|
380
|
+
languages: list[dict[str, Any]] | None = None,
|
|
395
381
|
) -> KIEDocument:
|
|
396
382
|
"""Re-arrange detected words into structured predictions
|
|
397
383
|
|
|
398
384
|
Args:
|
|
399
|
-
----
|
|
400
385
|
pages: list of N elements, where each element represents the page image
|
|
401
386
|
boxes: list of N dictionaries, where each element represents the localization predictions for a class,
|
|
402
387
|
of shape (*, 5) or (*, 6) for all predictions
|
|
@@ -411,7 +396,6 @@ class KIEDocumentBuilder(DocumentBuilder):
|
|
|
411
396
|
where each element is a dictionary containing the language (language + confidence)
|
|
412
397
|
|
|
413
398
|
Returns:
|
|
414
|
-
-------
|
|
415
399
|
document object
|
|
416
400
|
"""
|
|
417
401
|
if len(boxes) != len(text_preds) != len(crop_orientations) != len(objectness_scores) or len(boxes) != len(
|
|
@@ -425,7 +409,7 @@ class KIEDocumentBuilder(DocumentBuilder):
|
|
|
425
409
|
if self.export_as_straight_boxes and len(boxes) > 0:
|
|
426
410
|
# If boxes are already straight OK, else fit a bounding rect
|
|
427
411
|
if next(iter(boxes[0].values())).ndim == 3:
|
|
428
|
-
straight_boxes:
|
|
412
|
+
straight_boxes: list[dict[str, np.ndarray]] = []
|
|
429
413
|
# Iterate over pages
|
|
430
414
|
for p_boxes in boxes:
|
|
431
415
|
# Iterate over boxes of the pages
|
|
@@ -471,20 +455,18 @@ class KIEDocumentBuilder(DocumentBuilder):
|
|
|
471
455
|
self,
|
|
472
456
|
boxes: np.ndarray,
|
|
473
457
|
objectness_scores: np.ndarray,
|
|
474
|
-
word_preds:
|
|
475
|
-
crop_orientations:
|
|
476
|
-
) ->
|
|
458
|
+
word_preds: list[tuple[str, float]],
|
|
459
|
+
crop_orientations: list[dict[str, Any]],
|
|
460
|
+
) -> list[Prediction]:
|
|
477
461
|
"""Gather independent words in structured blocks
|
|
478
462
|
|
|
479
463
|
Args:
|
|
480
|
-
----
|
|
481
464
|
boxes: bounding boxes of all detected words of the page, of shape (N, 4) or (N, 4, 2)
|
|
482
465
|
objectness_scores: objectness scores of all detected words of the page
|
|
483
466
|
word_preds: list of all detected words of the page, of shape N
|
|
484
467
|
crop_orientations: list of orientations for each word crop
|
|
485
468
|
|
|
486
469
|
Returns:
|
|
487
|
-
-------
|
|
488
470
|
list of block elements
|
|
489
471
|
"""
|
|
490
472
|
if boxes.shape[0] != len(word_preds):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from doctr.file_utils import is_tf_available, is_torch_available
|
|
2
2
|
|
|
3
|
-
if
|
|
3
|
+
if is_torch_available():
|
|
4
|
+
from .pytorch import *
|
|
5
|
+
elif is_tf_available():
|
|
4
6
|
from .tensorflow import *
|
|
5
|
-
elif is_torch_available():
|
|
6
|
-
from .pytorch import * # type: ignore[assignment]
|