deepdoctection 0.42.0__py3-none-any.whl → 0.43__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +2 -1
- deepdoctection/analyzer/__init__.py +2 -1
- deepdoctection/analyzer/config.py +904 -0
- deepdoctection/analyzer/dd.py +36 -62
- deepdoctection/analyzer/factory.py +311 -141
- deepdoctection/configs/conf_dd_one.yaml +100 -44
- deepdoctection/configs/profiles.jsonl +32 -0
- deepdoctection/dataflow/__init__.py +9 -6
- deepdoctection/dataflow/base.py +33 -15
- deepdoctection/dataflow/common.py +96 -75
- deepdoctection/dataflow/custom.py +36 -29
- deepdoctection/dataflow/custom_serialize.py +135 -91
- deepdoctection/dataflow/parallel_map.py +33 -31
- deepdoctection/dataflow/serialize.py +15 -10
- deepdoctection/dataflow/stats.py +41 -28
- deepdoctection/datapoint/__init__.py +4 -6
- deepdoctection/datapoint/annotation.py +104 -66
- deepdoctection/datapoint/box.py +190 -130
- deepdoctection/datapoint/convert.py +66 -39
- deepdoctection/datapoint/image.py +151 -95
- deepdoctection/datapoint/view.py +383 -236
- deepdoctection/datasets/__init__.py +2 -6
- deepdoctection/datasets/adapter.py +11 -11
- deepdoctection/datasets/base.py +118 -81
- deepdoctection/datasets/dataflow_builder.py +18 -12
- deepdoctection/datasets/info.py +76 -57
- deepdoctection/datasets/instances/__init__.py +6 -2
- deepdoctection/datasets/instances/doclaynet.py +17 -14
- deepdoctection/datasets/instances/fintabnet.py +16 -22
- deepdoctection/datasets/instances/funsd.py +11 -6
- deepdoctection/datasets/instances/iiitar13k.py +9 -9
- deepdoctection/datasets/instances/layouttest.py +9 -9
- deepdoctection/datasets/instances/publaynet.py +9 -9
- deepdoctection/datasets/instances/pubtables1m.py +13 -13
- deepdoctection/datasets/instances/pubtabnet.py +13 -15
- deepdoctection/datasets/instances/rvlcdip.py +8 -8
- deepdoctection/datasets/instances/xfund.py +11 -9
- deepdoctection/datasets/registry.py +18 -11
- deepdoctection/datasets/save.py +12 -11
- deepdoctection/eval/__init__.py +3 -2
- deepdoctection/eval/accmetric.py +72 -52
- deepdoctection/eval/base.py +29 -10
- deepdoctection/eval/cocometric.py +14 -12
- deepdoctection/eval/eval.py +56 -41
- deepdoctection/eval/registry.py +6 -3
- deepdoctection/eval/tedsmetric.py +24 -9
- deepdoctection/eval/tp_eval_callback.py +13 -12
- deepdoctection/extern/__init__.py +1 -1
- deepdoctection/extern/base.py +176 -97
- deepdoctection/extern/d2detect.py +127 -92
- deepdoctection/extern/deskew.py +19 -10
- deepdoctection/extern/doctrocr.py +157 -106
- deepdoctection/extern/fastlang.py +25 -17
- deepdoctection/extern/hfdetr.py +137 -60
- deepdoctection/extern/hflayoutlm.py +329 -248
- deepdoctection/extern/hflm.py +67 -33
- deepdoctection/extern/model.py +108 -762
- deepdoctection/extern/pdftext.py +37 -12
- deepdoctection/extern/pt/nms.py +15 -1
- deepdoctection/extern/pt/ptutils.py +13 -9
- deepdoctection/extern/tessocr.py +87 -54
- deepdoctection/extern/texocr.py +29 -14
- deepdoctection/extern/tp/tfutils.py +36 -8
- deepdoctection/extern/tp/tpcompat.py +54 -16
- deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
- deepdoctection/extern/tpdetect.py +4 -2
- deepdoctection/mapper/__init__.py +1 -1
- deepdoctection/mapper/cats.py +117 -76
- deepdoctection/mapper/cocostruct.py +35 -17
- deepdoctection/mapper/d2struct.py +56 -29
- deepdoctection/mapper/hfstruct.py +32 -19
- deepdoctection/mapper/laylmstruct.py +221 -185
- deepdoctection/mapper/maputils.py +71 -35
- deepdoctection/mapper/match.py +76 -62
- deepdoctection/mapper/misc.py +68 -44
- deepdoctection/mapper/pascalstruct.py +13 -12
- deepdoctection/mapper/prodigystruct.py +33 -19
- deepdoctection/mapper/pubstruct.py +42 -32
- deepdoctection/mapper/tpstruct.py +39 -19
- deepdoctection/mapper/xfundstruct.py +20 -13
- deepdoctection/pipe/__init__.py +1 -2
- deepdoctection/pipe/anngen.py +104 -62
- deepdoctection/pipe/base.py +226 -107
- deepdoctection/pipe/common.py +206 -123
- deepdoctection/pipe/concurrency.py +74 -47
- deepdoctection/pipe/doctectionpipe.py +108 -47
- deepdoctection/pipe/language.py +41 -24
- deepdoctection/pipe/layout.py +45 -18
- deepdoctection/pipe/lm.py +146 -78
- deepdoctection/pipe/order.py +196 -113
- deepdoctection/pipe/refine.py +111 -63
- deepdoctection/pipe/registry.py +1 -1
- deepdoctection/pipe/segment.py +213 -142
- deepdoctection/pipe/sub_layout.py +76 -46
- deepdoctection/pipe/text.py +52 -33
- deepdoctection/pipe/transform.py +8 -6
- deepdoctection/train/d2_frcnn_train.py +87 -69
- deepdoctection/train/hf_detr_train.py +72 -40
- deepdoctection/train/hf_layoutlm_train.py +85 -46
- deepdoctection/train/tp_frcnn_train.py +56 -28
- deepdoctection/utils/concurrency.py +59 -16
- deepdoctection/utils/context.py +40 -19
- deepdoctection/utils/develop.py +25 -17
- deepdoctection/utils/env_info.py +85 -36
- deepdoctection/utils/error.py +16 -10
- deepdoctection/utils/file_utils.py +246 -62
- deepdoctection/utils/fs.py +162 -43
- deepdoctection/utils/identifier.py +29 -16
- deepdoctection/utils/logger.py +49 -32
- deepdoctection/utils/metacfg.py +83 -21
- deepdoctection/utils/pdf_utils.py +119 -62
- deepdoctection/utils/settings.py +24 -10
- deepdoctection/utils/tqdm.py +10 -5
- deepdoctection/utils/transform.py +182 -46
- deepdoctection/utils/utils.py +61 -28
- deepdoctection/utils/viz.py +150 -104
- deepdoctection-0.43.dist-info/METADATA +376 -0
- deepdoctection-0.43.dist-info/RECORD +149 -0
- {deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/WHEEL +1 -1
- deepdoctection/analyzer/_config.py +0 -146
- deepdoctection-0.42.0.dist-info/METADATA +0 -431
- deepdoctection-0.42.0.dist-info/RECORD +0 -148
- {deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/licenses/LICENSE +0 -0
- {deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/top_level.txt +0 -0
deepdoctection/extern/pdftext.py
CHANGED
|
@@ -48,18 +48,23 @@ def _to_detect_result(word: dict[str, str], class_name: ObjectTypes) -> Detectio
|
|
|
48
48
|
|
|
49
49
|
class PdfPlumberTextDetector(PdfMiner):
|
|
50
50
|
"""
|
|
51
|
-
Text miner based on the pdfminer.six engine. To convert pdfminers result, especially group character to get word
|
|
52
|
-
level results we use pdfplumber
|
|
51
|
+
Text miner based on the `pdfminer.six` engine. To convert `pdfminers` result, especially group character to get word
|
|
52
|
+
level results we use `pdfplumber`.
|
|
53
53
|
|
|
54
|
+
Example:
|
|
55
|
+
```python
|
|
54
56
|
pdf_plumber = PdfPlumberTextDetector()
|
|
55
57
|
df = SerializerPdfDoc.load("path/to/document.pdf")
|
|
56
58
|
df.reset_state()
|
|
57
59
|
|
|
58
60
|
for dp in df:
|
|
59
61
|
detection_results = pdf_plumber.predict(dp["pdf_bytes"])
|
|
62
|
+
```
|
|
60
63
|
|
|
61
64
|
To use it in a more integrated way:
|
|
62
65
|
|
|
66
|
+
Example:
|
|
67
|
+
```python
|
|
63
68
|
pdf_plumber = PdfPlumberTextDetector()
|
|
64
69
|
text_extract = TextExtractionService(pdf_plumber)
|
|
65
70
|
|
|
@@ -70,7 +75,7 @@ class PdfPlumberTextDetector(PdfMiner):
|
|
|
70
75
|
|
|
71
76
|
for dp in df:
|
|
72
77
|
...
|
|
73
|
-
|
|
78
|
+
```
|
|
74
79
|
"""
|
|
75
80
|
|
|
76
81
|
def __init__(self, x_tolerance: int = 3, y_tolerance: int = 3) -> None:
|
|
@@ -83,10 +88,13 @@ class PdfPlumberTextDetector(PdfMiner):
|
|
|
83
88
|
|
|
84
89
|
def predict(self, pdf_bytes: bytes) -> list[DetectionResult]:
|
|
85
90
|
"""
|
|
86
|
-
Call pdfminer.six and returns detected text as
|
|
91
|
+
Call `pdfminer.six` and returns detected text as `DetectionResult`
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
pdf_bytes: bytes of a single pdf page
|
|
87
95
|
|
|
88
|
-
:
|
|
89
|
-
|
|
96
|
+
Returns:
|
|
97
|
+
A list of `DetectionResult`
|
|
90
98
|
"""
|
|
91
99
|
|
|
92
100
|
with save_tmp_file(pdf_bytes, "pdf_") as (tmp_name, _):
|
|
@@ -104,8 +112,12 @@ class PdfPlumberTextDetector(PdfMiner):
|
|
|
104
112
|
def get_width_height(self, pdf_bytes: bytes) -> tuple[float, float]:
|
|
105
113
|
"""
|
|
106
114
|
Get the width and height of the full page
|
|
107
|
-
|
|
108
|
-
:
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
pdf_bytes: `pdf_bytes` generating the pdf
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
`(width,height)`
|
|
109
121
|
"""
|
|
110
122
|
|
|
111
123
|
if self._pdf_bytes == pdf_bytes and self._page is not None:
|
|
@@ -126,15 +138,20 @@ class Pdfmium2TextDetector(PdfMiner):
|
|
|
126
138
|
"""
|
|
127
139
|
Text miner based on the pypdfium2 engine. It will return text on text line level and not on word level
|
|
128
140
|
|
|
141
|
+
Example:
|
|
142
|
+
```python
|
|
129
143
|
pdfmium2 = Pdfmium2TextDetector()
|
|
130
144
|
df = SerializerPdfDoc.load("path/to/document.pdf")
|
|
131
145
|
df.reset_state()
|
|
132
146
|
|
|
133
147
|
for dp in df:
|
|
134
148
|
detection_results = pdfmium2.predict(dp["pdf_bytes"])
|
|
149
|
+
```
|
|
135
150
|
|
|
136
151
|
To use it in a more integrated way:
|
|
137
152
|
|
|
153
|
+
Example:
|
|
154
|
+
```python
|
|
138
155
|
pdfmium2 = Pdfmium2TextDetector()
|
|
139
156
|
text_extract = TextExtractionService(pdfmium2)
|
|
140
157
|
|
|
@@ -144,6 +161,7 @@ class Pdfmium2TextDetector(PdfMiner):
|
|
|
144
161
|
df.reset_state()
|
|
145
162
|
for dp in df:
|
|
146
163
|
...
|
|
164
|
+
```
|
|
147
165
|
|
|
148
166
|
"""
|
|
149
167
|
|
|
@@ -157,8 +175,11 @@ class Pdfmium2TextDetector(PdfMiner):
|
|
|
157
175
|
"""
|
|
158
176
|
Call pypdfium2 and returns detected text as detection results
|
|
159
177
|
|
|
160
|
-
:
|
|
161
|
-
|
|
178
|
+
Args:
|
|
179
|
+
pdf_bytes: bytes of a single pdf page
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
A list of `DetectionResult`
|
|
162
183
|
"""
|
|
163
184
|
|
|
164
185
|
pdf = PdfDocument(pdf_bytes)
|
|
@@ -188,8 +209,12 @@ class Pdfmium2TextDetector(PdfMiner):
|
|
|
188
209
|
def get_width_height(self, pdf_bytes: bytes) -> tuple[float, float]:
|
|
189
210
|
"""
|
|
190
211
|
Get the width and height of the full page
|
|
191
|
-
|
|
192
|
-
:
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
pdf_bytes: `pdf_bytes` generating the pdf
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
`(width,height)`
|
|
193
218
|
"""
|
|
194
219
|
|
|
195
220
|
if self._pdf_bytes == pdf_bytes and self._page is not None:
|
deepdoctection/extern/pt/nms.py
CHANGED
|
@@ -30,7 +30,21 @@ with try_import() as import_guard:
|
|
|
30
30
|
# Copy & paste from https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/nms.py
|
|
31
31
|
def batched_nms(boxes: torch.Tensor, scores: torch.Tensor, idxs: torch.Tensor, iou_threshold: float) -> torch.Tensor:
|
|
32
32
|
"""
|
|
33
|
-
Same as torchvision.ops.boxes.batched_nms
|
|
33
|
+
Same as `torchvision.ops.boxes.batched_nms`, but with `float()`.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
boxes: A `torch.Tensor` of shape (N, 4) containing bounding boxes.
|
|
37
|
+
scores: A `torch.Tensor` of shape (N,) containing scores for each box.
|
|
38
|
+
idxs: A `torch.Tensor` of shape (N,) containing the class indices for each box.
|
|
39
|
+
iou_threshold: A float representing the IoU threshold for suppression.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
A `torch.Tensor` containing the indices of the boxes to keep.
|
|
43
|
+
|
|
44
|
+
Note:
|
|
45
|
+
`Fp16` does not have enough range for batched NMS, so `float()` is used.
|
|
46
|
+
Torchvision already has a strategy to decide whether to use coordinate trick or for loop to implement
|
|
47
|
+
`batched_nms`.
|
|
34
48
|
"""
|
|
35
49
|
assert boxes.shape[-1] == 4
|
|
36
50
|
# Note: Torchvision already has a strategy (https://github.com/pytorch/vision/issues/1311)
|
|
@@ -33,19 +33,23 @@ with try_import() as import_guard:
|
|
|
33
33
|
|
|
34
34
|
def get_torch_device(device: Optional[Union[str, torch.device]] = None) -> torch.device:
|
|
35
35
|
"""
|
|
36
|
-
|
|
36
|
+
Select a device on which to load a model. The selection follows a cascade of priorities:
|
|
37
37
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
unless something else is specified by CUDA_VISIBLE_DEVICES:
|
|
38
|
+
If a device string is provided, it is used. If the environment variable `USE_CUDA` is set, a GPU is used.
|
|
39
|
+
If more GPUs are available, it will use all of them unless something else is specified by `CUDA_VISIBLE_DEVICES`.
|
|
41
40
|
|
|
42
|
-
|
|
41
|
+
See: <https://stackoverflow.com/questions/54216920/how-to-use-multiple-gpus-in-pytorch>
|
|
43
42
|
|
|
44
|
-
|
|
45
|
-
- Otherwise, the CPU is used.
|
|
43
|
+
If an MPS device is available, it is used. Otherwise, the CPU is used.
|
|
46
44
|
|
|
47
|
-
:
|
|
48
|
-
|
|
45
|
+
Args:
|
|
46
|
+
device: Device either as string or torch.device.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
torch.device: The selected device.
|
|
50
|
+
|
|
51
|
+
Note:
|
|
52
|
+
The function checks the environment variables `USE_CUDA` and `USE_MPS` to determine device preference.
|
|
49
53
|
"""
|
|
50
54
|
if device is not None:
|
|
51
55
|
if isinstance(device, torch.device):
|
deepdoctection/extern/tessocr.py
CHANGED
|
@@ -16,7 +16,7 @@
|
|
|
16
16
|
# limitations under the License.
|
|
17
17
|
|
|
18
18
|
"""
|
|
19
|
-
Tesseract OCR engine
|
|
19
|
+
Tesseract OCR engine
|
|
20
20
|
"""
|
|
21
21
|
from __future__ import annotations
|
|
22
22
|
|
|
@@ -115,7 +115,8 @@ def _run_tesseract(tesseract_args: list[str]) -> None:
|
|
|
115
115
|
|
|
116
116
|
def get_tesseract_version() -> Version:
|
|
117
117
|
"""
|
|
118
|
-
Returns
|
|
118
|
+
Returns:
|
|
119
|
+
Version of the installed tesseract engine.
|
|
119
120
|
"""
|
|
120
121
|
try:
|
|
121
122
|
output = subprocess.check_output(
|
|
@@ -142,10 +143,12 @@ def get_tesseract_version() -> Version:
|
|
|
142
143
|
|
|
143
144
|
def image_to_angle(image: PixelValues) -> Mapping[str, str]:
|
|
144
145
|
"""
|
|
145
|
-
Generating a tmp file and running
|
|
146
|
+
Generating a tmp file and running Tesseract to get the orientation of the image.
|
|
146
147
|
|
|
147
|
-
|
|
148
|
-
|
|
148
|
+
Args:
|
|
149
|
+
image: Image an `np.array`
|
|
150
|
+
Returns:
|
|
151
|
+
A dict with keys 'Orientation in degrees' and 'Orientation confidence'.
|
|
149
152
|
"""
|
|
150
153
|
with save_tmp_file(image, "tess_") as (tmp_name, input_file_name):
|
|
151
154
|
_run_tesseract(_input_to_cli_str("osd", "--psm 0", 0, input_file_name, tmp_name))
|
|
@@ -159,7 +162,7 @@ def image_to_angle(image: PixelValues) -> Mapping[str, str]:
|
|
|
159
162
|
|
|
160
163
|
def image_to_dict(image: PixelValues, lang: str, config: str) -> dict[str, list[Union[str, int, float]]]:
|
|
161
164
|
"""
|
|
162
|
-
This is more or less pytesseract.image_to_data with a dict as returned value.
|
|
165
|
+
This is more or less `pytesseract.image_to_data` with a dict as returned value.
|
|
163
166
|
What happens under the hood is:
|
|
164
167
|
|
|
165
168
|
- saving an image file
|
|
@@ -167,13 +170,17 @@ def image_to_dict(image: PixelValues, lang: str, config: str) -> dict[str, list[
|
|
|
167
170
|
- saving a temp .tsv file with predicted results
|
|
168
171
|
- reading the .tsv file and returning the results as dict.
|
|
169
172
|
|
|
170
|
-
|
|
173
|
+
Note:
|
|
174
|
+
Requires Tesseract or 3.05 or higher
|
|
171
175
|
|
|
172
|
-
:
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
176
|
+
Args:
|
|
177
|
+
image: Image in np.array.
|
|
178
|
+
lang: String of language
|
|
179
|
+
config: string of configs
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
Dictionary with keys `left`, `top`, `width`, `height` (bounding box coords), `conf` (confidence), `text`
|
|
183
|
+
(captured text), `block_num` (block number) and `lin_num` (line number).
|
|
177
184
|
"""
|
|
178
185
|
|
|
179
186
|
with save_tmp_file(image, "tess_") as (tmp_name, input_file_name):
|
|
@@ -213,10 +220,14 @@ def image_to_dict(image: PixelValues, lang: str, config: str) -> dict[str, list[
|
|
|
213
220
|
|
|
214
221
|
def tesseract_line_to_detectresult(detect_result_list: list[DetectionResult]) -> list[DetectionResult]:
|
|
215
222
|
"""
|
|
216
|
-
Generating text line DetectionResult based on Tesseract word grouping. It generates line bounding boxes from
|
|
223
|
+
Generating text line `DetectionResult`s based on Tesseract word grouping. It generates line bounding boxes from
|
|
217
224
|
word bounding boxes.
|
|
218
|
-
|
|
219
|
-
:
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
detect_result_list: A list of `DetectionResult`s
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
An extended list of `DetectionResult`s
|
|
220
231
|
"""
|
|
221
232
|
|
|
222
233
|
line_detect_result: list[DetectionResult] = []
|
|
@@ -247,15 +258,18 @@ def tesseract_line_to_detectresult(detect_result_list: list[DetectionResult]) ->
|
|
|
247
258
|
|
|
248
259
|
def predict_text(np_img: PixelValues, supported_languages: str, text_lines: bool, config: str) -> list[DetectionResult]:
|
|
249
260
|
"""
|
|
250
|
-
Calls
|
|
251
|
-
|
|
252
|
-
:
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
261
|
+
Calls Tesseract directly with some given configs. Requires Tesseract to be installed.
|
|
262
|
+
|
|
263
|
+
Args:
|
|
264
|
+
np_img: Image in `np.array`.
|
|
265
|
+
supported_languages: To improve OCR extraction quality it is helpful to pre-select the language of the
|
|
266
|
+
detected text, if this in known in advance. Combinations are possible, e.g. `deu`,
|
|
267
|
+
`fr+eng`.
|
|
268
|
+
text_lines: If `True`, it will return `DetectionResult`s of text lines as well.
|
|
269
|
+
config: The config parameter passing to Tesseract. Consult also <https://guides.nyu.edu/tesseract/usage>
|
|
270
|
+
|
|
271
|
+
Returns:
|
|
272
|
+
A list of Tesseract extractions wrapped in `DetectionResult`
|
|
259
273
|
"""
|
|
260
274
|
|
|
261
275
|
results = image_to_dict(np_img, supported_languages, config)
|
|
@@ -290,31 +304,37 @@ def predict_rotation(np_img: PixelValues) -> Mapping[str, str]:
|
|
|
290
304
|
"""
|
|
291
305
|
Predicts the rotation of an image using the Tesseract OCR engine.
|
|
292
306
|
|
|
293
|
-
:
|
|
294
|
-
|
|
307
|
+
Args:
|
|
308
|
+
np_img: numpy array of the image
|
|
309
|
+
|
|
310
|
+
Returns:
|
|
311
|
+
A dictionary with keys 'Orientation in degrees' and 'Orientation confidence'
|
|
295
312
|
"""
|
|
296
313
|
return image_to_angle(np_img)
|
|
297
314
|
|
|
298
315
|
|
|
299
316
|
class TesseractOcrDetector(ObjectDetector):
|
|
300
317
|
"""
|
|
301
|
-
Text object detector based on Tesseracts OCR engine.
|
|
318
|
+
Text object detector based on Tesseracts OCR engine.
|
|
302
319
|
|
|
303
|
-
|
|
304
|
-
|
|
320
|
+
Note:
|
|
321
|
+
Tesseract has to be installed separately. <https://tesseract-ocr.github.io/>
|
|
305
322
|
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
All configuration options that are available via pytesseract can be given via the configuration. The best overview
|
|
309
|
-
can be found at https://pypi.org/project/pytesseract/.
|
|
323
|
+
All configuration options that are available via pytesseract can be added to the configuration file:
|
|
324
|
+
<https://pypi.org/project/pytesseract/.>
|
|
310
325
|
|
|
326
|
+
Example:
|
|
327
|
+
```python
|
|
311
328
|
tesseract_config_path = ModelCatalog.get_full_path_configs("dd/conf_tesseract.yaml")
|
|
312
329
|
ocr_detector = TesseractOcrDetector(tesseract_config_path)
|
|
313
330
|
|
|
314
331
|
detection_result = ocr_detector.predict(bgr_image_as_np_array)
|
|
332
|
+
```
|
|
315
333
|
|
|
316
334
|
To use it within a pipeline
|
|
317
335
|
|
|
336
|
+
Example:
|
|
337
|
+
```python
|
|
318
338
|
tesseract_config_path = ModelCatalog.get_full_path_configs("dd/conf_tesseract.yaml")
|
|
319
339
|
ocr_detector = TesseractOcrDetector(tesseract_config_path)
|
|
320
340
|
|
|
@@ -325,6 +345,7 @@ class TesseractOcrDetector(ObjectDetector):
|
|
|
325
345
|
|
|
326
346
|
for dp in df:
|
|
327
347
|
...
|
|
348
|
+
```
|
|
328
349
|
"""
|
|
329
350
|
|
|
330
351
|
def __init__(
|
|
@@ -333,11 +354,12 @@ class TesseractOcrDetector(ObjectDetector):
|
|
|
333
354
|
config_overwrite: Optional[list[str]] = None,
|
|
334
355
|
):
|
|
335
356
|
"""
|
|
336
|
-
Set up the configuration which is stored in a yaml
|
|
357
|
+
Set up the configuration which is stored in a `.yaml` file, that need to be passed through.
|
|
337
358
|
|
|
338
|
-
:
|
|
339
|
-
|
|
340
|
-
|
|
359
|
+
Args:
|
|
360
|
+
path_yaml: The path to the yaml config
|
|
361
|
+
config_overwrite: Overwrite config parameters defined by the yaml file with new values.
|
|
362
|
+
E.g. `["oem=14"]`
|
|
341
363
|
"""
|
|
342
364
|
self.name = self.get_name()
|
|
343
365
|
self.model_id = self.get_model_id()
|
|
@@ -362,8 +384,11 @@ class TesseractOcrDetector(ObjectDetector):
|
|
|
362
384
|
"""
|
|
363
385
|
Transfer of a numpy array and call of pytesseract. Return of the detection results.
|
|
364
386
|
|
|
365
|
-
:
|
|
366
|
-
|
|
387
|
+
Args:
|
|
388
|
+
np_img: image as `np.array`
|
|
389
|
+
|
|
390
|
+
Returns:
|
|
391
|
+
A list of `DetectionResult`
|
|
367
392
|
"""
|
|
368
393
|
|
|
369
394
|
return predict_text(
|
|
@@ -386,7 +411,10 @@ class TesseractOcrDetector(ObjectDetector):
|
|
|
386
411
|
def set_language(self, language: ObjectTypes) -> None:
|
|
387
412
|
"""
|
|
388
413
|
Pass a language to change the model selection. For runtime language selection.
|
|
389
|
-
|
|
414
|
+
|
|
415
|
+
Args:
|
|
416
|
+
language: One of the following: `fre`,`dut`,`chi`,`cze`,`per`,`gre`,`mac`,`rum`,`arm`,
|
|
417
|
+
`geo`,`war`,`glg`,`slv`,`alb`,`nn`.
|
|
390
418
|
"""
|
|
391
419
|
self.config.LANGUAGES = _LANG_CODE_TO_TESS_LANG_CODE.get(language, language.value)
|
|
392
420
|
|
|
@@ -398,13 +426,11 @@ class TesseractOcrDetector(ObjectDetector):
|
|
|
398
426
|
|
|
399
427
|
class TesseractRotationTransformer(ImageTransformer):
|
|
400
428
|
"""
|
|
401
|
-
The `TesseractRotationTransformer`
|
|
402
|
-
|
|
403
|
-
base class and implements methods for predicting and applying rotation transformations to images.
|
|
429
|
+
The `TesseractRotationTransformer` is designed to handle image rotations.. It inherits from the `ImageTransformer`
|
|
430
|
+
base class and implements methods for predicting and applying rotation transformations.
|
|
404
431
|
|
|
405
432
|
The `predict` method determines the angle of the rotated image. It can only handle angles that are multiples of 90
|
|
406
|
-
degrees.
|
|
407
|
-
This method uses the Tesseract OCR engine to predict the rotation angle of an image.
|
|
433
|
+
degrees. This method uses the Tesseract OCR engine to predict the rotation angle of an image.
|
|
408
434
|
|
|
409
435
|
The `transform` method applies the predicted rotation to the image, effectively rotating the image backwards.
|
|
410
436
|
This method uses either the Pillow library or OpenCV for the rotation operation, depending on the configuration.
|
|
@@ -412,10 +438,12 @@ class TesseractRotationTransformer(ImageTransformer):
|
|
|
412
438
|
This class can be particularly useful in OCR tasks where the orientation of the text in the image matters.
|
|
413
439
|
The class also provides methods for cloning itself and for getting the requirements of the Tesseract OCR system.
|
|
414
440
|
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
441
|
+
Example:
|
|
442
|
+
```python
|
|
443
|
+
transformer = TesseractRotationTransformer()
|
|
444
|
+
detection_result = transformer.predict(np_img)
|
|
445
|
+
rotated_image = transformer.transform(np_img, detection_result)
|
|
446
|
+
```
|
|
419
447
|
"""
|
|
420
448
|
|
|
421
449
|
def __init__(self) -> None:
|
|
@@ -428,9 +456,12 @@ class TesseractRotationTransformer(ImageTransformer):
|
|
|
428
456
|
Applies the predicted rotation to the image, effectively rotating the image backwards.
|
|
429
457
|
This method uses either the Pillow library or OpenCV for the rotation operation, depending on the configuration.
|
|
430
458
|
|
|
431
|
-
:
|
|
432
|
-
|
|
433
|
-
|
|
459
|
+
Args:
|
|
460
|
+
np_img: The input image as a numpy array.
|
|
461
|
+
specification: A `DetectionResult` object containing the predicted rotation angle.
|
|
462
|
+
|
|
463
|
+
Returns:
|
|
464
|
+
The rotated image as a numpy array.
|
|
434
465
|
"""
|
|
435
466
|
return viz_handler.rotate_image(np_img, specification.angle) # type: ignore
|
|
436
467
|
|
|
@@ -439,8 +470,10 @@ class TesseractRotationTransformer(ImageTransformer):
|
|
|
439
470
|
Determines the angle of the rotated image. It can only handle angles that are multiples of 90 degrees.
|
|
440
471
|
This method uses the Tesseract OCR engine to predict the rotation angle of an image.
|
|
441
472
|
|
|
442
|
-
:
|
|
443
|
-
|
|
473
|
+
Args:
|
|
474
|
+
np_img: The input image as a numpy array.
|
|
475
|
+
Returns:
|
|
476
|
+
A `DetectionResult` object containing the predicted rotation angle and confidence.
|
|
444
477
|
"""
|
|
445
478
|
output_dict = predict_rotation(np_img)
|
|
446
479
|
return DetectionResult(
|
deepdoctection/extern/texocr.py
CHANGED
|
@@ -60,15 +60,18 @@ def _textract_to_detectresult(response: JsonDict, width: int, height: int, text_
|
|
|
60
60
|
return all_results
|
|
61
61
|
|
|
62
62
|
|
|
63
|
-
def predict_text(np_img: PixelValues, client, text_lines: bool) -> list[DetectionResult]: # type: ignore
|
|
63
|
+
def predict_text(np_img: PixelValues, client: boto3.client, text_lines: bool) -> list[DetectionResult]: # type: ignore
|
|
64
64
|
"""
|
|
65
65
|
Calls AWS Textract client (`detect_document_text`) and returns plain OCR results.
|
|
66
66
|
AWS account required.
|
|
67
67
|
|
|
68
|
-
:
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
68
|
+
Args:
|
|
69
|
+
np_img: Image in `np.array`.
|
|
70
|
+
client: botocore textract client
|
|
71
|
+
text_lines: If `True`, it will return `DetectionResult`s of Text lines as well.
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
A list of `DetectionResult`
|
|
72
75
|
"""
|
|
73
76
|
|
|
74
77
|
width, height = np_img.shape[1], np_img.shape[0]
|
|
@@ -95,16 +98,23 @@ def predict_text(np_img: PixelValues, client, text_lines: bool) -> list[Detectio
|
|
|
95
98
|
class TextractOcrDetector(ObjectDetector):
|
|
96
99
|
"""
|
|
97
100
|
Text object detector based on AWS Textract OCR engine. Note that an AWS account as well as some additional
|
|
98
|
-
installations are required, i.e AWS CLI and boto3
|
|
99
|
-
|
|
101
|
+
installations are required, i.e `AWS CLI` and `boto3`.
|
|
102
|
+
Note:
|
|
103
|
+
The service is not free of charge. Additional information can be found at:
|
|
104
|
+
<https://docs.aws.amazon.com/textract/?id=docs_gateway> .
|
|
105
|
+
|
|
106
|
+
The detector only calls the base `OCR` engine and does not return additional Textract document analysis features.
|
|
100
107
|
|
|
101
|
-
|
|
108
|
+
Example:
|
|
102
109
|
|
|
110
|
+
```python
|
|
103
111
|
textract_predictor = TextractOcrDetector()
|
|
104
112
|
detection_result = textract_predictor.predict(bgr_image_as_np_array)
|
|
113
|
+
```
|
|
105
114
|
|
|
106
|
-
|
|
115
|
+
or
|
|
107
116
|
|
|
117
|
+
```python
|
|
108
118
|
textract_predictor = TextractOcrDetector()
|
|
109
119
|
text_extract = TextExtractionService(textract_predictor)
|
|
110
120
|
|
|
@@ -113,13 +123,15 @@ class TextractOcrDetector(ObjectDetector):
|
|
|
113
123
|
|
|
114
124
|
for dp in df:
|
|
115
125
|
...
|
|
126
|
+
```
|
|
116
127
|
|
|
117
128
|
"""
|
|
118
129
|
|
|
119
130
|
def __init__(self, text_lines: bool = False, **credentials_kwargs: str) -> None:
|
|
120
131
|
"""
|
|
121
|
-
:
|
|
122
|
-
|
|
132
|
+
Args:
|
|
133
|
+
text_lines: If `True`, it will return `DetectionResult`s of Text lines as well.
|
|
134
|
+
credentials_kwargs: `aws_access_key_id`, `aws_secret_access_key` or `aws_session_token`
|
|
123
135
|
"""
|
|
124
136
|
self.name = "textract"
|
|
125
137
|
self.model_id = self.get_model_id()
|
|
@@ -133,10 +145,13 @@ class TextractOcrDetector(ObjectDetector):
|
|
|
133
145
|
|
|
134
146
|
def predict(self, np_img: PixelValues) -> list[DetectionResult]:
|
|
135
147
|
"""
|
|
136
|
-
Transfer of a
|
|
148
|
+
Transfer of a `np.array` and call textract `client`. Return of the `DetectionResult`s.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
np_img: image as `np.array`
|
|
137
152
|
|
|
138
|
-
:
|
|
139
|
-
|
|
153
|
+
Returns:
|
|
154
|
+
A list of `DetectionResult`s
|
|
140
155
|
"""
|
|
141
156
|
|
|
142
157
|
return predict_text(np_img, self.client, self.text_lines)
|
|
@@ -37,7 +37,15 @@ with try_import() as tf_import_guard:
|
|
|
37
37
|
|
|
38
38
|
def is_tfv2() -> bool:
|
|
39
39
|
"""
|
|
40
|
-
Returns whether
|
|
40
|
+
Returns whether TensorFlow is operating in V2 mode.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
Whether TensorFlow is operating in V2 mode.
|
|
44
|
+
|
|
45
|
+
Example:
|
|
46
|
+
```python
|
|
47
|
+
is_tfv2()
|
|
48
|
+
```
|
|
41
49
|
"""
|
|
42
50
|
try:
|
|
43
51
|
from tensorflow.python import tf2 # pylint: disable=C0415
|
|
@@ -49,7 +57,15 @@ def is_tfv2() -> bool:
|
|
|
49
57
|
|
|
50
58
|
def disable_tfv2() -> bool:
|
|
51
59
|
"""
|
|
52
|
-
|
|
60
|
+
Disables TensorFlow V2 mode.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
Whether TensorFlow V2 mode was disabled.
|
|
64
|
+
|
|
65
|
+
Example:
|
|
66
|
+
```python
|
|
67
|
+
disable_tfv2()
|
|
68
|
+
```
|
|
53
69
|
"""
|
|
54
70
|
|
|
55
71
|
tfv1 = tf.compat.v1
|
|
@@ -62,20 +78,32 @@ def disable_tfv2() -> bool:
|
|
|
62
78
|
|
|
63
79
|
def disable_tp_layer_logging() -> None:
|
|
64
80
|
"""
|
|
65
|
-
Disables
|
|
81
|
+
Disables tensorpack layer logging, if not already set.
|
|
82
|
+
|
|
83
|
+
Example:
|
|
84
|
+
```python
|
|
85
|
+
disable_tp_layer_logging()
|
|
86
|
+
```
|
|
66
87
|
"""
|
|
67
88
|
disable_layer_logging()
|
|
68
89
|
|
|
69
90
|
|
|
70
91
|
def get_tf_device(device: Optional[Union[str, tf.device]] = None) -> tf.device:
|
|
71
92
|
"""
|
|
72
|
-
|
|
93
|
+
Selects a device on which to load a model. The selection follows a cascade of priorities:
|
|
94
|
+
|
|
95
|
+
- If a `device` string is provided, it is used. If the string is "cuda" or "GPU", the first GPU is used.
|
|
96
|
+
- If the environment variable `USE_CUDA` is set, a GPU is used. If more GPUs are available it will use the first
|
|
97
|
+
one.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
device: Device string.
|
|
73
101
|
|
|
74
|
-
|
|
75
|
-
|
|
102
|
+
Returns:
|
|
103
|
+
TensorFlow device.
|
|
76
104
|
|
|
77
|
-
:
|
|
78
|
-
|
|
105
|
+
Raises:
|
|
106
|
+
EnvironmentError: If `USE_CUDA` is set but no GPU device is found, or if no CPU device is found.
|
|
79
107
|
"""
|
|
80
108
|
if device is not None:
|
|
81
109
|
if isinstance(device, ContextManager):
|