onnxtr 0.2.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {onnxtr-0.2.0 → onnxtr-0.3.0}/PKG-INFO +60 -21
- {onnxtr-0.2.0 → onnxtr-0.3.0}/README.md +58 -19
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/io/elements.py +17 -4
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/io/pdf.py +6 -3
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/__init__.py +1 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/_utils.py +57 -20
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/builder.py +24 -9
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/classification/models/mobilenet.py +12 -5
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/classification/zoo.py +18 -6
- onnxtr-0.3.0/onnxtr/models/detection/_utils/__init__.py +1 -0
- onnxtr-0.3.0/onnxtr/models/detection/_utils/base.py +66 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/detection/models/differentiable_binarization.py +27 -12
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/detection/models/fast.py +30 -9
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/detection/models/linknet.py +24 -9
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/detection/postprocessor/base.py +4 -3
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/detection/predictor/base.py +15 -1
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/detection/zoo.py +12 -3
- onnxtr-0.3.0/onnxtr/models/engine.py +116 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/predictor/base.py +65 -42
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/predictor/predictor.py +22 -15
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/recognition/models/crnn.py +24 -9
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/recognition/models/master.py +14 -5
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/recognition/models/parseq.py +14 -5
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/recognition/models/sar.py +12 -5
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/recognition/models/vitstr.py +18 -7
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/recognition/zoo.py +9 -6
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/zoo.py +16 -0
- onnxtr-0.3.0/onnxtr/py.typed +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/utils/geometry.py +33 -12
- onnxtr-0.3.0/onnxtr/version.py +1 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr.egg-info/PKG-INFO +60 -21
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr.egg-info/SOURCES.txt +3 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr.egg-info/requires.txt +1 -1
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr.egg-info/top_level.txt +0 -1
- {onnxtr-0.2.0 → onnxtr-0.3.0}/pyproject.toml +5 -2
- {onnxtr-0.2.0 → onnxtr-0.3.0}/setup.py +1 -1
- onnxtr-0.2.0/onnxtr/models/engine.py +0 -50
- onnxtr-0.2.0/onnxtr/version.py +0 -1
- {onnxtr-0.2.0 → onnxtr-0.3.0}/LICENSE +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/__init__.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/contrib/__init__.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/contrib/artefacts.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/contrib/base.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/file_utils.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/io/__init__.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/io/html.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/io/image.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/io/reader.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/classification/__init__.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/classification/models/__init__.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/classification/predictor/__init__.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/classification/predictor/base.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/detection/__init__.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/detection/core.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/detection/models/__init__.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/detection/postprocessor/__init__.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/detection/predictor/__init__.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/predictor/__init__.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/preprocessor/__init__.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/preprocessor/base.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/recognition/__init__.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/recognition/core.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/recognition/models/__init__.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/recognition/predictor/__init__.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/recognition/predictor/_utils.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/recognition/predictor/base.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/models/recognition/utils.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/transforms/__init__.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/transforms/base.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/utils/__init__.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/utils/common_types.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/utils/data.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/utils/fonts.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/utils/multithreading.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/utils/reconstitution.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/utils/repr.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/utils/visualization.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr/utils/vocabs.py +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr.egg-info/dependency_links.txt +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/onnxtr.egg-info/zip-safe +0 -0
- {onnxtr-0.2.0 → onnxtr-0.3.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: onnxtr
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Onnx Text Recognition (OnnxTR): docTR Onnx-Wrapper for high-performance OCR on documents.
|
|
5
5
|
Author-email: Felix Dittrich <felixdittrich92@gmail.com>
|
|
6
6
|
Maintainer: Felix Dittrich
|
|
@@ -228,7 +228,7 @@ License-File: LICENSE
|
|
|
228
228
|
Requires-Dist: numpy<2.0.0,>=1.16.0
|
|
229
229
|
Requires-Dist: scipy<2.0.0,>=1.4.0
|
|
230
230
|
Requires-Dist: opencv-python<5.0.0,>=4.5.0
|
|
231
|
-
Requires-Dist: pypdfium2<5.0.0,>=4.
|
|
231
|
+
Requires-Dist: pypdfium2<5.0.0,>=4.11.0
|
|
232
232
|
Requires-Dist: pyclipper<2.0.0,>=1.2.0
|
|
233
233
|
Requires-Dist: shapely<3.0.0,>=1.6.0
|
|
234
234
|
Requires-Dist: rapidfuzz<4.0.0,>=3.0.0
|
|
@@ -275,7 +275,7 @@ Requires-Dist: pre-commit>=2.17.0; extra == "dev"
|
|
|
275
275
|
[](https://codecov.io/gh/felixdittrich92/OnnxTR)
|
|
276
276
|
[](https://app.codacy.com/gh/felixdittrich92/OnnxTR/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade)
|
|
277
277
|
[](https://www.codefactor.io/repository/github/felixdittrich92/onnxtr)
|
|
278
|
-
[](https://pypi.org/project/OnnxTR/)
|
|
279
279
|
|
|
280
280
|
> :warning: Please note that this is a wrapper around the [doctr](https://github.com/mindee/doctr) library to provide a Onnx pipeline for docTR. For feature requests, which are not directly related to the Onnx pipeline, please refer to the base project.
|
|
281
281
|
|
|
@@ -336,11 +336,11 @@ multi_img_doc = DocumentFile.from_images(["path/to/page1.jpg", "path/to/page2.jp
|
|
|
336
336
|
|
|
337
337
|
### Putting it together
|
|
338
338
|
|
|
339
|
-
Let's use the default
|
|
339
|
+
Let's use the default `ocr_predictor` model for an example:
|
|
340
340
|
|
|
341
341
|
```python
|
|
342
342
|
from onnxtr.io import DocumentFile
|
|
343
|
-
from onnxtr.models import ocr_predictor
|
|
343
|
+
from onnxtr.models import ocr_predictor, EngineConfig
|
|
344
344
|
|
|
345
345
|
model = ocr_predictor(
|
|
346
346
|
det_arch='fast_base', # detection architecture
|
|
@@ -357,11 +357,15 @@ model = ocr_predictor(
|
|
|
357
357
|
detect_language=False, # set to `True` if the language of the pages should be detected (default: False)
|
|
358
358
|
# DocumentBuilder specific parameters
|
|
359
359
|
resolve_lines=True, # whether words should be automatically grouped into lines (default: True)
|
|
360
|
-
resolve_blocks=
|
|
360
|
+
resolve_blocks=False, # whether lines should be automatically grouped into blocks (default: False)
|
|
361
361
|
paragraph_break=0.035, # relative length of the minimum space separating paragraphs (default: 0.035)
|
|
362
362
|
# OnnxTR specific parameters
|
|
363
363
|
# NOTE: 8-Bit quantized models are not available for FAST detection models and can in general lead to poorer accuracy
|
|
364
364
|
load_in_8_bit=False, # set to `True` to load 8-bit quantized models instead of the full precision onces (default: False)
|
|
365
|
+
# Advanced engine configuration options
|
|
366
|
+
det_engine_cfg=EngineConfig(), # detection model engine configuration (default: internal predefined configuration)
|
|
367
|
+
reco_engine_cfg=EngineConfig(), # recognition model engine configuration (default: internal predefined configuration)
|
|
368
|
+
clf_engine_cfg=EngineConfig(), # classification (orientation) model engine configuration (default: internal predefined configuration)
|
|
365
369
|
)
|
|
366
370
|
# PDF
|
|
367
371
|
doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
|
|
@@ -399,6 +403,39 @@ for output in xml_output:
|
|
|
399
403
|
|
|
400
404
|
```
|
|
401
405
|
|
|
406
|
+
<details>
|
|
407
|
+
<summary>Advanced engine configuration options</summary>
|
|
408
|
+
|
|
409
|
+
You can also define advanced engine configurations for the models / predictors:
|
|
410
|
+
|
|
411
|
+
```python
|
|
412
|
+
from onnxruntime import SessionOptions
|
|
413
|
+
|
|
414
|
+
from onnxtr.models import ocr_predictor, EngineConfig
|
|
415
|
+
|
|
416
|
+
general_options = SessionOptions() # For configuartion options see: https://onnxruntime.ai/docs/api/python/api_summary.html#sessionoptions
|
|
417
|
+
general_options.enable_cpu_mem_arena = False
|
|
418
|
+
|
|
419
|
+
# NOTE: The following would force to run only on the GPU if no GPU is available it will raise an error
|
|
420
|
+
# List of strings e.g. ["CUDAExecutionProvider", "CPUExecutionProvider"] or a list of tuples with the provider and its options e.g.
|
|
421
|
+
# [("CUDAExecutionProvider", {"device_id": 0}), ("CPUExecutionProvider", {"arena_extend_strategy": "kSameAsRequested"})]
|
|
422
|
+
providers = [("CUDAExecutionProvider", {"device_id": 0})] # For available providers see: https://onnxruntime.ai/docs/execution-providers/
|
|
423
|
+
|
|
424
|
+
engine_config = EngineConfig(
|
|
425
|
+
session_options=general_options,
|
|
426
|
+
providers=providers
|
|
427
|
+
)
|
|
428
|
+
# We use the default predictor with the custom engine configuration
|
|
429
|
+
# NOTE: You can define differnt engine configurations for detection, recognition and classification depending on your needs
|
|
430
|
+
predictor = ocr_predictor(
|
|
431
|
+
det_engine_cfg=engine_config,
|
|
432
|
+
reco_engine_cfg=engine_config,
|
|
433
|
+
clf_engine_cfg=engine_config
|
|
434
|
+
)
|
|
435
|
+
```
|
|
436
|
+
|
|
437
|
+
</details>
|
|
438
|
+
|
|
402
439
|
## Loading custom exported models
|
|
403
440
|
|
|
404
441
|
You can also load docTR custom exported models:
|
|
@@ -485,24 +522,26 @@ The smallest combination in OnnxTR (docTR) of `db_mobilenet_v3_large` and `crnn_
|
|
|
485
522
|
|
|
486
523
|
- CPU benchmarks:
|
|
487
524
|
|
|
488
|
-
|Library
|
|
489
|
-
|
|
490
|
-
|docTR (CPU) - v0.8.1
|
|
491
|
-
|**OnnxTR (CPU)** - v0.1.2
|
|
492
|
-
|**OnnxTR (CPU) 8-bit** - v0.1.2
|
|
493
|
-
|EasyOCR (CPU) - v1.7.1
|
|
494
|
-
|**PyTesseract (CPU)** - v0.3.10
|
|
495
|
-
|Surya (line) (CPU) - v0.4.4
|
|
525
|
+
|Library |FUNSD (199 pages) |CORD (900 pages) |
|
|
526
|
+
|---------------------------------|-------------------------------|-------------------------------|
|
|
527
|
+
|docTR (CPU) - v0.8.1 | ~1.29s / Page | ~0.60s / Page |
|
|
528
|
+
|**OnnxTR (CPU)** - v0.1.2 | ~0.57s / Page | **~0.25s / Page** |
|
|
529
|
+
|**OnnxTR (CPU) 8-bit** - v0.1.2 | **~0.38s / Page** | **~0.14s / Page** |
|
|
530
|
+
|EasyOCR (CPU) - v1.7.1 | ~1.96s / Page | ~1.75s / Page |
|
|
531
|
+
|**PyTesseract (CPU)** - v0.3.10 | **~0.50s / Page** | ~0.52s / Page |
|
|
532
|
+
|Surya (line) (CPU) - v0.4.4 | ~48.76s / Page | ~35.49s / Page |
|
|
533
|
+
|PaddleOCR (CPU) - no cls - v2.7.3| ~1.27s / Page | ~0.38s / Page |
|
|
496
534
|
|
|
497
535
|
- GPU benchmarks:
|
|
498
536
|
|
|
499
|
-
|Library
|
|
500
|
-
|
|
501
|
-
|docTR (GPU) - v0.8.1
|
|
502
|
-
|**docTR (GPU) float16** - v0.8.1| **~0.06s / Page** | **~0.03s / Page** |
|
|
503
|
-
|OnnxTR (GPU) - v0.1.2
|
|
504
|
-
|EasyOCR (GPU) - v1.7.1
|
|
505
|
-
|Surya (GPU) float16 - v0.4.4
|
|
537
|
+
|Library |FUNSD (199 pages) |CORD (900 pages) |
|
|
538
|
+
|-------------------------------------|-------------------------------|-------------------------------|
|
|
539
|
+
|docTR (GPU) - v0.8.1 | ~0.07s / Page | ~0.05s / Page |
|
|
540
|
+
|**docTR (GPU) float16** - v0.8.1 | **~0.06s / Page** | **~0.03s / Page** |
|
|
541
|
+
|OnnxTR (GPU) - v0.1.2 | **~0.06s / Page** | ~0.04s / Page |
|
|
542
|
+
|EasyOCR (GPU) - v1.7.1 | ~0.31s / Page | ~0.19s / Page |
|
|
543
|
+
|Surya (GPU) float16 - v0.4.4 | ~3.70s / Page | ~2.81s / Page |
|
|
544
|
+
|**PaddleOCR (GPU) - no cls - v2.7.3**| ~0.08s / Page | **~0.03s / Page** |
|
|
506
545
|
|
|
507
546
|
## Citation
|
|
508
547
|
|
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
[](https://codecov.io/gh/felixdittrich92/OnnxTR)
|
|
8
8
|
[](https://app.codacy.com/gh/felixdittrich92/OnnxTR/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade)
|
|
9
9
|
[](https://www.codefactor.io/repository/github/felixdittrich92/onnxtr)
|
|
10
|
-
[](https://pypi.org/project/OnnxTR/)
|
|
11
11
|
|
|
12
12
|
> :warning: Please note that this is a wrapper around the [doctr](https://github.com/mindee/doctr) library to provide a Onnx pipeline for docTR. For feature requests, which are not directly related to the Onnx pipeline, please refer to the base project.
|
|
13
13
|
|
|
@@ -68,11 +68,11 @@ multi_img_doc = DocumentFile.from_images(["path/to/page1.jpg", "path/to/page2.jp
|
|
|
68
68
|
|
|
69
69
|
### Putting it together
|
|
70
70
|
|
|
71
|
-
Let's use the default
|
|
71
|
+
Let's use the default `ocr_predictor` model for an example:
|
|
72
72
|
|
|
73
73
|
```python
|
|
74
74
|
from onnxtr.io import DocumentFile
|
|
75
|
-
from onnxtr.models import ocr_predictor
|
|
75
|
+
from onnxtr.models import ocr_predictor, EngineConfig
|
|
76
76
|
|
|
77
77
|
model = ocr_predictor(
|
|
78
78
|
det_arch='fast_base', # detection architecture
|
|
@@ -89,11 +89,15 @@ model = ocr_predictor(
|
|
|
89
89
|
detect_language=False, # set to `True` if the language of the pages should be detected (default: False)
|
|
90
90
|
# DocumentBuilder specific parameters
|
|
91
91
|
resolve_lines=True, # whether words should be automatically grouped into lines (default: True)
|
|
92
|
-
resolve_blocks=
|
|
92
|
+
resolve_blocks=False, # whether lines should be automatically grouped into blocks (default: False)
|
|
93
93
|
paragraph_break=0.035, # relative length of the minimum space separating paragraphs (default: 0.035)
|
|
94
94
|
# OnnxTR specific parameters
|
|
95
95
|
# NOTE: 8-Bit quantized models are not available for FAST detection models and can in general lead to poorer accuracy
|
|
96
96
|
load_in_8_bit=False, # set to `True` to load 8-bit quantized models instead of the full precision onces (default: False)
|
|
97
|
+
# Advanced engine configuration options
|
|
98
|
+
det_engine_cfg=EngineConfig(), # detection model engine configuration (default: internal predefined configuration)
|
|
99
|
+
reco_engine_cfg=EngineConfig(), # recognition model engine configuration (default: internal predefined configuration)
|
|
100
|
+
clf_engine_cfg=EngineConfig(), # classification (orientation) model engine configuration (default: internal predefined configuration)
|
|
97
101
|
)
|
|
98
102
|
# PDF
|
|
99
103
|
doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
|
|
@@ -131,6 +135,39 @@ for output in xml_output:
|
|
|
131
135
|
|
|
132
136
|
```
|
|
133
137
|
|
|
138
|
+
<details>
|
|
139
|
+
<summary>Advanced engine configuration options</summary>
|
|
140
|
+
|
|
141
|
+
You can also define advanced engine configurations for the models / predictors:
|
|
142
|
+
|
|
143
|
+
```python
|
|
144
|
+
from onnxruntime import SessionOptions
|
|
145
|
+
|
|
146
|
+
from onnxtr.models import ocr_predictor, EngineConfig
|
|
147
|
+
|
|
148
|
+
general_options = SessionOptions() # For configuartion options see: https://onnxruntime.ai/docs/api/python/api_summary.html#sessionoptions
|
|
149
|
+
general_options.enable_cpu_mem_arena = False
|
|
150
|
+
|
|
151
|
+
# NOTE: The following would force to run only on the GPU if no GPU is available it will raise an error
|
|
152
|
+
# List of strings e.g. ["CUDAExecutionProvider", "CPUExecutionProvider"] or a list of tuples with the provider and its options e.g.
|
|
153
|
+
# [("CUDAExecutionProvider", {"device_id": 0}), ("CPUExecutionProvider", {"arena_extend_strategy": "kSameAsRequested"})]
|
|
154
|
+
providers = [("CUDAExecutionProvider", {"device_id": 0})] # For available providers see: https://onnxruntime.ai/docs/execution-providers/
|
|
155
|
+
|
|
156
|
+
engine_config = EngineConfig(
|
|
157
|
+
session_options=general_options,
|
|
158
|
+
providers=providers
|
|
159
|
+
)
|
|
160
|
+
# We use the default predictor with the custom engine configuration
|
|
161
|
+
# NOTE: You can define differnt engine configurations for detection, recognition and classification depending on your needs
|
|
162
|
+
predictor = ocr_predictor(
|
|
163
|
+
det_engine_cfg=engine_config,
|
|
164
|
+
reco_engine_cfg=engine_config,
|
|
165
|
+
clf_engine_cfg=engine_config
|
|
166
|
+
)
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
</details>
|
|
170
|
+
|
|
134
171
|
## Loading custom exported models
|
|
135
172
|
|
|
136
173
|
You can also load docTR custom exported models:
|
|
@@ -217,24 +254,26 @@ The smallest combination in OnnxTR (docTR) of `db_mobilenet_v3_large` and `crnn_
|
|
|
217
254
|
|
|
218
255
|
- CPU benchmarks:
|
|
219
256
|
|
|
220
|
-
|Library
|
|
221
|
-
|
|
222
|
-
|docTR (CPU) - v0.8.1
|
|
223
|
-
|**OnnxTR (CPU)** - v0.1.2
|
|
224
|
-
|**OnnxTR (CPU) 8-bit** - v0.1.2
|
|
225
|
-
|EasyOCR (CPU) - v1.7.1
|
|
226
|
-
|**PyTesseract (CPU)** - v0.3.10
|
|
227
|
-
|Surya (line) (CPU) - v0.4.4
|
|
257
|
+
|Library |FUNSD (199 pages) |CORD (900 pages) |
|
|
258
|
+
|---------------------------------|-------------------------------|-------------------------------|
|
|
259
|
+
|docTR (CPU) - v0.8.1 | ~1.29s / Page | ~0.60s / Page |
|
|
260
|
+
|**OnnxTR (CPU)** - v0.1.2 | ~0.57s / Page | **~0.25s / Page** |
|
|
261
|
+
|**OnnxTR (CPU) 8-bit** - v0.1.2 | **~0.38s / Page** | **~0.14s / Page** |
|
|
262
|
+
|EasyOCR (CPU) - v1.7.1 | ~1.96s / Page | ~1.75s / Page |
|
|
263
|
+
|**PyTesseract (CPU)** - v0.3.10 | **~0.50s / Page** | ~0.52s / Page |
|
|
264
|
+
|Surya (line) (CPU) - v0.4.4 | ~48.76s / Page | ~35.49s / Page |
|
|
265
|
+
|PaddleOCR (CPU) - no cls - v2.7.3| ~1.27s / Page | ~0.38s / Page |
|
|
228
266
|
|
|
229
267
|
- GPU benchmarks:
|
|
230
268
|
|
|
231
|
-
|Library
|
|
232
|
-
|
|
233
|
-
|docTR (GPU) - v0.8.1
|
|
234
|
-
|**docTR (GPU) float16** - v0.8.1| **~0.06s / Page** | **~0.03s / Page** |
|
|
235
|
-
|OnnxTR (GPU) - v0.1.2
|
|
236
|
-
|EasyOCR (GPU) - v1.7.1
|
|
237
|
-
|Surya (GPU) float16 - v0.4.4
|
|
269
|
+
|Library |FUNSD (199 pages) |CORD (900 pages) |
|
|
270
|
+
|-------------------------------------|-------------------------------|-------------------------------|
|
|
271
|
+
|docTR (GPU) - v0.8.1 | ~0.07s / Page | ~0.05s / Page |
|
|
272
|
+
|**docTR (GPU) float16** - v0.8.1 | **~0.06s / Page** | **~0.03s / Page** |
|
|
273
|
+
|OnnxTR (GPU) - v0.1.2 | **~0.06s / Page** | ~0.04s / Page |
|
|
274
|
+
|EasyOCR (GPU) - v1.7.1 | ~0.31s / Page | ~0.19s / Page |
|
|
275
|
+
|Surya (GPU) float16 - v0.4.4 | ~3.70s / Page | ~2.81s / Page |
|
|
276
|
+
|**PaddleOCR (GPU) - no cls - v2.7.3**| ~0.08s / Page | **~0.03s / Page** |
|
|
238
277
|
|
|
239
278
|
## Citation
|
|
240
279
|
|
|
@@ -67,10 +67,11 @@ class Word(Element):
|
|
|
67
67
|
confidence: the confidence associated with the text prediction
|
|
68
68
|
geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
|
|
69
69
|
the page's size
|
|
70
|
+
objectness_score: the objectness score of the detection
|
|
70
71
|
crop_orientation: the general orientation of the crop in degrees and its confidence
|
|
71
72
|
"""
|
|
72
73
|
|
|
73
|
-
_exported_keys: List[str] = ["value", "confidence", "geometry", "crop_orientation"]
|
|
74
|
+
_exported_keys: List[str] = ["value", "confidence", "geometry", "objectness_score", "crop_orientation"]
|
|
74
75
|
_children_names: List[str] = []
|
|
75
76
|
|
|
76
77
|
def __init__(
|
|
@@ -78,12 +79,14 @@ class Word(Element):
|
|
|
78
79
|
value: str,
|
|
79
80
|
confidence: float,
|
|
80
81
|
geometry: Union[BoundingBox, np.ndarray],
|
|
82
|
+
objectness_score: float,
|
|
81
83
|
crop_orientation: Dict[str, Any],
|
|
82
84
|
) -> None:
|
|
83
85
|
super().__init__()
|
|
84
86
|
self.value = value
|
|
85
87
|
self.confidence = confidence
|
|
86
88
|
self.geometry = geometry
|
|
89
|
+
self.objectness_score = objectness_score
|
|
87
90
|
self.crop_orientation = crop_orientation
|
|
88
91
|
|
|
89
92
|
def render(self) -> str:
|
|
@@ -143,7 +146,7 @@ class Line(Element):
|
|
|
143
146
|
all words in it.
|
|
144
147
|
"""
|
|
145
148
|
|
|
146
|
-
_exported_keys: List[str] = ["geometry"]
|
|
149
|
+
_exported_keys: List[str] = ["geometry", "objectness_score"]
|
|
147
150
|
_children_names: List[str] = ["words"]
|
|
148
151
|
words: List[Word] = []
|
|
149
152
|
|
|
@@ -151,7 +154,11 @@ class Line(Element):
|
|
|
151
154
|
self,
|
|
152
155
|
words: List[Word],
|
|
153
156
|
geometry: Optional[Union[BoundingBox, np.ndarray]] = None,
|
|
157
|
+
objectness_score: Optional[float] = None,
|
|
154
158
|
) -> None:
|
|
159
|
+
# Compute the objectness score of the line
|
|
160
|
+
if objectness_score is None:
|
|
161
|
+
objectness_score = float(np.mean([w.objectness_score for w in words]))
|
|
155
162
|
# Resolve the geometry using the smallest enclosing bounding box
|
|
156
163
|
if geometry is None:
|
|
157
164
|
# Check whether this is a rotated or straight box
|
|
@@ -160,6 +167,7 @@ class Line(Element):
|
|
|
160
167
|
|
|
161
168
|
super().__init__(words=words)
|
|
162
169
|
self.geometry = geometry
|
|
170
|
+
self.objectness_score = objectness_score
|
|
163
171
|
|
|
164
172
|
def render(self) -> str:
|
|
165
173
|
"""Renders the full text of the element"""
|
|
@@ -186,7 +194,7 @@ class Block(Element):
|
|
|
186
194
|
all lines and artefacts in it.
|
|
187
195
|
"""
|
|
188
196
|
|
|
189
|
-
_exported_keys: List[str] = ["geometry"]
|
|
197
|
+
_exported_keys: List[str] = ["geometry", "objectness_score"]
|
|
190
198
|
_children_names: List[str] = ["lines", "artefacts"]
|
|
191
199
|
lines: List[Line] = []
|
|
192
200
|
artefacts: List[Artefact] = []
|
|
@@ -196,7 +204,11 @@ class Block(Element):
|
|
|
196
204
|
lines: List[Line] = [],
|
|
197
205
|
artefacts: List[Artefact] = [],
|
|
198
206
|
geometry: Optional[Union[BoundingBox, np.ndarray]] = None,
|
|
207
|
+
objectness_score: Optional[float] = None,
|
|
199
208
|
) -> None:
|
|
209
|
+
# Compute the objectness score of the line
|
|
210
|
+
if objectness_score is None:
|
|
211
|
+
objectness_score = float(np.mean([w.objectness_score for line in lines for w in line.words]))
|
|
200
212
|
# Resolve the geometry using the smallest enclosing bounding box
|
|
201
213
|
if geometry is None:
|
|
202
214
|
line_boxes = [word.geometry for line in lines for word in line.words]
|
|
@@ -208,6 +220,7 @@ class Block(Element):
|
|
|
208
220
|
|
|
209
221
|
super().__init__(lines=lines, artefacts=artefacts)
|
|
210
222
|
self.geometry = geometry
|
|
223
|
+
self.objectness_score = objectness_score
|
|
211
224
|
|
|
212
225
|
def render(self, line_break: str = "\n") -> str:
|
|
213
226
|
"""Renders the full text of the element"""
|
|
@@ -314,7 +327,7 @@ class Page(Element):
|
|
|
314
327
|
SubElement(
|
|
315
328
|
head,
|
|
316
329
|
"meta",
|
|
317
|
-
attrib={"name": "ocr-system", "content": f" {onnxtr.__version__}"}, # type: ignore[attr-defined]
|
|
330
|
+
attrib={"name": "ocr-system", "content": f"onnxtr {onnxtr.__version__}"}, # type: ignore[attr-defined]
|
|
318
331
|
)
|
|
319
332
|
SubElement(
|
|
320
333
|
head,
|
|
@@ -15,7 +15,7 @@ __all__ = ["read_pdf"]
|
|
|
15
15
|
|
|
16
16
|
def read_pdf(
|
|
17
17
|
file: AbstractFile,
|
|
18
|
-
scale:
|
|
18
|
+
scale: int = 2,
|
|
19
19
|
rgb_mode: bool = True,
|
|
20
20
|
password: Optional[str] = None,
|
|
21
21
|
**kwargs: Any,
|
|
@@ -38,5 +38,8 @@ def read_pdf(
|
|
|
38
38
|
the list of pages decoded as numpy ndarray of shape H x W x C
|
|
39
39
|
"""
|
|
40
40
|
# Rasterise pages to numpy ndarrays with pypdfium2
|
|
41
|
-
pdf = pdfium.PdfDocument(file, password=password
|
|
42
|
-
|
|
41
|
+
pdf = pdfium.PdfDocument(file, password=password)
|
|
42
|
+
try:
|
|
43
|
+
return [page.render(scale=scale, rev_byteorder=rgb_mode, **kwargs).to_numpy() for page in pdf]
|
|
44
|
+
finally:
|
|
45
|
+
pdf.close()
|
|
@@ -11,6 +11,8 @@ import cv2
|
|
|
11
11
|
import numpy as np
|
|
12
12
|
from langdetect import LangDetectException, detect_langs
|
|
13
13
|
|
|
14
|
+
from onnxtr.utils.geometry import rotate_image
|
|
15
|
+
|
|
14
16
|
__all__ = ["estimate_orientation", "get_language"]
|
|
15
17
|
|
|
16
18
|
|
|
@@ -29,56 +31,91 @@ def get_max_width_length_ratio(contour: np.ndarray) -> float:
|
|
|
29
31
|
return max(w / h, h / w)
|
|
30
32
|
|
|
31
33
|
|
|
32
|
-
def estimate_orientation(
|
|
34
|
+
def estimate_orientation(
|
|
35
|
+
img: np.ndarray,
|
|
36
|
+
general_page_orientation: Optional[Tuple[int, float]] = None,
|
|
37
|
+
n_ct: int = 70,
|
|
38
|
+
ratio_threshold_for_lines: float = 3,
|
|
39
|
+
min_confidence: float = 0.2,
|
|
40
|
+
lower_area: int = 100,
|
|
41
|
+
) -> int:
|
|
33
42
|
"""Estimate the angle of the general document orientation based on the
|
|
34
43
|
lines of the document and the assumption that they should be horizontal.
|
|
35
44
|
|
|
36
45
|
Args:
|
|
37
46
|
----
|
|
38
47
|
img: the img or bitmap to analyze (H, W, C)
|
|
48
|
+
general_page_orientation: the general orientation of the page (angle [0, 90, 180, 270 (-90)], confidence)
|
|
49
|
+
estimated by a model
|
|
39
50
|
n_ct: the number of contours used for the orientation estimation
|
|
40
51
|
ratio_threshold_for_lines: this is the ratio w/h used to discriminates lines
|
|
52
|
+
min_confidence: the minimum confidence to consider the general_page_orientation
|
|
53
|
+
lower_area: the minimum area of a contour to be considered
|
|
41
54
|
|
|
42
55
|
Returns:
|
|
43
56
|
-------
|
|
44
|
-
the angle of the
|
|
57
|
+
the estimated angle of the page (clockwise, negative for left side rotation, positive for right side rotation)
|
|
45
58
|
"""
|
|
46
59
|
assert len(img.shape) == 3 and img.shape[-1] in [1, 3], f"Image shape {img.shape} not supported"
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
if
|
|
50
|
-
thresh = img.astype(np.uint8)
|
|
51
|
-
if max_value <= 255 and min_value >= 0 and img.shape[-1] == 3:
|
|
60
|
+
thresh = None
|
|
61
|
+
# Convert image to grayscale if necessary
|
|
62
|
+
if img.shape[-1] == 3:
|
|
52
63
|
gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
|
53
64
|
gray_img = cv2.medianBlur(gray_img, 5)
|
|
54
|
-
thresh = cv2.threshold(gray_img, thresh=0, maxval=255, type=cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
65
|
+
thresh = cv2.threshold(gray_img, thresh=0, maxval=255, type=cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
|
|
66
|
+
else:
|
|
67
|
+
thresh = img.astype(np.uint8) # type: ignore[assignment]
|
|
68
|
+
|
|
69
|
+
page_orientation, orientation_confidence = general_page_orientation or (None, 0.0)
|
|
70
|
+
if page_orientation and orientation_confidence >= min_confidence:
|
|
71
|
+
# We rotate the image to the general orientation which improves the detection
|
|
72
|
+
# No expand needed bitmap is already padded
|
|
73
|
+
thresh = rotate_image(thresh, -page_orientation) # type: ignore
|
|
74
|
+
else: # That's only required if we do not work on the detection models bin map
|
|
75
|
+
# try to merge words in lines
|
|
76
|
+
(h, w) = img.shape[:2]
|
|
77
|
+
k_x = max(1, (floor(w / 100)))
|
|
78
|
+
k_y = max(1, (floor(h / 100)))
|
|
79
|
+
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (k_x, k_y))
|
|
80
|
+
thresh = cv2.dilate(thresh, kernel, iterations=1)
|
|
62
81
|
|
|
63
82
|
# extract contours
|
|
64
83
|
contours, _ = cv2.findContours(thresh, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
|
|
65
84
|
|
|
66
|
-
# Sort contours
|
|
67
|
-
contours = sorted(
|
|
85
|
+
# Filter & Sort contours
|
|
86
|
+
contours = sorted(
|
|
87
|
+
[contour for contour in contours if cv2.contourArea(contour) > lower_area],
|
|
88
|
+
key=get_max_width_length_ratio,
|
|
89
|
+
reverse=True,
|
|
90
|
+
)
|
|
68
91
|
|
|
69
92
|
angles = []
|
|
70
93
|
for contour in contours[:n_ct]:
|
|
71
|
-
_, (w, h), angle = cv2.minAreaRect(contour)
|
|
94
|
+
_, (w, h), angle = cv2.minAreaRect(contour) # type: ignore[assignment]
|
|
72
95
|
if w / h > ratio_threshold_for_lines: # select only contours with ratio like lines
|
|
73
96
|
angles.append(angle)
|
|
74
97
|
elif w / h < 1 / ratio_threshold_for_lines: # if lines are vertical, substract 90 degree
|
|
75
98
|
angles.append(angle - 90)
|
|
76
99
|
|
|
77
100
|
if len(angles) == 0:
|
|
78
|
-
|
|
101
|
+
estimated_angle = 0 # in case no angles is found
|
|
79
102
|
else:
|
|
80
103
|
median = -median_low(angles)
|
|
81
|
-
|
|
104
|
+
estimated_angle = -round(median) if abs(median) != 0 else 0
|
|
105
|
+
|
|
106
|
+
# combine with the general orientation and the estimated angle
|
|
107
|
+
if page_orientation and orientation_confidence >= min_confidence:
|
|
108
|
+
# special case where the estimated angle is mostly wrong:
|
|
109
|
+
# case 1: - and + swapped
|
|
110
|
+
# case 2: estimated angle is completely wrong
|
|
111
|
+
# so in this case we prefer the general page orientation
|
|
112
|
+
if abs(estimated_angle) == abs(page_orientation):
|
|
113
|
+
return page_orientation
|
|
114
|
+
estimated_angle = estimated_angle if page_orientation == 0 else page_orientation + estimated_angle
|
|
115
|
+
if estimated_angle > 180:
|
|
116
|
+
estimated_angle -= 360
|
|
117
|
+
|
|
118
|
+
return estimated_angle # return the clockwise angle (negative - left side rotation, positive - right side rotation)
|
|
82
119
|
|
|
83
120
|
|
|
84
121
|
def rectify_crops(
|
|
@@ -31,7 +31,7 @@ class DocumentBuilder(NestedObject):
|
|
|
31
31
|
def __init__(
|
|
32
32
|
self,
|
|
33
33
|
resolve_lines: bool = True,
|
|
34
|
-
resolve_blocks: bool =
|
|
34
|
+
resolve_blocks: bool = False,
|
|
35
35
|
paragraph_break: float = 0.035,
|
|
36
36
|
export_as_straight_boxes: bool = False,
|
|
37
37
|
) -> None:
|
|
@@ -223,6 +223,7 @@ class DocumentBuilder(NestedObject):
|
|
|
223
223
|
def _build_blocks(
|
|
224
224
|
self,
|
|
225
225
|
boxes: np.ndarray,
|
|
226
|
+
objectness_scores: np.ndarray,
|
|
226
227
|
word_preds: List[Tuple[str, float]],
|
|
227
228
|
crop_orientations: List[Dict[str, Any]],
|
|
228
229
|
) -> List[Block]:
|
|
@@ -230,7 +231,8 @@ class DocumentBuilder(NestedObject):
|
|
|
230
231
|
|
|
231
232
|
Args:
|
|
232
233
|
----
|
|
233
|
-
boxes: bounding boxes of all detected words of the page, of shape (N,
|
|
234
|
+
boxes: bounding boxes of all detected words of the page, of shape (N, 4) or (N, 4, 2)
|
|
235
|
+
objectness_scores: objectness scores of all detected words of the page, of shape N
|
|
234
236
|
word_preds: list of all detected words of the page, of shape N
|
|
235
237
|
crop_orientations: list of dictoinaries containing
|
|
236
238
|
the general orientation (orientations + confidences) of the crops
|
|
@@ -265,12 +267,14 @@ class DocumentBuilder(NestedObject):
|
|
|
265
267
|
Word(
|
|
266
268
|
*word_preds[idx],
|
|
267
269
|
tuple([tuple(pt) for pt in boxes[idx].tolist()]), # type: ignore[arg-type]
|
|
270
|
+
float(objectness_scores[idx]),
|
|
268
271
|
crop_orientations[idx],
|
|
269
272
|
)
|
|
270
273
|
if boxes.ndim == 3
|
|
271
274
|
else Word(
|
|
272
275
|
*word_preds[idx],
|
|
273
276
|
((boxes[idx, 0], boxes[idx, 1]), (boxes[idx, 2], boxes[idx, 3])),
|
|
277
|
+
float(objectness_scores[idx]),
|
|
274
278
|
crop_orientations[idx],
|
|
275
279
|
)
|
|
276
280
|
for idx in line
|
|
@@ -293,6 +297,7 @@ class DocumentBuilder(NestedObject):
|
|
|
293
297
|
self,
|
|
294
298
|
pages: List[np.ndarray],
|
|
295
299
|
boxes: List[np.ndarray],
|
|
300
|
+
objectness_scores: List[np.ndarray],
|
|
296
301
|
text_preds: List[List[Tuple[str, float]]],
|
|
297
302
|
page_shapes: List[Tuple[int, int]],
|
|
298
303
|
crop_orientations: List[Dict[str, Any]],
|
|
@@ -304,8 +309,9 @@ class DocumentBuilder(NestedObject):
|
|
|
304
309
|
Args:
|
|
305
310
|
----
|
|
306
311
|
pages: list of N elements, where each element represents the page image
|
|
307
|
-
boxes: list of N elements, where each element represents the localization predictions, of shape (*,
|
|
308
|
-
or (*,
|
|
312
|
+
boxes: list of N elements, where each element represents the localization predictions, of shape (*, 4)
|
|
313
|
+
or (*, 4, 2) for all words for a given page
|
|
314
|
+
objectness_scores: list of N elements, where each element represents the objectness scores
|
|
309
315
|
text_preds: list of N elements, where each element is the list of all word prediction (text + confidence)
|
|
310
316
|
page_shapes: shape of each page, of size N
|
|
311
317
|
crop_orientations: list of N elements, where each element is
|
|
@@ -319,9 +325,9 @@ class DocumentBuilder(NestedObject):
|
|
|
319
325
|
-------
|
|
320
326
|
document object
|
|
321
327
|
"""
|
|
322
|
-
if len(boxes) != len(text_preds) != len(crop_orientations)
|
|
323
|
-
|
|
324
|
-
):
|
|
328
|
+
if len(boxes) != len(text_preds) != len(crop_orientations) != len(objectness_scores) or len(boxes) != len(
|
|
329
|
+
page_shapes
|
|
330
|
+
) != len(crop_orientations) != len(objectness_scores):
|
|
325
331
|
raise ValueError("All arguments are expected to be lists of the same size")
|
|
326
332
|
|
|
327
333
|
_orientations = (
|
|
@@ -339,6 +345,7 @@ class DocumentBuilder(NestedObject):
|
|
|
339
345
|
page,
|
|
340
346
|
self._build_blocks(
|
|
341
347
|
page_boxes,
|
|
348
|
+
loc_scores,
|
|
342
349
|
word_preds,
|
|
343
350
|
word_crop_orientations,
|
|
344
351
|
),
|
|
@@ -347,8 +354,16 @@ class DocumentBuilder(NestedObject):
|
|
|
347
354
|
orientation,
|
|
348
355
|
language,
|
|
349
356
|
)
|
|
350
|
-
for page, _idx, shape, page_boxes, word_preds, word_crop_orientations, orientation, language in zip(
|
|
351
|
-
pages,
|
|
357
|
+
for page, _idx, shape, page_boxes, loc_scores, word_preds, word_crop_orientations, orientation, language in zip( # noqa: E501
|
|
358
|
+
pages,
|
|
359
|
+
range(len(boxes)),
|
|
360
|
+
page_shapes,
|
|
361
|
+
boxes,
|
|
362
|
+
objectness_scores,
|
|
363
|
+
text_preds,
|
|
364
|
+
crop_orientations,
|
|
365
|
+
_orientations,
|
|
366
|
+
_languages,
|
|
352
367
|
)
|
|
353
368
|
]
|
|
354
369
|
|