onnxtr 0.1.2__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. {onnxtr-0.1.2 → onnxtr-0.3.0}/PKG-INFO +81 -16
  2. {onnxtr-0.1.2 → onnxtr-0.3.0}/README.md +79 -14
  3. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/io/elements.py +17 -4
  4. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/io/pdf.py +6 -3
  5. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/__init__.py +1 -0
  6. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/_utils.py +57 -20
  7. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/builder.py +24 -9
  8. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/classification/models/mobilenet.py +25 -7
  9. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/classification/predictor/base.py +1 -0
  10. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/classification/zoo.py +22 -7
  11. onnxtr-0.3.0/onnxtr/models/detection/_utils/__init__.py +1 -0
  12. onnxtr-0.3.0/onnxtr/models/detection/_utils/base.py +66 -0
  13. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/detection/models/differentiable_binarization.py +41 -11
  14. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/detection/models/fast.py +37 -9
  15. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/detection/models/linknet.py +39 -9
  16. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/detection/postprocessor/base.py +4 -3
  17. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/detection/predictor/base.py +15 -1
  18. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/detection/zoo.py +16 -3
  19. onnxtr-0.3.0/onnxtr/models/engine.py +116 -0
  20. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/predictor/base.py +69 -42
  21. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/predictor/predictor.py +22 -15
  22. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/recognition/models/crnn.py +39 -9
  23. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/recognition/models/master.py +19 -5
  24. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/recognition/models/parseq.py +20 -5
  25. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/recognition/models/sar.py +19 -5
  26. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/recognition/models/vitstr.py +31 -9
  27. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/recognition/zoo.py +12 -6
  28. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/zoo.py +22 -0
  29. onnxtr-0.3.0/onnxtr/py.typed +0 -0
  30. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/utils/geometry.py +33 -12
  31. onnxtr-0.3.0/onnxtr/version.py +1 -0
  32. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr.egg-info/PKG-INFO +81 -16
  33. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr.egg-info/SOURCES.txt +3 -0
  34. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr.egg-info/requires.txt +1 -1
  35. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr.egg-info/top_level.txt +0 -1
  36. {onnxtr-0.1.2 → onnxtr-0.3.0}/pyproject.toml +6 -2
  37. {onnxtr-0.1.2 → onnxtr-0.3.0}/setup.py +1 -1
  38. onnxtr-0.1.2/onnxtr/models/engine.py +0 -50
  39. onnxtr-0.1.2/onnxtr/version.py +0 -1
  40. {onnxtr-0.1.2 → onnxtr-0.3.0}/LICENSE +0 -0
  41. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/__init__.py +0 -0
  42. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/contrib/__init__.py +0 -0
  43. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/contrib/artefacts.py +0 -0
  44. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/contrib/base.py +0 -0
  45. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/file_utils.py +0 -0
  46. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/io/__init__.py +0 -0
  47. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/io/html.py +0 -0
  48. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/io/image.py +0 -0
  49. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/io/reader.py +0 -0
  50. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/classification/__init__.py +0 -0
  51. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/classification/models/__init__.py +0 -0
  52. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/classification/predictor/__init__.py +0 -0
  53. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/detection/__init__.py +0 -0
  54. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/detection/core.py +0 -0
  55. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/detection/models/__init__.py +0 -0
  56. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/detection/postprocessor/__init__.py +0 -0
  57. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/detection/predictor/__init__.py +0 -0
  58. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/predictor/__init__.py +0 -0
  59. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/preprocessor/__init__.py +0 -0
  60. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/preprocessor/base.py +0 -0
  61. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/recognition/__init__.py +0 -0
  62. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/recognition/core.py +0 -0
  63. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/recognition/models/__init__.py +0 -0
  64. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/recognition/predictor/__init__.py +0 -0
  65. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/recognition/predictor/_utils.py +0 -0
  66. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/recognition/predictor/base.py +0 -0
  67. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/models/recognition/utils.py +0 -0
  68. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/transforms/__init__.py +0 -0
  69. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/transforms/base.py +0 -0
  70. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/utils/__init__.py +0 -0
  71. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/utils/common_types.py +0 -0
  72. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/utils/data.py +0 -0
  73. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/utils/fonts.py +0 -0
  74. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/utils/multithreading.py +0 -0
  75. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/utils/reconstitution.py +0 -0
  76. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/utils/repr.py +0 -0
  77. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/utils/visualization.py +0 -0
  78. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr/utils/vocabs.py +0 -0
  79. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr.egg-info/dependency_links.txt +0 -0
  80. {onnxtr-0.1.2 → onnxtr-0.3.0}/onnxtr.egg-info/zip-safe +0 -0
  81. {onnxtr-0.1.2 → onnxtr-0.3.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: onnxtr
3
- Version: 0.1.2
3
+ Version: 0.3.0
4
4
  Summary: Onnx Text Recognition (OnnxTR): docTR Onnx-Wrapper for high-performance OCR on documents.
5
5
  Author-email: Felix Dittrich <felixdittrich92@gmail.com>
6
6
  Maintainer: Felix Dittrich
@@ -228,7 +228,7 @@ License-File: LICENSE
228
228
  Requires-Dist: numpy<2.0.0,>=1.16.0
229
229
  Requires-Dist: scipy<2.0.0,>=1.4.0
230
230
  Requires-Dist: opencv-python<5.0.0,>=4.5.0
231
- Requires-Dist: pypdfium2<5.0.0,>=4.0.0
231
+ Requires-Dist: pypdfium2<5.0.0,>=4.11.0
232
232
  Requires-Dist: pyclipper<2.0.0,>=1.2.0
233
233
  Requires-Dist: shapely<3.0.0,>=1.6.0
234
234
  Requires-Dist: rapidfuzz<4.0.0,>=3.0.0
@@ -275,7 +275,7 @@ Requires-Dist: pre-commit>=2.17.0; extra == "dev"
275
275
  [![codecov](https://codecov.io/gh/felixdittrich92/OnnxTR/graph/badge.svg?token=WVFRCQBOLI)](https://codecov.io/gh/felixdittrich92/OnnxTR)
276
276
  [![Codacy Badge](https://app.codacy.com/project/badge/Grade/4fff4d764bb14fb8b4f4afeb9587231b)](https://app.codacy.com/gh/felixdittrich92/OnnxTR/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade)
277
277
  [![CodeFactor](https://www.codefactor.io/repository/github/felixdittrich92/onnxtr/badge)](https://www.codefactor.io/repository/github/felixdittrich92/onnxtr)
278
- [![Pypi](https://img.shields.io/badge/pypi-v0.1.1-blue.svg)](https://pypi.org/project/OnnxTR/)
278
+ [![Pypi](https://img.shields.io/badge/pypi-v0.3.0-blue.svg)](https://pypi.org/project/OnnxTR/)
279
279
 
280
280
  > :warning: Please note that this is a wrapper around the [doctr](https://github.com/mindee/doctr) library to provide a Onnx pipeline for docTR. For feature requests, which are not directly related to the Onnx pipeline, please refer to the base project.
281
281
 
@@ -284,8 +284,9 @@ Requires-Dist: pre-commit>=2.17.0; extra == "dev"
284
284
  What you can expect from this repository:
285
285
 
286
286
  - efficient ways to parse textual information (localize and identify each word) from your documents
287
- - a Onnx pipeline for docTR, a wrapper around the [doctr](https://github.com/mindee/doctr) library
287
+ - a Onnx pipeline for docTR, a wrapper around the [doctr](https://github.com/mindee/doctr) library - no PyTorch or TensorFlow dependencies
288
288
  - more lightweight package with faster inference latency and less required resources
289
+ - 8-Bit quantized models for faster inference on CPU
289
290
 
290
291
  ![OCR_example](https://github.com/felixdittrich92/OnnxTR/raw/main/docs/images/ocr.png)
291
292
 
@@ -335,11 +336,11 @@ multi_img_doc = DocumentFile.from_images(["path/to/page1.jpg", "path/to/page2.jp
335
336
 
336
337
  ### Putting it together
337
338
 
338
- Let's use the default pretrained model for an example:
339
+ Let's use the default `ocr_predictor` model for an example:
339
340
 
340
341
  ```python
341
342
  from onnxtr.io import DocumentFile
342
- from onnxtr.models import ocr_predictor
343
+ from onnxtr.models import ocr_predictor, EngineConfig
343
344
 
344
345
  model = ocr_predictor(
345
346
  det_arch='fast_base', # detection architecture
@@ -356,8 +357,15 @@ model = ocr_predictor(
356
357
  detect_language=False, # set to `True` if the language of the pages should be detected (default: False)
357
358
  # DocumentBuilder specific parameters
358
359
  resolve_lines=True, # whether words should be automatically grouped into lines (default: True)
359
- resolve_blocks=True, # whether lines should be automatically grouped into blocks (default: True)
360
+ resolve_blocks=False, # whether lines should be automatically grouped into blocks (default: False)
360
361
  paragraph_break=0.035, # relative length of the minimum space separating paragraphs (default: 0.035)
362
+ # OnnxTR specific parameters
363
+ # NOTE: 8-Bit quantized models are not available for FAST detection models and can in general lead to poorer accuracy
364
+ load_in_8_bit=False, # set to `True` to load 8-bit quantized models instead of the full precision onces (default: False)
365
+ # Advanced engine configuration options
366
+ det_engine_cfg=EngineConfig(), # detection model engine configuration (default: internal predefined configuration)
367
+ reco_engine_cfg=EngineConfig(), # recognition model engine configuration (default: internal predefined configuration)
368
+ clf_engine_cfg=EngineConfig(), # classification (orientation) model engine configuration (default: internal predefined configuration)
361
369
  )
362
370
  # PDF
363
371
  doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
@@ -395,6 +403,39 @@ for output in xml_output:
395
403
 
396
404
  ```
397
405
 
406
+ <details>
407
+ <summary>Advanced engine configuration options</summary>
408
+
409
+ You can also define advanced engine configurations for the models / predictors:
410
+
411
+ ```python
412
+ from onnxruntime import SessionOptions
413
+
414
+ from onnxtr.models import ocr_predictor, EngineConfig
415
+
416
+ general_options = SessionOptions() # For configuartion options see: https://onnxruntime.ai/docs/api/python/api_summary.html#sessionoptions
417
+ general_options.enable_cpu_mem_arena = False
418
+
419
+ # NOTE: The following would force to run only on the GPU if no GPU is available it will raise an error
420
+ # List of strings e.g. ["CUDAExecutionProvider", "CPUExecutionProvider"] or a list of tuples with the provider and its options e.g.
421
+ # [("CUDAExecutionProvider", {"device_id": 0}), ("CPUExecutionProvider", {"arena_extend_strategy": "kSameAsRequested"})]
422
+ providers = [("CUDAExecutionProvider", {"device_id": 0})] # For available providers see: https://onnxruntime.ai/docs/execution-providers/
423
+
424
+ engine_config = EngineConfig(
425
+ session_options=general_options,
426
+ providers=providers
427
+ )
428
+ # We use the default predictor with the custom engine configuration
429
+ # NOTE: You can define differnt engine configurations for detection, recognition and classification depending on your needs
430
+ predictor = ocr_predictor(
431
+ det_engine_cfg=engine_config,
432
+ reco_engine_cfg=engine_config,
433
+ clf_engine_cfg=engine_config
434
+ )
435
+ ```
436
+
437
+ </details>
438
+
398
439
  ## Loading custom exported models
399
440
 
400
441
  You can also load docTR custom exported models:
@@ -438,9 +479,9 @@ predictor.list_archs()
438
479
  'linknet_resnet18',
439
480
  'linknet_resnet34',
440
481
  'linknet_resnet50',
441
- 'fast_tiny',
442
- 'fast_small',
443
- 'fast_base'
482
+ 'fast_tiny', # No 8-bit support
483
+ 'fast_small', # No 8-bit support
484
+ 'fast_base' # No 8-bit support
444
485
  ],
445
486
  'recognition archs':
446
487
  [
@@ -469,14 +510,38 @@ NOTE:
469
510
 
470
511
  ### Benchmarks
471
512
 
472
- The benchmarks was measured on a `i7-14700K Intel CPU`.
513
+ The CPU benchmarks was measured on a `i7-14700K Intel CPU`.
514
+
515
+ The GPU benchmarks was measured on a `RTX 4080 Nvidia GPU`.
516
+
517
+ Benchmarking performed on the FUNSD dataset and CORD dataset.
518
+
519
+ docTR / OnnxTR models used for the benchmarks are `fast_base` (full precision) | `db_resnet50` (8-bit variant) for detection and `crnn_vgg16_bn` for recognition.
520
+
521
+ The smallest combination in OnnxTR (docTR) of `db_mobilenet_v3_large` and `crnn_mobilenet_v3_small` takes as comparison `~0.17s / Page` on the FUNSD dataset and `~0.12s / Page` on the CORD dataset in **full precision**.
522
+
523
+ - CPU benchmarks:
524
+
525
+ |Library |FUNSD (199 pages) |CORD (900 pages) |
526
+ |---------------------------------|-------------------------------|-------------------------------|
527
+ |docTR (CPU) - v0.8.1 | ~1.29s / Page | ~0.60s / Page |
528
+ |**OnnxTR (CPU)** - v0.1.2 | ~0.57s / Page | **~0.25s / Page** |
529
+ |**OnnxTR (CPU) 8-bit** - v0.1.2 | **~0.38s / Page** | **~0.14s / Page** |
530
+ |EasyOCR (CPU) - v1.7.1 | ~1.96s / Page | ~1.75s / Page |
531
+ |**PyTesseract (CPU)** - v0.3.10 | **~0.50s / Page** | ~0.52s / Page |
532
+ |Surya (line) (CPU) - v0.4.4 | ~48.76s / Page | ~35.49s / Page |
533
+ |PaddleOCR (CPU) - no cls - v2.7.3| ~1.27s / Page | ~0.38s / Page |
473
534
 
474
- MORE BENCHMARKS COMING SOON
535
+ - GPU benchmarks:
475
536
 
476
- |Dataset |docTR (CPU) - v0.8.1 |OnnxTR (CPU) - v0.1.1 |
477
- |--------------------------------|-------------------------------|-------------------------------|
478
- |FUNSD (199 pages) | ~1.29s / Page | ~0.57s / Page |
479
- |CORD (900 pages) | ~0.60s / Page | ~0.25s / Page |
537
+ |Library |FUNSD (199 pages) |CORD (900 pages) |
538
+ |-------------------------------------|-------------------------------|-------------------------------|
539
+ |docTR (GPU) - v0.8.1 | ~0.07s / Page | ~0.05s / Page |
540
+ |**docTR (GPU) float16** - v0.8.1 | **~0.06s / Page** | **~0.03s / Page** |
541
+ |OnnxTR (GPU) - v0.1.2 | **~0.06s / Page** | ~0.04s / Page |
542
+ |EasyOCR (GPU) - v1.7.1 | ~0.31s / Page | ~0.19s / Page |
543
+ |Surya (GPU) float16 - v0.4.4 | ~3.70s / Page | ~2.81s / Page |
544
+ |**PaddleOCR (GPU) - no cls - v2.7.3**| ~0.08s / Page | **~0.03s / Page** |
480
545
 
481
546
  ## Citation
482
547
 
@@ -7,7 +7,7 @@
7
7
  [![codecov](https://codecov.io/gh/felixdittrich92/OnnxTR/graph/badge.svg?token=WVFRCQBOLI)](https://codecov.io/gh/felixdittrich92/OnnxTR)
8
8
  [![Codacy Badge](https://app.codacy.com/project/badge/Grade/4fff4d764bb14fb8b4f4afeb9587231b)](https://app.codacy.com/gh/felixdittrich92/OnnxTR/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade)
9
9
  [![CodeFactor](https://www.codefactor.io/repository/github/felixdittrich92/onnxtr/badge)](https://www.codefactor.io/repository/github/felixdittrich92/onnxtr)
10
- [![Pypi](https://img.shields.io/badge/pypi-v0.1.1-blue.svg)](https://pypi.org/project/OnnxTR/)
10
+ [![Pypi](https://img.shields.io/badge/pypi-v0.3.0-blue.svg)](https://pypi.org/project/OnnxTR/)
11
11
 
12
12
  > :warning: Please note that this is a wrapper around the [doctr](https://github.com/mindee/doctr) library to provide a Onnx pipeline for docTR. For feature requests, which are not directly related to the Onnx pipeline, please refer to the base project.
13
13
 
@@ -16,8 +16,9 @@
16
16
  What you can expect from this repository:
17
17
 
18
18
  - efficient ways to parse textual information (localize and identify each word) from your documents
19
- - a Onnx pipeline for docTR, a wrapper around the [doctr](https://github.com/mindee/doctr) library
19
+ - a Onnx pipeline for docTR, a wrapper around the [doctr](https://github.com/mindee/doctr) library - no PyTorch or TensorFlow dependencies
20
20
  - more lightweight package with faster inference latency and less required resources
21
+ - 8-Bit quantized models for faster inference on CPU
21
22
 
22
23
  ![OCR_example](https://github.com/felixdittrich92/OnnxTR/raw/main/docs/images/ocr.png)
23
24
 
@@ -67,11 +68,11 @@ multi_img_doc = DocumentFile.from_images(["path/to/page1.jpg", "path/to/page2.jp
67
68
 
68
69
  ### Putting it together
69
70
 
70
- Let's use the default pretrained model for an example:
71
+ Let's use the default `ocr_predictor` model for an example:
71
72
 
72
73
  ```python
73
74
  from onnxtr.io import DocumentFile
74
- from onnxtr.models import ocr_predictor
75
+ from onnxtr.models import ocr_predictor, EngineConfig
75
76
 
76
77
  model = ocr_predictor(
77
78
  det_arch='fast_base', # detection architecture
@@ -88,8 +89,15 @@ model = ocr_predictor(
88
89
  detect_language=False, # set to `True` if the language of the pages should be detected (default: False)
89
90
  # DocumentBuilder specific parameters
90
91
  resolve_lines=True, # whether words should be automatically grouped into lines (default: True)
91
- resolve_blocks=True, # whether lines should be automatically grouped into blocks (default: True)
92
+ resolve_blocks=False, # whether lines should be automatically grouped into blocks (default: False)
92
93
  paragraph_break=0.035, # relative length of the minimum space separating paragraphs (default: 0.035)
94
+ # OnnxTR specific parameters
95
+ # NOTE: 8-Bit quantized models are not available for FAST detection models and can in general lead to poorer accuracy
96
+ load_in_8_bit=False, # set to `True` to load 8-bit quantized models instead of the full precision onces (default: False)
97
+ # Advanced engine configuration options
98
+ det_engine_cfg=EngineConfig(), # detection model engine configuration (default: internal predefined configuration)
99
+ reco_engine_cfg=EngineConfig(), # recognition model engine configuration (default: internal predefined configuration)
100
+ clf_engine_cfg=EngineConfig(), # classification (orientation) model engine configuration (default: internal predefined configuration)
93
101
  )
94
102
  # PDF
95
103
  doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
@@ -127,6 +135,39 @@ for output in xml_output:
127
135
 
128
136
  ```
129
137
 
138
+ <details>
139
+ <summary>Advanced engine configuration options</summary>
140
+
141
+ You can also define advanced engine configurations for the models / predictors:
142
+
143
+ ```python
144
+ from onnxruntime import SessionOptions
145
+
146
+ from onnxtr.models import ocr_predictor, EngineConfig
147
+
148
+ general_options = SessionOptions() # For configuartion options see: https://onnxruntime.ai/docs/api/python/api_summary.html#sessionoptions
149
+ general_options.enable_cpu_mem_arena = False
150
+
151
+ # NOTE: The following would force to run only on the GPU if no GPU is available it will raise an error
152
+ # List of strings e.g. ["CUDAExecutionProvider", "CPUExecutionProvider"] or a list of tuples with the provider and its options e.g.
153
+ # [("CUDAExecutionProvider", {"device_id": 0}), ("CPUExecutionProvider", {"arena_extend_strategy": "kSameAsRequested"})]
154
+ providers = [("CUDAExecutionProvider", {"device_id": 0})] # For available providers see: https://onnxruntime.ai/docs/execution-providers/
155
+
156
+ engine_config = EngineConfig(
157
+ session_options=general_options,
158
+ providers=providers
159
+ )
160
+ # We use the default predictor with the custom engine configuration
161
+ # NOTE: You can define differnt engine configurations for detection, recognition and classification depending on your needs
162
+ predictor = ocr_predictor(
163
+ det_engine_cfg=engine_config,
164
+ reco_engine_cfg=engine_config,
165
+ clf_engine_cfg=engine_config
166
+ )
167
+ ```
168
+
169
+ </details>
170
+
130
171
  ## Loading custom exported models
131
172
 
132
173
  You can also load docTR custom exported models:
@@ -170,9 +211,9 @@ predictor.list_archs()
170
211
  'linknet_resnet18',
171
212
  'linknet_resnet34',
172
213
  'linknet_resnet50',
173
- 'fast_tiny',
174
- 'fast_small',
175
- 'fast_base'
214
+ 'fast_tiny', # No 8-bit support
215
+ 'fast_small', # No 8-bit support
216
+ 'fast_base' # No 8-bit support
176
217
  ],
177
218
  'recognition archs':
178
219
  [
@@ -201,14 +242,38 @@ NOTE:
201
242
 
202
243
  ### Benchmarks
203
244
 
204
- The benchmarks was measured on a `i7-14700K Intel CPU`.
245
+ The CPU benchmarks was measured on a `i7-14700K Intel CPU`.
246
+
247
+ The GPU benchmarks was measured on a `RTX 4080 Nvidia GPU`.
248
+
249
+ Benchmarking performed on the FUNSD dataset and CORD dataset.
250
+
251
+ docTR / OnnxTR models used for the benchmarks are `fast_base` (full precision) | `db_resnet50` (8-bit variant) for detection and `crnn_vgg16_bn` for recognition.
252
+
253
+ The smallest combination in OnnxTR (docTR) of `db_mobilenet_v3_large` and `crnn_mobilenet_v3_small` takes as comparison `~0.17s / Page` on the FUNSD dataset and `~0.12s / Page` on the CORD dataset in **full precision**.
254
+
255
+ - CPU benchmarks:
256
+
257
+ |Library |FUNSD (199 pages) |CORD (900 pages) |
258
+ |---------------------------------|-------------------------------|-------------------------------|
259
+ |docTR (CPU) - v0.8.1 | ~1.29s / Page | ~0.60s / Page |
260
+ |**OnnxTR (CPU)** - v0.1.2 | ~0.57s / Page | **~0.25s / Page** |
261
+ |**OnnxTR (CPU) 8-bit** - v0.1.2 | **~0.38s / Page** | **~0.14s / Page** |
262
+ |EasyOCR (CPU) - v1.7.1 | ~1.96s / Page | ~1.75s / Page |
263
+ |**PyTesseract (CPU)** - v0.3.10 | **~0.50s / Page** | ~0.52s / Page |
264
+ |Surya (line) (CPU) - v0.4.4 | ~48.76s / Page | ~35.49s / Page |
265
+ |PaddleOCR (CPU) - no cls - v2.7.3| ~1.27s / Page | ~0.38s / Page |
205
266
 
206
- MORE BENCHMARKS COMING SOON
267
+ - GPU benchmarks:
207
268
 
208
- |Dataset |docTR (CPU) - v0.8.1 |OnnxTR (CPU) - v0.1.1 |
209
- |--------------------------------|-------------------------------|-------------------------------|
210
- |FUNSD (199 pages) | ~1.29s / Page | ~0.57s / Page |
211
- |CORD (900 pages) | ~0.60s / Page | ~0.25s / Page |
269
+ |Library |FUNSD (199 pages) |CORD (900 pages) |
270
+ |-------------------------------------|-------------------------------|-------------------------------|
271
+ |docTR (GPU) - v0.8.1 | ~0.07s / Page | ~0.05s / Page |
272
+ |**docTR (GPU) float16** - v0.8.1 | **~0.06s / Page** | **~0.03s / Page** |
273
+ |OnnxTR (GPU) - v0.1.2 | **~0.06s / Page** | ~0.04s / Page |
274
+ |EasyOCR (GPU) - v1.7.1 | ~0.31s / Page | ~0.19s / Page |
275
+ |Surya (GPU) float16 - v0.4.4 | ~3.70s / Page | ~2.81s / Page |
276
+ |**PaddleOCR (GPU) - no cls - v2.7.3**| ~0.08s / Page | **~0.03s / Page** |
212
277
 
213
278
  ## Citation
214
279
 
@@ -67,10 +67,11 @@ class Word(Element):
67
67
  confidence: the confidence associated with the text prediction
68
68
  geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
69
69
  the page's size
70
+ objectness_score: the objectness score of the detection
70
71
  crop_orientation: the general orientation of the crop in degrees and its confidence
71
72
  """
72
73
 
73
- _exported_keys: List[str] = ["value", "confidence", "geometry", "crop_orientation"]
74
+ _exported_keys: List[str] = ["value", "confidence", "geometry", "objectness_score", "crop_orientation"]
74
75
  _children_names: List[str] = []
75
76
 
76
77
  def __init__(
@@ -78,12 +79,14 @@ class Word(Element):
78
79
  value: str,
79
80
  confidence: float,
80
81
  geometry: Union[BoundingBox, np.ndarray],
82
+ objectness_score: float,
81
83
  crop_orientation: Dict[str, Any],
82
84
  ) -> None:
83
85
  super().__init__()
84
86
  self.value = value
85
87
  self.confidence = confidence
86
88
  self.geometry = geometry
89
+ self.objectness_score = objectness_score
87
90
  self.crop_orientation = crop_orientation
88
91
 
89
92
  def render(self) -> str:
@@ -143,7 +146,7 @@ class Line(Element):
143
146
  all words in it.
144
147
  """
145
148
 
146
- _exported_keys: List[str] = ["geometry"]
149
+ _exported_keys: List[str] = ["geometry", "objectness_score"]
147
150
  _children_names: List[str] = ["words"]
148
151
  words: List[Word] = []
149
152
 
@@ -151,7 +154,11 @@ class Line(Element):
151
154
  self,
152
155
  words: List[Word],
153
156
  geometry: Optional[Union[BoundingBox, np.ndarray]] = None,
157
+ objectness_score: Optional[float] = None,
154
158
  ) -> None:
159
+ # Compute the objectness score of the line
160
+ if objectness_score is None:
161
+ objectness_score = float(np.mean([w.objectness_score for w in words]))
155
162
  # Resolve the geometry using the smallest enclosing bounding box
156
163
  if geometry is None:
157
164
  # Check whether this is a rotated or straight box
@@ -160,6 +167,7 @@ class Line(Element):
160
167
 
161
168
  super().__init__(words=words)
162
169
  self.geometry = geometry
170
+ self.objectness_score = objectness_score
163
171
 
164
172
  def render(self) -> str:
165
173
  """Renders the full text of the element"""
@@ -186,7 +194,7 @@ class Block(Element):
186
194
  all lines and artefacts in it.
187
195
  """
188
196
 
189
- _exported_keys: List[str] = ["geometry"]
197
+ _exported_keys: List[str] = ["geometry", "objectness_score"]
190
198
  _children_names: List[str] = ["lines", "artefacts"]
191
199
  lines: List[Line] = []
192
200
  artefacts: List[Artefact] = []
@@ -196,7 +204,11 @@ class Block(Element):
196
204
  lines: List[Line] = [],
197
205
  artefacts: List[Artefact] = [],
198
206
  geometry: Optional[Union[BoundingBox, np.ndarray]] = None,
207
+ objectness_score: Optional[float] = None,
199
208
  ) -> None:
209
+ # Compute the objectness score of the line
210
+ if objectness_score is None:
211
+ objectness_score = float(np.mean([w.objectness_score for line in lines for w in line.words]))
200
212
  # Resolve the geometry using the smallest enclosing bounding box
201
213
  if geometry is None:
202
214
  line_boxes = [word.geometry for line in lines for word in line.words]
@@ -208,6 +220,7 @@ class Block(Element):
208
220
 
209
221
  super().__init__(lines=lines, artefacts=artefacts)
210
222
  self.geometry = geometry
223
+ self.objectness_score = objectness_score
211
224
 
212
225
  def render(self, line_break: str = "\n") -> str:
213
226
  """Renders the full text of the element"""
@@ -314,7 +327,7 @@ class Page(Element):
314
327
  SubElement(
315
328
  head,
316
329
  "meta",
317
- attrib={"name": "ocr-system", "content": f" {onnxtr.__version__}"}, # type: ignore[attr-defined]
330
+ attrib={"name": "ocr-system", "content": f"onnxtr {onnxtr.__version__}"}, # type: ignore[attr-defined]
318
331
  )
319
332
  SubElement(
320
333
  head,
@@ -15,7 +15,7 @@ __all__ = ["read_pdf"]
15
15
 
16
16
  def read_pdf(
17
17
  file: AbstractFile,
18
- scale: float = 2,
18
+ scale: int = 2,
19
19
  rgb_mode: bool = True,
20
20
  password: Optional[str] = None,
21
21
  **kwargs: Any,
@@ -38,5 +38,8 @@ def read_pdf(
38
38
  the list of pages decoded as numpy ndarray of shape H x W x C
39
39
  """
40
40
  # Rasterise pages to numpy ndarrays with pypdfium2
41
- pdf = pdfium.PdfDocument(file, password=password, autoclose=True)
42
- return [page.render(scale=scale, rev_byteorder=rgb_mode, **kwargs).to_numpy() for page in pdf]
41
+ pdf = pdfium.PdfDocument(file, password=password)
42
+ try:
43
+ return [page.render(scale=scale, rev_byteorder=rgb_mode, **kwargs).to_numpy() for page in pdf]
44
+ finally:
45
+ pdf.close()
@@ -1,3 +1,4 @@
1
+ from .engine import EngineConfig
1
2
  from .classification import *
2
3
  from .detection import *
3
4
  from .recognition import *
@@ -11,6 +11,8 @@ import cv2
11
11
  import numpy as np
12
12
  from langdetect import LangDetectException, detect_langs
13
13
 
14
+ from onnxtr.utils.geometry import rotate_image
15
+
14
16
  __all__ = ["estimate_orientation", "get_language"]
15
17
 
16
18
 
@@ -29,56 +31,91 @@ def get_max_width_length_ratio(contour: np.ndarray) -> float:
29
31
  return max(w / h, h / w)
30
32
 
31
33
 
32
- def estimate_orientation(img: np.ndarray, n_ct: int = 50, ratio_threshold_for_lines: float = 5) -> int:
34
+ def estimate_orientation(
35
+ img: np.ndarray,
36
+ general_page_orientation: Optional[Tuple[int, float]] = None,
37
+ n_ct: int = 70,
38
+ ratio_threshold_for_lines: float = 3,
39
+ min_confidence: float = 0.2,
40
+ lower_area: int = 100,
41
+ ) -> int:
33
42
  """Estimate the angle of the general document orientation based on the
34
43
  lines of the document and the assumption that they should be horizontal.
35
44
 
36
45
  Args:
37
46
  ----
38
47
  img: the img or bitmap to analyze (H, W, C)
48
+ general_page_orientation: the general orientation of the page (angle [0, 90, 180, 270 (-90)], confidence)
49
+ estimated by a model
39
50
  n_ct: the number of contours used for the orientation estimation
40
51
  ratio_threshold_for_lines: this is the ratio w/h used to discriminates lines
52
+ min_confidence: the minimum confidence to consider the general_page_orientation
53
+ lower_area: the minimum area of a contour to be considered
41
54
 
42
55
  Returns:
43
56
  -------
44
- the angle of the general document orientation
57
+ the estimated angle of the page (clockwise, negative for left side rotation, positive for right side rotation)
45
58
  """
46
59
  assert len(img.shape) == 3 and img.shape[-1] in [1, 3], f"Image shape {img.shape} not supported"
47
- max_value = np.max(img)
48
- min_value = np.min(img)
49
- if max_value <= 1 and min_value >= 0 or (max_value <= 255 and min_value >= 0 and img.shape[-1] == 1):
50
- thresh = img.astype(np.uint8)
51
- if max_value <= 255 and min_value >= 0 and img.shape[-1] == 3:
60
+ thresh = None
61
+ # Convert image to grayscale if necessary
62
+ if img.shape[-1] == 3:
52
63
  gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
53
64
  gray_img = cv2.medianBlur(gray_img, 5)
54
- thresh = cv2.threshold(gray_img, thresh=0, maxval=255, type=cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1] # type: ignore[assignment]
55
-
56
- # try to merge words in lines
57
- (h, w) = img.shape[:2]
58
- k_x = max(1, (floor(w / 100)))
59
- k_y = max(1, (floor(h / 100)))
60
- kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (k_x, k_y))
61
- thresh = cv2.dilate(thresh, kernel, iterations=1) # type: ignore[assignment]
65
+ thresh = cv2.threshold(gray_img, thresh=0, maxval=255, type=cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
66
+ else:
67
+ thresh = img.astype(np.uint8) # type: ignore[assignment]
68
+
69
+ page_orientation, orientation_confidence = general_page_orientation or (None, 0.0)
70
+ if page_orientation and orientation_confidence >= min_confidence:
71
+ # We rotate the image to the general orientation which improves the detection
72
+ # No expand needed bitmap is already padded
73
+ thresh = rotate_image(thresh, -page_orientation) # type: ignore
74
+ else: # That's only required if we do not work on the detection models bin map
75
+ # try to merge words in lines
76
+ (h, w) = img.shape[:2]
77
+ k_x = max(1, (floor(w / 100)))
78
+ k_y = max(1, (floor(h / 100)))
79
+ kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (k_x, k_y))
80
+ thresh = cv2.dilate(thresh, kernel, iterations=1)
62
81
 
63
82
  # extract contours
64
83
  contours, _ = cv2.findContours(thresh, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
65
84
 
66
- # Sort contours
67
- contours = sorted(contours, key=get_max_width_length_ratio, reverse=True)
85
+ # Filter & Sort contours
86
+ contours = sorted(
87
+ [contour for contour in contours if cv2.contourArea(contour) > lower_area],
88
+ key=get_max_width_length_ratio,
89
+ reverse=True,
90
+ )
68
91
 
69
92
  angles = []
70
93
  for contour in contours[:n_ct]:
71
- _, (w, h), angle = cv2.minAreaRect(contour)
94
+ _, (w, h), angle = cv2.minAreaRect(contour) # type: ignore[assignment]
72
95
  if w / h > ratio_threshold_for_lines: # select only contours with ratio like lines
73
96
  angles.append(angle)
74
97
  elif w / h < 1 / ratio_threshold_for_lines: # if lines are vertical, substract 90 degree
75
98
  angles.append(angle - 90)
76
99
 
77
100
  if len(angles) == 0:
78
- return 0 # in case no angles is found
101
+ estimated_angle = 0 # in case no angles is found
79
102
  else:
80
103
  median = -median_low(angles)
81
- return round(median) if abs(median) != 0 else 0
104
+ estimated_angle = -round(median) if abs(median) != 0 else 0
105
+
106
+ # combine with the general orientation and the estimated angle
107
+ if page_orientation and orientation_confidence >= min_confidence:
108
+ # special case where the estimated angle is mostly wrong:
109
+ # case 1: - and + swapped
110
+ # case 2: estimated angle is completely wrong
111
+ # so in this case we prefer the general page orientation
112
+ if abs(estimated_angle) == abs(page_orientation):
113
+ return page_orientation
114
+ estimated_angle = estimated_angle if page_orientation == 0 else page_orientation + estimated_angle
115
+ if estimated_angle > 180:
116
+ estimated_angle -= 360
117
+
118
+ return estimated_angle # return the clockwise angle (negative - left side rotation, positive - right side rotation)
82
119
 
83
120
 
84
121
  def rectify_crops(
@@ -31,7 +31,7 @@ class DocumentBuilder(NestedObject):
31
31
  def __init__(
32
32
  self,
33
33
  resolve_lines: bool = True,
34
- resolve_blocks: bool = True,
34
+ resolve_blocks: bool = False,
35
35
  paragraph_break: float = 0.035,
36
36
  export_as_straight_boxes: bool = False,
37
37
  ) -> None:
@@ -223,6 +223,7 @@ class DocumentBuilder(NestedObject):
223
223
  def _build_blocks(
224
224
  self,
225
225
  boxes: np.ndarray,
226
+ objectness_scores: np.ndarray,
226
227
  word_preds: List[Tuple[str, float]],
227
228
  crop_orientations: List[Dict[str, Any]],
228
229
  ) -> List[Block]:
@@ -230,7 +231,8 @@ class DocumentBuilder(NestedObject):
230
231
 
231
232
  Args:
232
233
  ----
233
- boxes: bounding boxes of all detected words of the page, of shape (N, 5) or (N, 4, 2)
234
+ boxes: bounding boxes of all detected words of the page, of shape (N, 4) or (N, 4, 2)
235
+ objectness_scores: objectness scores of all detected words of the page, of shape N
234
236
  word_preds: list of all detected words of the page, of shape N
235
237
  crop_orientations: list of dictoinaries containing
236
238
  the general orientation (orientations + confidences) of the crops
@@ -265,12 +267,14 @@ class DocumentBuilder(NestedObject):
265
267
  Word(
266
268
  *word_preds[idx],
267
269
  tuple([tuple(pt) for pt in boxes[idx].tolist()]), # type: ignore[arg-type]
270
+ float(objectness_scores[idx]),
268
271
  crop_orientations[idx],
269
272
  )
270
273
  if boxes.ndim == 3
271
274
  else Word(
272
275
  *word_preds[idx],
273
276
  ((boxes[idx, 0], boxes[idx, 1]), (boxes[idx, 2], boxes[idx, 3])),
277
+ float(objectness_scores[idx]),
274
278
  crop_orientations[idx],
275
279
  )
276
280
  for idx in line
@@ -293,6 +297,7 @@ class DocumentBuilder(NestedObject):
293
297
  self,
294
298
  pages: List[np.ndarray],
295
299
  boxes: List[np.ndarray],
300
+ objectness_scores: List[np.ndarray],
296
301
  text_preds: List[List[Tuple[str, float]]],
297
302
  page_shapes: List[Tuple[int, int]],
298
303
  crop_orientations: List[Dict[str, Any]],
@@ -304,8 +309,9 @@ class DocumentBuilder(NestedObject):
304
309
  Args:
305
310
  ----
306
311
  pages: list of N elements, where each element represents the page image
307
- boxes: list of N elements, where each element represents the localization predictions, of shape (*, 5)
308
- or (*, 6) for all words for a given page
312
+ boxes: list of N elements, where each element represents the localization predictions, of shape (*, 4)
313
+ or (*, 4, 2) for all words for a given page
314
+ objectness_scores: list of N elements, where each element represents the objectness scores
309
315
  text_preds: list of N elements, where each element is the list of all word prediction (text + confidence)
310
316
  page_shapes: shape of each page, of size N
311
317
  crop_orientations: list of N elements, where each element is
@@ -319,9 +325,9 @@ class DocumentBuilder(NestedObject):
319
325
  -------
320
326
  document object
321
327
  """
322
- if len(boxes) != len(text_preds) != len(crop_orientations) or len(boxes) != len(page_shapes) != len(
323
- crop_orientations
324
- ):
328
+ if len(boxes) != len(text_preds) != len(crop_orientations) != len(objectness_scores) or len(boxes) != len(
329
+ page_shapes
330
+ ) != len(crop_orientations) != len(objectness_scores):
325
331
  raise ValueError("All arguments are expected to be lists of the same size")
326
332
 
327
333
  _orientations = (
@@ -339,6 +345,7 @@ class DocumentBuilder(NestedObject):
339
345
  page,
340
346
  self._build_blocks(
341
347
  page_boxes,
348
+ loc_scores,
342
349
  word_preds,
343
350
  word_crop_orientations,
344
351
  ),
@@ -347,8 +354,16 @@ class DocumentBuilder(NestedObject):
347
354
  orientation,
348
355
  language,
349
356
  )
350
- for page, _idx, shape, page_boxes, word_preds, word_crop_orientations, orientation, language in zip(
351
- pages, range(len(boxes)), page_shapes, boxes, text_preds, crop_orientations, _orientations, _languages
357
+ for page, _idx, shape, page_boxes, loc_scores, word_preds, word_crop_orientations, orientation, language in zip( # noqa: E501
358
+ pages,
359
+ range(len(boxes)),
360
+ page_shapes,
361
+ boxes,
362
+ objectness_scores,
363
+ text_preds,
364
+ crop_orientations,
365
+ _orientations,
366
+ _languages,
352
367
  )
353
368
  ]
354
369