deepdoctection 0.32__tar.gz → 0.33__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (154) hide show
  1. {deepdoctection-0.32 → deepdoctection-0.33}/PKG-INFO +4 -4
  2. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/__init__.py +3 -23
  3. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/analyzer/dd.py +47 -42
  4. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/dataflow/common.py +9 -5
  5. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/dataflow/custom.py +5 -5
  6. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/dataflow/custom_serialize.py +75 -18
  7. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/dataflow/parallel_map.py +3 -3
  8. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/dataflow/serialize.py +4 -4
  9. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/dataflow/stats.py +3 -3
  10. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datapoint/annotation.py +39 -55
  11. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datapoint/box.py +7 -7
  12. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datapoint/convert.py +6 -6
  13. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datapoint/image.py +43 -37
  14. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datapoint/view.py +175 -151
  15. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/adapter.py +30 -24
  16. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/base.py +9 -9
  17. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/dataflow_builder.py +3 -3
  18. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/info.py +23 -25
  19. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/instances/doclaynet.py +48 -49
  20. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/instances/fintabnet.py +44 -45
  21. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/instances/funsd.py +23 -23
  22. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/instances/iiitar13k.py +8 -8
  23. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/instances/layouttest.py +2 -2
  24. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/instances/publaynet.py +3 -3
  25. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/instances/pubtables1m.py +18 -18
  26. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/instances/pubtabnet.py +30 -29
  27. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/instances/rvlcdip.py +28 -29
  28. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/instances/xfund.py +24 -25
  29. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/save.py +6 -6
  30. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/eval/accmetric.py +32 -33
  31. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/eval/base.py +8 -9
  32. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/eval/cocometric.py +13 -12
  33. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/eval/eval.py +26 -26
  34. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/eval/tedsmetric.py +16 -12
  35. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/eval/tp_eval_callback.py +7 -16
  36. deepdoctection-0.33/deepdoctection/extern/base.py +644 -0
  37. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/d2detect.py +69 -89
  38. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/deskew.py +11 -10
  39. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/doctrocr.py +81 -64
  40. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/fastlang.py +23 -16
  41. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/hfdetr.py +53 -38
  42. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/hflayoutlm.py +216 -155
  43. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/hflm.py +35 -30
  44. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/model.py +432 -255
  45. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/pdftext.py +15 -15
  46. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/pt/ptutils.py +4 -2
  47. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tessocr.py +39 -38
  48. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/texocr.py +14 -16
  49. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tfutils.py +16 -2
  50. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tpcompat.py +11 -7
  51. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/config/config.py +4 -4
  52. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +1 -1
  53. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +5 -5
  54. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +6 -6
  55. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +4 -4
  56. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +5 -3
  57. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/preproc.py +5 -5
  58. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tpdetect.py +40 -45
  59. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/mapper/cats.py +27 -29
  60. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/mapper/cocostruct.py +10 -10
  61. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/mapper/d2struct.py +20 -21
  62. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/mapper/hfstruct.py +7 -7
  63. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/mapper/laylmstruct.py +22 -24
  64. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/mapper/maputils.py +9 -10
  65. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/mapper/match.py +2 -2
  66. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/mapper/misc.py +5 -6
  67. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/mapper/pascalstruct.py +4 -4
  68. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/mapper/prodigystruct.py +5 -5
  69. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/mapper/pubstruct.py +84 -92
  70. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/mapper/tpstruct.py +3 -3
  71. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/mapper/xfundstruct.py +33 -33
  72. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/pipe/anngen.py +12 -14
  73. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/pipe/base.py +52 -106
  74. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/pipe/common.py +63 -52
  75. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/pipe/concurrency.py +14 -10
  76. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/pipe/doctectionpipe.py +24 -21
  77. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/pipe/language.py +20 -25
  78. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/pipe/layout.py +18 -16
  79. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/pipe/lm.py +49 -47
  80. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/pipe/order.py +63 -65
  81. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/pipe/refine.py +102 -109
  82. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/pipe/segment.py +156 -161
  83. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/pipe/sub_layout.py +49 -39
  84. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/pipe/text.py +37 -36
  85. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/pipe/transform.py +19 -16
  86. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/train/d2_frcnn_train.py +27 -25
  87. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/train/hf_detr_train.py +22 -18
  88. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/train/hf_layoutlm_train.py +49 -48
  89. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/train/tp_frcnn_train.py +10 -11
  90. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/utils/concurrency.py +1 -1
  91. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/utils/context.py +13 -6
  92. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/utils/develop.py +4 -4
  93. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/utils/env_info.py +51 -13
  94. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/utils/file_utils.py +6 -11
  95. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/utils/fs.py +22 -18
  96. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/utils/identifier.py +2 -2
  97. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/utils/logger.py +15 -15
  98. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/utils/metacfg.py +7 -7
  99. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/utils/pdf_utils.py +11 -11
  100. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/utils/settings.py +185 -182
  101. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/utils/tqdm.py +1 -1
  102. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/utils/transform.py +14 -9
  103. deepdoctection-0.33/deepdoctection/utils/types.py +104 -0
  104. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/utils/utils.py +7 -7
  105. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/utils/viz.py +70 -69
  106. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection.egg-info/PKG-INFO +4 -4
  107. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection.egg-info/SOURCES.txt +1 -1
  108. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection.egg-info/requires.txt +3 -3
  109. {deepdoctection-0.32 → deepdoctection-0.33}/setup.cfg +3 -0
  110. {deepdoctection-0.32 → deepdoctection-0.33}/setup.py +1 -1
  111. deepdoctection-0.32/deepdoctection/extern/base.py +0 -439
  112. deepdoctection-0.32/deepdoctection/utils/detection_types.py +0 -68
  113. {deepdoctection-0.32 → deepdoctection-0.33}/LICENSE +0 -0
  114. {deepdoctection-0.32 → deepdoctection-0.33}/README.md +0 -0
  115. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/analyzer/__init__.py +0 -0
  116. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/configs/__init__.py +0 -0
  117. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/configs/conf_dd_one.yaml +0 -0
  118. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/configs/conf_tesseract.yaml +0 -0
  119. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/dataflow/__init__.py +0 -0
  120. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/dataflow/base.py +0 -0
  121. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datapoint/__init__.py +0 -0
  122. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/__init__.py +0 -0
  123. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/instances/__init__.py +0 -0
  124. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/instances/xsl/__init__.py +0 -0
  125. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/instances/xsl/pascal_voc.xsl +0 -0
  126. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/registry.py +0 -0
  127. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/eval/__init__.py +0 -0
  128. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/eval/registry.py +0 -0
  129. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/__init__.py +0 -0
  130. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/pt/__init__.py +0 -0
  131. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/pt/nms.py +0 -0
  132. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/__init__.py +0 -0
  133. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/__init__.py +0 -0
  134. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/common.py +0 -0
  135. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/config/__init__.py +0 -0
  136. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +0 -0
  137. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +0 -0
  138. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +0 -0
  139. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +0 -0
  140. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/predict.py +0 -0
  141. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +0 -0
  142. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +0 -0
  143. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/utils/np_box_ops.py +0 -0
  144. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/mapper/__init__.py +0 -0
  145. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/pipe/__init__.py +0 -0
  146. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/pipe/registry.py +0 -0
  147. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/py.typed +0 -0
  148. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/train/__init__.py +0 -0
  149. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/utils/__init__.py +0 -0
  150. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/utils/error.py +0 -0
  151. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/utils/mocks.py +0 -0
  152. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection.egg-info/dependency_links.txt +0 -0
  153. {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection.egg-info/top_level.txt +0 -0
  154. {deepdoctection-0.32 → deepdoctection-0.33}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deepdoctection
3
- Version: 0.32
3
+ Version: 0.33
4
4
  Summary: Repository for Document AI
5
5
  Home-page: https://github.com/deepdoctection/deepdoctection
6
6
  Author: Dr. Janis Meyer
@@ -23,7 +23,7 @@ Requires-Dist: jsonlines==3.1.0
23
23
  Requires-Dist: lazy-imports==0.3.1
24
24
  Requires-Dist: mock==4.0.3
25
25
  Requires-Dist: networkx>=2.7.1
26
- Requires-Dist: numpy>=1.21
26
+ Requires-Dist: numpy<2.0,>=1.21
27
27
  Requires-Dist: packaging>=20.0
28
28
  Requires-Dist: Pillow>=10.0.0
29
29
  Requires-Dist: pypdf>=3.16.0
@@ -40,7 +40,7 @@ Requires-Dist: jsonlines==3.1.0; extra == "tf"
40
40
  Requires-Dist: lazy-imports==0.3.1; extra == "tf"
41
41
  Requires-Dist: mock==4.0.3; extra == "tf"
42
42
  Requires-Dist: networkx>=2.7.1; extra == "tf"
43
- Requires-Dist: numpy>=1.21; extra == "tf"
43
+ Requires-Dist: numpy<2.0,>=1.21; extra == "tf"
44
44
  Requires-Dist: packaging>=20.0; extra == "tf"
45
45
  Requires-Dist: Pillow>=10.0.0; extra == "tf"
46
46
  Requires-Dist: pypdf>=3.16.0; extra == "tf"
@@ -70,7 +70,7 @@ Requires-Dist: jsonlines==3.1.0; extra == "pt"
70
70
  Requires-Dist: lazy-imports==0.3.1; extra == "pt"
71
71
  Requires-Dist: mock==4.0.3; extra == "pt"
72
72
  Requires-Dist: networkx>=2.7.1; extra == "pt"
73
- Requires-Dist: numpy>=1.21; extra == "pt"
73
+ Requires-Dist: numpy<2.0,>=1.21; extra == "pt"
74
74
  Requires-Dist: packaging>=20.0; extra == "pt"
75
75
  Requires-Dist: Pillow>=10.0.0; extra == "pt"
76
76
  Requires-Dist: pypdf>=3.16.0; extra == "pt"
@@ -25,7 +25,7 @@ from .utils.logger import LoggingRecord, logger
25
25
 
26
26
  # pylint: enable=wrong-import-position
27
27
 
28
- __version__ = 0.32
28
+ __version__ = 0.33
29
29
 
30
30
  _IMPORT_STRUCTURE = {
31
31
  "analyzer": [
@@ -160,6 +160,8 @@ _IMPORT_STRUCTURE = {
160
160
  "EvalCallback",
161
161
  ],
162
162
  "extern": [
163
+ "ModelCategories",
164
+ "NerModelCategories",
163
165
  "PredictorBase",
164
166
  "DetectionResult",
165
167
  "ObjectDetector",
@@ -423,28 +425,6 @@ _IMPORT_STRUCTURE = {
423
425
  env_info = collect_env_info()
424
426
  logger.debug(LoggingRecord(msg=env_info))
425
427
 
426
- if os.environ.get("PYTORCH_AVAILABLE") and os.environ.get("DD_USE_TORCH") is None:
427
- os.environ["DD_USE_TORCH"] = "1"
428
- os.environ["USE_TORCH"] = "1"
429
- if os.environ.get("TENSORFLOW_AVAILABLE") and os.environ.get("DD_USE_TF") is None:
430
- os.environ["DD_USE_TF"] = "1"
431
- os.environ["USE_TF"] = "1"
432
- if os.environ.get("DD_USE_TORCH") and os.environ.get("DD_USE_TF"):
433
- logger.warning(
434
- "Both DD_USE_TORCH and DD_USE_TF are set. Defaulting to PyTorch. If you want a different "
435
- "behaviour, set DD_USE_TORCH to None before importing deepdoctection."
436
- )
437
- os.environ.pop("DD_USE_TF")
438
- os.environ.pop("USE_TF")
439
-
440
- if not os.environ.get("PYTORCH_AVAILABLE") and not os.environ.get("TENSORFLOW_AVAILABLE"):
441
- logger.warning(
442
- LoggingRecord(
443
- msg="Neither Tensorflow or Pytorch are available. You will not be able to use any Deep Learning "
444
- "model from the library."
445
- )
446
- )
447
-
448
428
 
449
429
  # Direct imports for type-checking
450
430
  if TYPE_CHECKING:
@@ -23,10 +23,12 @@ Module for **deep**doctection analyzer.
23
23
  -user factory with a reduced config setting
24
24
  """
25
25
 
26
+ from __future__ import annotations
27
+
26
28
  import os
27
29
  from os import environ
28
30
  from shutil import copyfile
29
- from typing import List, Optional, Union
31
+ from typing import Optional, Union
30
32
 
31
33
  from lazy_imports import try_import
32
34
 
@@ -50,7 +52,7 @@ from ..pipe.refine import TableSegmentationRefinementService
50
52
  from ..pipe.segment import PubtablesSegmentationService, TableSegmentationService
51
53
  from ..pipe.sub_layout import DetectResultGenerator, SubImageLayoutService
52
54
  from ..pipe.text import TextExtractionService
53
- from ..utils.detection_types import Pathlike
55
+ from ..utils.env_info import ENV_VARS_TRUE
54
56
  from ..utils.error import DependencyError
55
57
  from ..utils.file_utils import detectron2_available, tensorpack_available
56
58
  from ..utils.fs import get_configs_dir_path, get_package_path, mkdir_p
@@ -58,6 +60,7 @@ from ..utils.logger import LoggingRecord, logger
58
60
  from ..utils.metacfg import AttrDict, set_config_by_yaml
59
61
  from ..utils.settings import CellType, LayoutType
60
62
  from ..utils.transform import PadTransform
63
+ from ..utils.types import PathLikeOrStr
61
64
 
62
65
  with try_import() as image_guard:
63
66
  from botocore.config import Config # type: ignore
@@ -81,7 +84,7 @@ _TESSERACT = "deepdoctection/configs/conf_tesseract.yaml"
81
84
 
82
85
 
83
86
  def maybe_copy_config_to_cache(
84
- package_path: Pathlike, configs_dir_path: Pathlike, file_name: str, force_copy: bool = True
87
+ package_path: PathLikeOrStr, configs_dir_path: PathLikeOrStr, file_name: str, force_copy: bool = True
85
88
  ) -> str:
86
89
  """
87
90
  Initial copying of various files
@@ -115,7 +118,7 @@ def config_sanity_checks(cfg: AttrDict) -> None:
115
118
 
116
119
  def build_detector(
117
120
  cfg: AttrDict, mode: str
118
- ) -> Union["D2FrcnnDetector", "TPFrcnnDetector", "HFDetrDerivedDetector", "D2FrcnnTracingDetector"]:
121
+ ) -> Union[D2FrcnnDetector, TPFrcnnDetector, HFDetrDerivedDetector, D2FrcnnTracingDetector]:
119
122
  """Building a D2-Detector, a TP-Detector as Detr-Detector or a D2-Torch Tracing Detector according to
120
123
  the config
121
124
 
@@ -133,8 +136,8 @@ def build_detector(
133
136
  config_path = ModelCatalog.get_full_path_configs(weights)
134
137
  weights_path = ModelDownloadManager.maybe_download_weights_and_configs(weights)
135
138
  profile = ModelCatalog.get_profile(weights)
136
- categories = profile.categories
137
- assert categories is not None
139
+ categories = profile.categories if profile.categories is not None else {}
140
+
138
141
  if profile.model_wrapper in ("TPFrcnnDetector",):
139
142
  return TPFrcnnDetector(config_path, weights_path, categories, filter_categories=filter_categories)
140
143
  if profile.model_wrapper in ("D2FrcnnDetector",):
@@ -202,11 +205,13 @@ def build_sub_image_service(detector: ObjectDetector, cfg: AttrDict, mode: str)
202
205
  padder = None
203
206
  if mode == "ITEM":
204
207
  if detector.__class__.__name__ in ("HFDetrDerivedDetector",):
205
- exclude_category_ids.extend(["1", "3", "4", "5", "6"])
208
+ exclude_category_ids.extend([1, 3, 4, 5, 6])
206
209
  padder = build_padder(cfg, mode)
207
- detect_result_generator = DetectResultGenerator(detector.categories, exclude_category_ids=exclude_category_ids)
210
+ detect_result_generator = DetectResultGenerator(
211
+ categories=detector.categories.categories, exclude_category_ids=exclude_category_ids
212
+ )
208
213
  return SubImageLayoutService(
209
- detector, [LayoutType.table, LayoutType.table_rotated], None, detect_result_generator, padder
214
+ detector, [LayoutType.TABLE, LayoutType.TABLE_ROTATED], None, detect_result_generator, padder
210
215
  )
211
216
 
212
217
 
@@ -233,9 +238,9 @@ def build_ocr(cfg: AttrDict) -> Union[TesseractOcrDetector, DoctrTextRecognizer,
233
238
  )
234
239
  if cfg.OCR.USE_TEXTRACT:
235
240
  credentials_kwargs = {
236
- "aws_access_key_id": environ.get("ACCESS_KEY"),
237
- "aws_secret_access_key": environ.get("SECRET_KEY"),
238
- "config": Config(region_name=environ.get("REGION")),
241
+ "aws_access_key_id": environ.get("ACCESS_KEY", None),
242
+ "aws_secret_access_key": environ.get("SECRET_KEY", None),
243
+ "config": Config(region_name=environ.get("REGION", None)),
239
244
  }
240
245
  return TextractOcrDetector(**credentials_kwargs)
241
246
  raise ValueError("You have set USE_OCR=True but any of USE_TESSERACT, USE_DOCTR, USE_TEXTRACT is set to False")
@@ -260,7 +265,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
260
265
  :param cfg: A configuration
261
266
  :return: Analyzer pipeline
262
267
  """
263
- pipe_component_list: List[PipelineComponent] = []
268
+ pipe_component_list: list[PipelineComponent] = []
264
269
 
265
270
  if cfg.USE_LAYOUT:
266
271
  d_layout = build_detector(cfg, "LAYOUT")
@@ -300,22 +305,22 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
300
305
  cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS,
301
306
  cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS,
302
307
  cfg.SEGMENTATION.CELL_CATEGORY_ID,
303
- LayoutType.table,
308
+ LayoutType.TABLE,
304
309
  [
305
- CellType.spanning,
306
- CellType.row_header,
307
- CellType.column_header,
308
- CellType.projected_row_header,
309
- LayoutType.cell,
310
+ CellType.SPANNING,
311
+ CellType.ROW_HEADER,
312
+ CellType.COLUMN_HEADER,
313
+ CellType.PROJECTED_ROW_HEADER,
314
+ LayoutType.CELL,
310
315
  ],
311
316
  [
312
- CellType.spanning,
313
- CellType.row_header,
314
- CellType.column_header,
315
- CellType.projected_row_header,
317
+ CellType.SPANNING,
318
+ CellType.ROW_HEADER,
319
+ CellType.COLUMN_HEADER,
320
+ CellType.PROJECTED_ROW_HEADER,
316
321
  ],
317
- [LayoutType.row, LayoutType.column],
318
- [CellType.row_number, CellType.column_number],
322
+ [LayoutType.ROW, LayoutType.COLUMN],
323
+ [CellType.ROW_NUMBER, CellType.COLUMN_NUMBER],
319
324
  stretch_rule=cfg.SEGMENTATION.STRETCH_RULE,
320
325
  )
321
326
  pipe_component_list.append(pubtables)
@@ -327,23 +332,23 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
327
332
  cfg.SEGMENTATION.FULL_TABLE_TILING,
328
333
  cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS,
329
334
  cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS,
330
- LayoutType.table,
331
- [CellType.header, CellType.body, LayoutType.cell],
332
- [LayoutType.row, LayoutType.column],
333
- [CellType.row_number, CellType.column_number],
335
+ LayoutType.TABLE,
336
+ [CellType.HEADER, CellType.BODY, LayoutType.CELL],
337
+ [LayoutType.ROW, LayoutType.COLUMN],
338
+ [CellType.ROW_NUMBER, CellType.COLUMN_NUMBER],
334
339
  cfg.SEGMENTATION.STRETCH_RULE,
335
340
  )
336
341
  pipe_component_list.append(table_segmentation)
337
342
 
338
343
  if cfg.USE_TABLE_REFINEMENT:
339
344
  table_segmentation_refinement = TableSegmentationRefinementService(
340
- [LayoutType.table, LayoutType.table_rotated],
345
+ [LayoutType.TABLE, LayoutType.TABLE_ROTATED],
341
346
  [
342
- LayoutType.cell,
343
- CellType.column_header,
344
- CellType.projected_row_header,
345
- CellType.spanning,
346
- CellType.row_header,
347
+ LayoutType.CELL,
348
+ CellType.COLUMN_HEADER,
349
+ CellType.PROJECTED_ROW_HEADER,
350
+ CellType.SPANNING,
351
+ CellType.ROW_HEADER,
347
352
  ],
348
353
  )
349
354
  pipe_component_list.append(table_segmentation_refinement)
@@ -363,7 +368,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
363
368
 
364
369
  ocr = build_ocr(cfg)
365
370
  skip_if_text_extracted = cfg.USE_PDF_MINER
366
- extract_from_roi = LayoutType.word if cfg.OCR.USE_DOCTR else None
371
+ extract_from_roi = LayoutType.WORD if cfg.OCR.USE_DOCTR else None
367
372
  text = TextExtractionService(
368
373
  ocr, skip_if_text_extracted=skip_if_text_extracted, extract_from_roi=extract_from_roi
369
374
  )
@@ -372,7 +377,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
372
377
  if cfg.USE_PDF_MINER or cfg.USE_OCR:
373
378
  match = MatchingService(
374
379
  parent_categories=cfg.WORD_MATCHING.PARENTAL_CATEGORIES,
375
- child_categories=LayoutType.word,
380
+ child_categories=LayoutType.WORD,
376
381
  matching_rule=cfg.WORD_MATCHING.RULE,
377
382
  threshold=cfg.WORD_MATCHING.THRESHOLD,
378
383
  max_parent_only=cfg.WORD_MATCHING.MAX_PARENT_ONLY,
@@ -380,7 +385,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
380
385
  pipe_component_list.append(match)
381
386
 
382
387
  order = TextOrderService(
383
- text_container=LayoutType.word,
388
+ text_container=LayoutType.WORD,
384
389
  text_block_categories=cfg.TEXT_ORDERING.TEXT_BLOCK_CATEGORIES,
385
390
  floating_text_block_categories=cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES,
386
391
  include_residual_text_container=cfg.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER,
@@ -392,7 +397,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
392
397
  pipe_component_list.append(order)
393
398
 
394
399
  page_parsing_service = PageParsingService(
395
- text_container=LayoutType.word,
400
+ text_container=LayoutType.WORD,
396
401
  floating_text_block_categories=cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES,
397
402
  include_residual_text_container=cfg.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER,
398
403
  )
@@ -403,8 +408,8 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
403
408
 
404
409
  def get_dd_analyzer(
405
410
  reset_config_file: bool = True,
406
- config_overwrite: Optional[List[str]] = None,
407
- path_config_file: Optional[Pathlike] = None,
411
+ config_overwrite: Optional[list[str]] = None,
412
+ path_config_file: Optional[PathLikeOrStr] = None,
408
413
  ) -> DoctectionPipe:
409
414
  """
410
415
  Factory function for creating the built-in **deep**doctection analyzer.
@@ -431,7 +436,7 @@ def get_dd_analyzer(
431
436
  :return: A DoctectionPipe instance with given configs
432
437
  """
433
438
  config_overwrite = [] if config_overwrite is None else config_overwrite
434
- lib = "TF" if os.environ.get("DD_USE_TF") else "PT"
439
+ lib = "TF" if os.environ.get("DD_USE_TF", "0") in ENV_VARS_TRUE else "PT"
435
440
  if lib == "TF":
436
441
  device = get_tf_device()
437
442
  elif lib == "PT":
@@ -12,7 +12,7 @@ Some DataFlow classes for transforming and processing datapoints. Many classes h
12
12
  """
13
13
  import itertools
14
14
  from copy import copy
15
- from typing import Any, Callable, Iterator, List, Union
15
+ from typing import Any, Callable, Iterator, Union
16
16
 
17
17
  import tqdm
18
18
 
@@ -164,6 +164,10 @@ class RepeatedData(ProxyDataFlow):
164
164
  Set to -1 to repeat ``ds`` infinite times.
165
165
  """
166
166
  self.num = num
167
+ if self.num != -1:
168
+ self.dfs = itertools.tee(df, self.num)
169
+ else:
170
+ self.dfs = ()
167
171
  super().__init__(df)
168
172
 
169
173
  def __len__(self) -> int:
@@ -180,8 +184,8 @@ class RepeatedData(ProxyDataFlow):
180
184
  while True:
181
185
  yield from self.df
182
186
  else:
183
- for _ in range(self.num):
184
- yield from self.df
187
+ for df in self.dfs:
188
+ yield from df
185
189
 
186
190
 
187
191
  class ConcatData(DataFlow):
@@ -197,7 +201,7 @@ class ConcatData(DataFlow):
197
201
  df = ConcatData([df_1,df_2])
198
202
  """
199
203
 
200
- def __init__(self, df_lists: List[DataFlow]) -> None:
204
+ def __init__(self, df_lists: list[DataFlow]) -> None:
201
205
  """
202
206
  :param df_lists: a list of DataFlow.
203
207
  """
@@ -233,7 +237,7 @@ class JoinData(DataFlow):
233
237
  `JoinData` will stop once the first Dataflow throws a StopIteration
234
238
  """
235
239
 
236
- def __init__(self, df_lists: List[DataFlow]) -> None:
240
+ def __init__(self, df_lists: list[DataFlow]) -> None:
237
241
  """
238
242
  :param df_lists: a list of DataFlow. When these dataflows have different sizes, JoinData will stop when any
239
243
  of them is exhausted.
@@ -21,7 +21,7 @@ from
21
21
 
22
22
  <https://github.com/tensorpack/dataflow/blob/master/dataflow/dataflow/common.py>
23
23
  """
24
- from typing import Any, Callable, Iterable, Iterator, List, Optional
24
+ from typing import Any, Callable, Iterable, Iterator, Optional
25
25
 
26
26
  import numpy as np
27
27
 
@@ -54,7 +54,7 @@ class CacheData(ProxyDataFlow):
54
54
  :param shuffle: whether to shuffle the cache before yielding from it.
55
55
  """
56
56
  self.shuffle = shuffle
57
- self.buffer: List[Any] = []
57
+ self.buffer: list[Any] = []
58
58
  self._guard: Optional[DataFlowReentrantGuard] = None
59
59
  self.rng = get_rng(self)
60
60
  super().__init__(df)
@@ -78,7 +78,7 @@ class CacheData(ProxyDataFlow):
78
78
  yield dp
79
79
  self.buffer.append(dp)
80
80
 
81
- def get_cache(self) -> List[Any]:
81
+ def get_cache(self) -> list[Any]:
82
82
  """
83
83
  get the cache of the whole dataflow as a list
84
84
 
@@ -115,10 +115,10 @@ class CustomDataFromList(DataFromList):
115
115
 
116
116
  def __init__(
117
117
  self,
118
- lst: List[Any],
118
+ lst: list[Any],
119
119
  shuffle: bool = False,
120
120
  max_datapoints: Optional[int] = None,
121
- rebalance_func: Optional[Callable[[List[Any]], List[Any]]] = None,
121
+ rebalance_func: Optional[Callable[[list[Any]], list[Any]]] = None,
122
122
  ):
123
123
  """
124
124
  :param lst: the input list. Each element represents a datapoint.
@@ -19,23 +19,25 @@
19
19
  Methods that convert incoming data to dataflows.
20
20
  """
21
21
 
22
+ from __future__ import annotations
23
+
22
24
  import itertools
23
25
  import json
24
26
  import os
25
27
  from collections import defaultdict
26
28
  from pathlib import Path
27
- from typing import DefaultDict, Dict, List, Optional, Sequence, Union
29
+ from typing import Any, DefaultDict, Dict, Iterator, List, Optional, Sequence, TextIO, Union
28
30
 
29
31
  from jsonlines import Reader, Writer
30
32
  from tabulate import tabulate
31
33
  from termcolor import colored
32
34
 
33
35
  from ..utils.context import timed_operation
34
- from ..utils.detection_types import JsonDict, Pathlike
35
36
  from ..utils.error import FileExtensionError
36
37
  from ..utils.identifier import get_uuid_from_str
37
38
  from ..utils.pdf_utils import PDFStreamer
38
39
  from ..utils.tqdm import get_tqdm
40
+ from ..utils.types import JsonDict, PathLikeOrStr
39
41
  from ..utils.utils import is_file_extension
40
42
  from .base import DataFlow
41
43
  from .common import FlattenData, JoinData, MapData
@@ -53,6 +55,59 @@ def _reset_df_and_get_length(df: DataFlow) -> int:
53
55
  return length
54
56
 
55
57
 
58
+ class FileClosingIterator:
59
+ """
60
+ A custom iterator that closes the file object once the iteration is complete.
61
+
62
+ This iterator is used to ensure that the file object is properly closed after
63
+ reading the data from it. It is used in the context of reading data from a file
64
+ in a streaming manner, where the data is not loaded into memory all at once.
65
+
66
+ **Example:**
67
+
68
+ file = open(path, "r")
69
+ iterator = Reader(file)
70
+ closing_iterator = FileClosingIterator(file, iter(iterator))
71
+
72
+ df = CustomDataFromIterable(closing_iterator, max_datapoints=max_datapoints) # set up a dataflow
73
+
74
+ """
75
+
76
+ def __init__(self, file_obj: TextIO, iterator: Iterator[Any]):
77
+ """
78
+ Initializes the FileClosingIterator with a file object and its iterator.
79
+
80
+ :param file_obj (TextIO): The file object to read data from.
81
+ :param iterator (Iterator): The actual iterator of the file object.
82
+ """
83
+ self.file_obj = file_obj
84
+ self.iterator = iterator
85
+
86
+ def __iter__(self) -> FileClosingIterator:
87
+ """
88
+ Returns the iterator object itself.
89
+
90
+ :return: FileClosingIterator: The instance of the class itself.
91
+ """
92
+ return self
93
+
94
+ def __next__(self) -> Any:
95
+ """
96
+ Returns the next item from the file object's iterator.
97
+ Closes the file object if the iteration is finished.
98
+
99
+ :return: The next item from the file object's iterator.
100
+
101
+ Raises:
102
+ StopIteration: If there are no more items to return.
103
+ """
104
+ try:
105
+ return next(self.iterator)
106
+ except StopIteration as exc:
107
+ self.file_obj.close()
108
+ raise StopIteration from exc
109
+
110
+
56
111
  class SerializerJsonlines:
57
112
  """
58
113
  Serialize a dataflow from a jsonlines file. Alternatively, save a dataflow of JSON objects to a .jsonl file.
@@ -66,7 +121,7 @@ class SerializerJsonlines:
66
121
  """
67
122
 
68
123
  @staticmethod
69
- def load(path: Pathlike, max_datapoints: Optional[int] = None) -> CustomDataFromIterable:
124
+ def load(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> CustomDataFromIterable:
70
125
  """
71
126
  :param path: a path to a .jsonl file.
72
127
  :param max_datapoints: Will stop the iteration once max_datapoints have been streamed
@@ -75,10 +130,11 @@ class SerializerJsonlines:
75
130
  """
76
131
  file = open(path, "r") # pylint: disable=W1514,R1732
77
132
  iterator = Reader(file)
78
- return CustomDataFromIterable(iterator, max_datapoints=max_datapoints)
133
+ closing_iterator = FileClosingIterator(file, iter(iterator))
134
+ return CustomDataFromIterable(closing_iterator, max_datapoints=max_datapoints)
79
135
 
80
136
  @staticmethod
81
- def save(df: DataFlow, path: Pathlike, file_name: str, max_datapoints: Optional[int] = None) -> None:
137
+ def save(df: DataFlow, path: PathLikeOrStr, file_name: str, max_datapoints: Optional[int] = None) -> None:
82
138
  """
83
139
  Writes a dataflow iteratively to a .jsonl file. Every datapoint must be a dict where all items are serializable.
84
140
  As the length of the dataflow cannot be determined in every case max_datapoint prevents generating an
@@ -120,7 +176,7 @@ class SerializerTabsepFiles:
120
176
  """
121
177
 
122
178
  @staticmethod
123
- def load(path: Pathlike, max_datapoins: Optional[int] = None) -> CustomDataFromList:
179
+ def load(path: PathLikeOrStr, max_datapoins: Optional[int] = None) -> CustomDataFromList:
124
180
  """
125
181
  :param path: a path to a .txt file.
126
182
  :param max_datapoins: Will stop the iteration once max_datapoints have been streamed
@@ -133,7 +189,7 @@ class SerializerTabsepFiles:
133
189
  return CustomDataFromList(file_list, max_datapoints=max_datapoins)
134
190
 
135
191
  @staticmethod
136
- def save(df: DataFlow, path: Pathlike, file_name: str, max_datapoints: Optional[int] = None) -> None:
192
+ def save(df: DataFlow, path: PathLikeOrStr, file_name: str, max_datapoints: Optional[int] = None) -> None:
137
193
  """
138
194
  Writes a dataflow iteratively to a .txt file. Every datapoint must be a string.
139
195
  As the length of the dataflow cannot be determined in every case max_datapoint prevents generating an
@@ -168,7 +224,7 @@ class SerializerFiles:
168
224
 
169
225
  @staticmethod
170
226
  def load(
171
- path: Pathlike,
227
+ path: PathLikeOrStr,
172
228
  file_type: Union[str, Sequence[str]],
173
229
  max_datapoints: Optional[int] = None,
174
230
  shuffle: Optional[bool] = False,
@@ -190,15 +246,14 @@ class SerializerFiles:
190
246
  df2: DataFlow
191
247
  df3: DataFlow
192
248
 
193
- if isinstance(path, str):
194
- path = Path(path)
249
+ path = Path(path)
195
250
  if not path.exists():
196
251
  raise NotADirectoryError(f"The path {path} to the directory or file does not exist")
197
252
 
198
253
  if shuffle:
199
254
  sort = False
200
- it1 = os.walk(path, topdown=False)
201
- it2 = os.walk(path, topdown=False)
255
+ it1 = os.walk(os.fspath(path), topdown=False)
256
+ it2 = os.walk(os.fspath(path), topdown=False)
202
257
  df1 = CustomDataFromIterable(it1)
203
258
  df2 = CustomDataFromIterable(it2)
204
259
  df1 = MapData(df1, lambda dp: None if len(dp[2]) == 0 else dp)
@@ -237,7 +292,7 @@ class CocoParser:
237
292
  :param annotation_file: location of annotation file
238
293
  """
239
294
 
240
- def __init__(self, annotation_file: Optional[Pathlike] = None) -> None:
295
+ def __init__(self, annotation_file: Optional[PathLikeOrStr] = None) -> None:
241
296
  self.dataset: JsonDict = {}
242
297
  self.anns: Dict[int, JsonDict] = {}
243
298
  self.cats: Dict[int, JsonDict] = {}
@@ -465,7 +520,7 @@ class SerializerCoco:
465
520
  """
466
521
 
467
522
  @staticmethod
468
- def load(path: Pathlike, max_datapoints: Optional[int] = None) -> DataFlow:
523
+ def load(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> DataFlow:
469
524
  """
470
525
  Loads a .json file and generates a dataflow.
471
526
 
@@ -478,7 +533,7 @@ class SerializerCoco:
478
533
 
479
534
  {'image':{'id',...},'annotations':[{'id':…,'bbox':...}]}
480
535
 
481
- for each single image id.
536
+ for each image id. We use the type hint CocoDatapointDict to describe this dictionary
482
537
 
483
538
  :param max_datapoints: Will stop the iteration once max_datapoints have been streamed.
484
539
  :param path: a path to a .json file.
@@ -525,7 +580,7 @@ class SerializerPdfDoc:
525
580
  """
526
581
 
527
582
  @staticmethod
528
- def load(path: Pathlike, max_datapoints: Optional[int] = None) -> DataFlow:
583
+ def load(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> DataFlow:
529
584
  """
530
585
  Loads the document page wise and returns a dataflow accordingly.
531
586
 
@@ -552,14 +607,16 @@ class SerializerPdfDoc:
552
607
  return df
553
608
 
554
609
  @staticmethod
555
- def save(path: Pathlike) -> None:
610
+ def save(path: PathLikeOrStr) -> None:
556
611
  """
557
612
  Not implemented
558
613
  """
559
614
  raise NotImplementedError()
560
615
 
561
616
  @staticmethod
562
- def split(path: Pathlike, path_target: Optional[Pathlike] = None, max_datapoint: Optional[int] = None) -> None:
617
+ def split(
618
+ path: PathLikeOrStr, path_target: Optional[PathLikeOrStr] = None, max_datapoint: Optional[int] = None
619
+ ) -> None:
563
620
  """
564
621
  Split a document into single pages.
565
622
  """
@@ -23,7 +23,7 @@ import uuid
23
23
  import weakref
24
24
  from abc import ABC, abstractmethod
25
25
  from contextlib import contextmanager
26
- from typing import Any, Callable, Iterator, List, no_type_check
26
+ from typing import Any, Callable, Iterator, no_type_check
27
27
 
28
28
  import zmq
29
29
 
@@ -236,7 +236,7 @@ class MultiThreadMapData(_ParallelMapData):
236
236
  self._strict = strict
237
237
  self.num_thread = num_thread
238
238
  self.map_func = map_func
239
- self._threads: List[Any] = []
239
+ self._threads: list[Any] = []
240
240
  self._evt = None
241
241
 
242
242
  def reset_state(self) -> None:
@@ -284,7 +284,7 @@ class _MultiProcessZMQDataFlow(DataFlow, ABC):
284
284
  if os.name == "nt":
285
285
  raise EnvironmentError("ZMQ IPC doesn't support windows")
286
286
  self._reset_done = False
287
- self._procs: List[Any] = []
287
+ self._procs: list[Any] = []
288
288
  self.context = None
289
289
  self.socket = None
290
290
 
@@ -12,7 +12,7 @@ Some DataFlow classes for serialization. Many classes have been taken from
12
12
 
13
13
  import pickle
14
14
  from copy import copy
15
- from typing import Any, Iterable, Iterator, List, Optional, Tuple, Union
15
+ from typing import Any, Iterable, Iterator, Optional, Union
16
16
 
17
17
  import numpy as np
18
18
 
@@ -23,7 +23,7 @@ from .base import DataFlow, RNGDataFlow
23
23
  class DataFromList(RNGDataFlow):
24
24
  """Wrap a list of datapoints to a DataFlow"""
25
25
 
26
- def __init__(self, lst: List[Any], shuffle: bool = True) -> None:
26
+ def __init__(self, lst: list[Any], shuffle: bool = True) -> None:
27
27
  """
28
28
  :param lst: input list. Each element is a datapoint.
29
29
  :param shuffle: shuffle data.
@@ -79,11 +79,11 @@ class FakeData(RNGDataFlow):
79
79
 
80
80
  def __init__(
81
81
  self,
82
- shapes: List[Union[List[Any], Tuple[Any]]],
82
+ shapes: list[Union[list[Any], tuple[Any]]],
83
83
  size: int = 1000,
84
84
  random: bool = True,
85
85
  dtype: str = "float32",
86
- domain: Tuple[Union[float, int], Union[float, int]] = (0, 1),
86
+ domain: tuple[Union[float, int], Union[float, int]] = (0, 1),
87
87
  ):
88
88
  """
89
89
  :param shapes: a list of lists/tuples. Shapes of each component.