deepdoctection 0.32__py3-none-any.whl → 0.34__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (111) hide show
  1. deepdoctection/__init__.py +8 -25
  2. deepdoctection/analyzer/dd.py +84 -71
  3. deepdoctection/dataflow/common.py +9 -5
  4. deepdoctection/dataflow/custom.py +5 -5
  5. deepdoctection/dataflow/custom_serialize.py +75 -18
  6. deepdoctection/dataflow/parallel_map.py +3 -3
  7. deepdoctection/dataflow/serialize.py +4 -4
  8. deepdoctection/dataflow/stats.py +3 -3
  9. deepdoctection/datapoint/annotation.py +78 -56
  10. deepdoctection/datapoint/box.py +7 -7
  11. deepdoctection/datapoint/convert.py +6 -6
  12. deepdoctection/datapoint/image.py +157 -75
  13. deepdoctection/datapoint/view.py +175 -151
  14. deepdoctection/datasets/adapter.py +30 -24
  15. deepdoctection/datasets/base.py +10 -10
  16. deepdoctection/datasets/dataflow_builder.py +3 -3
  17. deepdoctection/datasets/info.py +23 -25
  18. deepdoctection/datasets/instances/doclaynet.py +48 -49
  19. deepdoctection/datasets/instances/fintabnet.py +44 -45
  20. deepdoctection/datasets/instances/funsd.py +23 -23
  21. deepdoctection/datasets/instances/iiitar13k.py +8 -8
  22. deepdoctection/datasets/instances/layouttest.py +2 -2
  23. deepdoctection/datasets/instances/publaynet.py +3 -3
  24. deepdoctection/datasets/instances/pubtables1m.py +18 -18
  25. deepdoctection/datasets/instances/pubtabnet.py +30 -29
  26. deepdoctection/datasets/instances/rvlcdip.py +28 -29
  27. deepdoctection/datasets/instances/xfund.py +51 -30
  28. deepdoctection/datasets/save.py +6 -6
  29. deepdoctection/eval/accmetric.py +32 -33
  30. deepdoctection/eval/base.py +8 -9
  31. deepdoctection/eval/cocometric.py +13 -12
  32. deepdoctection/eval/eval.py +32 -26
  33. deepdoctection/eval/tedsmetric.py +16 -12
  34. deepdoctection/eval/tp_eval_callback.py +7 -16
  35. deepdoctection/extern/base.py +339 -134
  36. deepdoctection/extern/d2detect.py +69 -89
  37. deepdoctection/extern/deskew.py +11 -10
  38. deepdoctection/extern/doctrocr.py +81 -64
  39. deepdoctection/extern/fastlang.py +23 -16
  40. deepdoctection/extern/hfdetr.py +53 -38
  41. deepdoctection/extern/hflayoutlm.py +216 -155
  42. deepdoctection/extern/hflm.py +35 -30
  43. deepdoctection/extern/model.py +433 -255
  44. deepdoctection/extern/pdftext.py +15 -15
  45. deepdoctection/extern/pt/ptutils.py +4 -2
  46. deepdoctection/extern/tessocr.py +39 -38
  47. deepdoctection/extern/texocr.py +14 -16
  48. deepdoctection/extern/tp/tfutils.py +16 -2
  49. deepdoctection/extern/tp/tpcompat.py +11 -7
  50. deepdoctection/extern/tp/tpfrcnn/config/config.py +4 -4
  51. deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +1 -1
  52. deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +5 -5
  53. deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +6 -6
  54. deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +4 -4
  55. deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +5 -3
  56. deepdoctection/extern/tp/tpfrcnn/preproc.py +5 -5
  57. deepdoctection/extern/tpdetect.py +40 -45
  58. deepdoctection/mapper/cats.py +36 -40
  59. deepdoctection/mapper/cocostruct.py +16 -12
  60. deepdoctection/mapper/d2struct.py +22 -22
  61. deepdoctection/mapper/hfstruct.py +7 -7
  62. deepdoctection/mapper/laylmstruct.py +22 -24
  63. deepdoctection/mapper/maputils.py +9 -10
  64. deepdoctection/mapper/match.py +33 -2
  65. deepdoctection/mapper/misc.py +6 -7
  66. deepdoctection/mapper/pascalstruct.py +4 -4
  67. deepdoctection/mapper/prodigystruct.py +6 -6
  68. deepdoctection/mapper/pubstruct.py +84 -92
  69. deepdoctection/mapper/tpstruct.py +3 -3
  70. deepdoctection/mapper/xfundstruct.py +33 -33
  71. deepdoctection/pipe/anngen.py +39 -14
  72. deepdoctection/pipe/base.py +68 -99
  73. deepdoctection/pipe/common.py +181 -85
  74. deepdoctection/pipe/concurrency.py +14 -10
  75. deepdoctection/pipe/doctectionpipe.py +24 -21
  76. deepdoctection/pipe/language.py +20 -25
  77. deepdoctection/pipe/layout.py +18 -16
  78. deepdoctection/pipe/lm.py +49 -47
  79. deepdoctection/pipe/order.py +63 -65
  80. deepdoctection/pipe/refine.py +102 -109
  81. deepdoctection/pipe/segment.py +157 -162
  82. deepdoctection/pipe/sub_layout.py +50 -40
  83. deepdoctection/pipe/text.py +37 -36
  84. deepdoctection/pipe/transform.py +19 -16
  85. deepdoctection/train/d2_frcnn_train.py +27 -25
  86. deepdoctection/train/hf_detr_train.py +22 -18
  87. deepdoctection/train/hf_layoutlm_train.py +49 -48
  88. deepdoctection/train/tp_frcnn_train.py +10 -11
  89. deepdoctection/utils/concurrency.py +1 -1
  90. deepdoctection/utils/context.py +13 -6
  91. deepdoctection/utils/develop.py +4 -4
  92. deepdoctection/utils/env_info.py +52 -14
  93. deepdoctection/utils/file_utils.py +6 -11
  94. deepdoctection/utils/fs.py +41 -14
  95. deepdoctection/utils/identifier.py +2 -2
  96. deepdoctection/utils/logger.py +15 -15
  97. deepdoctection/utils/metacfg.py +7 -7
  98. deepdoctection/utils/pdf_utils.py +39 -14
  99. deepdoctection/utils/settings.py +188 -182
  100. deepdoctection/utils/tqdm.py +1 -1
  101. deepdoctection/utils/transform.py +14 -9
  102. deepdoctection/utils/types.py +104 -0
  103. deepdoctection/utils/utils.py +7 -7
  104. deepdoctection/utils/viz.py +70 -69
  105. {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/METADATA +7 -4
  106. deepdoctection-0.34.dist-info/RECORD +146 -0
  107. {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/WHEEL +1 -1
  108. deepdoctection/utils/detection_types.py +0 -68
  109. deepdoctection-0.32.dist-info/RECORD +0 -146
  110. {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/LICENSE +0 -0
  111. {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/top_level.txt +0 -0
@@ -25,23 +25,23 @@ import sys
25
25
  from errno import ENOENT
26
26
  from io import BytesIO
27
27
  from shutil import copyfile
28
- from typing import Generator, List, Optional, Tuple
28
+ from typing import Generator, Optional
29
29
 
30
30
  from numpy import uint8
31
31
  from pypdf import PdfReader, PdfWriter, errors
32
32
 
33
33
  from .context import save_tmp_file, timeout_manager
34
- from .detection_types import ImageType, Pathlike
35
34
  from .error import DependencyError, FileExtensionError
36
35
  from .file_utils import pdf_to_cairo_available, pdf_to_ppm_available, qpdf_available
37
36
  from .logger import LoggingRecord, logger
37
+ from .types import PathLikeOrStr, PixelValues
38
38
  from .utils import is_file_extension
39
39
  from .viz import viz_handler
40
40
 
41
41
  __all__ = ["decrypt_pdf_document", "get_pdf_file_reader", "get_pdf_file_writer", "PDFStreamer", "pdf_to_np_array"]
42
42
 
43
43
 
44
- def decrypt_pdf_document(path: Pathlike) -> bool:
44
+ def decrypt_pdf_document(path: PathLikeOrStr) -> bool:
45
45
  """
46
46
  Decrypting a pdf. As copying a pdf document removes the password that protects pdf, this method
47
47
  generates a copy and decrypts the copy using qpdf. The result is saved as the original
@@ -73,7 +73,7 @@ def decrypt_pdf_document(path: Pathlike) -> bool:
73
73
  return False
74
74
 
75
75
 
76
- def get_pdf_file_reader(path: Pathlike) -> PdfReader:
76
+ def get_pdf_file_reader(path: PathLikeOrStr) -> PdfReader:
77
77
  """
78
78
  Creates a file reader object from a pdf document. Will try to decrypt the document if it is
79
79
  encrypted. (See `decrypt_pdf_document` to understand what is meant with "decrypt").
@@ -107,8 +107,7 @@ def get_pdf_file_reader(path: Pathlike) -> PdfReader:
107
107
  )
108
108
  sys.exit()
109
109
 
110
- file_reader = PdfReader(open(path, "rb")) # pylint: disable=R1732
111
- return file_reader
110
+ return PdfReader(os.fspath(path))
112
111
 
113
112
 
114
113
  def get_pdf_file_writer() -> PdfWriter:
@@ -125,15 +124,27 @@ class PDFStreamer:
125
124
 
126
125
  **Example:**
127
126
 
128
- df = dataflow.DataFromIterable.PDFStreamer(path=path)
127
+ # Building a Dataflow with a PDFStreamer
128
+ df = dataflow.DataFromIterable(PDFStreamer(path=path))
129
129
  df.reset_state()
130
130
 
131
131
  for page in df:
132
132
  ... # do whatever you like
133
133
 
134
+ # Something else you can do:
135
+ streamer = PDFStreamer(path=path)
136
+ pages = len(streamer) # get the number of pages
137
+ random_int = random.sample(range(0, pages), 2) # select some pages
138
+ for ran in random_int:
139
+ pdf_bytes = streamer[ran] # get the page bytes directly
140
+
141
+ streamer.close() # Do not forget to close the streamer, otherwise the file will never be closed and might
142
+ # cause memory leaks if you open many files.
143
+
144
+
134
145
  """
135
146
 
136
- def __init__(self, path: Pathlike) -> None:
147
+ def __init__(self, path: PathLikeOrStr) -> None:
137
148
  """
138
149
  :param path: to a pdf.
139
150
  """
@@ -143,13 +154,27 @@ class PDFStreamer:
143
154
  def __len__(self) -> int:
144
155
  return len(self.file_reader.pages)
145
156
 
146
- def __iter__(self) -> Generator[Tuple[bytes, int], None, None]:
157
+ def __iter__(self) -> Generator[tuple[bytes, int], None, None]:
147
158
  for k in range(len(self)):
148
159
  buffer = BytesIO()
149
160
  writer = get_pdf_file_writer()
150
161
  writer.add_page(self.file_reader.pages[k])
151
162
  writer.write(buffer)
152
163
  yield buffer.getvalue(), k
164
+ self.file_reader.close()
165
+
166
+ def __getitem__(self, index: int) -> bytes:
167
+ buffer = BytesIO()
168
+ writer = get_pdf_file_writer()
169
+ writer.add_page(self.file_reader.pages[index])
170
+ writer.write(buffer)
171
+ return buffer.getvalue()
172
+
173
+ def close(self) -> None:
174
+ """
175
+ Close the file reader
176
+ """
177
+ self.file_reader.close()
153
178
 
154
179
 
155
180
  # The following functions are modified versions from the Python poppler wrapper
@@ -157,9 +182,9 @@ class PDFStreamer:
157
182
 
158
183
 
159
184
  def _input_to_cli_str(
160
- input_file_name: Pathlike, output_file_name: Pathlike, dpi: int, size: Optional[Tuple[int, int]] = None
161
- ) -> List[str]:
162
- cmd_args: List[str] = []
185
+ input_file_name: PathLikeOrStr, output_file_name: PathLikeOrStr, dpi: int, size: Optional[tuple[int, int]] = None
186
+ ) -> list[str]:
187
+ cmd_args: list[str] = []
163
188
 
164
189
  if pdf_to_ppm_available():
165
190
  command = "pdftoppm"
@@ -196,7 +221,7 @@ class PopplerError(RuntimeError):
196
221
  self.args = (status, message)
197
222
 
198
223
 
199
- def _run_poppler(poppler_args: List[str]) -> None:
224
+ def _run_poppler(poppler_args: list[str]) -> None:
200
225
  try:
201
226
  proc = subprocess.Popen(poppler_args) # pylint: disable=R1732
202
227
  except OSError as error:
@@ -209,7 +234,7 @@ def _run_poppler(poppler_args: List[str]) -> None:
209
234
  raise PopplerError(status=proc.returncode, message="Syntax Error: PDF cannot be read with Poppler")
210
235
 
211
236
 
212
- def pdf_to_np_array(pdf_bytes: bytes, size: Optional[Tuple[int, int]] = None, dpi: int = 200) -> ImageType:
237
+ def pdf_to_np_array(pdf_bytes: bytes, size: Optional[tuple[int, int]] = None, dpi: int = 200) -> PixelValues:
213
238
  """
214
239
  Convert a single pdf page from its byte representation to a numpy array. This function will save the pdf as to a tmp
215
240
  file and then call poppler via `pdftoppm` resp. `pdftocairo` if the former is not available.
@@ -18,11 +18,12 @@
18
18
  """
19
19
  Module for funcs and constants that maintain general settings
20
20
  """
21
+ from __future__ import annotations
21
22
 
22
23
  import os
23
24
  from enum import Enum
24
25
  from pathlib import Path
25
- from typing import Dict, List, Optional, Tuple, Union
26
+ from typing import Optional, Union
26
27
 
27
28
  import catalogue # type: ignore
28
29
 
@@ -34,7 +35,7 @@ class ObjectTypes(str, Enum):
34
35
  return f"<{self.__class__.__name__}.{self.name}>"
35
36
 
36
37
  @classmethod
37
- def from_value(cls, value: str) -> "ObjectTypes":
38
+ def from_value(cls, value: str) -> ObjectTypes:
38
39
  """Getting the enum member from a given string value
39
40
 
40
41
  :param value: string value to get the enum member
@@ -56,263 +57,268 @@ object_types_registry = catalogue.create("deepdoctection", "settings", entry_poi
56
57
  class DefaultType(ObjectTypes):
57
58
  """Type for default member"""
58
59
 
59
- default_type = "default_type"
60
+ DEFAULT_TYPE = "default_type"
60
61
 
61
62
 
62
63
  @object_types_registry.register("PageType")
63
64
  class PageType(ObjectTypes):
64
65
  """Type for document page properties"""
65
66
 
66
- document_type = "document_type"
67
- language = "language"
68
- angle = "angle"
67
+ DOCUMENT_TYPE = "document_type"
68
+ LANGUAGE = "language"
69
+ ANGLE = "angle"
69
70
 
70
71
 
71
72
  @object_types_registry.register("SummaryType")
72
73
  class SummaryType(ObjectTypes):
73
74
  """Summary type member"""
74
75
 
75
- summary = "summary"
76
+ SUMMARY = "summary"
76
77
 
77
78
 
78
79
  @object_types_registry.register("DocumentType")
79
80
  class DocumentType(ObjectTypes):
80
81
  """Document types"""
81
82
 
82
- letter = "letter"
83
- form = "form"
84
- email = "email"
85
- handwritten = "handwritten"
86
- advertisement = "advertisement"
87
- scientific_report = "scientific_report"
88
- scientific_publication = "scientific_publication"
89
- specification = "specification"
90
- file_folder = "file_folder"
91
- news_article = "news_article"
92
- budget = "budget"
93
- invoice = "invoice"
94
- presentation = "presentation"
95
- questionnaire = "questionnaire"
96
- resume = "resume"
97
- memo = "memo"
98
- financial_report = "financial_report"
99
- laws_and_regulations = "laws_and_regulations"
100
- government_tenders = "government_tenders"
101
- manuals = "manuals"
102
- patents = "patents"
83
+ LETTER = "letter"
84
+ FORM = "form"
85
+ EMAIL = "email"
86
+ HANDWRITTEN = "handwritten"
87
+ ADVERTISEMENT = "advertisement"
88
+ SCIENTIFIC_REPORT = "scientific_report"
89
+ SCIENTIFIC_PUBLICATION = "scientific_publication"
90
+ SPECIFICATION = "specification"
91
+ FILE_FOLDER = "file_folder"
92
+ NEWS_ARTICLE = "news_article"
93
+ BUDGET = "budget"
94
+ INVOICE = "invoice"
95
+ PRESENTATION = "presentation"
96
+ QUESTIONNAIRE = "questionnaire"
97
+ RESUME = "resume"
98
+ MEMO = "memo"
99
+ FINANCIAL_REPORT = "financial_report"
100
+ LAWS_AND_REGULATIONS = "laws_and_regulations"
101
+ GOVERNMENT_TENDERS = "government_tenders"
102
+ MANUALS = "manuals"
103
+ PATENTS = "patents"
104
+ MARK = "mark"
103
105
 
104
106
 
105
107
  @object_types_registry.register("LayoutType")
106
108
  class LayoutType(ObjectTypes):
107
109
  """Layout types"""
108
110
 
109
- table = "table"
110
- table_rotated = "table_rotated"
111
- figure = "figure"
112
- list = "list"
113
- text = "text"
114
- title = "title" # type: ignore
115
- logo = "logo"
116
- signature = "signature"
117
- caption = "caption"
118
- footnote = "footnote"
119
- formula = "formula"
120
- page_footer = "page_footer"
121
- page_header = "page_header"
122
- section_header = "section_header"
123
- page = "page"
124
- cell = "cell"
125
- row = "row"
126
- column = "column"
127
- word = "word"
128
- line = "line"
129
- background = "background"
111
+ TABLE = "table"
112
+ TABLE_ROTATED = "table_rotated"
113
+ FIGURE = "figure"
114
+ LIST = "list"
115
+ TEXT = "text"
116
+ TITLE = "title"
117
+ LOGO = "logo"
118
+ SIGNATURE = "signature"
119
+ CAPTION = "caption"
120
+ FOOTNOTE = "footnote"
121
+ FORMULA = "formula"
122
+ PAGE_FOOTER = "page_footer"
123
+ PAGE_HEADER = "page_header"
124
+ SECTION_HEADER = "section_header"
125
+ PAGE = "page"
126
+ CELL = "cell"
127
+ ROW = "row"
128
+ COLUMN = "column"
129
+ WORD = "word"
130
+ LINE = "line"
131
+ BACKGROUND = "background"
132
+ PAGE_NUMBER = "page_number"
133
+ KEY_VALUE_AREA = "key_value_area"
134
+ LIST_ITEM = "list_item"
130
135
 
131
136
 
132
137
  @object_types_registry.register("TableType")
133
138
  class TableType(ObjectTypes):
134
139
  """Types for table properties"""
135
140
 
136
- item = "item"
137
- number_of_rows = "number_of_rows"
138
- number_of_columns = "number_of_columns"
139
- max_row_span = "max_row_span"
140
- max_col_span = "max_col_span"
141
- html = "html"
141
+ ITEM = "item"
142
+ NUMBER_OF_ROWS = "number_of_rows"
143
+ NUMBER_OF_COLUMNS = "number_of_columns"
144
+ MAX_ROW_SPAN = "max_row_span"
145
+ MAX_COL_SPAN = "max_col_span"
146
+ HTML = "html"
142
147
 
143
148
 
144
149
  @object_types_registry.register("CellType")
145
150
  class CellType(ObjectTypes):
146
151
  """Types for cell properties"""
147
152
 
148
- header = "header"
149
- body = "body"
150
- row_number = "row_number"
151
- row_span = "row_span"
152
- row_header = "row_header"
153
- projected_row_header = "projected_row_header"
154
- column_number = "column_number"
155
- column_span = "column_span"
156
- column_header = "column_header"
157
- spanning = "spanning"
153
+ HEADER = "header"
154
+ BODY = "body"
155
+ ROW_NUMBER = "row_number"
156
+ ROW_SPAN = "row_span"
157
+ ROW_HEADER = "row_header"
158
+ PROJECTED_ROW_HEADER = "projected_row_header"
159
+ COLUMN_NUMBER = "column_number"
160
+ COLUMN_SPAN = "column_span"
161
+ COLUMN_HEADER = "column_header"
162
+ SPANNING = "spanning"
158
163
 
159
164
 
160
165
  @object_types_registry.register("WordType")
161
166
  class WordType(ObjectTypes):
162
167
  """Types for word properties"""
163
168
 
164
- characters = "characters"
165
- block = "block"
166
- token_class = "token_class"
167
- tag = "tag"
168
- token_tag = "token_tag"
169
- text_line = "text_line"
170
- character_type = "character_type"
171
- printed = "printed"
172
- handwritten = "handwritten"
169
+ CHARACTERS = "characters"
170
+ BLOCK = "block"
171
+ TOKEN_CLASS = "token_class"
172
+ TAG = "tag"
173
+ TOKEN_TAG = "token_tag"
174
+ TEXT_LINE = "text_line"
175
+ CHARACTER_TYPE = "character_type"
176
+ PRINTED = "printed"
177
+ HANDWRITTEN = "handwritten"
173
178
 
174
179
 
175
180
  @object_types_registry.register("TokenClasses")
176
181
  class TokenClasses(ObjectTypes):
177
182
  """Types for token classes"""
178
183
 
179
- header = "header"
180
- question = "question"
181
- answer = "answer"
182
- other = "other"
184
+ HEADER = "header"
185
+ QUESTION = "question"
186
+ ANSWER = "answer"
187
+ OTHER = "other"
183
188
 
184
189
 
185
190
  @object_types_registry.register("BioTag")
186
191
  class BioTag(ObjectTypes):
187
192
  """Types for tags"""
188
193
 
189
- begin = "B"
190
- inside = "I"
191
- outside = "O"
192
- single = "S"
193
- end = "E"
194
+ BEGIN = "B"
195
+ INSIDE = "I"
196
+ OUTSIDE = "O"
197
+ SINGLE = "S"
198
+ END = "E"
194
199
 
195
200
 
196
201
  @object_types_registry.register("TokenClassWithTag")
197
202
  class TokenClassWithTag(ObjectTypes):
198
203
  """Types for token classes with tags, e.g. B-answer"""
199
204
 
200
- b_answer = "B-answer"
201
- b_header = "B-header"
202
- b_question = "B-question"
203
- e_answer = "E-answer"
204
- e_header = "E-header"
205
- e_question = "E-question"
206
- i_answer = "I-answer"
207
- i_header = "I-header"
208
- i_question = "I-question"
209
- s_answer = "S-answer"
210
- s_header = "S-header"
211
- s_question = "S-question"
205
+ B_ANSWER = "B-answer"
206
+ B_HEADER = "B-header"
207
+ B_QUESTION = "B-question"
208
+ E_ANSWER = "E-answer"
209
+ E_HEADER = "E-header"
210
+ E_QUESTION = "E-question"
211
+ I_ANSWER = "I-answer"
212
+ I_HEADER = "I-header"
213
+ I_QUESTION = "I-question"
214
+ S_ANSWER = "S-answer"
215
+ S_HEADER = "S-header"
216
+ S_QUESTION = "S-question"
212
217
 
213
218
 
214
219
  @object_types_registry.register("Relationships")
215
220
  class Relationships(ObjectTypes):
216
221
  """Types for describing relationships between types"""
217
222
 
218
- child = "child"
219
- reading_order = "reading_order"
220
- semantic_entity_link = "semantic_entity_link"
223
+ CHILD = "child"
224
+ READING_ORDER = "reading_order"
225
+ SEMANTIC_ENTITY_LINK = "semantic_entity_link"
226
+ LAYOUT_LINK = "layout_link"
221
227
 
222
228
 
223
229
  @object_types_registry.register("Languages")
224
230
  class Languages(ObjectTypes):
225
231
  """Language types"""
226
232
 
227
- english = "eng"
228
- russian = "rus"
229
- german = "deu"
230
- french = "fre"
231
- italian = "ita"
232
- japanese = "jpn"
233
- spanish = "spa"
234
- cebuano = "ceb"
235
- turkish = "tur"
236
- portuguese = "por"
237
- ukrainian = "ukr"
238
- esperanto = "epo"
239
- polish = "pol"
240
- swedish = "swe"
241
- dutch = "dut"
242
- hebrew = "heb"
243
- chinese = "chi"
244
- hungarian = "hun"
245
- arabic = "ara"
246
- catalan = "cat"
247
- finnish = "fin"
248
- czech = "cze"
249
- persian = "per"
250
- serbian = "srp"
251
- greek = "gre"
252
- vietnamese = "vie"
253
- bulgarian = "bul"
254
- korean = "kor"
255
- norwegian = "nor"
256
- macedonian = "mac"
257
- romanian = "rum"
258
- indonesian = "ind"
259
- thai = "tha"
260
- armenian = "arm"
261
- danish = "dan"
262
- tamil = "tam"
263
- hindi = "hin"
264
- croatian = "hrv"
265
- belarusian = "bel"
266
- georgian = "geo"
267
- telugu = "tel"
268
- kazakh = "kaz"
269
- waray = "war"
270
- lithuanian = "lit"
271
- scottish = "glg"
272
- slovak = "slo"
273
- benin = "ben"
274
- basque = "baq"
275
- slovenian = "slv"
276
- malayalam = "mal"
277
- marathi = "mar"
278
- estonian = "est"
279
- azerbaijani = "aze"
280
- albanian = "alb"
281
- latin = "lat"
282
- bosnian = "bos"
283
- norwegian_nynorsk = "nno"
284
- urdu = "urd"
285
- not_defined = "nn"
233
+ ENGLISH = "eng"
234
+ RUSSIAN = "rus"
235
+ GERMAN = "deu"
236
+ FRENCH = "fre"
237
+ ITALIAN = "ita"
238
+ JAPANESE = "jpn"
239
+ SPANISH = "spa"
240
+ CEBUANO = "ceb"
241
+ TURKISH = "tur"
242
+ PORTUGUESE = "por"
243
+ UKRAINIAN = "ukr"
244
+ ESPERANTO = "epo"
245
+ POLISH = "pol"
246
+ SWEDISH = "swe"
247
+ DUTCH = "dut"
248
+ HEBREW = "heb"
249
+ CHINESE = "chi"
250
+ HUNGARIAN = "hun"
251
+ ARABIC = "ara"
252
+ CATALAN = "cat"
253
+ FINNISH = "fin"
254
+ CZECH = "cze"
255
+ PERSIAN = "per"
256
+ SERBIAN = "srp"
257
+ GREEK = "gre"
258
+ VIETNAMESE = "vie"
259
+ BULGARIAN = "bul"
260
+ KOREAN = "kor"
261
+ NORWEGIAN = "nor"
262
+ MACEDONIAN = "mac"
263
+ ROMANIAN = "rum"
264
+ INDONESIAN = "ind"
265
+ THAI = "tha"
266
+ ARMENIAN = "arm"
267
+ DANISH = "dan"
268
+ TAMIL = "tam"
269
+ HINDI = "hin"
270
+ CROATIAN = "hrv"
271
+ BELARUSIAN = "bel"
272
+ GEORGIAN = "geo"
273
+ TELUGU = "tel"
274
+ KAZAKH = "kaz"
275
+ WARAY = "war"
276
+ LITHUANIAN = "lit"
277
+ SCOTTISH = "glg"
278
+ SLOVAK = "slo"
279
+ BENIN = "ben"
280
+ BASQUE = "baq"
281
+ SLOVENIAN = "slv"
282
+ MALAYALAM = "mal"
283
+ MARATHI = "mar"
284
+ ESTONIAN = "est"
285
+ AZERBAIJANI = "aze"
286
+ ALBANIAN = "alb"
287
+ LATIN = "lat"
288
+ BOSNIAN = "bos"
289
+ NORWEGIAN_NOVOSIBIRSK = "nno"
290
+ URDU = "urd"
291
+ NOT_DEFINED = "nn"
286
292
 
287
293
 
288
294
  @object_types_registry.register("DatasetType")
289
295
  class DatasetType(ObjectTypes):
290
296
  """Dataset types"""
291
297
 
292
- object_detection = "object_detection"
293
- sequence_classification = "sequence_classification"
294
- token_classification = "token_classification"
295
- publaynet = "publaynet"
296
- default = "default"
298
+ OBJECT_DETECTION = "object_detection"
299
+ SEQUENCE_CLASSIFICATION = "sequence_classification"
300
+ TOKEN_CLASSIFICATION = "token_classification"
301
+ PUBLAYNET = "publaynet"
302
+ DEFAULT = "default"
297
303
 
298
304
 
299
305
  _TOKEN_AND_TAG_TO_TOKEN_CLASS_WITH_TAG = {
300
- (TokenClasses.header, BioTag.begin): TokenClassWithTag.b_header,
301
- (TokenClasses.header, BioTag.inside): TokenClassWithTag.i_header,
302
- (TokenClasses.header, BioTag.end): TokenClassWithTag.e_header,
303
- (TokenClasses.header, BioTag.single): TokenClassWithTag.s_header,
304
- (TokenClasses.answer, BioTag.begin): TokenClassWithTag.b_answer,
305
- (TokenClasses.answer, BioTag.inside): TokenClassWithTag.i_answer,
306
- (TokenClasses.answer, BioTag.end): TokenClassWithTag.e_answer,
307
- (TokenClasses.answer, BioTag.single): TokenClassWithTag.s_answer,
308
- (TokenClasses.question, BioTag.begin): TokenClassWithTag.b_question,
309
- (TokenClasses.question, BioTag.inside): TokenClassWithTag.i_question,
310
- (TokenClasses.question, BioTag.end): TokenClassWithTag.e_question,
311
- (TokenClasses.question, BioTag.single): TokenClassWithTag.s_question,
312
- (TokenClasses.other, BioTag.outside): BioTag.outside,
313
- (TokenClasses.header, BioTag.outside): BioTag.outside,
314
- (TokenClasses.answer, BioTag.outside): BioTag.outside,
315
- (TokenClasses.question, BioTag.outside): BioTag.outside,
306
+ (TokenClasses.HEADER, BioTag.BEGIN): TokenClassWithTag.B_HEADER,
307
+ (TokenClasses.HEADER, BioTag.INSIDE): TokenClassWithTag.I_HEADER,
308
+ (TokenClasses.HEADER, BioTag.END): TokenClassWithTag.E_HEADER,
309
+ (TokenClasses.HEADER, BioTag.SINGLE): TokenClassWithTag.S_HEADER,
310
+ (TokenClasses.ANSWER, BioTag.BEGIN): TokenClassWithTag.B_ANSWER,
311
+ (TokenClasses.ANSWER, BioTag.INSIDE): TokenClassWithTag.I_ANSWER,
312
+ (TokenClasses.ANSWER, BioTag.END): TokenClassWithTag.E_ANSWER,
313
+ (TokenClasses.ANSWER, BioTag.SINGLE): TokenClassWithTag.S_ANSWER,
314
+ (TokenClasses.QUESTION, BioTag.BEGIN): TokenClassWithTag.B_QUESTION,
315
+ (TokenClasses.QUESTION, BioTag.INSIDE): TokenClassWithTag.I_QUESTION,
316
+ (TokenClasses.QUESTION, BioTag.END): TokenClassWithTag.E_QUESTION,
317
+ (TokenClasses.QUESTION, BioTag.SINGLE): TokenClassWithTag.S_QUESTION,
318
+ (TokenClasses.OTHER, BioTag.OUTSIDE): BioTag.OUTSIDE,
319
+ (TokenClasses.HEADER, BioTag.OUTSIDE): BioTag.OUTSIDE,
320
+ (TokenClasses.ANSWER, BioTag.OUTSIDE): BioTag.OUTSIDE,
321
+ (TokenClasses.QUESTION, BioTag.OUTSIDE): BioTag.OUTSIDE,
316
322
  }
317
323
 
318
324
 
@@ -334,7 +340,7 @@ def token_class_tag_to_token_class_with_tag(token: ObjectTypes, tag: ObjectTypes
334
340
 
335
341
  def token_class_with_tag_to_token_class_and_tag(
336
342
  token_class_with_tag: ObjectTypes,
337
- ) -> Optional[Tuple[ObjectTypes, ObjectTypes]]:
343
+ ) -> Optional[tuple[ObjectTypes, ObjectTypes]]:
338
344
  """
339
345
  This is the reverse mapping from TokenClassWithTag members to TokenClasses and BioTag
340
346
 
@@ -358,7 +364,7 @@ def update_all_types_dict() -> None:
358
364
  _ALL_TYPES_DICT.update({e.value: e for e in obj})
359
365
 
360
366
 
361
- _OLD_TO_NEW_OBJ_TYPE: Dict[str, str] = {
367
+ _OLD_TO_NEW_OBJ_TYPE: dict[str, str] = {
362
368
  "DOC_CLASS": "document_type",
363
369
  "CHARS": "characters",
364
370
  "BIO_TAG": "tag",
@@ -381,10 +387,10 @@ def _get_new_obj_type_str(obj_type: str) -> str:
381
387
  return _OLD_TO_NEW_OBJ_TYPE.get(obj_type, obj_type)
382
388
 
383
389
 
384
- _BLACK_LIST: List[str] = ["B", "I", "O", "E", "S"]
390
+ _BLACK_LIST: list[str] = ["B", "I", "O", "E", "S"]
385
391
 
386
392
 
387
- def _get_black_list() -> List[str]:
393
+ def _get_black_list() -> list[str]:
388
394
  return _BLACK_LIST
389
395
 
390
396
 
@@ -23,7 +23,7 @@ from typing import Dict, Optional, Union
23
23
 
24
24
  from tqdm import tqdm
25
25
 
26
- from .detection_types import TqdmType
26
+ from .types import TqdmType
27
27
 
28
28
  __all__ = ["get_tqdm", "get_tqdm_default_kwargs"]
29
29