deepdoctection 0.31__py3-none-any.whl → 0.33__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (131) hide show
  1. deepdoctection/__init__.py +16 -29
  2. deepdoctection/analyzer/dd.py +70 -59
  3. deepdoctection/configs/conf_dd_one.yaml +34 -31
  4. deepdoctection/dataflow/common.py +9 -5
  5. deepdoctection/dataflow/custom.py +5 -5
  6. deepdoctection/dataflow/custom_serialize.py +75 -18
  7. deepdoctection/dataflow/parallel_map.py +3 -3
  8. deepdoctection/dataflow/serialize.py +4 -4
  9. deepdoctection/dataflow/stats.py +3 -3
  10. deepdoctection/datapoint/annotation.py +41 -56
  11. deepdoctection/datapoint/box.py +9 -8
  12. deepdoctection/datapoint/convert.py +6 -6
  13. deepdoctection/datapoint/image.py +56 -44
  14. deepdoctection/datapoint/view.py +245 -150
  15. deepdoctection/datasets/__init__.py +1 -4
  16. deepdoctection/datasets/adapter.py +35 -26
  17. deepdoctection/datasets/base.py +14 -12
  18. deepdoctection/datasets/dataflow_builder.py +3 -3
  19. deepdoctection/datasets/info.py +24 -26
  20. deepdoctection/datasets/instances/doclaynet.py +51 -51
  21. deepdoctection/datasets/instances/fintabnet.py +46 -46
  22. deepdoctection/datasets/instances/funsd.py +25 -24
  23. deepdoctection/datasets/instances/iiitar13k.py +13 -10
  24. deepdoctection/datasets/instances/layouttest.py +4 -3
  25. deepdoctection/datasets/instances/publaynet.py +5 -5
  26. deepdoctection/datasets/instances/pubtables1m.py +24 -21
  27. deepdoctection/datasets/instances/pubtabnet.py +32 -30
  28. deepdoctection/datasets/instances/rvlcdip.py +30 -30
  29. deepdoctection/datasets/instances/xfund.py +26 -26
  30. deepdoctection/datasets/save.py +6 -6
  31. deepdoctection/eval/__init__.py +1 -4
  32. deepdoctection/eval/accmetric.py +32 -33
  33. deepdoctection/eval/base.py +8 -9
  34. deepdoctection/eval/cocometric.py +15 -13
  35. deepdoctection/eval/eval.py +41 -37
  36. deepdoctection/eval/tedsmetric.py +30 -23
  37. deepdoctection/eval/tp_eval_callback.py +16 -19
  38. deepdoctection/extern/__init__.py +2 -7
  39. deepdoctection/extern/base.py +339 -134
  40. deepdoctection/extern/d2detect.py +85 -113
  41. deepdoctection/extern/deskew.py +14 -11
  42. deepdoctection/extern/doctrocr.py +141 -130
  43. deepdoctection/extern/fastlang.py +27 -18
  44. deepdoctection/extern/hfdetr.py +71 -62
  45. deepdoctection/extern/hflayoutlm.py +504 -211
  46. deepdoctection/extern/hflm.py +230 -0
  47. deepdoctection/extern/model.py +488 -302
  48. deepdoctection/extern/pdftext.py +23 -19
  49. deepdoctection/extern/pt/__init__.py +1 -3
  50. deepdoctection/extern/pt/nms.py +6 -2
  51. deepdoctection/extern/pt/ptutils.py +29 -19
  52. deepdoctection/extern/tessocr.py +39 -38
  53. deepdoctection/extern/texocr.py +18 -18
  54. deepdoctection/extern/tp/tfutils.py +57 -9
  55. deepdoctection/extern/tp/tpcompat.py +21 -14
  56. deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
  57. deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
  58. deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
  59. deepdoctection/extern/tp/tpfrcnn/config/config.py +13 -10
  60. deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
  61. deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +18 -8
  62. deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
  63. deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +14 -9
  64. deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
  65. deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +22 -17
  66. deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +21 -14
  67. deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +19 -11
  68. deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
  69. deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
  70. deepdoctection/extern/tp/tpfrcnn/preproc.py +12 -8
  71. deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
  72. deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
  73. deepdoctection/extern/tpdetect.py +45 -53
  74. deepdoctection/mapper/__init__.py +3 -8
  75. deepdoctection/mapper/cats.py +27 -29
  76. deepdoctection/mapper/cocostruct.py +10 -10
  77. deepdoctection/mapper/d2struct.py +27 -26
  78. deepdoctection/mapper/hfstruct.py +13 -8
  79. deepdoctection/mapper/laylmstruct.py +178 -37
  80. deepdoctection/mapper/maputils.py +12 -11
  81. deepdoctection/mapper/match.py +2 -2
  82. deepdoctection/mapper/misc.py +11 -9
  83. deepdoctection/mapper/pascalstruct.py +4 -4
  84. deepdoctection/mapper/prodigystruct.py +5 -5
  85. deepdoctection/mapper/pubstruct.py +84 -92
  86. deepdoctection/mapper/tpstruct.py +5 -5
  87. deepdoctection/mapper/xfundstruct.py +33 -33
  88. deepdoctection/pipe/__init__.py +1 -1
  89. deepdoctection/pipe/anngen.py +12 -14
  90. deepdoctection/pipe/base.py +52 -106
  91. deepdoctection/pipe/common.py +72 -59
  92. deepdoctection/pipe/concurrency.py +16 -11
  93. deepdoctection/pipe/doctectionpipe.py +24 -21
  94. deepdoctection/pipe/language.py +20 -25
  95. deepdoctection/pipe/layout.py +20 -16
  96. deepdoctection/pipe/lm.py +75 -105
  97. deepdoctection/pipe/order.py +194 -89
  98. deepdoctection/pipe/refine.py +111 -124
  99. deepdoctection/pipe/segment.py +156 -161
  100. deepdoctection/pipe/{cell.py → sub_layout.py} +50 -40
  101. deepdoctection/pipe/text.py +37 -36
  102. deepdoctection/pipe/transform.py +19 -16
  103. deepdoctection/train/__init__.py +6 -12
  104. deepdoctection/train/d2_frcnn_train.py +48 -41
  105. deepdoctection/train/hf_detr_train.py +41 -30
  106. deepdoctection/train/hf_layoutlm_train.py +153 -135
  107. deepdoctection/train/tp_frcnn_train.py +32 -31
  108. deepdoctection/utils/concurrency.py +1 -1
  109. deepdoctection/utils/context.py +13 -6
  110. deepdoctection/utils/develop.py +4 -4
  111. deepdoctection/utils/env_info.py +87 -125
  112. deepdoctection/utils/file_utils.py +6 -11
  113. deepdoctection/utils/fs.py +22 -18
  114. deepdoctection/utils/identifier.py +2 -2
  115. deepdoctection/utils/logger.py +16 -15
  116. deepdoctection/utils/metacfg.py +7 -7
  117. deepdoctection/utils/mocks.py +93 -0
  118. deepdoctection/utils/pdf_utils.py +11 -11
  119. deepdoctection/utils/settings.py +185 -181
  120. deepdoctection/utils/tqdm.py +1 -1
  121. deepdoctection/utils/transform.py +14 -9
  122. deepdoctection/utils/types.py +104 -0
  123. deepdoctection/utils/utils.py +7 -7
  124. deepdoctection/utils/viz.py +74 -72
  125. {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/METADATA +30 -21
  126. deepdoctection-0.33.dist-info/RECORD +146 -0
  127. {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/WHEEL +1 -1
  128. deepdoctection/utils/detection_types.py +0 -68
  129. deepdoctection-0.31.dist-info/RECORD +0 -144
  130. {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/LICENSE +0 -0
  131. {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/top_level.txt +0 -0
@@ -20,9 +20,8 @@ Module for ModelCatalog and ModelDownloadManager
20
20
  """
21
21
 
22
22
  import os
23
- from copy import copy
24
23
  from dataclasses import asdict, dataclass, field
25
- from typing import Any, Dict, List, Mapping, Optional, Union
24
+ from typing import Any, Mapping, Optional, Union
26
25
 
27
26
  import jsonlines
28
27
  from huggingface_hub import cached_download, hf_hub_url # type: ignore
@@ -32,11 +31,12 @@ from termcolor import colored
32
31
  from ..utils.fs import download, get_configs_dir_path, get_weights_dir_path
33
32
  from ..utils.logger import LoggingRecord, log_once, logger
34
33
  from ..utils.settings import CellType, Languages, LayoutType, ObjectTypes, get_type
34
+ from ..utils.types import PathLikeOrStr
35
35
 
36
36
  __all__ = ["ModelCatalog", "ModelDownloadManager", "print_model_infos", "ModelProfile"]
37
37
 
38
38
 
39
- @dataclass
39
+ @dataclass(frozen=True)
40
40
  class ModelProfile:
41
41
  """
42
42
  Class for model profile. Add for each model one ModelProfile to the ModelCatalog
@@ -45,25 +45,21 @@ class ModelProfile:
45
45
  name: str
46
46
  description: str
47
47
 
48
- size: List[int]
48
+ size: list[int]
49
49
  tp_model: bool = field(default=False)
50
50
  config: Optional[str] = field(default=None)
51
51
  preprocessor_config: Optional[str] = field(default=None)
52
52
  hf_repo_id: Optional[str] = field(default=None)
53
53
  hf_model_name: Optional[str] = field(default=None)
54
- hf_config_file: Optional[List[str]] = field(default=None)
55
- urls: Optional[List[str]] = field(default=None)
56
- categories: Optional[Dict[str, ObjectTypes]] = field(default=None)
54
+ hf_config_file: Optional[list[str]] = field(default=None)
55
+ urls: Optional[list[str]] = field(default=None)
56
+ categories: Optional[Mapping[int, ObjectTypes]] = field(default=None)
57
+ categories_orig: Optional[Mapping[str, ObjectTypes]] = field(default=None)
57
58
  dl_library: Optional[str] = field(default=None)
58
59
  model_wrapper: Optional[str] = field(default=None)
59
60
  architecture: Optional[str] = field(default=None)
60
61
 
61
- def __post_init__(self) -> None:
62
- """updating categories to ObjectTypes. This might be necessary if we load a catalog from a file"""
63
- if self.categories:
64
- self.categories = {key: get_type(val) for key, val in self.categories.items()}
65
-
66
- def as_dict(self) -> Dict[str, Any]:
62
+ def as_dict(self) -> dict[str, Any]:
67
63
  """
68
64
  returns a dict of the dataclass
69
65
  """
@@ -94,7 +90,7 @@ class ModelCatalog:
94
90
  ModelCatalog.get_full_path_configs("my_new_model")
95
91
  """
96
92
 
97
- CATALOG: Dict[str, ModelProfile] = {
93
+ CATALOG: dict[str, ModelProfile] = {
98
94
  "layout/model-800000_inf_only.data-00000-of-00001": ModelProfile(
99
95
  name="layout/model-800000_inf_only.data-00000-of-00001",
100
96
  description="Tensorpack layout model for inference purposes trained on Publaynet",
@@ -105,11 +101,11 @@ class ModelCatalog:
105
101
  hf_model_name="model-800000_inf_only",
106
102
  hf_config_file=["conf_frcnn_layout.yaml"],
107
103
  categories={
108
- "1": LayoutType.text,
109
- "2": LayoutType.title,
110
- "3": LayoutType.list,
111
- "4": LayoutType.table,
112
- "5": LayoutType.figure,
104
+ 1: LayoutType.TEXT,
105
+ 2: LayoutType.TITLE,
106
+ 3: LayoutType.LIST,
107
+ 4: LayoutType.TABLE,
108
+ 5: LayoutType.FIGURE,
113
109
  },
114
110
  dl_library="TF",
115
111
  model_wrapper="TPFrcnnDetector",
@@ -123,7 +119,7 @@ class ModelCatalog:
123
119
  hf_repo_id="deepdoctection/tp_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_c_inference_only",
124
120
  hf_model_name="model-1800000_inf_only",
125
121
  hf_config_file=["conf_frcnn_cell.yaml"],
126
- categories={"1": LayoutType.cell},
122
+ categories={1: LayoutType.CELL},
127
123
  dl_library="TF",
128
124
  model_wrapper="TPFrcnnDetector",
129
125
  ),
@@ -136,7 +132,7 @@ class ModelCatalog:
136
132
  hf_repo_id="deepdoctection/tp_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_rc_inference_only",
137
133
  hf_model_name="model-1620000_inf_only",
138
134
  hf_config_file=["conf_frcnn_rows.yaml"],
139
- categories={"1": LayoutType.row, "2": LayoutType.column},
135
+ categories={1: LayoutType.ROW, 2: LayoutType.COLUMN},
140
136
  dl_library="TF",
141
137
  model_wrapper="TPFrcnnDetector",
142
138
  ),
@@ -149,7 +145,7 @@ class ModelCatalog:
149
145
  hf_repo_id="deepdoctection/tp_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_rc",
150
146
  hf_model_name="model-1620000",
151
147
  hf_config_file=["conf_frcnn_rows.yaml"],
152
- categories={"1": LayoutType.row, "2": LayoutType.column},
148
+ categories={1: LayoutType.ROW, 2: LayoutType.COLUMN},
153
149
  dl_library="TF",
154
150
  model_wrapper="TPFrcnnDetector",
155
151
  ),
@@ -164,11 +160,11 @@ class ModelCatalog:
164
160
  hf_config_file=["conf_frcnn_layout.yaml"],
165
161
  dl_library="TF",
166
162
  categories={
167
- "1": LayoutType.text,
168
- "2": LayoutType.title,
169
- "3": LayoutType.list,
170
- "4": LayoutType.table,
171
- "5": LayoutType.figure,
163
+ 1: LayoutType.TEXT,
164
+ 2: LayoutType.TITLE,
165
+ 3: LayoutType.LIST,
166
+ 4: LayoutType.TABLE,
167
+ 5: LayoutType.FIGURE,
172
168
  },
173
169
  model_wrapper="TPFrcnnDetector",
174
170
  ),
@@ -181,29 +177,10 @@ class ModelCatalog:
181
177
  hf_repo_id="deepdoctection/tp_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_c",
182
178
  hf_model_name="model-1800000",
183
179
  hf_config_file=["conf_frcnn_cell.yaml"],
184
- categories={"1": LayoutType.cell},
180
+ categories={1: LayoutType.CELL},
185
181
  dl_library="TF",
186
182
  model_wrapper="TPFrcnnDetector",
187
183
  ),
188
- "layout/d2_model-800000-layout.pkl": ModelProfile(
189
- name="layout/d2_model-800000-layout.pkl",
190
- description="Detectron2 layout detection model trained on Publaynet",
191
- config="dd/d2/layout/CASCADE_RCNN_R_50_FPN_GN.yaml",
192
- size=[274568239],
193
- tp_model=False,
194
- hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_publaynet_inference_only",
195
- hf_model_name="d2_model-800000-layout.pkl",
196
- hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
197
- categories={
198
- "1": LayoutType.text,
199
- "2": LayoutType.title,
200
- "3": LayoutType.list,
201
- "4": LayoutType.table,
202
- "5": LayoutType.figure,
203
- },
204
- dl_library="PT",
205
- model_wrapper="D2FrcnnDetector",
206
- ),
207
184
  "layout/d2_model_0829999_layout_inf_only.pt": ModelProfile(
208
185
  name="layout/d2_model_0829999_layout_inf_only.pt",
209
186
  description="Detectron2 layout detection model trained on Publaynet",
@@ -214,11 +191,11 @@ class ModelCatalog:
214
191
  hf_model_name="d2_model_0829999_layout_inf_only.pt",
215
192
  hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
216
193
  categories={
217
- "1": LayoutType.text,
218
- "2": LayoutType.title,
219
- "3": LayoutType.list,
220
- "4": LayoutType.table,
221
- "5": LayoutType.figure,
194
+ 1: LayoutType.TEXT,
195
+ 2: LayoutType.TITLE,
196
+ 3: LayoutType.LIST,
197
+ 4: LayoutType.TABLE,
198
+ 5: LayoutType.FIGURE,
222
199
  },
223
200
  dl_library="PT",
224
201
  model_wrapper="D2FrcnnDetector",
@@ -233,11 +210,11 @@ class ModelCatalog:
233
210
  hf_model_name="d2_model_0829999_layout.pth",
234
211
  hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
235
212
  categories={
236
- "1": LayoutType.text,
237
- "2": LayoutType.title,
238
- "3": LayoutType.list,
239
- "4": LayoutType.table,
240
- "5": LayoutType.figure,
213
+ 1: LayoutType.TEXT,
214
+ 2: LayoutType.TITLE,
215
+ 3: LayoutType.LIST,
216
+ 4: LayoutType.TABLE,
217
+ 5: LayoutType.FIGURE,
241
218
  },
242
219
  dl_library="PT",
243
220
  model_wrapper="D2FrcnnDetector",
@@ -252,28 +229,15 @@ class ModelCatalog:
252
229
  hf_model_name="d2_model_0829999_layout_inf_only.ts",
253
230
  hf_config_file=["CASCADE_RCNN_R_50_FPN_GN_TS.yaml"],
254
231
  categories={
255
- "1": LayoutType.text,
256
- "2": LayoutType.title,
257
- "3": LayoutType.list,
258
- "4": LayoutType.table,
259
- "5": LayoutType.figure,
232
+ 1: LayoutType.TEXT,
233
+ 2: LayoutType.TITLE,
234
+ 3: LayoutType.LIST,
235
+ 4: LayoutType.TABLE,
236
+ 5: LayoutType.FIGURE,
260
237
  },
261
238
  dl_library="PT",
262
239
  model_wrapper="D2FrcnnTracingDetector",
263
240
  ),
264
- "cell/d2_model-1800000-cell.pkl": ModelProfile(
265
- name="cell/d2_model-1800000-cell.pkl",
266
- description="Detectron2 cell detection inference only model trained on Pubtabnet",
267
- config="dd/d2/cell/CASCADE_RCNN_R_50_FPN_GN.yaml",
268
- size=[274519039],
269
- tp_model=False,
270
- hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_c_inference_only",
271
- hf_model_name="d2_model-1800000-cell.pkl",
272
- hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
273
- categories={"1": LayoutType.cell},
274
- dl_library="PT",
275
- model_wrapper="D2FrcnnDetector",
276
- ),
277
241
  "cell/d2_model_1849999_cell_inf_only.pt": ModelProfile(
278
242
  name="cell/d2_model_1849999_cell_inf_only.pt",
279
243
  description="Detectron2 cell detection inference only model trained on Pubtabnet",
@@ -283,7 +247,7 @@ class ModelCatalog:
283
247
  hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_c_inference_only",
284
248
  hf_model_name="d2_model_1849999_cell_inf_only.pt",
285
249
  hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
286
- categories={"1": LayoutType.cell},
250
+ categories={1: LayoutType.CELL},
287
251
  dl_library="PT",
288
252
  model_wrapper="D2FrcnnDetector",
289
253
  ),
@@ -296,7 +260,7 @@ class ModelCatalog:
296
260
  hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_c_inference_only",
297
261
  hf_model_name="d2_model_1849999_cell_inf_only.ts",
298
262
  hf_config_file=["CASCADE_RCNN_R_50_FPN_GN_TS.yaml"],
299
- categories={"1": LayoutType.cell},
263
+ categories={1: LayoutType.CELL},
300
264
  dl_library="PT",
301
265
  model_wrapper="D2FrcnnTracingDetector",
302
266
  ),
@@ -309,20 +273,7 @@ class ModelCatalog:
309
273
  hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_c_inference_only",
310
274
  hf_model_name="cell/d2_model_1849999_cell.pth",
311
275
  hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
312
- categories={"1": LayoutType.cell},
313
- dl_library="PT",
314
- model_wrapper="D2FrcnnDetector",
315
- ),
316
- "item/d2_model-1620000-item.pkl": ModelProfile(
317
- name="item/d2_model-1620000-item.pkl",
318
- description="Detectron2 item detection inference only model trained on Pubtabnet",
319
- config="dd/d2/item/CASCADE_RCNN_R_50_FPN_GN.yaml",
320
- size=[274531339],
321
- tp_model=False,
322
- hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_rc_inference_only",
323
- hf_model_name="d2_model-1620000-item.pkl",
324
- hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
325
- categories={"1": LayoutType.row, "2": LayoutType.column},
276
+ categories={1: LayoutType.CELL},
326
277
  dl_library="PT",
327
278
  model_wrapper="D2FrcnnDetector",
328
279
  ),
@@ -335,7 +286,7 @@ class ModelCatalog:
335
286
  hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_rc_inference_only",
336
287
  hf_model_name="d2_model_1639999_item.pth",
337
288
  hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
338
- categories={"1": LayoutType.row, "2": LayoutType.column},
289
+ categories={1: LayoutType.ROW, 2: LayoutType.COLUMN},
339
290
  dl_library="PT",
340
291
  model_wrapper="D2FrcnnDetector",
341
292
  ),
@@ -348,7 +299,7 @@ class ModelCatalog:
348
299
  hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_rc_inference_only",
349
300
  hf_model_name="d2_model_1639999_item_inf_only.pt",
350
301
  hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
351
- categories={"1": LayoutType.row, "2": LayoutType.column},
302
+ categories={1: LayoutType.ROW, 2: LayoutType.COLUMN},
352
303
  dl_library="PT",
353
304
  model_wrapper="D2FrcnnDetector",
354
305
  ),
@@ -361,10 +312,49 @@ class ModelCatalog:
361
312
  hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_rc_inference_only",
362
313
  hf_model_name="d2_model_1639999_item_inf_only.ts",
363
314
  hf_config_file=["CASCADE_RCNN_R_50_FPN_GN_TS.yaml"],
364
- categories={"1": LayoutType.row, "2": LayoutType.column},
315
+ categories={1: LayoutType.ROW, 2: LayoutType.COLUMN},
365
316
  dl_library="PT",
366
317
  model_wrapper="D2FrcnnTracingDetector",
367
318
  ),
319
+ "nielsr/lilt-xlm-roberta-base/pytorch_model.bin": ModelProfile(
320
+ name="nielsr/lilt-xlm-roberta-base/pytorch_model.bin",
321
+ description="LiLT build with a RobertaXLM base model",
322
+ config="nielsr/lilt-xlm-roberta-base/config.json",
323
+ size=[1136743583],
324
+ tp_model=False,
325
+ hf_repo_id="nielsr/lilt-xlm-roberta-base",
326
+ hf_model_name="pytorch_model.bin",
327
+ hf_config_file=["config.json"],
328
+ dl_library="PT",
329
+ ),
330
+ "SCUT-DLVCLab/lilt-infoxlm-base/pytorch_model.bin": ModelProfile(
331
+ name="SCUT-DLVCLab/lilt-infoxlm-base/pytorch_model.bin",
332
+ description="Language-Independent Layout Transformer - InfoXLM model by stitching a pre-trained InfoXLM"
333
+ " and a pre-trained Language-Independent Layout Transformer (LiLT) together. It was introduced"
334
+ " in the paper LiLT: A Simple yet Effective Language-Independent Layout Transformer for"
335
+ " Structured Document Understanding by Wang et al. and first released in this repository.",
336
+ config="SCUT-DLVCLab/lilt-infoxlm-base/config.json",
337
+ size=[1136743583],
338
+ tp_model=False,
339
+ hf_repo_id="SCUT-DLVCLab/lilt-infoxlm-base",
340
+ hf_model_name="pytorch_model.bin",
341
+ hf_config_file=["config.json"],
342
+ dl_library="PT",
343
+ ),
344
+ "SCUT-DLVCLab/lilt-roberta-en-base/pytorch_model.bin": ModelProfile(
345
+ name="SCUT-DLVCLab/lilt-roberta-en-base/pytorch_model.bin",
346
+ description="Language-Independent Layout Transformer - RoBERTa model by stitching a pre-trained RoBERTa"
347
+ " (English) and a pre-trained Language-Independent Layout Transformer (LiLT) together. It was"
348
+ " introduced in the paper LiLT: A Simple yet Effective Language-Independent Layout Transformer"
349
+ " for Structured Document Understanding by Wang et al. and first released in this repository.",
350
+ config="SCUT-DLVCLab/lilt-roberta-en-base/config.json",
351
+ size=[523151519],
352
+ tp_model=False,
353
+ hf_repo_id="SCUT-DLVCLab/lilt-roberta-en-base",
354
+ hf_model_name="pytorch_model.bin",
355
+ hf_config_file=["config.json"],
356
+ dl_library="PT",
357
+ ),
368
358
  "microsoft/layoutlm-base-uncased/pytorch_model.bin": ModelProfile(
369
359
  name="microsoft/layoutlm-base-uncased/pytorch_model.bin",
370
360
  description="LayoutLM is a simple but effective pre-training method of text and layout for document image"
@@ -459,7 +449,7 @@ class ModelCatalog:
459
449
  hf_repo_id="microsoft/table-transformer-detection",
460
450
  hf_model_name="pytorch_model.bin",
461
451
  hf_config_file=["config.json", "preprocessor_config.json"],
462
- categories={"1": LayoutType.table, "2": LayoutType.table_rotated},
452
+ categories={1: LayoutType.TABLE, 2: LayoutType.TABLE_ROTATED},
463
453
  dl_library="PT",
464
454
  model_wrapper="HFDetrDerivedDetector",
465
455
  ),
@@ -477,12 +467,12 @@ class ModelCatalog:
477
467
  hf_model_name="pytorch_model.bin",
478
468
  hf_config_file=["config.json", "preprocessor_config.json"],
479
469
  categories={
480
- "1": LayoutType.table,
481
- "2": LayoutType.column,
482
- "3": LayoutType.row,
483
- "4": CellType.column_header,
484
- "5": CellType.projected_row_header,
485
- "6": CellType.spanning,
470
+ 1: LayoutType.TABLE,
471
+ 2: LayoutType.COLUMN,
472
+ 3: LayoutType.ROW,
473
+ 4: CellType.COLUMN_HEADER,
474
+ 5: CellType.PROJECTED_ROW_HEADER,
475
+ 6: CellType.SPANNING,
486
476
  },
487
477
  dl_library="PT",
488
478
  model_wrapper="HFDetrDerivedDetector",
@@ -494,7 +484,7 @@ class ModelCatalog:
494
484
  "https://mindee.github.io/doctr/using_doctr/using_models.html#. This is the Pytorch artefact.",
495
485
  size=[101971449],
496
486
  urls=["https://doctr-static.mindee.com/models?id=v0.3.1/db_resnet50-ac60cadc.pt&src=0"],
497
- categories={"1": LayoutType.word},
487
+ categories={1: LayoutType.WORD},
498
488
  dl_library="PT",
499
489
  model_wrapper="DoctrTextlineDetector",
500
490
  architecture="db_resnet50",
@@ -506,7 +496,7 @@ class ModelCatalog:
506
496
  "https://mindee.github.io/doctr/using_doctr/using_models.html#. This is the Tensorflow artefact.",
507
497
  size=[94178964],
508
498
  urls=["https://doctr-static.mindee.com/models?id=v0.2.0/db_resnet50-adcafc63.zip&src=0"],
509
- categories={"1": LayoutType.word},
499
+ categories={1: LayoutType.WORD},
510
500
  dl_library="TF",
511
501
  model_wrapper="DoctrTextlineDetector",
512
502
  architecture="db_resnet50",
@@ -535,195 +525,386 @@ class ModelCatalog:
535
525
  model_wrapper="DoctrTextRecognizer",
536
526
  architecture="crnn_vgg16_bn",
537
527
  ),
528
+ "FacebookAI/xlm-roberta-base": ModelProfile(
529
+ name="FacebookAI/xlm-roberta-base/pytorch_model.bin",
530
+ description="XLM-RoBERTa model pre-trained on 2.5TB of filtered CommonCrawl data containing 100 languages."
531
+ " It was introduced in the paper Unsupervised Cross-lingual Representation Learning at Scale"
532
+ " by Conneau et al. and first released in this repository.",
533
+ size=[1115590446],
534
+ tp_model=False,
535
+ config="FacebookAI/xlm-roberta-base/config.json",
536
+ hf_repo_id="FacebookAI/xlm-roberta-base",
537
+ hf_model_name="pytorch_model.bin",
538
+ hf_config_file=["config.json"],
539
+ dl_library="PT",
540
+ ),
538
541
  "fasttext/lid.176.bin": ModelProfile(
539
542
  name="fasttext/lid.176.bin",
540
543
  description="Fasttext language detection model",
541
544
  size=[131266198],
542
545
  urls=["https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin"],
543
546
  categories={
544
- "__label__en": Languages.english,
545
- "__label__ru": Languages.russian,
546
- "__label__de": Languages.german,
547
- "__label__fr": Languages.french,
548
- "__label__it": Languages.italian,
549
- "__label__ja": Languages.japanese,
550
- "__label__es": Languages.spanish,
551
- "__label__ceb": Languages.cebuano,
552
- "__label__tr": Languages.turkish,
553
- "__label__pt": Languages.portuguese,
554
- "__label__uk": Languages.ukrainian,
555
- "__label__eo": Languages.esperanto,
556
- "__label__pl": Languages.polish,
557
- "__label__sv": Languages.swedish,
558
- "__label__nl": Languages.dutch,
559
- "__label__he": Languages.hebrew,
560
- "__label__zh": Languages.chinese,
561
- "__label__hu": Languages.hungarian,
562
- "__label__ar": Languages.arabic,
563
- "__label__ca": Languages.catalan,
564
- "__label__fi": Languages.finnish,
565
- "__label__cs": Languages.czech,
566
- "__label__fa": Languages.persian,
567
- "__label__sr": Languages.serbian,
568
- "__label__el": Languages.greek,
569
- "__label__vi": Languages.vietnamese,
570
- "__label__bg": Languages.bulgarian,
571
- "__label__ko": Languages.korean,
572
- "__label__no": Languages.norwegian,
573
- "__label__mk": Languages.macedonian,
574
- "__label__ro": Languages.romanian,
575
- "__label__id": Languages.indonesian,
576
- "__label__th": Languages.thai,
577
- "__label__hy": Languages.armenian,
578
- "__label__da": Languages.danish,
579
- "__label__ta": Languages.tamil,
580
- "__label__hi": Languages.hindi,
581
- "__label__hr": Languages.croatian,
582
- "__label__sh": Languages.not_defined,
583
- "__label__be": Languages.belarusian,
584
- "__label__ka": Languages.georgian,
585
- "__label__te": Languages.telugu,
586
- "__label__kk": Languages.kazakh,
587
- "__label__war": Languages.waray,
588
- "__label__lt": Languages.lithuanian,
589
- "__label__gl": Languages.scottish,
590
- "__label__sk": Languages.slovak,
591
- "__label__bn": Languages.benin,
592
- "__label__eu": Languages.basque,
593
- "__label__sl": Languages.slovenian,
594
- "__label__kn": Languages.not_defined,
595
- "__label__ml": Languages.malayalam,
596
- "__label__mr": Languages.marathi,
597
- "__label__et": Languages.estonian,
598
- "__label__az": Languages.azerbaijani,
599
- "__label__ms": Languages.not_defined,
600
- "__label__sq": Languages.albanian,
601
- "__label__la": Languages.latin,
602
- "__label__bs": Languages.bosnian,
603
- "__label__nn": Languages.norwegian_nynorsk,
604
- "__label__ur": Languages.urdu,
605
- "__label__lv": Languages.not_defined,
606
- "__label__my": Languages.not_defined,
607
- "__label__tt": Languages.not_defined,
608
- "__label__af": Languages.not_defined,
609
- "__label__oc": Languages.not_defined,
610
- "__label__nds": Languages.not_defined,
611
- "__label__ky": Languages.not_defined,
612
- "__label__ast": Languages.not_defined,
613
- "__label__tl": Languages.not_defined,
614
- "__label__is": Languages.not_defined,
615
- "__label__ia": Languages.not_defined,
616
- "__label__si": Languages.not_defined,
617
- "__label__gu": Languages.not_defined,
618
- "__label__km": Languages.not_defined,
619
- "__label__br": Languages.not_defined,
620
- "__label__ba": Languages.not_defined,
621
- "__label__uz": Languages.not_defined,
622
- "__label__bo": Languages.not_defined,
623
- "__label__pa": Languages.not_defined,
624
- "__label__vo": Languages.not_defined,
625
- "__label__als": Languages.not_defined,
626
- "__label__ne": Languages.not_defined,
627
- "__label__cy": Languages.not_defined,
628
- "__label__jbo": Languages.not_defined,
629
- "__label__fy": Languages.not_defined,
630
- "__label__mn": Languages.not_defined,
631
- "__label__lb": Languages.not_defined,
632
- "__label__ce": Languages.not_defined,
633
- "__label__ug": Languages.not_defined,
634
- "__label__tg": Languages.not_defined,
635
- "__label__sco": Languages.not_defined,
636
- "__label__sa": Languages.not_defined,
637
- "__label__cv": Languages.not_defined,
638
- "__label__jv": Languages.not_defined,
639
- "__label__min": Languages.not_defined,
640
- "__label__io": Languages.not_defined,
641
- "__label__or": Languages.not_defined,
642
- "__label__as": Languages.not_defined,
643
- "__label__new": Languages.not_defined,
644
- "__label__ga": Languages.not_defined,
645
- "__label__mg": Languages.not_defined,
646
- "__label__an": Languages.not_defined,
647
- "__label__ckb": Languages.not_defined,
648
- "__label__sw": Languages.not_defined,
649
- "__label__bar": Languages.not_defined,
650
- "__label__lmo": Languages.not_defined,
651
- "__label__yi": Languages.not_defined,
652
- "__label__arz": Languages.not_defined,
653
- "__label__mhr": Languages.not_defined,
654
- "__label__azb": Languages.not_defined,
655
- "__label__sah": Languages.not_defined,
656
- "__label__pnb": Languages.not_defined,
657
- "__label__su": Languages.not_defined,
658
- "__label__bpy": Languages.not_defined,
659
- "__label__pms": Languages.not_defined,
660
- "__label__ilo": Languages.not_defined,
661
- "__label__wuu": Languages.not_defined,
662
- "__label__ku": Languages.not_defined,
663
- "__label__ps": Languages.not_defined,
664
- "__label__ie": Languages.not_defined,
665
- "__label__xmf": Languages.not_defined,
666
- "__label__yue": Languages.not_defined,
667
- "__label__gom": Languages.not_defined,
668
- "__label__li": Languages.not_defined,
669
- "__label__mwl": Languages.not_defined,
670
- "__label__kw": Languages.not_defined,
671
- "__label__sd": Languages.not_defined,
672
- "__label__hsb": Languages.not_defined,
673
- "__label__scn": Languages.not_defined,
674
- "__label__gd": Languages.not_defined,
675
- "__label__pam": Languages.not_defined,
676
- "__label__bh": Languages.not_defined,
677
- "__label__mai": Languages.not_defined,
678
- "__label__vec": Languages.not_defined,
679
- "__label__mt": Languages.not_defined,
680
- "__label__dv": Languages.not_defined,
681
- "__label__wa": Languages.not_defined,
682
- "__label__mzn": Languages.not_defined,
683
- "__label__am": Languages.not_defined,
684
- "__label__qu": Languages.not_defined,
685
- "__label__eml": Languages.not_defined,
686
- "__label__cbk": Languages.not_defined,
687
- "__label__tk": Languages.not_defined,
688
- "__label__rm": Languages.not_defined,
689
- "__label__os": Languages.not_defined,
690
- "__label__vls": Languages.not_defined,
691
- "__label__yo": Languages.not_defined,
692
- "__label__lo": Languages.not_defined,
693
- "__label__lez": Languages.not_defined,
694
- "__label__so": Languages.not_defined,
695
- "__label__myv": Languages.not_defined,
696
- "__label__diq": Languages.not_defined,
697
- "__label__mrj": Languages.not_defined,
698
- "__label__dsb": Languages.not_defined,
699
- "__label__frr": Languages.not_defined,
700
- "__label__ht": Languages.not_defined,
701
- "__label__gn": Languages.not_defined,
702
- "__label__bxr": Languages.not_defined,
703
- "__label__kv": Languages.not_defined,
704
- "__label__sc": Languages.not_defined,
705
- "__label__nah": Languages.not_defined,
706
- "__label__krc": Languages.not_defined,
707
- "__label__bcl": Languages.not_defined,
708
- "__label__nap": Languages.not_defined,
709
- "__label__gv": Languages.not_defined,
710
- "__label__av": Languages.not_defined,
711
- "__label__rue": Languages.not_defined,
712
- "__label__xal": Languages.not_defined,
713
- "__label__pfl": Languages.not_defined,
714
- "__label__dty": Languages.not_defined,
715
- "__label__hif": Languages.not_defined,
716
- "__label__co": Languages.not_defined,
717
- "__label__lrc": Languages.not_defined,
718
- "__label__vep": Languages.not_defined,
719
- "__label__tyv": Languages.not_defined,
547
+ 1: Languages.ENGLISH,
548
+ 2: Languages.RUSSIAN,
549
+ 3: Languages.GERMAN,
550
+ 4: Languages.FRENCH,
551
+ 5: Languages.ITALIAN,
552
+ 6: Languages.JAPANESE,
553
+ 7: Languages.SPANISH,
554
+ 8: Languages.CEBUANO,
555
+ 9: Languages.TURKISH,
556
+ 10: Languages.PORTUGUESE,
557
+ 11: Languages.UKRAINIAN,
558
+ 12: Languages.ESPERANTO,
559
+ 13: Languages.POLISH,
560
+ 14: Languages.SWEDISH,
561
+ 15: Languages.DUTCH,
562
+ 16: Languages.HEBREW,
563
+ 17: Languages.CHINESE,
564
+ 18: Languages.HUNGARIAN,
565
+ 19: Languages.ARABIC,
566
+ 20: Languages.CATALAN,
567
+ 21: Languages.FINNISH,
568
+ 22: Languages.CZECH,
569
+ 23: Languages.PERSIAN,
570
+ 24: Languages.SERBIAN,
571
+ 25: Languages.GREEK,
572
+ 26: Languages.VIETNAMESE,
573
+ 27: Languages.BULGARIAN,
574
+ 28: Languages.KOREAN,
575
+ 29: Languages.NORWEGIAN,
576
+ 30: Languages.MACEDONIAN,
577
+ 31: Languages.ROMANIAN,
578
+ 32: Languages.INDONESIAN,
579
+ 33: Languages.THAI,
580
+ 34: Languages.ARMENIAN,
581
+ 35: Languages.DANISH,
582
+ 36: Languages.TAMIL,
583
+ 37: Languages.HINDI,
584
+ 38: Languages.CROATIAN,
585
+ 39: Languages.NOT_DEFINED,
586
+ 40: Languages.BELARUSIAN,
587
+ 41: Languages.GEORGIAN,
588
+ 42: Languages.TELUGU,
589
+ 43: Languages.KAZAKH,
590
+ 44: Languages.WARAY,
591
+ 45: Languages.LITHUANIAN,
592
+ 46: Languages.SCOTTISH,
593
+ 47: Languages.SLOVAK,
594
+ 48: Languages.BENIN,
595
+ 49: Languages.BASQUE,
596
+ 50: Languages.SLOVENIAN,
597
+ 51: Languages.NOT_DEFINED,
598
+ 52: Languages.MALAYALAM,
599
+ 53: Languages.MARATHI,
600
+ 54: Languages.ESTONIAN,
601
+ 55: Languages.AZERBAIJANI,
602
+ 56: Languages.NOT_DEFINED,
603
+ 57: Languages.ALBANIAN,
604
+ 58: Languages.LATIN,
605
+ 59: Languages.BOSNIAN,
606
+ 60: Languages.NORWEGIAN_NOVOSIBIRSK,
607
+ 61: Languages.URDU,
608
+ 62: Languages.NOT_DEFINED,
609
+ 63: Languages.NOT_DEFINED,
610
+ 64: Languages.NOT_DEFINED,
611
+ 65: Languages.NOT_DEFINED,
612
+ 66: Languages.NOT_DEFINED,
613
+ 67: Languages.NOT_DEFINED,
614
+ 68: Languages.NOT_DEFINED,
615
+ 69: Languages.NOT_DEFINED,
616
+ 70: Languages.NOT_DEFINED,
617
+ 71: Languages.NOT_DEFINED,
618
+ 72: Languages.NOT_DEFINED,
619
+ 73: Languages.NOT_DEFINED,
620
+ 74: Languages.NOT_DEFINED,
621
+ 75: Languages.NOT_DEFINED,
622
+ 76: Languages.NOT_DEFINED,
623
+ 77: Languages.NOT_DEFINED,
624
+ 78: Languages.NOT_DEFINED,
625
+ 79: Languages.NOT_DEFINED,
626
+ 80: Languages.NOT_DEFINED,
627
+ 81: Languages.NOT_DEFINED,
628
+ 82: Languages.NOT_DEFINED,
629
+ 83: Languages.NOT_DEFINED,
630
+ 84: Languages.NOT_DEFINED,
631
+ 85: Languages.NOT_DEFINED,
632
+ 86: Languages.NOT_DEFINED,
633
+ 87: Languages.NOT_DEFINED,
634
+ 88: Languages.NOT_DEFINED,
635
+ 89: Languages.NOT_DEFINED,
636
+ 90: Languages.NOT_DEFINED,
637
+ 91: Languages.NOT_DEFINED,
638
+ 92: Languages.NOT_DEFINED,
639
+ 93: Languages.NOT_DEFINED,
640
+ 94: Languages.NOT_DEFINED,
641
+ 95: Languages.NOT_DEFINED,
642
+ 96: Languages.NOT_DEFINED,
643
+ 97: Languages.NOT_DEFINED,
644
+ 98: Languages.NOT_DEFINED,
645
+ 99: Languages.NOT_DEFINED,
646
+ 100: Languages.NOT_DEFINED,
647
+ 101: Languages.NOT_DEFINED,
648
+ 102: Languages.NOT_DEFINED,
649
+ 103: Languages.NOT_DEFINED,
650
+ 104: Languages.NOT_DEFINED,
651
+ 105: Languages.NOT_DEFINED,
652
+ 106: Languages.NOT_DEFINED,
653
+ 107: Languages.NOT_DEFINED,
654
+ 108: Languages.NOT_DEFINED,
655
+ 109: Languages.NOT_DEFINED,
656
+ 110: Languages.NOT_DEFINED,
657
+ 111: Languages.NOT_DEFINED,
658
+ 112: Languages.NOT_DEFINED,
659
+ 113: Languages.NOT_DEFINED,
660
+ 114: Languages.NOT_DEFINED,
661
+ 115: Languages.NOT_DEFINED,
662
+ 116: Languages.NOT_DEFINED,
663
+ 117: Languages.NOT_DEFINED,
664
+ 118: Languages.NOT_DEFINED,
665
+ 119: Languages.NOT_DEFINED,
666
+ 120: Languages.NOT_DEFINED,
667
+ 121: Languages.NOT_DEFINED,
668
+ 122: Languages.NOT_DEFINED,
669
+ 123: Languages.NOT_DEFINED,
670
+ 124: Languages.NOT_DEFINED,
671
+ 125: Languages.NOT_DEFINED,
672
+ 126: Languages.NOT_DEFINED,
673
+ 127: Languages.NOT_DEFINED,
674
+ 128: Languages.NOT_DEFINED,
675
+ 129: Languages.NOT_DEFINED,
676
+ 130: Languages.NOT_DEFINED,
677
+ 131: Languages.NOT_DEFINED,
678
+ 132: Languages.NOT_DEFINED,
679
+ 133: Languages.NOT_DEFINED,
680
+ 134: Languages.NOT_DEFINED,
681
+ 135: Languages.NOT_DEFINED,
682
+ 136: Languages.NOT_DEFINED,
683
+ 137: Languages.NOT_DEFINED,
684
+ 138: Languages.NOT_DEFINED,
685
+ 139: Languages.NOT_DEFINED,
686
+ 140: Languages.NOT_DEFINED,
687
+ 141: Languages.NOT_DEFINED,
688
+ 142: Languages.NOT_DEFINED,
689
+ 143: Languages.NOT_DEFINED,
690
+ 144: Languages.NOT_DEFINED,
691
+ 145: Languages.NOT_DEFINED,
692
+ 146: Languages.NOT_DEFINED,
693
+ 147: Languages.NOT_DEFINED,
694
+ 148: Languages.NOT_DEFINED,
695
+ 149: Languages.NOT_DEFINED,
696
+ 150: Languages.NOT_DEFINED,
697
+ 151: Languages.NOT_DEFINED,
698
+ 152: Languages.NOT_DEFINED,
699
+ 153: Languages.NOT_DEFINED,
700
+ 154: Languages.NOT_DEFINED,
701
+ 155: Languages.NOT_DEFINED,
702
+ 156: Languages.NOT_DEFINED,
703
+ 157: Languages.NOT_DEFINED,
704
+ 158: Languages.NOT_DEFINED,
705
+ 159: Languages.NOT_DEFINED,
706
+ 160: Languages.NOT_DEFINED,
707
+ 161: Languages.NOT_DEFINED,
708
+ 162: Languages.NOT_DEFINED,
709
+ 163: Languages.NOT_DEFINED,
710
+ 164: Languages.NOT_DEFINED,
711
+ 165: Languages.NOT_DEFINED,
712
+ 166: Languages.NOT_DEFINED,
713
+ 167: Languages.NOT_DEFINED,
714
+ 168: Languages.NOT_DEFINED,
715
+ 169: Languages.NOT_DEFINED,
716
+ 170: Languages.NOT_DEFINED,
717
+ 171: Languages.NOT_DEFINED,
718
+ 172: Languages.NOT_DEFINED,
719
+ 173: Languages.NOT_DEFINED,
720
+ 174: Languages.NOT_DEFINED,
721
+ 175: Languages.NOT_DEFINED,
722
+ 176: Languages.NOT_DEFINED,
723
+ },
724
+ categories_orig={
725
+ "__label__en": Languages.ENGLISH,
726
+ "__label__ru": Languages.RUSSIAN,
727
+ "__label__de": Languages.GERMAN,
728
+ "__label__fr": Languages.FRENCH,
729
+ "__label__it": Languages.ITALIAN,
730
+ "__label__ja": Languages.JAPANESE,
731
+ "__label__es": Languages.SPANISH,
732
+ "__label__ceb": Languages.CEBUANO,
733
+ "__label__tr": Languages.TURKISH,
734
+ "__label__pt": Languages.PORTUGUESE,
735
+ "__label__uk": Languages.UKRAINIAN,
736
+ "__label__eo": Languages.ESPERANTO,
737
+ "__label__pl": Languages.POLISH,
738
+ "__label__sv": Languages.SWEDISH,
739
+ "__label__nl": Languages.DUTCH,
740
+ "__label__he": Languages.HEBREW,
741
+ "__label__zh": Languages.CHINESE,
742
+ "__label__hu": Languages.HUNGARIAN,
743
+ "__label__ar": Languages.ARABIC,
744
+ "__label__ca": Languages.CATALAN,
745
+ "__label__fi": Languages.FINNISH,
746
+ "__label__cs": Languages.CZECH,
747
+ "__label__fa": Languages.PERSIAN,
748
+ "__label__sr": Languages.SERBIAN,
749
+ "__label__el": Languages.GREEK,
750
+ "__label__vi": Languages.VIETNAMESE,
751
+ "__label__bg": Languages.BULGARIAN,
752
+ "__label__ko": Languages.KOREAN,
753
+ "__label__no": Languages.NORWEGIAN,
754
+ "__label__mk": Languages.MACEDONIAN,
755
+ "__label__ro": Languages.ROMANIAN,
756
+ "__label__id": Languages.INDONESIAN,
757
+ "__label__th": Languages.THAI,
758
+ "__label__hy": Languages.ARMENIAN,
759
+ "__label__da": Languages.DANISH,
760
+ "__label__ta": Languages.TAMIL,
761
+ "__label__hi": Languages.HINDI,
762
+ "__label__hr": Languages.CROATIAN,
763
+ "__label__sh": Languages.NOT_DEFINED,
764
+ "__label__be": Languages.BELARUSIAN,
765
+ "__label__ka": Languages.GEORGIAN,
766
+ "__label__te": Languages.TELUGU,
767
+ "__label__kk": Languages.KAZAKH,
768
+ "__label__war": Languages.WARAY,
769
+ "__label__lt": Languages.LITHUANIAN,
770
+ "__label__gl": Languages.SCOTTISH,
771
+ "__label__sk": Languages.SLOVAK,
772
+ "__label__bn": Languages.BENIN,
773
+ "__label__eu": Languages.BASQUE,
774
+ "__label__sl": Languages.SLOVENIAN,
775
+ "__label__kn": Languages.NOT_DEFINED,
776
+ "__label__ml": Languages.MALAYALAM,
777
+ "__label__mr": Languages.MARATHI,
778
+ "__label__et": Languages.ESTONIAN,
779
+ "__label__az": Languages.AZERBAIJANI,
780
+ "__label__ms": Languages.NOT_DEFINED,
781
+ "__label__sq": Languages.ALBANIAN,
782
+ "__label__la": Languages.LATIN,
783
+ "__label__bs": Languages.BOSNIAN,
784
+ "__label__nn": Languages.NORWEGIAN_NOVOSIBIRSK,
785
+ "__label__ur": Languages.URDU,
786
+ "__label__lv": Languages.NOT_DEFINED,
787
+ "__label__my": Languages.NOT_DEFINED,
788
+ "__label__tt": Languages.NOT_DEFINED,
789
+ "__label__af": Languages.NOT_DEFINED,
790
+ "__label__oc": Languages.NOT_DEFINED,
791
+ "__label__nds": Languages.NOT_DEFINED,
792
+ "__label__ky": Languages.NOT_DEFINED,
793
+ "__label__ast": Languages.NOT_DEFINED,
794
+ "__label__tl": Languages.NOT_DEFINED,
795
+ "__label__is": Languages.NOT_DEFINED,
796
+ "__label__ia": Languages.NOT_DEFINED,
797
+ "__label__si": Languages.NOT_DEFINED,
798
+ "__label__gu": Languages.NOT_DEFINED,
799
+ "__label__km": Languages.NOT_DEFINED,
800
+ "__label__br": Languages.NOT_DEFINED,
801
+ "__label__ba": Languages.NOT_DEFINED,
802
+ "__label__uz": Languages.NOT_DEFINED,
803
+ "__label__bo": Languages.NOT_DEFINED,
804
+ "__label__pa": Languages.NOT_DEFINED,
805
+ "__label__vo": Languages.NOT_DEFINED,
806
+ "__label__als": Languages.NOT_DEFINED,
807
+ "__label__ne": Languages.NOT_DEFINED,
808
+ "__label__cy": Languages.NOT_DEFINED,
809
+ "__label__jbo": Languages.NOT_DEFINED,
810
+ "__label__fy": Languages.NOT_DEFINED,
811
+ "__label__mn": Languages.NOT_DEFINED,
812
+ "__label__lb": Languages.NOT_DEFINED,
813
+ "__label__ce": Languages.NOT_DEFINED,
814
+ "__label__ug": Languages.NOT_DEFINED,
815
+ "__label__tg": Languages.NOT_DEFINED,
816
+ "__label__sco": Languages.NOT_DEFINED,
817
+ "__label__sa": Languages.NOT_DEFINED,
818
+ "__label__cv": Languages.NOT_DEFINED,
819
+ "__label__jv": Languages.NOT_DEFINED,
820
+ "__label__min": Languages.NOT_DEFINED,
821
+ "__label__io": Languages.NOT_DEFINED,
822
+ "__label__or": Languages.NOT_DEFINED,
823
+ "__label__as": Languages.NOT_DEFINED,
824
+ "__label__new": Languages.NOT_DEFINED,
825
+ "__label__ga": Languages.NOT_DEFINED,
826
+ "__label__mg": Languages.NOT_DEFINED,
827
+ "__label__an": Languages.NOT_DEFINED,
828
+ "__label__ckb": Languages.NOT_DEFINED,
829
+ "__label__sw": Languages.NOT_DEFINED,
830
+ "__label__bar": Languages.NOT_DEFINED,
831
+ "__label__lmo": Languages.NOT_DEFINED,
832
+ "__label__yi": Languages.NOT_DEFINED,
833
+ "__label__arz": Languages.NOT_DEFINED,
834
+ "__label__mhr": Languages.NOT_DEFINED,
835
+ "__label__azb": Languages.NOT_DEFINED,
836
+ "__label__sah": Languages.NOT_DEFINED,
837
+ "__label__pnb": Languages.NOT_DEFINED,
838
+ "__label__su": Languages.NOT_DEFINED,
839
+ "__label__bpy": Languages.NOT_DEFINED,
840
+ "__label__pms": Languages.NOT_DEFINED,
841
+ "__label__ilo": Languages.NOT_DEFINED,
842
+ "__label__wuu": Languages.NOT_DEFINED,
843
+ "__label__ku": Languages.NOT_DEFINED,
844
+ "__label__ps": Languages.NOT_DEFINED,
845
+ "__label__ie": Languages.NOT_DEFINED,
846
+ "__label__xmf": Languages.NOT_DEFINED,
847
+ "__label__yue": Languages.NOT_DEFINED,
848
+ "__label__gom": Languages.NOT_DEFINED,
849
+ "__label__li": Languages.NOT_DEFINED,
850
+ "__label__mwl": Languages.NOT_DEFINED,
851
+ "__label__kw": Languages.NOT_DEFINED,
852
+ "__label__sd": Languages.NOT_DEFINED,
853
+ "__label__hsb": Languages.NOT_DEFINED,
854
+ "__label__scn": Languages.NOT_DEFINED,
855
+ "__label__gd": Languages.NOT_DEFINED,
856
+ "__label__pam": Languages.NOT_DEFINED,
857
+ "__label__bh": Languages.NOT_DEFINED,
858
+ "__label__mai": Languages.NOT_DEFINED,
859
+ "__label__vec": Languages.NOT_DEFINED,
860
+ "__label__mt": Languages.NOT_DEFINED,
861
+ "__label__dv": Languages.NOT_DEFINED,
862
+ "__label__wa": Languages.NOT_DEFINED,
863
+ "__label__mzn": Languages.NOT_DEFINED,
864
+ "__label__am": Languages.NOT_DEFINED,
865
+ "__label__qu": Languages.NOT_DEFINED,
866
+ "__label__eml": Languages.NOT_DEFINED,
867
+ "__label__cbk": Languages.NOT_DEFINED,
868
+ "__label__tk": Languages.NOT_DEFINED,
869
+ "__label__rm": Languages.NOT_DEFINED,
870
+ "__label__os": Languages.NOT_DEFINED,
871
+ "__label__vls": Languages.NOT_DEFINED,
872
+ "__label__yo": Languages.NOT_DEFINED,
873
+ "__label__lo": Languages.NOT_DEFINED,
874
+ "__label__lez": Languages.NOT_DEFINED,
875
+ "__label__so": Languages.NOT_DEFINED,
876
+ "__label__myv": Languages.NOT_DEFINED,
877
+ "__label__diq": Languages.NOT_DEFINED,
878
+ "__label__mrj": Languages.NOT_DEFINED,
879
+ "__label__dsb": Languages.NOT_DEFINED,
880
+ "__label__frr": Languages.NOT_DEFINED,
881
+ "__label__ht": Languages.NOT_DEFINED,
882
+ "__label__gn": Languages.NOT_DEFINED,
883
+ "__label__bxr": Languages.NOT_DEFINED,
884
+ "__label__kv": Languages.NOT_DEFINED,
885
+ "__label__sc": Languages.NOT_DEFINED,
886
+ "__label__nah": Languages.NOT_DEFINED,
887
+ "__label__krc": Languages.NOT_DEFINED,
888
+ "__label__bcl": Languages.NOT_DEFINED,
889
+ "__label__nap": Languages.NOT_DEFINED,
890
+ "__label__gv": Languages.NOT_DEFINED,
891
+ "__label__av": Languages.NOT_DEFINED,
892
+ "__label__rue": Languages.NOT_DEFINED,
893
+ "__label__xal": Languages.NOT_DEFINED,
894
+ "__label__pfl": Languages.NOT_DEFINED,
895
+ "__label__dty": Languages.NOT_DEFINED,
896
+ "__label__hif": Languages.NOT_DEFINED,
897
+ "__label__co": Languages.NOT_DEFINED,
898
+ "__label__lrc": Languages.NOT_DEFINED,
899
+ "__label__vep": Languages.NOT_DEFINED,
900
+ "__label__tyv": Languages.NOT_DEFINED,
720
901
  },
721
902
  model_wrapper="FasttextLangDetector",
722
903
  ),
723
904
  }
724
905
 
725
906
  @staticmethod
726
- def get_full_path_weights(name: str) -> str:
907
+ def get_full_path_weights(name: PathLikeOrStr) -> PathLikeOrStr:
727
908
  """
728
909
  Returns the absolute path of weights.
729
910
 
@@ -734,7 +915,7 @@ class ModelCatalog:
734
915
  :return: absolute weight path
735
916
  """
736
917
  try:
737
- profile = ModelCatalog.get_profile(name)
918
+ profile = ModelCatalog.get_profile(os.fspath(name))
738
919
  except KeyError:
739
920
  logger.info(
740
921
  LoggingRecord(
@@ -754,7 +935,7 @@ class ModelCatalog:
754
935
  return os.path.join(get_weights_dir_path(), name)
755
936
 
756
937
  @staticmethod
757
- def get_full_path_configs(name: str) -> str:
938
+ def get_full_path_configs(name: PathLikeOrStr) -> PathLikeOrStr:
758
939
  """
759
940
  Return the absolute path of configs for some given weights. Alternatively, pass last a path to a config file
760
941
  (without the base path to the cache config directory).
@@ -766,7 +947,7 @@ class ModelCatalog:
766
947
  :return: absolute path to the config
767
948
  """
768
949
  try:
769
- profile = ModelCatalog.get_profile(name)
950
+ profile = ModelCatalog.get_profile(os.fspath(name))
770
951
  except KeyError:
771
952
  logger.info(
772
953
  LoggingRecord(
@@ -780,7 +961,7 @@ class ModelCatalog:
780
961
  return os.path.join(get_configs_dir_path(), name)
781
962
 
782
963
  @staticmethod
783
- def get_full_path_preprocessor_configs(name: str) -> str:
964
+ def get_full_path_preprocessor_configs(name: Union[str]) -> PathLikeOrStr:
784
965
  """
785
966
  Return the absolute path of preprocessor configs for some given weights. Preprocessor are occasionally provided
786
967
  by the transformer library.
@@ -804,21 +985,21 @@ class ModelCatalog:
804
985
  return os.path.join(get_configs_dir_path(), name)
805
986
 
806
987
  @staticmethod
807
- def get_model_list() -> List[str]:
988
+ def get_model_list() -> list[PathLikeOrStr]:
808
989
  """
809
990
  Returns a list of absolute paths of registered models.
810
991
  """
811
992
  return [os.path.join(get_weights_dir_path(), profile.name) for profile in ModelCatalog.CATALOG.values()]
812
993
 
813
994
  @staticmethod
814
- def get_profile_list() -> List[str]:
995
+ def get_profile_list() -> list[str]:
815
996
  """
816
997
  Returns a list profile keys.
817
998
  """
818
999
  return list(ModelCatalog.CATALOG.keys())
819
1000
 
820
1001
  @staticmethod
821
- def is_registered(path_weights: str) -> bool:
1002
+ def is_registered(path_weights: PathLikeOrStr) -> bool:
822
1003
  """
823
1004
  Checks if some weights belong to a registered model
824
1005
 
@@ -842,8 +1023,8 @@ class ModelCatalog:
842
1023
 
843
1024
  profile = ModelCatalog.CATALOG.get(name)
844
1025
  if profile is not None:
845
- return copy(profile)
846
- raise KeyError("Model Profile does not exist. Please make sure the model is registered")
1026
+ return profile
1027
+ raise KeyError(f"Model Profile {name} does not exist. Please make sure the model is registered")
847
1028
 
848
1029
  @staticmethod
849
1030
  def register(name: str, profile: ModelProfile) -> None:
@@ -859,7 +1040,7 @@ class ModelCatalog:
859
1040
  ModelCatalog.CATALOG[name] = profile
860
1041
 
861
1042
  @staticmethod
862
- def load_profiles_from_file(path: Optional[str] = None) -> None:
1043
+ def load_profiles_from_file(path: Optional[PathLikeOrStr] = None) -> None:
863
1044
  """
864
1045
  Load model profiles from a jsonl file and extend `CATALOG` with the new profiles.
865
1046
 
@@ -870,10 +1051,11 @@ class ModelCatalog:
870
1051
  with jsonlines.open(path) as reader:
871
1052
  for obj in reader:
872
1053
  if not obj["name"] in ModelCatalog.CATALOG:
1054
+ obj["categories"] = {int(key): get_type(val) for key, val in obj["categories"].items()}
873
1055
  ModelCatalog.register(obj["name"], ModelProfile(**obj))
874
1056
 
875
1057
  @staticmethod
876
- def save_profiles_to_file(target_path: str) -> None:
1058
+ def save_profiles_to_file(target_path: PathLikeOrStr) -> None:
877
1059
  """
878
1060
  Save model profiles to a jsonl file.
879
1061
 
@@ -889,7 +1071,7 @@ class ModelCatalog:
889
1071
  ModelCatalog.load_profiles_from_file(os.environ.get("MODEL_CATALOG", None))
890
1072
 
891
1073
 
892
- def get_tp_weight_names(name: str) -> List[str]:
1074
+ def get_tp_weight_names(name: str) -> list[str]:
893
1075
  """
894
1076
  Given a path to some model weights it will return all file names according to TP naming convention
895
1077
 
@@ -915,7 +1097,7 @@ def print_model_infos(add_description: bool = True, add_config: bool = True, add
915
1097
  num_columns = min(6, len(profiles))
916
1098
  infos = []
917
1099
  for profile in profiles:
918
- tbl_input: List[Union[Mapping[str, ObjectTypes], str]] = [profile.name]
1100
+ tbl_input: list[Union[Mapping[int, ObjectTypes], str]] = [profile.name]
919
1101
  if add_description:
920
1102
  tbl_input.append(profile.description)
921
1103
  if add_config:
@@ -950,7 +1132,7 @@ class ModelDownloadManager:
950
1132
  """
951
1133
 
952
1134
  @staticmethod
953
- def maybe_download_weights_and_configs(name: str) -> str:
1135
+ def maybe_download_weights_and_configs(name: str) -> PathLikeOrStr:
954
1136
  """
955
1137
  Check if some model is registered. If yes, it will check if their weights
956
1138
  must be downloaded. Only weights that have not the same expected size will be downloaded again.
@@ -960,7 +1142,7 @@ class ModelDownloadManager:
960
1142
  """
961
1143
 
962
1144
  absolute_path_weights = ModelCatalog.get_full_path_weights(name)
963
- file_names: List[str] = []
1145
+ file_names: list[str] = []
964
1146
  if ModelCatalog.is_registered(name):
965
1147
  profile = ModelCatalog.get_profile(name)
966
1148
  # there is nothing to download if hf_repo_id or urls is not provided
@@ -980,9 +1162,11 @@ class ModelDownloadManager:
980
1162
  else:
981
1163
  file_names.append(model_name)
982
1164
  if profile.hf_repo_id:
983
- ModelDownloadManager.load_model_from_hf_hub(profile, absolute_path_weights, file_names)
1165
+ if not os.path.isfile(absolute_path_weights):
1166
+ ModelDownloadManager.load_model_from_hf_hub(profile, absolute_path_weights, file_names)
984
1167
  absolute_path_configs = ModelCatalog.get_full_path_configs(name)
985
- ModelDownloadManager.load_configs_from_hf_hub(profile, absolute_path_configs)
1168
+ if not os.path.isfile(absolute_path_configs):
1169
+ ModelDownloadManager.load_configs_from_hf_hub(profile, absolute_path_configs)
986
1170
  else:
987
1171
  ModelDownloadManager._load_from_gd(profile, absolute_path_weights, file_names)
988
1172
 
@@ -991,7 +1175,7 @@ class ModelDownloadManager:
991
1175
  return absolute_path_weights
992
1176
 
993
1177
  @staticmethod
994
- def load_model_from_hf_hub(profile: ModelProfile, absolute_path: str, file_names: List[str]) -> None:
1178
+ def load_model_from_hf_hub(profile: ModelProfile, absolute_path: PathLikeOrStr, file_names: list[str]) -> None:
995
1179
  """
996
1180
  Load a model from the Huggingface hub for a given profile and saves the model at the directory of the given
997
1181
  path.
@@ -1017,7 +1201,7 @@ class ModelDownloadManager:
1017
1201
  )
1018
1202
 
1019
1203
  @staticmethod
1020
- def _load_from_gd(profile: ModelProfile, absolute_path: str, file_names: List[str]) -> None:
1204
+ def _load_from_gd(profile: ModelProfile, absolute_path: PathLikeOrStr, file_names: list[str]) -> None:
1021
1205
  if profile.urls is None:
1022
1206
  raise ValueError("urls cannot be None")
1023
1207
  for size, url, file_name in zip(profile.size, profile.urls, file_names):
@@ -1025,7 +1209,7 @@ class ModelDownloadManager:
1025
1209
  download(str(url), directory, file_name, int(size))
1026
1210
 
1027
1211
  @staticmethod
1028
- def load_configs_from_hf_hub(profile: ModelProfile, absolute_path: str) -> None:
1212
+ def load_configs_from_hf_hub(profile: ModelProfile, absolute_path: PathLikeOrStr) -> None:
1029
1213
  """
1030
1214
  Load config file(s) from the Huggingface hub for a given profile and saves the model at the directory of the
1031
1215
  given path.
@@ -1044,9 +1228,11 @@ class ModelDownloadManager:
1044
1228
  ModelDownloadManager._load_from_hf_hub(repo_id, file_name, directory)
1045
1229
 
1046
1230
  @staticmethod
1047
- def _load_from_hf_hub(repo_id: str, file_name: str, cache_directory: str, force_download: bool = False) -> int:
1231
+ def _load_from_hf_hub(
1232
+ repo_id: str, file_name: str, cache_directory: PathLikeOrStr, force_download: bool = False
1233
+ ) -> int:
1048
1234
  url = hf_hub_url(repo_id=repo_id, filename=file_name)
1049
- token = os.environ.get("HF_CREDENTIALS")
1235
+ token = os.environ.get("HF_CREDENTIALS", None)
1050
1236
  f_path = cached_download(
1051
1237
  url,
1052
1238
  cache_dir=cache_directory,