deepdoctection 0.32__py3-none-any.whl → 0.34__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (111) hide show
  1. deepdoctection/__init__.py +8 -25
  2. deepdoctection/analyzer/dd.py +84 -71
  3. deepdoctection/dataflow/common.py +9 -5
  4. deepdoctection/dataflow/custom.py +5 -5
  5. deepdoctection/dataflow/custom_serialize.py +75 -18
  6. deepdoctection/dataflow/parallel_map.py +3 -3
  7. deepdoctection/dataflow/serialize.py +4 -4
  8. deepdoctection/dataflow/stats.py +3 -3
  9. deepdoctection/datapoint/annotation.py +78 -56
  10. deepdoctection/datapoint/box.py +7 -7
  11. deepdoctection/datapoint/convert.py +6 -6
  12. deepdoctection/datapoint/image.py +157 -75
  13. deepdoctection/datapoint/view.py +175 -151
  14. deepdoctection/datasets/adapter.py +30 -24
  15. deepdoctection/datasets/base.py +10 -10
  16. deepdoctection/datasets/dataflow_builder.py +3 -3
  17. deepdoctection/datasets/info.py +23 -25
  18. deepdoctection/datasets/instances/doclaynet.py +48 -49
  19. deepdoctection/datasets/instances/fintabnet.py +44 -45
  20. deepdoctection/datasets/instances/funsd.py +23 -23
  21. deepdoctection/datasets/instances/iiitar13k.py +8 -8
  22. deepdoctection/datasets/instances/layouttest.py +2 -2
  23. deepdoctection/datasets/instances/publaynet.py +3 -3
  24. deepdoctection/datasets/instances/pubtables1m.py +18 -18
  25. deepdoctection/datasets/instances/pubtabnet.py +30 -29
  26. deepdoctection/datasets/instances/rvlcdip.py +28 -29
  27. deepdoctection/datasets/instances/xfund.py +51 -30
  28. deepdoctection/datasets/save.py +6 -6
  29. deepdoctection/eval/accmetric.py +32 -33
  30. deepdoctection/eval/base.py +8 -9
  31. deepdoctection/eval/cocometric.py +13 -12
  32. deepdoctection/eval/eval.py +32 -26
  33. deepdoctection/eval/tedsmetric.py +16 -12
  34. deepdoctection/eval/tp_eval_callback.py +7 -16
  35. deepdoctection/extern/base.py +339 -134
  36. deepdoctection/extern/d2detect.py +69 -89
  37. deepdoctection/extern/deskew.py +11 -10
  38. deepdoctection/extern/doctrocr.py +81 -64
  39. deepdoctection/extern/fastlang.py +23 -16
  40. deepdoctection/extern/hfdetr.py +53 -38
  41. deepdoctection/extern/hflayoutlm.py +216 -155
  42. deepdoctection/extern/hflm.py +35 -30
  43. deepdoctection/extern/model.py +433 -255
  44. deepdoctection/extern/pdftext.py +15 -15
  45. deepdoctection/extern/pt/ptutils.py +4 -2
  46. deepdoctection/extern/tessocr.py +39 -38
  47. deepdoctection/extern/texocr.py +14 -16
  48. deepdoctection/extern/tp/tfutils.py +16 -2
  49. deepdoctection/extern/tp/tpcompat.py +11 -7
  50. deepdoctection/extern/tp/tpfrcnn/config/config.py +4 -4
  51. deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +1 -1
  52. deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +5 -5
  53. deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +6 -6
  54. deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +4 -4
  55. deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +5 -3
  56. deepdoctection/extern/tp/tpfrcnn/preproc.py +5 -5
  57. deepdoctection/extern/tpdetect.py +40 -45
  58. deepdoctection/mapper/cats.py +36 -40
  59. deepdoctection/mapper/cocostruct.py +16 -12
  60. deepdoctection/mapper/d2struct.py +22 -22
  61. deepdoctection/mapper/hfstruct.py +7 -7
  62. deepdoctection/mapper/laylmstruct.py +22 -24
  63. deepdoctection/mapper/maputils.py +9 -10
  64. deepdoctection/mapper/match.py +33 -2
  65. deepdoctection/mapper/misc.py +6 -7
  66. deepdoctection/mapper/pascalstruct.py +4 -4
  67. deepdoctection/mapper/prodigystruct.py +6 -6
  68. deepdoctection/mapper/pubstruct.py +84 -92
  69. deepdoctection/mapper/tpstruct.py +3 -3
  70. deepdoctection/mapper/xfundstruct.py +33 -33
  71. deepdoctection/pipe/anngen.py +39 -14
  72. deepdoctection/pipe/base.py +68 -99
  73. deepdoctection/pipe/common.py +181 -85
  74. deepdoctection/pipe/concurrency.py +14 -10
  75. deepdoctection/pipe/doctectionpipe.py +24 -21
  76. deepdoctection/pipe/language.py +20 -25
  77. deepdoctection/pipe/layout.py +18 -16
  78. deepdoctection/pipe/lm.py +49 -47
  79. deepdoctection/pipe/order.py +63 -65
  80. deepdoctection/pipe/refine.py +102 -109
  81. deepdoctection/pipe/segment.py +157 -162
  82. deepdoctection/pipe/sub_layout.py +50 -40
  83. deepdoctection/pipe/text.py +37 -36
  84. deepdoctection/pipe/transform.py +19 -16
  85. deepdoctection/train/d2_frcnn_train.py +27 -25
  86. deepdoctection/train/hf_detr_train.py +22 -18
  87. deepdoctection/train/hf_layoutlm_train.py +49 -48
  88. deepdoctection/train/tp_frcnn_train.py +10 -11
  89. deepdoctection/utils/concurrency.py +1 -1
  90. deepdoctection/utils/context.py +13 -6
  91. deepdoctection/utils/develop.py +4 -4
  92. deepdoctection/utils/env_info.py +52 -14
  93. deepdoctection/utils/file_utils.py +6 -11
  94. deepdoctection/utils/fs.py +41 -14
  95. deepdoctection/utils/identifier.py +2 -2
  96. deepdoctection/utils/logger.py +15 -15
  97. deepdoctection/utils/metacfg.py +7 -7
  98. deepdoctection/utils/pdf_utils.py +39 -14
  99. deepdoctection/utils/settings.py +188 -182
  100. deepdoctection/utils/tqdm.py +1 -1
  101. deepdoctection/utils/transform.py +14 -9
  102. deepdoctection/utils/types.py +104 -0
  103. deepdoctection/utils/utils.py +7 -7
  104. deepdoctection/utils/viz.py +70 -69
  105. {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/METADATA +7 -4
  106. deepdoctection-0.34.dist-info/RECORD +146 -0
  107. {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/WHEEL +1 -1
  108. deepdoctection/utils/detection_types.py +0 -68
  109. deepdoctection-0.32.dist-info/RECORD +0 -146
  110. {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/LICENSE +0 -0
  111. {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/top_level.txt +0 -0
@@ -20,9 +20,8 @@ Module for ModelCatalog and ModelDownloadManager
20
20
  """
21
21
 
22
22
  import os
23
- from copy import copy
24
23
  from dataclasses import asdict, dataclass, field
25
- from typing import Any, Dict, List, Mapping, Optional, Union
24
+ from typing import Any, Mapping, Optional, Union
26
25
 
27
26
  import jsonlines
28
27
  from huggingface_hub import cached_download, hf_hub_url # type: ignore
@@ -32,11 +31,12 @@ from termcolor import colored
32
31
  from ..utils.fs import download, get_configs_dir_path, get_weights_dir_path
33
32
  from ..utils.logger import LoggingRecord, log_once, logger
34
33
  from ..utils.settings import CellType, Languages, LayoutType, ObjectTypes, get_type
34
+ from ..utils.types import PathLikeOrStr
35
35
 
36
36
  __all__ = ["ModelCatalog", "ModelDownloadManager", "print_model_infos", "ModelProfile"]
37
37
 
38
38
 
39
- @dataclass
39
+ @dataclass(frozen=True)
40
40
  class ModelProfile:
41
41
  """
42
42
  Class for model profile. Add for each model one ModelProfile to the ModelCatalog
@@ -45,25 +45,21 @@ class ModelProfile:
45
45
  name: str
46
46
  description: str
47
47
 
48
- size: List[int]
48
+ size: list[int]
49
49
  tp_model: bool = field(default=False)
50
50
  config: Optional[str] = field(default=None)
51
51
  preprocessor_config: Optional[str] = field(default=None)
52
52
  hf_repo_id: Optional[str] = field(default=None)
53
53
  hf_model_name: Optional[str] = field(default=None)
54
- hf_config_file: Optional[List[str]] = field(default=None)
55
- urls: Optional[List[str]] = field(default=None)
56
- categories: Optional[Dict[str, ObjectTypes]] = field(default=None)
54
+ hf_config_file: Optional[list[str]] = field(default=None)
55
+ urls: Optional[list[str]] = field(default=None)
56
+ categories: Optional[Mapping[int, ObjectTypes]] = field(default=None)
57
+ categories_orig: Optional[Mapping[str, ObjectTypes]] = field(default=None)
57
58
  dl_library: Optional[str] = field(default=None)
58
59
  model_wrapper: Optional[str] = field(default=None)
59
60
  architecture: Optional[str] = field(default=None)
60
61
 
61
- def __post_init__(self) -> None:
62
- """updating categories to ObjectTypes. This might be necessary if we load a catalog from a file"""
63
- if self.categories:
64
- self.categories = {key: get_type(val) for key, val in self.categories.items()}
65
-
66
- def as_dict(self) -> Dict[str, Any]:
62
+ def as_dict(self) -> dict[str, Any]:
67
63
  """
68
64
  returns a dict of the dataclass
69
65
  """
@@ -94,7 +90,7 @@ class ModelCatalog:
94
90
  ModelCatalog.get_full_path_configs("my_new_model")
95
91
  """
96
92
 
97
- CATALOG: Dict[str, ModelProfile] = {
93
+ CATALOG: dict[str, ModelProfile] = {
98
94
  "layout/model-800000_inf_only.data-00000-of-00001": ModelProfile(
99
95
  name="layout/model-800000_inf_only.data-00000-of-00001",
100
96
  description="Tensorpack layout model for inference purposes trained on Publaynet",
@@ -105,11 +101,11 @@ class ModelCatalog:
105
101
  hf_model_name="model-800000_inf_only",
106
102
  hf_config_file=["conf_frcnn_layout.yaml"],
107
103
  categories={
108
- "1": LayoutType.text,
109
- "2": LayoutType.title,
110
- "3": LayoutType.list,
111
- "4": LayoutType.table,
112
- "5": LayoutType.figure,
104
+ 1: LayoutType.TEXT,
105
+ 2: LayoutType.TITLE,
106
+ 3: LayoutType.LIST,
107
+ 4: LayoutType.TABLE,
108
+ 5: LayoutType.FIGURE,
113
109
  },
114
110
  dl_library="TF",
115
111
  model_wrapper="TPFrcnnDetector",
@@ -123,7 +119,7 @@ class ModelCatalog:
123
119
  hf_repo_id="deepdoctection/tp_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_c_inference_only",
124
120
  hf_model_name="model-1800000_inf_only",
125
121
  hf_config_file=["conf_frcnn_cell.yaml"],
126
- categories={"1": LayoutType.cell},
122
+ categories={1: LayoutType.CELL},
127
123
  dl_library="TF",
128
124
  model_wrapper="TPFrcnnDetector",
129
125
  ),
@@ -136,7 +132,7 @@ class ModelCatalog:
136
132
  hf_repo_id="deepdoctection/tp_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_rc_inference_only",
137
133
  hf_model_name="model-1620000_inf_only",
138
134
  hf_config_file=["conf_frcnn_rows.yaml"],
139
- categories={"1": LayoutType.row, "2": LayoutType.column},
135
+ categories={1: LayoutType.ROW, 2: LayoutType.COLUMN},
140
136
  dl_library="TF",
141
137
  model_wrapper="TPFrcnnDetector",
142
138
  ),
@@ -149,7 +145,7 @@ class ModelCatalog:
149
145
  hf_repo_id="deepdoctection/tp_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_rc",
150
146
  hf_model_name="model-1620000",
151
147
  hf_config_file=["conf_frcnn_rows.yaml"],
152
- categories={"1": LayoutType.row, "2": LayoutType.column},
148
+ categories={1: LayoutType.ROW, 2: LayoutType.COLUMN},
153
149
  dl_library="TF",
154
150
  model_wrapper="TPFrcnnDetector",
155
151
  ),
@@ -164,11 +160,11 @@ class ModelCatalog:
164
160
  hf_config_file=["conf_frcnn_layout.yaml"],
165
161
  dl_library="TF",
166
162
  categories={
167
- "1": LayoutType.text,
168
- "2": LayoutType.title,
169
- "3": LayoutType.list,
170
- "4": LayoutType.table,
171
- "5": LayoutType.figure,
163
+ 1: LayoutType.TEXT,
164
+ 2: LayoutType.TITLE,
165
+ 3: LayoutType.LIST,
166
+ 4: LayoutType.TABLE,
167
+ 5: LayoutType.FIGURE,
172
168
  },
173
169
  model_wrapper="TPFrcnnDetector",
174
170
  ),
@@ -181,7 +177,7 @@ class ModelCatalog:
181
177
  hf_repo_id="deepdoctection/tp_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_c",
182
178
  hf_model_name="model-1800000",
183
179
  hf_config_file=["conf_frcnn_cell.yaml"],
184
- categories={"1": LayoutType.cell},
180
+ categories={1: LayoutType.CELL},
185
181
  dl_library="TF",
186
182
  model_wrapper="TPFrcnnDetector",
187
183
  ),
@@ -195,11 +191,11 @@ class ModelCatalog:
195
191
  hf_model_name="d2_model_0829999_layout_inf_only.pt",
196
192
  hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
197
193
  categories={
198
- "1": LayoutType.text,
199
- "2": LayoutType.title,
200
- "3": LayoutType.list,
201
- "4": LayoutType.table,
202
- "5": LayoutType.figure,
194
+ 1: LayoutType.TEXT,
195
+ 2: LayoutType.TITLE,
196
+ 3: LayoutType.LIST,
197
+ 4: LayoutType.TABLE,
198
+ 5: LayoutType.FIGURE,
203
199
  },
204
200
  dl_library="PT",
205
201
  model_wrapper="D2FrcnnDetector",
@@ -214,11 +210,11 @@ class ModelCatalog:
214
210
  hf_model_name="d2_model_0829999_layout.pth",
215
211
  hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
216
212
  categories={
217
- "1": LayoutType.text,
218
- "2": LayoutType.title,
219
- "3": LayoutType.list,
220
- "4": LayoutType.table,
221
- "5": LayoutType.figure,
213
+ 1: LayoutType.TEXT,
214
+ 2: LayoutType.TITLE,
215
+ 3: LayoutType.LIST,
216
+ 4: LayoutType.TABLE,
217
+ 5: LayoutType.FIGURE,
222
218
  },
223
219
  dl_library="PT",
224
220
  model_wrapper="D2FrcnnDetector",
@@ -233,11 +229,11 @@ class ModelCatalog:
233
229
  hf_model_name="d2_model_0829999_layout_inf_only.ts",
234
230
  hf_config_file=["CASCADE_RCNN_R_50_FPN_GN_TS.yaml"],
235
231
  categories={
236
- "1": LayoutType.text,
237
- "2": LayoutType.title,
238
- "3": LayoutType.list,
239
- "4": LayoutType.table,
240
- "5": LayoutType.figure,
232
+ 1: LayoutType.TEXT,
233
+ 2: LayoutType.TITLE,
234
+ 3: LayoutType.LIST,
235
+ 4: LayoutType.TABLE,
236
+ 5: LayoutType.FIGURE,
241
237
  },
242
238
  dl_library="PT",
243
239
  model_wrapper="D2FrcnnTracingDetector",
@@ -251,7 +247,7 @@ class ModelCatalog:
251
247
  hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_c_inference_only",
252
248
  hf_model_name="d2_model_1849999_cell_inf_only.pt",
253
249
  hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
254
- categories={"1": LayoutType.cell},
250
+ categories={1: LayoutType.CELL},
255
251
  dl_library="PT",
256
252
  model_wrapper="D2FrcnnDetector",
257
253
  ),
@@ -264,7 +260,7 @@ class ModelCatalog:
264
260
  hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_c_inference_only",
265
261
  hf_model_name="d2_model_1849999_cell_inf_only.ts",
266
262
  hf_config_file=["CASCADE_RCNN_R_50_FPN_GN_TS.yaml"],
267
- categories={"1": LayoutType.cell},
263
+ categories={1: LayoutType.CELL},
268
264
  dl_library="PT",
269
265
  model_wrapper="D2FrcnnTracingDetector",
270
266
  ),
@@ -277,7 +273,7 @@ class ModelCatalog:
277
273
  hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_c_inference_only",
278
274
  hf_model_name="cell/d2_model_1849999_cell.pth",
279
275
  hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
280
- categories={"1": LayoutType.cell},
276
+ categories={1: LayoutType.CELL},
281
277
  dl_library="PT",
282
278
  model_wrapper="D2FrcnnDetector",
283
279
  ),
@@ -290,7 +286,7 @@ class ModelCatalog:
290
286
  hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_rc_inference_only",
291
287
  hf_model_name="d2_model_1639999_item.pth",
292
288
  hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
293
- categories={"1": LayoutType.row, "2": LayoutType.column},
289
+ categories={1: LayoutType.ROW, 2: LayoutType.COLUMN},
294
290
  dl_library="PT",
295
291
  model_wrapper="D2FrcnnDetector",
296
292
  ),
@@ -303,7 +299,7 @@ class ModelCatalog:
303
299
  hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_rc_inference_only",
304
300
  hf_model_name="d2_model_1639999_item_inf_only.pt",
305
301
  hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
306
- categories={"1": LayoutType.row, "2": LayoutType.column},
302
+ categories={1: LayoutType.ROW, 2: LayoutType.COLUMN},
307
303
  dl_library="PT",
308
304
  model_wrapper="D2FrcnnDetector",
309
305
  ),
@@ -316,7 +312,7 @@ class ModelCatalog:
316
312
  hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_rc_inference_only",
317
313
  hf_model_name="d2_model_1639999_item_inf_only.ts",
318
314
  hf_config_file=["CASCADE_RCNN_R_50_FPN_GN_TS.yaml"],
319
- categories={"1": LayoutType.row, "2": LayoutType.column},
315
+ categories={1: LayoutType.ROW, 2: LayoutType.COLUMN},
320
316
  dl_library="PT",
321
317
  model_wrapper="D2FrcnnTracingDetector",
322
318
  ),
@@ -453,7 +449,7 @@ class ModelCatalog:
453
449
  hf_repo_id="microsoft/table-transformer-detection",
454
450
  hf_model_name="pytorch_model.bin",
455
451
  hf_config_file=["config.json", "preprocessor_config.json"],
456
- categories={"1": LayoutType.table, "2": LayoutType.table_rotated},
452
+ categories={1: LayoutType.TABLE, 2: LayoutType.TABLE_ROTATED},
457
453
  dl_library="PT",
458
454
  model_wrapper="HFDetrDerivedDetector",
459
455
  ),
@@ -471,12 +467,12 @@ class ModelCatalog:
471
467
  hf_model_name="pytorch_model.bin",
472
468
  hf_config_file=["config.json", "preprocessor_config.json"],
473
469
  categories={
474
- "1": LayoutType.table,
475
- "2": LayoutType.column,
476
- "3": LayoutType.row,
477
- "4": CellType.column_header,
478
- "5": CellType.projected_row_header,
479
- "6": CellType.spanning,
470
+ 1: LayoutType.TABLE,
471
+ 2: LayoutType.COLUMN,
472
+ 3: LayoutType.ROW,
473
+ 4: CellType.COLUMN_HEADER,
474
+ 5: CellType.PROJECTED_ROW_HEADER,
475
+ 6: CellType.SPANNING,
480
476
  },
481
477
  dl_library="PT",
482
478
  model_wrapper="HFDetrDerivedDetector",
@@ -488,7 +484,7 @@ class ModelCatalog:
488
484
  "https://mindee.github.io/doctr/using_doctr/using_models.html#. This is the Pytorch artefact.",
489
485
  size=[101971449],
490
486
  urls=["https://doctr-static.mindee.com/models?id=v0.3.1/db_resnet50-ac60cadc.pt&src=0"],
491
- categories={"1": LayoutType.word},
487
+ categories={1: LayoutType.WORD},
492
488
  dl_library="PT",
493
489
  model_wrapper="DoctrTextlineDetector",
494
490
  architecture="db_resnet50",
@@ -500,7 +496,7 @@ class ModelCatalog:
500
496
  "https://mindee.github.io/doctr/using_doctr/using_models.html#. This is the Tensorflow artefact.",
501
497
  size=[94178964],
502
498
  urls=["https://doctr-static.mindee.com/models?id=v0.2.0/db_resnet50-adcafc63.zip&src=0"],
503
- categories={"1": LayoutType.word},
499
+ categories={1: LayoutType.WORD},
504
500
  dl_library="TF",
505
501
  model_wrapper="DoctrTextlineDetector",
506
502
  architecture="db_resnet50",
@@ -548,189 +544,367 @@ class ModelCatalog:
548
544
  size=[131266198],
549
545
  urls=["https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin"],
550
546
  categories={
551
- "__label__en": Languages.english,
552
- "__label__ru": Languages.russian,
553
- "__label__de": Languages.german,
554
- "__label__fr": Languages.french,
555
- "__label__it": Languages.italian,
556
- "__label__ja": Languages.japanese,
557
- "__label__es": Languages.spanish,
558
- "__label__ceb": Languages.cebuano,
559
- "__label__tr": Languages.turkish,
560
- "__label__pt": Languages.portuguese,
561
- "__label__uk": Languages.ukrainian,
562
- "__label__eo": Languages.esperanto,
563
- "__label__pl": Languages.polish,
564
- "__label__sv": Languages.swedish,
565
- "__label__nl": Languages.dutch,
566
- "__label__he": Languages.hebrew,
567
- "__label__zh": Languages.chinese,
568
- "__label__hu": Languages.hungarian,
569
- "__label__ar": Languages.arabic,
570
- "__label__ca": Languages.catalan,
571
- "__label__fi": Languages.finnish,
572
- "__label__cs": Languages.czech,
573
- "__label__fa": Languages.persian,
574
- "__label__sr": Languages.serbian,
575
- "__label__el": Languages.greek,
576
- "__label__vi": Languages.vietnamese,
577
- "__label__bg": Languages.bulgarian,
578
- "__label__ko": Languages.korean,
579
- "__label__no": Languages.norwegian,
580
- "__label__mk": Languages.macedonian,
581
- "__label__ro": Languages.romanian,
582
- "__label__id": Languages.indonesian,
583
- "__label__th": Languages.thai,
584
- "__label__hy": Languages.armenian,
585
- "__label__da": Languages.danish,
586
- "__label__ta": Languages.tamil,
587
- "__label__hi": Languages.hindi,
588
- "__label__hr": Languages.croatian,
589
- "__label__sh": Languages.not_defined,
590
- "__label__be": Languages.belarusian,
591
- "__label__ka": Languages.georgian,
592
- "__label__te": Languages.telugu,
593
- "__label__kk": Languages.kazakh,
594
- "__label__war": Languages.waray,
595
- "__label__lt": Languages.lithuanian,
596
- "__label__gl": Languages.scottish,
597
- "__label__sk": Languages.slovak,
598
- "__label__bn": Languages.benin,
599
- "__label__eu": Languages.basque,
600
- "__label__sl": Languages.slovenian,
601
- "__label__kn": Languages.not_defined,
602
- "__label__ml": Languages.malayalam,
603
- "__label__mr": Languages.marathi,
604
- "__label__et": Languages.estonian,
605
- "__label__az": Languages.azerbaijani,
606
- "__label__ms": Languages.not_defined,
607
- "__label__sq": Languages.albanian,
608
- "__label__la": Languages.latin,
609
- "__label__bs": Languages.bosnian,
610
- "__label__nn": Languages.norwegian_nynorsk,
611
- "__label__ur": Languages.urdu,
612
- "__label__lv": Languages.not_defined,
613
- "__label__my": Languages.not_defined,
614
- "__label__tt": Languages.not_defined,
615
- "__label__af": Languages.not_defined,
616
- "__label__oc": Languages.not_defined,
617
- "__label__nds": Languages.not_defined,
618
- "__label__ky": Languages.not_defined,
619
- "__label__ast": Languages.not_defined,
620
- "__label__tl": Languages.not_defined,
621
- "__label__is": Languages.not_defined,
622
- "__label__ia": Languages.not_defined,
623
- "__label__si": Languages.not_defined,
624
- "__label__gu": Languages.not_defined,
625
- "__label__km": Languages.not_defined,
626
- "__label__br": Languages.not_defined,
627
- "__label__ba": Languages.not_defined,
628
- "__label__uz": Languages.not_defined,
629
- "__label__bo": Languages.not_defined,
630
- "__label__pa": Languages.not_defined,
631
- "__label__vo": Languages.not_defined,
632
- "__label__als": Languages.not_defined,
633
- "__label__ne": Languages.not_defined,
634
- "__label__cy": Languages.not_defined,
635
- "__label__jbo": Languages.not_defined,
636
- "__label__fy": Languages.not_defined,
637
- "__label__mn": Languages.not_defined,
638
- "__label__lb": Languages.not_defined,
639
- "__label__ce": Languages.not_defined,
640
- "__label__ug": Languages.not_defined,
641
- "__label__tg": Languages.not_defined,
642
- "__label__sco": Languages.not_defined,
643
- "__label__sa": Languages.not_defined,
644
- "__label__cv": Languages.not_defined,
645
- "__label__jv": Languages.not_defined,
646
- "__label__min": Languages.not_defined,
647
- "__label__io": Languages.not_defined,
648
- "__label__or": Languages.not_defined,
649
- "__label__as": Languages.not_defined,
650
- "__label__new": Languages.not_defined,
651
- "__label__ga": Languages.not_defined,
652
- "__label__mg": Languages.not_defined,
653
- "__label__an": Languages.not_defined,
654
- "__label__ckb": Languages.not_defined,
655
- "__label__sw": Languages.not_defined,
656
- "__label__bar": Languages.not_defined,
657
- "__label__lmo": Languages.not_defined,
658
- "__label__yi": Languages.not_defined,
659
- "__label__arz": Languages.not_defined,
660
- "__label__mhr": Languages.not_defined,
661
- "__label__azb": Languages.not_defined,
662
- "__label__sah": Languages.not_defined,
663
- "__label__pnb": Languages.not_defined,
664
- "__label__su": Languages.not_defined,
665
- "__label__bpy": Languages.not_defined,
666
- "__label__pms": Languages.not_defined,
667
- "__label__ilo": Languages.not_defined,
668
- "__label__wuu": Languages.not_defined,
669
- "__label__ku": Languages.not_defined,
670
- "__label__ps": Languages.not_defined,
671
- "__label__ie": Languages.not_defined,
672
- "__label__xmf": Languages.not_defined,
673
- "__label__yue": Languages.not_defined,
674
- "__label__gom": Languages.not_defined,
675
- "__label__li": Languages.not_defined,
676
- "__label__mwl": Languages.not_defined,
677
- "__label__kw": Languages.not_defined,
678
- "__label__sd": Languages.not_defined,
679
- "__label__hsb": Languages.not_defined,
680
- "__label__scn": Languages.not_defined,
681
- "__label__gd": Languages.not_defined,
682
- "__label__pam": Languages.not_defined,
683
- "__label__bh": Languages.not_defined,
684
- "__label__mai": Languages.not_defined,
685
- "__label__vec": Languages.not_defined,
686
- "__label__mt": Languages.not_defined,
687
- "__label__dv": Languages.not_defined,
688
- "__label__wa": Languages.not_defined,
689
- "__label__mzn": Languages.not_defined,
690
- "__label__am": Languages.not_defined,
691
- "__label__qu": Languages.not_defined,
692
- "__label__eml": Languages.not_defined,
693
- "__label__cbk": Languages.not_defined,
694
- "__label__tk": Languages.not_defined,
695
- "__label__rm": Languages.not_defined,
696
- "__label__os": Languages.not_defined,
697
- "__label__vls": Languages.not_defined,
698
- "__label__yo": Languages.not_defined,
699
- "__label__lo": Languages.not_defined,
700
- "__label__lez": Languages.not_defined,
701
- "__label__so": Languages.not_defined,
702
- "__label__myv": Languages.not_defined,
703
- "__label__diq": Languages.not_defined,
704
- "__label__mrj": Languages.not_defined,
705
- "__label__dsb": Languages.not_defined,
706
- "__label__frr": Languages.not_defined,
707
- "__label__ht": Languages.not_defined,
708
- "__label__gn": Languages.not_defined,
709
- "__label__bxr": Languages.not_defined,
710
- "__label__kv": Languages.not_defined,
711
- "__label__sc": Languages.not_defined,
712
- "__label__nah": Languages.not_defined,
713
- "__label__krc": Languages.not_defined,
714
- "__label__bcl": Languages.not_defined,
715
- "__label__nap": Languages.not_defined,
716
- "__label__gv": Languages.not_defined,
717
- "__label__av": Languages.not_defined,
718
- "__label__rue": Languages.not_defined,
719
- "__label__xal": Languages.not_defined,
720
- "__label__pfl": Languages.not_defined,
721
- "__label__dty": Languages.not_defined,
722
- "__label__hif": Languages.not_defined,
723
- "__label__co": Languages.not_defined,
724
- "__label__lrc": Languages.not_defined,
725
- "__label__vep": Languages.not_defined,
726
- "__label__tyv": Languages.not_defined,
547
+ 1: Languages.ENGLISH,
548
+ 2: Languages.RUSSIAN,
549
+ 3: Languages.GERMAN,
550
+ 4: Languages.FRENCH,
551
+ 5: Languages.ITALIAN,
552
+ 6: Languages.JAPANESE,
553
+ 7: Languages.SPANISH,
554
+ 8: Languages.CEBUANO,
555
+ 9: Languages.TURKISH,
556
+ 10: Languages.PORTUGUESE,
557
+ 11: Languages.UKRAINIAN,
558
+ 12: Languages.ESPERANTO,
559
+ 13: Languages.POLISH,
560
+ 14: Languages.SWEDISH,
561
+ 15: Languages.DUTCH,
562
+ 16: Languages.HEBREW,
563
+ 17: Languages.CHINESE,
564
+ 18: Languages.HUNGARIAN,
565
+ 19: Languages.ARABIC,
566
+ 20: Languages.CATALAN,
567
+ 21: Languages.FINNISH,
568
+ 22: Languages.CZECH,
569
+ 23: Languages.PERSIAN,
570
+ 24: Languages.SERBIAN,
571
+ 25: Languages.GREEK,
572
+ 26: Languages.VIETNAMESE,
573
+ 27: Languages.BULGARIAN,
574
+ 28: Languages.KOREAN,
575
+ 29: Languages.NORWEGIAN,
576
+ 30: Languages.MACEDONIAN,
577
+ 31: Languages.ROMANIAN,
578
+ 32: Languages.INDONESIAN,
579
+ 33: Languages.THAI,
580
+ 34: Languages.ARMENIAN,
581
+ 35: Languages.DANISH,
582
+ 36: Languages.TAMIL,
583
+ 37: Languages.HINDI,
584
+ 38: Languages.CROATIAN,
585
+ 39: Languages.NOT_DEFINED,
586
+ 40: Languages.BELARUSIAN,
587
+ 41: Languages.GEORGIAN,
588
+ 42: Languages.TELUGU,
589
+ 43: Languages.KAZAKH,
590
+ 44: Languages.WARAY,
591
+ 45: Languages.LITHUANIAN,
592
+ 46: Languages.SCOTTISH,
593
+ 47: Languages.SLOVAK,
594
+ 48: Languages.BENIN,
595
+ 49: Languages.BASQUE,
596
+ 50: Languages.SLOVENIAN,
597
+ 51: Languages.NOT_DEFINED,
598
+ 52: Languages.MALAYALAM,
599
+ 53: Languages.MARATHI,
600
+ 54: Languages.ESTONIAN,
601
+ 55: Languages.AZERBAIJANI,
602
+ 56: Languages.NOT_DEFINED,
603
+ 57: Languages.ALBANIAN,
604
+ 58: Languages.LATIN,
605
+ 59: Languages.BOSNIAN,
606
+ 60: Languages.NORWEGIAN_NOVOSIBIRSK,
607
+ 61: Languages.URDU,
608
+ 62: Languages.NOT_DEFINED,
609
+ 63: Languages.NOT_DEFINED,
610
+ 64: Languages.NOT_DEFINED,
611
+ 65: Languages.NOT_DEFINED,
612
+ 66: Languages.NOT_DEFINED,
613
+ 67: Languages.NOT_DEFINED,
614
+ 68: Languages.NOT_DEFINED,
615
+ 69: Languages.NOT_DEFINED,
616
+ 70: Languages.NOT_DEFINED,
617
+ 71: Languages.NOT_DEFINED,
618
+ 72: Languages.NOT_DEFINED,
619
+ 73: Languages.NOT_DEFINED,
620
+ 74: Languages.NOT_DEFINED,
621
+ 75: Languages.NOT_DEFINED,
622
+ 76: Languages.NOT_DEFINED,
623
+ 77: Languages.NOT_DEFINED,
624
+ 78: Languages.NOT_DEFINED,
625
+ 79: Languages.NOT_DEFINED,
626
+ 80: Languages.NOT_DEFINED,
627
+ 81: Languages.NOT_DEFINED,
628
+ 82: Languages.NOT_DEFINED,
629
+ 83: Languages.NOT_DEFINED,
630
+ 84: Languages.NOT_DEFINED,
631
+ 85: Languages.NOT_DEFINED,
632
+ 86: Languages.NOT_DEFINED,
633
+ 87: Languages.NOT_DEFINED,
634
+ 88: Languages.NOT_DEFINED,
635
+ 89: Languages.NOT_DEFINED,
636
+ 90: Languages.NOT_DEFINED,
637
+ 91: Languages.NOT_DEFINED,
638
+ 92: Languages.NOT_DEFINED,
639
+ 93: Languages.NOT_DEFINED,
640
+ 94: Languages.NOT_DEFINED,
641
+ 95: Languages.NOT_DEFINED,
642
+ 96: Languages.NOT_DEFINED,
643
+ 97: Languages.NOT_DEFINED,
644
+ 98: Languages.NOT_DEFINED,
645
+ 99: Languages.NOT_DEFINED,
646
+ 100: Languages.NOT_DEFINED,
647
+ 101: Languages.NOT_DEFINED,
648
+ 102: Languages.NOT_DEFINED,
649
+ 103: Languages.NOT_DEFINED,
650
+ 104: Languages.NOT_DEFINED,
651
+ 105: Languages.NOT_DEFINED,
652
+ 106: Languages.NOT_DEFINED,
653
+ 107: Languages.NOT_DEFINED,
654
+ 108: Languages.NOT_DEFINED,
655
+ 109: Languages.NOT_DEFINED,
656
+ 110: Languages.NOT_DEFINED,
657
+ 111: Languages.NOT_DEFINED,
658
+ 112: Languages.NOT_DEFINED,
659
+ 113: Languages.NOT_DEFINED,
660
+ 114: Languages.NOT_DEFINED,
661
+ 115: Languages.NOT_DEFINED,
662
+ 116: Languages.NOT_DEFINED,
663
+ 117: Languages.NOT_DEFINED,
664
+ 118: Languages.NOT_DEFINED,
665
+ 119: Languages.NOT_DEFINED,
666
+ 120: Languages.NOT_DEFINED,
667
+ 121: Languages.NOT_DEFINED,
668
+ 122: Languages.NOT_DEFINED,
669
+ 123: Languages.NOT_DEFINED,
670
+ 124: Languages.NOT_DEFINED,
671
+ 125: Languages.NOT_DEFINED,
672
+ 126: Languages.NOT_DEFINED,
673
+ 127: Languages.NOT_DEFINED,
674
+ 128: Languages.NOT_DEFINED,
675
+ 129: Languages.NOT_DEFINED,
676
+ 130: Languages.NOT_DEFINED,
677
+ 131: Languages.NOT_DEFINED,
678
+ 132: Languages.NOT_DEFINED,
679
+ 133: Languages.NOT_DEFINED,
680
+ 134: Languages.NOT_DEFINED,
681
+ 135: Languages.NOT_DEFINED,
682
+ 136: Languages.NOT_DEFINED,
683
+ 137: Languages.NOT_DEFINED,
684
+ 138: Languages.NOT_DEFINED,
685
+ 139: Languages.NOT_DEFINED,
686
+ 140: Languages.NOT_DEFINED,
687
+ 141: Languages.NOT_DEFINED,
688
+ 142: Languages.NOT_DEFINED,
689
+ 143: Languages.NOT_DEFINED,
690
+ 144: Languages.NOT_DEFINED,
691
+ 145: Languages.NOT_DEFINED,
692
+ 146: Languages.NOT_DEFINED,
693
+ 147: Languages.NOT_DEFINED,
694
+ 148: Languages.NOT_DEFINED,
695
+ 149: Languages.NOT_DEFINED,
696
+ 150: Languages.NOT_DEFINED,
697
+ 151: Languages.NOT_DEFINED,
698
+ 152: Languages.NOT_DEFINED,
699
+ 153: Languages.NOT_DEFINED,
700
+ 154: Languages.NOT_DEFINED,
701
+ 155: Languages.NOT_DEFINED,
702
+ 156: Languages.NOT_DEFINED,
703
+ 157: Languages.NOT_DEFINED,
704
+ 158: Languages.NOT_DEFINED,
705
+ 159: Languages.NOT_DEFINED,
706
+ 160: Languages.NOT_DEFINED,
707
+ 161: Languages.NOT_DEFINED,
708
+ 162: Languages.NOT_DEFINED,
709
+ 163: Languages.NOT_DEFINED,
710
+ 164: Languages.NOT_DEFINED,
711
+ 165: Languages.NOT_DEFINED,
712
+ 166: Languages.NOT_DEFINED,
713
+ 167: Languages.NOT_DEFINED,
714
+ 168: Languages.NOT_DEFINED,
715
+ 169: Languages.NOT_DEFINED,
716
+ 170: Languages.NOT_DEFINED,
717
+ 171: Languages.NOT_DEFINED,
718
+ 172: Languages.NOT_DEFINED,
719
+ 173: Languages.NOT_DEFINED,
720
+ 174: Languages.NOT_DEFINED,
721
+ 175: Languages.NOT_DEFINED,
722
+ 176: Languages.NOT_DEFINED,
723
+ },
724
+ categories_orig={
725
+ "__label__en": Languages.ENGLISH,
726
+ "__label__ru": Languages.RUSSIAN,
727
+ "__label__de": Languages.GERMAN,
728
+ "__label__fr": Languages.FRENCH,
729
+ "__label__it": Languages.ITALIAN,
730
+ "__label__ja": Languages.JAPANESE,
731
+ "__label__es": Languages.SPANISH,
732
+ "__label__ceb": Languages.CEBUANO,
733
+ "__label__tr": Languages.TURKISH,
734
+ "__label__pt": Languages.PORTUGUESE,
735
+ "__label__uk": Languages.UKRAINIAN,
736
+ "__label__eo": Languages.ESPERANTO,
737
+ "__label__pl": Languages.POLISH,
738
+ "__label__sv": Languages.SWEDISH,
739
+ "__label__nl": Languages.DUTCH,
740
+ "__label__he": Languages.HEBREW,
741
+ "__label__zh": Languages.CHINESE,
742
+ "__label__hu": Languages.HUNGARIAN,
743
+ "__label__ar": Languages.ARABIC,
744
+ "__label__ca": Languages.CATALAN,
745
+ "__label__fi": Languages.FINNISH,
746
+ "__label__cs": Languages.CZECH,
747
+ "__label__fa": Languages.PERSIAN,
748
+ "__label__sr": Languages.SERBIAN,
749
+ "__label__el": Languages.GREEK,
750
+ "__label__vi": Languages.VIETNAMESE,
751
+ "__label__bg": Languages.BULGARIAN,
752
+ "__label__ko": Languages.KOREAN,
753
+ "__label__no": Languages.NORWEGIAN,
754
+ "__label__mk": Languages.MACEDONIAN,
755
+ "__label__ro": Languages.ROMANIAN,
756
+ "__label__id": Languages.INDONESIAN,
757
+ "__label__th": Languages.THAI,
758
+ "__label__hy": Languages.ARMENIAN,
759
+ "__label__da": Languages.DANISH,
760
+ "__label__ta": Languages.TAMIL,
761
+ "__label__hi": Languages.HINDI,
762
+ "__label__hr": Languages.CROATIAN,
763
+ "__label__sh": Languages.NOT_DEFINED,
764
+ "__label__be": Languages.BELARUSIAN,
765
+ "__label__ka": Languages.GEORGIAN,
766
+ "__label__te": Languages.TELUGU,
767
+ "__label__kk": Languages.KAZAKH,
768
+ "__label__war": Languages.WARAY,
769
+ "__label__lt": Languages.LITHUANIAN,
770
+ "__label__gl": Languages.SCOTTISH,
771
+ "__label__sk": Languages.SLOVAK,
772
+ "__label__bn": Languages.BENIN,
773
+ "__label__eu": Languages.BASQUE,
774
+ "__label__sl": Languages.SLOVENIAN,
775
+ "__label__kn": Languages.NOT_DEFINED,
776
+ "__label__ml": Languages.MALAYALAM,
777
+ "__label__mr": Languages.MARATHI,
778
+ "__label__et": Languages.ESTONIAN,
779
+ "__label__az": Languages.AZERBAIJANI,
780
+ "__label__ms": Languages.NOT_DEFINED,
781
+ "__label__sq": Languages.ALBANIAN,
782
+ "__label__la": Languages.LATIN,
783
+ "__label__bs": Languages.BOSNIAN,
784
+ "__label__nn": Languages.NORWEGIAN_NOVOSIBIRSK,
785
+ "__label__ur": Languages.URDU,
786
+ "__label__lv": Languages.NOT_DEFINED,
787
+ "__label__my": Languages.NOT_DEFINED,
788
+ "__label__tt": Languages.NOT_DEFINED,
789
+ "__label__af": Languages.NOT_DEFINED,
790
+ "__label__oc": Languages.NOT_DEFINED,
791
+ "__label__nds": Languages.NOT_DEFINED,
792
+ "__label__ky": Languages.NOT_DEFINED,
793
+ "__label__ast": Languages.NOT_DEFINED,
794
+ "__label__tl": Languages.NOT_DEFINED,
795
+ "__label__is": Languages.NOT_DEFINED,
796
+ "__label__ia": Languages.NOT_DEFINED,
797
+ "__label__si": Languages.NOT_DEFINED,
798
+ "__label__gu": Languages.NOT_DEFINED,
799
+ "__label__km": Languages.NOT_DEFINED,
800
+ "__label__br": Languages.NOT_DEFINED,
801
+ "__label__ba": Languages.NOT_DEFINED,
802
+ "__label__uz": Languages.NOT_DEFINED,
803
+ "__label__bo": Languages.NOT_DEFINED,
804
+ "__label__pa": Languages.NOT_DEFINED,
805
+ "__label__vo": Languages.NOT_DEFINED,
806
+ "__label__als": Languages.NOT_DEFINED,
807
+ "__label__ne": Languages.NOT_DEFINED,
808
+ "__label__cy": Languages.NOT_DEFINED,
809
+ "__label__jbo": Languages.NOT_DEFINED,
810
+ "__label__fy": Languages.NOT_DEFINED,
811
+ "__label__mn": Languages.NOT_DEFINED,
812
+ "__label__lb": Languages.NOT_DEFINED,
813
+ "__label__ce": Languages.NOT_DEFINED,
814
+ "__label__ug": Languages.NOT_DEFINED,
815
+ "__label__tg": Languages.NOT_DEFINED,
816
+ "__label__sco": Languages.NOT_DEFINED,
817
+ "__label__sa": Languages.NOT_DEFINED,
818
+ "__label__cv": Languages.NOT_DEFINED,
819
+ "__label__jv": Languages.NOT_DEFINED,
820
+ "__label__min": Languages.NOT_DEFINED,
821
+ "__label__io": Languages.NOT_DEFINED,
822
+ "__label__or": Languages.NOT_DEFINED,
823
+ "__label__as": Languages.NOT_DEFINED,
824
+ "__label__new": Languages.NOT_DEFINED,
825
+ "__label__ga": Languages.NOT_DEFINED,
826
+ "__label__mg": Languages.NOT_DEFINED,
827
+ "__label__an": Languages.NOT_DEFINED,
828
+ "__label__ckb": Languages.NOT_DEFINED,
829
+ "__label__sw": Languages.NOT_DEFINED,
830
+ "__label__bar": Languages.NOT_DEFINED,
831
+ "__label__lmo": Languages.NOT_DEFINED,
832
+ "__label__yi": Languages.NOT_DEFINED,
833
+ "__label__arz": Languages.NOT_DEFINED,
834
+ "__label__mhr": Languages.NOT_DEFINED,
835
+ "__label__azb": Languages.NOT_DEFINED,
836
+ "__label__sah": Languages.NOT_DEFINED,
837
+ "__label__pnb": Languages.NOT_DEFINED,
838
+ "__label__su": Languages.NOT_DEFINED,
839
+ "__label__bpy": Languages.NOT_DEFINED,
840
+ "__label__pms": Languages.NOT_DEFINED,
841
+ "__label__ilo": Languages.NOT_DEFINED,
842
+ "__label__wuu": Languages.NOT_DEFINED,
843
+ "__label__ku": Languages.NOT_DEFINED,
844
+ "__label__ps": Languages.NOT_DEFINED,
845
+ "__label__ie": Languages.NOT_DEFINED,
846
+ "__label__xmf": Languages.NOT_DEFINED,
847
+ "__label__yue": Languages.NOT_DEFINED,
848
+ "__label__gom": Languages.NOT_DEFINED,
849
+ "__label__li": Languages.NOT_DEFINED,
850
+ "__label__mwl": Languages.NOT_DEFINED,
851
+ "__label__kw": Languages.NOT_DEFINED,
852
+ "__label__sd": Languages.NOT_DEFINED,
853
+ "__label__hsb": Languages.NOT_DEFINED,
854
+ "__label__scn": Languages.NOT_DEFINED,
855
+ "__label__gd": Languages.NOT_DEFINED,
856
+ "__label__pam": Languages.NOT_DEFINED,
857
+ "__label__bh": Languages.NOT_DEFINED,
858
+ "__label__mai": Languages.NOT_DEFINED,
859
+ "__label__vec": Languages.NOT_DEFINED,
860
+ "__label__mt": Languages.NOT_DEFINED,
861
+ "__label__dv": Languages.NOT_DEFINED,
862
+ "__label__wa": Languages.NOT_DEFINED,
863
+ "__label__mzn": Languages.NOT_DEFINED,
864
+ "__label__am": Languages.NOT_DEFINED,
865
+ "__label__qu": Languages.NOT_DEFINED,
866
+ "__label__eml": Languages.NOT_DEFINED,
867
+ "__label__cbk": Languages.NOT_DEFINED,
868
+ "__label__tk": Languages.NOT_DEFINED,
869
+ "__label__rm": Languages.NOT_DEFINED,
870
+ "__label__os": Languages.NOT_DEFINED,
871
+ "__label__vls": Languages.NOT_DEFINED,
872
+ "__label__yo": Languages.NOT_DEFINED,
873
+ "__label__lo": Languages.NOT_DEFINED,
874
+ "__label__lez": Languages.NOT_DEFINED,
875
+ "__label__so": Languages.NOT_DEFINED,
876
+ "__label__myv": Languages.NOT_DEFINED,
877
+ "__label__diq": Languages.NOT_DEFINED,
878
+ "__label__mrj": Languages.NOT_DEFINED,
879
+ "__label__dsb": Languages.NOT_DEFINED,
880
+ "__label__frr": Languages.NOT_DEFINED,
881
+ "__label__ht": Languages.NOT_DEFINED,
882
+ "__label__gn": Languages.NOT_DEFINED,
883
+ "__label__bxr": Languages.NOT_DEFINED,
884
+ "__label__kv": Languages.NOT_DEFINED,
885
+ "__label__sc": Languages.NOT_DEFINED,
886
+ "__label__nah": Languages.NOT_DEFINED,
887
+ "__label__krc": Languages.NOT_DEFINED,
888
+ "__label__bcl": Languages.NOT_DEFINED,
889
+ "__label__nap": Languages.NOT_DEFINED,
890
+ "__label__gv": Languages.NOT_DEFINED,
891
+ "__label__av": Languages.NOT_DEFINED,
892
+ "__label__rue": Languages.NOT_DEFINED,
893
+ "__label__xal": Languages.NOT_DEFINED,
894
+ "__label__pfl": Languages.NOT_DEFINED,
895
+ "__label__dty": Languages.NOT_DEFINED,
896
+ "__label__hif": Languages.NOT_DEFINED,
897
+ "__label__co": Languages.NOT_DEFINED,
898
+ "__label__lrc": Languages.NOT_DEFINED,
899
+ "__label__vep": Languages.NOT_DEFINED,
900
+ "__label__tyv": Languages.NOT_DEFINED,
727
901
  },
728
902
  model_wrapper="FasttextLangDetector",
729
903
  ),
730
904
  }
731
905
 
732
906
  @staticmethod
733
- def get_full_path_weights(name: str) -> str:
907
+ def get_full_path_weights(name: PathLikeOrStr) -> PathLikeOrStr:
734
908
  """
735
909
  Returns the absolute path of weights.
736
910
 
@@ -741,7 +915,7 @@ class ModelCatalog:
741
915
  :return: absolute weight path
742
916
  """
743
917
  try:
744
- profile = ModelCatalog.get_profile(name)
918
+ profile = ModelCatalog.get_profile(os.fspath(name))
745
919
  except KeyError:
746
920
  logger.info(
747
921
  LoggingRecord(
@@ -761,7 +935,7 @@ class ModelCatalog:
761
935
  return os.path.join(get_weights_dir_path(), name)
762
936
 
763
937
  @staticmethod
764
- def get_full_path_configs(name: str) -> str:
938
+ def get_full_path_configs(name: PathLikeOrStr) -> PathLikeOrStr:
765
939
  """
766
940
  Return the absolute path of configs for some given weights. Alternatively, pass last a path to a config file
767
941
  (without the base path to the cache config directory).
@@ -773,7 +947,7 @@ class ModelCatalog:
773
947
  :return: absolute path to the config
774
948
  """
775
949
  try:
776
- profile = ModelCatalog.get_profile(name)
950
+ profile = ModelCatalog.get_profile(os.fspath(name))
777
951
  except KeyError:
778
952
  logger.info(
779
953
  LoggingRecord(
@@ -787,7 +961,7 @@ class ModelCatalog:
787
961
  return os.path.join(get_configs_dir_path(), name)
788
962
 
789
963
  @staticmethod
790
- def get_full_path_preprocessor_configs(name: str) -> str:
964
+ def get_full_path_preprocessor_configs(name: Union[str]) -> PathLikeOrStr:
791
965
  """
792
966
  Return the absolute path of preprocessor configs for some given weights. Preprocessor are occasionally provided
793
967
  by the transformer library.
@@ -811,21 +985,21 @@ class ModelCatalog:
811
985
  return os.path.join(get_configs_dir_path(), name)
812
986
 
813
987
  @staticmethod
814
- def get_model_list() -> List[str]:
988
+ def get_model_list() -> list[PathLikeOrStr]:
815
989
  """
816
990
  Returns a list of absolute paths of registered models.
817
991
  """
818
992
  return [os.path.join(get_weights_dir_path(), profile.name) for profile in ModelCatalog.CATALOG.values()]
819
993
 
820
994
  @staticmethod
821
- def get_profile_list() -> List[str]:
995
+ def get_profile_list() -> list[str]:
822
996
  """
823
997
  Returns a list profile keys.
824
998
  """
825
999
  return list(ModelCatalog.CATALOG.keys())
826
1000
 
827
1001
  @staticmethod
828
- def is_registered(path_weights: str) -> bool:
1002
+ def is_registered(path_weights: PathLikeOrStr) -> bool:
829
1003
  """
830
1004
  Checks if some weights belong to a registered model
831
1005
 
@@ -849,8 +1023,8 @@ class ModelCatalog:
849
1023
 
850
1024
  profile = ModelCatalog.CATALOG.get(name)
851
1025
  if profile is not None:
852
- return copy(profile)
853
- raise KeyError("Model Profile does not exist. Please make sure the model is registered")
1026
+ return profile
1027
+ raise KeyError(f"Model Profile {name} does not exist. Please make sure the model is registered")
854
1028
 
855
1029
  @staticmethod
856
1030
  def register(name: str, profile: ModelProfile) -> None:
@@ -866,7 +1040,7 @@ class ModelCatalog:
866
1040
  ModelCatalog.CATALOG[name] = profile
867
1041
 
868
1042
  @staticmethod
869
- def load_profiles_from_file(path: Optional[str] = None) -> None:
1043
+ def load_profiles_from_file(path: Optional[PathLikeOrStr] = None) -> None:
870
1044
  """
871
1045
  Load model profiles from a jsonl file and extend `CATALOG` with the new profiles.
872
1046
 
@@ -877,10 +1051,12 @@ class ModelCatalog:
877
1051
  with jsonlines.open(path) as reader:
878
1052
  for obj in reader:
879
1053
  if not obj["name"] in ModelCatalog.CATALOG:
1054
+ categories = obj.get("categories") or {}
1055
+ obj["categories"] = {int(key): get_type(val) for key, val in categories.items()}
880
1056
  ModelCatalog.register(obj["name"], ModelProfile(**obj))
881
1057
 
882
1058
  @staticmethod
883
- def save_profiles_to_file(target_path: str) -> None:
1059
+ def save_profiles_to_file(target_path: PathLikeOrStr) -> None:
884
1060
  """
885
1061
  Save model profiles to a jsonl file.
886
1062
 
@@ -896,7 +1072,7 @@ class ModelCatalog:
896
1072
  ModelCatalog.load_profiles_from_file(os.environ.get("MODEL_CATALOG", None))
897
1073
 
898
1074
 
899
- def get_tp_weight_names(name: str) -> List[str]:
1075
+ def get_tp_weight_names(name: str) -> list[str]:
900
1076
  """
901
1077
  Given a path to some model weights it will return all file names according to TP naming convention
902
1078
 
@@ -922,7 +1098,7 @@ def print_model_infos(add_description: bool = True, add_config: bool = True, add
922
1098
  num_columns = min(6, len(profiles))
923
1099
  infos = []
924
1100
  for profile in profiles:
925
- tbl_input: List[Union[Mapping[str, ObjectTypes], str]] = [profile.name]
1101
+ tbl_input: list[Union[Mapping[int, ObjectTypes], str]] = [profile.name]
926
1102
  if add_description:
927
1103
  tbl_input.append(profile.description)
928
1104
  if add_config:
@@ -957,7 +1133,7 @@ class ModelDownloadManager:
957
1133
  """
958
1134
 
959
1135
  @staticmethod
960
- def maybe_download_weights_and_configs(name: str) -> str:
1136
+ def maybe_download_weights_and_configs(name: str) -> PathLikeOrStr:
961
1137
  """
962
1138
  Check if some model is registered. If yes, it will check if their weights
963
1139
  must be downloaded. Only weights that have not the same expected size will be downloaded again.
@@ -967,7 +1143,7 @@ class ModelDownloadManager:
967
1143
  """
968
1144
 
969
1145
  absolute_path_weights = ModelCatalog.get_full_path_weights(name)
970
- file_names: List[str] = []
1146
+ file_names: list[str] = []
971
1147
  if ModelCatalog.is_registered(name):
972
1148
  profile = ModelCatalog.get_profile(name)
973
1149
  # there is nothing to download if hf_repo_id or urls is not provided
@@ -1000,7 +1176,7 @@ class ModelDownloadManager:
1000
1176
  return absolute_path_weights
1001
1177
 
1002
1178
  @staticmethod
1003
- def load_model_from_hf_hub(profile: ModelProfile, absolute_path: str, file_names: List[str]) -> None:
1179
+ def load_model_from_hf_hub(profile: ModelProfile, absolute_path: PathLikeOrStr, file_names: list[str]) -> None:
1004
1180
  """
1005
1181
  Load a model from the Huggingface hub for a given profile and saves the model at the directory of the given
1006
1182
  path.
@@ -1026,7 +1202,7 @@ class ModelDownloadManager:
1026
1202
  )
1027
1203
 
1028
1204
  @staticmethod
1029
- def _load_from_gd(profile: ModelProfile, absolute_path: str, file_names: List[str]) -> None:
1205
+ def _load_from_gd(profile: ModelProfile, absolute_path: PathLikeOrStr, file_names: list[str]) -> None:
1030
1206
  if profile.urls is None:
1031
1207
  raise ValueError("urls cannot be None")
1032
1208
  for size, url, file_name in zip(profile.size, profile.urls, file_names):
@@ -1034,7 +1210,7 @@ class ModelDownloadManager:
1034
1210
  download(str(url), directory, file_name, int(size))
1035
1211
 
1036
1212
  @staticmethod
1037
- def load_configs_from_hf_hub(profile: ModelProfile, absolute_path: str) -> None:
1213
+ def load_configs_from_hf_hub(profile: ModelProfile, absolute_path: PathLikeOrStr) -> None:
1038
1214
  """
1039
1215
  Load config file(s) from the Huggingface hub for a given profile and saves the model at the directory of the
1040
1216
  given path.
@@ -1053,9 +1229,11 @@ class ModelDownloadManager:
1053
1229
  ModelDownloadManager._load_from_hf_hub(repo_id, file_name, directory)
1054
1230
 
1055
1231
  @staticmethod
1056
- def _load_from_hf_hub(repo_id: str, file_name: str, cache_directory: str, force_download: bool = False) -> int:
1232
+ def _load_from_hf_hub(
1233
+ repo_id: str, file_name: str, cache_directory: PathLikeOrStr, force_download: bool = False
1234
+ ) -> int:
1057
1235
  url = hf_hub_url(repo_id=repo_id, filename=file_name)
1058
- token = os.environ.get("HF_CREDENTIALS")
1236
+ token = os.environ.get("HF_CREDENTIALS", None)
1059
1237
  f_path = cached_download(
1060
1238
  url,
1061
1239
  cache_dir=cache_directory,