mteb 2.7.16__py3-none-any.whl → 2.7.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (173) hide show
  1. mteb/_create_dataloaders.py +16 -16
  2. mteb/_evaluators/any_sts_evaluator.py +1 -1
  3. mteb/_evaluators/classification_metrics.py +10 -1
  4. mteb/_evaluators/clustering_evaluator.py +1 -1
  5. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +2 -2
  6. mteb/_evaluators/pair_classification_evaluator.py +3 -2
  7. mteb/_evaluators/retrieval_evaluator.py +1 -1
  8. mteb/_evaluators/retrieval_metrics.py +9 -7
  9. mteb/_evaluators/sklearn_evaluator.py +13 -6
  10. mteb/_evaluators/text/bitext_mining_evaluator.py +1 -1
  11. mteb/_evaluators/text/summarization_evaluator.py +1 -1
  12. mteb/_evaluators/zeroshot_classification_evaluator.py +1 -1
  13. mteb/abstasks/_stratification.py +13 -8
  14. mteb/abstasks/abstask.py +4 -4
  15. mteb/abstasks/classification.py +6 -4
  16. mteb/abstasks/clustering.py +1 -1
  17. mteb/abstasks/clustering_legacy.py +1 -1
  18. mteb/abstasks/image/image_text_pair_classification.py +1 -1
  19. mteb/abstasks/multilabel_classification.py +7 -5
  20. mteb/abstasks/pair_classification.py +1 -1
  21. mteb/abstasks/regression.py +3 -2
  22. mteb/abstasks/retrieval.py +8 -5
  23. mteb/abstasks/retrieval_dataset_loaders.py +27 -8
  24. mteb/abstasks/sts.py +1 -1
  25. mteb/abstasks/text/bitext_mining.py +2 -2
  26. mteb/abstasks/text/reranking.py +1 -1
  27. mteb/abstasks/text/summarization.py +1 -1
  28. mteb/abstasks/zeroshot_classification.py +1 -1
  29. mteb/benchmarks/benchmark.py +131 -3
  30. mteb/evaluate.py +2 -2
  31. mteb/leaderboard/figures.py +2 -1
  32. mteb/leaderboard/table.py +10 -2
  33. mteb/models/cache_wrappers/cache_backend_protocol.py +3 -3
  34. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +3 -3
  35. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +8 -3
  36. mteb/models/cache_wrappers/cache_wrapper.py +2 -2
  37. mteb/models/model_implementations/bedrock_models.py +4 -4
  38. mteb/models/model_implementations/bm25.py +2 -2
  39. mteb/models/model_implementations/mcinext_models.py +2 -2
  40. mteb/models/model_implementations/openai_models.py +2 -1
  41. mteb/models/model_implementations/pylate_models.py +4 -4
  42. mteb/models/model_implementations/random_baseline.py +4 -3
  43. mteb/models/model_implementations/seed_models.py +7 -2
  44. mteb/models/model_implementations/voyage_models.py +1 -1
  45. mteb/models/models_protocols.py +2 -2
  46. mteb/models/search_wrappers.py +4 -4
  47. mteb/tasks/bitext_mining/multilingual/bible_nlp_bitext_mining.py +1 -1
  48. mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py +1 -1
  49. mteb/tasks/bitext_mining/multilingual/in22_conv_bitext_mining.py +1 -1
  50. mteb/tasks/bitext_mining/multilingual/in22_gen_bitext_mining.py +1 -1
  51. mteb/tasks/bitext_mining/multilingual/ntrex_bitext_mining.py +1 -1
  52. mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py +1 -1
  53. mteb/tasks/classification/ben/bengali_document_classification.py +2 -2
  54. mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +2 -2
  55. mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -1
  56. mteb/tasks/classification/multilingual/hin_dialect_classification.py +1 -1
  57. mteb/tasks/classification/multilingual/indic_lang_classification.py +1 -1
  58. mteb/tasks/classification/multilingual/indic_sentiment_classification.py +1 -1
  59. mteb/tasks/classification/multilingual/language_classification.py +1 -1
  60. mteb/tasks/classification/multilingual/south_african_lang_classification.py +1 -1
  61. mteb/tasks/classification/multilingual/turkic_classification.py +1 -1
  62. mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +2 -2
  63. mteb/tasks/classification/swa/swahili_news_classification.py +2 -2
  64. mteb/tasks/clustering/deu/ten_k_gnad_clustering_p2p.py +1 -1
  65. mteb/tasks/clustering/deu/ten_k_gnad_clustering_s2s.py +1 -1
  66. mteb/tasks/clustering/multilingual/mlsum_clustering_p2p.py +2 -2
  67. mteb/tasks/clustering/multilingual/mlsum_clustering_s2s.py +2 -2
  68. mteb/tasks/clustering/nob/vg_hierarchical_clustering.py +2 -2
  69. mteb/tasks/image_text_pair_classification/eng/image_co_de.py +1 -1
  70. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  71. mteb/tasks/instruction_reranking/multilingual/m_follow_ir.py +2 -2
  72. mteb/tasks/multichoice/eng/cv_bench.py +4 -4
  73. mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +1 -1
  74. mteb/tasks/pair_classification/eng/pub_chem_smilespc.py +1 -1
  75. mteb/tasks/pair_classification/multilingual/pub_chem_wiki_pair_classification.py +1 -1
  76. mteb/tasks/pair_classification/multilingual/rte3.py +1 -1
  77. mteb/tasks/retrieval/ara/sadeem_question_retrieval.py +1 -1
  78. mteb/tasks/retrieval/code/code_edit_search_retrieval.py +1 -1
  79. mteb/tasks/retrieval/code/code_rag.py +8 -8
  80. mteb/tasks/retrieval/code/code_search_net_cc_retrieval.py +1 -1
  81. mteb/tasks/retrieval/code/coir_code_search_net_retrieval.py +1 -1
  82. mteb/tasks/retrieval/code/ds1000_retrieval.py +1 -1
  83. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +1 -1
  84. mteb/tasks/retrieval/code/human_eval_retrieval.py +1 -1
  85. mteb/tasks/retrieval/code/mbpp_retrieval.py +1 -1
  86. mteb/tasks/retrieval/code/wiki_sql_retrieval.py +1 -1
  87. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +2 -2
  88. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  89. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  90. mteb/tasks/retrieval/deu/german_gov_service_retrieval.py +1 -1
  91. mteb/tasks/retrieval/deu/german_qu_ad_retrieval.py +1 -1
  92. mteb/tasks/retrieval/ell/greek_civics_qa.py +1 -1
  93. mteb/tasks/retrieval/eng/bright_retrieval.py +1 -1
  94. mteb/tasks/retrieval/eng/chat_doctor_retrieval.py +1 -1
  95. mteb/tasks/retrieval/eng/fin_qa_retrieval.py +1 -1
  96. mteb/tasks/retrieval/eng/finance_bench_retrieval.py +1 -1
  97. mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +1 -1
  98. mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +1 -1
  99. mteb/tasks/retrieval/eng/hc3_finance_retrieval.py +1 -1
  100. mteb/tasks/retrieval/eng/lemb_narrative_qa_retrieval.py +1 -1
  101. mteb/tasks/retrieval/eng/lemb_needle_retrieval.py +1 -1
  102. mteb/tasks/retrieval/eng/lemb_passkey_retrieval.py +1 -1
  103. mteb/tasks/retrieval/eng/lemb_summ_screen_fd_retrieval.py +1 -1
  104. mteb/tasks/retrieval/eng/lemb_wikim_qa_retrieval.py +1 -1
  105. mteb/tasks/retrieval/eng/lembqm_sum_retrieval.py +1 -1
  106. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -1
  107. mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +1 -1
  108. mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +1 -1
  109. mteb/tasks/retrieval/eng/ml_questions.py +1 -1
  110. mteb/tasks/retrieval/eng/nano_argu_ana_retrieval.py +1 -1
  111. mteb/tasks/retrieval/eng/nano_climate_fever_retrieval.py +1 -1
  112. mteb/tasks/retrieval/eng/nano_db_pedia_retrieval.py +1 -1
  113. mteb/tasks/retrieval/eng/nano_fever_retrieval.py +1 -1
  114. mteb/tasks/retrieval/eng/nano_fi_qa2018_retrieval.py +1 -1
  115. mteb/tasks/retrieval/eng/nano_hotpot_qa_retrieval.py +1 -1
  116. mteb/tasks/retrieval/eng/nano_msmarco_retrieval.py +1 -1
  117. mteb/tasks/retrieval/eng/nano_nf_corpus_retrieval.py +1 -1
  118. mteb/tasks/retrieval/eng/nano_nq_retrieval.py +1 -1
  119. mteb/tasks/retrieval/eng/nano_quora_retrieval.py +1 -1
  120. mteb/tasks/retrieval/eng/nano_sci_fact_retrieval.py +1 -1
  121. mteb/tasks/retrieval/eng/nano_scidocs_retrieval.py +1 -1
  122. mteb/tasks/retrieval/eng/nano_touche2020_retrieval.py +1 -1
  123. mteb/tasks/retrieval/eng/narrative_qa_retrieval.py +1 -1
  124. mteb/tasks/retrieval/eng/r2_med_retrieval.py +8 -8
  125. mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +1 -1
  126. mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +1 -1
  127. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +10 -10
  128. mteb/tasks/retrieval/fra/f_qu_ad_retrieval.py +1 -1
  129. mteb/tasks/retrieval/fra/syntec_retrieval.py +1 -1
  130. mteb/tasks/retrieval/hun/hun_sum2.py +1 -1
  131. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +1 -1
  132. mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt19.py +1 -1
  133. mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt21.py +1 -1
  134. mteb/tasks/retrieval/multilingual/cur_ev1_retrieval.py +1 -1
  135. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +1 -1
  136. mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +1 -1
  137. mteb/tasks/retrieval/multilingual/mr_tidy_retrieval.py +1 -1
  138. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +1 -1
  139. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -2
  140. mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py +1 -1
  141. mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
  142. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +5 -5
  143. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +1 -0
  144. mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +1 -1
  145. mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +1 -1
  146. mteb/tasks/retrieval/multilingual/x_qu_ad_retrieval.py +1 -1
  147. mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +1 -1
  148. mteb/tasks/retrieval/nld/cqa_dupstack_android_nl_retrieval.py +1 -1
  149. mteb/tasks/retrieval/nld/cqa_dupstack_english_nl_retrieval.py +1 -1
  150. mteb/tasks/retrieval/nld/cqa_dupstack_gaming_nl_retrieval.py +1 -1
  151. mteb/tasks/retrieval/nld/cqa_dupstack_gis_nl_retrieval.py +1 -1
  152. mteb/tasks/retrieval/nld/cqa_dupstack_mathematica_nl_retrieval.py +1 -1
  153. mteb/tasks/retrieval/nld/cqa_dupstack_physics_nl_retrieval.py +1 -1
  154. mteb/tasks/retrieval/nld/cqa_dupstack_programmers_nl_retrieval.py +1 -1
  155. mteb/tasks/retrieval/nld/cqa_dupstack_stats_nl_retrieval.py +1 -1
  156. mteb/tasks/retrieval/nld/cqa_dupstack_tex_nl_retrieval.py +1 -1
  157. mteb/tasks/retrieval/nld/cqa_dupstack_unix_nl_retrieval.py +1 -1
  158. mteb/tasks/retrieval/nld/cqa_dupstack_webmasters_nl_retrieval.py +1 -1
  159. mteb/tasks/retrieval/nld/cqa_dupstack_wordpress_nl_retrieval.py +1 -1
  160. mteb/tasks/retrieval/nob/norquad.py +2 -2
  161. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  162. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -1
  163. mteb/tasks/retrieval/vie/vie_qu_ad_retrieval.py +1 -1
  164. mteb/tasks/sts/multilingual/sem_rel24_sts.py +1 -1
  165. mteb/tasks/sts/multilingual/sts_benchmark_multilingual_sts.py +1 -1
  166. mteb/tasks/sts/por/assin2_sts.py +1 -1
  167. mteb/types/_encoder_io.py +3 -2
  168. {mteb-2.7.16.dist-info → mteb-2.7.18.dist-info}/METADATA +1 -1
  169. {mteb-2.7.16.dist-info → mteb-2.7.18.dist-info}/RECORD +173 -173
  170. {mteb-2.7.16.dist-info → mteb-2.7.18.dist-info}/WHEEL +0 -0
  171. {mteb-2.7.16.dist-info → mteb-2.7.18.dist-info}/entry_points.txt +0 -0
  172. {mteb-2.7.16.dist-info → mteb-2.7.18.dist-info}/licenses/LICENSE +0 -0
  173. {mteb-2.7.16.dist-info → mteb-2.7.18.dist-info}/top_level.txt +0 -0
@@ -150,7 +150,7 @@ class VoyageModel(AbsEncoder):
150
150
  sentences: list[str],
151
151
  batch_size: int,
152
152
  input_type: Literal["query", "document"],
153
- ) -> np.ndarray:
153
+ ) -> Array:
154
154
  embeddings, index = [], 0
155
155
 
156
156
  output_dtype = VOYAGE_DTYPE_TRANSLATION.get(
@@ -32,7 +32,7 @@ class SearchProtocol(Protocol):
32
32
  hf_split: str,
33
33
  hf_subset: str,
34
34
  encode_kwargs: EncodeKwargs,
35
- num_proc: int,
35
+ num_proc: int | None,
36
36
  ) -> None:
37
37
  """Index the corpus for retrieval.
38
38
 
@@ -56,7 +56,7 @@ class SearchProtocol(Protocol):
56
56
  top_k: int,
57
57
  encode_kwargs: EncodeKwargs,
58
58
  top_ranked: TopRankedDocumentsType | None = None,
59
- num_proc: int,
59
+ num_proc: int | None,
60
60
  ) -> RetrievalOutputType:
61
61
  """Search the corpus using the given queries.
62
62
 
@@ -59,7 +59,7 @@ class SearchEncoderWrapper:
59
59
  hf_split: str,
60
60
  hf_subset: str,
61
61
  encode_kwargs: EncodeKwargs,
62
- num_proc: int = 1,
62
+ num_proc: int | None = None,
63
63
  ) -> None:
64
64
  """Index the corpus for retrieval.
65
65
 
@@ -101,7 +101,7 @@ class SearchEncoderWrapper:
101
101
  top_k: int,
102
102
  encode_kwargs: EncodeKwargs,
103
103
  top_ranked: TopRankedDocumentsType | None = None,
104
- num_proc: int = 1,
104
+ num_proc: int | None = None,
105
105
  ) -> RetrievalOutputType:
106
106
  """Search the corpus for the given queries.
107
107
 
@@ -485,7 +485,7 @@ class SearchCrossEncoderWrapper:
485
485
  hf_split: str,
486
486
  hf_subset: str,
487
487
  encode_kwargs: EncodeKwargs,
488
- num_proc: int = 1,
488
+ num_proc: int | None = None,
489
489
  ) -> None:
490
490
  """Index the corpus for retrieval.
491
491
 
@@ -509,7 +509,7 @@ class SearchCrossEncoderWrapper:
509
509
  top_k: int,
510
510
  encode_kwargs: EncodeKwargs,
511
511
  top_ranked: TopRankedDocumentsType | None = None,
512
- num_proc: int = 1,
512
+ num_proc: int | None = None,
513
513
  ) -> RetrievalOutputType:
514
514
  """Search the corpus using the given queries.
515
515
 
@@ -914,7 +914,7 @@ class BibleNLPBitextMining(AbsTaskBitextMining):
914
914
  self.dataset_transform()
915
915
  self.data_loaded = True
916
916
 
917
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
917
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
918
918
  # Convert to standard format
919
919
  for lang in self.hf_subsets:
920
920
  l1, l2 = (l.split("_")[0] for l in lang.split("-"))
@@ -265,7 +265,7 @@ class FloresBitextMining(AbsTaskBitextMining):
265
265
  """,
266
266
  )
267
267
 
268
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
268
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
269
269
  if self.data_loaded:
270
270
  return
271
271
 
@@ -99,7 +99,7 @@ class IN22ConvBitextMining(AbsTaskBitextMining):
99
99
  """,
100
100
  )
101
101
 
102
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
102
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
103
103
  if self.data_loaded:
104
104
  return
105
105
 
@@ -93,7 +93,7 @@ class IN22GenBitextMining(AbsTaskBitextMining):
93
93
  """,
94
94
  )
95
95
 
96
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
96
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
97
97
  if self.data_loaded:
98
98
  return
99
99
 
@@ -280,7 +280,7 @@ class NTREXBitextMining(AbsTaskBitextMining):
280
280
  """,
281
281
  )
282
282
 
283
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
283
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
284
284
  if self.data_loaded:
285
285
  return
286
286
 
@@ -32,7 +32,7 @@ class RomaTalesBitextMining(AbsTaskBitextMining):
32
32
  bibtex_citation="",
33
33
  )
34
34
 
35
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
35
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
36
36
  """Load dataset from HuggingFace hub and convert it to the standard format."""
37
37
  if self.data_loaded:
38
38
  return
@@ -43,7 +43,7 @@ Islam, Tanvir},
43
43
  superseded_by="BengaliDocumentClassification.v2",
44
44
  )
45
45
 
46
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
46
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
47
47
  self.dataset = self.dataset.rename_columns(
48
48
  {"article": "text", "category": "label"}
49
49
  )
@@ -92,7 +92,7 @@ Islam, Tanvir},
92
92
  """,
93
93
  )
94
94
 
95
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
95
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
96
96
  self.dataset = self.stratified_subsampling(
97
97
  self.dataset, seed=self.seed, splits=["test"]
98
98
  )
@@ -46,7 +46,7 @@ Montoyo, Andres},
46
46
  )
47
47
  samples_per_label = 16
48
48
 
49
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
49
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
50
50
  self.dataset = self.dataset.rename_columns(
51
51
  {"comment": "text", "rating_str": "label"}
52
52
  )
@@ -99,7 +99,7 @@ Montoyo, Andres},
99
99
  )
100
100
  samples_per_label = 16
101
101
 
102
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
102
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
103
103
  self.dataset = self.stratified_subsampling(
104
104
  self.dataset, seed=self.seed, splits=["test"]
105
105
  )
@@ -46,7 +46,7 @@ Montoyo, Andres},
46
46
  )
47
47
  samples_per_label = 16
48
48
 
49
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
49
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
50
50
  self.dataset = self.dataset.rename_columns(
51
51
  {"comment": "text", "sentiment_int": "label"}
52
52
  )
@@ -60,7 +60,7 @@ class HinDialectClassification(AbsTaskClassification):
60
60
  """,
61
61
  )
62
62
 
63
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
63
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
64
64
  self.dataset = self.dataset.rename_columns(
65
65
  {"folksong": "text", "language": "label"}
66
66
  )
@@ -137,6 +137,6 @@ Okazaki, Naoaki},
137
137
  self.dataset_transform()
138
138
  self.data_loaded = True
139
139
 
140
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
140
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
141
141
  self.dataset = self.dataset.remove_columns(["language", "script"])
142
142
  self.dataset = self.dataset.rename_columns({"native sentence": "text"})
@@ -52,7 +52,7 @@ class IndicSentimentClassification(AbsTaskClassification):
52
52
  """,
53
53
  )
54
54
 
55
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
55
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
56
56
  label_map = {"Negative": 0, "Positive": 1}
57
57
  # Convert to standard format
58
58
  for lang in self.hf_subsets:
@@ -66,7 +66,7 @@ in Natural Language Processing},
66
66
  """,
67
67
  )
68
68
 
69
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
69
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
70
70
  self.dataset = self.dataset.rename_columns({"labels": "label"})
71
71
  self.dataset = self.stratified_subsampling(
72
72
  self.dataset, seed=self.seed, splits=["test"]
@@ -49,7 +49,7 @@ class SouthAfricanLangClassification(AbsTaskClassification):
49
49
  """,
50
50
  )
51
51
 
52
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
52
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
53
53
  self.dataset = self.dataset.rename_columns(
54
54
  {" text": "text", "lang_id": "label"}
55
55
  )
@@ -49,7 +49,7 @@ class TurkicClassification(AbsTaskClassification):
49
49
  )
50
50
  return dataset_lang["train"]
51
51
 
52
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
52
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
53
53
  """Load dataset from HuggingFace hub"""
54
54
  if self.data_loaded:
55
55
  return
@@ -35,7 +35,7 @@ class SlovakMovieReviewSentimentClassification(AbsTaskClassification):
35
35
  superseded_by="SlovakMovieReviewSentimentClassification.v2",
36
36
  )
37
37
 
38
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
38
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
39
39
  self.dataset = self.dataset.rename_columns({"comment": "text"})
40
40
 
41
41
  self.dataset = self.stratified_subsampling(
@@ -76,7 +76,7 @@ class SlovakMovieReviewSentimentClassificationV2(AbsTaskClassification):
76
76
  adapted_from=["SlovakMovieReviewSentimentClassification"],
77
77
  )
78
78
 
79
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
79
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
80
80
  self.dataset = self.stratified_subsampling(
81
81
  self.dataset, seed=self.seed, splits=["test"]
82
82
  )
@@ -37,7 +37,7 @@ class SwahiliNewsClassification(AbsTaskClassification):
37
37
  superseded_by="SwahiliNewsClassification.v2",
38
38
  )
39
39
 
40
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
40
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
41
41
  self.dataset = self.dataset.rename_columns(
42
42
  {"content": "text", "category": "label"}
43
43
  )
@@ -81,7 +81,7 @@ class SwahiliNewsClassificationV2(AbsTaskClassification):
81
81
  adapted_from=["SwahiliNewsClassification"],
82
82
  )
83
83
 
84
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
84
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
85
85
  self.dataset = self.stratified_subsampling(
86
86
  self.dataset, seed=self.seed, splits=["train"]
87
87
  )
@@ -63,7 +63,7 @@ class TenKGnadClusteringP2PFast(AbsTaskClustering):
63
63
  adapted_from=["TenKGnadClusteringP2P"],
64
64
  )
65
65
 
66
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
66
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
67
67
  ds = _convert_to_fast(
68
68
  self.dataset, self.input_column_name, self.label_column_name, self.seed
69
69
  )
@@ -63,7 +63,7 @@ class TenKGnadClusteringS2SFast(AbsTaskClustering):
63
63
  adapted_from=["TenKGnadClusteringS2S"],
64
64
  )
65
65
 
66
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
66
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
67
67
  ds = _convert_to_fast(
68
68
  self.dataset, self.input_column_name, self.label_column_name, self.seed
69
69
  )
@@ -51,7 +51,7 @@ class MLSUMClusteringP2P(AbsTaskClusteringLegacy):
51
51
  superseded_by="MLSUMClusteringP2P.v2",
52
52
  )
53
53
 
54
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
54
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
55
55
  """Load dataset from HuggingFace hub and convert it to the standard format."""
56
56
  if self.data_loaded:
57
57
  return
@@ -124,7 +124,7 @@ class MLSUMClusteringP2PFast(AbsTaskClustering):
124
124
  adapted_from=["MLSUMClusteringP2P"],
125
125
  )
126
126
 
127
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
127
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
128
128
  """Load dataset from HuggingFace hub and convert it to the standard format."""
129
129
  if self.data_loaded:
130
130
  return
@@ -51,7 +51,7 @@ class MLSUMClusteringS2S(AbsTaskClusteringLegacy):
51
51
  superseded_by="MLSUMClusteringS2S.v2",
52
52
  )
53
53
 
54
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
54
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
55
55
  """Load dataset from HuggingFace hub and convert it to the standard format."""
56
56
  if self.data_loaded:
57
57
  return
@@ -119,7 +119,7 @@ class MLSUMClusteringS2SFast(AbsTaskClustering):
119
119
  adapted_from=["MLSUMClusteringS2S"],
120
120
  )
121
121
 
122
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
122
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
123
123
  """Load dataset from HuggingFace hub and convert it to the standard format."""
124
124
  if self.data_loaded:
125
125
  return
@@ -45,7 +45,7 @@ class VGHierarchicalClusteringP2P(AbsTaskClustering):
45
45
  prompt="Identify the categories (e.g. sports) of given articles in Norwegian",
46
46
  )
47
47
 
48
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
48
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
49
49
  self.dataset = self.dataset.rename_columns(
50
50
  {"article": "sentences", "classes": "labels"}
51
51
  )
@@ -92,7 +92,7 @@ class VGHierarchicalClusteringS2S(AbsTaskClustering):
92
92
  prompt="Identify the categories (e.g. sports) of given articles in Norwegian",
93
93
  )
94
94
 
95
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
95
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
96
96
  self.dataset = self.dataset.rename_columns(
97
97
  {"ingress": "sentences", "classes": "labels"}
98
98
  )
@@ -53,7 +53,7 @@ class ImageCoDe(AbsTaskImageTextPairClassification):
53
53
  """,
54
54
  )
55
55
 
56
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
56
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
57
57
  if self.data_loaded:
58
58
  return
59
59
 
@@ -45,7 +45,7 @@ class SugarCrepe(AbsTaskImageTextPairClassification):
45
45
  """,
46
46
  )
47
47
 
48
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
48
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
49
49
  """Load dataset from HuggingFace hub"""
50
50
  if self.data_loaded:
51
51
  return
@@ -175,7 +175,7 @@ class mFollowIRCrossLingual(AbsTaskRetrieval): # noqa: N801
175
175
  """,
176
176
  )
177
177
 
178
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
178
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
179
179
  if self.data_loaded:
180
180
  return
181
181
 
@@ -243,7 +243,7 @@ class mFollowIR(AbsTaskRetrieval): # noqa: N801
243
243
  """,
244
244
  )
245
245
 
246
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
246
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
247
247
  if self.data_loaded:
248
248
  return
249
249
 
@@ -123,7 +123,7 @@ class CVBenchCount(AbsTaskRetrieval):
123
123
  """,
124
124
  )
125
125
 
126
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
126
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
127
127
  self.corpus, self.queries, self.relevant_docs, self.top_ranked = _load_data(
128
128
  path=self.metadata.dataset["path"],
129
129
  splits=self.metadata.eval_splits,
@@ -165,7 +165,7 @@ class CVBenchRelation(AbsTaskRetrieval):
165
165
  """,
166
166
  )
167
167
 
168
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
168
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
169
169
  self.corpus, self.queries, self.relevant_docs, self.top_ranked = _load_data(
170
170
  path=self.metadata.dataset["path"],
171
171
  splits=self.metadata.eval_splits,
@@ -207,7 +207,7 @@ class CVBenchDepth(AbsTaskRetrieval):
207
207
  """,
208
208
  )
209
209
 
210
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
210
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
211
211
  self.corpus, self.queries, self.relevant_docs, self.top_ranked = _load_data(
212
212
  path=self.metadata.dataset["path"],
213
213
  splits=self.metadata.eval_splits,
@@ -249,7 +249,7 @@ class CVBenchDistance(AbsTaskRetrieval):
249
249
  """,
250
250
  )
251
251
 
252
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
252
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
253
253
  self.corpus, self.queries, self.relevant_docs, self.top_ranked = _load_data(
254
254
  path=self.metadata.dataset["path"],
255
255
  splits=self.metadata.eval_splits,
@@ -66,7 +66,7 @@ Yih, Scott Wen-tau},
66
66
  },
67
67
  )
68
68
 
69
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
69
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
70
70
  labels = [
71
71
  "q2_label",
72
72
  "q3_label",
@@ -76,7 +76,7 @@ class PubChemSMILESPC(AbsTaskPairClassification):
76
76
  """,
77
77
  )
78
78
 
79
- def load_data(self, num_proc: int = 1, **kwargs: Any) -> None:
79
+ def load_data(self, num_proc: int | None = None, **kwargs: Any) -> None:
80
80
  if self.data_loaded:
81
81
  return
82
82
 
@@ -60,7 +60,7 @@ class PubChemWikiPairClassification(AbsTaskPairClassification):
60
60
  """,
61
61
  )
62
62
 
63
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
63
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
64
64
  _dataset = {}
65
65
  for lang in self.hf_subsets:
66
66
  _dataset[lang] = {}
@@ -52,7 +52,7 @@ Dolan, Bill},
52
52
  # sum of 4 languages after neutral filtering
53
53
  )
54
54
 
55
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
55
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
56
56
  """Load dataset from HuggingFace hub"""
57
57
  if self.data_loaded:
58
58
  return
@@ -37,7 +37,7 @@ class SadeemQuestionRetrieval(AbsTaskRetrieval):
37
37
  """,
38
38
  )
39
39
 
40
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
40
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
41
41
  if self.data_loaded:
42
42
  return
43
43
 
@@ -53,7 +53,7 @@ class CodeEditSearchRetrieval(AbsTaskRetrieval):
53
53
  """,
54
54
  )
55
55
 
56
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
56
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
57
57
  if self.data_loaded:
58
58
  return
59
59
 
@@ -51,7 +51,7 @@ class CodeRAGProgrammingSolutionsRetrieval(AbsTaskRetrieval):
51
51
  **common_args,
52
52
  )
53
53
 
54
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
54
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
55
55
  """Load dataset from HuggingFace hub"""
56
56
  if self.data_loaded:
57
57
  return
@@ -59,7 +59,7 @@ class CodeRAGProgrammingSolutionsRetrieval(AbsTaskRetrieval):
59
59
  self.dataset_transform()
60
60
  self.data_loaded = True
61
61
 
62
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
62
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
63
63
  """And transform to a retrieval dataset, which have the following attributes
64
64
 
65
65
  self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document data like title and text
@@ -108,7 +108,7 @@ class CodeRAGOnlineTutorialsRetrieval(AbsTaskRetrieval):
108
108
  **common_args,
109
109
  )
110
110
 
111
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
111
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
112
112
  """Load dataset from HuggingFace hub"""
113
113
  if self.data_loaded:
114
114
  return
@@ -116,7 +116,7 @@ class CodeRAGOnlineTutorialsRetrieval(AbsTaskRetrieval):
116
116
  self.dataset_transform()
117
117
  self.data_loaded = True
118
118
 
119
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
119
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
120
120
  """And transform to a retrieval dataset, which have the following attributes
121
121
 
122
122
  self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document data like title and text
@@ -168,7 +168,7 @@ class CodeRAGLibraryDocumentationSolutionsRetrieval(AbsTaskRetrieval):
168
168
  **common_args,
169
169
  )
170
170
 
171
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
171
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
172
172
  """Load dataset from HuggingFace hub"""
173
173
  if self.data_loaded:
174
174
  return
@@ -176,7 +176,7 @@ class CodeRAGLibraryDocumentationSolutionsRetrieval(AbsTaskRetrieval):
176
176
  self.dataset_transform()
177
177
  self.data_loaded = True
178
178
 
179
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
179
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
180
180
  """And transform to a retrieval dataset, which have the following attributes
181
181
 
182
182
  self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document data like title and text
@@ -225,7 +225,7 @@ class CodeRAGStackoverflowPostsRetrieval(AbsTaskRetrieval):
225
225
  **common_args,
226
226
  )
227
227
 
228
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
228
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
229
229
  """Load dataset from HuggingFace hub"""
230
230
  if self.data_loaded:
231
231
  return
@@ -233,7 +233,7 @@ class CodeRAGStackoverflowPostsRetrieval(AbsTaskRetrieval):
233
233
  self.dataset_transform()
234
234
  self.data_loaded = True
235
235
 
236
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
236
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
237
237
  """And transform to a retrieval dataset, which have the following attributes
238
238
 
239
239
  self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document data like title and text
@@ -99,7 +99,7 @@ class CodeSearchNetCCRetrieval(AbsTaskRetrieval):
99
99
  """,
100
100
  )
101
101
 
102
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
102
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
103
103
  if self.data_loaded:
104
104
  return
105
105
 
@@ -97,7 +97,7 @@ class COIRCodeSearchNetRetrieval(AbsTaskRetrieval):
97
97
  """,
98
98
  )
99
99
 
100
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
100
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
101
101
  if self.data_loaded:
102
102
  return
103
103
 
@@ -34,7 +34,7 @@ class DS1000Retrieval(AbsTaskRetrieval):
34
34
  """,
35
35
  )
36
36
 
37
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
37
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
38
38
  if self.data_loaded:
39
39
  return
40
40
 
@@ -37,7 +37,7 @@ class FreshStackRetrieval(AbsTaskRetrieval):
37
37
  """,
38
38
  )
39
39
 
40
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
40
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
41
41
  if self.data_loaded:
42
42
  return
43
43
 
@@ -34,7 +34,7 @@ class HumanEvalRetrieval(AbsTaskRetrieval):
34
34
  }""",
35
35
  )
36
36
 
37
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
37
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
38
38
  if self.data_loaded:
39
39
  return
40
40
 
@@ -34,7 +34,7 @@ class MBPPRetrieval(AbsTaskRetrieval):
34
34
  """,
35
35
  )
36
36
 
37
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
37
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
38
38
  if self.data_loaded:
39
39
  return
40
40
 
@@ -36,7 +36,7 @@ class WikiSQLRetrieval(AbsTaskRetrieval):
36
36
  """,
37
37
  )
38
38
 
39
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
39
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
40
40
  if self.data_loaded:
41
41
  return
42
42