mteb 2.7.17__py3-none-any.whl → 2.7.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. mteb/_create_dataloaders.py +16 -16
  2. mteb/_evaluators/any_sts_evaluator.py +1 -1
  3. mteb/_evaluators/clustering_evaluator.py +1 -1
  4. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +2 -2
  5. mteb/_evaluators/pair_classification_evaluator.py +1 -1
  6. mteb/_evaluators/retrieval_evaluator.py +1 -1
  7. mteb/_evaluators/sklearn_evaluator.py +4 -2
  8. mteb/_evaluators/text/bitext_mining_evaluator.py +1 -1
  9. mteb/_evaluators/text/summarization_evaluator.py +1 -1
  10. mteb/_evaluators/zeroshot_classification_evaluator.py +1 -1
  11. mteb/abstasks/abstask.py +4 -4
  12. mteb/abstasks/classification.py +2 -2
  13. mteb/abstasks/clustering.py +1 -1
  14. mteb/abstasks/clustering_legacy.py +1 -1
  15. mteb/abstasks/image/image_text_pair_classification.py +1 -1
  16. mteb/abstasks/multilabel_classification.py +1 -1
  17. mteb/abstasks/pair_classification.py +1 -1
  18. mteb/abstasks/retrieval.py +8 -5
  19. mteb/abstasks/retrieval_dataset_loaders.py +27 -8
  20. mteb/abstasks/sts.py +1 -1
  21. mteb/abstasks/text/bitext_mining.py +2 -2
  22. mteb/abstasks/text/reranking.py +1 -1
  23. mteb/abstasks/text/summarization.py +1 -1
  24. mteb/abstasks/zeroshot_classification.py +1 -1
  25. mteb/evaluate.py +2 -2
  26. mteb/models/model_implementations/bm25.py +2 -2
  27. mteb/models/model_implementations/ict_time_and_querit_models.py +115 -0
  28. mteb/models/model_implementations/pylate_models.py +4 -4
  29. mteb/models/models_protocols.py +2 -2
  30. mteb/models/search_wrappers.py +4 -4
  31. mteb/tasks/bitext_mining/multilingual/bible_nlp_bitext_mining.py +1 -1
  32. mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py +1 -1
  33. mteb/tasks/bitext_mining/multilingual/in22_conv_bitext_mining.py +1 -1
  34. mteb/tasks/bitext_mining/multilingual/in22_gen_bitext_mining.py +1 -1
  35. mteb/tasks/bitext_mining/multilingual/ntrex_bitext_mining.py +1 -1
  36. mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py +1 -1
  37. mteb/tasks/classification/ben/bengali_document_classification.py +2 -2
  38. mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +2 -2
  39. mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -1
  40. mteb/tasks/classification/multilingual/hin_dialect_classification.py +1 -1
  41. mteb/tasks/classification/multilingual/indic_lang_classification.py +1 -1
  42. mteb/tasks/classification/multilingual/indic_sentiment_classification.py +1 -1
  43. mteb/tasks/classification/multilingual/language_classification.py +1 -1
  44. mteb/tasks/classification/multilingual/south_african_lang_classification.py +1 -1
  45. mteb/tasks/classification/multilingual/turkic_classification.py +1 -1
  46. mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +2 -2
  47. mteb/tasks/classification/swa/swahili_news_classification.py +2 -2
  48. mteb/tasks/clustering/deu/ten_k_gnad_clustering_p2p.py +1 -1
  49. mteb/tasks/clustering/deu/ten_k_gnad_clustering_s2s.py +1 -1
  50. mteb/tasks/clustering/multilingual/mlsum_clustering_p2p.py +2 -2
  51. mteb/tasks/clustering/multilingual/mlsum_clustering_s2s.py +2 -2
  52. mteb/tasks/clustering/nob/vg_hierarchical_clustering.py +2 -2
  53. mteb/tasks/image_text_pair_classification/eng/image_co_de.py +1 -1
  54. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  55. mteb/tasks/instruction_reranking/multilingual/m_follow_ir.py +2 -2
  56. mteb/tasks/multichoice/eng/cv_bench.py +4 -4
  57. mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +1 -1
  58. mteb/tasks/pair_classification/eng/pub_chem_smilespc.py +1 -1
  59. mteb/tasks/pair_classification/multilingual/pub_chem_wiki_pair_classification.py +1 -1
  60. mteb/tasks/pair_classification/multilingual/rte3.py +1 -1
  61. mteb/tasks/retrieval/ara/sadeem_question_retrieval.py +1 -1
  62. mteb/tasks/retrieval/code/code_edit_search_retrieval.py +1 -1
  63. mteb/tasks/retrieval/code/code_rag.py +8 -8
  64. mteb/tasks/retrieval/code/code_search_net_cc_retrieval.py +1 -1
  65. mteb/tasks/retrieval/code/coir_code_search_net_retrieval.py +1 -1
  66. mteb/tasks/retrieval/code/ds1000_retrieval.py +1 -1
  67. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +1 -1
  68. mteb/tasks/retrieval/code/human_eval_retrieval.py +1 -1
  69. mteb/tasks/retrieval/code/mbpp_retrieval.py +1 -1
  70. mteb/tasks/retrieval/code/wiki_sql_retrieval.py +1 -1
  71. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +2 -2
  72. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  73. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  74. mteb/tasks/retrieval/deu/german_gov_service_retrieval.py +1 -1
  75. mteb/tasks/retrieval/deu/german_qu_ad_retrieval.py +1 -1
  76. mteb/tasks/retrieval/ell/greek_civics_qa.py +1 -1
  77. mteb/tasks/retrieval/eng/bright_retrieval.py +1 -1
  78. mteb/tasks/retrieval/eng/chat_doctor_retrieval.py +1 -1
  79. mteb/tasks/retrieval/eng/fin_qa_retrieval.py +1 -1
  80. mteb/tasks/retrieval/eng/finance_bench_retrieval.py +1 -1
  81. mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +1 -1
  82. mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +1 -1
  83. mteb/tasks/retrieval/eng/hc3_finance_retrieval.py +1 -1
  84. mteb/tasks/retrieval/eng/lemb_narrative_qa_retrieval.py +1 -1
  85. mteb/tasks/retrieval/eng/lemb_needle_retrieval.py +1 -1
  86. mteb/tasks/retrieval/eng/lemb_passkey_retrieval.py +1 -1
  87. mteb/tasks/retrieval/eng/lemb_summ_screen_fd_retrieval.py +1 -1
  88. mteb/tasks/retrieval/eng/lemb_wikim_qa_retrieval.py +1 -1
  89. mteb/tasks/retrieval/eng/lembqm_sum_retrieval.py +1 -1
  90. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -1
  91. mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +1 -1
  92. mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +1 -1
  93. mteb/tasks/retrieval/eng/ml_questions.py +1 -1
  94. mteb/tasks/retrieval/eng/nano_argu_ana_retrieval.py +1 -1
  95. mteb/tasks/retrieval/eng/nano_climate_fever_retrieval.py +1 -1
  96. mteb/tasks/retrieval/eng/nano_db_pedia_retrieval.py +1 -1
  97. mteb/tasks/retrieval/eng/nano_fever_retrieval.py +1 -1
  98. mteb/tasks/retrieval/eng/nano_fi_qa2018_retrieval.py +1 -1
  99. mteb/tasks/retrieval/eng/nano_hotpot_qa_retrieval.py +1 -1
  100. mteb/tasks/retrieval/eng/nano_msmarco_retrieval.py +1 -1
  101. mteb/tasks/retrieval/eng/nano_nf_corpus_retrieval.py +1 -1
  102. mteb/tasks/retrieval/eng/nano_nq_retrieval.py +1 -1
  103. mteb/tasks/retrieval/eng/nano_quora_retrieval.py +1 -1
  104. mteb/tasks/retrieval/eng/nano_sci_fact_retrieval.py +1 -1
  105. mteb/tasks/retrieval/eng/nano_scidocs_retrieval.py +1 -1
  106. mteb/tasks/retrieval/eng/nano_touche2020_retrieval.py +1 -1
  107. mteb/tasks/retrieval/eng/narrative_qa_retrieval.py +1 -1
  108. mteb/tasks/retrieval/eng/r2_med_retrieval.py +8 -8
  109. mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +1 -1
  110. mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +1 -1
  111. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +10 -10
  112. mteb/tasks/retrieval/fra/f_qu_ad_retrieval.py +1 -1
  113. mteb/tasks/retrieval/fra/syntec_retrieval.py +1 -1
  114. mteb/tasks/retrieval/hun/hun_sum2.py +1 -1
  115. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +1 -1
  116. mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt19.py +1 -1
  117. mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt21.py +1 -1
  118. mteb/tasks/retrieval/multilingual/cur_ev1_retrieval.py +1 -1
  119. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +1 -1
  120. mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +1 -1
  121. mteb/tasks/retrieval/multilingual/mr_tidy_retrieval.py +1 -1
  122. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +1 -1
  123. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -2
  124. mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py +1 -1
  125. mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
  126. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +5 -5
  127. mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +1 -1
  128. mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +1 -1
  129. mteb/tasks/retrieval/multilingual/x_qu_ad_retrieval.py +1 -1
  130. mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +1 -1
  131. mteb/tasks/retrieval/nld/cqa_dupstack_android_nl_retrieval.py +1 -1
  132. mteb/tasks/retrieval/nld/cqa_dupstack_english_nl_retrieval.py +1 -1
  133. mteb/tasks/retrieval/nld/cqa_dupstack_gaming_nl_retrieval.py +1 -1
  134. mteb/tasks/retrieval/nld/cqa_dupstack_gis_nl_retrieval.py +1 -1
  135. mteb/tasks/retrieval/nld/cqa_dupstack_mathematica_nl_retrieval.py +1 -1
  136. mteb/tasks/retrieval/nld/cqa_dupstack_physics_nl_retrieval.py +1 -1
  137. mteb/tasks/retrieval/nld/cqa_dupstack_programmers_nl_retrieval.py +1 -1
  138. mteb/tasks/retrieval/nld/cqa_dupstack_stats_nl_retrieval.py +1 -1
  139. mteb/tasks/retrieval/nld/cqa_dupstack_tex_nl_retrieval.py +1 -1
  140. mteb/tasks/retrieval/nld/cqa_dupstack_unix_nl_retrieval.py +1 -1
  141. mteb/tasks/retrieval/nld/cqa_dupstack_webmasters_nl_retrieval.py +1 -1
  142. mteb/tasks/retrieval/nld/cqa_dupstack_wordpress_nl_retrieval.py +1 -1
  143. mteb/tasks/retrieval/nob/norquad.py +2 -2
  144. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  145. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -1
  146. mteb/tasks/retrieval/vie/vie_qu_ad_retrieval.py +1 -1
  147. mteb/tasks/sts/multilingual/sem_rel24_sts.py +1 -1
  148. mteb/tasks/sts/multilingual/sts_benchmark_multilingual_sts.py +1 -1
  149. mteb/tasks/sts/por/assin2_sts.py +1 -1
  150. mteb/types/_encoder_io.py +1 -1
  151. {mteb-2.7.17.dist-info → mteb-2.7.19.dist-info}/METADATA +1 -1
  152. {mteb-2.7.17.dist-info → mteb-2.7.19.dist-info}/RECORD +156 -155
  153. {mteb-2.7.17.dist-info → mteb-2.7.19.dist-info}/WHEEL +0 -0
  154. {mteb-2.7.17.dist-info → mteb-2.7.19.dist-info}/entry_points.txt +0 -0
  155. {mteb-2.7.17.dist-info → mteb-2.7.19.dist-info}/licenses/LICENSE +0 -0
  156. {mteb-2.7.17.dist-info → mteb-2.7.19.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,115 @@
1
+ from __future__ import annotations
2
+
3
+ from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
4
+ from mteb.models.model_meta import ModelMeta
5
+ from mteb.types import PromptType
6
+
7
+
8
+ def instruction_template(
9
+ instruction: str | dict, prompt_type: PromptType | None = None
10
+ ) -> str:
11
+ """Format instruction for the model."""
12
+ if isinstance(instruction, dict):
13
+ instruction = instruction.get(prompt_type.value if prompt_type else "", "")
14
+ elif prompt_type == PromptType.document:
15
+ return ""
16
+
17
+ if not instruction:
18
+ return ""
19
+ return f"Instruct: {instruction}\nQuery:"
20
+
21
+
22
+ multilingual_langs = [
23
+ "deu-Latn",
24
+ "ita-Latn",
25
+ "ara-Arab",
26
+ "fas-Arab",
27
+ "fra-Latn",
28
+ "hin-Deva",
29
+ "spa-Latn",
30
+ "zho-Hans",
31
+ "ben-Beng",
32
+ "eng-Latn",
33
+ "fin-Latn",
34
+ "ind-Latn",
35
+ "jpn-Jpan",
36
+ "kor-Hang",
37
+ "rus-Cyrl",
38
+ "swh-Latn",
39
+ "tel-Telu",
40
+ "tha-Thai",
41
+ ]
42
+
43
+ training_data = [
44
+ "FEVER",
45
+ "DuRetrieval",
46
+ "HotpotQA",
47
+ "MSMARCO",
48
+ "T2Retrieval",
49
+ "NQ",
50
+ "MIRACLRetrieval",
51
+ "MrTidyRetrieval",
52
+ "AmazonCounterfactualClassification",
53
+ "Banking77Classification",
54
+ "ImdbClassification",
55
+ "MTOPDomainClassification",
56
+ "ToxicConversationsClassification",
57
+ "TweetSentimentExtractionClassification",
58
+ ]
59
+
60
+ boom_4b_instructions = {
61
+ "AmazonCounterfactualClassification": "Classify a given Amazon customer review text as either counterfactual or not-counterfactual.",
62
+ "AmazonPolarityClassification": "Classify Amazon reviews into positive or negative sentiment.",
63
+ "AmazonReviewsClassification": "Classify the given Amazon review into its appropriate rating category.",
64
+ "Banking77Classification": "Given a online banking query, find the corresponding intents.",
65
+ "EmotionClassification": "Classify the emotion expressed in the given Twitter message into one of the six emotions: anger, fear, joy, love, sadness, and surprise.",
66
+ "ImdbClassification": "Classify the sentiment expressed in the given movie review text from the IMDB dataset.",
67
+ "MassiveIntentClassification": "Given a user utterance as query, find the user intents.",
68
+ "MassiveScenarioClassification": "Given a user utterance as query, find the user scenarios.",
69
+ "MTOPDomainClassification": "Classify the intent domain of the given utterance in task-oriented conversation.",
70
+ "MTOPIntentClassification": "Classify the intent of the given utterance in task-oriented conversation.",
71
+ "ToxicConversationsClassification": "Classify the given comments as either toxic or not toxic.",
72
+ "TweetSentimentExtractionClassification": "Classify the sentiment of a given tweet as either positive, negative, or neutral.",
73
+ "TNews": "Classify the fine-grained category of the given news title.",
74
+ "ClimateFEVER": "Given a claim about climate change, retrieve documents that support or refute the claim.",
75
+ "ClimateFEVERHardNegatives": "Given a claim about climate change, retrieve documents that support or refute the claim.",
76
+ "DBPedia": "Given a query, retrieve relevant entity descriptions from DBPedia.",
77
+ "FEVER": "Given a claim, retrieve documents that support or refute the claim.",
78
+ "FEVERHardNegatives": "Given a claim, retrieve documents that support or refute the claim.",
79
+ "FiQA2018": "Given a financial question, retrieve user replies that best answer the question.",
80
+ "HotpotQA": "Given a multi-hop question, retrieve documents that can help answer the question.",
81
+ "HotpotQAHardNegatives": "Given a multi-hop question, retrieve documents that can help answer the question.",
82
+ "MSMARCO": "Given a web search query, retrieve relevant passages that answer the query.",
83
+ "NFCorpus": "Given a question, retrieve relevant documents that best answer the question.",
84
+ "NQ": "Given a question, retrieve Wikipedia passages that answer the question.",
85
+ }
86
+ # How the template actually renders each one at inference time:
87
+ # instruction_template(boom_4b_instructions["Banking77Classification"], PromptType.query)
88
+ # -> "Instruct: Given a online banking query, find the corresponding intents.\nQuery:"
89
+
90
+ boom_4b_v1 = ModelMeta(
91
+ loader=InstructSentenceTransformerModel,
92
+ loader_kwargs=dict(
93
+ instruction_template=instruction_template,
94
+ ),
95
+ name="ICT-TIME-and-Querit/BOOM_4B_v1",
96
+ model_type=["dense"],
97
+ languages=multilingual_langs,
98
+ open_weights=True,
99
+ adapted_from="Qwen/Qwen3-4B",
100
+ revision="447ab88574d27e67c428acc2b429d7d4580a4ea7",
101
+ release_date="2026-01-31",
102
+ n_parameters=4021774336,
103
+ n_embedding_parameters=None,
104
+ memory_usage_mb=7671,
105
+ embed_dim=2560,
106
+ max_tokens=32768,
107
+ license="apache-2.0",
108
+ reference="https://huggingface.co/ICT-TIME-and-Querit/BOOM_4B_v1",
109
+ similarity_fn_name="cosine",
110
+ framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
111
+ use_instructions=True,
112
+ public_training_code=None,
113
+ public_training_data=None,
114
+ training_datasets=training_data,
115
+ )
@@ -53,7 +53,7 @@ class PylateSearchEncoder:
53
53
  hf_split: str,
54
54
  hf_subset: str,
55
55
  encode_kwargs: EncodeKwargs,
56
- num_proc: int,
56
+ num_proc: int | None,
57
57
  ) -> None:
58
58
  """Index the corpus for retrieval.
59
59
 
@@ -89,7 +89,7 @@ class PylateSearchEncoder:
89
89
  top_k: int,
90
90
  encode_kwargs: EncodeKwargs,
91
91
  top_ranked: TopRankedDocumentsType | None = None,
92
- num_proc: int,
92
+ num_proc: int | None,
93
93
  ) -> RetrievalOutputType:
94
94
  queries_dataloader = create_dataloader(
95
95
  queries,
@@ -150,7 +150,7 @@ class PylateSearchEncoder:
150
150
  hf_split: str,
151
151
  top_k: int,
152
152
  encode_kwargs: EncodeKwargs,
153
- num_proc: int,
153
+ num_proc: int | None,
154
154
  ) -> dict[str, list[tuple[float, str]]]:
155
155
  from pylate import indexes, retrieve
156
156
 
@@ -216,7 +216,7 @@ class PylateSearchEncoder:
216
216
  hf_subset: str,
217
217
  hf_split: str,
218
218
  encode_kwargs: EncodeKwargs,
219
- num_proc: int = 1,
219
+ num_proc: int | None = None,
220
220
  ) -> dict[str, list[tuple[float, str]]]:
221
221
  """Rerank with PyLate's rank.rerank using per-query candidates.
222
222
 
@@ -32,7 +32,7 @@ class SearchProtocol(Protocol):
32
32
  hf_split: str,
33
33
  hf_subset: str,
34
34
  encode_kwargs: EncodeKwargs,
35
- num_proc: int,
35
+ num_proc: int | None,
36
36
  ) -> None:
37
37
  """Index the corpus for retrieval.
38
38
 
@@ -56,7 +56,7 @@ class SearchProtocol(Protocol):
56
56
  top_k: int,
57
57
  encode_kwargs: EncodeKwargs,
58
58
  top_ranked: TopRankedDocumentsType | None = None,
59
- num_proc: int,
59
+ num_proc: int | None,
60
60
  ) -> RetrievalOutputType:
61
61
  """Search the corpus using the given queries.
62
62
 
@@ -59,7 +59,7 @@ class SearchEncoderWrapper:
59
59
  hf_split: str,
60
60
  hf_subset: str,
61
61
  encode_kwargs: EncodeKwargs,
62
- num_proc: int = 1,
62
+ num_proc: int | None = None,
63
63
  ) -> None:
64
64
  """Index the corpus for retrieval.
65
65
 
@@ -101,7 +101,7 @@ class SearchEncoderWrapper:
101
101
  top_k: int,
102
102
  encode_kwargs: EncodeKwargs,
103
103
  top_ranked: TopRankedDocumentsType | None = None,
104
- num_proc: int = 1,
104
+ num_proc: int | None = None,
105
105
  ) -> RetrievalOutputType:
106
106
  """Search the corpus for the given queries.
107
107
 
@@ -485,7 +485,7 @@ class SearchCrossEncoderWrapper:
485
485
  hf_split: str,
486
486
  hf_subset: str,
487
487
  encode_kwargs: EncodeKwargs,
488
- num_proc: int = 1,
488
+ num_proc: int | None = None,
489
489
  ) -> None:
490
490
  """Index the corpus for retrieval.
491
491
 
@@ -509,7 +509,7 @@ class SearchCrossEncoderWrapper:
509
509
  top_k: int,
510
510
  encode_kwargs: EncodeKwargs,
511
511
  top_ranked: TopRankedDocumentsType | None = None,
512
- num_proc: int = 1,
512
+ num_proc: int | None = None,
513
513
  ) -> RetrievalOutputType:
514
514
  """Search the corpus using the given queries.
515
515
 
@@ -914,7 +914,7 @@ class BibleNLPBitextMining(AbsTaskBitextMining):
914
914
  self.dataset_transform()
915
915
  self.data_loaded = True
916
916
 
917
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
917
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
918
918
  # Convert to standard format
919
919
  for lang in self.hf_subsets:
920
920
  l1, l2 = (l.split("_")[0] for l in lang.split("-"))
@@ -265,7 +265,7 @@ class FloresBitextMining(AbsTaskBitextMining):
265
265
  """,
266
266
  )
267
267
 
268
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
268
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
269
269
  if self.data_loaded:
270
270
  return
271
271
 
@@ -99,7 +99,7 @@ class IN22ConvBitextMining(AbsTaskBitextMining):
99
99
  """,
100
100
  )
101
101
 
102
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
102
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
103
103
  if self.data_loaded:
104
104
  return
105
105
 
@@ -93,7 +93,7 @@ class IN22GenBitextMining(AbsTaskBitextMining):
93
93
  """,
94
94
  )
95
95
 
96
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
96
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
97
97
  if self.data_loaded:
98
98
  return
99
99
 
@@ -280,7 +280,7 @@ class NTREXBitextMining(AbsTaskBitextMining):
280
280
  """,
281
281
  )
282
282
 
283
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
283
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
284
284
  if self.data_loaded:
285
285
  return
286
286
 
@@ -32,7 +32,7 @@ class RomaTalesBitextMining(AbsTaskBitextMining):
32
32
  bibtex_citation="",
33
33
  )
34
34
 
35
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
35
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
36
36
  """Load dataset from HuggingFace hub and convert it to the standard format."""
37
37
  if self.data_loaded:
38
38
  return
@@ -43,7 +43,7 @@ Islam, Tanvir},
43
43
  superseded_by="BengaliDocumentClassification.v2",
44
44
  )
45
45
 
46
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
46
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
47
47
  self.dataset = self.dataset.rename_columns(
48
48
  {"article": "text", "category": "label"}
49
49
  )
@@ -92,7 +92,7 @@ Islam, Tanvir},
92
92
  """,
93
93
  )
94
94
 
95
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
95
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
96
96
  self.dataset = self.stratified_subsampling(
97
97
  self.dataset, seed=self.seed, splits=["test"]
98
98
  )
@@ -46,7 +46,7 @@ Montoyo, Andres},
46
46
  )
47
47
  samples_per_label = 16
48
48
 
49
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
49
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
50
50
  self.dataset = self.dataset.rename_columns(
51
51
  {"comment": "text", "rating_str": "label"}
52
52
  )
@@ -99,7 +99,7 @@ Montoyo, Andres},
99
99
  )
100
100
  samples_per_label = 16
101
101
 
102
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
102
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
103
103
  self.dataset = self.stratified_subsampling(
104
104
  self.dataset, seed=self.seed, splits=["test"]
105
105
  )
@@ -46,7 +46,7 @@ Montoyo, Andres},
46
46
  )
47
47
  samples_per_label = 16
48
48
 
49
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
49
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
50
50
  self.dataset = self.dataset.rename_columns(
51
51
  {"comment": "text", "sentiment_int": "label"}
52
52
  )
@@ -60,7 +60,7 @@ class HinDialectClassification(AbsTaskClassification):
60
60
  """,
61
61
  )
62
62
 
63
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
63
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
64
64
  self.dataset = self.dataset.rename_columns(
65
65
  {"folksong": "text", "language": "label"}
66
66
  )
@@ -137,6 +137,6 @@ Okazaki, Naoaki},
137
137
  self.dataset_transform()
138
138
  self.data_loaded = True
139
139
 
140
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
140
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
141
141
  self.dataset = self.dataset.remove_columns(["language", "script"])
142
142
  self.dataset = self.dataset.rename_columns({"native sentence": "text"})
@@ -52,7 +52,7 @@ class IndicSentimentClassification(AbsTaskClassification):
52
52
  """,
53
53
  )
54
54
 
55
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
55
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
56
56
  label_map = {"Negative": 0, "Positive": 1}
57
57
  # Convert to standard format
58
58
  for lang in self.hf_subsets:
@@ -66,7 +66,7 @@ in Natural Language Processing},
66
66
  """,
67
67
  )
68
68
 
69
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
69
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
70
70
  self.dataset = self.dataset.rename_columns({"labels": "label"})
71
71
  self.dataset = self.stratified_subsampling(
72
72
  self.dataset, seed=self.seed, splits=["test"]
@@ -49,7 +49,7 @@ class SouthAfricanLangClassification(AbsTaskClassification):
49
49
  """,
50
50
  )
51
51
 
52
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
52
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
53
53
  self.dataset = self.dataset.rename_columns(
54
54
  {" text": "text", "lang_id": "label"}
55
55
  )
@@ -49,7 +49,7 @@ class TurkicClassification(AbsTaskClassification):
49
49
  )
50
50
  return dataset_lang["train"]
51
51
 
52
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
52
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
53
53
  """Load dataset from HuggingFace hub"""
54
54
  if self.data_loaded:
55
55
  return
@@ -35,7 +35,7 @@ class SlovakMovieReviewSentimentClassification(AbsTaskClassification):
35
35
  superseded_by="SlovakMovieReviewSentimentClassification.v2",
36
36
  )
37
37
 
38
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
38
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
39
39
  self.dataset = self.dataset.rename_columns({"comment": "text"})
40
40
 
41
41
  self.dataset = self.stratified_subsampling(
@@ -76,7 +76,7 @@ class SlovakMovieReviewSentimentClassificationV2(AbsTaskClassification):
76
76
  adapted_from=["SlovakMovieReviewSentimentClassification"],
77
77
  )
78
78
 
79
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
79
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
80
80
  self.dataset = self.stratified_subsampling(
81
81
  self.dataset, seed=self.seed, splits=["test"]
82
82
  )
@@ -37,7 +37,7 @@ class SwahiliNewsClassification(AbsTaskClassification):
37
37
  superseded_by="SwahiliNewsClassification.v2",
38
38
  )
39
39
 
40
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
40
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
41
41
  self.dataset = self.dataset.rename_columns(
42
42
  {"content": "text", "category": "label"}
43
43
  )
@@ -81,7 +81,7 @@ class SwahiliNewsClassificationV2(AbsTaskClassification):
81
81
  adapted_from=["SwahiliNewsClassification"],
82
82
  )
83
83
 
84
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
84
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
85
85
  self.dataset = self.stratified_subsampling(
86
86
  self.dataset, seed=self.seed, splits=["train"]
87
87
  )
@@ -63,7 +63,7 @@ class TenKGnadClusteringP2PFast(AbsTaskClustering):
63
63
  adapted_from=["TenKGnadClusteringP2P"],
64
64
  )
65
65
 
66
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
66
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
67
67
  ds = _convert_to_fast(
68
68
  self.dataset, self.input_column_name, self.label_column_name, self.seed
69
69
  )
@@ -63,7 +63,7 @@ class TenKGnadClusteringS2SFast(AbsTaskClustering):
63
63
  adapted_from=["TenKGnadClusteringS2S"],
64
64
  )
65
65
 
66
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
66
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
67
67
  ds = _convert_to_fast(
68
68
  self.dataset, self.input_column_name, self.label_column_name, self.seed
69
69
  )
@@ -51,7 +51,7 @@ class MLSUMClusteringP2P(AbsTaskClusteringLegacy):
51
51
  superseded_by="MLSUMClusteringP2P.v2",
52
52
  )
53
53
 
54
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
54
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
55
55
  """Load dataset from HuggingFace hub and convert it to the standard format."""
56
56
  if self.data_loaded:
57
57
  return
@@ -124,7 +124,7 @@ class MLSUMClusteringP2PFast(AbsTaskClustering):
124
124
  adapted_from=["MLSUMClusteringP2P"],
125
125
  )
126
126
 
127
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
127
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
128
128
  """Load dataset from HuggingFace hub and convert it to the standard format."""
129
129
  if self.data_loaded:
130
130
  return
@@ -51,7 +51,7 @@ class MLSUMClusteringS2S(AbsTaskClusteringLegacy):
51
51
  superseded_by="MLSUMClusteringS2S.v2",
52
52
  )
53
53
 
54
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
54
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
55
55
  """Load dataset from HuggingFace hub and convert it to the standard format."""
56
56
  if self.data_loaded:
57
57
  return
@@ -119,7 +119,7 @@ class MLSUMClusteringS2SFast(AbsTaskClustering):
119
119
  adapted_from=["MLSUMClusteringS2S"],
120
120
  )
121
121
 
122
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
122
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
123
123
  """Load dataset from HuggingFace hub and convert it to the standard format."""
124
124
  if self.data_loaded:
125
125
  return
@@ -45,7 +45,7 @@ class VGHierarchicalClusteringP2P(AbsTaskClustering):
45
45
  prompt="Identify the categories (e.g. sports) of given articles in Norwegian",
46
46
  )
47
47
 
48
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
48
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
49
49
  self.dataset = self.dataset.rename_columns(
50
50
  {"article": "sentences", "classes": "labels"}
51
51
  )
@@ -92,7 +92,7 @@ class VGHierarchicalClusteringS2S(AbsTaskClustering):
92
92
  prompt="Identify the categories (e.g. sports) of given articles in Norwegian",
93
93
  )
94
94
 
95
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
95
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
96
96
  self.dataset = self.dataset.rename_columns(
97
97
  {"ingress": "sentences", "classes": "labels"}
98
98
  )
@@ -53,7 +53,7 @@ class ImageCoDe(AbsTaskImageTextPairClassification):
53
53
  """,
54
54
  )
55
55
 
56
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
56
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
57
57
  if self.data_loaded:
58
58
  return
59
59
 
@@ -45,7 +45,7 @@ class SugarCrepe(AbsTaskImageTextPairClassification):
45
45
  """,
46
46
  )
47
47
 
48
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
48
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
49
49
  """Load dataset from HuggingFace hub"""
50
50
  if self.data_loaded:
51
51
  return
@@ -175,7 +175,7 @@ class mFollowIRCrossLingual(AbsTaskRetrieval): # noqa: N801
175
175
  """,
176
176
  )
177
177
 
178
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
178
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
179
179
  if self.data_loaded:
180
180
  return
181
181
 
@@ -243,7 +243,7 @@ class mFollowIR(AbsTaskRetrieval): # noqa: N801
243
243
  """,
244
244
  )
245
245
 
246
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
246
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
247
247
  if self.data_loaded:
248
248
  return
249
249
 
@@ -123,7 +123,7 @@ class CVBenchCount(AbsTaskRetrieval):
123
123
  """,
124
124
  )
125
125
 
126
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
126
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
127
127
  self.corpus, self.queries, self.relevant_docs, self.top_ranked = _load_data(
128
128
  path=self.metadata.dataset["path"],
129
129
  splits=self.metadata.eval_splits,
@@ -165,7 +165,7 @@ class CVBenchRelation(AbsTaskRetrieval):
165
165
  """,
166
166
  )
167
167
 
168
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
168
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
169
169
  self.corpus, self.queries, self.relevant_docs, self.top_ranked = _load_data(
170
170
  path=self.metadata.dataset["path"],
171
171
  splits=self.metadata.eval_splits,
@@ -207,7 +207,7 @@ class CVBenchDepth(AbsTaskRetrieval):
207
207
  """,
208
208
  )
209
209
 
210
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
210
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
211
211
  self.corpus, self.queries, self.relevant_docs, self.top_ranked = _load_data(
212
212
  path=self.metadata.dataset["path"],
213
213
  splits=self.metadata.eval_splits,
@@ -249,7 +249,7 @@ class CVBenchDistance(AbsTaskRetrieval):
249
249
  """,
250
250
  )
251
251
 
252
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
252
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
253
253
  self.corpus, self.queries, self.relevant_docs, self.top_ranked = _load_data(
254
254
  path=self.metadata.dataset["path"],
255
255
  splits=self.metadata.eval_splits,
@@ -66,7 +66,7 @@ Yih, Scott Wen-tau},
66
66
  },
67
67
  )
68
68
 
69
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
69
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
70
70
  labels = [
71
71
  "q2_label",
72
72
  "q3_label",
@@ -76,7 +76,7 @@ class PubChemSMILESPC(AbsTaskPairClassification):
76
76
  """,
77
77
  )
78
78
 
79
- def load_data(self, num_proc: int = 1, **kwargs: Any) -> None:
79
+ def load_data(self, num_proc: int | None = None, **kwargs: Any) -> None:
80
80
  if self.data_loaded:
81
81
  return
82
82
 
@@ -60,7 +60,7 @@ class PubChemWikiPairClassification(AbsTaskPairClassification):
60
60
  """,
61
61
  )
62
62
 
63
- def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
63
+ def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
64
64
  _dataset = {}
65
65
  for lang in self.hf_subsets:
66
66
  _dataset[lang] = {}
@@ -52,7 +52,7 @@ Dolan, Bill},
52
52
  # sum of 4 languages after neutral filtering
53
53
  )
54
54
 
55
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
55
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
56
56
  """Load dataset from HuggingFace hub"""
57
57
  if self.data_loaded:
58
58
  return
@@ -37,7 +37,7 @@ class SadeemQuestionRetrieval(AbsTaskRetrieval):
37
37
  """,
38
38
  )
39
39
 
40
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
40
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
41
41
  if self.data_loaded:
42
42
  return
43
43
 
@@ -53,7 +53,7 @@ class CodeEditSearchRetrieval(AbsTaskRetrieval):
53
53
  """,
54
54
  )
55
55
 
56
- def load_data(self, num_proc: int = 1, **kwargs) -> None:
56
+ def load_data(self, num_proc: int | None = None, **kwargs) -> None:
57
57
  if self.data_loaded:
58
58
  return
59
59