mteb 2.7.11__py3-none-any.whl → 2.7.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. mteb/abstasks/abstask.py +2 -1
  2. mteb/models/model_implementations/nomic_models.py +40 -0
  3. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +110 -10
  4. mteb/models/model_implementations/ops_colqwen3_models.py +267 -0
  5. mteb/models/model_implementations/querit_models.py +245 -0
  6. mteb/tasks/bitext_mining/multilingual/bible_nlp_bitext_mining.py +1 -1
  7. mteb/tasks/classification/ben/bengali_document_classification.py +2 -2
  8. mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +2 -2
  9. mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -1
  10. mteb/tasks/classification/multilingual/hin_dialect_classification.py +1 -1
  11. mteb/tasks/classification/multilingual/indic_lang_classification.py +1 -1
  12. mteb/tasks/classification/multilingual/indic_sentiment_classification.py +1 -1
  13. mteb/tasks/classification/multilingual/language_classification.py +1 -1
  14. mteb/tasks/classification/multilingual/south_african_lang_classification.py +1 -1
  15. mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +2 -2
  16. mteb/tasks/classification/swa/swahili_news_classification.py +2 -2
  17. mteb/tasks/clustering/deu/ten_k_gnad_clustering_p2p.py +1 -1
  18. mteb/tasks/clustering/deu/ten_k_gnad_clustering_s2s.py +1 -1
  19. mteb/tasks/clustering/nob/vg_hierarchical_clustering.py +2 -2
  20. mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +1 -1
  21. mteb/tasks/pair_classification/multilingual/pub_chem_wiki_pair_classification.py +1 -1
  22. mteb/tasks/retrieval/code/code_rag.py +4 -4
  23. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  24. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +1 -1
  25. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +1 -1
  26. mteb/tasks/retrieval/nob/norquad.py +1 -1
  27. mteb/tasks/retrieval/nob/snl_retrieval.py +1 -1
  28. mteb/tasks/sts/multilingual/sem_rel24_sts.py +1 -1
  29. mteb/tasks/sts/multilingual/sts_benchmark_multilingual_sts.py +1 -1
  30. mteb/tasks/sts/por/assin2_sts.py +1 -1
  31. {mteb-2.7.11.dist-info → mteb-2.7.13.dist-info}/METADATA +1 -3
  32. {mteb-2.7.11.dist-info → mteb-2.7.13.dist-info}/RECORD +36 -34
  33. {mteb-2.7.11.dist-info → mteb-2.7.13.dist-info}/WHEEL +1 -1
  34. {mteb-2.7.11.dist-info → mteb-2.7.13.dist-info}/entry_points.txt +0 -0
  35. {mteb-2.7.11.dist-info → mteb-2.7.13.dist-info}/licenses/LICENSE +0 -0
  36. {mteb-2.7.11.dist-info → mteb-2.7.13.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,245 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from typing import TYPE_CHECKING, Any
5
+
6
+ import torch
7
+ from tqdm.auto import tqdm
8
+
9
+ from mteb.models.model_meta import ModelMeta
10
+
11
+ from .rerankers_custom import RerankerWrapper
12
+
13
+ if TYPE_CHECKING:
14
+ from torch.utils.data import DataLoader
15
+
16
+ from mteb.abstasks.task_metadata import TaskMetadata
17
+ from mteb.types import BatchedInput, PromptType
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class QueritWrapper(RerankerWrapper):
23
+ """
24
+ Multi-GPU / multi-process reranker wrapper for mteb.mteb evaluation.
25
+ Supports flattening all query-passage pairs without explicit grouping.
26
+ """
27
+
28
+ def __init__(
29
+ self,
30
+ model_name: str,
31
+ **kwargs: Any,
32
+ ) -> None:
33
+ super().__init__(model_name, **kwargs)
34
+ from transformers import AutoModel, AutoTokenizer
35
+
36
+ if not self.device:
37
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
38
+ model_args = {}
39
+ if self.fp_options:
40
+ model_args["torch_dtype"] = self.fp_options
41
+ self.model = AutoModel.from_pretrained(
42
+ model_name, trust_remote_code=True, **model_args
43
+ )
44
+ logger.info(f"Using model {model_name}")
45
+
46
+ self.model.to(self.device)
47
+ self.tokenizer = AutoTokenizer.from_pretrained(
48
+ model_name, trust_remote_code=True
49
+ )
50
+ if "[CLS]" not in self.tokenizer.get_vocab():
51
+ raise ValueError("Tokenizer missing required special token '[CLS]'")
52
+ self.cls_token_id = self.tokenizer.convert_tokens_to_ids("[CLS]")
53
+ self.pad_token_id = self.tokenizer.pad_token_id or 0
54
+
55
+ self.max_length = (
56
+ min(kwargs.get("max_length", 4096), self.tokenizer.model_max_length) - 1
57
+ ) # sometimes it's a v large number/max int
58
+ logger.info(f"Using max_length of {self.max_length}, 1 token for [CLS]")
59
+ self.model.eval()
60
+
61
+ def process_inputs(
62
+ self,
63
+ pairs: list[str],
64
+ ) -> dict[str, torch.Tensor]:
65
+ """
66
+ Encode a batch of (query, document) pairs:
67
+ - Concatenate prompt + Query + Content
68
+ - Append [CLS] at the end
69
+ - Left-pad to max_length
70
+ - Generate custom attention mask based on block types
71
+ """
72
+ # Construct input texts
73
+ enc = self.tokenizer(
74
+ pairs,
75
+ add_special_tokens=False,
76
+ truncation=True,
77
+ max_length=self.max_length,
78
+ padding=False,
79
+ )
80
+
81
+ input_ids_list: list[list[int]] = []
82
+ attn_mask_list: list[torch.Tensor] = []
83
+
84
+ for ids in enc["input_ids"]:
85
+ # Append [CLS] token
86
+ ids = ids + [self.cls_token_id]
87
+ block_types = [1] * (len(ids) - 1) + [2] # content + CLS
88
+
89
+ # Pad or truncate
90
+ if len(ids) < self.max_length:
91
+ pad_len = self.max_length - len(ids)
92
+ ids = [self.pad_token_id] * pad_len + ids
93
+ block_types = [0] * pad_len + block_types
94
+ else:
95
+ ids = ids[-self.max_length :]
96
+ block_types = block_types[-self.max_length :]
97
+
98
+ attn = self.compute_mask_content_cls(block_types)
99
+ input_ids_list.append(ids)
100
+ attn_mask_list.append(attn)
101
+
102
+ input_ids = torch.tensor(input_ids_list, dtype=torch.long, device=self.device)
103
+ attention_mask = torch.stack(attn_mask_list, dim=0).to(self.device)
104
+
105
+ return {"input_ids": input_ids, "attention_mask": attention_mask}
106
+
107
+ @torch.inference_mode()
108
+ def predict(
109
+ self,
110
+ inputs1: DataLoader[BatchedInput],
111
+ inputs2: DataLoader[BatchedInput],
112
+ *,
113
+ task_metadata: TaskMetadata,
114
+ hf_split: str,
115
+ hf_subset: str,
116
+ prompt_type: PromptType | None = None,
117
+ **kwargs: Any,
118
+ ) -> list[float]:
119
+ """
120
+ Predict relevance scores for query-passage pairs.
121
+ Supports both single-process and multi-process/multi-GPU modes.
122
+ """
123
+ # Flatten all pairs from mteb.mteb DataLoaders
124
+ queries = [text for batch in inputs1 for text in batch["text"]]
125
+ passages = [text for batch in inputs2 for text in batch["text"]]
126
+
127
+ instructions = None
128
+ if "instruction" in inputs2.dataset.features:
129
+ instructions = [text for batch in inputs1 for text in batch["instruction"]]
130
+
131
+ num_pairs = len(queries)
132
+ if num_pairs == 0:
133
+ return []
134
+ final_scores: list[float] = []
135
+
136
+ batch_size = kwargs.get("batch_size", self.batch_size)
137
+ with tqdm(total=num_pairs, desc="Scoring", ncols=100) as pbar:
138
+ for start in range(0, num_pairs, batch_size):
139
+ end = min(start + batch_size, num_pairs)
140
+ batch_q = queries[start:end]
141
+ batch_d = passages[start:end]
142
+
143
+ batch_instructions = (
144
+ instructions[start:end]
145
+ if instructions is not None
146
+ else [None] * len(batch_q)
147
+ )
148
+ pairs = [
149
+ self.format_instruction(instr, query, doc)
150
+ for instr, query, doc in zip(batch_instructions, batch_q, batch_d)
151
+ ]
152
+ enc = self.process_inputs(pairs)
153
+ out = self.model(**enc)
154
+ scores = out["score"].squeeze(-1).detach().float().cpu().tolist()
155
+
156
+ if not isinstance(scores, list):
157
+ scores = [scores]
158
+
159
+ final_scores.extend(scores)
160
+ pbar.update(len(scores))
161
+
162
+ return final_scores
163
+
164
+ @staticmethod
165
+ def format_instruction(instruction: str | None, query: str, doc: str) -> str:
166
+ if instruction is None:
167
+ output = f"Judge whether the Content meets the requirements based on the Query. Query: {query}; Content: {doc}"
168
+ else:
169
+ output = f"{instruction} Query: {query}; Content: {doc}"
170
+ return output
171
+
172
+ @staticmethod
173
+ def compute_mask_content_cls(block_types: list[int]) -> torch.Tensor:
174
+ """
175
+ Create custom attention mask based on token block types:
176
+ - 0: padding → ignored
177
+ - 1: content → causal attention to previous content only
178
+ - 2: [CLS] → causal attention to all non-padding tokens
179
+
180
+ Args:
181
+ block_types: List of token types for one sequence
182
+
183
+ Returns:
184
+ [1, seq_len, seq_len] boolean attention mask (True = allowed to attend)
185
+ """
186
+ pos = torch.tensor(block_types, dtype=torch.long)
187
+ n = pos.shape[0]
188
+ if n == 0:
189
+ return torch.empty((0, 0), dtype=torch.bool, device=pos.device)
190
+
191
+ row_types = pos.view(n, 1)
192
+ col_types = pos.view(1, n)
193
+
194
+ row_idx = torch.arange(n, device=pos.device).view(n, 1)
195
+ col_idx = torch.arange(n, device=pos.device).view(1, n)
196
+ causal_mask = col_idx <= row_idx
197
+
198
+ # Content tokens only attend to previous content
199
+ mask_content = (row_types == 1) & (col_types == 1) & causal_mask
200
+
201
+ # [CLS] attends to all non-pad tokens (causal)
202
+ mask_cls = (row_types == 2) & (col_types != 0) & causal_mask
203
+
204
+ type_mask = mask_content | mask_cls
205
+ return type_mask.unsqueeze(0)
206
+
207
+
208
+ querit_reranker_training_data = {
209
+ "MIRACLRanking", # https://huggingface.co/datasets/mteb/MIRACLReranking
210
+ "MrTidyRetrieval", # https://huggingface.co/datasets/mteb/mrtidy
211
+ "ruri-v3-dataset-reranker", # https://huggingface.co/datasets/cl-nagoya/ruri-v3-dataset-reranker
212
+ "MultiLongDocReranking", # https://huggingface.co/datasets/Shitao/MLDR
213
+ "MindSmallReranking", # https://huggingface.co/datasets/mteb/MindSmallReranking
214
+ "MSMARCO", # https://huggingface.co/datasets/mteb/msmarco
215
+ "CQADupStack", # https://huggingface.co/datasets/mteb/cqadupstack-*
216
+ "AskUbuntuDupQuestions", # https://github.com/taolei87/askubuntu & The corpus and queries that overlap with mteb/askubuntudupquestions-reranking have been removed.
217
+ "T2Reranking", # https://huggingface.co/datasets/THUIR/T2Ranking & The corpus and queries that overlap with mteb/T2Reranking have been removed.
218
+ }
219
+
220
+ model_meta = ModelMeta(
221
+ loader=QueritWrapper,
222
+ loader_kwargs={
223
+ "fp_options": "bfloat16",
224
+ },
225
+ name="Querit/Querit",
226
+ model_type=["cross-encoder"],
227
+ languages=["eng-Latn"],
228
+ open_weights=True,
229
+ revision="5ad2649cc4defb7e1361262260e9a781f14b08bc",
230
+ release_date="2026-01-24",
231
+ n_parameters=4919636992,
232
+ n_embedding_parameters=131907584,
233
+ embed_dim=1024,
234
+ memory_usage_mb=9383.0,
235
+ max_tokens=4096,
236
+ reference="https://huggingface.co/Querit/Querit",
237
+ similarity_fn_name=None,
238
+ training_datasets=querit_reranker_training_data,
239
+ license="apache-2.0",
240
+ framework=["PyTorch"],
241
+ use_instructions=None,
242
+ public_training_code=None,
243
+ public_training_data=None,
244
+ citation=None,
245
+ )
@@ -914,7 +914,7 @@ class BibleNLPBitextMining(AbsTaskBitextMining):
914
914
  self.dataset_transform()
915
915
  self.data_loaded = True
916
916
 
917
- def dataset_transform(self) -> None:
917
+ def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
918
918
  # Convert to standard format
919
919
  for lang in self.hf_subsets:
920
920
  l1, l2 = (l.split("_")[0] for l in lang.split("-"))
@@ -43,7 +43,7 @@ Islam, Tanvir},
43
43
  superseded_by="BengaliDocumentClassification.v2",
44
44
  )
45
45
 
46
- def dataset_transform(self) -> None:
46
+ def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
47
47
  self.dataset = self.dataset.rename_columns(
48
48
  {"article": "text", "category": "label"}
49
49
  )
@@ -92,7 +92,7 @@ Islam, Tanvir},
92
92
  """,
93
93
  )
94
94
 
95
- def dataset_transform(self) -> None:
95
+ def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
96
96
  self.dataset = self.stratified_subsampling(
97
97
  self.dataset, seed=self.seed, splits=["test"]
98
98
  )
@@ -46,7 +46,7 @@ Montoyo, Andres},
46
46
  )
47
47
  samples_per_label = 16
48
48
 
49
- def dataset_transform(self) -> None:
49
+ def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
50
50
  self.dataset = self.dataset.rename_columns(
51
51
  {"comment": "text", "rating_str": "label"}
52
52
  )
@@ -99,7 +99,7 @@ Montoyo, Andres},
99
99
  )
100
100
  samples_per_label = 16
101
101
 
102
- def dataset_transform(self) -> None:
102
+ def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
103
103
  self.dataset = self.stratified_subsampling(
104
104
  self.dataset, seed=self.seed, splits=["test"]
105
105
  )
@@ -46,7 +46,7 @@ Montoyo, Andres},
46
46
  )
47
47
  samples_per_label = 16
48
48
 
49
- def dataset_transform(self) -> None:
49
+ def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
50
50
  self.dataset = self.dataset.rename_columns(
51
51
  {"comment": "text", "sentiment_int": "label"}
52
52
  )
@@ -60,7 +60,7 @@ class HinDialectClassification(AbsTaskClassification):
60
60
  """,
61
61
  )
62
62
 
63
- def dataset_transform(self) -> None:
63
+ def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
64
64
  self.dataset = self.dataset.rename_columns(
65
65
  {"folksong": "text", "language": "label"}
66
66
  )
@@ -137,6 +137,6 @@ Okazaki, Naoaki},
137
137
  self.dataset_transform()
138
138
  self.data_loaded = True
139
139
 
140
- def dataset_transform(self) -> None:
140
+ def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
141
141
  self.dataset = self.dataset.remove_columns(["language", "script"])
142
142
  self.dataset = self.dataset.rename_columns({"native sentence": "text"})
@@ -52,7 +52,7 @@ class IndicSentimentClassification(AbsTaskClassification):
52
52
  """,
53
53
  )
54
54
 
55
- def dataset_transform(self) -> None:
55
+ def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
56
56
  label_map = {"Negative": 0, "Positive": 1}
57
57
  # Convert to standard format
58
58
  for lang in self.hf_subsets:
@@ -66,7 +66,7 @@ in Natural Language Processing},
66
66
  """,
67
67
  )
68
68
 
69
- def dataset_transform(self) -> None:
69
+ def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
70
70
  self.dataset = self.dataset.rename_columns({"labels": "label"})
71
71
  self.dataset = self.stratified_subsampling(
72
72
  self.dataset, seed=self.seed, splits=["test"]
@@ -49,7 +49,7 @@ class SouthAfricanLangClassification(AbsTaskClassification):
49
49
  """,
50
50
  )
51
51
 
52
- def dataset_transform(self) -> None:
52
+ def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
53
53
  self.dataset = self.dataset.rename_columns(
54
54
  {" text": "text", "lang_id": "label"}
55
55
  )
@@ -35,7 +35,7 @@ class SlovakMovieReviewSentimentClassification(AbsTaskClassification):
35
35
  superseded_by="SlovakMovieReviewSentimentClassification.v2",
36
36
  )
37
37
 
38
- def dataset_transform(self) -> None:
38
+ def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
39
39
  self.dataset = self.dataset.rename_columns({"comment": "text"})
40
40
 
41
41
  self.dataset = self.stratified_subsampling(
@@ -76,7 +76,7 @@ class SlovakMovieReviewSentimentClassificationV2(AbsTaskClassification):
76
76
  adapted_from=["SlovakMovieReviewSentimentClassification"],
77
77
  )
78
78
 
79
- def dataset_transform(self) -> None:
79
+ def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
80
80
  self.dataset = self.stratified_subsampling(
81
81
  self.dataset, seed=self.seed, splits=["test"]
82
82
  )
@@ -37,7 +37,7 @@ class SwahiliNewsClassification(AbsTaskClassification):
37
37
  superseded_by="SwahiliNewsClassification.v2",
38
38
  )
39
39
 
40
- def dataset_transform(self) -> None:
40
+ def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
41
41
  self.dataset = self.dataset.rename_columns(
42
42
  {"content": "text", "category": "label"}
43
43
  )
@@ -81,7 +81,7 @@ class SwahiliNewsClassificationV2(AbsTaskClassification):
81
81
  adapted_from=["SwahiliNewsClassification"],
82
82
  )
83
83
 
84
- def dataset_transform(self) -> None:
84
+ def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
85
85
  self.dataset = self.stratified_subsampling(
86
86
  self.dataset, seed=self.seed, splits=["train"]
87
87
  )
@@ -63,7 +63,7 @@ class TenKGnadClusteringP2PFast(AbsTaskClustering):
63
63
  adapted_from=["TenKGnadClusteringP2P"],
64
64
  )
65
65
 
66
- def dataset_transform(self) -> None:
66
+ def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
67
67
  ds = _convert_to_fast(
68
68
  self.dataset, self.input_column_name, self.label_column_name, self.seed
69
69
  )
@@ -63,7 +63,7 @@ class TenKGnadClusteringS2SFast(AbsTaskClustering):
63
63
  adapted_from=["TenKGnadClusteringS2S"],
64
64
  )
65
65
 
66
- def dataset_transform(self) -> None:
66
+ def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
67
67
  ds = _convert_to_fast(
68
68
  self.dataset, self.input_column_name, self.label_column_name, self.seed
69
69
  )
@@ -45,7 +45,7 @@ class VGHierarchicalClusteringP2P(AbsTaskClustering):
45
45
  prompt="Identify the categories (e.g. sports) of given articles in Norwegian",
46
46
  )
47
47
 
48
- def dataset_transform(self) -> None:
48
+ def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
49
49
  self.dataset = self.dataset.rename_columns(
50
50
  {"article": "sentences", "classes": "labels"}
51
51
  )
@@ -92,7 +92,7 @@ class VGHierarchicalClusteringS2S(AbsTaskClustering):
92
92
  prompt="Identify the categories (e.g. sports) of given articles in Norwegian",
93
93
  )
94
94
 
95
- def dataset_transform(self) -> None:
95
+ def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
96
96
  self.dataset = self.dataset.rename_columns(
97
97
  {"ingress": "sentences", "classes": "labels"}
98
98
  )
@@ -66,7 +66,7 @@ Yih, Scott Wen-tau},
66
66
  },
67
67
  )
68
68
 
69
- def dataset_transform(self) -> None:
69
+ def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
70
70
  labels = [
71
71
  "q2_label",
72
72
  "q3_label",
@@ -60,7 +60,7 @@ class PubChemWikiPairClassification(AbsTaskPairClassification):
60
60
  """,
61
61
  )
62
62
 
63
- def dataset_transform(self) -> None:
63
+ def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
64
64
  _dataset = {}
65
65
  for lang in self.hf_subsets:
66
66
  _dataset[lang] = {}
@@ -59,7 +59,7 @@ class CodeRAGProgrammingSolutionsRetrieval(AbsTaskRetrieval):
59
59
  self.dataset_transform()
60
60
  self.data_loaded = True
61
61
 
62
- def dataset_transform(self) -> None:
62
+ def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
63
63
  """And transform to a retrieval dataset, which have the following attributes
64
64
 
65
65
  self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document data like title and text
@@ -116,7 +116,7 @@ class CodeRAGOnlineTutorialsRetrieval(AbsTaskRetrieval):
116
116
  self.dataset_transform()
117
117
  self.data_loaded = True
118
118
 
119
- def dataset_transform(self) -> None:
119
+ def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
120
120
  """And transform to a retrieval dataset, which have the following attributes
121
121
 
122
122
  self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document data like title and text
@@ -176,7 +176,7 @@ class CodeRAGLibraryDocumentationSolutionsRetrieval(AbsTaskRetrieval):
176
176
  self.dataset_transform()
177
177
  self.data_loaded = True
178
178
 
179
- def dataset_transform(self) -> None:
179
+ def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
180
180
  """And transform to a retrieval dataset, which have the following attributes
181
181
 
182
182
  self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document data like title and text
@@ -233,7 +233,7 @@ class CodeRAGStackoverflowPostsRetrieval(AbsTaskRetrieval):
233
233
  self.dataset_transform()
234
234
  self.data_loaded = True
235
235
 
236
- def dataset_transform(self) -> None:
236
+ def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
237
237
  """And transform to a retrieval dataset, which have the following attributes
238
238
 
239
239
  self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document data like title and text
@@ -55,7 +55,7 @@ Derczynski, Leon},
55
55
  self.dataset_transform()
56
56
  self.data_loaded = True
57
57
 
58
- def dataset_transform(self) -> None:
58
+ def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
59
59
  """And transform to a retrieval dataset, which have the following attributes
60
60
 
61
61
  self.corpus = dict[doc_id, dict[str, str]] #id => dict with document data like title and text
@@ -68,7 +68,7 @@ Piperidis, Stelios},
68
68
  self.dataset_transform()
69
69
  self.data_loaded = True
70
70
 
71
- def dataset_transform(self) -> None:
71
+ def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
72
72
  """And transform to a retrieval dataset, which have the following attributes
73
73
 
74
74
  self.corpus = dict[doc_id, dict[str, str]] #id => dict with document data like title and text
@@ -44,7 +44,7 @@ class TwitterHjerneRetrieval(AbsTaskRetrieval):
44
44
  self.dataset_transform()
45
45
  self.data_loaded = True
46
46
 
47
- def dataset_transform(self) -> None:
47
+ def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
48
48
  """And transform to a retrieval dataset, which have the following attributes
49
49
 
50
50
  self.corpus = dict[doc_id, dict[str, str]] #id => dict with document data like title and text
@@ -58,7 +58,7 @@ Fishel, Mark},
58
58
  self.dataset_transform()
59
59
  self.data_loaded = True
60
60
 
61
- def dataset_transform(self) -> None:
61
+ def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
62
62
  """And transform to a retrieval dataset, which have the following attributes
63
63
 
64
64
  self.corpus = dict[doc_id, dict[str, str]] #id => dict with document data like title and text
@@ -45,7 +45,7 @@ class SNLRetrieval(AbsTaskRetrieval):
45
45
  self.dataset_transform()
46
46
  self.data_loaded = True
47
47
 
48
- def dataset_transform(self) -> None:
48
+ def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
49
49
  """And transform to a retrieval dataset, which have the following attributes
50
50
 
51
51
  self.corpus = dict[doc_id, dict[str, str]] #id => dict with document data like title and text
@@ -66,6 +66,6 @@ Seid Muhie Yimam and Saif M. Mohammad},
66
66
  min_score = 0
67
67
  max_score = 1
68
68
 
69
- def dataset_transform(self) -> None:
69
+ def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
70
70
  for lang, subset in self.dataset.items():
71
71
  self.dataset[lang] = subset.rename_column("label", "score")
@@ -56,6 +56,6 @@ class STSBenchmarkMultilingualSTS(AbsTaskSTS):
56
56
  min_score = 0
57
57
  max_score = 5
58
58
 
59
- def dataset_transform(self) -> None:
59
+ def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
60
60
  for lang, subset in self.dataset.items():
61
61
  self.dataset[lang] = subset.rename_column("similarity_score", "score")
@@ -39,7 +39,7 @@ class Assin2STS(AbsTaskSTS):
39
39
  min_score = 1
40
40
  max_score = 5
41
41
 
42
- def dataset_transform(self) -> None:
42
+ def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
43
43
  self.dataset = self.dataset.rename_columns(
44
44
  {
45
45
  "premise": "sentence1",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mteb
3
- Version: 2.7.11
3
+ Version: 2.7.13
4
4
  Summary: Massive Text Embedding Benchmark
5
5
  Author-email: MTEB Contributors <niklas@huggingface.co>, Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Nouamane Tazi <nouamane@huggingface.co>, Nils Reimers <info@nils-reimers.de>
6
6
  Maintainer-email: Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Roman Solomatin <risolomatin@gmail.com>, Isaac Chung <chungisaac1217@gmail.com>
@@ -32,8 +32,6 @@ Requires-Dist: rich>=0.0.0
32
32
  Requires-Dist: pytrec-eval-terrier>=0.5.6
33
33
  Requires-Dist: pydantic>=2.0.0
34
34
  Requires-Dist: polars>=0.20.22
35
- Requires-Dist: torch; python_full_version < "3.14"
36
- Requires-Dist: torch>=2.9.0; python_full_version >= "3.14"
37
35
  Provides-Extra: image
38
36
  Requires-Dist: torchvision>0.2.1; extra == "image"
39
37
  Requires-Dist: transformers[torch-vision,vision]; extra == "image"