mteb 2.7.11__py3-none-any.whl → 2.7.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/abstasks/abstask.py +2 -1
- mteb/models/model_implementations/nomic_models.py +40 -0
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +110 -10
- mteb/models/model_implementations/ops_colqwen3_models.py +267 -0
- mteb/models/model_implementations/querit_models.py +245 -0
- mteb/tasks/bitext_mining/multilingual/bible_nlp_bitext_mining.py +1 -1
- mteb/tasks/classification/ben/bengali_document_classification.py +2 -2
- mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -1
- mteb/tasks/classification/multilingual/hin_dialect_classification.py +1 -1
- mteb/tasks/classification/multilingual/indic_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/indic_sentiment_classification.py +1 -1
- mteb/tasks/classification/multilingual/language_classification.py +1 -1
- mteb/tasks/classification/multilingual/south_african_lang_classification.py +1 -1
- mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/swa/swahili_news_classification.py +2 -2
- mteb/tasks/clustering/deu/ten_k_gnad_clustering_p2p.py +1 -1
- mteb/tasks/clustering/deu/ten_k_gnad_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nob/vg_hierarchical_clustering.py +2 -2
- mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +1 -1
- mteb/tasks/pair_classification/multilingual/pub_chem_wiki_pair_classification.py +1 -1
- mteb/tasks/retrieval/code/code_rag.py +4 -4
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +1 -1
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +1 -1
- mteb/tasks/retrieval/nob/norquad.py +1 -1
- mteb/tasks/retrieval/nob/snl_retrieval.py +1 -1
- mteb/tasks/sts/multilingual/sem_rel24_sts.py +1 -1
- mteb/tasks/sts/multilingual/sts_benchmark_multilingual_sts.py +1 -1
- mteb/tasks/sts/por/assin2_sts.py +1 -1
- {mteb-2.7.11.dist-info → mteb-2.7.13.dist-info}/METADATA +1 -3
- {mteb-2.7.11.dist-info → mteb-2.7.13.dist-info}/RECORD +36 -34
- {mteb-2.7.11.dist-info → mteb-2.7.13.dist-info}/WHEEL +1 -1
- {mteb-2.7.11.dist-info → mteb-2.7.13.dist-info}/entry_points.txt +0 -0
- {mteb-2.7.11.dist-info → mteb-2.7.13.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.7.11.dist-info → mteb-2.7.13.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
5
|
+
|
|
6
|
+
import torch
|
|
7
|
+
from tqdm.auto import tqdm
|
|
8
|
+
|
|
9
|
+
from mteb.models.model_meta import ModelMeta
|
|
10
|
+
|
|
11
|
+
from .rerankers_custom import RerankerWrapper
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from torch.utils.data import DataLoader
|
|
15
|
+
|
|
16
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
17
|
+
from mteb.types import BatchedInput, PromptType
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class QueritWrapper(RerankerWrapper):
|
|
23
|
+
"""
|
|
24
|
+
Multi-GPU / multi-process reranker wrapper for mteb.mteb evaluation.
|
|
25
|
+
Supports flattening all query-passage pairs without explicit grouping.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
model_name: str,
|
|
31
|
+
**kwargs: Any,
|
|
32
|
+
) -> None:
|
|
33
|
+
super().__init__(model_name, **kwargs)
|
|
34
|
+
from transformers import AutoModel, AutoTokenizer
|
|
35
|
+
|
|
36
|
+
if not self.device:
|
|
37
|
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
38
|
+
model_args = {}
|
|
39
|
+
if self.fp_options:
|
|
40
|
+
model_args["torch_dtype"] = self.fp_options
|
|
41
|
+
self.model = AutoModel.from_pretrained(
|
|
42
|
+
model_name, trust_remote_code=True, **model_args
|
|
43
|
+
)
|
|
44
|
+
logger.info(f"Using model {model_name}")
|
|
45
|
+
|
|
46
|
+
self.model.to(self.device)
|
|
47
|
+
self.tokenizer = AutoTokenizer.from_pretrained(
|
|
48
|
+
model_name, trust_remote_code=True
|
|
49
|
+
)
|
|
50
|
+
if "[CLS]" not in self.tokenizer.get_vocab():
|
|
51
|
+
raise ValueError("Tokenizer missing required special token '[CLS]'")
|
|
52
|
+
self.cls_token_id = self.tokenizer.convert_tokens_to_ids("[CLS]")
|
|
53
|
+
self.pad_token_id = self.tokenizer.pad_token_id or 0
|
|
54
|
+
|
|
55
|
+
self.max_length = (
|
|
56
|
+
min(kwargs.get("max_length", 4096), self.tokenizer.model_max_length) - 1
|
|
57
|
+
) # sometimes it's a v large number/max int
|
|
58
|
+
logger.info(f"Using max_length of {self.max_length}, 1 token for [CLS]")
|
|
59
|
+
self.model.eval()
|
|
60
|
+
|
|
61
|
+
def process_inputs(
|
|
62
|
+
self,
|
|
63
|
+
pairs: list[str],
|
|
64
|
+
) -> dict[str, torch.Tensor]:
|
|
65
|
+
"""
|
|
66
|
+
Encode a batch of (query, document) pairs:
|
|
67
|
+
- Concatenate prompt + Query + Content
|
|
68
|
+
- Append [CLS] at the end
|
|
69
|
+
- Left-pad to max_length
|
|
70
|
+
- Generate custom attention mask based on block types
|
|
71
|
+
"""
|
|
72
|
+
# Construct input texts
|
|
73
|
+
enc = self.tokenizer(
|
|
74
|
+
pairs,
|
|
75
|
+
add_special_tokens=False,
|
|
76
|
+
truncation=True,
|
|
77
|
+
max_length=self.max_length,
|
|
78
|
+
padding=False,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
input_ids_list: list[list[int]] = []
|
|
82
|
+
attn_mask_list: list[torch.Tensor] = []
|
|
83
|
+
|
|
84
|
+
for ids in enc["input_ids"]:
|
|
85
|
+
# Append [CLS] token
|
|
86
|
+
ids = ids + [self.cls_token_id]
|
|
87
|
+
block_types = [1] * (len(ids) - 1) + [2] # content + CLS
|
|
88
|
+
|
|
89
|
+
# Pad or truncate
|
|
90
|
+
if len(ids) < self.max_length:
|
|
91
|
+
pad_len = self.max_length - len(ids)
|
|
92
|
+
ids = [self.pad_token_id] * pad_len + ids
|
|
93
|
+
block_types = [0] * pad_len + block_types
|
|
94
|
+
else:
|
|
95
|
+
ids = ids[-self.max_length :]
|
|
96
|
+
block_types = block_types[-self.max_length :]
|
|
97
|
+
|
|
98
|
+
attn = self.compute_mask_content_cls(block_types)
|
|
99
|
+
input_ids_list.append(ids)
|
|
100
|
+
attn_mask_list.append(attn)
|
|
101
|
+
|
|
102
|
+
input_ids = torch.tensor(input_ids_list, dtype=torch.long, device=self.device)
|
|
103
|
+
attention_mask = torch.stack(attn_mask_list, dim=0).to(self.device)
|
|
104
|
+
|
|
105
|
+
return {"input_ids": input_ids, "attention_mask": attention_mask}
|
|
106
|
+
|
|
107
|
+
@torch.inference_mode()
|
|
108
|
+
def predict(
|
|
109
|
+
self,
|
|
110
|
+
inputs1: DataLoader[BatchedInput],
|
|
111
|
+
inputs2: DataLoader[BatchedInput],
|
|
112
|
+
*,
|
|
113
|
+
task_metadata: TaskMetadata,
|
|
114
|
+
hf_split: str,
|
|
115
|
+
hf_subset: str,
|
|
116
|
+
prompt_type: PromptType | None = None,
|
|
117
|
+
**kwargs: Any,
|
|
118
|
+
) -> list[float]:
|
|
119
|
+
"""
|
|
120
|
+
Predict relevance scores for query-passage pairs.
|
|
121
|
+
Supports both single-process and multi-process/multi-GPU modes.
|
|
122
|
+
"""
|
|
123
|
+
# Flatten all pairs from mteb.mteb DataLoaders
|
|
124
|
+
queries = [text for batch in inputs1 for text in batch["text"]]
|
|
125
|
+
passages = [text for batch in inputs2 for text in batch["text"]]
|
|
126
|
+
|
|
127
|
+
instructions = None
|
|
128
|
+
if "instruction" in inputs2.dataset.features:
|
|
129
|
+
instructions = [text for batch in inputs1 for text in batch["instruction"]]
|
|
130
|
+
|
|
131
|
+
num_pairs = len(queries)
|
|
132
|
+
if num_pairs == 0:
|
|
133
|
+
return []
|
|
134
|
+
final_scores: list[float] = []
|
|
135
|
+
|
|
136
|
+
batch_size = kwargs.get("batch_size", self.batch_size)
|
|
137
|
+
with tqdm(total=num_pairs, desc="Scoring", ncols=100) as pbar:
|
|
138
|
+
for start in range(0, num_pairs, batch_size):
|
|
139
|
+
end = min(start + batch_size, num_pairs)
|
|
140
|
+
batch_q = queries[start:end]
|
|
141
|
+
batch_d = passages[start:end]
|
|
142
|
+
|
|
143
|
+
batch_instructions = (
|
|
144
|
+
instructions[start:end]
|
|
145
|
+
if instructions is not None
|
|
146
|
+
else [None] * len(batch_q)
|
|
147
|
+
)
|
|
148
|
+
pairs = [
|
|
149
|
+
self.format_instruction(instr, query, doc)
|
|
150
|
+
for instr, query, doc in zip(batch_instructions, batch_q, batch_d)
|
|
151
|
+
]
|
|
152
|
+
enc = self.process_inputs(pairs)
|
|
153
|
+
out = self.model(**enc)
|
|
154
|
+
scores = out["score"].squeeze(-1).detach().float().cpu().tolist()
|
|
155
|
+
|
|
156
|
+
if not isinstance(scores, list):
|
|
157
|
+
scores = [scores]
|
|
158
|
+
|
|
159
|
+
final_scores.extend(scores)
|
|
160
|
+
pbar.update(len(scores))
|
|
161
|
+
|
|
162
|
+
return final_scores
|
|
163
|
+
|
|
164
|
+
@staticmethod
|
|
165
|
+
def format_instruction(instruction: str | None, query: str, doc: str) -> str:
|
|
166
|
+
if instruction is None:
|
|
167
|
+
output = f"Judge whether the Content meets the requirements based on the Query. Query: {query}; Content: {doc}"
|
|
168
|
+
else:
|
|
169
|
+
output = f"{instruction} Query: {query}; Content: {doc}"
|
|
170
|
+
return output
|
|
171
|
+
|
|
172
|
+
@staticmethod
|
|
173
|
+
def compute_mask_content_cls(block_types: list[int]) -> torch.Tensor:
|
|
174
|
+
"""
|
|
175
|
+
Create custom attention mask based on token block types:
|
|
176
|
+
- 0: padding → ignored
|
|
177
|
+
- 1: content → causal attention to previous content only
|
|
178
|
+
- 2: [CLS] → causal attention to all non-padding tokens
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
block_types: List of token types for one sequence
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
[1, seq_len, seq_len] boolean attention mask (True = allowed to attend)
|
|
185
|
+
"""
|
|
186
|
+
pos = torch.tensor(block_types, dtype=torch.long)
|
|
187
|
+
n = pos.shape[0]
|
|
188
|
+
if n == 0:
|
|
189
|
+
return torch.empty((0, 0), dtype=torch.bool, device=pos.device)
|
|
190
|
+
|
|
191
|
+
row_types = pos.view(n, 1)
|
|
192
|
+
col_types = pos.view(1, n)
|
|
193
|
+
|
|
194
|
+
row_idx = torch.arange(n, device=pos.device).view(n, 1)
|
|
195
|
+
col_idx = torch.arange(n, device=pos.device).view(1, n)
|
|
196
|
+
causal_mask = col_idx <= row_idx
|
|
197
|
+
|
|
198
|
+
# Content tokens only attend to previous content
|
|
199
|
+
mask_content = (row_types == 1) & (col_types == 1) & causal_mask
|
|
200
|
+
|
|
201
|
+
# [CLS] attends to all non-pad tokens (causal)
|
|
202
|
+
mask_cls = (row_types == 2) & (col_types != 0) & causal_mask
|
|
203
|
+
|
|
204
|
+
type_mask = mask_content | mask_cls
|
|
205
|
+
return type_mask.unsqueeze(0)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
querit_reranker_training_data = {
|
|
209
|
+
"MIRACLRanking", # https://huggingface.co/datasets/mteb/MIRACLReranking
|
|
210
|
+
"MrTidyRetrieval", # https://huggingface.co/datasets/mteb/mrtidy
|
|
211
|
+
"ruri-v3-dataset-reranker", # https://huggingface.co/datasets/cl-nagoya/ruri-v3-dataset-reranker
|
|
212
|
+
"MultiLongDocReranking", # https://huggingface.co/datasets/Shitao/MLDR
|
|
213
|
+
"MindSmallReranking", # https://huggingface.co/datasets/mteb/MindSmallReranking
|
|
214
|
+
"MSMARCO", # https://huggingface.co/datasets/mteb/msmarco
|
|
215
|
+
"CQADupStack", # https://huggingface.co/datasets/mteb/cqadupstack-*
|
|
216
|
+
"AskUbuntuDupQuestions", # https://github.com/taolei87/askubuntu & The corpus and queries that overlap with mteb/askubuntudupquestions-reranking have been removed.
|
|
217
|
+
"T2Reranking", # https://huggingface.co/datasets/THUIR/T2Ranking & The corpus and queries that overlap with mteb/T2Reranking have been removed.
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
model_meta = ModelMeta(
|
|
221
|
+
loader=QueritWrapper,
|
|
222
|
+
loader_kwargs={
|
|
223
|
+
"fp_options": "bfloat16",
|
|
224
|
+
},
|
|
225
|
+
name="Querit/Querit",
|
|
226
|
+
model_type=["cross-encoder"],
|
|
227
|
+
languages=["eng-Latn"],
|
|
228
|
+
open_weights=True,
|
|
229
|
+
revision="5ad2649cc4defb7e1361262260e9a781f14b08bc",
|
|
230
|
+
release_date="2026-01-24",
|
|
231
|
+
n_parameters=4919636992,
|
|
232
|
+
n_embedding_parameters=131907584,
|
|
233
|
+
embed_dim=1024,
|
|
234
|
+
memory_usage_mb=9383.0,
|
|
235
|
+
max_tokens=4096,
|
|
236
|
+
reference="https://huggingface.co/Querit/Querit",
|
|
237
|
+
similarity_fn_name=None,
|
|
238
|
+
training_datasets=querit_reranker_training_data,
|
|
239
|
+
license="apache-2.0",
|
|
240
|
+
framework=["PyTorch"],
|
|
241
|
+
use_instructions=None,
|
|
242
|
+
public_training_code=None,
|
|
243
|
+
public_training_data=None,
|
|
244
|
+
citation=None,
|
|
245
|
+
)
|
|
@@ -914,7 +914,7 @@ class BibleNLPBitextMining(AbsTaskBitextMining):
|
|
|
914
914
|
self.dataset_transform()
|
|
915
915
|
self.data_loaded = True
|
|
916
916
|
|
|
917
|
-
def dataset_transform(self) -> None:
|
|
917
|
+
def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
|
|
918
918
|
# Convert to standard format
|
|
919
919
|
for lang in self.hf_subsets:
|
|
920
920
|
l1, l2 = (l.split("_")[0] for l in lang.split("-"))
|
|
@@ -43,7 +43,7 @@ Islam, Tanvir},
|
|
|
43
43
|
superseded_by="BengaliDocumentClassification.v2",
|
|
44
44
|
)
|
|
45
45
|
|
|
46
|
-
def dataset_transform(self) -> None:
|
|
46
|
+
def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
|
|
47
47
|
self.dataset = self.dataset.rename_columns(
|
|
48
48
|
{"article": "text", "category": "label"}
|
|
49
49
|
)
|
|
@@ -92,7 +92,7 @@ Islam, Tanvir},
|
|
|
92
92
|
""",
|
|
93
93
|
)
|
|
94
94
|
|
|
95
|
-
def dataset_transform(self) -> None:
|
|
95
|
+
def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
|
|
96
96
|
self.dataset = self.stratified_subsampling(
|
|
97
97
|
self.dataset, seed=self.seed, splits=["test"]
|
|
98
98
|
)
|
|
@@ -46,7 +46,7 @@ Montoyo, Andres},
|
|
|
46
46
|
)
|
|
47
47
|
samples_per_label = 16
|
|
48
48
|
|
|
49
|
-
def dataset_transform(self) -> None:
|
|
49
|
+
def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
|
|
50
50
|
self.dataset = self.dataset.rename_columns(
|
|
51
51
|
{"comment": "text", "rating_str": "label"}
|
|
52
52
|
)
|
|
@@ -99,7 +99,7 @@ Montoyo, Andres},
|
|
|
99
99
|
)
|
|
100
100
|
samples_per_label = 16
|
|
101
101
|
|
|
102
|
-
def dataset_transform(self) -> None:
|
|
102
|
+
def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
|
|
103
103
|
self.dataset = self.stratified_subsampling(
|
|
104
104
|
self.dataset, seed=self.seed, splits=["test"]
|
|
105
105
|
)
|
|
@@ -46,7 +46,7 @@ Montoyo, Andres},
|
|
|
46
46
|
)
|
|
47
47
|
samples_per_label = 16
|
|
48
48
|
|
|
49
|
-
def dataset_transform(self) -> None:
|
|
49
|
+
def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
|
|
50
50
|
self.dataset = self.dataset.rename_columns(
|
|
51
51
|
{"comment": "text", "sentiment_int": "label"}
|
|
52
52
|
)
|
|
@@ -60,7 +60,7 @@ class HinDialectClassification(AbsTaskClassification):
|
|
|
60
60
|
""",
|
|
61
61
|
)
|
|
62
62
|
|
|
63
|
-
def dataset_transform(self) -> None:
|
|
63
|
+
def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
|
|
64
64
|
self.dataset = self.dataset.rename_columns(
|
|
65
65
|
{"folksong": "text", "language": "label"}
|
|
66
66
|
)
|
|
@@ -137,6 +137,6 @@ Okazaki, Naoaki},
|
|
|
137
137
|
self.dataset_transform()
|
|
138
138
|
self.data_loaded = True
|
|
139
139
|
|
|
140
|
-
def dataset_transform(self) -> None:
|
|
140
|
+
def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
|
|
141
141
|
self.dataset = self.dataset.remove_columns(["language", "script"])
|
|
142
142
|
self.dataset = self.dataset.rename_columns({"native sentence": "text"})
|
|
@@ -52,7 +52,7 @@ class IndicSentimentClassification(AbsTaskClassification):
|
|
|
52
52
|
""",
|
|
53
53
|
)
|
|
54
54
|
|
|
55
|
-
def dataset_transform(self) -> None:
|
|
55
|
+
def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
|
|
56
56
|
label_map = {"Negative": 0, "Positive": 1}
|
|
57
57
|
# Convert to standard format
|
|
58
58
|
for lang in self.hf_subsets:
|
|
@@ -66,7 +66,7 @@ in Natural Language Processing},
|
|
|
66
66
|
""",
|
|
67
67
|
)
|
|
68
68
|
|
|
69
|
-
def dataset_transform(self) -> None:
|
|
69
|
+
def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
|
|
70
70
|
self.dataset = self.dataset.rename_columns({"labels": "label"})
|
|
71
71
|
self.dataset = self.stratified_subsampling(
|
|
72
72
|
self.dataset, seed=self.seed, splits=["test"]
|
|
@@ -49,7 +49,7 @@ class SouthAfricanLangClassification(AbsTaskClassification):
|
|
|
49
49
|
""",
|
|
50
50
|
)
|
|
51
51
|
|
|
52
|
-
def dataset_transform(self) -> None:
|
|
52
|
+
def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
|
|
53
53
|
self.dataset = self.dataset.rename_columns(
|
|
54
54
|
{" text": "text", "lang_id": "label"}
|
|
55
55
|
)
|
|
@@ -35,7 +35,7 @@ class SlovakMovieReviewSentimentClassification(AbsTaskClassification):
|
|
|
35
35
|
superseded_by="SlovakMovieReviewSentimentClassification.v2",
|
|
36
36
|
)
|
|
37
37
|
|
|
38
|
-
def dataset_transform(self) -> None:
|
|
38
|
+
def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
|
|
39
39
|
self.dataset = self.dataset.rename_columns({"comment": "text"})
|
|
40
40
|
|
|
41
41
|
self.dataset = self.stratified_subsampling(
|
|
@@ -76,7 +76,7 @@ class SlovakMovieReviewSentimentClassificationV2(AbsTaskClassification):
|
|
|
76
76
|
adapted_from=["SlovakMovieReviewSentimentClassification"],
|
|
77
77
|
)
|
|
78
78
|
|
|
79
|
-
def dataset_transform(self) -> None:
|
|
79
|
+
def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
|
|
80
80
|
self.dataset = self.stratified_subsampling(
|
|
81
81
|
self.dataset, seed=self.seed, splits=["test"]
|
|
82
82
|
)
|
|
@@ -37,7 +37,7 @@ class SwahiliNewsClassification(AbsTaskClassification):
|
|
|
37
37
|
superseded_by="SwahiliNewsClassification.v2",
|
|
38
38
|
)
|
|
39
39
|
|
|
40
|
-
def dataset_transform(self) -> None:
|
|
40
|
+
def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
|
|
41
41
|
self.dataset = self.dataset.rename_columns(
|
|
42
42
|
{"content": "text", "category": "label"}
|
|
43
43
|
)
|
|
@@ -81,7 +81,7 @@ class SwahiliNewsClassificationV2(AbsTaskClassification):
|
|
|
81
81
|
adapted_from=["SwahiliNewsClassification"],
|
|
82
82
|
)
|
|
83
83
|
|
|
84
|
-
def dataset_transform(self) -> None:
|
|
84
|
+
def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
|
|
85
85
|
self.dataset = self.stratified_subsampling(
|
|
86
86
|
self.dataset, seed=self.seed, splits=["train"]
|
|
87
87
|
)
|
|
@@ -63,7 +63,7 @@ class TenKGnadClusteringP2PFast(AbsTaskClustering):
|
|
|
63
63
|
adapted_from=["TenKGnadClusteringP2P"],
|
|
64
64
|
)
|
|
65
65
|
|
|
66
|
-
def dataset_transform(self) -> None:
|
|
66
|
+
def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
|
|
67
67
|
ds = _convert_to_fast(
|
|
68
68
|
self.dataset, self.input_column_name, self.label_column_name, self.seed
|
|
69
69
|
)
|
|
@@ -63,7 +63,7 @@ class TenKGnadClusteringS2SFast(AbsTaskClustering):
|
|
|
63
63
|
adapted_from=["TenKGnadClusteringS2S"],
|
|
64
64
|
)
|
|
65
65
|
|
|
66
|
-
def dataset_transform(self) -> None:
|
|
66
|
+
def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
|
|
67
67
|
ds = _convert_to_fast(
|
|
68
68
|
self.dataset, self.input_column_name, self.label_column_name, self.seed
|
|
69
69
|
)
|
|
@@ -45,7 +45,7 @@ class VGHierarchicalClusteringP2P(AbsTaskClustering):
|
|
|
45
45
|
prompt="Identify the categories (e.g. sports) of given articles in Norwegian",
|
|
46
46
|
)
|
|
47
47
|
|
|
48
|
-
def dataset_transform(self) -> None:
|
|
48
|
+
def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
|
|
49
49
|
self.dataset = self.dataset.rename_columns(
|
|
50
50
|
{"article": "sentences", "classes": "labels"}
|
|
51
51
|
)
|
|
@@ -92,7 +92,7 @@ class VGHierarchicalClusteringS2S(AbsTaskClustering):
|
|
|
92
92
|
prompt="Identify the categories (e.g. sports) of given articles in Norwegian",
|
|
93
93
|
)
|
|
94
94
|
|
|
95
|
-
def dataset_transform(self) -> None:
|
|
95
|
+
def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
|
|
96
96
|
self.dataset = self.dataset.rename_columns(
|
|
97
97
|
{"ingress": "sentences", "classes": "labels"}
|
|
98
98
|
)
|
|
@@ -60,7 +60,7 @@ class PubChemWikiPairClassification(AbsTaskPairClassification):
|
|
|
60
60
|
""",
|
|
61
61
|
)
|
|
62
62
|
|
|
63
|
-
def dataset_transform(self) -> None:
|
|
63
|
+
def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
|
|
64
64
|
_dataset = {}
|
|
65
65
|
for lang in self.hf_subsets:
|
|
66
66
|
_dataset[lang] = {}
|
|
@@ -59,7 +59,7 @@ class CodeRAGProgrammingSolutionsRetrieval(AbsTaskRetrieval):
|
|
|
59
59
|
self.dataset_transform()
|
|
60
60
|
self.data_loaded = True
|
|
61
61
|
|
|
62
|
-
def dataset_transform(self) -> None:
|
|
62
|
+
def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
|
|
63
63
|
"""And transform to a retrieval dataset, which have the following attributes
|
|
64
64
|
|
|
65
65
|
self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document data like title and text
|
|
@@ -116,7 +116,7 @@ class CodeRAGOnlineTutorialsRetrieval(AbsTaskRetrieval):
|
|
|
116
116
|
self.dataset_transform()
|
|
117
117
|
self.data_loaded = True
|
|
118
118
|
|
|
119
|
-
def dataset_transform(self) -> None:
|
|
119
|
+
def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
|
|
120
120
|
"""And transform to a retrieval dataset, which have the following attributes
|
|
121
121
|
|
|
122
122
|
self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document data like title and text
|
|
@@ -176,7 +176,7 @@ class CodeRAGLibraryDocumentationSolutionsRetrieval(AbsTaskRetrieval):
|
|
|
176
176
|
self.dataset_transform()
|
|
177
177
|
self.data_loaded = True
|
|
178
178
|
|
|
179
|
-
def dataset_transform(self) -> None:
|
|
179
|
+
def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
|
|
180
180
|
"""And transform to a retrieval dataset, which have the following attributes
|
|
181
181
|
|
|
182
182
|
self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document data like title and text
|
|
@@ -233,7 +233,7 @@ class CodeRAGStackoverflowPostsRetrieval(AbsTaskRetrieval):
|
|
|
233
233
|
self.dataset_transform()
|
|
234
234
|
self.data_loaded = True
|
|
235
235
|
|
|
236
|
-
def dataset_transform(self) -> None:
|
|
236
|
+
def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
|
|
237
237
|
"""And transform to a retrieval dataset, which have the following attributes
|
|
238
238
|
|
|
239
239
|
self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document data like title and text
|
|
@@ -55,7 +55,7 @@ Derczynski, Leon},
|
|
|
55
55
|
self.dataset_transform()
|
|
56
56
|
self.data_loaded = True
|
|
57
57
|
|
|
58
|
-
def dataset_transform(self) -> None:
|
|
58
|
+
def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
|
|
59
59
|
"""And transform to a retrieval dataset, which have the following attributes
|
|
60
60
|
|
|
61
61
|
self.corpus = dict[doc_id, dict[str, str]] #id => dict with document data like title and text
|
|
@@ -68,7 +68,7 @@ Piperidis, Stelios},
|
|
|
68
68
|
self.dataset_transform()
|
|
69
69
|
self.data_loaded = True
|
|
70
70
|
|
|
71
|
-
def dataset_transform(self) -> None:
|
|
71
|
+
def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
|
|
72
72
|
"""And transform to a retrieval dataset, which have the following attributes
|
|
73
73
|
|
|
74
74
|
self.corpus = dict[doc_id, dict[str, str]] #id => dict with document data like title and text
|
|
@@ -44,7 +44,7 @@ class TwitterHjerneRetrieval(AbsTaskRetrieval):
|
|
|
44
44
|
self.dataset_transform()
|
|
45
45
|
self.data_loaded = True
|
|
46
46
|
|
|
47
|
-
def dataset_transform(self) -> None:
|
|
47
|
+
def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
|
|
48
48
|
"""And transform to a retrieval dataset, which have the following attributes
|
|
49
49
|
|
|
50
50
|
self.corpus = dict[doc_id, dict[str, str]] #id => dict with document data like title and text
|
|
@@ -58,7 +58,7 @@ Fishel, Mark},
|
|
|
58
58
|
self.dataset_transform()
|
|
59
59
|
self.data_loaded = True
|
|
60
60
|
|
|
61
|
-
def dataset_transform(self) -> None:
|
|
61
|
+
def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
|
|
62
62
|
"""And transform to a retrieval dataset, which have the following attributes
|
|
63
63
|
|
|
64
64
|
self.corpus = dict[doc_id, dict[str, str]] #id => dict with document data like title and text
|
|
@@ -45,7 +45,7 @@ class SNLRetrieval(AbsTaskRetrieval):
|
|
|
45
45
|
self.dataset_transform()
|
|
46
46
|
self.data_loaded = True
|
|
47
47
|
|
|
48
|
-
def dataset_transform(self) -> None:
|
|
48
|
+
def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
|
|
49
49
|
"""And transform to a retrieval dataset, which have the following attributes
|
|
50
50
|
|
|
51
51
|
self.corpus = dict[doc_id, dict[str, str]] #id => dict with document data like title and text
|
|
@@ -66,6 +66,6 @@ Seid Muhie Yimam and Saif M. Mohammad},
|
|
|
66
66
|
min_score = 0
|
|
67
67
|
max_score = 1
|
|
68
68
|
|
|
69
|
-
def dataset_transform(self) -> None:
|
|
69
|
+
def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
|
|
70
70
|
for lang, subset in self.dataset.items():
|
|
71
71
|
self.dataset[lang] = subset.rename_column("label", "score")
|
|
@@ -56,6 +56,6 @@ class STSBenchmarkMultilingualSTS(AbsTaskSTS):
|
|
|
56
56
|
min_score = 0
|
|
57
57
|
max_score = 5
|
|
58
58
|
|
|
59
|
-
def dataset_transform(self) -> None:
|
|
59
|
+
def dataset_transform(self, num_proc: int = 1, **kwargs) -> None:
|
|
60
60
|
for lang, subset in self.dataset.items():
|
|
61
61
|
self.dataset[lang] = subset.rename_column("similarity_score", "score")
|
mteb/tasks/sts/por/assin2_sts.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mteb
|
|
3
|
-
Version: 2.7.
|
|
3
|
+
Version: 2.7.13
|
|
4
4
|
Summary: Massive Text Embedding Benchmark
|
|
5
5
|
Author-email: MTEB Contributors <niklas@huggingface.co>, Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Nouamane Tazi <nouamane@huggingface.co>, Nils Reimers <info@nils-reimers.de>
|
|
6
6
|
Maintainer-email: Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Roman Solomatin <risolomatin@gmail.com>, Isaac Chung <chungisaac1217@gmail.com>
|
|
@@ -32,8 +32,6 @@ Requires-Dist: rich>=0.0.0
|
|
|
32
32
|
Requires-Dist: pytrec-eval-terrier>=0.5.6
|
|
33
33
|
Requires-Dist: pydantic>=2.0.0
|
|
34
34
|
Requires-Dist: polars>=0.20.22
|
|
35
|
-
Requires-Dist: torch; python_full_version < "3.14"
|
|
36
|
-
Requires-Dist: torch>=2.9.0; python_full_version >= "3.14"
|
|
37
35
|
Provides-Extra: image
|
|
38
36
|
Requires-Dist: torchvision>0.2.1; extra == "image"
|
|
39
37
|
Requires-Dist: transformers[torch-vision,vision]; extra == "image"
|