mteb 2.5.2__py3-none-any.whl → 2.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +2 -0
- mteb/_create_dataloaders.py +17 -18
- mteb/_evaluators/any_sts_evaluator.py +3 -3
- mteb/_evaluators/clustering_evaluator.py +2 -2
- mteb/_evaluators/evaluator.py +4 -2
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +10 -8
- mteb/_evaluators/pair_classification_evaluator.py +5 -3
- mteb/_evaluators/retrieval_evaluator.py +2 -2
- mteb/_evaluators/retrieval_metrics.py +18 -17
- mteb/_evaluators/sklearn_evaluator.py +11 -10
- mteb/_evaluators/text/bitext_mining_evaluator.py +27 -18
- mteb/_evaluators/text/summarization_evaluator.py +23 -18
- mteb/_evaluators/zeroshot_classification_evaluator.py +5 -3
- mteb/abstasks/_data_filter/filters.py +1 -1
- mteb/abstasks/_data_filter/task_pipelines.py +3 -0
- mteb/abstasks/_statistics_calculation.py +18 -10
- mteb/abstasks/_stratification.py +18 -18
- mteb/abstasks/abstask.py +35 -28
- mteb/abstasks/aggregate_task_metadata.py +1 -9
- mteb/abstasks/aggregated_task.py +10 -29
- mteb/abstasks/classification.py +15 -10
- mteb/abstasks/clustering.py +19 -15
- mteb/abstasks/clustering_legacy.py +10 -10
- mteb/abstasks/image/image_text_pair_classification.py +7 -4
- mteb/abstasks/multilabel_classification.py +23 -19
- mteb/abstasks/pair_classification.py +20 -11
- mteb/abstasks/regression.py +4 -4
- mteb/abstasks/retrieval.py +28 -24
- mteb/abstasks/retrieval_dataset_loaders.py +2 -2
- mteb/abstasks/sts.py +8 -5
- mteb/abstasks/task_metadata.py +31 -33
- mteb/abstasks/text/bitext_mining.py +39 -28
- mteb/abstasks/text/reranking.py +8 -6
- mteb/abstasks/text/summarization.py +10 -5
- mteb/abstasks/zeroshot_classification.py +8 -4
- mteb/benchmarks/benchmark.py +4 -2
- mteb/benchmarks/benchmarks/__init__.py +4 -0
- mteb/benchmarks/benchmarks/benchmarks.py +112 -11
- mteb/benchmarks/get_benchmark.py +14 -55
- mteb/cache.py +182 -29
- mteb/cli/_display_tasks.py +2 -2
- mteb/cli/build_cli.py +110 -14
- mteb/cli/generate_model_card.py +43 -23
- mteb/deprecated_evaluator.py +63 -49
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
- mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
- mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
- mteb/evaluate.py +44 -33
- mteb/filter_tasks.py +25 -26
- mteb/get_tasks.py +29 -30
- mteb/languages/language_scripts.py +5 -3
- mteb/leaderboard/app.py +162 -34
- mteb/load_results.py +12 -12
- mteb/models/abs_encoder.py +10 -6
- mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
- mteb/models/cache_wrappers/cache_wrapper.py +2 -2
- mteb/models/get_model_meta.py +21 -3
- mteb/models/instruct_wrapper.py +28 -8
- mteb/models/model_implementations/align_models.py +1 -1
- mteb/models/model_implementations/andersborges.py +4 -4
- mteb/models/model_implementations/ara_models.py +1 -1
- mteb/models/model_implementations/arctic_models.py +8 -8
- mteb/models/model_implementations/b1ade_models.py +1 -1
- mteb/models/model_implementations/bge_models.py +45 -21
- mteb/models/model_implementations/bica_model.py +3 -3
- mteb/models/model_implementations/blip2_models.py +2 -2
- mteb/models/model_implementations/blip_models.py +16 -16
- mteb/models/model_implementations/bm25.py +4 -4
- mteb/models/model_implementations/bmretriever_models.py +6 -4
- mteb/models/model_implementations/cadet_models.py +1 -1
- mteb/models/model_implementations/cde_models.py +11 -4
- mteb/models/model_implementations/clip_models.py +6 -6
- mteb/models/model_implementations/clips_models.py +3 -3
- mteb/models/model_implementations/codefuse_models.py +5 -5
- mteb/models/model_implementations/codesage_models.py +3 -3
- mteb/models/model_implementations/cohere_models.py +5 -5
- mteb/models/model_implementations/cohere_v.py +2 -2
- mteb/models/model_implementations/colpali_models.py +3 -3
- mteb/models/model_implementations/colqwen_models.py +8 -8
- mteb/models/model_implementations/colsmol_models.py +2 -2
- mteb/models/model_implementations/conan_models.py +1 -1
- mteb/models/model_implementations/dino_models.py +42 -42
- mteb/models/model_implementations/e5_instruct.py +23 -4
- mteb/models/model_implementations/e5_models.py +9 -9
- mteb/models/model_implementations/e5_v.py +6 -6
- mteb/models/model_implementations/eagerworks_models.py +1 -1
- mteb/models/model_implementations/emillykkejensen_models.py +6 -6
- mteb/models/model_implementations/en_code_retriever.py +1 -1
- mteb/models/model_implementations/euler_models.py +2 -2
- mteb/models/model_implementations/fa_models.py +9 -9
- mteb/models/model_implementations/facebookai.py +14 -2
- mteb/models/model_implementations/geogpt_models.py +1 -1
- mteb/models/model_implementations/gme_v_models.py +6 -5
- mteb/models/model_implementations/google_models.py +1 -1
- mteb/models/model_implementations/granite_vision_embedding_models.py +1 -1
- mteb/models/model_implementations/gritlm_models.py +2 -2
- mteb/models/model_implementations/gte_models.py +25 -13
- mteb/models/model_implementations/hinvec_models.py +1 -1
- mteb/models/model_implementations/ibm_granite_models.py +30 -6
- mteb/models/model_implementations/inf_models.py +2 -2
- mteb/models/model_implementations/jasper_models.py +2 -2
- mteb/models/model_implementations/jina_clip.py +48 -10
- mteb/models/model_implementations/jina_models.py +18 -11
- mteb/models/model_implementations/kblab.py +12 -6
- mteb/models/model_implementations/kennethenevoldsen_models.py +4 -4
- mteb/models/model_implementations/kfst.py +1 -1
- mteb/models/model_implementations/kowshik24_models.py +1 -1
- mteb/models/model_implementations/lgai_embedding_models.py +1 -1
- mteb/models/model_implementations/linq_models.py +1 -1
- mteb/models/model_implementations/listconranker.py +1 -1
- mteb/models/model_implementations/llm2clip_models.py +6 -6
- mteb/models/model_implementations/llm2vec_models.py +8 -8
- mteb/models/model_implementations/mcinext_models.py +4 -1
- mteb/models/model_implementations/mdbr_models.py +17 -3
- mteb/models/model_implementations/misc_models.py +68 -68
- mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
- mteb/models/model_implementations/mme5_models.py +1 -1
- mteb/models/model_implementations/moco_models.py +4 -4
- mteb/models/model_implementations/mod_models.py +1 -1
- mteb/models/model_implementations/model2vec_models.py +14 -14
- mteb/models/model_implementations/moka_models.py +1 -1
- mteb/models/model_implementations/nbailab.py +3 -3
- mteb/models/model_implementations/no_instruct_sentence_models.py +2 -2
- mteb/models/model_implementations/nomic_models.py +30 -15
- mteb/models/model_implementations/nomic_models_vision.py +1 -1
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +15 -9
- mteb/models/model_implementations/nvidia_models.py +151 -19
- mteb/models/model_implementations/octen_models.py +61 -2
- mteb/models/model_implementations/openclip_models.py +13 -13
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -5
- mteb/models/model_implementations/ops_moa_models.py +1 -1
- mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
- mteb/models/model_implementations/pawan_models.py +1 -1
- mteb/models/model_implementations/piccolo_models.py +1 -1
- mteb/models/model_implementations/pixie_models.py +56 -0
- mteb/models/model_implementations/promptriever_models.py +4 -4
- mteb/models/model_implementations/pylate_models.py +10 -9
- mteb/models/model_implementations/qodo_models.py +2 -2
- mteb/models/model_implementations/qtack_models.py +1 -1
- mteb/models/model_implementations/qwen3_models.py +3 -3
- mteb/models/model_implementations/qzhou_models.py +2 -2
- mteb/models/model_implementations/random_baseline.py +3 -3
- mteb/models/model_implementations/rasgaard_models.py +2 -2
- mteb/models/model_implementations/reasonir_model.py +1 -1
- mteb/models/model_implementations/repllama_models.py +3 -3
- mteb/models/model_implementations/rerankers_custom.py +12 -6
- mteb/models/model_implementations/rerankers_monot5_based.py +17 -17
- mteb/models/model_implementations/richinfoai_models.py +1 -1
- mteb/models/model_implementations/ru_sentence_models.py +20 -20
- mteb/models/model_implementations/ruri_models.py +10 -10
- mteb/models/model_implementations/salesforce_models.py +3 -3
- mteb/models/model_implementations/samilpwc_models.py +1 -1
- mteb/models/model_implementations/sarashina_embedding_models.py +2 -2
- mteb/models/model_implementations/searchmap_models.py +1 -1
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +113 -146
- mteb/models/model_implementations/sentence_transformers_models.py +124 -22
- mteb/models/model_implementations/shuu_model.py +1 -1
- mteb/models/model_implementations/siglip_models.py +20 -20
- mteb/models/model_implementations/slm_models.py +416 -0
- mteb/models/model_implementations/spartan8806_atles_champion.py +1 -1
- mteb/models/model_implementations/stella_models.py +17 -4
- mteb/models/model_implementations/tarka_models.py +2 -2
- mteb/models/model_implementations/text2vec_models.py +9 -3
- mteb/models/model_implementations/ua_sentence_models.py +1 -1
- mteb/models/model_implementations/uae_models.py +7 -1
- mteb/models/model_implementations/vdr_models.py +1 -1
- mteb/models/model_implementations/vi_vn_models.py +6 -6
- mteb/models/model_implementations/vlm2vec_models.py +3 -3
- mteb/models/model_implementations/voyage_models.py +84 -0
- mteb/models/model_implementations/voyage_v.py +9 -7
- mteb/models/model_implementations/youtu_models.py +1 -1
- mteb/models/model_implementations/yuan_models.py +1 -1
- mteb/models/model_implementations/yuan_models_en.py +1 -1
- mteb/models/model_meta.py +80 -31
- mteb/models/models_protocols.py +22 -6
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +9 -6
- mteb/models/search_wrappers.py +33 -18
- mteb/models/sentence_transformer_wrapper.py +50 -25
- mteb/models/vllm_wrapper.py +327 -0
- mteb/py.typed +0 -0
- mteb/results/benchmark_results.py +29 -21
- mteb/results/model_result.py +52 -22
- mteb/results/task_result.py +80 -58
- mteb/similarity_functions.py +11 -7
- mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
- mteb/tasks/classification/est/estonian_valence.py +1 -1
- mteb/tasks/classification/kur/kurdish_sentiment_classification.py +2 -2
- mteb/tasks/classification/multilingual/scala_classification.py +1 -1
- mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
- mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
- mteb/tasks/retrieval/code/code_rag.py +12 -12
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
- mteb/tasks/retrieval/eng/__init__.py +2 -0
- mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
- mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
- mteb/tasks/retrieval/kor/__init__.py +15 -1
- mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
- mteb/tasks/retrieval/multilingual/__init__.py +2 -0
- mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +90 -100
- mteb/tasks/retrieval/nob/norquad.py +2 -2
- mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
- mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
- mteb/tasks/retrieval/vie/__init__.py +14 -6
- mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/fevervn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +48 -0
- mteb/tasks/retrieval/vie/nqvn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
- mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
- mteb/types/__init__.py +2 -0
- mteb/types/_encoder_io.py +12 -0
- mteb/types/_result.py +2 -1
- mteb/types/statistics.py +9 -3
- {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/METADATA +15 -4
- {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/RECORD +240 -219
- mteb/models/model_implementations/mxbai_models.py +0 -111
- {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
- {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
- {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import sys
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import TypedDict
|
|
4
4
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
import torch
|
|
@@ -12,6 +12,7 @@ from mteb._evaluators.evaluator import Evaluator
|
|
|
12
12
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
13
13
|
from mteb.models import EncoderProtocol
|
|
14
14
|
from mteb.similarity_functions import cos_sim, dot_score
|
|
15
|
+
from mteb.types import EncodeKwargs
|
|
15
16
|
|
|
16
17
|
# if later than python 3.13 use typing module
|
|
17
18
|
if sys.version_info >= (3, 13):
|
|
@@ -94,7 +95,7 @@ class SummarizationEvaluator(Evaluator):
|
|
|
94
95
|
self,
|
|
95
96
|
model: EncoderProtocol,
|
|
96
97
|
*,
|
|
97
|
-
encode_kwargs:
|
|
98
|
+
encode_kwargs: EncodeKwargs,
|
|
98
99
|
) -> SummarizationDistances:
|
|
99
100
|
# Get the human & machine summaries for the text in one go for all
|
|
100
101
|
human_lens = [len(human_summaries) for human_summaries in self.human_summaries]
|
|
@@ -135,10 +136,10 @@ class SummarizationEvaluator(Evaluator):
|
|
|
135
136
|
)
|
|
136
137
|
|
|
137
138
|
# Split the embeddings into the original human & machine summaries
|
|
138
|
-
|
|
139
|
+
embs_human_summaries_all_split = np.split(
|
|
139
140
|
embs_human_summaries_all, np.cumsum(human_lens)[:-1]
|
|
140
141
|
)
|
|
141
|
-
|
|
142
|
+
embs_machine_summaries_all_split = np.split(
|
|
142
143
|
embs_machine_summaries_all, np.cumsum(machine_lens)[:-1]
|
|
143
144
|
)
|
|
144
145
|
|
|
@@ -148,7 +149,9 @@ class SummarizationEvaluator(Evaluator):
|
|
|
148
149
|
all_human_scores = []
|
|
149
150
|
|
|
150
151
|
for i, (embs_human_summaries, embs_machine_summaries) in tqdm(
|
|
151
|
-
enumerate(
|
|
152
|
+
enumerate(
|
|
153
|
+
zip(embs_human_summaries_all_split, embs_machine_summaries_all_split)
|
|
154
|
+
),
|
|
152
155
|
desc="Scoring",
|
|
153
156
|
total=len(self.human_summaries),
|
|
154
157
|
):
|
|
@@ -164,7 +167,7 @@ class SummarizationEvaluator(Evaluator):
|
|
|
164
167
|
dot_scores = dot_score(emb_machine_summary, embs_human_summaries)
|
|
165
168
|
|
|
166
169
|
_sim_score = [
|
|
167
|
-
float(model.similarity(emb_machine_summary, emb_human_summary))
|
|
170
|
+
float(model.similarity(emb_machine_summary, emb_human_summary))
|
|
168
171
|
for emb_human_summary in embs_human_summaries
|
|
169
172
|
]
|
|
170
173
|
sim_score = torch.tensor(_sim_score)
|
|
@@ -216,17 +219,19 @@ class SummarizationEvaluator(Evaluator):
|
|
|
216
219
|
strict=True,
|
|
217
220
|
):
|
|
218
221
|
cosine_spearman_scores.append(
|
|
219
|
-
spearmanr(human_scores, cosine_pred_scores).statistic
|
|
222
|
+
float(spearmanr(human_scores, cosine_pred_scores).statistic)
|
|
220
223
|
)
|
|
221
224
|
cosine_pearson_scores.append(
|
|
222
|
-
pearsonr(human_scores, cosine_pred_scores).statistic
|
|
225
|
+
float(pearsonr(human_scores, cosine_pred_scores).statistic)
|
|
223
226
|
)
|
|
224
227
|
dot_spearman_scores.append(
|
|
225
|
-
spearmanr(human_scores, dot_pred_scores).statistic
|
|
228
|
+
float(spearmanr(human_scores, dot_pred_scores).statistic)
|
|
229
|
+
)
|
|
230
|
+
dot_pearson_scores.append(
|
|
231
|
+
float(pearsonr(human_scores, dot_pred_scores).statistic)
|
|
226
232
|
)
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
pearson_scores.append(pearsonr(human_scores, sim_scores).statistic)
|
|
233
|
+
spearman_scores.append(float(spearmanr(human_scores, sim_scores).statistic))
|
|
234
|
+
pearson_scores.append(float(pearsonr(human_scores, sim_scores).statistic))
|
|
230
235
|
|
|
231
236
|
return SummarizationMetrics(
|
|
232
237
|
pearson=float(np.mean(pearson_scores)),
|
|
@@ -273,10 +278,10 @@ class DeprecatedSummarizationEvaluator(SummarizationEvaluator):
|
|
|
273
278
|
pearson_scores.append(pearsonr(human_scores, sim_scores))
|
|
274
279
|
|
|
275
280
|
return SummarizationMetrics(
|
|
276
|
-
pearson=float(np.mean(pearson_scores)),
|
|
277
|
-
spearman=float(np.mean(spearman_scores)),
|
|
278
|
-
cosine_spearman=float(np.mean(cosine_spearman_scores)),
|
|
279
|
-
cosine_pearson=float(np.mean(cosine_pearson_scores)),
|
|
280
|
-
dot_pearson=float(np.mean(dot_pearson_scores)),
|
|
281
|
-
dot_spearman=float(np.mean(dot_spearman_scores)),
|
|
281
|
+
pearson=float(np.mean(pearson_scores)), # type: ignore[arg-type]
|
|
282
|
+
spearman=float(np.mean(spearman_scores)), # type: ignore[arg-type]
|
|
283
|
+
cosine_spearman=float(np.mean(cosine_spearman_scores)), # type: ignore[arg-type]
|
|
284
|
+
cosine_pearson=float(np.mean(cosine_pearson_scores)), # type: ignore[arg-type]
|
|
285
|
+
dot_pearson=float(np.mean(dot_pearson_scores)), # type: ignore[arg-type]
|
|
286
|
+
dot_spearman=float(np.mean(dot_spearman_scores)), # type: ignore[arg-type]
|
|
282
287
|
)
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Any
|
|
3
2
|
|
|
4
3
|
from datasets import Dataset
|
|
5
4
|
|
|
@@ -10,7 +9,7 @@ from mteb._create_dataloaders import (
|
|
|
10
9
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
11
10
|
from mteb.models import EncoderProtocol
|
|
12
11
|
from mteb.similarity_functions import similarity
|
|
13
|
-
from mteb.types import Array
|
|
12
|
+
from mteb.types import Array, EncodeKwargs
|
|
14
13
|
|
|
15
14
|
from .evaluator import Evaluator
|
|
16
15
|
|
|
@@ -38,7 +37,10 @@ class ZeroShotClassificationEvaluator(Evaluator):
|
|
|
38
37
|
self.hf_subset = hf_subset
|
|
39
38
|
|
|
40
39
|
def __call__(
|
|
41
|
-
self,
|
|
40
|
+
self,
|
|
41
|
+
model: EncoderProtocol,
|
|
42
|
+
*,
|
|
43
|
+
encode_kwargs: EncodeKwargs,
|
|
42
44
|
) -> Array:
|
|
43
45
|
dataloader = create_dataloader(
|
|
44
46
|
self.dataset,
|
|
@@ -61,7 +61,7 @@ def filter_unclear_label(
|
|
|
61
61
|
for text, label in zip(ds[input_column], ds[label_column]):
|
|
62
62
|
key = text.strip().lower()
|
|
63
63
|
normalized.setdefault(key, set()).add(
|
|
64
|
-
label if isinstance(label, (str, int, float)) else tuple(label)
|
|
64
|
+
label if isinstance(label, (str, int, float)) else tuple(label) # type: ignore[arg-type]
|
|
65
65
|
)
|
|
66
66
|
|
|
67
67
|
bad_texts = {t for t, labels in normalized.items() if len(labels) > 1}
|
|
@@ -2,7 +2,8 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import hashlib
|
|
4
4
|
from collections import Counter
|
|
5
|
-
from
|
|
5
|
+
from collections.abc import Mapping
|
|
6
|
+
from typing import TYPE_CHECKING, cast
|
|
6
7
|
|
|
7
8
|
from mteb.types import TopRankedDocumentsType
|
|
8
9
|
from mteb.types.statistics import (
|
|
@@ -52,7 +53,7 @@ def calculate_image_statistics(images: list[Image.Image]) -> ImageStatistics:
|
|
|
52
53
|
seen_hashes: set[str] = set()
|
|
53
54
|
|
|
54
55
|
for img in images:
|
|
55
|
-
width, height = img.size
|
|
56
|
+
width, height = img.size
|
|
56
57
|
img_heights.append(height)
|
|
57
58
|
img_widths.append(width)
|
|
58
59
|
|
|
@@ -82,17 +83,24 @@ def calculate_label_statistics(labels: list[int | list[int]]) -> LabelStatistics
|
|
|
82
83
|
LabelStatistics: A dictionary containing the descriptive statistics.
|
|
83
84
|
|
|
84
85
|
"""
|
|
86
|
+
total_labels: list[int | None] = []
|
|
87
|
+
|
|
85
88
|
if not isinstance(labels[0], list):
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
+
# single label classification
|
|
90
|
+
single_label = cast(list[int], labels)
|
|
91
|
+
label_len = [1] * len(single_label)
|
|
92
|
+
total_label_len = len(single_label)
|
|
93
|
+
total_labels.extend(single_label)
|
|
89
94
|
elif isinstance(labels[0], list):
|
|
90
95
|
# multilabel classification
|
|
91
|
-
|
|
96
|
+
multilabel_labels = cast(list[list[int]], labels)
|
|
97
|
+
label_len = [len(l) for l in multilabel_labels]
|
|
92
98
|
total_label_len = sum(label_len)
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
99
|
+
for l in multilabel_labels:
|
|
100
|
+
if l and len(l) > 0:
|
|
101
|
+
total_labels.extend(l)
|
|
102
|
+
else:
|
|
103
|
+
total_labels.append(None)
|
|
96
104
|
else:
|
|
97
105
|
raise ValueError(
|
|
98
106
|
"Labels must be a list of integers or a list of lists of integers."
|
|
@@ -159,7 +167,7 @@ def calculate_top_ranked_statistics(
|
|
|
159
167
|
|
|
160
168
|
|
|
161
169
|
def calculate_relevant_docs_statistics(
|
|
162
|
-
relevant_docs:
|
|
170
|
+
relevant_docs: Mapping[str, Mapping[str, int]],
|
|
163
171
|
) -> RelevantDocsStatistics:
|
|
164
172
|
qrels_lengths = [len(relevant_docs[qid]) for qid in relevant_docs]
|
|
165
173
|
unique_qrels = len({doc for qid in relevant_docs for doc in relevant_docs[qid]})
|
mteb/abstasks/_stratification.py
CHANGED
|
@@ -39,6 +39,7 @@ Bibtex:
|
|
|
39
39
|
"""
|
|
40
40
|
|
|
41
41
|
import itertools
|
|
42
|
+
from typing import Any
|
|
42
43
|
|
|
43
44
|
import numpy as np
|
|
44
45
|
import scipy.sparse as sp
|
|
@@ -119,8 +120,10 @@ def _get_most_desired_combination(samples_with_combination: dict):
|
|
|
119
120
|
if support_size == 0:
|
|
120
121
|
continue
|
|
121
122
|
if currently_chosen is None or (
|
|
122
|
-
best_number_of_combinations
|
|
123
|
-
and best_support_size
|
|
123
|
+
best_number_of_combinations is not None
|
|
124
|
+
and best_support_size is not None
|
|
125
|
+
and best_number_of_combinations < number_of_combinations
|
|
126
|
+
and best_support_size > support_size
|
|
124
127
|
):
|
|
125
128
|
currently_chosen = combination
|
|
126
129
|
best_number_of_combinations, best_support_size = (
|
|
@@ -162,7 +165,7 @@ class IterativeStratification(_BaseKFold):
|
|
|
162
165
|
self._rng_state = check_random_state(random_state)
|
|
163
166
|
need_shuffle = shuffle or random_state is not None
|
|
164
167
|
self.order = order
|
|
165
|
-
super().__init__(
|
|
168
|
+
super().__init__(
|
|
166
169
|
n_splits,
|
|
167
170
|
shuffle=need_shuffle,
|
|
168
171
|
random_state=self._rng_state if need_shuffle else None,
|
|
@@ -172,8 +175,7 @@ class IterativeStratification(_BaseKFold):
|
|
|
172
175
|
self.percentage_per_fold = sample_distribution_per_fold
|
|
173
176
|
else:
|
|
174
177
|
self.percentage_per_fold = [
|
|
175
|
-
1 / float(self.n_splits)
|
|
176
|
-
for _ in range(self.n_splits) # type: ignore
|
|
178
|
+
1 / float(self.n_splits) for _ in range(self.n_splits)
|
|
177
179
|
]
|
|
178
180
|
|
|
179
181
|
def _prepare_stratification(
|
|
@@ -182,9 +184,9 @@ class IterativeStratification(_BaseKFold):
|
|
|
182
184
|
list[list[int]],
|
|
183
185
|
dict[int, bool],
|
|
184
186
|
list[list[int]],
|
|
185
|
-
list[list[
|
|
186
|
-
dict[
|
|
187
|
-
list[list[
|
|
187
|
+
list[list[Any]],
|
|
188
|
+
dict[str, list[Any]],
|
|
189
|
+
list[list[Any]],
|
|
188
190
|
]:
|
|
189
191
|
"""Prepares variables for performing stratification
|
|
190
192
|
|
|
@@ -206,14 +208,14 @@ class IterativeStratification(_BaseKFold):
|
|
|
206
208
|
"""
|
|
207
209
|
self.n_samples, self.n_labels = y.shape
|
|
208
210
|
self.desired_samples_per_fold = np.array(
|
|
209
|
-
[self.percentage_per_fold[i] * self.n_samples for i in range(self.n_splits)]
|
|
211
|
+
[self.percentage_per_fold[i] * self.n_samples for i in range(self.n_splits)]
|
|
210
212
|
)
|
|
211
213
|
rows = sp.lil_matrix(y).rows
|
|
212
214
|
rows_used = dict.fromkeys(range(self.n_samples), False)
|
|
213
215
|
all_combinations = []
|
|
214
|
-
per_row_combinations = [[] for i in range(self.n_samples)]
|
|
215
|
-
samples_with_combination = {}
|
|
216
|
-
folds = [[] for _ in range(self.n_splits)]
|
|
216
|
+
per_row_combinations: list[list[Any]] = [[] for i in range(self.n_samples)]
|
|
217
|
+
samples_with_combination: dict[str, list[Any]] = {}
|
|
218
|
+
folds: list[list[int]] = [[] for _ in range(self.n_splits)]
|
|
217
219
|
|
|
218
220
|
# for every row
|
|
219
221
|
for sample_index, label_assignment in enumerate(rows):
|
|
@@ -229,21 +231,19 @@ class IterativeStratification(_BaseKFold):
|
|
|
229
231
|
all_combinations.append(combination)
|
|
230
232
|
per_row_combinations[sample_index].append(combination)
|
|
231
233
|
|
|
232
|
-
all_combinations = [list(x) for x in set(all_combinations)]
|
|
233
|
-
|
|
234
234
|
self.desired_samples_per_combination_per_fold = {
|
|
235
235
|
combination: np.array(
|
|
236
236
|
[
|
|
237
237
|
len(evidence_for_combination) * self.percentage_per_fold[j]
|
|
238
|
-
for j in range(self.n_splits)
|
|
238
|
+
for j in range(self.n_splits)
|
|
239
239
|
]
|
|
240
240
|
)
|
|
241
241
|
for combination, evidence_for_combination in samples_with_combination.items()
|
|
242
242
|
}
|
|
243
243
|
return (
|
|
244
|
-
rows,
|
|
244
|
+
rows.tolist(),
|
|
245
245
|
rows_used,
|
|
246
|
-
all_combinations,
|
|
246
|
+
[list(x) for x in set(all_combinations)],
|
|
247
247
|
per_row_combinations,
|
|
248
248
|
samples_with_combination,
|
|
249
249
|
folds,
|
|
@@ -328,7 +328,7 @@ class IterativeStratification(_BaseKFold):
|
|
|
328
328
|
per_row_combinations,
|
|
329
329
|
samples_with_combination,
|
|
330
330
|
folds,
|
|
331
|
-
) = self._prepare_stratification(y)
|
|
331
|
+
) = self._prepare_stratification(y)
|
|
332
332
|
|
|
333
333
|
self._distribute_positive_evidence(
|
|
334
334
|
rows_used, folds, samples_with_combination, per_row_combinations
|
mteb/abstasks/abstask.py
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
+
import warnings
|
|
3
4
|
from abc import ABC, abstractmethod
|
|
4
|
-
from collections.abc import Sequence
|
|
5
|
+
from collections.abc import Mapping, Sequence
|
|
5
6
|
from copy import copy
|
|
6
7
|
from pathlib import Path
|
|
7
|
-
from typing import Any, cast
|
|
8
|
+
from typing import Any, Literal, cast
|
|
8
9
|
|
|
9
10
|
import numpy as np
|
|
10
11
|
from datasets import ClassLabel, Dataset, DatasetDict, load_dataset
|
|
@@ -22,6 +23,7 @@ from mteb.models import (
|
|
|
22
23
|
SearchProtocol,
|
|
23
24
|
)
|
|
24
25
|
from mteb.types import HFSubset, Modalities, ScoresDict
|
|
26
|
+
from mteb.types._encoder_io import EncodeKwargs
|
|
25
27
|
from mteb.types.statistics import DescriptiveStatistics, SplitDescriptiveStatistics
|
|
26
28
|
|
|
27
29
|
logger = logging.getLogger(__name__)
|
|
@@ -78,8 +80,8 @@ class AbsTask(ABC):
|
|
|
78
80
|
"""
|
|
79
81
|
|
|
80
82
|
metadata: TaskMetadata
|
|
81
|
-
abstask_prompt: str
|
|
82
|
-
_eval_splits:
|
|
83
|
+
abstask_prompt: str
|
|
84
|
+
_eval_splits: Sequence[str] | None = None
|
|
83
85
|
dataset: dict[HFSubset, DatasetDict] | None = None
|
|
84
86
|
data_loaded: bool = False
|
|
85
87
|
hf_subsets: list[HFSubset]
|
|
@@ -102,9 +104,9 @@ class AbsTask(ABC):
|
|
|
102
104
|
def check_if_dataset_is_superseded(self) -> None:
|
|
103
105
|
"""Check if the dataset is superseded by a newer version."""
|
|
104
106
|
if self.superseded_by:
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
)
|
|
107
|
+
msg = f"Dataset '{self.metadata.name}' is superseded by '{self.superseded_by}'. We recommend using the newer version of the dataset unless you are running a specific benchmark. See `get_task('{self.superseded_by}').metadata.description` to get a description of the task and changes."
|
|
108
|
+
logger.warning(msg)
|
|
109
|
+
warnings.warn(msg)
|
|
108
110
|
|
|
109
111
|
def dataset_transform(self):
|
|
110
112
|
"""A transform operations applied to the dataset after loading.
|
|
@@ -120,10 +122,10 @@ class AbsTask(ABC):
|
|
|
120
122
|
split: str = "test",
|
|
121
123
|
subsets_to_run: list[HFSubset] | None = None,
|
|
122
124
|
*,
|
|
123
|
-
encode_kwargs:
|
|
125
|
+
encode_kwargs: EncodeKwargs,
|
|
124
126
|
prediction_folder: Path | None = None,
|
|
125
127
|
**kwargs: Any,
|
|
126
|
-
) ->
|
|
128
|
+
) -> Mapping[HFSubset, ScoresDict]:
|
|
127
129
|
"""Evaluates an MTEB compatible model on the task.
|
|
128
130
|
|
|
129
131
|
Args:
|
|
@@ -195,12 +197,12 @@ class AbsTask(ABC):
|
|
|
195
197
|
@abstractmethod
|
|
196
198
|
def _evaluate_subset(
|
|
197
199
|
self,
|
|
198
|
-
model:
|
|
200
|
+
model: MTEBModels,
|
|
199
201
|
data_split: Dataset,
|
|
200
202
|
*,
|
|
201
|
-
encode_kwargs: dict[str, Any],
|
|
202
203
|
hf_split: str,
|
|
203
204
|
hf_subset: str,
|
|
205
|
+
encode_kwargs: EncodeKwargs,
|
|
204
206
|
prediction_folder: Path | None = None,
|
|
205
207
|
**kwargs: Any,
|
|
206
208
|
) -> ScoresDict:
|
|
@@ -210,7 +212,7 @@ class AbsTask(ABC):
|
|
|
210
212
|
|
|
211
213
|
def _save_task_predictions(
|
|
212
214
|
self,
|
|
213
|
-
predictions:
|
|
215
|
+
predictions: Mapping[str, Any] | list[Any],
|
|
214
216
|
model: MTEBModels,
|
|
215
217
|
prediction_folder: Path,
|
|
216
218
|
hf_split: str,
|
|
@@ -226,7 +228,7 @@ class AbsTask(ABC):
|
|
|
226
228
|
hf_subset: The subset of the dataset (e.g. "en").
|
|
227
229
|
"""
|
|
228
230
|
predictions_path = self._predictions_path(prediction_folder)
|
|
229
|
-
existing_results = {
|
|
231
|
+
existing_results: dict[str, Any] = {
|
|
230
232
|
"mteb_model_meta": {
|
|
231
233
|
"model_name": model.mteb_model_meta.name,
|
|
232
234
|
"revision": model.mteb_model_meta.revision,
|
|
@@ -326,7 +328,7 @@ class AbsTask(ABC):
|
|
|
326
328
|
)
|
|
327
329
|
else:
|
|
328
330
|
# some of monolingual datasets explicitly adding the split name to the dataset name
|
|
329
|
-
self.dataset = load_dataset(**self.metadata.dataset)
|
|
331
|
+
self.dataset = load_dataset(**self.metadata.dataset)
|
|
330
332
|
self.dataset_transform()
|
|
331
333
|
self.data_loaded = True
|
|
332
334
|
|
|
@@ -362,15 +364,19 @@ class AbsTask(ABC):
|
|
|
362
364
|
"""
|
|
363
365
|
from mteb.abstasks import AbsTaskClassification
|
|
364
366
|
|
|
365
|
-
|
|
367
|
+
existing_stats = self.metadata.descriptive_stats
|
|
368
|
+
|
|
369
|
+
if existing_stats is not None and not overwrite_results:
|
|
366
370
|
logger.info("Loading metadata descriptive statistics from cache.")
|
|
367
|
-
return
|
|
371
|
+
return existing_stats
|
|
368
372
|
|
|
369
373
|
if not self.data_loaded:
|
|
370
374
|
self.load_data()
|
|
371
375
|
|
|
372
376
|
descriptive_stats: dict[str, DescriptiveStatistics] = {}
|
|
373
|
-
hf_subset_stat
|
|
377
|
+
hf_subset_stat: Literal["hf_subset_descriptive_stats"] = (
|
|
378
|
+
"hf_subset_descriptive_stats"
|
|
379
|
+
)
|
|
374
380
|
eval_splits = self.metadata.eval_splits
|
|
375
381
|
if isinstance(self, AbsTaskClassification):
|
|
376
382
|
eval_splits.append(self.train_split)
|
|
@@ -381,7 +387,7 @@ class AbsTask(ABC):
|
|
|
381
387
|
logger.info(f"Processing metadata for split {split}")
|
|
382
388
|
if self.metadata.is_multilingual:
|
|
383
389
|
descriptive_stats[split] = (
|
|
384
|
-
self._calculate_descriptive_statistics_from_split(
|
|
390
|
+
self._calculate_descriptive_statistics_from_split( # type: ignore[assignment]
|
|
385
391
|
split, compute_overall=True
|
|
386
392
|
)
|
|
387
393
|
)
|
|
@@ -400,7 +406,7 @@ class AbsTask(ABC):
|
|
|
400
406
|
descriptive_stats[split][hf_subset_stat][hf_subset] = split_details
|
|
401
407
|
else:
|
|
402
408
|
split_details = self._calculate_descriptive_statistics_from_split(split)
|
|
403
|
-
descriptive_stats[split] = split_details
|
|
409
|
+
descriptive_stats[split] = split_details # type: ignore[assignment]
|
|
404
410
|
|
|
405
411
|
with self.metadata.descriptive_stat_path.open("w") as f:
|
|
406
412
|
json.dump(descriptive_stats, f, indent=4)
|
|
@@ -437,7 +443,7 @@ class AbsTask(ABC):
|
|
|
437
443
|
|
|
438
444
|
return self.metadata.languages
|
|
439
445
|
|
|
440
|
-
def filter_eval_splits(self, eval_splits:
|
|
446
|
+
def filter_eval_splits(self, eval_splits: Sequence[str] | None) -> Self:
|
|
441
447
|
"""Filter the evaluation splits of the task.
|
|
442
448
|
|
|
443
449
|
Args:
|
|
@@ -451,9 +457,9 @@ class AbsTask(ABC):
|
|
|
451
457
|
|
|
452
458
|
def filter_languages(
|
|
453
459
|
self,
|
|
454
|
-
languages:
|
|
455
|
-
script:
|
|
456
|
-
hf_subsets:
|
|
460
|
+
languages: Sequence[str] | None,
|
|
461
|
+
script: Sequence[str] | None = None,
|
|
462
|
+
hf_subsets: Sequence[HFSubset] | None = None,
|
|
457
463
|
exclusive_language_filter: bool = False,
|
|
458
464
|
) -> Self:
|
|
459
465
|
"""Filter the languages of the task.
|
|
@@ -499,12 +505,14 @@ class AbsTask(ABC):
|
|
|
499
505
|
self.hf_subsets = subsets_to_keep
|
|
500
506
|
return self
|
|
501
507
|
|
|
502
|
-
def _add_main_score(self, scores:
|
|
508
|
+
def _add_main_score(self, scores: ScoresDict) -> None:
|
|
503
509
|
scores["main_score"] = scores[self.metadata.main_score]
|
|
504
510
|
|
|
505
511
|
def _upload_dataset_to_hub(
|
|
506
512
|
self, repo_name: str, fields: list[str] | dict[str, str]
|
|
507
513
|
) -> None:
|
|
514
|
+
if self.dataset is None:
|
|
515
|
+
raise ValueError("Dataset not loaded")
|
|
508
516
|
if self.metadata.is_multilingual:
|
|
509
517
|
for config in self.metadata.eval_langs:
|
|
510
518
|
logger.info(f"Converting {config} of {self.metadata.name}")
|
|
@@ -574,7 +582,7 @@ class AbsTask(ABC):
|
|
|
574
582
|
return False
|
|
575
583
|
|
|
576
584
|
@property
|
|
577
|
-
def eval_splits(self) ->
|
|
585
|
+
def eval_splits(self) -> Sequence[str]:
|
|
578
586
|
"""Returns the evaluation splits of the task."""
|
|
579
587
|
if self._eval_splits:
|
|
580
588
|
return self._eval_splits
|
|
@@ -607,9 +615,8 @@ class AbsTask(ABC):
|
|
|
607
615
|
self.data_loaded = False
|
|
608
616
|
logger.info(f"Unloaded dataset {self.metadata.name} from memory.")
|
|
609
617
|
else:
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
)
|
|
618
|
+
msg = f"Dataset `{self.metadata.name}` is not loaded, cannot unload it."
|
|
619
|
+
logger.warning(msg)
|
|
613
620
|
|
|
614
621
|
@property
|
|
615
622
|
def superseded_by(self) -> str | None:
|
|
@@ -5,7 +5,6 @@ from pydantic import ConfigDict, Field, model_validator
|
|
|
5
5
|
from typing_extensions import Self
|
|
6
6
|
|
|
7
7
|
from mteb.types import (
|
|
8
|
-
HFSubset,
|
|
9
8
|
ISOLanguageScript,
|
|
10
9
|
Languages,
|
|
11
10
|
Licenses,
|
|
@@ -60,14 +59,7 @@ class AggregateTaskMetadata(TaskMetadata):
|
|
|
60
59
|
reference: str | None = None
|
|
61
60
|
bibtex_citation: str | None = None
|
|
62
61
|
|
|
63
|
-
@
|
|
64
|
-
def hf_subsets_to_langscripts(self) -> dict[HFSubset, list[ISOLanguageScript]]:
|
|
65
|
-
"""Return a dictionary mapping huggingface subsets to languages."""
|
|
66
|
-
if isinstance(self.eval_langs, dict):
|
|
67
|
-
return self.eval_langs
|
|
68
|
-
return {"default": self.eval_langs} # type: ignore
|
|
69
|
-
|
|
70
|
-
@model_validator(mode="after") # type: ignore
|
|
62
|
+
@model_validator(mode="after")
|
|
71
63
|
def _compute_unfilled_cases(self) -> Self:
|
|
72
64
|
if not self.eval_langs:
|
|
73
65
|
self.eval_langs = self._compute_eval_langs()
|
mteb/abstasks/aggregated_task.py
CHANGED
|
@@ -1,14 +1,15 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import warnings
|
|
3
|
+
from collections.abc import Mapping
|
|
2
4
|
from pathlib import Path
|
|
3
5
|
from typing import Any
|
|
4
6
|
|
|
5
7
|
import numpy as np
|
|
6
8
|
from datasets import Dataset, DatasetDict
|
|
7
|
-
from typing_extensions import Self
|
|
8
9
|
|
|
9
10
|
from mteb.models.models_protocols import MTEBModels
|
|
10
11
|
from mteb.results.task_result import TaskResult
|
|
11
|
-
from mteb.types import HFSubset, ScoresDict
|
|
12
|
+
from mteb.types import EncodeKwargs, HFSubset, ScoresDict
|
|
12
13
|
from mteb.types.statistics import DescriptiveStatistics
|
|
13
14
|
|
|
14
15
|
from .abstask import AbsTask
|
|
@@ -32,7 +33,7 @@ class AbsTaskAggregate(AbsTask):
|
|
|
32
33
|
|
|
33
34
|
def task_results_to_scores(
|
|
34
35
|
self, task_results: list[TaskResult]
|
|
35
|
-
) -> dict[str,
|
|
36
|
+
) -> dict[str, Mapping[HFSubset, ScoresDict]]:
|
|
36
37
|
"""The function that aggregated scores. Can be redefined to allow for custom aggregations.
|
|
37
38
|
|
|
38
39
|
Args:
|
|
@@ -41,7 +42,7 @@ class AbsTaskAggregate(AbsTask):
|
|
|
41
42
|
Returns:
|
|
42
43
|
A dictionary with the aggregated scores.
|
|
43
44
|
"""
|
|
44
|
-
scores = {}
|
|
45
|
+
scores: dict[str, Mapping[HFSubset, ScoresDict]] = {}
|
|
45
46
|
subsets = (
|
|
46
47
|
self.metadata.eval_langs.keys()
|
|
47
48
|
if isinstance(self.metadata.eval_langs, dict)
|
|
@@ -113,40 +114,20 @@ class AbsTaskAggregate(AbsTask):
|
|
|
113
114
|
)
|
|
114
115
|
mteb_versions = {tr.mteb_version for tr in task_results}
|
|
115
116
|
if len(mteb_versions) != 1:
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
)
|
|
117
|
+
msg = f"All tasks of {self.metadata.name} is not run using the same version. different versions found are: {mteb_versions}"
|
|
118
|
+
logger.warning(msg)
|
|
119
|
+
warnings.warn(msg)
|
|
119
120
|
task_res.mteb_version = None
|
|
120
121
|
task_res.mteb_version = task_results[0].mteb_version
|
|
121
122
|
return task_res
|
|
122
123
|
|
|
123
|
-
def check_if_dataset_is_superseded(self) -> None:
|
|
124
|
-
"""Check if the dataset is superseded by a newer version"""
|
|
125
|
-
if self.superseded_by:
|
|
126
|
-
logger.warning(
|
|
127
|
-
f"Dataset '{self.metadata.name}' is superseded by '{self.superseded_by}', you might consider using the newer version of the dataset."
|
|
128
|
-
)
|
|
129
|
-
|
|
130
|
-
def filter_eval_splits(self, eval_splits: list[str] | None) -> Self:
|
|
131
|
-
"""Filter the evaluation splits of the task.
|
|
132
|
-
|
|
133
|
-
Args:
|
|
134
|
-
eval_splits: List of splits to evaluate on. If None, all splits in metadata
|
|
135
|
-
are used.
|
|
136
|
-
|
|
137
|
-
Returns:
|
|
138
|
-
The task with filtered evaluation splits.
|
|
139
|
-
"""
|
|
140
|
-
self._eval_splits = eval_splits
|
|
141
|
-
return self
|
|
142
|
-
|
|
143
124
|
def evaluate(
|
|
144
125
|
self,
|
|
145
126
|
model: MTEBModels,
|
|
146
127
|
split: str = "test",
|
|
147
128
|
subsets_to_run: list[HFSubset] | None = None,
|
|
148
129
|
*,
|
|
149
|
-
encode_kwargs:
|
|
130
|
+
encode_kwargs: EncodeKwargs,
|
|
150
131
|
prediction_folder: Path | None = None,
|
|
151
132
|
**kwargs: Any,
|
|
152
133
|
) -> dict[HFSubset, ScoresDict]:
|
|
@@ -160,7 +141,7 @@ class AbsTaskAggregate(AbsTask):
|
|
|
160
141
|
self,
|
|
161
142
|
model: MTEBModels,
|
|
162
143
|
data_split: DatasetDict | Dataset,
|
|
163
|
-
encode_kwargs:
|
|
144
|
+
encode_kwargs: EncodeKwargs,
|
|
164
145
|
**kwargs: Any,
|
|
165
146
|
) -> ScoresDict:
|
|
166
147
|
raise NotImplementedError(
|