mteb 2.5.2__py3-none-any.whl → 2.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +2 -0
- mteb/_create_dataloaders.py +17 -18
- mteb/_evaluators/any_sts_evaluator.py +3 -3
- mteb/_evaluators/clustering_evaluator.py +2 -2
- mteb/_evaluators/evaluator.py +4 -2
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +10 -8
- mteb/_evaluators/pair_classification_evaluator.py +5 -3
- mteb/_evaluators/retrieval_evaluator.py +2 -2
- mteb/_evaluators/retrieval_metrics.py +18 -17
- mteb/_evaluators/sklearn_evaluator.py +11 -10
- mteb/_evaluators/text/bitext_mining_evaluator.py +27 -18
- mteb/_evaluators/text/summarization_evaluator.py +23 -18
- mteb/_evaluators/zeroshot_classification_evaluator.py +5 -3
- mteb/abstasks/_data_filter/filters.py +1 -1
- mteb/abstasks/_data_filter/task_pipelines.py +3 -0
- mteb/abstasks/_statistics_calculation.py +18 -10
- mteb/abstasks/_stratification.py +18 -18
- mteb/abstasks/abstask.py +35 -28
- mteb/abstasks/aggregate_task_metadata.py +1 -9
- mteb/abstasks/aggregated_task.py +10 -29
- mteb/abstasks/classification.py +15 -10
- mteb/abstasks/clustering.py +19 -15
- mteb/abstasks/clustering_legacy.py +10 -10
- mteb/abstasks/image/image_text_pair_classification.py +7 -4
- mteb/abstasks/multilabel_classification.py +23 -19
- mteb/abstasks/pair_classification.py +20 -11
- mteb/abstasks/regression.py +4 -4
- mteb/abstasks/retrieval.py +28 -24
- mteb/abstasks/retrieval_dataset_loaders.py +2 -2
- mteb/abstasks/sts.py +8 -5
- mteb/abstasks/task_metadata.py +31 -33
- mteb/abstasks/text/bitext_mining.py +39 -28
- mteb/abstasks/text/reranking.py +8 -6
- mteb/abstasks/text/summarization.py +10 -5
- mteb/abstasks/zeroshot_classification.py +8 -4
- mteb/benchmarks/benchmark.py +4 -2
- mteb/benchmarks/benchmarks/__init__.py +4 -0
- mteb/benchmarks/benchmarks/benchmarks.py +112 -11
- mteb/benchmarks/get_benchmark.py +14 -55
- mteb/cache.py +182 -29
- mteb/cli/_display_tasks.py +2 -2
- mteb/cli/build_cli.py +110 -14
- mteb/cli/generate_model_card.py +43 -23
- mteb/deprecated_evaluator.py +63 -49
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
- mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
- mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
- mteb/evaluate.py +44 -33
- mteb/filter_tasks.py +25 -26
- mteb/get_tasks.py +29 -30
- mteb/languages/language_scripts.py +5 -3
- mteb/leaderboard/app.py +162 -34
- mteb/load_results.py +12 -12
- mteb/models/abs_encoder.py +10 -6
- mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
- mteb/models/cache_wrappers/cache_wrapper.py +2 -2
- mteb/models/get_model_meta.py +21 -3
- mteb/models/instruct_wrapper.py +28 -8
- mteb/models/model_implementations/align_models.py +1 -1
- mteb/models/model_implementations/andersborges.py +4 -4
- mteb/models/model_implementations/ara_models.py +1 -1
- mteb/models/model_implementations/arctic_models.py +8 -8
- mteb/models/model_implementations/b1ade_models.py +1 -1
- mteb/models/model_implementations/bge_models.py +45 -21
- mteb/models/model_implementations/bica_model.py +3 -3
- mteb/models/model_implementations/blip2_models.py +2 -2
- mteb/models/model_implementations/blip_models.py +16 -16
- mteb/models/model_implementations/bm25.py +4 -4
- mteb/models/model_implementations/bmretriever_models.py +6 -4
- mteb/models/model_implementations/cadet_models.py +1 -1
- mteb/models/model_implementations/cde_models.py +11 -4
- mteb/models/model_implementations/clip_models.py +6 -6
- mteb/models/model_implementations/clips_models.py +3 -3
- mteb/models/model_implementations/codefuse_models.py +5 -5
- mteb/models/model_implementations/codesage_models.py +3 -3
- mteb/models/model_implementations/cohere_models.py +5 -5
- mteb/models/model_implementations/cohere_v.py +2 -2
- mteb/models/model_implementations/colpali_models.py +3 -3
- mteb/models/model_implementations/colqwen_models.py +8 -8
- mteb/models/model_implementations/colsmol_models.py +2 -2
- mteb/models/model_implementations/conan_models.py +1 -1
- mteb/models/model_implementations/dino_models.py +42 -42
- mteb/models/model_implementations/e5_instruct.py +23 -4
- mteb/models/model_implementations/e5_models.py +9 -9
- mteb/models/model_implementations/e5_v.py +6 -6
- mteb/models/model_implementations/eagerworks_models.py +1 -1
- mteb/models/model_implementations/emillykkejensen_models.py +6 -6
- mteb/models/model_implementations/en_code_retriever.py +1 -1
- mteb/models/model_implementations/euler_models.py +2 -2
- mteb/models/model_implementations/fa_models.py +9 -9
- mteb/models/model_implementations/facebookai.py +14 -2
- mteb/models/model_implementations/geogpt_models.py +1 -1
- mteb/models/model_implementations/gme_v_models.py +6 -5
- mteb/models/model_implementations/google_models.py +1 -1
- mteb/models/model_implementations/granite_vision_embedding_models.py +1 -1
- mteb/models/model_implementations/gritlm_models.py +2 -2
- mteb/models/model_implementations/gte_models.py +25 -13
- mteb/models/model_implementations/hinvec_models.py +1 -1
- mteb/models/model_implementations/ibm_granite_models.py +30 -6
- mteb/models/model_implementations/inf_models.py +2 -2
- mteb/models/model_implementations/jasper_models.py +2 -2
- mteb/models/model_implementations/jina_clip.py +48 -10
- mteb/models/model_implementations/jina_models.py +18 -11
- mteb/models/model_implementations/kblab.py +12 -6
- mteb/models/model_implementations/kennethenevoldsen_models.py +4 -4
- mteb/models/model_implementations/kfst.py +1 -1
- mteb/models/model_implementations/kowshik24_models.py +1 -1
- mteb/models/model_implementations/lgai_embedding_models.py +1 -1
- mteb/models/model_implementations/linq_models.py +1 -1
- mteb/models/model_implementations/listconranker.py +1 -1
- mteb/models/model_implementations/llm2clip_models.py +6 -6
- mteb/models/model_implementations/llm2vec_models.py +8 -8
- mteb/models/model_implementations/mcinext_models.py +4 -1
- mteb/models/model_implementations/mdbr_models.py +17 -3
- mteb/models/model_implementations/misc_models.py +68 -68
- mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
- mteb/models/model_implementations/mme5_models.py +1 -1
- mteb/models/model_implementations/moco_models.py +4 -4
- mteb/models/model_implementations/mod_models.py +1 -1
- mteb/models/model_implementations/model2vec_models.py +14 -14
- mteb/models/model_implementations/moka_models.py +1 -1
- mteb/models/model_implementations/nbailab.py +3 -3
- mteb/models/model_implementations/no_instruct_sentence_models.py +2 -2
- mteb/models/model_implementations/nomic_models.py +30 -15
- mteb/models/model_implementations/nomic_models_vision.py +1 -1
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +15 -9
- mteb/models/model_implementations/nvidia_models.py +151 -19
- mteb/models/model_implementations/octen_models.py +61 -2
- mteb/models/model_implementations/openclip_models.py +13 -13
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -5
- mteb/models/model_implementations/ops_moa_models.py +1 -1
- mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
- mteb/models/model_implementations/pawan_models.py +1 -1
- mteb/models/model_implementations/piccolo_models.py +1 -1
- mteb/models/model_implementations/pixie_models.py +56 -0
- mteb/models/model_implementations/promptriever_models.py +4 -4
- mteb/models/model_implementations/pylate_models.py +10 -9
- mteb/models/model_implementations/qodo_models.py +2 -2
- mteb/models/model_implementations/qtack_models.py +1 -1
- mteb/models/model_implementations/qwen3_models.py +3 -3
- mteb/models/model_implementations/qzhou_models.py +2 -2
- mteb/models/model_implementations/random_baseline.py +3 -3
- mteb/models/model_implementations/rasgaard_models.py +2 -2
- mteb/models/model_implementations/reasonir_model.py +1 -1
- mteb/models/model_implementations/repllama_models.py +3 -3
- mteb/models/model_implementations/rerankers_custom.py +12 -6
- mteb/models/model_implementations/rerankers_monot5_based.py +17 -17
- mteb/models/model_implementations/richinfoai_models.py +1 -1
- mteb/models/model_implementations/ru_sentence_models.py +20 -20
- mteb/models/model_implementations/ruri_models.py +10 -10
- mteb/models/model_implementations/salesforce_models.py +3 -3
- mteb/models/model_implementations/samilpwc_models.py +1 -1
- mteb/models/model_implementations/sarashina_embedding_models.py +2 -2
- mteb/models/model_implementations/searchmap_models.py +1 -1
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +113 -146
- mteb/models/model_implementations/sentence_transformers_models.py +124 -22
- mteb/models/model_implementations/shuu_model.py +1 -1
- mteb/models/model_implementations/siglip_models.py +20 -20
- mteb/models/model_implementations/slm_models.py +416 -0
- mteb/models/model_implementations/spartan8806_atles_champion.py +1 -1
- mteb/models/model_implementations/stella_models.py +17 -4
- mteb/models/model_implementations/tarka_models.py +2 -2
- mteb/models/model_implementations/text2vec_models.py +9 -3
- mteb/models/model_implementations/ua_sentence_models.py +1 -1
- mteb/models/model_implementations/uae_models.py +7 -1
- mteb/models/model_implementations/vdr_models.py +1 -1
- mteb/models/model_implementations/vi_vn_models.py +6 -6
- mteb/models/model_implementations/vlm2vec_models.py +3 -3
- mteb/models/model_implementations/voyage_models.py +84 -0
- mteb/models/model_implementations/voyage_v.py +9 -7
- mteb/models/model_implementations/youtu_models.py +1 -1
- mteb/models/model_implementations/yuan_models.py +1 -1
- mteb/models/model_implementations/yuan_models_en.py +1 -1
- mteb/models/model_meta.py +80 -31
- mteb/models/models_protocols.py +22 -6
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +9 -6
- mteb/models/search_wrappers.py +33 -18
- mteb/models/sentence_transformer_wrapper.py +50 -25
- mteb/models/vllm_wrapper.py +327 -0
- mteb/py.typed +0 -0
- mteb/results/benchmark_results.py +29 -21
- mteb/results/model_result.py +52 -22
- mteb/results/task_result.py +80 -58
- mteb/similarity_functions.py +11 -7
- mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
- mteb/tasks/classification/est/estonian_valence.py +1 -1
- mteb/tasks/classification/kur/kurdish_sentiment_classification.py +2 -2
- mteb/tasks/classification/multilingual/scala_classification.py +1 -1
- mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
- mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
- mteb/tasks/retrieval/code/code_rag.py +12 -12
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
- mteb/tasks/retrieval/eng/__init__.py +2 -0
- mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
- mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
- mteb/tasks/retrieval/kor/__init__.py +15 -1
- mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
- mteb/tasks/retrieval/multilingual/__init__.py +2 -0
- mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +90 -100
- mteb/tasks/retrieval/nob/norquad.py +2 -2
- mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
- mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
- mteb/tasks/retrieval/vie/__init__.py +14 -6
- mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/fevervn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +48 -0
- mteb/tasks/retrieval/vie/nqvn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
- mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
- mteb/types/__init__.py +2 -0
- mteb/types/_encoder_io.py +12 -0
- mteb/types/_result.py +2 -1
- mteb/types/statistics.py +9 -3
- {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/METADATA +15 -4
- {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/RECORD +240 -219
- mteb/models/model_implementations/mxbai_models.py +0 -111
- {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
- {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
- {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
mteb/abstasks/classification.py
CHANGED
|
@@ -16,7 +16,7 @@ from sklearn.metrics import (
|
|
|
16
16
|
|
|
17
17
|
from mteb._evaluators.sklearn_evaluator import SklearnEvaluator, SklearnModelProtocol
|
|
18
18
|
from mteb.models import EncoderProtocol, MTEBModels
|
|
19
|
-
from mteb.types import HFSubset, ScoresDict
|
|
19
|
+
from mteb.types import EncodeKwargs, HFSubset, ScoresDict
|
|
20
20
|
from mteb.types.statistics import (
|
|
21
21
|
ImageStatistics,
|
|
22
22
|
LabelStatistics,
|
|
@@ -98,9 +98,8 @@ class AbsTaskClassification(AbsTask):
|
|
|
98
98
|
text: str (for text) or PIL.Image (for image). Column name can be changed via `input_column_name` attribute.
|
|
99
99
|
label: int. Column name can be changed via `label_column_name` attribute.
|
|
100
100
|
evaluator_model: The model to use for evaluation. Can be any sklearn compatible model. Default is `LogisticRegression`.
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
n_experiments: Number of experiments to run. Default is 10.
|
|
101
|
+
samples_per_label: Number of samples per label to use for training the evaluator model. Default is 8.
|
|
102
|
+
n_experiments: Number of experiments to run. Default is 10.
|
|
104
103
|
train_split: Name of the split to use for training the evaluator model. Default is "train".
|
|
105
104
|
label_column_name: Name of the column containing the labels. Default is "label".
|
|
106
105
|
input_column_name: Name of the column containing the input data. Default is "text".
|
|
@@ -126,7 +125,7 @@ class AbsTaskClassification(AbsTask):
|
|
|
126
125
|
split: str = "test",
|
|
127
126
|
subsets_to_run: list[HFSubset] | None = None,
|
|
128
127
|
*,
|
|
129
|
-
encode_kwargs:
|
|
128
|
+
encode_kwargs: EncodeKwargs,
|
|
130
129
|
prediction_folder: Path | None = None,
|
|
131
130
|
**kwargs: Any,
|
|
132
131
|
) -> dict[HFSubset, ScoresDict]:
|
|
@@ -143,6 +142,9 @@ class AbsTaskClassification(AbsTask):
|
|
|
143
142
|
if not self.data_loaded:
|
|
144
143
|
self.load_data()
|
|
145
144
|
|
|
145
|
+
if self.dataset is None:
|
|
146
|
+
raise RuntimeError("Dataset not loaded.")
|
|
147
|
+
|
|
146
148
|
if "random_state" in self.evaluator_model.get_params():
|
|
147
149
|
self.evaluator_model = self.evaluator_model.set_params(
|
|
148
150
|
random_state=self.seed
|
|
@@ -175,19 +177,22 @@ class AbsTaskClassification(AbsTask):
|
|
|
175
177
|
)
|
|
176
178
|
self._add_main_score(scores[hf_subset])
|
|
177
179
|
|
|
178
|
-
return scores
|
|
180
|
+
return scores # type: ignore[return-value]
|
|
179
181
|
|
|
180
182
|
def _evaluate_subset(
|
|
181
183
|
self,
|
|
182
|
-
model:
|
|
184
|
+
model: MTEBModels,
|
|
183
185
|
data_split: DatasetDict,
|
|
184
186
|
*,
|
|
185
|
-
encode_kwargs:
|
|
187
|
+
encode_kwargs: EncodeKwargs,
|
|
186
188
|
hf_split: str,
|
|
187
189
|
hf_subset: str,
|
|
188
190
|
prediction_folder: Path | None = None,
|
|
189
191
|
**kwargs: Any,
|
|
190
192
|
) -> FullClassificationMetrics:
|
|
193
|
+
if not isinstance(model, EncoderProtocol):
|
|
194
|
+
raise TypeError("Expected model to be an instance of EncoderProtocol")
|
|
195
|
+
|
|
191
196
|
train_split = data_split[self.train_split]
|
|
192
197
|
eval_split = data_split[hf_split]
|
|
193
198
|
|
|
@@ -237,7 +242,7 @@ class AbsTaskClassification(AbsTask):
|
|
|
237
242
|
# ap will be none for non binary classification tasks
|
|
238
243
|
k: (
|
|
239
244
|
float(np.mean(values))
|
|
240
|
-
if (values := [s[k] for s in scores if s[k] is not None])
|
|
245
|
+
if (values := [s[k] for s in scores if s[k] is not None]) # type: ignore[literal-required]
|
|
241
246
|
else np.nan
|
|
242
247
|
)
|
|
243
248
|
for k in scores[0].keys()
|
|
@@ -245,7 +250,7 @@ class AbsTaskClassification(AbsTask):
|
|
|
245
250
|
logger.info(f"Running {self.metadata.name} - Finished.")
|
|
246
251
|
return FullClassificationMetrics(
|
|
247
252
|
scores_per_experiment=scores,
|
|
248
|
-
**avg_scores,
|
|
253
|
+
**avg_scores, # type: ignore[typeddict-item]
|
|
249
254
|
)
|
|
250
255
|
|
|
251
256
|
def _calculate_scores(
|
mteb/abstasks/clustering.py
CHANGED
|
@@ -3,7 +3,7 @@ import logging
|
|
|
3
3
|
import random
|
|
4
4
|
from collections import defaultdict
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import Any
|
|
6
|
+
from typing import Any, cast
|
|
7
7
|
|
|
8
8
|
import numpy as np
|
|
9
9
|
from datasets import Dataset, DatasetDict
|
|
@@ -11,8 +11,8 @@ from sklearn.cluster import MiniBatchKMeans
|
|
|
11
11
|
from sklearn.metrics.cluster import v_measure_score
|
|
12
12
|
|
|
13
13
|
from mteb._create_dataloaders import create_dataloader
|
|
14
|
-
from mteb.models import EncoderProtocol
|
|
15
|
-
from mteb.types import HFSubset, ScoresDict
|
|
14
|
+
from mteb.models import EncoderProtocol, MTEBModels
|
|
15
|
+
from mteb.types import Array, EncodeKwargs, HFSubset, ScoresDict
|
|
16
16
|
from mteb.types.statistics import (
|
|
17
17
|
ImageStatistics,
|
|
18
18
|
LabelStatistics,
|
|
@@ -34,7 +34,7 @@ MultilingualDataset = dict[HFSubset, DatasetDict]
|
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
def _evaluate_clustering_bootstrapped(
|
|
37
|
-
embeddings:
|
|
37
|
+
embeddings: Array,
|
|
38
38
|
labels: list[list[str]],
|
|
39
39
|
n_clusters: int,
|
|
40
40
|
cluster_size: int,
|
|
@@ -61,21 +61,21 @@ def _evaluate_clustering_bootstrapped(
|
|
|
61
61
|
max_depth = max(map(len, labels))
|
|
62
62
|
# Evaluate on each level til max depth
|
|
63
63
|
for i_level in range(max_depth):
|
|
64
|
-
level_labels = []
|
|
64
|
+
level_labels: list[str | int] = []
|
|
65
65
|
# Assign -1 to gold label if the level is not there
|
|
66
66
|
for label in labels:
|
|
67
67
|
if len(label) > i_level:
|
|
68
68
|
level_labels.append(label[i_level])
|
|
69
69
|
else:
|
|
70
70
|
level_labels.append(-1)
|
|
71
|
-
|
|
71
|
+
np_level_labels = np.array(level_labels)
|
|
72
72
|
valid_idx = np.array(
|
|
73
|
-
[level_label != -1 for level_label in
|
|
73
|
+
[level_label != -1 for level_label in np_level_labels]
|
|
74
74
|
) # Could be level_labels != -1 but fails with FutureWarning: elementwise comparison failed
|
|
75
|
-
|
|
75
|
+
np_level_labels = np_level_labels[valid_idx]
|
|
76
76
|
level_embeddings = embeddings[valid_idx]
|
|
77
77
|
clustering_model = MiniBatchKMeans(
|
|
78
|
-
n_clusters=np.unique(
|
|
78
|
+
n_clusters=np.unique(np_level_labels).size,
|
|
79
79
|
batch_size=kmean_batch_size,
|
|
80
80
|
init="k-means++",
|
|
81
81
|
n_init=1, # default when kmeans++ is used
|
|
@@ -87,7 +87,7 @@ def _evaluate_clustering_bootstrapped(
|
|
|
87
87
|
cluster_indices = rng_state.choices(range(n_embeddings), k=cluster_size)
|
|
88
88
|
|
|
89
89
|
_embeddings = level_embeddings[cluster_indices]
|
|
90
|
-
_labels =
|
|
90
|
+
_labels = np_level_labels[cluster_indices]
|
|
91
91
|
cluster_assignment = clustering_model.fit_predict(_embeddings)
|
|
92
92
|
v_measure = v_measure_score(_labels, cluster_assignment)
|
|
93
93
|
v_measures[f"Level {i_level}"].append(v_measure)
|
|
@@ -153,15 +153,19 @@ class AbsTaskClustering(AbsTask):
|
|
|
153
153
|
|
|
154
154
|
def _evaluate_subset(
|
|
155
155
|
self,
|
|
156
|
-
model:
|
|
156
|
+
model: MTEBModels,
|
|
157
157
|
data_split: Dataset,
|
|
158
158
|
*,
|
|
159
|
-
encode_kwargs:
|
|
159
|
+
encode_kwargs: EncodeKwargs,
|
|
160
160
|
hf_split: str,
|
|
161
161
|
hf_subset: str,
|
|
162
162
|
prediction_folder: Path | None = None,
|
|
163
163
|
**kwargs: Any,
|
|
164
164
|
) -> ScoresDict:
|
|
165
|
+
if not isinstance(model, EncoderProtocol):
|
|
166
|
+
raise TypeError(
|
|
167
|
+
"Expected encoder model to be an instance of EncoderProtocol."
|
|
168
|
+
)
|
|
165
169
|
if (
|
|
166
170
|
self.max_document_to_embed is not None
|
|
167
171
|
and self.max_fraction_of_documents_to_embed is not None
|
|
@@ -182,13 +186,13 @@ class AbsTaskClustering(AbsTask):
|
|
|
182
186
|
self.max_fraction_of_documents_to_embed * len(data_split)
|
|
183
187
|
)
|
|
184
188
|
else:
|
|
185
|
-
max_documents_to_embed = self.max_document_to_embed
|
|
189
|
+
max_documents_to_embed = cast(int, self.max_document_to_embed)
|
|
186
190
|
|
|
187
|
-
max_documents_to_embed = min(len(data_split), max_documents_to_embed)
|
|
191
|
+
max_documents_to_embed = min(len(data_split), max_documents_to_embed)
|
|
188
192
|
example_indices = self.rng_state.sample(
|
|
189
193
|
range(len(data_split)), k=max_documents_to_embed
|
|
190
194
|
)
|
|
191
|
-
downsampled_dataset = data_split.select(example_indices)
|
|
195
|
+
downsampled_dataset = data_split.select(example_indices)
|
|
192
196
|
|
|
193
197
|
downsampled_dataset = downsampled_dataset.select_columns(
|
|
194
198
|
[self.input_column_name, self.label_column_name]
|
|
@@ -8,8 +8,8 @@ from scipy.optimize import linear_sum_assignment
|
|
|
8
8
|
from sklearn import metrics
|
|
9
9
|
|
|
10
10
|
from mteb._evaluators import ClusteringEvaluator
|
|
11
|
-
from mteb.models import EncoderProtocol
|
|
12
|
-
from mteb.types import ScoresDict
|
|
11
|
+
from mteb.models import EncoderProtocol, MTEBModels
|
|
12
|
+
from mteb.types import EncodeKwargs, ScoresDict
|
|
13
13
|
from mteb.types.statistics import (
|
|
14
14
|
ImageStatistics,
|
|
15
15
|
LabelStatistics,
|
|
@@ -80,15 +80,18 @@ class AbsTaskClusteringLegacy(AbsTask):
|
|
|
80
80
|
|
|
81
81
|
def _evaluate_subset(
|
|
82
82
|
self,
|
|
83
|
-
model:
|
|
83
|
+
model: MTEBModels,
|
|
84
84
|
data_split: Dataset,
|
|
85
85
|
*,
|
|
86
|
-
encode_kwargs:
|
|
86
|
+
encode_kwargs: EncodeKwargs,
|
|
87
87
|
hf_split: str,
|
|
88
88
|
hf_subset: str,
|
|
89
89
|
prediction_folder: Path | None = None,
|
|
90
90
|
**kwargs: Any,
|
|
91
91
|
) -> ScoresDict:
|
|
92
|
+
if not isinstance(model, EncoderProtocol):
|
|
93
|
+
raise TypeError("Expected model to be an instance of EncoderProtocol")
|
|
94
|
+
|
|
92
95
|
data_split = data_split.select_columns(
|
|
93
96
|
[self.input_column_name, self.label_column_name]
|
|
94
97
|
)
|
|
@@ -139,9 +142,6 @@ class AbsTaskClusteringLegacy(AbsTask):
|
|
|
139
142
|
}
|
|
140
143
|
return scores
|
|
141
144
|
|
|
142
|
-
data_split = data_split.select_columns(
|
|
143
|
-
[self.input_column_name, self.label_column_name]
|
|
144
|
-
)
|
|
145
145
|
evaluator = self.evaluator(
|
|
146
146
|
data_split,
|
|
147
147
|
input_column_name=self.input_column_name,
|
|
@@ -151,10 +151,10 @@ class AbsTaskClusteringLegacy(AbsTask):
|
|
|
151
151
|
hf_subset=hf_subset,
|
|
152
152
|
**kwargs,
|
|
153
153
|
)
|
|
154
|
-
|
|
154
|
+
evaluate_clusters = evaluator(model, encode_kwargs=encode_kwargs)
|
|
155
155
|
if prediction_folder:
|
|
156
156
|
self._save_task_predictions(
|
|
157
|
-
|
|
157
|
+
evaluate_clusters,
|
|
158
158
|
model,
|
|
159
159
|
prediction_folder,
|
|
160
160
|
hf_subset=hf_subset,
|
|
@@ -163,7 +163,7 @@ class AbsTaskClusteringLegacy(AbsTask):
|
|
|
163
163
|
|
|
164
164
|
return self._compute_metrics(
|
|
165
165
|
data_split[self.label_column_name],
|
|
166
|
-
|
|
166
|
+
evaluate_clusters,
|
|
167
167
|
)
|
|
168
168
|
|
|
169
169
|
def _compute_metrics(
|
|
@@ -12,7 +12,8 @@ from mteb.abstasks._statistics_calculation import (
|
|
|
12
12
|
calculate_text_statistics,
|
|
13
13
|
)
|
|
14
14
|
from mteb.abstasks.abstask import AbsTask
|
|
15
|
-
from mteb.models.models_protocols import EncoderProtocol
|
|
15
|
+
from mteb.models.models_protocols import EncoderProtocol, MTEBModels
|
|
16
|
+
from mteb.types import EncodeKwargs
|
|
16
17
|
from mteb.types.statistics import (
|
|
17
18
|
ImageStatistics,
|
|
18
19
|
SplitDescriptiveStatistics,
|
|
@@ -116,15 +117,17 @@ class AbsTaskImageTextPairClassification(AbsTask):
|
|
|
116
117
|
|
|
117
118
|
def _evaluate_subset(
|
|
118
119
|
self,
|
|
119
|
-
model:
|
|
120
|
+
model: MTEBModels,
|
|
120
121
|
data_split: Dataset,
|
|
121
122
|
*,
|
|
122
|
-
encode_kwargs:
|
|
123
|
+
encode_kwargs: EncodeKwargs,
|
|
123
124
|
hf_split: str,
|
|
124
125
|
hf_subset: str,
|
|
125
126
|
prediction_folder: Path | None = None,
|
|
126
127
|
**kwargs: Any,
|
|
127
128
|
) -> ImageTextPairClassificationMetrics:
|
|
129
|
+
if not isinstance(model, EncoderProtocol):
|
|
130
|
+
raise TypeError("Expected model to be an instance of EncoderProtocol")
|
|
128
131
|
select_columns = []
|
|
129
132
|
for columns in (self.images_column_names, self.texts_column_names):
|
|
130
133
|
if isinstance(columns, str):
|
|
@@ -154,7 +157,7 @@ class AbsTaskImageTextPairClassification(AbsTask):
|
|
|
154
157
|
hf_subset=hf_subset,
|
|
155
158
|
**kwargs,
|
|
156
159
|
)
|
|
157
|
-
scores = evaluator(model, encode_kwargs=encode_kwargs)
|
|
160
|
+
scores: list[torch.Tensor] = evaluator(model, encode_kwargs=encode_kwargs) # type: ignore[assignment]
|
|
158
161
|
if prediction_folder:
|
|
159
162
|
self._save_task_predictions(
|
|
160
163
|
[score.tolist() for score in scores],
|
|
@@ -16,7 +16,8 @@ from typing_extensions import override
|
|
|
16
16
|
from mteb._create_dataloaders import create_dataloader
|
|
17
17
|
from mteb._evaluators.classification_metrics import hamming_score
|
|
18
18
|
from mteb._evaluators.sklearn_evaluator import SklearnModelProtocol
|
|
19
|
-
from mteb.models import EncoderProtocol
|
|
19
|
+
from mteb.models import EncoderProtocol, MTEBModels
|
|
20
|
+
from mteb.types import Array, EncodeKwargs
|
|
20
21
|
|
|
21
22
|
from .classification import AbsTaskClassification
|
|
22
23
|
|
|
@@ -24,14 +25,14 @@ logger = logging.getLogger(__name__)
|
|
|
24
25
|
|
|
25
26
|
|
|
26
27
|
def _evaluate_classifier(
|
|
27
|
-
embeddings_train:
|
|
28
|
+
embeddings_train: Array,
|
|
28
29
|
y_train: np.ndarray,
|
|
29
|
-
embeddings_test:
|
|
30
|
+
embeddings_test: Array,
|
|
30
31
|
classifier: SklearnModelProtocol,
|
|
31
32
|
) -> tuple[np.ndarray, SklearnModelProtocol]:
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
return
|
|
33
|
+
classifier_copy: SklearnModelProtocol = clone(classifier)
|
|
34
|
+
classifier_copy.fit(embeddings_train, y_train)
|
|
35
|
+
return classifier_copy.predict(embeddings_test), classifier_copy
|
|
35
36
|
|
|
36
37
|
|
|
37
38
|
class MultilabelClassificationMetrics(TypedDict):
|
|
@@ -69,25 +70,28 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
|
|
|
69
70
|
input_column_name: Name of the column containing the input text.
|
|
70
71
|
label_column_name: Name of the column containing the labels.
|
|
71
72
|
samples_per_label: Number of samples to use pr. label. These samples are embedded and a classifier is fit using the labels and samples.
|
|
72
|
-
|
|
73
|
+
evaluator_model: Classifier to use for evaluation. Must implement the SklearnModelProtocol.
|
|
73
74
|
"""
|
|
74
75
|
|
|
75
|
-
|
|
76
|
+
evaluator_model: SklearnModelProtocol = KNeighborsClassifier(n_neighbors=5)
|
|
76
77
|
input_column_name: str = "text"
|
|
77
78
|
label_column_name: str = "label"
|
|
78
79
|
|
|
79
80
|
@override
|
|
80
|
-
def _evaluate_subset(
|
|
81
|
+
def _evaluate_subset( # type: ignore[override]
|
|
81
82
|
self,
|
|
82
|
-
model:
|
|
83
|
+
model: MTEBModels,
|
|
83
84
|
data_split: DatasetDict,
|
|
84
85
|
*,
|
|
85
|
-
encode_kwargs:
|
|
86
|
+
encode_kwargs: EncodeKwargs,
|
|
86
87
|
hf_split: str,
|
|
87
88
|
hf_subset: str,
|
|
88
89
|
prediction_folder: Path | None = None,
|
|
89
90
|
**kwargs: Any,
|
|
90
91
|
) -> FullMultilabelClassificationMetrics:
|
|
92
|
+
if not isinstance(model, EncoderProtocol):
|
|
93
|
+
raise TypeError("Expected model to be an instance of EncoderProtocol")
|
|
94
|
+
|
|
91
95
|
if isinstance(data_split, DatasetDict):
|
|
92
96
|
data_split = data_split.select_columns(
|
|
93
97
|
[self.input_column_name, self.label_column_name]
|
|
@@ -165,7 +169,7 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
|
|
|
165
169
|
y_train = train_split.select(sample_indices)[self.label_column_name]
|
|
166
170
|
y_train = binarizer.transform(y_train)
|
|
167
171
|
y_pred, current_classifier = _evaluate_classifier(
|
|
168
|
-
X_train, y_train, X_test, self.
|
|
172
|
+
X_train, y_train, X_test, self.evaluator_model
|
|
169
173
|
)
|
|
170
174
|
if prediction_folder:
|
|
171
175
|
all_predictions.append(y_pred.tolist())
|
|
@@ -185,19 +189,20 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
|
|
|
185
189
|
)
|
|
186
190
|
|
|
187
191
|
avg_scores: dict[str, Any] = {
|
|
188
|
-
k: np.mean([s[k] for s in scores])
|
|
192
|
+
k: np.mean([s[k] for s in scores]) # type: ignore[literal-required]
|
|
193
|
+
for k in scores[0].keys()
|
|
189
194
|
}
|
|
190
195
|
logger.info("Running multilabel classification - Finished.")
|
|
191
196
|
return FullMultilabelClassificationMetrics(
|
|
192
197
|
scores_per_experiment=scores,
|
|
193
|
-
**avg_scores,
|
|
198
|
+
**avg_scores, # type: ignore[typeddict-item]
|
|
194
199
|
)
|
|
195
200
|
|
|
196
|
-
def _calculate_scores(
|
|
201
|
+
def _calculate_scores( # type: ignore[override]
|
|
197
202
|
self,
|
|
198
203
|
y_test: np.ndarray,
|
|
199
204
|
y_pred: np.ndarray,
|
|
200
|
-
x_test_embedding:
|
|
205
|
+
x_test_embedding: Array,
|
|
201
206
|
current_classifier: SklearnModelProtocol,
|
|
202
207
|
) -> MultilabelClassificationMetrics:
|
|
203
208
|
accuracy = current_classifier.score(x_test_embedding, y_test)
|
|
@@ -232,10 +237,9 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
|
|
|
232
237
|
"""
|
|
233
238
|
sample_indices = []
|
|
234
239
|
if idxs is None:
|
|
235
|
-
idxs = np.arange(len(y))
|
|
240
|
+
idxs = list(np.arange(len(y)))
|
|
236
241
|
self.np_rng.shuffle(idxs)
|
|
237
|
-
|
|
238
|
-
label_counter = defaultdict(int)
|
|
242
|
+
label_counter: dict[int, int] = defaultdict(int)
|
|
239
243
|
for i in idxs:
|
|
240
244
|
if any((label_counter[label] < samples_per_label) for label in y[i]):
|
|
241
245
|
sample_indices.append(i)
|
|
@@ -18,8 +18,8 @@ from mteb.abstasks._statistics_calculation import (
|
|
|
18
18
|
)
|
|
19
19
|
from mteb.abstasks.abstask import AbsTask
|
|
20
20
|
from mteb.models.model_meta import ScoringFunction
|
|
21
|
-
from mteb.models.models_protocols import EncoderProtocol
|
|
22
|
-
from mteb.types import PromptType
|
|
21
|
+
from mteb.models.models_protocols import EncoderProtocol, MTEBModels
|
|
22
|
+
from mteb.types import EncodeKwargs, PromptType
|
|
23
23
|
from mteb.types.statistics import (
|
|
24
24
|
ImageStatistics,
|
|
25
25
|
LabelStatistics,
|
|
@@ -44,8 +44,8 @@ class PairClassificationDescriptiveStatistics(SplitDescriptiveStatistics):
|
|
|
44
44
|
"""
|
|
45
45
|
|
|
46
46
|
num_samples: int
|
|
47
|
-
number_of_characters: int
|
|
48
|
-
unique_pairs: int
|
|
47
|
+
number_of_characters: int | None
|
|
48
|
+
unique_pairs: int | None
|
|
49
49
|
|
|
50
50
|
text1_statistics: TextStatistics | None
|
|
51
51
|
image1_statistics: ImageStatistics | None
|
|
@@ -79,15 +79,18 @@ class AbsTaskPairClassification(AbsTask):
|
|
|
79
79
|
|
|
80
80
|
def _evaluate_subset(
|
|
81
81
|
self,
|
|
82
|
-
model:
|
|
82
|
+
model: MTEBModels,
|
|
83
83
|
data_split: Dataset,
|
|
84
84
|
*,
|
|
85
85
|
hf_split: str,
|
|
86
86
|
hf_subset: str,
|
|
87
|
-
encode_kwargs:
|
|
87
|
+
encode_kwargs: EncodeKwargs,
|
|
88
88
|
prediction_folder: Path | None = None,
|
|
89
89
|
**kwargs,
|
|
90
90
|
) -> dict[str, float]:
|
|
91
|
+
if not isinstance(model, EncoderProtocol):
|
|
92
|
+
raise TypeError("Expected model to be an instance of EncoderProtocol")
|
|
93
|
+
|
|
91
94
|
if self.metadata.modalities == ["text"]:
|
|
92
95
|
# for compatibility with v1 version where datasets were stored in a single row
|
|
93
96
|
data_split = data_split[0] if len(data_split) == 1 else data_split
|
|
@@ -120,7 +123,7 @@ class AbsTaskPairClassification(AbsTask):
|
|
|
120
123
|
self, similarity_scores: PairClassificationDistances, labels: list[int]
|
|
121
124
|
) -> dict[str, float]:
|
|
122
125
|
logger.info("Computing metrics...")
|
|
123
|
-
|
|
126
|
+
np_labels = np.asarray(labels)
|
|
124
127
|
output_scores = {}
|
|
125
128
|
max_scores = defaultdict(list)
|
|
126
129
|
for short_name, scores, reverse in [
|
|
@@ -142,7 +145,7 @@ class AbsTaskPairClassification(AbsTask):
|
|
|
142
145
|
],
|
|
143
146
|
[ScoringFunction.DOT_PRODUCT.value, similarity_scores["dot_scores"], True],
|
|
144
147
|
]:
|
|
145
|
-
metrics = self._compute_metrics_values(scores,
|
|
148
|
+
metrics = self._compute_metrics_values(scores, np_labels, reverse) # type: ignore[arg-type]
|
|
146
149
|
for metric_name, metric_value in metrics.items():
|
|
147
150
|
output_scores[f"{short_name}_{metric_name}"] = metric_value
|
|
148
151
|
max_scores[metric_name].append(metric_value)
|
|
@@ -237,6 +240,12 @@ class AbsTaskPairClassification(AbsTask):
|
|
|
237
240
|
|
|
238
241
|
def _push_dataset_to_hub(self, repo_name: str) -> None:
|
|
239
242
|
# previously pair classification datasets were stored in a single row
|
|
243
|
+
if self.dataset is None:
|
|
244
|
+
# overall this shouldn't happen as we check for dataset before pushing to hub
|
|
245
|
+
# added here for type checking purposes
|
|
246
|
+
raise RuntimeError(
|
|
247
|
+
"Dataset not loaded. To load dataset run `task.load_data()`."
|
|
248
|
+
)
|
|
240
249
|
if self.metadata.is_multilingual:
|
|
241
250
|
for subset in self.dataset:
|
|
242
251
|
for split in self.dataset[subset]:
|
|
@@ -290,13 +299,13 @@ class AbsTaskPairClassification(AbsTask):
|
|
|
290
299
|
)
|
|
291
300
|
|
|
292
301
|
def _find_best_acc_and_threshold(
|
|
293
|
-
self, scores:
|
|
302
|
+
self, scores: list[float], labels: np.ndarray, high_score_more_similar: bool
|
|
294
303
|
) -> tuple[float, float]:
|
|
295
304
|
rows = list(zip(scores, labels))
|
|
296
305
|
rows = sorted(rows, key=lambda x: x[0], reverse=high_score_more_similar)
|
|
297
306
|
|
|
298
307
|
max_acc = 0
|
|
299
|
-
best_threshold = -1
|
|
308
|
+
best_threshold = -1.0
|
|
300
309
|
positive_so_far = 0
|
|
301
310
|
remaining_negatives = sum(np.array(labels) == 0)
|
|
302
311
|
|
|
@@ -323,7 +332,7 @@ class AbsTaskPairClassification(AbsTask):
|
|
|
323
332
|
|
|
324
333
|
rows = sorted(rows, key=lambda x: x[0], reverse=high_score_more_similar)
|
|
325
334
|
|
|
326
|
-
best_f1 = best_precision = best_recall = 0
|
|
335
|
+
best_f1 = best_precision = best_recall = 0.0
|
|
327
336
|
threshold = 0
|
|
328
337
|
nextract = 0
|
|
329
338
|
ncorrect = 0
|
mteb/abstasks/regression.py
CHANGED
|
@@ -84,10 +84,10 @@ class AbsTaskRegression(AbsTaskClassification):
|
|
|
84
84
|
n_samples: Number of samples to use for training the regression model. If the dataset has fewer samples than n_samples, all samples are used.
|
|
85
85
|
abstask_prompt: Prompt to use for the task for instruction model if not prompt is provided in TaskMetadata.prompt.
|
|
86
86
|
evaluator_model: The model to use for evaluation. Can be any sklearn compatible model. Default is `LinearRegression`.
|
|
87
|
-
|
|
87
|
+
|
|
88
88
|
"""
|
|
89
89
|
|
|
90
|
-
evaluator: type[
|
|
90
|
+
evaluator: type[SklearnEvaluator] = SklearnEvaluator
|
|
91
91
|
evaluator_model: SklearnModelProtocol = LinearRegression(n_jobs=-1)
|
|
92
92
|
|
|
93
93
|
train_split: str = "train"
|
|
@@ -113,7 +113,7 @@ class AbsTaskRegression(AbsTaskClassification):
|
|
|
113
113
|
)["train"]
|
|
114
114
|
return train_split_sampled, []
|
|
115
115
|
|
|
116
|
-
def _calculate_scores(
|
|
116
|
+
def _calculate_scores( # type: ignore[override]
|
|
117
117
|
self,
|
|
118
118
|
y_test: np.ndarray | list[int],
|
|
119
119
|
y_pred: np.ndarray,
|
|
@@ -183,7 +183,7 @@ class AbsTaskRegression(AbsTaskClassification):
|
|
|
183
183
|
|
|
184
184
|
return dataset_dict
|
|
185
185
|
|
|
186
|
-
def _calculate_descriptive_statistics_from_split(
|
|
186
|
+
def _calculate_descriptive_statistics_from_split( # type: ignore[override]
|
|
187
187
|
self, split: str, hf_subset: str | None = None, compute_overall: bool = False
|
|
188
188
|
) -> RegressionDescriptiveStatistics:
|
|
189
189
|
train_text = []
|
mteb/abstasks/retrieval.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
3
|
from collections import defaultdict
|
|
4
|
-
from collections.abc import Callable, Sequence
|
|
4
|
+
from collections.abc import Callable, Mapping, Sequence
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from time import time
|
|
7
7
|
from typing import Any, Literal
|
|
@@ -25,6 +25,7 @@ from mteb.models import (
|
|
|
25
25
|
SearchProtocol,
|
|
26
26
|
)
|
|
27
27
|
from mteb.types import (
|
|
28
|
+
EncodeKwargs,
|
|
28
29
|
HFSubset,
|
|
29
30
|
QueryDatasetType,
|
|
30
31
|
RelevantDocumentsType,
|
|
@@ -184,17 +185,17 @@ class AbsTaskRetrieval(AbsTask):
|
|
|
184
185
|
return queries, corpus
|
|
185
186
|
|
|
186
187
|
if self.metadata.is_multilingual:
|
|
187
|
-
for subset in self.queries:
|
|
188
|
-
for split in self.queries[subset]:
|
|
189
|
-
queries = self.queries[subset][split]
|
|
190
|
-
corpus = self.corpus[subset][split]
|
|
188
|
+
for subset in self.queries: # type: ignore[attr-defined]
|
|
189
|
+
for split in self.queries[subset]: # type: ignore[attr-defined]
|
|
190
|
+
queries = self.queries[subset][split] # type: ignore[attr-defined]
|
|
191
|
+
corpus = self.corpus[subset][split] # type: ignore[attr-defined]
|
|
191
192
|
|
|
192
193
|
(
|
|
193
194
|
self.dataset[subset][split]["queries"],
|
|
194
195
|
self.dataset[subset][split]["corpus"],
|
|
195
196
|
) = _process_split(queries, corpus)
|
|
196
197
|
|
|
197
|
-
self.dataset[subset][split]["relevant_docs"] = self.relevant_docs[
|
|
198
|
+
self.dataset[subset][split]["relevant_docs"] = self.relevant_docs[ # type: ignore[attr-defined]
|
|
198
199
|
subset
|
|
199
200
|
][split]
|
|
200
201
|
if hasattr(self, "instructions"):
|
|
@@ -211,15 +212,15 @@ class AbsTaskRetrieval(AbsTask):
|
|
|
211
212
|
][split]
|
|
212
213
|
else:
|
|
213
214
|
subset = "default"
|
|
214
|
-
for split in self.queries:
|
|
215
|
-
queries = self.queries[split]
|
|
216
|
-
corpus = self.corpus[split]
|
|
215
|
+
for split in self.queries: # type: ignore[attr-defined]
|
|
216
|
+
queries = self.queries[split] # type: ignore[attr-defined]
|
|
217
|
+
corpus = self.corpus[split] # type: ignore[attr-defined]
|
|
217
218
|
(
|
|
218
219
|
self.dataset[subset][split]["queries"],
|
|
219
220
|
self.dataset[subset][split]["corpus"],
|
|
220
221
|
) = _process_split(queries, corpus)
|
|
221
222
|
|
|
222
|
-
self.dataset[subset][split]["relevant_docs"] = self.relevant_docs[
|
|
223
|
+
self.dataset[subset][split]["relevant_docs"] = self.relevant_docs[ # type: ignore[attr-defined]
|
|
223
224
|
split
|
|
224
225
|
].copy()
|
|
225
226
|
if hasattr(self, "instructions"):
|
|
@@ -235,9 +236,9 @@ class AbsTaskRetrieval(AbsTask):
|
|
|
235
236
|
split
|
|
236
237
|
].copy()
|
|
237
238
|
|
|
238
|
-
del self.queries
|
|
239
|
-
del self.corpus
|
|
240
|
-
del self.relevant_docs
|
|
239
|
+
del self.queries # type: ignore[attr-defined]
|
|
240
|
+
del self.corpus # type: ignore[attr-defined]
|
|
241
|
+
del self.relevant_docs # type: ignore[attr-defined]
|
|
241
242
|
if hasattr(self, "instructions"):
|
|
242
243
|
del self.instructions
|
|
243
244
|
if hasattr(self, "top_ranked"):
|
|
@@ -283,10 +284,10 @@ class AbsTaskRetrieval(AbsTask):
|
|
|
283
284
|
split: str = "test",
|
|
284
285
|
subsets_to_run: list[HFSubset] | None = None,
|
|
285
286
|
*,
|
|
286
|
-
encode_kwargs:
|
|
287
|
+
encode_kwargs: EncodeKwargs,
|
|
287
288
|
prediction_folder: Path | None = None,
|
|
288
|
-
**kwargs,
|
|
289
|
-
) ->
|
|
289
|
+
**kwargs: Any,
|
|
290
|
+
) -> Mapping[HFSubset, ScoresDict]:
|
|
290
291
|
"""Evaluate the model on the retrieval task.
|
|
291
292
|
|
|
292
293
|
Args:
|
|
@@ -320,7 +321,7 @@ class AbsTaskRetrieval(AbsTask):
|
|
|
320
321
|
self,
|
|
321
322
|
model: MTEBModels,
|
|
322
323
|
data_split: RetrievalSplitData,
|
|
323
|
-
encode_kwargs:
|
|
324
|
+
encode_kwargs: EncodeKwargs,
|
|
324
325
|
hf_split: str,
|
|
325
326
|
hf_subset: str,
|
|
326
327
|
prediction_folder: Path | None = None,
|
|
@@ -357,6 +358,8 @@ class AbsTaskRetrieval(AbsTask):
|
|
|
357
358
|
**kwargs,
|
|
358
359
|
)
|
|
359
360
|
|
|
361
|
+
search_model: SearchProtocol
|
|
362
|
+
|
|
360
363
|
if isinstance(model, EncoderProtocol) and not isinstance(model, SearchProtocol):
|
|
361
364
|
search_model = SearchEncoderWrapper(model)
|
|
362
365
|
elif isinstance(model, CrossEncoderProtocol):
|
|
@@ -578,11 +581,12 @@ class AbsTaskRetrieval(AbsTask):
|
|
|
578
581
|
if isinstance(data[split][subset_item], Dataset):
|
|
579
582
|
sections[split] = data[split][subset_item]
|
|
580
583
|
elif converter is not None:
|
|
584
|
+
subset_data = data[split][subset_item]
|
|
585
|
+
if subset_data is None:
|
|
586
|
+
continue
|
|
587
|
+
|
|
581
588
|
sections[split] = Dataset.from_list(
|
|
582
|
-
[
|
|
583
|
-
converter(idx, item)
|
|
584
|
-
for idx, item in data[split][subset_item].items()
|
|
585
|
-
]
|
|
589
|
+
[converter(idx, item) for idx, item in subset_data.items()]
|
|
586
590
|
)
|
|
587
591
|
else:
|
|
588
592
|
raise ValueError(
|
|
@@ -680,7 +684,7 @@ class AbsTaskRetrieval(AbsTask):
|
|
|
680
684
|
|
|
681
685
|
top_k_sorted = defaultdict(list)
|
|
682
686
|
for query_id, values in top_ranked.items():
|
|
683
|
-
sorted_keys = sorted(values, key=values
|
|
687
|
+
sorted_keys = sorted(values, key=lambda k: values[k], reverse=True)
|
|
684
688
|
top_k_sorted[query_id] = sorted_keys[: self._top_k]
|
|
685
689
|
|
|
686
690
|
self.dataset[subset][split]["top_ranked"] = top_k_sorted
|
|
@@ -688,10 +692,10 @@ class AbsTaskRetrieval(AbsTask):
|
|
|
688
692
|
|
|
689
693
|
|
|
690
694
|
def _process_relevant_docs(
|
|
691
|
-
collection:
|
|
695
|
+
collection: Mapping[str, Mapping[str, int]],
|
|
692
696
|
hf_subset: str,
|
|
693
697
|
split: str,
|
|
694
|
-
) -> dict[str, dict[str,
|
|
698
|
+
) -> dict[str, dict[str, int]]:
|
|
695
699
|
"""Collections can contain overlapping ids in different splits. Prepend split and subset to avoid this
|
|
696
700
|
|
|
697
701
|
Returns:
|