mteb 2.5.3__py3-none-any.whl → 2.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/_create_dataloaders.py +10 -15
- mteb/_evaluators/any_sts_evaluator.py +1 -4
- mteb/_evaluators/evaluator.py +2 -1
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +5 -6
- mteb/_evaluators/pair_classification_evaluator.py +3 -1
- mteb/_evaluators/retrieval_metrics.py +17 -16
- mteb/_evaluators/sklearn_evaluator.py +9 -8
- mteb/_evaluators/text/bitext_mining_evaluator.py +23 -16
- mteb/_evaluators/text/summarization_evaluator.py +20 -16
- mteb/abstasks/_data_filter/filters.py +1 -1
- mteb/abstasks/_data_filter/task_pipelines.py +3 -0
- mteb/abstasks/_statistics_calculation.py +18 -10
- mteb/abstasks/_stratification.py +18 -18
- mteb/abstasks/abstask.py +27 -21
- mteb/abstasks/aggregate_task_metadata.py +1 -9
- mteb/abstasks/aggregated_task.py +3 -16
- mteb/abstasks/classification.py +10 -4
- mteb/abstasks/clustering.py +18 -14
- mteb/abstasks/clustering_legacy.py +8 -8
- mteb/abstasks/image/image_text_pair_classification.py +5 -3
- mteb/abstasks/multilabel_classification.py +20 -16
- mteb/abstasks/pair_classification.py +18 -9
- mteb/abstasks/regression.py +3 -3
- mteb/abstasks/retrieval.py +12 -9
- mteb/abstasks/sts.py +6 -3
- mteb/abstasks/task_metadata.py +20 -16
- mteb/abstasks/text/bitext_mining.py +36 -25
- mteb/abstasks/text/reranking.py +7 -5
- mteb/abstasks/text/summarization.py +8 -3
- mteb/abstasks/zeroshot_classification.py +5 -2
- mteb/benchmarks/benchmark.py +4 -2
- mteb/benchmarks/benchmarks/benchmarks.py +22 -1
- mteb/benchmarks/get_benchmark.py +14 -55
- mteb/cache.py +21 -18
- mteb/cli/_display_tasks.py +2 -2
- mteb/cli/build_cli.py +8 -8
- mteb/cli/generate_model_card.py +39 -20
- mteb/deprecated_evaluator.py +56 -43
- mteb/evaluate.py +35 -29
- mteb/filter_tasks.py +25 -26
- mteb/get_tasks.py +25 -27
- mteb/languages/language_scripts.py +5 -3
- mteb/leaderboard/app.py +1 -1
- mteb/load_results.py +12 -12
- mteb/models/abs_encoder.py +2 -2
- mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +2 -1
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +30 -13
- mteb/models/cache_wrappers/cache_wrapper.py +2 -2
- mteb/models/get_model_meta.py +8 -1
- mteb/models/instruct_wrapper.py +11 -5
- mteb/models/model_implementations/andersborges.py +2 -2
- mteb/models/model_implementations/blip_models.py +8 -8
- mteb/models/model_implementations/bm25.py +1 -1
- mteb/models/model_implementations/clip_models.py +3 -3
- mteb/models/model_implementations/cohere_models.py +1 -1
- mteb/models/model_implementations/cohere_v.py +2 -2
- mteb/models/model_implementations/dino_models.py +23 -23
- mteb/models/model_implementations/emillykkejensen_models.py +3 -3
- mteb/models/model_implementations/jina_clip.py +1 -1
- mteb/models/model_implementations/jina_models.py +1 -1
- mteb/models/model_implementations/kennethenevoldsen_models.py +2 -2
- mteb/models/model_implementations/llm2clip_models.py +3 -3
- mteb/models/model_implementations/moco_models.py +2 -2
- mteb/models/model_implementations/model2vec_models.py +1 -1
- mteb/models/model_implementations/nomic_models.py +8 -8
- mteb/models/model_implementations/openclip_models.py +7 -7
- mteb/models/model_implementations/random_baseline.py +3 -3
- mteb/models/model_implementations/rasgaard_models.py +1 -1
- mteb/models/model_implementations/repllama_models.py +2 -2
- mteb/models/model_implementations/rerankers_custom.py +3 -3
- mteb/models/model_implementations/rerankers_monot5_based.py +3 -3
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +113 -146
- mteb/models/model_implementations/siglip_models.py +10 -10
- mteb/models/model_implementations/vlm2vec_models.py +1 -1
- mteb/models/model_implementations/voyage_v.py +4 -4
- mteb/models/model_meta.py +30 -14
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +5 -5
- mteb/models/search_wrappers.py +22 -10
- mteb/models/sentence_transformer_wrapper.py +9 -4
- mteb/py.typed +0 -0
- mteb/results/benchmark_results.py +25 -19
- mteb/results/model_result.py +49 -21
- mteb/results/task_result.py +45 -51
- mteb/similarity_functions.py +11 -7
- mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
- mteb/tasks/classification/est/estonian_valence.py +1 -1
- mteb/tasks/classification/multilingual/scala_classification.py +1 -1
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
- mteb/tasks/retrieval/code/code_rag.py +12 -12
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
- mteb/tasks/retrieval/nob/norquad.py +2 -2
- mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
- mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
- mteb/types/_result.py +2 -1
- mteb/types/statistics.py +9 -3
- {mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/METADATA +1 -1
- {mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/RECORD +105 -104
- {mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/WHEEL +0 -0
- {mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/entry_points.txt +0 -0
- {mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/top_level.txt +0 -0
mteb/abstasks/abstask.py
CHANGED
|
@@ -2,10 +2,10 @@ import json
|
|
|
2
2
|
import logging
|
|
3
3
|
import warnings
|
|
4
4
|
from abc import ABC, abstractmethod
|
|
5
|
-
from collections.abc import Sequence
|
|
5
|
+
from collections.abc import Mapping, Sequence
|
|
6
6
|
from copy import copy
|
|
7
7
|
from pathlib import Path
|
|
8
|
-
from typing import Any, cast
|
|
8
|
+
from typing import Any, Literal, cast
|
|
9
9
|
|
|
10
10
|
import numpy as np
|
|
11
11
|
from datasets import ClassLabel, Dataset, DatasetDict, load_dataset
|
|
@@ -79,8 +79,8 @@ class AbsTask(ABC):
|
|
|
79
79
|
"""
|
|
80
80
|
|
|
81
81
|
metadata: TaskMetadata
|
|
82
|
-
abstask_prompt: str
|
|
83
|
-
_eval_splits:
|
|
82
|
+
abstask_prompt: str
|
|
83
|
+
_eval_splits: Sequence[str] | None = None
|
|
84
84
|
dataset: dict[HFSubset, DatasetDict] | None = None
|
|
85
85
|
data_loaded: bool = False
|
|
86
86
|
hf_subsets: list[HFSubset]
|
|
@@ -124,7 +124,7 @@ class AbsTask(ABC):
|
|
|
124
124
|
encode_kwargs: dict[str, Any],
|
|
125
125
|
prediction_folder: Path | None = None,
|
|
126
126
|
**kwargs: Any,
|
|
127
|
-
) ->
|
|
127
|
+
) -> Mapping[HFSubset, ScoresDict]:
|
|
128
128
|
"""Evaluates an MTEB compatible model on the task.
|
|
129
129
|
|
|
130
130
|
Args:
|
|
@@ -196,12 +196,12 @@ class AbsTask(ABC):
|
|
|
196
196
|
@abstractmethod
|
|
197
197
|
def _evaluate_subset(
|
|
198
198
|
self,
|
|
199
|
-
model:
|
|
199
|
+
model: MTEBModels,
|
|
200
200
|
data_split: Dataset,
|
|
201
201
|
*,
|
|
202
|
-
encode_kwargs: dict[str, Any],
|
|
203
202
|
hf_split: str,
|
|
204
203
|
hf_subset: str,
|
|
204
|
+
encode_kwargs: dict[str, Any],
|
|
205
205
|
prediction_folder: Path | None = None,
|
|
206
206
|
**kwargs: Any,
|
|
207
207
|
) -> ScoresDict:
|
|
@@ -211,7 +211,7 @@ class AbsTask(ABC):
|
|
|
211
211
|
|
|
212
212
|
def _save_task_predictions(
|
|
213
213
|
self,
|
|
214
|
-
predictions:
|
|
214
|
+
predictions: Mapping[str, Any] | list[Any],
|
|
215
215
|
model: MTEBModels,
|
|
216
216
|
prediction_folder: Path,
|
|
217
217
|
hf_split: str,
|
|
@@ -227,7 +227,7 @@ class AbsTask(ABC):
|
|
|
227
227
|
hf_subset: The subset of the dataset (e.g. "en").
|
|
228
228
|
"""
|
|
229
229
|
predictions_path = self._predictions_path(prediction_folder)
|
|
230
|
-
existing_results = {
|
|
230
|
+
existing_results: dict[str, Any] = {
|
|
231
231
|
"mteb_model_meta": {
|
|
232
232
|
"model_name": model.mteb_model_meta.name,
|
|
233
233
|
"revision": model.mteb_model_meta.revision,
|
|
@@ -327,7 +327,7 @@ class AbsTask(ABC):
|
|
|
327
327
|
)
|
|
328
328
|
else:
|
|
329
329
|
# some of monolingual datasets explicitly adding the split name to the dataset name
|
|
330
|
-
self.dataset = load_dataset(**self.metadata.dataset)
|
|
330
|
+
self.dataset = load_dataset(**self.metadata.dataset)
|
|
331
331
|
self.dataset_transform()
|
|
332
332
|
self.data_loaded = True
|
|
333
333
|
|
|
@@ -363,15 +363,19 @@ class AbsTask(ABC):
|
|
|
363
363
|
"""
|
|
364
364
|
from mteb.abstasks import AbsTaskClassification
|
|
365
365
|
|
|
366
|
-
|
|
366
|
+
existing_stats = self.metadata.descriptive_stats
|
|
367
|
+
|
|
368
|
+
if existing_stats is not None and not overwrite_results:
|
|
367
369
|
logger.info("Loading metadata descriptive statistics from cache.")
|
|
368
|
-
return
|
|
370
|
+
return existing_stats
|
|
369
371
|
|
|
370
372
|
if not self.data_loaded:
|
|
371
373
|
self.load_data()
|
|
372
374
|
|
|
373
375
|
descriptive_stats: dict[str, DescriptiveStatistics] = {}
|
|
374
|
-
hf_subset_stat
|
|
376
|
+
hf_subset_stat: Literal["hf_subset_descriptive_stats"] = (
|
|
377
|
+
"hf_subset_descriptive_stats"
|
|
378
|
+
)
|
|
375
379
|
eval_splits = self.metadata.eval_splits
|
|
376
380
|
if isinstance(self, AbsTaskClassification):
|
|
377
381
|
eval_splits.append(self.train_split)
|
|
@@ -382,7 +386,7 @@ class AbsTask(ABC):
|
|
|
382
386
|
logger.info(f"Processing metadata for split {split}")
|
|
383
387
|
if self.metadata.is_multilingual:
|
|
384
388
|
descriptive_stats[split] = (
|
|
385
|
-
self._calculate_descriptive_statistics_from_split(
|
|
389
|
+
self._calculate_descriptive_statistics_from_split( # type: ignore[assignment]
|
|
386
390
|
split, compute_overall=True
|
|
387
391
|
)
|
|
388
392
|
)
|
|
@@ -401,7 +405,7 @@ class AbsTask(ABC):
|
|
|
401
405
|
descriptive_stats[split][hf_subset_stat][hf_subset] = split_details
|
|
402
406
|
else:
|
|
403
407
|
split_details = self._calculate_descriptive_statistics_from_split(split)
|
|
404
|
-
descriptive_stats[split] = split_details
|
|
408
|
+
descriptive_stats[split] = split_details # type: ignore[assignment]
|
|
405
409
|
|
|
406
410
|
with self.metadata.descriptive_stat_path.open("w") as f:
|
|
407
411
|
json.dump(descriptive_stats, f, indent=4)
|
|
@@ -438,7 +442,7 @@ class AbsTask(ABC):
|
|
|
438
442
|
|
|
439
443
|
return self.metadata.languages
|
|
440
444
|
|
|
441
|
-
def filter_eval_splits(self, eval_splits:
|
|
445
|
+
def filter_eval_splits(self, eval_splits: Sequence[str] | None) -> Self:
|
|
442
446
|
"""Filter the evaluation splits of the task.
|
|
443
447
|
|
|
444
448
|
Args:
|
|
@@ -452,9 +456,9 @@ class AbsTask(ABC):
|
|
|
452
456
|
|
|
453
457
|
def filter_languages(
|
|
454
458
|
self,
|
|
455
|
-
languages:
|
|
456
|
-
script:
|
|
457
|
-
hf_subsets:
|
|
459
|
+
languages: Sequence[str] | None,
|
|
460
|
+
script: Sequence[str] | None = None,
|
|
461
|
+
hf_subsets: Sequence[HFSubset] | None = None,
|
|
458
462
|
exclusive_language_filter: bool = False,
|
|
459
463
|
) -> Self:
|
|
460
464
|
"""Filter the languages of the task.
|
|
@@ -500,12 +504,14 @@ class AbsTask(ABC):
|
|
|
500
504
|
self.hf_subsets = subsets_to_keep
|
|
501
505
|
return self
|
|
502
506
|
|
|
503
|
-
def _add_main_score(self, scores:
|
|
507
|
+
def _add_main_score(self, scores: ScoresDict) -> None:
|
|
504
508
|
scores["main_score"] = scores[self.metadata.main_score]
|
|
505
509
|
|
|
506
510
|
def _upload_dataset_to_hub(
|
|
507
511
|
self, repo_name: str, fields: list[str] | dict[str, str]
|
|
508
512
|
) -> None:
|
|
513
|
+
if self.dataset is None:
|
|
514
|
+
raise ValueError("Dataset not loaded")
|
|
509
515
|
if self.metadata.is_multilingual:
|
|
510
516
|
for config in self.metadata.eval_langs:
|
|
511
517
|
logger.info(f"Converting {config} of {self.metadata.name}")
|
|
@@ -575,7 +581,7 @@ class AbsTask(ABC):
|
|
|
575
581
|
return False
|
|
576
582
|
|
|
577
583
|
@property
|
|
578
|
-
def eval_splits(self) ->
|
|
584
|
+
def eval_splits(self) -> Sequence[str]:
|
|
579
585
|
"""Returns the evaluation splits of the task."""
|
|
580
586
|
if self._eval_splits:
|
|
581
587
|
return self._eval_splits
|
|
@@ -5,7 +5,6 @@ from pydantic import ConfigDict, Field, model_validator
|
|
|
5
5
|
from typing_extensions import Self
|
|
6
6
|
|
|
7
7
|
from mteb.types import (
|
|
8
|
-
HFSubset,
|
|
9
8
|
ISOLanguageScript,
|
|
10
9
|
Languages,
|
|
11
10
|
Licenses,
|
|
@@ -60,14 +59,7 @@ class AggregateTaskMetadata(TaskMetadata):
|
|
|
60
59
|
reference: str | None = None
|
|
61
60
|
bibtex_citation: str | None = None
|
|
62
61
|
|
|
63
|
-
@
|
|
64
|
-
def hf_subsets_to_langscripts(self) -> dict[HFSubset, list[ISOLanguageScript]]:
|
|
65
|
-
"""Return a dictionary mapping huggingface subsets to languages."""
|
|
66
|
-
if isinstance(self.eval_langs, dict):
|
|
67
|
-
return self.eval_langs
|
|
68
|
-
return {"default": self.eval_langs} # type: ignore
|
|
69
|
-
|
|
70
|
-
@model_validator(mode="after") # type: ignore
|
|
62
|
+
@model_validator(mode="after")
|
|
71
63
|
def _compute_unfilled_cases(self) -> Self:
|
|
72
64
|
if not self.eval_langs:
|
|
73
65
|
self.eval_langs = self._compute_eval_langs()
|
mteb/abstasks/aggregated_task.py
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import warnings
|
|
3
|
+
from collections.abc import Mapping
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
from typing import Any
|
|
5
6
|
|
|
6
7
|
import numpy as np
|
|
7
8
|
from datasets import Dataset, DatasetDict
|
|
8
|
-
from typing_extensions import Self
|
|
9
9
|
|
|
10
10
|
from mteb.models.models_protocols import MTEBModels
|
|
11
11
|
from mteb.results.task_result import TaskResult
|
|
@@ -33,7 +33,7 @@ class AbsTaskAggregate(AbsTask):
|
|
|
33
33
|
|
|
34
34
|
def task_results_to_scores(
|
|
35
35
|
self, task_results: list[TaskResult]
|
|
36
|
-
) -> dict[str,
|
|
36
|
+
) -> dict[str, Mapping[HFSubset, ScoresDict]]:
|
|
37
37
|
"""The function that aggregated scores. Can be redefined to allow for custom aggregations.
|
|
38
38
|
|
|
39
39
|
Args:
|
|
@@ -42,7 +42,7 @@ class AbsTaskAggregate(AbsTask):
|
|
|
42
42
|
Returns:
|
|
43
43
|
A dictionary with the aggregated scores.
|
|
44
44
|
"""
|
|
45
|
-
scores = {}
|
|
45
|
+
scores: dict[str, Mapping[HFSubset, ScoresDict]] = {}
|
|
46
46
|
subsets = (
|
|
47
47
|
self.metadata.eval_langs.keys()
|
|
48
48
|
if isinstance(self.metadata.eval_langs, dict)
|
|
@@ -121,19 +121,6 @@ class AbsTaskAggregate(AbsTask):
|
|
|
121
121
|
task_res.mteb_version = task_results[0].mteb_version
|
|
122
122
|
return task_res
|
|
123
123
|
|
|
124
|
-
def filter_eval_splits(self, eval_splits: list[str] | None) -> Self:
|
|
125
|
-
"""Filter the evaluation splits of the task.
|
|
126
|
-
|
|
127
|
-
Args:
|
|
128
|
-
eval_splits: List of splits to evaluate on. If None, all splits in metadata
|
|
129
|
-
are used.
|
|
130
|
-
|
|
131
|
-
Returns:
|
|
132
|
-
The task with filtered evaluation splits.
|
|
133
|
-
"""
|
|
134
|
-
self._eval_splits = eval_splits
|
|
135
|
-
return self
|
|
136
|
-
|
|
137
124
|
def evaluate(
|
|
138
125
|
self,
|
|
139
126
|
model: MTEBModels,
|
mteb/abstasks/classification.py
CHANGED
|
@@ -143,6 +143,9 @@ class AbsTaskClassification(AbsTask):
|
|
|
143
143
|
if not self.data_loaded:
|
|
144
144
|
self.load_data()
|
|
145
145
|
|
|
146
|
+
if self.dataset is None:
|
|
147
|
+
raise RuntimeError("Dataset not loaded.")
|
|
148
|
+
|
|
146
149
|
if "random_state" in self.evaluator_model.get_params():
|
|
147
150
|
self.evaluator_model = self.evaluator_model.set_params(
|
|
148
151
|
random_state=self.seed
|
|
@@ -175,11 +178,11 @@ class AbsTaskClassification(AbsTask):
|
|
|
175
178
|
)
|
|
176
179
|
self._add_main_score(scores[hf_subset])
|
|
177
180
|
|
|
178
|
-
return scores
|
|
181
|
+
return scores # type: ignore[return-value]
|
|
179
182
|
|
|
180
183
|
def _evaluate_subset(
|
|
181
184
|
self,
|
|
182
|
-
model:
|
|
185
|
+
model: MTEBModels,
|
|
183
186
|
data_split: DatasetDict,
|
|
184
187
|
*,
|
|
185
188
|
encode_kwargs: dict[str, Any],
|
|
@@ -188,6 +191,9 @@ class AbsTaskClassification(AbsTask):
|
|
|
188
191
|
prediction_folder: Path | None = None,
|
|
189
192
|
**kwargs: Any,
|
|
190
193
|
) -> FullClassificationMetrics:
|
|
194
|
+
if not isinstance(model, EncoderProtocol):
|
|
195
|
+
raise TypeError("Expected model to be an instance of EncoderProtocol")
|
|
196
|
+
|
|
191
197
|
train_split = data_split[self.train_split]
|
|
192
198
|
eval_split = data_split[hf_split]
|
|
193
199
|
|
|
@@ -237,7 +243,7 @@ class AbsTaskClassification(AbsTask):
|
|
|
237
243
|
# ap will be none for non binary classification tasks
|
|
238
244
|
k: (
|
|
239
245
|
float(np.mean(values))
|
|
240
|
-
if (values := [s[k] for s in scores if s[k] is not None])
|
|
246
|
+
if (values := [s[k] for s in scores if s[k] is not None]) # type: ignore[literal-required]
|
|
241
247
|
else np.nan
|
|
242
248
|
)
|
|
243
249
|
for k in scores[0].keys()
|
|
@@ -245,7 +251,7 @@ class AbsTaskClassification(AbsTask):
|
|
|
245
251
|
logger.info(f"Running {self.metadata.name} - Finished.")
|
|
246
252
|
return FullClassificationMetrics(
|
|
247
253
|
scores_per_experiment=scores,
|
|
248
|
-
**avg_scores,
|
|
254
|
+
**avg_scores, # type: ignore[typeddict-item]
|
|
249
255
|
)
|
|
250
256
|
|
|
251
257
|
def _calculate_scores(
|
mteb/abstasks/clustering.py
CHANGED
|
@@ -3,7 +3,7 @@ import logging
|
|
|
3
3
|
import random
|
|
4
4
|
from collections import defaultdict
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import Any
|
|
6
|
+
from typing import Any, cast
|
|
7
7
|
|
|
8
8
|
import numpy as np
|
|
9
9
|
from datasets import Dataset, DatasetDict
|
|
@@ -11,8 +11,8 @@ from sklearn.cluster import MiniBatchKMeans
|
|
|
11
11
|
from sklearn.metrics.cluster import v_measure_score
|
|
12
12
|
|
|
13
13
|
from mteb._create_dataloaders import create_dataloader
|
|
14
|
-
from mteb.models import EncoderProtocol
|
|
15
|
-
from mteb.types import HFSubset, ScoresDict
|
|
14
|
+
from mteb.models import EncoderProtocol, MTEBModels
|
|
15
|
+
from mteb.types import Array, HFSubset, ScoresDict
|
|
16
16
|
from mteb.types.statistics import (
|
|
17
17
|
ImageStatistics,
|
|
18
18
|
LabelStatistics,
|
|
@@ -34,7 +34,7 @@ MultilingualDataset = dict[HFSubset, DatasetDict]
|
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
def _evaluate_clustering_bootstrapped(
|
|
37
|
-
embeddings:
|
|
37
|
+
embeddings: Array,
|
|
38
38
|
labels: list[list[str]],
|
|
39
39
|
n_clusters: int,
|
|
40
40
|
cluster_size: int,
|
|
@@ -61,21 +61,21 @@ def _evaluate_clustering_bootstrapped(
|
|
|
61
61
|
max_depth = max(map(len, labels))
|
|
62
62
|
# Evaluate on each level til max depth
|
|
63
63
|
for i_level in range(max_depth):
|
|
64
|
-
level_labels = []
|
|
64
|
+
level_labels: list[str | int] = []
|
|
65
65
|
# Assign -1 to gold label if the level is not there
|
|
66
66
|
for label in labels:
|
|
67
67
|
if len(label) > i_level:
|
|
68
68
|
level_labels.append(label[i_level])
|
|
69
69
|
else:
|
|
70
70
|
level_labels.append(-1)
|
|
71
|
-
|
|
71
|
+
np_level_labels = np.array(level_labels)
|
|
72
72
|
valid_idx = np.array(
|
|
73
|
-
[level_label != -1 for level_label in
|
|
73
|
+
[level_label != -1 for level_label in np_level_labels]
|
|
74
74
|
) # Could be level_labels != -1 but fails with FutureWarning: elementwise comparison failed
|
|
75
|
-
|
|
75
|
+
np_level_labels = np_level_labels[valid_idx]
|
|
76
76
|
level_embeddings = embeddings[valid_idx]
|
|
77
77
|
clustering_model = MiniBatchKMeans(
|
|
78
|
-
n_clusters=np.unique(
|
|
78
|
+
n_clusters=np.unique(np_level_labels).size,
|
|
79
79
|
batch_size=kmean_batch_size,
|
|
80
80
|
init="k-means++",
|
|
81
81
|
n_init=1, # default when kmeans++ is used
|
|
@@ -87,7 +87,7 @@ def _evaluate_clustering_bootstrapped(
|
|
|
87
87
|
cluster_indices = rng_state.choices(range(n_embeddings), k=cluster_size)
|
|
88
88
|
|
|
89
89
|
_embeddings = level_embeddings[cluster_indices]
|
|
90
|
-
_labels =
|
|
90
|
+
_labels = np_level_labels[cluster_indices]
|
|
91
91
|
cluster_assignment = clustering_model.fit_predict(_embeddings)
|
|
92
92
|
v_measure = v_measure_score(_labels, cluster_assignment)
|
|
93
93
|
v_measures[f"Level {i_level}"].append(v_measure)
|
|
@@ -153,7 +153,7 @@ class AbsTaskClustering(AbsTask):
|
|
|
153
153
|
|
|
154
154
|
def _evaluate_subset(
|
|
155
155
|
self,
|
|
156
|
-
model:
|
|
156
|
+
model: MTEBModels,
|
|
157
157
|
data_split: Dataset,
|
|
158
158
|
*,
|
|
159
159
|
encode_kwargs: dict[str, Any],
|
|
@@ -162,6 +162,10 @@ class AbsTaskClustering(AbsTask):
|
|
|
162
162
|
prediction_folder: Path | None = None,
|
|
163
163
|
**kwargs: Any,
|
|
164
164
|
) -> ScoresDict:
|
|
165
|
+
if not isinstance(model, EncoderProtocol):
|
|
166
|
+
raise TypeError(
|
|
167
|
+
"Expected encoder model to be an instance of EncoderProtocol."
|
|
168
|
+
)
|
|
165
169
|
if (
|
|
166
170
|
self.max_document_to_embed is not None
|
|
167
171
|
and self.max_fraction_of_documents_to_embed is not None
|
|
@@ -182,13 +186,13 @@ class AbsTaskClustering(AbsTask):
|
|
|
182
186
|
self.max_fraction_of_documents_to_embed * len(data_split)
|
|
183
187
|
)
|
|
184
188
|
else:
|
|
185
|
-
max_documents_to_embed = self.max_document_to_embed
|
|
189
|
+
max_documents_to_embed = cast(int, self.max_document_to_embed)
|
|
186
190
|
|
|
187
|
-
max_documents_to_embed = min(len(data_split), max_documents_to_embed)
|
|
191
|
+
max_documents_to_embed = min(len(data_split), max_documents_to_embed)
|
|
188
192
|
example_indices = self.rng_state.sample(
|
|
189
193
|
range(len(data_split)), k=max_documents_to_embed
|
|
190
194
|
)
|
|
191
|
-
downsampled_dataset = data_split.select(example_indices)
|
|
195
|
+
downsampled_dataset = data_split.select(example_indices)
|
|
192
196
|
|
|
193
197
|
downsampled_dataset = downsampled_dataset.select_columns(
|
|
194
198
|
[self.input_column_name, self.label_column_name]
|
|
@@ -8,7 +8,7 @@ from scipy.optimize import linear_sum_assignment
|
|
|
8
8
|
from sklearn import metrics
|
|
9
9
|
|
|
10
10
|
from mteb._evaluators import ClusteringEvaluator
|
|
11
|
-
from mteb.models import EncoderProtocol
|
|
11
|
+
from mteb.models import EncoderProtocol, MTEBModels
|
|
12
12
|
from mteb.types import ScoresDict
|
|
13
13
|
from mteb.types.statistics import (
|
|
14
14
|
ImageStatistics,
|
|
@@ -80,7 +80,7 @@ class AbsTaskClusteringLegacy(AbsTask):
|
|
|
80
80
|
|
|
81
81
|
def _evaluate_subset(
|
|
82
82
|
self,
|
|
83
|
-
model:
|
|
83
|
+
model: MTEBModels,
|
|
84
84
|
data_split: Dataset,
|
|
85
85
|
*,
|
|
86
86
|
encode_kwargs: dict[str, Any],
|
|
@@ -89,6 +89,9 @@ class AbsTaskClusteringLegacy(AbsTask):
|
|
|
89
89
|
prediction_folder: Path | None = None,
|
|
90
90
|
**kwargs: Any,
|
|
91
91
|
) -> ScoresDict:
|
|
92
|
+
if not isinstance(model, EncoderProtocol):
|
|
93
|
+
raise TypeError("Expected model to be an instance of EncoderProtocol")
|
|
94
|
+
|
|
92
95
|
data_split = data_split.select_columns(
|
|
93
96
|
[self.input_column_name, self.label_column_name]
|
|
94
97
|
)
|
|
@@ -139,9 +142,6 @@ class AbsTaskClusteringLegacy(AbsTask):
|
|
|
139
142
|
}
|
|
140
143
|
return scores
|
|
141
144
|
|
|
142
|
-
data_split = data_split.select_columns(
|
|
143
|
-
[self.input_column_name, self.label_column_name]
|
|
144
|
-
)
|
|
145
145
|
evaluator = self.evaluator(
|
|
146
146
|
data_split,
|
|
147
147
|
input_column_name=self.input_column_name,
|
|
@@ -151,10 +151,10 @@ class AbsTaskClusteringLegacy(AbsTask):
|
|
|
151
151
|
hf_subset=hf_subset,
|
|
152
152
|
**kwargs,
|
|
153
153
|
)
|
|
154
|
-
|
|
154
|
+
evaluate_clusters = evaluator(model, encode_kwargs=encode_kwargs)
|
|
155
155
|
if prediction_folder:
|
|
156
156
|
self._save_task_predictions(
|
|
157
|
-
|
|
157
|
+
evaluate_clusters,
|
|
158
158
|
model,
|
|
159
159
|
prediction_folder,
|
|
160
160
|
hf_subset=hf_subset,
|
|
@@ -163,7 +163,7 @@ class AbsTaskClusteringLegacy(AbsTask):
|
|
|
163
163
|
|
|
164
164
|
return self._compute_metrics(
|
|
165
165
|
data_split[self.label_column_name],
|
|
166
|
-
|
|
166
|
+
evaluate_clusters,
|
|
167
167
|
)
|
|
168
168
|
|
|
169
169
|
def _compute_metrics(
|
|
@@ -12,7 +12,7 @@ from mteb.abstasks._statistics_calculation import (
|
|
|
12
12
|
calculate_text_statistics,
|
|
13
13
|
)
|
|
14
14
|
from mteb.abstasks.abstask import AbsTask
|
|
15
|
-
from mteb.models.models_protocols import EncoderProtocol
|
|
15
|
+
from mteb.models.models_protocols import EncoderProtocol, MTEBModels
|
|
16
16
|
from mteb.types.statistics import (
|
|
17
17
|
ImageStatistics,
|
|
18
18
|
SplitDescriptiveStatistics,
|
|
@@ -116,7 +116,7 @@ class AbsTaskImageTextPairClassification(AbsTask):
|
|
|
116
116
|
|
|
117
117
|
def _evaluate_subset(
|
|
118
118
|
self,
|
|
119
|
-
model:
|
|
119
|
+
model: MTEBModels,
|
|
120
120
|
data_split: Dataset,
|
|
121
121
|
*,
|
|
122
122
|
encode_kwargs: dict[str, Any],
|
|
@@ -125,6 +125,8 @@ class AbsTaskImageTextPairClassification(AbsTask):
|
|
|
125
125
|
prediction_folder: Path | None = None,
|
|
126
126
|
**kwargs: Any,
|
|
127
127
|
) -> ImageTextPairClassificationMetrics:
|
|
128
|
+
if not isinstance(model, EncoderProtocol):
|
|
129
|
+
raise TypeError("Expected model to be an instance of EncoderProtocol")
|
|
128
130
|
select_columns = []
|
|
129
131
|
for columns in (self.images_column_names, self.texts_column_names):
|
|
130
132
|
if isinstance(columns, str):
|
|
@@ -154,7 +156,7 @@ class AbsTaskImageTextPairClassification(AbsTask):
|
|
|
154
156
|
hf_subset=hf_subset,
|
|
155
157
|
**kwargs,
|
|
156
158
|
)
|
|
157
|
-
scores = evaluator(model, encode_kwargs=encode_kwargs)
|
|
159
|
+
scores: list[torch.Tensor] = evaluator(model, encode_kwargs=encode_kwargs) # type: ignore[assignment]
|
|
158
160
|
if prediction_folder:
|
|
159
161
|
self._save_task_predictions(
|
|
160
162
|
[score.tolist() for score in scores],
|
|
@@ -16,7 +16,8 @@ from typing_extensions import override
|
|
|
16
16
|
from mteb._create_dataloaders import create_dataloader
|
|
17
17
|
from mteb._evaluators.classification_metrics import hamming_score
|
|
18
18
|
from mteb._evaluators.sklearn_evaluator import SklearnModelProtocol
|
|
19
|
-
from mteb.models import EncoderProtocol
|
|
19
|
+
from mteb.models import EncoderProtocol, MTEBModels
|
|
20
|
+
from mteb.types import Array
|
|
20
21
|
|
|
21
22
|
from .classification import AbsTaskClassification
|
|
22
23
|
|
|
@@ -24,14 +25,14 @@ logger = logging.getLogger(__name__)
|
|
|
24
25
|
|
|
25
26
|
|
|
26
27
|
def _evaluate_classifier(
|
|
27
|
-
embeddings_train:
|
|
28
|
+
embeddings_train: Array,
|
|
28
29
|
y_train: np.ndarray,
|
|
29
|
-
embeddings_test:
|
|
30
|
+
embeddings_test: Array,
|
|
30
31
|
classifier: SklearnModelProtocol,
|
|
31
32
|
) -> tuple[np.ndarray, SklearnModelProtocol]:
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
return
|
|
33
|
+
classifier_copy: SklearnModelProtocol = clone(classifier)
|
|
34
|
+
classifier_copy.fit(embeddings_train, y_train)
|
|
35
|
+
return classifier_copy.predict(embeddings_test), classifier_copy
|
|
35
36
|
|
|
36
37
|
|
|
37
38
|
class MultilabelClassificationMetrics(TypedDict):
|
|
@@ -72,14 +73,14 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
|
|
|
72
73
|
evaluator: Classifier to use for evaluation. Must implement the SklearnModelProtocol.
|
|
73
74
|
"""
|
|
74
75
|
|
|
75
|
-
evaluator: SklearnModelProtocol = KNeighborsClassifier(n_neighbors=5)
|
|
76
|
+
evaluator: SklearnModelProtocol = KNeighborsClassifier(n_neighbors=5) # type: ignore[assignment]
|
|
76
77
|
input_column_name: str = "text"
|
|
77
78
|
label_column_name: str = "label"
|
|
78
79
|
|
|
79
80
|
@override
|
|
80
|
-
def _evaluate_subset(
|
|
81
|
+
def _evaluate_subset( # type: ignore[override]
|
|
81
82
|
self,
|
|
82
|
-
model:
|
|
83
|
+
model: MTEBModels,
|
|
83
84
|
data_split: DatasetDict,
|
|
84
85
|
*,
|
|
85
86
|
encode_kwargs: dict[str, Any],
|
|
@@ -88,6 +89,9 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
|
|
|
88
89
|
prediction_folder: Path | None = None,
|
|
89
90
|
**kwargs: Any,
|
|
90
91
|
) -> FullMultilabelClassificationMetrics:
|
|
92
|
+
if not isinstance(model, EncoderProtocol):
|
|
93
|
+
raise TypeError("Expected model to be an instance of EncoderProtocol")
|
|
94
|
+
|
|
91
95
|
if isinstance(data_split, DatasetDict):
|
|
92
96
|
data_split = data_split.select_columns(
|
|
93
97
|
[self.input_column_name, self.label_column_name]
|
|
@@ -185,19 +189,20 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
|
|
|
185
189
|
)
|
|
186
190
|
|
|
187
191
|
avg_scores: dict[str, Any] = {
|
|
188
|
-
k: np.mean([s[k] for s in scores])
|
|
192
|
+
k: np.mean([s[k] for s in scores]) # type: ignore[literal-required]
|
|
193
|
+
for k in scores[0].keys()
|
|
189
194
|
}
|
|
190
195
|
logger.info("Running multilabel classification - Finished.")
|
|
191
196
|
return FullMultilabelClassificationMetrics(
|
|
192
197
|
scores_per_experiment=scores,
|
|
193
|
-
**avg_scores,
|
|
198
|
+
**avg_scores, # type: ignore[typeddict-item]
|
|
194
199
|
)
|
|
195
200
|
|
|
196
|
-
def _calculate_scores(
|
|
201
|
+
def _calculate_scores( # type: ignore[override]
|
|
197
202
|
self,
|
|
198
203
|
y_test: np.ndarray,
|
|
199
204
|
y_pred: np.ndarray,
|
|
200
|
-
x_test_embedding:
|
|
205
|
+
x_test_embedding: Array,
|
|
201
206
|
current_classifier: SklearnModelProtocol,
|
|
202
207
|
) -> MultilabelClassificationMetrics:
|
|
203
208
|
accuracy = current_classifier.score(x_test_embedding, y_test)
|
|
@@ -232,10 +237,9 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
|
|
|
232
237
|
"""
|
|
233
238
|
sample_indices = []
|
|
234
239
|
if idxs is None:
|
|
235
|
-
idxs = np.arange(len(y))
|
|
240
|
+
idxs = list(np.arange(len(y)))
|
|
236
241
|
self.np_rng.shuffle(idxs)
|
|
237
|
-
|
|
238
|
-
label_counter = defaultdict(int)
|
|
242
|
+
label_counter: dict[int, int] = defaultdict(int)
|
|
239
243
|
for i in idxs:
|
|
240
244
|
if any((label_counter[label] < samples_per_label) for label in y[i]):
|
|
241
245
|
sample_indices.append(i)
|
|
@@ -18,7 +18,7 @@ from mteb.abstasks._statistics_calculation import (
|
|
|
18
18
|
)
|
|
19
19
|
from mteb.abstasks.abstask import AbsTask
|
|
20
20
|
from mteb.models.model_meta import ScoringFunction
|
|
21
|
-
from mteb.models.models_protocols import EncoderProtocol
|
|
21
|
+
from mteb.models.models_protocols import EncoderProtocol, MTEBModels
|
|
22
22
|
from mteb.types import PromptType
|
|
23
23
|
from mteb.types.statistics import (
|
|
24
24
|
ImageStatistics,
|
|
@@ -44,8 +44,8 @@ class PairClassificationDescriptiveStatistics(SplitDescriptiveStatistics):
|
|
|
44
44
|
"""
|
|
45
45
|
|
|
46
46
|
num_samples: int
|
|
47
|
-
number_of_characters: int
|
|
48
|
-
unique_pairs: int
|
|
47
|
+
number_of_characters: int | None
|
|
48
|
+
unique_pairs: int | None
|
|
49
49
|
|
|
50
50
|
text1_statistics: TextStatistics | None
|
|
51
51
|
image1_statistics: ImageStatistics | None
|
|
@@ -79,7 +79,7 @@ class AbsTaskPairClassification(AbsTask):
|
|
|
79
79
|
|
|
80
80
|
def _evaluate_subset(
|
|
81
81
|
self,
|
|
82
|
-
model:
|
|
82
|
+
model: MTEBModels,
|
|
83
83
|
data_split: Dataset,
|
|
84
84
|
*,
|
|
85
85
|
hf_split: str,
|
|
@@ -88,6 +88,9 @@ class AbsTaskPairClassification(AbsTask):
|
|
|
88
88
|
prediction_folder: Path | None = None,
|
|
89
89
|
**kwargs,
|
|
90
90
|
) -> dict[str, float]:
|
|
91
|
+
if not isinstance(model, EncoderProtocol):
|
|
92
|
+
raise TypeError("Expected model to be an instance of EncoderProtocol")
|
|
93
|
+
|
|
91
94
|
if self.metadata.modalities == ["text"]:
|
|
92
95
|
# for compatibility with v1 version where datasets were stored in a single row
|
|
93
96
|
data_split = data_split[0] if len(data_split) == 1 else data_split
|
|
@@ -120,7 +123,7 @@ class AbsTaskPairClassification(AbsTask):
|
|
|
120
123
|
self, similarity_scores: PairClassificationDistances, labels: list[int]
|
|
121
124
|
) -> dict[str, float]:
|
|
122
125
|
logger.info("Computing metrics...")
|
|
123
|
-
|
|
126
|
+
np_labels = np.asarray(labels)
|
|
124
127
|
output_scores = {}
|
|
125
128
|
max_scores = defaultdict(list)
|
|
126
129
|
for short_name, scores, reverse in [
|
|
@@ -142,7 +145,7 @@ class AbsTaskPairClassification(AbsTask):
|
|
|
142
145
|
],
|
|
143
146
|
[ScoringFunction.DOT_PRODUCT.value, similarity_scores["dot_scores"], True],
|
|
144
147
|
]:
|
|
145
|
-
metrics = self._compute_metrics_values(scores,
|
|
148
|
+
metrics = self._compute_metrics_values(scores, np_labels, reverse) # type: ignore[arg-type]
|
|
146
149
|
for metric_name, metric_value in metrics.items():
|
|
147
150
|
output_scores[f"{short_name}_{metric_name}"] = metric_value
|
|
148
151
|
max_scores[metric_name].append(metric_value)
|
|
@@ -237,6 +240,12 @@ class AbsTaskPairClassification(AbsTask):
|
|
|
237
240
|
|
|
238
241
|
def _push_dataset_to_hub(self, repo_name: str) -> None:
|
|
239
242
|
# previously pair classification datasets were stored in a single row
|
|
243
|
+
if self.dataset is None:
|
|
244
|
+
# overall this shouldn't happen as we check for dataset before pushing to hub
|
|
245
|
+
# added here for type checking purposes
|
|
246
|
+
raise RuntimeError(
|
|
247
|
+
"Dataset not loaded. To load dataset run `task.load_data()`."
|
|
248
|
+
)
|
|
240
249
|
if self.metadata.is_multilingual:
|
|
241
250
|
for subset in self.dataset:
|
|
242
251
|
for split in self.dataset[subset]:
|
|
@@ -290,13 +299,13 @@ class AbsTaskPairClassification(AbsTask):
|
|
|
290
299
|
)
|
|
291
300
|
|
|
292
301
|
def _find_best_acc_and_threshold(
|
|
293
|
-
self, scores:
|
|
302
|
+
self, scores: list[float], labels: np.ndarray, high_score_more_similar: bool
|
|
294
303
|
) -> tuple[float, float]:
|
|
295
304
|
rows = list(zip(scores, labels))
|
|
296
305
|
rows = sorted(rows, key=lambda x: x[0], reverse=high_score_more_similar)
|
|
297
306
|
|
|
298
307
|
max_acc = 0
|
|
299
|
-
best_threshold = -1
|
|
308
|
+
best_threshold = -1.0
|
|
300
309
|
positive_so_far = 0
|
|
301
310
|
remaining_negatives = sum(np.array(labels) == 0)
|
|
302
311
|
|
|
@@ -323,7 +332,7 @@ class AbsTaskPairClassification(AbsTask):
|
|
|
323
332
|
|
|
324
333
|
rows = sorted(rows, key=lambda x: x[0], reverse=high_score_more_similar)
|
|
325
334
|
|
|
326
|
-
best_f1 = best_precision = best_recall = 0
|
|
335
|
+
best_f1 = best_precision = best_recall = 0.0
|
|
327
336
|
threshold = 0
|
|
328
337
|
nextract = 0
|
|
329
338
|
ncorrect = 0
|