mteb 2.5.2__py3-none-any.whl → 2.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/_create_dataloaders.py +10 -15
- mteb/_evaluators/any_sts_evaluator.py +1 -4
- mteb/_evaluators/evaluator.py +2 -1
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +5 -6
- mteb/_evaluators/pair_classification_evaluator.py +3 -1
- mteb/_evaluators/retrieval_metrics.py +17 -16
- mteb/_evaluators/sklearn_evaluator.py +9 -8
- mteb/_evaluators/text/bitext_mining_evaluator.py +23 -16
- mteb/_evaluators/text/summarization_evaluator.py +20 -16
- mteb/abstasks/_data_filter/filters.py +1 -1
- mteb/abstasks/_data_filter/task_pipelines.py +3 -0
- mteb/abstasks/_statistics_calculation.py +18 -10
- mteb/abstasks/_stratification.py +18 -18
- mteb/abstasks/abstask.py +33 -27
- mteb/abstasks/aggregate_task_metadata.py +1 -9
- mteb/abstasks/aggregated_task.py +7 -26
- mteb/abstasks/classification.py +10 -4
- mteb/abstasks/clustering.py +18 -14
- mteb/abstasks/clustering_legacy.py +8 -8
- mteb/abstasks/image/image_text_pair_classification.py +5 -3
- mteb/abstasks/multilabel_classification.py +20 -16
- mteb/abstasks/pair_classification.py +18 -9
- mteb/abstasks/regression.py +3 -3
- mteb/abstasks/retrieval.py +12 -9
- mteb/abstasks/sts.py +6 -3
- mteb/abstasks/task_metadata.py +22 -19
- mteb/abstasks/text/bitext_mining.py +36 -25
- mteb/abstasks/text/reranking.py +7 -5
- mteb/abstasks/text/summarization.py +8 -3
- mteb/abstasks/zeroshot_classification.py +5 -2
- mteb/benchmarks/benchmark.py +2 -2
- mteb/cache.py +27 -22
- mteb/cli/_display_tasks.py +2 -2
- mteb/cli/build_cli.py +15 -10
- mteb/cli/generate_model_card.py +10 -7
- mteb/deprecated_evaluator.py +60 -46
- mteb/evaluate.py +39 -30
- mteb/filter_tasks.py +25 -26
- mteb/get_tasks.py +29 -30
- mteb/languages/language_scripts.py +5 -3
- mteb/leaderboard/app.py +1 -1
- mteb/load_results.py +12 -12
- mteb/models/abs_encoder.py +7 -5
- mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
- mteb/models/cache_wrappers/cache_wrapper.py +2 -2
- mteb/models/get_model_meta.py +8 -1
- mteb/models/instruct_wrapper.py +11 -5
- mteb/models/model_implementations/andersborges.py +2 -2
- mteb/models/model_implementations/blip_models.py +8 -8
- mteb/models/model_implementations/bm25.py +1 -1
- mteb/models/model_implementations/clip_models.py +3 -3
- mteb/models/model_implementations/cohere_models.py +1 -1
- mteb/models/model_implementations/cohere_v.py +2 -2
- mteb/models/model_implementations/dino_models.py +23 -23
- mteb/models/model_implementations/emillykkejensen_models.py +3 -3
- mteb/models/model_implementations/gme_v_models.py +4 -3
- mteb/models/model_implementations/jina_clip.py +1 -1
- mteb/models/model_implementations/jina_models.py +1 -1
- mteb/models/model_implementations/kennethenevoldsen_models.py +2 -2
- mteb/models/model_implementations/llm2clip_models.py +3 -3
- mteb/models/model_implementations/mcinext_models.py +4 -1
- mteb/models/model_implementations/moco_models.py +2 -2
- mteb/models/model_implementations/model2vec_models.py +1 -1
- mteb/models/model_implementations/nomic_models.py +8 -8
- mteb/models/model_implementations/openclip_models.py +7 -7
- mteb/models/model_implementations/random_baseline.py +3 -3
- mteb/models/model_implementations/rasgaard_models.py +1 -1
- mteb/models/model_implementations/repllama_models.py +2 -2
- mteb/models/model_implementations/rerankers_custom.py +3 -3
- mteb/models/model_implementations/rerankers_monot5_based.py +3 -3
- mteb/models/model_implementations/siglip_models.py +10 -10
- mteb/models/model_implementations/vlm2vec_models.py +1 -1
- mteb/models/model_implementations/voyage_v.py +4 -4
- mteb/models/model_meta.py +14 -13
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +9 -6
- mteb/models/search_wrappers.py +26 -12
- mteb/models/sentence_transformer_wrapper.py +19 -14
- mteb/py.typed +0 -0
- mteb/results/benchmark_results.py +28 -20
- mteb/results/model_result.py +52 -22
- mteb/results/task_result.py +55 -58
- mteb/similarity_functions.py +11 -7
- mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
- mteb/tasks/classification/est/estonian_valence.py +1 -1
- mteb/tasks/classification/multilingual/scala_classification.py +1 -1
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
- mteb/tasks/retrieval/code/code_rag.py +12 -12
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
- mteb/tasks/retrieval/nob/norquad.py +2 -2
- mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
- mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
- mteb/types/_result.py +2 -1
- mteb/types/statistics.py +9 -3
- {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/METADATA +1 -1
- {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/RECORD +104 -103
- {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/WHEEL +0 -0
- {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/entry_points.txt +0 -0
- {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/top_level.txt +0 -0
mteb/_create_dataloaders.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import warnings
|
|
2
3
|
from collections.abc import Callable
|
|
3
4
|
from typing import Any, cast
|
|
4
5
|
|
|
@@ -113,11 +114,8 @@ def _create_text_dataloader_for_queries(
|
|
|
113
114
|
)
|
|
114
115
|
|
|
115
116
|
|
|
116
|
-
_warned_about_user_role = False
|
|
117
|
-
|
|
118
|
-
|
|
119
117
|
def _convert_conv_history_to_query(
|
|
120
|
-
row: dict[str, list[str] | Conversation],
|
|
118
|
+
row: dict[str, str | list[str] | Conversation],
|
|
121
119
|
) -> dict[str, str | Conversation]:
|
|
122
120
|
"""Convert a conversation history to a single query string.
|
|
123
121
|
|
|
@@ -127,21 +125,18 @@ def _convert_conv_history_to_query(
|
|
|
127
125
|
Returns:
|
|
128
126
|
The updated row with the "query" and "text" fields set to the conversation string, and the "conversation" field set to the list of ConversationTurn.
|
|
129
127
|
"""
|
|
130
|
-
global _warned_about_user_role
|
|
131
|
-
|
|
132
128
|
conversation = row["text"]
|
|
133
129
|
# if it's a list of strings, just join them
|
|
134
130
|
if isinstance(conversation, list) and isinstance(conversation[0], str):
|
|
135
|
-
|
|
136
|
-
conv_str = "; ".join(
|
|
131
|
+
conversation_ = cast(list[str], conversation)
|
|
132
|
+
conv_str = "; ".join(conversation_)
|
|
137
133
|
current_conversation = [
|
|
138
|
-
ConversationTurn(role="user", content=message) for message in
|
|
134
|
+
ConversationTurn(role="user", content=message) for message in conversation_
|
|
139
135
|
]
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
_warned_about_user_role = True
|
|
136
|
+
warnings.warn(
|
|
137
|
+
"Conversations are a list of strings. Used 'user' role for all turns.",
|
|
138
|
+
category=UserWarning,
|
|
139
|
+
)
|
|
145
140
|
# otherwise, it's a list of dictionaries, which we need to convert to strings
|
|
146
141
|
elif isinstance(conversation, list) and isinstance(conversation[0], dict):
|
|
147
142
|
conv = []
|
|
@@ -178,7 +173,7 @@ def _convert_conv_history_to_query(
|
|
|
178
173
|
|
|
179
174
|
row["text"] = conv_str
|
|
180
175
|
row["conversation"] = current_conversation
|
|
181
|
-
return row
|
|
176
|
+
return cast(dict[str, str | list[ConversationTurn]], row)
|
|
182
177
|
|
|
183
178
|
|
|
184
179
|
def _create_dataloader_for_queries_conversation(
|
|
@@ -57,10 +57,7 @@ class AnySTSEvaluator(Evaluator):
|
|
|
57
57
|
self.input2_prompt_type = input2_prompt_type
|
|
58
58
|
|
|
59
59
|
def __call__(
|
|
60
|
-
self,
|
|
61
|
-
model: EncoderProtocol,
|
|
62
|
-
*,
|
|
63
|
-
encode_kwargs: dict[str, Any],
|
|
60
|
+
self, model: EncoderProtocol, *, encode_kwargs: dict[str, Any]
|
|
64
61
|
) -> STSEvaluatorScores:
|
|
65
62
|
logger.info("Running semantic similarity - Encoding samples (1/2)")
|
|
66
63
|
embeddings1 = model.encode(
|
mteb/_evaluators/evaluator.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
|
+
from collections.abc import Iterable, Mapping
|
|
2
3
|
from typing import Any
|
|
3
4
|
|
|
4
5
|
from mteb.abstasks.abstask import _set_seed
|
|
@@ -18,7 +19,7 @@ class Evaluator(ABC):
|
|
|
18
19
|
@abstractmethod
|
|
19
20
|
def __call__(
|
|
20
21
|
self, model: EncoderProtocol, *, encode_kwargs: dict[str, Any]
|
|
21
|
-
) ->
|
|
22
|
+
) -> Mapping[str, float] | Iterable[Any]:
|
|
22
23
|
"""This is called during training to evaluate the model.
|
|
23
24
|
|
|
24
25
|
It returns scores.
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
+
from collections.abc import Sequence
|
|
4
5
|
from typing import TYPE_CHECKING, Any
|
|
5
6
|
|
|
6
7
|
import torch
|
|
@@ -61,8 +62,8 @@ class ImageTextPairClassificationEvaluator(Evaluator):
|
|
|
61
62
|
def __init__(
|
|
62
63
|
self,
|
|
63
64
|
dataset,
|
|
64
|
-
images_column_names: str |
|
|
65
|
-
texts_column_names: str |
|
|
65
|
+
images_column_names: str | Sequence[str],
|
|
66
|
+
texts_column_names: str | Sequence[str],
|
|
66
67
|
num_images_per_sample: int,
|
|
67
68
|
num_texts_per_sample: int,
|
|
68
69
|
task_metadata: TaskMetadata,
|
|
@@ -82,10 +83,8 @@ class ImageTextPairClassificationEvaluator(Evaluator):
|
|
|
82
83
|
self.hf_split = hf_split
|
|
83
84
|
self.hf_subset = hf_subset
|
|
84
85
|
|
|
85
|
-
def __call__(
|
|
86
|
-
self,
|
|
87
|
-
model: EncoderProtocol,
|
|
88
|
-
encode_kwargs: dict[str, Any],
|
|
86
|
+
def __call__( # type: ignore[override]
|
|
87
|
+
self, model: EncoderProtocol, *, encode_kwargs: dict[str, Any]
|
|
89
88
|
) -> list[torch.Tensor]:
|
|
90
89
|
images = []
|
|
91
90
|
if isinstance(self.images_column_names, str):
|
|
@@ -148,7 +148,9 @@ class PairClassificationEvaluator(Evaluator):
|
|
|
148
148
|
hf_subset: str,
|
|
149
149
|
**encode_kwargs: Any,
|
|
150
150
|
) -> np.ndarray:
|
|
151
|
-
index_map
|
|
151
|
+
index_map = {}
|
|
152
|
+
all_unique_texts: list[str] = []
|
|
153
|
+
all_texts_indexes = []
|
|
152
154
|
for text in all_texts:
|
|
153
155
|
text_hash = hash(text)
|
|
154
156
|
if text_hash not in index_map:
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from collections import defaultdict
|
|
3
|
+
from collections.abc import Mapping
|
|
3
4
|
from typing import Any
|
|
4
5
|
|
|
5
6
|
import numpy as np
|
|
@@ -15,7 +16,7 @@ logger = logging.getLogger(__name__)
|
|
|
15
16
|
|
|
16
17
|
def mrr(
|
|
17
18
|
qrels: RelevantDocumentsType,
|
|
18
|
-
results:
|
|
19
|
+
results: Mapping[str, Mapping[str, float]],
|
|
19
20
|
k_values: list[int],
|
|
20
21
|
) -> dict[str, list[float]]:
|
|
21
22
|
mrr_metrics = defaultdict(list)
|
|
@@ -32,7 +33,7 @@ def mrr(
|
|
|
32
33
|
doc_id for doc_id in qrels[query_id] if qrels[query_id][doc_id] > 0
|
|
33
34
|
}
|
|
34
35
|
for k in k_values:
|
|
35
|
-
rr = 0
|
|
36
|
+
rr = 0.0
|
|
36
37
|
for rank, hit in enumerate(top_hits[query_id][0:k]):
|
|
37
38
|
if hit[0] in query_relevant_docs:
|
|
38
39
|
rr = 1.0 / (rank + 1)
|
|
@@ -45,8 +46,8 @@ def recall_cap(
|
|
|
45
46
|
qrels: RelevantDocumentsType,
|
|
46
47
|
results: dict[str, dict[str, float]],
|
|
47
48
|
k_values: list[int],
|
|
48
|
-
) -> dict[str, list[float]]:
|
|
49
|
-
capped_recall = defaultdict(list)
|
|
49
|
+
) -> dict[str, list[float | None]]:
|
|
50
|
+
capped_recall: dict[str, list[float | None]] = defaultdict(list)
|
|
50
51
|
|
|
51
52
|
k_max = max(k_values)
|
|
52
53
|
|
|
@@ -188,7 +189,7 @@ def evaluate_p_mrr_change(
|
|
|
188
189
|
Returns:
|
|
189
190
|
A dictionary with the scores, including "p-MRR", "og" and "changed" keys.
|
|
190
191
|
"""
|
|
191
|
-
followir_scores = defaultdict(dict)
|
|
192
|
+
followir_scores: dict[str, float | dict[str, float]] = defaultdict(dict)
|
|
192
193
|
|
|
193
194
|
qrels_sep = {
|
|
194
195
|
"og": {k: v for k, v in qrels.items() if k.endswith("-og")},
|
|
@@ -227,7 +228,7 @@ def evaluate_p_mrr_change(
|
|
|
227
228
|
ndcg, _map, recall, precision, naucs, avg_mrr, naucs_mrr, cv_recall, {}
|
|
228
229
|
)
|
|
229
230
|
for key, value in scores_dict.items():
|
|
230
|
-
followir_scores[name][key] = value
|
|
231
|
+
followir_scores[name][key] = value # type: ignore[index]
|
|
231
232
|
|
|
232
233
|
return followir_scores
|
|
233
234
|
|
|
@@ -254,8 +255,8 @@ def confidence_scores(sim_scores: list[float]) -> dict[str, float]:
|
|
|
254
255
|
sim_scores_sorted = sorted(sim_scores)[::-1]
|
|
255
256
|
|
|
256
257
|
cs_max = sim_scores_sorted[0]
|
|
257
|
-
cs_std = np.std(sim_scores)
|
|
258
|
-
cs_diff1 =
|
|
258
|
+
cs_std = float(np.std(sim_scores))
|
|
259
|
+
cs_diff1 = 0.0
|
|
259
260
|
if len(sim_scores) > 1:
|
|
260
261
|
cs_diff1 = sim_scores_sorted[0] - sim_scores_sorted[1]
|
|
261
262
|
elif len(sim_scores) == 1:
|
|
@@ -410,7 +411,7 @@ def make_score_dict(
|
|
|
410
411
|
cv_recall: dict[str, float],
|
|
411
412
|
task_scores: dict[str, float],
|
|
412
413
|
previous_results_model_meta: dict[str, Any] | None = None,
|
|
413
|
-
) -> dict[str,
|
|
414
|
+
) -> dict[str, Any]:
|
|
414
415
|
return {
|
|
415
416
|
**{f"ndcg_at_{k.split('@')[1]}": v for (k, v) in ndcg.items()},
|
|
416
417
|
**{f"map_at_{k.split('@')[1]}": v for (k, v) in _map.items()},
|
|
@@ -528,7 +529,7 @@ def max_over_subqueries(
|
|
|
528
529
|
|
|
529
530
|
|
|
530
531
|
def calculate_retrieval_scores(
|
|
531
|
-
results:
|
|
532
|
+
results: Mapping[str, Mapping[str, float]],
|
|
532
533
|
qrels: RelevantDocumentsType,
|
|
533
534
|
k_values: list[int],
|
|
534
535
|
skip_first_result: bool = False,
|
|
@@ -576,7 +577,7 @@ def calculate_retrieval_scores(
|
|
|
576
577
|
|
|
577
578
|
|
|
578
579
|
def evaluate_abstention(
|
|
579
|
-
results:
|
|
580
|
+
results: Mapping[str, Mapping[str, float]],
|
|
580
581
|
metric_scores: dict[str, list[float]],
|
|
581
582
|
) -> dict[str, float]:
|
|
582
583
|
"""Computes normalized Area Under the Curve on a set of evaluated instances as presented in the paper https://arxiv.org/abs/2402.12997
|
|
@@ -591,21 +592,21 @@ def evaluate_abstention(
|
|
|
591
592
|
all_sim_scores = [list(results[qid].values()) for qid in list(results.keys())]
|
|
592
593
|
all_conf_scores = [confidence_scores(sim_scores) for sim_scores in all_sim_scores]
|
|
593
594
|
conf_fcts = list(all_conf_scores[0].keys())
|
|
594
|
-
|
|
595
|
+
all_conf_scores_ = {
|
|
595
596
|
fct: np.array([x[fct] for x in all_conf_scores]) for fct in conf_fcts
|
|
596
597
|
}
|
|
597
|
-
|
|
598
|
+
metric_scores_ = {k: np.array(v) for k, v in metric_scores.items()}
|
|
598
599
|
naucs = {}
|
|
599
600
|
|
|
600
|
-
for metric_name, scores in
|
|
601
|
-
for fct, conf_scores in
|
|
601
|
+
for metric_name, scores in metric_scores_.items():
|
|
602
|
+
for fct, conf_scores in all_conf_scores_.items():
|
|
602
603
|
naucs[f"nAUC_{metric_name}_{fct}"] = nauc(conf_scores, scores)
|
|
603
604
|
|
|
604
605
|
return naucs
|
|
605
606
|
|
|
606
607
|
|
|
607
608
|
def calculate_cv_recall(
|
|
608
|
-
results:
|
|
609
|
+
results: Mapping[str, Mapping[str, float]],
|
|
609
610
|
qrels: RelevantDocumentsType,
|
|
610
611
|
k_values: list[int],
|
|
611
612
|
skip_first_result: bool = False,
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Any, Protocol
|
|
2
|
+
from typing import Any, Protocol, cast
|
|
3
3
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
from datasets import Dataset
|
|
@@ -9,7 +9,7 @@ from typing_extensions import Self
|
|
|
9
9
|
from mteb._create_dataloaders import create_dataloader
|
|
10
10
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
11
11
|
from mteb.models import EncoderProtocol
|
|
12
|
-
from mteb.types import BatchedInput
|
|
12
|
+
from mteb.types import Array, BatchedInput
|
|
13
13
|
|
|
14
14
|
from .evaluator import Evaluator
|
|
15
15
|
|
|
@@ -17,11 +17,11 @@ logger = logging.getLogger(__name__)
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
class SklearnModelProtocol(Protocol):
|
|
20
|
-
def fit(self, X:
|
|
21
|
-
def predict(self, X:
|
|
20
|
+
def fit(self, X: Array, y: np.ndarray | list[int]) -> None: ... # noqa: N803
|
|
21
|
+
def predict(self, X: Array) -> np.ndarray: ... # noqa: N803
|
|
22
22
|
def get_params(self) -> dict[str, Any]: ...
|
|
23
|
-
def set_params(self, **kwargs: dict[str, Any]) -> Self: ...
|
|
24
|
-
def score(self, X:
|
|
23
|
+
def set_params(self, random_state: int, **kwargs: dict[str, Any]) -> Self: ...
|
|
24
|
+
def score(self, X: Array, y: np.ndarray | list[int]) -> float: ... # noqa: N803
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
class SklearnEvaluator(Evaluator):
|
|
@@ -71,8 +71,8 @@ class SklearnEvaluator(Evaluator):
|
|
|
71
71
|
model: EncoderProtocol,
|
|
72
72
|
*,
|
|
73
73
|
encode_kwargs: dict[str, Any],
|
|
74
|
-
test_cache:
|
|
75
|
-
) -> tuple[np.ndarray,
|
|
74
|
+
test_cache: Array | None = None,
|
|
75
|
+
) -> tuple[np.ndarray, Array]:
|
|
76
76
|
"""Classification evaluation by training a sklearn classifier on the embeddings of the training set and evaluating on the embeddings of the test set.
|
|
77
77
|
|
|
78
78
|
Args:
|
|
@@ -104,6 +104,7 @@ class SklearnEvaluator(Evaluator):
|
|
|
104
104
|
hf_subset=self.hf_subset,
|
|
105
105
|
**encode_kwargs,
|
|
106
106
|
)
|
|
107
|
+
test_cache = cast(Array, test_cache)
|
|
107
108
|
|
|
108
109
|
logger.info("Running - Fitting classifier...")
|
|
109
110
|
y_train = self.train_dataset[self.label_column_name]
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from typing import Any
|
|
3
3
|
|
|
4
|
-
import numpy as np
|
|
5
4
|
import torch
|
|
6
5
|
from datasets import Dataset
|
|
7
6
|
from tqdm.auto import tqdm
|
|
@@ -10,6 +9,7 @@ from mteb._create_dataloaders import _create_dataloader_from_texts
|
|
|
10
9
|
from mteb._evaluators.evaluator import Evaluator
|
|
11
10
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
12
11
|
from mteb.models import EncoderProtocol
|
|
12
|
+
from mteb.types import Array
|
|
13
13
|
|
|
14
14
|
logger = logging.getLogger(__name__)
|
|
15
15
|
|
|
@@ -69,11 +69,11 @@ class BitextMiningEvaluator(Evaluator):
|
|
|
69
69
|
|
|
70
70
|
def _similarity_search(
|
|
71
71
|
self,
|
|
72
|
-
query_embeddings:
|
|
73
|
-
corpus_embeddings:
|
|
72
|
+
query_embeddings: Array,
|
|
73
|
+
corpus_embeddings: Array,
|
|
74
74
|
model: EncoderProtocol,
|
|
75
75
|
query_chunk_size: int = 100,
|
|
76
|
-
corpus_chunk_size: int =
|
|
76
|
+
corpus_chunk_size: int = 500_000,
|
|
77
77
|
) -> list[dict[str, float]]:
|
|
78
78
|
"""This function performs a cosine similarity search between a list of query embeddings and a list of corpus embeddings.
|
|
79
79
|
|
|
@@ -104,13 +104,15 @@ class BitextMiningEvaluator(Evaluator):
|
|
|
104
104
|
):
|
|
105
105
|
query_embeddings = query_embeddings.to(corpus_embeddings.device)
|
|
106
106
|
|
|
107
|
-
queries_result_list
|
|
107
|
+
queries_result_list: list[list[dict[str, float]]] = [
|
|
108
|
+
[] for _ in range(len(query_embeddings))
|
|
109
|
+
]
|
|
108
110
|
|
|
109
111
|
for query_start_idx in range(0, len(query_embeddings), query_chunk_size):
|
|
110
112
|
# Iterate over chunks of the corpus
|
|
111
113
|
for corpus_start_idx in range(0, len(corpus_embeddings), corpus_chunk_size):
|
|
112
114
|
# Compute cosine similarities
|
|
113
|
-
similarity_scores = model.similarity(
|
|
115
|
+
similarity_scores = model.similarity(
|
|
114
116
|
query_embeddings[
|
|
115
117
|
query_start_idx : query_start_idx + query_chunk_size
|
|
116
118
|
],
|
|
@@ -120,15 +122,17 @@ class BitextMiningEvaluator(Evaluator):
|
|
|
120
122
|
)
|
|
121
123
|
|
|
122
124
|
# Get top-k scores
|
|
123
|
-
|
|
124
|
-
torch.
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
125
|
+
cos_scores_top_k_values_tensor, cos_scores_top_k_idx_tensor = (
|
|
126
|
+
torch.topk(
|
|
127
|
+
torch.tensor(similarity_scores),
|
|
128
|
+
1,
|
|
129
|
+
dim=1,
|
|
130
|
+
largest=True,
|
|
131
|
+
sorted=False,
|
|
132
|
+
)
|
|
129
133
|
)
|
|
130
|
-
cos_scores_top_k_values =
|
|
131
|
-
cos_scores_top_k_idx =
|
|
134
|
+
cos_scores_top_k_values = cos_scores_top_k_values_tensor.cpu().tolist()
|
|
135
|
+
cos_scores_top_k_idx = cos_scores_top_k_idx_tensor.cpu().tolist()
|
|
132
136
|
|
|
133
137
|
for query_itr in range(len(similarity_scores)):
|
|
134
138
|
for sub_corpus_id, score in zip(
|
|
@@ -141,11 +145,14 @@ class BitextMiningEvaluator(Evaluator):
|
|
|
141
145
|
{"corpus_id": corpus_id, "score": score}
|
|
142
146
|
)
|
|
143
147
|
|
|
148
|
+
result_queries_list: list[dict[str, float]] = [
|
|
149
|
+
{} for _ in range(len(query_embeddings))
|
|
150
|
+
]
|
|
144
151
|
# Sort and strip to top_k results
|
|
145
152
|
for idx in range(len(queries_result_list)):
|
|
146
153
|
queries_result_list[idx] = sorted(
|
|
147
154
|
queries_result_list[idx], key=lambda x: x["score"], reverse=True
|
|
148
155
|
)
|
|
149
|
-
|
|
156
|
+
result_queries_list[idx] = queries_result_list[idx][0]
|
|
150
157
|
|
|
151
|
-
return
|
|
158
|
+
return result_queries_list
|
|
@@ -135,10 +135,10 @@ class SummarizationEvaluator(Evaluator):
|
|
|
135
135
|
)
|
|
136
136
|
|
|
137
137
|
# Split the embeddings into the original human & machine summaries
|
|
138
|
-
|
|
138
|
+
embs_human_summaries_all_split = np.split(
|
|
139
139
|
embs_human_summaries_all, np.cumsum(human_lens)[:-1]
|
|
140
140
|
)
|
|
141
|
-
|
|
141
|
+
embs_machine_summaries_all_split = np.split(
|
|
142
142
|
embs_machine_summaries_all, np.cumsum(machine_lens)[:-1]
|
|
143
143
|
)
|
|
144
144
|
|
|
@@ -148,7 +148,9 @@ class SummarizationEvaluator(Evaluator):
|
|
|
148
148
|
all_human_scores = []
|
|
149
149
|
|
|
150
150
|
for i, (embs_human_summaries, embs_machine_summaries) in tqdm(
|
|
151
|
-
enumerate(
|
|
151
|
+
enumerate(
|
|
152
|
+
zip(embs_human_summaries_all_split, embs_machine_summaries_all_split)
|
|
153
|
+
),
|
|
152
154
|
desc="Scoring",
|
|
153
155
|
total=len(self.human_summaries),
|
|
154
156
|
):
|
|
@@ -164,7 +166,7 @@ class SummarizationEvaluator(Evaluator):
|
|
|
164
166
|
dot_scores = dot_score(emb_machine_summary, embs_human_summaries)
|
|
165
167
|
|
|
166
168
|
_sim_score = [
|
|
167
|
-
float(model.similarity(emb_machine_summary, emb_human_summary))
|
|
169
|
+
float(model.similarity(emb_machine_summary, emb_human_summary))
|
|
168
170
|
for emb_human_summary in embs_human_summaries
|
|
169
171
|
]
|
|
170
172
|
sim_score = torch.tensor(_sim_score)
|
|
@@ -216,17 +218,19 @@ class SummarizationEvaluator(Evaluator):
|
|
|
216
218
|
strict=True,
|
|
217
219
|
):
|
|
218
220
|
cosine_spearman_scores.append(
|
|
219
|
-
spearmanr(human_scores, cosine_pred_scores).statistic
|
|
221
|
+
float(spearmanr(human_scores, cosine_pred_scores).statistic)
|
|
220
222
|
)
|
|
221
223
|
cosine_pearson_scores.append(
|
|
222
|
-
pearsonr(human_scores, cosine_pred_scores).statistic
|
|
224
|
+
float(pearsonr(human_scores, cosine_pred_scores).statistic)
|
|
223
225
|
)
|
|
224
226
|
dot_spearman_scores.append(
|
|
225
|
-
spearmanr(human_scores, dot_pred_scores).statistic
|
|
227
|
+
float(spearmanr(human_scores, dot_pred_scores).statistic)
|
|
228
|
+
)
|
|
229
|
+
dot_pearson_scores.append(
|
|
230
|
+
float(pearsonr(human_scores, dot_pred_scores).statistic)
|
|
226
231
|
)
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
pearson_scores.append(pearsonr(human_scores, sim_scores).statistic)
|
|
232
|
+
spearman_scores.append(float(spearmanr(human_scores, sim_scores).statistic))
|
|
233
|
+
pearson_scores.append(float(pearsonr(human_scores, sim_scores).statistic))
|
|
230
234
|
|
|
231
235
|
return SummarizationMetrics(
|
|
232
236
|
pearson=float(np.mean(pearson_scores)),
|
|
@@ -273,10 +277,10 @@ class DeprecatedSummarizationEvaluator(SummarizationEvaluator):
|
|
|
273
277
|
pearson_scores.append(pearsonr(human_scores, sim_scores))
|
|
274
278
|
|
|
275
279
|
return SummarizationMetrics(
|
|
276
|
-
pearson=float(np.mean(pearson_scores)),
|
|
277
|
-
spearman=float(np.mean(spearman_scores)),
|
|
278
|
-
cosine_spearman=float(np.mean(cosine_spearman_scores)),
|
|
279
|
-
cosine_pearson=float(np.mean(cosine_pearson_scores)),
|
|
280
|
-
dot_pearson=float(np.mean(dot_pearson_scores)),
|
|
281
|
-
dot_spearman=float(np.mean(dot_spearman_scores)),
|
|
280
|
+
pearson=float(np.mean(pearson_scores)), # type: ignore[arg-type]
|
|
281
|
+
spearman=float(np.mean(spearman_scores)), # type: ignore[arg-type]
|
|
282
|
+
cosine_spearman=float(np.mean(cosine_spearman_scores)), # type: ignore[arg-type]
|
|
283
|
+
cosine_pearson=float(np.mean(cosine_pearson_scores)), # type: ignore[arg-type]
|
|
284
|
+
dot_pearson=float(np.mean(dot_pearson_scores)), # type: ignore[arg-type]
|
|
285
|
+
dot_spearman=float(np.mean(dot_spearman_scores)), # type: ignore[arg-type]
|
|
282
286
|
)
|
|
@@ -61,7 +61,7 @@ def filter_unclear_label(
|
|
|
61
61
|
for text, label in zip(ds[input_column], ds[label_column]):
|
|
62
62
|
key = text.strip().lower()
|
|
63
63
|
normalized.setdefault(key, set()).add(
|
|
64
|
-
label if isinstance(label, (str, int, float)) else tuple(label)
|
|
64
|
+
label if isinstance(label, (str, int, float)) else tuple(label) # type: ignore[arg-type]
|
|
65
65
|
)
|
|
66
66
|
|
|
67
67
|
bad_texts = {t for t, labels in normalized.items() if len(labels) > 1}
|
|
@@ -2,7 +2,8 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import hashlib
|
|
4
4
|
from collections import Counter
|
|
5
|
-
from
|
|
5
|
+
from collections.abc import Mapping
|
|
6
|
+
from typing import TYPE_CHECKING, cast
|
|
6
7
|
|
|
7
8
|
from mteb.types import TopRankedDocumentsType
|
|
8
9
|
from mteb.types.statistics import (
|
|
@@ -52,7 +53,7 @@ def calculate_image_statistics(images: list[Image.Image]) -> ImageStatistics:
|
|
|
52
53
|
seen_hashes: set[str] = set()
|
|
53
54
|
|
|
54
55
|
for img in images:
|
|
55
|
-
width, height = img.size
|
|
56
|
+
width, height = img.size
|
|
56
57
|
img_heights.append(height)
|
|
57
58
|
img_widths.append(width)
|
|
58
59
|
|
|
@@ -82,17 +83,24 @@ def calculate_label_statistics(labels: list[int | list[int]]) -> LabelStatistics
|
|
|
82
83
|
LabelStatistics: A dictionary containing the descriptive statistics.
|
|
83
84
|
|
|
84
85
|
"""
|
|
86
|
+
total_labels: list[int | None] = []
|
|
87
|
+
|
|
85
88
|
if not isinstance(labels[0], list):
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
+
# single label classification
|
|
90
|
+
single_label = cast(list[int], labels)
|
|
91
|
+
label_len = [1] * len(single_label)
|
|
92
|
+
total_label_len = len(single_label)
|
|
93
|
+
total_labels.extend(single_label)
|
|
89
94
|
elif isinstance(labels[0], list):
|
|
90
95
|
# multilabel classification
|
|
91
|
-
|
|
96
|
+
multilabel_labels = cast(list[list[int]], labels)
|
|
97
|
+
label_len = [len(l) for l in multilabel_labels]
|
|
92
98
|
total_label_len = sum(label_len)
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
99
|
+
for l in multilabel_labels:
|
|
100
|
+
if l and len(l) > 0:
|
|
101
|
+
total_labels.extend(l)
|
|
102
|
+
else:
|
|
103
|
+
total_labels.append(None)
|
|
96
104
|
else:
|
|
97
105
|
raise ValueError(
|
|
98
106
|
"Labels must be a list of integers or a list of lists of integers."
|
|
@@ -159,7 +167,7 @@ def calculate_top_ranked_statistics(
|
|
|
159
167
|
|
|
160
168
|
|
|
161
169
|
def calculate_relevant_docs_statistics(
|
|
162
|
-
relevant_docs:
|
|
170
|
+
relevant_docs: Mapping[str, Mapping[str, int]],
|
|
163
171
|
) -> RelevantDocsStatistics:
|
|
164
172
|
qrels_lengths = [len(relevant_docs[qid]) for qid in relevant_docs]
|
|
165
173
|
unique_qrels = len({doc for qid in relevant_docs for doc in relevant_docs[qid]})
|
mteb/abstasks/_stratification.py
CHANGED
|
@@ -39,6 +39,7 @@ Bibtex:
|
|
|
39
39
|
"""
|
|
40
40
|
|
|
41
41
|
import itertools
|
|
42
|
+
from typing import Any
|
|
42
43
|
|
|
43
44
|
import numpy as np
|
|
44
45
|
import scipy.sparse as sp
|
|
@@ -119,8 +120,10 @@ def _get_most_desired_combination(samples_with_combination: dict):
|
|
|
119
120
|
if support_size == 0:
|
|
120
121
|
continue
|
|
121
122
|
if currently_chosen is None or (
|
|
122
|
-
best_number_of_combinations
|
|
123
|
-
and best_support_size
|
|
123
|
+
best_number_of_combinations is not None
|
|
124
|
+
and best_support_size is not None
|
|
125
|
+
and best_number_of_combinations < number_of_combinations
|
|
126
|
+
and best_support_size > support_size
|
|
124
127
|
):
|
|
125
128
|
currently_chosen = combination
|
|
126
129
|
best_number_of_combinations, best_support_size = (
|
|
@@ -162,7 +165,7 @@ class IterativeStratification(_BaseKFold):
|
|
|
162
165
|
self._rng_state = check_random_state(random_state)
|
|
163
166
|
need_shuffle = shuffle or random_state is not None
|
|
164
167
|
self.order = order
|
|
165
|
-
super().__init__(
|
|
168
|
+
super().__init__(
|
|
166
169
|
n_splits,
|
|
167
170
|
shuffle=need_shuffle,
|
|
168
171
|
random_state=self._rng_state if need_shuffle else None,
|
|
@@ -172,8 +175,7 @@ class IterativeStratification(_BaseKFold):
|
|
|
172
175
|
self.percentage_per_fold = sample_distribution_per_fold
|
|
173
176
|
else:
|
|
174
177
|
self.percentage_per_fold = [
|
|
175
|
-
1 / float(self.n_splits)
|
|
176
|
-
for _ in range(self.n_splits) # type: ignore
|
|
178
|
+
1 / float(self.n_splits) for _ in range(self.n_splits)
|
|
177
179
|
]
|
|
178
180
|
|
|
179
181
|
def _prepare_stratification(
|
|
@@ -182,9 +184,9 @@ class IterativeStratification(_BaseKFold):
|
|
|
182
184
|
list[list[int]],
|
|
183
185
|
dict[int, bool],
|
|
184
186
|
list[list[int]],
|
|
185
|
-
list[list[
|
|
186
|
-
dict[
|
|
187
|
-
list[list[
|
|
187
|
+
list[list[Any]],
|
|
188
|
+
dict[str, list[Any]],
|
|
189
|
+
list[list[Any]],
|
|
188
190
|
]:
|
|
189
191
|
"""Prepares variables for performing stratification
|
|
190
192
|
|
|
@@ -206,14 +208,14 @@ class IterativeStratification(_BaseKFold):
|
|
|
206
208
|
"""
|
|
207
209
|
self.n_samples, self.n_labels = y.shape
|
|
208
210
|
self.desired_samples_per_fold = np.array(
|
|
209
|
-
[self.percentage_per_fold[i] * self.n_samples for i in range(self.n_splits)]
|
|
211
|
+
[self.percentage_per_fold[i] * self.n_samples for i in range(self.n_splits)]
|
|
210
212
|
)
|
|
211
213
|
rows = sp.lil_matrix(y).rows
|
|
212
214
|
rows_used = dict.fromkeys(range(self.n_samples), False)
|
|
213
215
|
all_combinations = []
|
|
214
|
-
per_row_combinations = [[] for i in range(self.n_samples)]
|
|
215
|
-
samples_with_combination = {}
|
|
216
|
-
folds = [[] for _ in range(self.n_splits)]
|
|
216
|
+
per_row_combinations: list[list[Any]] = [[] for i in range(self.n_samples)]
|
|
217
|
+
samples_with_combination: dict[str, list[Any]] = {}
|
|
218
|
+
folds: list[list[int]] = [[] for _ in range(self.n_splits)]
|
|
217
219
|
|
|
218
220
|
# for every row
|
|
219
221
|
for sample_index, label_assignment in enumerate(rows):
|
|
@@ -229,21 +231,19 @@ class IterativeStratification(_BaseKFold):
|
|
|
229
231
|
all_combinations.append(combination)
|
|
230
232
|
per_row_combinations[sample_index].append(combination)
|
|
231
233
|
|
|
232
|
-
all_combinations = [list(x) for x in set(all_combinations)]
|
|
233
|
-
|
|
234
234
|
self.desired_samples_per_combination_per_fold = {
|
|
235
235
|
combination: np.array(
|
|
236
236
|
[
|
|
237
237
|
len(evidence_for_combination) * self.percentage_per_fold[j]
|
|
238
|
-
for j in range(self.n_splits)
|
|
238
|
+
for j in range(self.n_splits)
|
|
239
239
|
]
|
|
240
240
|
)
|
|
241
241
|
for combination, evidence_for_combination in samples_with_combination.items()
|
|
242
242
|
}
|
|
243
243
|
return (
|
|
244
|
-
rows,
|
|
244
|
+
rows.tolist(),
|
|
245
245
|
rows_used,
|
|
246
|
-
all_combinations,
|
|
246
|
+
[list(x) for x in set(all_combinations)],
|
|
247
247
|
per_row_combinations,
|
|
248
248
|
samples_with_combination,
|
|
249
249
|
folds,
|
|
@@ -328,7 +328,7 @@ class IterativeStratification(_BaseKFold):
|
|
|
328
328
|
per_row_combinations,
|
|
329
329
|
samples_with_combination,
|
|
330
330
|
folds,
|
|
331
|
-
) = self._prepare_stratification(y)
|
|
331
|
+
) = self._prepare_stratification(y)
|
|
332
332
|
|
|
333
333
|
self._distribute_positive_evidence(
|
|
334
334
|
rows_used, folds, samples_with_combination, per_row_combinations
|