mteb 2.5.3__py3-none-any.whl → 2.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. mteb/_create_dataloaders.py +10 -15
  2. mteb/_evaluators/any_sts_evaluator.py +1 -4
  3. mteb/_evaluators/evaluator.py +2 -1
  4. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +5 -6
  5. mteb/_evaluators/pair_classification_evaluator.py +3 -1
  6. mteb/_evaluators/retrieval_metrics.py +17 -16
  7. mteb/_evaluators/sklearn_evaluator.py +9 -8
  8. mteb/_evaluators/text/bitext_mining_evaluator.py +23 -16
  9. mteb/_evaluators/text/summarization_evaluator.py +20 -16
  10. mteb/abstasks/_data_filter/filters.py +1 -1
  11. mteb/abstasks/_data_filter/task_pipelines.py +3 -0
  12. mteb/abstasks/_statistics_calculation.py +18 -10
  13. mteb/abstasks/_stratification.py +18 -18
  14. mteb/abstasks/abstask.py +27 -21
  15. mteb/abstasks/aggregate_task_metadata.py +1 -9
  16. mteb/abstasks/aggregated_task.py +3 -16
  17. mteb/abstasks/classification.py +10 -4
  18. mteb/abstasks/clustering.py +18 -14
  19. mteb/abstasks/clustering_legacy.py +8 -8
  20. mteb/abstasks/image/image_text_pair_classification.py +5 -3
  21. mteb/abstasks/multilabel_classification.py +20 -16
  22. mteb/abstasks/pair_classification.py +18 -9
  23. mteb/abstasks/regression.py +3 -3
  24. mteb/abstasks/retrieval.py +12 -9
  25. mteb/abstasks/sts.py +6 -3
  26. mteb/abstasks/task_metadata.py +20 -16
  27. mteb/abstasks/text/bitext_mining.py +36 -25
  28. mteb/abstasks/text/reranking.py +7 -5
  29. mteb/abstasks/text/summarization.py +8 -3
  30. mteb/abstasks/zeroshot_classification.py +5 -2
  31. mteb/benchmarks/benchmark.py +2 -2
  32. mteb/cache.py +20 -18
  33. mteb/cli/_display_tasks.py +2 -2
  34. mteb/cli/build_cli.py +5 -5
  35. mteb/cli/generate_model_card.py +6 -4
  36. mteb/deprecated_evaluator.py +56 -43
  37. mteb/evaluate.py +35 -29
  38. mteb/filter_tasks.py +25 -26
  39. mteb/get_tasks.py +25 -27
  40. mteb/languages/language_scripts.py +5 -3
  41. mteb/leaderboard/app.py +1 -1
  42. mteb/load_results.py +12 -12
  43. mteb/models/abs_encoder.py +2 -2
  44. mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
  45. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
  46. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +2 -1
  47. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +30 -13
  48. mteb/models/cache_wrappers/cache_wrapper.py +2 -2
  49. mteb/models/get_model_meta.py +8 -1
  50. mteb/models/instruct_wrapper.py +11 -5
  51. mteb/models/model_implementations/andersborges.py +2 -2
  52. mteb/models/model_implementations/blip_models.py +8 -8
  53. mteb/models/model_implementations/bm25.py +1 -1
  54. mteb/models/model_implementations/clip_models.py +3 -3
  55. mteb/models/model_implementations/cohere_models.py +1 -1
  56. mteb/models/model_implementations/cohere_v.py +2 -2
  57. mteb/models/model_implementations/dino_models.py +23 -23
  58. mteb/models/model_implementations/emillykkejensen_models.py +3 -3
  59. mteb/models/model_implementations/jina_clip.py +1 -1
  60. mteb/models/model_implementations/jina_models.py +1 -1
  61. mteb/models/model_implementations/kennethenevoldsen_models.py +2 -2
  62. mteb/models/model_implementations/llm2clip_models.py +3 -3
  63. mteb/models/model_implementations/moco_models.py +2 -2
  64. mteb/models/model_implementations/model2vec_models.py +1 -1
  65. mteb/models/model_implementations/nomic_models.py +8 -8
  66. mteb/models/model_implementations/openclip_models.py +7 -7
  67. mteb/models/model_implementations/random_baseline.py +3 -3
  68. mteb/models/model_implementations/rasgaard_models.py +1 -1
  69. mteb/models/model_implementations/repllama_models.py +2 -2
  70. mteb/models/model_implementations/rerankers_custom.py +3 -3
  71. mteb/models/model_implementations/rerankers_monot5_based.py +3 -3
  72. mteb/models/model_implementations/siglip_models.py +10 -10
  73. mteb/models/model_implementations/vlm2vec_models.py +1 -1
  74. mteb/models/model_implementations/voyage_v.py +4 -4
  75. mteb/models/model_meta.py +11 -12
  76. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +5 -5
  77. mteb/models/search_wrappers.py +22 -10
  78. mteb/models/sentence_transformer_wrapper.py +9 -4
  79. mteb/py.typed +0 -0
  80. mteb/results/benchmark_results.py +25 -19
  81. mteb/results/model_result.py +49 -21
  82. mteb/results/task_result.py +45 -51
  83. mteb/similarity_functions.py +11 -7
  84. mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
  85. mteb/tasks/classification/est/estonian_valence.py +1 -1
  86. mteb/tasks/classification/multilingual/scala_classification.py +1 -1
  87. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  88. mteb/tasks/retrieval/code/code_rag.py +12 -12
  89. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  90. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  91. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  92. mteb/tasks/retrieval/nob/norquad.py +2 -2
  93. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  94. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  95. mteb/types/_result.py +2 -1
  96. mteb/types/statistics.py +9 -3
  97. {mteb-2.5.3.dist-info → mteb-2.5.4.dist-info}/METADATA +1 -1
  98. {mteb-2.5.3.dist-info → mteb-2.5.4.dist-info}/RECORD +102 -101
  99. {mteb-2.5.3.dist-info → mteb-2.5.4.dist-info}/WHEEL +0 -0
  100. {mteb-2.5.3.dist-info → mteb-2.5.4.dist-info}/entry_points.txt +0 -0
  101. {mteb-2.5.3.dist-info → mteb-2.5.4.dist-info}/licenses/LICENSE +0 -0
  102. {mteb-2.5.3.dist-info → mteb-2.5.4.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ import warnings
2
3
  from collections.abc import Callable
3
4
  from typing import Any, cast
4
5
 
@@ -113,11 +114,8 @@ def _create_text_dataloader_for_queries(
113
114
  )
114
115
 
115
116
 
116
- _warned_about_user_role = False
117
-
118
-
119
117
  def _convert_conv_history_to_query(
120
- row: dict[str, list[str] | Conversation],
118
+ row: dict[str, str | list[str] | Conversation],
121
119
  ) -> dict[str, str | Conversation]:
122
120
  """Convert a conversation history to a single query string.
123
121
 
@@ -127,21 +125,18 @@ def _convert_conv_history_to_query(
127
125
  Returns:
128
126
  The updated row with the "query" and "text" fields set to the conversation string, and the "conversation" field set to the list of ConversationTurn.
129
127
  """
130
- global _warned_about_user_role
131
-
132
128
  conversation = row["text"]
133
129
  # if it's a list of strings, just join them
134
130
  if isinstance(conversation, list) and isinstance(conversation[0], str):
135
- conversation = cast(list[str], conversation)
136
- conv_str = "; ".join(conversation)
131
+ conversation_ = cast(list[str], conversation)
132
+ conv_str = "; ".join(conversation_)
137
133
  current_conversation = [
138
- ConversationTurn(role="user", content=message) for message in conversation
134
+ ConversationTurn(role="user", content=message) for message in conversation_
139
135
  ]
140
- if not _warned_about_user_role:
141
- logger.warning(
142
- "Conversations are a list of strings. Used 'user' role for all turns."
143
- )
144
- _warned_about_user_role = True
136
+ warnings.warn(
137
+ "Conversations are a list of strings. Used 'user' role for all turns.",
138
+ category=UserWarning,
139
+ )
145
140
  # otherwise, it's a list of dictionaries, which we need to convert to strings
146
141
  elif isinstance(conversation, list) and isinstance(conversation[0], dict):
147
142
  conv = []
@@ -178,7 +173,7 @@ def _convert_conv_history_to_query(
178
173
 
179
174
  row["text"] = conv_str
180
175
  row["conversation"] = current_conversation
181
- return row
176
+ return cast(dict[str, str | list[ConversationTurn]], row)
182
177
 
183
178
 
184
179
  def _create_dataloader_for_queries_conversation(
@@ -57,10 +57,7 @@ class AnySTSEvaluator(Evaluator):
57
57
  self.input2_prompt_type = input2_prompt_type
58
58
 
59
59
  def __call__(
60
- self,
61
- model: EncoderProtocol,
62
- *,
63
- encode_kwargs: dict[str, Any],
60
+ self, model: EncoderProtocol, *, encode_kwargs: dict[str, Any]
64
61
  ) -> STSEvaluatorScores:
65
62
  logger.info("Running semantic similarity - Encoding samples (1/2)")
66
63
  embeddings1 = model.encode(
@@ -1,4 +1,5 @@
1
1
  from abc import ABC, abstractmethod
2
+ from collections.abc import Iterable, Mapping
2
3
  from typing import Any
3
4
 
4
5
  from mteb.abstasks.abstask import _set_seed
@@ -18,7 +19,7 @@ class Evaluator(ABC):
18
19
  @abstractmethod
19
20
  def __call__(
20
21
  self, model: EncoderProtocol, *, encode_kwargs: dict[str, Any]
21
- ) -> dict[str, float]:
22
+ ) -> Mapping[str, float] | Iterable[Any]:
22
23
  """This is called during training to evaluate the model.
23
24
 
24
25
  It returns scores.
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import logging
4
+ from collections.abc import Sequence
4
5
  from typing import TYPE_CHECKING, Any
5
6
 
6
7
  import torch
@@ -61,8 +62,8 @@ class ImageTextPairClassificationEvaluator(Evaluator):
61
62
  def __init__(
62
63
  self,
63
64
  dataset,
64
- images_column_names: str | list[str],
65
- texts_column_names: str | list[str],
65
+ images_column_names: str | Sequence[str],
66
+ texts_column_names: str | Sequence[str],
66
67
  num_images_per_sample: int,
67
68
  num_texts_per_sample: int,
68
69
  task_metadata: TaskMetadata,
@@ -82,10 +83,8 @@ class ImageTextPairClassificationEvaluator(Evaluator):
82
83
  self.hf_split = hf_split
83
84
  self.hf_subset = hf_subset
84
85
 
85
- def __call__(
86
- self,
87
- model: EncoderProtocol,
88
- encode_kwargs: dict[str, Any],
86
+ def __call__( # type: ignore[override]
87
+ self, model: EncoderProtocol, *, encode_kwargs: dict[str, Any]
89
88
  ) -> list[torch.Tensor]:
90
89
  images = []
91
90
  if isinstance(self.images_column_names, str):
@@ -148,7 +148,9 @@ class PairClassificationEvaluator(Evaluator):
148
148
  hf_subset: str,
149
149
  **encode_kwargs: Any,
150
150
  ) -> np.ndarray:
151
- index_map, all_unique_texts, all_texts_indexes = {}, [], []
151
+ index_map = {}
152
+ all_unique_texts: list[str] = []
153
+ all_texts_indexes = []
152
154
  for text in all_texts:
153
155
  text_hash = hash(text)
154
156
  if text_hash not in index_map:
@@ -1,5 +1,6 @@
1
1
  import logging
2
2
  from collections import defaultdict
3
+ from collections.abc import Mapping
3
4
  from typing import Any
4
5
 
5
6
  import numpy as np
@@ -15,7 +16,7 @@ logger = logging.getLogger(__name__)
15
16
 
16
17
  def mrr(
17
18
  qrels: RelevantDocumentsType,
18
- results: dict[str, dict[str, float]],
19
+ results: Mapping[str, Mapping[str, float]],
19
20
  k_values: list[int],
20
21
  ) -> dict[str, list[float]]:
21
22
  mrr_metrics = defaultdict(list)
@@ -32,7 +33,7 @@ def mrr(
32
33
  doc_id for doc_id in qrels[query_id] if qrels[query_id][doc_id] > 0
33
34
  }
34
35
  for k in k_values:
35
- rr = 0
36
+ rr = 0.0
36
37
  for rank, hit in enumerate(top_hits[query_id][0:k]):
37
38
  if hit[0] in query_relevant_docs:
38
39
  rr = 1.0 / (rank + 1)
@@ -45,8 +46,8 @@ def recall_cap(
45
46
  qrels: RelevantDocumentsType,
46
47
  results: dict[str, dict[str, float]],
47
48
  k_values: list[int],
48
- ) -> dict[str, list[float]]:
49
- capped_recall = defaultdict(list)
49
+ ) -> dict[str, list[float | None]]:
50
+ capped_recall: dict[str, list[float | None]] = defaultdict(list)
50
51
 
51
52
  k_max = max(k_values)
52
53
 
@@ -188,7 +189,7 @@ def evaluate_p_mrr_change(
188
189
  Returns:
189
190
  A dictionary with the scores, including "p-MRR", "og" and "changed" keys.
190
191
  """
191
- followir_scores = defaultdict(dict)
192
+ followir_scores: dict[str, float | dict[str, float]] = defaultdict(dict)
192
193
 
193
194
  qrels_sep = {
194
195
  "og": {k: v for k, v in qrels.items() if k.endswith("-og")},
@@ -227,7 +228,7 @@ def evaluate_p_mrr_change(
227
228
  ndcg, _map, recall, precision, naucs, avg_mrr, naucs_mrr, cv_recall, {}
228
229
  )
229
230
  for key, value in scores_dict.items():
230
- followir_scores[name][key] = value
231
+ followir_scores[name][key] = value # type: ignore[index]
231
232
 
232
233
  return followir_scores
233
234
 
@@ -254,8 +255,8 @@ def confidence_scores(sim_scores: list[float]) -> dict[str, float]:
254
255
  sim_scores_sorted = sorted(sim_scores)[::-1]
255
256
 
256
257
  cs_max = sim_scores_sorted[0]
257
- cs_std = np.std(sim_scores)
258
- cs_diff1 = None
258
+ cs_std = float(np.std(sim_scores))
259
+ cs_diff1 = 0.0
259
260
  if len(sim_scores) > 1:
260
261
  cs_diff1 = sim_scores_sorted[0] - sim_scores_sorted[1]
261
262
  elif len(sim_scores) == 1:
@@ -410,7 +411,7 @@ def make_score_dict(
410
411
  cv_recall: dict[str, float],
411
412
  task_scores: dict[str, float],
412
413
  previous_results_model_meta: dict[str, Any] | None = None,
413
- ) -> dict[str, float]:
414
+ ) -> dict[str, Any]:
414
415
  return {
415
416
  **{f"ndcg_at_{k.split('@')[1]}": v for (k, v) in ndcg.items()},
416
417
  **{f"map_at_{k.split('@')[1]}": v for (k, v) in _map.items()},
@@ -528,7 +529,7 @@ def max_over_subqueries(
528
529
 
529
530
 
530
531
  def calculate_retrieval_scores(
531
- results: dict[str, dict[str, float]],
532
+ results: Mapping[str, Mapping[str, float]],
532
533
  qrels: RelevantDocumentsType,
533
534
  k_values: list[int],
534
535
  skip_first_result: bool = False,
@@ -576,7 +577,7 @@ def calculate_retrieval_scores(
576
577
 
577
578
 
578
579
  def evaluate_abstention(
579
- results: dict[str, dict[str, float]],
580
+ results: Mapping[str, Mapping[str, float]],
580
581
  metric_scores: dict[str, list[float]],
581
582
  ) -> dict[str, float]:
582
583
  """Computes normalized Area Under the Curve on a set of evaluated instances as presented in the paper https://arxiv.org/abs/2402.12997
@@ -591,21 +592,21 @@ def evaluate_abstention(
591
592
  all_sim_scores = [list(results[qid].values()) for qid in list(results.keys())]
592
593
  all_conf_scores = [confidence_scores(sim_scores) for sim_scores in all_sim_scores]
593
594
  conf_fcts = list(all_conf_scores[0].keys())
594
- all_conf_scores = {
595
+ all_conf_scores_ = {
595
596
  fct: np.array([x[fct] for x in all_conf_scores]) for fct in conf_fcts
596
597
  }
597
- metric_scores = {k: np.array(v) for k, v in metric_scores.items()}
598
+ metric_scores_ = {k: np.array(v) for k, v in metric_scores.items()}
598
599
  naucs = {}
599
600
 
600
- for metric_name, scores in metric_scores.items():
601
- for fct, conf_scores in all_conf_scores.items():
601
+ for metric_name, scores in metric_scores_.items():
602
+ for fct, conf_scores in all_conf_scores_.items():
602
603
  naucs[f"nAUC_{metric_name}_{fct}"] = nauc(conf_scores, scores)
603
604
 
604
605
  return naucs
605
606
 
606
607
 
607
608
  def calculate_cv_recall(
608
- results: dict[str, dict[str, float]],
609
+ results: Mapping[str, Mapping[str, float]],
609
610
  qrels: RelevantDocumentsType,
610
611
  k_values: list[int],
611
612
  skip_first_result: bool = False,
@@ -1,5 +1,5 @@
1
1
  import logging
2
- from typing import Any, Protocol
2
+ from typing import Any, Protocol, cast
3
3
 
4
4
  import numpy as np
5
5
  from datasets import Dataset
@@ -9,7 +9,7 @@ from typing_extensions import Self
9
9
  from mteb._create_dataloaders import create_dataloader
10
10
  from mteb.abstasks.task_metadata import TaskMetadata
11
11
  from mteb.models import EncoderProtocol
12
- from mteb.types import BatchedInput
12
+ from mteb.types import Array, BatchedInput
13
13
 
14
14
  from .evaluator import Evaluator
15
15
 
@@ -17,11 +17,11 @@ logger = logging.getLogger(__name__)
17
17
 
18
18
 
19
19
  class SklearnModelProtocol(Protocol):
20
- def fit(self, X: np.ndarray, y: np.ndarray | list[int]) -> None: ... # noqa: N803
21
- def predict(self, X: np.ndarray) -> np.ndarray: ... # noqa: N803
20
+ def fit(self, X: Array, y: np.ndarray | list[int]) -> None: ... # noqa: N803
21
+ def predict(self, X: Array) -> np.ndarray: ... # noqa: N803
22
22
  def get_params(self) -> dict[str, Any]: ...
23
- def set_params(self, **kwargs: dict[str, Any]) -> Self: ...
24
- def score(self, X: np.ndarray, y: np.ndarray | list[int]) -> float: ... # noqa: N803
23
+ def set_params(self, random_state: int, **kwargs: dict[str, Any]) -> Self: ...
24
+ def score(self, X: Array, y: np.ndarray | list[int]) -> float: ... # noqa: N803
25
25
 
26
26
 
27
27
  class SklearnEvaluator(Evaluator):
@@ -71,8 +71,8 @@ class SklearnEvaluator(Evaluator):
71
71
  model: EncoderProtocol,
72
72
  *,
73
73
  encode_kwargs: dict[str, Any],
74
- test_cache: np.ndarray | None = None,
75
- ) -> tuple[np.ndarray, np.ndarray]:
74
+ test_cache: Array | None = None,
75
+ ) -> tuple[np.ndarray, Array]:
76
76
  """Classification evaluation by training a sklearn classifier on the embeddings of the training set and evaluating on the embeddings of the test set.
77
77
 
78
78
  Args:
@@ -104,6 +104,7 @@ class SklearnEvaluator(Evaluator):
104
104
  hf_subset=self.hf_subset,
105
105
  **encode_kwargs,
106
106
  )
107
+ test_cache = cast(Array, test_cache)
107
108
 
108
109
  logger.info("Running - Fitting classifier...")
109
110
  y_train = self.train_dataset[self.label_column_name]
@@ -1,7 +1,6 @@
1
1
  import logging
2
2
  from typing import Any
3
3
 
4
- import numpy as np
5
4
  import torch
6
5
  from datasets import Dataset
7
6
  from tqdm.auto import tqdm
@@ -10,6 +9,7 @@ from mteb._create_dataloaders import _create_dataloader_from_texts
10
9
  from mteb._evaluators.evaluator import Evaluator
11
10
  from mteb.abstasks.task_metadata import TaskMetadata
12
11
  from mteb.models import EncoderProtocol
12
+ from mteb.types import Array
13
13
 
14
14
  logger = logging.getLogger(__name__)
15
15
 
@@ -69,11 +69,11 @@ class BitextMiningEvaluator(Evaluator):
69
69
 
70
70
  def _similarity_search(
71
71
  self,
72
- query_embeddings: np.ndarray,
73
- corpus_embeddings: np.ndarray,
72
+ query_embeddings: Array,
73
+ corpus_embeddings: Array,
74
74
  model: EncoderProtocol,
75
75
  query_chunk_size: int = 100,
76
- corpus_chunk_size: int = 500000,
76
+ corpus_chunk_size: int = 500_000,
77
77
  ) -> list[dict[str, float]]:
78
78
  """This function performs a cosine similarity search between a list of query embeddings and a list of corpus embeddings.
79
79
 
@@ -104,13 +104,15 @@ class BitextMiningEvaluator(Evaluator):
104
104
  ):
105
105
  query_embeddings = query_embeddings.to(corpus_embeddings.device)
106
106
 
107
- queries_result_list = [[] for _ in range(len(query_embeddings))]
107
+ queries_result_list: list[list[dict[str, float]]] = [
108
+ [] for _ in range(len(query_embeddings))
109
+ ]
108
110
 
109
111
  for query_start_idx in range(0, len(query_embeddings), query_chunk_size):
110
112
  # Iterate over chunks of the corpus
111
113
  for corpus_start_idx in range(0, len(corpus_embeddings), corpus_chunk_size):
112
114
  # Compute cosine similarities
113
- similarity_scores = model.similarity( # type: ignore
115
+ similarity_scores = model.similarity(
114
116
  query_embeddings[
115
117
  query_start_idx : query_start_idx + query_chunk_size
116
118
  ],
@@ -120,15 +122,17 @@ class BitextMiningEvaluator(Evaluator):
120
122
  )
121
123
 
122
124
  # Get top-k scores
123
- cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk(
124
- torch.tensor(similarity_scores),
125
- 1,
126
- dim=1,
127
- largest=True,
128
- sorted=False,
125
+ cos_scores_top_k_values_tensor, cos_scores_top_k_idx_tensor = (
126
+ torch.topk(
127
+ torch.tensor(similarity_scores),
128
+ 1,
129
+ dim=1,
130
+ largest=True,
131
+ sorted=False,
132
+ )
129
133
  )
130
- cos_scores_top_k_values = cos_scores_top_k_values.cpu().tolist()
131
- cos_scores_top_k_idx = cos_scores_top_k_idx.cpu().tolist()
134
+ cos_scores_top_k_values = cos_scores_top_k_values_tensor.cpu().tolist()
135
+ cos_scores_top_k_idx = cos_scores_top_k_idx_tensor.cpu().tolist()
132
136
 
133
137
  for query_itr in range(len(similarity_scores)):
134
138
  for sub_corpus_id, score in zip(
@@ -141,11 +145,14 @@ class BitextMiningEvaluator(Evaluator):
141
145
  {"corpus_id": corpus_id, "score": score}
142
146
  )
143
147
 
148
+ result_queries_list: list[dict[str, float]] = [
149
+ {} for _ in range(len(query_embeddings))
150
+ ]
144
151
  # Sort and strip to top_k results
145
152
  for idx in range(len(queries_result_list)):
146
153
  queries_result_list[idx] = sorted(
147
154
  queries_result_list[idx], key=lambda x: x["score"], reverse=True
148
155
  )
149
- queries_result_list[idx] = queries_result_list[idx][0]
156
+ result_queries_list[idx] = queries_result_list[idx][0]
150
157
 
151
- return queries_result_list
158
+ return result_queries_list
@@ -135,10 +135,10 @@ class SummarizationEvaluator(Evaluator):
135
135
  )
136
136
 
137
137
  # Split the embeddings into the original human & machine summaries
138
- embs_human_summaries_all = np.split(
138
+ embs_human_summaries_all_split = np.split(
139
139
  embs_human_summaries_all, np.cumsum(human_lens)[:-1]
140
140
  )
141
- embs_machine_summaries_all = np.split(
141
+ embs_machine_summaries_all_split = np.split(
142
142
  embs_machine_summaries_all, np.cumsum(machine_lens)[:-1]
143
143
  )
144
144
 
@@ -148,7 +148,9 @@ class SummarizationEvaluator(Evaluator):
148
148
  all_human_scores = []
149
149
 
150
150
  for i, (embs_human_summaries, embs_machine_summaries) in tqdm(
151
- enumerate(zip(embs_human_summaries_all, embs_machine_summaries_all)),
151
+ enumerate(
152
+ zip(embs_human_summaries_all_split, embs_machine_summaries_all_split)
153
+ ),
152
154
  desc="Scoring",
153
155
  total=len(self.human_summaries),
154
156
  ):
@@ -164,7 +166,7 @@ class SummarizationEvaluator(Evaluator):
164
166
  dot_scores = dot_score(emb_machine_summary, embs_human_summaries)
165
167
 
166
168
  _sim_score = [
167
- float(model.similarity(emb_machine_summary, emb_human_summary)) # type: ignore
169
+ float(model.similarity(emb_machine_summary, emb_human_summary))
168
170
  for emb_human_summary in embs_human_summaries
169
171
  ]
170
172
  sim_score = torch.tensor(_sim_score)
@@ -216,17 +218,19 @@ class SummarizationEvaluator(Evaluator):
216
218
  strict=True,
217
219
  ):
218
220
  cosine_spearman_scores.append(
219
- spearmanr(human_scores, cosine_pred_scores).statistic
221
+ float(spearmanr(human_scores, cosine_pred_scores).statistic)
220
222
  )
221
223
  cosine_pearson_scores.append(
222
- pearsonr(human_scores, cosine_pred_scores).statistic
224
+ float(pearsonr(human_scores, cosine_pred_scores).statistic)
223
225
  )
224
226
  dot_spearman_scores.append(
225
- spearmanr(human_scores, dot_pred_scores).statistic
227
+ float(spearmanr(human_scores, dot_pred_scores).statistic)
228
+ )
229
+ dot_pearson_scores.append(
230
+ float(pearsonr(human_scores, dot_pred_scores).statistic)
226
231
  )
227
- dot_pearson_scores.append(pearsonr(human_scores, dot_pred_scores).statistic)
228
- spearman_scores.append(spearmanr(human_scores, sim_scores).statistic)
229
- pearson_scores.append(pearsonr(human_scores, sim_scores).statistic)
232
+ spearman_scores.append(float(spearmanr(human_scores, sim_scores).statistic))
233
+ pearson_scores.append(float(pearsonr(human_scores, sim_scores).statistic))
230
234
 
231
235
  return SummarizationMetrics(
232
236
  pearson=float(np.mean(pearson_scores)),
@@ -273,10 +277,10 @@ class DeprecatedSummarizationEvaluator(SummarizationEvaluator):
273
277
  pearson_scores.append(pearsonr(human_scores, sim_scores))
274
278
 
275
279
  return SummarizationMetrics(
276
- pearson=float(np.mean(pearson_scores)),
277
- spearman=float(np.mean(spearman_scores)),
278
- cosine_spearman=float(np.mean(cosine_spearman_scores)),
279
- cosine_pearson=float(np.mean(cosine_pearson_scores)),
280
- dot_pearson=float(np.mean(dot_pearson_scores)),
281
- dot_spearman=float(np.mean(dot_spearman_scores)),
280
+ pearson=float(np.mean(pearson_scores)), # type: ignore[arg-type]
281
+ spearman=float(np.mean(spearman_scores)), # type: ignore[arg-type]
282
+ cosine_spearman=float(np.mean(cosine_spearman_scores)), # type: ignore[arg-type]
283
+ cosine_pearson=float(np.mean(cosine_pearson_scores)), # type: ignore[arg-type]
284
+ dot_pearson=float(np.mean(dot_pearson_scores)), # type: ignore[arg-type]
285
+ dot_spearman=float(np.mean(dot_spearman_scores)), # type: ignore[arg-type]
282
286
  )
@@ -61,7 +61,7 @@ def filter_unclear_label(
61
61
  for text, label in zip(ds[input_column], ds[label_column]):
62
62
  key = text.strip().lower()
63
63
  normalized.setdefault(key, set()).add(
64
- label if isinstance(label, (str, int, float)) else tuple(label)
64
+ label if isinstance(label, (str, int, float)) else tuple(label) # type: ignore[arg-type]
65
65
  )
66
66
 
67
67
  bad_texts = {t for t, labels in normalized.items() if len(labels) > 1}
@@ -89,6 +89,9 @@ def process_classification(
89
89
  subset=None,
90
90
  )
91
91
 
92
+ if task.dataset is None:
93
+ raise ValueError("Task dataset is None.")
94
+
92
95
  new_ds = {}
93
96
  for subset in task.dataset:
94
97
  new_ds[subset] = clean_dataset(
@@ -2,7 +2,8 @@ from __future__ import annotations
2
2
 
3
3
  import hashlib
4
4
  from collections import Counter
5
- from typing import TYPE_CHECKING
5
+ from collections.abc import Mapping
6
+ from typing import TYPE_CHECKING, cast
6
7
 
7
8
  from mteb.types import TopRankedDocumentsType
8
9
  from mteb.types.statistics import (
@@ -52,7 +53,7 @@ def calculate_image_statistics(images: list[Image.Image]) -> ImageStatistics:
52
53
  seen_hashes: set[str] = set()
53
54
 
54
55
  for img in images:
55
- width, height = img.size # type: ignore
56
+ width, height = img.size
56
57
  img_heights.append(height)
57
58
  img_widths.append(width)
58
59
 
@@ -82,17 +83,24 @@ def calculate_label_statistics(labels: list[int | list[int]]) -> LabelStatistics
82
83
  LabelStatistics: A dictionary containing the descriptive statistics.
83
84
 
84
85
  """
86
+ total_labels: list[int | None] = []
87
+
85
88
  if not isinstance(labels[0], list):
86
- label_len = [1] * len(labels)
87
- total_label_len = len(labels)
88
- total_labels = labels
89
+ # single label classification
90
+ single_label = cast(list[int], labels)
91
+ label_len = [1] * len(single_label)
92
+ total_label_len = len(single_label)
93
+ total_labels.extend(single_label)
89
94
  elif isinstance(labels[0], list):
90
95
  # multilabel classification
91
- label_len = [len(l) for l in labels]
96
+ multilabel_labels = cast(list[list[int]], labels)
97
+ label_len = [len(l) for l in multilabel_labels]
92
98
  total_label_len = sum(label_len)
93
- total_labels = []
94
- for l in labels:
95
- total_labels.extend(l if len(l) > 0 else [None])
99
+ for l in multilabel_labels:
100
+ if l and len(l) > 0:
101
+ total_labels.extend(l)
102
+ else:
103
+ total_labels.append(None)
96
104
  else:
97
105
  raise ValueError(
98
106
  "Labels must be a list of integers or a list of lists of integers."
@@ -159,7 +167,7 @@ def calculate_top_ranked_statistics(
159
167
 
160
168
 
161
169
  def calculate_relevant_docs_statistics(
162
- relevant_docs: dict[str, dict[str, float]],
170
+ relevant_docs: Mapping[str, Mapping[str, int]],
163
171
  ) -> RelevantDocsStatistics:
164
172
  qrels_lengths = [len(relevant_docs[qid]) for qid in relevant_docs]
165
173
  unique_qrels = len({doc for qid in relevant_docs for doc in relevant_docs[qid]})
@@ -39,6 +39,7 @@ Bibtex:
39
39
  """
40
40
 
41
41
  import itertools
42
+ from typing import Any
42
43
 
43
44
  import numpy as np
44
45
  import scipy.sparse as sp
@@ -119,8 +120,10 @@ def _get_most_desired_combination(samples_with_combination: dict):
119
120
  if support_size == 0:
120
121
  continue
121
122
  if currently_chosen is None or (
122
- best_number_of_combinations < number_of_combinations # type: ignore
123
- and best_support_size > support_size # type: ignore
123
+ best_number_of_combinations is not None
124
+ and best_support_size is not None
125
+ and best_number_of_combinations < number_of_combinations
126
+ and best_support_size > support_size
124
127
  ):
125
128
  currently_chosen = combination
126
129
  best_number_of_combinations, best_support_size = (
@@ -162,7 +165,7 @@ class IterativeStratification(_BaseKFold):
162
165
  self._rng_state = check_random_state(random_state)
163
166
  need_shuffle = shuffle or random_state is not None
164
167
  self.order = order
165
- super().__init__( # type: ignore
168
+ super().__init__(
166
169
  n_splits,
167
170
  shuffle=need_shuffle,
168
171
  random_state=self._rng_state if need_shuffle else None,
@@ -172,8 +175,7 @@ class IterativeStratification(_BaseKFold):
172
175
  self.percentage_per_fold = sample_distribution_per_fold
173
176
  else:
174
177
  self.percentage_per_fold = [
175
- 1 / float(self.n_splits)
176
- for _ in range(self.n_splits) # type: ignore
178
+ 1 / float(self.n_splits) for _ in range(self.n_splits)
177
179
  ]
178
180
 
179
181
  def _prepare_stratification(
@@ -182,9 +184,9 @@ class IterativeStratification(_BaseKFold):
182
184
  list[list[int]],
183
185
  dict[int, bool],
184
186
  list[list[int]],
185
- list[list[list[int]]],
186
- dict[tuple[int, ...], list[int]],
187
- list[list[int]],
187
+ list[list[Any]],
188
+ dict[str, list[Any]],
189
+ list[list[Any]],
188
190
  ]:
189
191
  """Prepares variables for performing stratification
190
192
 
@@ -206,14 +208,14 @@ class IterativeStratification(_BaseKFold):
206
208
  """
207
209
  self.n_samples, self.n_labels = y.shape
208
210
  self.desired_samples_per_fold = np.array(
209
- [self.percentage_per_fold[i] * self.n_samples for i in range(self.n_splits)] # type: ignore
211
+ [self.percentage_per_fold[i] * self.n_samples for i in range(self.n_splits)]
210
212
  )
211
213
  rows = sp.lil_matrix(y).rows
212
214
  rows_used = dict.fromkeys(range(self.n_samples), False)
213
215
  all_combinations = []
214
- per_row_combinations = [[] for i in range(self.n_samples)]
215
- samples_with_combination = {}
216
- folds = [[] for _ in range(self.n_splits)] # type: ignore
216
+ per_row_combinations: list[list[Any]] = [[] for i in range(self.n_samples)]
217
+ samples_with_combination: dict[str, list[Any]] = {}
218
+ folds: list[list[int]] = [[] for _ in range(self.n_splits)]
217
219
 
218
220
  # for every row
219
221
  for sample_index, label_assignment in enumerate(rows):
@@ -229,21 +231,19 @@ class IterativeStratification(_BaseKFold):
229
231
  all_combinations.append(combination)
230
232
  per_row_combinations[sample_index].append(combination)
231
233
 
232
- all_combinations = [list(x) for x in set(all_combinations)]
233
-
234
234
  self.desired_samples_per_combination_per_fold = {
235
235
  combination: np.array(
236
236
  [
237
237
  len(evidence_for_combination) * self.percentage_per_fold[j]
238
- for j in range(self.n_splits) # type: ignore
238
+ for j in range(self.n_splits)
239
239
  ]
240
240
  )
241
241
  for combination, evidence_for_combination in samples_with_combination.items()
242
242
  }
243
243
  return (
244
- rows,
244
+ rows.tolist(),
245
245
  rows_used,
246
- all_combinations,
246
+ [list(x) for x in set(all_combinations)],
247
247
  per_row_combinations,
248
248
  samples_with_combination,
249
249
  folds,
@@ -328,7 +328,7 @@ class IterativeStratification(_BaseKFold):
328
328
  per_row_combinations,
329
329
  samples_with_combination,
330
330
  folds,
331
- ) = self._prepare_stratification(y) # type: ignore
331
+ ) = self._prepare_stratification(y)
332
332
 
333
333
  self._distribute_positive_evidence(
334
334
  rows_used, folds, samples_with_combination, per_row_combinations