mteb 2.2.2__py3-none-any.whl → 2.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. mteb/__init__.py +4 -0
  2. mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
  3. mteb/evaluate.py +38 -7
  4. mteb/models/__init__.py +4 -1
  5. mteb/models/cache_wrappers/__init__.py +2 -1
  6. mteb/models/model_implementations/colpali_models.py +4 -4
  7. mteb/models/model_implementations/colqwen_models.py +206 -2
  8. mteb/models/model_implementations/eagerworks_models.py +163 -0
  9. mteb/models/model_implementations/euler_models.py +25 -0
  10. mteb/models/model_implementations/google_models.py +1 -1
  11. mteb/models/model_implementations/jina_models.py +203 -5
  12. mteb/models/model_implementations/nb_sbert.py +1 -1
  13. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +10 -11
  14. mteb/models/model_implementations/nvidia_models.py +1 -1
  15. mteb/models/model_implementations/ops_moa_models.py +2 -2
  16. mteb/models/model_implementations/promptriever_models.py +4 -4
  17. mteb/models/model_implementations/qwen3_models.py +3 -3
  18. mteb/models/model_implementations/qzhou_models.py +1 -1
  19. mteb/models/model_implementations/random_baseline.py +8 -18
  20. mteb/models/model_implementations/vdr_models.py +1 -0
  21. mteb/models/model_implementations/yuan_models_en.py +57 -0
  22. mteb/models/search_encoder_index/__init__.py +7 -0
  23. mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
  24. mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
  25. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +157 -0
  26. mteb/models/search_wrappers.py +157 -41
  27. mteb/results/model_result.py +2 -1
  28. mteb/results/task_result.py +12 -0
  29. mteb/similarity_functions.py +49 -0
  30. mteb/tasks/reranking/multilingual/__init__.py +2 -0
  31. mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
  32. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
  33. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
  34. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +3 -3
  35. {mteb-2.2.2.dist-info → mteb-2.3.1.dist-info}/METADATA +6 -1
  36. {mteb-2.2.2.dist-info → mteb-2.3.1.dist-info}/RECORD +40 -31
  37. {mteb-2.2.2.dist-info → mteb-2.3.1.dist-info}/WHEEL +0 -0
  38. {mteb-2.2.2.dist-info → mteb-2.3.1.dist-info}/entry_points.txt +0 -0
  39. {mteb-2.2.2.dist-info → mteb-2.3.1.dist-info}/licenses/LICENSE +0 -0
  40. {mteb-2.2.2.dist-info → mteb-2.3.1.dist-info}/top_level.txt +0 -0
@@ -21,6 +21,7 @@ from mteb.types import (
21
21
  )
22
22
 
23
23
  from .models_protocols import CrossEncoderProtocol, EncoderProtocol
24
+ from .search_encoder_index.search_backend_protocol import IndexEncoderSearchProtocol
24
25
 
25
26
  logger = logging.getLogger(__name__)
26
27
 
@@ -28,13 +29,19 @@ logger = logging.getLogger(__name__)
28
29
  class SearchEncoderWrapper:
29
30
  """Wrapper for Encoder models to be used in search tasks."""
30
31
 
31
- corpus_chunk_size = 50_000
32
32
  task_corpus: CorpusDatasetType | None
33
33
 
34
- def __init__(self, model: EncoderProtocol):
34
+ def __init__(
35
+ self,
36
+ model: EncoderProtocol,
37
+ corpus_chunk_size: int = 50_000,
38
+ index_backend: IndexEncoderSearchProtocol | None = None,
39
+ ) -> None:
35
40
  self.model = model
36
41
  self.task_corpus = None
37
42
  self.mteb_model_meta = model.mteb_model_meta
43
+ self.corpus_chunk_size = corpus_chunk_size
44
+ self.index_backend = index_backend
38
45
 
39
46
  def index(
40
47
  self,
@@ -56,6 +63,22 @@ class SearchEncoderWrapper:
56
63
  """
57
64
  # Always retain corpus for potential reranking or fallback flows
58
65
  self.task_corpus = corpus
66
+ if self.index_backend is not None:
67
+ all_doc_embeddings = self.model.encode(
68
+ create_dataloader(
69
+ corpus,
70
+ task_metadata,
71
+ prompt_type=PromptType.document,
72
+ **encode_kwargs,
73
+ ),
74
+ task_metadata=task_metadata,
75
+ hf_split=hf_split,
76
+ hf_subset=hf_subset,
77
+ prompt_type=PromptType.document,
78
+ **encode_kwargs,
79
+ )
80
+
81
+ self.index_backend.add_documents(all_doc_embeddings, corpus["id"])
59
82
 
60
83
  def search(
61
84
  self,
@@ -105,27 +128,74 @@ class SearchEncoderWrapper:
105
128
 
106
129
  if top_ranked is not None:
107
130
  logger.info("Reranking pre-ranked documents...")
108
- result_heaps = self._rerank_documents(
109
- query_idx_to_id=query_idx_to_id,
110
- query_embeddings=query_embeddings,
111
- top_ranked=top_ranked,
112
- top_k=top_k,
113
- task_metadata=task_metadata,
114
- hf_subset=hf_subset,
115
- hf_split=hf_split,
116
- encode_kwargs=encode_kwargs,
117
- )
131
+ if self.index_backend is None:
132
+ result_heaps = self._rerank_documents(
133
+ query_idx_to_id=query_idx_to_id,
134
+ query_embeddings=query_embeddings,
135
+ top_ranked=top_ranked,
136
+ top_k=top_k,
137
+ task_metadata=task_metadata,
138
+ hf_subset=hf_subset,
139
+ hf_split=hf_split,
140
+ encode_kwargs=encode_kwargs,
141
+ )
142
+ else:
143
+ cos_scores_top_k_values, cos_scores_top_k_idx = (
144
+ self.index_backend.search(
145
+ query_embeddings,
146
+ top_k,
147
+ similarity_fn=self.model.similarity,
148
+ top_ranked=top_ranked,
149
+ query_idx_to_id=query_idx_to_id,
150
+ )
151
+ )
152
+ result_heaps = {qid: [] for qid in query_idx_to_id.values()}
153
+ for query_itr in range(len(query_embeddings)):
154
+ result_heaps = self._rerank_sort_results(
155
+ result_heaps=result_heaps,
156
+ query_id=query_idx_to_id[query_itr],
157
+ ranked_ids=top_ranked[query_idx_to_id[query_itr]],
158
+ scores_top_k_idx=torch.tensor(
159
+ [cos_scores_top_k_idx[query_itr]]
160
+ ),
161
+ scores_top_k_values=torch.tensor(
162
+ [cos_scores_top_k_values[query_itr]]
163
+ ),
164
+ )
165
+ self.index_backend.clear()
118
166
  else:
119
167
  logger.info("Performing full corpus search...")
120
- result_heaps = self._full_corpus_search(
121
- query_idx_to_id=query_idx_to_id,
122
- query_embeddings=query_embeddings,
123
- task_metadata=task_metadata,
124
- hf_subset=hf_subset,
125
- hf_split=hf_split,
126
- top_k=top_k,
127
- encode_kwargs=encode_kwargs,
128
- )
168
+ if self.index_backend is None:
169
+ result_heaps = self._full_corpus_search(
170
+ query_idx_to_id=query_idx_to_id,
171
+ query_embeddings=query_embeddings,
172
+ task_metadata=task_metadata,
173
+ hf_subset=hf_subset,
174
+ hf_split=hf_split,
175
+ top_k=top_k,
176
+ encode_kwargs=encode_kwargs,
177
+ )
178
+ else:
179
+ cos_scores_top_k_values, cos_scores_top_k_idx = (
180
+ self.index_backend.search(
181
+ query_embeddings,
182
+ top_k,
183
+ similarity_fn=self.model.similarity,
184
+ top_ranked=None,
185
+ query_idx_to_id=None,
186
+ )
187
+ )
188
+ result_heaps = {qid: [] for qid in query_idx_to_id.values()}
189
+ result_heaps = self._sort_full_corpus_results(
190
+ result_heaps=result_heaps,
191
+ query_idx_to_id=query_idx_to_id,
192
+ query_embeddings=query_embeddings,
193
+ cos_scores_top_k_idx=cos_scores_top_k_idx,
194
+ cos_scores_top_k_values=cos_scores_top_k_values,
195
+ sub_corpus_ids=self.task_corpus["id"],
196
+ top_k=top_k,
197
+ )
198
+ self.index_backend.clear()
129
199
 
130
200
  # Reset the task corpus dataloader to None to free up memory
131
201
  self.task_corpus = None
@@ -192,19 +262,45 @@ class SearchEncoderWrapper:
192
262
  cos_scores_top_k_values = cos_scores_top_k_values.cpu().tolist()
193
263
 
194
264
  sub_corpus_ids = list(sub_corpus_ids)
195
- for query_itr in range(len(query_embeddings)):
196
- query_id = query_idx_to_id[query_itr]
197
- for sub_corpus_id, score in zip(
198
- cos_scores_top_k_idx[query_itr],
199
- cos_scores_top_k_values[query_itr],
200
- ):
201
- corpus_id = sub_corpus_ids[sub_corpus_id]
202
- if len(result_heaps[query_id]) < top_k:
203
- # push item on the heap
204
- heapq.heappush(result_heaps[query_id], (score, corpus_id))
205
- else:
206
- # If item is larger than the smallest in the heap, push it on the heap then pop the smallest element
207
- heapq.heappushpop(result_heaps[query_id], (score, corpus_id))
265
+ result_heaps = self._sort_full_corpus_results(
266
+ result_heaps=result_heaps,
267
+ query_idx_to_id=query_idx_to_id,
268
+ query_embeddings=query_embeddings,
269
+ cos_scores_top_k_idx=cos_scores_top_k_idx,
270
+ cos_scores_top_k_values=cos_scores_top_k_values,
271
+ sub_corpus_ids=sub_corpus_ids,
272
+ top_k=top_k,
273
+ )
274
+ return result_heaps
275
+
276
+ def _sort_full_corpus_results(
277
+ self,
278
+ result_heaps: dict[str, list[tuple[float, str]]],
279
+ query_idx_to_id: dict[int, str],
280
+ query_embeddings: Array,
281
+ cos_scores_top_k_idx: list[list[int]],
282
+ cos_scores_top_k_values: list[list[float]],
283
+ sub_corpus_ids: list[str],
284
+ top_k: int,
285
+ ) -> dict[str, list[tuple[float, str]]]:
286
+ """Sort the heaps into descending order lists.
287
+
288
+ Returns:
289
+ A dictionary mapping query IDs to a sorted list of tuples, each containing a relevance score and a document ID.
290
+ """
291
+ for query_itr in range(len(query_embeddings)):
292
+ query_id = query_idx_to_id[query_itr]
293
+ for sub_corpus_id, score in zip(
294
+ cos_scores_top_k_idx[query_itr],
295
+ cos_scores_top_k_values[query_itr],
296
+ ):
297
+ corpus_id = sub_corpus_ids[sub_corpus_id]
298
+ if len(result_heaps[query_id]) < top_k:
299
+ # push item on the heap
300
+ heapq.heappush(result_heaps[query_id], (score, corpus_id))
301
+ else:
302
+ # If item is larger than the smallest in the heap, push it on the heap then pop the smallest element
303
+ heapq.heappushpop(result_heaps[query_id], (score, corpus_id))
208
304
  return result_heaps
209
305
 
210
306
  def _rerank_documents(
@@ -279,14 +375,34 @@ class SearchEncoderWrapper:
279
375
  scores_top_k_values = scores_top_k_values.cpu()
280
376
  scores_top_k_idx = scores_top_k_idx.cpu()
281
377
 
282
- # Build result heap
283
- for doc_idx, score in zip(
284
- scores_top_k_idx[0].tolist(),
285
- scores_top_k_values[0].tolist(),
286
- ):
287
- corpus_id = ranked_ids[doc_idx]
288
- heapq.heappush(result_heaps[query_id], (score, corpus_id))
378
+ result_heaps = self._rerank_sort_results(
379
+ result_heaps=result_heaps,
380
+ query_id=query_id,
381
+ ranked_ids=ranked_ids,
382
+ scores_top_k_idx=scores_top_k_idx,
383
+ scores_top_k_values=scores_top_k_values,
384
+ )
385
+ return result_heaps
386
+
387
+ def _rerank_sort_results(
388
+ self,
389
+ result_heaps: list[tuple[float, str]],
390
+ query_id: str,
391
+ ranked_ids: list[str],
392
+ scores_top_k_idx: torch.Tensor,
393
+ scores_top_k_values: torch.Tensor,
394
+ ) -> list[tuple[float, str]]:
395
+ """Sort the heap into descending order list.
289
396
 
397
+ Returns:
398
+ A sorted list of tuples, each containing a relevance score and a document ID.
399
+ """
400
+ for doc_idx, score in zip(
401
+ scores_top_k_idx[0].tolist(),
402
+ scores_top_k_values[0].tolist(),
403
+ ):
404
+ corpus_id = ranked_ids[doc_idx]
405
+ heapq.heappush(result_heaps[query_id], (score, corpus_id))
290
406
  return result_heaps
291
407
 
292
408
  def encode(
@@ -22,7 +22,7 @@ from mteb.types import (
22
22
  SplitName,
23
23
  )
24
24
 
25
- from .task_result import TaskResult
25
+ from .task_result import TaskError, TaskResult
26
26
 
27
27
  logger = logging.getLogger(__name__)
28
28
 
@@ -82,6 +82,7 @@ class ModelResult(BaseModel):
82
82
  protected_namespaces=(),
83
83
  )
84
84
  )
85
+ exceptions: list[TaskError] | None = None
85
86
 
86
87
  def __repr__(self) -> str:
87
88
  n_entries = len(self.task_results)
@@ -840,3 +840,15 @@ class TaskResult(BaseModel):
840
840
  )
841
841
  )
842
842
  return results
843
+
844
+
845
+ class TaskError(BaseModel):
846
+ """A class to represent an error that occurred during the evaluation of a task.
847
+
848
+ Attributes:
849
+ task_name: The name of the MTEB task.
850
+ exception: The error message that occurred during the evaluation.
851
+ """
852
+
853
+ task_name: str
854
+ exception: str
@@ -1,6 +1,7 @@
1
1
  import torch
2
2
 
3
3
  from mteb.models import EncoderProtocol
4
+ from mteb.models.model_meta import ScoringFunction
4
5
  from mteb.types import Array
5
6
 
6
7
 
@@ -38,6 +39,54 @@ def compute_pairwise_similarity(
38
39
  return pairwise_cos_sim(embedding1, embedding2)
39
40
 
40
41
 
42
+ def select_similarity(
43
+ embedding1: Array,
44
+ embedding2: Array,
45
+ similarity_fn: ScoringFunction,
46
+ ) -> Array:
47
+ """Compute similarity between two sets of embeddings using the specified similarity function.
48
+
49
+ Args:
50
+ embedding1: The first set of embeddings.
51
+ embedding2: The second set of embeddings.
52
+ similarity_fn: The similarity function to use (COSINE, DOT_PRODUCT, EUCLIDEAN).
53
+
54
+ Returns:
55
+ Array: The computed similarity scores.
56
+ """
57
+ if similarity_fn is ScoringFunction.COSINE:
58
+ return cos_sim(embedding1, embedding2)
59
+ elif similarity_fn is ScoringFunction.DOT_PRODUCT:
60
+ return dot_score(embedding1, embedding2)
61
+ elif similarity_fn is ScoringFunction.EUCLIDEAN:
62
+ return euclidean_sim(embedding1, embedding2)
63
+ raise ValueError(f"Unsupported similarity function: {similarity_fn}")
64
+
65
+
66
+ def select_pairwise_similarity(
67
+ embedding1: Array,
68
+ embedding2: Array,
69
+ similarity_fn: ScoringFunction,
70
+ ) -> Array:
71
+ """Compute pairwise similarity between two sets of embeddings using the specified similarity function.
72
+
73
+ Args:
74
+ embedding1: The first set of embeddings.
75
+ embedding2: The second set of embeddings.
76
+ similarity_fn: The similarity function to use (COSINE, DOT_PRODUCT, EUCLIDEAN).
77
+
78
+ Returns:
79
+ Array: The computed pairwise similarity scores.
80
+ """
81
+ if similarity_fn is ScoringFunction.COSINE:
82
+ return pairwise_cos_sim(embedding1, embedding2)
83
+ elif similarity_fn is ScoringFunction.DOT_PRODUCT:
84
+ return pairwise_dot_score(embedding1, embedding2)
85
+ elif similarity_fn is ScoringFunction.EUCLIDEAN:
86
+ return pairwise_euclidean_sim(embedding1, embedding2)
87
+ raise ValueError(f"Unsupported similarity function: {similarity_fn}")
88
+
89
+
41
90
  def _normalize_embeddings(embeddings: Array) -> torch.Tensor:
42
91
  """Normalizes the embeddings matrix, so that each sentence embedding has unit length.
43
92
 
@@ -1,6 +1,7 @@
1
1
  from .esci_reranking import ESCIReranking
2
2
  from .hume_wikipedia_reranking_multilingual import HUMEWikipediaRerankingMultilingual
3
3
  from .miracl_reranking import MIRACLReranking
4
+ from .multi_long_doc_reranking import MultiLongDocReranking
4
5
  from .wikipedia_reranking_multilingual import WikipediaRerankingMultilingual
5
6
  from .x_glue_wpr_reranking import XGlueWPRReranking
6
7
 
@@ -8,6 +9,7 @@ __all__ = [
8
9
  "ESCIReranking",
9
10
  "HUMEWikipediaRerankingMultilingual",
10
11
  "MIRACLReranking",
12
+ "MultiLongDocReranking",
11
13
  "WikipediaRerankingMultilingual",
12
14
  "XGlueWPRReranking",
13
15
  ]
@@ -0,0 +1,70 @@
1
+ from mteb.abstasks.retrieval import AbsTaskRetrieval
2
+ from mteb.abstasks.task_metadata import TaskMetadata
3
+
4
+
5
+ class MultiLongDocReranking(AbsTaskRetrieval):
6
+ metadata = TaskMetadata(
7
+ name="MultiLongDocReranking",
8
+ description=(
9
+ "Reranking version of MultiLongDocRetrieval (MLDR). MLDR is a Multilingual Long-Document "
10
+ "Retrieval dataset built on Wikipedia, Wudao and mC4, covering 13 typologically diverse languages. "
11
+ "Specifically, we sample lengthy articles from Wikipedia, Wudao and mC4 datasets and randomly choose "
12
+ "paragraphs from them. Then we use GPT-3.5 to generate questions based on these paragraphs. "
13
+ "The generated question and the sampled article constitute a new text pair to the dataset."
14
+ ),
15
+ reference="https://huggingface.co/datasets/Shitao/MLDR",
16
+ dataset={
17
+ "path": "mteb/MultiLongDocReranking",
18
+ "revision": "ad09ce14c17bce6edae151b7f6ef12e15d91dbf3",
19
+ },
20
+ type="Reranking",
21
+ category="t2t",
22
+ modalities=["text"],
23
+ eval_splits=["test"],
24
+ eval_langs={
25
+ "ar": ["ara-Arab"],
26
+ "de": ["deu-Latn"],
27
+ "en": ["eng-Latn"],
28
+ "es": ["spa-Latn"],
29
+ "fr": ["fra-Latn"],
30
+ "hi": ["hin-Deva"],
31
+ "it": ["ita-Latn"],
32
+ "ja": ["jpn-Jpan"],
33
+ "ko": ["kor-Kore"],
34
+ "pt": ["por-Latn"],
35
+ "ru": ["rus-Cyrl"],
36
+ "th": ["tha-Thai"],
37
+ "zh": ["zho-Hans"],
38
+ },
39
+ main_score="ndcg_at_10",
40
+ date=(
41
+ "2000-01-01",
42
+ "2024-12-31",
43
+ ), # Not found in the paper, guessed using the paper's publication date and constituent datasets
44
+ domains=[
45
+ "Encyclopaedic",
46
+ "Written",
47
+ "Web",
48
+ "Non-fiction",
49
+ "Fiction",
50
+ ], # narrativeqa, wikipedia, wudao, mC4
51
+ task_subtypes=[],
52
+ license="mit",
53
+ annotations_creators="LM-generated", # gpt-3.5
54
+ dialect=[],
55
+ sample_creation="found",
56
+ bibtex_citation=r"""
57
+ @misc{bge-m3,
58
+ archiveprefix = {arXiv},
59
+ author = {Jianlv Chen and Shitao Xiao and Peitian Zhang and Kun Luo and Defu Lian and Zheng Liu},
60
+ eprint = {2402.03216},
61
+ primaryclass = {cs.CL},
62
+ title = {BGE M3-Embedding: Multi-Lingual, Multi-Functionality, Multi-Granularity Text Embeddings Through Self-Knowledge Distillation},
63
+ year = {2024},
64
+ }
65
+ """,
66
+ prompt={
67
+ "query": "Given a question, rerank long documents based on their relevance to answer the question"
68
+ },
69
+ adapted_from=["MultiLongDocRetrieval"],
70
+ )
@@ -351,6 +351,7 @@ class VidoreSyntheticDocQAAIRetrieval(AbsTaskRetrieval):
351
351
  }
352
352
  """,
353
353
  prompt={"query": "Find a screenshot that relevant to the user's question."},
354
+ adapted_from=["VidoreDocVQARetrieval"],
354
355
  )
355
356
 
356
357
  def load_data(self) -> None:
@@ -394,6 +395,7 @@ class VidoreSyntheticDocQAEnergyRetrieval(AbsTaskRetrieval):
394
395
  }
395
396
  """,
396
397
  prompt={"query": "Find a screenshot that relevant to the user's question."},
398
+ adapted_from=["VidoreDocVQARetrieval"],
397
399
  )
398
400
 
399
401
  def load_data(self) -> None:
@@ -437,6 +439,7 @@ class VidoreSyntheticDocQAGovernmentReportsRetrieval(AbsTaskRetrieval):
437
439
  }
438
440
  """,
439
441
  prompt={"query": "Find a screenshot that relevant to the user's question."},
442
+ adapted_from=["VidoreDocVQARetrieval"],
440
443
  )
441
444
 
442
445
  def load_data(self) -> None:
@@ -480,6 +483,7 @@ class VidoreSyntheticDocQAHealthcareIndustryRetrieval(AbsTaskRetrieval):
480
483
  }
481
484
  """,
482
485
  prompt={"query": "Find a screenshot that relevant to the user's question."},
486
+ adapted_from=["VidoreDocVQARetrieval"],
483
487
  )
484
488
 
485
489
  def load_data(self) -> None: