mteb 2.2.1__py3-none-any.whl → 2.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. mteb/__init__.py +4 -0
  2. mteb/models/__init__.py +4 -1
  3. mteb/models/cache_wrappers/__init__.py +2 -1
  4. mteb/models/model_implementations/eagerworks_models.py +163 -0
  5. mteb/models/model_implementations/google_models.py +1 -1
  6. mteb/models/model_implementations/nb_sbert.py +1 -1
  7. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +2 -2
  8. mteb/models/model_implementations/nvidia_models.py +1 -1
  9. mteb/models/model_implementations/ops_moa_models.py +2 -2
  10. mteb/models/model_implementations/promptriever_models.py +4 -4
  11. mteb/models/model_implementations/qwen3_models.py +3 -3
  12. mteb/models/model_implementations/qzhou_models.py +1 -1
  13. mteb/models/model_implementations/random_baseline.py +8 -18
  14. mteb/models/search_encoder_index/__init__.py +7 -0
  15. mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
  16. mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
  17. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +157 -0
  18. mteb/models/search_wrappers.py +157 -41
  19. mteb/similarity_functions.py +49 -0
  20. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
  21. {mteb-2.2.1.dist-info → mteb-2.3.0.dist-info}/METADATA +3 -1
  22. {mteb-2.2.1.dist-info → mteb-2.3.0.dist-info}/RECORD +26 -21
  23. {mteb-2.2.1.dist-info → mteb-2.3.0.dist-info}/WHEEL +0 -0
  24. {mteb-2.2.1.dist-info → mteb-2.3.0.dist-info}/entry_points.txt +0 -0
  25. {mteb-2.2.1.dist-info → mteb-2.3.0.dist-info}/licenses/LICENSE +0 -0
  26. {mteb-2.2.1.dist-info → mteb-2.3.0.dist-info}/top_level.txt +0 -0
mteb/__init__.py CHANGED
@@ -9,8 +9,10 @@ from mteb.filter_tasks import filter_tasks
9
9
  from mteb.get_tasks import get_task, get_tasks
10
10
  from mteb.load_results import load_results
11
11
  from mteb.models import (
12
+ CacheBackendProtocol,
12
13
  CrossEncoderProtocol,
13
14
  EncoderProtocol,
15
+ IndexEncoderSearchProtocol,
14
16
  SearchProtocol,
15
17
  SentenceTransformerEncoderWrapper,
16
18
  )
@@ -27,8 +29,10 @@ __all__ = [
27
29
  "AbsTask",
28
30
  "Benchmark",
29
31
  "BenchmarkResults",
32
+ "CacheBackendProtocol",
30
33
  "CrossEncoderProtocol",
31
34
  "EncoderProtocol",
35
+ "IndexEncoderSearchProtocol",
32
36
  "SearchProtocol",
33
37
  "SentenceTransformerEncoderWrapper",
34
38
  "TaskMetadata",
mteb/models/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- from .cache_wrappers import CachedEmbeddingWrapper
1
+ from .cache_wrappers import CacheBackendProtocol, CachedEmbeddingWrapper
2
2
  from .model_meta import ModelMeta
3
3
  from .models_protocols import (
4
4
  CrossEncoderProtocol,
@@ -6,6 +6,7 @@ from .models_protocols import (
6
6
  MTEBModels,
7
7
  SearchProtocol,
8
8
  )
9
+ from .search_encoder_index.search_backend_protocol import IndexEncoderSearchProtocol
9
10
  from .search_wrappers import SearchCrossEncoderWrapper, SearchEncoderWrapper
10
11
  from .sentence_transformer_wrapper import (
11
12
  CrossEncoderWrapper,
@@ -14,10 +15,12 @@ from .sentence_transformer_wrapper import (
14
15
  )
15
16
 
16
17
  __all__ = [
18
+ "CacheBackendProtocol",
17
19
  "CachedEmbeddingWrapper",
18
20
  "CrossEncoderProtocol",
19
21
  "CrossEncoderWrapper",
20
22
  "EncoderProtocol",
23
+ "IndexEncoderSearchProtocol",
21
24
  "MTEBModels",
22
25
  "ModelMeta",
23
26
  "SearchCrossEncoderWrapper",
@@ -1,3 +1,4 @@
1
+ from .cache_backend_protocol import CacheBackendProtocol
1
2
  from .cache_wrapper import CachedEmbeddingWrapper
2
3
 
3
- __all__ = ["CachedEmbeddingWrapper"]
4
+ __all__ = ["CacheBackendProtocol", "CachedEmbeddingWrapper"]
@@ -0,0 +1,163 @@
1
+ from typing import Any
2
+
3
+ import torch
4
+ from torch.utils.data import DataLoader
5
+ from tqdm.auto import tqdm
6
+
7
+ from mteb._requires_package import (
8
+ requires_image_dependencies,
9
+ requires_package,
10
+ )
11
+ from mteb.abstasks.task_metadata import TaskMetadata
12
+ from mteb.models.abs_encoder import AbsEncoder
13
+ from mteb.models.model_meta import ModelMeta, ScoringFunction
14
+ from mteb.types import Array, BatchedInput, PromptType
15
+
16
+
17
+ class EagerEmbedV1Wrapper(AbsEncoder):
18
+ """Wrapper for EagerEmbed single-vector embedding models."""
19
+
20
+ def __init__(
21
+ self,
22
+ model_name: str,
23
+ revision: str | None = None,
24
+ device: str | None = None,
25
+ image_size: int = 784,
26
+ **kwargs,
27
+ ):
28
+ requires_image_dependencies()
29
+ requires_package(
30
+ self, "qwen_vl_utils", model_name, "pip install mteb[eager_embed]"
31
+ )
32
+ from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
33
+
34
+ self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
35
+ self.image_size = image_size
36
+
37
+ # Load model
38
+ self.mdl = Qwen3VLForConditionalGeneration.from_pretrained(model_name, **kwargs)
39
+ self.mdl = self.mdl.to(self.device)
40
+ self.mdl.eval()
41
+
42
+ # Load processor
43
+ self.processor = AutoProcessor.from_pretrained(model_name)
44
+
45
+ def get_embedding(self, last_hidden_state: torch.Tensor) -> torch.Tensor:
46
+ """Extract embeddings from last token of last hidden state."""
47
+ reps = last_hidden_state[:, -1]
48
+ return reps
49
+
50
+ def encode(
51
+ self,
52
+ inputs: DataLoader[BatchedInput],
53
+ *,
54
+ task_metadata: TaskMetadata,
55
+ hf_split: str,
56
+ hf_subset: str,
57
+ prompt_type: PromptType | None = None,
58
+ **kwargs: Any,
59
+ ) -> Array:
60
+ """Encode inputs (text and/or images) into embeddings."""
61
+ from qwen_vl_utils import process_vision_info
62
+
63
+ all_embeddings: list[torch.Tensor] = []
64
+
65
+ with torch.no_grad():
66
+ for batch in tqdm(inputs, desc="Encoding"):
67
+ batch_texts = batch.get("text", [])
68
+ batch_images = batch.get("image", [])
69
+
70
+ messages = []
71
+ for i in range(max(len(batch_texts), len(batch_images))):
72
+ text_content = batch_texts[i] if batch_texts else ""
73
+ image_content = batch_images[i] if batch_images else None
74
+
75
+ query_prefix = "Query: " if prompt_type == PromptType.query else ""
76
+ content = [
77
+ {"type": "text", "text": f"{query_prefix}{text_content}"}
78
+ ]
79
+
80
+ if image_content is not None:
81
+ content.append(
82
+ {
83
+ "type": "image",
84
+ "image": image_content,
85
+ "resized_height": self.image_size,
86
+ "resized_width": self.image_size,
87
+ }
88
+ )
89
+
90
+ messages.append([{"role": "user", "content": content}])
91
+
92
+ # Prepare inputs
93
+ texts = [
94
+ self.processor.apply_chat_template(
95
+ msg, tokenize=False, add_generation_prompt=False
96
+ )
97
+ + "<|endoftext|>"
98
+ for msg in messages
99
+ ]
100
+
101
+ image_inputs = None
102
+ video_inputs = None
103
+ if batch_images:
104
+ image_inputs, video_inputs = process_vision_info(messages)
105
+
106
+ model_inputs = self.processor(
107
+ text=texts,
108
+ images=image_inputs,
109
+ videos=video_inputs,
110
+ padding="longest",
111
+ return_tensors="pt",
112
+ ).to(self.device)
113
+
114
+ # Get embeddings
115
+ output = self.mdl(
116
+ **model_inputs, return_dict=True, output_hidden_states=True
117
+ )
118
+ embeddings = self.get_embedding(output.hidden_states[-1])
119
+ embeddings = embeddings.cpu().to(torch.float32)
120
+ embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=-1)
121
+
122
+ all_embeddings.append(embeddings)
123
+
124
+ return torch.cat(all_embeddings, dim=0)
125
+
126
+
127
+ EAGER_EMBED_V1_CITATION = """@article{EagerEmbed,
128
+ title={Eager Embed V1: Multimodal Dense Embeddings for Retrieval},
129
+ author={Juan Pablo Balarini},
130
+ year={2025},
131
+ publisher={Eagerworks},
132
+ url={https://github.com/eagerworks/eager-embed},
133
+ }"""
134
+
135
+ EAGER_EMBED_V1_TRAINING_DATASETS = {"colpali", "bge-ir", "pixmo-docs", "wiki-ss"}
136
+
137
+ Eager_Embed_V1 = ModelMeta(
138
+ loader=EagerEmbedV1Wrapper,
139
+ loader_kwargs=dict(
140
+ dtype=torch.float16,
141
+ image_size=784,
142
+ ),
143
+ name="eagerworks/eager-embed-v1",
144
+ languages=["fra-Latn", "spa-Latn", "eng-Latn", "deu-Latn"],
145
+ revision="a6bec272729c5056e2c26618ce085205c82a3b3c",
146
+ release_date="2025-11-20",
147
+ modalities=["image", "text"],
148
+ n_parameters=4_000_000_000,
149
+ memory_usage_mb=16929,
150
+ max_tokens=262144,
151
+ embed_dim=2560,
152
+ license="apache-2.0",
153
+ open_weights=True,
154
+ framework=["Tevatron"],
155
+ reference="https://huggingface.co/eagerworks/eager-embed-v1",
156
+ similarity_fn_name=ScoringFunction.COSINE,
157
+ use_instructions=True,
158
+ training_datasets=EAGER_EMBED_V1_TRAINING_DATASETS,
159
+ citation=EAGER_EMBED_V1_CITATION,
160
+ adapted_from="https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct",
161
+ public_training_code="https://github.com/eagerworks/eager-embed",
162
+ public_training_data="https://github.com/eagerworks/eager-embed/blob/main/dataset_config.yaml",
163
+ )
@@ -275,5 +275,5 @@ embedding_gemma_300m = ModelMeta(
275
275
  public_training_data=None,
276
276
  training_datasets=GECKO_TRAINING_DATA,
277
277
  similarity_fn_name="cosine",
278
- memory_usage_mb=578,
278
+ memory_usage_mb=1155,
279
279
  )
@@ -11,7 +11,7 @@ nb_sbert = ModelMeta(
11
11
  revision="b95656350a076aeafd2d23763660f80655408cc6",
12
12
  release_date="2022-11-23",
13
13
  n_parameters=1_780_000_000,
14
- memory_usage_mb=197,
14
+ memory_usage_mb=678,
15
15
  embed_dim=4096,
16
16
  license="apache-2.0",
17
17
  max_tokens=75,
@@ -146,7 +146,7 @@ llama_nemoretriever_colembed_1b_v1 = ModelMeta(
146
146
  release_date="2025-06-27",
147
147
  modalities=["image", "text"],
148
148
  n_parameters=2_418_000_000,
149
- memory_usage_mb=9224,
149
+ memory_usage_mb=4610,
150
150
  max_tokens=8192,
151
151
  embed_dim=2048,
152
152
  license="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1/blob/main/LICENSE",
@@ -172,7 +172,7 @@ llama_nemoretriever_colembed_3b_v1 = ModelMeta(
172
172
  release_date="2025-06-27",
173
173
  modalities=["image", "text"],
174
174
  n_parameters=4_407_000_000,
175
- memory_usage_mb=16811,
175
+ memory_usage_mb=8403,
176
176
  max_tokens=8192,
177
177
  embed_dim=3072,
178
178
  license="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1/blob/main/LICENSE",
@@ -146,7 +146,7 @@ NV_embed_v1 = ModelMeta(
146
146
  revision="570834afd5fef5bf3a3c2311a2b6e0a66f6f4f2c",
147
147
  release_date="2024-09-13", # initial commit of hf model.
148
148
  n_parameters=7_850_000_000,
149
- memory_usage_mb=29945,
149
+ memory_usage_mb=14975,
150
150
  embed_dim=4096,
151
151
  license="cc-by-nc-4.0",
152
152
  max_tokens=32768,
@@ -27,7 +27,7 @@ ops_moa_conan_embedding = ModelMeta(
27
27
  languages=["zho-Hans"],
28
28
  loader=OPSWrapper,
29
29
  n_parameters=int(343 * 1e6),
30
- memory_usage_mb=2e3,
30
+ memory_usage_mb=1308,
31
31
  max_tokens=512,
32
32
  embed_dim=1536,
33
33
  license="cc-by-nc-4.0",
@@ -58,7 +58,7 @@ ops_moa_yuan_embedding = ModelMeta(
58
58
  languages=["zho-Hans"],
59
59
  loader=OPSWrapper,
60
60
  n_parameters=int(343 * 1e6),
61
- memory_usage_mb=2e3,
61
+ memory_usage_mb=1242,
62
62
  max_tokens=512,
63
63
  embed_dim=1536,
64
64
  license="cc-by-nc-4.0",
@@ -80,7 +80,7 @@ promptriever_llama2 = ModelMeta(
80
80
  revision="01c7f73d771dfac7d292323805ebc428287df4f9-30b14e3813c0fa45facfd01a594580c3fe5ecf23", # base-peft revision
81
81
  release_date="2024-09-15",
82
82
  n_parameters=7_000_000_000,
83
- memory_usage_mb=27,
83
+ memory_usage_mb=26703,
84
84
  max_tokens=4096,
85
85
  embed_dim=4096,
86
86
  license="apache-2.0",
@@ -115,7 +115,7 @@ promptriever_llama3 = ModelMeta(
115
115
  },
116
116
  release_date="2024-09-15",
117
117
  n_parameters=8_000_000_000,
118
- memory_usage_mb=31,
118
+ memory_usage_mb=30518,
119
119
  max_tokens=8192,
120
120
  embed_dim=4096,
121
121
  license="apache-2.0",
@@ -143,7 +143,7 @@ promptriever_llama3_instruct = ModelMeta(
143
143
  revision="5206a32e0bd3067aef1ce90f5528ade7d866253f-8b677258615625122c2eb7329292b8c402612c21", # base-peft revision
144
144
  release_date="2024-09-15",
145
145
  n_parameters=8_000_000_000,
146
- memory_usage_mb=31,
146
+ memory_usage_mb=30518,
147
147
  max_tokens=8192,
148
148
  embed_dim=4096,
149
149
  training_datasets={
@@ -175,7 +175,7 @@ promptriever_mistral_v1 = ModelMeta(
175
175
  revision="7231864981174d9bee8c7687c24c8344414eae6b-876d63e49b6115ecb6839893a56298fadee7e8f5", # base-peft revision
176
176
  release_date="2024-09-15",
177
177
  n_parameters=7_000_000_000,
178
- memory_usage_mb=27,
178
+ memory_usage_mb=26703,
179
179
  training_datasets={
180
180
  # "samaya-ai/msmarco-w-instructions",
181
181
  "mMARCO-NL", # translation not trained on
@@ -139,7 +139,7 @@ Qwen3_Embedding_0B6 = ModelMeta(
139
139
  revision="b22da495047858cce924d27d76261e96be6febc0", # Commit of @tomaarsen
140
140
  release_date="2025-06-05",
141
141
  n_parameters=595776512,
142
- memory_usage_mb=2272,
142
+ memory_usage_mb=1136,
143
143
  embed_dim=1024,
144
144
  max_tokens=32768,
145
145
  license="apache-2.0",
@@ -161,7 +161,7 @@ Qwen3_Embedding_4B = ModelMeta(
161
161
  revision="636cd9bf47d976946cdbb2b0c3ca0cb2f8eea5ff", # Commit of @tomaarsen
162
162
  release_date="2025-06-05",
163
163
  n_parameters=4021774336,
164
- memory_usage_mb=15341,
164
+ memory_usage_mb=7671,
165
165
  embed_dim=2560,
166
166
  max_tokens=32768,
167
167
  license="apache-2.0",
@@ -183,7 +183,7 @@ Qwen3_Embedding_8B = ModelMeta(
183
183
  revision="4e423935c619ae4df87b646a3ce949610c66241c", # Commit of @tomaarsen
184
184
  release_date="2025-06-05",
185
185
  n_parameters=7567295488,
186
- memory_usage_mb=28866,
186
+ memory_usage_mb=14433,
187
187
  embed_dim=4096,
188
188
  max_tokens=32768,
189
189
  license="apache-2.0",
@@ -63,7 +63,7 @@ QZhou_Embedding = ModelMeta(
63
63
  revision="f1e6c03ee3882e7b9fa5cec91217715272e433b8",
64
64
  release_date="2025-08-24",
65
65
  n_parameters=7_070_619_136,
66
- memory_usage_mb=29070,
66
+ memory_usage_mb=14436,
67
67
  embed_dim=3584,
68
68
  license="apache-2.0",
69
69
  max_tokens=8192,
@@ -8,6 +8,10 @@ from torch.utils.data import DataLoader
8
8
 
9
9
  from mteb.abstasks.task_metadata import TaskMetadata
10
10
  from mteb.models.model_meta import ModelMeta
11
+ from mteb.similarity_functions import (
12
+ select_pairwise_similarity,
13
+ select_similarity,
14
+ )
11
15
  from mteb.types._encoder_io import Array, BatchedInput, PromptType
12
16
 
13
17
 
@@ -155,15 +159,9 @@ class RandomEncoderBaseline:
155
159
  Returns:
156
160
  Cosine similarity matrix between the two sets of embeddings
157
161
  """
158
- norm1 = np.linalg.norm(
159
- embeddings1.reshape(-1, self.embedding_dim), axis=1, keepdims=True
160
- )
161
- norm2 = np.linalg.norm(
162
- embeddings2.reshape(-1, self.embedding_dim), axis=1, keepdims=True
162
+ return select_similarity(
163
+ embeddings1, embeddings2, self.mteb_model_meta.similarity_fn_name
163
164
  )
164
- normalized1 = embeddings1 / (norm1 + 1e-10)
165
- normalized2 = embeddings2 / (norm2 + 1e-10)
166
- return np.dot(normalized1, normalized2.T)
167
165
 
168
166
  def similarity_pairwise(
169
167
  self,
@@ -179,17 +177,9 @@ class RandomEncoderBaseline:
179
177
  Returns:
180
178
  Cosine similarity for each pair of embeddings
181
179
  """
182
- norm1 = np.linalg.norm(
183
- embeddings1.reshape(-1, self.embedding_dim), axis=1, keepdims=True
184
- )
185
- norm2 = np.linalg.norm(
186
- embeddings2.reshape(-1, self.embedding_dim), axis=1, keepdims=True
180
+ return select_pairwise_similarity(
181
+ embeddings1, embeddings2, self.mteb_model_meta.similarity_fn_name
187
182
  )
188
- normalized1 = embeddings1 / (norm1 + 1e-10)
189
- normalized2 = embeddings2 / (norm2 + 1e-10)
190
- normalized1 = np.asarray(normalized1)
191
- normalized2 = np.asarray(normalized2)
192
- return np.sum(normalized1 * normalized2, axis=1)
193
183
 
194
184
 
195
185
  random_encoder_baseline = ModelMeta(
@@ -0,0 +1,7 @@
1
+ from .search_backend_protocol import IndexEncoderSearchProtocol
2
+ from .search_indexes import FaissSearchIndex
3
+
4
+ __all__ = [
5
+ "FaissSearchIndex",
6
+ "IndexEncoderSearchProtocol",
7
+ ]
@@ -0,0 +1,50 @@
1
+ from collections.abc import Callable
2
+ from typing import Protocol
3
+
4
+ from mteb.types import Array, TopRankedDocumentsType
5
+
6
+
7
+ class IndexEncoderSearchProtocol(Protocol):
8
+ """Protocol for search backends used in encoder-based retrieval."""
9
+
10
+ def add_documents(
11
+ self,
12
+ embeddings: Array,
13
+ idxs: list[str],
14
+ ) -> None:
15
+ """Add documents to the search backend.
16
+
17
+ Args:
18
+ embeddings: Embeddings of the documents to add.
19
+ idxs: IDs of the documents to add.
20
+ """
21
+
22
+ def search(
23
+ self,
24
+ embeddings: Array,
25
+ top_k: int,
26
+ similarity_fn: Callable[[Array, Array], Array],
27
+ top_ranked: TopRankedDocumentsType | None = None,
28
+ query_idx_to_id: dict[int, str] | None = None,
29
+ ) -> tuple[list[list[float]], list[list[int]]]:
30
+ """Search through added corpus embeddings or rerank top-ranked documents.
31
+
32
+ Supports both full-corpus and reranking search modes:
33
+ - Full-corpus mode: `top_ranked=None`, uses added corpus embeddings.
34
+ - Reranking mode: `top_ranked` contains mapping {query_id: [doc_ids]}.
35
+
36
+ Args:
37
+ embeddings: Query embeddings, shape (num_queries, dim).
38
+ top_k: Number of top results to return.
39
+ similarity_fn: Function to compute similarity between query and corpus.
40
+ top_ranked: Mapping of query_id -> list of candidate doc_ids. Used for reranking.
41
+ query_idx_to_id: Mapping of query index -> query_id. Used for reranking.
42
+
43
+ Returns:
44
+ A tuple (top_k_values, top_k_indices), for each query:
45
+ - top_k_values: List of top-k similarity scores.
46
+ - top_k_indices: List of indices of the top-k documents in the added corpus.
47
+ """
48
+
49
+ def clear(self) -> None:
50
+ """Clear all stored documents and embeddings from the backend."""
@@ -0,0 +1,5 @@
1
+ from .faiss_search_index import FaissSearchIndex
2
+
3
+ __all__ = [
4
+ "FaissSearchIndex",
5
+ ]
@@ -0,0 +1,157 @@
1
+ import logging
2
+ from collections.abc import Callable
3
+
4
+ import numpy as np
5
+ import torch
6
+
7
+ from mteb._requires_package import requires_package
8
+ from mteb.models.model_meta import ScoringFunction
9
+ from mteb.models.models_protocols import EncoderProtocol
10
+ from mteb.types import Array, TopRankedDocumentsType
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class FaissSearchIndex:
16
+ """FAISS-based backend for encoder-based search.
17
+
18
+ Supports both full-corpus retrieval and reranking (via `top_ranked`).
19
+
20
+ Notes:
21
+ - Stores *all* embeddings in memory (IndexFlatIP or IndexFlatL2).
22
+ - Expects embeddings to be normalized if cosine similarity is desired.
23
+ """
24
+
25
+ _normalize: bool = False
26
+
27
+ def __init__(self, model: EncoderProtocol) -> None:
28
+ requires_package(
29
+ self,
30
+ "faiss",
31
+ "FAISS-based search",
32
+ install_instruction="pip install mteb[faiss-cpu]",
33
+ )
34
+
35
+ import faiss
36
+ from faiss import IndexFlatIP, IndexFlatL2
37
+
38
+ # https://github.com/facebookresearch/faiss/wiki/Faiss-indexes
39
+ if model.mteb_model_meta.similarity_fn_name is ScoringFunction.DOT_PRODUCT:
40
+ self.index_type = IndexFlatIP
41
+ elif model.mteb_model_meta.similarity_fn_name is ScoringFunction.COSINE:
42
+ self.index_type = IndexFlatIP
43
+ self._normalize = True
44
+ elif model.mteb_model_meta.similarity_fn_name is ScoringFunction.EUCLIDEAN:
45
+ self.index_type = IndexFlatL2
46
+ else:
47
+ raise ValueError(
48
+ f"FAISS backend does not support similarity function {model.mteb_model_meta.similarity_fn_name}. "
49
+ f"Available: {ScoringFunction.DOT_PRODUCT}, {ScoringFunction.COSINE}."
50
+ )
51
+
52
+ self.idxs: list[str] = []
53
+ self.index: faiss.Index | None = None
54
+
55
+ def add_documents(self, embeddings: Array, idxs: list[str]) -> None:
56
+ """Add all document embeddings and their IDs to FAISS index."""
57
+ import faiss
58
+
59
+ if isinstance(embeddings, torch.Tensor):
60
+ embeddings = embeddings.detach().cpu().numpy()
61
+
62
+ embeddings = embeddings.astype(np.float32)
63
+ self.idxs.extend(idxs)
64
+
65
+ if self._normalize:
66
+ faiss.normalize_L2(embeddings)
67
+
68
+ dim = embeddings.shape[1]
69
+ if self.index is None:
70
+ self.index = self.index_type(dim)
71
+
72
+ self.index.add(embeddings)
73
+ logger.info(f"FAISS index built with {len(idxs)} vectors of dim {dim}.")
74
+
75
+ def search(
76
+ self,
77
+ embeddings: Array,
78
+ top_k: int,
79
+ similarity_fn: Callable[[Array, Array], Array],
80
+ top_ranked: TopRankedDocumentsType | None = None,
81
+ query_idx_to_id: dict[int, str] | None = None,
82
+ ) -> tuple[list[list[float]], list[list[int]]]:
83
+ """Search using FAISS."""
84
+ import faiss
85
+
86
+ if self.index is None:
87
+ raise ValueError("No index built. Call add_document() first.")
88
+
89
+ if isinstance(embeddings, torch.Tensor):
90
+ embeddings = embeddings.detach().cpu().numpy()
91
+
92
+ if self._normalize:
93
+ faiss.normalize_L2(embeddings)
94
+
95
+ if top_ranked is not None:
96
+ if query_idx_to_id is None:
97
+ raise ValueError("query_idx_to_id must be provided when reranking.")
98
+
99
+ similarities, ids = self._reranking(
100
+ embeddings,
101
+ top_k,
102
+ top_ranked=top_ranked,
103
+ query_idx_to_id=query_idx_to_id,
104
+ )
105
+ else:
106
+ similarities, ids = self.index.search(embeddings.astype(np.float32), top_k)
107
+ similarities = similarities.tolist()
108
+ ids = ids.tolist()
109
+
110
+ if issubclass(self.index_type, faiss.IndexFlatL2):
111
+ similarities = -np.sqrt(np.maximum(similarities, 0))
112
+
113
+ return similarities, ids
114
+
115
+ def _reranking(
116
+ self,
117
+ embeddings: Array,
118
+ top_k: int,
119
+ top_ranked: TopRankedDocumentsType | None = None,
120
+ query_idx_to_id: dict[int, str] | None = None,
121
+ ) -> tuple[list[list[float]], list[list[int]]]:
122
+ doc_id_to_idx = {doc_id: i for i, doc_id in enumerate(self.idxs)}
123
+ scores_all: list[list[float]] = []
124
+ idxs_all: list[list[int]] = []
125
+
126
+ for query_idx, query_emb in enumerate(embeddings):
127
+ query_id = query_idx_to_id[query_idx]
128
+ ranked_ids = top_ranked.get(query_id)
129
+ if not ranked_ids:
130
+ logger.warning(f"No top-ranked documents for query {query_id}")
131
+ scores_all.append([])
132
+ idxs_all.append([])
133
+ continue
134
+
135
+ candidate_indices = [doc_id_to_idx[doc_id] for doc_id in ranked_ids]
136
+ d = self.index.d
137
+ candidate_embs = np.vstack(
138
+ [self.index.reconstruct(idx) for idx in candidate_indices]
139
+ )
140
+ sub_reranking_index = self.index_type(d)
141
+ sub_reranking_index.add(candidate_embs)
142
+
143
+ # Search returns scores and indices in one call
144
+ scores, local_indices = sub_reranking_index.search(
145
+ query_emb.reshape(1, -1).astype(np.float32),
146
+ min(top_k, len(candidate_indices)),
147
+ )
148
+ # faiss will output 2d arrays even for single query
149
+ scores_all.append(scores[0].tolist())
150
+ idxs_all.append(local_indices[0].tolist())
151
+
152
+ return scores_all, idxs_all
153
+
154
+ def clear(self) -> None:
155
+ """Clear all stored documents and embeddings from the backend."""
156
+ self.index = None
157
+ self.idxs = []
@@ -21,6 +21,7 @@ from mteb.types import (
21
21
  )
22
22
 
23
23
  from .models_protocols import CrossEncoderProtocol, EncoderProtocol
24
+ from .search_encoder_index.search_backend_protocol import IndexEncoderSearchProtocol
24
25
 
25
26
  logger = logging.getLogger(__name__)
26
27
 
@@ -28,13 +29,19 @@ logger = logging.getLogger(__name__)
28
29
  class SearchEncoderWrapper:
29
30
  """Wrapper for Encoder models to be used in search tasks."""
30
31
 
31
- corpus_chunk_size = 50_000
32
32
  task_corpus: CorpusDatasetType | None
33
33
 
34
- def __init__(self, model: EncoderProtocol):
34
+ def __init__(
35
+ self,
36
+ model: EncoderProtocol,
37
+ corpus_chunk_size: int = 50_000,
38
+ index_backend: IndexEncoderSearchProtocol | None = None,
39
+ ) -> None:
35
40
  self.model = model
36
41
  self.task_corpus = None
37
42
  self.mteb_model_meta = model.mteb_model_meta
43
+ self.corpus_chunk_size = corpus_chunk_size
44
+ self.index_backend = index_backend
38
45
 
39
46
  def index(
40
47
  self,
@@ -56,6 +63,22 @@ class SearchEncoderWrapper:
56
63
  """
57
64
  # Always retain corpus for potential reranking or fallback flows
58
65
  self.task_corpus = corpus
66
+ if self.index_backend is not None:
67
+ all_doc_embeddings = self.model.encode(
68
+ create_dataloader(
69
+ corpus,
70
+ task_metadata,
71
+ prompt_type=PromptType.document,
72
+ **encode_kwargs,
73
+ ),
74
+ task_metadata=task_metadata,
75
+ hf_split=hf_split,
76
+ hf_subset=hf_subset,
77
+ prompt_type=PromptType.document,
78
+ **encode_kwargs,
79
+ )
80
+
81
+ self.index_backend.add_documents(all_doc_embeddings, corpus["id"])
59
82
 
60
83
  def search(
61
84
  self,
@@ -105,27 +128,74 @@ class SearchEncoderWrapper:
105
128
 
106
129
  if top_ranked is not None:
107
130
  logger.info("Reranking pre-ranked documents...")
108
- result_heaps = self._rerank_documents(
109
- query_idx_to_id=query_idx_to_id,
110
- query_embeddings=query_embeddings,
111
- top_ranked=top_ranked,
112
- top_k=top_k,
113
- task_metadata=task_metadata,
114
- hf_subset=hf_subset,
115
- hf_split=hf_split,
116
- encode_kwargs=encode_kwargs,
117
- )
131
+ if self.index_backend is None:
132
+ result_heaps = self._rerank_documents(
133
+ query_idx_to_id=query_idx_to_id,
134
+ query_embeddings=query_embeddings,
135
+ top_ranked=top_ranked,
136
+ top_k=top_k,
137
+ task_metadata=task_metadata,
138
+ hf_subset=hf_subset,
139
+ hf_split=hf_split,
140
+ encode_kwargs=encode_kwargs,
141
+ )
142
+ else:
143
+ cos_scores_top_k_values, cos_scores_top_k_idx = (
144
+ self.index_backend.search(
145
+ query_embeddings,
146
+ top_k,
147
+ similarity_fn=self.model.similarity,
148
+ top_ranked=top_ranked,
149
+ query_idx_to_id=query_idx_to_id,
150
+ )
151
+ )
152
+ result_heaps = {qid: [] for qid in query_idx_to_id.values()}
153
+ for query_itr in range(len(query_embeddings)):
154
+ result_heaps = self._rerank_sort_results(
155
+ result_heaps=result_heaps,
156
+ query_id=query_idx_to_id[query_itr],
157
+ ranked_ids=top_ranked[query_idx_to_id[query_itr]],
158
+ scores_top_k_idx=torch.tensor(
159
+ [cos_scores_top_k_idx[query_itr]]
160
+ ),
161
+ scores_top_k_values=torch.tensor(
162
+ [cos_scores_top_k_values[query_itr]]
163
+ ),
164
+ )
165
+ self.index_backend.clear()
118
166
  else:
119
167
  logger.info("Performing full corpus search...")
120
- result_heaps = self._full_corpus_search(
121
- query_idx_to_id=query_idx_to_id,
122
- query_embeddings=query_embeddings,
123
- task_metadata=task_metadata,
124
- hf_subset=hf_subset,
125
- hf_split=hf_split,
126
- top_k=top_k,
127
- encode_kwargs=encode_kwargs,
128
- )
168
+ if self.index_backend is None:
169
+ result_heaps = self._full_corpus_search(
170
+ query_idx_to_id=query_idx_to_id,
171
+ query_embeddings=query_embeddings,
172
+ task_metadata=task_metadata,
173
+ hf_subset=hf_subset,
174
+ hf_split=hf_split,
175
+ top_k=top_k,
176
+ encode_kwargs=encode_kwargs,
177
+ )
178
+ else:
179
+ cos_scores_top_k_values, cos_scores_top_k_idx = (
180
+ self.index_backend.search(
181
+ query_embeddings,
182
+ top_k,
183
+ similarity_fn=self.model.similarity,
184
+ top_ranked=None,
185
+ query_idx_to_id=None,
186
+ )
187
+ )
188
+ result_heaps = {qid: [] for qid in query_idx_to_id.values()}
189
+ result_heaps = self._sort_full_corpus_results(
190
+ result_heaps=result_heaps,
191
+ query_idx_to_id=query_idx_to_id,
192
+ query_embeddings=query_embeddings,
193
+ cos_scores_top_k_idx=cos_scores_top_k_idx,
194
+ cos_scores_top_k_values=cos_scores_top_k_values,
195
+ sub_corpus_ids=self.task_corpus["id"],
196
+ top_k=top_k,
197
+ )
198
+ self.index_backend.clear()
129
199
 
130
200
  # Reset the task corpus dataloader to None to free up memory
131
201
  self.task_corpus = None
@@ -192,19 +262,45 @@ class SearchEncoderWrapper:
192
262
  cos_scores_top_k_values = cos_scores_top_k_values.cpu().tolist()
193
263
 
194
264
  sub_corpus_ids = list(sub_corpus_ids)
195
- for query_itr in range(len(query_embeddings)):
196
- query_id = query_idx_to_id[query_itr]
197
- for sub_corpus_id, score in zip(
198
- cos_scores_top_k_idx[query_itr],
199
- cos_scores_top_k_values[query_itr],
200
- ):
201
- corpus_id = sub_corpus_ids[sub_corpus_id]
202
- if len(result_heaps[query_id]) < top_k:
203
- # push item on the heap
204
- heapq.heappush(result_heaps[query_id], (score, corpus_id))
205
- else:
206
- # If item is larger than the smallest in the heap, push it on the heap then pop the smallest element
207
- heapq.heappushpop(result_heaps[query_id], (score, corpus_id))
265
+ result_heaps = self._sort_full_corpus_results(
266
+ result_heaps=result_heaps,
267
+ query_idx_to_id=query_idx_to_id,
268
+ query_embeddings=query_embeddings,
269
+ cos_scores_top_k_idx=cos_scores_top_k_idx,
270
+ cos_scores_top_k_values=cos_scores_top_k_values,
271
+ sub_corpus_ids=sub_corpus_ids,
272
+ top_k=top_k,
273
+ )
274
+ return result_heaps
275
+
276
+ def _sort_full_corpus_results(
277
+ self,
278
+ result_heaps: dict[str, list[tuple[float, str]]],
279
+ query_idx_to_id: dict[int, str],
280
+ query_embeddings: Array,
281
+ cos_scores_top_k_idx: list[list[int]],
282
+ cos_scores_top_k_values: list[list[float]],
283
+ sub_corpus_ids: list[str],
284
+ top_k: int,
285
+ ) -> dict[str, list[tuple[float, str]]]:
286
+ """Sort the heaps into descending order lists.
287
+
288
+ Returns:
289
+ A dictionary mapping query IDs to a sorted list of tuples, each containing a relevance score and a document ID.
290
+ """
291
+ for query_itr in range(len(query_embeddings)):
292
+ query_id = query_idx_to_id[query_itr]
293
+ for sub_corpus_id, score in zip(
294
+ cos_scores_top_k_idx[query_itr],
295
+ cos_scores_top_k_values[query_itr],
296
+ ):
297
+ corpus_id = sub_corpus_ids[sub_corpus_id]
298
+ if len(result_heaps[query_id]) < top_k:
299
+ # push item on the heap
300
+ heapq.heappush(result_heaps[query_id], (score, corpus_id))
301
+ else:
302
+ # If item is larger than the smallest in the heap, push it on the heap then pop the smallest element
303
+ heapq.heappushpop(result_heaps[query_id], (score, corpus_id))
208
304
  return result_heaps
209
305
 
210
306
  def _rerank_documents(
@@ -279,14 +375,34 @@ class SearchEncoderWrapper:
279
375
  scores_top_k_values = scores_top_k_values.cpu()
280
376
  scores_top_k_idx = scores_top_k_idx.cpu()
281
377
 
282
- # Build result heap
283
- for doc_idx, score in zip(
284
- scores_top_k_idx[0].tolist(),
285
- scores_top_k_values[0].tolist(),
286
- ):
287
- corpus_id = ranked_ids[doc_idx]
288
- heapq.heappush(result_heaps[query_id], (score, corpus_id))
378
+ result_heaps = self._rerank_sort_results(
379
+ result_heaps=result_heaps,
380
+ query_id=query_id,
381
+ ranked_ids=ranked_ids,
382
+ scores_top_k_idx=scores_top_k_idx,
383
+ scores_top_k_values=scores_top_k_values,
384
+ )
385
+ return result_heaps
386
+
387
+ def _rerank_sort_results(
388
+ self,
389
+ result_heaps: list[tuple[float, str]],
390
+ query_id: str,
391
+ ranked_ids: list[str],
392
+ scores_top_k_idx: torch.Tensor,
393
+ scores_top_k_values: torch.Tensor,
394
+ ) -> list[tuple[float, str]]:
395
+ """Sort the heap into descending order list.
289
396
 
397
+ Returns:
398
+ A sorted list of tuples, each containing a relevance score and a document ID.
399
+ """
400
+ for doc_idx, score in zip(
401
+ scores_top_k_idx[0].tolist(),
402
+ scores_top_k_values[0].tolist(),
403
+ ):
404
+ corpus_id = ranked_ids[doc_idx]
405
+ heapq.heappush(result_heaps[query_id], (score, corpus_id))
290
406
  return result_heaps
291
407
 
292
408
  def encode(
@@ -1,6 +1,7 @@
1
1
  import torch
2
2
 
3
3
  from mteb.models import EncoderProtocol
4
+ from mteb.models.model_meta import ScoringFunction
4
5
  from mteb.types import Array
5
6
 
6
7
 
@@ -38,6 +39,54 @@ def compute_pairwise_similarity(
38
39
  return pairwise_cos_sim(embedding1, embedding2)
39
40
 
40
41
 
42
+ def select_similarity(
43
+ embedding1: Array,
44
+ embedding2: Array,
45
+ similarity_fn: ScoringFunction,
46
+ ) -> Array:
47
+ """Compute similarity between two sets of embeddings using the specified similarity function.
48
+
49
+ Args:
50
+ embedding1: The first set of embeddings.
51
+ embedding2: The second set of embeddings.
52
+ similarity_fn: The similarity function to use (COSINE, DOT_PRODUCT, EUCLIDEAN).
53
+
54
+ Returns:
55
+ Array: The computed similarity scores.
56
+ """
57
+ if similarity_fn is ScoringFunction.COSINE:
58
+ return cos_sim(embedding1, embedding2)
59
+ elif similarity_fn is ScoringFunction.DOT_PRODUCT:
60
+ return dot_score(embedding1, embedding2)
61
+ elif similarity_fn is ScoringFunction.EUCLIDEAN:
62
+ return euclidean_sim(embedding1, embedding2)
63
+ raise ValueError(f"Unsupported similarity function: {similarity_fn}")
64
+
65
+
66
+ def select_pairwise_similarity(
67
+ embedding1: Array,
68
+ embedding2: Array,
69
+ similarity_fn: ScoringFunction,
70
+ ) -> Array:
71
+ """Compute pairwise similarity between two sets of embeddings using the specified similarity function.
72
+
73
+ Args:
74
+ embedding1: The first set of embeddings.
75
+ embedding2: The second set of embeddings.
76
+ similarity_fn: The similarity function to use (COSINE, DOT_PRODUCT, EUCLIDEAN).
77
+
78
+ Returns:
79
+ Array: The computed pairwise similarity scores.
80
+ """
81
+ if similarity_fn is ScoringFunction.COSINE:
82
+ return pairwise_cos_sim(embedding1, embedding2)
83
+ elif similarity_fn is ScoringFunction.DOT_PRODUCT:
84
+ return pairwise_dot_score(embedding1, embedding2)
85
+ elif similarity_fn is ScoringFunction.EUCLIDEAN:
86
+ return pairwise_euclidean_sim(embedding1, embedding2)
87
+ raise ValueError(f"Unsupported similarity function: {similarity_fn}")
88
+
89
+
41
90
  def _normalize_embeddings(embeddings: Array) -> torch.Tensor:
42
91
  """Normalizes the embeddings matrix, so that each sentence embedding has unit length.
43
92
 
@@ -55,6 +55,7 @@ def _load_data(
55
55
  },
56
56
  remove_columns=["corpus-id"],
57
57
  )
58
+ corpus_ds = corpus_ds.select_columns(["id", "image"])
58
59
 
59
60
  qrels_ds = load_dataset(
60
61
  path,
@@ -64,7 +65,7 @@ def _load_data(
64
65
  )
65
66
 
66
67
  if langs is None:
67
- queries[split] = query_ds
68
+ queries[split] = query_ds.select_columns(["id", "text"])
68
69
  corpus[split] = corpus_ds
69
70
  relevant_docs[split] = {}
70
71
  for row in qrels_ds:
@@ -75,7 +76,8 @@ def _load_data(
75
76
  relevant_docs[split][qid][did] = int(row["score"])
76
77
  else:
77
78
  for lang in langs:
78
- queries[lang][split] = query_ds.filter(lambda x: x["language"] == lang)
79
+ filtered_query_ds = query_ds.filter(lambda x: x["language"] == lang)
80
+ queries[lang][split] = filtered_query_ds.select_columns(["id", "text"])
79
81
 
80
82
  corpus[lang][split] = corpus_ds
81
83
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mteb
3
- Version: 2.2.1
3
+ Version: 2.3.0
4
4
  Summary: Massive Text Embedding Benchmark
5
5
  Author-email: MTEB Contributors <niklas@huggingface.co>, Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Nouamane Tazi <nouamane@huggingface.co>, Nils Reimers <info@nils-reimers.de>
6
6
  Maintainer-email: Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Roman Solomatin <risolomatin@gmail.com>, Isaac Chung <chungisaac1217@gmail.com>
@@ -100,6 +100,8 @@ Provides-Extra: llama-embed-nemotron
100
100
  Requires-Dist: transformers==4.51.0; extra == "llama-embed-nemotron"
101
101
  Provides-Extra: faiss-cpu
102
102
  Requires-Dist: faiss-cpu>=1.12.0; extra == "faiss-cpu"
103
+ Provides-Extra: eager-embed
104
+ Requires-Dist: qwen_vl_utils>=0.0.14; extra == "eager-embed"
103
105
  Dynamic: license-file
104
106
 
105
107
  <h1 align="center">
@@ -1,4 +1,4 @@
1
- mteb/__init__.py,sha256=bl3K7IHalVzP27SIQ3qdo52vy9aAuvMsIKOx0h5P6Sk,1256
1
+ mteb/__init__.py,sha256=h2kru--zMEC0mmLQ688kggdDpBH7dxYz1HhLVHbRjcI,1376
2
2
  mteb/__main__.py,sha256=KKWed4HW-OpfpJhCuKDNDPuAAIoppQY1g2gRuCdAmlw,34
3
3
  mteb/_create_dataloaders.py,sha256=9aUHM1q2q748XHax_YYcPBmckIOOVCrJ_N2bJYVbn3s,14338
4
4
  mteb/_helpful_enum.py,sha256=jh73N1jlcpg7RGz4bj8UpctiMNvqvHpp9wrB7SYEzIU,510
@@ -11,7 +11,7 @@ mteb/evaluate.py,sha256=nSRKXlQikwtd4qb2Ruckn756IAgTigDjXdl-W2nhI6M,17993
11
11
  mteb/filter_tasks.py,sha256=5XE1OYmgDDoJYnXwFf4ma_PIT_Lekzs420sQF_kpCiY,7240
12
12
  mteb/get_tasks.py,sha256=6Gc18a2bZoLQV1Ms_qdr2KieAqIXg8TDg4l7ZN8rW2I,14218
13
13
  mteb/load_results.py,sha256=Xw2ZX7BToU92WwUTQUQKPAgPhX7ucyRRdoCrxAoPHdI,6414
14
- mteb/similarity_functions.py,sha256=ZkBapSGDXKE5ipTG2FpeFnAC1iWwiVkrAidmKK_I4bI,8799
14
+ mteb/similarity_functions.py,sha256=ySSnrKl4cSKOWfyIKQPVTJtxuy2ZNfcv0COXDp22QlQ,10630
15
15
  mteb/_evaluators/__init__.py,sha256=Ag1_RWpxBGMpujzd3FZjI40gY_KQKIpY31tJPuk-hFg,1013
16
16
  mteb/_evaluators/_download.py,sha256=jntlcURbJxcxUjTmn2D9Tu6ZnWgDc9t5bY8p9CZCqv4,586
17
17
  mteb/_evaluators/any_sts_evaluator.py,sha256=f0V3NDP5Bfp8qEeBwP8E-Enj5F5NbFze-kGmzlkObQA,3762
@@ -1428,15 +1428,15 @@ mteb/leaderboard/benchmark_selector.py,sha256=hnXdo_Kj4UUAruFl6nZkCxAQ88IEfbaH8E
1428
1428
  mteb/leaderboard/figures.py,sha256=Rq20LFpaUhQD4tuKp7P7ExQtAjonMLibgO3ud0ykMag,7491
1429
1429
  mteb/leaderboard/table.py,sha256=qs0H_Gt9FzRvzb-AL0YlqEe0YAsdYsVX3QlncfCBEqg,7828
1430
1430
  mteb/leaderboard/text_segments.py,sha256=iMIkS04QQjPbT-SkU0x6fOcS8xRbUYevryu9HydipKM,6570
1431
- mteb/models/__init__.py,sha256=ycGU-x60LT0OFyP4CYa5pQhM7J5hCimubuT56va9wfM,741
1431
+ mteb/models/__init__.py,sha256=ABTuoqiBjBtBWW3LYY7ItBHdylR6jWoy06HH0g6j6fU,910
1432
1432
  mteb/models/abs_encoder.py,sha256=m0JkRfRPMYadDgBR9eozRloI31ZSWkSzDFINpwbfLZk,16533
1433
1433
  mteb/models/get_model_meta.py,sha256=VpZZNINk-QrNeVpPZnlqzlLhtBs8G84eRwTzAb_gRD4,9108
1434
1434
  mteb/models/instruct_wrapper.py,sha256=Ty4nfEvioycL_uATkhd0PGuyeB5Xc9xrRd6HOGgb-tc,9005
1435
1435
  mteb/models/model_meta.py,sha256=b-Nel9nX5bJk4cgJnqkBzEKyMY7uXvxlCBSxmmH1Ios,14769
1436
1436
  mteb/models/models_protocols.py,sha256=D2hYWn_UBGMaKtRwBx3u0B0ni6lHJjSzTxX21XFNwIc,8917
1437
- mteb/models/search_wrappers.py,sha256=9PrS12afZInQKnmky2zdDrY_tVaC-Lwx__3zmoFIgn0,15475
1437
+ mteb/models/search_wrappers.py,sha256=AcMhjQyKdeitUjnaqgnP3_zTeVSum8rz1sjBRddHUVQ,20328
1438
1438
  mteb/models/sentence_transformer_wrapper.py,sha256=n5CMsM6Lpg_CFHH0NkpJusMsaLUTt-L9vRmFINQ961k,12338
1439
- mteb/models/cache_wrappers/__init__.py,sha256=j3JBHN73Tr7uMUO92FEvKXstnybxrPpGWmKXU2lAoIE,88
1439
+ mteb/models/cache_wrappers/__init__.py,sha256=1w1TnMwulWJSzNkLXjbh5MY3sqgHWc6vUntYn49i9X8,169
1440
1440
  mteb/models/cache_wrappers/cache_backend_protocol.py,sha256=TR7kD7KbN1J4piszIecpegtLZYGy7sRHZt3SDWlImKk,1665
1441
1441
  mteb/models/cache_wrappers/cache_wrapper.py,sha256=KLDeOCe_ndQshbZa5ep2u3jovsl--tfpQzvt9EXyxCA,6589
1442
1442
  mteb/models/cache_wrappers/cache_backends/__init__.py,sha256=hN2Tq7cpTxoOYSCJ1Wnpvb8dEm-kQLfCCahT1N9Bacw,123
@@ -1471,13 +1471,14 @@ mteb/models/model_implementations/dino_models.py,sha256=QFgaFHR5YKrylqJGSljXCBn2
1471
1471
  mteb/models/model_implementations/e5_instruct.py,sha256=9R4GoSFicgqNDCh3HhTN_8L1qhzuEKvatjHYn3T9zlU,7676
1472
1472
  mteb/models/model_implementations/e5_models.py,sha256=vsqkmm6XzZn9ROj_OUR0j2KiN75MEuQsOPeoyc1AeYg,10937
1473
1473
  mteb/models/model_implementations/e5_v.py,sha256=_9W7I0ryIzx_H9eCkzwdm8iHdGX1LIjKGXkhSh_zNv8,6690
1474
+ mteb/models/model_implementations/eagerworks_models.py,sha256=NOQkCUqn9jLSpf9p6KyaIHnJxYV1MNlr2z7hO2AcRSc,5744
1474
1475
  mteb/models/model_implementations/emillykkejensen_models.py,sha256=QdhGqCm_1-AURkrniZj2S1MjwwIVOPMzLvpgfJq-3EQ,2779
1475
1476
  mteb/models/model_implementations/en_code_retriever.py,sha256=leZ-0M6LrunocY3XQBYZU1uevDRopeyR5ujIhwqBbd8,1043
1476
1477
  mteb/models/model_implementations/evaclip_models.py,sha256=cPMGYLDIq4s8zJxb4vPXqJ-rqwPaq7KOh2QZSO6cDas,8000
1477
1478
  mteb/models/model_implementations/fa_models.py,sha256=WGal70_ezITWoNdjcMdbOCTSCtoaXzuPadYstLVXxhg,7478
1478
1479
  mteb/models/model_implementations/geogpt_models.py,sha256=Juv86SwhgQX80lVLjAFtim2aSiJT1AcgjniyyiKyk1Q,1923
1479
1480
  mteb/models/model_implementations/gme_v_models.py,sha256=NkfgR3_UdZzoBt1NnalVou6LOR-F7qXM4by9EbAVrys,13568
1480
- mteb/models/model_implementations/google_models.py,sha256=P3Kg6G8UI6JA17MMlhhoDj54SCAcRP8uZ1CudL-Caf0,9295
1481
+ mteb/models/model_implementations/google_models.py,sha256=ROo83udaUmPx0U_qfFuS55DSrCILVsRZu3oLp_P-srg,9296
1481
1482
  mteb/models/model_implementations/granite_vision_embedding_models.py,sha256=uqQ5-e_a-ADv3gf3sR9Drk0S4x8Gy8mZkpL-E4X16TM,7241
1482
1483
  mteb/models/model_implementations/gritlm_models.py,sha256=aS_CuioL95JAQMYiaKlGuAWU9wZjabn268Xut3bD8-w,3005
1483
1484
  mteb/models/model_implementations/gte_models.py,sha256=o26Xyu_tucUlP435Q_jB4-bl0xckgj4wtbutTwhYgIo,10073
@@ -1503,25 +1504,25 @@ mteb/models/model_implementations/moco_models.py,sha256=Kl0nBsqkG3crYoo5YulFq1fv
1503
1504
  mteb/models/model_implementations/model2vec_models.py,sha256=D-EY-6P-cKKunbgzk4DHzJL1ogpWYFhpHbTLb8qQjJw,13765
1504
1505
  mteb/models/model_implementations/moka_models.py,sha256=Y5do7Z4JyGxabYrjHhkBLqCKTQKotniS-f4kOgXJjag,4995
1505
1506
  mteb/models/model_implementations/mxbai_models.py,sha256=33ta2BnhvKYBUgE89wFgPNf-CnOb7ooumZvqHOvbZsA,3593
1506
- mteb/models/model_implementations/nb_sbert.py,sha256=Ab0OXvOzEqlFFO1679mMiCdJvMSnvcfCoeFMLzNL74I,861
1507
+ mteb/models/model_implementations/nb_sbert.py,sha256=dF3WBn6ERIK7Oqp-tXdLn11Gf0Z7RKLhAoCq0YHxEug,861
1507
1508
  mteb/models/model_implementations/no_instruct_sentence_models.py,sha256=6i-xbLRRNKuDpU-hwklwdQjgu1wnz5CecLSoc6kyd7Q,3976
1508
1509
  mteb/models/model_implementations/nomic_models.py,sha256=mT-v5Gs5-sRH8-ziCw_CtxB9ox3C6FtwWJjNghNrunw,11334
1509
1510
  mteb/models/model_implementations/nomic_models_vision.py,sha256=gEEieMThvw4p-QhRH0G_9-WWTvj-jqOlgFsh6O07dbc,6731
1510
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py,sha256=zLo-GSghLsIpIZguAdg0Vf18Dn-mdizwQDrWiSwupr0,6171
1511
- mteb/models/model_implementations/nvidia_models.py,sha256=aeex14J1yGxc2wm8kCDIgxwo_uy0Fu9y9liFVD-LDOg,21555
1511
+ mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py,sha256=j71ijIGeYammmtPO6O_IQvPHtSRgwvonDjh8QhfwU64,6170
1512
+ mteb/models/model_implementations/nvidia_models.py,sha256=acVverAt77lURkILCVkCdXsWgY1BJoG1-ugB7yIhlIM,21555
1512
1513
  mteb/models/model_implementations/openai_models.py,sha256=2tJyEapIW-GtB3ZOXIHwGjSZGgJl2daE_UsbzH4NhBM,9620
1513
1514
  mteb/models/model_implementations/openclip_models.py,sha256=W8XcokgLU1nSmMaWpYXkWWizVd3sQezcP02YtF2fXpo,11436
1514
1515
  mteb/models/model_implementations/opensearch_neural_sparse_models.py,sha256=fuxIjOx_kPoDps5C7LW3JllG-AZj4ktqeTNgJESHZh4,8351
1515
- mteb/models/model_implementations/ops_moa_models.py,sha256=vTZYi6aYe2UYI_BBaoMad3anozknUoadVQw_EBKX3MU,2411
1516
+ mteb/models/model_implementations/ops_moa_models.py,sha256=luWw1j2iTMx1z1ydLCjvCI89E9Yvge7ruEawivJTmfE,2413
1516
1517
  mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py,sha256=qGXv71qRjNCIFluZOwvfBlFlKKyN2bXBokwUPk4KHmM,1066
1517
1518
  mteb/models/model_implementations/piccolo_models.py,sha256=d8Dtkv_ZTUOCmJLLOuwquq-gX-2UfKvAtl_LvAS0Xi0,2113
1518
- mteb/models/model_implementations/promptriever_models.py,sha256=2iB1n4ZSX0NyDBBk9rryKSwdjhc3D6TgUhhlzJxys8E,6316
1519
+ mteb/models/model_implementations/promptriever_models.py,sha256=S7uWes_P74p3OZR_KBJHJN_ezlvvRx2__46DMCWqV5M,6328
1519
1520
  mteb/models/model_implementations/pylate_models.py,sha256=yINGQL97S4xjj74-FTWpO4KHX-E9NDOEeyQWyRmmnaE,14772
1520
1521
  mteb/models/model_implementations/qodo_models.py,sha256=JDqffDlQiOEariyheybOIf3iNkqot2gTkEIHWDnRbUE,2037
1521
1522
  mteb/models/model_implementations/qtack_models.py,sha256=biZLH5E3UWIcMZXIZNGgBZFEUvovPpAo6vUyL776W1w,1224
1522
- mteb/models/model_implementations/qwen3_models.py,sha256=IpPJC_y-Kt_1Bq6nT2lX3-lMl-hl0BV6VhAL8Czfyfg,5133
1523
- mteb/models/model_implementations/qzhou_models.py,sha256=90tsGftdCX90Gvu63bxtMyzczGi_u9bsUlg4WAVeH9Q,3536
1524
- mteb/models/model_implementations/random_baseline.py,sha256=NH-epZ73BYvV6qYgu09BsoiFxBWypUNLdJIXRp5QPhM,8030
1523
+ mteb/models/model_implementations/qwen3_models.py,sha256=F_o6ciD-6gLFfIlQYD9MsNvcbkmGzJ39eKpFlEog1rM,5132
1524
+ mteb/models/model_implementations/qzhou_models.py,sha256=7KaZpHdap-YyK0QxOMHxU0W2aGismx7GZv_bNXkEOcI,3536
1525
+ mteb/models/model_implementations/random_baseline.py,sha256=1VNnWBSi0Ph_RLON6clOuQI-Kli5BRtiiDFZMrTj7PM,7489
1525
1526
  mteb/models/model_implementations/rasgaard_models.py,sha256=a8F3kDSBWHH0UR7wRioOrWGQUxtloD5mU7EG27iM-68,1260
1526
1527
  mteb/models/model_implementations/reasonir_model.py,sha256=wSCcJpUgZ0pG2g3vTEzYNmPlPG_CVn_rR0ENVCines0,2218
1527
1528
  mteb/models/model_implementations/repllama_models.py,sha256=89HoqEpzkNysHeuf_-YhU8WETamHTogSRztGIRo6G1s,7321
@@ -1553,6 +1554,10 @@ mteb/models/model_implementations/voyage_v.py,sha256=WnvwYNVv3c5K0ChzGA3v2iTQX2e
1553
1554
  mteb/models/model_implementations/xyz_models.py,sha256=TePlrH6EHwRPO87U_J3Yce9-XHCn_X7I2cJ_6BZ2fUY,1296
1554
1555
  mteb/models/model_implementations/youtu_models.py,sha256=NB74E6z-_36HyXb8GXKn8CrmRLN68uX9eH4xcS57zl0,5938
1555
1556
  mteb/models/model_implementations/yuan_models.py,sha256=yZ6ki6YFaoVrJ_2pPSRQaMKOsIOUo3GtmhPx1qeUl2w,939
1557
+ mteb/models/search_encoder_index/__init__.py,sha256=3QFacIuFyEiI7ocsSkb3Lp2S2L7MLkpHCMIJ201fowA,182
1558
+ mteb/models/search_encoder_index/search_backend_protocol.py,sha256=TSjlx88stJcMldbAeVqNCf8JsQvE-B5rf5SBRw90isY,1890
1559
+ mteb/models/search_encoder_index/search_indexes/__init__.py,sha256=Wm60_oUemUpFsvrCMW111dcPH2L2rt1iZrXMskXmG7o,88
1560
+ mteb/models/search_encoder_index/search_indexes/faiss_search_index.py,sha256=WMs3QbbYV13fRuT3dakmdVMZLFdc_9ZzSupS3QxlbVQ,5555
1556
1561
  mteb/results/__init__.py,sha256=EXQqK4Am5eIYzD52dpcGAFSdqnC38oE6JHN302oidHc,158
1557
1562
  mteb/results/benchmark_results.py,sha256=OWqeBxbNsPmOKRhxY980N5CikpdJXToDGJGTXUe64Lw,18209
1558
1563
  mteb/results/model_result.py,sha256=Wdbkpxq7_geliYDr4558i6txDVdsHL-Y9WAv_u7thlI,13689
@@ -2354,7 +2359,7 @@ mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py,sha256=LJGpx4RkS
2354
2359
  mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py,sha256=Mmcvrt_1cIxPfHZfUzSURPZyaaweGiB02im1ZszlS6M,6837
2355
2360
  mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py,sha256=iFUQUlO_ogBdQBVYBQW3o-AJDQ792yg1pJtRxA5I3Qo,3796
2356
2361
  mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py,sha256=UduWKefwP7bPYxiDlztPEvSWXmTdw0xElglMbPY6XhA,4449
2357
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py,sha256=UEvRarrOnziRmY3SSYmDwWutfrRqe4EQBU-BON6qjug,8913
2362
+ mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py,sha256=vOfiruHywYkP8pccdAuGLyYyFTw1zK0qcXDnUFA8Z5A,9091
2358
2363
  mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py,sha256=wOoC--IVTz0dR6RMVICbz6OWxfCyVahGDSfX_TScCgA,16934
2359
2364
  mteb/tasks/retrieval/multilingual/web_faq_retrieval.py,sha256=TM-Q98yXZny_PKHAFNEvw9o9ET_L6VM3aNis1NJ9DgM,2686
2360
2365
  mteb/tasks/retrieval/multilingual/wikipedia_retrieval_multilingual.py,sha256=zyqAt63bHXNU_I37jb891pwWUyGzZUGkXCyhWlRbed8,1569
@@ -2558,9 +2563,9 @@ mteb/types/_metadata.py,sha256=NN-W0S6a5TDV7UkpRx1pyWtGF4TyyCyoPUfHOwdeci8,2290
2558
2563
  mteb/types/_result.py,sha256=CRAUc5IvqI3_9SyXDwv-PWLCXwXdZem9RePeYESRtuw,996
2559
2564
  mteb/types/_string_validators.py,sha256=PY-dYq4E8O50VS3bLYdldPWp400fl_WzUjfVSkNWe8U,523
2560
2565
  mteb/types/statistics.py,sha256=YwJsxTf1eaCI_RE-J37a-gK5wDeGAsmkeZKoZCFihSo,3755
2561
- mteb-2.2.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
2562
- mteb-2.2.1.dist-info/METADATA,sha256=0o9W1431q71D-h6_mUMwyuEhk9wbFC-31DyDA0vFDhY,13573
2563
- mteb-2.2.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
2564
- mteb-2.2.1.dist-info/entry_points.txt,sha256=8IJoEJFKoDHmVnNev-qJ9pp4Ln7_1-ma9QsXnzVCzGU,39
2565
- mteb-2.2.1.dist-info/top_level.txt,sha256=OLVIjcQAlWBz0bdmutKlWHLF42FF0hp4uVAg3ZyiG4U,5
2566
- mteb-2.2.1.dist-info/RECORD,,
2566
+ mteb-2.3.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
2567
+ mteb-2.3.0.dist-info/METADATA,sha256=ZXHapOBoYWzV3b_qYW_igqfvEsqDVXky9deYaQDAOFI,13662
2568
+ mteb-2.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
2569
+ mteb-2.3.0.dist-info/entry_points.txt,sha256=8IJoEJFKoDHmVnNev-qJ9pp4Ln7_1-ma9QsXnzVCzGU,39
2570
+ mteb-2.3.0.dist-info/top_level.txt,sha256=OLVIjcQAlWBz0bdmutKlWHLF42FF0hp4uVAg3ZyiG4U,5
2571
+ mteb-2.3.0.dist-info/RECORD,,
File without changes