mteb 2.5.2__py3-none-any.whl → 2.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/_create_dataloaders.py +10 -15
- mteb/_evaluators/any_sts_evaluator.py +1 -4
- mteb/_evaluators/evaluator.py +2 -1
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +5 -6
- mteb/_evaluators/pair_classification_evaluator.py +3 -1
- mteb/_evaluators/retrieval_metrics.py +17 -16
- mteb/_evaluators/sklearn_evaluator.py +9 -8
- mteb/_evaluators/text/bitext_mining_evaluator.py +23 -16
- mteb/_evaluators/text/summarization_evaluator.py +20 -16
- mteb/abstasks/_data_filter/filters.py +1 -1
- mteb/abstasks/_data_filter/task_pipelines.py +3 -0
- mteb/abstasks/_statistics_calculation.py +18 -10
- mteb/abstasks/_stratification.py +18 -18
- mteb/abstasks/abstask.py +33 -27
- mteb/abstasks/aggregate_task_metadata.py +1 -9
- mteb/abstasks/aggregated_task.py +7 -26
- mteb/abstasks/classification.py +10 -4
- mteb/abstasks/clustering.py +18 -14
- mteb/abstasks/clustering_legacy.py +8 -8
- mteb/abstasks/image/image_text_pair_classification.py +5 -3
- mteb/abstasks/multilabel_classification.py +20 -16
- mteb/abstasks/pair_classification.py +18 -9
- mteb/abstasks/regression.py +3 -3
- mteb/abstasks/retrieval.py +12 -9
- mteb/abstasks/sts.py +6 -3
- mteb/abstasks/task_metadata.py +22 -19
- mteb/abstasks/text/bitext_mining.py +36 -25
- mteb/abstasks/text/reranking.py +7 -5
- mteb/abstasks/text/summarization.py +8 -3
- mteb/abstasks/zeroshot_classification.py +5 -2
- mteb/benchmarks/benchmark.py +2 -2
- mteb/cache.py +27 -22
- mteb/cli/_display_tasks.py +2 -2
- mteb/cli/build_cli.py +15 -10
- mteb/cli/generate_model_card.py +10 -7
- mteb/deprecated_evaluator.py +60 -46
- mteb/evaluate.py +39 -30
- mteb/filter_tasks.py +25 -26
- mteb/get_tasks.py +29 -30
- mteb/languages/language_scripts.py +5 -3
- mteb/leaderboard/app.py +1 -1
- mteb/load_results.py +12 -12
- mteb/models/abs_encoder.py +7 -5
- mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
- mteb/models/cache_wrappers/cache_wrapper.py +2 -2
- mteb/models/get_model_meta.py +8 -1
- mteb/models/instruct_wrapper.py +11 -5
- mteb/models/model_implementations/andersborges.py +2 -2
- mteb/models/model_implementations/blip_models.py +8 -8
- mteb/models/model_implementations/bm25.py +1 -1
- mteb/models/model_implementations/clip_models.py +3 -3
- mteb/models/model_implementations/cohere_models.py +1 -1
- mteb/models/model_implementations/cohere_v.py +2 -2
- mteb/models/model_implementations/dino_models.py +23 -23
- mteb/models/model_implementations/emillykkejensen_models.py +3 -3
- mteb/models/model_implementations/gme_v_models.py +4 -3
- mteb/models/model_implementations/jina_clip.py +1 -1
- mteb/models/model_implementations/jina_models.py +1 -1
- mteb/models/model_implementations/kennethenevoldsen_models.py +2 -2
- mteb/models/model_implementations/llm2clip_models.py +3 -3
- mteb/models/model_implementations/mcinext_models.py +4 -1
- mteb/models/model_implementations/moco_models.py +2 -2
- mteb/models/model_implementations/model2vec_models.py +1 -1
- mteb/models/model_implementations/nomic_models.py +8 -8
- mteb/models/model_implementations/openclip_models.py +7 -7
- mteb/models/model_implementations/random_baseline.py +3 -3
- mteb/models/model_implementations/rasgaard_models.py +1 -1
- mteb/models/model_implementations/repllama_models.py +2 -2
- mteb/models/model_implementations/rerankers_custom.py +3 -3
- mteb/models/model_implementations/rerankers_monot5_based.py +3 -3
- mteb/models/model_implementations/siglip_models.py +10 -10
- mteb/models/model_implementations/vlm2vec_models.py +1 -1
- mteb/models/model_implementations/voyage_v.py +4 -4
- mteb/models/model_meta.py +14 -13
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +9 -6
- mteb/models/search_wrappers.py +26 -12
- mteb/models/sentence_transformer_wrapper.py +19 -14
- mteb/py.typed +0 -0
- mteb/results/benchmark_results.py +28 -20
- mteb/results/model_result.py +52 -22
- mteb/results/task_result.py +55 -58
- mteb/similarity_functions.py +11 -7
- mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
- mteb/tasks/classification/est/estonian_valence.py +1 -1
- mteb/tasks/classification/multilingual/scala_classification.py +1 -1
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
- mteb/tasks/retrieval/code/code_rag.py +12 -12
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
- mteb/tasks/retrieval/nob/norquad.py +2 -2
- mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
- mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
- mteb/types/_result.py +2 -1
- mteb/types/statistics.py +9 -3
- {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/METADATA +1 -1
- {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/RECORD +104 -103
- {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/WHEEL +0 -0
- {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/entry_points.txt +0 -0
- {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/top_level.txt +0 -0
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
+
import warnings
|
|
3
4
|
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
4
6
|
|
|
5
7
|
import numpy as np
|
|
6
8
|
|
|
7
|
-
from mteb.types import BatchedInput
|
|
8
|
-
|
|
9
9
|
from ._hash_utils import _hash_item
|
|
10
10
|
|
|
11
11
|
logger = logging.getLogger(__name__)
|
|
@@ -14,7 +14,7 @@ logger = logging.getLogger(__name__)
|
|
|
14
14
|
class NumpyCache:
|
|
15
15
|
"""Generic vector cache for both text and images."""
|
|
16
16
|
|
|
17
|
-
def __init__(self, directory: str | Path, initial_vectors: int =
|
|
17
|
+
def __init__(self, directory: str | Path, initial_vectors: int = 100_000):
|
|
18
18
|
self.directory = Path(directory)
|
|
19
19
|
self.directory.mkdir(parents=True, exist_ok=True)
|
|
20
20
|
self.vectors_file = self.directory / "vectors.npy"
|
|
@@ -27,7 +27,7 @@ class NumpyCache:
|
|
|
27
27
|
logger.info(f"Initialized VectorCacheMap in directory: {self.directory}")
|
|
28
28
|
self._initialize_vectors_file()
|
|
29
29
|
|
|
30
|
-
def add(self,
|
|
30
|
+
def add(self, items: list[dict[str, Any]], vectors: np.ndarray) -> None:
|
|
31
31
|
"""Add a vector to the cache."""
|
|
32
32
|
try:
|
|
33
33
|
if self.vector_dim is None:
|
|
@@ -38,12 +38,17 @@ class NumpyCache:
|
|
|
38
38
|
self._save_dimension()
|
|
39
39
|
logger.info(f"Initialized vector dimension to {self.vector_dim}")
|
|
40
40
|
|
|
41
|
-
|
|
41
|
+
if self.vectors is None:
|
|
42
|
+
raise RuntimeError(
|
|
43
|
+
"Vectors file not initialized. Call _initialize_vectors_file() first."
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
for item, vec in zip(items, vectors):
|
|
42
47
|
item_hash = _hash_item(item)
|
|
43
48
|
if item_hash in self.hash_to_index:
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
)
|
|
49
|
+
msg = f"Hash collision or duplicate item for hash {item_hash}. Overwriting existing vector."
|
|
50
|
+
logger.warning(msg)
|
|
51
|
+
warnings.warn(msg)
|
|
47
52
|
index = self.hash_to_index[item_hash]
|
|
48
53
|
else:
|
|
49
54
|
index = len(self.hash_to_index)
|
|
@@ -74,18 +79,26 @@ class NumpyCache:
|
|
|
74
79
|
shape=(self.initial_vectors, self.vector_dim),
|
|
75
80
|
)
|
|
76
81
|
else:
|
|
77
|
-
self.vectors = np.memmap(
|
|
78
|
-
|
|
82
|
+
self.vectors = np.memmap(
|
|
83
|
+
self.vectors_file,
|
|
84
|
+
dtype="float32",
|
|
85
|
+
mode="r+",
|
|
86
|
+
shape=(-1, self.vector_dim),
|
|
87
|
+
)
|
|
79
88
|
logger.info(f"Vectors file initialized with shape: {self.vectors.shape}")
|
|
80
89
|
|
|
81
90
|
def _double_vectors_file(self) -> None:
|
|
91
|
+
if self.vectors is None or self.vector_dim is None:
|
|
92
|
+
raise RuntimeError(
|
|
93
|
+
"Vectors file not initialized. Call _initialize_vectors_file() first."
|
|
94
|
+
)
|
|
82
95
|
current_size = len(self.vectors)
|
|
83
96
|
new_size = current_size * 2
|
|
84
97
|
logger.info(f"Doubling vectors file from {current_size} to {new_size} vectors")
|
|
85
98
|
self.vectors.flush()
|
|
86
99
|
new_vectors = np.memmap(
|
|
87
|
-
self.vectors_file,
|
|
88
|
-
dtype=
|
|
100
|
+
str(self.vectors_file),
|
|
101
|
+
dtype=np.float32,
|
|
89
102
|
mode="r+",
|
|
90
103
|
shape=(new_size, self.vector_dim),
|
|
91
104
|
)
|
|
@@ -107,9 +120,9 @@ class NumpyCache:
|
|
|
107
120
|
f"Loaded vector dimension {self.vector_dim} from {self.dimension_file}"
|
|
108
121
|
)
|
|
109
122
|
else:
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
)
|
|
123
|
+
msg = "Dimension file not found. Vector dimension remains uninitialized."
|
|
124
|
+
logger.warning(msg)
|
|
125
|
+
warnings.warn(msg)
|
|
113
126
|
|
|
114
127
|
def save(self) -> None:
|
|
115
128
|
"""Persist VectorCacheMap to disk."""
|
|
@@ -146,25 +159,30 @@ class NumpyCache:
|
|
|
146
159
|
|
|
147
160
|
if self.vector_dim is not None:
|
|
148
161
|
self.vectors = np.memmap(
|
|
149
|
-
self.vectors_file,
|
|
162
|
+
self.vectors_file,
|
|
163
|
+
dtype="float32",
|
|
164
|
+
mode="r+",
|
|
165
|
+
shape=(-1, self.vector_dim),
|
|
150
166
|
)
|
|
151
|
-
self.vectors = self.vectors.reshape(-1, self.vector_dim)
|
|
152
167
|
logger.info(f"Loaded vectors file with shape: {self.vectors.shape}")
|
|
153
168
|
else:
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
)
|
|
169
|
+
msg = "Vector dimension not set. Unable to load vectors file."
|
|
170
|
+
logger.warning(msg)
|
|
171
|
+
warnings.warn(msg)
|
|
157
172
|
logger.info(f"Loaded VectorCacheMap from {self.directory}")
|
|
158
173
|
else:
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
)
|
|
174
|
+
msg = "No existing files found. Initialized empty VectorCacheMap."
|
|
175
|
+
logger.warning(msg)
|
|
176
|
+
warnings.warn(msg)
|
|
162
177
|
except Exception as e:
|
|
163
178
|
logger.error(f"Error loading VectorCacheMap: {str(e)}")
|
|
164
179
|
raise
|
|
165
180
|
|
|
166
|
-
def get_vector(self, item:
|
|
181
|
+
def get_vector(self, item: dict[str, Any]) -> np.ndarray | None:
|
|
167
182
|
"""Retrieve vector from index by hash."""
|
|
183
|
+
if self.vectors is None:
|
|
184
|
+
return None
|
|
185
|
+
|
|
168
186
|
try:
|
|
169
187
|
item_hash = _hash_item(item)
|
|
170
188
|
if item_hash not in self.hash_to_index:
|
|
@@ -176,7 +194,7 @@ class NumpyCache:
|
|
|
176
194
|
logger.error(f"Error retrieving vector for item: {str(e)}")
|
|
177
195
|
raise
|
|
178
196
|
|
|
179
|
-
def __contains__(self, item:
|
|
197
|
+
def __contains__(self, item: dict[str, Any]) -> bool:
|
|
180
198
|
return _hash_item(item) in self.hash_to_index
|
|
181
199
|
|
|
182
200
|
def __del__(self):
|
|
@@ -90,9 +90,9 @@ class CachedEmbeddingWrapper:
|
|
|
90
90
|
try:
|
|
91
91
|
cache = self._get_or_create_cache(task_name)
|
|
92
92
|
|
|
93
|
-
uncached_items: list[
|
|
93
|
+
uncached_items: list[dict[str, Any]] = []
|
|
94
94
|
uncached_indices: list[int] = []
|
|
95
|
-
all_items = inputs.dataset
|
|
95
|
+
all_items: Dataset = inputs.dataset
|
|
96
96
|
cached_vectors: dict[int, np.ndarray] = {}
|
|
97
97
|
|
|
98
98
|
for i, item in enumerate(all_items):
|
mteb/models/get_model_meta.py
CHANGED
|
@@ -93,7 +93,14 @@ def get_model(
|
|
|
93
93
|
meta = get_model_meta(model_name, revision)
|
|
94
94
|
model = meta.load_model(**kwargs)
|
|
95
95
|
|
|
96
|
-
|
|
96
|
+
if kwargs:
|
|
97
|
+
logger.info(
|
|
98
|
+
f"Model '{model_name}' loaded with additional arguments: {list(kwargs.keys())}"
|
|
99
|
+
)
|
|
100
|
+
meta = meta.model_copy(deep=True)
|
|
101
|
+
meta.loader_kwargs |= kwargs
|
|
102
|
+
|
|
103
|
+
model.mteb_model_meta = meta # type: ignore[misc]
|
|
97
104
|
return model
|
|
98
105
|
|
|
99
106
|
|
mteb/models/instruct_wrapper.py
CHANGED
|
@@ -17,7 +17,7 @@ logger = logging.getLogger(__name__)
|
|
|
17
17
|
def instruct_wrapper(
|
|
18
18
|
model_name_or_path: str,
|
|
19
19
|
mode: str,
|
|
20
|
-
instruction_template: str | Callable[[str], str] | None = None,
|
|
20
|
+
instruction_template: str | Callable[[str, PromptType | None], str] | None = None,
|
|
21
21
|
**kwargs,
|
|
22
22
|
):
|
|
23
23
|
"""Instruct wrapper for models. Uses GritLM to pass instructions to the model.
|
|
@@ -40,7 +40,9 @@ def instruct_wrapper(
|
|
|
40
40
|
self,
|
|
41
41
|
model_name_or_path: str,
|
|
42
42
|
mode: str,
|
|
43
|
-
instruction_template: str
|
|
43
|
+
instruction_template: str
|
|
44
|
+
| Callable[[str, PromptType | None], str]
|
|
45
|
+
| None = None,
|
|
44
46
|
**kwargs,
|
|
45
47
|
):
|
|
46
48
|
if (
|
|
@@ -82,8 +84,11 @@ def instruct_wrapper(
|
|
|
82
84
|
logger.info(
|
|
83
85
|
f"Using instruction: '{instruction}' for task: '{task_metadata.name}'"
|
|
84
86
|
)
|
|
85
|
-
embeddings = super().encode(
|
|
86
|
-
_inputs,
|
|
87
|
+
embeddings = super().encode( # type: ignore[safe-super]
|
|
88
|
+
_inputs, # type: ignore[arg-type]
|
|
89
|
+
instruction=instruction,
|
|
90
|
+
*args,
|
|
91
|
+
**kwargs,
|
|
87
92
|
)
|
|
88
93
|
if isinstance(embeddings, torch.Tensor):
|
|
89
94
|
# sometimes in kwargs can be return_tensors=True
|
|
@@ -141,7 +146,7 @@ class InstructSentenceTransformerModel(AbsEncoder):
|
|
|
141
146
|
)
|
|
142
147
|
|
|
143
148
|
self.instruction_template = instruction_template
|
|
144
|
-
tokenizer_params = {}
|
|
149
|
+
tokenizer_params: dict[str, Any] = {}
|
|
145
150
|
if add_eos_token:
|
|
146
151
|
tokenizer_params["add_eos_token"] = add_eos_token
|
|
147
152
|
if max_seq_length is not None:
|
|
@@ -193,6 +198,7 @@ class InstructSentenceTransformerModel(AbsEncoder):
|
|
|
193
198
|
The encoded input in a numpy array or torch tensor of the shape (Number of sentences) x (Embedding dimension).
|
|
194
199
|
"""
|
|
195
200
|
sentences = [text for batch in inputs for text in batch["text"]]
|
|
201
|
+
instruction: str | None
|
|
196
202
|
instruction = self.get_task_instruction(task_metadata, prompt_type)
|
|
197
203
|
|
|
198
204
|
# to passage prompts won't be applied to passages
|
|
@@ -4,7 +4,7 @@ from mteb.models.model_implementations.model2vec_models import Model2VecModel
|
|
|
4
4
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
5
5
|
|
|
6
6
|
model2vecdk = ModelMeta(
|
|
7
|
-
loader=Model2VecModel,
|
|
7
|
+
loader=Model2VecModel,
|
|
8
8
|
name="andersborges/model2vecdk",
|
|
9
9
|
model_type=["dense"],
|
|
10
10
|
languages=["dan-Latn"],
|
|
@@ -35,7 +35,7 @@ model2vecdk = ModelMeta(
|
|
|
35
35
|
|
|
36
36
|
|
|
37
37
|
model2vecdk_stem = ModelMeta(
|
|
38
|
-
loader=Model2VecModel,
|
|
38
|
+
loader=Model2VecModel,
|
|
39
39
|
name="andersborges/model2vecdk-stem",
|
|
40
40
|
model_type=["dense"],
|
|
41
41
|
languages=["dan-Latn"],
|
|
@@ -128,7 +128,7 @@ class BLIPModel(AbsEncoder):
|
|
|
128
128
|
|
|
129
129
|
# in descending order of usage (downloads from huggingface)
|
|
130
130
|
blip_image_captioning_large = ModelMeta(
|
|
131
|
-
loader=BLIPModel,
|
|
131
|
+
loader=BLIPModel,
|
|
132
132
|
name="Salesforce/blip-image-captioning-large",
|
|
133
133
|
model_type=["dense"],
|
|
134
134
|
languages=["eng-Latn"],
|
|
@@ -156,7 +156,7 @@ blip_image_captioning_large = ModelMeta(
|
|
|
156
156
|
)
|
|
157
157
|
|
|
158
158
|
blip_image_captioning_base = ModelMeta(
|
|
159
|
-
loader=BLIPModel,
|
|
159
|
+
loader=BLIPModel,
|
|
160
160
|
name="Salesforce/blip-image-captioning-base",
|
|
161
161
|
model_type=["dense"],
|
|
162
162
|
languages=["eng-Latn"],
|
|
@@ -185,7 +185,7 @@ blip_image_captioning_base = ModelMeta(
|
|
|
185
185
|
|
|
186
186
|
|
|
187
187
|
blip_vqa_base = ModelMeta(
|
|
188
|
-
loader=BLIPModel,
|
|
188
|
+
loader=BLIPModel,
|
|
189
189
|
name="Salesforce/blip-vqa-base",
|
|
190
190
|
model_type=["dense"],
|
|
191
191
|
languages=["eng-Latn"],
|
|
@@ -212,7 +212,7 @@ blip_vqa_base = ModelMeta(
|
|
|
212
212
|
)
|
|
213
213
|
|
|
214
214
|
blip_vqa_capfilt_large = ModelMeta(
|
|
215
|
-
loader=BLIPModel,
|
|
215
|
+
loader=BLIPModel,
|
|
216
216
|
name="Salesforce/blip-vqa-capfilt-large",
|
|
217
217
|
model_type=["dense"],
|
|
218
218
|
languages=["eng-Latn"],
|
|
@@ -239,7 +239,7 @@ blip_vqa_capfilt_large = ModelMeta(
|
|
|
239
239
|
)
|
|
240
240
|
|
|
241
241
|
blip_itm_base_coco = ModelMeta(
|
|
242
|
-
loader=BLIPModel,
|
|
242
|
+
loader=BLIPModel,
|
|
243
243
|
name="Salesforce/blip-itm-base-coco",
|
|
244
244
|
model_type=["dense"],
|
|
245
245
|
languages=["eng-Latn"],
|
|
@@ -266,7 +266,7 @@ blip_itm_base_coco = ModelMeta(
|
|
|
266
266
|
)
|
|
267
267
|
|
|
268
268
|
blip_itm_large_coco = ModelMeta(
|
|
269
|
-
loader=BLIPModel,
|
|
269
|
+
loader=BLIPModel,
|
|
270
270
|
name="Salesforce/blip-itm-large-coco",
|
|
271
271
|
model_type=["dense"],
|
|
272
272
|
languages=["eng-Latn"],
|
|
@@ -294,7 +294,7 @@ blip_itm_large_coco = ModelMeta(
|
|
|
294
294
|
)
|
|
295
295
|
|
|
296
296
|
blip_itm_base_flickr = ModelMeta(
|
|
297
|
-
loader=BLIPModel,
|
|
297
|
+
loader=BLIPModel,
|
|
298
298
|
name="Salesforce/blip-itm-base-flickr",
|
|
299
299
|
model_type=["dense"],
|
|
300
300
|
languages=["eng-Latn"],
|
|
@@ -322,7 +322,7 @@ blip_itm_base_flickr = ModelMeta(
|
|
|
322
322
|
)
|
|
323
323
|
|
|
324
324
|
blip_itm_large_flickr = ModelMeta(
|
|
325
|
-
loader=BLIPModel,
|
|
325
|
+
loader=BLIPModel,
|
|
326
326
|
name="Salesforce/blip-itm-large-flickr",
|
|
327
327
|
model_type=["dense"],
|
|
328
328
|
languages=["eng-Latn"],
|
|
@@ -113,7 +113,7 @@ def bm25_loader(model_name, **kwargs) -> SearchProtocol:
|
|
|
113
113
|
|
|
114
114
|
def encode(self, texts: list[str]):
|
|
115
115
|
"""Encode input text as term vectors"""
|
|
116
|
-
return bm25s.tokenize(texts, stopwords=self.stopwords, stemmer=self.stemmer)
|
|
116
|
+
return bm25s.tokenize(texts, stopwords=self.stopwords, stemmer=self.stemmer)
|
|
117
117
|
|
|
118
118
|
return BM25Search(**kwargs)
|
|
119
119
|
|
|
@@ -115,7 +115,7 @@ CLIP_CITATION = """
|
|
|
115
115
|
|
|
116
116
|
|
|
117
117
|
clip_vit_large_patch14 = ModelMeta(
|
|
118
|
-
loader=CLIPModel,
|
|
118
|
+
loader=CLIPModel,
|
|
119
119
|
name="openai/clip-vit-large-patch14",
|
|
120
120
|
model_type=["dense"],
|
|
121
121
|
languages=["eng-Latn"],
|
|
@@ -139,7 +139,7 @@ clip_vit_large_patch14 = ModelMeta(
|
|
|
139
139
|
)
|
|
140
140
|
|
|
141
141
|
clip_vit_base_patch32 = ModelMeta(
|
|
142
|
-
loader=CLIPModel,
|
|
142
|
+
loader=CLIPModel,
|
|
143
143
|
name="openai/clip-vit-base-patch32",
|
|
144
144
|
model_type=["dense"],
|
|
145
145
|
languages=["eng-Latn"],
|
|
@@ -163,7 +163,7 @@ clip_vit_base_patch32 = ModelMeta(
|
|
|
163
163
|
)
|
|
164
164
|
|
|
165
165
|
clip_vit_base_patch16 = ModelMeta(
|
|
166
|
-
loader=CLIPModel,
|
|
166
|
+
loader=CLIPModel,
|
|
167
167
|
name="openai/clip-vit-base-patch16",
|
|
168
168
|
model_type=["dense"],
|
|
169
169
|
languages=["eng-Latn"],
|
|
@@ -222,7 +222,7 @@ class CohereTextEmbeddingModel(AbsEncoder):
|
|
|
222
222
|
) -> None:
|
|
223
223
|
requires_package(self, "cohere", model_name, "pip install 'mteb[cohere]'")
|
|
224
224
|
|
|
225
|
-
import cohere
|
|
225
|
+
import cohere
|
|
226
226
|
|
|
227
227
|
self.model_name = model_name.removeprefix("Cohere/Cohere-")
|
|
228
228
|
self.sep = sep
|
|
@@ -378,7 +378,7 @@ def cohere_v_loader(model_name, **kwargs):
|
|
|
378
378
|
|
|
379
379
|
|
|
380
380
|
cohere_mult_3 = ModelMeta(
|
|
381
|
-
loader=cohere_v_loader,
|
|
381
|
+
loader=cohere_v_loader,
|
|
382
382
|
loader_kwargs={"model_name": "embed-multilingual-v3.0"},
|
|
383
383
|
name="cohere/embed-multilingual-v3.0",
|
|
384
384
|
model_type=["dense"],
|
|
@@ -402,7 +402,7 @@ cohere_mult_3 = ModelMeta(
|
|
|
402
402
|
)
|
|
403
403
|
|
|
404
404
|
cohere_eng_3 = ModelMeta(
|
|
405
|
-
loader=cohere_v_loader,
|
|
405
|
+
loader=cohere_v_loader,
|
|
406
406
|
loader_kwargs={"model_name": "embed-english-v3.0"},
|
|
407
407
|
name="cohere/embed-english-v3.0",
|
|
408
408
|
model_type=["dense"],
|
|
@@ -104,7 +104,7 @@ dinov2_training_datasets = set(
|
|
|
104
104
|
|
|
105
105
|
|
|
106
106
|
dinov2_small = ModelMeta(
|
|
107
|
-
loader=DINOModel,
|
|
107
|
+
loader=DINOModel,
|
|
108
108
|
name="facebook/dinov2-small",
|
|
109
109
|
model_type=["dense"],
|
|
110
110
|
languages=["eng-Latn"],
|
|
@@ -125,7 +125,7 @@ dinov2_small = ModelMeta(
|
|
|
125
125
|
use_instructions=False,
|
|
126
126
|
training_datasets=dinov2_training_datasets,
|
|
127
127
|
citation="""@misc{oquab2023dinov2,
|
|
128
|
-
title={DINOv2: Learning Robust Visual Features without Supervision},
|
|
128
|
+
title={DINOv2: Learning Robust Visual Features without Supervision},
|
|
129
129
|
author={Maxime Oquab and Timothée Darcet and Théo Moutakanni and Huy Vo and Marc Szafraniec and Vasil Khalidov and Pierre Fernandez and Daniel Haziza and Francisco Massa and Alaaeldin El-Nouby and Mahmoud Assran and Nicolas Ballas and Wojciech Galuba and Russell Howes and Po-Yao Huang and Shang-Wen Li and Ishan Misra and Michael Rabbat and Vasu Sharma and Gabriel Synnaeve and Hu Xu and Hervé Jegou and Julien Mairal and Patrick Labatut and Armand Joulin and Piotr Bojanowski},
|
|
130
130
|
year={2023},
|
|
131
131
|
eprint={2304.07193},
|
|
@@ -135,7 +135,7 @@ dinov2_small = ModelMeta(
|
|
|
135
135
|
)
|
|
136
136
|
|
|
137
137
|
dinov2_base = ModelMeta(
|
|
138
|
-
loader=DINOModel,
|
|
138
|
+
loader=DINOModel,
|
|
139
139
|
name="facebook/dinov2-base",
|
|
140
140
|
model_type=["dense"],
|
|
141
141
|
languages=["eng-Latn"],
|
|
@@ -156,7 +156,7 @@ dinov2_base = ModelMeta(
|
|
|
156
156
|
use_instructions=False,
|
|
157
157
|
training_datasets=dinov2_training_datasets,
|
|
158
158
|
citation="""@misc{oquab2023dinov2,
|
|
159
|
-
title={DINOv2: Learning Robust Visual Features without Supervision},
|
|
159
|
+
title={DINOv2: Learning Robust Visual Features without Supervision},
|
|
160
160
|
author={Maxime Oquab and Timothée Darcet and Théo Moutakanni and Huy Vo and Marc Szafraniec and Vasil Khalidov and Pierre Fernandez and Daniel Haziza and Francisco Massa and Alaaeldin El-Nouby and Mahmoud Assran and Nicolas Ballas and Wojciech Galuba and Russell Howes and Po-Yao Huang and Shang-Wen Li and Ishan Misra and Michael Rabbat and Vasu Sharma and Gabriel Synnaeve and Hu Xu and Hervé Jegou and Julien Mairal and Patrick Labatut and Armand Joulin and Piotr Bojanowski},
|
|
161
161
|
year={2023},
|
|
162
162
|
eprint={2304.07193},
|
|
@@ -166,7 +166,7 @@ dinov2_base = ModelMeta(
|
|
|
166
166
|
)
|
|
167
167
|
|
|
168
168
|
dinov2_large = ModelMeta(
|
|
169
|
-
loader=DINOModel,
|
|
169
|
+
loader=DINOModel,
|
|
170
170
|
name="facebook/dinov2-large",
|
|
171
171
|
model_type=["dense"],
|
|
172
172
|
languages=["eng-Latn"],
|
|
@@ -187,7 +187,7 @@ dinov2_large = ModelMeta(
|
|
|
187
187
|
use_instructions=False,
|
|
188
188
|
training_datasets=dinov2_training_datasets,
|
|
189
189
|
citation="""@misc{oquab2023dinov2,
|
|
190
|
-
title={DINOv2: Learning Robust Visual Features without Supervision},
|
|
190
|
+
title={DINOv2: Learning Robust Visual Features without Supervision},
|
|
191
191
|
author={Maxime Oquab and Timothée Darcet and Théo Moutakanni and Huy Vo and Marc Szafraniec and Vasil Khalidov and Pierre Fernandez and Daniel Haziza and Francisco Massa and Alaaeldin El-Nouby and Mahmoud Assran and Nicolas Ballas and Wojciech Galuba and Russell Howes and Po-Yao Huang and Shang-Wen Li and Ishan Misra and Michael Rabbat and Vasu Sharma and Gabriel Synnaeve and Hu Xu and Hervé Jegou and Julien Mairal and Patrick Labatut and Armand Joulin and Piotr Bojanowski},
|
|
192
192
|
year={2023},
|
|
193
193
|
eprint={2304.07193},
|
|
@@ -197,7 +197,7 @@ dinov2_large = ModelMeta(
|
|
|
197
197
|
)
|
|
198
198
|
|
|
199
199
|
dinov2_giant = ModelMeta(
|
|
200
|
-
loader=DINOModel,
|
|
200
|
+
loader=DINOModel,
|
|
201
201
|
name="facebook/dinov2-giant",
|
|
202
202
|
model_type=["dense"],
|
|
203
203
|
languages=["eng-Latn"],
|
|
@@ -218,7 +218,7 @@ dinov2_giant = ModelMeta(
|
|
|
218
218
|
use_instructions=False,
|
|
219
219
|
training_datasets=dinov2_training_datasets,
|
|
220
220
|
citation="""@misc{oquab2023dinov2,
|
|
221
|
-
title={DINOv2: Learning Robust Visual Features without Supervision},
|
|
221
|
+
title={DINOv2: Learning Robust Visual Features without Supervision},
|
|
222
222
|
author={Maxime Oquab and Timothée Darcet and Théo Moutakanni and Huy Vo and Marc Szafraniec and Vasil Khalidov and Pierre Fernandez and Daniel Haziza and Francisco Massa and Alaaeldin El-Nouby and Mahmoud Assran and Nicolas Ballas and Wojciech Galuba and Russell Howes and Po-Yao Huang and Shang-Wen Li and Ishan Misra and Michael Rabbat and Vasu Sharma and Gabriel Synnaeve and Hu Xu and Hervé Jegou and Julien Mairal and Patrick Labatut and Armand Joulin and Piotr Bojanowski},
|
|
223
223
|
year={2023},
|
|
224
224
|
eprint={2304.07193},
|
|
@@ -253,7 +253,7 @@ webssl_dino300m_full2b = ModelMeta(
|
|
|
253
253
|
use_instructions=False,
|
|
254
254
|
training_datasets=webssl_dino_training_datasets,
|
|
255
255
|
citation="""@article{fan2025scaling,
|
|
256
|
-
title={Scaling Language-Free Visual Representation Learning},
|
|
256
|
+
title={Scaling Language-Free Visual Representation Learning},
|
|
257
257
|
author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
|
|
258
258
|
year={2025},
|
|
259
259
|
eprint={2504.01017},
|
|
@@ -284,7 +284,7 @@ webssl_dino1b_full2b = ModelMeta(
|
|
|
284
284
|
use_instructions=False,
|
|
285
285
|
training_datasets=webssl_dino_training_datasets,
|
|
286
286
|
citation="""@article{fan2025scaling,
|
|
287
|
-
title={Scaling Language-Free Visual Representation Learning},
|
|
287
|
+
title={Scaling Language-Free Visual Representation Learning},
|
|
288
288
|
author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
|
|
289
289
|
year={2025},
|
|
290
290
|
eprint={2504.01017},
|
|
@@ -315,7 +315,7 @@ webssl_dino2b_full2b = ModelMeta(
|
|
|
315
315
|
use_instructions=False,
|
|
316
316
|
training_datasets=webssl_dino_training_datasets,
|
|
317
317
|
citation="""@article{fan2025scaling,
|
|
318
|
-
title={Scaling Language-Free Visual Representation Learning},
|
|
318
|
+
title={Scaling Language-Free Visual Representation Learning},
|
|
319
319
|
author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
|
|
320
320
|
year={2025},
|
|
321
321
|
eprint={2504.01017},
|
|
@@ -346,7 +346,7 @@ webssl_dino3b_full2b = ModelMeta(
|
|
|
346
346
|
use_instructions=False,
|
|
347
347
|
training_datasets=webssl_dino_training_datasets,
|
|
348
348
|
citation="""@article{fan2025scaling,
|
|
349
|
-
title={Scaling Language-Free Visual Representation Learning},
|
|
349
|
+
title={Scaling Language-Free Visual Representation Learning},
|
|
350
350
|
author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
|
|
351
351
|
year={2025},
|
|
352
352
|
eprint={2504.01017},
|
|
@@ -377,7 +377,7 @@ webssl_dino5b_full2b = ModelMeta(
|
|
|
377
377
|
use_instructions=False,
|
|
378
378
|
training_datasets=webssl_dino_training_datasets,
|
|
379
379
|
citation="""@article{fan2025scaling,
|
|
380
|
-
title={Scaling Language-Free Visual Representation Learning},
|
|
380
|
+
title={Scaling Language-Free Visual Representation Learning},
|
|
381
381
|
author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
|
|
382
382
|
year={2025},
|
|
383
383
|
eprint={2504.01017},
|
|
@@ -408,7 +408,7 @@ webssl_dino7b_full8b_224 = ModelMeta(
|
|
|
408
408
|
use_instructions=False,
|
|
409
409
|
training_datasets=webssl_dino_training_datasets,
|
|
410
410
|
citation="""@article{fan2025scaling,
|
|
411
|
-
title={Scaling Language-Free Visual Representation Learning},
|
|
411
|
+
title={Scaling Language-Free Visual Representation Learning},
|
|
412
412
|
author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
|
|
413
413
|
year={2025},
|
|
414
414
|
eprint={2504.01017},
|
|
@@ -439,7 +439,7 @@ webssl_dino7b_full8b_378 = ModelMeta(
|
|
|
439
439
|
use_instructions=False,
|
|
440
440
|
training_datasets=webssl_dino_training_datasets,
|
|
441
441
|
citation="""@article{fan2025scaling,
|
|
442
|
-
title={Scaling Language-Free Visual Representation Learning},
|
|
442
|
+
title={Scaling Language-Free Visual Representation Learning},
|
|
443
443
|
author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
|
|
444
444
|
year={2025},
|
|
445
445
|
eprint={2504.01017},
|
|
@@ -470,7 +470,7 @@ webssl_dino7b_full8b_518 = ModelMeta(
|
|
|
470
470
|
use_instructions=False,
|
|
471
471
|
training_datasets=webssl_dino_training_datasets,
|
|
472
472
|
citation="""@article{fan2025scaling,
|
|
473
|
-
title={Scaling Language-Free Visual Representation Learning},
|
|
473
|
+
title={Scaling Language-Free Visual Representation Learning},
|
|
474
474
|
author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
|
|
475
475
|
year={2025},
|
|
476
476
|
eprint={2504.01017},
|
|
@@ -502,7 +502,7 @@ webssl_dino2b_light2b = ModelMeta(
|
|
|
502
502
|
use_instructions=False,
|
|
503
503
|
training_datasets=webssl_dino_training_datasets,
|
|
504
504
|
citation="""@article{fan2025scaling,
|
|
505
|
-
title={Scaling Language-Free Visual Representation Learning},
|
|
505
|
+
title={Scaling Language-Free Visual Representation Learning},
|
|
506
506
|
author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
|
|
507
507
|
year={2025},
|
|
508
508
|
eprint={2504.01017},
|
|
@@ -533,7 +533,7 @@ webssl_dino2b_heavy2b = ModelMeta(
|
|
|
533
533
|
use_instructions=False,
|
|
534
534
|
training_datasets=webssl_dino_training_datasets,
|
|
535
535
|
citation="""@article{fan2025scaling,
|
|
536
|
-
title={Scaling Language-Free Visual Representation Learning},
|
|
536
|
+
title={Scaling Language-Free Visual Representation Learning},
|
|
537
537
|
author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
|
|
538
538
|
year={2025},
|
|
539
539
|
eprint={2504.01017},
|
|
@@ -564,7 +564,7 @@ webssl_dino3b_light2b = ModelMeta(
|
|
|
564
564
|
use_instructions=False,
|
|
565
565
|
training_datasets=webssl_dino_training_datasets,
|
|
566
566
|
citation="""@article{fan2025scaling,
|
|
567
|
-
title={Scaling Language-Free Visual Representation Learning},
|
|
567
|
+
title={Scaling Language-Free Visual Representation Learning},
|
|
568
568
|
author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
|
|
569
569
|
year={2025},
|
|
570
570
|
eprint={2504.01017},
|
|
@@ -595,7 +595,7 @@ webssl_dino3b_heavy2b = ModelMeta(
|
|
|
595
595
|
use_instructions=False,
|
|
596
596
|
training_datasets=webssl_dino_training_datasets,
|
|
597
597
|
citation="""@article{fan2025scaling,
|
|
598
|
-
title={Scaling Language-Free Visual Representation Learning},
|
|
598
|
+
title={Scaling Language-Free Visual Representation Learning},
|
|
599
599
|
author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
|
|
600
600
|
year={2025},
|
|
601
601
|
eprint={2504.01017},
|
|
@@ -626,7 +626,7 @@ webssl_mae300m_full2b = ModelMeta(
|
|
|
626
626
|
use_instructions=False,
|
|
627
627
|
training_datasets=webssl_dino_training_datasets,
|
|
628
628
|
citation="""@article{fan2025scaling,
|
|
629
|
-
title={Scaling Language-Free Visual Representation Learning},
|
|
629
|
+
title={Scaling Language-Free Visual Representation Learning},
|
|
630
630
|
author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
|
|
631
631
|
year={2025},
|
|
632
632
|
eprint={2504.01017},
|
|
@@ -657,7 +657,7 @@ webssl_mae700m_full2b = ModelMeta(
|
|
|
657
657
|
use_instructions=False,
|
|
658
658
|
training_datasets=webssl_dino_training_datasets,
|
|
659
659
|
citation="""@article{fan2025scaling,
|
|
660
|
-
title={Scaling Language-Free Visual Representation Learning},
|
|
660
|
+
title={Scaling Language-Free Visual Representation Learning},
|
|
661
661
|
author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
|
|
662
662
|
year={2025},
|
|
663
663
|
eprint={2504.01017},
|
|
@@ -688,7 +688,7 @@ webssl_mae1b_full2b = ModelMeta(
|
|
|
688
688
|
use_instructions=False,
|
|
689
689
|
training_datasets=webssl_dino_training_datasets,
|
|
690
690
|
citation="""@article{fan2025scaling,
|
|
691
|
-
title={Scaling Language-Free Visual Representation Learning},
|
|
691
|
+
title={Scaling Language-Free Visual Representation Learning},
|
|
692
692
|
author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
|
|
693
693
|
year={2025},
|
|
694
694
|
eprint={2504.01017},
|
|
@@ -2,7 +2,7 @@ from mteb.models.model_meta import ModelMeta
|
|
|
2
2
|
from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
|
|
3
3
|
|
|
4
4
|
embedding_gemma_300m_scandi = ModelMeta(
|
|
5
|
-
loader=sentence_transformers_loader,
|
|
5
|
+
loader=sentence_transformers_loader,
|
|
6
6
|
name="emillykkejensen/EmbeddingGemma-Scandi-300m",
|
|
7
7
|
model_type=["dense"],
|
|
8
8
|
languages=["dan-Latn", "swe-Latn", "nor-Latn", "nob-Latn", "nno-Latn"],
|
|
@@ -35,7 +35,7 @@ embedding_gemma_300m_scandi = ModelMeta(
|
|
|
35
35
|
|
|
36
36
|
|
|
37
37
|
qwen_scandi = ModelMeta(
|
|
38
|
-
loader=sentence_transformers_loader,
|
|
38
|
+
loader=sentence_transformers_loader,
|
|
39
39
|
name="emillykkejensen/Qwen3-Embedding-Scandi-0.6B",
|
|
40
40
|
model_type=["dense"],
|
|
41
41
|
languages=["dan-Latn", "swe-Latn", "nor-Latn", "nob-Latn", "nno-Latn"],
|
|
@@ -59,7 +59,7 @@ qwen_scandi = ModelMeta(
|
|
|
59
59
|
|
|
60
60
|
|
|
61
61
|
mmbert_scandi = ModelMeta(
|
|
62
|
-
loader=sentence_transformers_loader,
|
|
62
|
+
loader=sentence_transformers_loader,
|
|
63
63
|
name="emillykkejensen/mmBERTscandi-base-embedding",
|
|
64
64
|
model_type=["dense"],
|
|
65
65
|
languages=["dan-Latn", "swe-Latn", "nor-Latn", "nob-Latn", "nno-Latn"],
|
|
@@ -2,6 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
4
|
import math
|
|
5
|
+
import warnings
|
|
5
6
|
from typing import TYPE_CHECKING, Any
|
|
6
7
|
|
|
7
8
|
import torch
|
|
@@ -261,9 +262,9 @@ def smart_resize(
|
|
|
261
262
|
w_bar = ceil_by_factor(width * beta, factor)
|
|
262
263
|
|
|
263
264
|
if max(h_bar, w_bar) / min(h_bar, w_bar) > MAX_RATIO:
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
)
|
|
265
|
+
msg = f"Absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(h_bar, w_bar) / min(h_bar, w_bar)}"
|
|
266
|
+
logger.warning(msg)
|
|
267
|
+
warnings.warn(msg)
|
|
267
268
|
if h_bar > w_bar:
|
|
268
269
|
h_bar = w_bar * MAX_RATIO
|
|
269
270
|
else:
|