mteb 2.5.2__py3-none-any.whl → 2.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. mteb/_create_dataloaders.py +10 -15
  2. mteb/_evaluators/any_sts_evaluator.py +1 -4
  3. mteb/_evaluators/evaluator.py +2 -1
  4. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +5 -6
  5. mteb/_evaluators/pair_classification_evaluator.py +3 -1
  6. mteb/_evaluators/retrieval_metrics.py +17 -16
  7. mteb/_evaluators/sklearn_evaluator.py +9 -8
  8. mteb/_evaluators/text/bitext_mining_evaluator.py +23 -16
  9. mteb/_evaluators/text/summarization_evaluator.py +20 -16
  10. mteb/abstasks/_data_filter/filters.py +1 -1
  11. mteb/abstasks/_data_filter/task_pipelines.py +3 -0
  12. mteb/abstasks/_statistics_calculation.py +18 -10
  13. mteb/abstasks/_stratification.py +18 -18
  14. mteb/abstasks/abstask.py +33 -27
  15. mteb/abstasks/aggregate_task_metadata.py +1 -9
  16. mteb/abstasks/aggregated_task.py +7 -26
  17. mteb/abstasks/classification.py +10 -4
  18. mteb/abstasks/clustering.py +18 -14
  19. mteb/abstasks/clustering_legacy.py +8 -8
  20. mteb/abstasks/image/image_text_pair_classification.py +5 -3
  21. mteb/abstasks/multilabel_classification.py +20 -16
  22. mteb/abstasks/pair_classification.py +18 -9
  23. mteb/abstasks/regression.py +3 -3
  24. mteb/abstasks/retrieval.py +12 -9
  25. mteb/abstasks/sts.py +6 -3
  26. mteb/abstasks/task_metadata.py +22 -19
  27. mteb/abstasks/text/bitext_mining.py +36 -25
  28. mteb/abstasks/text/reranking.py +7 -5
  29. mteb/abstasks/text/summarization.py +8 -3
  30. mteb/abstasks/zeroshot_classification.py +5 -2
  31. mteb/benchmarks/benchmark.py +2 -2
  32. mteb/cache.py +27 -22
  33. mteb/cli/_display_tasks.py +2 -2
  34. mteb/cli/build_cli.py +15 -10
  35. mteb/cli/generate_model_card.py +10 -7
  36. mteb/deprecated_evaluator.py +60 -46
  37. mteb/evaluate.py +39 -30
  38. mteb/filter_tasks.py +25 -26
  39. mteb/get_tasks.py +29 -30
  40. mteb/languages/language_scripts.py +5 -3
  41. mteb/leaderboard/app.py +1 -1
  42. mteb/load_results.py +12 -12
  43. mteb/models/abs_encoder.py +7 -5
  44. mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
  45. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
  46. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
  47. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
  48. mteb/models/cache_wrappers/cache_wrapper.py +2 -2
  49. mteb/models/get_model_meta.py +8 -1
  50. mteb/models/instruct_wrapper.py +11 -5
  51. mteb/models/model_implementations/andersborges.py +2 -2
  52. mteb/models/model_implementations/blip_models.py +8 -8
  53. mteb/models/model_implementations/bm25.py +1 -1
  54. mteb/models/model_implementations/clip_models.py +3 -3
  55. mteb/models/model_implementations/cohere_models.py +1 -1
  56. mteb/models/model_implementations/cohere_v.py +2 -2
  57. mteb/models/model_implementations/dino_models.py +23 -23
  58. mteb/models/model_implementations/emillykkejensen_models.py +3 -3
  59. mteb/models/model_implementations/gme_v_models.py +4 -3
  60. mteb/models/model_implementations/jina_clip.py +1 -1
  61. mteb/models/model_implementations/jina_models.py +1 -1
  62. mteb/models/model_implementations/kennethenevoldsen_models.py +2 -2
  63. mteb/models/model_implementations/llm2clip_models.py +3 -3
  64. mteb/models/model_implementations/mcinext_models.py +4 -1
  65. mteb/models/model_implementations/moco_models.py +2 -2
  66. mteb/models/model_implementations/model2vec_models.py +1 -1
  67. mteb/models/model_implementations/nomic_models.py +8 -8
  68. mteb/models/model_implementations/openclip_models.py +7 -7
  69. mteb/models/model_implementations/random_baseline.py +3 -3
  70. mteb/models/model_implementations/rasgaard_models.py +1 -1
  71. mteb/models/model_implementations/repllama_models.py +2 -2
  72. mteb/models/model_implementations/rerankers_custom.py +3 -3
  73. mteb/models/model_implementations/rerankers_monot5_based.py +3 -3
  74. mteb/models/model_implementations/siglip_models.py +10 -10
  75. mteb/models/model_implementations/vlm2vec_models.py +1 -1
  76. mteb/models/model_implementations/voyage_v.py +4 -4
  77. mteb/models/model_meta.py +14 -13
  78. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +9 -6
  79. mteb/models/search_wrappers.py +26 -12
  80. mteb/models/sentence_transformer_wrapper.py +19 -14
  81. mteb/py.typed +0 -0
  82. mteb/results/benchmark_results.py +28 -20
  83. mteb/results/model_result.py +52 -22
  84. mteb/results/task_result.py +55 -58
  85. mteb/similarity_functions.py +11 -7
  86. mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
  87. mteb/tasks/classification/est/estonian_valence.py +1 -1
  88. mteb/tasks/classification/multilingual/scala_classification.py +1 -1
  89. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  90. mteb/tasks/retrieval/code/code_rag.py +12 -12
  91. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  92. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  93. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  94. mteb/tasks/retrieval/nob/norquad.py +2 -2
  95. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  96. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  97. mteb/types/_result.py +2 -1
  98. mteb/types/statistics.py +9 -3
  99. {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/METADATA +1 -1
  100. {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/RECORD +104 -103
  101. {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/WHEEL +0 -0
  102. {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/entry_points.txt +0 -0
  103. {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/licenses/LICENSE +0 -0
  104. {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,11 @@
1
1
  import json
2
2
  import logging
3
+ import warnings
3
4
  from pathlib import Path
5
+ from typing import Any
4
6
 
5
7
  import numpy as np
6
8
 
7
- from mteb.types import BatchedInput
8
-
9
9
  from ._hash_utils import _hash_item
10
10
 
11
11
  logger = logging.getLogger(__name__)
@@ -14,7 +14,7 @@ logger = logging.getLogger(__name__)
14
14
  class NumpyCache:
15
15
  """Generic vector cache for both text and images."""
16
16
 
17
- def __init__(self, directory: str | Path, initial_vectors: int = 100000):
17
+ def __init__(self, directory: str | Path, initial_vectors: int = 100_000):
18
18
  self.directory = Path(directory)
19
19
  self.directory.mkdir(parents=True, exist_ok=True)
20
20
  self.vectors_file = self.directory / "vectors.npy"
@@ -27,7 +27,7 @@ class NumpyCache:
27
27
  logger.info(f"Initialized VectorCacheMap in directory: {self.directory}")
28
28
  self._initialize_vectors_file()
29
29
 
30
- def add(self, item: list[BatchedInput], vectors: np.ndarray) -> None:
30
+ def add(self, items: list[dict[str, Any]], vectors: np.ndarray) -> None:
31
31
  """Add a vector to the cache."""
32
32
  try:
33
33
  if self.vector_dim is None:
@@ -38,12 +38,17 @@ class NumpyCache:
38
38
  self._save_dimension()
39
39
  logger.info(f"Initialized vector dimension to {self.vector_dim}")
40
40
 
41
- for item, vec in zip(item, vectors):
41
+ if self.vectors is None:
42
+ raise RuntimeError(
43
+ "Vectors file not initialized. Call _initialize_vectors_file() first."
44
+ )
45
+
46
+ for item, vec in zip(items, vectors):
42
47
  item_hash = _hash_item(item)
43
48
  if item_hash in self.hash_to_index:
44
- logger.warning(
45
- "Hash collision or duplicate item. Overwriting existing vector."
46
- )
49
+ msg = f"Hash collision or duplicate item for hash {item_hash}. Overwriting existing vector."
50
+ logger.warning(msg)
51
+ warnings.warn(msg)
47
52
  index = self.hash_to_index[item_hash]
48
53
  else:
49
54
  index = len(self.hash_to_index)
@@ -74,18 +79,26 @@ class NumpyCache:
74
79
  shape=(self.initial_vectors, self.vector_dim),
75
80
  )
76
81
  else:
77
- self.vectors = np.memmap(self.vectors_file, dtype="float32", mode="r+")
78
- self.vectors = self.vectors.reshape(-1, self.vector_dim)
82
+ self.vectors = np.memmap(
83
+ self.vectors_file,
84
+ dtype="float32",
85
+ mode="r+",
86
+ shape=(-1, self.vector_dim),
87
+ )
79
88
  logger.info(f"Vectors file initialized with shape: {self.vectors.shape}")
80
89
 
81
90
  def _double_vectors_file(self) -> None:
91
+ if self.vectors is None or self.vector_dim is None:
92
+ raise RuntimeError(
93
+ "Vectors file not initialized. Call _initialize_vectors_file() first."
94
+ )
82
95
  current_size = len(self.vectors)
83
96
  new_size = current_size * 2
84
97
  logger.info(f"Doubling vectors file from {current_size} to {new_size} vectors")
85
98
  self.vectors.flush()
86
99
  new_vectors = np.memmap(
87
- self.vectors_file,
88
- dtype="float32",
100
+ str(self.vectors_file),
101
+ dtype=np.float32,
89
102
  mode="r+",
90
103
  shape=(new_size, self.vector_dim),
91
104
  )
@@ -107,9 +120,9 @@ class NumpyCache:
107
120
  f"Loaded vector dimension {self.vector_dim} from {self.dimension_file}"
108
121
  )
109
122
  else:
110
- logger.warning(
111
- "Dimension file not found. Vector dimension remains uninitialized."
112
- )
123
+ msg = "Dimension file not found. Vector dimension remains uninitialized."
124
+ logger.warning(msg)
125
+ warnings.warn(msg)
113
126
 
114
127
  def save(self) -> None:
115
128
  """Persist VectorCacheMap to disk."""
@@ -146,25 +159,30 @@ class NumpyCache:
146
159
 
147
160
  if self.vector_dim is not None:
148
161
  self.vectors = np.memmap(
149
- self.vectors_file, dtype="float32", mode="r+"
162
+ self.vectors_file,
163
+ dtype="float32",
164
+ mode="r+",
165
+ shape=(-1, self.vector_dim),
150
166
  )
151
- self.vectors = self.vectors.reshape(-1, self.vector_dim)
152
167
  logger.info(f"Loaded vectors file with shape: {self.vectors.shape}")
153
168
  else:
154
- logger.warning(
155
- "Vector dimension not set. Unable to load vectors file."
156
- )
169
+ msg = "Vector dimension not set. Unable to load vectors file."
170
+ logger.warning(msg)
171
+ warnings.warn(msg)
157
172
  logger.info(f"Loaded VectorCacheMap from {self.directory}")
158
173
  else:
159
- logger.warning(
160
- "No existing files found. Initialized empty VectorCacheMap."
161
- )
174
+ msg = "No existing files found. Initialized empty VectorCacheMap."
175
+ logger.warning(msg)
176
+ warnings.warn(msg)
162
177
  except Exception as e:
163
178
  logger.error(f"Error loading VectorCacheMap: {str(e)}")
164
179
  raise
165
180
 
166
- def get_vector(self, item: BatchedInput) -> np.ndarray | None:
181
+ def get_vector(self, item: dict[str, Any]) -> np.ndarray | None:
167
182
  """Retrieve vector from index by hash."""
183
+ if self.vectors is None:
184
+ return None
185
+
168
186
  try:
169
187
  item_hash = _hash_item(item)
170
188
  if item_hash not in self.hash_to_index:
@@ -176,7 +194,7 @@ class NumpyCache:
176
194
  logger.error(f"Error retrieving vector for item: {str(e)}")
177
195
  raise
178
196
 
179
- def __contains__(self, item: BatchedInput) -> bool:
197
+ def __contains__(self, item: dict[str, Any]) -> bool:
180
198
  return _hash_item(item) in self.hash_to_index
181
199
 
182
200
  def __del__(self):
@@ -90,9 +90,9 @@ class CachedEmbeddingWrapper:
90
90
  try:
91
91
  cache = self._get_or_create_cache(task_name)
92
92
 
93
- uncached_items: list[BatchedInput] = []
93
+ uncached_items: list[dict[str, Any]] = []
94
94
  uncached_indices: list[int] = []
95
- all_items = inputs.dataset
95
+ all_items: Dataset = inputs.dataset
96
96
  cached_vectors: dict[int, np.ndarray] = {}
97
97
 
98
98
  for i, item in enumerate(all_items):
@@ -93,7 +93,14 @@ def get_model(
93
93
  meta = get_model_meta(model_name, revision)
94
94
  model = meta.load_model(**kwargs)
95
95
 
96
- model.mteb_model_meta = meta # type: ignore
96
+ if kwargs:
97
+ logger.info(
98
+ f"Model '{model_name}' loaded with additional arguments: {list(kwargs.keys())}"
99
+ )
100
+ meta = meta.model_copy(deep=True)
101
+ meta.loader_kwargs |= kwargs
102
+
103
+ model.mteb_model_meta = meta # type: ignore[misc]
97
104
  return model
98
105
 
99
106
 
@@ -17,7 +17,7 @@ logger = logging.getLogger(__name__)
17
17
  def instruct_wrapper(
18
18
  model_name_or_path: str,
19
19
  mode: str,
20
- instruction_template: str | Callable[[str], str] | None = None,
20
+ instruction_template: str | Callable[[str, PromptType | None], str] | None = None,
21
21
  **kwargs,
22
22
  ):
23
23
  """Instruct wrapper for models. Uses GritLM to pass instructions to the model.
@@ -40,7 +40,9 @@ def instruct_wrapper(
40
40
  self,
41
41
  model_name_or_path: str,
42
42
  mode: str,
43
- instruction_template: str | Callable[[str, PromptType], str] | None = None,
43
+ instruction_template: str
44
+ | Callable[[str, PromptType | None], str]
45
+ | None = None,
44
46
  **kwargs,
45
47
  ):
46
48
  if (
@@ -82,8 +84,11 @@ def instruct_wrapper(
82
84
  logger.info(
83
85
  f"Using instruction: '{instruction}' for task: '{task_metadata.name}'"
84
86
  )
85
- embeddings = super().encode(
86
- _inputs, instruction=instruction, *args, **kwargs
87
+ embeddings = super().encode( # type: ignore[safe-super]
88
+ _inputs, # type: ignore[arg-type]
89
+ instruction=instruction,
90
+ *args,
91
+ **kwargs,
87
92
  )
88
93
  if isinstance(embeddings, torch.Tensor):
89
94
  # sometimes in kwargs can be return_tensors=True
@@ -141,7 +146,7 @@ class InstructSentenceTransformerModel(AbsEncoder):
141
146
  )
142
147
 
143
148
  self.instruction_template = instruction_template
144
- tokenizer_params = {}
149
+ tokenizer_params: dict[str, Any] = {}
145
150
  if add_eos_token:
146
151
  tokenizer_params["add_eos_token"] = add_eos_token
147
152
  if max_seq_length is not None:
@@ -193,6 +198,7 @@ class InstructSentenceTransformerModel(AbsEncoder):
193
198
  The encoded input in a numpy array or torch tensor of the shape (Number of sentences) x (Embedding dimension).
194
199
  """
195
200
  sentences = [text for batch in inputs for text in batch["text"]]
201
+ instruction: str | None
196
202
  instruction = self.get_task_instruction(task_metadata, prompt_type)
197
203
 
198
204
  # to passage prompts won't be applied to passages
@@ -4,7 +4,7 @@ from mteb.models.model_implementations.model2vec_models import Model2VecModel
4
4
  from mteb.models.model_meta import ModelMeta, ScoringFunction
5
5
 
6
6
  model2vecdk = ModelMeta(
7
- loader=Model2VecModel, # type: ignore
7
+ loader=Model2VecModel,
8
8
  name="andersborges/model2vecdk",
9
9
  model_type=["dense"],
10
10
  languages=["dan-Latn"],
@@ -35,7 +35,7 @@ model2vecdk = ModelMeta(
35
35
 
36
36
 
37
37
  model2vecdk_stem = ModelMeta(
38
- loader=Model2VecModel, # type: ignore
38
+ loader=Model2VecModel,
39
39
  name="andersborges/model2vecdk-stem",
40
40
  model_type=["dense"],
41
41
  languages=["dan-Latn"],
@@ -128,7 +128,7 @@ class BLIPModel(AbsEncoder):
128
128
 
129
129
  # in descending order of usage (downloads from huggingface)
130
130
  blip_image_captioning_large = ModelMeta(
131
- loader=BLIPModel, # type: ignore
131
+ loader=BLIPModel,
132
132
  name="Salesforce/blip-image-captioning-large",
133
133
  model_type=["dense"],
134
134
  languages=["eng-Latn"],
@@ -156,7 +156,7 @@ blip_image_captioning_large = ModelMeta(
156
156
  )
157
157
 
158
158
  blip_image_captioning_base = ModelMeta(
159
- loader=BLIPModel, # type: ignore
159
+ loader=BLIPModel,
160
160
  name="Salesforce/blip-image-captioning-base",
161
161
  model_type=["dense"],
162
162
  languages=["eng-Latn"],
@@ -185,7 +185,7 @@ blip_image_captioning_base = ModelMeta(
185
185
 
186
186
 
187
187
  blip_vqa_base = ModelMeta(
188
- loader=BLIPModel, # type: ignore
188
+ loader=BLIPModel,
189
189
  name="Salesforce/blip-vqa-base",
190
190
  model_type=["dense"],
191
191
  languages=["eng-Latn"],
@@ -212,7 +212,7 @@ blip_vqa_base = ModelMeta(
212
212
  )
213
213
 
214
214
  blip_vqa_capfilt_large = ModelMeta(
215
- loader=BLIPModel, # type: ignore
215
+ loader=BLIPModel,
216
216
  name="Salesforce/blip-vqa-capfilt-large",
217
217
  model_type=["dense"],
218
218
  languages=["eng-Latn"],
@@ -239,7 +239,7 @@ blip_vqa_capfilt_large = ModelMeta(
239
239
  )
240
240
 
241
241
  blip_itm_base_coco = ModelMeta(
242
- loader=BLIPModel, # type: ignore
242
+ loader=BLIPModel,
243
243
  name="Salesforce/blip-itm-base-coco",
244
244
  model_type=["dense"],
245
245
  languages=["eng-Latn"],
@@ -266,7 +266,7 @@ blip_itm_base_coco = ModelMeta(
266
266
  )
267
267
 
268
268
  blip_itm_large_coco = ModelMeta(
269
- loader=BLIPModel, # type: ignore
269
+ loader=BLIPModel,
270
270
  name="Salesforce/blip-itm-large-coco",
271
271
  model_type=["dense"],
272
272
  languages=["eng-Latn"],
@@ -294,7 +294,7 @@ blip_itm_large_coco = ModelMeta(
294
294
  )
295
295
 
296
296
  blip_itm_base_flickr = ModelMeta(
297
- loader=BLIPModel, # type: ignore
297
+ loader=BLIPModel,
298
298
  name="Salesforce/blip-itm-base-flickr",
299
299
  model_type=["dense"],
300
300
  languages=["eng-Latn"],
@@ -322,7 +322,7 @@ blip_itm_base_flickr = ModelMeta(
322
322
  )
323
323
 
324
324
  blip_itm_large_flickr = ModelMeta(
325
- loader=BLIPModel, # type: ignore
325
+ loader=BLIPModel,
326
326
  name="Salesforce/blip-itm-large-flickr",
327
327
  model_type=["dense"],
328
328
  languages=["eng-Latn"],
@@ -113,7 +113,7 @@ def bm25_loader(model_name, **kwargs) -> SearchProtocol:
113
113
 
114
114
  def encode(self, texts: list[str]):
115
115
  """Encode input text as term vectors"""
116
- return bm25s.tokenize(texts, stopwords=self.stopwords, stemmer=self.stemmer) # type: ignore
116
+ return bm25s.tokenize(texts, stopwords=self.stopwords, stemmer=self.stemmer)
117
117
 
118
118
  return BM25Search(**kwargs)
119
119
 
@@ -115,7 +115,7 @@ CLIP_CITATION = """
115
115
 
116
116
 
117
117
  clip_vit_large_patch14 = ModelMeta(
118
- loader=CLIPModel, # type: ignore
118
+ loader=CLIPModel,
119
119
  name="openai/clip-vit-large-patch14",
120
120
  model_type=["dense"],
121
121
  languages=["eng-Latn"],
@@ -139,7 +139,7 @@ clip_vit_large_patch14 = ModelMeta(
139
139
  )
140
140
 
141
141
  clip_vit_base_patch32 = ModelMeta(
142
- loader=CLIPModel, # type: ignore
142
+ loader=CLIPModel,
143
143
  name="openai/clip-vit-base-patch32",
144
144
  model_type=["dense"],
145
145
  languages=["eng-Latn"],
@@ -163,7 +163,7 @@ clip_vit_base_patch32 = ModelMeta(
163
163
  )
164
164
 
165
165
  clip_vit_base_patch16 = ModelMeta(
166
- loader=CLIPModel, # type: ignore
166
+ loader=CLIPModel,
167
167
  name="openai/clip-vit-base-patch16",
168
168
  model_type=["dense"],
169
169
  languages=["eng-Latn"],
@@ -222,7 +222,7 @@ class CohereTextEmbeddingModel(AbsEncoder):
222
222
  ) -> None:
223
223
  requires_package(self, "cohere", model_name, "pip install 'mteb[cohere]'")
224
224
 
225
- import cohere # type: ignore
225
+ import cohere
226
226
 
227
227
  self.model_name = model_name.removeprefix("Cohere/Cohere-")
228
228
  self.sep = sep
@@ -378,7 +378,7 @@ def cohere_v_loader(model_name, **kwargs):
378
378
 
379
379
 
380
380
  cohere_mult_3 = ModelMeta(
381
- loader=cohere_v_loader, # type: ignore
381
+ loader=cohere_v_loader,
382
382
  loader_kwargs={"model_name": "embed-multilingual-v3.0"},
383
383
  name="cohere/embed-multilingual-v3.0",
384
384
  model_type=["dense"],
@@ -402,7 +402,7 @@ cohere_mult_3 = ModelMeta(
402
402
  )
403
403
 
404
404
  cohere_eng_3 = ModelMeta(
405
- loader=cohere_v_loader, # type: ignore
405
+ loader=cohere_v_loader,
406
406
  loader_kwargs={"model_name": "embed-english-v3.0"},
407
407
  name="cohere/embed-english-v3.0",
408
408
  model_type=["dense"],
@@ -104,7 +104,7 @@ dinov2_training_datasets = set(
104
104
 
105
105
 
106
106
  dinov2_small = ModelMeta(
107
- loader=DINOModel, # type: ignore
107
+ loader=DINOModel,
108
108
  name="facebook/dinov2-small",
109
109
  model_type=["dense"],
110
110
  languages=["eng-Latn"],
@@ -125,7 +125,7 @@ dinov2_small = ModelMeta(
125
125
  use_instructions=False,
126
126
  training_datasets=dinov2_training_datasets,
127
127
  citation="""@misc{oquab2023dinov2,
128
- title={DINOv2: Learning Robust Visual Features without Supervision},
128
+ title={DINOv2: Learning Robust Visual Features without Supervision},
129
129
  author={Maxime Oquab and Timothée Darcet and Théo Moutakanni and Huy Vo and Marc Szafraniec and Vasil Khalidov and Pierre Fernandez and Daniel Haziza and Francisco Massa and Alaaeldin El-Nouby and Mahmoud Assran and Nicolas Ballas and Wojciech Galuba and Russell Howes and Po-Yao Huang and Shang-Wen Li and Ishan Misra and Michael Rabbat and Vasu Sharma and Gabriel Synnaeve and Hu Xu and Hervé Jegou and Julien Mairal and Patrick Labatut and Armand Joulin and Piotr Bojanowski},
130
130
  year={2023},
131
131
  eprint={2304.07193},
@@ -135,7 +135,7 @@ dinov2_small = ModelMeta(
135
135
  )
136
136
 
137
137
  dinov2_base = ModelMeta(
138
- loader=DINOModel, # type: ignore
138
+ loader=DINOModel,
139
139
  name="facebook/dinov2-base",
140
140
  model_type=["dense"],
141
141
  languages=["eng-Latn"],
@@ -156,7 +156,7 @@ dinov2_base = ModelMeta(
156
156
  use_instructions=False,
157
157
  training_datasets=dinov2_training_datasets,
158
158
  citation="""@misc{oquab2023dinov2,
159
- title={DINOv2: Learning Robust Visual Features without Supervision},
159
+ title={DINOv2: Learning Robust Visual Features without Supervision},
160
160
  author={Maxime Oquab and Timothée Darcet and Théo Moutakanni and Huy Vo and Marc Szafraniec and Vasil Khalidov and Pierre Fernandez and Daniel Haziza and Francisco Massa and Alaaeldin El-Nouby and Mahmoud Assran and Nicolas Ballas and Wojciech Galuba and Russell Howes and Po-Yao Huang and Shang-Wen Li and Ishan Misra and Michael Rabbat and Vasu Sharma and Gabriel Synnaeve and Hu Xu and Hervé Jegou and Julien Mairal and Patrick Labatut and Armand Joulin and Piotr Bojanowski},
161
161
  year={2023},
162
162
  eprint={2304.07193},
@@ -166,7 +166,7 @@ dinov2_base = ModelMeta(
166
166
  )
167
167
 
168
168
  dinov2_large = ModelMeta(
169
- loader=DINOModel, # type: ignore
169
+ loader=DINOModel,
170
170
  name="facebook/dinov2-large",
171
171
  model_type=["dense"],
172
172
  languages=["eng-Latn"],
@@ -187,7 +187,7 @@ dinov2_large = ModelMeta(
187
187
  use_instructions=False,
188
188
  training_datasets=dinov2_training_datasets,
189
189
  citation="""@misc{oquab2023dinov2,
190
- title={DINOv2: Learning Robust Visual Features without Supervision},
190
+ title={DINOv2: Learning Robust Visual Features without Supervision},
191
191
  author={Maxime Oquab and Timothée Darcet and Théo Moutakanni and Huy Vo and Marc Szafraniec and Vasil Khalidov and Pierre Fernandez and Daniel Haziza and Francisco Massa and Alaaeldin El-Nouby and Mahmoud Assran and Nicolas Ballas and Wojciech Galuba and Russell Howes and Po-Yao Huang and Shang-Wen Li and Ishan Misra and Michael Rabbat and Vasu Sharma and Gabriel Synnaeve and Hu Xu and Hervé Jegou and Julien Mairal and Patrick Labatut and Armand Joulin and Piotr Bojanowski},
192
192
  year={2023},
193
193
  eprint={2304.07193},
@@ -197,7 +197,7 @@ dinov2_large = ModelMeta(
197
197
  )
198
198
 
199
199
  dinov2_giant = ModelMeta(
200
- loader=DINOModel, # type: ignore
200
+ loader=DINOModel,
201
201
  name="facebook/dinov2-giant",
202
202
  model_type=["dense"],
203
203
  languages=["eng-Latn"],
@@ -218,7 +218,7 @@ dinov2_giant = ModelMeta(
218
218
  use_instructions=False,
219
219
  training_datasets=dinov2_training_datasets,
220
220
  citation="""@misc{oquab2023dinov2,
221
- title={DINOv2: Learning Robust Visual Features without Supervision},
221
+ title={DINOv2: Learning Robust Visual Features without Supervision},
222
222
  author={Maxime Oquab and Timothée Darcet and Théo Moutakanni and Huy Vo and Marc Szafraniec and Vasil Khalidov and Pierre Fernandez and Daniel Haziza and Francisco Massa and Alaaeldin El-Nouby and Mahmoud Assran and Nicolas Ballas and Wojciech Galuba and Russell Howes and Po-Yao Huang and Shang-Wen Li and Ishan Misra and Michael Rabbat and Vasu Sharma and Gabriel Synnaeve and Hu Xu and Hervé Jegou and Julien Mairal and Patrick Labatut and Armand Joulin and Piotr Bojanowski},
223
223
  year={2023},
224
224
  eprint={2304.07193},
@@ -253,7 +253,7 @@ webssl_dino300m_full2b = ModelMeta(
253
253
  use_instructions=False,
254
254
  training_datasets=webssl_dino_training_datasets,
255
255
  citation="""@article{fan2025scaling,
256
- title={Scaling Language-Free Visual Representation Learning},
256
+ title={Scaling Language-Free Visual Representation Learning},
257
257
  author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
258
258
  year={2025},
259
259
  eprint={2504.01017},
@@ -284,7 +284,7 @@ webssl_dino1b_full2b = ModelMeta(
284
284
  use_instructions=False,
285
285
  training_datasets=webssl_dino_training_datasets,
286
286
  citation="""@article{fan2025scaling,
287
- title={Scaling Language-Free Visual Representation Learning},
287
+ title={Scaling Language-Free Visual Representation Learning},
288
288
  author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
289
289
  year={2025},
290
290
  eprint={2504.01017},
@@ -315,7 +315,7 @@ webssl_dino2b_full2b = ModelMeta(
315
315
  use_instructions=False,
316
316
  training_datasets=webssl_dino_training_datasets,
317
317
  citation="""@article{fan2025scaling,
318
- title={Scaling Language-Free Visual Representation Learning},
318
+ title={Scaling Language-Free Visual Representation Learning},
319
319
  author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
320
320
  year={2025},
321
321
  eprint={2504.01017},
@@ -346,7 +346,7 @@ webssl_dino3b_full2b = ModelMeta(
346
346
  use_instructions=False,
347
347
  training_datasets=webssl_dino_training_datasets,
348
348
  citation="""@article{fan2025scaling,
349
- title={Scaling Language-Free Visual Representation Learning},
349
+ title={Scaling Language-Free Visual Representation Learning},
350
350
  author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
351
351
  year={2025},
352
352
  eprint={2504.01017},
@@ -377,7 +377,7 @@ webssl_dino5b_full2b = ModelMeta(
377
377
  use_instructions=False,
378
378
  training_datasets=webssl_dino_training_datasets,
379
379
  citation="""@article{fan2025scaling,
380
- title={Scaling Language-Free Visual Representation Learning},
380
+ title={Scaling Language-Free Visual Representation Learning},
381
381
  author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
382
382
  year={2025},
383
383
  eprint={2504.01017},
@@ -408,7 +408,7 @@ webssl_dino7b_full8b_224 = ModelMeta(
408
408
  use_instructions=False,
409
409
  training_datasets=webssl_dino_training_datasets,
410
410
  citation="""@article{fan2025scaling,
411
- title={Scaling Language-Free Visual Representation Learning},
411
+ title={Scaling Language-Free Visual Representation Learning},
412
412
  author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
413
413
  year={2025},
414
414
  eprint={2504.01017},
@@ -439,7 +439,7 @@ webssl_dino7b_full8b_378 = ModelMeta(
439
439
  use_instructions=False,
440
440
  training_datasets=webssl_dino_training_datasets,
441
441
  citation="""@article{fan2025scaling,
442
- title={Scaling Language-Free Visual Representation Learning},
442
+ title={Scaling Language-Free Visual Representation Learning},
443
443
  author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
444
444
  year={2025},
445
445
  eprint={2504.01017},
@@ -470,7 +470,7 @@ webssl_dino7b_full8b_518 = ModelMeta(
470
470
  use_instructions=False,
471
471
  training_datasets=webssl_dino_training_datasets,
472
472
  citation="""@article{fan2025scaling,
473
- title={Scaling Language-Free Visual Representation Learning},
473
+ title={Scaling Language-Free Visual Representation Learning},
474
474
  author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
475
475
  year={2025},
476
476
  eprint={2504.01017},
@@ -502,7 +502,7 @@ webssl_dino2b_light2b = ModelMeta(
502
502
  use_instructions=False,
503
503
  training_datasets=webssl_dino_training_datasets,
504
504
  citation="""@article{fan2025scaling,
505
- title={Scaling Language-Free Visual Representation Learning},
505
+ title={Scaling Language-Free Visual Representation Learning},
506
506
  author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
507
507
  year={2025},
508
508
  eprint={2504.01017},
@@ -533,7 +533,7 @@ webssl_dino2b_heavy2b = ModelMeta(
533
533
  use_instructions=False,
534
534
  training_datasets=webssl_dino_training_datasets,
535
535
  citation="""@article{fan2025scaling,
536
- title={Scaling Language-Free Visual Representation Learning},
536
+ title={Scaling Language-Free Visual Representation Learning},
537
537
  author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
538
538
  year={2025},
539
539
  eprint={2504.01017},
@@ -564,7 +564,7 @@ webssl_dino3b_light2b = ModelMeta(
564
564
  use_instructions=False,
565
565
  training_datasets=webssl_dino_training_datasets,
566
566
  citation="""@article{fan2025scaling,
567
- title={Scaling Language-Free Visual Representation Learning},
567
+ title={Scaling Language-Free Visual Representation Learning},
568
568
  author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
569
569
  year={2025},
570
570
  eprint={2504.01017},
@@ -595,7 +595,7 @@ webssl_dino3b_heavy2b = ModelMeta(
595
595
  use_instructions=False,
596
596
  training_datasets=webssl_dino_training_datasets,
597
597
  citation="""@article{fan2025scaling,
598
- title={Scaling Language-Free Visual Representation Learning},
598
+ title={Scaling Language-Free Visual Representation Learning},
599
599
  author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
600
600
  year={2025},
601
601
  eprint={2504.01017},
@@ -626,7 +626,7 @@ webssl_mae300m_full2b = ModelMeta(
626
626
  use_instructions=False,
627
627
  training_datasets=webssl_dino_training_datasets,
628
628
  citation="""@article{fan2025scaling,
629
- title={Scaling Language-Free Visual Representation Learning},
629
+ title={Scaling Language-Free Visual Representation Learning},
630
630
  author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
631
631
  year={2025},
632
632
  eprint={2504.01017},
@@ -657,7 +657,7 @@ webssl_mae700m_full2b = ModelMeta(
657
657
  use_instructions=False,
658
658
  training_datasets=webssl_dino_training_datasets,
659
659
  citation="""@article{fan2025scaling,
660
- title={Scaling Language-Free Visual Representation Learning},
660
+ title={Scaling Language-Free Visual Representation Learning},
661
661
  author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
662
662
  year={2025},
663
663
  eprint={2504.01017},
@@ -688,7 +688,7 @@ webssl_mae1b_full2b = ModelMeta(
688
688
  use_instructions=False,
689
689
  training_datasets=webssl_dino_training_datasets,
690
690
  citation="""@article{fan2025scaling,
691
- title={Scaling Language-Free Visual Representation Learning},
691
+ title={Scaling Language-Free Visual Representation Learning},
692
692
  author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
693
693
  year={2025},
694
694
  eprint={2504.01017},
@@ -2,7 +2,7 @@ from mteb.models.model_meta import ModelMeta
2
2
  from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
3
3
 
4
4
  embedding_gemma_300m_scandi = ModelMeta(
5
- loader=sentence_transformers_loader, # type: ignore
5
+ loader=sentence_transformers_loader,
6
6
  name="emillykkejensen/EmbeddingGemma-Scandi-300m",
7
7
  model_type=["dense"],
8
8
  languages=["dan-Latn", "swe-Latn", "nor-Latn", "nob-Latn", "nno-Latn"],
@@ -35,7 +35,7 @@ embedding_gemma_300m_scandi = ModelMeta(
35
35
 
36
36
 
37
37
  qwen_scandi = ModelMeta(
38
- loader=sentence_transformers_loader, # type: ignore
38
+ loader=sentence_transformers_loader,
39
39
  name="emillykkejensen/Qwen3-Embedding-Scandi-0.6B",
40
40
  model_type=["dense"],
41
41
  languages=["dan-Latn", "swe-Latn", "nor-Latn", "nob-Latn", "nno-Latn"],
@@ -59,7 +59,7 @@ qwen_scandi = ModelMeta(
59
59
 
60
60
 
61
61
  mmbert_scandi = ModelMeta(
62
- loader=sentence_transformers_loader, # type: ignore
62
+ loader=sentence_transformers_loader,
63
63
  name="emillykkejensen/mmBERTscandi-base-embedding",
64
64
  model_type=["dense"],
65
65
  languages=["dan-Latn", "swe-Latn", "nor-Latn", "nob-Latn", "nno-Latn"],
@@ -2,6 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import logging
4
4
  import math
5
+ import warnings
5
6
  from typing import TYPE_CHECKING, Any
6
7
 
7
8
  import torch
@@ -261,9 +262,9 @@ def smart_resize(
261
262
  w_bar = ceil_by_factor(width * beta, factor)
262
263
 
263
264
  if max(h_bar, w_bar) / min(h_bar, w_bar) > MAX_RATIO:
264
- logger.warning(
265
- f"Absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(h_bar, w_bar) / min(h_bar, w_bar)}"
266
- )
265
+ msg = f"Absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(h_bar, w_bar) / min(h_bar, w_bar)}"
266
+ logger.warning(msg)
267
+ warnings.warn(msg)
267
268
  if h_bar > w_bar:
268
269
  h_bar = w_bar * MAX_RATIO
269
270
  else:
@@ -121,7 +121,7 @@ class JinaCLIPModel(AbsEncoder):
121
121
 
122
122
 
123
123
  jina_clip_v1 = ModelMeta(
124
- loader=JinaCLIPModel, # type: ignore
124
+ loader=JinaCLIPModel,
125
125
  name="jinaai/jina-clip-v1",
126
126
  model_type=["dense"],
127
127
  languages=["eng-Latn"],
@@ -795,7 +795,7 @@ jina_embeddings_v4 = ModelMeta(
795
795
 
796
796
 
797
797
  jina_embeddings_v3 = ModelMeta(
798
- loader=JinaWrapper, # type: ignore
798
+ loader=JinaWrapper,
799
799
  loader_kwargs=dict(
800
800
  trust_remote_code=True,
801
801
  model_prompts={