mteb 2.5.2__py3-none-any.whl → 2.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +2 -0
- mteb/_create_dataloaders.py +17 -18
- mteb/_evaluators/any_sts_evaluator.py +3 -3
- mteb/_evaluators/clustering_evaluator.py +2 -2
- mteb/_evaluators/evaluator.py +4 -2
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +10 -8
- mteb/_evaluators/pair_classification_evaluator.py +5 -3
- mteb/_evaluators/retrieval_evaluator.py +2 -2
- mteb/_evaluators/retrieval_metrics.py +18 -17
- mteb/_evaluators/sklearn_evaluator.py +11 -10
- mteb/_evaluators/text/bitext_mining_evaluator.py +27 -18
- mteb/_evaluators/text/summarization_evaluator.py +23 -18
- mteb/_evaluators/zeroshot_classification_evaluator.py +5 -3
- mteb/abstasks/_data_filter/filters.py +1 -1
- mteb/abstasks/_data_filter/task_pipelines.py +3 -0
- mteb/abstasks/_statistics_calculation.py +18 -10
- mteb/abstasks/_stratification.py +18 -18
- mteb/abstasks/abstask.py +35 -28
- mteb/abstasks/aggregate_task_metadata.py +1 -9
- mteb/abstasks/aggregated_task.py +10 -29
- mteb/abstasks/classification.py +15 -10
- mteb/abstasks/clustering.py +19 -15
- mteb/abstasks/clustering_legacy.py +10 -10
- mteb/abstasks/image/image_text_pair_classification.py +7 -4
- mteb/abstasks/multilabel_classification.py +23 -19
- mteb/abstasks/pair_classification.py +20 -11
- mteb/abstasks/regression.py +4 -4
- mteb/abstasks/retrieval.py +28 -24
- mteb/abstasks/retrieval_dataset_loaders.py +2 -2
- mteb/abstasks/sts.py +8 -5
- mteb/abstasks/task_metadata.py +31 -33
- mteb/abstasks/text/bitext_mining.py +39 -28
- mteb/abstasks/text/reranking.py +8 -6
- mteb/abstasks/text/summarization.py +10 -5
- mteb/abstasks/zeroshot_classification.py +8 -4
- mteb/benchmarks/benchmark.py +4 -2
- mteb/benchmarks/benchmarks/__init__.py +4 -0
- mteb/benchmarks/benchmarks/benchmarks.py +112 -11
- mteb/benchmarks/get_benchmark.py +14 -55
- mteb/cache.py +182 -29
- mteb/cli/_display_tasks.py +2 -2
- mteb/cli/build_cli.py +110 -14
- mteb/cli/generate_model_card.py +43 -23
- mteb/deprecated_evaluator.py +63 -49
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
- mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
- mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
- mteb/evaluate.py +44 -33
- mteb/filter_tasks.py +25 -26
- mteb/get_tasks.py +29 -30
- mteb/languages/language_scripts.py +5 -3
- mteb/leaderboard/app.py +162 -34
- mteb/load_results.py +12 -12
- mteb/models/abs_encoder.py +10 -6
- mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
- mteb/models/cache_wrappers/cache_wrapper.py +2 -2
- mteb/models/get_model_meta.py +21 -3
- mteb/models/instruct_wrapper.py +28 -8
- mteb/models/model_implementations/align_models.py +1 -1
- mteb/models/model_implementations/andersborges.py +4 -4
- mteb/models/model_implementations/ara_models.py +1 -1
- mteb/models/model_implementations/arctic_models.py +8 -8
- mteb/models/model_implementations/b1ade_models.py +1 -1
- mteb/models/model_implementations/bge_models.py +45 -21
- mteb/models/model_implementations/bica_model.py +3 -3
- mteb/models/model_implementations/blip2_models.py +2 -2
- mteb/models/model_implementations/blip_models.py +16 -16
- mteb/models/model_implementations/bm25.py +4 -4
- mteb/models/model_implementations/bmretriever_models.py +6 -4
- mteb/models/model_implementations/cadet_models.py +1 -1
- mteb/models/model_implementations/cde_models.py +11 -4
- mteb/models/model_implementations/clip_models.py +6 -6
- mteb/models/model_implementations/clips_models.py +3 -3
- mteb/models/model_implementations/codefuse_models.py +5 -5
- mteb/models/model_implementations/codesage_models.py +3 -3
- mteb/models/model_implementations/cohere_models.py +5 -5
- mteb/models/model_implementations/cohere_v.py +2 -2
- mteb/models/model_implementations/colpali_models.py +3 -3
- mteb/models/model_implementations/colqwen_models.py +8 -8
- mteb/models/model_implementations/colsmol_models.py +2 -2
- mteb/models/model_implementations/conan_models.py +1 -1
- mteb/models/model_implementations/dino_models.py +42 -42
- mteb/models/model_implementations/e5_instruct.py +23 -4
- mteb/models/model_implementations/e5_models.py +9 -9
- mteb/models/model_implementations/e5_v.py +6 -6
- mteb/models/model_implementations/eagerworks_models.py +1 -1
- mteb/models/model_implementations/emillykkejensen_models.py +6 -6
- mteb/models/model_implementations/en_code_retriever.py +1 -1
- mteb/models/model_implementations/euler_models.py +2 -2
- mteb/models/model_implementations/fa_models.py +9 -9
- mteb/models/model_implementations/facebookai.py +14 -2
- mteb/models/model_implementations/geogpt_models.py +1 -1
- mteb/models/model_implementations/gme_v_models.py +6 -5
- mteb/models/model_implementations/google_models.py +1 -1
- mteb/models/model_implementations/granite_vision_embedding_models.py +1 -1
- mteb/models/model_implementations/gritlm_models.py +2 -2
- mteb/models/model_implementations/gte_models.py +25 -13
- mteb/models/model_implementations/hinvec_models.py +1 -1
- mteb/models/model_implementations/ibm_granite_models.py +30 -6
- mteb/models/model_implementations/inf_models.py +2 -2
- mteb/models/model_implementations/jasper_models.py +2 -2
- mteb/models/model_implementations/jina_clip.py +48 -10
- mteb/models/model_implementations/jina_models.py +18 -11
- mteb/models/model_implementations/kblab.py +12 -6
- mteb/models/model_implementations/kennethenevoldsen_models.py +4 -4
- mteb/models/model_implementations/kfst.py +1 -1
- mteb/models/model_implementations/kowshik24_models.py +1 -1
- mteb/models/model_implementations/lgai_embedding_models.py +1 -1
- mteb/models/model_implementations/linq_models.py +1 -1
- mteb/models/model_implementations/listconranker.py +1 -1
- mteb/models/model_implementations/llm2clip_models.py +6 -6
- mteb/models/model_implementations/llm2vec_models.py +8 -8
- mteb/models/model_implementations/mcinext_models.py +4 -1
- mteb/models/model_implementations/mdbr_models.py +17 -3
- mteb/models/model_implementations/misc_models.py +68 -68
- mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
- mteb/models/model_implementations/mme5_models.py +1 -1
- mteb/models/model_implementations/moco_models.py +4 -4
- mteb/models/model_implementations/mod_models.py +1 -1
- mteb/models/model_implementations/model2vec_models.py +14 -14
- mteb/models/model_implementations/moka_models.py +1 -1
- mteb/models/model_implementations/nbailab.py +3 -3
- mteb/models/model_implementations/no_instruct_sentence_models.py +2 -2
- mteb/models/model_implementations/nomic_models.py +30 -15
- mteb/models/model_implementations/nomic_models_vision.py +1 -1
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +15 -9
- mteb/models/model_implementations/nvidia_models.py +151 -19
- mteb/models/model_implementations/octen_models.py +61 -2
- mteb/models/model_implementations/openclip_models.py +13 -13
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -5
- mteb/models/model_implementations/ops_moa_models.py +1 -1
- mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
- mteb/models/model_implementations/pawan_models.py +1 -1
- mteb/models/model_implementations/piccolo_models.py +1 -1
- mteb/models/model_implementations/pixie_models.py +56 -0
- mteb/models/model_implementations/promptriever_models.py +4 -4
- mteb/models/model_implementations/pylate_models.py +10 -9
- mteb/models/model_implementations/qodo_models.py +2 -2
- mteb/models/model_implementations/qtack_models.py +1 -1
- mteb/models/model_implementations/qwen3_models.py +3 -3
- mteb/models/model_implementations/qzhou_models.py +2 -2
- mteb/models/model_implementations/random_baseline.py +3 -3
- mteb/models/model_implementations/rasgaard_models.py +2 -2
- mteb/models/model_implementations/reasonir_model.py +1 -1
- mteb/models/model_implementations/repllama_models.py +3 -3
- mteb/models/model_implementations/rerankers_custom.py +12 -6
- mteb/models/model_implementations/rerankers_monot5_based.py +17 -17
- mteb/models/model_implementations/richinfoai_models.py +1 -1
- mteb/models/model_implementations/ru_sentence_models.py +20 -20
- mteb/models/model_implementations/ruri_models.py +10 -10
- mteb/models/model_implementations/salesforce_models.py +3 -3
- mteb/models/model_implementations/samilpwc_models.py +1 -1
- mteb/models/model_implementations/sarashina_embedding_models.py +2 -2
- mteb/models/model_implementations/searchmap_models.py +1 -1
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +113 -146
- mteb/models/model_implementations/sentence_transformers_models.py +124 -22
- mteb/models/model_implementations/shuu_model.py +1 -1
- mteb/models/model_implementations/siglip_models.py +20 -20
- mteb/models/model_implementations/slm_models.py +416 -0
- mteb/models/model_implementations/spartan8806_atles_champion.py +1 -1
- mteb/models/model_implementations/stella_models.py +17 -4
- mteb/models/model_implementations/tarka_models.py +2 -2
- mteb/models/model_implementations/text2vec_models.py +9 -3
- mteb/models/model_implementations/ua_sentence_models.py +1 -1
- mteb/models/model_implementations/uae_models.py +7 -1
- mteb/models/model_implementations/vdr_models.py +1 -1
- mteb/models/model_implementations/vi_vn_models.py +6 -6
- mteb/models/model_implementations/vlm2vec_models.py +3 -3
- mteb/models/model_implementations/voyage_models.py +84 -0
- mteb/models/model_implementations/voyage_v.py +9 -7
- mteb/models/model_implementations/youtu_models.py +1 -1
- mteb/models/model_implementations/yuan_models.py +1 -1
- mteb/models/model_implementations/yuan_models_en.py +1 -1
- mteb/models/model_meta.py +80 -31
- mteb/models/models_protocols.py +22 -6
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +9 -6
- mteb/models/search_wrappers.py +33 -18
- mteb/models/sentence_transformer_wrapper.py +50 -25
- mteb/models/vllm_wrapper.py +327 -0
- mteb/py.typed +0 -0
- mteb/results/benchmark_results.py +29 -21
- mteb/results/model_result.py +52 -22
- mteb/results/task_result.py +80 -58
- mteb/similarity_functions.py +11 -7
- mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
- mteb/tasks/classification/est/estonian_valence.py +1 -1
- mteb/tasks/classification/kur/kurdish_sentiment_classification.py +2 -2
- mteb/tasks/classification/multilingual/scala_classification.py +1 -1
- mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
- mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
- mteb/tasks/retrieval/code/code_rag.py +12 -12
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
- mteb/tasks/retrieval/eng/__init__.py +2 -0
- mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
- mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
- mteb/tasks/retrieval/kor/__init__.py +15 -1
- mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
- mteb/tasks/retrieval/multilingual/__init__.py +2 -0
- mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +90 -100
- mteb/tasks/retrieval/nob/norquad.py +2 -2
- mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
- mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
- mteb/tasks/retrieval/vie/__init__.py +14 -6
- mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/fevervn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +48 -0
- mteb/tasks/retrieval/vie/nqvn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
- mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
- mteb/types/__init__.py +2 -0
- mteb/types/_encoder_io.py +12 -0
- mteb/types/_result.py +2 -1
- mteb/types/statistics.py +9 -3
- {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/METADATA +15 -4
- {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/RECORD +240 -219
- mteb/models/model_implementations/mxbai_models.py +0 -111
- {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
- {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
- {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
|
@@ -25,6 +25,9 @@ VOYAGE_DTYPE_TRANSLATION = {
|
|
|
25
25
|
|
|
26
26
|
# Total token limits per model based on VoyageAI documentation
|
|
27
27
|
VOYAGE_TOTAL_TOKEN_LIMITS = {
|
|
28
|
+
"voyage-4-large": 120_000,
|
|
29
|
+
"voyage-4": 320_000,
|
|
30
|
+
"voyage-4-lite": 1_000_000,
|
|
28
31
|
"voyage-3.5-lite": 1_000_000,
|
|
29
32
|
"voyage-3.5": 320_000,
|
|
30
33
|
"voyage-2": 320_000,
|
|
@@ -206,6 +209,84 @@ model_prompts = {
|
|
|
206
209
|
PromptType.document.value: "document",
|
|
207
210
|
}
|
|
208
211
|
|
|
212
|
+
voyage_4 = ModelMeta(
|
|
213
|
+
name="voyageai/voyage-4",
|
|
214
|
+
model_type=["dense"],
|
|
215
|
+
revision="1",
|
|
216
|
+
release_date="2026-01-15",
|
|
217
|
+
languages=None, # supported languages not specified
|
|
218
|
+
loader=VoyageModel,
|
|
219
|
+
loader_kwargs=dict(
|
|
220
|
+
max_tokens=32000,
|
|
221
|
+
model_prompts=model_prompts,
|
|
222
|
+
),
|
|
223
|
+
max_tokens=32000,
|
|
224
|
+
embed_dim=1024,
|
|
225
|
+
open_weights=False,
|
|
226
|
+
n_parameters=None,
|
|
227
|
+
memory_usage_mb=None,
|
|
228
|
+
license=None,
|
|
229
|
+
reference="https://blog.voyageai.com/2026/01/15/voyage-4/",
|
|
230
|
+
similarity_fn_name="cosine",
|
|
231
|
+
framework=["API"],
|
|
232
|
+
use_instructions=True,
|
|
233
|
+
training_datasets=VOYAGE_TRAINING_DATA,
|
|
234
|
+
public_training_code=None,
|
|
235
|
+
public_training_data=None,
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
voyage_4_lite = ModelMeta(
|
|
239
|
+
name="voyageai/voyage-4-lite",
|
|
240
|
+
model_type=["dense"],
|
|
241
|
+
revision="1",
|
|
242
|
+
release_date="2026-01-15",
|
|
243
|
+
languages=None, # supported languages not specified
|
|
244
|
+
loader=VoyageModel,
|
|
245
|
+
loader_kwargs=dict(
|
|
246
|
+
max_tokens=32000,
|
|
247
|
+
model_prompts=model_prompts,
|
|
248
|
+
),
|
|
249
|
+
max_tokens=32000,
|
|
250
|
+
embed_dim=1024,
|
|
251
|
+
open_weights=False,
|
|
252
|
+
n_parameters=None,
|
|
253
|
+
memory_usage_mb=None,
|
|
254
|
+
license=None,
|
|
255
|
+
reference="https://blog.voyageai.com/2026/01/15/voyage-4/",
|
|
256
|
+
similarity_fn_name="cosine",
|
|
257
|
+
framework=["API"],
|
|
258
|
+
use_instructions=True,
|
|
259
|
+
training_datasets=VOYAGE_TRAINING_DATA,
|
|
260
|
+
public_training_code=None,
|
|
261
|
+
public_training_data=None,
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
voyage_4_large = ModelMeta(
|
|
265
|
+
name="voyageai/voyage-4-large",
|
|
266
|
+
model_type=["dense"],
|
|
267
|
+
revision="1",
|
|
268
|
+
release_date="2026-01-15",
|
|
269
|
+
languages=None, # supported languages not specified
|
|
270
|
+
loader=VoyageModel,
|
|
271
|
+
loader_kwargs=dict(
|
|
272
|
+
max_tokens=32000,
|
|
273
|
+
model_prompts=model_prompts,
|
|
274
|
+
),
|
|
275
|
+
max_tokens=32000,
|
|
276
|
+
embed_dim=1024,
|
|
277
|
+
open_weights=False,
|
|
278
|
+
n_parameters=None,
|
|
279
|
+
memory_usage_mb=None,
|
|
280
|
+
license=None,
|
|
281
|
+
reference="https://blog.voyageai.com/2026/01/15/voyage-4/",
|
|
282
|
+
similarity_fn_name="cosine",
|
|
283
|
+
framework=["API"],
|
|
284
|
+
use_instructions=True,
|
|
285
|
+
training_datasets=VOYAGE_TRAINING_DATA,
|
|
286
|
+
public_training_code=None,
|
|
287
|
+
public_training_data=None,
|
|
288
|
+
)
|
|
289
|
+
|
|
209
290
|
voyage_3_large = ModelMeta(
|
|
210
291
|
name="voyageai/voyage-3-large", # Date of publication of this post https://blog.voyageai.com/2025/01/07/voyage-3-large/
|
|
211
292
|
model_type=["dense"],
|
|
@@ -230,6 +311,7 @@ voyage_3_large = ModelMeta(
|
|
|
230
311
|
training_datasets=VOYAGE_TRAINING_DATA,
|
|
231
312
|
public_training_code=None,
|
|
232
313
|
public_training_data=None,
|
|
314
|
+
superseded_by="voyageai/voyage-4-large",
|
|
233
315
|
)
|
|
234
316
|
|
|
235
317
|
|
|
@@ -257,6 +339,7 @@ voyage_3_5 = ModelMeta(
|
|
|
257
339
|
training_datasets=VOYAGE_TRAINING_DATA,
|
|
258
340
|
public_training_code=None,
|
|
259
341
|
public_training_data=None,
|
|
342
|
+
superseded_by="voyageai/voyage-4",
|
|
260
343
|
)
|
|
261
344
|
|
|
262
345
|
voyage_3_5_int8 = ModelMeta(
|
|
@@ -571,6 +654,7 @@ voyage_3_lite = ModelMeta(
|
|
|
571
654
|
training_datasets=VOYAGE_TRAINING_DATA,
|
|
572
655
|
public_training_code=None,
|
|
573
656
|
public_training_data=None,
|
|
657
|
+
superseded_by="voyageai/voyage-4-lite",
|
|
574
658
|
)
|
|
575
659
|
|
|
576
660
|
voyage_3_exp = ModelMeta(
|
|
@@ -16,6 +16,8 @@ from mteb.types import Array, BatchedInput, PromptType
|
|
|
16
16
|
if TYPE_CHECKING:
|
|
17
17
|
from PIL import Image
|
|
18
18
|
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
19
21
|
|
|
20
22
|
def _downsample_image(
|
|
21
23
|
image: Image.Image, max_pixels: int = 16000000, target_longest_side: int = 4000
|
|
@@ -37,18 +39,18 @@ def _downsample_image(
|
|
|
37
39
|
new_width = int(width * (target_longest_side / height))
|
|
38
40
|
|
|
39
41
|
new_size = (new_width, new_height)
|
|
40
|
-
|
|
42
|
+
logger.info(
|
|
41
43
|
f"Downsampling image from {width}x{height} to {new_width}x{new_height}"
|
|
42
44
|
)
|
|
43
|
-
return image.resize(new_size, Image.LANCZOS)
|
|
45
|
+
return image.resize(new_size, Image.LANCZOS)
|
|
44
46
|
if width > height:
|
|
45
47
|
if width > 10000:
|
|
46
|
-
|
|
47
|
-
return image.resize((10000, height), Image.LANCZOS)
|
|
48
|
+
logger.error("Processing extremely wide images.")
|
|
49
|
+
return image.resize((10000, height), Image.LANCZOS)
|
|
48
50
|
else:
|
|
49
51
|
if height > 10000:
|
|
50
|
-
|
|
51
|
-
return image.resize((width, 10000), Image.LANCZOS)
|
|
52
|
+
logger.error("Processing extremely high images.")
|
|
53
|
+
return image.resize((width, 10000), Image.LANCZOS)
|
|
52
54
|
return image
|
|
53
55
|
|
|
54
56
|
|
|
@@ -202,7 +204,7 @@ def voyage_v_loader(model_name, **kwargs):
|
|
|
202
204
|
|
|
203
205
|
|
|
204
206
|
voyage_v = ModelMeta(
|
|
205
|
-
loader=voyage_v_loader,
|
|
207
|
+
loader=voyage_v_loader,
|
|
206
208
|
name="voyageai/voyage-multimodal-3",
|
|
207
209
|
model_type=["dense"],
|
|
208
210
|
languages=[], # Unknown
|
|
@@ -127,7 +127,7 @@ Youtu_Embedding_V1 = ModelMeta(
|
|
|
127
127
|
max_tokens=8192,
|
|
128
128
|
reference="https://huggingface.co/tencent/Youtu-Embedding",
|
|
129
129
|
similarity_fn_name="cosine",
|
|
130
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
130
|
+
framework=["Sentence Transformers", "PyTorch", "Transformers", "safetensors"],
|
|
131
131
|
use_instructions=True,
|
|
132
132
|
public_training_code=None,
|
|
133
133
|
public_training_data=None,
|
|
@@ -26,7 +26,7 @@ yuan_embedding_2_zh = ModelMeta(
|
|
|
26
26
|
max_tokens=512,
|
|
27
27
|
reference="https://huggingface.co/IEITYuan/Yuan-embedding-2.0-zh",
|
|
28
28
|
similarity_fn_name="cosine",
|
|
29
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
29
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors"],
|
|
30
30
|
use_instructions=False,
|
|
31
31
|
public_training_code=None,
|
|
32
32
|
public_training_data=None,
|
|
@@ -49,7 +49,7 @@ yuan_embedding_2_en = ModelMeta(
|
|
|
49
49
|
license="apache-2.0",
|
|
50
50
|
reference="https://huggingface.co/IEITYuan/Yuan-embedding-2.0-en",
|
|
51
51
|
similarity_fn_name="cosine",
|
|
52
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
52
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors"],
|
|
53
53
|
use_instructions=True,
|
|
54
54
|
public_training_code=None,
|
|
55
55
|
public_training_data=None,
|
mteb/models/model_meta.py
CHANGED
|
@@ -17,11 +17,13 @@ from huggingface_hub import (
|
|
|
17
17
|
get_safetensors_metadata,
|
|
18
18
|
hf_hub_download,
|
|
19
19
|
list_repo_commits,
|
|
20
|
+
model_info,
|
|
20
21
|
repo_exists,
|
|
21
22
|
)
|
|
22
23
|
from huggingface_hub.errors import (
|
|
23
24
|
EntryNotFoundError,
|
|
24
25
|
GatedRepoError,
|
|
26
|
+
HFValidationError,
|
|
25
27
|
NotASafetensorsRepoError,
|
|
26
28
|
RepositoryNotFoundError,
|
|
27
29
|
SafetensorsParsingError,
|
|
@@ -55,6 +57,10 @@ FRAMEWORKS = Literal[
|
|
|
55
57
|
"PyLate",
|
|
56
58
|
"ColBERT",
|
|
57
59
|
"ColPali",
|
|
60
|
+
"GGUF",
|
|
61
|
+
"safetensors",
|
|
62
|
+
"ONNX",
|
|
63
|
+
"Transformers",
|
|
58
64
|
]
|
|
59
65
|
|
|
60
66
|
MODEL_TYPES = Literal["dense", "cross-encoder", "late-interaction"]
|
|
@@ -81,9 +87,6 @@ def _get_loader_name(
|
|
|
81
87
|
return loader.__name__
|
|
82
88
|
|
|
83
89
|
|
|
84
|
-
_SENTENCE_TRANSFORMER_LIB_NAME = "Sentence Transformers"
|
|
85
|
-
|
|
86
|
-
|
|
87
90
|
class ModelMeta(BaseModel):
|
|
88
91
|
"""The model metadata object.
|
|
89
92
|
|
|
@@ -250,7 +253,7 @@ class ModelMeta(BaseModel):
|
|
|
250
253
|
)
|
|
251
254
|
return v
|
|
252
255
|
|
|
253
|
-
def load_model(self, **kwargs: Any) -> MTEBModels:
|
|
256
|
+
def load_model(self, device: str | None = None, **kwargs: Any) -> MTEBModels:
|
|
254
257
|
"""Loads the model using the specified loader function."""
|
|
255
258
|
if self.loader is None:
|
|
256
259
|
raise NotImplementedError(
|
|
@@ -262,11 +265,11 @@ class ModelMeta(BaseModel):
|
|
|
262
265
|
# Allow overwrites
|
|
263
266
|
_kwargs = self.loader_kwargs.copy()
|
|
264
267
|
_kwargs.update(kwargs)
|
|
268
|
+
if device is not None:
|
|
269
|
+
_kwargs["device"] = device
|
|
265
270
|
|
|
266
|
-
model:
|
|
267
|
-
|
|
268
|
-
)
|
|
269
|
-
model.mteb_model_meta = self # type: ignore
|
|
271
|
+
model: MTEBModels = self.loader(self.name, revision=self.revision, **_kwargs)
|
|
272
|
+
model.mteb_model_meta = self # type: ignore[misc]
|
|
270
273
|
return model
|
|
271
274
|
|
|
272
275
|
def model_name_as_path(self) -> str:
|
|
@@ -307,7 +310,7 @@ class ModelMeta(BaseModel):
|
|
|
307
310
|
embedding_dim = None
|
|
308
311
|
max_tokens = None
|
|
309
312
|
|
|
310
|
-
if model_name and compute_metadata and
|
|
313
|
+
if model_name and compute_metadata and _repo_exists(model_name):
|
|
311
314
|
reference = "https://huggingface.co/" + model_name
|
|
312
315
|
card = ModelCard.load(model_name)
|
|
313
316
|
card_data: ModelCardData = card.data
|
|
@@ -318,22 +321,17 @@ class ModelMeta(BaseModel):
|
|
|
318
321
|
model_config = None
|
|
319
322
|
logger.warning(f"Can't get configuration for {model_name}. Error: {e}")
|
|
320
323
|
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
)
|
|
325
|
-
frameworks.append(_SENTENCE_TRANSFORMER_LIB_NAME)
|
|
326
|
-
else:
|
|
327
|
-
msg = "Model library not recognized, defaulting to Sentence Transformers loader."
|
|
328
|
-
logger.warning(msg)
|
|
329
|
-
warnings.warn(msg)
|
|
324
|
+
hf_frameworks = (
|
|
325
|
+
cls._get_frameworks_from_hf_tags(model_name) if model_name else []
|
|
326
|
+
)
|
|
327
|
+
frameworks.extend(hf_frameworks)
|
|
330
328
|
|
|
331
329
|
if revision is None:
|
|
332
330
|
revisions = _get_repo_commits(model_name, "model")
|
|
333
331
|
revision = revisions[0].commit_id if revisions else None
|
|
334
332
|
|
|
335
333
|
release_date = cls.fetch_release_date(model_name)
|
|
336
|
-
model_license = card_data.license
|
|
334
|
+
model_license = card_data.license if card_data.license != "other" else None
|
|
337
335
|
n_parameters = cls._calculate_num_parameters_from_hub(model_name)
|
|
338
336
|
memory_usage_mb = cls._calculate_memory_usage_mb(model_name, n_parameters)
|
|
339
337
|
if model_config and hasattr(model_config, "hidden_size"):
|
|
@@ -386,8 +384,6 @@ class ModelMeta(BaseModel):
|
|
|
386
384
|
else model.model_card_data.base_model
|
|
387
385
|
)
|
|
388
386
|
meta = cls._from_hub(name, revision, compute_metadata)
|
|
389
|
-
if _SENTENCE_TRANSFORMER_LIB_NAME not in meta.framework:
|
|
390
|
-
meta.framework.append("Sentence Transformers")
|
|
391
387
|
meta.revision = model.model_card_data.base_model_revision or meta.revision
|
|
392
388
|
meta.max_tokens = model.max_seq_length
|
|
393
389
|
meta.embed_dim = model.get_sentence_embedding_dimension()
|
|
@@ -413,11 +409,9 @@ class ModelMeta(BaseModel):
|
|
|
413
409
|
The generated ModelMeta.
|
|
414
410
|
"""
|
|
415
411
|
meta = cls._from_hub(model, revision, compute_metadata)
|
|
416
|
-
if _SENTENCE_TRANSFORMER_LIB_NAME not in meta.framework:
|
|
417
|
-
meta.framework.append("Sentence Transformers")
|
|
418
412
|
meta.modalities = ["text"]
|
|
419
413
|
|
|
420
|
-
if model and compute_metadata and
|
|
414
|
+
if model and compute_metadata and _repo_exists(model):
|
|
421
415
|
# have max_seq_length field
|
|
422
416
|
sbert_config = _get_json_from_hub(
|
|
423
417
|
model, "sentence_bert_config.json", "model", revision=revision
|
|
@@ -435,7 +429,7 @@ class ModelMeta(BaseModel):
|
|
|
435
429
|
and config_sbert.get("similarity_fn_name") is not None
|
|
436
430
|
):
|
|
437
431
|
meta.similarity_fn_name = ScoringFunction.from_str(
|
|
438
|
-
config_sbert
|
|
432
|
+
config_sbert["similarity_fn_name"]
|
|
439
433
|
)
|
|
440
434
|
else:
|
|
441
435
|
meta.similarity_fn_name = ScoringFunction.COSINE
|
|
@@ -461,8 +455,6 @@ class ModelMeta(BaseModel):
|
|
|
461
455
|
from mteb.models import CrossEncoderWrapper
|
|
462
456
|
|
|
463
457
|
meta = cls._from_hub(model.model.name_or_path, revision, compute_metadata)
|
|
464
|
-
if _SENTENCE_TRANSFORMER_LIB_NAME not in meta.framework:
|
|
465
|
-
meta.framework.append("Sentence Transformers")
|
|
466
458
|
meta.revision = model.config._commit_hash or meta.revision
|
|
467
459
|
meta.loader = CrossEncoderWrapper
|
|
468
460
|
meta.embed_dim = None
|
|
@@ -511,10 +503,12 @@ class ModelMeta(BaseModel):
|
|
|
511
503
|
if adapted_training_datasets is not None:
|
|
512
504
|
training_datasets |= adapted_training_datasets
|
|
513
505
|
except (ValueError, KeyError) as e:
|
|
514
|
-
|
|
506
|
+
msg = f"Could not get source model: {e} in MTEB"
|
|
507
|
+
logger.warning(msg)
|
|
508
|
+
warnings.warn(msg)
|
|
515
509
|
|
|
516
510
|
return_dataset = training_datasets.copy()
|
|
517
|
-
visited = set()
|
|
511
|
+
visited: set[str] = set()
|
|
518
512
|
|
|
519
513
|
for dataset in training_datasets:
|
|
520
514
|
similar_tasks = _collect_similar_tasks(dataset, visited)
|
|
@@ -548,6 +542,8 @@ class ModelMeta(BaseModel):
|
|
|
548
542
|
|
|
549
543
|
@staticmethod
|
|
550
544
|
def _calculate_num_parameters_from_hub(model_name: str | None = None) -> int | None:
|
|
545
|
+
if not model_name:
|
|
546
|
+
return None
|
|
551
547
|
try:
|
|
552
548
|
safetensors_metadata = get_safetensors_metadata(model_name)
|
|
553
549
|
if len(safetensors_metadata.parameter_count) >= 0:
|
|
@@ -561,7 +557,7 @@ class ModelMeta(BaseModel):
|
|
|
561
557
|
logger.warning(
|
|
562
558
|
f"Can't calculate number of parameters for {model_name}. Got error {e}"
|
|
563
559
|
)
|
|
564
|
-
|
|
560
|
+
return None
|
|
565
561
|
|
|
566
562
|
def calculate_num_parameters_from_hub(self) -> int | None:
|
|
567
563
|
"""Calculates the number of parameters in the model.
|
|
@@ -624,7 +620,7 @@ class ModelMeta(BaseModel):
|
|
|
624
620
|
if "API" in self.framework or self.name is None:
|
|
625
621
|
return None
|
|
626
622
|
|
|
627
|
-
return self._calculate_memory_usage_mb(self.
|
|
623
|
+
return self._calculate_memory_usage_mb(self.name, self.n_parameters)
|
|
628
624
|
|
|
629
625
|
@staticmethod
|
|
630
626
|
def fetch_release_date(model_name: str) -> StrDate | None:
|
|
@@ -640,6 +636,43 @@ class ModelMeta(BaseModel):
|
|
|
640
636
|
return release_date
|
|
641
637
|
return None
|
|
642
638
|
|
|
639
|
+
@staticmethod
|
|
640
|
+
def _get_frameworks_from_hf_tags(model_name: str) -> list[FRAMEWORKS]:
|
|
641
|
+
"""Extract frameworks supported by the model from HuggingFace model tags.
|
|
642
|
+
|
|
643
|
+
Args:
|
|
644
|
+
model_name: HuggingFace model name
|
|
645
|
+
|
|
646
|
+
Returns:
|
|
647
|
+
List of framework names found in tags. Defaults to empty list if no frameworks found.
|
|
648
|
+
"""
|
|
649
|
+
try:
|
|
650
|
+
info = model_info(model_name)
|
|
651
|
+
if not info.tags:
|
|
652
|
+
return []
|
|
653
|
+
except Exception as e:
|
|
654
|
+
logger.warning(
|
|
655
|
+
f"Failed to fetch frameworks from HuggingFace tags for {model_name}: {e}"
|
|
656
|
+
)
|
|
657
|
+
return []
|
|
658
|
+
|
|
659
|
+
# Mapping from HuggingFace tags to MTEB framework names
|
|
660
|
+
tag_to_framework: dict[str, FRAMEWORKS] = {
|
|
661
|
+
"sentence-transformers": "Sentence Transformers",
|
|
662
|
+
"transformers": "Transformers",
|
|
663
|
+
"onnx": "ONNX",
|
|
664
|
+
"safetensors": "safetensors",
|
|
665
|
+
"gguf": "GGUF",
|
|
666
|
+
}
|
|
667
|
+
|
|
668
|
+
frameworks: list[FRAMEWORKS] = []
|
|
669
|
+
|
|
670
|
+
for framework_tag in tag_to_framework.keys():
|
|
671
|
+
if framework_tag in info.tags:
|
|
672
|
+
frameworks.append(tag_to_framework[framework_tag])
|
|
673
|
+
|
|
674
|
+
return frameworks
|
|
675
|
+
|
|
643
676
|
def to_python(self) -> str:
|
|
644
677
|
"""Returns a string representation of the model."""
|
|
645
678
|
return _pydantic_instance_to_code(self)
|
|
@@ -784,3 +817,19 @@ def _get_file_on_hub(
|
|
|
784
817
|
except (GatedRepoError, RepositoryNotFoundError, EntryNotFoundError) as e:
|
|
785
818
|
logger.warning(f"Can't get file {file_name} of {repo_id}: {e}")
|
|
786
819
|
return None
|
|
820
|
+
|
|
821
|
+
|
|
822
|
+
def _repo_exists(repo_id: str, repo_type: str | None = None) -> bool:
|
|
823
|
+
"""Checks if a repository exists on HuggingFace Hub.
|
|
824
|
+
|
|
825
|
+
Repo exists will raise HFValidationError for invalid local paths
|
|
826
|
+
|
|
827
|
+
Args:
|
|
828
|
+
repo_id: The repository ID.
|
|
829
|
+
repo_type: The type of repository (e.g., "model", "dataset", "space").
|
|
830
|
+
"""
|
|
831
|
+
try:
|
|
832
|
+
return repo_exists(repo_id=repo_id, repo_type=repo_type)
|
|
833
|
+
except HFValidationError as e:
|
|
834
|
+
logger.warning(f"Can't check existence of {repo_id}: {e}")
|
|
835
|
+
return False
|
mteb/models/models_protocols.py
CHANGED
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable
|
|
2
2
|
|
|
3
3
|
from torch.utils.data import DataLoader
|
|
4
|
+
from typing_extensions import Unpack
|
|
4
5
|
|
|
5
6
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
6
7
|
from mteb.types import (
|
|
7
8
|
Array,
|
|
8
9
|
BatchedInput,
|
|
9
10
|
CorpusDatasetType,
|
|
11
|
+
EncodeKwargs,
|
|
10
12
|
PromptType,
|
|
11
13
|
QueryDatasetType,
|
|
12
14
|
RetrievalOutputType,
|
|
@@ -28,7 +30,7 @@ class SearchProtocol(Protocol):
|
|
|
28
30
|
task_metadata: TaskMetadata,
|
|
29
31
|
hf_split: str,
|
|
30
32
|
hf_subset: str,
|
|
31
|
-
encode_kwargs:
|
|
33
|
+
encode_kwargs: EncodeKwargs,
|
|
32
34
|
) -> None:
|
|
33
35
|
"""Index the corpus for retrieval.
|
|
34
36
|
|
|
@@ -49,7 +51,7 @@ class SearchProtocol(Protocol):
|
|
|
49
51
|
hf_split: str,
|
|
50
52
|
hf_subset: str,
|
|
51
53
|
top_k: int,
|
|
52
|
-
encode_kwargs:
|
|
54
|
+
encode_kwargs: EncodeKwargs,
|
|
53
55
|
top_ranked: TopRankedDocumentsType | None = None,
|
|
54
56
|
) -> RetrievalOutputType:
|
|
55
57
|
"""Search the corpus using the given queries.
|
|
@@ -83,12 +85,19 @@ class EncoderProtocol(Protocol):
|
|
|
83
85
|
In general the interface is kept aligned with sentence-transformers interface. In cases where exceptions occurs these are handled within MTEB.
|
|
84
86
|
"""
|
|
85
87
|
|
|
86
|
-
def __init__(
|
|
88
|
+
def __init__(
|
|
89
|
+
self,
|
|
90
|
+
model_name: str,
|
|
91
|
+
revision: str | None,
|
|
92
|
+
device: str | None = None,
|
|
93
|
+
**kwargs: Any,
|
|
94
|
+
) -> None:
|
|
87
95
|
"""The initialization function for the encoder. Used when calling it from the mteb run CLI.
|
|
88
96
|
|
|
89
97
|
Args:
|
|
90
98
|
model_name: Name of the model
|
|
91
99
|
revision: revision of the model
|
|
100
|
+
device: Device used to load the model
|
|
92
101
|
kwargs: Any additional kwargs
|
|
93
102
|
"""
|
|
94
103
|
...
|
|
@@ -101,7 +110,7 @@ class EncoderProtocol(Protocol):
|
|
|
101
110
|
hf_split: str,
|
|
102
111
|
hf_subset: str,
|
|
103
112
|
prompt_type: PromptType | None = None,
|
|
104
|
-
**kwargs:
|
|
113
|
+
**kwargs: Unpack[EncodeKwargs],
|
|
105
114
|
) -> Array:
|
|
106
115
|
"""Encodes the given sentences using the encoder.
|
|
107
116
|
|
|
@@ -181,12 +190,19 @@ class CrossEncoderProtocol(Protocol):
|
|
|
181
190
|
In general the interface is kept aligned with sentence-transformers interface. In cases where exceptions occurs these are handled within MTEB.
|
|
182
191
|
"""
|
|
183
192
|
|
|
184
|
-
def __init__(
|
|
193
|
+
def __init__(
|
|
194
|
+
self,
|
|
195
|
+
model_name: str,
|
|
196
|
+
revision: str | None,
|
|
197
|
+
device: str | None = None,
|
|
198
|
+
**kwargs: Any,
|
|
199
|
+
) -> None:
|
|
185
200
|
"""The initialization function for the encoder. Used when calling it from the mteb run CLI.
|
|
186
201
|
|
|
187
202
|
Args:
|
|
188
203
|
model_name: Name of the model
|
|
189
204
|
revision: revision of the model
|
|
205
|
+
device: Device used to load the model
|
|
190
206
|
kwargs: Any additional kwargs
|
|
191
207
|
"""
|
|
192
208
|
...
|
|
@@ -200,7 +216,7 @@ class CrossEncoderProtocol(Protocol):
|
|
|
200
216
|
hf_split: str,
|
|
201
217
|
hf_subset: str,
|
|
202
218
|
prompt_type: PromptType | None = None,
|
|
203
|
-
**kwargs:
|
|
219
|
+
**kwargs: Unpack[EncodeKwargs],
|
|
204
220
|
) -> Array:
|
|
205
221
|
"""Predicts relevance scores for pairs of inputs. Note that, unlike the encoder, the cross-encoder can compare across inputs.
|
|
206
222
|
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import warnings
|
|
2
3
|
from collections.abc import Callable
|
|
3
4
|
|
|
4
5
|
import numpy as np
|
|
@@ -108,7 +109,7 @@ class FaissSearchIndex:
|
|
|
108
109
|
ids = ids.tolist()
|
|
109
110
|
|
|
110
111
|
if issubclass(self.index_type, faiss.IndexFlatL2):
|
|
111
|
-
similarities = -np.sqrt(np.maximum(similarities, 0))
|
|
112
|
+
similarities = (-np.sqrt(np.maximum(similarities, 0))).tolist()
|
|
112
113
|
|
|
113
114
|
return similarities, ids
|
|
114
115
|
|
|
@@ -116,8 +117,8 @@ class FaissSearchIndex:
|
|
|
116
117
|
self,
|
|
117
118
|
embeddings: Array,
|
|
118
119
|
top_k: int,
|
|
119
|
-
top_ranked: TopRankedDocumentsType
|
|
120
|
-
query_idx_to_id: dict[int, str]
|
|
120
|
+
top_ranked: TopRankedDocumentsType,
|
|
121
|
+
query_idx_to_id: dict[int, str],
|
|
121
122
|
) -> tuple[list[list[float]], list[list[int]]]:
|
|
122
123
|
doc_id_to_idx = {doc_id: i for i, doc_id in enumerate(self.idxs)}
|
|
123
124
|
scores_all: list[list[float]] = []
|
|
@@ -127,15 +128,17 @@ class FaissSearchIndex:
|
|
|
127
128
|
query_id = query_idx_to_id[query_idx]
|
|
128
129
|
ranked_ids = top_ranked.get(query_id)
|
|
129
130
|
if not ranked_ids:
|
|
130
|
-
|
|
131
|
+
msg = f"No top-ranked documents for query {query_id}"
|
|
132
|
+
logger.warning(msg)
|
|
133
|
+
warnings.warn(msg)
|
|
131
134
|
scores_all.append([])
|
|
132
135
|
idxs_all.append([])
|
|
133
136
|
continue
|
|
134
137
|
|
|
135
138
|
candidate_indices = [doc_id_to_idx[doc_id] for doc_id in ranked_ids]
|
|
136
|
-
d = self.index.d
|
|
139
|
+
d = self.index.d # type: ignore[union-attr]
|
|
137
140
|
candidate_embs = np.vstack(
|
|
138
|
-
[self.index.reconstruct(idx) for idx in candidate_indices]
|
|
141
|
+
[self.index.reconstruct(idx) for idx in candidate_indices] # type: ignore[union-attr]
|
|
139
142
|
)
|
|
140
143
|
sub_reranking_index = self.index_type(d)
|
|
141
144
|
sub_reranking_index.add(candidate_embs)
|