mteb 2.5.2__py3-none-any.whl → 2.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +2 -0
- mteb/_create_dataloaders.py +17 -18
- mteb/_evaluators/any_sts_evaluator.py +3 -3
- mteb/_evaluators/clustering_evaluator.py +2 -2
- mteb/_evaluators/evaluator.py +4 -2
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +10 -8
- mteb/_evaluators/pair_classification_evaluator.py +5 -3
- mteb/_evaluators/retrieval_evaluator.py +2 -2
- mteb/_evaluators/retrieval_metrics.py +18 -17
- mteb/_evaluators/sklearn_evaluator.py +11 -10
- mteb/_evaluators/text/bitext_mining_evaluator.py +27 -18
- mteb/_evaluators/text/summarization_evaluator.py +23 -18
- mteb/_evaluators/zeroshot_classification_evaluator.py +5 -3
- mteb/abstasks/_data_filter/filters.py +1 -1
- mteb/abstasks/_data_filter/task_pipelines.py +3 -0
- mteb/abstasks/_statistics_calculation.py +18 -10
- mteb/abstasks/_stratification.py +18 -18
- mteb/abstasks/abstask.py +35 -28
- mteb/abstasks/aggregate_task_metadata.py +1 -9
- mteb/abstasks/aggregated_task.py +10 -29
- mteb/abstasks/classification.py +15 -10
- mteb/abstasks/clustering.py +19 -15
- mteb/abstasks/clustering_legacy.py +10 -10
- mteb/abstasks/image/image_text_pair_classification.py +7 -4
- mteb/abstasks/multilabel_classification.py +23 -19
- mteb/abstasks/pair_classification.py +20 -11
- mteb/abstasks/regression.py +4 -4
- mteb/abstasks/retrieval.py +28 -24
- mteb/abstasks/retrieval_dataset_loaders.py +2 -2
- mteb/abstasks/sts.py +8 -5
- mteb/abstasks/task_metadata.py +31 -33
- mteb/abstasks/text/bitext_mining.py +39 -28
- mteb/abstasks/text/reranking.py +8 -6
- mteb/abstasks/text/summarization.py +10 -5
- mteb/abstasks/zeroshot_classification.py +8 -4
- mteb/benchmarks/benchmark.py +4 -2
- mteb/benchmarks/benchmarks/__init__.py +4 -0
- mteb/benchmarks/benchmarks/benchmarks.py +112 -11
- mteb/benchmarks/get_benchmark.py +14 -55
- mteb/cache.py +182 -29
- mteb/cli/_display_tasks.py +2 -2
- mteb/cli/build_cli.py +110 -14
- mteb/cli/generate_model_card.py +43 -23
- mteb/deprecated_evaluator.py +63 -49
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
- mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
- mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
- mteb/evaluate.py +44 -33
- mteb/filter_tasks.py +25 -26
- mteb/get_tasks.py +29 -30
- mteb/languages/language_scripts.py +5 -3
- mteb/leaderboard/app.py +162 -34
- mteb/load_results.py +12 -12
- mteb/models/abs_encoder.py +10 -6
- mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
- mteb/models/cache_wrappers/cache_wrapper.py +2 -2
- mteb/models/get_model_meta.py +21 -3
- mteb/models/instruct_wrapper.py +28 -8
- mteb/models/model_implementations/align_models.py +1 -1
- mteb/models/model_implementations/andersborges.py +4 -4
- mteb/models/model_implementations/ara_models.py +1 -1
- mteb/models/model_implementations/arctic_models.py +8 -8
- mteb/models/model_implementations/b1ade_models.py +1 -1
- mteb/models/model_implementations/bge_models.py +45 -21
- mteb/models/model_implementations/bica_model.py +3 -3
- mteb/models/model_implementations/blip2_models.py +2 -2
- mteb/models/model_implementations/blip_models.py +16 -16
- mteb/models/model_implementations/bm25.py +4 -4
- mteb/models/model_implementations/bmretriever_models.py +6 -4
- mteb/models/model_implementations/cadet_models.py +1 -1
- mteb/models/model_implementations/cde_models.py +11 -4
- mteb/models/model_implementations/clip_models.py +6 -6
- mteb/models/model_implementations/clips_models.py +3 -3
- mteb/models/model_implementations/codefuse_models.py +5 -5
- mteb/models/model_implementations/codesage_models.py +3 -3
- mteb/models/model_implementations/cohere_models.py +5 -5
- mteb/models/model_implementations/cohere_v.py +2 -2
- mteb/models/model_implementations/colpali_models.py +3 -3
- mteb/models/model_implementations/colqwen_models.py +8 -8
- mteb/models/model_implementations/colsmol_models.py +2 -2
- mteb/models/model_implementations/conan_models.py +1 -1
- mteb/models/model_implementations/dino_models.py +42 -42
- mteb/models/model_implementations/e5_instruct.py +23 -4
- mteb/models/model_implementations/e5_models.py +9 -9
- mteb/models/model_implementations/e5_v.py +6 -6
- mteb/models/model_implementations/eagerworks_models.py +1 -1
- mteb/models/model_implementations/emillykkejensen_models.py +6 -6
- mteb/models/model_implementations/en_code_retriever.py +1 -1
- mteb/models/model_implementations/euler_models.py +2 -2
- mteb/models/model_implementations/fa_models.py +9 -9
- mteb/models/model_implementations/facebookai.py +14 -2
- mteb/models/model_implementations/geogpt_models.py +1 -1
- mteb/models/model_implementations/gme_v_models.py +6 -5
- mteb/models/model_implementations/google_models.py +1 -1
- mteb/models/model_implementations/granite_vision_embedding_models.py +1 -1
- mteb/models/model_implementations/gritlm_models.py +2 -2
- mteb/models/model_implementations/gte_models.py +25 -13
- mteb/models/model_implementations/hinvec_models.py +1 -1
- mteb/models/model_implementations/ibm_granite_models.py +30 -6
- mteb/models/model_implementations/inf_models.py +2 -2
- mteb/models/model_implementations/jasper_models.py +2 -2
- mteb/models/model_implementations/jina_clip.py +48 -10
- mteb/models/model_implementations/jina_models.py +18 -11
- mteb/models/model_implementations/kblab.py +12 -6
- mteb/models/model_implementations/kennethenevoldsen_models.py +4 -4
- mteb/models/model_implementations/kfst.py +1 -1
- mteb/models/model_implementations/kowshik24_models.py +1 -1
- mteb/models/model_implementations/lgai_embedding_models.py +1 -1
- mteb/models/model_implementations/linq_models.py +1 -1
- mteb/models/model_implementations/listconranker.py +1 -1
- mteb/models/model_implementations/llm2clip_models.py +6 -6
- mteb/models/model_implementations/llm2vec_models.py +8 -8
- mteb/models/model_implementations/mcinext_models.py +4 -1
- mteb/models/model_implementations/mdbr_models.py +17 -3
- mteb/models/model_implementations/misc_models.py +68 -68
- mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
- mteb/models/model_implementations/mme5_models.py +1 -1
- mteb/models/model_implementations/moco_models.py +4 -4
- mteb/models/model_implementations/mod_models.py +1 -1
- mteb/models/model_implementations/model2vec_models.py +14 -14
- mteb/models/model_implementations/moka_models.py +1 -1
- mteb/models/model_implementations/nbailab.py +3 -3
- mteb/models/model_implementations/no_instruct_sentence_models.py +2 -2
- mteb/models/model_implementations/nomic_models.py +30 -15
- mteb/models/model_implementations/nomic_models_vision.py +1 -1
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +15 -9
- mteb/models/model_implementations/nvidia_models.py +151 -19
- mteb/models/model_implementations/octen_models.py +61 -2
- mteb/models/model_implementations/openclip_models.py +13 -13
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -5
- mteb/models/model_implementations/ops_moa_models.py +1 -1
- mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
- mteb/models/model_implementations/pawan_models.py +1 -1
- mteb/models/model_implementations/piccolo_models.py +1 -1
- mteb/models/model_implementations/pixie_models.py +56 -0
- mteb/models/model_implementations/promptriever_models.py +4 -4
- mteb/models/model_implementations/pylate_models.py +10 -9
- mteb/models/model_implementations/qodo_models.py +2 -2
- mteb/models/model_implementations/qtack_models.py +1 -1
- mteb/models/model_implementations/qwen3_models.py +3 -3
- mteb/models/model_implementations/qzhou_models.py +2 -2
- mteb/models/model_implementations/random_baseline.py +3 -3
- mteb/models/model_implementations/rasgaard_models.py +2 -2
- mteb/models/model_implementations/reasonir_model.py +1 -1
- mteb/models/model_implementations/repllama_models.py +3 -3
- mteb/models/model_implementations/rerankers_custom.py +12 -6
- mteb/models/model_implementations/rerankers_monot5_based.py +17 -17
- mteb/models/model_implementations/richinfoai_models.py +1 -1
- mteb/models/model_implementations/ru_sentence_models.py +20 -20
- mteb/models/model_implementations/ruri_models.py +10 -10
- mteb/models/model_implementations/salesforce_models.py +3 -3
- mteb/models/model_implementations/samilpwc_models.py +1 -1
- mteb/models/model_implementations/sarashina_embedding_models.py +2 -2
- mteb/models/model_implementations/searchmap_models.py +1 -1
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +113 -146
- mteb/models/model_implementations/sentence_transformers_models.py +124 -22
- mteb/models/model_implementations/shuu_model.py +1 -1
- mteb/models/model_implementations/siglip_models.py +20 -20
- mteb/models/model_implementations/slm_models.py +416 -0
- mteb/models/model_implementations/spartan8806_atles_champion.py +1 -1
- mteb/models/model_implementations/stella_models.py +17 -4
- mteb/models/model_implementations/tarka_models.py +2 -2
- mteb/models/model_implementations/text2vec_models.py +9 -3
- mteb/models/model_implementations/ua_sentence_models.py +1 -1
- mteb/models/model_implementations/uae_models.py +7 -1
- mteb/models/model_implementations/vdr_models.py +1 -1
- mteb/models/model_implementations/vi_vn_models.py +6 -6
- mteb/models/model_implementations/vlm2vec_models.py +3 -3
- mteb/models/model_implementations/voyage_models.py +84 -0
- mteb/models/model_implementations/voyage_v.py +9 -7
- mteb/models/model_implementations/youtu_models.py +1 -1
- mteb/models/model_implementations/yuan_models.py +1 -1
- mteb/models/model_implementations/yuan_models_en.py +1 -1
- mteb/models/model_meta.py +80 -31
- mteb/models/models_protocols.py +22 -6
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +9 -6
- mteb/models/search_wrappers.py +33 -18
- mteb/models/sentence_transformer_wrapper.py +50 -25
- mteb/models/vllm_wrapper.py +327 -0
- mteb/py.typed +0 -0
- mteb/results/benchmark_results.py +29 -21
- mteb/results/model_result.py +52 -22
- mteb/results/task_result.py +80 -58
- mteb/similarity_functions.py +11 -7
- mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
- mteb/tasks/classification/est/estonian_valence.py +1 -1
- mteb/tasks/classification/kur/kurdish_sentiment_classification.py +2 -2
- mteb/tasks/classification/multilingual/scala_classification.py +1 -1
- mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
- mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
- mteb/tasks/retrieval/code/code_rag.py +12 -12
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
- mteb/tasks/retrieval/eng/__init__.py +2 -0
- mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
- mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
- mteb/tasks/retrieval/kor/__init__.py +15 -1
- mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
- mteb/tasks/retrieval/multilingual/__init__.py +2 -0
- mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +90 -100
- mteb/tasks/retrieval/nob/norquad.py +2 -2
- mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
- mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
- mteb/tasks/retrieval/vie/__init__.py +14 -6
- mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/fevervn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +48 -0
- mteb/tasks/retrieval/vie/nqvn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
- mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
- mteb/types/__init__.py +2 -0
- mteb/types/_encoder_io.py +12 -0
- mteb/types/_result.py +2 -1
- mteb/types/statistics.py +9 -3
- {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/METADATA +15 -4
- {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/RECORD +240 -219
- mteb/models/model_implementations/mxbai_models.py +0 -111
- {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
- {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
- {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
|
@@ -250,7 +250,7 @@ rubert_tiny = ModelMeta(
|
|
|
250
250
|
max_tokens=512,
|
|
251
251
|
reference="https://huggingface.co/cointegrated/rubert-tiny",
|
|
252
252
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
253
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
253
|
+
framework=["Sentence Transformers", "PyTorch", "Transformers", "safetensors"],
|
|
254
254
|
use_instructions=False,
|
|
255
255
|
public_training_code="https://gist.github.com/avidale/7bc6350f26196918bf339c01261f5c60",
|
|
256
256
|
training_datasets={
|
|
@@ -276,7 +276,7 @@ rubert_tiny2 = ModelMeta(
|
|
|
276
276
|
max_tokens=2048,
|
|
277
277
|
reference="https://huggingface.co/cointegrated/rubert-tiny2",
|
|
278
278
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
279
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
279
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
|
|
280
280
|
use_instructions=False,
|
|
281
281
|
public_training_code="https://colab.research.google.com/drive/1mSWfIQ6PIlteLVZ9DKKpcorycgLIKZLf?usp=sharing",
|
|
282
282
|
training_datasets=set(
|
|
@@ -303,7 +303,7 @@ sbert_large_nlu_ru = ModelMeta(
|
|
|
303
303
|
max_tokens=512, # best guess
|
|
304
304
|
reference="https://huggingface.co/ai-forever/sbert_large_nlu_ru",
|
|
305
305
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
306
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
306
|
+
framework=["Sentence Transformers", "PyTorch", "Transformers", "safetensors"],
|
|
307
307
|
use_instructions=False,
|
|
308
308
|
public_training_code=None,
|
|
309
309
|
public_training_data=None,
|
|
@@ -329,7 +329,7 @@ sbert_large_mt_nlu_ru = ModelMeta(
|
|
|
329
329
|
max_tokens=512, # best guess
|
|
330
330
|
reference="https://huggingface.co/ai-forever/sbert_large_mt_nlu_ru",
|
|
331
331
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
332
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
332
|
+
framework=["Sentence Transformers", "PyTorch", "Transformers", "safetensors"],
|
|
333
333
|
use_instructions=False,
|
|
334
334
|
public_training_code=None,
|
|
335
335
|
public_training_data=None,
|
|
@@ -357,7 +357,7 @@ user_base_ru = ModelMeta(
|
|
|
357
357
|
max_tokens=512,
|
|
358
358
|
reference="https://huggingface.co/deepvk/USER-base",
|
|
359
359
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
360
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
360
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors"],
|
|
361
361
|
adapted_from="https://huggingface.co/deepvk/deberta-v1-base",
|
|
362
362
|
use_instructions=True,
|
|
363
363
|
citation="""@misc{deepvk2024user,
|
|
@@ -418,7 +418,7 @@ user_bge_m3 = ModelMeta(
|
|
|
418
418
|
max_tokens=8194,
|
|
419
419
|
reference="https://huggingface.co/deepvk/USER-base",
|
|
420
420
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
421
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
421
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors"],
|
|
422
422
|
adapted_from="BAAI/bge-m3",
|
|
423
423
|
use_instructions=False,
|
|
424
424
|
training_datasets={
|
|
@@ -469,7 +469,7 @@ deberta_v1_ru = ModelMeta(
|
|
|
469
469
|
max_tokens=512,
|
|
470
470
|
reference="https://huggingface.co/deepvk/deberta-v1-base",
|
|
471
471
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
472
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
472
|
+
framework=["Sentence Transformers", "PyTorch", "Transformers", "safetensors"],
|
|
473
473
|
use_instructions=False,
|
|
474
474
|
# Wikipedia, Books, Twitter comments, Pikabu, Proza.ru, Film subtitles, News websites, and Social corpus
|
|
475
475
|
public_training_code=None,
|
|
@@ -500,7 +500,7 @@ rubert_base_cased = ModelMeta(
|
|
|
500
500
|
max_tokens=512,
|
|
501
501
|
reference="https://huggingface.co/DeepPavlov/rubert-base-cased",
|
|
502
502
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
503
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
503
|
+
framework=["Sentence Transformers", "PyTorch", "Transformers"],
|
|
504
504
|
use_instructions=False,
|
|
505
505
|
public_training_code=None,
|
|
506
506
|
public_training_data=None,
|
|
@@ -536,7 +536,7 @@ distilrubert_small_cased_conversational = ModelMeta(
|
|
|
536
536
|
max_tokens=512,
|
|
537
537
|
reference="https://huggingface.co/DeepPavlov/distilrubert-small-cased-conversational",
|
|
538
538
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
539
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
539
|
+
framework=["Sentence Transformers", "PyTorch", "Transformers"],
|
|
540
540
|
use_instructions=False,
|
|
541
541
|
public_training_code=None,
|
|
542
542
|
public_training_data=None,
|
|
@@ -571,7 +571,7 @@ rubert_base_cased_sentence = ModelMeta(
|
|
|
571
571
|
max_tokens=512,
|
|
572
572
|
reference="https://huggingface.co/DeepPavlov/rubert-base-cased-sentence",
|
|
573
573
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
574
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
574
|
+
framework=["Sentence Transformers", "PyTorch", "Transformers"],
|
|
575
575
|
use_instructions=False,
|
|
576
576
|
public_training_code=None,
|
|
577
577
|
public_training_data=None,
|
|
@@ -596,7 +596,7 @@ labse_en_ru = ModelMeta(
|
|
|
596
596
|
max_tokens=512,
|
|
597
597
|
reference="https://huggingface.co/cointegrated/LaBSE-en-ru",
|
|
598
598
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
599
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
599
|
+
framework=["Sentence Transformers", "PyTorch", "Transformers", "safetensors"],
|
|
600
600
|
use_instructions=False,
|
|
601
601
|
public_training_code="https://colab.research.google.com/drive/1dnPRn0-ugj3vZgSpyCC9sgslM2SuSfHy?usp=sharing",
|
|
602
602
|
public_training_data=None,
|
|
@@ -624,7 +624,7 @@ rubert_tiny_turbo = ModelMeta(
|
|
|
624
624
|
max_tokens=2048,
|
|
625
625
|
reference="https://huggingface.co/sergeyzh/rubert-tiny-turbo",
|
|
626
626
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
627
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
627
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
|
|
628
628
|
use_instructions=False,
|
|
629
629
|
public_training_code=None,
|
|
630
630
|
public_training_data=None,
|
|
@@ -647,7 +647,7 @@ rubert_mini_frida = ModelMeta(
|
|
|
647
647
|
max_tokens=2048,
|
|
648
648
|
reference="https://huggingface.co/sergeyzh/rubert-mini-frida",
|
|
649
649
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
650
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
650
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
|
|
651
651
|
use_instructions=True,
|
|
652
652
|
public_training_code=None,
|
|
653
653
|
public_training_data=None,
|
|
@@ -675,7 +675,7 @@ labse_ru_turbo = ModelMeta(
|
|
|
675
675
|
max_tokens=512,
|
|
676
676
|
reference="https://huggingface.co/sergeyzh/LaBSE-ru-turbo",
|
|
677
677
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
678
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
678
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
|
|
679
679
|
use_instructions=False,
|
|
680
680
|
training_datasets=turbo_models_datasets,
|
|
681
681
|
public_training_code=None,
|
|
@@ -745,7 +745,7 @@ rosberta_ru_en = ModelMeta(
|
|
|
745
745
|
},
|
|
746
746
|
public_training_data=None,
|
|
747
747
|
public_training_code=None,
|
|
748
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
748
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
|
|
749
749
|
citation="""@misc{snegirev2024russianfocusedembeddersexplorationrumteb,
|
|
750
750
|
title={The Russian-focused embedders' exploration: ruMTEB benchmark and Russian embedding model design},
|
|
751
751
|
author={Artem Snegirev and Maria Tikhonova and Anna Maksimova and Alena Fenogenova and Alexander Abramov},
|
|
@@ -895,7 +895,7 @@ frida = ModelMeta(
|
|
|
895
895
|
training_datasets=frida_training_datasets,
|
|
896
896
|
public_training_data=None,
|
|
897
897
|
public_training_code=None,
|
|
898
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
898
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
|
|
899
899
|
citation=None,
|
|
900
900
|
)
|
|
901
901
|
|
|
@@ -924,7 +924,7 @@ giga_embeddings = ModelMeta(
|
|
|
924
924
|
max_tokens=4096,
|
|
925
925
|
reference="https://huggingface.co/ai-sage/Giga-Embeddings-instruct",
|
|
926
926
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
927
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
927
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
|
|
928
928
|
use_instructions=True,
|
|
929
929
|
public_training_code=None,
|
|
930
930
|
public_training_data=None,
|
|
@@ -956,7 +956,7 @@ berta = ModelMeta(
|
|
|
956
956
|
max_tokens=512,
|
|
957
957
|
reference="https://huggingface.co/sergeyzh/BERTA",
|
|
958
958
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
959
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
959
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
|
|
960
960
|
use_instructions=True,
|
|
961
961
|
training_datasets=berta_training_datasets,
|
|
962
962
|
public_training_code=None,
|
|
@@ -1034,7 +1034,7 @@ user2_small = ModelMeta(
|
|
|
1034
1034
|
training_datasets=user2_training_data,
|
|
1035
1035
|
public_training_data=None,
|
|
1036
1036
|
public_training_code="https://github.com/BlessedTatonka/some_code/tree/2899f27d51efdf4217fc6453799ff197e9792f1e",
|
|
1037
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
1037
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors"],
|
|
1038
1038
|
citation="""@misc{deepvk2025user,
|
|
1039
1039
|
title={USER2},
|
|
1040
1040
|
author={Malashenko, Boris and Spirin, Egor and Sokolov Andrey},
|
|
@@ -1067,7 +1067,7 @@ user2_base = ModelMeta(
|
|
|
1067
1067
|
training_datasets=user2_training_data,
|
|
1068
1068
|
public_training_data=None,
|
|
1069
1069
|
public_training_code="https://github.com/BlessedTatonka/some_code/tree/2899f27d51efdf4217fc6453799ff197e9792f1e",
|
|
1070
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
1070
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors"],
|
|
1071
1071
|
citation="""@misc{deepvk2025user,
|
|
1072
1072
|
title={USER2},
|
|
1073
1073
|
author={Malashenko, Boris and Spirin, Egor and Sokolov Andrey},
|
|
@@ -44,7 +44,7 @@ cl_nagoya_ruri_v3_30m = ModelMeta(
|
|
|
44
44
|
max_tokens=8192,
|
|
45
45
|
reference="https://huggingface.co/cl-nagoya/ruri-v3-30m",
|
|
46
46
|
similarity_fn_name="cosine",
|
|
47
|
-
framework=["PyTorch", "Sentence Transformers"],
|
|
47
|
+
framework=["PyTorch", "Sentence Transformers", "safetensors"],
|
|
48
48
|
use_instructions=True,
|
|
49
49
|
superseded_by=None,
|
|
50
50
|
training_datasets={
|
|
@@ -75,7 +75,7 @@ cl_nagoya_ruri_v3_70m = ModelMeta(
|
|
|
75
75
|
max_tokens=8192,
|
|
76
76
|
reference="https://huggingface.co/cl-nagoya/ruri-v3-70m",
|
|
77
77
|
similarity_fn_name="cosine",
|
|
78
|
-
framework=["PyTorch", "Sentence Transformers"],
|
|
78
|
+
framework=["PyTorch", "Sentence Transformers", "safetensors"],
|
|
79
79
|
use_instructions=True,
|
|
80
80
|
superseded_by=None,
|
|
81
81
|
training_datasets={"MrTidyRetrieval", "MIRACLRetrieval"},
|
|
@@ -104,7 +104,7 @@ cl_nagoya_ruri_v3_130m = ModelMeta(
|
|
|
104
104
|
max_tokens=8192,
|
|
105
105
|
reference="https://huggingface.co/cl-nagoya/ruri-v3-130m",
|
|
106
106
|
similarity_fn_name="cosine",
|
|
107
|
-
framework=["PyTorch", "Sentence Transformers"],
|
|
107
|
+
framework=["PyTorch", "Sentence Transformers", "safetensors"],
|
|
108
108
|
use_instructions=True,
|
|
109
109
|
superseded_by=None,
|
|
110
110
|
training_datasets={"MrTidyRetrieval", "MIRACLRetrieval"},
|
|
@@ -133,7 +133,7 @@ cl_nagoya_ruri_v3_310m = ModelMeta(
|
|
|
133
133
|
max_tokens=8192,
|
|
134
134
|
reference="https://huggingface.co/cl-nagoya/ruri-v3-310m",
|
|
135
135
|
similarity_fn_name="cosine",
|
|
136
|
-
framework=["PyTorch", "Sentence Transformers"],
|
|
136
|
+
framework=["PyTorch", "Sentence Transformers", "safetensors"],
|
|
137
137
|
use_instructions=True,
|
|
138
138
|
superseded_by=None,
|
|
139
139
|
training_datasets={"MrTidyRetrieval", "MIRACLRetrieval"},
|
|
@@ -163,7 +163,7 @@ cl_nagoya_ruri_small_v2 = ModelMeta(
|
|
|
163
163
|
max_tokens=512,
|
|
164
164
|
reference="https://huggingface.co/cl-nagoya/ruri-small-v2",
|
|
165
165
|
similarity_fn_name="cosine",
|
|
166
|
-
framework=["PyTorch", "Sentence Transformers"],
|
|
166
|
+
framework=["PyTorch", "Sentence Transformers", "safetensors"],
|
|
167
167
|
use_instructions=True,
|
|
168
168
|
adapted_from="line-corporation/line-distilbert-base-japanese",
|
|
169
169
|
superseded_by=None,
|
|
@@ -192,7 +192,7 @@ cl_nagoya_ruri_base_v2 = ModelMeta(
|
|
|
192
192
|
max_tokens=512,
|
|
193
193
|
reference="https://huggingface.co/cl-nagoya/ruri-base-v2",
|
|
194
194
|
similarity_fn_name="cosine",
|
|
195
|
-
framework=["PyTorch", "Sentence Transformers"],
|
|
195
|
+
framework=["PyTorch", "Sentence Transformers", "safetensors"],
|
|
196
196
|
use_instructions=True,
|
|
197
197
|
adapted_from="tohoku-nlp/bert-base-japanese-v3",
|
|
198
198
|
superseded_by=None,
|
|
@@ -221,7 +221,7 @@ cl_nagoya_ruri_large_v2 = ModelMeta(
|
|
|
221
221
|
max_tokens=512,
|
|
222
222
|
reference="https://huggingface.co/cl-nagoya/ruri-large-v2",
|
|
223
223
|
similarity_fn_name="cosine",
|
|
224
|
-
framework=["PyTorch", "Sentence Transformers"],
|
|
224
|
+
framework=["PyTorch", "Sentence Transformers", "safetensors"],
|
|
225
225
|
use_instructions=True,
|
|
226
226
|
adapted_from="tohoku-nlp/bert-large-japanese-v2",
|
|
227
227
|
superseded_by=None,
|
|
@@ -251,7 +251,7 @@ cl_nagoya_ruri_small_v1 = ModelMeta(
|
|
|
251
251
|
max_tokens=512,
|
|
252
252
|
reference="https://huggingface.co/cl-nagoya/ruri-small",
|
|
253
253
|
similarity_fn_name="cosine",
|
|
254
|
-
framework=["PyTorch", "Sentence Transformers"],
|
|
254
|
+
framework=["PyTorch", "Sentence Transformers", "safetensors"],
|
|
255
255
|
use_instructions=True,
|
|
256
256
|
adapted_from="line-corporation/line-distilbert-base-japanese",
|
|
257
257
|
superseded_by="cl-nagoya/ruri-small-v2",
|
|
@@ -280,7 +280,7 @@ cl_nagoya_ruri_base_v1 = ModelMeta(
|
|
|
280
280
|
max_tokens=512,
|
|
281
281
|
reference="https://huggingface.co/cl-nagoya/ruri-base",
|
|
282
282
|
similarity_fn_name="cosine",
|
|
283
|
-
framework=["PyTorch", "Sentence Transformers"],
|
|
283
|
+
framework=["PyTorch", "Sentence Transformers", "safetensors"],
|
|
284
284
|
use_instructions=True,
|
|
285
285
|
adapted_from="tohoku-nlp/bert-base-japanese-v3",
|
|
286
286
|
superseded_by="cl-nagoya/ruri-base-v2",
|
|
@@ -310,7 +310,7 @@ cl_nagoya_ruri_large_v1 = ModelMeta(
|
|
|
310
310
|
max_tokens=512,
|
|
311
311
|
reference="https://huggingface.co/cl-nagoya/ruri-large",
|
|
312
312
|
similarity_fn_name="cosine",
|
|
313
|
-
framework=["PyTorch", "Sentence Transformers"],
|
|
313
|
+
framework=["PyTorch", "Sentence Transformers", "safetensors"],
|
|
314
314
|
use_instructions=True,
|
|
315
315
|
adapted_from="tohoku-nlp/bert-large-japanese-v2",
|
|
316
316
|
superseded_by="cl-nagoya/ruri-large-v2",
|
|
@@ -58,7 +58,7 @@ SFR_Embedding_2_R = ModelMeta(
|
|
|
58
58
|
max_tokens=32768,
|
|
59
59
|
reference="https://huggingface.co/Salesforce/SFR-Embedding-2_R",
|
|
60
60
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
61
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
61
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
|
|
62
62
|
use_instructions=True,
|
|
63
63
|
adapted_from="intfloat/e5-mistral-7b-instruct",
|
|
64
64
|
public_training_code=None,
|
|
@@ -96,7 +96,7 @@ SFR_Embedding_Code_2B_R = ModelMeta(
|
|
|
96
96
|
max_tokens=8192,
|
|
97
97
|
reference="https://huggingface.co/Salesforce/SFR-Embedding-Code-2B_R",
|
|
98
98
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
99
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
99
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
|
|
100
100
|
use_instructions=True,
|
|
101
101
|
adapted_from="google/gemma-2-2b-it",
|
|
102
102
|
public_training_code=None,
|
|
@@ -134,7 +134,7 @@ SFR_Embedding_Mistral = ModelMeta(
|
|
|
134
134
|
max_tokens=32768,
|
|
135
135
|
reference="https://huggingface.co/Salesforce/SFR-Embedding-Mistral",
|
|
136
136
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
137
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
137
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
|
|
138
138
|
use_instructions=True,
|
|
139
139
|
public_training_code=None,
|
|
140
140
|
public_training_data=None,
|
|
@@ -57,7 +57,7 @@ samilpwc_expr = ModelMeta(
|
|
|
57
57
|
max_tokens=514,
|
|
58
58
|
reference="https://huggingface.co/SamilPwC-AXNode-GenAI/PwC-Embedding_expr",
|
|
59
59
|
similarity_fn_name="cosine",
|
|
60
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
60
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
|
|
61
61
|
use_instructions=True,
|
|
62
62
|
public_training_code=None,
|
|
63
63
|
public_training_data=None,
|
|
@@ -130,7 +130,7 @@ sbintuitions_sarashina_embedding_v2_1b = ModelMeta(
|
|
|
130
130
|
max_tokens=8192,
|
|
131
131
|
reference="https://huggingface.co/sbintuitions/sarashina-embedding-v2-1b",
|
|
132
132
|
similarity_fn_name="cosine",
|
|
133
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
133
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
|
|
134
134
|
use_instructions=True,
|
|
135
135
|
adapted_from="sbintuitions/sarashina2.2-1b",
|
|
136
136
|
superseded_by=None,
|
|
@@ -156,7 +156,7 @@ sbintuitions_sarashina_embedding_v1_1b = ModelMeta(
|
|
|
156
156
|
max_tokens=8192,
|
|
157
157
|
reference="https://huggingface.co/sbintuitions/sarashina-embedding-v1-1b",
|
|
158
158
|
similarity_fn_name="cosine",
|
|
159
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
159
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
|
|
160
160
|
use_instructions=False,
|
|
161
161
|
adapted_from="sbintuitions/sarashina2.1-1b",
|
|
162
162
|
superseded_by="sbintuitions/sarashina-embedding-v2-1b",
|
|
@@ -33,7 +33,7 @@ searchmap_preview = ModelMeta(
|
|
|
33
33
|
max_tokens=8192,
|
|
34
34
|
reference="https://huggingface.co/VPLabs/SearchMap_Preview",
|
|
35
35
|
similarity_fn_name="cosine",
|
|
36
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
36
|
+
framework=["Sentence Transformers", "PyTorch", "Transformers", "safetensors"],
|
|
37
37
|
public_training_code=None,
|
|
38
38
|
public_training_data=None,
|
|
39
39
|
training_datasets=None,
|
|
@@ -4,13 +4,15 @@ import base64
|
|
|
4
4
|
import logging
|
|
5
5
|
import os
|
|
6
6
|
import time
|
|
7
|
-
from concurrent.futures import ThreadPoolExecutor
|
|
7
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
8
|
+
from functools import partial
|
|
8
9
|
from io import BytesIO
|
|
9
10
|
from typing import TYPE_CHECKING, Any
|
|
10
11
|
|
|
11
12
|
import requests
|
|
12
13
|
import torch
|
|
13
14
|
from torch.utils.data import DataLoader
|
|
15
|
+
from tqdm import tqdm
|
|
14
16
|
|
|
15
17
|
from mteb._requires_package import requires_package
|
|
16
18
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
@@ -26,114 +28,6 @@ if TYPE_CHECKING:
|
|
|
26
28
|
|
|
27
29
|
logger = logging.getLogger(__name__)
|
|
28
30
|
|
|
29
|
-
|
|
30
|
-
def pil_to_base64(image, format="jpeg"):
|
|
31
|
-
if image is None:
|
|
32
|
-
return None
|
|
33
|
-
buffer = BytesIO()
|
|
34
|
-
image.save(buffer, format=format)
|
|
35
|
-
img_bytes = buffer.getvalue()
|
|
36
|
-
encoded_bytes = base64.b64encode(img_bytes)
|
|
37
|
-
return encoded_bytes.decode("utf-8")
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
def multimodal_embedding(image_base64=None, text_content=None):
|
|
41
|
-
auth_token = os.getenv("VOLCES_AUTH_TOKEN")
|
|
42
|
-
model_name = "doubao-embedding-vision-251215"
|
|
43
|
-
api_url = "https://ark.cn-beijing.volces.com/api/v3/embeddings/multimodal"
|
|
44
|
-
|
|
45
|
-
headers = {
|
|
46
|
-
"Authorization": f"Bearer {auth_token}",
|
|
47
|
-
"x-ark-vlm1": "true",
|
|
48
|
-
"Content-Type": "application/json",
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
if image_base64 is not None and text_content is None:
|
|
52
|
-
inputs = []
|
|
53
|
-
for image in image_base64:
|
|
54
|
-
image_format = "jpeg"
|
|
55
|
-
image_data = f"data:image/{image_format};base64,{image}"
|
|
56
|
-
inputs.append({"type": "image_url", "image_url": {"url": image_data}})
|
|
57
|
-
|
|
58
|
-
payload = {"model": model_name, "input": inputs}
|
|
59
|
-
elif image_base64 is None and text_content is not None:
|
|
60
|
-
payload = {
|
|
61
|
-
"model": model_name,
|
|
62
|
-
"input": [
|
|
63
|
-
{"type": "text", "text": text_content},
|
|
64
|
-
],
|
|
65
|
-
}
|
|
66
|
-
else:
|
|
67
|
-
inputs = []
|
|
68
|
-
for image in image_base64:
|
|
69
|
-
image_format = "jpeg"
|
|
70
|
-
image_data = f"data:image/{image_format};base64,{image}"
|
|
71
|
-
inputs.append({"type": "image_url", "image_url": {"url": image_data}})
|
|
72
|
-
inputs.append({"type": "text", "text": text_content})
|
|
73
|
-
payload = {"model": model_name, "input": inputs}
|
|
74
|
-
|
|
75
|
-
try:
|
|
76
|
-
response = requests.post(url=api_url, headers=headers, json=payload, timeout=10)
|
|
77
|
-
|
|
78
|
-
response.raise_for_status()
|
|
79
|
-
return response.json()
|
|
80
|
-
|
|
81
|
-
except requests.exceptions.HTTPError as http_err:
|
|
82
|
-
logger.error(f"HTTP error ({http_err.response.status_code}): {http_err}")
|
|
83
|
-
except requests.exceptions.JSONDecodeError:
|
|
84
|
-
logger.error("Error:The response is not in valid JSON format")
|
|
85
|
-
except requests.exceptions.Timeout:
|
|
86
|
-
logger.error("Error:Request timeout")
|
|
87
|
-
except Exception as e:
|
|
88
|
-
logger.error(f"Unknown error: {str(e)}")
|
|
89
|
-
|
|
90
|
-
return None
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
def multi_thread_encode(sentences, batch_size=1, max_workers=8):
|
|
94
|
-
batches = []
|
|
95
|
-
for idx in range(0, len(sentences), batch_size):
|
|
96
|
-
batches.append((idx // batch_size, sentences[idx : idx + batch_size]))
|
|
97
|
-
|
|
98
|
-
n_batches = len(batches)
|
|
99
|
-
results = [None] * n_batches # Pre-allocated result list
|
|
100
|
-
all_embeddings = [] # Final ordered embeddings
|
|
101
|
-
|
|
102
|
-
def _process_batch(batch_idx, batch_sentences):
|
|
103
|
-
sentence = batch_sentences[0]
|
|
104
|
-
|
|
105
|
-
retries = 5
|
|
106
|
-
while retries > 0:
|
|
107
|
-
try:
|
|
108
|
-
resp = multimodal_embedding(text_content=sentence)
|
|
109
|
-
embedding = torch.tensor(resp["data"]["embedding"])
|
|
110
|
-
break
|
|
111
|
-
except Exception as e:
|
|
112
|
-
time.sleep(1)
|
|
113
|
-
logger.warning(f"Retrying... {retries} retries left. Error: {str(e)}")
|
|
114
|
-
retries -= 1
|
|
115
|
-
if retries == 0:
|
|
116
|
-
raise e
|
|
117
|
-
return batch_idx, embedding
|
|
118
|
-
|
|
119
|
-
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
120
|
-
futures = {
|
|
121
|
-
executor.submit(_process_batch, idx, batch): idx for idx, batch in batches
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
for future in as_completed(futures):
|
|
125
|
-
batch_idx, embeddings = future.result()
|
|
126
|
-
results[batch_idx] = embeddings
|
|
127
|
-
|
|
128
|
-
for batch_embeddings in results:
|
|
129
|
-
all_embeddings.append(batch_embeddings)
|
|
130
|
-
|
|
131
|
-
all_embeddings = torch.stack(all_embeddings, dim=0)
|
|
132
|
-
all_embeddings = torch.nn.functional.normalize(all_embeddings, dim=-1)
|
|
133
|
-
|
|
134
|
-
return all_embeddings.float().cpu()
|
|
135
|
-
|
|
136
|
-
|
|
137
31
|
doubao_embedding_training_data = (
|
|
138
32
|
{
|
|
139
33
|
"PawsXPairClassification",
|
|
@@ -166,25 +60,80 @@ class Seed16EmbeddingWrapper(AbsEncoder):
|
|
|
166
60
|
"pip install mteb[ark]",
|
|
167
61
|
"tiktoken",
|
|
168
62
|
)
|
|
169
|
-
import tiktoken
|
|
170
63
|
|
|
171
64
|
self._model_name = model_name
|
|
172
65
|
self._max_tokens = 32768
|
|
173
66
|
self._embed_dim = embed_dim
|
|
174
67
|
self._available_embed_dims = [2048, 1024]
|
|
175
|
-
self._encoding = tiktoken.get_encoding(tokenizer_name)
|
|
176
68
|
|
|
177
|
-
def
|
|
178
|
-
|
|
69
|
+
def pil_to_base64(self, image, format="jpeg"):
|
|
70
|
+
if image is None:
|
|
71
|
+
return None
|
|
72
|
+
buffer = BytesIO()
|
|
73
|
+
image.save(buffer, format=format)
|
|
74
|
+
img_bytes = buffer.getvalue()
|
|
75
|
+
encoded_bytes = base64.b64encode(img_bytes)
|
|
76
|
+
return encoded_bytes.decode("utf-8")
|
|
77
|
+
|
|
78
|
+
def multimodal_embedding(self, instruction, image_base64, text_content):
|
|
79
|
+
auth_token = os.getenv("VOLCES_AUTH_TOKEN")
|
|
80
|
+
model_name = "doubao-embedding-vision-251215"
|
|
81
|
+
api_url = "https://ark.cn-beijing.volces.com/api/v3/embeddings/multimodal"
|
|
82
|
+
|
|
83
|
+
headers = {
|
|
84
|
+
"Authorization": f"Bearer {auth_token}",
|
|
85
|
+
"x-ark-vlm1": "true",
|
|
86
|
+
"Content-Type": "application/json",
|
|
87
|
+
}
|
|
179
88
|
|
|
180
|
-
|
|
181
|
-
|
|
89
|
+
if text_content is not None and len(text_content) > self._max_tokens:
|
|
90
|
+
text_content = text_content[: self._max_tokens]
|
|
91
|
+
|
|
92
|
+
if image_base64 is not None and text_content is None:
|
|
93
|
+
inputs = []
|
|
94
|
+
for image in image_base64:
|
|
95
|
+
image_format = "jpeg"
|
|
96
|
+
image_data = f"data:image/{image_format};base64,{image}"
|
|
97
|
+
inputs.append({"type": "image_url", "image_url": {"url": image_data}})
|
|
98
|
+
|
|
99
|
+
payload = {"model": model_name, "input": inputs}
|
|
100
|
+
elif image_base64 is None and text_content is not None:
|
|
101
|
+
payload = {
|
|
102
|
+
"model": model_name,
|
|
103
|
+
"instruction": instruction,
|
|
104
|
+
"input": [
|
|
105
|
+
{"type": "text", "text": text_content},
|
|
106
|
+
],
|
|
107
|
+
}
|
|
108
|
+
else:
|
|
109
|
+
inputs = []
|
|
110
|
+
for image in image_base64:
|
|
111
|
+
image_format = "jpeg"
|
|
112
|
+
image_data = f"data:image/{image_format};base64,{image}"
|
|
113
|
+
inputs.append({"type": "image_url", "image_url": {"url": image_data}})
|
|
114
|
+
inputs.append({"type": "text", "text": text_content})
|
|
115
|
+
payload = {"model": model_name, "input": inputs}
|
|
116
|
+
|
|
117
|
+
max_retries = 3
|
|
118
|
+
retry_count = 0
|
|
119
|
+
|
|
120
|
+
while retry_count < max_retries:
|
|
121
|
+
response = requests.post(
|
|
122
|
+
url=api_url, headers=headers, json=payload, timeout=30
|
|
123
|
+
)
|
|
182
124
|
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
125
|
+
if response.status_code != 200:
|
|
126
|
+
retry_count += 1
|
|
127
|
+
time.sleep(3)
|
|
128
|
+
continue
|
|
129
|
+
|
|
130
|
+
response_json = response.json()
|
|
131
|
+
return response_json
|
|
132
|
+
|
|
133
|
+
raise Exception(
|
|
134
|
+
f"Request failed with status code {response.status_code}. "
|
|
135
|
+
f"Response: {response.text}"
|
|
136
|
+
)
|
|
188
137
|
|
|
189
138
|
def get_fused_embeddings(
|
|
190
139
|
self,
|
|
@@ -204,59 +153,69 @@ class Seed16EmbeddingWrapper(AbsEncoder):
|
|
|
204
153
|
if images is not None and texts is not None:
|
|
205
154
|
assert len(texts) == len(images)
|
|
206
155
|
batch_len = len(texts)
|
|
207
|
-
images_base64 = [pil_to_base64(image) for image in images]
|
|
156
|
+
images_base64 = [self.pil_to_base64(image) for image in images]
|
|
208
157
|
elif images is None:
|
|
209
158
|
batch_len = len(texts)
|
|
210
159
|
images_base64 = [None for _ in range(batch_len)]
|
|
211
160
|
elif texts is None:
|
|
212
161
|
batch_len = len(images)
|
|
213
|
-
images_base64 = [pil_to_base64(image) for image in images]
|
|
162
|
+
images_base64 = [self.pil_to_base64(image) for image in images]
|
|
214
163
|
else:
|
|
215
164
|
raise ValueError("images and texts cannot be None at the same time")
|
|
216
165
|
|
|
217
|
-
|
|
218
|
-
|
|
166
|
+
def process_item(
|
|
167
|
+
i, prompt_type, task_name, texts, images_base64, multimodal_embedding
|
|
168
|
+
):
|
|
219
169
|
if (
|
|
220
170
|
prompt_type == PromptType("query") or prompt_type is None
|
|
221
171
|
) and task_name in TASK_NAME_TO_INSTRUCTION:
|
|
222
172
|
instruction = TASK_NAME_TO_INSTRUCTION[task_name]
|
|
223
173
|
instruction = instruction.rstrip("{}").rstrip("\n")
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
+ "\n Query:{}"
|
|
229
|
-
).format(texts[i])
|
|
230
|
-
else:
|
|
231
|
-
input_text = (
|
|
232
|
-
"Target_modality:Text.\n Instruction:"
|
|
233
|
-
+ instruction
|
|
234
|
-
+ "\n Query:"
|
|
235
|
-
)
|
|
174
|
+
instruction = (
|
|
175
|
+
"Target_modality:Text.\n Instruction:" + instruction + "\n Query:"
|
|
176
|
+
)
|
|
177
|
+
input_text = texts[i]
|
|
236
178
|
else:
|
|
237
179
|
if texts[i] != "" and images_base64[i] is not None:
|
|
238
|
-
instruction = "Instruction: Compress the
|
|
239
|
-
input_text =
|
|
180
|
+
instruction = "Instruction: Compress the text and image into one word.\n Query:"
|
|
181
|
+
input_text = texts[i]
|
|
240
182
|
elif texts[i] != "":
|
|
241
183
|
instruction = (
|
|
242
|
-
"Instruction: Compress the
|
|
184
|
+
"Instruction: Compress the text into one word.\n Query:"
|
|
243
185
|
)
|
|
244
|
-
input_text =
|
|
186
|
+
input_text = texts[i]
|
|
245
187
|
elif images_base64[i] is not None:
|
|
246
188
|
instruction = (
|
|
247
|
-
"Instruction: Compress the
|
|
189
|
+
"Instruction: Compress the image into one word.\n Query:"
|
|
248
190
|
)
|
|
249
|
-
input_text =
|
|
191
|
+
input_text = None
|
|
250
192
|
else:
|
|
251
193
|
raise ValueError("image and text are both None")
|
|
252
194
|
|
|
253
195
|
resp = multimodal_embedding(
|
|
254
|
-
|
|
196
|
+
instruction=instruction,
|
|
197
|
+
image_base64=images_base64[i],
|
|
198
|
+
text_content=input_text,
|
|
255
199
|
)
|
|
256
200
|
embedding = torch.tensor(resp["data"]["embedding"])
|
|
257
201
|
embedding = torch.reshape(embedding, (1, -1))
|
|
202
|
+
return embedding
|
|
203
|
+
|
|
204
|
+
outputs = []
|
|
205
|
+
process_partial = partial(
|
|
206
|
+
process_item,
|
|
207
|
+
prompt_type=prompt_type,
|
|
208
|
+
task_name=task_name,
|
|
209
|
+
texts=texts,
|
|
210
|
+
images_base64=images_base64,
|
|
211
|
+
multimodal_embedding=self.multimodal_embedding,
|
|
212
|
+
)
|
|
213
|
+
with ThreadPoolExecutor(max_workers=15) as executor:
|
|
214
|
+
futures = [executor.submit(process_partial, i) for i in range(batch_len)]
|
|
215
|
+
for future in tqdm(futures, total=batch_len, desc="Encoding"):
|
|
216
|
+
outputs.append(future.result())
|
|
258
217
|
|
|
259
|
-
outputs = torch.stack(outputs, dim=0)
|
|
218
|
+
outputs = torch.stack(outputs, dim=0).squeeze(1)
|
|
260
219
|
|
|
261
220
|
if self._embed_dim is not None:
|
|
262
221
|
outputs = outputs[:, : self._embed_dim]
|
|
@@ -273,13 +232,21 @@ class Seed16EmbeddingWrapper(AbsEncoder):
|
|
|
273
232
|
prompt_type: PromptType | None = None,
|
|
274
233
|
**kwargs: Any,
|
|
275
234
|
) -> Array:
|
|
276
|
-
|
|
277
|
-
|
|
235
|
+
if "text" in inputs.dataset.features:
|
|
236
|
+
sentences = [text for batch in inputs for text in batch["text"]]
|
|
237
|
+
else:
|
|
238
|
+
sentences = None
|
|
239
|
+
|
|
240
|
+
if "image" in inputs.dataset.features:
|
|
241
|
+
images = [image for batch in inputs for image in batch["image"]]
|
|
242
|
+
else:
|
|
243
|
+
images = None
|
|
278
244
|
|
|
279
245
|
return self.get_fused_embeddings(
|
|
280
246
|
texts=sentences,
|
|
281
247
|
images=images,
|
|
282
248
|
task_name=task_metadata.name,
|
|
249
|
+
prompt_type=prompt_type,
|
|
283
250
|
**kwargs,
|
|
284
251
|
)
|
|
285
252
|
|