mteb 2.1.4__py3-none-any.whl → 2.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (527) hide show
  1. mteb/__init__.py +6 -0
  2. mteb/_create_dataloaders.py +22 -20
  3. mteb/_evaluators/any_sts_evaluator.py +23 -14
  4. mteb/_evaluators/classification_metrics.py +54 -0
  5. mteb/_evaluators/clustering_evaluator.py +3 -3
  6. mteb/_evaluators/evaluator.py +4 -2
  7. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +18 -11
  8. mteb/_evaluators/pair_classification_evaluator.py +34 -40
  9. mteb/_evaluators/retrieval_evaluator.py +2 -2
  10. mteb/_evaluators/retrieval_metrics.py +18 -17
  11. mteb/_evaluators/sklearn_evaluator.py +25 -37
  12. mteb/_evaluators/text/bitext_mining_evaluator.py +31 -19
  13. mteb/_evaluators/text/summarization_evaluator.py +27 -20
  14. mteb/_evaluators/zeroshot_classification_evaluator.py +7 -5
  15. mteb/abstasks/_data_filter/__init__.py +0 -0
  16. mteb/abstasks/_data_filter/filters.py +125 -0
  17. mteb/abstasks/_data_filter/task_pipelines.py +105 -0
  18. mteb/abstasks/_statistics_calculation.py +23 -11
  19. mteb/abstasks/_stratification.py +18 -18
  20. mteb/abstasks/abstask.py +35 -28
  21. mteb/abstasks/aggregate_task_metadata.py +1 -9
  22. mteb/abstasks/aggregated_task.py +10 -29
  23. mteb/abstasks/classification.py +15 -12
  24. mteb/abstasks/clustering.py +20 -16
  25. mteb/abstasks/clustering_legacy.py +13 -10
  26. mteb/abstasks/image/image_text_pair_classification.py +7 -4
  27. mteb/abstasks/multilabel_classification.py +33 -22
  28. mteb/abstasks/pair_classification.py +27 -11
  29. mteb/abstasks/regression.py +4 -4
  30. mteb/abstasks/retrieval.py +28 -24
  31. mteb/abstasks/retrieval_dataset_loaders.py +2 -2
  32. mteb/abstasks/sts.py +14 -4
  33. mteb/abstasks/task_metadata.py +32 -33
  34. mteb/abstasks/text/bitext_mining.py +39 -28
  35. mteb/abstasks/text/reranking.py +8 -6
  36. mteb/abstasks/text/summarization.py +10 -5
  37. mteb/abstasks/zeroshot_classification.py +8 -4
  38. mteb/benchmarks/_create_table.py +84 -37
  39. mteb/benchmarks/benchmark.py +77 -16
  40. mteb/benchmarks/benchmarks/__init__.py +12 -0
  41. mteb/benchmarks/benchmarks/benchmarks.py +361 -16
  42. mteb/benchmarks/get_benchmark.py +14 -53
  43. mteb/cache.py +227 -37
  44. mteb/cli/_display_tasks.py +2 -2
  45. mteb/cli/build_cli.py +110 -14
  46. mteb/cli/generate_model_card.py +43 -23
  47. mteb/deprecated_evaluator.py +71 -62
  48. mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
  49. mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
  50. mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
  51. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
  52. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
  53. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
  54. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
  55. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
  56. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
  57. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
  58. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
  59. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
  60. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
  61. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
  62. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
  63. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
  64. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
  65. mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
  66. mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
  67. mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
  68. mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
  69. mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
  70. mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
  71. mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
  72. mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
  73. mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
  74. mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
  75. mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
  76. mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
  77. mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
  78. mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
  79. mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
  80. mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
  81. mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
  82. mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
  83. mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
  84. mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
  85. mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
  86. mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
  87. mteb/evaluate.py +106 -75
  88. mteb/filter_tasks.py +25 -26
  89. mteb/get_tasks.py +29 -30
  90. mteb/languages/language_scripts.py +5 -3
  91. mteb/leaderboard/app.py +414 -151
  92. mteb/leaderboard/benchmark_selector.py +14 -5
  93. mteb/leaderboard/figures.py +13 -15
  94. mteb/leaderboard/table.py +82 -17
  95. mteb/load_results.py +12 -12
  96. mteb/models/__init__.py +4 -1
  97. mteb/models/abs_encoder.py +31 -23
  98. mteb/models/cache_wrappers/__init__.py +2 -1
  99. mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
  100. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +7 -6
  101. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
  102. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
  103. mteb/models/cache_wrappers/cache_wrapper.py +3 -3
  104. mteb/models/get_model_meta.py +25 -118
  105. mteb/models/instruct_wrapper.py +33 -9
  106. mteb/models/model_implementations/align_models.py +8 -1
  107. mteb/models/model_implementations/amazon_models.py +1 -0
  108. mteb/models/model_implementations/andersborges.py +65 -0
  109. mteb/models/model_implementations/ara_models.py +9 -1
  110. mteb/models/model_implementations/arctic_models.py +16 -8
  111. mteb/models/model_implementations/b1ade_models.py +2 -1
  112. mteb/models/model_implementations/bedrock_models.py +4 -0
  113. mteb/models/model_implementations/bge_models.py +101 -17
  114. mteb/models/model_implementations/bica_model.py +35 -0
  115. mteb/models/model_implementations/blip2_models.py +13 -2
  116. mteb/models/model_implementations/blip_models.py +43 -16
  117. mteb/models/model_implementations/bm25.py +5 -4
  118. mteb/models/model_implementations/bmretriever_models.py +10 -4
  119. mteb/models/model_implementations/cadet_models.py +10 -1
  120. mteb/models/model_implementations/cde_models.py +25 -4
  121. mteb/models/model_implementations/clip_models.py +9 -6
  122. mteb/models/model_implementations/clips_models.py +100 -0
  123. mteb/models/model_implementations/codefuse_models.py +165 -3
  124. mteb/models/model_implementations/codesage_models.py +18 -3
  125. mteb/models/model_implementations/cohere_models.py +13 -6
  126. mteb/models/model_implementations/cohere_v.py +7 -2
  127. mteb/models/model_implementations/colpali_models.py +17 -9
  128. mteb/models/model_implementations/colqwen_models.py +275 -5
  129. mteb/models/model_implementations/colsmol_models.py +4 -2
  130. mteb/models/model_implementations/conan_models.py +2 -1
  131. mteb/models/model_implementations/dino_models.py +194 -23
  132. mteb/models/model_implementations/e5_instruct.py +27 -4
  133. mteb/models/model_implementations/e5_models.py +21 -110
  134. mteb/models/model_implementations/e5_v.py +7 -6
  135. mteb/models/model_implementations/eagerworks_models.py +164 -0
  136. mteb/models/model_implementations/emillykkejensen_models.py +91 -0
  137. mteb/models/model_implementations/en_code_retriever.py +2 -1
  138. mteb/models/model_implementations/euler_models.py +32 -0
  139. mteb/models/model_implementations/evaclip_models.py +4 -0
  140. mteb/models/model_implementations/fa_models.py +67 -9
  141. mteb/models/model_implementations/facebookai.py +205 -0
  142. mteb/models/model_implementations/geogpt_models.py +2 -1
  143. mteb/models/model_implementations/gme_v_models.py +17 -10
  144. mteb/models/model_implementations/google_models.py +17 -6
  145. mteb/models/model_implementations/granite_vision_embedding_models.py +8 -3
  146. mteb/models/model_implementations/gritlm_models.py +4 -2
  147. mteb/models/model_implementations/gte_models.py +99 -9
  148. mteb/models/model_implementations/hinvec_models.py +2 -1
  149. mteb/models/model_implementations/human.py +1 -0
  150. mteb/models/model_implementations/ibm_granite_models.py +36 -6
  151. mteb/models/model_implementations/inf_models.py +4 -2
  152. mteb/models/model_implementations/jasper_models.py +256 -3
  153. mteb/models/model_implementations/jina_clip.py +49 -10
  154. mteb/models/model_implementations/jina_models.py +222 -11
  155. mteb/models/model_implementations/kalm_models.py +203 -25
  156. mteb/models/model_implementations/kblab.py +37 -0
  157. mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
  158. mteb/models/model_implementations/kfst.py +25 -0
  159. mteb/models/model_implementations/kowshik24_models.py +32 -0
  160. mteb/models/model_implementations/lens_models.py +2 -0
  161. mteb/models/model_implementations/lgai_embedding_models.py +2 -1
  162. mteb/models/model_implementations/linq_models.py +4 -3
  163. mteb/models/model_implementations/listconranker.py +2 -2
  164. mteb/models/model_implementations/llm2clip_models.py +9 -6
  165. mteb/models/model_implementations/llm2vec_models.py +16 -8
  166. mteb/models/model_implementations/mcinext_models.py +7 -1
  167. mteb/models/model_implementations/mdbr_models.py +19 -3
  168. mteb/models/model_implementations/misc_models.py +422 -60
  169. mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
  170. mteb/models/model_implementations/mme5_models.py +2 -1
  171. mteb/models/model_implementations/moco_models.py +15 -4
  172. mteb/models/model_implementations/mod_models.py +191 -0
  173. mteb/models/model_implementations/model2vec_models.py +27 -14
  174. mteb/models/model_implementations/moka_models.py +4 -1
  175. mteb/models/model_implementations/nbailab.py +70 -0
  176. mteb/models/model_implementations/no_instruct_sentence_models.py +3 -2
  177. mteb/models/model_implementations/nomic_models.py +173 -6
  178. mteb/models/model_implementations/nomic_models_vision.py +8 -3
  179. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +32 -19
  180. mteb/models/model_implementations/nvidia_models.py +155 -20
  181. mteb/models/model_implementations/octen_models.py +254 -0
  182. mteb/models/model_implementations/openai_models.py +20 -16
  183. mteb/models/model_implementations/openclip_models.py +37 -13
  184. mteb/models/model_implementations/opensearch_neural_sparse_models.py +10 -5
  185. mteb/models/model_implementations/ops_moa_models.py +5 -3
  186. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
  187. mteb/models/model_implementations/pawan_models.py +39 -0
  188. mteb/models/model_implementations/piccolo_models.py +9 -1
  189. mteb/models/model_implementations/pixie_models.py +56 -0
  190. mteb/models/model_implementations/promptriever_models.py +12 -8
  191. mteb/models/model_implementations/pylate_models.py +46 -12
  192. mteb/models/model_implementations/qodo_models.py +4 -2
  193. mteb/models/model_implementations/qtack_models.py +2 -1
  194. mteb/models/model_implementations/qwen3_models.py +9 -6
  195. mteb/models/model_implementations/qzhou_models.py +5 -3
  196. mteb/models/model_implementations/random_baseline.py +19 -24
  197. mteb/models/model_implementations/rasgaard_models.py +34 -0
  198. mteb/models/model_implementations/reasonir_model.py +2 -1
  199. mteb/models/model_implementations/repllama_models.py +5 -3
  200. mteb/models/model_implementations/rerankers_custom.py +15 -9
  201. mteb/models/model_implementations/rerankers_monot5_based.py +31 -31
  202. mteb/models/model_implementations/richinfoai_models.py +2 -1
  203. mteb/models/model_implementations/ru_sentence_models.py +71 -20
  204. mteb/models/model_implementations/ruri_models.py +322 -0
  205. mteb/models/model_implementations/salesforce_models.py +6 -3
  206. mteb/models/model_implementations/samilpwc_models.py +2 -1
  207. mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
  208. mteb/models/model_implementations/searchmap_models.py +2 -1
  209. mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
  210. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +625 -0
  211. mteb/models/model_implementations/seed_models.py +1 -0
  212. mteb/models/model_implementations/sentence_transformers_models.py +177 -18
  213. mteb/models/model_implementations/shuu_model.py +32 -31
  214. mteb/models/model_implementations/siglip_models.py +30 -20
  215. mteb/models/model_implementations/slm_models.py +416 -0
  216. mteb/models/model_implementations/sonar_models.py +1 -0
  217. mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
  218. mteb/models/model_implementations/stella_models.py +23 -4
  219. mteb/models/model_implementations/tarka_models.py +376 -0
  220. mteb/models/model_implementations/text2vec_models.py +9 -3
  221. mteb/models/model_implementations/ua_sentence_models.py +11 -1
  222. mteb/models/model_implementations/uae_models.py +8 -1
  223. mteb/models/model_implementations/vdr_models.py +3 -1
  224. mteb/models/model_implementations/vi_vn_models.py +45 -6
  225. mteb/models/model_implementations/vista_models.py +2 -0
  226. mteb/models/model_implementations/vlm2vec_models.py +5 -3
  227. mteb/models/model_implementations/voyage_models.py +99 -0
  228. mteb/models/model_implementations/voyage_v.py +17 -9
  229. mteb/models/model_implementations/xyz_models.py +1 -0
  230. mteb/models/model_implementations/youtu_models.py +2 -1
  231. mteb/models/model_implementations/yuan_models.py +34 -0
  232. mteb/models/model_implementations/yuan_models_en.py +58 -0
  233. mteb/models/model_meta.py +498 -29
  234. mteb/models/models_protocols.py +22 -6
  235. mteb/models/search_encoder_index/__init__.py +7 -0
  236. mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
  237. mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
  238. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +160 -0
  239. mteb/models/search_wrappers.py +197 -65
  240. mteb/models/sentence_transformer_wrapper.py +52 -32
  241. mteb/models/vllm_wrapper.py +327 -0
  242. mteb/py.typed +0 -0
  243. mteb/results/benchmark_results.py +114 -65
  244. mteb/results/model_result.py +63 -26
  245. mteb/results/task_result.py +117 -77
  246. mteb/similarity_functions.py +60 -7
  247. mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
  248. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
  249. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
  250. mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
  251. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
  252. mteb/tasks/classification/ara/ajgt.py +1 -2
  253. mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
  254. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
  255. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
  256. mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
  257. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
  258. mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
  259. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
  260. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
  261. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
  262. mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
  263. mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
  264. mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
  265. mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
  266. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
  267. mteb/tasks/classification/dan/dk_hate_classification.py +2 -3
  268. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
  269. mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
  270. mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
  271. mteb/tasks/classification/eng/arxiv_classification.py +1 -2
  272. mteb/tasks/classification/eng/banking77_classification.py +1 -2
  273. mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
  274. mteb/tasks/classification/eng/emotion_classification.py +1 -2
  275. mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
  276. mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
  277. mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
  278. mteb/tasks/classification/eng/imdb_classification.py +1 -2
  279. mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
  280. mteb/tasks/classification/eng/news_classification.py +1 -2
  281. mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
  282. mteb/tasks/classification/eng/patent_classification.py +1 -2
  283. mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
  284. mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
  285. mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
  286. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
  287. mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
  288. mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
  289. mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
  290. mteb/tasks/classification/eng/ucf101_classification.py +1 -5
  291. mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
  292. mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
  293. mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
  294. mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
  295. mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
  296. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
  297. mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
  298. mteb/tasks/classification/est/estonian_valence.py +2 -3
  299. mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
  300. mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
  301. mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
  302. mteb/tasks/classification/fra/french_book_reviews.py +1 -2
  303. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
  304. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
  305. mteb/tasks/classification/heb/__init__.py +6 -1
  306. mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
  307. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
  308. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
  309. mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
  310. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
  311. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
  312. mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
  313. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
  314. mteb/tasks/classification/jpn/wrime_classification.py +1 -2
  315. mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
  316. mteb/tasks/classification/kor/klue_tc.py +1 -2
  317. mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
  318. mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
  319. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +3 -4
  320. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
  321. mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
  322. mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
  323. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
  324. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
  325. mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
  326. mteb/tasks/classification/multilingual/scala_classification.py +2 -3
  327. mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
  328. mteb/tasks/classification/mya/myanmar_news.py +1 -2
  329. mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
  330. mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
  331. mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
  332. mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
  333. mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
  334. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
  335. mteb/tasks/classification/nld/iconclass_classification.py +3 -0
  336. mteb/tasks/classification/nld/open_tender_classification.py +3 -0
  337. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
  338. mteb/tasks/classification/nob/no_rec_classification.py +1 -2
  339. mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
  340. mteb/tasks/classification/ory/odia_news_classification.py +1 -2
  341. mteb/tasks/classification/pol/polish_classification.py +3 -6
  342. mteb/tasks/classification/ron/moroco.py +1 -2
  343. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
  344. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
  345. mteb/tasks/classification/rus/georeview_classification.py +1 -2
  346. mteb/tasks/classification/rus/headline_classification.py +1 -2
  347. mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
  348. mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
  349. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
  350. mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
  351. mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
  352. mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
  353. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
  354. mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
  355. mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
  356. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
  357. mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
  358. mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
  359. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
  360. mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
  361. mteb/tasks/classification/swe/dalaj_classification.py +1 -2
  362. mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
  363. mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
  364. mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
  365. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
  366. mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
  367. mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
  368. mteb/tasks/classification/tur/__init__.py +4 -0
  369. mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
  370. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
  371. mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
  372. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
  373. mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
  374. mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
  375. mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
  376. mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
  377. mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
  378. mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
  379. mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
  380. mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
  381. mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
  382. mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
  383. mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
  384. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
  385. mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
  386. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
  387. mteb/tasks/classification/zho/cmteb_classification.py +5 -10
  388. mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
  389. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
  390. mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
  391. mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
  392. mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
  393. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
  394. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
  395. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
  396. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
  397. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
  398. mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
  399. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
  400. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
  401. mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
  402. mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
  403. mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
  404. mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
  405. mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
  406. mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
  407. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  408. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
  409. mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
  410. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
  411. mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
  412. mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
  413. mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
  414. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  415. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
  416. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
  417. mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
  418. mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
  419. mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
  420. mteb/tasks/pair_classification/rus/__init__.py +2 -2
  421. mteb/tasks/pair_classification/rus/terra.py +51 -25
  422. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
  423. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
  424. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
  425. mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
  426. mteb/tasks/reranking/jpn/__init__.py +9 -1
  427. mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
  428. mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
  429. mteb/tasks/reranking/multilingual/__init__.py +2 -0
  430. mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
  431. mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
  432. mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
  433. mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
  434. mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
  435. mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
  436. mteb/tasks/retrieval/code/code_rag.py +12 -12
  437. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
  438. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  439. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  440. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  441. mteb/tasks/retrieval/eng/__init__.py +2 -0
  442. mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
  443. mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
  444. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
  445. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
  446. mteb/tasks/retrieval/jpn/__init__.py +8 -0
  447. mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
  448. mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
  449. mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
  450. mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
  451. mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
  452. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
  453. mteb/tasks/retrieval/kor/__init__.py +16 -1
  454. mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
  455. mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
  456. mteb/tasks/retrieval/multilingual/__init__.py +24 -0
  457. mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
  458. mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
  459. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
  460. mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
  461. mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
  462. mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
  463. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
  464. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
  465. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
  466. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +389 -0
  467. mteb/tasks/retrieval/nld/__init__.py +8 -4
  468. mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
  469. mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
  470. mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
  471. mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
  472. mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
  473. mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
  474. mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
  475. mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
  476. mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
  477. mteb/tasks/retrieval/nob/norquad.py +2 -2
  478. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  479. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
  480. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  481. mteb/tasks/retrieval/vie/__init__.py +14 -6
  482. mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
  483. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +40 -5
  484. mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
  485. mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
  486. mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
  487. mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
  488. mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
  489. mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
  490. mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
  491. mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
  492. mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
  493. mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
  494. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +40 -5
  495. mteb/tasks/retrieval/vie/fevervn_retrieval.py +40 -7
  496. mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
  497. mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
  498. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +40 -6
  499. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +49 -5
  500. mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
  501. mteb/tasks/retrieval/vie/nqvn_retrieval.py +40 -5
  502. mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
  503. mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
  504. mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
  505. mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
  506. mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
  507. mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
  508. mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
  509. mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
  510. mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
  511. mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
  512. mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
  513. mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
  514. mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
  515. mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
  516. mteb/types/__init__.py +2 -0
  517. mteb/types/_encoder_io.py +19 -2
  518. mteb/types/_result.py +2 -1
  519. mteb/types/statistics.py +9 -3
  520. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/METADATA +25 -8
  521. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/RECORD +525 -438
  522. mteb/models/model_implementations/mxbai_models.py +0 -102
  523. mteb/models/model_implementations/nb_sbert.py +0 -25
  524. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
  525. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
  526. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
  527. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
@@ -91,10 +91,6 @@ class OpenAIModel(AbsEncoder):
91
91
 
92
92
  from openai import NotGiven
93
93
 
94
- if self.model_name == "text-embedding-ada-002" and self._embed_dim is not None:
95
- logger.warning(
96
- "Reducing embedding size available only for text-embedding-3-* models"
97
- )
98
94
  sentences = [text for batch in inputs for text in batch["text"]]
99
95
 
100
96
  mask_sents = [(i, t) for i, t in enumerate(sentences) if t.strip()]
@@ -122,13 +118,22 @@ class OpenAIModel(AbsEncoder):
122
118
 
123
119
  no_empty_embeddings = []
124
120
 
121
+ # Set dimensions only for models that support it
122
+ dimensions = (
123
+ self._embed_dim or NotGiven()
124
+ if not self.model_name == "text-embedding-ada-002"
125
+ else NotGiven()
126
+ )
127
+ default_kwargs = dict(
128
+ model=self.model_name,
129
+ encoding_format="float",
130
+ dimensions=dimensions,
131
+ )
132
+
125
133
  for sublist in tqdm(sublists, leave=False, disable=not show_progress_bar):
126
134
  try:
127
135
  response = self._client.embeddings.create(
128
- input=sublist,
129
- model=self.model_name,
130
- encoding_format="float",
131
- dimensions=self._embed_dim or NotGiven(),
136
+ input=sublist, **default_kwargs
132
137
  )
133
138
  except Exception as e:
134
139
  # Sleep due to too many requests
@@ -138,19 +143,13 @@ class OpenAIModel(AbsEncoder):
138
143
  time.sleep(10)
139
144
  try:
140
145
  response = self._client.embeddings.create(
141
- input=sublist,
142
- model=self.model_name,
143
- encoding_format="float",
144
- dimensions=self._embed_dim or NotGiven(),
146
+ input=sublist, **default_kwargs
145
147
  )
146
148
  except Exception as e:
147
149
  logger.info("Sleeping for 60 seconds due to error", e)
148
150
  time.sleep(60)
149
151
  response = self._client.embeddings.create(
150
- input=sublist,
151
- model=self.model_name,
152
- encoding_format="float",
153
- dimensions=self._embed_dim or NotGiven(),
152
+ input=sublist, **default_kwargs
154
153
  )
155
154
  no_empty_embeddings.extend(self._to_numpy(response))
156
155
 
@@ -168,6 +167,7 @@ class OpenAIModel(AbsEncoder):
168
167
 
169
168
  text_embedding_3_small = ModelMeta(
170
169
  name="openai/text-embedding-3-small",
170
+ model_type=["dense"],
171
171
  revision="3",
172
172
  release_date="2024-01-25",
173
173
  languages=None, # supported languages not specified
@@ -192,6 +192,7 @@ text_embedding_3_small = ModelMeta(
192
192
  )
193
193
  text_embedding_3_large = ModelMeta(
194
194
  name="openai/text-embedding-3-large",
195
+ model_type=["dense"],
195
196
  revision="3",
196
197
  release_date="2024-01-25",
197
198
  languages=None, # supported languages not specified
@@ -216,6 +217,7 @@ text_embedding_3_large = ModelMeta(
216
217
  )
217
218
  text_embedding_ada_002 = ModelMeta(
218
219
  name="openai/text-embedding-ada-002",
220
+ model_type=["dense"],
219
221
  revision="3",
220
222
  release_date="2022-12-15",
221
223
  languages=None, # supported languages not specified
@@ -241,6 +243,7 @@ text_embedding_ada_002 = ModelMeta(
241
243
 
242
244
  text_embedding_3_small_512 = ModelMeta(
243
245
  name="openai/text-embedding-3-small (embed_dim=512)",
246
+ model_type=["dense"],
244
247
  revision="3",
245
248
  release_date="2024-01-25",
246
249
  languages=None, # supported languages not specified
@@ -267,6 +270,7 @@ text_embedding_3_small_512 = ModelMeta(
267
270
 
268
271
  text_embedding_3_large_512 = ModelMeta(
269
272
  name="openai/text-embedding-3-large (embed_dim=512)",
273
+ model_type=["dense"],
270
274
  revision="3",
271
275
  release_date="2024-01-25",
272
276
  languages=None, # supported languages not specified
@@ -10,6 +10,14 @@ from mteb.models.abs_encoder import AbsEncoder
10
10
  from mteb.models.model_meta import ModelMeta, ScoringFunction
11
11
  from mteb.types import Array, BatchedInput, PromptType
12
12
 
13
+ OPENCLIP_CITATION = """@inproceedings{cherti2023reproducible,
14
+ title={Reproducible scaling laws for contrastive language-image learning},
15
+ author={Cherti, Mehdi and Beaumont, Romain and Wightman, Ross and Wortsman, Mitchell and Ilharco, Gabriel and Gordon, Cade and Schuhmann, Christoph and Schmidt, Ludwig and Jitsev, Jenia},
16
+ booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
17
+ pages={2818--2829},
18
+ year={2023}
19
+ }"""
20
+
13
21
 
14
22
  def openclip_loader(model_name, **kwargs):
15
23
  requires_package(
@@ -112,8 +120,9 @@ def openclip_loader(model_name, **kwargs):
112
120
 
113
121
 
114
122
  CLIP_ViT_L_14_DataComp_XL_s13B_b90K = ModelMeta(
115
- loader=openclip_loader, # type: ignore
123
+ loader=openclip_loader,
116
124
  name="laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K",
125
+ model_type=["dense"],
117
126
  languages=["eng-Latn"],
118
127
  revision="84c9828e63dc9a9351d1fe637c346d4c1c4db341",
119
128
  release_date="2023-04-26",
@@ -133,11 +142,13 @@ CLIP_ViT_L_14_DataComp_XL_s13B_b90K = ModelMeta(
133
142
  training_datasets=set(
134
143
  # DataComp-1B
135
144
  ),
145
+ citation=OPENCLIP_CITATION,
136
146
  )
137
147
 
138
148
  CLIP_ViT_B_32_DataComp_XL_s13B_b90K = ModelMeta(
139
- loader=openclip_loader, # type: ignore
149
+ loader=openclip_loader,
140
150
  name="laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K",
151
+ model_type=["dense"],
141
152
  languages=["eng-Latn"],
142
153
  revision="f0e2ffa09cbadab3db6a261ec1ec56407ce42912",
143
154
  release_date="2023-04-26",
@@ -150,18 +161,20 @@ CLIP_ViT_B_32_DataComp_XL_s13B_b90K = ModelMeta(
150
161
  open_weights=True,
151
162
  public_training_code="https://github.com/mlfoundations/open_clip",
152
163
  public_training_data="https://huggingface.co/datasets/mlfoundations/datacomp_1b",
153
- framework=["PyTorch"],
164
+ framework=["PyTorch", "safetensors"],
154
165
  reference="https://huggingface.co/laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K",
155
166
  similarity_fn_name=ScoringFunction.COSINE,
156
167
  use_instructions=False,
157
168
  training_datasets=set(
158
169
  # DataComp-1B
159
170
  ),
171
+ citation=OPENCLIP_CITATION,
160
172
  )
161
173
 
162
174
  CLIP_ViT_B_16_DataComp_XL_s13B_b90K = ModelMeta(
163
- loader=openclip_loader, # type: ignore
175
+ loader=openclip_loader,
164
176
  name="laion/CLIP-ViT-B-16-DataComp.XL-s13B-b90K",
177
+ model_type=["dense"],
165
178
  languages=["eng-Latn"],
166
179
  revision="d110532e8d4ff91c574ee60a342323f28468b287",
167
180
  release_date="2023-04-26",
@@ -181,11 +194,13 @@ CLIP_ViT_B_16_DataComp_XL_s13B_b90K = ModelMeta(
181
194
  training_datasets=set(
182
195
  # DataComp-1B
183
196
  ),
197
+ citation=OPENCLIP_CITATION,
184
198
  )
185
199
 
186
200
  CLIP_ViT_bigG_14_laion2B_39B_b160k = ModelMeta(
187
- loader=openclip_loader, # type: ignore
201
+ loader=openclip_loader,
188
202
  name="laion/CLIP-ViT-bigG-14-laion2B-39B-b160k",
203
+ model_type=["dense"],
189
204
  languages=["eng-Latn"],
190
205
  revision="bc7788f151930d91b58474715fdce5524ad9a189",
191
206
  release_date="2023-01-23",
@@ -198,18 +213,20 @@ CLIP_ViT_bigG_14_laion2B_39B_b160k = ModelMeta(
198
213
  open_weights=True,
199
214
  public_training_code="https://github.com/mlfoundations/open_clip",
200
215
  public_training_data="https://laion.ai/blog/laion-5b/",
201
- framework=["PyTorch"],
216
+ framework=["PyTorch", "safetensors"],
202
217
  reference="https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k",
203
218
  similarity_fn_name=ScoringFunction.COSINE,
204
219
  use_instructions=False,
205
220
  training_datasets=set(
206
221
  # 2 Billion sample English subset of LAION-5B
207
222
  ),
223
+ citation=OPENCLIP_CITATION,
208
224
  )
209
225
 
210
226
  CLIP_ViT_g_14_laion2B_s34B_b88K = ModelMeta(
211
- loader=openclip_loader, # type: ignore
227
+ loader=openclip_loader,
212
228
  name="laion/CLIP-ViT-g-14-laion2B-s34B-b88K",
229
+ model_type=["dense"],
213
230
  languages=["eng-Latn"],
214
231
  revision="15efd0f6ac0c40c0f9da7becca03c974d7012604",
215
232
  release_date="2023-03-06",
@@ -222,18 +239,20 @@ CLIP_ViT_g_14_laion2B_s34B_b88K = ModelMeta(
222
239
  open_weights=True,
223
240
  public_training_code="https://github.com/mlfoundations/open_clip",
224
241
  public_training_data="https://laion.ai/blog/laion-5b/",
225
- framework=["PyTorch"],
242
+ framework=["PyTorch", "safetensors"],
226
243
  reference="https://huggingface.co/laion/CLIP-ViT-g-14-laion2B-s34B-b88K",
227
244
  similarity_fn_name=ScoringFunction.COSINE,
228
245
  use_instructions=False,
229
246
  training_datasets=set(
230
247
  # 2 Billion sample English subset of LAION-5B
231
248
  ),
249
+ citation=OPENCLIP_CITATION,
232
250
  )
233
251
 
234
252
  CLIP_ViT_H_14_laion2B_s32B_b79K = ModelMeta(
235
- loader=openclip_loader, # type: ignore
253
+ loader=openclip_loader,
236
254
  name="laion/CLIP-ViT-H-14-laion2B-s32B-b79K",
255
+ model_type=["dense"],
237
256
  languages=["eng-Latn"],
238
257
  revision="de081ac0a0ca8dc9d1533eed1ae884bb8ae1404b",
239
258
  release_date="2022-09-15",
@@ -246,18 +265,20 @@ CLIP_ViT_H_14_laion2B_s32B_b79K = ModelMeta(
246
265
  open_weights=True,
247
266
  public_training_code="https://github.com/mlfoundations/open_clip",
248
267
  public_training_data="https://laion.ai/blog/laion-5b/",
249
- framework=["PyTorch"],
268
+ framework=["PyTorch", "safetensors"],
250
269
  reference="https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K",
251
270
  similarity_fn_name=ScoringFunction.COSINE,
252
271
  use_instructions=False,
253
272
  training_datasets=set(
254
273
  # 2 Billion sample English subset of LAION-5B
255
274
  ),
275
+ citation=OPENCLIP_CITATION,
256
276
  )
257
277
 
258
278
  CLIP_ViT_L_14_laion2B_s32B_b82K = ModelMeta(
259
- loader=openclip_loader, # type: ignore
279
+ loader=openclip_loader,
260
280
  name="laion/CLIP-ViT-L-14-laion2B-s32B-b82K",
281
+ model_type=["dense"],
261
282
  languages=["eng-Latn"],
262
283
  revision="1627032197142fbe2a7cfec626f4ced3ae60d07a",
263
284
  release_date="2022-09-15",
@@ -270,18 +291,20 @@ CLIP_ViT_L_14_laion2B_s32B_b82K = ModelMeta(
270
291
  open_weights=True,
271
292
  public_training_code="https://github.com/mlfoundations/open_clip",
272
293
  public_training_data="https://laion.ai/blog/laion-5b/",
273
- framework=["PyTorch"],
294
+ framework=["PyTorch", "safetensors"],
274
295
  reference="https://huggingface.co/laion/CLIP-ViT-L-14-laion2B-s32B-b82K",
275
296
  similarity_fn_name=ScoringFunction.COSINE,
276
297
  use_instructions=False,
277
298
  training_datasets=set(
278
299
  # 2 Billion sample English subset of LAION-5B
279
300
  ),
301
+ citation=OPENCLIP_CITATION,
280
302
  )
281
303
 
282
304
  CLIP_ViT_B_32_laion2B_s34B_b79K = ModelMeta(
283
305
  loader=openclip_loader,
284
306
  name="laion/CLIP-ViT-B-32-laion2B-s34B-b79K",
307
+ model_type=["dense"],
285
308
  languages=["eng-Latn"],
286
309
  revision="08f73555f1b2fb7c82058aebbd492887a94968ef",
287
310
  release_date="2022-09-15",
@@ -294,11 +317,12 @@ CLIP_ViT_B_32_laion2B_s34B_b79K = ModelMeta(
294
317
  open_weights=True,
295
318
  public_training_code="https://github.com/mlfoundations/open_clip",
296
319
  public_training_data="https://laion.ai/blog/laion-5b/",
297
- framework=["PyTorch"],
320
+ framework=["PyTorch", "safetensors"],
298
321
  reference="https://huggingface.co/laion/CLIP-ViT-B-32-laion2B-s34B-b79K",
299
322
  similarity_fn_name=ScoringFunction.COSINE,
300
323
  use_instructions=False,
301
324
  training_datasets=set(
302
325
  # 2 Billion sample English subset of LAION-5B
303
326
  ),
327
+ citation=OPENCLIP_CITATION,
304
328
  )
@@ -128,6 +128,7 @@ class SparseEncoderWrapper(AbsEncoder):
128
128
 
129
129
  opensearch_neural_sparse_encoding_doc_v3_gte = ModelMeta(
130
130
  name="opensearch-project/opensearch-neural-sparse-encoding-doc-v3-gte",
131
+ model_type=["dense"],
131
132
  languages=["eng-Latn"],
132
133
  open_weights=True,
133
134
  revision="a8abaa916125ee512a7a8f4d706d07eb0128a8e6",
@@ -139,7 +140,7 @@ opensearch_neural_sparse_encoding_doc_v3_gte = ModelMeta(
139
140
  max_tokens=8192,
140
141
  reference="https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-doc-v3-gte",
141
142
  similarity_fn_name="dot",
142
- framework=["Sentence Transformers", "PyTorch"],
143
+ framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
143
144
  public_training_code="https://github.com/zhichao-aws/opensearch-sparse-model-tuning-sample",
144
145
  public_training_data=True,
145
146
  use_instructions=True,
@@ -153,6 +154,7 @@ opensearch_neural_sparse_encoding_doc_v3_gte = ModelMeta(
153
154
 
154
155
  opensearch_neural_sparse_encoding_doc_v3_distill = ModelMeta(
155
156
  name="opensearch-project/opensearch-neural-sparse-encoding-doc-v3-distill",
157
+ model_type=["dense"],
156
158
  languages=["eng-Latn"],
157
159
  open_weights=True,
158
160
  revision="babf71f3c48695e2e53a978208e8aba48335e3c0",
@@ -164,7 +166,7 @@ opensearch_neural_sparse_encoding_doc_v3_distill = ModelMeta(
164
166
  max_tokens=512,
165
167
  reference="https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-doc-v3-distill",
166
168
  similarity_fn_name="dot",
167
- framework=["Sentence Transformers", "PyTorch"],
169
+ framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
168
170
  public_training_code="https://github.com/zhichao-aws/opensearch-sparse-model-tuning-sample",
169
171
  public_training_data=True,
170
172
  use_instructions=True,
@@ -174,6 +176,7 @@ opensearch_neural_sparse_encoding_doc_v3_distill = ModelMeta(
174
176
 
175
177
  opensearch_neural_sparse_encoding_doc_v2_distill = ModelMeta(
176
178
  name="opensearch-project/opensearch-neural-sparse-encoding-doc-v2-distill",
179
+ model_type=["dense"],
177
180
  languages=["eng-Latn"],
178
181
  open_weights=True,
179
182
  revision="8921a26c78b8559d6604eb1f5c0b74c079bee38f",
@@ -185,7 +188,7 @@ opensearch_neural_sparse_encoding_doc_v2_distill = ModelMeta(
185
188
  max_tokens=512,
186
189
  reference="https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-doc-v2-distill",
187
190
  similarity_fn_name="dot",
188
- framework=["Sentence Transformers", "PyTorch"],
191
+ framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
189
192
  public_training_code="https://github.com/zhichao-aws/opensearch-sparse-model-tuning-sample",
190
193
  public_training_data=True,
191
194
  use_instructions=True,
@@ -196,6 +199,7 @@ opensearch_neural_sparse_encoding_doc_v2_distill = ModelMeta(
196
199
 
197
200
  opensearch_neural_sparse_encoding_doc_v2_mini = ModelMeta(
198
201
  name="opensearch-project/opensearch-neural-sparse-encoding-doc-v2-mini",
202
+ model_type=["dense"],
199
203
  languages=["eng-Latn"],
200
204
  open_weights=True,
201
205
  revision="4af867a426867dfdd744097531046f4289a32fdd",
@@ -207,7 +211,7 @@ opensearch_neural_sparse_encoding_doc_v2_mini = ModelMeta(
207
211
  max_tokens=512,
208
212
  reference="https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-doc-v2-mini",
209
213
  similarity_fn_name="dot",
210
- framework=["Sentence Transformers", "PyTorch"],
214
+ framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
211
215
  public_training_code="https://github.com/zhichao-aws/opensearch-sparse-model-tuning-sample",
212
216
  public_training_data=True,
213
217
  use_instructions=True,
@@ -217,6 +221,7 @@ opensearch_neural_sparse_encoding_doc_v2_mini = ModelMeta(
217
221
 
218
222
  opensearch_neural_sparse_encoding_doc_v1 = ModelMeta(
219
223
  name="opensearch-project/opensearch-neural-sparse-encoding-doc-v1",
224
+ model_type=["dense"],
220
225
  languages=["eng-Latn"],
221
226
  open_weights=True,
222
227
  revision="98cdcbd72867c547f72f2b7b7bed9cdf9f09922d",
@@ -228,7 +233,7 @@ opensearch_neural_sparse_encoding_doc_v1 = ModelMeta(
228
233
  max_tokens=512,
229
234
  reference="https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-doc-v1",
230
235
  similarity_fn_name="dot",
231
- framework=["Sentence Transformers", "PyTorch"],
236
+ framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
232
237
  public_training_code="https://github.com/zhichao-aws/opensearch-sparse-model-tuning-sample",
233
238
  public_training_data=True,
234
239
  use_instructions=True,
@@ -22,12 +22,13 @@ class OPSWrapper(AbsEncoder):
22
22
 
23
23
  ops_moa_conan_embedding = ModelMeta(
24
24
  name="OpenSearch-AI/Ops-MoA-Conan-embedding-v1",
25
+ model_type=["dense"],
25
26
  revision="46dcd58753f3daa920c66f89e47086a534089350",
26
27
  release_date="2025-03-26",
27
28
  languages=["zho-Hans"],
28
29
  loader=OPSWrapper,
29
30
  n_parameters=int(343 * 1e6),
30
- memory_usage_mb=2e3,
31
+ memory_usage_mb=1308,
31
32
  max_tokens=512,
32
33
  embed_dim=1536,
33
34
  license="cc-by-nc-4.0",
@@ -53,19 +54,20 @@ ops_moa_conan_embedding = ModelMeta(
53
54
 
54
55
  ops_moa_yuan_embedding = ModelMeta(
55
56
  name="OpenSearch-AI/Ops-MoA-Yuan-embedding-1.0",
57
+ model_type=["dense"],
56
58
  revision="23712d0766417b0eb88a2513c6e212a58b543268",
57
59
  release_date="2025-03-26",
58
60
  languages=["zho-Hans"],
59
61
  loader=OPSWrapper,
60
62
  n_parameters=int(343 * 1e6),
61
- memory_usage_mb=2e3,
63
+ memory_usage_mb=1242,
62
64
  max_tokens=512,
63
65
  embed_dim=1536,
64
66
  license="cc-by-nc-4.0",
65
67
  open_weights=True,
66
68
  public_training_code=None,
67
69
  public_training_data=None,
68
- framework=["PyTorch", "Sentence Transformers"],
70
+ framework=["PyTorch", "Sentence Transformers", "safetensors"],
69
71
  reference="https://huggingface.co/OpenSearch-AI/Ops-MoA-Yuan-embedding-1.0",
70
72
  similarity_fn_name="cosine",
71
73
  use_instructions=False,
@@ -14,7 +14,7 @@ solon_embeddings_1_1 = ModelMeta(
14
14
  max_tokens=8192,
15
15
  reference="https://huggingface.co/OrdalieTech/Solon-embeddings-mini-beta-1.1",
16
16
  similarity_fn_name="cosine",
17
- framework=["Sentence Transformers", "PyTorch"],
17
+ framework=["Sentence Transformers", "PyTorch", "safetensors"],
18
18
  use_instructions=False,
19
19
  public_training_data=(
20
20
  "https://huggingface.co/datasets/PleIAs/common_corpus; "
@@ -0,0 +1,39 @@
1
+ from mteb.models.model_meta import (
2
+ ModelMeta,
3
+ ScoringFunction,
4
+ )
5
+ from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
6
+
7
+ PAWAN_EMBD_CITATION = """@misc{medhi2025pawanembd,
8
+ title={PawanEmbd-68M: Distilled Embedding Model},
9
+ author={Medhi, D.},
10
+ year={2025},
11
+ url={https://huggingface.co/dmedhi/PawanEmbd-68M}
12
+ }"""
13
+
14
+ pawan_embd_68m = ModelMeta(
15
+ loader=sentence_transformers_loader,
16
+ name="dmedhi/PawanEmbd-68M",
17
+ model_type=["dense"],
18
+ languages=["eng-Latn"],
19
+ open_weights=True,
20
+ revision="32f295145802bdbd65699ad65fd27d2a5b69a909",
21
+ release_date="2025-12-08",
22
+ n_parameters=68_000_000,
23
+ memory_usage_mb=260,
24
+ embed_dim=768,
25
+ license="apache-2.0",
26
+ max_tokens=512,
27
+ reference="https://huggingface.co/dmedhi/PawanEmbd-68M",
28
+ similarity_fn_name=ScoringFunction.COSINE,
29
+ framework=["Sentence Transformers", "PyTorch", "safetensors"],
30
+ adapted_from="ibm-granite/granite-embedding-278m-multilingual",
31
+ superseded_by=None,
32
+ public_training_code=None,
33
+ public_training_data=None,
34
+ use_instructions=False,
35
+ training_datasets={
36
+ "AllNLI",
37
+ },
38
+ citation=PAWAN_EMBD_CITATION,
39
+ )
@@ -6,6 +6,7 @@ from mteb.models.sentence_transformer_wrapper import sentence_transformers_loade
6
6
  piccolo_base_zh = ModelMeta(
7
7
  loader=sentence_transformers_loader,
8
8
  name="sensenova/piccolo-base-zh",
9
+ model_type=["dense"],
9
10
  languages=["zho-Hans"],
10
11
  open_weights=True,
11
12
  revision="47c0a63b8f667c3482e05b2fd45577bb19252196",
@@ -17,7 +18,7 @@ piccolo_base_zh = ModelMeta(
17
18
  max_tokens=512,
18
19
  reference="https://huggingface.co/sensenova/piccolo-base-zh",
19
20
  similarity_fn_name=ScoringFunction.COSINE,
20
- framework=["Sentence Transformers", "PyTorch"],
21
+ framework=["Sentence Transformers", "PyTorch", "Transformers"],
21
22
  use_instructions=False,
22
23
  superseded_by=None,
23
24
  adapted_from=None,
@@ -29,6 +30,7 @@ piccolo_base_zh = ModelMeta(
29
30
  piccolo_large_zh_v2 = ModelMeta(
30
31
  loader=sentence_transformers_loader,
31
32
  name="sensenova/piccolo-large-zh-v2",
33
+ model_type=["dense"],
32
34
  languages=["zho-Hans"],
33
35
  open_weights=False, # They "temporarily" removed it in may last year
34
36
  # "Due to certain internal company considerations"
@@ -48,4 +50,10 @@ piccolo_large_zh_v2 = ModelMeta(
48
50
  public_training_code=None,
49
51
  public_training_data=None,
50
52
  training_datasets=None, # They don't say
53
+ citation="""@misc{2405.06932,
54
+ Author = {Junqin Huang and Zhongjie Hu and Zihao Jing and Mengya Gao and Yichao Wu},
55
+ Title = {Piccolo2: General Text Embedding with Multi-task Hybrid Loss Training},
56
+ Year = {2024},
57
+ Eprint = {arXiv:2405.06932},
58
+ }""",
51
59
  )
@@ -0,0 +1,56 @@
1
+ from mteb.models.model_implementations.arctic_models import (
2
+ ARCTIC_V2_CITATION,
3
+ LANGUAGES_V2_0,
4
+ arctic_v2_training_datasets,
5
+ )
6
+ from mteb.models.model_meta import (
7
+ ModelMeta,
8
+ ScoringFunction,
9
+ )
10
+ from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
11
+
12
+ PIXIE_RUNE_V1_CITATION = """@misc{TelePIX-PIXIE-Rune-v1.0,
13
+ title = {PIXIE-Rune-v1.0},
14
+ author = {TelePIX AI Research Team and Bongmin Kim},
15
+ year = {2026},
16
+ howpublished = {Hugging Face model card},
17
+ url = {https://huggingface.co/telepix/PIXIE-Rune-v1.0}
18
+ }"""
19
+
20
+ PIXIE_RUNE_V1_PROMPTS = {
21
+ "query": "query: ",
22
+ "document": "",
23
+ }
24
+
25
+ # it is further fine-tuned on TelePIX proprietary IR data (not public).
26
+ pixie_rune_v1_training_datasets = set(arctic_v2_training_datasets) | {
27
+ "TelePIX-Proprietary-IR-Triplets",
28
+ }
29
+
30
+ pixie_rune_v1_0 = ModelMeta(
31
+ loader=sentence_transformers_loader,
32
+ loader_kwargs={
33
+ "model_prompts": PIXIE_RUNE_V1_PROMPTS,
34
+ },
35
+ name="telepix/PIXIE-Rune-v1.0",
36
+ model_type=["dense"],
37
+ revision="b2486496da71191626666a88f9bfec844933a134",
38
+ release_date="2026-01-15",
39
+ languages=LANGUAGES_V2_0,
40
+ open_weights=True,
41
+ framework=["Sentence Transformers", "PyTorch", "safetensors"],
42
+ n_parameters=567754752,
43
+ memory_usage_mb=2166,
44
+ max_tokens=6144,
45
+ embed_dim=1024,
46
+ license="apache-2.0",
47
+ reference="https://huggingface.co/telepix/PIXIE-Rune-v1.0",
48
+ similarity_fn_name=ScoringFunction.COSINE,
49
+ use_instructions=True,
50
+ adapted_from="Snowflake/snowflake-arctic-embed-l-v2.0",
51
+ superseded_by=None,
52
+ public_training_code=None,
53
+ public_training_data=None,
54
+ training_datasets=pixie_rune_v1_training_datasets,
55
+ citation=PIXIE_RUNE_V1_CITATION + "\n\n" + ARCTIC_V2_CITATION,
56
+ )
@@ -75,12 +75,13 @@ promptriever_llama2 = ModelMeta(
75
75
  model_prompts=model_prompts,
76
76
  ),
77
77
  name="samaya-ai/promptriever-llama2-7b-v1",
78
+ model_type=["dense"],
78
79
  languages=["eng-Latn"],
79
80
  open_weights=True,
80
81
  revision="01c7f73d771dfac7d292323805ebc428287df4f9-30b14e3813c0fa45facfd01a594580c3fe5ecf23", # base-peft revision
81
82
  release_date="2024-09-15",
82
83
  n_parameters=7_000_000_000,
83
- memory_usage_mb=27,
84
+ memory_usage_mb=26703,
84
85
  max_tokens=4096,
85
86
  embed_dim=4096,
86
87
  license="apache-2.0",
@@ -89,7 +90,7 @@ promptriever_llama2 = ModelMeta(
89
90
  ),
90
91
  reference="https://huggingface.co/samaya-ai/promptriever-llama2-7b-v1",
91
92
  similarity_fn_name=ScoringFunction.COSINE,
92
- framework=["PyTorch", "Tevatron"],
93
+ framework=["PyTorch", "Tevatron", "safetensors"],
93
94
  use_instructions=True,
94
95
  citation=PROMPTRIEVER_CITATION,
95
96
  public_training_code=None,
@@ -106,6 +107,7 @@ promptriever_llama3 = ModelMeta(
106
107
  model_prompts=model_prompts,
107
108
  ),
108
109
  name="samaya-ai/promptriever-llama3.1-8b-v1",
110
+ model_type=["dense"],
109
111
  languages=["eng-Latn"],
110
112
  open_weights=True,
111
113
  revision="48d6d0fc4e02fb1269b36940650a1b7233035cbb-2ead22cfb1b0e0c519c371c63c2ab90ffc511b8a", # base-peft revision
@@ -115,13 +117,13 @@ promptriever_llama3 = ModelMeta(
115
117
  },
116
118
  release_date="2024-09-15",
117
119
  n_parameters=8_000_000_000,
118
- memory_usage_mb=31,
120
+ memory_usage_mb=30518,
119
121
  max_tokens=8192,
120
122
  embed_dim=4096,
121
123
  license="apache-2.0",
122
124
  reference="https://huggingface.co/samaya-ai/promptriever-llama3.1-8b-v1",
123
125
  similarity_fn_name=ScoringFunction.COSINE,
124
- framework=["PyTorch", "Tevatron"],
126
+ framework=["PyTorch", "Tevatron", "safetensors"],
125
127
  use_instructions=True,
126
128
  citation=PROMPTRIEVER_CITATION,
127
129
  public_training_code=None,
@@ -138,12 +140,13 @@ promptriever_llama3_instruct = ModelMeta(
138
140
  model_prompts=model_prompts,
139
141
  ),
140
142
  name="samaya-ai/promptriever-llama3.1-8b-instruct-v1",
143
+ model_type=["dense"],
141
144
  languages=["eng-Latn"],
142
145
  open_weights=True,
143
146
  revision="5206a32e0bd3067aef1ce90f5528ade7d866253f-8b677258615625122c2eb7329292b8c402612c21", # base-peft revision
144
147
  release_date="2024-09-15",
145
148
  n_parameters=8_000_000_000,
146
- memory_usage_mb=31,
149
+ memory_usage_mb=30518,
147
150
  max_tokens=8192,
148
151
  embed_dim=4096,
149
152
  training_datasets={
@@ -153,7 +156,7 @@ promptriever_llama3_instruct = ModelMeta(
153
156
  license="apache-2.0",
154
157
  reference="https://huggingface.co/samaya-ai/promptriever-llama3.1-8b-instruct-v1",
155
158
  similarity_fn_name=ScoringFunction.COSINE,
156
- framework=["PyTorch", "Tevatron"],
159
+ framework=["PyTorch", "Tevatron", "safetensors"],
157
160
  use_instructions=True,
158
161
  citation=PROMPTRIEVER_CITATION,
159
162
  public_training_code=None,
@@ -170,12 +173,13 @@ promptriever_mistral_v1 = ModelMeta(
170
173
  model_prompts=model_prompts,
171
174
  ),
172
175
  name="samaya-ai/promptriever-mistral-v0.1-7b-v1",
176
+ model_type=["dense"],
173
177
  languages=["eng-Latn"],
174
178
  open_weights=True,
175
179
  revision="7231864981174d9bee8c7687c24c8344414eae6b-876d63e49b6115ecb6839893a56298fadee7e8f5", # base-peft revision
176
180
  release_date="2024-09-15",
177
181
  n_parameters=7_000_000_000,
178
- memory_usage_mb=27,
182
+ memory_usage_mb=26703,
179
183
  training_datasets={
180
184
  # "samaya-ai/msmarco-w-instructions",
181
185
  "mMARCO-NL", # translation not trained on
@@ -185,7 +189,7 @@ promptriever_mistral_v1 = ModelMeta(
185
189
  license="apache-2.0",
186
190
  reference="https://huggingface.co/samaya-ai/promptriever-mistral-v0.1-7b-v1",
187
191
  similarity_fn_name=ScoringFunction.COSINE,
188
- framework=["PyTorch", "Tevatron"],
192
+ framework=["PyTorch", "Tevatron", "safetensors"],
189
193
  use_instructions=True,
190
194
  citation=PROMPTRIEVER_CITATION,
191
195
  public_training_code=None,