PyPI - mteb - Versions diffs - 2.0.5__py3-none-any.whl → 2.1.19__py3-none-any.whl - Mend

mteb 2.0.5py3-none-any.whl → 2.1.19py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (412) hide show

mteb/languages/language_scripts.py CHANGED Viewed

@@ -3,6 +3,8 @@ from dataclasses import dataclass
 from typing_extensions import Self
+from mteb.languages import check_language_code
 @dataclass
 class LanguageScripts:
@@ -46,8 +48,10 @@ class LanguageScripts:
                 if len(lang_script) == 2:
                     normalized_langs.add(lang_script[0])
                     lang_script_codes.add(lang)
+                    check_language_code(lang)
                     script_codes.add(lang_script[1])
                 else:
+                    check_language_code(lang)
                     normalized_langs.add(lang)
         return cls(

mteb/leaderboard/app.py CHANGED Viewed

@@ -107,7 +107,9 @@ def _update_description(
     description += f" - **Number of task types**: {n_task_types}\n"
     description += f" - **Number of domains**: {n_domains}\n"
     if benchmark.reference is not None:
-        description += f"\n[Click for More Info]({benchmark.reference})"
+        description += (
+            f'\n<a href="{benchmark.reference}" target="_blank">Click for More Info</a>'
+        )
     return description
@@ -137,7 +139,7 @@ def _update_task_info(task_names: str) -> gr.DataFrame:
     df["languages"] = df["languages"].map(_format_list)
     df = df.sort_values("name")
     df["domains"] = df["domains"].map(_format_list)
-    df["name"] = "[" + df["name"] + "](" + df["reference"] + ")"
+    df["name"] = f'<a href="{df["reference"]}" target="_blank">{df["name"]}</a>'
     df["modalities"] = df["modalities"].map(_format_list)
     df = df.rename(
         columns={
@@ -318,7 +320,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
             """
         ## Embedding Leaderboard
-        This leaderboard compares 100+ text and image embedding models across 1000+ languages. We refer to the publication of each selectable benchmark for details on metrics, languages, tasks, and task types. Anyone is welcome [to add a model](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_model.md), [add benchmarks](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_benchmark.md), [help us improve zero-shot annotations](https://github.com/embeddings-benchmark/mteb/blob/06489abca007261c7e6b11f36d4844c5ed5efdcb/mteb/models/bge_models.py#L91) or [propose other changes to the leaderboard](https://github.com/embeddings-benchmark/mteb/tree/main/mteb/leaderboard).
+        This leaderboard compares 100+ text and image embedding models across 1000+ languages. We refer to the publication of each selectable benchmark for details on metrics, languages, tasks, and task types. Anyone is welcome [to add a model](https://embeddings-benchmark.github.io/mteb/contributing/adding_a_model/), [add benchmarks](https://embeddings-benchmark.github.io/mteb/contributing/adding_a_benchmark/), [help us improve zero-shot annotations](https://github.com/embeddings-benchmark/mteb/blob/06489abca007261c7e6b11f36d4844c5ed5efdcb/mteb/models/bge_models.py#L91) or [propose other changes to the leaderboard](https://github.com/embeddings-benchmark/mteb/issues/new?template=enhancement.yaml).
         """
         )
         gr.Markdown(

mteb/leaderboard/benchmark_selector.py CHANGED Viewed

@@ -73,6 +73,7 @@ GP_BENCHMARK_ENTRIES = [
                         "MTEB(fra, v1)",
                         "MTEB(jpn, v1)",
                         "MTEB(kor, v1)",
+                        "MTEB(nld, v1)",
                         "MTEB(pol, v1)",
                         "MTEB(rus, v1)",
                         "MTEB(fas, v2)",
@@ -109,10 +110,11 @@ R_BENCHMARK_ENTRIES = [
             MenuEntry(
                 "Image",
                 description=None,
-                open=False,
+                open=True,
                 benchmarks=[
-                    mteb.get_benchmark("VisualDocumentRetrieval"),
+                    mteb.get_benchmark("ViDoRe(v3)"),
                     mteb.get_benchmark("JinaVDR"),
+                    MenuEntry("Other", [mteb.get_benchmark("ViDoRe(v1&v2)")]),
                 ],
             ),
             MenuEntry(

mteb/leaderboard/text_segments.py CHANGED Viewed

@@ -53,7 +53,7 @@ ACKNOWLEDGEMENT = """
         <img src="https://play-lh.googleusercontent.com/HdfHZ5jnfMM1Ep7XpPaVdFIVSRx82wKlRC_qmnHx9H1E4aWNp4WKoOcH0x95NAnuYg" width="60" height="55" style="padding: 10px;">
     </a>
     <a href="https://huggingface.co">
-        <img src="https://raw.githubusercontent.com/embeddings-benchmark/mteb/main/docs/images/hf_logo.png" width="60" height="55" style="padding: 10px;">
+        <img src="https://raw.githubusercontent.com/embeddings-benchmark/mteb/main/docs/images/logos/hf_logo.png" width="60" height="55" style="padding: 10px;">
     </a>
 </div>

mteb/models/cache_wrappers/cache_wrapper.py CHANGED Viewed

@@ -112,7 +112,7 @@ class CachedEmbeddingWrapper:
                     dataset,
                     task_metadata=task_metadata,
                     prompt_type=prompt_type,
-                    batch_size=batch_size,
+                    **kwargs,
                 )
                 new_vectors = self._model.encode(
                     dl,

mteb/models/instruct_wrapper.py CHANGED Viewed

@@ -153,6 +153,9 @@ class InstructSentenceTransformerModel(AbsEncoder):
         self.model_name = model_name
         self.model = SentenceTransformer(model_name, revision=revision, **kwargs)
+        if max_seq_length:
+            # https://github.com/huggingface/sentence-transformers/issues/3575
+            self.model.max_seq_length = max_seq_length
         self.apply_instruction_to_passages = apply_instruction_to_passages
         self.prompts_dict = prompts_dict

mteb/models/model_implementations/align_models.py CHANGED Viewed

@@ -124,4 +124,10 @@ align_base = ModelMeta(
     training_datasets=set(
         #  COYO-700M
     ),
+    citation="""@misc{kakaobrain2022coyo-align,
+    title         = {COYO-ALIGN},
+    author        = {Yoon, Boogeo and Lee, Youhan and Baek, Woonhyuk},
+    year          = {2022},
+    howpublished  = {https://github.com/kakaobrain/coyo-align},
+}""",
 )

mteb/models/model_implementations/andersborges.py ADDED Viewed

@@ -0,0 +1,51 @@
+import numpy as np
+from mteb.models.model_implementations.model2vec_models import Model2VecModel
+from mteb.models.model_meta import ModelMeta, ScoringFunction
+model2vecdk = ModelMeta(
+    loader=Model2VecModel,  # type: ignore
+    name="andersborges/model2vecdk",
+    languages=["dan-Latn"],
+    open_weights=True,
+    revision="cb576c78dcc1b729e4612645f61db59929d69e61",
+    release_date="2025-11-21",
+    n_parameters=48042496,
+    memory_usage_mb=183,
+    max_tokens=np.inf,
+    embed_dim=256,
+    license="mit",
+    similarity_fn_name=ScoringFunction.COSINE,
+    framework=["NumPy", "Sentence Transformers"],
+    reference="https://huggingface.co/andersborges/model2vecdk",
+    use_instructions=False,
+    adapted_from="https://huggingface.co/jealk/TTC-L2V-supervised-2",
+    superseded_by=None,
+    training_datasets=set(),  # distilled
+    public_training_code="https://github.com/andersborges/dkmodel2vec",
+    public_training_data="https://huggingface.co/datasets/DDSC/nordic-embedding-training-data",
+)
+model2vecdk_stem = ModelMeta(
+    loader=Model2VecModel,  # type: ignore
+    name="andersborges/model2vecdk-stem",
+    languages=["dan-Latn"],
+    open_weights=True,
+    revision="cb576c78dcc1b729e4612645f61db59929d69e61",
+    release_date="2025-11-21",
+    n_parameters=48578560,
+    memory_usage_mb=185,
+    max_tokens=np.inf,
+    embed_dim=256,
+    license="mit",
+    similarity_fn_name=ScoringFunction.COSINE,
+    framework=["NumPy", "Sentence Transformers"],
+    reference="https://huggingface.co/andersborges/model2vecdk",
+    use_instructions=False,
+    adapted_from="https://huggingface.co/jealk/TTC-L2V-supervised-2",
+    superseded_by=None,
+    training_datasets=set(),  # distilled
+    public_training_code="https://github.com/andersborges/dkmodel2vec",
+    public_training_data="https://huggingface.co/datasets/DDSC/nordic-embedding-training-data",
+)

mteb/models/model_implementations/ara_models.py CHANGED Viewed

@@ -23,4 +23,11 @@ arabic_triplet_matryoshka = ModelMeta(
     training_datasets=set(
         #  "akhooli/arabic-triplets-1m-curated-sims-len"
     ),
+    citation="""
+    @article{nacar2025gate,
+    title={GATE: General Arabic Text Embedding for Enhanced Semantic Textual Similarity with Matryoshka Representation Learning and Hybrid Loss Training},
+    author={Nacar, Omer and Koubaa, Anis and Sibaee, Serry and Al-Habashi, Yasser and Ammar, Adel and Boulila, Wadii},
+    journal={arXiv preprint arXiv:2505.24581},
+    year={2025}
+}""",
 )

mteb/models/model_implementations/b1ade_models.py CHANGED Viewed

@@ -2,7 +2,7 @@ from mteb.models.model_meta import ModelMeta, ScoringFunction
 from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
 b1ade_training_data = {
-    # We are in teh process of submitting a paper outlining our process of creating b1ade using model merging and knowledge distillation.
+    # We are in the process of submitting a paper outlining our process of creating b1ade using model merging and knowledge distillation.
     # Similar to mixedbread models, we do not train on any data (except the MSMarco training split) of MTEB.
     "MSMARCO",
 }

mteb/models/model_implementations/bge_models.py CHANGED Viewed

@@ -62,7 +62,7 @@ bge_m3_training_data = {
     # mMARCO-ZH
     # LawGPT
     # NLI-zh2, LeCaRDv2,
-    # NLI, MultiLongDoc (their syntetic)
+    # NLI, MultiLongDoc (their synthetic)
     # + synthetic data
 }
@@ -141,7 +141,6 @@ bge_chinese_training_data = {
 # https://huggingface.co/BAAI/bge-m3/discussions/29
 bgem3_languages = [
     "afr-Latn",  # af
-    # als
     "amh-Ethi",  # am
     # an
     # ar
@@ -151,7 +150,6 @@ bgem3_languages = [
     # av
     # az
     "azj-Latn",  # azb
-    # ba
     # bar
     # bcl
     "ben-Beng",  # be

mteb/models/model_implementations/blip2_models.py CHANGED Viewed

@@ -10,6 +10,13 @@ from mteb.models.abs_encoder import AbsEncoder
 from mteb.models.model_meta import ModelMeta, ScoringFunction
 from mteb.types import Array, BatchedInput, PromptType
+BLIP2_CITATION = """@inproceedings{li2023blip2,
+    title={{BLIP-2:} Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models},
+    author={Junnan Li and Dongxu Li and Silvio Savarese and Steven Hoi},
+    year={2023},
+    booktitle={ICML},
+}"""
 def blip2_loader(model_name, **kwargs):
     requires_package(
@@ -176,6 +183,7 @@ blip2_opt_2_7b = ModelMeta(
     similarity_fn_name=ScoringFunction.COSINE,
     use_instructions=False,
     training_datasets=blip2_training_datasets,
+    citation=BLIP2_CITATION,
 )
 blip2_opt_6_7b_coco = ModelMeta(
@@ -198,4 +206,5 @@ blip2_opt_6_7b_coco = ModelMeta(
     similarity_fn_name=ScoringFunction.COSINE,
     use_instructions=False,
     training_datasets=blip2_training_datasets,
+    citation=BLIP2_CITATION,
 )

mteb/models/model_implementations/blip_models.py CHANGED Viewed

@@ -10,6 +10,17 @@ from mteb.models.abs_encoder import AbsEncoder
 from mteb.models.model_meta import ModelMeta, ScoringFunction
 from mteb.types import Array, BatchedInput, PromptType
+BLIP_CITATION = """@misc{https://doi.org/10.48550/arxiv.2201.12086,
+    doi = {10.48550/ARXIV.2201.12086},
+    url = {https://arxiv.org/abs/2201.12086},
+    author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven},
+    keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
+    title = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
+    publisher = {arXiv},
+    year = {2022},
+    copyright = {Creative Commons Attribution 4.0 International}
+}"""
 class BLIPModel(AbsEncoder):
     def __init__(
@@ -140,6 +151,7 @@ blip_image_captioning_large = ModelMeta(
         # CC3M+CC12M+SBU
         # LAION115M
     ),
+    citation=BLIP_CITATION,
 )
 blip_image_captioning_base = ModelMeta(
@@ -166,6 +178,7 @@ blip_image_captioning_base = ModelMeta(
         # CC3M+CC12M+SBU
         # LAION115M
     ),
+    citation=BLIP_CITATION,
 )
@@ -192,6 +205,7 @@ blip_vqa_base = ModelMeta(
         # CC3M+CC12M+SBU
         # LAION115M
     ),
+    citation=BLIP_CITATION,
 )
 blip_vqa_capfilt_large = ModelMeta(
@@ -217,6 +231,7 @@ blip_vqa_capfilt_large = ModelMeta(
         # CC3M+CC12M+SBU
         # LAION115M
     ),
+    citation=BLIP_CITATION,
 )
 blip_itm_base_coco = ModelMeta(
@@ -242,6 +257,7 @@ blip_itm_base_coco = ModelMeta(
         # CC3M+CC12M+SBU
         # LAION115M
     ),
+    citation=BLIP_CITATION,
 )
 blip_itm_large_coco = ModelMeta(
@@ -268,6 +284,7 @@ blip_itm_large_coco = ModelMeta(
         # CC3M+CC12M+SBU
         # LAION115M
     ),
+    citation=BLIP_CITATION,
 )
 blip_itm_base_flickr = ModelMeta(
@@ -294,6 +311,7 @@ blip_itm_base_flickr = ModelMeta(
         # LAION115M
         # Flickr30k
     ),
+    citation=BLIP_CITATION,
 )
 blip_itm_large_flickr = ModelMeta(
@@ -319,4 +337,5 @@ blip_itm_large_flickr = ModelMeta(
         # CC3M+CC12M+SBU
         # LAION115M
     ),
+    citation=BLIP_CITATION,
 )

mteb/models/model_implementations/bmretriever_models.py CHANGED Viewed

@@ -48,7 +48,7 @@ class BMRetrieverWrapper(InstructSentenceTransformerModel):
         if padding_side is not None:
             tokenizer_params["padding_side"] = padding_side
         kwargs.setdefault("tokenizer_args", {}).update(tokenizer_params)
-        kwargs.setdefault("config_args", {}).update(revison=revision)
+        kwargs.setdefault("config_args", {}).update(revision=revision)
         transformer = Transformer(
             model_name,

mteb/models/model_implementations/cadet_models.py CHANGED Viewed

@@ -3,6 +3,13 @@ from mteb.models.sentence_transformer_wrapper import sentence_transformers_loade
 from .bge_models import bge_m3_training_data
+CADET_CITATION = """@article{tamber2025conventionalcontrastivelearningfalls,
+    title={Conventional Contrastive Learning Often Falls Short: Improving Dense Retrieval with Cross-Encoder Listwise Distillation and Synthetic Data},
+    author={Manveer Singh Tamber and Suleman Kazi and Vivek Sourabh and Jimmy Lin},
+    journal={arXiv:2505.19274},
+    year={2025}
+}"""
 cadet_training_data = {
     # we train with the corpora of FEVER, MSMARCO, and DBPEDIA. We only train with synthetic generated queries.
     # However, we do use queries from MSMARCO as examples for synthetic query generation.
@@ -46,4 +53,5 @@ cadet_embed = ModelMeta(
     public_training_data="https://github.com/manveertamber/cadet-dense-retrieval",
     training_datasets=cadet_training_data,
     adapted_from="intfloat/e5-base-unsupervised",
+    citation=CADET_CITATION,
 )

mteb/models/model_implementations/cde_models.py CHANGED Viewed

@@ -24,6 +24,16 @@ if TYPE_CHECKING:
     )
 logger = logging.getLogger(__name__)
+CDE_CITATION = """@misc{morris2024contextualdocumentembeddings,
+    title={Contextual Document Embeddings},
+    author={John X. Morris and Alexander M. Rush},
+    year={2024},
+    eprint={2410.02525},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL},
+    url={https://arxiv.org/abs/2410.02525},
+}"""
 class CDEWrapper(SentenceTransformerEncoderWrapper):
     dataset_embeddings: torch.Tensor | None = None
@@ -217,6 +227,7 @@ cde_small_v1 = ModelMeta(
     training_datasets=bge_full_data,
     public_training_code="https://github.com/jxmorris12/cde",
     public_training_data="https://huggingface.co/datasets/cfli/bge-full-data",
+    citation=CDE_CITATION,
 )
 cde_small_v2 = ModelMeta(
@@ -244,4 +255,5 @@ cde_small_v2 = ModelMeta(
     training_datasets=bge_full_data,
     public_training_code="https://github.com/jxmorris12/cde",
     public_training_data="https://huggingface.co/datasets/cfli/bge-full-data",
+    citation=CDE_CITATION,
 )

mteb/models/model_implementations/codefuse_models.py CHANGED Viewed

@@ -2,6 +2,18 @@ from mteb.models import ModelMeta
 from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
 from mteb.types import PromptType
+F2LLM_CITATION = """@article{2025F2LLM,
+    title={F2LLM Technical Report: Matching SOTA Embedding Performance with 6 Million Open-Source Data},
+    author={Ziyin Zhang and Zihan Liao and Hang Yu and Peng Di and Rui Wang},
+    journal={CoRR},
+    volume={abs/2510.02294},
+    year={2025},
+    url={https://doi.org/10.48550/arXiv.2510.02294},
+    doi={10.48550/ARXIV.2510.02294},
+    eprinttype={arXiv},
+    eprint={2510.02294}
+}"""
 training_datasets = {
     "MSMARCO",
     "ArguAna",
@@ -146,6 +158,7 @@ F2LLM_0B6 = ModelMeta(
     public_training_code="https://github.com/codefuse-ai/F2LLM",
     public_training_data="https://huggingface.co/datasets/codefuse-ai/F2LLM",
     training_datasets=training_datasets,
+    citation=F2LLM_CITATION,
 )
 F2LLM_1B7 = ModelMeta(
@@ -174,6 +187,7 @@ F2LLM_1B7 = ModelMeta(
     public_training_code="https://github.com/codefuse-ai/F2LLM",
     public_training_data="https://huggingface.co/datasets/codefuse-ai/F2LLM",
     training_datasets=training_datasets,
+    citation=F2LLM_CITATION,
 )
 F2LLM_4B = ModelMeta(
@@ -202,4 +216,5 @@ F2LLM_4B = ModelMeta(
     public_training_code="https://github.com/codefuse-ai/F2LLM",
     public_training_data="https://huggingface.co/datasets/codefuse-ai/F2LLM",
     training_datasets=training_datasets,
+    citation=F2LLM_CITATION,
 )

mteb/models/model_implementations/codesage_models.py CHANGED Viewed

@@ -1,6 +1,15 @@
 from mteb.models.model_meta import ModelMeta, ScoringFunction
 from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
+CODESAGE_CITATION = """@inproceedings{
+    zhang2024code,
+    title={{CODE} {REPRESENTATION} {LEARNING} {AT} {SCALE}},
+    author={Dejiao Zhang and Wasi Uddin Ahmad and Ming Tan and Hantian Ding and Ramesh Nallapati and Dan Roth and Xiaofei Ma and Bing Xiang},
+    booktitle={The Twelfth International Conference on Learning Representations},
+    year={2024},
+    url={https://openreview.net/forum?id=vfzRRjumpX}
+}"""
 codesage_languages = [
     "python-Code",
     "javascript-Code",
@@ -33,6 +42,7 @@ codesage_large = ModelMeta(
         "CodeSearchNetRetrieval",
         "CodeSearchNetCCRetrieval",
     },
+    citation=CODESAGE_CITATION,
 )
 codesage_base = ModelMeta(
@@ -58,6 +68,7 @@ codesage_base = ModelMeta(
         "CodeSearchNetRetrieval",
         "CodeSearchNetCCRetrieval",
     },
+    citation=CODESAGE_CITATION,
 )
 codesage_small = ModelMeta(
@@ -83,4 +94,5 @@ codesage_small = ModelMeta(
         "CodeSearchNetRetrieval",
         "CodeSearchNetCCRetrieval",
     },
+    citation=CODESAGE_CITATION,
 )

mteb/models/model_implementations/cohere_models.py CHANGED Viewed

@@ -221,7 +221,7 @@ class CohereTextEmbeddingModel(AbsEncoder):
     ) -> None:
         import cohere  # type: ignore
-        self.model_name = model_name.lstrip("Cohere/Cohere-")
+        self.model_name = model_name.removeprefix("Cohere/Cohere-")
         self.sep = sep
         self.model_prompts = self.validate_task_to_prompt_name(model_prompts)
         if embedding_type not in get_args(EmbeddingType):

mteb/models/model_implementations/colqwen_models.py CHANGED Viewed

@@ -220,3 +220,60 @@ colnomic_7b = ModelMeta(
     training_datasets=COLNOMIC_TRAINING_DATA,
     citation=COLNOMIC_CITATION,
 )
+EVOQWEN_TRAINING_DATA = {
+    "colpali_train_set",
+    "VisRAG-Ret-Train-Synthetic-data",
+    "VisRAG-Ret-Train-In-domain-data",
+}
+evoqwen25_vl_retriever_3b_v1 = ModelMeta(
+    loader=ColQwen2_5Wrapper,
+    loader_kwargs=dict(
+        torch_dtype=torch.float16, attn_implementation="flash_attention_2"
+    ),
+    name="ApsaraStackMaaS/EvoQwen2.5-VL-Retriever-3B-v1",
+    languages=["eng-Latn"],
+    revision="aeacaa2775f2758d82721eb1cf2f5daf1a392da9",
+    release_date="2025-11-04",
+    modalities=["image", "text"],
+    n_parameters=3_000_000_000,
+    memory_usage_mb=7200,
+    max_tokens=128000,
+    embed_dim=128,
+    license="apache-2.0",
+    open_weights=True,
+    public_training_code="https://github.com/illuin-tech/colpali",
+    public_training_data="https://huggingface.co/datasets/vidore/colpali_train_set",
+    framework=["ColPali"],
+    reference="https://huggingface.co/ApsaraStackMaaS/EvoQwen2.5-VL-Retriever-3B-v1",
+    similarity_fn_name="MaxSim",
+    use_instructions=True,
+    training_datasets=EVOQWEN_TRAINING_DATA,
+)
+evoqwen25_vl_retriever_7b_v1 = ModelMeta(
+    loader=ColQwen2_5Wrapper,
+    loader_kwargs=dict(
+        torch_dtype=torch.float16, attn_implementation="flash_attention_2"
+    ),
+    name="ApsaraStackMaaS/EvoQwen2.5-VL-Retriever-7B-v1",
+    languages=["eng-Latn"],
+    revision="8952ac6ee0e7de2e9211b165921518caf9202110",
+    release_date="2025-11-04",
+    modalities=["image", "text"],
+    n_parameters=7_000_000_000,
+    memory_usage_mb=14400,
+    max_tokens=128000,
+    embed_dim=128,
+    license="apache-2.0",
+    open_weights=True,
+    public_training_code="https://github.com/illuin-tech/colpali",
+    public_training_data="https://huggingface.co/datasets/vidore/colpali_train_set",
+    framework=["ColPali"],
+    reference="https://huggingface.co/ApsaraStackMaaS/EvoQwen2.5-VL-Retriever-7B-v1",
+    similarity_fn_name="MaxSim",
+    use_instructions=True,
+    training_datasets=EVOQWEN_TRAINING_DATA,
+)

mteb/models/model_implementations/emillykkejensen_models.py ADDED Viewed

@@ -0,0 +1,70 @@
+from mteb.models.model_meta import ModelMeta
+from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
+embedding_gemma_300m_scandi = ModelMeta(
+    loader=sentence_transformers_loader,  # type: ignore
+    name="emillykkejensen/EmbeddingGemma-Scandi-300m",
+    languages=["dan-Latn", "swe-Latn", "nor-Latn", "nob-Latn", "nno-Latn"],
+    open_weights=True,
+    revision="9f3307b9f601db564a9190cb475324d128dcfe86",
+    release_date="2025-10-17",
+    n_parameters=307_581_696,
+    embed_dim=768,
+    max_tokens=2048,
+    license="apache-2.0",
+    reference="https://huggingface.co/emillykkejensen/EmbeddingGemma-Scandi-300m",
+    framework=["Sentence Transformers", "PyTorch"],
+    use_instructions=True,
+    public_training_code=None,
+    public_training_data="https://huggingface.co/datasets/DDSC/nordic-embedding-training-data",
+    training_datasets=set(),
+    similarity_fn_name="cosine",  # type: ignore[arg-type]
+    adapted_from="google/embeddinggemma-300m",
+    memory_usage_mb=578,
+)
+qwen_scandi = ModelMeta(
+    loader=sentence_transformers_loader,  # type: ignore
+    name="emillykkejensen/Qwen3-Embedding-Scandi-0.6B",
+    languages=["dan-Latn", "swe-Latn", "nor-Latn", "nob-Latn", "nno-Latn"],
+    open_weights=True,
+    revision="cf1e7ba36ebd3d605549d8f02930a18e17b54513",
+    release_date="2025-10-17",
+    n_parameters=595776512,
+    memory_usage_mb=2272,
+    embed_dim=1024,
+    max_tokens=32768,
+    license="apache-2.0",
+    reference="https://huggingface.co/emillykkejensen/Qwen3-Embedding-Scandi-0.6B",
+    framework=["Sentence Transformers", "PyTorch"],
+    use_instructions=True,
+    public_training_code=None,
+    public_training_data="https://huggingface.co/datasets/DDSC/nordic-embedding-training-data",
+    training_datasets=set(),
+    similarity_fn_name="cosine",  # type: ignore[arg-type]
+    adapted_from="Qwen/Qwen3-Embedding-0.6B",
+)
+mmbert_scandi = ModelMeta(
+    loader=sentence_transformers_loader,  # type: ignore
+    name="emillykkejensen/mmBERTscandi-base-embedding",
+    languages=["dan-Latn", "swe-Latn", "nor-Latn", "nob-Latn", "nno-Latn"],
+    open_weights=True,
+    revision="82d74c7a5d8e1ddf31b132865df2d16b2b0294ee",
+    release_date="2025-10-17",
+    n_parameters=306939648,
+    memory_usage_mb=1171,
+    embed_dim=768,
+    max_tokens=8192,
+    license="apache-2.0",
+    reference="https://huggingface.co/emillykkejensen/Qwen3-Embedding-Scandi-0.6B",
+    framework=["Sentence Transformers", "PyTorch"],
+    use_instructions=True,
+    public_training_code=None,
+    public_training_data="https://huggingface.co/datasets/DDSC/nordic-embedding-training-data",
+    training_datasets=set(),
+    similarity_fn_name="cosine",  # type: ignore[arg-type]
+    adapted_from="jonasaise/scandmmBERT-base-scandinavian",
+)

mteb/models/model_implementations/gme_v_models.py CHANGED Viewed

@@ -39,7 +39,7 @@ class Encoder(torch.nn.Module):
         self.max_length = max_length
         self.normalize = normalize
         self.processor.tokenizer.padding_side = "right"
-        self.defualt_instruction = "You are a helpful assistant."
+        self.default_instruction = "You are a helpful assistant."
     def forward(
         self,
@@ -103,7 +103,7 @@ class Encoder(torch.nn.Module):
         instruction=None,
         **kwargs,
     ):
-        instruction = instruction or self.defualt_instruction
+        instruction = instruction or self.default_instruction
         # Inputs must be batched
         input_texts, input_images = [], []
         for t, i in zip(texts, images):

mteb/models/model_implementations/ibm_granite_models.py CHANGED Viewed

@@ -79,7 +79,7 @@ granite_training_data = {
     "MIRACLReranking",
     # Multilingual MrTydi Triples
     "MrTidyRetrieval",
-    # Sadeeem Question Asnwering
+    # Sadeeem Question Answering
     # DBPedia Title-Body Pairs
     "DBPedia",
     "DBPedia-NL",  # translated from hotpotQA (not trained on)

mteb/models/model_implementations/inf_models.py CHANGED Viewed

@@ -4,7 +4,7 @@ from mteb.models.model_meta import (
 )
 from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
-inf_retreiver_v1_training_data = {
+inf_retriever_v1_training_data = {
     # eng_Latn
     "ArguAna",
     "CQADupstackRetrieval",
@@ -66,7 +66,7 @@ inf_retriever_v1 = ModelMeta(
     adapted_from="Alibaba-NLP/gte-Qwen2-7B-instruct",
     public_training_code=None,
     public_training_data=None,
-    training_datasets=inf_retreiver_v1_training_data,
+    training_datasets=inf_retriever_v1_training_data,
     citation=INF_RETRIEVER_CITATION,
 )
@@ -92,6 +92,6 @@ inf_retriever_v1_1_5b = ModelMeta(
     adapted_from="Alibaba-NLP/gte-Qwen2-1.5B-instruct",
     public_training_code=None,
     public_training_data=None,
-    training_datasets=inf_retreiver_v1_training_data,
+    training_datasets=inf_retriever_v1_training_data,
     citation=INF_RETRIEVER_CITATION,
 )

mteb 2.0.5__py3-none-any.whl → 2.1.19__py3-none-any.whl

mteb 2.0.5py3-none-any.whl → 2.1.19py3-none-any.whl