mteb 2.1.10__py3-none-any.whl → 2.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1647,7 +1647,7 @@ MTEB_NL = Benchmark(
1647
1647
  exclusive_language_filter=True,
1648
1648
  tasks=[
1649
1649
  # Classification
1650
- "DutchBookReviewSentimentClassification",
1650
+ "DutchBookReviewSentimentClassification.v2",
1651
1651
  "MassiveIntentClassification",
1652
1652
  "MassiveScenarioClassification",
1653
1653
  "SIB200Classification",
@@ -1678,10 +1678,10 @@ MTEB_NL = Benchmark(
1678
1678
  # # Reranking
1679
1679
  "WikipediaRerankingMultilingual",
1680
1680
  # # Retrieval
1681
- "ArguAna-NL",
1682
- "SCIDOCS-NL",
1683
- "SciFact-NL",
1684
- "NFCorpus-NL",
1681
+ "ArguAna-NL.v2",
1682
+ "SCIDOCS-NL.v2",
1683
+ "SciFact-NL.v2",
1684
+ "NFCorpus-NL.v2",
1685
1685
  "BelebeleRetrieval",
1686
1686
  "WebFAQRetrieval",
1687
1687
  "DutchNewsArticlesRetrieval",
@@ -19,7 +19,7 @@ embedding_gemma_300m_scandi = ModelMeta(
19
19
  public_training_data="https://huggingface.co/datasets/DDSC/nordic-embedding-training-data",
20
20
  training_datasets=set(),
21
21
  similarity_fn_name="cosine", # type: ignore[arg-type]
22
- adapted_from="emillykkejensen/EmbeddingGemma-Scandi-300m",
22
+ adapted_from="google/embeddinggemma-300m",
23
23
  memory_usage_mb=578,
24
24
  )
25
25
 
@@ -7,14 +7,34 @@ from torch.utils.data import DataLoader
7
7
 
8
8
  from mteb.abstasks.task_metadata import TaskMetadata
9
9
  from mteb.models.abs_encoder import AbsEncoder
10
+ from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
11
+ from mteb.models.model_implementations.bge_models import (
12
+ bge_chinese_training_data,
13
+ bge_full_data,
14
+ bge_m3_training_data,
15
+ )
16
+ from mteb.models.model_implementations.e5_instruct import E5_MISTRAL_TRAINING_DATA
17
+ from mteb.models.model_implementations.nvidia_models import nvidia_training_datasets
18
+ from mteb.models.model_implementations.qzhou_models import qzhou_training_data
10
19
  from mteb.models.model_meta import ModelMeta, ScoringFunction
11
20
  from mteb.types import Array, BatchedInput, PromptType
12
21
 
13
- from .nvidia_models import nvidia_training_datasets
14
-
15
22
  logger = logging.getLogger(__name__)
16
23
 
17
24
 
25
+ def instruction_template(
26
+ instruction: str, prompt_type: PromptType | None = None
27
+ ) -> str:
28
+ if not instruction or prompt_type == PromptType.document:
29
+ return ""
30
+ if isinstance(instruction, dict):
31
+ if prompt_type is None:
32
+ instruction = "Given a web search query, retrieve relevant passages that answer the query"
33
+ else:
34
+ instruction = instruction[prompt_type]
35
+ return f"Instruct: {instruction}\nQuery:"
36
+
37
+
18
38
  class JasperModel(AbsEncoder):
19
39
  def __init__(
20
40
  self,
@@ -114,3 +134,34 @@ jasper_en_v1 = ModelMeta(
114
134
  }
115
135
  """,
116
136
  )
137
+
138
+ Jasper_Token_Compression_600M = ModelMeta(
139
+ loader=InstructSentenceTransformerModel,
140
+ loader_kwargs=dict(
141
+ instruction_template=instruction_template,
142
+ apply_instruction_to_passages=False,
143
+ trust_remote_code=True,
144
+ ),
145
+ name="infgrad/Jasper-Token-Compression-600M",
146
+ languages=["eng-Latn", "zho-Hans"],
147
+ open_weights=True,
148
+ revision="06a100f753a5a96d9e583b3af79c6fcdfacc4719",
149
+ release_date="2025-11-14",
150
+ n_parameters=595776512,
151
+ memory_usage_mb=2272,
152
+ embed_dim=2048,
153
+ license="mit",
154
+ max_tokens=32768,
155
+ reference="https://huggingface.co/infgrad/Jasper-Token-Compression-600M",
156
+ similarity_fn_name="cosine",
157
+ framework=["Sentence Transformers", "PyTorch"],
158
+ use_instructions=True,
159
+ public_training_code=None,
160
+ # public_training_data: unsupervised data for distillation
161
+ public_training_data="https://huggingface.co/datasets/infgrad/jasper_text_distill_dataset",
162
+ training_datasets=bge_m3_training_data
163
+ | bge_chinese_training_data
164
+ | bge_full_data
165
+ | E5_MISTRAL_TRAINING_DATA
166
+ | qzhou_training_data,
167
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mteb
3
- Version: 2.1.10
3
+ Version: 2.1.12
4
4
  Summary: Massive Text Embedding Benchmark
5
5
  Author-email: MTEB Contributors <niklas@huggingface.co>, Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Nouamane Tazi <nouamane@huggingface.co>, Nils Reimers <info@nils-reimers.de>
6
6
  Maintainer-email: Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Roman Solomatin <risolomatin@gmail.com>, Isaac Chung <chungisaac1217@gmail.com>
@@ -56,7 +56,7 @@ mteb/benchmarks/_create_table.py,sha256=z3iqa5dajLk0DYxEE9EeO1qpR3VJXokg8ZQ2rdUk
56
56
  mteb/benchmarks/benchmark.py,sha256=70RlMyyg_wkWTlU_IbfLl-KaqRWXGCKTd8fWe9X-AQE,4173
57
57
  mteb/benchmarks/get_benchmark.py,sha256=-n_O-gitRKZi48gJKNgGuI36hsP7yLVSiwulnMHN7Gw,3935
58
58
  mteb/benchmarks/benchmarks/__init__.py,sha256=UD6YjWPDVPSQdUhmD-4rho08Gs5LU9pS_C2jX5eUns0,2102
59
- mteb/benchmarks/benchmarks/benchmarks.py,sha256=v7n2fPGOC66zzBhS1nfSthY55DQnGNg-hGa4XfT21Vg,89941
59
+ mteb/benchmarks/benchmarks/benchmarks.py,sha256=KDJanVYs3BkFn74VHwarZ8HJ2DX6EIgcVYBrlyjbv9I,89956
60
60
  mteb/benchmarks/benchmarks/rteb_benchmarks.py,sha256=QnCSrTTaBfcRlAQp2Nu81tgv1idMXqiM16Fp2zKJ5Ys,10607
61
61
  mteb/cli/__init__.py,sha256=v-csUr3eUZElIvrGB6QGtaIdndDfNWEe9oZchsGsJpg,64
62
62
  mteb/cli/_display_tasks.py,sha256=7A06dT9sSoTz6shyMvskPxuc5eHY_H7PGPlROzMP0yw,2196
@@ -1469,7 +1469,7 @@ mteb/models/model_implementations/dino_models.py,sha256=QFgaFHR5YKrylqJGSljXCBn2
1469
1469
  mteb/models/model_implementations/e5_instruct.py,sha256=9R4GoSFicgqNDCh3HhTN_8L1qhzuEKvatjHYn3T9zlU,7676
1470
1470
  mteb/models/model_implementations/e5_models.py,sha256=vsqkmm6XzZn9ROj_OUR0j2KiN75MEuQsOPeoyc1AeYg,10937
1471
1471
  mteb/models/model_implementations/e5_v.py,sha256=_9W7I0ryIzx_H9eCkzwdm8iHdGX1LIjKGXkhSh_zNv8,6690
1472
- mteb/models/model_implementations/emillykkejensen_models.py,sha256=1DEAGdSZZXDFbbw0YH-vkLm9Y-wthgbOJCRTIpV3Jeo,2795
1472
+ mteb/models/model_implementations/emillykkejensen_models.py,sha256=QdhGqCm_1-AURkrniZj2S1MjwwIVOPMzLvpgfJq-3EQ,2779
1473
1473
  mteb/models/model_implementations/en_code_retriever.py,sha256=leZ-0M6LrunocY3XQBYZU1uevDRopeyR5ujIhwqBbd8,1043
1474
1474
  mteb/models/model_implementations/evaclip_models.py,sha256=cPMGYLDIq4s8zJxb4vPXqJ-rqwPaq7KOh2QZSO6cDas,8000
1475
1475
  mteb/models/model_implementations/fa_models.py,sha256=WGal70_ezITWoNdjcMdbOCTSCtoaXzuPadYstLVXxhg,7478
@@ -1483,7 +1483,7 @@ mteb/models/model_implementations/hinvec_models.py,sha256=I_d_dSNVaGIwMIwyvTlaPA
1483
1483
  mteb/models/model_implementations/human.py,sha256=klMpuMAtYH92EIEwNMEhne_Baf9fNiTg1DNWYD11P44,532
1484
1484
  mteb/models/model_implementations/ibm_granite_models.py,sha256=YCT0jbgawy19ps5l8QlxpQoJLjq8Nh-3R-e6yxS0DRM,7902
1485
1485
  mteb/models/model_implementations/inf_models.py,sha256=lvXUFhAYDltq2_Xa9MHcwfhh1V20rbJLSgON76tkj6w,2906
1486
- mteb/models/model_implementations/jasper_models.py,sha256=KzjVnQ1HwaVO9Z7kk1ZkjFrhvlKupeWCmkSljnZv-IM,4071
1486
+ mteb/models/model_implementations/jasper_models.py,sha256=yf6gNPTWl05rAJrao8lIpw0wld6xdmPx9PhDwbGHSlc,6037
1487
1487
  mteb/models/model_implementations/jina_clip.py,sha256=CfiIxbhKspjQajNtObCfGPHOWPk6uLn4cuwydQHFTMo,5118
1488
1488
  mteb/models/model_implementations/jina_models.py,sha256=QWoesiTygdFTLcdGpdx26wOUI1AXRz3jLmxGHJ0WMNE,29919
1489
1489
  mteb/models/model_implementations/kalm_models.py,sha256=FmW7Z5Qs6WYBLuKvql3u4IJW36kj4k-Ypah8qTBEBkg,59837
@@ -2554,9 +2554,9 @@ mteb/types/_metadata.py,sha256=NN-W0S6a5TDV7UkpRx1pyWtGF4TyyCyoPUfHOwdeci8,2290
2554
2554
  mteb/types/_result.py,sha256=CRAUc5IvqI3_9SyXDwv-PWLCXwXdZem9RePeYESRtuw,996
2555
2555
  mteb/types/_string_validators.py,sha256=PY-dYq4E8O50VS3bLYdldPWp400fl_WzUjfVSkNWe8U,523
2556
2556
  mteb/types/statistics.py,sha256=YwJsxTf1eaCI_RE-J37a-gK5wDeGAsmkeZKoZCFihSo,3755
2557
- mteb-2.1.10.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
2558
- mteb-2.1.10.dist-info/METADATA,sha256=LClBepxtjXoGssnPn6QgdAukEqJerTX67OC7zoKhdiE,13574
2559
- mteb-2.1.10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
2560
- mteb-2.1.10.dist-info/entry_points.txt,sha256=8IJoEJFKoDHmVnNev-qJ9pp4Ln7_1-ma9QsXnzVCzGU,39
2561
- mteb-2.1.10.dist-info/top_level.txt,sha256=OLVIjcQAlWBz0bdmutKlWHLF42FF0hp4uVAg3ZyiG4U,5
2562
- mteb-2.1.10.dist-info/RECORD,,
2557
+ mteb-2.1.12.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
2558
+ mteb-2.1.12.dist-info/METADATA,sha256=mXrsN01rI1osGl_9epUwEI7BjLmwXSxJECQjR7BmoJM,13574
2559
+ mteb-2.1.12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
2560
+ mteb-2.1.12.dist-info/entry_points.txt,sha256=8IJoEJFKoDHmVnNev-qJ9pp4Ln7_1-ma9QsXnzVCzGU,39
2561
+ mteb-2.1.12.dist-info/top_level.txt,sha256=OLVIjcQAlWBz0bdmutKlWHLF42FF0hp4uVAg3ZyiG4U,5
2562
+ mteb-2.1.12.dist-info/RECORD,,
File without changes