mteb 2.1.9__py3-none-any.whl → 2.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1647,7 +1647,7 @@ MTEB_NL = Benchmark(
1647
1647
  exclusive_language_filter=True,
1648
1648
  tasks=[
1649
1649
  # Classification
1650
- "DutchBookReviewSentimentClassification",
1650
+ "DutchBookReviewSentimentClassification.v2",
1651
1651
  "MassiveIntentClassification",
1652
1652
  "MassiveScenarioClassification",
1653
1653
  "SIB200Classification",
@@ -1678,10 +1678,10 @@ MTEB_NL = Benchmark(
1678
1678
  # # Reranking
1679
1679
  "WikipediaRerankingMultilingual",
1680
1680
  # # Retrieval
1681
- "ArguAna-NL",
1682
- "SCIDOCS-NL",
1683
- "SciFact-NL",
1684
- "NFCorpus-NL",
1681
+ "ArguAna-NL.v2",
1682
+ "SCIDOCS-NL.v2",
1683
+ "SciFact-NL.v2",
1684
+ "NFCorpus-NL.v2",
1685
1685
  "BelebeleRetrieval",
1686
1686
  "WebFAQRetrieval",
1687
1687
  "DutchNewsArticlesRetrieval",
@@ -7,14 +7,34 @@ from torch.utils.data import DataLoader
7
7
 
8
8
  from mteb.abstasks.task_metadata import TaskMetadata
9
9
  from mteb.models.abs_encoder import AbsEncoder
10
+ from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
11
+ from mteb.models.model_implementations.bge_models import (
12
+ bge_chinese_training_data,
13
+ bge_full_data,
14
+ bge_m3_training_data,
15
+ )
16
+ from mteb.models.model_implementations.e5_instruct import E5_MISTRAL_TRAINING_DATA
17
+ from mteb.models.model_implementations.nvidia_models import nvidia_training_datasets
18
+ from mteb.models.model_implementations.qzhou_models import qzhou_training_data
10
19
  from mteb.models.model_meta import ModelMeta, ScoringFunction
11
20
  from mteb.types import Array, BatchedInput, PromptType
12
21
 
13
- from .nvidia_models import nvidia_training_datasets
14
-
15
22
  logger = logging.getLogger(__name__)
16
23
 
17
24
 
25
+ def instruction_template(
26
+ instruction: str, prompt_type: PromptType | None = None
27
+ ) -> str:
28
+ if not instruction or prompt_type == PromptType.document:
29
+ return ""
30
+ if isinstance(instruction, dict):
31
+ if prompt_type is None:
32
+ instruction = "Given a web search query, retrieve relevant passages that answer the query"
33
+ else:
34
+ instruction = instruction[prompt_type]
35
+ return f"Instruct: {instruction}\nQuery:"
36
+
37
+
18
38
  class JasperModel(AbsEncoder):
19
39
  def __init__(
20
40
  self,
@@ -114,3 +134,34 @@ jasper_en_v1 = ModelMeta(
114
134
  }
115
135
  """,
116
136
  )
137
+
138
+ Jasper_Token_Compression_600M = ModelMeta(
139
+ loader=InstructSentenceTransformerModel,
140
+ loader_kwargs=dict(
141
+ instruction_template=instruction_template,
142
+ apply_instruction_to_passages=False,
143
+ trust_remote_code=True,
144
+ ),
145
+ name="infgrad/Jasper-Token-Compression-600M",
146
+ languages=["eng-Latn", "zho-Hans"],
147
+ open_weights=True,
148
+ revision="06a100f753a5a96d9e583b3af79c6fcdfacc4719",
149
+ release_date="2025-11-14",
150
+ n_parameters=595776512,
151
+ memory_usage_mb=2272,
152
+ embed_dim=2048,
153
+ license="mit",
154
+ max_tokens=32768,
155
+ reference="https://huggingface.co/infgrad/Jasper-Token-Compression-600M",
156
+ similarity_fn_name="cosine",
157
+ framework=["Sentence Transformers", "PyTorch"],
158
+ use_instructions=True,
159
+ public_training_code=None,
160
+ # public_training_data: unsupervised data for distillation
161
+ public_training_data="https://huggingface.co/datasets/infgrad/jasper_text_distill_dataset",
162
+ training_datasets=bge_m3_training_data
163
+ | bge_chinese_training_data
164
+ | bge_full_data
165
+ | E5_MISTRAL_TRAINING_DATA
166
+ | qzhou_training_data,
167
+ )
@@ -46,10 +46,17 @@ class GeorgianFAQRetrieval(AbsTaskRetrieval):
46
46
  split=_EVAL_SPLIT,
47
47
  revision=self.metadata.dataset["revision"],
48
48
  )
49
- question_ids = {
50
- question: _id for _id, question in enumerate(set(data["question"]))
51
- }
52
- answer_ids = {answer: _id for _id, answer in enumerate(set(data["answer"]))}
49
+
50
+ question_ids = {}
51
+ answer_ids = {}
52
+
53
+ for row in data:
54
+ question = row["question"]
55
+ answer = row["answer"]
56
+ if question not in question_ids:
57
+ question_ids[question] = len(question_ids)
58
+ if answer not in answer_ids:
59
+ answer_ids[answer] = len(answer_ids)
53
60
 
54
61
  for row in data:
55
62
  question = row["question"]
@@ -230,10 +230,11 @@ class BelebeleRetrieval(AbsTaskRetrieval):
230
230
  ds_corpus = self.dataset[lang_corpus]
231
231
  ds_question = self.dataset[lang_question]
232
232
 
233
- question_ids = {
234
- question: _id
235
- for _id, question in enumerate(set(ds_question["question"]))
236
- }
233
+ question_ids = {}
234
+ for row in ds_question:
235
+ question = row["question"]
236
+ if question not in question_ids:
237
+ question_ids[question] = len(question_ids)
237
238
 
238
239
  link_to_context_id = {}
239
240
  context_idx = 0
@@ -32,10 +32,15 @@ def _load_publichealthqa_data(
32
32
  split=split,
33
33
  revision=revision,
34
34
  )
35
- question_ids = {
36
- question: _id for _id, question in enumerate(set(data["question"]))
37
- }
38
- answer_ids = {answer: _id for _id, answer in enumerate(set(data["answer"]))}
35
+
36
+ question_ids = {}
37
+ answer_ids = {}
38
+
39
+ for row in data:
40
+ if row["question"] is not None and row["question"] not in question_ids:
41
+ question_ids[row["question"]] = len(question_ids)
42
+ if row["answer"] is not None and row["answer"] not in answer_ids:
43
+ answer_ids[row["answer"]] = len(answer_ids)
39
44
 
40
45
  for row in data:
41
46
  if row["question"] is None or row["answer"] is None:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mteb
3
- Version: 2.1.9
3
+ Version: 2.1.11
4
4
  Summary: Massive Text Embedding Benchmark
5
5
  Author-email: MTEB Contributors <niklas@huggingface.co>, Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Nouamane Tazi <nouamane@huggingface.co>, Nils Reimers <info@nils-reimers.de>
6
6
  Maintainer-email: Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Roman Solomatin <risolomatin@gmail.com>, Isaac Chung <chungisaac1217@gmail.com>
@@ -56,7 +56,7 @@ mteb/benchmarks/_create_table.py,sha256=z3iqa5dajLk0DYxEE9EeO1qpR3VJXokg8ZQ2rdUk
56
56
  mteb/benchmarks/benchmark.py,sha256=70RlMyyg_wkWTlU_IbfLl-KaqRWXGCKTd8fWe9X-AQE,4173
57
57
  mteb/benchmarks/get_benchmark.py,sha256=-n_O-gitRKZi48gJKNgGuI36hsP7yLVSiwulnMHN7Gw,3935
58
58
  mteb/benchmarks/benchmarks/__init__.py,sha256=UD6YjWPDVPSQdUhmD-4rho08Gs5LU9pS_C2jX5eUns0,2102
59
- mteb/benchmarks/benchmarks/benchmarks.py,sha256=v7n2fPGOC66zzBhS1nfSthY55DQnGNg-hGa4XfT21Vg,89941
59
+ mteb/benchmarks/benchmarks/benchmarks.py,sha256=KDJanVYs3BkFn74VHwarZ8HJ2DX6EIgcVYBrlyjbv9I,89956
60
60
  mteb/benchmarks/benchmarks/rteb_benchmarks.py,sha256=QnCSrTTaBfcRlAQp2Nu81tgv1idMXqiM16Fp2zKJ5Ys,10607
61
61
  mteb/cli/__init__.py,sha256=v-csUr3eUZElIvrGB6QGtaIdndDfNWEe9oZchsGsJpg,64
62
62
  mteb/cli/_display_tasks.py,sha256=7A06dT9sSoTz6shyMvskPxuc5eHY_H7PGPlROzMP0yw,2196
@@ -1483,7 +1483,7 @@ mteb/models/model_implementations/hinvec_models.py,sha256=I_d_dSNVaGIwMIwyvTlaPA
1483
1483
  mteb/models/model_implementations/human.py,sha256=klMpuMAtYH92EIEwNMEhne_Baf9fNiTg1DNWYD11P44,532
1484
1484
  mteb/models/model_implementations/ibm_granite_models.py,sha256=YCT0jbgawy19ps5l8QlxpQoJLjq8Nh-3R-e6yxS0DRM,7902
1485
1485
  mteb/models/model_implementations/inf_models.py,sha256=lvXUFhAYDltq2_Xa9MHcwfhh1V20rbJLSgON76tkj6w,2906
1486
- mteb/models/model_implementations/jasper_models.py,sha256=KzjVnQ1HwaVO9Z7kk1ZkjFrhvlKupeWCmkSljnZv-IM,4071
1486
+ mteb/models/model_implementations/jasper_models.py,sha256=yf6gNPTWl05rAJrao8lIpw0wld6xdmPx9PhDwbGHSlc,6037
1487
1487
  mteb/models/model_implementations/jina_clip.py,sha256=CfiIxbhKspjQajNtObCfGPHOWPk6uLn4cuwydQHFTMo,5118
1488
1488
  mteb/models/model_implementations/jina_models.py,sha256=QWoesiTygdFTLcdGpdx26wOUI1AXRz3jLmxGHJ0WMNE,29919
1489
1489
  mteb/models/model_implementations/kalm_models.py,sha256=FmW7Z5Qs6WYBLuKvql3u4IJW36kj4k-Ypah8qTBEBkg,59837
@@ -2326,12 +2326,12 @@ mteb/tasks/retrieval/jpn/nlp_journal_abs_intro_retrieval.py,sha256=EEOQpTC6vEPUL
2326
2326
  mteb/tasks/retrieval/jpn/nlp_journal_title_abs_retrieval.py,sha256=JOOW_5pRKHzVn8wTOY0fhxLJ6Ns7wlQHoGHGIYVovAQ,3056
2327
2327
  mteb/tasks/retrieval/jpn/nlp_journal_title_intro_retrieval.py,sha256=aVFTFiANWrIz68FjHv9KBqlhpWlsmi9EAP052gECzaU,3078
2328
2328
  mteb/tasks/retrieval/kat/__init__.py,sha256=H4phkKqg_yZzkK7T62aCMBzjbGZzLKJ-MngrQlPbW3A,93
2329
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py,sha256=XKEmWaMuxJraXfMF4k7S8CFy3KdLUphad7z8Nz29jp4,2345
2329
+ mteb/tasks/retrieval/kat/georgian_faq_retrieval.py,sha256=4zyodSYCtHtBW9WKIGxFZaTXDrtHuaf3uyfIsDRGBqM,2494
2330
2330
  mteb/tasks/retrieval/kor/__init__.py,sha256=zNjAS2VRjeYX5u4vqev6dGOo_R3i9uSzxAsduZ0po4I,138
2331
2331
  mteb/tasks/retrieval/kor/auto_rag_retrieval.py,sha256=tgffW8zMpDSv1FCOdS4_4SL5zKQj70JVSt_RKs3CgKY,1576
2332
2332
  mteb/tasks/retrieval/kor/ko_strategy_qa.py,sha256=jk13ORetYtF0q36h8ljD6TeTHUwvK5F5ZbDoMCP3eWk,1156
2333
2333
  mteb/tasks/retrieval/multilingual/__init__.py,sha256=mfVGkoB4DO5ktlg8ia-4nImFVmZcqXh1XkgCkIff0tY,6765
2334
- mteb/tasks/retrieval/multilingual/belebele_retrieval.py,sha256=AEShsRxcoCNC2C02dj-go6eFr008NLZ2cnebAmdo4Sk,6956
2334
+ mteb/tasks/retrieval/multilingual/belebele_retrieval.py,sha256=gaVLEwuLEwMutMi9V-obpiYKbpllX2QNm2j3MVeebfE,7027
2335
2335
  mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt19.py,sha256=_6r34ZvRiLVENYcrd87NjilybGaetBwKFEbO29zYmBU,4676
2336
2336
  mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt21.py,sha256=Puy0PjpRr4M_Bbxdl7oWfa7pQGM04zaRaTNlnhyKejM,4677
2337
2337
  mteb/tasks/retrieval/multilingual/cur_ev1_retrieval.py,sha256=dwzo2sqjamM_xkSiC-jbapyhDFezSJpM4S8KfBsuLPk,4562
@@ -2346,7 +2346,7 @@ mteb/tasks/retrieval/multilingual/mr_tidy_retrieval.py,sha256=3uGnj3O92_02zXZnPW
2346
2346
  mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py,sha256=xF4GjBmJVgw6c8VGVh-5QLN_4i_NKeoAzqRWmA_pfnw,2440
2347
2347
  mteb/tasks/retrieval/multilingual/neu_clir2022_retrieval.py,sha256=bkGMvMxG2toYL98kv85BvVpSZ-rVeWvB5FFIzXhdPO4,2749
2348
2348
  mteb/tasks/retrieval/multilingual/neu_clir2023_retrieval.py,sha256=0cALhuU3ZU5c_y7tDIyiMc7Onv-qC7YwfnimZVb8-rg,2793
2349
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py,sha256=CGOTbGuwYNrzbefB76QY88fnb8CdJkN9bPhAssNtLvA,3502
2349
+ mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py,sha256=LJGpx4RkSJPXldN0SlMA6PbG1x8R2l-Hupc9q1xfleg,3667
2350
2350
  mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py,sha256=Mmcvrt_1cIxPfHZfUzSURPZyaaweGiB02im1ZszlS6M,6837
2351
2351
  mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py,sha256=iFUQUlO_ogBdQBVYBQW3o-AJDQ792yg1pJtRxA5I3Qo,3796
2352
2352
  mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py,sha256=UduWKefwP7bPYxiDlztPEvSWXmTdw0xElglMbPY6XhA,4449
@@ -2554,9 +2554,9 @@ mteb/types/_metadata.py,sha256=NN-W0S6a5TDV7UkpRx1pyWtGF4TyyCyoPUfHOwdeci8,2290
2554
2554
  mteb/types/_result.py,sha256=CRAUc5IvqI3_9SyXDwv-PWLCXwXdZem9RePeYESRtuw,996
2555
2555
  mteb/types/_string_validators.py,sha256=PY-dYq4E8O50VS3bLYdldPWp400fl_WzUjfVSkNWe8U,523
2556
2556
  mteb/types/statistics.py,sha256=YwJsxTf1eaCI_RE-J37a-gK5wDeGAsmkeZKoZCFihSo,3755
2557
- mteb-2.1.9.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
2558
- mteb-2.1.9.dist-info/METADATA,sha256=yUOXi6O_wkyskXKnHDcDB6SqLZg5Q5Nc_a_qK7Pngpc,13573
2559
- mteb-2.1.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
2560
- mteb-2.1.9.dist-info/entry_points.txt,sha256=8IJoEJFKoDHmVnNev-qJ9pp4Ln7_1-ma9QsXnzVCzGU,39
2561
- mteb-2.1.9.dist-info/top_level.txt,sha256=OLVIjcQAlWBz0bdmutKlWHLF42FF0hp4uVAg3ZyiG4U,5
2562
- mteb-2.1.9.dist-info/RECORD,,
2557
+ mteb-2.1.11.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
2558
+ mteb-2.1.11.dist-info/METADATA,sha256=lAjW0-rC-ibKTw8ErbU3AVXabOp8pD-Uw6goYdBTJlI,13574
2559
+ mteb-2.1.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
2560
+ mteb-2.1.11.dist-info/entry_points.txt,sha256=8IJoEJFKoDHmVnNev-qJ9pp4Ln7_1-ma9QsXnzVCzGU,39
2561
+ mteb-2.1.11.dist-info/top_level.txt,sha256=OLVIjcQAlWBz0bdmutKlWHLF42FF0hp4uVAg3ZyiG4U,5
2562
+ mteb-2.1.11.dist-info/RECORD,,
File without changes