mteb 2.1.9__py3-none-any.whl → 2.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/benchmarks/benchmarks/benchmarks.py +5 -5
- mteb/models/model_implementations/jasper_models.py +53 -2
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
- mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
- {mteb-2.1.9.dist-info → mteb-2.1.11.dist-info}/METADATA +1 -1
- {mteb-2.1.9.dist-info → mteb-2.1.11.dist-info}/RECORD +11 -11
- {mteb-2.1.9.dist-info → mteb-2.1.11.dist-info}/WHEEL +0 -0
- {mteb-2.1.9.dist-info → mteb-2.1.11.dist-info}/entry_points.txt +0 -0
- {mteb-2.1.9.dist-info → mteb-2.1.11.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.1.9.dist-info → mteb-2.1.11.dist-info}/top_level.txt +0 -0
|
@@ -1647,7 +1647,7 @@ MTEB_NL = Benchmark(
|
|
|
1647
1647
|
exclusive_language_filter=True,
|
|
1648
1648
|
tasks=[
|
|
1649
1649
|
# Classification
|
|
1650
|
-
"DutchBookReviewSentimentClassification",
|
|
1650
|
+
"DutchBookReviewSentimentClassification.v2",
|
|
1651
1651
|
"MassiveIntentClassification",
|
|
1652
1652
|
"MassiveScenarioClassification",
|
|
1653
1653
|
"SIB200Classification",
|
|
@@ -1678,10 +1678,10 @@ MTEB_NL = Benchmark(
|
|
|
1678
1678
|
# # Reranking
|
|
1679
1679
|
"WikipediaRerankingMultilingual",
|
|
1680
1680
|
# # Retrieval
|
|
1681
|
-
"ArguAna-NL",
|
|
1682
|
-
"SCIDOCS-NL",
|
|
1683
|
-
"SciFact-NL",
|
|
1684
|
-
"NFCorpus-NL",
|
|
1681
|
+
"ArguAna-NL.v2",
|
|
1682
|
+
"SCIDOCS-NL.v2",
|
|
1683
|
+
"SciFact-NL.v2",
|
|
1684
|
+
"NFCorpus-NL.v2",
|
|
1685
1685
|
"BelebeleRetrieval",
|
|
1686
1686
|
"WebFAQRetrieval",
|
|
1687
1687
|
"DutchNewsArticlesRetrieval",
|
|
@@ -7,14 +7,34 @@ from torch.utils.data import DataLoader
|
|
|
7
7
|
|
|
8
8
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
9
9
|
from mteb.models.abs_encoder import AbsEncoder
|
|
10
|
+
from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
|
|
11
|
+
from mteb.models.model_implementations.bge_models import (
|
|
12
|
+
bge_chinese_training_data,
|
|
13
|
+
bge_full_data,
|
|
14
|
+
bge_m3_training_data,
|
|
15
|
+
)
|
|
16
|
+
from mteb.models.model_implementations.e5_instruct import E5_MISTRAL_TRAINING_DATA
|
|
17
|
+
from mteb.models.model_implementations.nvidia_models import nvidia_training_datasets
|
|
18
|
+
from mteb.models.model_implementations.qzhou_models import qzhou_training_data
|
|
10
19
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
11
20
|
from mteb.types import Array, BatchedInput, PromptType
|
|
12
21
|
|
|
13
|
-
from .nvidia_models import nvidia_training_datasets
|
|
14
|
-
|
|
15
22
|
logger = logging.getLogger(__name__)
|
|
16
23
|
|
|
17
24
|
|
|
25
|
+
def instruction_template(
|
|
26
|
+
instruction: str, prompt_type: PromptType | None = None
|
|
27
|
+
) -> str:
|
|
28
|
+
if not instruction or prompt_type == PromptType.document:
|
|
29
|
+
return ""
|
|
30
|
+
if isinstance(instruction, dict):
|
|
31
|
+
if prompt_type is None:
|
|
32
|
+
instruction = "Given a web search query, retrieve relevant passages that answer the query"
|
|
33
|
+
else:
|
|
34
|
+
instruction = instruction[prompt_type]
|
|
35
|
+
return f"Instruct: {instruction}\nQuery:"
|
|
36
|
+
|
|
37
|
+
|
|
18
38
|
class JasperModel(AbsEncoder):
|
|
19
39
|
def __init__(
|
|
20
40
|
self,
|
|
@@ -114,3 +134,34 @@ jasper_en_v1 = ModelMeta(
|
|
|
114
134
|
}
|
|
115
135
|
""",
|
|
116
136
|
)
|
|
137
|
+
|
|
138
|
+
Jasper_Token_Compression_600M = ModelMeta(
|
|
139
|
+
loader=InstructSentenceTransformerModel,
|
|
140
|
+
loader_kwargs=dict(
|
|
141
|
+
instruction_template=instruction_template,
|
|
142
|
+
apply_instruction_to_passages=False,
|
|
143
|
+
trust_remote_code=True,
|
|
144
|
+
),
|
|
145
|
+
name="infgrad/Jasper-Token-Compression-600M",
|
|
146
|
+
languages=["eng-Latn", "zho-Hans"],
|
|
147
|
+
open_weights=True,
|
|
148
|
+
revision="06a100f753a5a96d9e583b3af79c6fcdfacc4719",
|
|
149
|
+
release_date="2025-11-14",
|
|
150
|
+
n_parameters=595776512,
|
|
151
|
+
memory_usage_mb=2272,
|
|
152
|
+
embed_dim=2048,
|
|
153
|
+
license="mit",
|
|
154
|
+
max_tokens=32768,
|
|
155
|
+
reference="https://huggingface.co/infgrad/Jasper-Token-Compression-600M",
|
|
156
|
+
similarity_fn_name="cosine",
|
|
157
|
+
framework=["Sentence Transformers", "PyTorch"],
|
|
158
|
+
use_instructions=True,
|
|
159
|
+
public_training_code=None,
|
|
160
|
+
# public_training_data: unsupervised data for distillation
|
|
161
|
+
public_training_data="https://huggingface.co/datasets/infgrad/jasper_text_distill_dataset",
|
|
162
|
+
training_datasets=bge_m3_training_data
|
|
163
|
+
| bge_chinese_training_data
|
|
164
|
+
| bge_full_data
|
|
165
|
+
| E5_MISTRAL_TRAINING_DATA
|
|
166
|
+
| qzhou_training_data,
|
|
167
|
+
)
|
|
@@ -46,10 +46,17 @@ class GeorgianFAQRetrieval(AbsTaskRetrieval):
|
|
|
46
46
|
split=_EVAL_SPLIT,
|
|
47
47
|
revision=self.metadata.dataset["revision"],
|
|
48
48
|
)
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
}
|
|
52
|
-
|
|
49
|
+
|
|
50
|
+
question_ids = {}
|
|
51
|
+
answer_ids = {}
|
|
52
|
+
|
|
53
|
+
for row in data:
|
|
54
|
+
question = row["question"]
|
|
55
|
+
answer = row["answer"]
|
|
56
|
+
if question not in question_ids:
|
|
57
|
+
question_ids[question] = len(question_ids)
|
|
58
|
+
if answer not in answer_ids:
|
|
59
|
+
answer_ids[answer] = len(answer_ids)
|
|
53
60
|
|
|
54
61
|
for row in data:
|
|
55
62
|
question = row["question"]
|
|
@@ -230,10 +230,11 @@ class BelebeleRetrieval(AbsTaskRetrieval):
|
|
|
230
230
|
ds_corpus = self.dataset[lang_corpus]
|
|
231
231
|
ds_question = self.dataset[lang_question]
|
|
232
232
|
|
|
233
|
-
question_ids = {
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
233
|
+
question_ids = {}
|
|
234
|
+
for row in ds_question:
|
|
235
|
+
question = row["question"]
|
|
236
|
+
if question not in question_ids:
|
|
237
|
+
question_ids[question] = len(question_ids)
|
|
237
238
|
|
|
238
239
|
link_to_context_id = {}
|
|
239
240
|
context_idx = 0
|
|
@@ -32,10 +32,15 @@ def _load_publichealthqa_data(
|
|
|
32
32
|
split=split,
|
|
33
33
|
revision=revision,
|
|
34
34
|
)
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
}
|
|
38
|
-
|
|
35
|
+
|
|
36
|
+
question_ids = {}
|
|
37
|
+
answer_ids = {}
|
|
38
|
+
|
|
39
|
+
for row in data:
|
|
40
|
+
if row["question"] is not None and row["question"] not in question_ids:
|
|
41
|
+
question_ids[row["question"]] = len(question_ids)
|
|
42
|
+
if row["answer"] is not None and row["answer"] not in answer_ids:
|
|
43
|
+
answer_ids[row["answer"]] = len(answer_ids)
|
|
39
44
|
|
|
40
45
|
for row in data:
|
|
41
46
|
if row["question"] is None or row["answer"] is None:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mteb
|
|
3
|
-
Version: 2.1.
|
|
3
|
+
Version: 2.1.11
|
|
4
4
|
Summary: Massive Text Embedding Benchmark
|
|
5
5
|
Author-email: MTEB Contributors <niklas@huggingface.co>, Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Nouamane Tazi <nouamane@huggingface.co>, Nils Reimers <info@nils-reimers.de>
|
|
6
6
|
Maintainer-email: Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Roman Solomatin <risolomatin@gmail.com>, Isaac Chung <chungisaac1217@gmail.com>
|
|
@@ -56,7 +56,7 @@ mteb/benchmarks/_create_table.py,sha256=z3iqa5dajLk0DYxEE9EeO1qpR3VJXokg8ZQ2rdUk
|
|
|
56
56
|
mteb/benchmarks/benchmark.py,sha256=70RlMyyg_wkWTlU_IbfLl-KaqRWXGCKTd8fWe9X-AQE,4173
|
|
57
57
|
mteb/benchmarks/get_benchmark.py,sha256=-n_O-gitRKZi48gJKNgGuI36hsP7yLVSiwulnMHN7Gw,3935
|
|
58
58
|
mteb/benchmarks/benchmarks/__init__.py,sha256=UD6YjWPDVPSQdUhmD-4rho08Gs5LU9pS_C2jX5eUns0,2102
|
|
59
|
-
mteb/benchmarks/benchmarks/benchmarks.py,sha256=
|
|
59
|
+
mteb/benchmarks/benchmarks/benchmarks.py,sha256=KDJanVYs3BkFn74VHwarZ8HJ2DX6EIgcVYBrlyjbv9I,89956
|
|
60
60
|
mteb/benchmarks/benchmarks/rteb_benchmarks.py,sha256=QnCSrTTaBfcRlAQp2Nu81tgv1idMXqiM16Fp2zKJ5Ys,10607
|
|
61
61
|
mteb/cli/__init__.py,sha256=v-csUr3eUZElIvrGB6QGtaIdndDfNWEe9oZchsGsJpg,64
|
|
62
62
|
mteb/cli/_display_tasks.py,sha256=7A06dT9sSoTz6shyMvskPxuc5eHY_H7PGPlROzMP0yw,2196
|
|
@@ -1483,7 +1483,7 @@ mteb/models/model_implementations/hinvec_models.py,sha256=I_d_dSNVaGIwMIwyvTlaPA
|
|
|
1483
1483
|
mteb/models/model_implementations/human.py,sha256=klMpuMAtYH92EIEwNMEhne_Baf9fNiTg1DNWYD11P44,532
|
|
1484
1484
|
mteb/models/model_implementations/ibm_granite_models.py,sha256=YCT0jbgawy19ps5l8QlxpQoJLjq8Nh-3R-e6yxS0DRM,7902
|
|
1485
1485
|
mteb/models/model_implementations/inf_models.py,sha256=lvXUFhAYDltq2_Xa9MHcwfhh1V20rbJLSgON76tkj6w,2906
|
|
1486
|
-
mteb/models/model_implementations/jasper_models.py,sha256=
|
|
1486
|
+
mteb/models/model_implementations/jasper_models.py,sha256=yf6gNPTWl05rAJrao8lIpw0wld6xdmPx9PhDwbGHSlc,6037
|
|
1487
1487
|
mteb/models/model_implementations/jina_clip.py,sha256=CfiIxbhKspjQajNtObCfGPHOWPk6uLn4cuwydQHFTMo,5118
|
|
1488
1488
|
mteb/models/model_implementations/jina_models.py,sha256=QWoesiTygdFTLcdGpdx26wOUI1AXRz3jLmxGHJ0WMNE,29919
|
|
1489
1489
|
mteb/models/model_implementations/kalm_models.py,sha256=FmW7Z5Qs6WYBLuKvql3u4IJW36kj4k-Ypah8qTBEBkg,59837
|
|
@@ -2326,12 +2326,12 @@ mteb/tasks/retrieval/jpn/nlp_journal_abs_intro_retrieval.py,sha256=EEOQpTC6vEPUL
|
|
|
2326
2326
|
mteb/tasks/retrieval/jpn/nlp_journal_title_abs_retrieval.py,sha256=JOOW_5pRKHzVn8wTOY0fhxLJ6Ns7wlQHoGHGIYVovAQ,3056
|
|
2327
2327
|
mteb/tasks/retrieval/jpn/nlp_journal_title_intro_retrieval.py,sha256=aVFTFiANWrIz68FjHv9KBqlhpWlsmi9EAP052gECzaU,3078
|
|
2328
2328
|
mteb/tasks/retrieval/kat/__init__.py,sha256=H4phkKqg_yZzkK7T62aCMBzjbGZzLKJ-MngrQlPbW3A,93
|
|
2329
|
-
mteb/tasks/retrieval/kat/georgian_faq_retrieval.py,sha256=
|
|
2329
|
+
mteb/tasks/retrieval/kat/georgian_faq_retrieval.py,sha256=4zyodSYCtHtBW9WKIGxFZaTXDrtHuaf3uyfIsDRGBqM,2494
|
|
2330
2330
|
mteb/tasks/retrieval/kor/__init__.py,sha256=zNjAS2VRjeYX5u4vqev6dGOo_R3i9uSzxAsduZ0po4I,138
|
|
2331
2331
|
mteb/tasks/retrieval/kor/auto_rag_retrieval.py,sha256=tgffW8zMpDSv1FCOdS4_4SL5zKQj70JVSt_RKs3CgKY,1576
|
|
2332
2332
|
mteb/tasks/retrieval/kor/ko_strategy_qa.py,sha256=jk13ORetYtF0q36h8ljD6TeTHUwvK5F5ZbDoMCP3eWk,1156
|
|
2333
2333
|
mteb/tasks/retrieval/multilingual/__init__.py,sha256=mfVGkoB4DO5ktlg8ia-4nImFVmZcqXh1XkgCkIff0tY,6765
|
|
2334
|
-
mteb/tasks/retrieval/multilingual/belebele_retrieval.py,sha256=
|
|
2334
|
+
mteb/tasks/retrieval/multilingual/belebele_retrieval.py,sha256=gaVLEwuLEwMutMi9V-obpiYKbpllX2QNm2j3MVeebfE,7027
|
|
2335
2335
|
mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt19.py,sha256=_6r34ZvRiLVENYcrd87NjilybGaetBwKFEbO29zYmBU,4676
|
|
2336
2336
|
mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt21.py,sha256=Puy0PjpRr4M_Bbxdl7oWfa7pQGM04zaRaTNlnhyKejM,4677
|
|
2337
2337
|
mteb/tasks/retrieval/multilingual/cur_ev1_retrieval.py,sha256=dwzo2sqjamM_xkSiC-jbapyhDFezSJpM4S8KfBsuLPk,4562
|
|
@@ -2346,7 +2346,7 @@ mteb/tasks/retrieval/multilingual/mr_tidy_retrieval.py,sha256=3uGnj3O92_02zXZnPW
|
|
|
2346
2346
|
mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py,sha256=xF4GjBmJVgw6c8VGVh-5QLN_4i_NKeoAzqRWmA_pfnw,2440
|
|
2347
2347
|
mteb/tasks/retrieval/multilingual/neu_clir2022_retrieval.py,sha256=bkGMvMxG2toYL98kv85BvVpSZ-rVeWvB5FFIzXhdPO4,2749
|
|
2348
2348
|
mteb/tasks/retrieval/multilingual/neu_clir2023_retrieval.py,sha256=0cALhuU3ZU5c_y7tDIyiMc7Onv-qC7YwfnimZVb8-rg,2793
|
|
2349
|
-
mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py,sha256=
|
|
2349
|
+
mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py,sha256=LJGpx4RkSJPXldN0SlMA6PbG1x8R2l-Hupc9q1xfleg,3667
|
|
2350
2350
|
mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py,sha256=Mmcvrt_1cIxPfHZfUzSURPZyaaweGiB02im1ZszlS6M,6837
|
|
2351
2351
|
mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py,sha256=iFUQUlO_ogBdQBVYBQW3o-AJDQ792yg1pJtRxA5I3Qo,3796
|
|
2352
2352
|
mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py,sha256=UduWKefwP7bPYxiDlztPEvSWXmTdw0xElglMbPY6XhA,4449
|
|
@@ -2554,9 +2554,9 @@ mteb/types/_metadata.py,sha256=NN-W0S6a5TDV7UkpRx1pyWtGF4TyyCyoPUfHOwdeci8,2290
|
|
|
2554
2554
|
mteb/types/_result.py,sha256=CRAUc5IvqI3_9SyXDwv-PWLCXwXdZem9RePeYESRtuw,996
|
|
2555
2555
|
mteb/types/_string_validators.py,sha256=PY-dYq4E8O50VS3bLYdldPWp400fl_WzUjfVSkNWe8U,523
|
|
2556
2556
|
mteb/types/statistics.py,sha256=YwJsxTf1eaCI_RE-J37a-gK5wDeGAsmkeZKoZCFihSo,3755
|
|
2557
|
-
mteb-2.1.
|
|
2558
|
-
mteb-2.1.
|
|
2559
|
-
mteb-2.1.
|
|
2560
|
-
mteb-2.1.
|
|
2561
|
-
mteb-2.1.
|
|
2562
|
-
mteb-2.1.
|
|
2557
|
+
mteb-2.1.11.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
2558
|
+
mteb-2.1.11.dist-info/METADATA,sha256=lAjW0-rC-ibKTw8ErbU3AVXabOp8pD-Uw6goYdBTJlI,13574
|
|
2559
|
+
mteb-2.1.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
2560
|
+
mteb-2.1.11.dist-info/entry_points.txt,sha256=8IJoEJFKoDHmVnNev-qJ9pp4Ln7_1-ma9QsXnzVCzGU,39
|
|
2561
|
+
mteb-2.1.11.dist-info/top_level.txt,sha256=OLVIjcQAlWBz0bdmutKlWHLF42FF0hp4uVAg3ZyiG4U,5
|
|
2562
|
+
mteb-2.1.11.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|