mteb 2.1.10__py3-none-any.whl → 2.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/benchmarks/benchmarks/benchmarks.py +5 -5
- mteb/models/model_implementations/jasper_models.py +53 -2
- {mteb-2.1.10.dist-info → mteb-2.1.11.dist-info}/METADATA +1 -1
- {mteb-2.1.10.dist-info → mteb-2.1.11.dist-info}/RECORD +8 -8
- {mteb-2.1.10.dist-info → mteb-2.1.11.dist-info}/WHEEL +0 -0
- {mteb-2.1.10.dist-info → mteb-2.1.11.dist-info}/entry_points.txt +0 -0
- {mteb-2.1.10.dist-info → mteb-2.1.11.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.1.10.dist-info → mteb-2.1.11.dist-info}/top_level.txt +0 -0
|
@@ -1647,7 +1647,7 @@ MTEB_NL = Benchmark(
|
|
|
1647
1647
|
exclusive_language_filter=True,
|
|
1648
1648
|
tasks=[
|
|
1649
1649
|
# Classification
|
|
1650
|
-
"DutchBookReviewSentimentClassification",
|
|
1650
|
+
"DutchBookReviewSentimentClassification.v2",
|
|
1651
1651
|
"MassiveIntentClassification",
|
|
1652
1652
|
"MassiveScenarioClassification",
|
|
1653
1653
|
"SIB200Classification",
|
|
@@ -1678,10 +1678,10 @@ MTEB_NL = Benchmark(
|
|
|
1678
1678
|
# # Reranking
|
|
1679
1679
|
"WikipediaRerankingMultilingual",
|
|
1680
1680
|
# # Retrieval
|
|
1681
|
-
"ArguAna-NL",
|
|
1682
|
-
"SCIDOCS-NL",
|
|
1683
|
-
"SciFact-NL",
|
|
1684
|
-
"NFCorpus-NL",
|
|
1681
|
+
"ArguAna-NL.v2",
|
|
1682
|
+
"SCIDOCS-NL.v2",
|
|
1683
|
+
"SciFact-NL.v2",
|
|
1684
|
+
"NFCorpus-NL.v2",
|
|
1685
1685
|
"BelebeleRetrieval",
|
|
1686
1686
|
"WebFAQRetrieval",
|
|
1687
1687
|
"DutchNewsArticlesRetrieval",
|
|
@@ -7,14 +7,34 @@ from torch.utils.data import DataLoader
|
|
|
7
7
|
|
|
8
8
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
9
9
|
from mteb.models.abs_encoder import AbsEncoder
|
|
10
|
+
from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
|
|
11
|
+
from mteb.models.model_implementations.bge_models import (
|
|
12
|
+
bge_chinese_training_data,
|
|
13
|
+
bge_full_data,
|
|
14
|
+
bge_m3_training_data,
|
|
15
|
+
)
|
|
16
|
+
from mteb.models.model_implementations.e5_instruct import E5_MISTRAL_TRAINING_DATA
|
|
17
|
+
from mteb.models.model_implementations.nvidia_models import nvidia_training_datasets
|
|
18
|
+
from mteb.models.model_implementations.qzhou_models import qzhou_training_data
|
|
10
19
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
11
20
|
from mteb.types import Array, BatchedInput, PromptType
|
|
12
21
|
|
|
13
|
-
from .nvidia_models import nvidia_training_datasets
|
|
14
|
-
|
|
15
22
|
logger = logging.getLogger(__name__)
|
|
16
23
|
|
|
17
24
|
|
|
25
|
+
def instruction_template(
|
|
26
|
+
instruction: str, prompt_type: PromptType | None = None
|
|
27
|
+
) -> str:
|
|
28
|
+
if not instruction or prompt_type == PromptType.document:
|
|
29
|
+
return ""
|
|
30
|
+
if isinstance(instruction, dict):
|
|
31
|
+
if prompt_type is None:
|
|
32
|
+
instruction = "Given a web search query, retrieve relevant passages that answer the query"
|
|
33
|
+
else:
|
|
34
|
+
instruction = instruction[prompt_type]
|
|
35
|
+
return f"Instruct: {instruction}\nQuery:"
|
|
36
|
+
|
|
37
|
+
|
|
18
38
|
class JasperModel(AbsEncoder):
|
|
19
39
|
def __init__(
|
|
20
40
|
self,
|
|
@@ -114,3 +134,34 @@ jasper_en_v1 = ModelMeta(
|
|
|
114
134
|
}
|
|
115
135
|
""",
|
|
116
136
|
)
|
|
137
|
+
|
|
138
|
+
Jasper_Token_Compression_600M = ModelMeta(
|
|
139
|
+
loader=InstructSentenceTransformerModel,
|
|
140
|
+
loader_kwargs=dict(
|
|
141
|
+
instruction_template=instruction_template,
|
|
142
|
+
apply_instruction_to_passages=False,
|
|
143
|
+
trust_remote_code=True,
|
|
144
|
+
),
|
|
145
|
+
name="infgrad/Jasper-Token-Compression-600M",
|
|
146
|
+
languages=["eng-Latn", "zho-Hans"],
|
|
147
|
+
open_weights=True,
|
|
148
|
+
revision="06a100f753a5a96d9e583b3af79c6fcdfacc4719",
|
|
149
|
+
release_date="2025-11-14",
|
|
150
|
+
n_parameters=595776512,
|
|
151
|
+
memory_usage_mb=2272,
|
|
152
|
+
embed_dim=2048,
|
|
153
|
+
license="mit",
|
|
154
|
+
max_tokens=32768,
|
|
155
|
+
reference="https://huggingface.co/infgrad/Jasper-Token-Compression-600M",
|
|
156
|
+
similarity_fn_name="cosine",
|
|
157
|
+
framework=["Sentence Transformers", "PyTorch"],
|
|
158
|
+
use_instructions=True,
|
|
159
|
+
public_training_code=None,
|
|
160
|
+
# public_training_data: unsupervised data for distillation
|
|
161
|
+
public_training_data="https://huggingface.co/datasets/infgrad/jasper_text_distill_dataset",
|
|
162
|
+
training_datasets=bge_m3_training_data
|
|
163
|
+
| bge_chinese_training_data
|
|
164
|
+
| bge_full_data
|
|
165
|
+
| E5_MISTRAL_TRAINING_DATA
|
|
166
|
+
| qzhou_training_data,
|
|
167
|
+
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mteb
|
|
3
|
-
Version: 2.1.
|
|
3
|
+
Version: 2.1.11
|
|
4
4
|
Summary: Massive Text Embedding Benchmark
|
|
5
5
|
Author-email: MTEB Contributors <niklas@huggingface.co>, Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Nouamane Tazi <nouamane@huggingface.co>, Nils Reimers <info@nils-reimers.de>
|
|
6
6
|
Maintainer-email: Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Roman Solomatin <risolomatin@gmail.com>, Isaac Chung <chungisaac1217@gmail.com>
|
|
@@ -56,7 +56,7 @@ mteb/benchmarks/_create_table.py,sha256=z3iqa5dajLk0DYxEE9EeO1qpR3VJXokg8ZQ2rdUk
|
|
|
56
56
|
mteb/benchmarks/benchmark.py,sha256=70RlMyyg_wkWTlU_IbfLl-KaqRWXGCKTd8fWe9X-AQE,4173
|
|
57
57
|
mteb/benchmarks/get_benchmark.py,sha256=-n_O-gitRKZi48gJKNgGuI36hsP7yLVSiwulnMHN7Gw,3935
|
|
58
58
|
mteb/benchmarks/benchmarks/__init__.py,sha256=UD6YjWPDVPSQdUhmD-4rho08Gs5LU9pS_C2jX5eUns0,2102
|
|
59
|
-
mteb/benchmarks/benchmarks/benchmarks.py,sha256=
|
|
59
|
+
mteb/benchmarks/benchmarks/benchmarks.py,sha256=KDJanVYs3BkFn74VHwarZ8HJ2DX6EIgcVYBrlyjbv9I,89956
|
|
60
60
|
mteb/benchmarks/benchmarks/rteb_benchmarks.py,sha256=QnCSrTTaBfcRlAQp2Nu81tgv1idMXqiM16Fp2zKJ5Ys,10607
|
|
61
61
|
mteb/cli/__init__.py,sha256=v-csUr3eUZElIvrGB6QGtaIdndDfNWEe9oZchsGsJpg,64
|
|
62
62
|
mteb/cli/_display_tasks.py,sha256=7A06dT9sSoTz6shyMvskPxuc5eHY_H7PGPlROzMP0yw,2196
|
|
@@ -1483,7 +1483,7 @@ mteb/models/model_implementations/hinvec_models.py,sha256=I_d_dSNVaGIwMIwyvTlaPA
|
|
|
1483
1483
|
mteb/models/model_implementations/human.py,sha256=klMpuMAtYH92EIEwNMEhne_Baf9fNiTg1DNWYD11P44,532
|
|
1484
1484
|
mteb/models/model_implementations/ibm_granite_models.py,sha256=YCT0jbgawy19ps5l8QlxpQoJLjq8Nh-3R-e6yxS0DRM,7902
|
|
1485
1485
|
mteb/models/model_implementations/inf_models.py,sha256=lvXUFhAYDltq2_Xa9MHcwfhh1V20rbJLSgON76tkj6w,2906
|
|
1486
|
-
mteb/models/model_implementations/jasper_models.py,sha256=
|
|
1486
|
+
mteb/models/model_implementations/jasper_models.py,sha256=yf6gNPTWl05rAJrao8lIpw0wld6xdmPx9PhDwbGHSlc,6037
|
|
1487
1487
|
mteb/models/model_implementations/jina_clip.py,sha256=CfiIxbhKspjQajNtObCfGPHOWPk6uLn4cuwydQHFTMo,5118
|
|
1488
1488
|
mteb/models/model_implementations/jina_models.py,sha256=QWoesiTygdFTLcdGpdx26wOUI1AXRz3jLmxGHJ0WMNE,29919
|
|
1489
1489
|
mteb/models/model_implementations/kalm_models.py,sha256=FmW7Z5Qs6WYBLuKvql3u4IJW36kj4k-Ypah8qTBEBkg,59837
|
|
@@ -2554,9 +2554,9 @@ mteb/types/_metadata.py,sha256=NN-W0S6a5TDV7UkpRx1pyWtGF4TyyCyoPUfHOwdeci8,2290
|
|
|
2554
2554
|
mteb/types/_result.py,sha256=CRAUc5IvqI3_9SyXDwv-PWLCXwXdZem9RePeYESRtuw,996
|
|
2555
2555
|
mteb/types/_string_validators.py,sha256=PY-dYq4E8O50VS3bLYdldPWp400fl_WzUjfVSkNWe8U,523
|
|
2556
2556
|
mteb/types/statistics.py,sha256=YwJsxTf1eaCI_RE-J37a-gK5wDeGAsmkeZKoZCFihSo,3755
|
|
2557
|
-
mteb-2.1.
|
|
2558
|
-
mteb-2.1.
|
|
2559
|
-
mteb-2.1.
|
|
2560
|
-
mteb-2.1.
|
|
2561
|
-
mteb-2.1.
|
|
2562
|
-
mteb-2.1.
|
|
2557
|
+
mteb-2.1.11.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
2558
|
+
mteb-2.1.11.dist-info/METADATA,sha256=lAjW0-rC-ibKTw8ErbU3AVXabOp8pD-Uw6goYdBTJlI,13574
|
|
2559
|
+
mteb-2.1.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
2560
|
+
mteb-2.1.11.dist-info/entry_points.txt,sha256=8IJoEJFKoDHmVnNev-qJ9pp4Ln7_1-ma9QsXnzVCzGU,39
|
|
2561
|
+
mteb-2.1.11.dist-info/top_level.txt,sha256=OLVIjcQAlWBz0bdmutKlWHLF42FF0hp4uVAg3ZyiG4U,5
|
|
2562
|
+
mteb-2.1.11.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|