mteb 2.3.4__py3-none-any.whl → 2.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/benchmarks/benchmarks/__init__.py +2 -0
- mteb/benchmarks/benchmarks/benchmarks.py +57 -0
- mteb/cache.py +5 -1
- mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
- mteb/leaderboard/figures.py +1 -1
- mteb/models/model_implementations/kowshik24_models.py +22 -0
- mteb/models/model_implementations/ruri_models.py +312 -0
- mteb/models/model_implementations/sarashina_embedding_models.py +166 -0
- mteb/models/search_wrappers.py +1 -1
- mteb/tasks/classification/heb/__init__.py +6 -1
- mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -3
- {mteb-2.3.4.dist-info → mteb-2.3.6.dist-info}/METADATA +1 -1
- {mteb-2.3.4.dist-info → mteb-2.3.6.dist-info}/RECORD +17 -13
- {mteb-2.3.4.dist-info → mteb-2.3.6.dist-info}/WHEEL +0 -0
- {mteb-2.3.4.dist-info → mteb-2.3.6.dist-info}/entry_points.txt +0 -0
- {mteb-2.3.4.dist-info → mteb-2.3.6.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.3.4.dist-info → mteb-2.3.6.dist-info}/top_level.txt +0 -0
|
@@ -12,6 +12,7 @@ from mteb.benchmarks.benchmarks.benchmarks import (
|
|
|
12
12
|
FA_MTEB_2,
|
|
13
13
|
HUME,
|
|
14
14
|
JINA_VDR,
|
|
15
|
+
JMTEB_V2,
|
|
15
16
|
LONG_EMBED,
|
|
16
17
|
MIEB_ENG,
|
|
17
18
|
MIEB_IMG,
|
|
@@ -75,6 +76,7 @@ __all__ = [
|
|
|
75
76
|
"HUME",
|
|
76
77
|
"HUME",
|
|
77
78
|
"JINA_VDR",
|
|
79
|
+
"JMTEB_V2",
|
|
78
80
|
"LONG_EMBED",
|
|
79
81
|
"MIEB_ENG",
|
|
80
82
|
"MIEB_IMG",
|
|
@@ -2562,3 +2562,60 @@ HUME = HUMEBenchmark(
|
|
|
2562
2562
|
citation=None,
|
|
2563
2563
|
contacts=["AdnanElAssadi56", "KennethEnevoldsen", "isaac-chung", "Samoed"],
|
|
2564
2564
|
)
|
|
2565
|
+
|
|
2566
|
+
JMTEB_V2 = Benchmark(
|
|
2567
|
+
name="JMTEB(v2)",
|
|
2568
|
+
display_name="Japanese",
|
|
2569
|
+
icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/jp.svg",
|
|
2570
|
+
tasks=get_tasks(
|
|
2571
|
+
languages=["jpn"],
|
|
2572
|
+
tasks=[
|
|
2573
|
+
# Clustering (3)
|
|
2574
|
+
"LivedoorNewsClustering.v2",
|
|
2575
|
+
"MewsC16JaClustering",
|
|
2576
|
+
"SIB200ClusteringS2S",
|
|
2577
|
+
# Classification (7)
|
|
2578
|
+
"AmazonReviewsClassification",
|
|
2579
|
+
"AmazonCounterfactualClassification",
|
|
2580
|
+
"MassiveIntentClassification",
|
|
2581
|
+
"MassiveScenarioClassification",
|
|
2582
|
+
"JapaneseSentimentClassification",
|
|
2583
|
+
"SIB200Classification",
|
|
2584
|
+
"WRIMEClassification",
|
|
2585
|
+
# STS (2)
|
|
2586
|
+
"JSTS",
|
|
2587
|
+
"JSICK",
|
|
2588
|
+
# Retrieval (11)
|
|
2589
|
+
"JaqketRetrieval",
|
|
2590
|
+
"MrTidyRetrieval",
|
|
2591
|
+
"JaGovFaqsRetrieval",
|
|
2592
|
+
"NLPJournalTitleAbsRetrieval.V2",
|
|
2593
|
+
"NLPJournalTitleIntroRetrieval.V2",
|
|
2594
|
+
"NLPJournalAbsIntroRetrieval.V2",
|
|
2595
|
+
"NLPJournalAbsArticleRetrieval.V2",
|
|
2596
|
+
"JaCWIRRetrieval",
|
|
2597
|
+
"MIRACLRetrieval",
|
|
2598
|
+
"MintakaRetrieval",
|
|
2599
|
+
"MultiLongDocRetrieval",
|
|
2600
|
+
# Reranking (5)
|
|
2601
|
+
"ESCIReranking",
|
|
2602
|
+
"JQaRAReranking",
|
|
2603
|
+
"JaCWIRReranking",
|
|
2604
|
+
"MIRACLReranking",
|
|
2605
|
+
"MultiLongDocReranking",
|
|
2606
|
+
],
|
|
2607
|
+
),
|
|
2608
|
+
description="JMTEB is a benchmark for evaluating Japanese text embedding models. In v2, we have extended the benchmark to 28 datasets, enabling more comprehensive evaluation compared with v1 (MTEB(jpn, v1)).",
|
|
2609
|
+
reference="https://github.com/sbintuitions/JMTEB",
|
|
2610
|
+
citation=r"""
|
|
2611
|
+
@article{li2025jmteb,
|
|
2612
|
+
author = {Li, Shengzhe and Ohagi, Masaya and Ri, Ryokan and Fukuchi, Akihiko and Shibata, Tomohide and Kawahara, Daisuke},
|
|
2613
|
+
issue = {3},
|
|
2614
|
+
journal = {Vol.2025-NL-265,No.3,1-15},
|
|
2615
|
+
month = {sep},
|
|
2616
|
+
title = {{JMTEB and JMTEB-lite: Japanese Massive Text Embedding Benchmark and Its Lightweight Version}},
|
|
2617
|
+
year = {2025},
|
|
2618
|
+
}
|
|
2619
|
+
""",
|
|
2620
|
+
contacts=["lsz05"],
|
|
2621
|
+
)
|
mteb/cache.py
CHANGED
|
@@ -243,7 +243,11 @@ class ResultCache:
|
|
|
243
243
|
f"No results repository found in {results_directory}, cloning it from {remote}"
|
|
244
244
|
)
|
|
245
245
|
|
|
246
|
-
subprocess.run(
|
|
246
|
+
subprocess.run(
|
|
247
|
+
["git", "clone", "--depth", "1", remote, "remote"],
|
|
248
|
+
cwd=self.cache_path,
|
|
249
|
+
check=True,
|
|
250
|
+
)
|
|
247
251
|
|
|
248
252
|
return results_directory
|
|
249
253
|
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
{
|
|
2
|
+
"test": {
|
|
3
|
+
"num_samples": 1085,
|
|
4
|
+
"number_texts_intersect_with_train": 0,
|
|
5
|
+
"text_statistics": {
|
|
6
|
+
"total_text_length": 115359,
|
|
7
|
+
"min_text_length": 8,
|
|
8
|
+
"average_text_length": 106.32165898617511,
|
|
9
|
+
"max_text_length": 2722,
|
|
10
|
+
"unique_texts": 1085
|
|
11
|
+
},
|
|
12
|
+
"image_statistics": null,
|
|
13
|
+
"label_statistics": {
|
|
14
|
+
"min_labels_per_text": 1,
|
|
15
|
+
"average_label_per_text": 1.0,
|
|
16
|
+
"max_labels_per_text": 1,
|
|
17
|
+
"unique_labels": 3,
|
|
18
|
+
"labels": {
|
|
19
|
+
"0": {
|
|
20
|
+
"count": 868
|
|
21
|
+
},
|
|
22
|
+
"1": {
|
|
23
|
+
"count": 190
|
|
24
|
+
},
|
|
25
|
+
"2": {
|
|
26
|
+
"count": 27
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
},
|
|
31
|
+
"train": {
|
|
32
|
+
"num_samples": 7176,
|
|
33
|
+
"number_texts_intersect_with_train": null,
|
|
34
|
+
"text_statistics": {
|
|
35
|
+
"total_text_length": 830248,
|
|
36
|
+
"min_text_length": 5,
|
|
37
|
+
"average_text_length": 115.69788182831661,
|
|
38
|
+
"max_text_length": 4759,
|
|
39
|
+
"unique_texts": 7176
|
|
40
|
+
},
|
|
41
|
+
"image_statistics": null,
|
|
42
|
+
"label_statistics": {
|
|
43
|
+
"min_labels_per_text": 1,
|
|
44
|
+
"average_label_per_text": 1.0,
|
|
45
|
+
"max_labels_per_text": 1,
|
|
46
|
+
"unique_labels": 3,
|
|
47
|
+
"labels": {
|
|
48
|
+
"0": {
|
|
49
|
+
"count": 4933
|
|
50
|
+
},
|
|
51
|
+
"1": {
|
|
52
|
+
"count": 2047
|
|
53
|
+
},
|
|
54
|
+
"2": {
|
|
55
|
+
"count": 196
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
}
|
mteb/leaderboard/figures.py
CHANGED
|
@@ -117,7 +117,7 @@ def _performance_size_plot(df: pd.DataFrame) -> go.Figure:
|
|
|
117
117
|
df["Max Tokens"] = df["Max Tokens"].map(_parse_float)
|
|
118
118
|
df["Log(Tokens)"] = np.log10(df["Max Tokens"])
|
|
119
119
|
df["Mean (Task)"] = df["Mean (Task)"].map(_parse_float)
|
|
120
|
-
df = df.dropna(
|
|
120
|
+
df = df[df["Mean (Task)"] > 0].dropna(
|
|
121
121
|
subset=["Mean (Task)", "Number of Parameters", "Embedding Dimensions"]
|
|
122
122
|
)
|
|
123
123
|
if not len(df.index):
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from mteb.models import ModelMeta, sentence_transformers_loader
|
|
2
|
+
|
|
3
|
+
kowshik24_bangla_embedding_model = ModelMeta(
|
|
4
|
+
loader=sentence_transformers_loader,
|
|
5
|
+
name="Kowshik24/bangla-sentence-transformer-ft-matryoshka-paraphrase-multilingual-mpnet-base-v2",
|
|
6
|
+
languages=["ben-Beng"], # Bengali using Bengali script
|
|
7
|
+
open_weights=True,
|
|
8
|
+
revision="6689c21e69be5950596bad084457cbaa138728d8",
|
|
9
|
+
release_date="2025-11-10",
|
|
10
|
+
n_parameters=278_000_000,
|
|
11
|
+
memory_usage_mb=1061,
|
|
12
|
+
embed_dim=768,
|
|
13
|
+
license="apache-2.0",
|
|
14
|
+
max_tokens=128,
|
|
15
|
+
reference="https://huggingface.co/Kowshik24/bangla-sentence-transformer-ft-matryoshka-paraphrase-multilingual-mpnet-base-v2",
|
|
16
|
+
similarity_fn_name="cosine",
|
|
17
|
+
framework=["Sentence Transformers", "PyTorch"],
|
|
18
|
+
use_instructions=False,
|
|
19
|
+
public_training_code="https://github.com/kowshik24/Bangla-Embedding",
|
|
20
|
+
public_training_data="https://huggingface.co/datasets/sartajekram/BanglaRQA",
|
|
21
|
+
training_datasets=set(),
|
|
22
|
+
)
|
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
from mteb.models.model_meta import ModelMeta
|
|
2
|
+
from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
|
|
3
|
+
|
|
4
|
+
RURI_V3_PROMPTS = {
|
|
5
|
+
"Retrieval-query": "検索クエリ: ",
|
|
6
|
+
"Retrieval-document": "検索文書: ",
|
|
7
|
+
"Reranking-query": "検索クエリ: ",
|
|
8
|
+
"Reranking-document": "検索文書: ",
|
|
9
|
+
"Classification": "トピック: ",
|
|
10
|
+
"Clustering": "トピック: ",
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
RURI_V1_V2_PROMPTS = {
|
|
14
|
+
"query": "クエリ: ",
|
|
15
|
+
"document": "文章: ",
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
RURI_CITATION = r"""@misc{Ruri,
|
|
20
|
+
title={{Ruri: Japanese General Text Embeddings}},
|
|
21
|
+
author={Hayato Tsukagoshi and Ryohei Sasano},
|
|
22
|
+
year={2024},
|
|
23
|
+
eprint={2409.07737},
|
|
24
|
+
archivePrefix={arXiv},
|
|
25
|
+
primaryClass={cs.CL},
|
|
26
|
+
url={https://arxiv.org/abs/2409.07737},
|
|
27
|
+
}"""
|
|
28
|
+
|
|
29
|
+
cl_nagoya_ruri_v3_30m = ModelMeta(
|
|
30
|
+
loader=sentence_transformers_loader,
|
|
31
|
+
loader_kwargs=dict(
|
|
32
|
+
model_prompts=RURI_V3_PROMPTS,
|
|
33
|
+
),
|
|
34
|
+
name="cl-nagoya/ruri-v3-30m",
|
|
35
|
+
languages=["jpn-Jpan"],
|
|
36
|
+
open_weights=True,
|
|
37
|
+
revision="24899e5de370b56d179604a007c0d727bf144504",
|
|
38
|
+
release_date="2025-04-07",
|
|
39
|
+
n_parameters=36_705_536,
|
|
40
|
+
memory_usage_mb=140,
|
|
41
|
+
embed_dim=256,
|
|
42
|
+
license="apache-2.0",
|
|
43
|
+
max_tokens=8192,
|
|
44
|
+
reference="https://huggingface.co/cl-nagoya/ruri-v3-30m",
|
|
45
|
+
similarity_fn_name="cosine",
|
|
46
|
+
framework=["PyTorch", "Sentence Transformers"],
|
|
47
|
+
use_instructions=True,
|
|
48
|
+
superseded_by=None,
|
|
49
|
+
training_datasets={
|
|
50
|
+
"cl-nagoya/ruri-v3-dataset-ft",
|
|
51
|
+
},
|
|
52
|
+
adapted_from="sbintuitions/modernbert-ja-30m",
|
|
53
|
+
public_training_code=None,
|
|
54
|
+
public_training_data="https://huggingface.co/datasets/cl-nagoya/ruri-v3-dataset-ft",
|
|
55
|
+
citation=RURI_CITATION,
|
|
56
|
+
contacts=["hpprc"],
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
cl_nagoya_ruri_v3_70m = ModelMeta(
|
|
60
|
+
loader=sentence_transformers_loader,
|
|
61
|
+
loader_kwargs=dict(
|
|
62
|
+
model_prompts=RURI_V3_PROMPTS,
|
|
63
|
+
),
|
|
64
|
+
name="cl-nagoya/ruri-v3-70m",
|
|
65
|
+
languages=["jpn-Jpan"],
|
|
66
|
+
open_weights=True,
|
|
67
|
+
revision="07a8b0aba47d29d2ca21f89b915c1efe2c23d1cc",
|
|
68
|
+
release_date="2025-04-09",
|
|
69
|
+
n_parameters=36_705_536,
|
|
70
|
+
memory_usage_mb=140,
|
|
71
|
+
embed_dim=256,
|
|
72
|
+
license="apache-2.0",
|
|
73
|
+
max_tokens=8192,
|
|
74
|
+
reference="https://huggingface.co/cl-nagoya/ruri-v3-70m",
|
|
75
|
+
similarity_fn_name="cosine",
|
|
76
|
+
framework=["PyTorch", "Sentence Transformers"],
|
|
77
|
+
use_instructions=True,
|
|
78
|
+
superseded_by=None,
|
|
79
|
+
training_datasets={"MrTidyRetrieval", "MIRACLRetrieval"},
|
|
80
|
+
adapted_from="sbintuitions/modernbert-ja-70m",
|
|
81
|
+
public_training_code=None,
|
|
82
|
+
public_training_data="https://huggingface.co/datasets/cl-nagoya/ruri-v3-dataset-ft",
|
|
83
|
+
citation=RURI_CITATION,
|
|
84
|
+
contacts=["hpprc"],
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
cl_nagoya_ruri_v3_130m = ModelMeta(
|
|
88
|
+
loader=sentence_transformers_loader,
|
|
89
|
+
loader_kwargs=dict(
|
|
90
|
+
model_prompts=RURI_V3_PROMPTS,
|
|
91
|
+
),
|
|
92
|
+
name="cl-nagoya/ruri-v3-130m",
|
|
93
|
+
languages=["jpn-Jpan"],
|
|
94
|
+
open_weights=True,
|
|
95
|
+
revision="e3114c6ee10dbab8b4b235fbc6dcf9dd4d5ac1a6",
|
|
96
|
+
release_date="2025-04-09",
|
|
97
|
+
n_parameters=132_140_544,
|
|
98
|
+
memory_usage_mb=504,
|
|
99
|
+
embed_dim=512,
|
|
100
|
+
license="apache-2.0",
|
|
101
|
+
max_tokens=8192,
|
|
102
|
+
reference="https://huggingface.co/cl-nagoya/ruri-v3-130m",
|
|
103
|
+
similarity_fn_name="cosine",
|
|
104
|
+
framework=["PyTorch", "Sentence Transformers"],
|
|
105
|
+
use_instructions=True,
|
|
106
|
+
superseded_by=None,
|
|
107
|
+
training_datasets={"MrTidyRetrieval", "MIRACLRetrieval"},
|
|
108
|
+
adapted_from="sbintuitions/modernbert-ja-130m",
|
|
109
|
+
public_training_code=None,
|
|
110
|
+
public_training_data="https://huggingface.co/datasets/cl-nagoya/ruri-v3-dataset-ft",
|
|
111
|
+
citation=RURI_CITATION,
|
|
112
|
+
contacts=["hpprc"],
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
cl_nagoya_ruri_v3_310m = ModelMeta(
|
|
116
|
+
loader=sentence_transformers_loader,
|
|
117
|
+
loader_kwargs=dict(
|
|
118
|
+
model_prompts=RURI_V3_PROMPTS,
|
|
119
|
+
),
|
|
120
|
+
name="cl-nagoya/ruri-v3-310m",
|
|
121
|
+
languages=["jpn-Jpan"],
|
|
122
|
+
open_weights=True,
|
|
123
|
+
revision="18b60fb8c2b9df296fb4212bb7d23ef94e579cd3",
|
|
124
|
+
release_date="2025-04-09",
|
|
125
|
+
n_parameters=314_611_968,
|
|
126
|
+
memory_usage_mb=1200,
|
|
127
|
+
embed_dim=768,
|
|
128
|
+
license="apache-2.0",
|
|
129
|
+
max_tokens=8192,
|
|
130
|
+
reference="https://huggingface.co/cl-nagoya/ruri-v3-310m",
|
|
131
|
+
similarity_fn_name="cosine",
|
|
132
|
+
framework=["PyTorch", "Sentence Transformers"],
|
|
133
|
+
use_instructions=True,
|
|
134
|
+
superseded_by=None,
|
|
135
|
+
training_datasets={"MrTidyRetrieval", "MIRACLRetrieval"},
|
|
136
|
+
adapted_from="sbintuitions/modernbert-ja-310m",
|
|
137
|
+
public_training_code=None,
|
|
138
|
+
public_training_data="https://huggingface.co/datasets/cl-nagoya/ruri-v3-dataset-ft",
|
|
139
|
+
citation=RURI_CITATION,
|
|
140
|
+
contacts=["hpprc"],
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
cl_nagoya_ruri_small_v2 = ModelMeta(
|
|
144
|
+
loader=sentence_transformers_loader,
|
|
145
|
+
loader_kwargs=dict(
|
|
146
|
+
model_prompts=RURI_V1_V2_PROMPTS,
|
|
147
|
+
trust_remote_code=True,
|
|
148
|
+
),
|
|
149
|
+
name="cl-nagoya/ruri-small-v2",
|
|
150
|
+
languages=["jpn-Jpan"],
|
|
151
|
+
open_weights=True,
|
|
152
|
+
revision="db18646e673b713cd0518a5bb0fefdce21e77cd9",
|
|
153
|
+
release_date="2024-12-05",
|
|
154
|
+
n_parameters=68_087_808,
|
|
155
|
+
memory_usage_mb=260,
|
|
156
|
+
embed_dim=768,
|
|
157
|
+
license="apache-2.0",
|
|
158
|
+
max_tokens=512,
|
|
159
|
+
reference="https://huggingface.co/cl-nagoya/ruri-small-v2",
|
|
160
|
+
similarity_fn_name="cosine",
|
|
161
|
+
framework=["PyTorch", "Sentence Transformers"],
|
|
162
|
+
use_instructions=True,
|
|
163
|
+
adapted_from="line-corporation/line-distilbert-base-japanese",
|
|
164
|
+
superseded_by=None,
|
|
165
|
+
training_datasets={"MrTidyRetrieval", "MIRACLRetrieval"},
|
|
166
|
+
public_training_code=None,
|
|
167
|
+
public_training_data="https://huggingface.co/datasets/cl-nagoya/ruri-dataset-v2-ft",
|
|
168
|
+
citation=RURI_CITATION,
|
|
169
|
+
contacts=["hpprc"],
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
cl_nagoya_ruri_base_v2 = ModelMeta(
|
|
173
|
+
loader=sentence_transformers_loader,
|
|
174
|
+
loader_kwargs=dict(
|
|
175
|
+
model_prompts=RURI_V1_V2_PROMPTS,
|
|
176
|
+
),
|
|
177
|
+
name="cl-nagoya/ruri-base-v2",
|
|
178
|
+
languages=["jpn-Jpan"],
|
|
179
|
+
open_weights=True,
|
|
180
|
+
revision="8ce03882903668a01c83ca3b8111ac025a3bc734",
|
|
181
|
+
release_date="2024-12-05",
|
|
182
|
+
n_parameters=111_207_168,
|
|
183
|
+
memory_usage_mb=424,
|
|
184
|
+
embed_dim=768,
|
|
185
|
+
license="apache-2.0",
|
|
186
|
+
max_tokens=512,
|
|
187
|
+
reference="https://huggingface.co/cl-nagoya/ruri-base-v2",
|
|
188
|
+
similarity_fn_name="cosine",
|
|
189
|
+
framework=["PyTorch", "Sentence Transformers"],
|
|
190
|
+
use_instructions=True,
|
|
191
|
+
adapted_from="tohoku-nlp/bert-base-japanese-v3",
|
|
192
|
+
superseded_by=None,
|
|
193
|
+
training_datasets=None,
|
|
194
|
+
public_training_code=None,
|
|
195
|
+
public_training_data="https://huggingface.co/datasets/cl-nagoya/ruri-dataset-v2-ft",
|
|
196
|
+
citation=RURI_CITATION,
|
|
197
|
+
contacts=["hpprc"],
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
cl_nagoya_ruri_large_v2 = ModelMeta(
|
|
201
|
+
loader=sentence_transformers_loader,
|
|
202
|
+
loader_kwargs=dict(
|
|
203
|
+
model_prompts=RURI_V1_V2_PROMPTS,
|
|
204
|
+
),
|
|
205
|
+
name="cl-nagoya/ruri-large-v2",
|
|
206
|
+
languages=["jpn-Jpan"],
|
|
207
|
+
open_weights=True,
|
|
208
|
+
revision="42898ef34a5574977380ebf0dfd28cbfbd36438b",
|
|
209
|
+
release_date="2024-12-06",
|
|
210
|
+
n_parameters=337_441_792,
|
|
211
|
+
memory_usage_mb=1287,
|
|
212
|
+
embed_dim=1024,
|
|
213
|
+
license="apache-2.0",
|
|
214
|
+
max_tokens=512,
|
|
215
|
+
reference="https://huggingface.co/cl-nagoya/ruri-large-v2",
|
|
216
|
+
similarity_fn_name="cosine",
|
|
217
|
+
framework=["PyTorch", "Sentence Transformers"],
|
|
218
|
+
use_instructions=True,
|
|
219
|
+
adapted_from="tohoku-nlp/bert-large-japanese-v2",
|
|
220
|
+
superseded_by=None,
|
|
221
|
+
training_datasets=None,
|
|
222
|
+
public_training_code=None,
|
|
223
|
+
public_training_data="https://huggingface.co/datasets/cl-nagoya/ruri-dataset-v2-ft",
|
|
224
|
+
citation=RURI_CITATION,
|
|
225
|
+
contacts=["hpprc"],
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
cl_nagoya_ruri_small_v1 = ModelMeta(
|
|
229
|
+
loader=sentence_transformers_loader,
|
|
230
|
+
loader_kwargs=dict(
|
|
231
|
+
model_prompts=RURI_V1_V2_PROMPTS,
|
|
232
|
+
trust_remote_code=True,
|
|
233
|
+
),
|
|
234
|
+
name="cl-nagoya/ruri-small",
|
|
235
|
+
languages=["jpn-Jpan"],
|
|
236
|
+
open_weights=True,
|
|
237
|
+
revision="bc56ce90cd7a979f6eb199fc52dfe700bfd94bc3",
|
|
238
|
+
release_date="2024-08-28",
|
|
239
|
+
n_parameters=68_087_808,
|
|
240
|
+
memory_usage_mb=130,
|
|
241
|
+
embed_dim=768,
|
|
242
|
+
license="apache-2.0",
|
|
243
|
+
max_tokens=512,
|
|
244
|
+
reference="https://huggingface.co/cl-nagoya/ruri-small",
|
|
245
|
+
similarity_fn_name="cosine",
|
|
246
|
+
framework=["PyTorch", "Sentence Transformers"],
|
|
247
|
+
use_instructions=True,
|
|
248
|
+
adapted_from="line-corporation/line-distilbert-base-japanese",
|
|
249
|
+
superseded_by="cl-nagoya/ruri-small-v2",
|
|
250
|
+
training_datasets=None,
|
|
251
|
+
public_training_code=None,
|
|
252
|
+
public_training_data="https://huggingface.co/datasets/cl-nagoya/ruri-dataset-ft",
|
|
253
|
+
citation=RURI_CITATION,
|
|
254
|
+
contacts=["hpprc"],
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
cl_nagoya_ruri_base_v1 = ModelMeta(
|
|
258
|
+
loader=sentence_transformers_loader,
|
|
259
|
+
loader_kwargs=dict(
|
|
260
|
+
model_prompts=RURI_V1_V2_PROMPTS,
|
|
261
|
+
),
|
|
262
|
+
name="cl-nagoya/ruri-base",
|
|
263
|
+
languages=["jpn-Jpan"],
|
|
264
|
+
open_weights=True,
|
|
265
|
+
revision="1ae40b8b6c78518a499425086bab8fc16c2e4b0e",
|
|
266
|
+
release_date="2024-08-28",
|
|
267
|
+
n_parameters=111_207_168,
|
|
268
|
+
memory_usage_mb=212,
|
|
269
|
+
embed_dim=768,
|
|
270
|
+
license="apache-2.0",
|
|
271
|
+
max_tokens=512,
|
|
272
|
+
reference="https://huggingface.co/cl-nagoya/ruri-base",
|
|
273
|
+
similarity_fn_name="cosine",
|
|
274
|
+
framework=["PyTorch", "Sentence Transformers"],
|
|
275
|
+
use_instructions=True,
|
|
276
|
+
adapted_from="tohoku-nlp/bert-base-japanese-v3",
|
|
277
|
+
superseded_by="cl-nagoya/ruri-base-v2",
|
|
278
|
+
training_datasets=None,
|
|
279
|
+
public_training_code=None,
|
|
280
|
+
public_training_data="https://huggingface.co/datasets/cl-nagoya/ruri-dataset-ft",
|
|
281
|
+
citation=RURI_CITATION,
|
|
282
|
+
contacts=["hpprc"],
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
cl_nagoya_ruri_large_v1 = ModelMeta(
|
|
287
|
+
loader=sentence_transformers_loader,
|
|
288
|
+
loader_kwargs=dict(
|
|
289
|
+
model_prompts=RURI_V1_V2_PROMPTS,
|
|
290
|
+
),
|
|
291
|
+
name="cl-nagoya/ruri-large",
|
|
292
|
+
languages=["jpn-Jpan"],
|
|
293
|
+
open_weights=True,
|
|
294
|
+
revision="a011c39b13e8bc137ee13c6bc82191ece46c414c",
|
|
295
|
+
release_date="2024-08-28",
|
|
296
|
+
n_parameters=337_441_792,
|
|
297
|
+
memory_usage_mb=644,
|
|
298
|
+
embed_dim=1024,
|
|
299
|
+
license="apache-2.0",
|
|
300
|
+
max_tokens=512,
|
|
301
|
+
reference="https://huggingface.co/cl-nagoya/ruri-large",
|
|
302
|
+
similarity_fn_name="cosine",
|
|
303
|
+
framework=["PyTorch", "Sentence Transformers"],
|
|
304
|
+
use_instructions=True,
|
|
305
|
+
adapted_from="tohoku-nlp/bert-large-japanese-v2",
|
|
306
|
+
superseded_by="cl-nagoya/ruri-large-v2",
|
|
307
|
+
training_datasets=None,
|
|
308
|
+
public_training_code=None,
|
|
309
|
+
public_training_data="https://huggingface.co/datasets/cl-nagoya/ruri-dataset-ft",
|
|
310
|
+
citation=RURI_CITATION,
|
|
311
|
+
contacts=["hpprc"],
|
|
312
|
+
)
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
|
|
2
|
+
from mteb.models.model_meta import ModelMeta
|
|
3
|
+
from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
|
|
4
|
+
from mteb.types import PromptType
|
|
5
|
+
|
|
6
|
+
SARASHINA_V2_INSTRUCTIONS = {
|
|
7
|
+
"Retrieval": {
|
|
8
|
+
"query": "クエリを与えるので、もっともクエリに意味が似ている一節を探してください。",
|
|
9
|
+
"document": "text: ",
|
|
10
|
+
},
|
|
11
|
+
"Reranking": {
|
|
12
|
+
"query": "クエリを与えるので、もっともクエリに意味が似ている一節を探してください。",
|
|
13
|
+
"document": "text: ",
|
|
14
|
+
},
|
|
15
|
+
"Classification": "与えられたドキュメントを適切なカテゴリに分類してください。",
|
|
16
|
+
"Clustering": "与えられたドキュメントのトピックまたはテーマを特定してください。",
|
|
17
|
+
# optimization regarding JMTEB
|
|
18
|
+
"LivedoorNewsClustering.v2": "与えられたニュース記事のトピックを特定してください。",
|
|
19
|
+
"MewsC16JaClustering": "与えられたニュース記事のトピックを特定してください。",
|
|
20
|
+
"SIB200ClusteringS2S": "与えられたテキストのトピックを特定してください。",
|
|
21
|
+
"AmazonReviewsClassification": "与えられたAmazonレビューを適切な評価カテゴリに分類してください。",
|
|
22
|
+
"AmazonCounterfactualClassification": "与えられたAmazonのカスタマーレビューのテキストを反事実か反事実でないかに分類してください。",
|
|
23
|
+
"MassiveIntentClassification": "ユーザーの発話をクエリとして与えるので、ユーザーの意図を見つけてください。",
|
|
24
|
+
"MassiveScenarioClassification": "ユーザーの発話をクエリとして与えるので、ユーザーシナリオを見つけてください。",
|
|
25
|
+
"JapaneseSentimentClassification": "与えられたテキストの感情極性をポジティブ(1)かネガティブか(0)に分類してください。",
|
|
26
|
+
"SIB200Classification": "与えられたテキストのトピックを特定してください。",
|
|
27
|
+
"WRIMEClassification": "与えられたテキストの感情極性(-2:強いネガティブ、-1:ネガティブ、0:ニュートラル、1:ポジティブ、2:強いポジティブ)を分類してください。",
|
|
28
|
+
"JSTS": "クエリを与えるので,もっともクエリに意味が似ている一節を探してください。",
|
|
29
|
+
"JSICK": "クエリを与えるので,もっともクエリに意味が似ている一節を探してください。",
|
|
30
|
+
"JaqketRetrieval": {
|
|
31
|
+
"query": "質問を与えるので、その質問に答えるのに役立つWikipediaの文章を検索してください。",
|
|
32
|
+
"document": "text: ",
|
|
33
|
+
},
|
|
34
|
+
"MrTidyRetrieval": {
|
|
35
|
+
"query": "質問を与えるので、その質問に答えるWikipediaの文章を検索するしてください。",
|
|
36
|
+
"document": "text: ",
|
|
37
|
+
},
|
|
38
|
+
"JaGovFaqsRetrieval": {
|
|
39
|
+
"query": "質問を与えるので、その質問に答えるのに役立つ関連文書を検索してください。",
|
|
40
|
+
"document": "text: ",
|
|
41
|
+
},
|
|
42
|
+
"NLPJournalTitleAbsRetrieval.V2": {
|
|
43
|
+
"query": "論文のタイトルを与えるので、タイトルに対応する要約を検索してください。",
|
|
44
|
+
"document": "text: ",
|
|
45
|
+
},
|
|
46
|
+
"NLPJournalTitleIntroRetrieval.V2": {
|
|
47
|
+
"query": "論文のタイトルを与えるので、タイトルに対応する要約を検索してください。",
|
|
48
|
+
"document": "text: ",
|
|
49
|
+
},
|
|
50
|
+
"NLPJournalAbsIntroRetrieval.V2": {
|
|
51
|
+
"query": "論文の序論を与えるので、序論に対応する全文を検索してください。",
|
|
52
|
+
"document": "text: ",
|
|
53
|
+
},
|
|
54
|
+
"NLPJournalAbsArticleRetrieval.V2": {
|
|
55
|
+
"query": "論文の序論を与えるので、序論に対応する全文を検索してください。",
|
|
56
|
+
"document": "text: ",
|
|
57
|
+
},
|
|
58
|
+
"JaCWIRRetrieval": {
|
|
59
|
+
"query": "記事のタイトルを与えるので、そのタイトルと合っている記事の中身を検索してください。",
|
|
60
|
+
"document": "text: ",
|
|
61
|
+
},
|
|
62
|
+
"MIRACLRetrieval": {
|
|
63
|
+
"query": "質問を与えるので、その質問に答えるのに役立つ関連文書を検索してください。",
|
|
64
|
+
"document": "text: ",
|
|
65
|
+
},
|
|
66
|
+
"MintakaRetrieval": {
|
|
67
|
+
"query": "質問を与えるので、その質問に答えられるテキストを検索してください。",
|
|
68
|
+
"document": "text: ",
|
|
69
|
+
},
|
|
70
|
+
"MultiLongDocRetrieval": {
|
|
71
|
+
"query": "質問を与えるので、その質問に答えるのに役立つWikipediaの文章を検索してください。",
|
|
72
|
+
"document": "text: ",
|
|
73
|
+
},
|
|
74
|
+
"ESCIReranking": {
|
|
75
|
+
"query": "クエリを与えるので、与えられたWeb検索クエリに答える関連文章を検索してください。",
|
|
76
|
+
"document": "text: ",
|
|
77
|
+
},
|
|
78
|
+
"JQaRAReranking": {
|
|
79
|
+
"query": "質問を与えるので、その質問に答えるのに役立つWikipediaの文章を検索してください。",
|
|
80
|
+
"document": "text: ",
|
|
81
|
+
},
|
|
82
|
+
"JaCWIRReranking": {
|
|
83
|
+
"query": "記事のタイトルを与えるので、そのタイトルと合っている記事の中身を検索してください。",
|
|
84
|
+
"document": "text: ",
|
|
85
|
+
},
|
|
86
|
+
"MIRACLReranking": {
|
|
87
|
+
"query": "質問を与えるので、その質問に答えるのに役立つ関連文書を検索してください。",
|
|
88
|
+
"document": "text: ",
|
|
89
|
+
},
|
|
90
|
+
"MultiLongDocReranking": {
|
|
91
|
+
"query": "質問を与えるので、その質問に答えるのに役立つWikipediaの文章を検索してください。",
|
|
92
|
+
"document": "text: ",
|
|
93
|
+
},
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def sarashina_instruction_template(
|
|
98
|
+
instruction: str, prompt_type: PromptType | None = None
|
|
99
|
+
) -> str:
|
|
100
|
+
"""Instruction template for Sarashina v2 model.
|
|
101
|
+
|
|
102
|
+
Returns the instruction as-is since the prompts already contain the full format.
|
|
103
|
+
For document prompts, returns the instruction directly (e.g., "text: ").
|
|
104
|
+
"""
|
|
105
|
+
if not instruction:
|
|
106
|
+
return ""
|
|
107
|
+
if prompt_type == PromptType.document:
|
|
108
|
+
return "text: "
|
|
109
|
+
return f"task: {instruction}\nquery: "
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
sbintuitions_sarashina_embedding_v2_1b = ModelMeta(
|
|
113
|
+
loader=InstructSentenceTransformerModel,
|
|
114
|
+
loader_kwargs=dict(
|
|
115
|
+
instruction_template=sarashina_instruction_template,
|
|
116
|
+
apply_instruction_to_passages=True,
|
|
117
|
+
prompts_dict=SARASHINA_V2_INSTRUCTIONS,
|
|
118
|
+
max_seq_length=8192,
|
|
119
|
+
),
|
|
120
|
+
name="sbintuitions/sarashina-embedding-v2-1b",
|
|
121
|
+
languages=["jpn-Jpan"],
|
|
122
|
+
open_weights=True,
|
|
123
|
+
revision="1f3408afaa7b617e3445d891310a9c26dd0c68a5",
|
|
124
|
+
release_date="2025-07-30",
|
|
125
|
+
n_parameters=1_224_038_144,
|
|
126
|
+
memory_usage_mb=4669,
|
|
127
|
+
embed_dim=1792,
|
|
128
|
+
license="https://huggingface.co/sbintuitions/sarashina-embedding-v2-1b/blob/main/LICENSE",
|
|
129
|
+
max_tokens=8192,
|
|
130
|
+
reference="https://huggingface.co/sbintuitions/sarashina-embedding-v2-1b",
|
|
131
|
+
similarity_fn_name="cosine",
|
|
132
|
+
framework=["Sentence Transformers", "PyTorch"],
|
|
133
|
+
use_instructions=True,
|
|
134
|
+
adapted_from="sbintuitions/sarashina2.2-1b",
|
|
135
|
+
superseded_by=None,
|
|
136
|
+
training_datasets={"NQ", "MrTidyRetrieval"},
|
|
137
|
+
public_training_code=None,
|
|
138
|
+
public_training_data="https://huggingface.co/sbintuitions/sarashina-embedding-v1-1b",
|
|
139
|
+
citation=None,
|
|
140
|
+
contacts=["Sraym1217", "akiFQC", "lsz05"],
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
sbintuitions_sarashina_embedding_v1_1b = ModelMeta(
|
|
144
|
+
loader=sentence_transformers_loader,
|
|
145
|
+
name="sbintuitions/sarashina-embedding-v1-1b",
|
|
146
|
+
languages=["jpn-Jpan"],
|
|
147
|
+
open_weights=True,
|
|
148
|
+
revision="d060fcd8984075071e7fad81baff035cbb3b6c7e",
|
|
149
|
+
release_date="2024-11-22",
|
|
150
|
+
n_parameters=1_224_038_144,
|
|
151
|
+
memory_usage_mb=4669,
|
|
152
|
+
embed_dim=1792,
|
|
153
|
+
license="https://huggingface.co/sbintuitions/sarashina-embedding-v1-1b/blob/main/LICENSE",
|
|
154
|
+
max_tokens=8192,
|
|
155
|
+
reference="https://huggingface.co/sbintuitions/sarashina-embedding-v1-1b",
|
|
156
|
+
similarity_fn_name="cosine",
|
|
157
|
+
framework=["Sentence Transformers", "PyTorch"],
|
|
158
|
+
use_instructions=False,
|
|
159
|
+
adapted_from="sbintuitions/sarashina2.1-1b",
|
|
160
|
+
superseded_by="sbintuitions/sarashina-embedding-v2-1b",
|
|
161
|
+
training_datasets={"NQ", "MrTidyRetrieval"},
|
|
162
|
+
public_training_code=None,
|
|
163
|
+
public_training_data="https://huggingface.co/sbintuitions/sarashina-embedding-v1-1b",
|
|
164
|
+
citation=None,
|
|
165
|
+
contacts=["akiFQC", "lsz05"],
|
|
166
|
+
)
|
mteb/models/search_wrappers.py
CHANGED
|
@@ -250,7 +250,7 @@ class SearchEncoderWrapper:
|
|
|
250
250
|
|
|
251
251
|
# get top-k values
|
|
252
252
|
cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk(
|
|
253
|
-
torch.
|
|
253
|
+
torch.as_tensor(scores),
|
|
254
254
|
min(
|
|
255
255
|
top_k + 1,
|
|
256
256
|
len(scores[1]) if len(scores) > 1 else len(scores[-1]),
|
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
from .hebrew_sentiment_analysis import (
|
|
2
2
|
HebrewSentimentAnalysis,
|
|
3
3
|
HebrewSentimentAnalysisV2,
|
|
4
|
+
HebrewSentimentAnalysisV3,
|
|
4
5
|
)
|
|
5
6
|
|
|
6
|
-
__all__ = [
|
|
7
|
+
__all__ = [
|
|
8
|
+
"HebrewSentimentAnalysis",
|
|
9
|
+
"HebrewSentimentAnalysisV2",
|
|
10
|
+
"HebrewSentimentAnalysisV3",
|
|
11
|
+
]
|
|
@@ -9,7 +9,12 @@ class HebrewSentimentAnalysis(AbsTaskClassification):
|
|
|
9
9
|
"path": "mteb/HebrewSentimentAnalysis",
|
|
10
10
|
"revision": "03eb0996c8234e0d8cd7206bf4763815deda12ed",
|
|
11
11
|
},
|
|
12
|
-
description=
|
|
12
|
+
description=(
|
|
13
|
+
"HebrewSentiment is a data set consists of 12,804 user comments to posts on the official Facebook page of Israel’s president, Mr. Reuven Rivlin. "
|
|
14
|
+
"In October 2015, we used the open software application Netvizz (Rieder, 2013) to scrape all the comments to all of the president’s posts in the period of June – August 2014, "
|
|
15
|
+
"the first three months of Rivlin’s presidency.2 While the president’s posts aimed at reconciling tensions and called for tolerance and empathy, "
|
|
16
|
+
"the sentiment expressed in the comments to the president’s posts was polarized between citizens who warmly thanked the president, and citizens that fiercely critiqued his policy. "
|
|
17
|
+
),
|
|
13
18
|
reference="https://huggingface.co/datasets/hebrew_sentiment",
|
|
14
19
|
type="Classification",
|
|
15
20
|
category="t2c",
|
|
@@ -37,7 +42,7 @@ class HebrewSentimentAnalysis(AbsTaskClassification):
|
|
|
37
42
|
year = {2018},
|
|
38
43
|
}
|
|
39
44
|
""",
|
|
40
|
-
superseded_by="HebrewSentimentAnalysis.
|
|
45
|
+
superseded_by="HebrewSentimentAnalysis.v3",
|
|
41
46
|
)
|
|
42
47
|
|
|
43
48
|
|
|
@@ -49,7 +54,61 @@ class HebrewSentimentAnalysisV2(AbsTaskClassification):
|
|
|
49
54
|
"revision": "7ecd049fc8ac0d6f0a0121c8ff9fe44ea5bd935b",
|
|
50
55
|
"name": "morph",
|
|
51
56
|
},
|
|
52
|
-
description=
|
|
57
|
+
description=(
|
|
58
|
+
"HebrewSentiment is a data set consists of 12,804 user comments to posts on the official Facebook page of Israel’s president, Mr. Reuven Rivlin. "
|
|
59
|
+
"In October 2015, we used the open software application Netvizz (Rieder, 2013) to scrape all the comments to all of the president’s posts in the period of June – August 2014, "
|
|
60
|
+
"the first three months of Rivlin’s presidency.2 While the president’s posts aimed at reconciling tensions and called for tolerance and empathy, "
|
|
61
|
+
"the sentiment expressed in the comments to the president’s posts was polarized between citizens who warmly thanked the president, and citizens that fiercely critiqued his policy. "
|
|
62
|
+
"This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)"
|
|
63
|
+
),
|
|
64
|
+
reference="https://huggingface.co/datasets/hebrew_sentiment",
|
|
65
|
+
type="Classification",
|
|
66
|
+
category="t2c",
|
|
67
|
+
modalities=["text"],
|
|
68
|
+
eval_splits=["test"],
|
|
69
|
+
eval_langs=["heb-Hebr"],
|
|
70
|
+
main_score="accuracy",
|
|
71
|
+
date=("2015-10-01", "2015-10-31"),
|
|
72
|
+
domains=["Reviews", "Written"],
|
|
73
|
+
task_subtypes=["Sentiment/Hate speech"],
|
|
74
|
+
license="mit",
|
|
75
|
+
annotations_creators="expert-annotated",
|
|
76
|
+
dialect=[],
|
|
77
|
+
sample_creation="found",
|
|
78
|
+
bibtex_citation=r"""
|
|
79
|
+
@inproceedings{amram-etal-2018-representations,
|
|
80
|
+
address = {Santa Fe, New Mexico, USA},
|
|
81
|
+
author = {Amram, Adam and Ben David, Anat and Tsarfaty, Reut},
|
|
82
|
+
booktitle = {Proceedings of the 27th International Conference on Computational Linguistics},
|
|
83
|
+
month = aug,
|
|
84
|
+
pages = {2242--2252},
|
|
85
|
+
publisher = {Association for Computational Linguistics},
|
|
86
|
+
title = {Representations and Architectures in Neural Sentiment Analysis for Morphologically Rich Languages: A Case Study from {M}odern {H}ebrew},
|
|
87
|
+
url = {https://www.aclweb.org/anthology/C18-1190},
|
|
88
|
+
year = {2018},
|
|
89
|
+
}
|
|
90
|
+
""",
|
|
91
|
+
adapted_from=["HebrewSentimentAnalysis"],
|
|
92
|
+
superseded_by="HebrewSentimentAnalysis.v3",
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class HebrewSentimentAnalysisV3(AbsTaskClassification):
|
|
97
|
+
label_column_name = "labels"
|
|
98
|
+
metadata = TaskMetadata(
|
|
99
|
+
name="HebrewSentimentAnalysis.v3",
|
|
100
|
+
dataset={
|
|
101
|
+
"path": "mteb/HebrewSentimentAnalysisV4",
|
|
102
|
+
"revision": "aa0b83c4b16cd28daf7c41ef3402e3ffe9c70c59",
|
|
103
|
+
},
|
|
104
|
+
description=(
|
|
105
|
+
"HebrewSentiment is a data set consists of 12,804 user comments to posts on the official Facebook page of Israel’s president, Mr. Reuven Rivlin. "
|
|
106
|
+
"In October 2015, we used the open software application Netvizz (Rieder, 2013) to scrape all the comments to all of the president’s posts in the period of June – August 2014, "
|
|
107
|
+
"the first three months of Rivlin’s presidency.2 While the president’s posts aimed at reconciling tensions and called for tolerance and empathy, "
|
|
108
|
+
"the sentiment expressed in the comments to the president’s posts was polarized between citizens who warmly thanked the president, and citizens that fiercely critiqued his policy. "
|
|
109
|
+
"This version corrects texts (took pre-tokenized) [more details in this thread](https://huggingface.co/datasets/mteb/HebrewSentimentAnalysis/discussions/2). "
|
|
110
|
+
"This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)"
|
|
111
|
+
),
|
|
53
112
|
reference="https://huggingface.co/datasets/hebrew_sentiment",
|
|
54
113
|
type="Classification",
|
|
55
114
|
category="t2c",
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mteb
|
|
3
|
-
Version: 2.3.
|
|
3
|
+
Version: 2.3.6
|
|
4
4
|
Summary: Massive Text Embedding Benchmark
|
|
5
5
|
Author-email: MTEB Contributors <niklas@huggingface.co>, Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Nouamane Tazi <nouamane@huggingface.co>, Nils Reimers <info@nils-reimers.de>
|
|
6
6
|
Maintainer-email: Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Roman Solomatin <risolomatin@gmail.com>, Isaac Chung <chungisaac1217@gmail.com>
|
|
@@ -5,7 +5,7 @@ mteb/_helpful_enum.py,sha256=jh73N1jlcpg7RGz4bj8UpctiMNvqvHpp9wrB7SYEzIU,510
|
|
|
5
5
|
mteb/_log_once.py,sha256=-tUKzxGQzf2LZSuQXi97oYFXMta1B6GEYXd7BPqssvY,1095
|
|
6
6
|
mteb/_requires_package.py,sha256=eHg_TD9BVZRzNCcQQrUP17d8M1DF_vOd_tVx54AmAnM,3017
|
|
7
7
|
mteb/_set_seed.py,sha256=HPlPRl__Pe6IG-4UgJqTfplcivJ_wA2kaClbXoHQedM,1178
|
|
8
|
-
mteb/cache.py,sha256=
|
|
8
|
+
mteb/cache.py,sha256=XiFuhjZ2C-o0LgP1YM8g9As_vigJCUNfTrOb9-EiFlM,20177
|
|
9
9
|
mteb/deprecated_evaluator.py,sha256=t13Eluvm5ByVIOqgT7fqiVfLb8Ud3A4bbF2djRfs8iA,26901
|
|
10
10
|
mteb/evaluate.py,sha256=B60CkqRHzkI-3zIfHyocp-YUeWrzeoOvX_RN5vSlGqE,19363
|
|
11
11
|
mteb/filter_tasks.py,sha256=5XE1OYmgDDoJYnXwFf4ma_PIT_Lekzs420sQF_kpCiY,7240
|
|
@@ -55,8 +55,8 @@ mteb/benchmarks/__init__.py,sha256=MQEVeli-zLaJ7Xg0z7RhXQwsdmm7Ht_W2Ln0rZo1Szc,2
|
|
|
55
55
|
mteb/benchmarks/_create_table.py,sha256=OAiR44ynJ2fMzoBmVITQtOTYQzxIu9KUdS_HzlBlAck,20195
|
|
56
56
|
mteb/benchmarks/benchmark.py,sha256=70RlMyyg_wkWTlU_IbfLl-KaqRWXGCKTd8fWe9X-AQE,4173
|
|
57
57
|
mteb/benchmarks/get_benchmark.py,sha256=-n_O-gitRKZi48gJKNgGuI36hsP7yLVSiwulnMHN7Gw,3935
|
|
58
|
-
mteb/benchmarks/benchmarks/__init__.py,sha256=
|
|
59
|
-
mteb/benchmarks/benchmarks/benchmarks.py,sha256=
|
|
58
|
+
mteb/benchmarks/benchmarks/__init__.py,sha256=Ig5dSFunzI-F-OamruuKJVSstbG3xQNkXCxRY3Bj_Ck,2180
|
|
59
|
+
mteb/benchmarks/benchmarks/benchmarks.py,sha256=qHHmJfisT75VRVoZfPcHhShCG0jY6vSWZEx-D01XxKU,94757
|
|
60
60
|
mteb/benchmarks/benchmarks/rteb_benchmarks.py,sha256=QnCSrTTaBfcRlAQp2Nu81tgv1idMXqiM16Fp2zKJ5Ys,10607
|
|
61
61
|
mteb/cli/__init__.py,sha256=v-csUr3eUZElIvrGB6QGtaIdndDfNWEe9oZchsGsJpg,64
|
|
62
62
|
mteb/cli/_display_tasks.py,sha256=7A06dT9sSoTz6shyMvskPxuc5eHY_H7PGPlROzMP0yw,2196
|
|
@@ -252,6 +252,7 @@ mteb/descriptive_stats/Classification/HeadlineClassification.json,sha256=VfTqah7
|
|
|
252
252
|
mteb/descriptive_stats/Classification/HeadlineClassification.v2.json,sha256=n-KiCmlKXb5QOzNG9QTdjwYR-cRV3Qvn96KOF2so7Cs,2110
|
|
253
253
|
mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.json,sha256=j517xsonPntbr4k5Pa9ftGGIZqzaOLDvPqeGjWngdyI,1659
|
|
254
254
|
mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v2.json,sha256=FaxgCr_lbS-ppsd_cEp_87kbCRCGqFUDMZv375XGvdk,1658
|
|
255
|
+
mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json,sha256=75u2ZWek2BT8JQElPqBHvCqTEXbeTwlrXcyQC5NK-wU,1658
|
|
255
256
|
mteb/descriptive_stats/Classification/HinDialectClassification.json,sha256=HK13-QJaWc0uC4wOqhZHKzT6i05_wJhURdUJ6Upq8OQ,4745
|
|
256
257
|
mteb/descriptive_stats/Classification/HindiDiscourseClassification.json,sha256=4XMoJL46RHYKYKdndxD5lZ2b3hGjL2CArx3eRalO8eg,1049
|
|
257
258
|
mteb/descriptive_stats/Classification/HindiDiscourseClassification.v2.json,sha256=MGz2Ntd2JxyHtlP3GhkhbMQ_yH2Eladd4vZSFOme2K8,2088
|
|
@@ -1426,7 +1427,7 @@ mteb/languages/programming_languages.py,sha256=zxAakT3OSUnAuTnQ34VyeFIECnNXMlleZ
|
|
|
1426
1427
|
mteb/leaderboard/__init__.py,sha256=991roXmtRwEQysV-37hWEzWpkvPgMCGRqZTHR-hm2io,88
|
|
1427
1428
|
mteb/leaderboard/app.py,sha256=29MxFLKEVT-roULHG5boHmsQVhld1rDGNS94r7MWlz8,33118
|
|
1428
1429
|
mteb/leaderboard/benchmark_selector.py,sha256=uH66SI0iT1J4_fnebViWa83dQwhPi7toBv7PRL_epDw,7784
|
|
1429
|
-
mteb/leaderboard/figures.py,sha256=
|
|
1430
|
+
mteb/leaderboard/figures.py,sha256=mPO0go_23QEhAm1RJdLiBxPFCoUiA74_ztyl6yimc7k,7553
|
|
1430
1431
|
mteb/leaderboard/table.py,sha256=6SnrYC5GcBlvVSO6vOk6ObuqtoveBLv3JUuXqdKueG8,8333
|
|
1431
1432
|
mteb/leaderboard/text_segments.py,sha256=iMIkS04QQjPbT-SkU0x6fOcS8xRbUYevryu9HydipKM,6570
|
|
1432
1433
|
mteb/models/__init__.py,sha256=ABTuoqiBjBtBWW3LYY7ItBHdylR6jWoy06HH0g6j6fU,910
|
|
@@ -1435,7 +1436,7 @@ mteb/models/get_model_meta.py,sha256=GeofphZ8wFtwAHYQipgQlZzxNIFAVFGzo_E2sMzjZTc
|
|
|
1435
1436
|
mteb/models/instruct_wrapper.py,sha256=Ty4nfEvioycL_uATkhd0PGuyeB5Xc9xrRd6HOGgb-tc,9005
|
|
1436
1437
|
mteb/models/model_meta.py,sha256=b-Nel9nX5bJk4cgJnqkBzEKyMY7uXvxlCBSxmmH1Ios,14769
|
|
1437
1438
|
mteb/models/models_protocols.py,sha256=D2hYWn_UBGMaKtRwBx3u0B0ni6lHJjSzTxX21XFNwIc,8917
|
|
1438
|
-
mteb/models/search_wrappers.py,sha256=
|
|
1439
|
+
mteb/models/search_wrappers.py,sha256=zpCvxUVNQWekyC4Fiz7mvlI0VPdSrFq41A0GrCDvBK4,20331
|
|
1439
1440
|
mteb/models/sentence_transformer_wrapper.py,sha256=n5CMsM6Lpg_CFHH0NkpJusMsaLUTt-L9vRmFINQ961k,12338
|
|
1440
1441
|
mteb/models/cache_wrappers/__init__.py,sha256=1w1TnMwulWJSzNkLXjbh5MY3sqgHWc6vUntYn49i9X8,169
|
|
1441
1442
|
mteb/models/cache_wrappers/cache_backend_protocol.py,sha256=TR7kD7KbN1J4piszIecpegtLZYGy7sRHZt3SDWlImKk,1665
|
|
@@ -1494,6 +1495,7 @@ mteb/models/model_implementations/jina_clip.py,sha256=CfiIxbhKspjQajNtObCfGPHOWP
|
|
|
1494
1495
|
mteb/models/model_implementations/jina_models.py,sha256=HrHm2Io3g9gHwxU5icAaudy_E8rAVkAAIFSzVYWF-dM,34859
|
|
1495
1496
|
mteb/models/model_implementations/kalm_models.py,sha256=FmW7Z5Qs6WYBLuKvql3u4IJW36kj4k-Ypah8qTBEBkg,59837
|
|
1496
1497
|
mteb/models/model_implementations/kennethenevoldsen_models.py,sha256=DF-9nmsewYO9ikZ0kV81ujKGr7Ot36-9iPoxN7KX2mY,2993
|
|
1498
|
+
mteb/models/model_implementations/kowshik24_models.py,sha256=HoQpybjhquK2XSnawlq0aiSWFI5M7l6N4DNY4MQ-P10,976
|
|
1497
1499
|
mteb/models/model_implementations/lens_models.py,sha256=fC7_NB1F8vBAlXD0p0-hALf6eZTPFJwpz57dy71OlwI,1696
|
|
1498
1500
|
mteb/models/model_implementations/lgai_embedding_models.py,sha256=S83pbfkMH3YUNl4skusgbK-Rn-uLuScQVxgXwegR_N4,2333
|
|
1499
1501
|
mteb/models/model_implementations/linq_models.py,sha256=EtvUyiNbjU-GJd1kS0Z0gBACkP2pFOjk0KfGMZz4K9Y,1872
|
|
@@ -1534,8 +1536,10 @@ mteb/models/model_implementations/rerankers_custom.py,sha256=ro73A9-hHudy3_qIMrh
|
|
|
1534
1536
|
mteb/models/model_implementations/rerankers_monot5_based.py,sha256=rxVwzapNnHl4gCw79XVCaTXj3-wbToyj7XVL97tpAF4,34302
|
|
1535
1537
|
mteb/models/model_implementations/richinfoai_models.py,sha256=llvYa0JUjyOOMbuTgOYoJ2qeqZ5rLHX1ZjZIYlYbdvA,989
|
|
1536
1538
|
mteb/models/model_implementations/ru_sentence_models.py,sha256=GuZFwbzaooufvSMGNjIsL0DDLrqHjhdSsAQHHZo5H08,40480
|
|
1539
|
+
mteb/models/model_implementations/ruri_models.py,sha256=-BTYkZ8dEWZUbGqx3YB5yFSrzMwZtXX7sMUHzrlB8ws,10043
|
|
1537
1540
|
mteb/models/model_implementations/salesforce_models.py,sha256=KslTK-IKeLvNG-vQir9k6swkaOgjk6eyozm_BOVgTpY,5160
|
|
1538
1541
|
mteb/models/model_implementations/samilpwc_models.py,sha256=oMwKNwCxoH1jZgCy04oo2oVlBZWu253QMpnEEC6emz8,2021
|
|
1542
|
+
mteb/models/model_implementations/sarashina_embedding_models.py,sha256=TSmr2FEX79mJTA9mbEV3meEZYSelGv58Veiw__TTGFM,8415
|
|
1539
1543
|
mteb/models/model_implementations/searchmap_models.py,sha256=XvVl99emIgnNUCxkTuFQXW6py2R8vgsArfpyHveCugw,1904
|
|
1540
1544
|
mteb/models/model_implementations/seed_1_6_embedding_models.py,sha256=8J3htEddltyGTydIbnMUudgAV97FdD43-SQKaSA_Iuc,18534
|
|
1541
1545
|
mteb/models/model_implementations/seed_models.py,sha256=SgK4kPVO6V33G3F1zSq06zSkWarPLEwBt1SWp4TUoVw,14142
|
|
@@ -1722,8 +1726,8 @@ mteb/tasks/classification/fra/french_book_reviews.py,sha256=Fsx8UznQVNDNUhcdsTeN
|
|
|
1722
1726
|
mteb/tasks/classification/fra/movie_review_sentiment_classification.py,sha256=ov-fbReWP9T_RqhxFtS-gjNaZmbM9J8gQdBQyci5yqU,3290
|
|
1723
1727
|
mteb/tasks/classification/guj/__init__.py,sha256=HZfimpBCywBLi5VGof_A9Ua6bqtMUoWGRhM1eqAEWKE,186
|
|
1724
1728
|
mteb/tasks/classification/guj/gujarati_news_classification.py,sha256=VEdbzqlw8b8N8R3TQc275iiCxqGLAMAM8Nf_N7FnUGA,2303
|
|
1725
|
-
mteb/tasks/classification/heb/__init__.py,sha256=
|
|
1726
|
-
mteb/tasks/classification/heb/hebrew_sentiment_analysis.py,sha256=
|
|
1729
|
+
mteb/tasks/classification/heb/__init__.py,sha256=xQNtDxjUsCXQhA_ZYO_4kpBtHWQSBhWhQwAuWcTo6GE,246
|
|
1730
|
+
mteb/tasks/classification/heb/hebrew_sentiment_analysis.py,sha256=2wmKwq4Z4YKTG3QieuxmUE56ofiSOV63kV-jh8Zomh4,7281
|
|
1727
1731
|
mteb/tasks/classification/hin/__init__.py,sha256=KdScMtYYmjsali0InoHP0PKQ8yCbaD-j2tLqc7_lhlo,356
|
|
1728
1732
|
mteb/tasks/classification/hin/hindi_discourse_classification.py,sha256=HXLJZFEpnI7UjwA-NuadhrCplqaLiSb8Npla_6oXC48,3717
|
|
1729
1733
|
mteb/tasks/classification/hin/sentiment_analysis_hindi.py,sha256=YIeEuNa2UTr8Jwh_wx15broRuD3NHNansGq7Bl9Vjl0,3101
|
|
@@ -2569,9 +2573,9 @@ mteb/types/_metadata.py,sha256=NN-W0S6a5TDV7UkpRx1pyWtGF4TyyCyoPUfHOwdeci8,2290
|
|
|
2569
2573
|
mteb/types/_result.py,sha256=CRAUc5IvqI3_9SyXDwv-PWLCXwXdZem9RePeYESRtuw,996
|
|
2570
2574
|
mteb/types/_string_validators.py,sha256=PY-dYq4E8O50VS3bLYdldPWp400fl_WzUjfVSkNWe8U,523
|
|
2571
2575
|
mteb/types/statistics.py,sha256=YwJsxTf1eaCI_RE-J37a-gK5wDeGAsmkeZKoZCFihSo,3755
|
|
2572
|
-
mteb-2.3.
|
|
2573
|
-
mteb-2.3.
|
|
2574
|
-
mteb-2.3.
|
|
2575
|
-
mteb-2.3.
|
|
2576
|
-
mteb-2.3.
|
|
2577
|
-
mteb-2.3.
|
|
2576
|
+
mteb-2.3.6.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
2577
|
+
mteb-2.3.6.dist-info/METADATA,sha256=urz0_67bNhVt17rvN3pZdvMFt_mvxI7MFvamWkNoNjM,13923
|
|
2578
|
+
mteb-2.3.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
2579
|
+
mteb-2.3.6.dist-info/entry_points.txt,sha256=8IJoEJFKoDHmVnNev-qJ9pp4Ln7_1-ma9QsXnzVCzGU,39
|
|
2580
|
+
mteb-2.3.6.dist-info/top_level.txt,sha256=OLVIjcQAlWBz0bdmutKlWHLF42FF0hp4uVAg3ZyiG4U,5
|
|
2581
|
+
mteb-2.3.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|