mteb 2.3.4__py3-none-any.whl → 2.3.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/benchmarks/benchmarks/__init__.py +2 -0
- mteb/benchmarks/benchmarks/benchmarks.py +57 -0
- mteb/cache.py +5 -1
- mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
- mteb/models/model_implementations/ruri_models.py +312 -0
- mteb/models/search_wrappers.py +1 -1
- mteb/tasks/classification/heb/__init__.py +6 -1
- mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -3
- {mteb-2.3.4.dist-info → mteb-2.3.5.dist-info}/METADATA +1 -1
- {mteb-2.3.4.dist-info → mteb-2.3.5.dist-info}/RECORD +14 -12
- {mteb-2.3.4.dist-info → mteb-2.3.5.dist-info}/WHEEL +0 -0
- {mteb-2.3.4.dist-info → mteb-2.3.5.dist-info}/entry_points.txt +0 -0
- {mteb-2.3.4.dist-info → mteb-2.3.5.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.3.4.dist-info → mteb-2.3.5.dist-info}/top_level.txt +0 -0
|
@@ -12,6 +12,7 @@ from mteb.benchmarks.benchmarks.benchmarks import (
|
|
|
12
12
|
FA_MTEB_2,
|
|
13
13
|
HUME,
|
|
14
14
|
JINA_VDR,
|
|
15
|
+
JMTEB_V2,
|
|
15
16
|
LONG_EMBED,
|
|
16
17
|
MIEB_ENG,
|
|
17
18
|
MIEB_IMG,
|
|
@@ -75,6 +76,7 @@ __all__ = [
|
|
|
75
76
|
"HUME",
|
|
76
77
|
"HUME",
|
|
77
78
|
"JINA_VDR",
|
|
79
|
+
"JMTEB_V2",
|
|
78
80
|
"LONG_EMBED",
|
|
79
81
|
"MIEB_ENG",
|
|
80
82
|
"MIEB_IMG",
|
|
@@ -2562,3 +2562,60 @@ HUME = HUMEBenchmark(
|
|
|
2562
2562
|
citation=None,
|
|
2563
2563
|
contacts=["AdnanElAssadi56", "KennethEnevoldsen", "isaac-chung", "Samoed"],
|
|
2564
2564
|
)
|
|
2565
|
+
|
|
2566
|
+
JMTEB_V2 = Benchmark(
|
|
2567
|
+
name="JMTEB(v2)",
|
|
2568
|
+
display_name="Japanese",
|
|
2569
|
+
icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/jp.svg",
|
|
2570
|
+
tasks=get_tasks(
|
|
2571
|
+
languages=["jpn"],
|
|
2572
|
+
tasks=[
|
|
2573
|
+
# Clustering (3)
|
|
2574
|
+
"LivedoorNewsClustering.v2",
|
|
2575
|
+
"MewsC16JaClustering",
|
|
2576
|
+
"SIB200ClusteringS2S",
|
|
2577
|
+
# Classification (7)
|
|
2578
|
+
"AmazonReviewsClassification",
|
|
2579
|
+
"AmazonCounterfactualClassification",
|
|
2580
|
+
"MassiveIntentClassification",
|
|
2581
|
+
"MassiveScenarioClassification",
|
|
2582
|
+
"JapaneseSentimentClassification",
|
|
2583
|
+
"SIB200Classification",
|
|
2584
|
+
"WRIMEClassification",
|
|
2585
|
+
# STS (2)
|
|
2586
|
+
"JSTS",
|
|
2587
|
+
"JSICK",
|
|
2588
|
+
# Retrieval (11)
|
|
2589
|
+
"JaqketRetrieval",
|
|
2590
|
+
"MrTidyRetrieval",
|
|
2591
|
+
"JaGovFaqsRetrieval",
|
|
2592
|
+
"NLPJournalTitleAbsRetrieval.V2",
|
|
2593
|
+
"NLPJournalTitleIntroRetrieval.V2",
|
|
2594
|
+
"NLPJournalAbsIntroRetrieval.V2",
|
|
2595
|
+
"NLPJournalAbsArticleRetrieval.V2",
|
|
2596
|
+
"JaCWIRRetrieval",
|
|
2597
|
+
"MIRACLRetrieval",
|
|
2598
|
+
"MintakaRetrieval",
|
|
2599
|
+
"MultiLongDocRetrieval",
|
|
2600
|
+
# Reranking (5)
|
|
2601
|
+
"ESCIReranking",
|
|
2602
|
+
"JQaRAReranking",
|
|
2603
|
+
"JaCWIRReranking",
|
|
2604
|
+
"MIRACLReranking",
|
|
2605
|
+
"MultiLongDocReranking",
|
|
2606
|
+
],
|
|
2607
|
+
),
|
|
2608
|
+
description="JMTEB is a benchmark for evaluating Japanese text embedding models. In v2, we have extended the benchmark to 28 datasets, enabling more comprehensive evaluation compared with v1 (MTEB(jpn, v1)).",
|
|
2609
|
+
reference="https://github.com/sbintuitions/JMTEB",
|
|
2610
|
+
citation=r"""
|
|
2611
|
+
@article{li2025jmteb,
|
|
2612
|
+
author = {Li, Shengzhe and Ohagi, Masaya and Ri, Ryokan and Fukuchi, Akihiko and Shibata, Tomohide and Kawahara, Daisuke},
|
|
2613
|
+
issue = {3},
|
|
2614
|
+
journal = {Vol.2025-NL-265,No.3,1-15},
|
|
2615
|
+
month = {sep},
|
|
2616
|
+
title = {{JMTEB and JMTEB-lite: Japanese Massive Text Embedding Benchmark and Its Lightweight Version}},
|
|
2617
|
+
year = {2025},
|
|
2618
|
+
}
|
|
2619
|
+
""",
|
|
2620
|
+
contacts=["lsz05"],
|
|
2621
|
+
)
|
mteb/cache.py
CHANGED
|
@@ -243,7 +243,11 @@ class ResultCache:
|
|
|
243
243
|
f"No results repository found in {results_directory}, cloning it from {remote}"
|
|
244
244
|
)
|
|
245
245
|
|
|
246
|
-
subprocess.run(
|
|
246
|
+
subprocess.run(
|
|
247
|
+
["git", "clone", "--depth", "1", remote, "remote"],
|
|
248
|
+
cwd=self.cache_path,
|
|
249
|
+
check=True,
|
|
250
|
+
)
|
|
247
251
|
|
|
248
252
|
return results_directory
|
|
249
253
|
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
{
|
|
2
|
+
"test": {
|
|
3
|
+
"num_samples": 1085,
|
|
4
|
+
"number_texts_intersect_with_train": 0,
|
|
5
|
+
"text_statistics": {
|
|
6
|
+
"total_text_length": 115359,
|
|
7
|
+
"min_text_length": 8,
|
|
8
|
+
"average_text_length": 106.32165898617511,
|
|
9
|
+
"max_text_length": 2722,
|
|
10
|
+
"unique_texts": 1085
|
|
11
|
+
},
|
|
12
|
+
"image_statistics": null,
|
|
13
|
+
"label_statistics": {
|
|
14
|
+
"min_labels_per_text": 1,
|
|
15
|
+
"average_label_per_text": 1.0,
|
|
16
|
+
"max_labels_per_text": 1,
|
|
17
|
+
"unique_labels": 3,
|
|
18
|
+
"labels": {
|
|
19
|
+
"0": {
|
|
20
|
+
"count": 868
|
|
21
|
+
},
|
|
22
|
+
"1": {
|
|
23
|
+
"count": 190
|
|
24
|
+
},
|
|
25
|
+
"2": {
|
|
26
|
+
"count": 27
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
},
|
|
31
|
+
"train": {
|
|
32
|
+
"num_samples": 7176,
|
|
33
|
+
"number_texts_intersect_with_train": null,
|
|
34
|
+
"text_statistics": {
|
|
35
|
+
"total_text_length": 830248,
|
|
36
|
+
"min_text_length": 5,
|
|
37
|
+
"average_text_length": 115.69788182831661,
|
|
38
|
+
"max_text_length": 4759,
|
|
39
|
+
"unique_texts": 7176
|
|
40
|
+
},
|
|
41
|
+
"image_statistics": null,
|
|
42
|
+
"label_statistics": {
|
|
43
|
+
"min_labels_per_text": 1,
|
|
44
|
+
"average_label_per_text": 1.0,
|
|
45
|
+
"max_labels_per_text": 1,
|
|
46
|
+
"unique_labels": 3,
|
|
47
|
+
"labels": {
|
|
48
|
+
"0": {
|
|
49
|
+
"count": 4933
|
|
50
|
+
},
|
|
51
|
+
"1": {
|
|
52
|
+
"count": 2047
|
|
53
|
+
},
|
|
54
|
+
"2": {
|
|
55
|
+
"count": 196
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
}
|
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
from mteb.models.model_meta import ModelMeta
|
|
2
|
+
from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
|
|
3
|
+
|
|
4
|
+
RURI_V3_PROMPTS = {
|
|
5
|
+
"Retrieval-query": "検索クエリ: ",
|
|
6
|
+
"Retrieval-document": "検索文書: ",
|
|
7
|
+
"Reranking-query": "検索クエリ: ",
|
|
8
|
+
"Reranking-document": "検索文書: ",
|
|
9
|
+
"Classification": "トピック: ",
|
|
10
|
+
"Clustering": "トピック: ",
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
RURI_V1_V2_PROMPTS = {
|
|
14
|
+
"query": "クエリ: ",
|
|
15
|
+
"document": "文章: ",
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
RURI_CITATION = r"""@misc{Ruri,
|
|
20
|
+
title={{Ruri: Japanese General Text Embeddings}},
|
|
21
|
+
author={Hayato Tsukagoshi and Ryohei Sasano},
|
|
22
|
+
year={2024},
|
|
23
|
+
eprint={2409.07737},
|
|
24
|
+
archivePrefix={arXiv},
|
|
25
|
+
primaryClass={cs.CL},
|
|
26
|
+
url={https://arxiv.org/abs/2409.07737},
|
|
27
|
+
}"""
|
|
28
|
+
|
|
29
|
+
cl_nagoya_ruri_v3_30m = ModelMeta(
|
|
30
|
+
loader=sentence_transformers_loader,
|
|
31
|
+
loader_kwargs=dict(
|
|
32
|
+
model_prompts=RURI_V3_PROMPTS,
|
|
33
|
+
),
|
|
34
|
+
name="cl-nagoya/ruri-v3-30m",
|
|
35
|
+
languages=["jpn-Jpan"],
|
|
36
|
+
open_weights=True,
|
|
37
|
+
revision="24899e5de370b56d179604a007c0d727bf144504",
|
|
38
|
+
release_date="2025-04-07",
|
|
39
|
+
n_parameters=36_705_536,
|
|
40
|
+
memory_usage_mb=140,
|
|
41
|
+
embed_dim=256,
|
|
42
|
+
license="apache-2.0",
|
|
43
|
+
max_tokens=8192,
|
|
44
|
+
reference="https://huggingface.co/cl-nagoya/ruri-v3-30m",
|
|
45
|
+
similarity_fn_name="cosine",
|
|
46
|
+
framework=["PyTorch", "Sentence Transformers"],
|
|
47
|
+
use_instructions=True,
|
|
48
|
+
superseded_by=None,
|
|
49
|
+
training_datasets={
|
|
50
|
+
"cl-nagoya/ruri-v3-dataset-ft",
|
|
51
|
+
},
|
|
52
|
+
adapted_from="sbintuitions/modernbert-ja-30m",
|
|
53
|
+
public_training_code=None,
|
|
54
|
+
public_training_data="https://huggingface.co/datasets/cl-nagoya/ruri-v3-dataset-ft",
|
|
55
|
+
citation=RURI_CITATION,
|
|
56
|
+
contacts=["hpprc"],
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
cl_nagoya_ruri_v3_70m = ModelMeta(
|
|
60
|
+
loader=sentence_transformers_loader,
|
|
61
|
+
loader_kwargs=dict(
|
|
62
|
+
model_prompts=RURI_V3_PROMPTS,
|
|
63
|
+
),
|
|
64
|
+
name="cl-nagoya/ruri-v3-70m",
|
|
65
|
+
languages=["jpn-Jpan"],
|
|
66
|
+
open_weights=True,
|
|
67
|
+
revision="07a8b0aba47d29d2ca21f89b915c1efe2c23d1cc",
|
|
68
|
+
release_date="2025-04-09",
|
|
69
|
+
n_parameters=36_705_536,
|
|
70
|
+
memory_usage_mb=140,
|
|
71
|
+
embed_dim=256,
|
|
72
|
+
license="apache-2.0",
|
|
73
|
+
max_tokens=8192,
|
|
74
|
+
reference="https://huggingface.co/cl-nagoya/ruri-v3-70m",
|
|
75
|
+
similarity_fn_name="cosine",
|
|
76
|
+
framework=["PyTorch", "Sentence Transformers"],
|
|
77
|
+
use_instructions=True,
|
|
78
|
+
superseded_by=None,
|
|
79
|
+
training_datasets={"MrTidyRetrieval", "MIRACLRetrieval"},
|
|
80
|
+
adapted_from="sbintuitions/modernbert-ja-70m",
|
|
81
|
+
public_training_code=None,
|
|
82
|
+
public_training_data="https://huggingface.co/datasets/cl-nagoya/ruri-v3-dataset-ft",
|
|
83
|
+
citation=RURI_CITATION,
|
|
84
|
+
contacts=["hpprc"],
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
cl_nagoya_ruri_v3_130m = ModelMeta(
|
|
88
|
+
loader=sentence_transformers_loader,
|
|
89
|
+
loader_kwargs=dict(
|
|
90
|
+
model_prompts=RURI_V3_PROMPTS,
|
|
91
|
+
),
|
|
92
|
+
name="cl-nagoya/ruri-v3-130m",
|
|
93
|
+
languages=["jpn-Jpan"],
|
|
94
|
+
open_weights=True,
|
|
95
|
+
revision="e3114c6ee10dbab8b4b235fbc6dcf9dd4d5ac1a6",
|
|
96
|
+
release_date="2025-04-09",
|
|
97
|
+
n_parameters=132_140_544,
|
|
98
|
+
memory_usage_mb=504,
|
|
99
|
+
embed_dim=512,
|
|
100
|
+
license="apache-2.0",
|
|
101
|
+
max_tokens=8192,
|
|
102
|
+
reference="https://huggingface.co/cl-nagoya/ruri-v3-130m",
|
|
103
|
+
similarity_fn_name="cosine",
|
|
104
|
+
framework=["PyTorch", "Sentence Transformers"],
|
|
105
|
+
use_instructions=True,
|
|
106
|
+
superseded_by=None,
|
|
107
|
+
training_datasets={"MrTidyRetrieval", "MIRACLRetrieval"},
|
|
108
|
+
adapted_from="sbintuitions/modernbert-ja-130m",
|
|
109
|
+
public_training_code=None,
|
|
110
|
+
public_training_data="https://huggingface.co/datasets/cl-nagoya/ruri-v3-dataset-ft",
|
|
111
|
+
citation=RURI_CITATION,
|
|
112
|
+
contacts=["hpprc"],
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
cl_nagoya_ruri_v3_310m = ModelMeta(
|
|
116
|
+
loader=sentence_transformers_loader,
|
|
117
|
+
loader_kwargs=dict(
|
|
118
|
+
model_prompts=RURI_V3_PROMPTS,
|
|
119
|
+
),
|
|
120
|
+
name="cl-nagoya/ruri-v3-310m",
|
|
121
|
+
languages=["jpn-Jpan"],
|
|
122
|
+
open_weights=True,
|
|
123
|
+
revision="18b60fb8c2b9df296fb4212bb7d23ef94e579cd3",
|
|
124
|
+
release_date="2025-04-09",
|
|
125
|
+
n_parameters=314_611_968,
|
|
126
|
+
memory_usage_mb=1200,
|
|
127
|
+
embed_dim=768,
|
|
128
|
+
license="apache-2.0",
|
|
129
|
+
max_tokens=8192,
|
|
130
|
+
reference="https://huggingface.co/cl-nagoya/ruri-v3-310m",
|
|
131
|
+
similarity_fn_name="cosine",
|
|
132
|
+
framework=["PyTorch", "Sentence Transformers"],
|
|
133
|
+
use_instructions=True,
|
|
134
|
+
superseded_by=None,
|
|
135
|
+
training_datasets={"MrTidyRetrieval", "MIRACLRetrieval"},
|
|
136
|
+
adapted_from="sbintuitions/modernbert-ja-310m",
|
|
137
|
+
public_training_code=None,
|
|
138
|
+
public_training_data="https://huggingface.co/datasets/cl-nagoya/ruri-v3-dataset-ft",
|
|
139
|
+
citation=RURI_CITATION,
|
|
140
|
+
contacts=["hpprc"],
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
cl_nagoya_ruri_small_v2 = ModelMeta(
|
|
144
|
+
loader=sentence_transformers_loader,
|
|
145
|
+
loader_kwargs=dict(
|
|
146
|
+
model_prompts=RURI_V1_V2_PROMPTS,
|
|
147
|
+
trust_remote_code=True,
|
|
148
|
+
),
|
|
149
|
+
name="cl-nagoya/ruri-small-v2",
|
|
150
|
+
languages=["jpn-Jpan"],
|
|
151
|
+
open_weights=True,
|
|
152
|
+
revision="db18646e673b713cd0518a5bb0fefdce21e77cd9",
|
|
153
|
+
release_date="2024-12-05",
|
|
154
|
+
n_parameters=68_087_808,
|
|
155
|
+
memory_usage_mb=260,
|
|
156
|
+
embed_dim=768,
|
|
157
|
+
license="apache-2.0",
|
|
158
|
+
max_tokens=512,
|
|
159
|
+
reference="https://huggingface.co/cl-nagoya/ruri-small-v2",
|
|
160
|
+
similarity_fn_name="cosine",
|
|
161
|
+
framework=["PyTorch", "Sentence Transformers"],
|
|
162
|
+
use_instructions=True,
|
|
163
|
+
adapted_from="line-corporation/line-distilbert-base-japanese",
|
|
164
|
+
superseded_by=None,
|
|
165
|
+
training_datasets={"MrTidyRetrieval", "MIRACLRetrieval"},
|
|
166
|
+
public_training_code=None,
|
|
167
|
+
public_training_data="https://huggingface.co/datasets/cl-nagoya/ruri-dataset-v2-ft",
|
|
168
|
+
citation=RURI_CITATION,
|
|
169
|
+
contacts=["hpprc"],
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
cl_nagoya_ruri_base_v2 = ModelMeta(
|
|
173
|
+
loader=sentence_transformers_loader,
|
|
174
|
+
loader_kwargs=dict(
|
|
175
|
+
model_prompts=RURI_V1_V2_PROMPTS,
|
|
176
|
+
),
|
|
177
|
+
name="cl-nagoya/ruri-base-v2",
|
|
178
|
+
languages=["jpn-Jpan"],
|
|
179
|
+
open_weights=True,
|
|
180
|
+
revision="8ce03882903668a01c83ca3b8111ac025a3bc734",
|
|
181
|
+
release_date="2024-12-05",
|
|
182
|
+
n_parameters=111_207_168,
|
|
183
|
+
memory_usage_mb=424,
|
|
184
|
+
embed_dim=768,
|
|
185
|
+
license="apache-2.0",
|
|
186
|
+
max_tokens=512,
|
|
187
|
+
reference="https://huggingface.co/cl-nagoya/ruri-base-v2",
|
|
188
|
+
similarity_fn_name="cosine",
|
|
189
|
+
framework=["PyTorch", "Sentence Transformers"],
|
|
190
|
+
use_instructions=True,
|
|
191
|
+
adapted_from="tohoku-nlp/bert-base-japanese-v3",
|
|
192
|
+
superseded_by=None,
|
|
193
|
+
training_datasets=None,
|
|
194
|
+
public_training_code=None,
|
|
195
|
+
public_training_data="https://huggingface.co/datasets/cl-nagoya/ruri-dataset-v2-ft",
|
|
196
|
+
citation=RURI_CITATION,
|
|
197
|
+
contacts=["hpprc"],
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
cl_nagoya_ruri_large_v2 = ModelMeta(
|
|
201
|
+
loader=sentence_transformers_loader,
|
|
202
|
+
loader_kwargs=dict(
|
|
203
|
+
model_prompts=RURI_V1_V2_PROMPTS,
|
|
204
|
+
),
|
|
205
|
+
name="cl-nagoya/ruri-large-v2",
|
|
206
|
+
languages=["jpn-Jpan"],
|
|
207
|
+
open_weights=True,
|
|
208
|
+
revision="42898ef34a5574977380ebf0dfd28cbfbd36438b",
|
|
209
|
+
release_date="2024-12-06",
|
|
210
|
+
n_parameters=337_441_792,
|
|
211
|
+
memory_usage_mb=1287,
|
|
212
|
+
embed_dim=1024,
|
|
213
|
+
license="apache-2.0",
|
|
214
|
+
max_tokens=512,
|
|
215
|
+
reference="https://huggingface.co/cl-nagoya/ruri-large-v2",
|
|
216
|
+
similarity_fn_name="cosine",
|
|
217
|
+
framework=["PyTorch", "Sentence Transformers"],
|
|
218
|
+
use_instructions=True,
|
|
219
|
+
adapted_from="tohoku-nlp/bert-large-japanese-v2",
|
|
220
|
+
superseded_by=None,
|
|
221
|
+
training_datasets=None,
|
|
222
|
+
public_training_code=None,
|
|
223
|
+
public_training_data="https://huggingface.co/datasets/cl-nagoya/ruri-dataset-v2-ft",
|
|
224
|
+
citation=RURI_CITATION,
|
|
225
|
+
contacts=["hpprc"],
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
cl_nagoya_ruri_small_v1 = ModelMeta(
|
|
229
|
+
loader=sentence_transformers_loader,
|
|
230
|
+
loader_kwargs=dict(
|
|
231
|
+
model_prompts=RURI_V1_V2_PROMPTS,
|
|
232
|
+
trust_remote_code=True,
|
|
233
|
+
),
|
|
234
|
+
name="cl-nagoya/ruri-small",
|
|
235
|
+
languages=["jpn-Jpan"],
|
|
236
|
+
open_weights=True,
|
|
237
|
+
revision="bc56ce90cd7a979f6eb199fc52dfe700bfd94bc3",
|
|
238
|
+
release_date="2024-08-28",
|
|
239
|
+
n_parameters=68_087_808,
|
|
240
|
+
memory_usage_mb=130,
|
|
241
|
+
embed_dim=768,
|
|
242
|
+
license="apache-2.0",
|
|
243
|
+
max_tokens=512,
|
|
244
|
+
reference="https://huggingface.co/cl-nagoya/ruri-small",
|
|
245
|
+
similarity_fn_name="cosine",
|
|
246
|
+
framework=["PyTorch", "Sentence Transformers"],
|
|
247
|
+
use_instructions=True,
|
|
248
|
+
adapted_from="line-corporation/line-distilbert-base-japanese",
|
|
249
|
+
superseded_by="cl-nagoya/ruri-small-v2",
|
|
250
|
+
training_datasets=None,
|
|
251
|
+
public_training_code=None,
|
|
252
|
+
public_training_data="https://huggingface.co/datasets/cl-nagoya/ruri-dataset-ft",
|
|
253
|
+
citation=RURI_CITATION,
|
|
254
|
+
contacts=["hpprc"],
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
cl_nagoya_ruri_base_v1 = ModelMeta(
|
|
258
|
+
loader=sentence_transformers_loader,
|
|
259
|
+
loader_kwargs=dict(
|
|
260
|
+
model_prompts=RURI_V1_V2_PROMPTS,
|
|
261
|
+
),
|
|
262
|
+
name="cl-nagoya/ruri-base",
|
|
263
|
+
languages=["jpn-Jpan"],
|
|
264
|
+
open_weights=True,
|
|
265
|
+
revision="1ae40b8b6c78518a499425086bab8fc16c2e4b0e",
|
|
266
|
+
release_date="2024-08-28",
|
|
267
|
+
n_parameters=111_207_168,
|
|
268
|
+
memory_usage_mb=212,
|
|
269
|
+
embed_dim=768,
|
|
270
|
+
license="apache-2.0",
|
|
271
|
+
max_tokens=512,
|
|
272
|
+
reference="https://huggingface.co/cl-nagoya/ruri-base",
|
|
273
|
+
similarity_fn_name="cosine",
|
|
274
|
+
framework=["PyTorch", "Sentence Transformers"],
|
|
275
|
+
use_instructions=True,
|
|
276
|
+
adapted_from="tohoku-nlp/bert-base-japanese-v3",
|
|
277
|
+
superseded_by="cl-nagoya/ruri-base-v2",
|
|
278
|
+
training_datasets=None,
|
|
279
|
+
public_training_code=None,
|
|
280
|
+
public_training_data="https://huggingface.co/datasets/cl-nagoya/ruri-dataset-ft",
|
|
281
|
+
citation=RURI_CITATION,
|
|
282
|
+
contacts=["hpprc"],
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
cl_nagoya_ruri_large_v1 = ModelMeta(
|
|
287
|
+
loader=sentence_transformers_loader,
|
|
288
|
+
loader_kwargs=dict(
|
|
289
|
+
model_prompts=RURI_V1_V2_PROMPTS,
|
|
290
|
+
),
|
|
291
|
+
name="cl-nagoya/ruri-large",
|
|
292
|
+
languages=["jpn-Jpan"],
|
|
293
|
+
open_weights=True,
|
|
294
|
+
revision="a011c39b13e8bc137ee13c6bc82191ece46c414c",
|
|
295
|
+
release_date="2024-08-28",
|
|
296
|
+
n_parameters=337_441_792,
|
|
297
|
+
memory_usage_mb=644,
|
|
298
|
+
embed_dim=1024,
|
|
299
|
+
license="apache-2.0",
|
|
300
|
+
max_tokens=512,
|
|
301
|
+
reference="https://huggingface.co/cl-nagoya/ruri-large",
|
|
302
|
+
similarity_fn_name="cosine",
|
|
303
|
+
framework=["PyTorch", "Sentence Transformers"],
|
|
304
|
+
use_instructions=True,
|
|
305
|
+
adapted_from="tohoku-nlp/bert-large-japanese-v2",
|
|
306
|
+
superseded_by="cl-nagoya/ruri-large-v2",
|
|
307
|
+
training_datasets=None,
|
|
308
|
+
public_training_code=None,
|
|
309
|
+
public_training_data="https://huggingface.co/datasets/cl-nagoya/ruri-dataset-ft",
|
|
310
|
+
citation=RURI_CITATION,
|
|
311
|
+
contacts=["hpprc"],
|
|
312
|
+
)
|
mteb/models/search_wrappers.py
CHANGED
|
@@ -250,7 +250,7 @@ class SearchEncoderWrapper:
|
|
|
250
250
|
|
|
251
251
|
# get top-k values
|
|
252
252
|
cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk(
|
|
253
|
-
torch.
|
|
253
|
+
torch.as_tensor(scores),
|
|
254
254
|
min(
|
|
255
255
|
top_k + 1,
|
|
256
256
|
len(scores[1]) if len(scores) > 1 else len(scores[-1]),
|
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
from .hebrew_sentiment_analysis import (
|
|
2
2
|
HebrewSentimentAnalysis,
|
|
3
3
|
HebrewSentimentAnalysisV2,
|
|
4
|
+
HebrewSentimentAnalysisV3,
|
|
4
5
|
)
|
|
5
6
|
|
|
6
|
-
__all__ = [
|
|
7
|
+
__all__ = [
|
|
8
|
+
"HebrewSentimentAnalysis",
|
|
9
|
+
"HebrewSentimentAnalysisV2",
|
|
10
|
+
"HebrewSentimentAnalysisV3",
|
|
11
|
+
]
|
|
@@ -9,7 +9,12 @@ class HebrewSentimentAnalysis(AbsTaskClassification):
|
|
|
9
9
|
"path": "mteb/HebrewSentimentAnalysis",
|
|
10
10
|
"revision": "03eb0996c8234e0d8cd7206bf4763815deda12ed",
|
|
11
11
|
},
|
|
12
|
-
description=
|
|
12
|
+
description=(
|
|
13
|
+
"HebrewSentiment is a data set consists of 12,804 user comments to posts on the official Facebook page of Israel’s president, Mr. Reuven Rivlin. "
|
|
14
|
+
"In October 2015, we used the open software application Netvizz (Rieder, 2013) to scrape all the comments to all of the president’s posts in the period of June – August 2014, "
|
|
15
|
+
"the first three months of Rivlin’s presidency.2 While the president’s posts aimed at reconciling tensions and called for tolerance and empathy, "
|
|
16
|
+
"the sentiment expressed in the comments to the president’s posts was polarized between citizens who warmly thanked the president, and citizens that fiercely critiqued his policy. "
|
|
17
|
+
),
|
|
13
18
|
reference="https://huggingface.co/datasets/hebrew_sentiment",
|
|
14
19
|
type="Classification",
|
|
15
20
|
category="t2c",
|
|
@@ -37,7 +42,7 @@ class HebrewSentimentAnalysis(AbsTaskClassification):
|
|
|
37
42
|
year = {2018},
|
|
38
43
|
}
|
|
39
44
|
""",
|
|
40
|
-
superseded_by="HebrewSentimentAnalysis.
|
|
45
|
+
superseded_by="HebrewSentimentAnalysis.v3",
|
|
41
46
|
)
|
|
42
47
|
|
|
43
48
|
|
|
@@ -49,7 +54,61 @@ class HebrewSentimentAnalysisV2(AbsTaskClassification):
|
|
|
49
54
|
"revision": "7ecd049fc8ac0d6f0a0121c8ff9fe44ea5bd935b",
|
|
50
55
|
"name": "morph",
|
|
51
56
|
},
|
|
52
|
-
description=
|
|
57
|
+
description=(
|
|
58
|
+
"HebrewSentiment is a data set consists of 12,804 user comments to posts on the official Facebook page of Israel’s president, Mr. Reuven Rivlin. "
|
|
59
|
+
"In October 2015, we used the open software application Netvizz (Rieder, 2013) to scrape all the comments to all of the president’s posts in the period of June – August 2014, "
|
|
60
|
+
"the first three months of Rivlin’s presidency.2 While the president’s posts aimed at reconciling tensions and called for tolerance and empathy, "
|
|
61
|
+
"the sentiment expressed in the comments to the president’s posts was polarized between citizens who warmly thanked the president, and citizens that fiercely critiqued his policy. "
|
|
62
|
+
"This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)"
|
|
63
|
+
),
|
|
64
|
+
reference="https://huggingface.co/datasets/hebrew_sentiment",
|
|
65
|
+
type="Classification",
|
|
66
|
+
category="t2c",
|
|
67
|
+
modalities=["text"],
|
|
68
|
+
eval_splits=["test"],
|
|
69
|
+
eval_langs=["heb-Hebr"],
|
|
70
|
+
main_score="accuracy",
|
|
71
|
+
date=("2015-10-01", "2015-10-31"),
|
|
72
|
+
domains=["Reviews", "Written"],
|
|
73
|
+
task_subtypes=["Sentiment/Hate speech"],
|
|
74
|
+
license="mit",
|
|
75
|
+
annotations_creators="expert-annotated",
|
|
76
|
+
dialect=[],
|
|
77
|
+
sample_creation="found",
|
|
78
|
+
bibtex_citation=r"""
|
|
79
|
+
@inproceedings{amram-etal-2018-representations,
|
|
80
|
+
address = {Santa Fe, New Mexico, USA},
|
|
81
|
+
author = {Amram, Adam and Ben David, Anat and Tsarfaty, Reut},
|
|
82
|
+
booktitle = {Proceedings of the 27th International Conference on Computational Linguistics},
|
|
83
|
+
month = aug,
|
|
84
|
+
pages = {2242--2252},
|
|
85
|
+
publisher = {Association for Computational Linguistics},
|
|
86
|
+
title = {Representations and Architectures in Neural Sentiment Analysis for Morphologically Rich Languages: A Case Study from {M}odern {H}ebrew},
|
|
87
|
+
url = {https://www.aclweb.org/anthology/C18-1190},
|
|
88
|
+
year = {2018},
|
|
89
|
+
}
|
|
90
|
+
""",
|
|
91
|
+
adapted_from=["HebrewSentimentAnalysis"],
|
|
92
|
+
superseded_by="HebrewSentimentAnalysis.v3",
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class HebrewSentimentAnalysisV3(AbsTaskClassification):
|
|
97
|
+
label_column_name = "labels"
|
|
98
|
+
metadata = TaskMetadata(
|
|
99
|
+
name="HebrewSentimentAnalysis.v3",
|
|
100
|
+
dataset={
|
|
101
|
+
"path": "mteb/HebrewSentimentAnalysisV4",
|
|
102
|
+
"revision": "aa0b83c4b16cd28daf7c41ef3402e3ffe9c70c59",
|
|
103
|
+
},
|
|
104
|
+
description=(
|
|
105
|
+
"HebrewSentiment is a data set consists of 12,804 user comments to posts on the official Facebook page of Israel’s president, Mr. Reuven Rivlin. "
|
|
106
|
+
"In October 2015, we used the open software application Netvizz (Rieder, 2013) to scrape all the comments to all of the president’s posts in the period of June – August 2014, "
|
|
107
|
+
"the first three months of Rivlin’s presidency.2 While the president’s posts aimed at reconciling tensions and called for tolerance and empathy, "
|
|
108
|
+
"the sentiment expressed in the comments to the president’s posts was polarized between citizens who warmly thanked the president, and citizens that fiercely critiqued his policy. "
|
|
109
|
+
"This version corrects texts (took pre-tokenized) [more details in this thread](https://huggingface.co/datasets/mteb/HebrewSentimentAnalysis/discussions/2). "
|
|
110
|
+
"This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)"
|
|
111
|
+
),
|
|
53
112
|
reference="https://huggingface.co/datasets/hebrew_sentiment",
|
|
54
113
|
type="Classification",
|
|
55
114
|
category="t2c",
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mteb
|
|
3
|
-
Version: 2.3.
|
|
3
|
+
Version: 2.3.5
|
|
4
4
|
Summary: Massive Text Embedding Benchmark
|
|
5
5
|
Author-email: MTEB Contributors <niklas@huggingface.co>, Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Nouamane Tazi <nouamane@huggingface.co>, Nils Reimers <info@nils-reimers.de>
|
|
6
6
|
Maintainer-email: Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Roman Solomatin <risolomatin@gmail.com>, Isaac Chung <chungisaac1217@gmail.com>
|
|
@@ -5,7 +5,7 @@ mteb/_helpful_enum.py,sha256=jh73N1jlcpg7RGz4bj8UpctiMNvqvHpp9wrB7SYEzIU,510
|
|
|
5
5
|
mteb/_log_once.py,sha256=-tUKzxGQzf2LZSuQXi97oYFXMta1B6GEYXd7BPqssvY,1095
|
|
6
6
|
mteb/_requires_package.py,sha256=eHg_TD9BVZRzNCcQQrUP17d8M1DF_vOd_tVx54AmAnM,3017
|
|
7
7
|
mteb/_set_seed.py,sha256=HPlPRl__Pe6IG-4UgJqTfplcivJ_wA2kaClbXoHQedM,1178
|
|
8
|
-
mteb/cache.py,sha256=
|
|
8
|
+
mteb/cache.py,sha256=XiFuhjZ2C-o0LgP1YM8g9As_vigJCUNfTrOb9-EiFlM,20177
|
|
9
9
|
mteb/deprecated_evaluator.py,sha256=t13Eluvm5ByVIOqgT7fqiVfLb8Ud3A4bbF2djRfs8iA,26901
|
|
10
10
|
mteb/evaluate.py,sha256=B60CkqRHzkI-3zIfHyocp-YUeWrzeoOvX_RN5vSlGqE,19363
|
|
11
11
|
mteb/filter_tasks.py,sha256=5XE1OYmgDDoJYnXwFf4ma_PIT_Lekzs420sQF_kpCiY,7240
|
|
@@ -55,8 +55,8 @@ mteb/benchmarks/__init__.py,sha256=MQEVeli-zLaJ7Xg0z7RhXQwsdmm7Ht_W2Ln0rZo1Szc,2
|
|
|
55
55
|
mteb/benchmarks/_create_table.py,sha256=OAiR44ynJ2fMzoBmVITQtOTYQzxIu9KUdS_HzlBlAck,20195
|
|
56
56
|
mteb/benchmarks/benchmark.py,sha256=70RlMyyg_wkWTlU_IbfLl-KaqRWXGCKTd8fWe9X-AQE,4173
|
|
57
57
|
mteb/benchmarks/get_benchmark.py,sha256=-n_O-gitRKZi48gJKNgGuI36hsP7yLVSiwulnMHN7Gw,3935
|
|
58
|
-
mteb/benchmarks/benchmarks/__init__.py,sha256=
|
|
59
|
-
mteb/benchmarks/benchmarks/benchmarks.py,sha256=
|
|
58
|
+
mteb/benchmarks/benchmarks/__init__.py,sha256=Ig5dSFunzI-F-OamruuKJVSstbG3xQNkXCxRY3Bj_Ck,2180
|
|
59
|
+
mteb/benchmarks/benchmarks/benchmarks.py,sha256=qHHmJfisT75VRVoZfPcHhShCG0jY6vSWZEx-D01XxKU,94757
|
|
60
60
|
mteb/benchmarks/benchmarks/rteb_benchmarks.py,sha256=QnCSrTTaBfcRlAQp2Nu81tgv1idMXqiM16Fp2zKJ5Ys,10607
|
|
61
61
|
mteb/cli/__init__.py,sha256=v-csUr3eUZElIvrGB6QGtaIdndDfNWEe9oZchsGsJpg,64
|
|
62
62
|
mteb/cli/_display_tasks.py,sha256=7A06dT9sSoTz6shyMvskPxuc5eHY_H7PGPlROzMP0yw,2196
|
|
@@ -252,6 +252,7 @@ mteb/descriptive_stats/Classification/HeadlineClassification.json,sha256=VfTqah7
|
|
|
252
252
|
mteb/descriptive_stats/Classification/HeadlineClassification.v2.json,sha256=n-KiCmlKXb5QOzNG9QTdjwYR-cRV3Qvn96KOF2so7Cs,2110
|
|
253
253
|
mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.json,sha256=j517xsonPntbr4k5Pa9ftGGIZqzaOLDvPqeGjWngdyI,1659
|
|
254
254
|
mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v2.json,sha256=FaxgCr_lbS-ppsd_cEp_87kbCRCGqFUDMZv375XGvdk,1658
|
|
255
|
+
mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json,sha256=75u2ZWek2BT8JQElPqBHvCqTEXbeTwlrXcyQC5NK-wU,1658
|
|
255
256
|
mteb/descriptive_stats/Classification/HinDialectClassification.json,sha256=HK13-QJaWc0uC4wOqhZHKzT6i05_wJhURdUJ6Upq8OQ,4745
|
|
256
257
|
mteb/descriptive_stats/Classification/HindiDiscourseClassification.json,sha256=4XMoJL46RHYKYKdndxD5lZ2b3hGjL2CArx3eRalO8eg,1049
|
|
257
258
|
mteb/descriptive_stats/Classification/HindiDiscourseClassification.v2.json,sha256=MGz2Ntd2JxyHtlP3GhkhbMQ_yH2Eladd4vZSFOme2K8,2088
|
|
@@ -1435,7 +1436,7 @@ mteb/models/get_model_meta.py,sha256=GeofphZ8wFtwAHYQipgQlZzxNIFAVFGzo_E2sMzjZTc
|
|
|
1435
1436
|
mteb/models/instruct_wrapper.py,sha256=Ty4nfEvioycL_uATkhd0PGuyeB5Xc9xrRd6HOGgb-tc,9005
|
|
1436
1437
|
mteb/models/model_meta.py,sha256=b-Nel9nX5bJk4cgJnqkBzEKyMY7uXvxlCBSxmmH1Ios,14769
|
|
1437
1438
|
mteb/models/models_protocols.py,sha256=D2hYWn_UBGMaKtRwBx3u0B0ni6lHJjSzTxX21XFNwIc,8917
|
|
1438
|
-
mteb/models/search_wrappers.py,sha256=
|
|
1439
|
+
mteb/models/search_wrappers.py,sha256=zpCvxUVNQWekyC4Fiz7mvlI0VPdSrFq41A0GrCDvBK4,20331
|
|
1439
1440
|
mteb/models/sentence_transformer_wrapper.py,sha256=n5CMsM6Lpg_CFHH0NkpJusMsaLUTt-L9vRmFINQ961k,12338
|
|
1440
1441
|
mteb/models/cache_wrappers/__init__.py,sha256=1w1TnMwulWJSzNkLXjbh5MY3sqgHWc6vUntYn49i9X8,169
|
|
1441
1442
|
mteb/models/cache_wrappers/cache_backend_protocol.py,sha256=TR7kD7KbN1J4piszIecpegtLZYGy7sRHZt3SDWlImKk,1665
|
|
@@ -1534,6 +1535,7 @@ mteb/models/model_implementations/rerankers_custom.py,sha256=ro73A9-hHudy3_qIMrh
|
|
|
1534
1535
|
mteb/models/model_implementations/rerankers_monot5_based.py,sha256=rxVwzapNnHl4gCw79XVCaTXj3-wbToyj7XVL97tpAF4,34302
|
|
1535
1536
|
mteb/models/model_implementations/richinfoai_models.py,sha256=llvYa0JUjyOOMbuTgOYoJ2qeqZ5rLHX1ZjZIYlYbdvA,989
|
|
1536
1537
|
mteb/models/model_implementations/ru_sentence_models.py,sha256=GuZFwbzaooufvSMGNjIsL0DDLrqHjhdSsAQHHZo5H08,40480
|
|
1538
|
+
mteb/models/model_implementations/ruri_models.py,sha256=-BTYkZ8dEWZUbGqx3YB5yFSrzMwZtXX7sMUHzrlB8ws,10043
|
|
1537
1539
|
mteb/models/model_implementations/salesforce_models.py,sha256=KslTK-IKeLvNG-vQir9k6swkaOgjk6eyozm_BOVgTpY,5160
|
|
1538
1540
|
mteb/models/model_implementations/samilpwc_models.py,sha256=oMwKNwCxoH1jZgCy04oo2oVlBZWu253QMpnEEC6emz8,2021
|
|
1539
1541
|
mteb/models/model_implementations/searchmap_models.py,sha256=XvVl99emIgnNUCxkTuFQXW6py2R8vgsArfpyHveCugw,1904
|
|
@@ -1722,8 +1724,8 @@ mteb/tasks/classification/fra/french_book_reviews.py,sha256=Fsx8UznQVNDNUhcdsTeN
|
|
|
1722
1724
|
mteb/tasks/classification/fra/movie_review_sentiment_classification.py,sha256=ov-fbReWP9T_RqhxFtS-gjNaZmbM9J8gQdBQyci5yqU,3290
|
|
1723
1725
|
mteb/tasks/classification/guj/__init__.py,sha256=HZfimpBCywBLi5VGof_A9Ua6bqtMUoWGRhM1eqAEWKE,186
|
|
1724
1726
|
mteb/tasks/classification/guj/gujarati_news_classification.py,sha256=VEdbzqlw8b8N8R3TQc275iiCxqGLAMAM8Nf_N7FnUGA,2303
|
|
1725
|
-
mteb/tasks/classification/heb/__init__.py,sha256=
|
|
1726
|
-
mteb/tasks/classification/heb/hebrew_sentiment_analysis.py,sha256=
|
|
1727
|
+
mteb/tasks/classification/heb/__init__.py,sha256=xQNtDxjUsCXQhA_ZYO_4kpBtHWQSBhWhQwAuWcTo6GE,246
|
|
1728
|
+
mteb/tasks/classification/heb/hebrew_sentiment_analysis.py,sha256=2wmKwq4Z4YKTG3QieuxmUE56ofiSOV63kV-jh8Zomh4,7281
|
|
1727
1729
|
mteb/tasks/classification/hin/__init__.py,sha256=KdScMtYYmjsali0InoHP0PKQ8yCbaD-j2tLqc7_lhlo,356
|
|
1728
1730
|
mteb/tasks/classification/hin/hindi_discourse_classification.py,sha256=HXLJZFEpnI7UjwA-NuadhrCplqaLiSb8Npla_6oXC48,3717
|
|
1729
1731
|
mteb/tasks/classification/hin/sentiment_analysis_hindi.py,sha256=YIeEuNa2UTr8Jwh_wx15broRuD3NHNansGq7Bl9Vjl0,3101
|
|
@@ -2569,9 +2571,9 @@ mteb/types/_metadata.py,sha256=NN-W0S6a5TDV7UkpRx1pyWtGF4TyyCyoPUfHOwdeci8,2290
|
|
|
2569
2571
|
mteb/types/_result.py,sha256=CRAUc5IvqI3_9SyXDwv-PWLCXwXdZem9RePeYESRtuw,996
|
|
2570
2572
|
mteb/types/_string_validators.py,sha256=PY-dYq4E8O50VS3bLYdldPWp400fl_WzUjfVSkNWe8U,523
|
|
2571
2573
|
mteb/types/statistics.py,sha256=YwJsxTf1eaCI_RE-J37a-gK5wDeGAsmkeZKoZCFihSo,3755
|
|
2572
|
-
mteb-2.3.
|
|
2573
|
-
mteb-2.3.
|
|
2574
|
-
mteb-2.3.
|
|
2575
|
-
mteb-2.3.
|
|
2576
|
-
mteb-2.3.
|
|
2577
|
-
mteb-2.3.
|
|
2574
|
+
mteb-2.3.5.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
2575
|
+
mteb-2.3.5.dist-info/METADATA,sha256=Ud-HNDLgXkrYqVQczyt-TNpev3LR1rBhRDPKK3Dn_T0,13923
|
|
2576
|
+
mteb-2.3.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
2577
|
+
mteb-2.3.5.dist-info/entry_points.txt,sha256=8IJoEJFKoDHmVnNev-qJ9pp4Ln7_1-ma9QsXnzVCzGU,39
|
|
2578
|
+
mteb-2.3.5.dist-info/top_level.txt,sha256=OLVIjcQAlWBz0bdmutKlWHLF42FF0hp4uVAg3ZyiG4U,5
|
|
2579
|
+
mteb-2.3.5.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|