mteb 2.3.3__py3-none-any.whl → 2.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,6 +12,7 @@ from mteb.benchmarks.benchmarks.benchmarks import (
12
12
  FA_MTEB_2,
13
13
  HUME,
14
14
  JINA_VDR,
15
+ JMTEB_V2,
15
16
  LONG_EMBED,
16
17
  MIEB_ENG,
17
18
  MIEB_IMG,
@@ -75,6 +76,7 @@ __all__ = [
75
76
  "HUME",
76
77
  "HUME",
77
78
  "JINA_VDR",
79
+ "JMTEB_V2",
78
80
  "LONG_EMBED",
79
81
  "MIEB_ENG",
80
82
  "MIEB_IMG",
@@ -2562,3 +2562,60 @@ HUME = HUMEBenchmark(
2562
2562
  citation=None,
2563
2563
  contacts=["AdnanElAssadi56", "KennethEnevoldsen", "isaac-chung", "Samoed"],
2564
2564
  )
2565
+
2566
+ JMTEB_V2 = Benchmark(
2567
+ name="JMTEB(v2)",
2568
+ display_name="Japanese",
2569
+ icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/jp.svg",
2570
+ tasks=get_tasks(
2571
+ languages=["jpn"],
2572
+ tasks=[
2573
+ # Clustering (3)
2574
+ "LivedoorNewsClustering.v2",
2575
+ "MewsC16JaClustering",
2576
+ "SIB200ClusteringS2S",
2577
+ # Classification (7)
2578
+ "AmazonReviewsClassification",
2579
+ "AmazonCounterfactualClassification",
2580
+ "MassiveIntentClassification",
2581
+ "MassiveScenarioClassification",
2582
+ "JapaneseSentimentClassification",
2583
+ "SIB200Classification",
2584
+ "WRIMEClassification",
2585
+ # STS (2)
2586
+ "JSTS",
2587
+ "JSICK",
2588
+ # Retrieval (11)
2589
+ "JaqketRetrieval",
2590
+ "MrTidyRetrieval",
2591
+ "JaGovFaqsRetrieval",
2592
+ "NLPJournalTitleAbsRetrieval.V2",
2593
+ "NLPJournalTitleIntroRetrieval.V2",
2594
+ "NLPJournalAbsIntroRetrieval.V2",
2595
+ "NLPJournalAbsArticleRetrieval.V2",
2596
+ "JaCWIRRetrieval",
2597
+ "MIRACLRetrieval",
2598
+ "MintakaRetrieval",
2599
+ "MultiLongDocRetrieval",
2600
+ # Reranking (5)
2601
+ "ESCIReranking",
2602
+ "JQaRAReranking",
2603
+ "JaCWIRReranking",
2604
+ "MIRACLReranking",
2605
+ "MultiLongDocReranking",
2606
+ ],
2607
+ ),
2608
+ description="JMTEB is a benchmark for evaluating Japanese text embedding models. In v2, we have extended the benchmark to 28 datasets, enabling more comprehensive evaluation compared with v1 (MTEB(jpn, v1)).",
2609
+ reference="https://github.com/sbintuitions/JMTEB",
2610
+ citation=r"""
2611
+ @article{li2025jmteb,
2612
+ author = {Li, Shengzhe and Ohagi, Masaya and Ri, Ryokan and Fukuchi, Akihiko and Shibata, Tomohide and Kawahara, Daisuke},
2613
+ issue = {3},
2614
+ journal = {Vol.2025-NL-265,No.3,1-15},
2615
+ month = {sep},
2616
+ title = {{JMTEB and JMTEB-lite: Japanese Massive Text Embedding Benchmark and Its Lightweight Version}},
2617
+ year = {2025},
2618
+ }
2619
+ """,
2620
+ contacts=["lsz05"],
2621
+ )
mteb/cache.py CHANGED
@@ -243,7 +243,11 @@ class ResultCache:
243
243
  f"No results repository found in {results_directory}, cloning it from {remote}"
244
244
  )
245
245
 
246
- subprocess.run(["git", "clone", remote, "remote"], cwd=self.cache_path)
246
+ subprocess.run(
247
+ ["git", "clone", "--depth", "1", remote, "remote"],
248
+ cwd=self.cache_path,
249
+ check=True,
250
+ )
247
251
 
248
252
  return results_directory
249
253
 
@@ -0,0 +1,60 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 1085,
4
+ "number_texts_intersect_with_train": 0,
5
+ "text_statistics": {
6
+ "total_text_length": 115359,
7
+ "min_text_length": 8,
8
+ "average_text_length": 106.32165898617511,
9
+ "max_text_length": 2722,
10
+ "unique_texts": 1085
11
+ },
12
+ "image_statistics": null,
13
+ "label_statistics": {
14
+ "min_labels_per_text": 1,
15
+ "average_label_per_text": 1.0,
16
+ "max_labels_per_text": 1,
17
+ "unique_labels": 3,
18
+ "labels": {
19
+ "0": {
20
+ "count": 868
21
+ },
22
+ "1": {
23
+ "count": 190
24
+ },
25
+ "2": {
26
+ "count": 27
27
+ }
28
+ }
29
+ }
30
+ },
31
+ "train": {
32
+ "num_samples": 7176,
33
+ "number_texts_intersect_with_train": null,
34
+ "text_statistics": {
35
+ "total_text_length": 830248,
36
+ "min_text_length": 5,
37
+ "average_text_length": 115.69788182831661,
38
+ "max_text_length": 4759,
39
+ "unique_texts": 7176
40
+ },
41
+ "image_statistics": null,
42
+ "label_statistics": {
43
+ "min_labels_per_text": 1,
44
+ "average_label_per_text": 1.0,
45
+ "max_labels_per_text": 1,
46
+ "unique_labels": 3,
47
+ "labels": {
48
+ "0": {
49
+ "count": 4933
50
+ },
51
+ "1": {
52
+ "count": 2047
53
+ },
54
+ "2": {
55
+ "count": 196
56
+ }
57
+ }
58
+ }
59
+ }
60
+ }
@@ -2,6 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import difflib
4
4
  import logging
5
+ import warnings
5
6
  from collections.abc import Iterable
6
7
  from typing import TYPE_CHECKING, Any
7
8
 
@@ -180,6 +181,14 @@ def _model_meta_from_hf_hub(model_name: str) -> ModelMeta:
180
181
  if card_data.get("library_name", None) == "sentence-transformers":
181
182
  frameworks.append("Sentence Transformers")
182
183
  loader = sentence_transformers_loader
184
+ else:
185
+ msg = (
186
+ "Model library not recognized, defaulting to Sentence Transformers loader."
187
+ )
188
+ logger.warning(msg)
189
+ warnings.warn(msg)
190
+ loader = sentence_transformers_loader
191
+
183
192
  revision = card_data.get("base_model_revision", None)
184
193
  license = card_data.get("license", None)
185
194
  return ModelMeta(
@@ -0,0 +1,312 @@
1
+ from mteb.models.model_meta import ModelMeta
2
+ from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
3
+
4
+ RURI_V3_PROMPTS = {
5
+ "Retrieval-query": "検索クエリ: ",
6
+ "Retrieval-document": "検索文書: ",
7
+ "Reranking-query": "検索クエリ: ",
8
+ "Reranking-document": "検索文書: ",
9
+ "Classification": "トピック: ",
10
+ "Clustering": "トピック: ",
11
+ }
12
+
13
+ RURI_V1_V2_PROMPTS = {
14
+ "query": "クエリ: ",
15
+ "document": "文章: ",
16
+ }
17
+
18
+
19
+ RURI_CITATION = r"""@misc{Ruri,
20
+ title={{Ruri: Japanese General Text Embeddings}},
21
+ author={Hayato Tsukagoshi and Ryohei Sasano},
22
+ year={2024},
23
+ eprint={2409.07737},
24
+ archivePrefix={arXiv},
25
+ primaryClass={cs.CL},
26
+ url={https://arxiv.org/abs/2409.07737},
27
+ }"""
28
+
29
+ cl_nagoya_ruri_v3_30m = ModelMeta(
30
+ loader=sentence_transformers_loader,
31
+ loader_kwargs=dict(
32
+ model_prompts=RURI_V3_PROMPTS,
33
+ ),
34
+ name="cl-nagoya/ruri-v3-30m",
35
+ languages=["jpn-Jpan"],
36
+ open_weights=True,
37
+ revision="24899e5de370b56d179604a007c0d727bf144504",
38
+ release_date="2025-04-07",
39
+ n_parameters=36_705_536,
40
+ memory_usage_mb=140,
41
+ embed_dim=256,
42
+ license="apache-2.0",
43
+ max_tokens=8192,
44
+ reference="https://huggingface.co/cl-nagoya/ruri-v3-30m",
45
+ similarity_fn_name="cosine",
46
+ framework=["PyTorch", "Sentence Transformers"],
47
+ use_instructions=True,
48
+ superseded_by=None,
49
+ training_datasets={
50
+ "cl-nagoya/ruri-v3-dataset-ft",
51
+ },
52
+ adapted_from="sbintuitions/modernbert-ja-30m",
53
+ public_training_code=None,
54
+ public_training_data="https://huggingface.co/datasets/cl-nagoya/ruri-v3-dataset-ft",
55
+ citation=RURI_CITATION,
56
+ contacts=["hpprc"],
57
+ )
58
+
59
+ cl_nagoya_ruri_v3_70m = ModelMeta(
60
+ loader=sentence_transformers_loader,
61
+ loader_kwargs=dict(
62
+ model_prompts=RURI_V3_PROMPTS,
63
+ ),
64
+ name="cl-nagoya/ruri-v3-70m",
65
+ languages=["jpn-Jpan"],
66
+ open_weights=True,
67
+ revision="07a8b0aba47d29d2ca21f89b915c1efe2c23d1cc",
68
+ release_date="2025-04-09",
69
+ n_parameters=36_705_536,
70
+ memory_usage_mb=140,
71
+ embed_dim=256,
72
+ license="apache-2.0",
73
+ max_tokens=8192,
74
+ reference="https://huggingface.co/cl-nagoya/ruri-v3-70m",
75
+ similarity_fn_name="cosine",
76
+ framework=["PyTorch", "Sentence Transformers"],
77
+ use_instructions=True,
78
+ superseded_by=None,
79
+ training_datasets={"MrTidyRetrieval", "MIRACLRetrieval"},
80
+ adapted_from="sbintuitions/modernbert-ja-70m",
81
+ public_training_code=None,
82
+ public_training_data="https://huggingface.co/datasets/cl-nagoya/ruri-v3-dataset-ft",
83
+ citation=RURI_CITATION,
84
+ contacts=["hpprc"],
85
+ )
86
+
87
+ cl_nagoya_ruri_v3_130m = ModelMeta(
88
+ loader=sentence_transformers_loader,
89
+ loader_kwargs=dict(
90
+ model_prompts=RURI_V3_PROMPTS,
91
+ ),
92
+ name="cl-nagoya/ruri-v3-130m",
93
+ languages=["jpn-Jpan"],
94
+ open_weights=True,
95
+ revision="e3114c6ee10dbab8b4b235fbc6dcf9dd4d5ac1a6",
96
+ release_date="2025-04-09",
97
+ n_parameters=132_140_544,
98
+ memory_usage_mb=504,
99
+ embed_dim=512,
100
+ license="apache-2.0",
101
+ max_tokens=8192,
102
+ reference="https://huggingface.co/cl-nagoya/ruri-v3-130m",
103
+ similarity_fn_name="cosine",
104
+ framework=["PyTorch", "Sentence Transformers"],
105
+ use_instructions=True,
106
+ superseded_by=None,
107
+ training_datasets={"MrTidyRetrieval", "MIRACLRetrieval"},
108
+ adapted_from="sbintuitions/modernbert-ja-130m",
109
+ public_training_code=None,
110
+ public_training_data="https://huggingface.co/datasets/cl-nagoya/ruri-v3-dataset-ft",
111
+ citation=RURI_CITATION,
112
+ contacts=["hpprc"],
113
+ )
114
+
115
+ cl_nagoya_ruri_v3_310m = ModelMeta(
116
+ loader=sentence_transformers_loader,
117
+ loader_kwargs=dict(
118
+ model_prompts=RURI_V3_PROMPTS,
119
+ ),
120
+ name="cl-nagoya/ruri-v3-310m",
121
+ languages=["jpn-Jpan"],
122
+ open_weights=True,
123
+ revision="18b60fb8c2b9df296fb4212bb7d23ef94e579cd3",
124
+ release_date="2025-04-09",
125
+ n_parameters=314_611_968,
126
+ memory_usage_mb=1200,
127
+ embed_dim=768,
128
+ license="apache-2.0",
129
+ max_tokens=8192,
130
+ reference="https://huggingface.co/cl-nagoya/ruri-v3-310m",
131
+ similarity_fn_name="cosine",
132
+ framework=["PyTorch", "Sentence Transformers"],
133
+ use_instructions=True,
134
+ superseded_by=None,
135
+ training_datasets={"MrTidyRetrieval", "MIRACLRetrieval"},
136
+ adapted_from="sbintuitions/modernbert-ja-310m",
137
+ public_training_code=None,
138
+ public_training_data="https://huggingface.co/datasets/cl-nagoya/ruri-v3-dataset-ft",
139
+ citation=RURI_CITATION,
140
+ contacts=["hpprc"],
141
+ )
142
+
143
+ cl_nagoya_ruri_small_v2 = ModelMeta(
144
+ loader=sentence_transformers_loader,
145
+ loader_kwargs=dict(
146
+ model_prompts=RURI_V1_V2_PROMPTS,
147
+ trust_remote_code=True,
148
+ ),
149
+ name="cl-nagoya/ruri-small-v2",
150
+ languages=["jpn-Jpan"],
151
+ open_weights=True,
152
+ revision="db18646e673b713cd0518a5bb0fefdce21e77cd9",
153
+ release_date="2024-12-05",
154
+ n_parameters=68_087_808,
155
+ memory_usage_mb=260,
156
+ embed_dim=768,
157
+ license="apache-2.0",
158
+ max_tokens=512,
159
+ reference="https://huggingface.co/cl-nagoya/ruri-small-v2",
160
+ similarity_fn_name="cosine",
161
+ framework=["PyTorch", "Sentence Transformers"],
162
+ use_instructions=True,
163
+ adapted_from="line-corporation/line-distilbert-base-japanese",
164
+ superseded_by=None,
165
+ training_datasets={"MrTidyRetrieval", "MIRACLRetrieval"},
166
+ public_training_code=None,
167
+ public_training_data="https://huggingface.co/datasets/cl-nagoya/ruri-dataset-v2-ft",
168
+ citation=RURI_CITATION,
169
+ contacts=["hpprc"],
170
+ )
171
+
172
+ cl_nagoya_ruri_base_v2 = ModelMeta(
173
+ loader=sentence_transformers_loader,
174
+ loader_kwargs=dict(
175
+ model_prompts=RURI_V1_V2_PROMPTS,
176
+ ),
177
+ name="cl-nagoya/ruri-base-v2",
178
+ languages=["jpn-Jpan"],
179
+ open_weights=True,
180
+ revision="8ce03882903668a01c83ca3b8111ac025a3bc734",
181
+ release_date="2024-12-05",
182
+ n_parameters=111_207_168,
183
+ memory_usage_mb=424,
184
+ embed_dim=768,
185
+ license="apache-2.0",
186
+ max_tokens=512,
187
+ reference="https://huggingface.co/cl-nagoya/ruri-base-v2",
188
+ similarity_fn_name="cosine",
189
+ framework=["PyTorch", "Sentence Transformers"],
190
+ use_instructions=True,
191
+ adapted_from="tohoku-nlp/bert-base-japanese-v3",
192
+ superseded_by=None,
193
+ training_datasets=None,
194
+ public_training_code=None,
195
+ public_training_data="https://huggingface.co/datasets/cl-nagoya/ruri-dataset-v2-ft",
196
+ citation=RURI_CITATION,
197
+ contacts=["hpprc"],
198
+ )
199
+
200
+ cl_nagoya_ruri_large_v2 = ModelMeta(
201
+ loader=sentence_transformers_loader,
202
+ loader_kwargs=dict(
203
+ model_prompts=RURI_V1_V2_PROMPTS,
204
+ ),
205
+ name="cl-nagoya/ruri-large-v2",
206
+ languages=["jpn-Jpan"],
207
+ open_weights=True,
208
+ revision="42898ef34a5574977380ebf0dfd28cbfbd36438b",
209
+ release_date="2024-12-06",
210
+ n_parameters=337_441_792,
211
+ memory_usage_mb=1287,
212
+ embed_dim=1024,
213
+ license="apache-2.0",
214
+ max_tokens=512,
215
+ reference="https://huggingface.co/cl-nagoya/ruri-large-v2",
216
+ similarity_fn_name="cosine",
217
+ framework=["PyTorch", "Sentence Transformers"],
218
+ use_instructions=True,
219
+ adapted_from="tohoku-nlp/bert-large-japanese-v2",
220
+ superseded_by=None,
221
+ training_datasets=None,
222
+ public_training_code=None,
223
+ public_training_data="https://huggingface.co/datasets/cl-nagoya/ruri-dataset-v2-ft",
224
+ citation=RURI_CITATION,
225
+ contacts=["hpprc"],
226
+ )
227
+
228
+ cl_nagoya_ruri_small_v1 = ModelMeta(
229
+ loader=sentence_transformers_loader,
230
+ loader_kwargs=dict(
231
+ model_prompts=RURI_V1_V2_PROMPTS,
232
+ trust_remote_code=True,
233
+ ),
234
+ name="cl-nagoya/ruri-small",
235
+ languages=["jpn-Jpan"],
236
+ open_weights=True,
237
+ revision="bc56ce90cd7a979f6eb199fc52dfe700bfd94bc3",
238
+ release_date="2024-08-28",
239
+ n_parameters=68_087_808,
240
+ memory_usage_mb=130,
241
+ embed_dim=768,
242
+ license="apache-2.0",
243
+ max_tokens=512,
244
+ reference="https://huggingface.co/cl-nagoya/ruri-small",
245
+ similarity_fn_name="cosine",
246
+ framework=["PyTorch", "Sentence Transformers"],
247
+ use_instructions=True,
248
+ adapted_from="line-corporation/line-distilbert-base-japanese",
249
+ superseded_by="cl-nagoya/ruri-small-v2",
250
+ training_datasets=None,
251
+ public_training_code=None,
252
+ public_training_data="https://huggingface.co/datasets/cl-nagoya/ruri-dataset-ft",
253
+ citation=RURI_CITATION,
254
+ contacts=["hpprc"],
255
+ )
256
+
257
+ cl_nagoya_ruri_base_v1 = ModelMeta(
258
+ loader=sentence_transformers_loader,
259
+ loader_kwargs=dict(
260
+ model_prompts=RURI_V1_V2_PROMPTS,
261
+ ),
262
+ name="cl-nagoya/ruri-base",
263
+ languages=["jpn-Jpan"],
264
+ open_weights=True,
265
+ revision="1ae40b8b6c78518a499425086bab8fc16c2e4b0e",
266
+ release_date="2024-08-28",
267
+ n_parameters=111_207_168,
268
+ memory_usage_mb=212,
269
+ embed_dim=768,
270
+ license="apache-2.0",
271
+ max_tokens=512,
272
+ reference="https://huggingface.co/cl-nagoya/ruri-base",
273
+ similarity_fn_name="cosine",
274
+ framework=["PyTorch", "Sentence Transformers"],
275
+ use_instructions=True,
276
+ adapted_from="tohoku-nlp/bert-base-japanese-v3",
277
+ superseded_by="cl-nagoya/ruri-base-v2",
278
+ training_datasets=None,
279
+ public_training_code=None,
280
+ public_training_data="https://huggingface.co/datasets/cl-nagoya/ruri-dataset-ft",
281
+ citation=RURI_CITATION,
282
+ contacts=["hpprc"],
283
+ )
284
+
285
+
286
+ cl_nagoya_ruri_large_v1 = ModelMeta(
287
+ loader=sentence_transformers_loader,
288
+ loader_kwargs=dict(
289
+ model_prompts=RURI_V1_V2_PROMPTS,
290
+ ),
291
+ name="cl-nagoya/ruri-large",
292
+ languages=["jpn-Jpan"],
293
+ open_weights=True,
294
+ revision="a011c39b13e8bc137ee13c6bc82191ece46c414c",
295
+ release_date="2024-08-28",
296
+ n_parameters=337_441_792,
297
+ memory_usage_mb=644,
298
+ embed_dim=1024,
299
+ license="apache-2.0",
300
+ max_tokens=512,
301
+ reference="https://huggingface.co/cl-nagoya/ruri-large",
302
+ similarity_fn_name="cosine",
303
+ framework=["PyTorch", "Sentence Transformers"],
304
+ use_instructions=True,
305
+ adapted_from="tohoku-nlp/bert-large-japanese-v2",
306
+ superseded_by="cl-nagoya/ruri-large-v2",
307
+ training_datasets=None,
308
+ public_training_code=None,
309
+ public_training_data="https://huggingface.co/datasets/cl-nagoya/ruri-dataset-ft",
310
+ citation=RURI_CITATION,
311
+ contacts=["hpprc"],
312
+ )
@@ -250,7 +250,7 @@ class SearchEncoderWrapper:
250
250
 
251
251
  # get top-k values
252
252
  cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk(
253
- torch.tensor(scores),
253
+ torch.as_tensor(scores),
254
254
  min(
255
255
  top_k + 1,
256
256
  len(scores[1]) if len(scores) > 1 else len(scores[-1]),
@@ -1,6 +1,11 @@
1
1
  from .hebrew_sentiment_analysis import (
2
2
  HebrewSentimentAnalysis,
3
3
  HebrewSentimentAnalysisV2,
4
+ HebrewSentimentAnalysisV3,
4
5
  )
5
6
 
6
- __all__ = ["HebrewSentimentAnalysis", "HebrewSentimentAnalysisV2"]
7
+ __all__ = [
8
+ "HebrewSentimentAnalysis",
9
+ "HebrewSentimentAnalysisV2",
10
+ "HebrewSentimentAnalysisV3",
11
+ ]
@@ -9,7 +9,12 @@ class HebrewSentimentAnalysis(AbsTaskClassification):
9
9
  "path": "mteb/HebrewSentimentAnalysis",
10
10
  "revision": "03eb0996c8234e0d8cd7206bf4763815deda12ed",
11
11
  },
12
- description="HebrewSentiment is a data set consists of 12,804 user comments to posts on the official Facebook page of Israel’s president, Mr. Reuven Rivlin. In October 2015, we used the open software application Netvizz (Rieder, 2013) to scrape all the comments to all of the president’s posts in the period of June – August 2014, the first three months of Rivlin’s presidency.2 While the president’s posts aimed at reconciling tensions and called for tolerance and empathy, the sentiment expressed in the comments to the president’s posts was polarized between citizens who warmly thanked the president, and citizens that fiercely critiqued his policy.",
12
+ description=(
13
+ "HebrewSentiment is a data set consists of 12,804 user comments to posts on the official Facebook page of Israel’s president, Mr. Reuven Rivlin. "
14
+ "In October 2015, we used the open software application Netvizz (Rieder, 2013) to scrape all the comments to all of the president’s posts in the period of June – August 2014, "
15
+ "the first three months of Rivlin’s presidency.2 While the president’s posts aimed at reconciling tensions and called for tolerance and empathy, "
16
+ "the sentiment expressed in the comments to the president’s posts was polarized between citizens who warmly thanked the president, and citizens that fiercely critiqued his policy. "
17
+ ),
13
18
  reference="https://huggingface.co/datasets/hebrew_sentiment",
14
19
  type="Classification",
15
20
  category="t2c",
@@ -37,7 +42,7 @@ class HebrewSentimentAnalysis(AbsTaskClassification):
37
42
  year = {2018},
38
43
  }
39
44
  """,
40
- superseded_by="HebrewSentimentAnalysis.v2",
45
+ superseded_by="HebrewSentimentAnalysis.v3",
41
46
  )
42
47
 
43
48
 
@@ -49,7 +54,61 @@ class HebrewSentimentAnalysisV2(AbsTaskClassification):
49
54
  "revision": "7ecd049fc8ac0d6f0a0121c8ff9fe44ea5bd935b",
50
55
  "name": "morph",
51
56
  },
52
- description="HebrewSentiment is a data set consists of 12,804 user comments to posts on the official Facebook page of Israel’s president, Mr. Reuven Rivlin. In October 2015, we used the open software application Netvizz (Rieder, 2013) to scrape all the comments to all of the president’s posts in the period of June – August 2014, the first three months of Rivlin’s presidency.2 While the president’s posts aimed at reconciling tensions and called for tolerance and empathy, the sentiment expressed in the comments to the president’s posts was polarized between citizens who warmly thanked the president, and citizens that fiercely critiqued his policy. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
57
+ description=(
58
+ "HebrewSentiment is a data set consists of 12,804 user comments to posts on the official Facebook page of Israel’s president, Mr. Reuven Rivlin. "
59
+ "In October 2015, we used the open software application Netvizz (Rieder, 2013) to scrape all the comments to all of the president’s posts in the period of June – August 2014, "
60
+ "the first three months of Rivlin’s presidency.2 While the president’s posts aimed at reconciling tensions and called for tolerance and empathy, "
61
+ "the sentiment expressed in the comments to the president’s posts was polarized between citizens who warmly thanked the president, and citizens that fiercely critiqued his policy. "
62
+ "This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)"
63
+ ),
64
+ reference="https://huggingface.co/datasets/hebrew_sentiment",
65
+ type="Classification",
66
+ category="t2c",
67
+ modalities=["text"],
68
+ eval_splits=["test"],
69
+ eval_langs=["heb-Hebr"],
70
+ main_score="accuracy",
71
+ date=("2015-10-01", "2015-10-31"),
72
+ domains=["Reviews", "Written"],
73
+ task_subtypes=["Sentiment/Hate speech"],
74
+ license="mit",
75
+ annotations_creators="expert-annotated",
76
+ dialect=[],
77
+ sample_creation="found",
78
+ bibtex_citation=r"""
79
+ @inproceedings{amram-etal-2018-representations,
80
+ address = {Santa Fe, New Mexico, USA},
81
+ author = {Amram, Adam and Ben David, Anat and Tsarfaty, Reut},
82
+ booktitle = {Proceedings of the 27th International Conference on Computational Linguistics},
83
+ month = aug,
84
+ pages = {2242--2252},
85
+ publisher = {Association for Computational Linguistics},
86
+ title = {Representations and Architectures in Neural Sentiment Analysis for Morphologically Rich Languages: A Case Study from {M}odern {H}ebrew},
87
+ url = {https://www.aclweb.org/anthology/C18-1190},
88
+ year = {2018},
89
+ }
90
+ """,
91
+ adapted_from=["HebrewSentimentAnalysis"],
92
+ superseded_by="HebrewSentimentAnalysis.v3",
93
+ )
94
+
95
+
96
+ class HebrewSentimentAnalysisV3(AbsTaskClassification):
97
+ label_column_name = "labels"
98
+ metadata = TaskMetadata(
99
+ name="HebrewSentimentAnalysis.v3",
100
+ dataset={
101
+ "path": "mteb/HebrewSentimentAnalysisV4",
102
+ "revision": "aa0b83c4b16cd28daf7c41ef3402e3ffe9c70c59",
103
+ },
104
+ description=(
105
+ "HebrewSentiment is a data set consists of 12,804 user comments to posts on the official Facebook page of Israel’s president, Mr. Reuven Rivlin. "
106
+ "In October 2015, we used the open software application Netvizz (Rieder, 2013) to scrape all the comments to all of the president’s posts in the period of June – August 2014, "
107
+ "the first three months of Rivlin’s presidency.2 While the president’s posts aimed at reconciling tensions and called for tolerance and empathy, "
108
+ "the sentiment expressed in the comments to the president’s posts was polarized between citizens who warmly thanked the president, and citizens that fiercely critiqued his policy. "
109
+ "This version corrects texts (took pre-tokenized) [more details in this thread](https://huggingface.co/datasets/mteb/HebrewSentimentAnalysis/discussions/2). "
110
+ "This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)"
111
+ ),
53
112
  reference="https://huggingface.co/datasets/hebrew_sentiment",
54
113
  type="Classification",
55
114
  category="t2c",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mteb
3
- Version: 2.3.3
3
+ Version: 2.3.5
4
4
  Summary: Massive Text Embedding Benchmark
5
5
  Author-email: MTEB Contributors <niklas@huggingface.co>, Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Nouamane Tazi <nouamane@huggingface.co>, Nils Reimers <info@nils-reimers.de>
6
6
  Maintainer-email: Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Roman Solomatin <risolomatin@gmail.com>, Isaac Chung <chungisaac1217@gmail.com>
@@ -5,7 +5,7 @@ mteb/_helpful_enum.py,sha256=jh73N1jlcpg7RGz4bj8UpctiMNvqvHpp9wrB7SYEzIU,510
5
5
  mteb/_log_once.py,sha256=-tUKzxGQzf2LZSuQXi97oYFXMta1B6GEYXd7BPqssvY,1095
6
6
  mteb/_requires_package.py,sha256=eHg_TD9BVZRzNCcQQrUP17d8M1DF_vOd_tVx54AmAnM,3017
7
7
  mteb/_set_seed.py,sha256=HPlPRl__Pe6IG-4UgJqTfplcivJ_wA2kaClbXoHQedM,1178
8
- mteb/cache.py,sha256=77GtlwqHZxqkoCYcu76KCFL4AnHHkH8w-JY3oglMrbc,20102
8
+ mteb/cache.py,sha256=XiFuhjZ2C-o0LgP1YM8g9As_vigJCUNfTrOb9-EiFlM,20177
9
9
  mteb/deprecated_evaluator.py,sha256=t13Eluvm5ByVIOqgT7fqiVfLb8Ud3A4bbF2djRfs8iA,26901
10
10
  mteb/evaluate.py,sha256=B60CkqRHzkI-3zIfHyocp-YUeWrzeoOvX_RN5vSlGqE,19363
11
11
  mteb/filter_tasks.py,sha256=5XE1OYmgDDoJYnXwFf4ma_PIT_Lekzs420sQF_kpCiY,7240
@@ -55,8 +55,8 @@ mteb/benchmarks/__init__.py,sha256=MQEVeli-zLaJ7Xg0z7RhXQwsdmm7Ht_W2Ln0rZo1Szc,2
55
55
  mteb/benchmarks/_create_table.py,sha256=OAiR44ynJ2fMzoBmVITQtOTYQzxIu9KUdS_HzlBlAck,20195
56
56
  mteb/benchmarks/benchmark.py,sha256=70RlMyyg_wkWTlU_IbfLl-KaqRWXGCKTd8fWe9X-AQE,4173
57
57
  mteb/benchmarks/get_benchmark.py,sha256=-n_O-gitRKZi48gJKNgGuI36hsP7yLVSiwulnMHN7Gw,3935
58
- mteb/benchmarks/benchmarks/__init__.py,sha256=0ySgD14Mu3Y1nJzazR_eUir81ia3x6E23N57SzQNkF0,2150
59
- mteb/benchmarks/benchmarks/benchmarks.py,sha256=Ob2cHVXwFk328xbV-2ZmUibiVAMtT2RN1ygGgiP6UNQ,92662
58
+ mteb/benchmarks/benchmarks/__init__.py,sha256=Ig5dSFunzI-F-OamruuKJVSstbG3xQNkXCxRY3Bj_Ck,2180
59
+ mteb/benchmarks/benchmarks/benchmarks.py,sha256=qHHmJfisT75VRVoZfPcHhShCG0jY6vSWZEx-D01XxKU,94757
60
60
  mteb/benchmarks/benchmarks/rteb_benchmarks.py,sha256=QnCSrTTaBfcRlAQp2Nu81tgv1idMXqiM16Fp2zKJ5Ys,10607
61
61
  mteb/cli/__init__.py,sha256=v-csUr3eUZElIvrGB6QGtaIdndDfNWEe9oZchsGsJpg,64
62
62
  mteb/cli/_display_tasks.py,sha256=7A06dT9sSoTz6shyMvskPxuc5eHY_H7PGPlROzMP0yw,2196
@@ -252,6 +252,7 @@ mteb/descriptive_stats/Classification/HeadlineClassification.json,sha256=VfTqah7
252
252
  mteb/descriptive_stats/Classification/HeadlineClassification.v2.json,sha256=n-KiCmlKXb5QOzNG9QTdjwYR-cRV3Qvn96KOF2so7Cs,2110
253
253
  mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.json,sha256=j517xsonPntbr4k5Pa9ftGGIZqzaOLDvPqeGjWngdyI,1659
254
254
  mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v2.json,sha256=FaxgCr_lbS-ppsd_cEp_87kbCRCGqFUDMZv375XGvdk,1658
255
+ mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json,sha256=75u2ZWek2BT8JQElPqBHvCqTEXbeTwlrXcyQC5NK-wU,1658
255
256
  mteb/descriptive_stats/Classification/HinDialectClassification.json,sha256=HK13-QJaWc0uC4wOqhZHKzT6i05_wJhURdUJ6Upq8OQ,4745
256
257
  mteb/descriptive_stats/Classification/HindiDiscourseClassification.json,sha256=4XMoJL46RHYKYKdndxD5lZ2b3hGjL2CArx3eRalO8eg,1049
257
258
  mteb/descriptive_stats/Classification/HindiDiscourseClassification.v2.json,sha256=MGz2Ntd2JxyHtlP3GhkhbMQ_yH2Eladd4vZSFOme2K8,2088
@@ -1431,11 +1432,11 @@ mteb/leaderboard/table.py,sha256=6SnrYC5GcBlvVSO6vOk6ObuqtoveBLv3JUuXqdKueG8,833
1431
1432
  mteb/leaderboard/text_segments.py,sha256=iMIkS04QQjPbT-SkU0x6fOcS8xRbUYevryu9HydipKM,6570
1432
1433
  mteb/models/__init__.py,sha256=ABTuoqiBjBtBWW3LYY7ItBHdylR6jWoy06HH0g6j6fU,910
1433
1434
  mteb/models/abs_encoder.py,sha256=m0JkRfRPMYadDgBR9eozRloI31ZSWkSzDFINpwbfLZk,16533
1434
- mteb/models/get_model_meta.py,sha256=VpZZNINk-QrNeVpPZnlqzlLhtBs8G84eRwTzAb_gRD4,9108
1435
+ mteb/models/get_model_meta.py,sha256=GeofphZ8wFtwAHYQipgQlZzxNIFAVFGzo_E2sMzjZTc,9350
1435
1436
  mteb/models/instruct_wrapper.py,sha256=Ty4nfEvioycL_uATkhd0PGuyeB5Xc9xrRd6HOGgb-tc,9005
1436
1437
  mteb/models/model_meta.py,sha256=b-Nel9nX5bJk4cgJnqkBzEKyMY7uXvxlCBSxmmH1Ios,14769
1437
1438
  mteb/models/models_protocols.py,sha256=D2hYWn_UBGMaKtRwBx3u0B0ni6lHJjSzTxX21XFNwIc,8917
1438
- mteb/models/search_wrappers.py,sha256=AcMhjQyKdeitUjnaqgnP3_zTeVSum8rz1sjBRddHUVQ,20328
1439
+ mteb/models/search_wrappers.py,sha256=zpCvxUVNQWekyC4Fiz7mvlI0VPdSrFq41A0GrCDvBK4,20331
1439
1440
  mteb/models/sentence_transformer_wrapper.py,sha256=n5CMsM6Lpg_CFHH0NkpJusMsaLUTt-L9vRmFINQ961k,12338
1440
1441
  mteb/models/cache_wrappers/__init__.py,sha256=1w1TnMwulWJSzNkLXjbh5MY3sqgHWc6vUntYn49i9X8,169
1441
1442
  mteb/models/cache_wrappers/cache_backend_protocol.py,sha256=TR7kD7KbN1J4piszIecpegtLZYGy7sRHZt3SDWlImKk,1665
@@ -1534,6 +1535,7 @@ mteb/models/model_implementations/rerankers_custom.py,sha256=ro73A9-hHudy3_qIMrh
1534
1535
  mteb/models/model_implementations/rerankers_monot5_based.py,sha256=rxVwzapNnHl4gCw79XVCaTXj3-wbToyj7XVL97tpAF4,34302
1535
1536
  mteb/models/model_implementations/richinfoai_models.py,sha256=llvYa0JUjyOOMbuTgOYoJ2qeqZ5rLHX1ZjZIYlYbdvA,989
1536
1537
  mteb/models/model_implementations/ru_sentence_models.py,sha256=GuZFwbzaooufvSMGNjIsL0DDLrqHjhdSsAQHHZo5H08,40480
1538
+ mteb/models/model_implementations/ruri_models.py,sha256=-BTYkZ8dEWZUbGqx3YB5yFSrzMwZtXX7sMUHzrlB8ws,10043
1537
1539
  mteb/models/model_implementations/salesforce_models.py,sha256=KslTK-IKeLvNG-vQir9k6swkaOgjk6eyozm_BOVgTpY,5160
1538
1540
  mteb/models/model_implementations/samilpwc_models.py,sha256=oMwKNwCxoH1jZgCy04oo2oVlBZWu253QMpnEEC6emz8,2021
1539
1541
  mteb/models/model_implementations/searchmap_models.py,sha256=XvVl99emIgnNUCxkTuFQXW6py2R8vgsArfpyHveCugw,1904
@@ -1722,8 +1724,8 @@ mteb/tasks/classification/fra/french_book_reviews.py,sha256=Fsx8UznQVNDNUhcdsTeN
1722
1724
  mteb/tasks/classification/fra/movie_review_sentiment_classification.py,sha256=ov-fbReWP9T_RqhxFtS-gjNaZmbM9J8gQdBQyci5yqU,3290
1723
1725
  mteb/tasks/classification/guj/__init__.py,sha256=HZfimpBCywBLi5VGof_A9Ua6bqtMUoWGRhM1eqAEWKE,186
1724
1726
  mteb/tasks/classification/guj/gujarati_news_classification.py,sha256=VEdbzqlw8b8N8R3TQc275iiCxqGLAMAM8Nf_N7FnUGA,2303
1725
- mteb/tasks/classification/heb/__init__.py,sha256=Wa9-nATstuSXN2OFsO3tF0BZdlk3AzM0g9R1VzZUTc0,171
1726
- mteb/tasks/classification/heb/hebrew_sentiment_analysis.py,sha256=1DPoxWdGAZJB7LwEQWJzpR8KEg2fpbgGXe_cGv1bo5M,4523
1727
+ mteb/tasks/classification/heb/__init__.py,sha256=xQNtDxjUsCXQhA_ZYO_4kpBtHWQSBhWhQwAuWcTo6GE,246
1728
+ mteb/tasks/classification/heb/hebrew_sentiment_analysis.py,sha256=2wmKwq4Z4YKTG3QieuxmUE56ofiSOV63kV-jh8Zomh4,7281
1727
1729
  mteb/tasks/classification/hin/__init__.py,sha256=KdScMtYYmjsali0InoHP0PKQ8yCbaD-j2tLqc7_lhlo,356
1728
1730
  mteb/tasks/classification/hin/hindi_discourse_classification.py,sha256=HXLJZFEpnI7UjwA-NuadhrCplqaLiSb8Npla_6oXC48,3717
1729
1731
  mteb/tasks/classification/hin/sentiment_analysis_hindi.py,sha256=YIeEuNa2UTr8Jwh_wx15broRuD3NHNansGq7Bl9Vjl0,3101
@@ -2569,9 +2571,9 @@ mteb/types/_metadata.py,sha256=NN-W0S6a5TDV7UkpRx1pyWtGF4TyyCyoPUfHOwdeci8,2290
2569
2571
  mteb/types/_result.py,sha256=CRAUc5IvqI3_9SyXDwv-PWLCXwXdZem9RePeYESRtuw,996
2570
2572
  mteb/types/_string_validators.py,sha256=PY-dYq4E8O50VS3bLYdldPWp400fl_WzUjfVSkNWe8U,523
2571
2573
  mteb/types/statistics.py,sha256=YwJsxTf1eaCI_RE-J37a-gK5wDeGAsmkeZKoZCFihSo,3755
2572
- mteb-2.3.3.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
2573
- mteb-2.3.3.dist-info/METADATA,sha256=LbvRqywjhaqAK4910G8ueME52YrrqFzvm4NXl2M3MBA,13923
2574
- mteb-2.3.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
2575
- mteb-2.3.3.dist-info/entry_points.txt,sha256=8IJoEJFKoDHmVnNev-qJ9pp4Ln7_1-ma9QsXnzVCzGU,39
2576
- mteb-2.3.3.dist-info/top_level.txt,sha256=OLVIjcQAlWBz0bdmutKlWHLF42FF0hp4uVAg3ZyiG4U,5
2577
- mteb-2.3.3.dist-info/RECORD,,
2574
+ mteb-2.3.5.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
2575
+ mteb-2.3.5.dist-info/METADATA,sha256=Ud-HNDLgXkrYqVQczyt-TNpev3LR1rBhRDPKK3Dn_T0,13923
2576
+ mteb-2.3.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
2577
+ mteb-2.3.5.dist-info/entry_points.txt,sha256=8IJoEJFKoDHmVnNev-qJ9pp4Ln7_1-ma9QsXnzVCzGU,39
2578
+ mteb-2.3.5.dist-info/top_level.txt,sha256=OLVIjcQAlWBz0bdmutKlWHLF42FF0hp4uVAg3ZyiG4U,5
2579
+ mteb-2.3.5.dist-info/RECORD,,
File without changes