mteb 2.1.0__py3-none-any.whl → 2.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. mteb/_create_dataloaders.py +2 -0
  2. mteb/abstasks/_stratification.py +1 -1
  3. mteb/abstasks/abstask.py +6 -1
  4. mteb/abstasks/dataset_card_template.md +1 -1
  5. mteb/abstasks/retrieval.py +2 -1
  6. mteb/abstasks/retrieval_dataset_loaders.py +1 -1
  7. mteb/abstasks/task_metadata.py +1 -1
  8. mteb/benchmarks/benchmarks/benchmarks.py +7 -11
  9. mteb/benchmarks/get_benchmark.py +1 -1
  10. mteb/descriptive_stats/Image/Any2AnyMultilingualRetrieval/XFlickr30kCoT2IRetrieval.json +243 -153
  11. mteb/descriptive_stats/Image/Any2AnyMultilingualRetrieval/XM3600T2IRetrieval.json +999 -629
  12. mteb/descriptive_stats/Image/Any2AnyRetrieval/OVENIT2TRetrieval.json +33 -17
  13. mteb/descriptive_stats/Image/DocumentUnderstanding/MIRACLVisionRetrieval.json +574 -0
  14. mteb/descriptive_stats/Retrieval/ClimateFEVERHardNegatives.v2.json +30 -0
  15. mteb/descriptive_stats/Retrieval/DBPediaHardNegatives.v2.json +30 -0
  16. mteb/descriptive_stats/Retrieval/FEVERHardNegatives.v2.json +30 -0
  17. mteb/descriptive_stats/Retrieval/HotpotQAHardNegatives.v2.json +30 -0
  18. mteb/descriptive_stats/Retrieval/QuoraRetrievalHardNegatives.v2.json +30 -0
  19. mteb/descriptive_stats/Retrieval/RiaNewsRetrievalHardNegatives.v2.json +30 -0
  20. mteb/descriptive_stats/Retrieval/VDRMultilingualRetrieval.json +184 -0
  21. mteb/languages/check_language_code.py +11 -3
  22. mteb/languages/language_scripts.py +4 -0
  23. mteb/leaderboard/text_segments.py +1 -1
  24. mteb/models/model_implementations/b1ade_models.py +1 -1
  25. mteb/models/model_implementations/bge_models.py +1 -3
  26. mteb/models/model_implementations/bmretriever_models.py +1 -1
  27. mteb/models/model_implementations/gme_v_models.py +2 -2
  28. mteb/models/model_implementations/ibm_granite_models.py +1 -1
  29. mteb/models/model_implementations/inf_models.py +3 -3
  30. mteb/models/model_implementations/jina_models.py +12 -2
  31. mteb/models/model_implementations/llm2vec_models.py +1 -1
  32. mteb/models/model_implementations/misc_models.py +2 -2
  33. mteb/models/model_implementations/mxbai_models.py +1 -1
  34. mteb/models/model_implementations/salesforce_models.py +1 -1
  35. mteb/models/model_implementations/seed_1_6_embedding_models.py +1 -1
  36. mteb/models/model_implementations/voyage_v.py +9 -9
  37. mteb/results/task_result.py +6 -8
  38. mteb/tasks/classification/dan/angry_tweets_classification.py +2 -2
  39. mteb/tasks/classification/eng/legal_bench_classification.py +3 -3
  40. mteb/tasks/classification/mya/myanmar_news.py +2 -2
  41. mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
  42. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
  43. mteb/tasks/pair_classification/multilingual/indic_xnli_pair_classification.py +9 -8
  44. mteb/tasks/retrieval/code/code_rag.py +8 -8
  45. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  46. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  47. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  48. mteb/tasks/retrieval/eng/__init__.py +18 -4
  49. mteb/tasks/retrieval/eng/climate_fever_retrieval.py +68 -77
  50. mteb/tasks/retrieval/eng/dbpedia_retrieval.py +55 -50
  51. mteb/tasks/retrieval/eng/fever_retrieval.py +62 -67
  52. mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +0 -4
  53. mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +0 -4
  54. mteb/tasks/retrieval/eng/hotpot_qa_retrieval.py +57 -67
  55. mteb/tasks/retrieval/eng/legal_summarization_retrieval.py +1 -1
  56. mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +0 -3
  57. mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +0 -2
  58. mteb/tasks/retrieval/eng/oven_it2t_retrieval.py +1 -1
  59. mteb/tasks/retrieval/eng/quora_retrieval.py +51 -46
  60. mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +0 -4
  61. mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +0 -4
  62. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +0 -2
  63. mteb/tasks/retrieval/jpn/ja_gov_faqs_retrieval.py +1 -1
  64. mteb/tasks/retrieval/multilingual/belebele_retrieval.py +1 -1
  65. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +0 -2
  66. mteb/tasks/retrieval/multilingual/miracl_retrieval.py +1 -1
  67. mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +2 -9
  68. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +0 -2
  69. mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +0 -2
  70. mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +6 -5
  71. mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +3 -4
  72. mteb/tasks/retrieval/nob/norquad.py +2 -2
  73. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  74. mteb/tasks/retrieval/rus/__init__.py +11 -2
  75. mteb/tasks/retrieval/rus/ria_news_retrieval.py +48 -44
  76. mteb/tasks/retrieval/tur/tur_hist_quad.py +2 -2
  77. {mteb-2.1.0.dist-info → mteb-2.1.1.dist-info}/METADATA +5 -5
  78. {mteb-2.1.0.dist-info → mteb-2.1.1.dist-info}/RECORD +82 -87
  79. mteb/descriptive_stats/Classification/PersianTextTone.json +0 -56
  80. mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchCount.json +0 -37
  81. mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchDepth.json +0 -25
  82. mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchDistance.json +0 -25
  83. mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchRelation.json +0 -25
  84. mteb/descriptive_stats/Image/VisualSTS/STS12VisualSTS.json +0 -20
  85. mteb/descriptive_stats/Image/VisualSTS/STS13VisualSTS.json +0 -20
  86. mteb/descriptive_stats/Image/VisualSTS/STS14VisualSTS.json +0 -20
  87. mteb/descriptive_stats/Image/VisualSTS/STS15VisualSTS.json +0 -20
  88. mteb/descriptive_stats/Image/VisualSTS/STS16VisualSTS.json +0 -20
  89. mteb/descriptive_stats/Image/VisualSTS/STS17MultilingualVisualSTS.json +0 -220
  90. mteb/descriptive_stats/Image/VisualSTS/STSBenchmarkMultilingualVisualSTS.json +0 -402
  91. mteb/descriptive_stats/Reranking/InstructIR.json +0 -31
  92. {mteb-2.1.0.dist-info → mteb-2.1.1.dist-info}/WHEEL +0 -0
  93. {mteb-2.1.0.dist-info → mteb-2.1.1.dist-info}/entry_points.txt +0 -0
  94. {mteb-2.1.0.dist-info → mteb-2.1.1.dist-info}/licenses/LICENSE +0 -0
  95. {mteb-2.1.0.dist-info → mteb-2.1.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,30 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 90470,
4
+ "number_of_characters": 30600110,
5
+ "documents_text_statistics": {
6
+ "total_text_length": 30586476,
7
+ "min_text_length": 8,
8
+ "average_text_length": 339.58561119129564,
9
+ "max_text_length": 5857,
10
+ "unique_texts": 90070
11
+ },
12
+ "documents_image_statistics": null,
13
+ "queries_text_statistics": {
14
+ "total_text_length": 13634,
15
+ "min_text_length": 6,
16
+ "average_text_length": 34.085,
17
+ "max_text_length": 88,
18
+ "unique_texts": 399
19
+ },
20
+ "queries_image_statistics": null,
21
+ "relevant_docs_statistics": {
22
+ "num_relevant_docs": 15286,
23
+ "min_relevant_docs_per_query": 21,
24
+ "average_relevant_docs_per_query": 38.215,
25
+ "max_relevant_docs_per_query": 1499,
26
+ "unique_relevant_docs": 40724
27
+ },
28
+ "top_ranked_statistics": null
29
+ }
30
+ }
@@ -0,0 +1,30 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 164698,
4
+ "number_of_characters": 114050514,
5
+ "documents_text_statistics": {
6
+ "total_text_length": 114000894,
7
+ "min_text_length": 1,
8
+ "average_text_length": 696.4098156361104,
9
+ "max_text_length": 29033,
10
+ "unique_texts": 163698
11
+ },
12
+ "documents_image_statistics": null,
13
+ "queries_text_statistics": {
14
+ "total_text_length": 49620,
15
+ "min_text_length": 15,
16
+ "average_text_length": 49.62,
17
+ "max_text_length": 172,
18
+ "unique_texts": 997
19
+ },
20
+ "queries_image_statistics": null,
21
+ "relevant_docs_statistics": {
22
+ "num_relevant_docs": 1171,
23
+ "min_relevant_docs_per_query": 1,
24
+ "average_relevant_docs_per_query": 1.171,
25
+ "max_relevant_docs_per_query": 15,
26
+ "unique_relevant_docs": 677
27
+ },
28
+ "top_ranked_statistics": null
29
+ }
30
+ }
@@ -0,0 +1,30 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 226621,
4
+ "number_of_characters": 84600866,
5
+ "documents_text_statistics": {
6
+ "total_text_length": 84508282,
7
+ "min_text_length": 8,
8
+ "average_text_length": 374.55858275603777,
9
+ "max_text_length": 3463,
10
+ "unique_texts": 225621
11
+ },
12
+ "documents_image_statistics": null,
13
+ "queries_text_statistics": {
14
+ "total_text_length": 92584,
15
+ "min_text_length": 34,
16
+ "average_text_length": 92.584,
17
+ "max_text_length": 288,
18
+ "unique_texts": 1000
19
+ },
20
+ "queries_image_statistics": null,
21
+ "relevant_docs_statistics": {
22
+ "num_relevant_docs": 2000,
23
+ "min_relevant_docs_per_query": 2,
24
+ "average_relevant_docs_per_query": 2.0,
25
+ "max_relevant_docs_per_query": 2,
26
+ "unique_relevant_docs": 1975
27
+ },
28
+ "top_ranked_statistics": null
29
+ }
30
+ }
@@ -0,0 +1,30 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 178163,
4
+ "number_of_characters": 10498457,
5
+ "documents_text_statistics": {
6
+ "total_text_length": 10447229,
7
+ "min_text_length": 1,
8
+ "average_text_length": 58.96958732918273,
9
+ "max_text_length": 581,
10
+ "unique_texts": 176849
11
+ },
12
+ "documents_image_statistics": null,
13
+ "queries_text_statistics": {
14
+ "total_text_length": 51228,
15
+ "min_text_length": 2,
16
+ "average_text_length": 51.228,
17
+ "max_text_length": 180,
18
+ "unique_texts": 1000
19
+ },
20
+ "queries_image_statistics": null,
21
+ "relevant_docs_statistics": {
22
+ "num_relevant_docs": 1641,
23
+ "min_relevant_docs_per_query": 1,
24
+ "average_relevant_docs_per_query": 1.641,
25
+ "max_relevant_docs_per_query": 34,
26
+ "unique_relevant_docs": 1641
27
+ },
28
+ "top_ranked_statistics": null
29
+ }
30
+ }
@@ -0,0 +1,30 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 192237,
4
+ "number_of_characters": 234466370,
5
+ "documents_text_statistics": {
6
+ "total_text_length": 234404032,
7
+ "min_text_length": 0,
8
+ "average_text_length": 1225.7253146619116,
9
+ "max_text_length": 2000,
10
+ "unique_texts": 191237
11
+ },
12
+ "documents_image_statistics": null,
13
+ "queries_text_statistics": {
14
+ "total_text_length": 62338,
15
+ "min_text_length": 4,
16
+ "average_text_length": 62.338,
17
+ "max_text_length": 85,
18
+ "unique_texts": 1000
19
+ },
20
+ "queries_image_statistics": null,
21
+ "relevant_docs_statistics": {
22
+ "num_relevant_docs": 1000,
23
+ "min_relevant_docs_per_query": 1,
24
+ "average_relevant_docs_per_query": 1.0,
25
+ "max_relevant_docs_per_query": 1,
26
+ "unique_relevant_docs": 1000
27
+ },
28
+ "top_ranked_statistics": null
29
+ }
30
+ }
@@ -0,0 +1,184 @@
1
+ {
2
+ "train": {
3
+ "num_samples": 16500,
4
+ "number_of_characters": 118992,
5
+ "documents_text_statistics": null,
6
+ "documents_image_statistics": {
7
+ "min_image_width": 447,
8
+ "average_image_width": 1401.1196666666667,
9
+ "max_image_width": 2743,
10
+ "min_image_height": 376,
11
+ "average_image_height": 1685.2892,
12
+ "max_image_height": 5257,
13
+ "unique_images": 14981
14
+ },
15
+ "queries_text_statistics": {
16
+ "total_text_length": 118992,
17
+ "min_text_length": 13,
18
+ "average_text_length": 79.328,
19
+ "max_text_length": 204,
20
+ "unique_texts": 1499
21
+ },
22
+ "queries_image_statistics": null,
23
+ "relevant_docs_statistics": {
24
+ "num_relevant_docs": 1499,
25
+ "min_relevant_docs_per_query": 1,
26
+ "average_relevant_docs_per_query": 1.0,
27
+ "max_relevant_docs_per_query": 1,
28
+ "unique_relevant_docs": 1499
29
+ },
30
+ "top_ranked_statistics": null,
31
+ "hf_subset_descriptive_stats": {
32
+ "en": {
33
+ "num_samples": 3300,
34
+ "number_of_characters": 20947,
35
+ "documents_text_statistics": null,
36
+ "documents_image_statistics": {
37
+ "min_image_width": 653,
38
+ "average_image_width": 1388.4603333333334,
39
+ "max_image_width": 2464,
40
+ "min_image_height": 878,
41
+ "average_image_height": 1691.6246666666666,
42
+ "max_image_height": 3533,
43
+ "unique_images": 2996
44
+ },
45
+ "queries_text_statistics": {
46
+ "total_text_length": 20947,
47
+ "min_text_length": 31,
48
+ "average_text_length": 69.82333333333334,
49
+ "max_text_length": 142,
50
+ "unique_texts": 300
51
+ },
52
+ "queries_image_statistics": null,
53
+ "relevant_docs_statistics": {
54
+ "num_relevant_docs": 300,
55
+ "min_relevant_docs_per_query": 1,
56
+ "average_relevant_docs_per_query": 1.0,
57
+ "max_relevant_docs_per_query": 1,
58
+ "unique_relevant_docs": 300
59
+ },
60
+ "top_ranked_statistics": null
61
+ },
62
+ "es": {
63
+ "num_samples": 3300,
64
+ "number_of_characters": 24935,
65
+ "documents_text_statistics": null,
66
+ "documents_image_statistics": {
67
+ "min_image_width": 447,
68
+ "average_image_width": 1370.8263333333334,
69
+ "max_image_width": 2743,
70
+ "min_image_height": 376,
71
+ "average_image_height": 1709.195,
72
+ "max_image_height": 5257,
73
+ "unique_images": 2997
74
+ },
75
+ "queries_text_statistics": {
76
+ "total_text_length": 24935,
77
+ "min_text_length": 35,
78
+ "average_text_length": 83.11666666666666,
79
+ "max_text_length": 153,
80
+ "unique_texts": 300
81
+ },
82
+ "queries_image_statistics": null,
83
+ "relevant_docs_statistics": {
84
+ "num_relevant_docs": 300,
85
+ "min_relevant_docs_per_query": 1,
86
+ "average_relevant_docs_per_query": 1.0,
87
+ "max_relevant_docs_per_query": 1,
88
+ "unique_relevant_docs": 300
89
+ },
90
+ "top_ranked_statistics": null
91
+ },
92
+ "fr": {
93
+ "num_samples": 3300,
94
+ "number_of_characters": 25217,
95
+ "documents_text_statistics": null,
96
+ "documents_image_statistics": {
97
+ "min_image_width": 780,
98
+ "average_image_width": 1402.3566666666666,
99
+ "max_image_width": 2579,
100
+ "min_image_height": 756,
101
+ "average_image_height": 1689.5696666666668,
102
+ "max_image_height": 2912,
103
+ "unique_images": 2998
104
+ },
105
+ "queries_text_statistics": {
106
+ "total_text_length": 25217,
107
+ "min_text_length": 37,
108
+ "average_text_length": 84.05666666666667,
109
+ "max_text_length": 152,
110
+ "unique_texts": 299
111
+ },
112
+ "queries_image_statistics": null,
113
+ "relevant_docs_statistics": {
114
+ "num_relevant_docs": 299,
115
+ "min_relevant_docs_per_query": 1,
116
+ "average_relevant_docs_per_query": 1.0,
117
+ "max_relevant_docs_per_query": 1,
118
+ "unique_relevant_docs": 299
119
+ },
120
+ "top_ranked_statistics": null
121
+ },
122
+ "de": {
123
+ "num_samples": 3300,
124
+ "number_of_characters": 23029,
125
+ "documents_text_statistics": null,
126
+ "documents_image_statistics": {
127
+ "min_image_width": 828,
128
+ "average_image_width": 1394.5596666666668,
129
+ "max_image_width": 2366,
130
+ "min_image_height": 756,
131
+ "average_image_height": 1686.0596666666668,
132
+ "max_image_height": 2827,
133
+ "unique_images": 2994
134
+ },
135
+ "queries_text_statistics": {
136
+ "total_text_length": 23029,
137
+ "min_text_length": 35,
138
+ "average_text_length": 76.76333333333334,
139
+ "max_text_length": 143,
140
+ "unique_texts": 300
141
+ },
142
+ "queries_image_statistics": null,
143
+ "relevant_docs_statistics": {
144
+ "num_relevant_docs": 300,
145
+ "min_relevant_docs_per_query": 1,
146
+ "average_relevant_docs_per_query": 1.0,
147
+ "max_relevant_docs_per_query": 1,
148
+ "unique_relevant_docs": 300
149
+ },
150
+ "top_ranked_statistics": null
151
+ },
152
+ "it": {
153
+ "num_samples": 3300,
154
+ "number_of_characters": 24864,
155
+ "documents_text_statistics": null,
156
+ "documents_image_statistics": {
157
+ "min_image_width": 788,
158
+ "average_image_width": 1449.3953333333334,
159
+ "max_image_width": 2583,
160
+ "min_image_height": 804,
161
+ "average_image_height": 1649.997,
162
+ "max_image_height": 2168,
163
+ "unique_images": 2996
164
+ },
165
+ "queries_text_statistics": {
166
+ "total_text_length": 24864,
167
+ "min_text_length": 13,
168
+ "average_text_length": 82.88,
169
+ "max_text_length": 204,
170
+ "unique_texts": 300
171
+ },
172
+ "queries_image_statistics": null,
173
+ "relevant_docs_statistics": {
174
+ "num_relevant_docs": 300,
175
+ "min_relevant_docs_per_query": 1,
176
+ "average_relevant_docs_per_query": 1.0,
177
+ "max_relevant_docs_per_query": 1,
178
+ "unique_relevant_docs": 300
179
+ },
180
+ "top_ranked_statistics": null
181
+ }
182
+ }
183
+ }
184
+ }
@@ -13,7 +13,15 @@ def check_language_code(code: str) -> None:
13
13
  Args:
14
14
  code: The language code to check.
15
15
  """
16
- lang, script = code.split("-")
16
+ lang = None
17
+ script = None
18
+ if "-" in code:
19
+ lang, script = code.split("-")
20
+ elif code[0].isupper():
21
+ script = code
22
+ else:
23
+ lang = code
24
+
17
25
  if script == "Code":
18
26
  if lang in PROGRAMMING_LANGS:
19
27
  return # override for code
@@ -21,11 +29,11 @@ def check_language_code(code: str) -> None:
21
29
  raise ValueError(
22
30
  f"Programming language {lang} is not a valid programming language."
23
31
  )
24
- if lang not in ISO_TO_LANGUAGE:
32
+ if lang is not None and lang not in ISO_TO_LANGUAGE:
25
33
  raise ValueError(
26
34
  f"Invalid language code: {lang}, you can find valid ISO 639-3 codes in {path_to_lang_codes}"
27
35
  )
28
- if script not in ISO_TO_SCRIPT:
36
+ if script is not None and script not in ISO_TO_SCRIPT:
29
37
  raise ValueError(
30
38
  f"Invalid script code: {script}, you can find valid ISO 15924 codes in {path_to_lang_scripts}"
31
39
  )
@@ -3,6 +3,8 @@ from dataclasses import dataclass
3
3
 
4
4
  from typing_extensions import Self
5
5
 
6
+ from mteb.languages import check_language_code
7
+
6
8
 
7
9
  @dataclass
8
10
  class LanguageScripts:
@@ -46,8 +48,10 @@ class LanguageScripts:
46
48
  if len(lang_script) == 2:
47
49
  normalized_langs.add(lang_script[0])
48
50
  lang_script_codes.add(lang)
51
+ check_language_code(lang)
49
52
  script_codes.add(lang_script[1])
50
53
  else:
54
+ check_language_code(lang)
51
55
  normalized_langs.add(lang)
52
56
 
53
57
  return cls(
@@ -53,7 +53,7 @@ ACKNOWLEDGEMENT = """
53
53
  <img src="https://play-lh.googleusercontent.com/HdfHZ5jnfMM1Ep7XpPaVdFIVSRx82wKlRC_qmnHx9H1E4aWNp4WKoOcH0x95NAnuYg" width="60" height="55" style="padding: 10px;">
54
54
  </a>
55
55
  <a href="https://huggingface.co">
56
- <img src="https://raw.githubusercontent.com/embeddings-benchmark/mteb/main/docs/images/hf_logo.png" width="60" height="55" style="padding: 10px;">
56
+ <img src="https://raw.githubusercontent.com/embeddings-benchmark/mteb/main/docs/images/logos/hf_logo.png" width="60" height="55" style="padding: 10px;">
57
57
  </a>
58
58
  </div>
59
59
 
@@ -2,7 +2,7 @@ from mteb.models.model_meta import ModelMeta, ScoringFunction
2
2
  from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
3
3
 
4
4
  b1ade_training_data = {
5
- # We are in teh process of submitting a paper outlining our process of creating b1ade using model merging and knowledge distillation.
5
+ # We are in the process of submitting a paper outlining our process of creating b1ade using model merging and knowledge distillation.
6
6
  # Similar to mixedbread models, we do not train on any data (except the MSMarco training split) of MTEB.
7
7
  "MSMARCO",
8
8
  }
@@ -62,7 +62,7 @@ bge_m3_training_data = {
62
62
  # mMARCO-ZH
63
63
  # LawGPT
64
64
  # NLI-zh2, LeCaRDv2,
65
- # NLI, MultiLongDoc (their syntetic)
65
+ # NLI, MultiLongDoc (their synthetic)
66
66
  # + synthetic data
67
67
  }
68
68
 
@@ -141,7 +141,6 @@ bge_chinese_training_data = {
141
141
  # https://huggingface.co/BAAI/bge-m3/discussions/29
142
142
  bgem3_languages = [
143
143
  "afr-Latn", # af
144
- # als
145
144
  "amh-Ethi", # am
146
145
  # an
147
146
  # ar
@@ -151,7 +150,6 @@ bgem3_languages = [
151
150
  # av
152
151
  # az
153
152
  "azj-Latn", # azb
154
- # ba
155
153
  # bar
156
154
  # bcl
157
155
  "ben-Beng", # be
@@ -48,7 +48,7 @@ class BMRetrieverWrapper(InstructSentenceTransformerModel):
48
48
  if padding_side is not None:
49
49
  tokenizer_params["padding_side"] = padding_side
50
50
  kwargs.setdefault("tokenizer_args", {}).update(tokenizer_params)
51
- kwargs.setdefault("config_args", {}).update(revison=revision)
51
+ kwargs.setdefault("config_args", {}).update(revision=revision)
52
52
 
53
53
  transformer = Transformer(
54
54
  model_name,
@@ -39,7 +39,7 @@ class Encoder(torch.nn.Module):
39
39
  self.max_length = max_length
40
40
  self.normalize = normalize
41
41
  self.processor.tokenizer.padding_side = "right"
42
- self.defualt_instruction = "You are a helpful assistant."
42
+ self.default_instruction = "You are a helpful assistant."
43
43
 
44
44
  def forward(
45
45
  self,
@@ -103,7 +103,7 @@ class Encoder(torch.nn.Module):
103
103
  instruction=None,
104
104
  **kwargs,
105
105
  ):
106
- instruction = instruction or self.defualt_instruction
106
+ instruction = instruction or self.default_instruction
107
107
  # Inputs must be batched
108
108
  input_texts, input_images = [], []
109
109
  for t, i in zip(texts, images):
@@ -79,7 +79,7 @@ granite_training_data = {
79
79
  "MIRACLReranking",
80
80
  # Multilingual MrTydi Triples
81
81
  "MrTidyRetrieval",
82
- # Sadeeem Question Asnwering
82
+ # Sadeeem Question Answering
83
83
  # DBPedia Title-Body Pairs
84
84
  "DBPedia",
85
85
  "DBPedia-NL", # translated from hotpotQA (not trained on)
@@ -4,7 +4,7 @@ from mteb.models.model_meta import (
4
4
  )
5
5
  from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
6
6
 
7
- inf_retreiver_v1_training_data = {
7
+ inf_retriever_v1_training_data = {
8
8
  # eng_Latn
9
9
  "ArguAna",
10
10
  "CQADupstackRetrieval",
@@ -66,7 +66,7 @@ inf_retriever_v1 = ModelMeta(
66
66
  adapted_from="Alibaba-NLP/gte-Qwen2-7B-instruct",
67
67
  public_training_code=None,
68
68
  public_training_data=None,
69
- training_datasets=inf_retreiver_v1_training_data,
69
+ training_datasets=inf_retriever_v1_training_data,
70
70
  citation=INF_RETRIEVER_CITATION,
71
71
  )
72
72
 
@@ -92,6 +92,6 @@ inf_retriever_v1_1_5b = ModelMeta(
92
92
  adapted_from="Alibaba-NLP/gte-Qwen2-1.5B-instruct",
93
93
  public_training_code=None,
94
94
  public_training_data=None,
95
- training_datasets=inf_retreiver_v1_training_data,
95
+ training_datasets=inf_retriever_v1_training_data,
96
96
  citation=INF_RETRIEVER_CITATION,
97
97
  )
@@ -310,9 +310,19 @@ class JinaV4Wrapper(AbsEncoder):
310
310
  text_embeddings = None
311
311
  image_embeddings = None
312
312
  if "text" in inputs.dataset.features:
313
- text_embeddings = self.get_text_embeddings(inputs, **kwargs)
313
+ text_embeddings = self.get_text_embeddings(
314
+ inputs,
315
+ task_metadata=task_metadata,
316
+ prompt_type=prompt_type,
317
+ **kwargs,
318
+ )
314
319
  if "image" in inputs.dataset.features:
315
- image_embeddings = self.get_image_embeddings(inputs, **kwargs)
320
+ image_embeddings = self.get_image_embeddings(
321
+ inputs,
322
+ task_metadata=task_metadata,
323
+ prompt_type=prompt_type,
324
+ **kwargs,
325
+ )
316
326
 
317
327
  if text_embeddings is not None and image_embeddings is not None:
318
328
  if len(text_embeddings) != len(image_embeddings):
@@ -23,7 +23,7 @@ def llm2vec_instruction(instruction):
23
23
 
24
24
  llm2vec_supervised_training_data = {
25
25
  # source, section g1: https://arxiv.org/pdf/2404.05961
26
- # splits assumed but unkown
26
+ # splits assumed but unknown
27
27
  "HotpotQA",
28
28
  "HotpotQA-PL", # translation not trained on
29
29
  "HotpotQA-NL", # translation not trained on
@@ -382,7 +382,7 @@ Mihaiii__Venusaur = ModelMeta(
382
382
  reference="https://huggingface.co/Mihaiii/Venusaur",
383
383
  similarity_fn_name=ScoringFunction.COSINE,
384
384
  use_instructions=None,
385
- training_datasets=None, # source model is unkown
385
+ training_datasets=None, # source model is unknown
386
386
  # {"Mihaiii/qa-assistant"},
387
387
  adapted_from="Mihaiii/test14",
388
388
  superseded_by=None,
@@ -1516,7 +1516,7 @@ openbmb__minicpm_embedding = ModelMeta(
1516
1516
  superseded_by=None,
1517
1517
  )
1518
1518
 
1519
- silma_ai__silma_embeddding_matryoshka_v0_1 = ModelMeta(
1519
+ silma_ai__silma_embedding_matryoshka_v0_1 = ModelMeta(
1520
1520
  name="silma-ai/silma-embeddding-matryoshka-v0.1",
1521
1521
  revision="a520977a9542ebdb8a7206df6b7ff6977f1886ea",
1522
1522
  release_date="2024-10-12",
@@ -5,7 +5,7 @@ from mteb.models.model_meta import (
5
5
  from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
6
6
 
7
7
  mixedbread_training_data = {
8
- # from correspondance:
8
+ # from correspondence:
9
9
  # as mentioned in our blog post
10
10
  # (https://www.mixedbread.com/blog/mxbai-embed-large-v1#built-for-rag-and-real-world-use-cases:~:text=During%20the%20whole,related%20use%20cases.)
11
11
  # We do not train on any data (except the MSMarco training split) of MTEB. We have a strong filtering process to ensure the OOD setting. That's true
@@ -27,7 +27,7 @@ SFR_TRAINING_DATA = { # inherits from e5
27
27
  "HotpotQA-PL", # translation not trained on
28
28
  "HotpotQA-NL", # translation not trained on
29
29
  # source: https://github.com/embeddings-benchmark/leaderboard/issues/41
30
- # qoute: In the realm of Semantic Textual Similarity (STS), it is trained on STS12, STS22, and STSBenchmark
30
+ # quote: In the realm of Semantic Textual Similarity (STS), it is trained on STS12, STS22, and STSBenchmark
31
31
  "STS12",
32
32
  "STS22",
33
33
  "STSBenchmark",
@@ -344,7 +344,7 @@ TASK_NAME_TO_INSTRUCTION = {
344
344
  "SprintDuplicateQuestions": "Retrieve semantically similar text\n{}",
345
345
  "TwitterSemEval2015": "Retrieve semantically similar text\n{}",
346
346
  "TwitterURLCorpus": "Retrieve semantically similar text\n{}",
347
- "CQADupstackGamingRetrieval": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given questionn\n{}",
347
+ "CQADupstackGamingRetrieval": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question\n{}",
348
348
  "CQADupstackUnixRetrieval": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question\n{}",
349
349
  "DuRetrieval": "为这个句子生成表示以用于检索相关内容:{}",
350
350
  "T2Retrieval": "为这个句子生成表示以用于检索相关内容:{}",
@@ -51,7 +51,13 @@ def _downsample_image(
51
51
  def voyage_v_loader(model_name, **kwargs):
52
52
  requires_package(
53
53
  voyage_v_loader,
54
- "voyageai and tenacity",
54
+ "voyageai",
55
+ model_name,
56
+ "pip install 'mteb[voyage_v]'",
57
+ )
58
+ requires_package(
59
+ voyage_v_loader,
60
+ "tenacity",
55
61
  model_name,
56
62
  "pip install 'mteb[voyage_v]'",
57
63
  )
@@ -65,11 +71,9 @@ def voyage_v_loader(model_name, **kwargs):
65
71
  **kwargs: Any,
66
72
  ):
67
73
  requires_image_dependencies()
68
- from torchvision import transforms
69
74
 
70
75
  self.model_name = model_name.split("/")[-1]
71
76
  self.vo = voyageai.Client()
72
- self.tensor_to_image = transforms.Compose([transforms.PILToTensor()])
73
77
 
74
78
  @retry(
75
79
  stop=stop_after_attempt(6), # Stop after 6 attempts
@@ -126,10 +130,7 @@ def voyage_v_loader(model_name, **kwargs):
126
130
  for batch in tqdm(
127
131
  images, disable=not show_progress_bar, desc="Image Encoding"
128
132
  ):
129
- batch_images = [
130
- [_downsample_image(self.tensor_to_image(image))]
131
- for image in batch["image"]
132
- ]
133
+ batch_images = [[_downsample_image(image)] for image in batch["image"]]
133
134
  embeddings = self._multimodal_embed(
134
135
  batch_images, model=self.model_name, input_type=input_type
135
136
  ).embeddings
@@ -163,8 +164,7 @@ def voyage_v_loader(model_name, **kwargs):
163
164
  inputs, disable=not show_progress_bar, desc="Interleaved Encoding"
164
165
  ):
165
166
  batch_images = [
166
- _downsample_image(self.tensor_to_image(image))
167
- for image in batch["image"]
167
+ _downsample_image(image) for image in batch["image"]
168
168
  ]
169
169
  batch_texts = batch["text"]
170
170
  interleaved_inputs = [