mteb 2.2.2__py3-none-any.whl → 2.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +4 -0
- mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
- mteb/evaluate.py +38 -7
- mteb/models/__init__.py +4 -1
- mteb/models/cache_wrappers/__init__.py +2 -1
- mteb/models/model_implementations/colpali_models.py +4 -4
- mteb/models/model_implementations/colqwen_models.py +206 -2
- mteb/models/model_implementations/eagerworks_models.py +163 -0
- mteb/models/model_implementations/euler_models.py +25 -0
- mteb/models/model_implementations/google_models.py +1 -1
- mteb/models/model_implementations/jina_models.py +203 -5
- mteb/models/model_implementations/nb_sbert.py +1 -1
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +10 -11
- mteb/models/model_implementations/nvidia_models.py +1 -1
- mteb/models/model_implementations/ops_moa_models.py +2 -2
- mteb/models/model_implementations/promptriever_models.py +4 -4
- mteb/models/model_implementations/qwen3_models.py +3 -3
- mteb/models/model_implementations/qzhou_models.py +1 -1
- mteb/models/model_implementations/random_baseline.py +8 -18
- mteb/models/model_implementations/vdr_models.py +1 -0
- mteb/models/model_implementations/yuan_models_en.py +57 -0
- mteb/models/search_encoder_index/__init__.py +7 -0
- mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
- mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +157 -0
- mteb/models/search_wrappers.py +157 -41
- mteb/results/model_result.py +2 -1
- mteb/results/task_result.py +12 -0
- mteb/similarity_functions.py +49 -0
- mteb/tasks/reranking/multilingual/__init__.py +2 -0
- mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +3 -3
- {mteb-2.2.2.dist-info → mteb-2.3.1.dist-info}/METADATA +6 -1
- {mteb-2.2.2.dist-info → mteb-2.3.1.dist-info}/RECORD +40 -31
- {mteb-2.2.2.dist-info → mteb-2.3.1.dist-info}/WHEEL +0 -0
- {mteb-2.2.2.dist-info → mteb-2.3.1.dist-info}/entry_points.txt +0 -0
- {mteb-2.2.2.dist-info → mteb-2.3.1.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.2.2.dist-info → mteb-2.3.1.dist-info}/top_level.txt +0 -0
mteb/__init__.py
CHANGED
|
@@ -9,8 +9,10 @@ from mteb.filter_tasks import filter_tasks
|
|
|
9
9
|
from mteb.get_tasks import get_task, get_tasks
|
|
10
10
|
from mteb.load_results import load_results
|
|
11
11
|
from mteb.models import (
|
|
12
|
+
CacheBackendProtocol,
|
|
12
13
|
CrossEncoderProtocol,
|
|
13
14
|
EncoderProtocol,
|
|
15
|
+
IndexEncoderSearchProtocol,
|
|
14
16
|
SearchProtocol,
|
|
15
17
|
SentenceTransformerEncoderWrapper,
|
|
16
18
|
)
|
|
@@ -27,8 +29,10 @@ __all__ = [
|
|
|
27
29
|
"AbsTask",
|
|
28
30
|
"Benchmark",
|
|
29
31
|
"BenchmarkResults",
|
|
32
|
+
"CacheBackendProtocol",
|
|
30
33
|
"CrossEncoderProtocol",
|
|
31
34
|
"EncoderProtocol",
|
|
35
|
+
"IndexEncoderSearchProtocol",
|
|
32
36
|
"SearchProtocol",
|
|
33
37
|
"SentenceTransformerEncoderWrapper",
|
|
34
38
|
"TaskMetadata",
|
|
@@ -0,0 +1,466 @@
|
|
|
1
|
+
{
|
|
2
|
+
"test": {
|
|
3
|
+
"num_samples": 33489,
|
|
4
|
+
"number_of_characters": 478879013,
|
|
5
|
+
"documents_text_statistics": {
|
|
6
|
+
"total_text_length": 478570118,
|
|
7
|
+
"min_text_length": 37,
|
|
8
|
+
"average_text_length": 16119.442150291354,
|
|
9
|
+
"max_text_length": 287838,
|
|
10
|
+
"unique_texts": 29689
|
|
11
|
+
},
|
|
12
|
+
"documents_image_statistics": null,
|
|
13
|
+
"queries_text_statistics": {
|
|
14
|
+
"total_text_length": 308895,
|
|
15
|
+
"min_text_length": 3,
|
|
16
|
+
"average_text_length": 81.28815789473684,
|
|
17
|
+
"max_text_length": 2589,
|
|
18
|
+
"unique_texts": 3800
|
|
19
|
+
},
|
|
20
|
+
"queries_image_statistics": null,
|
|
21
|
+
"relevant_docs_statistics": {
|
|
22
|
+
"num_relevant_docs": 3800,
|
|
23
|
+
"min_relevant_docs_per_query": 8,
|
|
24
|
+
"average_relevant_docs_per_query": 1.0,
|
|
25
|
+
"max_relevant_docs_per_query": 8,
|
|
26
|
+
"unique_relevant_docs": 29689
|
|
27
|
+
},
|
|
28
|
+
"top_ranked_statistics": {
|
|
29
|
+
"num_top_ranked": 30400,
|
|
30
|
+
"min_top_ranked_per_query": 8,
|
|
31
|
+
"average_top_ranked_per_query": 8.0,
|
|
32
|
+
"max_top_ranked_per_query": 8
|
|
33
|
+
},
|
|
34
|
+
"hf_subset_descriptive_stats": {
|
|
35
|
+
"ar": {
|
|
36
|
+
"num_samples": 1759,
|
|
37
|
+
"number_of_characters": 17483509,
|
|
38
|
+
"documents_text_statistics": {
|
|
39
|
+
"total_text_length": 17468355,
|
|
40
|
+
"min_text_length": 2467,
|
|
41
|
+
"average_text_length": 11204.846055163567,
|
|
42
|
+
"max_text_length": 115382,
|
|
43
|
+
"unique_texts": 1559
|
|
44
|
+
},
|
|
45
|
+
"documents_image_statistics": null,
|
|
46
|
+
"queries_text_statistics": {
|
|
47
|
+
"total_text_length": 15154,
|
|
48
|
+
"min_text_length": 7,
|
|
49
|
+
"average_text_length": 75.77,
|
|
50
|
+
"max_text_length": 695,
|
|
51
|
+
"unique_texts": 200
|
|
52
|
+
},
|
|
53
|
+
"queries_image_statistics": null,
|
|
54
|
+
"relevant_docs_statistics": {
|
|
55
|
+
"num_relevant_docs": 200,
|
|
56
|
+
"min_relevant_docs_per_query": 8,
|
|
57
|
+
"average_relevant_docs_per_query": 1.0,
|
|
58
|
+
"max_relevant_docs_per_query": 8,
|
|
59
|
+
"unique_relevant_docs": 1559
|
|
60
|
+
},
|
|
61
|
+
"top_ranked_statistics": {
|
|
62
|
+
"num_top_ranked": 1600,
|
|
63
|
+
"min_top_ranked_per_query": 8,
|
|
64
|
+
"average_top_ranked_per_query": 8.0,
|
|
65
|
+
"max_top_ranked_per_query": 8
|
|
66
|
+
}
|
|
67
|
+
},
|
|
68
|
+
"de": {
|
|
69
|
+
"num_samples": 1800,
|
|
70
|
+
"number_of_characters": 9860028,
|
|
71
|
+
"documents_text_statistics": {
|
|
72
|
+
"total_text_length": 9835298,
|
|
73
|
+
"min_text_length": 107,
|
|
74
|
+
"average_text_length": 6147.06125,
|
|
75
|
+
"max_text_length": 92210,
|
|
76
|
+
"unique_texts": 1600
|
|
77
|
+
},
|
|
78
|
+
"documents_image_statistics": null,
|
|
79
|
+
"queries_text_statistics": {
|
|
80
|
+
"total_text_length": 24730,
|
|
81
|
+
"min_text_length": 10,
|
|
82
|
+
"average_text_length": 123.65,
|
|
83
|
+
"max_text_length": 957,
|
|
84
|
+
"unique_texts": 200
|
|
85
|
+
},
|
|
86
|
+
"queries_image_statistics": null,
|
|
87
|
+
"relevant_docs_statistics": {
|
|
88
|
+
"num_relevant_docs": 200,
|
|
89
|
+
"min_relevant_docs_per_query": 8,
|
|
90
|
+
"average_relevant_docs_per_query": 1.0,
|
|
91
|
+
"max_relevant_docs_per_query": 8,
|
|
92
|
+
"unique_relevant_docs": 1600
|
|
93
|
+
},
|
|
94
|
+
"top_ranked_statistics": {
|
|
95
|
+
"num_top_ranked": 1600,
|
|
96
|
+
"min_top_ranked_per_query": 8,
|
|
97
|
+
"average_top_ranked_per_query": 8.0,
|
|
98
|
+
"max_top_ranked_per_query": 8
|
|
99
|
+
}
|
|
100
|
+
},
|
|
101
|
+
"en": {
|
|
102
|
+
"num_samples": 6878,
|
|
103
|
+
"number_of_characters": 221164232,
|
|
104
|
+
"documents_text_statistics": {
|
|
105
|
+
"total_text_length": 221099168,
|
|
106
|
+
"min_text_length": 12147,
|
|
107
|
+
"average_text_length": 36376.96084238236,
|
|
108
|
+
"max_text_length": 287838,
|
|
109
|
+
"unique_texts": 6078
|
|
110
|
+
},
|
|
111
|
+
"documents_image_statistics": null,
|
|
112
|
+
"queries_text_statistics": {
|
|
113
|
+
"total_text_length": 65064,
|
|
114
|
+
"min_text_length": 18,
|
|
115
|
+
"average_text_length": 81.33,
|
|
116
|
+
"max_text_length": 255,
|
|
117
|
+
"unique_texts": 800
|
|
118
|
+
},
|
|
119
|
+
"queries_image_statistics": null,
|
|
120
|
+
"relevant_docs_statistics": {
|
|
121
|
+
"num_relevant_docs": 800,
|
|
122
|
+
"min_relevant_docs_per_query": 8,
|
|
123
|
+
"average_relevant_docs_per_query": 1.0,
|
|
124
|
+
"max_relevant_docs_per_query": 8,
|
|
125
|
+
"unique_relevant_docs": 6078
|
|
126
|
+
},
|
|
127
|
+
"top_ranked_statistics": {
|
|
128
|
+
"num_top_ranked": 6400,
|
|
129
|
+
"min_top_ranked_per_query": 8,
|
|
130
|
+
"average_top_ranked_per_query": 8.0,
|
|
131
|
+
"max_top_ranked_per_query": 8
|
|
132
|
+
}
|
|
133
|
+
},
|
|
134
|
+
"es": {
|
|
135
|
+
"num_samples": 1780,
|
|
136
|
+
"number_of_characters": 20852843,
|
|
137
|
+
"documents_text_statistics": {
|
|
138
|
+
"total_text_length": 20826446,
|
|
139
|
+
"min_text_length": 2657,
|
|
140
|
+
"average_text_length": 13181.29493670886,
|
|
141
|
+
"max_text_length": 270338,
|
|
142
|
+
"unique_texts": 1580
|
|
143
|
+
},
|
|
144
|
+
"documents_image_statistics": null,
|
|
145
|
+
"queries_text_statistics": {
|
|
146
|
+
"total_text_length": 26397,
|
|
147
|
+
"min_text_length": 40,
|
|
148
|
+
"average_text_length": 131.985,
|
|
149
|
+
"max_text_length": 480,
|
|
150
|
+
"unique_texts": 200
|
|
151
|
+
},
|
|
152
|
+
"queries_image_statistics": null,
|
|
153
|
+
"relevant_docs_statistics": {
|
|
154
|
+
"num_relevant_docs": 200,
|
|
155
|
+
"min_relevant_docs_per_query": 8,
|
|
156
|
+
"average_relevant_docs_per_query": 1.0,
|
|
157
|
+
"max_relevant_docs_per_query": 8,
|
|
158
|
+
"unique_relevant_docs": 1580
|
|
159
|
+
},
|
|
160
|
+
"top_ranked_statistics": {
|
|
161
|
+
"num_top_ranked": 1600,
|
|
162
|
+
"min_top_ranked_per_query": 8,
|
|
163
|
+
"average_top_ranked_per_query": 8.0,
|
|
164
|
+
"max_top_ranked_per_query": 8
|
|
165
|
+
}
|
|
166
|
+
},
|
|
167
|
+
"fr": {
|
|
168
|
+
"num_samples": 1762,
|
|
169
|
+
"number_of_characters": 17828712,
|
|
170
|
+
"documents_text_statistics": {
|
|
171
|
+
"total_text_length": 17798753,
|
|
172
|
+
"min_text_length": 2093,
|
|
173
|
+
"average_text_length": 11394.848271446863,
|
|
174
|
+
"max_text_length": 133854,
|
|
175
|
+
"unique_texts": 1562
|
|
176
|
+
},
|
|
177
|
+
"documents_image_statistics": null,
|
|
178
|
+
"queries_text_statistics": {
|
|
179
|
+
"total_text_length": 29959,
|
|
180
|
+
"min_text_length": 33,
|
|
181
|
+
"average_text_length": 149.795,
|
|
182
|
+
"max_text_length": 2589,
|
|
183
|
+
"unique_texts": 200
|
|
184
|
+
},
|
|
185
|
+
"queries_image_statistics": null,
|
|
186
|
+
"relevant_docs_statistics": {
|
|
187
|
+
"num_relevant_docs": 200,
|
|
188
|
+
"min_relevant_docs_per_query": 8,
|
|
189
|
+
"average_relevant_docs_per_query": 1.0,
|
|
190
|
+
"max_relevant_docs_per_query": 8,
|
|
191
|
+
"unique_relevant_docs": 1562
|
|
192
|
+
},
|
|
193
|
+
"top_ranked_statistics": {
|
|
194
|
+
"num_top_ranked": 1600,
|
|
195
|
+
"min_top_ranked_per_query": 8,
|
|
196
|
+
"average_top_ranked_per_query": 8.0,
|
|
197
|
+
"max_top_ranked_per_query": 8
|
|
198
|
+
}
|
|
199
|
+
},
|
|
200
|
+
"hi": {
|
|
201
|
+
"num_samples": 1715,
|
|
202
|
+
"number_of_characters": 18465376,
|
|
203
|
+
"documents_text_statistics": {
|
|
204
|
+
"total_text_length": 18444624,
|
|
205
|
+
"min_text_length": 2426,
|
|
206
|
+
"average_text_length": 12174.669306930693,
|
|
207
|
+
"max_text_length": 227264,
|
|
208
|
+
"unique_texts": 1515
|
|
209
|
+
},
|
|
210
|
+
"documents_image_statistics": null,
|
|
211
|
+
"queries_text_statistics": {
|
|
212
|
+
"total_text_length": 20752,
|
|
213
|
+
"min_text_length": 6,
|
|
214
|
+
"average_text_length": 103.76,
|
|
215
|
+
"max_text_length": 2022,
|
|
216
|
+
"unique_texts": 200
|
|
217
|
+
},
|
|
218
|
+
"queries_image_statistics": null,
|
|
219
|
+
"relevant_docs_statistics": {
|
|
220
|
+
"num_relevant_docs": 200,
|
|
221
|
+
"min_relevant_docs_per_query": 8,
|
|
222
|
+
"average_relevant_docs_per_query": 1.0,
|
|
223
|
+
"max_relevant_docs_per_query": 8,
|
|
224
|
+
"unique_relevant_docs": 1515
|
|
225
|
+
},
|
|
226
|
+
"top_ranked_statistics": {
|
|
227
|
+
"num_top_ranked": 1600,
|
|
228
|
+
"min_top_ranked_per_query": 8,
|
|
229
|
+
"average_top_ranked_per_query": 8.0,
|
|
230
|
+
"max_top_ranked_per_query": 8
|
|
231
|
+
}
|
|
232
|
+
},
|
|
233
|
+
"it": {
|
|
234
|
+
"num_samples": 1780,
|
|
235
|
+
"number_of_characters": 22616410,
|
|
236
|
+
"documents_text_statistics": {
|
|
237
|
+
"total_text_length": 22593491,
|
|
238
|
+
"min_text_length": 2518,
|
|
239
|
+
"average_text_length": 14299.677848101266,
|
|
240
|
+
"max_text_length": 117197,
|
|
241
|
+
"unique_texts": 1580
|
|
242
|
+
},
|
|
243
|
+
"documents_image_statistics": null,
|
|
244
|
+
"queries_text_statistics": {
|
|
245
|
+
"total_text_length": 22919,
|
|
246
|
+
"min_text_length": 12,
|
|
247
|
+
"average_text_length": 114.595,
|
|
248
|
+
"max_text_length": 1899,
|
|
249
|
+
"unique_texts": 200
|
|
250
|
+
},
|
|
251
|
+
"queries_image_statistics": null,
|
|
252
|
+
"relevant_docs_statistics": {
|
|
253
|
+
"num_relevant_docs": 200,
|
|
254
|
+
"min_relevant_docs_per_query": 8,
|
|
255
|
+
"average_relevant_docs_per_query": 1.0,
|
|
256
|
+
"max_relevant_docs_per_query": 8,
|
|
257
|
+
"unique_relevant_docs": 1580
|
|
258
|
+
},
|
|
259
|
+
"top_ranked_statistics": {
|
|
260
|
+
"num_top_ranked": 1600,
|
|
261
|
+
"min_top_ranked_per_query": 8,
|
|
262
|
+
"average_top_ranked_per_query": 8.0,
|
|
263
|
+
"max_top_ranked_per_query": 8
|
|
264
|
+
}
|
|
265
|
+
},
|
|
266
|
+
"ja": {
|
|
267
|
+
"num_samples": 1781,
|
|
268
|
+
"number_of_characters": 8562074,
|
|
269
|
+
"documents_text_statistics": {
|
|
270
|
+
"total_text_length": 8550928,
|
|
271
|
+
"min_text_length": 1244,
|
|
272
|
+
"average_text_length": 5408.556609740671,
|
|
273
|
+
"max_text_length": 97242,
|
|
274
|
+
"unique_texts": 1581
|
|
275
|
+
},
|
|
276
|
+
"documents_image_statistics": null,
|
|
277
|
+
"queries_text_statistics": {
|
|
278
|
+
"total_text_length": 11146,
|
|
279
|
+
"min_text_length": 6,
|
|
280
|
+
"average_text_length": 55.73,
|
|
281
|
+
"max_text_length": 416,
|
|
282
|
+
"unique_texts": 200
|
|
283
|
+
},
|
|
284
|
+
"queries_image_statistics": null,
|
|
285
|
+
"relevant_docs_statistics": {
|
|
286
|
+
"num_relevant_docs": 200,
|
|
287
|
+
"min_relevant_docs_per_query": 8,
|
|
288
|
+
"average_relevant_docs_per_query": 1.0,
|
|
289
|
+
"max_relevant_docs_per_query": 8,
|
|
290
|
+
"unique_relevant_docs": 1581
|
|
291
|
+
},
|
|
292
|
+
"top_ranked_statistics": {
|
|
293
|
+
"num_top_ranked": 1600,
|
|
294
|
+
"min_top_ranked_per_query": 8,
|
|
295
|
+
"average_top_ranked_per_query": 8.0,
|
|
296
|
+
"max_top_ranked_per_query": 8
|
|
297
|
+
}
|
|
298
|
+
},
|
|
299
|
+
"ko": {
|
|
300
|
+
"num_samples": 1770,
|
|
301
|
+
"number_of_characters": 9773349,
|
|
302
|
+
"documents_text_statistics": {
|
|
303
|
+
"total_text_length": 9761605,
|
|
304
|
+
"min_text_length": 1490,
|
|
305
|
+
"average_text_length": 6217.58280254777,
|
|
306
|
+
"max_text_length": 76949,
|
|
307
|
+
"unique_texts": 1570
|
|
308
|
+
},
|
|
309
|
+
"documents_image_statistics": null,
|
|
310
|
+
"queries_text_statistics": {
|
|
311
|
+
"total_text_length": 11744,
|
|
312
|
+
"min_text_length": 8,
|
|
313
|
+
"average_text_length": 58.72,
|
|
314
|
+
"max_text_length": 330,
|
|
315
|
+
"unique_texts": 200
|
|
316
|
+
},
|
|
317
|
+
"queries_image_statistics": null,
|
|
318
|
+
"relevant_docs_statistics": {
|
|
319
|
+
"num_relevant_docs": 200,
|
|
320
|
+
"min_relevant_docs_per_query": 8,
|
|
321
|
+
"average_relevant_docs_per_query": 1.0,
|
|
322
|
+
"max_relevant_docs_per_query": 8,
|
|
323
|
+
"unique_relevant_docs": 1570
|
|
324
|
+
},
|
|
325
|
+
"top_ranked_statistics": {
|
|
326
|
+
"num_top_ranked": 1600,
|
|
327
|
+
"min_top_ranked_per_query": 8,
|
|
328
|
+
"average_top_ranked_per_query": 8.0,
|
|
329
|
+
"max_top_ranked_per_query": 8
|
|
330
|
+
}
|
|
331
|
+
},
|
|
332
|
+
"pt": {
|
|
333
|
+
"num_samples": 1764,
|
|
334
|
+
"number_of_characters": 23152911,
|
|
335
|
+
"documents_text_statistics": {
|
|
336
|
+
"total_text_length": 23130220,
|
|
337
|
+
"min_text_length": 3473,
|
|
338
|
+
"average_text_length": 14789.143222506395,
|
|
339
|
+
"max_text_length": 108535,
|
|
340
|
+
"unique_texts": 1564
|
|
341
|
+
},
|
|
342
|
+
"documents_image_statistics": null,
|
|
343
|
+
"queries_text_statistics": {
|
|
344
|
+
"total_text_length": 22691,
|
|
345
|
+
"min_text_length": 4,
|
|
346
|
+
"average_text_length": 113.455,
|
|
347
|
+
"max_text_length": 511,
|
|
348
|
+
"unique_texts": 200
|
|
349
|
+
},
|
|
350
|
+
"queries_image_statistics": null,
|
|
351
|
+
"relevant_docs_statistics": {
|
|
352
|
+
"num_relevant_docs": 200,
|
|
353
|
+
"min_relevant_docs_per_query": 8,
|
|
354
|
+
"average_relevant_docs_per_query": 1.0,
|
|
355
|
+
"max_relevant_docs_per_query": 8,
|
|
356
|
+
"unique_relevant_docs": 1564
|
|
357
|
+
},
|
|
358
|
+
"top_ranked_statistics": {
|
|
359
|
+
"num_top_ranked": 1600,
|
|
360
|
+
"min_top_ranked_per_query": 8,
|
|
361
|
+
"average_top_ranked_per_query": 8.0,
|
|
362
|
+
"max_top_ranked_per_query": 8
|
|
363
|
+
}
|
|
364
|
+
},
|
|
365
|
+
"ru": {
|
|
366
|
+
"num_samples": 1779,
|
|
367
|
+
"number_of_characters": 22994826,
|
|
368
|
+
"documents_text_statistics": {
|
|
369
|
+
"total_text_length": 22975852,
|
|
370
|
+
"min_text_length": 2914,
|
|
371
|
+
"average_text_length": 14550.887903736542,
|
|
372
|
+
"max_text_length": 151133,
|
|
373
|
+
"unique_texts": 1579
|
|
374
|
+
},
|
|
375
|
+
"documents_image_statistics": null,
|
|
376
|
+
"queries_text_statistics": {
|
|
377
|
+
"total_text_length": 18974,
|
|
378
|
+
"min_text_length": 12,
|
|
379
|
+
"average_text_length": 94.87,
|
|
380
|
+
"max_text_length": 413,
|
|
381
|
+
"unique_texts": 200
|
|
382
|
+
},
|
|
383
|
+
"queries_image_statistics": null,
|
|
384
|
+
"relevant_docs_statistics": {
|
|
385
|
+
"num_relevant_docs": 200,
|
|
386
|
+
"min_relevant_docs_per_query": 8,
|
|
387
|
+
"average_relevant_docs_per_query": 1.0,
|
|
388
|
+
"max_relevant_docs_per_query": 8,
|
|
389
|
+
"unique_relevant_docs": 1579
|
|
390
|
+
},
|
|
391
|
+
"top_ranked_statistics": {
|
|
392
|
+
"num_top_ranked": 1600,
|
|
393
|
+
"min_top_ranked_per_query": 8,
|
|
394
|
+
"average_top_ranked_per_query": 8.0,
|
|
395
|
+
"max_top_ranked_per_query": 8
|
|
396
|
+
}
|
|
397
|
+
},
|
|
398
|
+
"th": {
|
|
399
|
+
"num_samples": 1800,
|
|
400
|
+
"number_of_characters": 8022609,
|
|
401
|
+
"documents_text_statistics": {
|
|
402
|
+
"total_text_length": 8003011,
|
|
403
|
+
"min_text_length": 37,
|
|
404
|
+
"average_text_length": 5001.881875,
|
|
405
|
+
"max_text_length": 44872,
|
|
406
|
+
"unique_texts": 1600
|
|
407
|
+
},
|
|
408
|
+
"documents_image_statistics": null,
|
|
409
|
+
"queries_text_statistics": {
|
|
410
|
+
"total_text_length": 19598,
|
|
411
|
+
"min_text_length": 11,
|
|
412
|
+
"average_text_length": 97.99,
|
|
413
|
+
"max_text_length": 309,
|
|
414
|
+
"unique_texts": 200
|
|
415
|
+
},
|
|
416
|
+
"queries_image_statistics": null,
|
|
417
|
+
"relevant_docs_statistics": {
|
|
418
|
+
"num_relevant_docs": 200,
|
|
419
|
+
"min_relevant_docs_per_query": 8,
|
|
420
|
+
"average_relevant_docs_per_query": 1.0,
|
|
421
|
+
"max_relevant_docs_per_query": 8,
|
|
422
|
+
"unique_relevant_docs": 1600
|
|
423
|
+
},
|
|
424
|
+
"top_ranked_statistics": {
|
|
425
|
+
"num_top_ranked": 1600,
|
|
426
|
+
"min_top_ranked_per_query": 8,
|
|
427
|
+
"average_top_ranked_per_query": 8.0,
|
|
428
|
+
"max_top_ranked_per_query": 8
|
|
429
|
+
}
|
|
430
|
+
},
|
|
431
|
+
"zh": {
|
|
432
|
+
"num_samples": 7121,
|
|
433
|
+
"number_of_characters": 78102134,
|
|
434
|
+
"documents_text_statistics": {
|
|
435
|
+
"total_text_length": 78082367,
|
|
436
|
+
"min_text_length": 6268,
|
|
437
|
+
"average_text_length": 12352.850340136054,
|
|
438
|
+
"max_text_length": 278468,
|
|
439
|
+
"unique_texts": 6321
|
|
440
|
+
},
|
|
441
|
+
"documents_image_statistics": null,
|
|
442
|
+
"queries_text_statistics": {
|
|
443
|
+
"total_text_length": 19767,
|
|
444
|
+
"min_text_length": 3,
|
|
445
|
+
"average_text_length": 24.70875,
|
|
446
|
+
"max_text_length": 646,
|
|
447
|
+
"unique_texts": 800
|
|
448
|
+
},
|
|
449
|
+
"queries_image_statistics": null,
|
|
450
|
+
"relevant_docs_statistics": {
|
|
451
|
+
"num_relevant_docs": 800,
|
|
452
|
+
"min_relevant_docs_per_query": 8,
|
|
453
|
+
"average_relevant_docs_per_query": 1.0,
|
|
454
|
+
"max_relevant_docs_per_query": 8,
|
|
455
|
+
"unique_relevant_docs": 6321
|
|
456
|
+
},
|
|
457
|
+
"top_ranked_statistics": {
|
|
458
|
+
"num_top_ranked": 6400,
|
|
459
|
+
"min_top_ranked_per_query": 8,
|
|
460
|
+
"average_top_ranked_per_query": 8.0,
|
|
461
|
+
"max_top_ranked_per_query": 8
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
}
|
|
465
|
+
}
|
|
466
|
+
}
|
mteb/evaluate.py
CHANGED
|
@@ -7,6 +7,7 @@ from pathlib import Path
|
|
|
7
7
|
from time import time
|
|
8
8
|
from typing import TYPE_CHECKING, Any, cast
|
|
9
9
|
|
|
10
|
+
from datasets.exceptions import DatasetNotFoundError
|
|
10
11
|
from tqdm.auto import tqdm
|
|
11
12
|
|
|
12
13
|
from mteb._helpful_enum import HelpfulStrEnum
|
|
@@ -25,6 +26,7 @@ from mteb.models.sentence_transformer_wrapper import (
|
|
|
25
26
|
SentenceTransformerEncoderWrapper,
|
|
26
27
|
)
|
|
27
28
|
from mteb.results import ModelResult, TaskResult
|
|
29
|
+
from mteb.results.task_result import TaskError
|
|
28
30
|
from mteb.types import HFSubset, PromptType, SplitName
|
|
29
31
|
from mteb.types._metadata import ModelName, Revision
|
|
30
32
|
|
|
@@ -117,7 +119,8 @@ def _evaluate_task(
|
|
|
117
119
|
co2_tracker: bool | None,
|
|
118
120
|
encode_kwargs: dict[str, Any],
|
|
119
121
|
prediction_folder: Path | None,
|
|
120
|
-
|
|
122
|
+
public_only: bool | None,
|
|
123
|
+
) -> TaskResult | TaskError:
|
|
121
124
|
"""The core logic to run a model on a given task. See `evaluate` for more details.
|
|
122
125
|
|
|
123
126
|
Returns:
|
|
@@ -149,6 +152,7 @@ def _evaluate_task(
|
|
|
149
152
|
encode_kwargs=encode_kwargs,
|
|
150
153
|
co2_tracker=False,
|
|
151
154
|
prediction_folder=prediction_folder,
|
|
155
|
+
public_only=public_only,
|
|
152
156
|
)
|
|
153
157
|
result.kg_co2_emissions = tracker.final_emissions
|
|
154
158
|
return result
|
|
@@ -159,7 +163,20 @@ def _evaluate_task(
|
|
|
159
163
|
|
|
160
164
|
data_loaded = task.data_loaded
|
|
161
165
|
if not data_loaded:
|
|
162
|
-
|
|
166
|
+
try:
|
|
167
|
+
task.load_data()
|
|
168
|
+
except DatasetNotFoundError as e:
|
|
169
|
+
if not task.metadata.is_public and public_only is None:
|
|
170
|
+
logger.warning(
|
|
171
|
+
f"Dataset for private task '{task.metadata.name}' not found. "
|
|
172
|
+
"Make sure you have access to the dataset and that you have set up the authentication correctly. To disable this warning set `public_only=False`"
|
|
173
|
+
)
|
|
174
|
+
return TaskError(
|
|
175
|
+
task_name=task.metadata.name,
|
|
176
|
+
exception=str(e),
|
|
177
|
+
)
|
|
178
|
+
if public_only is False:
|
|
179
|
+
raise e
|
|
163
180
|
|
|
164
181
|
evaluation_time = 0
|
|
165
182
|
|
|
@@ -281,6 +298,7 @@ def evaluate(
|
|
|
281
298
|
overwrite_strategy: str | OverwriteStrategy = "only-missing",
|
|
282
299
|
prediction_folder: Path | str | None = None,
|
|
283
300
|
show_progress_bar: bool = True,
|
|
301
|
+
public_only: bool | None = None,
|
|
284
302
|
) -> ModelResult:
|
|
285
303
|
"""This function runs a model on a given task and returns the results.
|
|
286
304
|
|
|
@@ -304,6 +322,7 @@ def evaluate(
|
|
|
304
322
|
prediction_folder: Optional folder in which to save model predictions for the task. Predictions of the tasks will be sabed in `prediction_folder/{task_name}_predictions.json`
|
|
305
323
|
show_progress_bar: Whether to show a progress bar when running the evaluation. Default is True. Setting this to False will also set the
|
|
306
324
|
`encode_kwargs['show_progress_bar']` to False if encode_kwargs is unspecified.
|
|
325
|
+
public_only: Run only public tasks. If None, it will attempt to run the private task.
|
|
307
326
|
|
|
308
327
|
Returns:
|
|
309
328
|
The results of the evaluation.
|
|
@@ -355,6 +374,7 @@ def evaluate(
|
|
|
355
374
|
overwrite_strategy=overwrite_strategy,
|
|
356
375
|
prediction_folder=prediction_folder,
|
|
357
376
|
show_progress_bar=show_progress_bar,
|
|
377
|
+
public_only=public_only,
|
|
358
378
|
)
|
|
359
379
|
result = task.combine_task_results(results.task_results)
|
|
360
380
|
return ModelResult(
|
|
@@ -367,6 +387,7 @@ def evaluate(
|
|
|
367
387
|
task = tasks
|
|
368
388
|
else:
|
|
369
389
|
results = []
|
|
390
|
+
exceptions = []
|
|
370
391
|
tasks_tqdm = tqdm(
|
|
371
392
|
tasks,
|
|
372
393
|
desc="Evaluating tasks",
|
|
@@ -384,12 +405,16 @@ def evaluate(
|
|
|
384
405
|
overwrite_strategy=overwrite_strategy,
|
|
385
406
|
prediction_folder=prediction_folder,
|
|
386
407
|
show_progress_bar=False,
|
|
408
|
+
public_only=public_only,
|
|
387
409
|
)
|
|
388
410
|
results.extend(_res.task_results)
|
|
411
|
+
if _res.exceptions:
|
|
412
|
+
exceptions.extend(_res.exceptions)
|
|
389
413
|
return ModelResult(
|
|
390
414
|
model_name=_res.model_name,
|
|
391
415
|
model_revision=_res.model_revision,
|
|
392
416
|
task_results=results,
|
|
417
|
+
exceptions=exceptions,
|
|
393
418
|
)
|
|
394
419
|
|
|
395
420
|
overwrite_strategy = OverwriteStrategy.from_str(overwrite_strategy)
|
|
@@ -459,16 +484,13 @@ def evaluate(
|
|
|
459
484
|
co2_tracker=co2_tracker,
|
|
460
485
|
encode_kwargs=encode_kwargs,
|
|
461
486
|
prediction_folder=prediction_folder,
|
|
487
|
+
public_only=public_only,
|
|
462
488
|
)
|
|
463
489
|
except Exception as e:
|
|
464
490
|
logger.error(
|
|
465
491
|
f"Error while running task {task.metadata.name} on splits {list(missing_eval.keys())}: {e}"
|
|
466
492
|
)
|
|
467
|
-
|
|
468
|
-
model_name=model_name,
|
|
469
|
-
model_revision=model_revision,
|
|
470
|
-
task_results=[],
|
|
471
|
-
)
|
|
493
|
+
result = TaskError(task_name=task.metadata.name, exception=str(e))
|
|
472
494
|
else:
|
|
473
495
|
result = _evaluate_task(
|
|
474
496
|
model=model,
|
|
@@ -477,9 +499,18 @@ def evaluate(
|
|
|
477
499
|
co2_tracker=False,
|
|
478
500
|
encode_kwargs=encode_kwargs,
|
|
479
501
|
prediction_folder=prediction_folder,
|
|
502
|
+
public_only=public_only,
|
|
480
503
|
)
|
|
481
504
|
logger.info(f"✓ Finished evaluation for {task.metadata.name}")
|
|
482
505
|
|
|
506
|
+
if isinstance(result, TaskError):
|
|
507
|
+
return ModelResult(
|
|
508
|
+
model_name=model_name,
|
|
509
|
+
model_revision=model_revision,
|
|
510
|
+
task_results=[],
|
|
511
|
+
exceptions=[result],
|
|
512
|
+
)
|
|
513
|
+
|
|
483
514
|
if existing_results:
|
|
484
515
|
result = result.merge(existing_results)
|
|
485
516
|
|
mteb/models/__init__.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from .cache_wrappers import CachedEmbeddingWrapper
|
|
1
|
+
from .cache_wrappers import CacheBackendProtocol, CachedEmbeddingWrapper
|
|
2
2
|
from .model_meta import ModelMeta
|
|
3
3
|
from .models_protocols import (
|
|
4
4
|
CrossEncoderProtocol,
|
|
@@ -6,6 +6,7 @@ from .models_protocols import (
|
|
|
6
6
|
MTEBModels,
|
|
7
7
|
SearchProtocol,
|
|
8
8
|
)
|
|
9
|
+
from .search_encoder_index.search_backend_protocol import IndexEncoderSearchProtocol
|
|
9
10
|
from .search_wrappers import SearchCrossEncoderWrapper, SearchEncoderWrapper
|
|
10
11
|
from .sentence_transformer_wrapper import (
|
|
11
12
|
CrossEncoderWrapper,
|
|
@@ -14,10 +15,12 @@ from .sentence_transformer_wrapper import (
|
|
|
14
15
|
)
|
|
15
16
|
|
|
16
17
|
__all__ = [
|
|
18
|
+
"CacheBackendProtocol",
|
|
17
19
|
"CachedEmbeddingWrapper",
|
|
18
20
|
"CrossEncoderProtocol",
|
|
19
21
|
"CrossEncoderWrapper",
|
|
20
22
|
"EncoderProtocol",
|
|
23
|
+
"IndexEncoderSearchProtocol",
|
|
21
24
|
"MTEBModels",
|
|
22
25
|
"ModelMeta",
|
|
23
26
|
"SearchCrossEncoderWrapper",
|
|
@@ -196,10 +196,10 @@ COLPALI_CITATION = """
|
|
|
196
196
|
|
|
197
197
|
COLPALI_TRAINING_DATA = {
|
|
198
198
|
# from https://huggingface.co/datasets/vidore/colpali_train_set
|
|
199
|
-
"
|
|
200
|
-
"
|
|
201
|
-
"
|
|
202
|
-
"
|
|
199
|
+
"VidoreDocVQARetrieval",
|
|
200
|
+
"VidoreInfoVQARetrieval",
|
|
201
|
+
"VidoreTatdqaRetrieval",
|
|
202
|
+
"VidoreArxivQARetrieval",
|
|
203
203
|
}
|
|
204
204
|
|
|
205
205
|
colpali_v1_1 = ModelMeta(
|