mteb 2.3.0__py3-none-any.whl → 2.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,466 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 33489,
4
+ "number_of_characters": 478879013,
5
+ "documents_text_statistics": {
6
+ "total_text_length": 478570118,
7
+ "min_text_length": 37,
8
+ "average_text_length": 16119.442150291354,
9
+ "max_text_length": 287838,
10
+ "unique_texts": 29689
11
+ },
12
+ "documents_image_statistics": null,
13
+ "queries_text_statistics": {
14
+ "total_text_length": 308895,
15
+ "min_text_length": 3,
16
+ "average_text_length": 81.28815789473684,
17
+ "max_text_length": 2589,
18
+ "unique_texts": 3800
19
+ },
20
+ "queries_image_statistics": null,
21
+ "relevant_docs_statistics": {
22
+ "num_relevant_docs": 3800,
23
+ "min_relevant_docs_per_query": 8,
24
+ "average_relevant_docs_per_query": 1.0,
25
+ "max_relevant_docs_per_query": 8,
26
+ "unique_relevant_docs": 29689
27
+ },
28
+ "top_ranked_statistics": {
29
+ "num_top_ranked": 30400,
30
+ "min_top_ranked_per_query": 8,
31
+ "average_top_ranked_per_query": 8.0,
32
+ "max_top_ranked_per_query": 8
33
+ },
34
+ "hf_subset_descriptive_stats": {
35
+ "ar": {
36
+ "num_samples": 1759,
37
+ "number_of_characters": 17483509,
38
+ "documents_text_statistics": {
39
+ "total_text_length": 17468355,
40
+ "min_text_length": 2467,
41
+ "average_text_length": 11204.846055163567,
42
+ "max_text_length": 115382,
43
+ "unique_texts": 1559
44
+ },
45
+ "documents_image_statistics": null,
46
+ "queries_text_statistics": {
47
+ "total_text_length": 15154,
48
+ "min_text_length": 7,
49
+ "average_text_length": 75.77,
50
+ "max_text_length": 695,
51
+ "unique_texts": 200
52
+ },
53
+ "queries_image_statistics": null,
54
+ "relevant_docs_statistics": {
55
+ "num_relevant_docs": 200,
56
+ "min_relevant_docs_per_query": 8,
57
+ "average_relevant_docs_per_query": 1.0,
58
+ "max_relevant_docs_per_query": 8,
59
+ "unique_relevant_docs": 1559
60
+ },
61
+ "top_ranked_statistics": {
62
+ "num_top_ranked": 1600,
63
+ "min_top_ranked_per_query": 8,
64
+ "average_top_ranked_per_query": 8.0,
65
+ "max_top_ranked_per_query": 8
66
+ }
67
+ },
68
+ "de": {
69
+ "num_samples": 1800,
70
+ "number_of_characters": 9860028,
71
+ "documents_text_statistics": {
72
+ "total_text_length": 9835298,
73
+ "min_text_length": 107,
74
+ "average_text_length": 6147.06125,
75
+ "max_text_length": 92210,
76
+ "unique_texts": 1600
77
+ },
78
+ "documents_image_statistics": null,
79
+ "queries_text_statistics": {
80
+ "total_text_length": 24730,
81
+ "min_text_length": 10,
82
+ "average_text_length": 123.65,
83
+ "max_text_length": 957,
84
+ "unique_texts": 200
85
+ },
86
+ "queries_image_statistics": null,
87
+ "relevant_docs_statistics": {
88
+ "num_relevant_docs": 200,
89
+ "min_relevant_docs_per_query": 8,
90
+ "average_relevant_docs_per_query": 1.0,
91
+ "max_relevant_docs_per_query": 8,
92
+ "unique_relevant_docs": 1600
93
+ },
94
+ "top_ranked_statistics": {
95
+ "num_top_ranked": 1600,
96
+ "min_top_ranked_per_query": 8,
97
+ "average_top_ranked_per_query": 8.0,
98
+ "max_top_ranked_per_query": 8
99
+ }
100
+ },
101
+ "en": {
102
+ "num_samples": 6878,
103
+ "number_of_characters": 221164232,
104
+ "documents_text_statistics": {
105
+ "total_text_length": 221099168,
106
+ "min_text_length": 12147,
107
+ "average_text_length": 36376.96084238236,
108
+ "max_text_length": 287838,
109
+ "unique_texts": 6078
110
+ },
111
+ "documents_image_statistics": null,
112
+ "queries_text_statistics": {
113
+ "total_text_length": 65064,
114
+ "min_text_length": 18,
115
+ "average_text_length": 81.33,
116
+ "max_text_length": 255,
117
+ "unique_texts": 800
118
+ },
119
+ "queries_image_statistics": null,
120
+ "relevant_docs_statistics": {
121
+ "num_relevant_docs": 800,
122
+ "min_relevant_docs_per_query": 8,
123
+ "average_relevant_docs_per_query": 1.0,
124
+ "max_relevant_docs_per_query": 8,
125
+ "unique_relevant_docs": 6078
126
+ },
127
+ "top_ranked_statistics": {
128
+ "num_top_ranked": 6400,
129
+ "min_top_ranked_per_query": 8,
130
+ "average_top_ranked_per_query": 8.0,
131
+ "max_top_ranked_per_query": 8
132
+ }
133
+ },
134
+ "es": {
135
+ "num_samples": 1780,
136
+ "number_of_characters": 20852843,
137
+ "documents_text_statistics": {
138
+ "total_text_length": 20826446,
139
+ "min_text_length": 2657,
140
+ "average_text_length": 13181.29493670886,
141
+ "max_text_length": 270338,
142
+ "unique_texts": 1580
143
+ },
144
+ "documents_image_statistics": null,
145
+ "queries_text_statistics": {
146
+ "total_text_length": 26397,
147
+ "min_text_length": 40,
148
+ "average_text_length": 131.985,
149
+ "max_text_length": 480,
150
+ "unique_texts": 200
151
+ },
152
+ "queries_image_statistics": null,
153
+ "relevant_docs_statistics": {
154
+ "num_relevant_docs": 200,
155
+ "min_relevant_docs_per_query": 8,
156
+ "average_relevant_docs_per_query": 1.0,
157
+ "max_relevant_docs_per_query": 8,
158
+ "unique_relevant_docs": 1580
159
+ },
160
+ "top_ranked_statistics": {
161
+ "num_top_ranked": 1600,
162
+ "min_top_ranked_per_query": 8,
163
+ "average_top_ranked_per_query": 8.0,
164
+ "max_top_ranked_per_query": 8
165
+ }
166
+ },
167
+ "fr": {
168
+ "num_samples": 1762,
169
+ "number_of_characters": 17828712,
170
+ "documents_text_statistics": {
171
+ "total_text_length": 17798753,
172
+ "min_text_length": 2093,
173
+ "average_text_length": 11394.848271446863,
174
+ "max_text_length": 133854,
175
+ "unique_texts": 1562
176
+ },
177
+ "documents_image_statistics": null,
178
+ "queries_text_statistics": {
179
+ "total_text_length": 29959,
180
+ "min_text_length": 33,
181
+ "average_text_length": 149.795,
182
+ "max_text_length": 2589,
183
+ "unique_texts": 200
184
+ },
185
+ "queries_image_statistics": null,
186
+ "relevant_docs_statistics": {
187
+ "num_relevant_docs": 200,
188
+ "min_relevant_docs_per_query": 8,
189
+ "average_relevant_docs_per_query": 1.0,
190
+ "max_relevant_docs_per_query": 8,
191
+ "unique_relevant_docs": 1562
192
+ },
193
+ "top_ranked_statistics": {
194
+ "num_top_ranked": 1600,
195
+ "min_top_ranked_per_query": 8,
196
+ "average_top_ranked_per_query": 8.0,
197
+ "max_top_ranked_per_query": 8
198
+ }
199
+ },
200
+ "hi": {
201
+ "num_samples": 1715,
202
+ "number_of_characters": 18465376,
203
+ "documents_text_statistics": {
204
+ "total_text_length": 18444624,
205
+ "min_text_length": 2426,
206
+ "average_text_length": 12174.669306930693,
207
+ "max_text_length": 227264,
208
+ "unique_texts": 1515
209
+ },
210
+ "documents_image_statistics": null,
211
+ "queries_text_statistics": {
212
+ "total_text_length": 20752,
213
+ "min_text_length": 6,
214
+ "average_text_length": 103.76,
215
+ "max_text_length": 2022,
216
+ "unique_texts": 200
217
+ },
218
+ "queries_image_statistics": null,
219
+ "relevant_docs_statistics": {
220
+ "num_relevant_docs": 200,
221
+ "min_relevant_docs_per_query": 8,
222
+ "average_relevant_docs_per_query": 1.0,
223
+ "max_relevant_docs_per_query": 8,
224
+ "unique_relevant_docs": 1515
225
+ },
226
+ "top_ranked_statistics": {
227
+ "num_top_ranked": 1600,
228
+ "min_top_ranked_per_query": 8,
229
+ "average_top_ranked_per_query": 8.0,
230
+ "max_top_ranked_per_query": 8
231
+ }
232
+ },
233
+ "it": {
234
+ "num_samples": 1780,
235
+ "number_of_characters": 22616410,
236
+ "documents_text_statistics": {
237
+ "total_text_length": 22593491,
238
+ "min_text_length": 2518,
239
+ "average_text_length": 14299.677848101266,
240
+ "max_text_length": 117197,
241
+ "unique_texts": 1580
242
+ },
243
+ "documents_image_statistics": null,
244
+ "queries_text_statistics": {
245
+ "total_text_length": 22919,
246
+ "min_text_length": 12,
247
+ "average_text_length": 114.595,
248
+ "max_text_length": 1899,
249
+ "unique_texts": 200
250
+ },
251
+ "queries_image_statistics": null,
252
+ "relevant_docs_statistics": {
253
+ "num_relevant_docs": 200,
254
+ "min_relevant_docs_per_query": 8,
255
+ "average_relevant_docs_per_query": 1.0,
256
+ "max_relevant_docs_per_query": 8,
257
+ "unique_relevant_docs": 1580
258
+ },
259
+ "top_ranked_statistics": {
260
+ "num_top_ranked": 1600,
261
+ "min_top_ranked_per_query": 8,
262
+ "average_top_ranked_per_query": 8.0,
263
+ "max_top_ranked_per_query": 8
264
+ }
265
+ },
266
+ "ja": {
267
+ "num_samples": 1781,
268
+ "number_of_characters": 8562074,
269
+ "documents_text_statistics": {
270
+ "total_text_length": 8550928,
271
+ "min_text_length": 1244,
272
+ "average_text_length": 5408.556609740671,
273
+ "max_text_length": 97242,
274
+ "unique_texts": 1581
275
+ },
276
+ "documents_image_statistics": null,
277
+ "queries_text_statistics": {
278
+ "total_text_length": 11146,
279
+ "min_text_length": 6,
280
+ "average_text_length": 55.73,
281
+ "max_text_length": 416,
282
+ "unique_texts": 200
283
+ },
284
+ "queries_image_statistics": null,
285
+ "relevant_docs_statistics": {
286
+ "num_relevant_docs": 200,
287
+ "min_relevant_docs_per_query": 8,
288
+ "average_relevant_docs_per_query": 1.0,
289
+ "max_relevant_docs_per_query": 8,
290
+ "unique_relevant_docs": 1581
291
+ },
292
+ "top_ranked_statistics": {
293
+ "num_top_ranked": 1600,
294
+ "min_top_ranked_per_query": 8,
295
+ "average_top_ranked_per_query": 8.0,
296
+ "max_top_ranked_per_query": 8
297
+ }
298
+ },
299
+ "ko": {
300
+ "num_samples": 1770,
301
+ "number_of_characters": 9773349,
302
+ "documents_text_statistics": {
303
+ "total_text_length": 9761605,
304
+ "min_text_length": 1490,
305
+ "average_text_length": 6217.58280254777,
306
+ "max_text_length": 76949,
307
+ "unique_texts": 1570
308
+ },
309
+ "documents_image_statistics": null,
310
+ "queries_text_statistics": {
311
+ "total_text_length": 11744,
312
+ "min_text_length": 8,
313
+ "average_text_length": 58.72,
314
+ "max_text_length": 330,
315
+ "unique_texts": 200
316
+ },
317
+ "queries_image_statistics": null,
318
+ "relevant_docs_statistics": {
319
+ "num_relevant_docs": 200,
320
+ "min_relevant_docs_per_query": 8,
321
+ "average_relevant_docs_per_query": 1.0,
322
+ "max_relevant_docs_per_query": 8,
323
+ "unique_relevant_docs": 1570
324
+ },
325
+ "top_ranked_statistics": {
326
+ "num_top_ranked": 1600,
327
+ "min_top_ranked_per_query": 8,
328
+ "average_top_ranked_per_query": 8.0,
329
+ "max_top_ranked_per_query": 8
330
+ }
331
+ },
332
+ "pt": {
333
+ "num_samples": 1764,
334
+ "number_of_characters": 23152911,
335
+ "documents_text_statistics": {
336
+ "total_text_length": 23130220,
337
+ "min_text_length": 3473,
338
+ "average_text_length": 14789.143222506395,
339
+ "max_text_length": 108535,
340
+ "unique_texts": 1564
341
+ },
342
+ "documents_image_statistics": null,
343
+ "queries_text_statistics": {
344
+ "total_text_length": 22691,
345
+ "min_text_length": 4,
346
+ "average_text_length": 113.455,
347
+ "max_text_length": 511,
348
+ "unique_texts": 200
349
+ },
350
+ "queries_image_statistics": null,
351
+ "relevant_docs_statistics": {
352
+ "num_relevant_docs": 200,
353
+ "min_relevant_docs_per_query": 8,
354
+ "average_relevant_docs_per_query": 1.0,
355
+ "max_relevant_docs_per_query": 8,
356
+ "unique_relevant_docs": 1564
357
+ },
358
+ "top_ranked_statistics": {
359
+ "num_top_ranked": 1600,
360
+ "min_top_ranked_per_query": 8,
361
+ "average_top_ranked_per_query": 8.0,
362
+ "max_top_ranked_per_query": 8
363
+ }
364
+ },
365
+ "ru": {
366
+ "num_samples": 1779,
367
+ "number_of_characters": 22994826,
368
+ "documents_text_statistics": {
369
+ "total_text_length": 22975852,
370
+ "min_text_length": 2914,
371
+ "average_text_length": 14550.887903736542,
372
+ "max_text_length": 151133,
373
+ "unique_texts": 1579
374
+ },
375
+ "documents_image_statistics": null,
376
+ "queries_text_statistics": {
377
+ "total_text_length": 18974,
378
+ "min_text_length": 12,
379
+ "average_text_length": 94.87,
380
+ "max_text_length": 413,
381
+ "unique_texts": 200
382
+ },
383
+ "queries_image_statistics": null,
384
+ "relevant_docs_statistics": {
385
+ "num_relevant_docs": 200,
386
+ "min_relevant_docs_per_query": 8,
387
+ "average_relevant_docs_per_query": 1.0,
388
+ "max_relevant_docs_per_query": 8,
389
+ "unique_relevant_docs": 1579
390
+ },
391
+ "top_ranked_statistics": {
392
+ "num_top_ranked": 1600,
393
+ "min_top_ranked_per_query": 8,
394
+ "average_top_ranked_per_query": 8.0,
395
+ "max_top_ranked_per_query": 8
396
+ }
397
+ },
398
+ "th": {
399
+ "num_samples": 1800,
400
+ "number_of_characters": 8022609,
401
+ "documents_text_statistics": {
402
+ "total_text_length": 8003011,
403
+ "min_text_length": 37,
404
+ "average_text_length": 5001.881875,
405
+ "max_text_length": 44872,
406
+ "unique_texts": 1600
407
+ },
408
+ "documents_image_statistics": null,
409
+ "queries_text_statistics": {
410
+ "total_text_length": 19598,
411
+ "min_text_length": 11,
412
+ "average_text_length": 97.99,
413
+ "max_text_length": 309,
414
+ "unique_texts": 200
415
+ },
416
+ "queries_image_statistics": null,
417
+ "relevant_docs_statistics": {
418
+ "num_relevant_docs": 200,
419
+ "min_relevant_docs_per_query": 8,
420
+ "average_relevant_docs_per_query": 1.0,
421
+ "max_relevant_docs_per_query": 8,
422
+ "unique_relevant_docs": 1600
423
+ },
424
+ "top_ranked_statistics": {
425
+ "num_top_ranked": 1600,
426
+ "min_top_ranked_per_query": 8,
427
+ "average_top_ranked_per_query": 8.0,
428
+ "max_top_ranked_per_query": 8
429
+ }
430
+ },
431
+ "zh": {
432
+ "num_samples": 7121,
433
+ "number_of_characters": 78102134,
434
+ "documents_text_statistics": {
435
+ "total_text_length": 78082367,
436
+ "min_text_length": 6268,
437
+ "average_text_length": 12352.850340136054,
438
+ "max_text_length": 278468,
439
+ "unique_texts": 6321
440
+ },
441
+ "documents_image_statistics": null,
442
+ "queries_text_statistics": {
443
+ "total_text_length": 19767,
444
+ "min_text_length": 3,
445
+ "average_text_length": 24.70875,
446
+ "max_text_length": 646,
447
+ "unique_texts": 800
448
+ },
449
+ "queries_image_statistics": null,
450
+ "relevant_docs_statistics": {
451
+ "num_relevant_docs": 800,
452
+ "min_relevant_docs_per_query": 8,
453
+ "average_relevant_docs_per_query": 1.0,
454
+ "max_relevant_docs_per_query": 8,
455
+ "unique_relevant_docs": 6321
456
+ },
457
+ "top_ranked_statistics": {
458
+ "num_top_ranked": 6400,
459
+ "min_top_ranked_per_query": 8,
460
+ "average_top_ranked_per_query": 8.0,
461
+ "max_top_ranked_per_query": 8
462
+ }
463
+ }
464
+ }
465
+ }
466
+ }
mteb/evaluate.py CHANGED
@@ -7,6 +7,7 @@ from pathlib import Path
7
7
  from time import time
8
8
  from typing import TYPE_CHECKING, Any, cast
9
9
 
10
+ from datasets.exceptions import DatasetNotFoundError
10
11
  from tqdm.auto import tqdm
11
12
 
12
13
  from mteb._helpful_enum import HelpfulStrEnum
@@ -25,6 +26,7 @@ from mteb.models.sentence_transformer_wrapper import (
25
26
  SentenceTransformerEncoderWrapper,
26
27
  )
27
28
  from mteb.results import ModelResult, TaskResult
29
+ from mteb.results.task_result import TaskError
28
30
  from mteb.types import HFSubset, PromptType, SplitName
29
31
  from mteb.types._metadata import ModelName, Revision
30
32
 
@@ -117,7 +119,8 @@ def _evaluate_task(
117
119
  co2_tracker: bool | None,
118
120
  encode_kwargs: dict[str, Any],
119
121
  prediction_folder: Path | None,
120
- ) -> TaskResult:
122
+ public_only: bool | None,
123
+ ) -> TaskResult | TaskError:
121
124
  """The core logic to run a model on a given task. See `evaluate` for more details.
122
125
 
123
126
  Returns:
@@ -149,6 +152,7 @@ def _evaluate_task(
149
152
  encode_kwargs=encode_kwargs,
150
153
  co2_tracker=False,
151
154
  prediction_folder=prediction_folder,
155
+ public_only=public_only,
152
156
  )
153
157
  result.kg_co2_emissions = tracker.final_emissions
154
158
  return result
@@ -159,7 +163,20 @@ def _evaluate_task(
159
163
 
160
164
  data_loaded = task.data_loaded
161
165
  if not data_loaded:
162
- task.load_data()
166
+ try:
167
+ task.load_data()
168
+ except DatasetNotFoundError as e:
169
+ if not task.metadata.is_public and public_only is None:
170
+ logger.warning(
171
+ f"Dataset for private task '{task.metadata.name}' not found. "
172
+ "Make sure you have access to the dataset and that you have set up the authentication correctly. To disable this warning set `public_only=False`"
173
+ )
174
+ return TaskError(
175
+ task_name=task.metadata.name,
176
+ exception=str(e),
177
+ )
178
+ if public_only is False:
179
+ raise e
163
180
 
164
181
  evaluation_time = 0
165
182
 
@@ -281,6 +298,7 @@ def evaluate(
281
298
  overwrite_strategy: str | OverwriteStrategy = "only-missing",
282
299
  prediction_folder: Path | str | None = None,
283
300
  show_progress_bar: bool = True,
301
+ public_only: bool | None = None,
284
302
  ) -> ModelResult:
285
303
  """This function runs a model on a given task and returns the results.
286
304
 
@@ -304,6 +322,7 @@ def evaluate(
304
322
  prediction_folder: Optional folder in which to save model predictions for the task. Predictions of the tasks will be sabed in `prediction_folder/{task_name}_predictions.json`
305
323
  show_progress_bar: Whether to show a progress bar when running the evaluation. Default is True. Setting this to False will also set the
306
324
  `encode_kwargs['show_progress_bar']` to False if encode_kwargs is unspecified.
325
+ public_only: Run only public tasks. If None, it will attempt to run the private task.
307
326
 
308
327
  Returns:
309
328
  The results of the evaluation.
@@ -355,6 +374,7 @@ def evaluate(
355
374
  overwrite_strategy=overwrite_strategy,
356
375
  prediction_folder=prediction_folder,
357
376
  show_progress_bar=show_progress_bar,
377
+ public_only=public_only,
358
378
  )
359
379
  result = task.combine_task_results(results.task_results)
360
380
  return ModelResult(
@@ -367,6 +387,7 @@ def evaluate(
367
387
  task = tasks
368
388
  else:
369
389
  results = []
390
+ exceptions = []
370
391
  tasks_tqdm = tqdm(
371
392
  tasks,
372
393
  desc="Evaluating tasks",
@@ -384,12 +405,16 @@ def evaluate(
384
405
  overwrite_strategy=overwrite_strategy,
385
406
  prediction_folder=prediction_folder,
386
407
  show_progress_bar=False,
408
+ public_only=public_only,
387
409
  )
388
410
  results.extend(_res.task_results)
411
+ if _res.exceptions:
412
+ exceptions.extend(_res.exceptions)
389
413
  return ModelResult(
390
414
  model_name=_res.model_name,
391
415
  model_revision=_res.model_revision,
392
416
  task_results=results,
417
+ exceptions=exceptions,
393
418
  )
394
419
 
395
420
  overwrite_strategy = OverwriteStrategy.from_str(overwrite_strategy)
@@ -459,16 +484,13 @@ def evaluate(
459
484
  co2_tracker=co2_tracker,
460
485
  encode_kwargs=encode_kwargs,
461
486
  prediction_folder=prediction_folder,
487
+ public_only=public_only,
462
488
  )
463
489
  except Exception as e:
464
490
  logger.error(
465
491
  f"Error while running task {task.metadata.name} on splits {list(missing_eval.keys())}: {e}"
466
492
  )
467
- return ModelResult(
468
- model_name=model_name,
469
- model_revision=model_revision,
470
- task_results=[],
471
- )
493
+ result = TaskError(task_name=task.metadata.name, exception=str(e))
472
494
  else:
473
495
  result = _evaluate_task(
474
496
  model=model,
@@ -477,9 +499,18 @@ def evaluate(
477
499
  co2_tracker=False,
478
500
  encode_kwargs=encode_kwargs,
479
501
  prediction_folder=prediction_folder,
502
+ public_only=public_only,
480
503
  )
481
504
  logger.info(f"✓ Finished evaluation for {task.metadata.name}")
482
505
 
506
+ if isinstance(result, TaskError):
507
+ return ModelResult(
508
+ model_name=model_name,
509
+ model_revision=model_revision,
510
+ task_results=[],
511
+ exceptions=[result],
512
+ )
513
+
483
514
  if existing_results:
484
515
  result = result.merge(existing_results)
485
516
 
@@ -196,10 +196,10 @@ COLPALI_CITATION = """
196
196
 
197
197
  COLPALI_TRAINING_DATA = {
198
198
  # from https://huggingface.co/datasets/vidore/colpali_train_set
199
- "DocVQA",
200
- "InfoVQA",
201
- "TATDQA",
202
- "arXivQA",
199
+ "VidoreDocVQARetrieval",
200
+ "VidoreInfoVQARetrieval",
201
+ "VidoreTatdqaRetrieval",
202
+ "VidoreArxivQARetrieval",
203
203
  }
204
204
 
205
205
  colpali_v1_1 = ModelMeta(