mteb 2.1.0__py3-none-any.whl → 2.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. mteb/_create_dataloaders.py +2 -0
  2. mteb/_evaluators/retrieval_metrics.py +0 -9
  3. mteb/abstasks/_stratification.py +1 -1
  4. mteb/abstasks/abstask.py +6 -1
  5. mteb/abstasks/dataset_card_template.md +1 -1
  6. mteb/abstasks/retrieval.py +2 -1
  7. mteb/abstasks/retrieval_dataset_loaders.py +1 -1
  8. mteb/abstasks/task_metadata.py +1 -1
  9. mteb/benchmarks/benchmarks/benchmarks.py +9 -13
  10. mteb/benchmarks/get_benchmark.py +1 -1
  11. mteb/descriptive_stats/Image/Any2AnyMultilingualRetrieval/XFlickr30kCoT2IRetrieval.json +243 -153
  12. mteb/descriptive_stats/Image/Any2AnyMultilingualRetrieval/XM3600T2IRetrieval.json +999 -629
  13. mteb/descriptive_stats/Image/Any2AnyRetrieval/OVENIT2TRetrieval.json +33 -17
  14. mteb/descriptive_stats/Image/DocumentUnderstanding/MIRACLVisionRetrieval.json +574 -0
  15. mteb/descriptive_stats/Retrieval/ClimateFEVERHardNegatives.v2.json +30 -0
  16. mteb/descriptive_stats/Retrieval/DBPediaHardNegatives.v2.json +30 -0
  17. mteb/descriptive_stats/Retrieval/FEVERHardNegatives.v2.json +30 -0
  18. mteb/descriptive_stats/Retrieval/HotpotQAHardNegatives.v2.json +30 -0
  19. mteb/descriptive_stats/Retrieval/QuoraRetrievalHardNegatives.v2.json +30 -0
  20. mteb/descriptive_stats/Retrieval/RiaNewsRetrievalHardNegatives.v2.json +30 -0
  21. mteb/descriptive_stats/Retrieval/VDRMultilingualRetrieval.json +184 -0
  22. mteb/languages/check_language_code.py +11 -3
  23. mteb/languages/language_scripts.py +4 -0
  24. mteb/leaderboard/app.py +1 -1
  25. mteb/leaderboard/benchmark_selector.py +1 -0
  26. mteb/leaderboard/text_segments.py +1 -1
  27. mteb/models/model_implementations/b1ade_models.py +1 -1
  28. mteb/models/model_implementations/bge_models.py +1 -3
  29. mteb/models/model_implementations/bmretriever_models.py +1 -1
  30. mteb/models/model_implementations/gme_v_models.py +2 -2
  31. mteb/models/model_implementations/ibm_granite_models.py +1 -1
  32. mteb/models/model_implementations/inf_models.py +3 -3
  33. mteb/models/model_implementations/jina_models.py +12 -2
  34. mteb/models/model_implementations/llm2vec_models.py +1 -1
  35. mteb/models/model_implementations/misc_models.py +2 -2
  36. mteb/models/model_implementations/mxbai_models.py +1 -1
  37. mteb/models/model_implementations/reasonir_model.py +1 -1
  38. mteb/models/model_implementations/salesforce_models.py +1 -1
  39. mteb/models/model_implementations/seed_1_6_embedding_models.py +1 -1
  40. mteb/models/model_implementations/voyage_v.py +9 -9
  41. mteb/results/task_result.py +6 -8
  42. mteb/tasks/classification/dan/angry_tweets_classification.py +2 -2
  43. mteb/tasks/classification/eng/legal_bench_classification.py +3 -3
  44. mteb/tasks/classification/mya/myanmar_news.py +2 -2
  45. mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
  46. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
  47. mteb/tasks/pair_classification/multilingual/indic_xnli_pair_classification.py +9 -8
  48. mteb/tasks/retrieval/code/code_rag.py +8 -8
  49. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  50. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  51. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  52. mteb/tasks/retrieval/eng/__init__.py +18 -4
  53. mteb/tasks/retrieval/eng/climate_fever_retrieval.py +68 -77
  54. mteb/tasks/retrieval/eng/dbpedia_retrieval.py +55 -50
  55. mteb/tasks/retrieval/eng/fever_retrieval.py +62 -67
  56. mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +0 -4
  57. mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +0 -4
  58. mteb/tasks/retrieval/eng/hotpot_qa_retrieval.py +57 -67
  59. mteb/tasks/retrieval/eng/legal_summarization_retrieval.py +1 -1
  60. mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +0 -3
  61. mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +0 -2
  62. mteb/tasks/retrieval/eng/oven_it2t_retrieval.py +1 -1
  63. mteb/tasks/retrieval/eng/quora_retrieval.py +51 -46
  64. mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +0 -4
  65. mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +0 -4
  66. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +0 -2
  67. mteb/tasks/retrieval/jpn/ja_gov_faqs_retrieval.py +1 -1
  68. mteb/tasks/retrieval/multilingual/belebele_retrieval.py +1 -1
  69. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +0 -2
  70. mteb/tasks/retrieval/multilingual/miracl_retrieval.py +1 -1
  71. mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +2 -9
  72. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +0 -2
  73. mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +0 -2
  74. mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +6 -5
  75. mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +3 -4
  76. mteb/tasks/retrieval/nob/norquad.py +2 -2
  77. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  78. mteb/tasks/retrieval/rus/__init__.py +11 -2
  79. mteb/tasks/retrieval/rus/ria_news_retrieval.py +48 -44
  80. mteb/tasks/retrieval/tur/tur_hist_quad.py +2 -2
  81. {mteb-2.1.0.dist-info → mteb-2.1.2.dist-info}/METADATA +5 -5
  82. {mteb-2.1.0.dist-info → mteb-2.1.2.dist-info}/RECORD +86 -91
  83. mteb/descriptive_stats/Classification/PersianTextTone.json +0 -56
  84. mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchCount.json +0 -37
  85. mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchDepth.json +0 -25
  86. mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchDistance.json +0 -25
  87. mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchRelation.json +0 -25
  88. mteb/descriptive_stats/Image/VisualSTS/STS12VisualSTS.json +0 -20
  89. mteb/descriptive_stats/Image/VisualSTS/STS13VisualSTS.json +0 -20
  90. mteb/descriptive_stats/Image/VisualSTS/STS14VisualSTS.json +0 -20
  91. mteb/descriptive_stats/Image/VisualSTS/STS15VisualSTS.json +0 -20
  92. mteb/descriptive_stats/Image/VisualSTS/STS16VisualSTS.json +0 -20
  93. mteb/descriptive_stats/Image/VisualSTS/STS17MultilingualVisualSTS.json +0 -220
  94. mteb/descriptive_stats/Image/VisualSTS/STSBenchmarkMultilingualVisualSTS.json +0 -402
  95. mteb/descriptive_stats/Reranking/InstructIR.json +0 -31
  96. {mteb-2.1.0.dist-info → mteb-2.1.2.dist-info}/WHEEL +0 -0
  97. {mteb-2.1.0.dist-info → mteb-2.1.2.dist-info}/entry_points.txt +0 -0
  98. {mteb-2.1.0.dist-info → mteb-2.1.2.dist-info}/licenses/LICENSE +0 -0
  99. {mteb-2.1.0.dist-info → mteb-2.1.2.dist-info}/top_level.txt +0 -0
@@ -277,6 +277,8 @@ def _custom_collate_fn(batch: list[dict[str, Any]]) -> dict[str, Any]:
277
277
  # Leave the images as a list to avoid stacking errors.
278
278
  collated[key] = [item[key] for item in batch]
279
279
  else:
280
+ if any(item[key] is None for item in batch):
281
+ raise ValueError(f"Found None in batch for key '{key}'")
280
282
  collated[key] = default_collate([item[key] for item in batch])
281
283
  return collated
282
284
 
@@ -5,7 +5,6 @@ from typing import Any
5
5
  import numpy as np
6
6
  import pandas as pd
7
7
  import pytrec_eval
8
- import torch
9
8
  from packaging.version import Version
10
9
  from sklearn.metrics import auc
11
10
 
@@ -14,14 +13,6 @@ from mteb.types import RelevantDocumentsType, RetrievalEvaluationResult
14
13
  logger = logging.getLogger(__name__)
15
14
 
16
15
 
17
- try:
18
- # speeds up computation if available
19
- torch.set_float32_matmul_precision("high")
20
- logger.info("Setting torch float32 matmul precision to high for a speedup")
21
- except Exception:
22
- pass
23
-
24
-
25
16
  def mrr(
26
17
  qrels: RelevantDocumentsType,
27
18
  results: dict[str, dict[str, float]],
@@ -134,7 +134,7 @@ def _get_most_desired_combination(samples_with_combination: dict):
134
134
  class IterativeStratification(_BaseKFold):
135
135
  """Iteratively stratify a multi-label data set into folds
136
136
 
137
- Construct an interative stratifier that splits the data set into folds trying to maintain balanced representation
137
+ Construct an iterative stratifier that splits the data set into folds trying to maintain balanced representation
138
138
  with respect to order-th label combinations.
139
139
  """
140
140
 
mteb/abstasks/abstask.py CHANGED
@@ -459,7 +459,7 @@ class AbsTask(ABC):
459
459
  """Filter the languages of the task.
460
460
 
461
461
  Args:
462
- languages: list of languages to filter the task by can be either a 3-letter langauge code (e.g. "eng") or also include the script
462
+ languages: list of languages to filter the task by can be either a 3-letter language code (e.g. "eng") or also include the script
463
463
  (e.g. "eng-Latn")
464
464
  script: A list of scripts to filter the task by. Will be ignored if language code specified the script. If None, all scripts are included.
465
465
  If the language code does not specify the script the intersection of the language and script will be used.
@@ -491,6 +491,11 @@ class AbsTask(ABC):
491
491
  if lang_scripts.contains_languages(langs):
492
492
  subsets_to_keep.append(hf_subset)
493
493
 
494
+ if len(subsets_to_keep) == 0:
495
+ raise ValueError(
496
+ f"No subsets were found for {self.metadata.name} with filters: language code {languages}, script {script}, hf subsets {hf_subsets}."
497
+ )
498
+
494
499
  self.hf_subsets = subsets_to_keep
495
500
  return self
496
501
 
@@ -85,7 +85,7 @@ desc_stats = task.metadata.descriptive_stats
85
85
  ```
86
86
 
87
87
  ```json
88
- {{ descritptive_stats | default("{}", true) }}
88
+ {{ descriptive_stats | default("{}", true) }}
89
89
  ```
90
90
 
91
91
  </details>
@@ -653,6 +653,8 @@ class AbsTaskRetrieval(AbsTask):
653
653
  FileNotFoundError: If the specified path does not exist.
654
654
  ValueError: If the loaded top ranked results are not in the expected format.
655
655
  """
656
+ self._top_k = top_k
657
+
656
658
  top_ranked_path = Path(top_ranked_path)
657
659
  if top_ranked_path.is_dir():
658
660
  top_ranked_path = self._predictions_path(top_ranked_path)
@@ -682,7 +684,6 @@ class AbsTaskRetrieval(AbsTask):
682
684
  top_k_sorted[query_id] = sorted_keys[: self._top_k]
683
685
 
684
686
  self.dataset[subset][split]["top_ranked"] = top_k_sorted
685
- self._top_k = top_k
686
687
  return self
687
688
 
688
689
 
@@ -176,7 +176,7 @@ class RetrievalDatasetLoader:
176
176
  {
177
177
  "query-id": Value("string"),
178
178
  "corpus-id": Value("string"),
179
- "score": Value("uint16"),
179
+ "score": Value("int32"),
180
180
  }
181
181
  )
182
182
  )
@@ -532,7 +532,7 @@ class TaskMetadata(BaseModel):
532
532
  citation=self.bibtex_citation,
533
533
  dataset_description=self.description,
534
534
  dataset_reference=self.reference,
535
- descritptive_stats=descriptive_stats,
535
+ descriptive_stats=descriptive_stats,
536
536
  dataset_task_name=self.name,
537
537
  category=self.category,
538
538
  domains=", ".join(self.domains) if self.domains else None,
@@ -641,7 +641,7 @@ MTEB_KOR = Benchmark(
641
641
  icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/kr.svg",
642
642
  tasks=get_tasks(
643
643
  languages=["kor"],
644
- tasks=[ # @KennethEnevoldsen: We could probably expand this to a more solid benchamrk, but for now I have left it as is.
644
+ tasks=[ # @KennethEnevoldsen: We could probably expand this to a more solid benchmark, but for now I have left it as is.
645
645
  # Classification
646
646
  "KLUE-TC",
647
647
  # Reranking
@@ -975,8 +975,6 @@ MTEB_INDIC = Benchmark(
975
975
  # Bitext
976
976
  "IN22ConvBitextMining",
977
977
  "IN22GenBitextMining",
978
- "IndicGenBenchFloresBitextMining",
979
- "LinceMTBitextMining",
980
978
  # clustering
981
979
  "SIB200ClusteringS2S",
982
980
  # classification
@@ -985,7 +983,6 @@ MTEB_INDIC = Benchmark(
985
983
  "HindiDiscourseClassification",
986
984
  "SentimentAnalysisHindi",
987
985
  "MalayalamNewsClassification",
988
- "IndicLangClassification",
989
986
  "MTOPIntentClassification",
990
987
  "MultiHateClassification",
991
988
  "TweetSentimentClassification",
@@ -1008,7 +1005,7 @@ MTEB_INDIC = Benchmark(
1008
1005
  # STS
1009
1006
  (get_task("IndicCrosslingualSTS"),)
1010
1007
  ),
1011
- description="A regional geopolitical text embedding benchmark targetting embedding performance on Indic languages.",
1008
+ description="A regional geopolitical text embedding benchmark targeting embedding performance on Indic languages.",
1012
1009
  reference=None,
1013
1010
  citation=MMTEB_CITATION,
1014
1011
  contacts=["KennethEnevoldsen", "isaac-chung"],
@@ -1016,7 +1013,7 @@ MTEB_INDIC = Benchmark(
1016
1013
 
1017
1014
 
1018
1015
  eu_languages = [
1019
- # official EU languages (56) - we could include the whole economic area e.g. Norway - additioanlly we could include minority languages (probably a good idea?)
1016
+ # official EU languages (56) - we could include the whole economic area e.g. Norway - additionally we could include minority languages (probably a good idea?)
1020
1017
  # germanic
1021
1018
  "dan",
1022
1019
  "eng",
@@ -1084,7 +1081,6 @@ MTEB_EU = Benchmark(
1084
1081
  "AmazonCounterfactualClassification",
1085
1082
  "MassiveScenarioClassification",
1086
1083
  "MultiHateClassification",
1087
- "NordicLangClassification",
1088
1084
  "ScalaClassification",
1089
1085
  "SwissJudgementClassification",
1090
1086
  "TweetSentimentClassification",
@@ -1142,7 +1138,7 @@ MTEB_EU = Benchmark(
1142
1138
  languages=eu_languages,
1143
1139
  exclusive_language_filter=True,
1144
1140
  ),
1145
- description="A regional geopolitical text embedding benchmark targetting embedding performance on European languages.",
1141
+ description="A regional geopolitical text embedding benchmark targeting embedding performance on European languages.",
1146
1142
  reference=None,
1147
1143
  citation=MMTEB_CITATION,
1148
1144
  contacts=["KennethEnevoldsen", "isaac-chung"],
@@ -1638,7 +1634,7 @@ BEIR_NL = Benchmark(
1638
1634
 
1639
1635
  MTEB_NL = Benchmark(
1640
1636
  name="MTEB(nld, v1)",
1641
- display_name="MTEB-NL",
1637
+ display_name="Dutch",
1642
1638
  icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/nl.svg",
1643
1639
  tasks=MTEBTasks(
1644
1640
  get_tasks(
@@ -1682,7 +1678,7 @@ MTEB_NL = Benchmark(
1682
1678
  "SciFact-NL",
1683
1679
  "NFCorpus-NL",
1684
1680
  "BelebeleRetrieval",
1685
- # "WebFAQRetrieval",
1681
+ "WebFAQRetrieval",
1686
1682
  "DutchNewsArticlesRetrieval",
1687
1683
  "bBSARDNLRetrieval",
1688
1684
  "LegalQANLRetrieval",
@@ -1858,7 +1854,7 @@ MIEB_ENG = MIEBBenchmark(
1858
1854
  ),
1859
1855
  description="""MIEB(eng) is a comprehensive image embeddings benchmark, spanning 8 task types, covering 125 tasks.
1860
1856
  In addition to image classification (zero shot and linear probing), clustering, retrieval, MIEB includes tasks in compositionality evaluation,
1861
- document undestanding, visual STS, and CV-centric tasks.""",
1857
+ document understanding, visual STS, and CV-centric tasks.""",
1862
1858
  reference="https://arxiv.org/abs/2504.10471",
1863
1859
  contacts=["gowitheflow-1998", "isaac-chung"],
1864
1860
  citation=r"""
@@ -1892,7 +1888,7 @@ MIEB_MULTILINGUAL = MIEBBenchmark(
1892
1888
  ),
1893
1889
  description="""MIEB(Multilingual) is a comprehensive image embeddings benchmark, spanning 10 task types, covering 130 tasks and a total of 39 languages.
1894
1890
  In addition to image classification (zero shot and linear probing), clustering, retrieval, MIEB includes tasks in compositionality evaluation,
1895
- document undestanding, visual STS, and CV-centric tasks. This benchmark consists of MIEB(eng) + 3 multilingual retrieval
1891
+ document understanding, visual STS, and CV-centric tasks. This benchmark consists of MIEB(eng) + 3 multilingual retrieval
1896
1892
  datasets + the multilingual parts of VisualSTS-b and VisualSTS-16.""",
1897
1893
  reference="https://arxiv.org/abs/2504.10471",
1898
1894
  contacts=["gowitheflow-1998", "isaac-chung"],
@@ -2113,7 +2109,7 @@ BUILT_MTEB = Benchmark(
2113
2109
  "BuiltBenchReranking",
2114
2110
  ],
2115
2111
  ),
2116
- description='"Built-Bench" is an ongoing effort aimed at evaluating text embedding models in the context of built asset management, spanning over various dicsiplines such as architeture, engineering, constrcution, and operations management of the built environment.',
2112
+ description='"Built-Bench" is an ongoing effort aimed at evaluating text embedding models in the context of built asset management, spanning over various disciplines such as architecture, engineering, construction, and operations management of the built environment.',
2117
2113
  reference="https://arxiv.org/abs/2411.12056",
2118
2114
  citation=r"""
2119
2115
  @article{shahinmoghadam2024benchmarking,
@@ -14,7 +14,7 @@ def _build_registry() -> dict[str, Benchmark]:
14
14
 
15
15
  benchmark_registry = {
16
16
  inst.name: inst
17
- for nam, inst in benchmark_module.__dict__.items()
17
+ for _, inst in benchmark_module.__dict__.items()
18
18
  if isinstance(inst, Benchmark)
19
19
  }
20
20
  return benchmark_registry