mteb 2.1.0__py3-none-any.whl → 2.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. mteb/_create_dataloaders.py +2 -0
  2. mteb/_evaluators/retrieval_metrics.py +0 -9
  3. mteb/abstasks/_stratification.py +1 -1
  4. mteb/abstasks/abstask.py +6 -1
  5. mteb/abstasks/dataset_card_template.md +1 -1
  6. mteb/abstasks/retrieval.py +2 -1
  7. mteb/abstasks/retrieval_dataset_loaders.py +1 -1
  8. mteb/abstasks/task_metadata.py +1 -1
  9. mteb/benchmarks/benchmarks/benchmarks.py +9 -13
  10. mteb/benchmarks/get_benchmark.py +1 -1
  11. mteb/descriptive_stats/Image/Any2AnyMultilingualRetrieval/XFlickr30kCoT2IRetrieval.json +243 -153
  12. mteb/descriptive_stats/Image/Any2AnyMultilingualRetrieval/XM3600T2IRetrieval.json +999 -629
  13. mteb/descriptive_stats/Image/Any2AnyRetrieval/OVENIT2TRetrieval.json +33 -17
  14. mteb/descriptive_stats/Image/DocumentUnderstanding/MIRACLVisionRetrieval.json +574 -0
  15. mteb/descriptive_stats/Retrieval/ClimateFEVERHardNegatives.v2.json +30 -0
  16. mteb/descriptive_stats/Retrieval/DBPediaHardNegatives.v2.json +30 -0
  17. mteb/descriptive_stats/Retrieval/FEVERHardNegatives.v2.json +30 -0
  18. mteb/descriptive_stats/Retrieval/HotpotQAHardNegatives.v2.json +30 -0
  19. mteb/descriptive_stats/Retrieval/QuoraRetrievalHardNegatives.v2.json +30 -0
  20. mteb/descriptive_stats/Retrieval/RiaNewsRetrievalHardNegatives.v2.json +30 -0
  21. mteb/descriptive_stats/Retrieval/VDRMultilingualRetrieval.json +184 -0
  22. mteb/languages/check_language_code.py +11 -3
  23. mteb/languages/language_scripts.py +4 -0
  24. mteb/leaderboard/app.py +1 -1
  25. mteb/leaderboard/benchmark_selector.py +1 -0
  26. mteb/leaderboard/text_segments.py +1 -1
  27. mteb/models/model_implementations/b1ade_models.py +1 -1
  28. mteb/models/model_implementations/bge_models.py +1 -3
  29. mteb/models/model_implementations/bmretriever_models.py +1 -1
  30. mteb/models/model_implementations/gme_v_models.py +2 -2
  31. mteb/models/model_implementations/ibm_granite_models.py +1 -1
  32. mteb/models/model_implementations/inf_models.py +3 -3
  33. mteb/models/model_implementations/jina_models.py +12 -2
  34. mteb/models/model_implementations/llm2vec_models.py +1 -1
  35. mteb/models/model_implementations/misc_models.py +2 -2
  36. mteb/models/model_implementations/mxbai_models.py +1 -1
  37. mteb/models/model_implementations/reasonir_model.py +1 -1
  38. mteb/models/model_implementations/salesforce_models.py +1 -1
  39. mteb/models/model_implementations/seed_1_6_embedding_models.py +1 -1
  40. mteb/models/model_implementations/voyage_v.py +9 -9
  41. mteb/results/task_result.py +6 -8
  42. mteb/tasks/classification/dan/angry_tweets_classification.py +2 -2
  43. mteb/tasks/classification/eng/legal_bench_classification.py +3 -3
  44. mteb/tasks/classification/mya/myanmar_news.py +2 -2
  45. mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
  46. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
  47. mteb/tasks/pair_classification/multilingual/indic_xnli_pair_classification.py +9 -8
  48. mteb/tasks/retrieval/code/code_rag.py +8 -8
  49. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  50. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  51. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  52. mteb/tasks/retrieval/eng/__init__.py +18 -4
  53. mteb/tasks/retrieval/eng/climate_fever_retrieval.py +68 -77
  54. mteb/tasks/retrieval/eng/dbpedia_retrieval.py +55 -50
  55. mteb/tasks/retrieval/eng/fever_retrieval.py +62 -67
  56. mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +0 -4
  57. mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +0 -4
  58. mteb/tasks/retrieval/eng/hotpot_qa_retrieval.py +57 -67
  59. mteb/tasks/retrieval/eng/legal_summarization_retrieval.py +1 -1
  60. mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +0 -3
  61. mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +0 -2
  62. mteb/tasks/retrieval/eng/oven_it2t_retrieval.py +1 -1
  63. mteb/tasks/retrieval/eng/quora_retrieval.py +51 -46
  64. mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +0 -4
  65. mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +0 -4
  66. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +0 -2
  67. mteb/tasks/retrieval/jpn/ja_gov_faqs_retrieval.py +1 -1
  68. mteb/tasks/retrieval/multilingual/belebele_retrieval.py +1 -1
  69. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +0 -2
  70. mteb/tasks/retrieval/multilingual/miracl_retrieval.py +1 -1
  71. mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +2 -9
  72. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +0 -2
  73. mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +0 -2
  74. mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +6 -5
  75. mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +3 -4
  76. mteb/tasks/retrieval/nob/norquad.py +2 -2
  77. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  78. mteb/tasks/retrieval/rus/__init__.py +11 -2
  79. mteb/tasks/retrieval/rus/ria_news_retrieval.py +48 -44
  80. mteb/tasks/retrieval/tur/tur_hist_quad.py +2 -2
  81. {mteb-2.1.0.dist-info → mteb-2.1.2.dist-info}/METADATA +5 -5
  82. {mteb-2.1.0.dist-info → mteb-2.1.2.dist-info}/RECORD +86 -91
  83. mteb/descriptive_stats/Classification/PersianTextTone.json +0 -56
  84. mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchCount.json +0 -37
  85. mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchDepth.json +0 -25
  86. mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchDistance.json +0 -25
  87. mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchRelation.json +0 -25
  88. mteb/descriptive_stats/Image/VisualSTS/STS12VisualSTS.json +0 -20
  89. mteb/descriptive_stats/Image/VisualSTS/STS13VisualSTS.json +0 -20
  90. mteb/descriptive_stats/Image/VisualSTS/STS14VisualSTS.json +0 -20
  91. mteb/descriptive_stats/Image/VisualSTS/STS15VisualSTS.json +0 -20
  92. mteb/descriptive_stats/Image/VisualSTS/STS16VisualSTS.json +0 -20
  93. mteb/descriptive_stats/Image/VisualSTS/STS17MultilingualVisualSTS.json +0 -220
  94. mteb/descriptive_stats/Image/VisualSTS/STSBenchmarkMultilingualVisualSTS.json +0 -402
  95. mteb/descriptive_stats/Reranking/InstructIR.json +0 -31
  96. {mteb-2.1.0.dist-info → mteb-2.1.2.dist-info}/WHEEL +0 -0
  97. {mteb-2.1.0.dist-info → mteb-2.1.2.dist-info}/entry_points.txt +0 -0
  98. {mteb-2.1.0.dist-info → mteb-2.1.2.dist-info}/licenses/LICENSE +0 -0
  99. {mteb-2.1.0.dist-info → mteb-2.1.2.dist-info}/top_level.txt +0 -0
@@ -51,7 +51,13 @@ def _downsample_image(
51
51
  def voyage_v_loader(model_name, **kwargs):
52
52
  requires_package(
53
53
  voyage_v_loader,
54
- "voyageai and tenacity",
54
+ "voyageai",
55
+ model_name,
56
+ "pip install 'mteb[voyage_v]'",
57
+ )
58
+ requires_package(
59
+ voyage_v_loader,
60
+ "tenacity",
55
61
  model_name,
56
62
  "pip install 'mteb[voyage_v]'",
57
63
  )
@@ -65,11 +71,9 @@ def voyage_v_loader(model_name, **kwargs):
65
71
  **kwargs: Any,
66
72
  ):
67
73
  requires_image_dependencies()
68
- from torchvision import transforms
69
74
 
70
75
  self.model_name = model_name.split("/")[-1]
71
76
  self.vo = voyageai.Client()
72
- self.tensor_to_image = transforms.Compose([transforms.PILToTensor()])
73
77
 
74
78
  @retry(
75
79
  stop=stop_after_attempt(6), # Stop after 6 attempts
@@ -126,10 +130,7 @@ def voyage_v_loader(model_name, **kwargs):
126
130
  for batch in tqdm(
127
131
  images, disable=not show_progress_bar, desc="Image Encoding"
128
132
  ):
129
- batch_images = [
130
- [_downsample_image(self.tensor_to_image(image))]
131
- for image in batch["image"]
132
- ]
133
+ batch_images = [[_downsample_image(image)] for image in batch["image"]]
133
134
  embeddings = self._multimodal_embed(
134
135
  batch_images, model=self.model_name, input_type=input_type
135
136
  ).embeddings
@@ -163,8 +164,7 @@ def voyage_v_loader(model_name, **kwargs):
163
164
  inputs, disable=not show_progress_bar, desc="Interleaved Encoding"
164
165
  ):
165
166
  batch_images = [
166
- _downsample_image(self.tensor_to_image(image))
167
- for image in batch["image"]
167
+ _downsample_image(image) for image in batch["image"]
168
168
  ]
169
169
  batch_texts = batch["text"]
170
170
  interleaved_inputs = [
@@ -32,7 +32,7 @@ from mteb.types import (
32
32
  logger = logging.getLogger(__name__)
33
33
 
34
34
 
35
- class Criterias(HelpfulStrEnum):
35
+ class Criteria(HelpfulStrEnum):
36
36
  """Enum for criteria to check when merging TaskResult objects."""
37
37
 
38
38
  MTEB_VERSION = "mteb_version"
@@ -671,7 +671,7 @@ class TaskResult(BaseModel):
671
671
  def is_mergeable(
672
672
  self,
673
673
  result: TaskResult | AbsTask,
674
- criteria: list[str] | list[Criterias] = [
674
+ criteria: list[str] | list[Criteria] = [
675
675
  "mteb_version",
676
676
  "dataset_revision",
677
677
  ],
@@ -688,9 +688,7 @@ class TaskResult(BaseModel):
688
688
  Returns:
689
689
  True if the TaskResult object can be merged with the other object, False otherwise.
690
690
  """
691
- criteria = [
692
- Criterias.from_str(c) if isinstance(c, str) else c for c in criteria
693
- ]
691
+ criteria = [Criteria.from_str(c) if isinstance(c, str) else c for c in criteria]
694
692
  if isinstance(result, TaskResult):
695
693
  name = result.task_name
696
694
  revision = result.dataset_revision
@@ -709,14 +707,14 @@ class TaskResult(BaseModel):
709
707
  )
710
708
  return False
711
709
 
712
- if Criterias.MTEB_VERSION in criteria and self.mteb_version != mteb_version:
710
+ if Criteria.MTEB_VERSION in criteria and self.mteb_version != mteb_version:
713
711
  if raise_error:
714
712
  raise ValueError(
715
713
  f"Cannot merge TaskResult objects as they are derived from different MTEB versions ({self.mteb_version} and {mteb_version})"
716
714
  )
717
715
  return False
718
716
 
719
- if Criterias.DATASET_REVISION in criteria and self.dataset_revision != revision:
717
+ if Criteria.DATASET_REVISION in criteria and self.dataset_revision != revision:
720
718
  if raise_error:
721
719
  raise ValueError(
722
720
  f"Cannot merge TaskResult objects as they are derived from different dataset revisions ({self.dataset_revision} and {revision})"
@@ -728,7 +726,7 @@ class TaskResult(BaseModel):
728
726
  def merge(
729
727
  self,
730
728
  new_results: TaskResult,
731
- criteria: list[str] | list[Criterias] = [
729
+ criteria: list[str] | list[Criteria] = [
732
730
  "mteb_version",
733
731
  "dataset_revision",
734
732
  ],
@@ -9,7 +9,7 @@ class AngryTweetsClassification(AbsTaskClassification):
9
9
  "path": "DDSC/angry-tweets",
10
10
  "revision": "20b0e6081892e78179356fada741b7afa381443d",
11
11
  },
12
- description="A sentiment dataset with 3 classes (positiv, negativ, neutral) for Danish tweets",
12
+ description="A sentiment dataset with 3 classes (positive, negative, neutral) for Danish tweets",
13
13
  reference="https://aclanthology.org/2021.nodalida-main.53/",
14
14
  type="Classification",
15
15
  category="t2c",
@@ -47,7 +47,7 @@ class AngryTweetsClassificationV2(AbsTaskClassification):
47
47
  "path": "mteb/angry_tweets",
48
48
  "revision": "b9475fb66a13befda4fa9871cd92343bb2c0eb77",
49
49
  },
50
- description="""A sentiment dataset with 3 classes (positiv, negativ, neutral) for Danish tweets
50
+ description="""A sentiment dataset with 3 classes (positive, negative, neutral) for Danish tweets
51
51
  This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
52
52
  reference="https://aclanthology.org/2021.nodalida-main.53/",
53
53
  type="Classification",
@@ -2641,7 +2641,7 @@ class InternationalCitizenshipQuestionsLegalBenchClassification(AbsTaskClassific
2641
2641
  class JCrewBlockerLegalBenchClassification(AbsTaskClassification):
2642
2642
  metadata = TaskMetadata(
2643
2643
  name="JCrewBlockerLegalBenchClassification",
2644
- description="The J.Crew Blocker, also known as the J.Crew Protection, is a provision included in leveraged loan documents to prevent companies from removing security by transferring intellectual property (IP) into new subsidiaries and raising additional debt. The task consists of detemining whether the J.Crew Blocker is present in the document.",
2644
+ description="The J.Crew Blocker, also known as the J.Crew Protection, is a provision included in leveraged loan documents to prevent companies from removing security by transferring intellectual property (IP) into new subsidiaries and raising additional debt. The task consists of determining whether the J.Crew Blocker is present in the document.",
2645
2645
  reference="https://huggingface.co/datasets/nguha/legalbench",
2646
2646
  dataset={
2647
2647
  "path": "mteb/JCrewBlockerLegalBenchClassification",
@@ -2677,7 +2677,7 @@ class JCrewBlockerLegalBenchClassification(AbsTaskClassification):
2677
2677
  class JCrewBlockerLegalBenchClassificationV2(AbsTaskClassification):
2678
2678
  metadata = TaskMetadata(
2679
2679
  name="JCrewBlockerLegalBenchClassification.v2",
2680
- description="""The J.Crew Blocker, also known as the J.Crew Protection, is a provision included in leveraged loan documents to prevent companies from removing security by transferring intellectual property (IP) into new subsidiaries and raising additional debt. The task consists of detemining whether the J.Crew Blocker is present in the document.
2680
+ description="""The J.Crew Blocker, also known as the J.Crew Protection, is a provision included in leveraged loan documents to prevent companies from removing security by transferring intellectual property (IP) into new subsidiaries and raising additional debt. The task consists of determining whether the J.Crew Blocker is present in the document.
2681
2681
  This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
2682
2682
  reference="https://huggingface.co/datasets/nguha/legalbench",
2683
2683
  dataset={
@@ -4500,7 +4500,7 @@ class OverrulingLegalBenchClassificationV2(AbsTaskClassification):
4500
4500
  class PersonalJurisdictionLegalBenchClassification(AbsTaskClassification):
4501
4501
  metadata = TaskMetadata(
4502
4502
  name="PersonalJurisdictionLegalBenchClassification",
4503
- description="""Given a fact pattern describing the set of contacts between a plaintiff, defendant, and forum, determine if a court in that forum could excercise personal jurisdiction over the defendant.""",
4503
+ description="""Given a fact pattern describing the set of contacts between a plaintiff, defendant, and forum, determine if a court in that forum could exercise personal jurisdiction over the defendant.""",
4504
4504
  reference="https://huggingface.co/datasets/nguha/legalbench",
4505
4505
  dataset={
4506
4506
  "path": "mteb/PersonalJurisdictionLegalBenchClassification",
@@ -9,7 +9,7 @@ class MyanmarNews(AbsTaskClassification):
9
9
  "path": "mteb/MyanmarNews",
10
10
  "revision": "644419f24bc820bbf8af24e0b4714a069812e0a3",
11
11
  },
12
- description="The Myanmar News dataset on Hugging Face contains news articles in Burmese. It is designed for tasks such as text classification, sentiment analysis, and language modeling. The dataset includes a variety of news topics in 4 categorie, providing a rich resource for natural language processing applications involving Burmese which is a low resource language.",
12
+ description="The Myanmar News dataset on Hugging Face contains news articles in Burmese. It is designed for tasks such as text classification, sentiment analysis, and language modeling. The dataset includes a variety of news topics in 4 categories, providing a rich resource for natural language processing applications involving Burmese which is a low resource language.",
13
13
  reference="https://huggingface.co/datasets/myanmar_news",
14
14
  type="Classification",
15
15
  category="t2c",
@@ -45,7 +45,7 @@ class MyanmarNewsV2(AbsTaskClassification):
45
45
  "path": "mteb/myanmar_news",
46
46
  "revision": "475b43ffbdb5138ad67a01a2c860bc7db502f3c5",
47
47
  },
48
- description="""The Myanmar News dataset on Hugging Face contains news articles in Burmese. It is designed for tasks such as text classification, sentiment analysis, and language modeling. The dataset includes a variety of news topics in 4 categorie, providing a rich resource for natural language processing applications involving Burmese which is a low resource language.
48
+ description="""The Myanmar News dataset on Hugging Face contains news articles in Burmese. It is designed for tasks such as text classification, sentiment analysis, and language modeling. The dataset includes a variety of news topics in 4 categories, providing a rich resource for natural language processing applications involving Burmese which is a low resource language.
49
49
  This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
50
50
  reference="https://huggingface.co/datasets/myanmar_news",
51
51
  type="Classification",
@@ -5,7 +5,7 @@ from mteb.abstasks.task_metadata import TaskMetadata
5
5
  class WongnaiReviewsClassification(AbsTaskClassification):
6
6
  metadata = TaskMetadata(
7
7
  name="WongnaiReviewsClassification",
8
- description="Wongnai features over 200,000 restaurants, beauty salons, and spas across Thailand on its platform, with detailed information about each merchant and user reviews. In this dataset there are 5 classes corressponding each star rating",
8
+ description="Wongnai features over 200,000 restaurants, beauty salons, and spas across Thailand on its platform, with detailed information about each merchant and user reviews. In this dataset there are 5 classes corresponding each star rating",
9
9
  reference="https://github.com/wongnai/wongnai-corpus",
10
10
  dataset={
11
11
  "path": "Wongnai/wongnai_reviews",
@@ -10,7 +10,7 @@ class UkrFormalityClassification(AbsTaskClassification):
10
10
  trainslating English GYAFC data.
11
11
  English data source: https://aclanthology.org/N18-1012/
12
12
  Translation into Ukrainian language using model: https://huggingface.co/facebook/nllb-200-distilled-600M
13
- Additionally, the dataset was balanced, witha labels: 0 - informal, 1 - formal.
13
+ Additionally, the dataset was balanced, with labels: 0 - informal, 1 - formal.
14
14
  """,
15
15
  dataset={
16
16
  "path": "ukr-detect/ukr-formality-dataset-translated-gyafc",
@@ -61,7 +61,7 @@ class UkrFormalityClassificationV2(AbsTaskClassification):
61
61
  trainslating English GYAFC data.
62
62
  English data source: https://aclanthology.org/N18-1012/
63
63
  Translation into Ukrainian language using model: https://huggingface.co/facebook/nllb-200-distilled-600M
64
- Additionally, the dataset was balanced, witha labels: 0 - informal, 1 - formal.
64
+ Additionally, the dataset was balanced, with labels: 0 - informal, 1 - formal.
65
65
 
66
66
  This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
67
67
  dataset={
@@ -23,14 +23,15 @@ class IndicXnliPairClassification(AbsTaskPairClassification):
23
23
  "path": "mteb/IndicXnliPairClassification",
24
24
  "revision": "027e97b9afe84ea3447b57b7705b8864bb2b3a83",
25
25
  },
26
- description="""INDICXNLI is similar to existing XNLI dataset in shape/form, but
27
- focusses on Indic language family.
28
- The train (392,702), validation (2,490), and evaluation sets (5,010) of English
29
- XNLI were translated from English into each of the eleven Indic languages. IndicTrans
30
- is a large Transformer-based sequence to sequence model. It is trained on Samanantar
31
- dataset (Ramesh et al., 2021), which is the largest parallel multi- lingual corpus
32
- over eleven Indic languages.
33
- """,
26
+ description=(
27
+ "INDICXNLI is similar to existing XNLI dataset in shape/form, but "
28
+ "focuses on Indic language family. "
29
+ "The train (392,702), validation (2,490), and evaluation sets (5,010) of English "
30
+ "XNLI were translated from English into each of the eleven Indic languages. IndicTrans "
31
+ "is a large Transformer-based sequence to sequence model. It is trained on Samanantar "
32
+ "dataset (Ramesh et al., 2021), which is the largest parallel multi- lingual corpus "
33
+ "over eleven Indic languages."
34
+ ),
34
35
  reference="https://gem-benchmark.com/data_cards/opusparcus",
35
36
  category="t2t",
36
37
  modalities=["text"],
@@ -60,9 +60,9 @@ class CodeRAGProgrammingSolutionsRetrieval(AbsTaskRetrieval):
60
60
  self.data_loaded = True
61
61
 
62
62
  def dataset_transform(self) -> None:
63
- """And transform to a retrieval datset, which have the following attributes
63
+ """And transform to a retrieval dataset, which have the following attributes
64
64
 
65
- self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document datas like title and text
65
+ self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document data like title and text
66
66
  self.queries = Dict[query_id, str] #id => query
67
67
  self.relevant_docs = Dict[query_id, Dict[[doc_id, score]]
68
68
  """
@@ -117,9 +117,9 @@ class CodeRAGOnlineTutorialsRetrieval(AbsTaskRetrieval):
117
117
  self.data_loaded = True
118
118
 
119
119
  def dataset_transform(self) -> None:
120
- """And transform to a retrieval datset, which have the following attributes
120
+ """And transform to a retrieval dataset, which have the following attributes
121
121
 
122
- self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document datas like title and text
122
+ self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document data like title and text
123
123
  self.queries = Dict[query_id, str] #id => query
124
124
  self.relevant_docs = Dict[query_id, Dict[[doc_id, score]]
125
125
  """
@@ -177,9 +177,9 @@ class CodeRAGLibraryDocumentationSolutionsRetrieval(AbsTaskRetrieval):
177
177
  self.data_loaded = True
178
178
 
179
179
  def dataset_transform(self) -> None:
180
- """And transform to a retrieval datset, which have the following attributes
180
+ """And transform to a retrieval dataset, which have the following attributes
181
181
 
182
- self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document datas like title and text
182
+ self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document data like title and text
183
183
  self.queries = Dict[query_id, str] #id => query
184
184
  self.relevant_docs = Dict[query_id, Dict[[doc_id, score]]
185
185
  """
@@ -234,9 +234,9 @@ class CodeRAGStackoverflowPostsRetrieval(AbsTaskRetrieval):
234
234
  self.data_loaded = True
235
235
 
236
236
  def dataset_transform(self) -> None:
237
- """And transform to a retrieval datset, which have the following attributes
237
+ """And transform to a retrieval dataset, which have the following attributes
238
238
 
239
- self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document datas like title and text
239
+ self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document data like title and text
240
240
  self.queries = Dict[query_id, str] #id => query
241
241
  self.relevant_docs = Dict[query_id, Dict[[doc_id, score]]
242
242
  """
@@ -56,7 +56,7 @@ Derczynski, Leon},
56
56
  self.data_loaded = True
57
57
 
58
58
  def dataset_transform(self) -> None:
59
- """And transform to a retrieval datset, which have the following attributes
59
+ """And transform to a retrieval dataset, which have the following attributes
60
60
 
61
61
  self.corpus = dict[doc_id, dict[str, str]] #id => dict with document data like title and text
62
62
  self.queries = dict[query_id, str] #id => query
@@ -69,9 +69,9 @@ Piperidis, Stelios},
69
69
  self.data_loaded = True
70
70
 
71
71
  def dataset_transform(self) -> None:
72
- """And transform to a retrieval datset, which have the following attributes
72
+ """And transform to a retrieval dataset, which have the following attributes
73
73
 
74
- self.corpus = dict[doc_id, dict[str, str]] #id => dict with document datas like title and text
74
+ self.corpus = dict[doc_id, dict[str, str]] #id => dict with document data like title and text
75
75
  self.queries = dict[query_id, str] #id => query
76
76
  self.relevant_docs = dict[query_id, dict[[doc_id, score]]
77
77
  """
@@ -45,9 +45,9 @@ class TwitterHjerneRetrieval(AbsTaskRetrieval):
45
45
  self.data_loaded = True
46
46
 
47
47
  def dataset_transform(self) -> None:
48
- """And transform to a retrieval datset, which have the following attributes
48
+ """And transform to a retrieval dataset, which have the following attributes
49
49
 
50
- self.corpus = dict[doc_id, dict[str, str]] #id => dict with document datas like title and text
50
+ self.corpus = dict[doc_id, dict[str, str]] #id => dict with document data like title and text
51
51
  self.queries = dict[query_id, str] #id => query
52
52
  self.relevant_docs = dict[query_id, dict[[doc_id, score]]
53
53
  """
@@ -22,6 +22,7 @@ from .cirr_it2i_retrieval import CIRRIT2IRetrieval
22
22
  from .climate_fever_retrieval import (
23
23
  ClimateFEVER,
24
24
  ClimateFEVERHardNegatives,
25
+ ClimateFEVERHardNegativesV2,
25
26
  ClimateFEVERRetrievalv2,
26
27
  )
27
28
  from .cqa_dupstack_android_retrieval import CQADupstackAndroidRetrieval
@@ -57,7 +58,7 @@ from .dapfam_patent_retrieval import (
57
58
  DAPFAMOutTitlAbsToTitlAbsClmRetrieval,
58
59
  DAPFAMOutTitlAbsToTitlAbsRetrieval,
59
60
  )
60
- from .dbpedia_retrieval import DBPedia, DBPediaHardNegatives
61
+ from .dbpedia_retrieval import DBPedia, DBPediaHardNegatives, DBPediaHardNegativesV2
61
62
  from .edis_t2it_retrieval import EDIST2ITRetrieval
62
63
  from .encyclopedia_vqa_it2it_retrieval import EncyclopediaVQAIT2ITRetrieval
63
64
  from .english_finance1_retrieval import EnglishFinance1Retrieval
@@ -70,7 +71,7 @@ from .fashion200k_i2t_retrieval import Fashion200kI2TRetrieval
70
71
  from .fashion200k_t2i_retrieval import Fashion200kT2IRetrieval
71
72
  from .fashion_iq_it2i_retrieval import FashionIQIT2IRetrieval
72
73
  from .feedback_qa_retrieval import FeedbackQARetrieval
73
- from .fever_retrieval import FEVER, FEVERHardNegatives
74
+ from .fever_retrieval import FEVER, FEVERHardNegatives, FEVERHardNegativesV2
74
75
  from .fi_qa2018_retrieval import FiQA2018
75
76
  from .fin_qa_retrieval import FinQARetrieval
76
77
  from .finance_bench_retrieval import FinanceBenchRetrieval
@@ -85,7 +86,11 @@ from .hateful_memes_i2t_retrieval import HatefulMemesI2TRetrieval
85
86
  from .hateful_memes_t2i_retrieval import HatefulMemesT2IRetrieval
86
87
  from .hc3_finance_retrieval import HC3FinanceRetrieval
87
88
  from .hella_swag_retrieval import HellaSwag
88
- from .hotpot_qa_retrieval import HotpotQA, HotpotQAHardNegatives
89
+ from .hotpot_qa_retrieval import (
90
+ HotpotQA,
91
+ HotpotQAHardNegatives,
92
+ HotpotQAHardNegativesV2,
93
+ )
89
94
  from .image_co_de_t2i_retrieval import ImageCoDeT2IRetrieval
90
95
  from .info_seek_it2it_retrieval import InfoSeekIT2ITRetrieval
91
96
  from .info_seek_it2t_retrieval import InfoSeekIT2TRetrieval
@@ -133,7 +138,11 @@ from .oven_it2it_retrieval import OVENIT2ITRetrieval
133
138
  from .oven_it2t_retrieval import OVENIT2TRetrieval
134
139
  from .piqa_retrieval import PIQA
135
140
  from .quail_retrieval import Quail
136
- from .quora_retrieval import QuoraRetrieval, QuoraRetrievalHardNegatives
141
+ from .quora_retrieval import (
142
+ QuoraRetrieval,
143
+ QuoraRetrievalHardNegatives,
144
+ QuoraRetrievalHardNegativesV2,
145
+ )
137
146
  from .r2_med_retrieval import (
138
147
  R2MEDBioinformaticsRetrieval,
139
148
  R2MEDBiologyRetrieval,
@@ -247,6 +256,7 @@ __all__ = [
247
256
  "ChemNQRetrieval",
248
257
  "ClimateFEVER",
249
258
  "ClimateFEVERHardNegatives",
259
+ "ClimateFEVERHardNegativesV2",
250
260
  "ClimateFEVERRetrievalv2",
251
261
  "DAPFAMAllTitlAbsClmToFullTextRetrieval",
252
262
  "DAPFAMAllTitlAbsClmToTitlAbsClmRetrieval",
@@ -268,6 +278,7 @@ __all__ = [
268
278
  "DAPFAMOutTitlAbsToTitlAbsRetrieval",
269
279
  "DBPedia",
270
280
  "DBPediaHardNegatives",
281
+ "DBPediaHardNegativesV2",
271
282
  "EDIST2ITRetrieval",
272
283
  "EncyclopediaVQAIT2ITRetrieval",
273
284
  "EnglishFinance1Retrieval",
@@ -276,6 +287,7 @@ __all__ = [
276
287
  "EnglishFinance4Retrieval",
277
288
  "EnglishHealthcare1Retrieval",
278
289
  "FEVERHardNegatives",
290
+ "FEVERHardNegativesV2",
279
291
  "FaithDialRetrieval",
280
292
  "Fashion200kI2TRetrieval",
281
293
  "Fashion200kT2IRetrieval",
@@ -296,6 +308,7 @@ __all__ = [
296
308
  "HellaSwag",
297
309
  "HotpotQA",
298
310
  "HotpotQAHardNegatives",
311
+ "HotpotQAHardNegativesV2",
299
312
  "ImageCoDeT2IRetrieval",
300
313
  "InfoSeekIT2ITRetrieval",
301
314
  "InfoSeekIT2TRetrieval",
@@ -345,6 +358,7 @@ __all__ = [
345
358
  "Quail",
346
359
  "QuoraRetrieval",
347
360
  "QuoraRetrievalHardNegatives",
361
+ "QuoraRetrievalHardNegativesV2",
348
362
  "R2MEDBioinformaticsRetrieval",
349
363
  "R2MEDBiologyRetrieval",
350
364
  "R2MEDIIYiClinicalRetrieval",
@@ -1,30 +1,21 @@
1
1
  from mteb.abstasks.retrieval import AbsTaskRetrieval
2
2
  from mteb.abstasks.task_metadata import TaskMetadata
3
3
 
4
-
5
- class ClimateFEVER(AbsTaskRetrieval):
6
- metadata = TaskMetadata(
7
- name="ClimateFEVER",
8
- description="CLIMATE-FEVER is a dataset adopting the FEVER methodology that consists of 1,535 real-world claims (queries) regarding climate-change. The underlying corpus is the same as FVER.",
9
- reference="https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html",
10
- dataset={
11
- "path": "mteb/climate-fever",
12
- "revision": "47f2ac6acb640fc46020b02a5b59fdda04d39380",
13
- },
14
- type="Retrieval",
15
- category="t2t",
16
- modalities=["text"],
17
- eval_splits=["test"],
18
- eval_langs=["eng-Latn"],
19
- main_score="ndcg_at_10",
20
- date=("2001-01-01", "2020-12-31"), # launch of wiki -> paper publication
21
- domains=["Encyclopaedic", "Written"],
22
- task_subtypes=["Claim verification"],
23
- license="cc-by-sa-4.0",
24
- annotations_creators="human-annotated",
25
- dialect=[],
26
- sample_creation="found",
27
- bibtex_citation=r"""
4
+ _climate_fever_metadata = dict(
5
+ type="Retrieval",
6
+ category="t2t",
7
+ modalities=["text"],
8
+ eval_splits=["test"],
9
+ eval_langs=["eng-Latn"],
10
+ main_score="ndcg_at_10",
11
+ date=("2001-01-01", "2020-12-31"), # launch of wiki -> paper publication
12
+ domains=["Encyclopaedic", "Written"],
13
+ task_subtypes=["Claim verification"],
14
+ license="cc-by-sa-4.0",
15
+ annotations_creators="human-annotated",
16
+ dialect=[],
17
+ sample_creation="found",
18
+ bibtex_citation=r"""
28
19
  @misc{diggelmann2021climatefever,
29
20
  archiveprefix = {arXiv},
30
21
  author = {Thomas Diggelmann and Jordan Boyd-Graber and Jannis Bulian and Massimiliano Ciaramita and Markus Leippold},
@@ -34,82 +25,82 @@ class ClimateFEVER(AbsTaskRetrieval):
34
25
  year = {2021},
35
26
  }
36
27
  """,
28
+ )
29
+
30
+
31
+ class ClimateFEVER(AbsTaskRetrieval):
32
+ metadata = TaskMetadata(
33
+ name="ClimateFEVER",
34
+ description=(
35
+ "CLIMATE-FEVER is a dataset adopting the FEVER methodology that consists of 1,535 real-world claims "
36
+ "(queries) regarding climate-change. The underlying corpus is the same as FEVER."
37
+ ),
38
+ reference="https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html",
39
+ dataset={
40
+ "path": "mteb/climate-fever",
41
+ "revision": "47f2ac6acb640fc46020b02a5b59fdda04d39380",
42
+ },
43
+ prompt={
44
+ "query": "Given a claim about climate change, retrieve documents that support or refute the claim"
45
+ },
46
+ **_climate_fever_metadata,
47
+ )
48
+
49
+
50
+ class ClimateFEVERRetrievalv2(AbsTaskRetrieval):
51
+ metadata = TaskMetadata(
52
+ name="ClimateFEVER.v2",
53
+ description=(
54
+ "CLIMATE-FEVER is a dataset following the FEVER methodology, containing 1,535 real-world climate change claims. "
55
+ "This updated version addresses corpus mismatches and qrel inconsistencies in MTEB, restoring labels while refining corpus-query alignment for better accuracy."
56
+ ),
57
+ reference="https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html",
58
+ dataset={
59
+ "path": "mteb/climate-fever-v2",
60
+ "revision": "e438c9586767800aeb10dbe8a245c41dbea4e5f4",
61
+ },
37
62
  prompt={
38
63
  "query": "Given a claim about climate change, retrieve documents that support or refute the claim"
39
64
  },
65
+ adapted_from=["ClimateFEVER"],
66
+ **_climate_fever_metadata,
40
67
  )
41
68
 
42
69
 
43
70
  class ClimateFEVERHardNegatives(AbsTaskRetrieval):
44
71
  metadata = TaskMetadata(
45
72
  name="ClimateFEVERHardNegatives",
46
- description="CLIMATE-FEVER is a dataset adopting the FEVER methodology that consists of 1,535 real-world claims regarding climate-change. The hard negative version has been created by pooling the 250 top documents per query from BM25, e5-multilingual-large and e5-mistral-instruct.",
73
+ description=(
74
+ "CLIMATE-FEVER is a dataset adopting the FEVER methodology that consists of 1,535 real-world claims regarding climate-change. "
75
+ "The hard negative version has been created by pooling the 250 top documents per query from BM25, e5-multilingual-large and e5-mistral-instruct."
76
+ ),
47
77
  reference="https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html",
48
78
  dataset={
49
79
  "path": "mteb/ClimateFEVER_test_top_250_only_w_correct-v2",
50
80
  "revision": "3a309e201f3c2c4b13bd4a367a8f37eee2ec1d21",
51
81
  },
52
- type="Retrieval",
53
- category="t2t",
54
- modalities=["text"],
55
- eval_splits=["test"],
56
- eval_langs=["eng-Latn"],
57
- main_score="ndcg_at_10",
58
- date=("2001-01-01", "2020-12-31"), # launch of wiki -> paper publication
59
- domains=["Encyclopaedic", "Written"],
60
- task_subtypes=["Claim verification"],
61
- license="cc-by-sa-4.0",
62
- annotations_creators="human-annotated",
63
- dialect=[],
64
- sample_creation="found",
65
- bibtex_citation=r"""
66
- @misc{diggelmann2021climatefever,
67
- archiveprefix = {arXiv},
68
- author = {Thomas Diggelmann and Jordan Boyd-Graber and Jannis Bulian and Massimiliano Ciaramita and Markus Leippold},
69
- eprint = {2012.00614},
70
- primaryclass = {cs.CL},
71
- title = {CLIMATE-FEVER: A Dataset for Verification of Real-World Climate Claims},
72
- year = {2021},
73
- }
74
- """,
75
82
  adapted_from=["ClimateFEVER"],
83
+ superseded_by="ClimateFEVERHardNegatives.v2",
84
+ **_climate_fever_metadata,
76
85
  )
77
86
 
78
87
 
79
- class ClimateFEVERRetrievalv2(AbsTaskRetrieval):
88
+ class ClimateFEVERHardNegativesV2(AbsTaskRetrieval):
80
89
  metadata = TaskMetadata(
81
- name="ClimateFEVER.v2",
82
- description="CLIMATE-FEVER is a dataset following the FEVER methodology, containing 1,535 real-world climate change claims. This updated version addresses corpus mismatches and qrel inconsistencies in MTEB, restoring labels while refining corpus-query alignment for better accuracy. ",
90
+ name="ClimateFEVERHardNegatives.v2",
91
+ description=(
92
+ "CLIMATE-FEVER is a dataset adopting the FEVER methodology that consists of 1,535 real-world claims regarding climate-change. "
93
+ "The hard negative version has been created by pooling the 250 top documents per query from BM25, e5-multilingual-large and e5-mistral-instruct. "
94
+ "V2 uses a more appropriate prompt rather than the default prompt for retrieval. You can get more information on the effect of different prompt in the [PR](https://github.com/embeddings-benchmark/mteb/pull/3469#issuecomment-3436467106)"
95
+ ),
83
96
  reference="https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html",
84
97
  dataset={
85
- "path": "mteb/climate-fever-v2",
86
- "revision": "e438c9586767800aeb10dbe8a245c41dbea4e5f4",
98
+ "path": "mteb/ClimateFEVER_test_top_250_only_w_correct-v2",
99
+ "revision": "3a309e201f3c2c4b13bd4a367a8f37eee2ec1d21",
87
100
  },
88
- type="Retrieval",
89
- category="t2t",
90
- modalities=["text"],
91
- eval_splits=["test"],
92
- eval_langs=["eng-Latn"],
93
- main_score="ndcg_at_10",
94
- date=("2001-01-01", "2020-12-31"), # launch of wiki -> paper publication
95
- domains=["Academic", "Written"],
96
- task_subtypes=["Claim verification"],
97
- license="cc-by-sa-4.0",
98
- annotations_creators="human-annotated",
99
- dialect=[],
100
- sample_creation="found",
101
- bibtex_citation=r"""
102
- @misc{diggelmann2021climatefever,
103
- archiveprefix = {arXiv},
104
- author = {Thomas Diggelmann and Jordan Boyd-Graber and Jannis Bulian and Massimiliano Ciaramita and Markus Leippold},
105
- eprint = {2012.00614},
106
- primaryclass = {cs.CL},
107
- title = {CLIMATE-FEVER: A Dataset for Verification of Real-World Climate Claims},
108
- year = {2021},
109
- }
110
- """,
101
+ adapted_from=["ClimateFEVER"],
111
102
  prompt={
112
103
  "query": "Given a claim about climate change, retrieve documents that support or refute the claim"
113
104
  },
114
- adapted_from=["ClimateFEVER"],
105
+ **_climate_fever_metadata,
115
106
  )