mteb 2.1.0__py3-none-any.whl → 2.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/_create_dataloaders.py +2 -0
- mteb/abstasks/_stratification.py +1 -1
- mteb/abstasks/abstask.py +6 -1
- mteb/abstasks/dataset_card_template.md +1 -1
- mteb/abstasks/retrieval.py +2 -1
- mteb/abstasks/retrieval_dataset_loaders.py +1 -1
- mteb/abstasks/task_metadata.py +1 -1
- mteb/benchmarks/benchmarks/benchmarks.py +7 -11
- mteb/benchmarks/get_benchmark.py +1 -1
- mteb/descriptive_stats/Image/Any2AnyMultilingualRetrieval/XFlickr30kCoT2IRetrieval.json +243 -153
- mteb/descriptive_stats/Image/Any2AnyMultilingualRetrieval/XM3600T2IRetrieval.json +999 -629
- mteb/descriptive_stats/Image/Any2AnyRetrieval/OVENIT2TRetrieval.json +33 -17
- mteb/descriptive_stats/Image/DocumentUnderstanding/MIRACLVisionRetrieval.json +574 -0
- mteb/descriptive_stats/Retrieval/ClimateFEVERHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/DBPediaHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/FEVERHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/HotpotQAHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/QuoraRetrievalHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/RiaNewsRetrievalHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/VDRMultilingualRetrieval.json +184 -0
- mteb/languages/check_language_code.py +11 -3
- mteb/languages/language_scripts.py +4 -0
- mteb/leaderboard/text_segments.py +1 -1
- mteb/models/model_implementations/b1ade_models.py +1 -1
- mteb/models/model_implementations/bge_models.py +1 -3
- mteb/models/model_implementations/bmretriever_models.py +1 -1
- mteb/models/model_implementations/gme_v_models.py +2 -2
- mteb/models/model_implementations/ibm_granite_models.py +1 -1
- mteb/models/model_implementations/inf_models.py +3 -3
- mteb/models/model_implementations/jina_models.py +12 -2
- mteb/models/model_implementations/llm2vec_models.py +1 -1
- mteb/models/model_implementations/misc_models.py +2 -2
- mteb/models/model_implementations/mxbai_models.py +1 -1
- mteb/models/model_implementations/salesforce_models.py +1 -1
- mteb/models/model_implementations/seed_1_6_embedding_models.py +1 -1
- mteb/models/model_implementations/voyage_v.py +9 -9
- mteb/results/task_result.py +6 -8
- mteb/tasks/classification/dan/angry_tweets_classification.py +2 -2
- mteb/tasks/classification/eng/legal_bench_classification.py +3 -3
- mteb/tasks/classification/mya/myanmar_news.py +2 -2
- mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
- mteb/tasks/pair_classification/multilingual/indic_xnli_pair_classification.py +9 -8
- mteb/tasks/retrieval/code/code_rag.py +8 -8
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
- mteb/tasks/retrieval/eng/__init__.py +18 -4
- mteb/tasks/retrieval/eng/climate_fever_retrieval.py +68 -77
- mteb/tasks/retrieval/eng/dbpedia_retrieval.py +55 -50
- mteb/tasks/retrieval/eng/fever_retrieval.py +62 -67
- mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +0 -4
- mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +0 -4
- mteb/tasks/retrieval/eng/hotpot_qa_retrieval.py +57 -67
- mteb/tasks/retrieval/eng/legal_summarization_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +0 -3
- mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +0 -2
- mteb/tasks/retrieval/eng/oven_it2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/quora_retrieval.py +51 -46
- mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +0 -4
- mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +0 -4
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +0 -2
- mteb/tasks/retrieval/jpn/ja_gov_faqs_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/belebele_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +0 -2
- mteb/tasks/retrieval/multilingual/miracl_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +2 -9
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +0 -2
- mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +0 -2
- mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +6 -5
- mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +3 -4
- mteb/tasks/retrieval/nob/norquad.py +2 -2
- mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
- mteb/tasks/retrieval/rus/__init__.py +11 -2
- mteb/tasks/retrieval/rus/ria_news_retrieval.py +48 -44
- mteb/tasks/retrieval/tur/tur_hist_quad.py +2 -2
- {mteb-2.1.0.dist-info → mteb-2.1.1.dist-info}/METADATA +5 -5
- {mteb-2.1.0.dist-info → mteb-2.1.1.dist-info}/RECORD +82 -87
- mteb/descriptive_stats/Classification/PersianTextTone.json +0 -56
- mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchCount.json +0 -37
- mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchDepth.json +0 -25
- mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchDistance.json +0 -25
- mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchRelation.json +0 -25
- mteb/descriptive_stats/Image/VisualSTS/STS12VisualSTS.json +0 -20
- mteb/descriptive_stats/Image/VisualSTS/STS13VisualSTS.json +0 -20
- mteb/descriptive_stats/Image/VisualSTS/STS14VisualSTS.json +0 -20
- mteb/descriptive_stats/Image/VisualSTS/STS15VisualSTS.json +0 -20
- mteb/descriptive_stats/Image/VisualSTS/STS16VisualSTS.json +0 -20
- mteb/descriptive_stats/Image/VisualSTS/STS17MultilingualVisualSTS.json +0 -220
- mteb/descriptive_stats/Image/VisualSTS/STSBenchmarkMultilingualVisualSTS.json +0 -402
- mteb/descriptive_stats/Reranking/InstructIR.json +0 -31
- {mteb-2.1.0.dist-info → mteb-2.1.1.dist-info}/WHEEL +0 -0
- {mteb-2.1.0.dist-info → mteb-2.1.1.dist-info}/entry_points.txt +0 -0
- {mteb-2.1.0.dist-info → mteb-2.1.1.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.1.0.dist-info → mteb-2.1.1.dist-info}/top_level.txt +0 -0
mteb/results/task_result.py
CHANGED
|
@@ -32,7 +32,7 @@ from mteb.types import (
|
|
|
32
32
|
logger = logging.getLogger(__name__)
|
|
33
33
|
|
|
34
34
|
|
|
35
|
-
class
|
|
35
|
+
class Criteria(HelpfulStrEnum):
|
|
36
36
|
"""Enum for criteria to check when merging TaskResult objects."""
|
|
37
37
|
|
|
38
38
|
MTEB_VERSION = "mteb_version"
|
|
@@ -671,7 +671,7 @@ class TaskResult(BaseModel):
|
|
|
671
671
|
def is_mergeable(
|
|
672
672
|
self,
|
|
673
673
|
result: TaskResult | AbsTask,
|
|
674
|
-
criteria: list[str] | list[
|
|
674
|
+
criteria: list[str] | list[Criteria] = [
|
|
675
675
|
"mteb_version",
|
|
676
676
|
"dataset_revision",
|
|
677
677
|
],
|
|
@@ -688,9 +688,7 @@ class TaskResult(BaseModel):
|
|
|
688
688
|
Returns:
|
|
689
689
|
True if the TaskResult object can be merged with the other object, False otherwise.
|
|
690
690
|
"""
|
|
691
|
-
criteria = [
|
|
692
|
-
Criterias.from_str(c) if isinstance(c, str) else c for c in criteria
|
|
693
|
-
]
|
|
691
|
+
criteria = [Criteria.from_str(c) if isinstance(c, str) else c for c in criteria]
|
|
694
692
|
if isinstance(result, TaskResult):
|
|
695
693
|
name = result.task_name
|
|
696
694
|
revision = result.dataset_revision
|
|
@@ -709,14 +707,14 @@ class TaskResult(BaseModel):
|
|
|
709
707
|
)
|
|
710
708
|
return False
|
|
711
709
|
|
|
712
|
-
if
|
|
710
|
+
if Criteria.MTEB_VERSION in criteria and self.mteb_version != mteb_version:
|
|
713
711
|
if raise_error:
|
|
714
712
|
raise ValueError(
|
|
715
713
|
f"Cannot merge TaskResult objects as they are derived from different MTEB versions ({self.mteb_version} and {mteb_version})"
|
|
716
714
|
)
|
|
717
715
|
return False
|
|
718
716
|
|
|
719
|
-
if
|
|
717
|
+
if Criteria.DATASET_REVISION in criteria and self.dataset_revision != revision:
|
|
720
718
|
if raise_error:
|
|
721
719
|
raise ValueError(
|
|
722
720
|
f"Cannot merge TaskResult objects as they are derived from different dataset revisions ({self.dataset_revision} and {revision})"
|
|
@@ -728,7 +726,7 @@ class TaskResult(BaseModel):
|
|
|
728
726
|
def merge(
|
|
729
727
|
self,
|
|
730
728
|
new_results: TaskResult,
|
|
731
|
-
criteria: list[str] | list[
|
|
729
|
+
criteria: list[str] | list[Criteria] = [
|
|
732
730
|
"mteb_version",
|
|
733
731
|
"dataset_revision",
|
|
734
732
|
],
|
|
@@ -9,7 +9,7 @@ class AngryTweetsClassification(AbsTaskClassification):
|
|
|
9
9
|
"path": "DDSC/angry-tweets",
|
|
10
10
|
"revision": "20b0e6081892e78179356fada741b7afa381443d",
|
|
11
11
|
},
|
|
12
|
-
description="A sentiment dataset with 3 classes (
|
|
12
|
+
description="A sentiment dataset with 3 classes (positive, negative, neutral) for Danish tweets",
|
|
13
13
|
reference="https://aclanthology.org/2021.nodalida-main.53/",
|
|
14
14
|
type="Classification",
|
|
15
15
|
category="t2c",
|
|
@@ -47,7 +47,7 @@ class AngryTweetsClassificationV2(AbsTaskClassification):
|
|
|
47
47
|
"path": "mteb/angry_tweets",
|
|
48
48
|
"revision": "b9475fb66a13befda4fa9871cd92343bb2c0eb77",
|
|
49
49
|
},
|
|
50
|
-
description="""A sentiment dataset with 3 classes (
|
|
50
|
+
description="""A sentiment dataset with 3 classes (positive, negative, neutral) for Danish tweets
|
|
51
51
|
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
|
|
52
52
|
reference="https://aclanthology.org/2021.nodalida-main.53/",
|
|
53
53
|
type="Classification",
|
|
@@ -2641,7 +2641,7 @@ class InternationalCitizenshipQuestionsLegalBenchClassification(AbsTaskClassific
|
|
|
2641
2641
|
class JCrewBlockerLegalBenchClassification(AbsTaskClassification):
|
|
2642
2642
|
metadata = TaskMetadata(
|
|
2643
2643
|
name="JCrewBlockerLegalBenchClassification",
|
|
2644
|
-
description="The J.Crew Blocker, also known as the J.Crew Protection, is a provision included in leveraged loan documents to prevent companies from removing security by transferring intellectual property (IP) into new subsidiaries and raising additional debt. The task consists of
|
|
2644
|
+
description="The J.Crew Blocker, also known as the J.Crew Protection, is a provision included in leveraged loan documents to prevent companies from removing security by transferring intellectual property (IP) into new subsidiaries and raising additional debt. The task consists of determining whether the J.Crew Blocker is present in the document.",
|
|
2645
2645
|
reference="https://huggingface.co/datasets/nguha/legalbench",
|
|
2646
2646
|
dataset={
|
|
2647
2647
|
"path": "mteb/JCrewBlockerLegalBenchClassification",
|
|
@@ -2677,7 +2677,7 @@ class JCrewBlockerLegalBenchClassification(AbsTaskClassification):
|
|
|
2677
2677
|
class JCrewBlockerLegalBenchClassificationV2(AbsTaskClassification):
|
|
2678
2678
|
metadata = TaskMetadata(
|
|
2679
2679
|
name="JCrewBlockerLegalBenchClassification.v2",
|
|
2680
|
-
description="""The J.Crew Blocker, also known as the J.Crew Protection, is a provision included in leveraged loan documents to prevent companies from removing security by transferring intellectual property (IP) into new subsidiaries and raising additional debt. The task consists of
|
|
2680
|
+
description="""The J.Crew Blocker, also known as the J.Crew Protection, is a provision included in leveraged loan documents to prevent companies from removing security by transferring intellectual property (IP) into new subsidiaries and raising additional debt. The task consists of determining whether the J.Crew Blocker is present in the document.
|
|
2681
2681
|
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
|
|
2682
2682
|
reference="https://huggingface.co/datasets/nguha/legalbench",
|
|
2683
2683
|
dataset={
|
|
@@ -4500,7 +4500,7 @@ class OverrulingLegalBenchClassificationV2(AbsTaskClassification):
|
|
|
4500
4500
|
class PersonalJurisdictionLegalBenchClassification(AbsTaskClassification):
|
|
4501
4501
|
metadata = TaskMetadata(
|
|
4502
4502
|
name="PersonalJurisdictionLegalBenchClassification",
|
|
4503
|
-
description="""Given a fact pattern describing the set of contacts between a plaintiff, defendant, and forum, determine if a court in that forum could
|
|
4503
|
+
description="""Given a fact pattern describing the set of contacts between a plaintiff, defendant, and forum, determine if a court in that forum could exercise personal jurisdiction over the defendant.""",
|
|
4504
4504
|
reference="https://huggingface.co/datasets/nguha/legalbench",
|
|
4505
4505
|
dataset={
|
|
4506
4506
|
"path": "mteb/PersonalJurisdictionLegalBenchClassification",
|
|
@@ -9,7 +9,7 @@ class MyanmarNews(AbsTaskClassification):
|
|
|
9
9
|
"path": "mteb/MyanmarNews",
|
|
10
10
|
"revision": "644419f24bc820bbf8af24e0b4714a069812e0a3",
|
|
11
11
|
},
|
|
12
|
-
description="The Myanmar News dataset on Hugging Face contains news articles in Burmese. It is designed for tasks such as text classification, sentiment analysis, and language modeling. The dataset includes a variety of news topics in 4
|
|
12
|
+
description="The Myanmar News dataset on Hugging Face contains news articles in Burmese. It is designed for tasks such as text classification, sentiment analysis, and language modeling. The dataset includes a variety of news topics in 4 categories, providing a rich resource for natural language processing applications involving Burmese which is a low resource language.",
|
|
13
13
|
reference="https://huggingface.co/datasets/myanmar_news",
|
|
14
14
|
type="Classification",
|
|
15
15
|
category="t2c",
|
|
@@ -45,7 +45,7 @@ class MyanmarNewsV2(AbsTaskClassification):
|
|
|
45
45
|
"path": "mteb/myanmar_news",
|
|
46
46
|
"revision": "475b43ffbdb5138ad67a01a2c860bc7db502f3c5",
|
|
47
47
|
},
|
|
48
|
-
description="""The Myanmar News dataset on Hugging Face contains news articles in Burmese. It is designed for tasks such as text classification, sentiment analysis, and language modeling. The dataset includes a variety of news topics in 4
|
|
48
|
+
description="""The Myanmar News dataset on Hugging Face contains news articles in Burmese. It is designed for tasks such as text classification, sentiment analysis, and language modeling. The dataset includes a variety of news topics in 4 categories, providing a rich resource for natural language processing applications involving Burmese which is a low resource language.
|
|
49
49
|
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
|
|
50
50
|
reference="https://huggingface.co/datasets/myanmar_news",
|
|
51
51
|
type="Classification",
|
|
@@ -5,7 +5,7 @@ from mteb.abstasks.task_metadata import TaskMetadata
|
|
|
5
5
|
class WongnaiReviewsClassification(AbsTaskClassification):
|
|
6
6
|
metadata = TaskMetadata(
|
|
7
7
|
name="WongnaiReviewsClassification",
|
|
8
|
-
description="Wongnai features over 200,000 restaurants, beauty salons, and spas across Thailand on its platform, with detailed information about each merchant and user reviews. In this dataset there are 5 classes
|
|
8
|
+
description="Wongnai features over 200,000 restaurants, beauty salons, and spas across Thailand on its platform, with detailed information about each merchant and user reviews. In this dataset there are 5 classes corresponding each star rating",
|
|
9
9
|
reference="https://github.com/wongnai/wongnai-corpus",
|
|
10
10
|
dataset={
|
|
11
11
|
"path": "Wongnai/wongnai_reviews",
|
|
@@ -10,7 +10,7 @@ class UkrFormalityClassification(AbsTaskClassification):
|
|
|
10
10
|
trainslating English GYAFC data.
|
|
11
11
|
English data source: https://aclanthology.org/N18-1012/
|
|
12
12
|
Translation into Ukrainian language using model: https://huggingface.co/facebook/nllb-200-distilled-600M
|
|
13
|
-
Additionally, the dataset was balanced,
|
|
13
|
+
Additionally, the dataset was balanced, with labels: 0 - informal, 1 - formal.
|
|
14
14
|
""",
|
|
15
15
|
dataset={
|
|
16
16
|
"path": "ukr-detect/ukr-formality-dataset-translated-gyafc",
|
|
@@ -61,7 +61,7 @@ class UkrFormalityClassificationV2(AbsTaskClassification):
|
|
|
61
61
|
trainslating English GYAFC data.
|
|
62
62
|
English data source: https://aclanthology.org/N18-1012/
|
|
63
63
|
Translation into Ukrainian language using model: https://huggingface.co/facebook/nllb-200-distilled-600M
|
|
64
|
-
Additionally, the dataset was balanced,
|
|
64
|
+
Additionally, the dataset was balanced, with labels: 0 - informal, 1 - formal.
|
|
65
65
|
|
|
66
66
|
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
|
|
67
67
|
dataset={
|
|
@@ -23,14 +23,15 @@ class IndicXnliPairClassification(AbsTaskPairClassification):
|
|
|
23
23
|
"path": "mteb/IndicXnliPairClassification",
|
|
24
24
|
"revision": "027e97b9afe84ea3447b57b7705b8864bb2b3a83",
|
|
25
25
|
},
|
|
26
|
-
description=
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
26
|
+
description=(
|
|
27
|
+
"INDICXNLI is similar to existing XNLI dataset in shape/form, but "
|
|
28
|
+
"focuses on Indic language family. "
|
|
29
|
+
"The train (392,702), validation (2,490), and evaluation sets (5,010) of English "
|
|
30
|
+
"XNLI were translated from English into each of the eleven Indic languages. IndicTrans "
|
|
31
|
+
"is a large Transformer-based sequence to sequence model. It is trained on Samanantar "
|
|
32
|
+
"dataset (Ramesh et al., 2021), which is the largest parallel multi- lingual corpus "
|
|
33
|
+
"over eleven Indic languages."
|
|
34
|
+
),
|
|
34
35
|
reference="https://gem-benchmark.com/data_cards/opusparcus",
|
|
35
36
|
category="t2t",
|
|
36
37
|
modalities=["text"],
|
|
@@ -60,9 +60,9 @@ class CodeRAGProgrammingSolutionsRetrieval(AbsTaskRetrieval):
|
|
|
60
60
|
self.data_loaded = True
|
|
61
61
|
|
|
62
62
|
def dataset_transform(self) -> None:
|
|
63
|
-
"""And transform to a retrieval
|
|
63
|
+
"""And transform to a retrieval dataset, which have the following attributes
|
|
64
64
|
|
|
65
|
-
self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document
|
|
65
|
+
self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document data like title and text
|
|
66
66
|
self.queries = Dict[query_id, str] #id => query
|
|
67
67
|
self.relevant_docs = Dict[query_id, Dict[[doc_id, score]]
|
|
68
68
|
"""
|
|
@@ -117,9 +117,9 @@ class CodeRAGOnlineTutorialsRetrieval(AbsTaskRetrieval):
|
|
|
117
117
|
self.data_loaded = True
|
|
118
118
|
|
|
119
119
|
def dataset_transform(self) -> None:
|
|
120
|
-
"""And transform to a retrieval
|
|
120
|
+
"""And transform to a retrieval dataset, which have the following attributes
|
|
121
121
|
|
|
122
|
-
self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document
|
|
122
|
+
self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document data like title and text
|
|
123
123
|
self.queries = Dict[query_id, str] #id => query
|
|
124
124
|
self.relevant_docs = Dict[query_id, Dict[[doc_id, score]]
|
|
125
125
|
"""
|
|
@@ -177,9 +177,9 @@ class CodeRAGLibraryDocumentationSolutionsRetrieval(AbsTaskRetrieval):
|
|
|
177
177
|
self.data_loaded = True
|
|
178
178
|
|
|
179
179
|
def dataset_transform(self) -> None:
|
|
180
|
-
"""And transform to a retrieval
|
|
180
|
+
"""And transform to a retrieval dataset, which have the following attributes
|
|
181
181
|
|
|
182
|
-
self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document
|
|
182
|
+
self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document data like title and text
|
|
183
183
|
self.queries = Dict[query_id, str] #id => query
|
|
184
184
|
self.relevant_docs = Dict[query_id, Dict[[doc_id, score]]
|
|
185
185
|
"""
|
|
@@ -234,9 +234,9 @@ class CodeRAGStackoverflowPostsRetrieval(AbsTaskRetrieval):
|
|
|
234
234
|
self.data_loaded = True
|
|
235
235
|
|
|
236
236
|
def dataset_transform(self) -> None:
|
|
237
|
-
"""And transform to a retrieval
|
|
237
|
+
"""And transform to a retrieval dataset, which have the following attributes
|
|
238
238
|
|
|
239
|
-
self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document
|
|
239
|
+
self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document data like title and text
|
|
240
240
|
self.queries = Dict[query_id, str] #id => query
|
|
241
241
|
self.relevant_docs = Dict[query_id, Dict[[doc_id, score]]
|
|
242
242
|
"""
|
|
@@ -56,7 +56,7 @@ Derczynski, Leon},
|
|
|
56
56
|
self.data_loaded = True
|
|
57
57
|
|
|
58
58
|
def dataset_transform(self) -> None:
|
|
59
|
-
"""And transform to a retrieval
|
|
59
|
+
"""And transform to a retrieval dataset, which have the following attributes
|
|
60
60
|
|
|
61
61
|
self.corpus = dict[doc_id, dict[str, str]] #id => dict with document data like title and text
|
|
62
62
|
self.queries = dict[query_id, str] #id => query
|
|
@@ -69,9 +69,9 @@ Piperidis, Stelios},
|
|
|
69
69
|
self.data_loaded = True
|
|
70
70
|
|
|
71
71
|
def dataset_transform(self) -> None:
|
|
72
|
-
"""And transform to a retrieval
|
|
72
|
+
"""And transform to a retrieval dataset, which have the following attributes
|
|
73
73
|
|
|
74
|
-
self.corpus = dict[doc_id, dict[str, str]] #id => dict with document
|
|
74
|
+
self.corpus = dict[doc_id, dict[str, str]] #id => dict with document data like title and text
|
|
75
75
|
self.queries = dict[query_id, str] #id => query
|
|
76
76
|
self.relevant_docs = dict[query_id, dict[[doc_id, score]]
|
|
77
77
|
"""
|
|
@@ -45,9 +45,9 @@ class TwitterHjerneRetrieval(AbsTaskRetrieval):
|
|
|
45
45
|
self.data_loaded = True
|
|
46
46
|
|
|
47
47
|
def dataset_transform(self) -> None:
|
|
48
|
-
"""And transform to a retrieval
|
|
48
|
+
"""And transform to a retrieval dataset, which have the following attributes
|
|
49
49
|
|
|
50
|
-
self.corpus = dict[doc_id, dict[str, str]] #id => dict with document
|
|
50
|
+
self.corpus = dict[doc_id, dict[str, str]] #id => dict with document data like title and text
|
|
51
51
|
self.queries = dict[query_id, str] #id => query
|
|
52
52
|
self.relevant_docs = dict[query_id, dict[[doc_id, score]]
|
|
53
53
|
"""
|
|
@@ -22,6 +22,7 @@ from .cirr_it2i_retrieval import CIRRIT2IRetrieval
|
|
|
22
22
|
from .climate_fever_retrieval import (
|
|
23
23
|
ClimateFEVER,
|
|
24
24
|
ClimateFEVERHardNegatives,
|
|
25
|
+
ClimateFEVERHardNegativesV2,
|
|
25
26
|
ClimateFEVERRetrievalv2,
|
|
26
27
|
)
|
|
27
28
|
from .cqa_dupstack_android_retrieval import CQADupstackAndroidRetrieval
|
|
@@ -57,7 +58,7 @@ from .dapfam_patent_retrieval import (
|
|
|
57
58
|
DAPFAMOutTitlAbsToTitlAbsClmRetrieval,
|
|
58
59
|
DAPFAMOutTitlAbsToTitlAbsRetrieval,
|
|
59
60
|
)
|
|
60
|
-
from .dbpedia_retrieval import DBPedia, DBPediaHardNegatives
|
|
61
|
+
from .dbpedia_retrieval import DBPedia, DBPediaHardNegatives, DBPediaHardNegativesV2
|
|
61
62
|
from .edis_t2it_retrieval import EDIST2ITRetrieval
|
|
62
63
|
from .encyclopedia_vqa_it2it_retrieval import EncyclopediaVQAIT2ITRetrieval
|
|
63
64
|
from .english_finance1_retrieval import EnglishFinance1Retrieval
|
|
@@ -70,7 +71,7 @@ from .fashion200k_i2t_retrieval import Fashion200kI2TRetrieval
|
|
|
70
71
|
from .fashion200k_t2i_retrieval import Fashion200kT2IRetrieval
|
|
71
72
|
from .fashion_iq_it2i_retrieval import FashionIQIT2IRetrieval
|
|
72
73
|
from .feedback_qa_retrieval import FeedbackQARetrieval
|
|
73
|
-
from .fever_retrieval import FEVER, FEVERHardNegatives
|
|
74
|
+
from .fever_retrieval import FEVER, FEVERHardNegatives, FEVERHardNegativesV2
|
|
74
75
|
from .fi_qa2018_retrieval import FiQA2018
|
|
75
76
|
from .fin_qa_retrieval import FinQARetrieval
|
|
76
77
|
from .finance_bench_retrieval import FinanceBenchRetrieval
|
|
@@ -85,7 +86,11 @@ from .hateful_memes_i2t_retrieval import HatefulMemesI2TRetrieval
|
|
|
85
86
|
from .hateful_memes_t2i_retrieval import HatefulMemesT2IRetrieval
|
|
86
87
|
from .hc3_finance_retrieval import HC3FinanceRetrieval
|
|
87
88
|
from .hella_swag_retrieval import HellaSwag
|
|
88
|
-
from .hotpot_qa_retrieval import
|
|
89
|
+
from .hotpot_qa_retrieval import (
|
|
90
|
+
HotpotQA,
|
|
91
|
+
HotpotQAHardNegatives,
|
|
92
|
+
HotpotQAHardNegativesV2,
|
|
93
|
+
)
|
|
89
94
|
from .image_co_de_t2i_retrieval import ImageCoDeT2IRetrieval
|
|
90
95
|
from .info_seek_it2it_retrieval import InfoSeekIT2ITRetrieval
|
|
91
96
|
from .info_seek_it2t_retrieval import InfoSeekIT2TRetrieval
|
|
@@ -133,7 +138,11 @@ from .oven_it2it_retrieval import OVENIT2ITRetrieval
|
|
|
133
138
|
from .oven_it2t_retrieval import OVENIT2TRetrieval
|
|
134
139
|
from .piqa_retrieval import PIQA
|
|
135
140
|
from .quail_retrieval import Quail
|
|
136
|
-
from .quora_retrieval import
|
|
141
|
+
from .quora_retrieval import (
|
|
142
|
+
QuoraRetrieval,
|
|
143
|
+
QuoraRetrievalHardNegatives,
|
|
144
|
+
QuoraRetrievalHardNegativesV2,
|
|
145
|
+
)
|
|
137
146
|
from .r2_med_retrieval import (
|
|
138
147
|
R2MEDBioinformaticsRetrieval,
|
|
139
148
|
R2MEDBiologyRetrieval,
|
|
@@ -247,6 +256,7 @@ __all__ = [
|
|
|
247
256
|
"ChemNQRetrieval",
|
|
248
257
|
"ClimateFEVER",
|
|
249
258
|
"ClimateFEVERHardNegatives",
|
|
259
|
+
"ClimateFEVERHardNegativesV2",
|
|
250
260
|
"ClimateFEVERRetrievalv2",
|
|
251
261
|
"DAPFAMAllTitlAbsClmToFullTextRetrieval",
|
|
252
262
|
"DAPFAMAllTitlAbsClmToTitlAbsClmRetrieval",
|
|
@@ -268,6 +278,7 @@ __all__ = [
|
|
|
268
278
|
"DAPFAMOutTitlAbsToTitlAbsRetrieval",
|
|
269
279
|
"DBPedia",
|
|
270
280
|
"DBPediaHardNegatives",
|
|
281
|
+
"DBPediaHardNegativesV2",
|
|
271
282
|
"EDIST2ITRetrieval",
|
|
272
283
|
"EncyclopediaVQAIT2ITRetrieval",
|
|
273
284
|
"EnglishFinance1Retrieval",
|
|
@@ -276,6 +287,7 @@ __all__ = [
|
|
|
276
287
|
"EnglishFinance4Retrieval",
|
|
277
288
|
"EnglishHealthcare1Retrieval",
|
|
278
289
|
"FEVERHardNegatives",
|
|
290
|
+
"FEVERHardNegativesV2",
|
|
279
291
|
"FaithDialRetrieval",
|
|
280
292
|
"Fashion200kI2TRetrieval",
|
|
281
293
|
"Fashion200kT2IRetrieval",
|
|
@@ -296,6 +308,7 @@ __all__ = [
|
|
|
296
308
|
"HellaSwag",
|
|
297
309
|
"HotpotQA",
|
|
298
310
|
"HotpotQAHardNegatives",
|
|
311
|
+
"HotpotQAHardNegativesV2",
|
|
299
312
|
"ImageCoDeT2IRetrieval",
|
|
300
313
|
"InfoSeekIT2ITRetrieval",
|
|
301
314
|
"InfoSeekIT2TRetrieval",
|
|
@@ -345,6 +358,7 @@ __all__ = [
|
|
|
345
358
|
"Quail",
|
|
346
359
|
"QuoraRetrieval",
|
|
347
360
|
"QuoraRetrievalHardNegatives",
|
|
361
|
+
"QuoraRetrievalHardNegativesV2",
|
|
348
362
|
"R2MEDBioinformaticsRetrieval",
|
|
349
363
|
"R2MEDBiologyRetrieval",
|
|
350
364
|
"R2MEDIIYiClinicalRetrieval",
|
|
@@ -1,30 +1,21 @@
|
|
|
1
1
|
from mteb.abstasks.retrieval import AbsTaskRetrieval
|
|
2
2
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
main_score="ndcg_at_10",
|
|
20
|
-
date=("2001-01-01", "2020-12-31"), # launch of wiki -> paper publication
|
|
21
|
-
domains=["Encyclopaedic", "Written"],
|
|
22
|
-
task_subtypes=["Claim verification"],
|
|
23
|
-
license="cc-by-sa-4.0",
|
|
24
|
-
annotations_creators="human-annotated",
|
|
25
|
-
dialect=[],
|
|
26
|
-
sample_creation="found",
|
|
27
|
-
bibtex_citation=r"""
|
|
4
|
+
_climate_fever_metadata = dict(
|
|
5
|
+
type="Retrieval",
|
|
6
|
+
category="t2t",
|
|
7
|
+
modalities=["text"],
|
|
8
|
+
eval_splits=["test"],
|
|
9
|
+
eval_langs=["eng-Latn"],
|
|
10
|
+
main_score="ndcg_at_10",
|
|
11
|
+
date=("2001-01-01", "2020-12-31"), # launch of wiki -> paper publication
|
|
12
|
+
domains=["Encyclopaedic", "Written"],
|
|
13
|
+
task_subtypes=["Claim verification"],
|
|
14
|
+
license="cc-by-sa-4.0",
|
|
15
|
+
annotations_creators="human-annotated",
|
|
16
|
+
dialect=[],
|
|
17
|
+
sample_creation="found",
|
|
18
|
+
bibtex_citation=r"""
|
|
28
19
|
@misc{diggelmann2021climatefever,
|
|
29
20
|
archiveprefix = {arXiv},
|
|
30
21
|
author = {Thomas Diggelmann and Jordan Boyd-Graber and Jannis Bulian and Massimiliano Ciaramita and Markus Leippold},
|
|
@@ -34,82 +25,82 @@ class ClimateFEVER(AbsTaskRetrieval):
|
|
|
34
25
|
year = {2021},
|
|
35
26
|
}
|
|
36
27
|
""",
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ClimateFEVER(AbsTaskRetrieval):
|
|
32
|
+
metadata = TaskMetadata(
|
|
33
|
+
name="ClimateFEVER",
|
|
34
|
+
description=(
|
|
35
|
+
"CLIMATE-FEVER is a dataset adopting the FEVER methodology that consists of 1,535 real-world claims "
|
|
36
|
+
"(queries) regarding climate-change. The underlying corpus is the same as FEVER."
|
|
37
|
+
),
|
|
38
|
+
reference="https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html",
|
|
39
|
+
dataset={
|
|
40
|
+
"path": "mteb/climate-fever",
|
|
41
|
+
"revision": "47f2ac6acb640fc46020b02a5b59fdda04d39380",
|
|
42
|
+
},
|
|
43
|
+
prompt={
|
|
44
|
+
"query": "Given a claim about climate change, retrieve documents that support or refute the claim"
|
|
45
|
+
},
|
|
46
|
+
**_climate_fever_metadata,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class ClimateFEVERRetrievalv2(AbsTaskRetrieval):
|
|
51
|
+
metadata = TaskMetadata(
|
|
52
|
+
name="ClimateFEVER.v2",
|
|
53
|
+
description=(
|
|
54
|
+
"CLIMATE-FEVER is a dataset following the FEVER methodology, containing 1,535 real-world climate change claims. "
|
|
55
|
+
"This updated version addresses corpus mismatches and qrel inconsistencies in MTEB, restoring labels while refining corpus-query alignment for better accuracy."
|
|
56
|
+
),
|
|
57
|
+
reference="https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html",
|
|
58
|
+
dataset={
|
|
59
|
+
"path": "mteb/climate-fever-v2",
|
|
60
|
+
"revision": "e438c9586767800aeb10dbe8a245c41dbea4e5f4",
|
|
61
|
+
},
|
|
37
62
|
prompt={
|
|
38
63
|
"query": "Given a claim about climate change, retrieve documents that support or refute the claim"
|
|
39
64
|
},
|
|
65
|
+
adapted_from=["ClimateFEVER"],
|
|
66
|
+
**_climate_fever_metadata,
|
|
40
67
|
)
|
|
41
68
|
|
|
42
69
|
|
|
43
70
|
class ClimateFEVERHardNegatives(AbsTaskRetrieval):
|
|
44
71
|
metadata = TaskMetadata(
|
|
45
72
|
name="ClimateFEVERHardNegatives",
|
|
46
|
-
description=
|
|
73
|
+
description=(
|
|
74
|
+
"CLIMATE-FEVER is a dataset adopting the FEVER methodology that consists of 1,535 real-world claims regarding climate-change. "
|
|
75
|
+
"The hard negative version has been created by pooling the 250 top documents per query from BM25, e5-multilingual-large and e5-mistral-instruct."
|
|
76
|
+
),
|
|
47
77
|
reference="https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html",
|
|
48
78
|
dataset={
|
|
49
79
|
"path": "mteb/ClimateFEVER_test_top_250_only_w_correct-v2",
|
|
50
80
|
"revision": "3a309e201f3c2c4b13bd4a367a8f37eee2ec1d21",
|
|
51
81
|
},
|
|
52
|
-
type="Retrieval",
|
|
53
|
-
category="t2t",
|
|
54
|
-
modalities=["text"],
|
|
55
|
-
eval_splits=["test"],
|
|
56
|
-
eval_langs=["eng-Latn"],
|
|
57
|
-
main_score="ndcg_at_10",
|
|
58
|
-
date=("2001-01-01", "2020-12-31"), # launch of wiki -> paper publication
|
|
59
|
-
domains=["Encyclopaedic", "Written"],
|
|
60
|
-
task_subtypes=["Claim verification"],
|
|
61
|
-
license="cc-by-sa-4.0",
|
|
62
|
-
annotations_creators="human-annotated",
|
|
63
|
-
dialect=[],
|
|
64
|
-
sample_creation="found",
|
|
65
|
-
bibtex_citation=r"""
|
|
66
|
-
@misc{diggelmann2021climatefever,
|
|
67
|
-
archiveprefix = {arXiv},
|
|
68
|
-
author = {Thomas Diggelmann and Jordan Boyd-Graber and Jannis Bulian and Massimiliano Ciaramita and Markus Leippold},
|
|
69
|
-
eprint = {2012.00614},
|
|
70
|
-
primaryclass = {cs.CL},
|
|
71
|
-
title = {CLIMATE-FEVER: A Dataset for Verification of Real-World Climate Claims},
|
|
72
|
-
year = {2021},
|
|
73
|
-
}
|
|
74
|
-
""",
|
|
75
82
|
adapted_from=["ClimateFEVER"],
|
|
83
|
+
superseded_by="ClimateFEVERHardNegatives.v2",
|
|
84
|
+
**_climate_fever_metadata,
|
|
76
85
|
)
|
|
77
86
|
|
|
78
87
|
|
|
79
|
-
class
|
|
88
|
+
class ClimateFEVERHardNegativesV2(AbsTaskRetrieval):
|
|
80
89
|
metadata = TaskMetadata(
|
|
81
|
-
name="
|
|
82
|
-
description=
|
|
90
|
+
name="ClimateFEVERHardNegatives.v2",
|
|
91
|
+
description=(
|
|
92
|
+
"CLIMATE-FEVER is a dataset adopting the FEVER methodology that consists of 1,535 real-world claims regarding climate-change. "
|
|
93
|
+
"The hard negative version has been created by pooling the 250 top documents per query from BM25, e5-multilingual-large and e5-mistral-instruct. "
|
|
94
|
+
"V2 uses a more appropriate prompt rather than the default prompt for retrieval. You can get more information on the effect of different prompt in the [PR](https://github.com/embeddings-benchmark/mteb/pull/3469#issuecomment-3436467106)"
|
|
95
|
+
),
|
|
83
96
|
reference="https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html",
|
|
84
97
|
dataset={
|
|
85
|
-
"path": "mteb/
|
|
86
|
-
"revision": "
|
|
98
|
+
"path": "mteb/ClimateFEVER_test_top_250_only_w_correct-v2",
|
|
99
|
+
"revision": "3a309e201f3c2c4b13bd4a367a8f37eee2ec1d21",
|
|
87
100
|
},
|
|
88
|
-
|
|
89
|
-
category="t2t",
|
|
90
|
-
modalities=["text"],
|
|
91
|
-
eval_splits=["test"],
|
|
92
|
-
eval_langs=["eng-Latn"],
|
|
93
|
-
main_score="ndcg_at_10",
|
|
94
|
-
date=("2001-01-01", "2020-12-31"), # launch of wiki -> paper publication
|
|
95
|
-
domains=["Academic", "Written"],
|
|
96
|
-
task_subtypes=["Claim verification"],
|
|
97
|
-
license="cc-by-sa-4.0",
|
|
98
|
-
annotations_creators="human-annotated",
|
|
99
|
-
dialect=[],
|
|
100
|
-
sample_creation="found",
|
|
101
|
-
bibtex_citation=r"""
|
|
102
|
-
@misc{diggelmann2021climatefever,
|
|
103
|
-
archiveprefix = {arXiv},
|
|
104
|
-
author = {Thomas Diggelmann and Jordan Boyd-Graber and Jannis Bulian and Massimiliano Ciaramita and Markus Leippold},
|
|
105
|
-
eprint = {2012.00614},
|
|
106
|
-
primaryclass = {cs.CL},
|
|
107
|
-
title = {CLIMATE-FEVER: A Dataset for Verification of Real-World Climate Claims},
|
|
108
|
-
year = {2021},
|
|
109
|
-
}
|
|
110
|
-
""",
|
|
101
|
+
adapted_from=["ClimateFEVER"],
|
|
111
102
|
prompt={
|
|
112
103
|
"query": "Given a claim about climate change, retrieve documents that support or refute the claim"
|
|
113
104
|
},
|
|
114
|
-
|
|
105
|
+
**_climate_fever_metadata,
|
|
115
106
|
)
|