mteb 2.0.5__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +10 -1
- mteb/benchmarks/benchmarks/__init__.py +2 -0
- mteb/benchmarks/benchmarks/benchmarks.py +75 -0
- mteb/descriptive_stats/Classification/DutchColaClassification.json +54 -0
- mteb/descriptive_stats/Classification/DutchGovernmentBiasClassification.json +54 -0
- mteb/descriptive_stats/Classification/DutchNewsArticlesClassification.json +90 -0
- mteb/descriptive_stats/Classification/DutchSarcasticHeadlinesClassification.json +54 -0
- mteb/descriptive_stats/Classification/IconclassClassification.json +96 -0
- mteb/descriptive_stats/Classification/OpenTenderClassification.json +222 -0
- mteb/descriptive_stats/Classification/VaccinChatNLClassification.json +1068 -0
- mteb/descriptive_stats/Clustering/DutchNewsArticlesClusteringP2P.json +45 -0
- mteb/descriptive_stats/Clustering/DutchNewsArticlesClusteringS2S.json +45 -0
- mteb/descriptive_stats/Clustering/IconclassClusteringS2S.json +48 -0
- mteb/descriptive_stats/Clustering/OpenTenderClusteringP2P.json +111 -0
- mteb/descriptive_stats/Clustering/OpenTenderClusteringS2S.json +111 -0
- mteb/descriptive_stats/Clustering/VABBClusteringP2P.json +60 -0
- mteb/descriptive_stats/Clustering/VABBClusteringS2S.json +60 -0
- mteb/descriptive_stats/MultilabelClassification/CovidDisinformationNLMultiLabelClassification.json +84 -0
- mteb/descriptive_stats/MultilabelClassification/VABBMultiLabelClassification.json +156 -0
- mteb/descriptive_stats/PairClassification/SICKNLPairClassification.json +35 -0
- mteb/descriptive_stats/PairClassification/XLWICNLPairClassification.json +35 -0
- mteb/descriptive_stats/Retrieval/DutchNewsArticlesRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/LegalQANLRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/OpenTenderRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/VABBRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/bBSARDNLRetrieval.json +30 -0
- mteb/descriptive_stats/STS/SICK-NL-STS.json +28 -0
- mteb/tasks/classification/nld/__init__.py +16 -0
- mteb/tasks/classification/nld/dutch_cola_classification.py +38 -0
- mteb/tasks/classification/nld/dutch_government_bias_classification.py +37 -0
- mteb/tasks/classification/nld/dutch_news_articles_classification.py +30 -0
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +36 -0
- mteb/tasks/classification/nld/iconclass_classification.py +41 -0
- mteb/tasks/classification/nld/open_tender_classification.py +38 -0
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +46 -0
- mteb/tasks/clustering/__init__.py +1 -0
- mteb/tasks/clustering/nld/__init__.py +17 -0
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +37 -0
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +37 -0
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +47 -0
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +51 -0
- mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +41 -0
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +51 -0
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +51 -0
- mteb/tasks/multilabel_classification/__init__.py +1 -0
- mteb/tasks/multilabel_classification/nld/__init__.py +9 -0
- mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +88 -0
- mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +44 -0
- mteb/tasks/pair_classification/__init__.py +1 -0
- mteb/tasks/pair_classification/nld/__init__.py +7 -0
- mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +36 -0
- mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +41 -0
- mteb/tasks/retrieval/nld/__init__.py +10 -0
- mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +41 -0
- mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +30 -0
- mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +39 -0
- mteb/tasks/retrieval/nld/open_tender_retrieval.py +38 -0
- mteb/tasks/retrieval/nld/vabb_retrieval.py +41 -0
- mteb/tasks/sts/__init__.py +1 -0
- mteb/tasks/sts/nld/__init__.py +5 -0
- mteb/tasks/sts/nld/sick_nl_sts.py +41 -0
- {mteb-2.0.5.dist-info → mteb-2.1.0.dist-info}/METADATA +2 -204
- {mteb-2.0.5.dist-info → mteb-2.1.0.dist-info}/RECORD +67 -15
- {mteb-2.0.5.dist-info → mteb-2.1.0.dist-info}/WHEEL +0 -0
- {mteb-2.0.5.dist-info → mteb-2.1.0.dist-info}/entry_points.txt +0 -0
- {mteb-2.0.5.dist-info → mteb-2.1.0.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.0.5.dist-info → mteb-2.1.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from mteb.abstasks.classification import AbsTaskClassification
|
|
2
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class VaccinChatNLClassification(AbsTaskClassification):
|
|
6
|
+
metadata = TaskMetadata(
|
|
7
|
+
name="VaccinChatNLClassification",
|
|
8
|
+
description="VaccinChatNL is a Flemish Dutch FAQ dataset on the topic of COVID-19 vaccinations in Flanders.",
|
|
9
|
+
reference="https://huggingface.co/datasets/clips/VaccinChatNL",
|
|
10
|
+
dataset={
|
|
11
|
+
"path": "clips/VaccinChatNL",
|
|
12
|
+
"revision": "bd27d0058bea2ad52470d9072a3b5da6b97c1ac3",
|
|
13
|
+
},
|
|
14
|
+
type="Classification",
|
|
15
|
+
category="t2c",
|
|
16
|
+
modalities=["text"],
|
|
17
|
+
date=("2022-01-01", "2022-09-01"),
|
|
18
|
+
eval_splits=["test"],
|
|
19
|
+
eval_langs=["nld-Latn"],
|
|
20
|
+
main_score="f1",
|
|
21
|
+
domains=["Spoken", "Web"],
|
|
22
|
+
task_subtypes=[],
|
|
23
|
+
license="cc-by-4.0",
|
|
24
|
+
annotations_creators="expert-annotated",
|
|
25
|
+
dialect=[],
|
|
26
|
+
sample_creation="created",
|
|
27
|
+
bibtex_citation=r"""
|
|
28
|
+
@inproceedings{buhmann-etal-2022-domain,
|
|
29
|
+
address = {Gyeongju, Republic of Korea},
|
|
30
|
+
author = {Buhmann, Jeska and De Bruyn, Maxime and Lotfi, Ehsan and Daelemans, Walter},
|
|
31
|
+
booktitle = {Proceedings of the 29th International Conference on Computational Linguistics},
|
|
32
|
+
month = oct,
|
|
33
|
+
pages = {3539--3549},
|
|
34
|
+
publisher = {International Committee on Computational Linguistics},
|
|
35
|
+
title = {Domain- and Task-Adaptation for {V}accin{C}hat{NL}, a {D}utch {COVID}-19 {FAQ} Answering Corpus and Classification Model},
|
|
36
|
+
url = {https://aclanthology.org/2022.coling-1.312},
|
|
37
|
+
year = {2022},
|
|
38
|
+
}
|
|
39
|
+
""",
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
def dataset_transform(self):
|
|
43
|
+
for split in self.dataset:
|
|
44
|
+
self.dataset[split] = self.dataset[split].rename_columns(
|
|
45
|
+
{"sentence1": "text"}
|
|
46
|
+
)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from .dutch_news_articles_clustering_p2p import DutchNewsArticlesClusteringP2P
|
|
2
|
+
from .dutch_news_articles_clustering_s2s import DutchNewsArticlesClusteringS2S
|
|
3
|
+
from .iconclass_clustering_s2s import IconclassClusteringS2S
|
|
4
|
+
from .open_tender_clustering_p2p import OpenTenderClusteringP2P
|
|
5
|
+
from .open_tender_clustering_s2s import OpenTenderClusteringS2S
|
|
6
|
+
from .vabb_clustering_p2p import VABBClusteringP2P
|
|
7
|
+
from .vabb_clustering_s2s import VABBClusteringS2S
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"DutchNewsArticlesClusteringP2P",
|
|
11
|
+
"DutchNewsArticlesClusteringS2S",
|
|
12
|
+
"IconclassClusteringS2S",
|
|
13
|
+
"OpenTenderClusteringP2P",
|
|
14
|
+
"OpenTenderClusteringS2S",
|
|
15
|
+
"VABBClusteringP2P",
|
|
16
|
+
"VABBClusteringS2S",
|
|
17
|
+
]
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from mteb.abstasks.clustering import AbsTaskClustering
|
|
2
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class DutchNewsArticlesClusteringP2P(AbsTaskClustering):
|
|
6
|
+
max_fraction_of_documents_to_embed = 1.0
|
|
7
|
+
metadata = TaskMetadata(
|
|
8
|
+
name="DutchNewsArticlesClusteringP2P",
|
|
9
|
+
dataset={
|
|
10
|
+
"path": "clips/mteb-nl-news-articles-cls",
|
|
11
|
+
"revision": "0a7227d31f85c5676be92767f8df5405ea93de54",
|
|
12
|
+
},
|
|
13
|
+
description="This dataset contains all the articles published by the NOS as of the 1st of January 2010. The "
|
|
14
|
+
"data is obtained by scraping the NOS website. The NOS is one of the biggest (online) news "
|
|
15
|
+
"organizations in the Netherlands.",
|
|
16
|
+
reference="https://www.kaggle.com/datasets/maxscheijen/dutch-news-articles",
|
|
17
|
+
type="Clustering",
|
|
18
|
+
category="t2c",
|
|
19
|
+
modalities=["text"],
|
|
20
|
+
eval_splits=["test"],
|
|
21
|
+
eval_langs=["nld-Latn"],
|
|
22
|
+
main_score="v_measure",
|
|
23
|
+
date=("2009-11-01", "2010-01-01"),
|
|
24
|
+
domains=["Written", "News"],
|
|
25
|
+
task_subtypes=[],
|
|
26
|
+
license="cc-by-nc-sa-4.0",
|
|
27
|
+
annotations_creators="derived",
|
|
28
|
+
dialect=[],
|
|
29
|
+
sample_creation="found",
|
|
30
|
+
bibtex_citation="",
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
def dataset_transform(self):
|
|
34
|
+
for split in self.dataset:
|
|
35
|
+
self.dataset[split] = self.dataset[split].rename_columns(
|
|
36
|
+
{"label": "labels", "text": "sentences"}
|
|
37
|
+
)
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from mteb.abstasks.clustering import AbsTaskClustering
|
|
2
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class DutchNewsArticlesClusteringS2S(AbsTaskClustering):
|
|
6
|
+
max_fraction_of_documents_to_embed = 1.0
|
|
7
|
+
metadata = TaskMetadata(
|
|
8
|
+
name="DutchNewsArticlesClusteringS2S",
|
|
9
|
+
dataset={
|
|
10
|
+
"path": "clips/mteb-nl-news-articles-cls",
|
|
11
|
+
"revision": "0a7227d31f85c5676be92767f8df5405ea93de54",
|
|
12
|
+
},
|
|
13
|
+
description="This dataset contains all the articles published by the NOS as of the 1st of January 2010. The "
|
|
14
|
+
"data is obtained by scraping the NOS website. The NOS is one of the biggest (online) news "
|
|
15
|
+
"organizations in the Netherlands.",
|
|
16
|
+
reference="https://www.kaggle.com/datasets/maxscheijen/dutch-news-articles",
|
|
17
|
+
type="Clustering",
|
|
18
|
+
category="t2c",
|
|
19
|
+
modalities=["text"],
|
|
20
|
+
eval_splits=["test"],
|
|
21
|
+
eval_langs=["nld-Latn"],
|
|
22
|
+
main_score="v_measure",
|
|
23
|
+
date=("2009-11-01", "2010-01-01"),
|
|
24
|
+
domains=["Written", "News"],
|
|
25
|
+
task_subtypes=[],
|
|
26
|
+
license="cc-by-nc-sa-4.0",
|
|
27
|
+
annotations_creators="derived",
|
|
28
|
+
dialect=[],
|
|
29
|
+
sample_creation="found",
|
|
30
|
+
bibtex_citation="",
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
def dataset_transform(self):
|
|
34
|
+
for split in self.dataset:
|
|
35
|
+
self.dataset[split] = self.dataset[split].rename_columns(
|
|
36
|
+
{"label": "labels", "title": "sentences"}
|
|
37
|
+
)
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from mteb.abstasks.clustering import AbsTaskClustering
|
|
2
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class IconclassClusteringS2S(AbsTaskClustering):
|
|
6
|
+
max_fraction_of_documents_to_embed = 1.0
|
|
7
|
+
metadata = TaskMetadata(
|
|
8
|
+
name="IconclassClusteringS2S",
|
|
9
|
+
dataset={
|
|
10
|
+
"path": "clips/mteb-nl-iconclass-cls",
|
|
11
|
+
"revision": "1cd02f1579dab39fedc95de8cc15fd620557a9f2",
|
|
12
|
+
},
|
|
13
|
+
description="Iconclass is an iconographic thesaurus, which is widely used in the digital heritage domain to "
|
|
14
|
+
"describe subjects depicted in artworks. The task is to classify the first layer of Iconclass",
|
|
15
|
+
reference="https://dl.acm.org/doi/pdf/10.1145/3575865",
|
|
16
|
+
type="Clustering",
|
|
17
|
+
category="t2c",
|
|
18
|
+
modalities=["text"],
|
|
19
|
+
eval_splits=["test"],
|
|
20
|
+
eval_langs=["nld-Latn"],
|
|
21
|
+
main_score="v_measure",
|
|
22
|
+
date=("2009-11-01", "2010-01-01"),
|
|
23
|
+
domains=["Written", "Fiction"],
|
|
24
|
+
task_subtypes=[],
|
|
25
|
+
license="cc-by-nc-sa-4.0",
|
|
26
|
+
annotations_creators="derived",
|
|
27
|
+
dialect=[],
|
|
28
|
+
sample_creation="found",
|
|
29
|
+
bibtex_citation=r"""
|
|
30
|
+
@article{banar2023transfer,
|
|
31
|
+
author = {Banar, Nikolay and Daelemans, Walter and Kestemont, Mike},
|
|
32
|
+
journal = {ACM Journal on Computing and Cultural Heritage},
|
|
33
|
+
number = {2},
|
|
34
|
+
pages = {1--16},
|
|
35
|
+
publisher = {ACM New York, NY},
|
|
36
|
+
title = {Transfer learning for the visual arts: The multi-modal retrieval of iconclass codes},
|
|
37
|
+
volume = {16},
|
|
38
|
+
year = {2023},
|
|
39
|
+
}
|
|
40
|
+
""",
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
def dataset_transform(self):
|
|
44
|
+
for split in self.dataset:
|
|
45
|
+
self.dataset[split] = self.dataset[split].map(
|
|
46
|
+
lambda ex: {"labels": ex["label"], "sentences": ex["text"]}
|
|
47
|
+
)
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from mteb.abstasks.clustering import AbsTaskClustering
|
|
2
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class OpenTenderClusteringP2P(AbsTaskClustering):
|
|
6
|
+
max_fraction_of_documents_to_embed = 1.0
|
|
7
|
+
metadata = TaskMetadata(
|
|
8
|
+
name="OpenTenderClusteringP2P",
|
|
9
|
+
dataset={
|
|
10
|
+
"path": "clips/mteb-nl-opentender-cls-pr",
|
|
11
|
+
"revision": "9af5657575a669dc18c7f897a67287ff7d1a0c65",
|
|
12
|
+
},
|
|
13
|
+
description="This dataset contains all the articles published by the NOS as of the 1st of January 2010. The "
|
|
14
|
+
"data is obtained by scraping the NOS website. The NOS is one of the biggest (online) news "
|
|
15
|
+
"organizations in the Netherlands.",
|
|
16
|
+
reference="https://arxiv.org/abs/2509.12340",
|
|
17
|
+
type="Clustering",
|
|
18
|
+
category="t2c",
|
|
19
|
+
modalities=["text"],
|
|
20
|
+
eval_splits=["test"],
|
|
21
|
+
eval_langs=["nld-Latn"],
|
|
22
|
+
main_score="v_measure",
|
|
23
|
+
date=("2025-08-01", "2025-08-10"),
|
|
24
|
+
domains=["Government", "Written"],
|
|
25
|
+
task_subtypes=[],
|
|
26
|
+
license="cc-by-nc-sa-4.0",
|
|
27
|
+
annotations_creators="derived",
|
|
28
|
+
dialect=[],
|
|
29
|
+
sample_creation="found",
|
|
30
|
+
bibtex_citation=r"""
|
|
31
|
+
@misc{banar2025mtebnle5nlembeddingbenchmark,
|
|
32
|
+
archiveprefix = {arXiv},
|
|
33
|
+
author = {Nikolay Banar and Ehsan Lotfi and Jens Van Nooten and Cristina Arhiliuc and Marija Kliocaite and Walter Daelemans},
|
|
34
|
+
eprint = {2509.12340},
|
|
35
|
+
primaryclass = {cs.CL},
|
|
36
|
+
title = {MTEB-NL and E5-NL: Embedding Benchmark and Models for Dutch},
|
|
37
|
+
url = {https://arxiv.org/abs/2509.12340},
|
|
38
|
+
year = {2025},
|
|
39
|
+
}
|
|
40
|
+
""",
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
def dataset_transform(self):
|
|
44
|
+
# reuse the dataset for classification
|
|
45
|
+
for split in self.dataset:
|
|
46
|
+
self.dataset[split] = self.dataset[split].map(
|
|
47
|
+
lambda ex: {
|
|
48
|
+
"labels": ex["label"],
|
|
49
|
+
"sentences": ex["text"],
|
|
50
|
+
}
|
|
51
|
+
)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from mteb.abstasks.clustering import AbsTaskClustering
|
|
2
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class OpenTenderClusteringS2S(AbsTaskClustering):
|
|
6
|
+
max_fraction_of_documents_to_embed = 1.0
|
|
7
|
+
metadata = TaskMetadata(
|
|
8
|
+
name="OpenTenderClusteringS2S",
|
|
9
|
+
dataset={
|
|
10
|
+
"path": "clips/mteb-nl-opentender-clst-s2s-pr",
|
|
11
|
+
"revision": "ad86cf1813d130e17dda0092d1c4f2c664e68d0c",
|
|
12
|
+
},
|
|
13
|
+
description="This dataset contains all the articles published by the NOS as of the 1st of January 2010. The "
|
|
14
|
+
"data is obtained by scraping the NOS website. The NOS is one of the biggest (online) news "
|
|
15
|
+
"organizations in the Netherlands.",
|
|
16
|
+
reference="https://arxiv.org/abs/2509.12340",
|
|
17
|
+
type="Clustering",
|
|
18
|
+
category="t2c",
|
|
19
|
+
modalities=["text"],
|
|
20
|
+
eval_splits=["test"],
|
|
21
|
+
eval_langs=["nld-Latn"],
|
|
22
|
+
main_score="v_measure",
|
|
23
|
+
date=("2025-08-01", "2025-08-10"),
|
|
24
|
+
domains=["Government", "Written"],
|
|
25
|
+
task_subtypes=[],
|
|
26
|
+
license="cc-by-nc-sa-4.0",
|
|
27
|
+
annotations_creators="derived",
|
|
28
|
+
dialect=[],
|
|
29
|
+
sample_creation="found",
|
|
30
|
+
bibtex_citation=r"""
|
|
31
|
+
@misc{banar2025mtebnle5nlembeddingbenchmark,
|
|
32
|
+
archiveprefix = {arXiv},
|
|
33
|
+
author = {Nikolay Banar and Ehsan Lotfi and Jens Van Nooten and Cristina Arhiliuc and Marija Kliocaite and Walter Daelemans},
|
|
34
|
+
eprint = {2509.12340},
|
|
35
|
+
primaryclass = {cs.CL},
|
|
36
|
+
title = {MTEB-NL and E5-NL: Embedding Benchmark and Models for Dutch},
|
|
37
|
+
url = {https://arxiv.org/abs/2509.12340},
|
|
38
|
+
year = {2025},
|
|
39
|
+
}
|
|
40
|
+
""",
|
|
41
|
+
)
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from mteb.abstasks.clustering import AbsTaskClustering
|
|
2
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class VABBClusteringP2P(AbsTaskClustering):
|
|
6
|
+
max_fraction_of_documents_to_embed = 1.0
|
|
7
|
+
metadata = TaskMetadata(
|
|
8
|
+
name="VABBClusteringP2P",
|
|
9
|
+
dataset={
|
|
10
|
+
"path": "clips/mteb-nl-vabb-cls",
|
|
11
|
+
"revision": "544acc2e46909eab2b49962b043a18b9c9772770",
|
|
12
|
+
},
|
|
13
|
+
description="This dataset contains the fourteenth edition of the Flemish Academic Bibliography for the Social "
|
|
14
|
+
"Sciences and Humanities (VABB-SHW), a database of academic publications from the social sciences "
|
|
15
|
+
"and humanities authored by researchers affiliated to Flemish universities (more information). "
|
|
16
|
+
"Publications in the database are used as one of the parameters of the Flemish performance-based "
|
|
17
|
+
"research funding system",
|
|
18
|
+
reference="https://zenodo.org/records/14214806",
|
|
19
|
+
type="Clustering",
|
|
20
|
+
category="t2c",
|
|
21
|
+
modalities=["text"],
|
|
22
|
+
eval_splits=["test"],
|
|
23
|
+
eval_langs=["nld-Latn"],
|
|
24
|
+
main_score="v_measure",
|
|
25
|
+
date=("2009-11-01", "2010-01-01"),
|
|
26
|
+
domains=["Academic", "Written"],
|
|
27
|
+
task_subtypes=[],
|
|
28
|
+
license="cc-by-nc-sa-4.0",
|
|
29
|
+
annotations_creators="derived",
|
|
30
|
+
dialect=[],
|
|
31
|
+
sample_creation="found",
|
|
32
|
+
bibtex_citation=r"""
|
|
33
|
+
@dataset{aspeslagh2024vabb,
|
|
34
|
+
author = {Aspeslagh, Pieter and Guns, Raf and Engels, Tim C. E.},
|
|
35
|
+
doi = {10.5281/zenodo.14214806},
|
|
36
|
+
publisher = {Zenodo},
|
|
37
|
+
title = {VABB-SHW: Dataset of Flemish Academic Bibliography for the Social Sciences and Humanities (edition 14)},
|
|
38
|
+
url = {https://doi.org/10.5281/zenodo.14214806},
|
|
39
|
+
year = {2024},
|
|
40
|
+
}
|
|
41
|
+
""",
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
def dataset_transform(self):
|
|
45
|
+
for split in self.dataset:
|
|
46
|
+
self.dataset[split] = self.dataset[split].map(
|
|
47
|
+
lambda ex: {
|
|
48
|
+
"labels": ex["org_discipline"],
|
|
49
|
+
"sentences": f"{ex['title']}\n{ex['abstract']}",
|
|
50
|
+
}
|
|
51
|
+
)
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from mteb.abstasks.clustering import AbsTaskClustering
|
|
2
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class VABBClusteringS2S(AbsTaskClustering):
|
|
6
|
+
max_fraction_of_documents_to_embed = 1.0
|
|
7
|
+
metadata = TaskMetadata(
|
|
8
|
+
name="VABBClusteringS2S",
|
|
9
|
+
dataset={
|
|
10
|
+
"path": "clips/mteb-nl-vabb-cls",
|
|
11
|
+
"revision": "544acc2e46909eab2b49962b043a18b9c9772770",
|
|
12
|
+
},
|
|
13
|
+
description="This dataset contains the fourteenth edition of the Flemish Academic Bibliography for the Social "
|
|
14
|
+
"Sciences and Humanities (VABB-SHW), a database of academic publications from the social sciences "
|
|
15
|
+
"and humanities authored by researchers affiliated to Flemish universities (more information). "
|
|
16
|
+
"Publications in the database are used as one of the parameters of the Flemish performance-based "
|
|
17
|
+
"research funding system",
|
|
18
|
+
reference="https://zenodo.org/records/14214806",
|
|
19
|
+
type="Clustering",
|
|
20
|
+
category="t2c",
|
|
21
|
+
modalities=["text"],
|
|
22
|
+
eval_splits=["test"],
|
|
23
|
+
eval_langs=["nld-Latn"],
|
|
24
|
+
main_score="v_measure",
|
|
25
|
+
date=("2009-11-01", "2010-01-01"),
|
|
26
|
+
domains=["Academic", "Written"],
|
|
27
|
+
task_subtypes=[],
|
|
28
|
+
license="cc-by-nc-sa-4.0",
|
|
29
|
+
annotations_creators="derived",
|
|
30
|
+
dialect=[],
|
|
31
|
+
sample_creation="found",
|
|
32
|
+
bibtex_citation=r"""
|
|
33
|
+
@dataset{aspeslagh2024vabb,
|
|
34
|
+
author = {Aspeslagh, Pieter and Guns, Raf and Engels, Tim C. E.},
|
|
35
|
+
doi = {10.5281/zenodo.14214806},
|
|
36
|
+
publisher = {Zenodo},
|
|
37
|
+
title = {VABB-SHW: Dataset of Flemish Academic Bibliography for the Social Sciences and Humanities (edition 14)},
|
|
38
|
+
url = {https://doi.org/10.5281/zenodo.14214806},
|
|
39
|
+
year = {2024},
|
|
40
|
+
}
|
|
41
|
+
""",
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
def dataset_transform(self):
|
|
45
|
+
for split in self.dataset:
|
|
46
|
+
self.dataset[split] = self.dataset[split].rename_columns(
|
|
47
|
+
{"title": "sentences"}
|
|
48
|
+
)
|
|
49
|
+
self.dataset[split] = self.dataset[split].map(
|
|
50
|
+
lambda ex: {"labels": ex["org_discipline"]}
|
|
51
|
+
)
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
from .covid_disinformation_nl_multi_label_classification import (
|
|
2
|
+
CovidDisinformationNLMultiLabelClassification,
|
|
3
|
+
)
|
|
4
|
+
from .vabb_multi_label_classification import VABBMultiLabelClassification
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"CovidDisinformationNLMultiLabelClassification",
|
|
8
|
+
"VABBMultiLabelClassification",
|
|
9
|
+
]
|
mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
from mteb.abstasks.multilabel_classification import (
|
|
2
|
+
AbsTaskMultilabelClassification,
|
|
3
|
+
)
|
|
4
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class CovidDisinformationNLMultiLabelClassification(AbsTaskMultilabelClassification):
|
|
8
|
+
metadata = TaskMetadata(
|
|
9
|
+
name="CovidDisinformationNLMultiLabelClassification",
|
|
10
|
+
dataset={
|
|
11
|
+
"path": "clips/mteb-nl-COVID-19-disinformation",
|
|
12
|
+
"revision": "7ad922bdef875db1f530847c6ffff05fc154f2e8",
|
|
13
|
+
},
|
|
14
|
+
description="The dataset is curated to address questions of interest to journalists, fact-checkers, "
|
|
15
|
+
"social media platforms, policymakers, and the general public.",
|
|
16
|
+
reference="https://aclanthology.org/2021.findings-emnlp.56.pdf",
|
|
17
|
+
type="MultilabelClassification",
|
|
18
|
+
category="t2c",
|
|
19
|
+
modalities=["text"],
|
|
20
|
+
eval_splits=["test"],
|
|
21
|
+
eval_langs=["nld-Latn"],
|
|
22
|
+
main_score="f1",
|
|
23
|
+
date=("2020-01-01", "2021-04-01"),
|
|
24
|
+
domains=["Web", "Social", "Written"],
|
|
25
|
+
task_subtypes=[],
|
|
26
|
+
license="cc-by-4.0",
|
|
27
|
+
annotations_creators="human-annotated",
|
|
28
|
+
dialect=[],
|
|
29
|
+
sample_creation="found",
|
|
30
|
+
bibtex_citation=r"""
|
|
31
|
+
@inproceedings{alam-etal-2021-fighting-covid,
|
|
32
|
+
address = {Punta Cana, Dominican Republic},
|
|
33
|
+
author = {Alam, Firoj and
|
|
34
|
+
Shaar, Shaden and
|
|
35
|
+
Dalvi, Fahim and
|
|
36
|
+
Sajjad, Hassan and
|
|
37
|
+
Nikolov, Alex and
|
|
38
|
+
Mubarak, Hamdy and
|
|
39
|
+
Da San Martino, Giovanni and
|
|
40
|
+
Abdelali, Ahmed and
|
|
41
|
+
Durrani, Nadir and
|
|
42
|
+
Darwish, Kareem and
|
|
43
|
+
Al-Homaid, Abdulaziz and
|
|
44
|
+
Zaghouani, Wajdi and
|
|
45
|
+
Caselli, Tommaso and
|
|
46
|
+
Danoe, Gijs and
|
|
47
|
+
Stolk, Friso and
|
|
48
|
+
Bruntink, Britt and
|
|
49
|
+
Nakov, Preslav},
|
|
50
|
+
booktitle = {Findings of the Association for Computational Linguistics: EMNLP 2021},
|
|
51
|
+
doi = {10.18653/v1/2021.findings-emnlp.56},
|
|
52
|
+
editor = {Moens, Marie-Francine and
|
|
53
|
+
Huang, Xuanjing and
|
|
54
|
+
Specia, Lucia and
|
|
55
|
+
Yih, Scott Wen-tau},
|
|
56
|
+
month = nov,
|
|
57
|
+
pages = {611--649},
|
|
58
|
+
publisher = {Association for Computational Linguistics},
|
|
59
|
+
title = {Fighting the {COVID}-19 Infodemic: Modeling the Perspective of Journalists, Fact-Checkers, Social Media Platforms, Policy Makers, and the Society},
|
|
60
|
+
url = {https://aclanthology.org/2021.findings-emnlp.56/},
|
|
61
|
+
year = {2021},
|
|
62
|
+
}
|
|
63
|
+
""",
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
def dataset_transform(self) -> None:
|
|
67
|
+
labels = [
|
|
68
|
+
"q2_label",
|
|
69
|
+
"q3_label",
|
|
70
|
+
"q4_label",
|
|
71
|
+
"q5_label",
|
|
72
|
+
"q6_label",
|
|
73
|
+
"q7_label",
|
|
74
|
+
]
|
|
75
|
+
_dataset = {}
|
|
76
|
+
|
|
77
|
+
def map_labels(example):
|
|
78
|
+
ml_labels = []
|
|
79
|
+
for i, label in enumerate(labels):
|
|
80
|
+
if example[label] == "yes":
|
|
81
|
+
ml_labels.append(i)
|
|
82
|
+
return {"label": ml_labels}
|
|
83
|
+
|
|
84
|
+
for split in self.dataset:
|
|
85
|
+
self.dataset[split] = self.dataset[split].filter(
|
|
86
|
+
lambda ex: ex["q1_label"] == "yes"
|
|
87
|
+
)
|
|
88
|
+
self.dataset[split] = self.dataset[split].map(map_labels)
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from mteb.abstasks.multilabel_classification import (
|
|
2
|
+
AbsTaskMultilabelClassification,
|
|
3
|
+
)
|
|
4
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class VABBMultiLabelClassification(AbsTaskMultilabelClassification):
|
|
8
|
+
samples_per_label = 128
|
|
9
|
+
metadata = TaskMetadata(
|
|
10
|
+
name="VABBMultiLabelClassification",
|
|
11
|
+
dataset={
|
|
12
|
+
"path": "clips/mteb-nl-vabb-mlcls-pr",
|
|
13
|
+
"revision": "584c70f5104671772119f21e9f8a3c912ac07d4a",
|
|
14
|
+
},
|
|
15
|
+
description="This dataset contains the fourteenth edition of the Flemish Academic Bibliography for the Social "
|
|
16
|
+
"Sciences and Humanities (VABB-SHW), a database of academic publications from the social sciences "
|
|
17
|
+
"and humanities authored by researchers affiliated to Flemish universities (more information). "
|
|
18
|
+
"Publications in the database are used as one of the parameters of the Flemish performance-based "
|
|
19
|
+
"research funding system",
|
|
20
|
+
reference="https://zenodo.org/records/14214806",
|
|
21
|
+
type="MultilabelClassification",
|
|
22
|
+
category="t2c",
|
|
23
|
+
modalities=["text"],
|
|
24
|
+
eval_splits=["test"],
|
|
25
|
+
eval_langs=["nld-Latn"],
|
|
26
|
+
main_score="f1",
|
|
27
|
+
date=("2020-01-01", "2021-04-01"),
|
|
28
|
+
domains=["Academic", "Written"],
|
|
29
|
+
task_subtypes=[],
|
|
30
|
+
license="cc-by-4.0",
|
|
31
|
+
annotations_creators="human-annotated",
|
|
32
|
+
dialect=[],
|
|
33
|
+
sample_creation="found",
|
|
34
|
+
bibtex_citation=r"""
|
|
35
|
+
@dataset{aspeslagh2024vabb,
|
|
36
|
+
author = {Aspeslagh, Pieter and Guns, Raf and Engels, Tim C. E.},
|
|
37
|
+
doi = {10.5281/zenodo.14214806},
|
|
38
|
+
publisher = {Zenodo},
|
|
39
|
+
title = {VABB-SHW: Dataset of Flemish Academic Bibliography for the Social Sciences and Humanities (edition 14)},
|
|
40
|
+
url = {https://doi.org/10.5281/zenodo.14214806},
|
|
41
|
+
year = {2024},
|
|
42
|
+
}
|
|
43
|
+
""",
|
|
44
|
+
)
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from mteb.abstasks.pair_classification import AbsTaskPairClassification
|
|
2
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class SICKNLPairClassification(AbsTaskPairClassification):
|
|
6
|
+
metadata = TaskMetadata(
|
|
7
|
+
name="SICKNLPairClassification",
|
|
8
|
+
dataset={
|
|
9
|
+
"path": "clips/mteb-nl-sick-pcls-pr",
|
|
10
|
+
"revision": "a13a1892bcb4c077dc416d390389223eea5f20f0",
|
|
11
|
+
},
|
|
12
|
+
description="SICK-NL is a Dutch translation of SICK ",
|
|
13
|
+
reference="https://aclanthology.org/2021.eacl-main.126/",
|
|
14
|
+
type="PairClassification",
|
|
15
|
+
category="t2t",
|
|
16
|
+
modalities=["text"],
|
|
17
|
+
eval_splits=["test"],
|
|
18
|
+
eval_langs=["nld-Latn"],
|
|
19
|
+
main_score="max_ap",
|
|
20
|
+
date=("2020-09-01", "2021-01-01"),
|
|
21
|
+
domains=["Web", "Written"],
|
|
22
|
+
task_subtypes=[],
|
|
23
|
+
license="mit",
|
|
24
|
+
annotations_creators="human-annotated",
|
|
25
|
+
dialect=[],
|
|
26
|
+
sample_creation="machine-translated and verified",
|
|
27
|
+
bibtex_citation=r"""
|
|
28
|
+
@inproceedings{wijnholds2021sick,
|
|
29
|
+
author = {Wijnholds, Gijs and Moortgat, Michael},
|
|
30
|
+
booktitle = {Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume},
|
|
31
|
+
pages = {1474--1479},
|
|
32
|
+
title = {SICK-NL: A Dataset for Dutch Natural Language Inference},
|
|
33
|
+
year = {2021},
|
|
34
|
+
}
|
|
35
|
+
""",
|
|
36
|
+
)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from mteb.abstasks.pair_classification import AbsTaskPairClassification
|
|
2
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class XLWICNLPairClassification(AbsTaskPairClassification):
|
|
6
|
+
metadata = TaskMetadata(
|
|
7
|
+
name="XLWICNLPairClassification",
|
|
8
|
+
description="The Word-in-Context dataset (WiC) addresses the dependence on sense inventories by reformulating "
|
|
9
|
+
"the standard disambiguation task as a binary classification problem; but, it is limited to the "
|
|
10
|
+
"English language. We put forward a large multilingual benchmark, XL-WiC, featuring gold standards "
|
|
11
|
+
"in 12 new languages from varied language families and with different degrees of resource "
|
|
12
|
+
"availability, opening room for evaluation scenarios such as zero-shot cross-lingual transfer. ",
|
|
13
|
+
reference="https://aclanthology.org/2020.emnlp-main.584.pdf",
|
|
14
|
+
dataset={
|
|
15
|
+
"path": "clips/mteb-nl-xlwic",
|
|
16
|
+
"revision": "0b33ce358b1b5d500ff3715ba3d777b4d2c21cb0",
|
|
17
|
+
},
|
|
18
|
+
type="PairClassification",
|
|
19
|
+
category="t2t",
|
|
20
|
+
modalities=["text"],
|
|
21
|
+
date=("2019-10-04", "2019-10-04"),
|
|
22
|
+
eval_splits=["test"],
|
|
23
|
+
eval_langs=["nld-Latn"],
|
|
24
|
+
main_score="max_ap",
|
|
25
|
+
domains=["Written"],
|
|
26
|
+
task_subtypes=[],
|
|
27
|
+
license="cc-by-nc-sa-4.0",
|
|
28
|
+
annotations_creators="derived",
|
|
29
|
+
dialect=[],
|
|
30
|
+
sample_creation="created",
|
|
31
|
+
bibtex_citation=r"""
|
|
32
|
+
@inproceedings{raganato2020xl,
|
|
33
|
+
author = {Raganato, A and Pasini, T and Camacho-Collados, J and Pilehvar, M and others},
|
|
34
|
+
booktitle = {Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)},
|
|
35
|
+
organization = {Association for Computational Linguistics (ACL)},
|
|
36
|
+
pages = {7193--7206},
|
|
37
|
+
title = {XL-WiC: A multilingual benchmark for evaluating semantic contextualization},
|
|
38
|
+
year = {2020},
|
|
39
|
+
}
|
|
40
|
+
""",
|
|
41
|
+
)
|