mteb 2.0.5__py3-none-any.whl → 2.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +10 -1
- mteb/_create_dataloaders.py +2 -0
- mteb/abstasks/_stratification.py +1 -1
- mteb/abstasks/abstask.py +6 -1
- mteb/abstasks/dataset_card_template.md +1 -1
- mteb/abstasks/retrieval.py +2 -1
- mteb/abstasks/retrieval_dataset_loaders.py +1 -1
- mteb/abstasks/task_metadata.py +1 -1
- mteb/benchmarks/benchmarks/__init__.py +2 -0
- mteb/benchmarks/benchmarks/benchmarks.py +82 -11
- mteb/benchmarks/get_benchmark.py +1 -1
- mteb/descriptive_stats/Classification/DutchColaClassification.json +54 -0
- mteb/descriptive_stats/Classification/DutchGovernmentBiasClassification.json +54 -0
- mteb/descriptive_stats/Classification/DutchNewsArticlesClassification.json +90 -0
- mteb/descriptive_stats/Classification/DutchSarcasticHeadlinesClassification.json +54 -0
- mteb/descriptive_stats/Classification/IconclassClassification.json +96 -0
- mteb/descriptive_stats/Classification/OpenTenderClassification.json +222 -0
- mteb/descriptive_stats/Classification/VaccinChatNLClassification.json +1068 -0
- mteb/descriptive_stats/Clustering/DutchNewsArticlesClusteringP2P.json +45 -0
- mteb/descriptive_stats/Clustering/DutchNewsArticlesClusteringS2S.json +45 -0
- mteb/descriptive_stats/Clustering/IconclassClusteringS2S.json +48 -0
- mteb/descriptive_stats/Clustering/OpenTenderClusteringP2P.json +111 -0
- mteb/descriptive_stats/Clustering/OpenTenderClusteringS2S.json +111 -0
- mteb/descriptive_stats/Clustering/VABBClusteringP2P.json +60 -0
- mteb/descriptive_stats/Clustering/VABBClusteringS2S.json +60 -0
- mteb/descriptive_stats/Image/Any2AnyMultilingualRetrieval/XFlickr30kCoT2IRetrieval.json +243 -153
- mteb/descriptive_stats/Image/Any2AnyMultilingualRetrieval/XM3600T2IRetrieval.json +999 -629
- mteb/descriptive_stats/Image/Any2AnyRetrieval/OVENIT2TRetrieval.json +33 -17
- mteb/descriptive_stats/Image/DocumentUnderstanding/MIRACLVisionRetrieval.json +574 -0
- mteb/descriptive_stats/MultilabelClassification/CovidDisinformationNLMultiLabelClassification.json +84 -0
- mteb/descriptive_stats/MultilabelClassification/VABBMultiLabelClassification.json +156 -0
- mteb/descriptive_stats/PairClassification/SICKNLPairClassification.json +35 -0
- mteb/descriptive_stats/PairClassification/XLWICNLPairClassification.json +35 -0
- mteb/descriptive_stats/Retrieval/ClimateFEVERHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/DBPediaHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/DutchNewsArticlesRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/FEVERHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/HotpotQAHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/LegalQANLRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/OpenTenderRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/QuoraRetrievalHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/RiaNewsRetrievalHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/VABBRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/VDRMultilingualRetrieval.json +184 -0
- mteb/descriptive_stats/Retrieval/bBSARDNLRetrieval.json +30 -0
- mteb/descriptive_stats/STS/SICK-NL-STS.json +28 -0
- mteb/languages/check_language_code.py +11 -3
- mteb/languages/language_scripts.py +4 -0
- mteb/leaderboard/text_segments.py +1 -1
- mteb/models/model_implementations/b1ade_models.py +1 -1
- mteb/models/model_implementations/bge_models.py +1 -3
- mteb/models/model_implementations/bmretriever_models.py +1 -1
- mteb/models/model_implementations/gme_v_models.py +2 -2
- mteb/models/model_implementations/ibm_granite_models.py +1 -1
- mteb/models/model_implementations/inf_models.py +3 -3
- mteb/models/model_implementations/jina_models.py +12 -2
- mteb/models/model_implementations/llm2vec_models.py +1 -1
- mteb/models/model_implementations/misc_models.py +2 -2
- mteb/models/model_implementations/mxbai_models.py +1 -1
- mteb/models/model_implementations/salesforce_models.py +1 -1
- mteb/models/model_implementations/seed_1_6_embedding_models.py +1 -1
- mteb/models/model_implementations/voyage_v.py +9 -9
- mteb/results/task_result.py +6 -8
- mteb/tasks/classification/dan/angry_tweets_classification.py +2 -2
- mteb/tasks/classification/eng/legal_bench_classification.py +3 -3
- mteb/tasks/classification/mya/myanmar_news.py +2 -2
- mteb/tasks/classification/nld/__init__.py +16 -0
- mteb/tasks/classification/nld/dutch_cola_classification.py +38 -0
- mteb/tasks/classification/nld/dutch_government_bias_classification.py +37 -0
- mteb/tasks/classification/nld/dutch_news_articles_classification.py +30 -0
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +36 -0
- mteb/tasks/classification/nld/iconclass_classification.py +41 -0
- mteb/tasks/classification/nld/open_tender_classification.py +38 -0
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +46 -0
- mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
- mteb/tasks/clustering/__init__.py +1 -0
- mteb/tasks/clustering/nld/__init__.py +17 -0
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +37 -0
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +37 -0
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +47 -0
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +51 -0
- mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +41 -0
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +51 -0
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +51 -0
- mteb/tasks/multilabel_classification/__init__.py +1 -0
- mteb/tasks/multilabel_classification/nld/__init__.py +9 -0
- mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +88 -0
- mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +44 -0
- mteb/tasks/pair_classification/__init__.py +1 -0
- mteb/tasks/pair_classification/multilingual/indic_xnli_pair_classification.py +9 -8
- mteb/tasks/pair_classification/nld/__init__.py +7 -0
- mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +36 -0
- mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +41 -0
- mteb/tasks/retrieval/code/code_rag.py +8 -8
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
- mteb/tasks/retrieval/eng/__init__.py +18 -4
- mteb/tasks/retrieval/eng/climate_fever_retrieval.py +68 -77
- mteb/tasks/retrieval/eng/dbpedia_retrieval.py +55 -50
- mteb/tasks/retrieval/eng/fever_retrieval.py +62 -67
- mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +0 -4
- mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +0 -4
- mteb/tasks/retrieval/eng/hotpot_qa_retrieval.py +57 -67
- mteb/tasks/retrieval/eng/legal_summarization_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +0 -3
- mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +0 -2
- mteb/tasks/retrieval/eng/oven_it2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/quora_retrieval.py +51 -46
- mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +0 -4
- mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +0 -4
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +0 -2
- mteb/tasks/retrieval/jpn/ja_gov_faqs_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/belebele_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +0 -2
- mteb/tasks/retrieval/multilingual/miracl_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +2 -9
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +0 -2
- mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +0 -2
- mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +6 -5
- mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +3 -4
- mteb/tasks/retrieval/nld/__init__.py +10 -0
- mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +41 -0
- mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +30 -0
- mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +39 -0
- mteb/tasks/retrieval/nld/open_tender_retrieval.py +38 -0
- mteb/tasks/retrieval/nld/vabb_retrieval.py +41 -0
- mteb/tasks/retrieval/nob/norquad.py +2 -2
- mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
- mteb/tasks/retrieval/rus/__init__.py +11 -2
- mteb/tasks/retrieval/rus/ria_news_retrieval.py +48 -44
- mteb/tasks/retrieval/tur/tur_hist_quad.py +2 -2
- mteb/tasks/sts/__init__.py +1 -0
- mteb/tasks/sts/nld/__init__.py +5 -0
- mteb/tasks/sts/nld/sick_nl_sts.py +41 -0
- mteb-2.1.1.dist-info/METADATA +253 -0
- {mteb-2.0.5.dist-info → mteb-2.1.1.dist-info}/RECORD +142 -95
- mteb/descriptive_stats/Classification/PersianTextTone.json +0 -56
- mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchCount.json +0 -37
- mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchDepth.json +0 -25
- mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchDistance.json +0 -25
- mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchRelation.json +0 -25
- mteb/descriptive_stats/Image/VisualSTS/STS12VisualSTS.json +0 -20
- mteb/descriptive_stats/Image/VisualSTS/STS13VisualSTS.json +0 -20
- mteb/descriptive_stats/Image/VisualSTS/STS14VisualSTS.json +0 -20
- mteb/descriptive_stats/Image/VisualSTS/STS15VisualSTS.json +0 -20
- mteb/descriptive_stats/Image/VisualSTS/STS16VisualSTS.json +0 -20
- mteb/descriptive_stats/Image/VisualSTS/STS17MultilingualVisualSTS.json +0 -220
- mteb/descriptive_stats/Image/VisualSTS/STSBenchmarkMultilingualVisualSTS.json +0 -402
- mteb/descriptive_stats/Reranking/InstructIR.json +0 -31
- mteb-2.0.5.dist-info/METADATA +0 -455
- {mteb-2.0.5.dist-info → mteb-2.1.1.dist-info}/WHEEL +0 -0
- {mteb-2.0.5.dist-info → mteb-2.1.1.dist-info}/entry_points.txt +0 -0
- {mteb-2.0.5.dist-info → mteb-2.1.1.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.0.5.dist-info → mteb-2.1.1.dist-info}/top_level.txt +0 -0
|
@@ -9,7 +9,7 @@ class MyanmarNews(AbsTaskClassification):
|
|
|
9
9
|
"path": "mteb/MyanmarNews",
|
|
10
10
|
"revision": "644419f24bc820bbf8af24e0b4714a069812e0a3",
|
|
11
11
|
},
|
|
12
|
-
description="The Myanmar News dataset on Hugging Face contains news articles in Burmese. It is designed for tasks such as text classification, sentiment analysis, and language modeling. The dataset includes a variety of news topics in 4
|
|
12
|
+
description="The Myanmar News dataset on Hugging Face contains news articles in Burmese. It is designed for tasks such as text classification, sentiment analysis, and language modeling. The dataset includes a variety of news topics in 4 categories, providing a rich resource for natural language processing applications involving Burmese which is a low resource language.",
|
|
13
13
|
reference="https://huggingface.co/datasets/myanmar_news",
|
|
14
14
|
type="Classification",
|
|
15
15
|
category="t2c",
|
|
@@ -45,7 +45,7 @@ class MyanmarNewsV2(AbsTaskClassification):
|
|
|
45
45
|
"path": "mteb/myanmar_news",
|
|
46
46
|
"revision": "475b43ffbdb5138ad67a01a2c860bc7db502f3c5",
|
|
47
47
|
},
|
|
48
|
-
description="""The Myanmar News dataset on Hugging Face contains news articles in Burmese. It is designed for tasks such as text classification, sentiment analysis, and language modeling. The dataset includes a variety of news topics in 4
|
|
48
|
+
description="""The Myanmar News dataset on Hugging Face contains news articles in Burmese. It is designed for tasks such as text classification, sentiment analysis, and language modeling. The dataset includes a variety of news topics in 4 categories, providing a rich resource for natural language processing applications involving Burmese which is a low resource language.
|
|
49
49
|
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
|
|
50
50
|
reference="https://huggingface.co/datasets/myanmar_news",
|
|
51
51
|
type="Classification",
|
|
@@ -2,8 +2,24 @@ from .dutch_book_review_sentiment_classification import (
|
|
|
2
2
|
DutchBookReviewSentimentClassification,
|
|
3
3
|
DutchBookReviewSentimentClassificationV2,
|
|
4
4
|
)
|
|
5
|
+
from .dutch_cola_classification import DutchColaClassification
|
|
6
|
+
from .dutch_government_bias_classification import DutchGovernmentBiasClassification
|
|
7
|
+
from .dutch_news_articles_classification import DutchNewsArticlesClassification
|
|
8
|
+
from .dutch_sarcastic_headlines_classification import (
|
|
9
|
+
DutchSarcasticHeadlinesClassification,
|
|
10
|
+
)
|
|
11
|
+
from .iconclass_classification import IconclassClassification
|
|
12
|
+
from .open_tender_classification import OpenTenderClassification
|
|
13
|
+
from .vaccin_chat_nl_classification import VaccinChatNLClassification
|
|
5
14
|
|
|
6
15
|
__all__ = [
|
|
7
16
|
"DutchBookReviewSentimentClassification",
|
|
8
17
|
"DutchBookReviewSentimentClassificationV2",
|
|
18
|
+
"DutchColaClassification",
|
|
19
|
+
"DutchGovernmentBiasClassification",
|
|
20
|
+
"DutchNewsArticlesClassification",
|
|
21
|
+
"DutchSarcasticHeadlinesClassification",
|
|
22
|
+
"IconclassClassification",
|
|
23
|
+
"OpenTenderClassification",
|
|
24
|
+
"VaccinChatNLClassification",
|
|
9
25
|
]
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from mteb.abstasks.classification import AbsTaskClassification
|
|
2
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class DutchColaClassification(AbsTaskClassification):
|
|
6
|
+
samples_per_label = 128
|
|
7
|
+
metadata = TaskMetadata(
|
|
8
|
+
name="DutchColaClassification",
|
|
9
|
+
description="Dutch CoLA is a corpus of linguistic acceptability for Dutch.",
|
|
10
|
+
reference="https://huggingface.co/datasets/GroNLP/dutch-cola",
|
|
11
|
+
dataset={
|
|
12
|
+
"path": "clips/mteb-nl-dutch-cola",
|
|
13
|
+
"revision": "2269ed7d95d8abaab829f1592b4b2047372e9f81",
|
|
14
|
+
},
|
|
15
|
+
type="Classification",
|
|
16
|
+
category="t2c",
|
|
17
|
+
modalities=["text"],
|
|
18
|
+
date=("2024-03-01", "2024-05-01"),
|
|
19
|
+
eval_splits=["test"],
|
|
20
|
+
eval_langs=["nld-Latn"],
|
|
21
|
+
main_score="f1",
|
|
22
|
+
domains=["Written"],
|
|
23
|
+
task_subtypes=["Linguistic acceptability"],
|
|
24
|
+
license="not specified", # specified as unknown
|
|
25
|
+
annotations_creators="expert-annotated",
|
|
26
|
+
dialect=[],
|
|
27
|
+
sample_creation="found",
|
|
28
|
+
bibtex_citation=r"""
|
|
29
|
+
@misc{gronlp_2024,
|
|
30
|
+
author = {Bylinina, Lisa and Abdi, Silvana and Brouwer, Hylke and Elzinga, Martine and Gunput, Shenza and Huisman, Sem and Krooneman, Collin and Poot, David and Top, Jelmer and Weideman, Cain},
|
|
31
|
+
doi = { 10.57967/hf/3825 },
|
|
32
|
+
publisher = { Hugging Face },
|
|
33
|
+
title = { {Dutch-CoLA (Revision 5a4196c)} },
|
|
34
|
+
url = { https://huggingface.co/datasets/GroNLP/dutch-cola },
|
|
35
|
+
year = {2024},
|
|
36
|
+
}
|
|
37
|
+
""",
|
|
38
|
+
)
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from mteb.abstasks.classification import AbsTaskClassification
|
|
2
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class DutchGovernmentBiasClassification(AbsTaskClassification):
|
|
6
|
+
samples_per_label = 32
|
|
7
|
+
metadata = TaskMetadata(
|
|
8
|
+
name="DutchGovernmentBiasClassification",
|
|
9
|
+
description="The Dutch Government Data for Bias Detection (DGDB) is a dataset sourced from the Dutch House of Representatives and annotated for bias by experts",
|
|
10
|
+
reference="https://dl.acm.org/doi/pdf/10.1145/3696410.3714526",
|
|
11
|
+
dataset={
|
|
12
|
+
"path": "clips/mteb-nl-dutch-government-bias-detection",
|
|
13
|
+
"revision": "bf5e20ee2d3ce2e24e4de50f5dd8573e0e0e2fec",
|
|
14
|
+
},
|
|
15
|
+
type="Classification",
|
|
16
|
+
category="t2c",
|
|
17
|
+
modalities=["text"],
|
|
18
|
+
date=("2019-10-04", "2019-10-04"),
|
|
19
|
+
eval_splits=["test"],
|
|
20
|
+
eval_langs=["nld-Latn"],
|
|
21
|
+
main_score="f1",
|
|
22
|
+
domains=["Written", "Government"],
|
|
23
|
+
task_subtypes=[],
|
|
24
|
+
license="cc-by-nc-sa-4.0",
|
|
25
|
+
annotations_creators="expert-annotated",
|
|
26
|
+
dialect=[],
|
|
27
|
+
sample_creation="found",
|
|
28
|
+
bibtex_citation=r"""
|
|
29
|
+
@inproceedings{de2025detecting,
|
|
30
|
+
author = {de Swart, Milena and Den Hengst, Floris and Chen, Jieying},
|
|
31
|
+
booktitle = {Proceedings of the ACM on Web Conference 2025},
|
|
32
|
+
pages = {5034--5044},
|
|
33
|
+
title = {Detecting Linguistic Bias in Government Documents Using Large language Models},
|
|
34
|
+
year = {2025},
|
|
35
|
+
}
|
|
36
|
+
""",
|
|
37
|
+
)
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from mteb.abstasks.classification import AbsTaskClassification
|
|
2
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class DutchNewsArticlesClassification(AbsTaskClassification):
|
|
6
|
+
metadata = TaskMetadata(
|
|
7
|
+
name="DutchNewsArticlesClassification",
|
|
8
|
+
dataset={
|
|
9
|
+
"path": "clips/mteb-nl-news-articles-cls",
|
|
10
|
+
"revision": "0a7227d31f85c5676be92767f8df5405ea93de54",
|
|
11
|
+
},
|
|
12
|
+
description="This dataset contains all the articles published by the NOS as of the 1st of January 2010. The "
|
|
13
|
+
"data is obtained by scraping the NOS website. The NOS is one of the biggest (online) news "
|
|
14
|
+
"organizations in the Netherlands.",
|
|
15
|
+
reference="https://www.kaggle.com/datasets/maxscheijen/dutch-news-articles",
|
|
16
|
+
type="Classification",
|
|
17
|
+
category="t2c",
|
|
18
|
+
modalities=["text"],
|
|
19
|
+
eval_splits=["test"],
|
|
20
|
+
eval_langs=["nld-Latn"],
|
|
21
|
+
main_score="f1",
|
|
22
|
+
date=("2009-11-01", "2010-01-01"),
|
|
23
|
+
domains=["Written", "News"],
|
|
24
|
+
task_subtypes=["Topic classification"],
|
|
25
|
+
license="cc-by-nc-sa-4.0",
|
|
26
|
+
annotations_creators="derived",
|
|
27
|
+
dialect=[],
|
|
28
|
+
sample_creation="found",
|
|
29
|
+
bibtex_citation="",
|
|
30
|
+
)
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from mteb.abstasks.classification import AbsTaskClassification
|
|
2
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class DutchSarcasticHeadlinesClassification(AbsTaskClassification):
|
|
6
|
+
metadata = TaskMetadata(
|
|
7
|
+
name="DutchSarcasticHeadlinesClassification",
|
|
8
|
+
description="This dataset contains news headlines of two Dutch news websites. All sarcastic headlines were "
|
|
9
|
+
"collected from the Speld.nl (the Dutch equivalent of The Onion) whereas all 'normal' headlines "
|
|
10
|
+
"were collected from the news website Nu.nl.",
|
|
11
|
+
reference="https://www.kaggle.com/datasets/harrotuin/dutch-news-headlines",
|
|
12
|
+
dataset={
|
|
13
|
+
"path": "clips/mteb-nl-sarcastic-headlines",
|
|
14
|
+
"revision": "7e520e36394795859583f84f81fcb97de915d05a",
|
|
15
|
+
},
|
|
16
|
+
type="Classification",
|
|
17
|
+
category="t2c",
|
|
18
|
+
modalities=["text"],
|
|
19
|
+
date=("2019-01-01", "2020-01-01"),
|
|
20
|
+
eval_splits=["test"],
|
|
21
|
+
eval_langs=["nld-Latn"],
|
|
22
|
+
main_score="f1",
|
|
23
|
+
domains=["News", "Written", "Fiction"],
|
|
24
|
+
task_subtypes=[],
|
|
25
|
+
license="cc0-1.0",
|
|
26
|
+
annotations_creators="derived",
|
|
27
|
+
dialect=[],
|
|
28
|
+
sample_creation="found",
|
|
29
|
+
bibtex_citation="""""",
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
def dataset_transform(self):
|
|
33
|
+
for split in self.dataset:
|
|
34
|
+
self.dataset[split] = self.dataset[split].rename_columns(
|
|
35
|
+
{"headline": "text", "is_sarcastic": "label"}
|
|
36
|
+
)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from mteb.abstasks.classification import AbsTaskClassification
|
|
2
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class IconclassClassification(AbsTaskClassification):
|
|
6
|
+
samples_per_label = 32
|
|
7
|
+
metadata = TaskMetadata(
|
|
8
|
+
name="IconclassClassification",
|
|
9
|
+
description="Iconclass is an iconographic thesaurus, which is widely used in the digital heritage domain to "
|
|
10
|
+
"describe subjects depicted in artworks. The task is to classify the first layer of Iconclass",
|
|
11
|
+
reference="https://dl.acm.org/doi/pdf/10.1145/3575865",
|
|
12
|
+
dataset={
|
|
13
|
+
"path": "clips/mteb-nl-iconclass-cls",
|
|
14
|
+
"revision": "1cd02f1579dab39fedc95de8cc15fd620557a9f2",
|
|
15
|
+
},
|
|
16
|
+
type="Classification",
|
|
17
|
+
category="t2c",
|
|
18
|
+
modalities=["text"],
|
|
19
|
+
date=("2020-01-01", "2020-05-01"),
|
|
20
|
+
eval_splits=["test"],
|
|
21
|
+
eval_langs=["nld-Latn"],
|
|
22
|
+
main_score="f1",
|
|
23
|
+
domains=["Written", "Fiction"],
|
|
24
|
+
task_subtypes=[],
|
|
25
|
+
license="cc-by-nc-sa-4.0",
|
|
26
|
+
annotations_creators="expert-annotated",
|
|
27
|
+
dialect=[],
|
|
28
|
+
sample_creation="found",
|
|
29
|
+
bibtex_citation=r"""
|
|
30
|
+
@article{banar2023transfer,
|
|
31
|
+
author = {Banar, Nikolay and Daelemans, Walter and Kestemont, Mike},
|
|
32
|
+
journal = {ACM Journal on Computing and Cultural Heritage},
|
|
33
|
+
number = {2},
|
|
34
|
+
pages = {1--16},
|
|
35
|
+
publisher = {ACM New York, NY},
|
|
36
|
+
title = {Transfer learning for the visual arts: The multi-modal retrieval of iconclass codes},
|
|
37
|
+
volume = {16},
|
|
38
|
+
year = {2023},
|
|
39
|
+
}
|
|
40
|
+
""",
|
|
41
|
+
)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from mteb.abstasks.classification import AbsTaskClassification
|
|
2
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class OpenTenderClassification(AbsTaskClassification):
|
|
6
|
+
metadata = TaskMetadata(
|
|
7
|
+
name="OpenTenderClassification",
|
|
8
|
+
dataset={
|
|
9
|
+
"path": "clips/mteb-nl-opentender-cls-pr",
|
|
10
|
+
"revision": "9af5657575a669dc18c7f897a67287ff7d1a0c65",
|
|
11
|
+
},
|
|
12
|
+
description="This dataset contains Belgian and Dutch tender calls from OpenTender in Dutch",
|
|
13
|
+
reference="https://arxiv.org/abs/2509.12340",
|
|
14
|
+
type="Classification",
|
|
15
|
+
category="t2c",
|
|
16
|
+
modalities=["text"],
|
|
17
|
+
eval_splits=["test"],
|
|
18
|
+
eval_langs=["nld-Latn"],
|
|
19
|
+
main_score="f1",
|
|
20
|
+
date=("2025-08-01", "2025-08-10"),
|
|
21
|
+
domains=["Government", "Written"],
|
|
22
|
+
task_subtypes=[],
|
|
23
|
+
license="cc-by-4.0",
|
|
24
|
+
annotations_creators="human-annotated",
|
|
25
|
+
dialect=[],
|
|
26
|
+
sample_creation="found",
|
|
27
|
+
bibtex_citation=r"""
|
|
28
|
+
@misc{banar2025mtebnle5nlembeddingbenchmark,
|
|
29
|
+
archiveprefix = {arXiv},
|
|
30
|
+
author = {Nikolay Banar and Ehsan Lotfi and Jens Van Nooten and Cristina Arhiliuc and Marija Kliocaite and Walter Daelemans},
|
|
31
|
+
eprint = {2509.12340},
|
|
32
|
+
primaryclass = {cs.CL},
|
|
33
|
+
title = {MTEB-NL and E5-NL: Embedding Benchmark and Models for Dutch},
|
|
34
|
+
url = {https://arxiv.org/abs/2509.12340},
|
|
35
|
+
year = {2025},
|
|
36
|
+
}
|
|
37
|
+
""",
|
|
38
|
+
)
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from mteb.abstasks.classification import AbsTaskClassification
|
|
2
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class VaccinChatNLClassification(AbsTaskClassification):
|
|
6
|
+
metadata = TaskMetadata(
|
|
7
|
+
name="VaccinChatNLClassification",
|
|
8
|
+
description="VaccinChatNL is a Flemish Dutch FAQ dataset on the topic of COVID-19 vaccinations in Flanders.",
|
|
9
|
+
reference="https://huggingface.co/datasets/clips/VaccinChatNL",
|
|
10
|
+
dataset={
|
|
11
|
+
"path": "clips/VaccinChatNL",
|
|
12
|
+
"revision": "bd27d0058bea2ad52470d9072a3b5da6b97c1ac3",
|
|
13
|
+
},
|
|
14
|
+
type="Classification",
|
|
15
|
+
category="t2c",
|
|
16
|
+
modalities=["text"],
|
|
17
|
+
date=("2022-01-01", "2022-09-01"),
|
|
18
|
+
eval_splits=["test"],
|
|
19
|
+
eval_langs=["nld-Latn"],
|
|
20
|
+
main_score="f1",
|
|
21
|
+
domains=["Spoken", "Web"],
|
|
22
|
+
task_subtypes=[],
|
|
23
|
+
license="cc-by-4.0",
|
|
24
|
+
annotations_creators="expert-annotated",
|
|
25
|
+
dialect=[],
|
|
26
|
+
sample_creation="created",
|
|
27
|
+
bibtex_citation=r"""
|
|
28
|
+
@inproceedings{buhmann-etal-2022-domain,
|
|
29
|
+
address = {Gyeongju, Republic of Korea},
|
|
30
|
+
author = {Buhmann, Jeska and De Bruyn, Maxime and Lotfi, Ehsan and Daelemans, Walter},
|
|
31
|
+
booktitle = {Proceedings of the 29th International Conference on Computational Linguistics},
|
|
32
|
+
month = oct,
|
|
33
|
+
pages = {3539--3549},
|
|
34
|
+
publisher = {International Committee on Computational Linguistics},
|
|
35
|
+
title = {Domain- and Task-Adaptation for {V}accin{C}hat{NL}, a {D}utch {COVID}-19 {FAQ} Answering Corpus and Classification Model},
|
|
36
|
+
url = {https://aclanthology.org/2022.coling-1.312},
|
|
37
|
+
year = {2022},
|
|
38
|
+
}
|
|
39
|
+
""",
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
def dataset_transform(self):
|
|
43
|
+
for split in self.dataset:
|
|
44
|
+
self.dataset[split] = self.dataset[split].rename_columns(
|
|
45
|
+
{"sentence1": "text"}
|
|
46
|
+
)
|
|
@@ -5,7 +5,7 @@ from mteb.abstasks.task_metadata import TaskMetadata
|
|
|
5
5
|
class WongnaiReviewsClassification(AbsTaskClassification):
|
|
6
6
|
metadata = TaskMetadata(
|
|
7
7
|
name="WongnaiReviewsClassification",
|
|
8
|
-
description="Wongnai features over 200,000 restaurants, beauty salons, and spas across Thailand on its platform, with detailed information about each merchant and user reviews. In this dataset there are 5 classes
|
|
8
|
+
description="Wongnai features over 200,000 restaurants, beauty salons, and spas across Thailand on its platform, with detailed information about each merchant and user reviews. In this dataset there are 5 classes corresponding each star rating",
|
|
9
9
|
reference="https://github.com/wongnai/wongnai-corpus",
|
|
10
10
|
dataset={
|
|
11
11
|
"path": "Wongnai/wongnai_reviews",
|
|
@@ -10,7 +10,7 @@ class UkrFormalityClassification(AbsTaskClassification):
|
|
|
10
10
|
trainslating English GYAFC data.
|
|
11
11
|
English data source: https://aclanthology.org/N18-1012/
|
|
12
12
|
Translation into Ukrainian language using model: https://huggingface.co/facebook/nllb-200-distilled-600M
|
|
13
|
-
Additionally, the dataset was balanced,
|
|
13
|
+
Additionally, the dataset was balanced, with labels: 0 - informal, 1 - formal.
|
|
14
14
|
""",
|
|
15
15
|
dataset={
|
|
16
16
|
"path": "ukr-detect/ukr-formality-dataset-translated-gyafc",
|
|
@@ -61,7 +61,7 @@ class UkrFormalityClassificationV2(AbsTaskClassification):
|
|
|
61
61
|
trainslating English GYAFC data.
|
|
62
62
|
English data source: https://aclanthology.org/N18-1012/
|
|
63
63
|
Translation into Ukrainian language using model: https://huggingface.co/facebook/nllb-200-distilled-600M
|
|
64
|
-
Additionally, the dataset was balanced,
|
|
64
|
+
Additionally, the dataset was balanced, with labels: 0 - informal, 1 - formal.
|
|
65
65
|
|
|
66
66
|
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
|
|
67
67
|
dataset={
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from .dutch_news_articles_clustering_p2p import DutchNewsArticlesClusteringP2P
|
|
2
|
+
from .dutch_news_articles_clustering_s2s import DutchNewsArticlesClusteringS2S
|
|
3
|
+
from .iconclass_clustering_s2s import IconclassClusteringS2S
|
|
4
|
+
from .open_tender_clustering_p2p import OpenTenderClusteringP2P
|
|
5
|
+
from .open_tender_clustering_s2s import OpenTenderClusteringS2S
|
|
6
|
+
from .vabb_clustering_p2p import VABBClusteringP2P
|
|
7
|
+
from .vabb_clustering_s2s import VABBClusteringS2S
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"DutchNewsArticlesClusteringP2P",
|
|
11
|
+
"DutchNewsArticlesClusteringS2S",
|
|
12
|
+
"IconclassClusteringS2S",
|
|
13
|
+
"OpenTenderClusteringP2P",
|
|
14
|
+
"OpenTenderClusteringS2S",
|
|
15
|
+
"VABBClusteringP2P",
|
|
16
|
+
"VABBClusteringS2S",
|
|
17
|
+
]
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from mteb.abstasks.clustering import AbsTaskClustering
|
|
2
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class DutchNewsArticlesClusteringP2P(AbsTaskClustering):
|
|
6
|
+
max_fraction_of_documents_to_embed = 1.0
|
|
7
|
+
metadata = TaskMetadata(
|
|
8
|
+
name="DutchNewsArticlesClusteringP2P",
|
|
9
|
+
dataset={
|
|
10
|
+
"path": "clips/mteb-nl-news-articles-cls",
|
|
11
|
+
"revision": "0a7227d31f85c5676be92767f8df5405ea93de54",
|
|
12
|
+
},
|
|
13
|
+
description="This dataset contains all the articles published by the NOS as of the 1st of January 2010. The "
|
|
14
|
+
"data is obtained by scraping the NOS website. The NOS is one of the biggest (online) news "
|
|
15
|
+
"organizations in the Netherlands.",
|
|
16
|
+
reference="https://www.kaggle.com/datasets/maxscheijen/dutch-news-articles",
|
|
17
|
+
type="Clustering",
|
|
18
|
+
category="t2c",
|
|
19
|
+
modalities=["text"],
|
|
20
|
+
eval_splits=["test"],
|
|
21
|
+
eval_langs=["nld-Latn"],
|
|
22
|
+
main_score="v_measure",
|
|
23
|
+
date=("2009-11-01", "2010-01-01"),
|
|
24
|
+
domains=["Written", "News"],
|
|
25
|
+
task_subtypes=[],
|
|
26
|
+
license="cc-by-nc-sa-4.0",
|
|
27
|
+
annotations_creators="derived",
|
|
28
|
+
dialect=[],
|
|
29
|
+
sample_creation="found",
|
|
30
|
+
bibtex_citation="",
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
def dataset_transform(self):
|
|
34
|
+
for split in self.dataset:
|
|
35
|
+
self.dataset[split] = self.dataset[split].rename_columns(
|
|
36
|
+
{"label": "labels", "text": "sentences"}
|
|
37
|
+
)
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from mteb.abstasks.clustering import AbsTaskClustering
|
|
2
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class DutchNewsArticlesClusteringS2S(AbsTaskClustering):
|
|
6
|
+
max_fraction_of_documents_to_embed = 1.0
|
|
7
|
+
metadata = TaskMetadata(
|
|
8
|
+
name="DutchNewsArticlesClusteringS2S",
|
|
9
|
+
dataset={
|
|
10
|
+
"path": "clips/mteb-nl-news-articles-cls",
|
|
11
|
+
"revision": "0a7227d31f85c5676be92767f8df5405ea93de54",
|
|
12
|
+
},
|
|
13
|
+
description="This dataset contains all the articles published by the NOS as of the 1st of January 2010. The "
|
|
14
|
+
"data is obtained by scraping the NOS website. The NOS is one of the biggest (online) news "
|
|
15
|
+
"organizations in the Netherlands.",
|
|
16
|
+
reference="https://www.kaggle.com/datasets/maxscheijen/dutch-news-articles",
|
|
17
|
+
type="Clustering",
|
|
18
|
+
category="t2c",
|
|
19
|
+
modalities=["text"],
|
|
20
|
+
eval_splits=["test"],
|
|
21
|
+
eval_langs=["nld-Latn"],
|
|
22
|
+
main_score="v_measure",
|
|
23
|
+
date=("2009-11-01", "2010-01-01"),
|
|
24
|
+
domains=["Written", "News"],
|
|
25
|
+
task_subtypes=[],
|
|
26
|
+
license="cc-by-nc-sa-4.0",
|
|
27
|
+
annotations_creators="derived",
|
|
28
|
+
dialect=[],
|
|
29
|
+
sample_creation="found",
|
|
30
|
+
bibtex_citation="",
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
def dataset_transform(self):
|
|
34
|
+
for split in self.dataset:
|
|
35
|
+
self.dataset[split] = self.dataset[split].rename_columns(
|
|
36
|
+
{"label": "labels", "title": "sentences"}
|
|
37
|
+
)
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from mteb.abstasks.clustering import AbsTaskClustering
|
|
2
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class IconclassClusteringS2S(AbsTaskClustering):
|
|
6
|
+
max_fraction_of_documents_to_embed = 1.0
|
|
7
|
+
metadata = TaskMetadata(
|
|
8
|
+
name="IconclassClusteringS2S",
|
|
9
|
+
dataset={
|
|
10
|
+
"path": "clips/mteb-nl-iconclass-cls",
|
|
11
|
+
"revision": "1cd02f1579dab39fedc95de8cc15fd620557a9f2",
|
|
12
|
+
},
|
|
13
|
+
description="Iconclass is an iconographic thesaurus, which is widely used in the digital heritage domain to "
|
|
14
|
+
"describe subjects depicted in artworks. The task is to classify the first layer of Iconclass",
|
|
15
|
+
reference="https://dl.acm.org/doi/pdf/10.1145/3575865",
|
|
16
|
+
type="Clustering",
|
|
17
|
+
category="t2c",
|
|
18
|
+
modalities=["text"],
|
|
19
|
+
eval_splits=["test"],
|
|
20
|
+
eval_langs=["nld-Latn"],
|
|
21
|
+
main_score="v_measure",
|
|
22
|
+
date=("2009-11-01", "2010-01-01"),
|
|
23
|
+
domains=["Written", "Fiction"],
|
|
24
|
+
task_subtypes=[],
|
|
25
|
+
license="cc-by-nc-sa-4.0",
|
|
26
|
+
annotations_creators="derived",
|
|
27
|
+
dialect=[],
|
|
28
|
+
sample_creation="found",
|
|
29
|
+
bibtex_citation=r"""
|
|
30
|
+
@article{banar2023transfer,
|
|
31
|
+
author = {Banar, Nikolay and Daelemans, Walter and Kestemont, Mike},
|
|
32
|
+
journal = {ACM Journal on Computing and Cultural Heritage},
|
|
33
|
+
number = {2},
|
|
34
|
+
pages = {1--16},
|
|
35
|
+
publisher = {ACM New York, NY},
|
|
36
|
+
title = {Transfer learning for the visual arts: The multi-modal retrieval of iconclass codes},
|
|
37
|
+
volume = {16},
|
|
38
|
+
year = {2023},
|
|
39
|
+
}
|
|
40
|
+
""",
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
def dataset_transform(self):
|
|
44
|
+
for split in self.dataset:
|
|
45
|
+
self.dataset[split] = self.dataset[split].map(
|
|
46
|
+
lambda ex: {"labels": ex["label"], "sentences": ex["text"]}
|
|
47
|
+
)
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from mteb.abstasks.clustering import AbsTaskClustering
|
|
2
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class OpenTenderClusteringP2P(AbsTaskClustering):
|
|
6
|
+
max_fraction_of_documents_to_embed = 1.0
|
|
7
|
+
metadata = TaskMetadata(
|
|
8
|
+
name="OpenTenderClusteringP2P",
|
|
9
|
+
dataset={
|
|
10
|
+
"path": "clips/mteb-nl-opentender-cls-pr",
|
|
11
|
+
"revision": "9af5657575a669dc18c7f897a67287ff7d1a0c65",
|
|
12
|
+
},
|
|
13
|
+
description="This dataset contains all the articles published by the NOS as of the 1st of January 2010. The "
|
|
14
|
+
"data is obtained by scraping the NOS website. The NOS is one of the biggest (online) news "
|
|
15
|
+
"organizations in the Netherlands.",
|
|
16
|
+
reference="https://arxiv.org/abs/2509.12340",
|
|
17
|
+
type="Clustering",
|
|
18
|
+
category="t2c",
|
|
19
|
+
modalities=["text"],
|
|
20
|
+
eval_splits=["test"],
|
|
21
|
+
eval_langs=["nld-Latn"],
|
|
22
|
+
main_score="v_measure",
|
|
23
|
+
date=("2025-08-01", "2025-08-10"),
|
|
24
|
+
domains=["Government", "Written"],
|
|
25
|
+
task_subtypes=[],
|
|
26
|
+
license="cc-by-nc-sa-4.0",
|
|
27
|
+
annotations_creators="derived",
|
|
28
|
+
dialect=[],
|
|
29
|
+
sample_creation="found",
|
|
30
|
+
bibtex_citation=r"""
|
|
31
|
+
@misc{banar2025mtebnle5nlembeddingbenchmark,
|
|
32
|
+
archiveprefix = {arXiv},
|
|
33
|
+
author = {Nikolay Banar and Ehsan Lotfi and Jens Van Nooten and Cristina Arhiliuc and Marija Kliocaite and Walter Daelemans},
|
|
34
|
+
eprint = {2509.12340},
|
|
35
|
+
primaryclass = {cs.CL},
|
|
36
|
+
title = {MTEB-NL and E5-NL: Embedding Benchmark and Models for Dutch},
|
|
37
|
+
url = {https://arxiv.org/abs/2509.12340},
|
|
38
|
+
year = {2025},
|
|
39
|
+
}
|
|
40
|
+
""",
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
def dataset_transform(self):
|
|
44
|
+
# reuse the dataset for classification
|
|
45
|
+
for split in self.dataset:
|
|
46
|
+
self.dataset[split] = self.dataset[split].map(
|
|
47
|
+
lambda ex: {
|
|
48
|
+
"labels": ex["label"],
|
|
49
|
+
"sentences": ex["text"],
|
|
50
|
+
}
|
|
51
|
+
)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from mteb.abstasks.clustering import AbsTaskClustering
|
|
2
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class OpenTenderClusteringS2S(AbsTaskClustering):
|
|
6
|
+
max_fraction_of_documents_to_embed = 1.0
|
|
7
|
+
metadata = TaskMetadata(
|
|
8
|
+
name="OpenTenderClusteringS2S",
|
|
9
|
+
dataset={
|
|
10
|
+
"path": "clips/mteb-nl-opentender-clst-s2s-pr",
|
|
11
|
+
"revision": "ad86cf1813d130e17dda0092d1c4f2c664e68d0c",
|
|
12
|
+
},
|
|
13
|
+
description="This dataset contains all the articles published by the NOS as of the 1st of January 2010. The "
|
|
14
|
+
"data is obtained by scraping the NOS website. The NOS is one of the biggest (online) news "
|
|
15
|
+
"organizations in the Netherlands.",
|
|
16
|
+
reference="https://arxiv.org/abs/2509.12340",
|
|
17
|
+
type="Clustering",
|
|
18
|
+
category="t2c",
|
|
19
|
+
modalities=["text"],
|
|
20
|
+
eval_splits=["test"],
|
|
21
|
+
eval_langs=["nld-Latn"],
|
|
22
|
+
main_score="v_measure",
|
|
23
|
+
date=("2025-08-01", "2025-08-10"),
|
|
24
|
+
domains=["Government", "Written"],
|
|
25
|
+
task_subtypes=[],
|
|
26
|
+
license="cc-by-nc-sa-4.0",
|
|
27
|
+
annotations_creators="derived",
|
|
28
|
+
dialect=[],
|
|
29
|
+
sample_creation="found",
|
|
30
|
+
bibtex_citation=r"""
|
|
31
|
+
@misc{banar2025mtebnle5nlembeddingbenchmark,
|
|
32
|
+
archiveprefix = {arXiv},
|
|
33
|
+
author = {Nikolay Banar and Ehsan Lotfi and Jens Van Nooten and Cristina Arhiliuc and Marija Kliocaite and Walter Daelemans},
|
|
34
|
+
eprint = {2509.12340},
|
|
35
|
+
primaryclass = {cs.CL},
|
|
36
|
+
title = {MTEB-NL and E5-NL: Embedding Benchmark and Models for Dutch},
|
|
37
|
+
url = {https://arxiv.org/abs/2509.12340},
|
|
38
|
+
year = {2025},
|
|
39
|
+
}
|
|
40
|
+
""",
|
|
41
|
+
)
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from mteb.abstasks.clustering import AbsTaskClustering
|
|
2
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class VABBClusteringP2P(AbsTaskClustering):
|
|
6
|
+
max_fraction_of_documents_to_embed = 1.0
|
|
7
|
+
metadata = TaskMetadata(
|
|
8
|
+
name="VABBClusteringP2P",
|
|
9
|
+
dataset={
|
|
10
|
+
"path": "clips/mteb-nl-vabb-cls",
|
|
11
|
+
"revision": "544acc2e46909eab2b49962b043a18b9c9772770",
|
|
12
|
+
},
|
|
13
|
+
description="This dataset contains the fourteenth edition of the Flemish Academic Bibliography for the Social "
|
|
14
|
+
"Sciences and Humanities (VABB-SHW), a database of academic publications from the social sciences "
|
|
15
|
+
"and humanities authored by researchers affiliated to Flemish universities (more information). "
|
|
16
|
+
"Publications in the database are used as one of the parameters of the Flemish performance-based "
|
|
17
|
+
"research funding system",
|
|
18
|
+
reference="https://zenodo.org/records/14214806",
|
|
19
|
+
type="Clustering",
|
|
20
|
+
category="t2c",
|
|
21
|
+
modalities=["text"],
|
|
22
|
+
eval_splits=["test"],
|
|
23
|
+
eval_langs=["nld-Latn"],
|
|
24
|
+
main_score="v_measure",
|
|
25
|
+
date=("2009-11-01", "2010-01-01"),
|
|
26
|
+
domains=["Academic", "Written"],
|
|
27
|
+
task_subtypes=[],
|
|
28
|
+
license="cc-by-nc-sa-4.0",
|
|
29
|
+
annotations_creators="derived",
|
|
30
|
+
dialect=[],
|
|
31
|
+
sample_creation="found",
|
|
32
|
+
bibtex_citation=r"""
|
|
33
|
+
@dataset{aspeslagh2024vabb,
|
|
34
|
+
author = {Aspeslagh, Pieter and Guns, Raf and Engels, Tim C. E.},
|
|
35
|
+
doi = {10.5281/zenodo.14214806},
|
|
36
|
+
publisher = {Zenodo},
|
|
37
|
+
title = {VABB-SHW: Dataset of Flemish Academic Bibliography for the Social Sciences and Humanities (edition 14)},
|
|
38
|
+
url = {https://doi.org/10.5281/zenodo.14214806},
|
|
39
|
+
year = {2024},
|
|
40
|
+
}
|
|
41
|
+
""",
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
def dataset_transform(self):
|
|
45
|
+
for split in self.dataset:
|
|
46
|
+
self.dataset[split] = self.dataset[split].map(
|
|
47
|
+
lambda ex: {
|
|
48
|
+
"labels": ex["org_discipline"],
|
|
49
|
+
"sentences": f"{ex['title']}\n{ex['abstract']}",
|
|
50
|
+
}
|
|
51
|
+
)
|