mteb 2.1.5__py3-none-any.whl → 2.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
  2. mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
  3. mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
  4. mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
  5. mteb/models/model_implementations/colqwen_models.py +57 -0
  6. mteb/models/model_implementations/kalm_models.py +159 -25
  7. mteb/models/model_implementations/rasgaard_models.py +27 -0
  8. mteb/models/model_implementations/tarka_models.py +1 -1
  9. mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +5 -1
  10. mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
  11. mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
  12. mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
  13. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
  14. mteb/tasks/classification/nld/iconclass_classification.py +3 -0
  15. mteb/tasks/classification/nld/open_tender_classification.py +3 -0
  16. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
  17. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
  18. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
  19. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
  20. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
  21. mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
  22. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
  23. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
  24. mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
  25. mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
  26. mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
  27. mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
  28. mteb/tasks/retrieval/nld/__init__.py +8 -4
  29. mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
  30. mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
  31. mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
  32. mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
  33. mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
  34. mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
  35. mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
  36. mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
  37. mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
  38. mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
  39. {mteb-2.1.5.dist-info → mteb-2.1.7.dist-info}/METADATA +2 -2
  40. {mteb-2.1.5.dist-info → mteb-2.1.7.dist-info}/RECORD +44 -39
  41. {mteb-2.1.5.dist-info → mteb-2.1.7.dist-info}/WHEEL +0 -0
  42. {mteb-2.1.5.dist-info → mteb-2.1.7.dist-info}/entry_points.txt +0 -0
  43. {mteb-2.1.5.dist-info → mteb-2.1.7.dist-info}/licenses/LICENSE +0 -0
  44. {mteb-2.1.5.dist-info → mteb-2.1.7.dist-info}/top_level.txt +0 -0
@@ -28,6 +28,9 @@ class DutchNewsArticlesClusteringP2P(AbsTaskClustering):
28
28
  dialect=[],
29
29
  sample_creation="found",
30
30
  bibtex_citation="",
31
+ prompt={
32
+ "query": "Identificeer de hoofdcategorie van nieuwsartikelen op basis van de titels en de inhoud"
33
+ },
31
34
  )
32
35
 
33
36
  def dataset_transform(self):
@@ -28,6 +28,9 @@ class DutchNewsArticlesClusteringS2S(AbsTaskClustering):
28
28
  dialect=[],
29
29
  sample_creation="found",
30
30
  bibtex_citation="",
31
+ prompt={
32
+ "query": "Identificeer de hoofdcategorie van nieuwsartikelen op basis van de titels"
33
+ },
31
34
  )
32
35
 
33
36
  def dataset_transform(self):
@@ -38,6 +38,9 @@ class IconclassClusteringS2S(AbsTaskClustering):
38
38
  year = {2023},
39
39
  }
40
40
  """,
41
+ prompt={
42
+ "query": "Identificeer het onderwerp of thema van kunstwerken op basis van de titels"
43
+ },
41
44
  )
42
45
 
43
46
  def dataset_transform(self):
@@ -38,6 +38,9 @@ class OpenTenderClusteringP2P(AbsTaskClustering):
38
38
  year = {2025},
39
39
  }
40
40
  """,
41
+ prompt={
42
+ "query": "Identificeer de hoofdcategorie van aanbestedingen op basis van de titels en beschrijvingen"
43
+ },
41
44
  )
42
45
 
43
46
  def dataset_transform(self):
@@ -38,4 +38,7 @@ class OpenTenderClusteringS2S(AbsTaskClustering):
38
38
  year = {2025},
39
39
  }
40
40
  """,
41
+ prompt={
42
+ "query": "Identificeer de hoofdcategorie van aanbestedingen op basis van de titels"
43
+ },
41
44
  )
@@ -39,6 +39,9 @@ class VABBClusteringP2P(AbsTaskClustering):
39
39
  year = {2024},
40
40
  }
41
41
  """,
42
+ prompt={
43
+ "query": "Identificeer de hoofdcategorie van wetenschappelijke artikelen op basis van de titels en abstracts"
44
+ },
42
45
  )
43
46
 
44
47
  def dataset_transform(self):
@@ -39,6 +39,9 @@ class VABBClusteringS2S(AbsTaskClustering):
39
39
  year = {2024},
40
40
  }
41
41
  """,
42
+ prompt={
43
+ "query": "Identificeer de hoofdcategorie van wetenschappelijke artikelen op basis van de titels"
44
+ },
42
45
  )
43
46
 
44
47
  def dataset_transform(self):
@@ -61,6 +61,9 @@ Yih, Scott Wen-tau},
61
61
  year = {2021},
62
62
  }
63
63
  """,
64
+ prompt={
65
+ "query": "Classificeer COVID-19-gerelateerde sociale media-berichten in alle toepasselijke desinformatiecategorieën"
66
+ },
64
67
  )
65
68
 
66
69
  def dataset_transform(self) -> None:
@@ -41,4 +41,7 @@ class VABBMultiLabelClassification(AbsTaskMultilabelClassification):
41
41
  year = {2024},
42
42
  }
43
43
  """,
44
+ prompt={
45
+ "query": "Classificeer de onderwerpen van een wetenschappelijk artikel op basis van de abstract"
46
+ },
44
47
  )
@@ -33,4 +33,7 @@ class SICKNLPairClassification(AbsTaskPairClassification):
33
33
  year = {2021},
34
34
  }
35
35
  """,
36
+ prompt={
37
+ "query": "Zoek tekst die semantisch vergelijkbaar is met de gegeven tekst."
38
+ },
36
39
  )
@@ -38,4 +38,7 @@ class XLWICNLPairClassification(AbsTaskPairClassification):
38
38
  year = {2020},
39
39
  }
40
40
  """,
41
+ prompt={
42
+ "query": "Zoek tekst die semantisch vergelijkbaar is met de gegeven tekst."
43
+ },
41
44
  )
@@ -1,4 +1,4 @@
1
- from .argu_ana_nl_retrieval import ArguAnaNL
1
+ from .argu_ana_nl_retrieval import ArguAnaNL, ArguAnaNLv2
2
2
  from .bbsard_nl_retrieval import BBSARDNLRetrieval
3
3
  from .climate_fevernl_retrieval import ClimateFEVERNL
4
4
  from .cqa_dupstack_android_nl_retrieval import CQADupstackAndroidNLRetrieval
@@ -20,12 +20,12 @@ from .fi_qa2018_nl_retrieval import FiQA2018NL
20
20
  from .hotpot_qanl_retrieval import HotpotQANL
21
21
  from .legal_qa_nl_retrieval import LegalQANLRetrieval
22
22
  from .mmarconl_retrieval import MMMARCONL
23
- from .nf_corpus_nl_retrieval import NFCorpusNL
23
+ from .nf_corpus_nl_retrieval import NFCorpusNL, NFCorpusNLv2
24
24
  from .nqnl_retrieval import NQNL
25
25
  from .open_tender_retrieval import OpenTenderRetrieval
26
26
  from .quora_nl_retrieval import QuoraNLRetrieval
27
- from .sci_fact_nl_retrieval import SciFactNL
28
- from .scidocsnl_retrieval import SCIDOCSNL
27
+ from .sci_fact_nl_retrieval import SciFactNL, SciFactNLv2
28
+ from .scidocsnl_retrieval import SCIDOCSNL, SCIDOCSNLv2
29
29
  from .touche2020_nl_retrieval import Touche2020NL
30
30
  from .treccovidnl_retrieval import TRECCOVIDNL
31
31
  from .vabb_retrieval import VABBRetrieval
@@ -37,6 +37,7 @@ __all__ = [
37
37
  "SCIDOCSNL",
38
38
  "TRECCOVIDNL",
39
39
  "ArguAnaNL",
40
+ "ArguAnaNLv2",
40
41
  "BBSARDNLRetrieval",
41
42
  "CQADupstackAndroidNLRetrieval",
42
43
  "CQADupstackEnglishNLRetrieval",
@@ -57,9 +58,12 @@ __all__ = [
57
58
  "HotpotQANL",
58
59
  "LegalQANLRetrieval",
59
60
  "NFCorpusNL",
61
+ "NFCorpusNLv2",
60
62
  "OpenTenderRetrieval",
61
63
  "QuoraNLRetrieval",
64
+ "SCIDOCSNLv2",
62
65
  "SciFactNL",
66
+ "SciFactNLv2",
63
67
  "Touche2020NL",
64
68
  "VABBRetrieval",
65
69
  ]
@@ -1,33 +1,26 @@
1
1
  from mteb.abstasks.retrieval import AbsTaskRetrieval
2
2
  from mteb.abstasks.task_metadata import TaskMetadata
3
3
 
4
-
5
- class ArguAnaNL(AbsTaskRetrieval):
6
- ignore_identical_ids = True
7
-
8
- metadata = TaskMetadata(
9
- name="ArguAna-NL",
10
- description="ArguAna involves the task of retrieval of the best counterargument to an argument. ArguAna-NL is "
11
- "a Dutch translation.",
12
- reference="https://huggingface.co/datasets/clips/beir-nl-arguana",
13
- dataset={
14
- "path": "clips/beir-nl-arguana",
15
- "revision": "4cd085d148fe2cac923bb7758d6ef585926170ba",
16
- },
17
- type="Retrieval",
18
- category="t2t",
19
- modalities=["text"],
20
- eval_splits=["test"],
21
- eval_langs=["nld-Latn"],
22
- main_score="ndcg_at_10",
23
- date=("2016-03-01", "2016-03-01"), # best guess: based on publication date
24
- domains=["Written", "Non-fiction"],
25
- task_subtypes=[],
26
- license="cc-by-sa-4.0",
27
- annotations_creators="derived",
28
- dialect=[],
29
- sample_creation="machine-translated and verified", # manually checked a small subset
30
- bibtex_citation=r"""
4
+ _argu_ana_nl_metadata = dict(
5
+ reference="https://huggingface.co/datasets/clips/beir-nl-arguana",
6
+ dataset={
7
+ "path": "clips/beir-nl-arguana",
8
+ "revision": "4cd085d148fe2cac923bb7758d6ef585926170ba",
9
+ },
10
+ type="Retrieval",
11
+ category="t2t",
12
+ modalities=["text"],
13
+ eval_splits=["test"],
14
+ eval_langs=["nld-Latn"],
15
+ main_score="ndcg_at_10",
16
+ date=("2016-03-01", "2016-03-01"), # best guess: based on publication date
17
+ domains=["Written", "Non-fiction"],
18
+ task_subtypes=[],
19
+ license="cc-by-sa-4.0",
20
+ annotations_creators="derived",
21
+ dialect=[],
22
+ sample_creation="machine-translated and verified", # manually checked a small subset
23
+ bibtex_citation=r"""
31
24
  @misc{banar2024beirnlzeroshotinformationretrieval,
32
25
  archiveprefix = {arXiv},
33
26
  author = {Nikolay Banar and Ehsan Lotfi and Walter Daelemans},
@@ -38,5 +31,31 @@ class ArguAnaNL(AbsTaskRetrieval):
38
31
  year = {2024},
39
32
  }
40
33
  """,
34
+ )
35
+
36
+
37
+ class ArguAnaNL(AbsTaskRetrieval):
38
+ ignore_identical_ids = True
39
+
40
+ metadata = TaskMetadata(
41
+ name="ArguAna-NL",
42
+ description="ArguAna involves the task of retrieval of the best counterargument to an argument. ArguAna-NL is "
43
+ "a Dutch translation.",
41
44
  adapted_from=["ArguAna"],
45
+ **_argu_ana_nl_metadata,
46
+ )
47
+
48
+
49
+ class ArguAnaNLv2(AbsTaskRetrieval):
50
+ ignore_identical_ids = True
51
+
52
+ metadata = TaskMetadata(
53
+ name="ArguAna-NL.v2",
54
+ description="ArguAna involves the task of retrieval of the best counterargument to an argument. ArguAna-NL is "
55
+ "a Dutch translation. This version adds a Dutch prompt to the dataset.",
56
+ prompt={
57
+ "query": "Gegeven een bewering, vind documenten die de bewering weerleggen"
58
+ },
59
+ adapted_from=["ArguAna-NL"],
60
+ **_argu_ana_nl_metadata,
42
61
  )
@@ -38,4 +38,7 @@ class BBSARDNLRetrieval(AbsTaskRetrieval):
38
38
  year = {2025},
39
39
  }
40
40
  """,
41
+ prompt={
42
+ "query": "Gegeven een juridische vraag, haal documenten op die kunnen helpen bij het beantwoorden van de vraag"
43
+ },
41
44
  )
@@ -27,4 +27,7 @@ class DutchNewsArticlesRetrieval(AbsTaskRetrieval):
27
27
  dialect=[],
28
28
  sample_creation="found",
29
29
  bibtex_citation="",
30
+ prompt={
31
+ "query": "Gegeven een titel, haal het nieuwsartikel op dat het beste bij de titel past"
32
+ },
30
33
  )
@@ -36,4 +36,7 @@ class LegalQANLRetrieval(AbsTaskRetrieval):
36
36
  year = {2024},
37
37
  }
38
38
  """,
39
+ prompt={
40
+ "query": "Gegeven een juridische vraag, haal documenten op die kunnen helpen bij het beantwoorden van de vraag"
41
+ },
39
42
  )
@@ -1,31 +1,26 @@
1
1
  from mteb.abstasks.retrieval import AbsTaskRetrieval
2
2
  from mteb.abstasks.task_metadata import TaskMetadata
3
3
 
4
-
5
- class NFCorpusNL(AbsTaskRetrieval):
6
- metadata = TaskMetadata(
7
- name="NFCorpus-NL",
8
- dataset={
9
- "path": "clips/beir-nl-nfcorpus",
10
- "revision": "942953e674fd0f619ff89897abb806dc3df5dd39",
11
- },
12
- description="NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval. NFCorpus-NL is "
13
- "a Dutch translation.",
14
- reference="https://huggingface.co/datasets/clips/beir-nl-nfcorpus",
15
- type="Retrieval",
16
- category="t2t",
17
- modalities=["text"],
18
- eval_splits=["test"],
19
- eval_langs=["nld-Latn"],
20
- main_score="ndcg_at_10",
21
- date=("2016-03-01", "2016-03-01"), # best guess: based on publication date
22
- domains=["Medical", "Academic", "Written"],
23
- task_subtypes=[],
24
- license="cc-by-4.0",
25
- annotations_creators="derived",
26
- dialect=[],
27
- sample_creation="machine-translated and verified", # manually checked a small subset
28
- bibtex_citation=r"""
4
+ _nf_corpus_metadata = dict(
5
+ dataset={
6
+ "path": "clips/beir-nl-nfcorpus",
7
+ "revision": "942953e674fd0f619ff89897abb806dc3df5dd39",
8
+ },
9
+ reference="https://huggingface.co/datasets/clips/beir-nl-nfcorpus",
10
+ type="Retrieval",
11
+ category="t2t",
12
+ modalities=["text"],
13
+ eval_splits=["test"],
14
+ eval_langs=["nld-Latn"],
15
+ main_score="ndcg_at_10",
16
+ date=("2016-03-01", "2016-03-01"), # best guess: based on publication date
17
+ domains=["Medical", "Academic", "Written"],
18
+ task_subtypes=[],
19
+ license="cc-by-4.0",
20
+ annotations_creators="derived",
21
+ dialect=[],
22
+ sample_creation="machine-translated and verified", # manually checked a small subset
23
+ bibtex_citation=r"""
29
24
  @misc{banar2024beirnlzeroshotinformationretrieval,
30
25
  archiveprefix = {arXiv},
31
26
  author = {Nikolay Banar and Ehsan Lotfi and Walter Daelemans},
@@ -36,5 +31,27 @@ class NFCorpusNL(AbsTaskRetrieval):
36
31
  year = {2024},
37
32
  }
38
33
  """,
34
+ )
35
+
36
+
37
+ class NFCorpusNL(AbsTaskRetrieval):
38
+ metadata = TaskMetadata(
39
+ name="NFCorpus-NL",
40
+ description="NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval. NFCorpus-NL is "
41
+ "a Dutch translation.",
39
42
  adapted_from=["NFCorpus"],
43
+ **_nf_corpus_metadata,
44
+ )
45
+
46
+
47
+ class NFCorpusNLv2(AbsTaskRetrieval):
48
+ metadata = TaskMetadata(
49
+ name="NFCorpus-NL.v2",
50
+ description="NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval. NFCorpus-NL is "
51
+ "a Dutch translation. This version adds a Dutch prompt to the dataset.",
52
+ adapted_from=["NFCorpus-NL"],
53
+ prompt={
54
+ "query": "Gegeven een vraag, haal relevante documenten op die de vraag het beste beantwoorden"
55
+ },
56
+ **_nf_corpus_metadata,
40
57
  )
@@ -35,4 +35,7 @@ class OpenTenderRetrieval(AbsTaskRetrieval):
35
35
  year = {2025},
36
36
  }
37
37
  """,
38
+ prompt={
39
+ "query": "Gegeven een titel, haal de aanbestedingsbeschrijving op die het beste bij de titel past"
40
+ },
38
41
  )
@@ -1,30 +1,26 @@
1
1
  from mteb.abstasks.retrieval import AbsTaskRetrieval
2
2
  from mteb.abstasks.task_metadata import TaskMetadata
3
3
 
4
-
5
- class SciFactNL(AbsTaskRetrieval):
6
- metadata = TaskMetadata(
7
- name="SciFact-NL",
8
- dataset={
9
- "path": "clips/beir-nl-scifact",
10
- "revision": "856d8dfc294b138856bbf3042450e3782321e44e",
11
- },
12
- description="SciFactNL verifies scientific claims in Dutch using evidence from the research literature containing scientific paper abstracts.",
13
- reference="https://huggingface.co/datasets/clips/beir-nl-scifact",
14
- type="Retrieval",
15
- category="t2t",
16
- modalities=["text"],
17
- eval_splits=["test"],
18
- eval_langs=["nld-Latn"],
19
- main_score="ndcg_at_10",
20
- date=("2020-05-01", "2020-05-01"), # best guess: based on submission date
21
- domains=["Academic", "Medical", "Written"],
22
- task_subtypes=[],
23
- license="cc-by-4.0",
24
- annotations_creators="derived",
25
- dialect=[],
26
- sample_creation="machine-translated and verified", # manually checked a small subset
27
- bibtex_citation=r"""
4
+ _sci_fact_nl_metadata = dict(
5
+ dataset={
6
+ "path": "clips/beir-nl-scifact",
7
+ "revision": "856d8dfc294b138856bbf3042450e3782321e44e",
8
+ },
9
+ reference="https://huggingface.co/datasets/clips/beir-nl-scifact",
10
+ type="Retrieval",
11
+ category="t2t",
12
+ modalities=["text"],
13
+ eval_splits=["test"],
14
+ eval_langs=["nld-Latn"],
15
+ main_score="ndcg_at_10",
16
+ date=("2020-05-01", "2020-05-01"), # best guess: based on submission date
17
+ domains=["Academic", "Medical", "Written"],
18
+ task_subtypes=[],
19
+ license="cc-by-4.0",
20
+ annotations_creators="derived",
21
+ dialect=[],
22
+ sample_creation="machine-translated and verified", # manually checked a small subset
23
+ bibtex_citation=r"""
28
24
  @misc{banar2024beirnlzeroshotinformationretrieval,
29
25
  archiveprefix = {arXiv},
30
26
  author = {Nikolay Banar and Ehsan Lotfi and Walter Daelemans},
@@ -35,5 +31,27 @@ class SciFactNL(AbsTaskRetrieval):
35
31
  year = {2024},
36
32
  }
37
33
  """,
34
+ )
35
+
36
+
37
+ class SciFactNL(AbsTaskRetrieval):
38
+ metadata = TaskMetadata(
39
+ name="SciFact-NL",
40
+ description="SciFactNL verifies scientific claims in Dutch using evidence from the research literature "
41
+ "containing scientific paper abstracts.",
38
42
  adapted_from=["SciFact"],
43
+ **_sci_fact_nl_metadata,
44
+ )
45
+
46
+
47
+ class SciFactNLv2(AbsTaskRetrieval):
48
+ metadata = TaskMetadata(
49
+ name="SciFact-NL.v2",
50
+ description="SciFactNL verifies scientific claims in Dutch using evidence from the research literature "
51
+ "containing scientific paper abstracts. This version adds a Dutch prompt to the dataset.",
52
+ adapted_from=["SciFact-NL"],
53
+ prompt={
54
+ "query": "Given a scientific claim, retrieve documents that support or refute the claim"
55
+ },
56
+ **_sci_fact_nl_metadata,
39
57
  )
@@ -1,33 +1,26 @@
1
1
  from mteb.abstasks.retrieval import AbsTaskRetrieval
2
2
  from mteb.abstasks.task_metadata import TaskMetadata
3
3
 
4
-
5
- class SCIDOCSNL(AbsTaskRetrieval):
6
- metadata = TaskMetadata(
7
- name="SCIDOCS-NL",
8
- dataset={
9
- "path": "clips/beir-nl-scidocs",
10
- "revision": "4e018aa220029f9d1bd5a31de3650e322e32ea38",
11
- },
12
- description=(
13
- "SciDocs, a new evaluation benchmark consisting of seven document-level tasks ranging from citation"
14
- + " prediction, to document classification and recommendation. SciDocs-NL is a Dutch translation."
15
- ),
16
- reference="https://huggingface.co/datasets/clips/beir-nl-scidocs",
17
- type="Retrieval",
18
- category="t2t",
19
- modalities=["text"],
20
- eval_splits=["test"],
21
- eval_langs=["nld-Latn"],
22
- main_score="ndcg_at_10",
23
- date=("2020-05-01", "2020-05-01"), # best guess: based on submission date
24
- domains=["Academic", "Written", "Non-fiction"],
25
- task_subtypes=[],
26
- license="cc-by-sa-4.0",
27
- annotations_creators="derived",
28
- dialect=[],
29
- sample_creation="machine-translated and verified", # manually checked a small subset
30
- bibtex_citation=r"""
4
+ _scidocsnl_metadata = dict(
5
+ dataset={
6
+ "path": "clips/beir-nl-scidocs",
7
+ "revision": "4e018aa220029f9d1bd5a31de3650e322e32ea38",
8
+ },
9
+ reference="https://huggingface.co/datasets/clips/beir-nl-scidocs",
10
+ type="Retrieval",
11
+ category="t2t",
12
+ modalities=["text"],
13
+ eval_splits=["test"],
14
+ eval_langs=["nld-Latn"],
15
+ main_score="ndcg_at_10",
16
+ date=("2020-05-01", "2020-05-01"), # best guess: based on submission date
17
+ domains=["Academic", "Written", "Non-fiction"],
18
+ task_subtypes=[],
19
+ license="cc-by-sa-4.0",
20
+ annotations_creators="derived",
21
+ dialect=[],
22
+ sample_creation="machine-translated and verified", # manually checked a small subset
23
+ bibtex_citation=r"""
31
24
  @misc{banar2024beirnlzeroshotinformationretrieval,
32
25
  archiveprefix = {arXiv},
33
26
  author = {Nikolay Banar and Ehsan Lotfi and Walter Daelemans},
@@ -38,5 +31,29 @@ class SCIDOCSNL(AbsTaskRetrieval):
38
31
  year = {2024},
39
32
  }
40
33
  """,
34
+ )
35
+
36
+
37
+ class SCIDOCSNL(AbsTaskRetrieval):
38
+ metadata = TaskMetadata(
39
+ name="SCIDOCS-NL",
40
+ description="SciDocs, a new evaluation benchmark consisting of seven document-level tasks ranging from "
41
+ "citation prediction, to document classification and recommendation. SciDocs-NL is a Dutch "
42
+ "translation.",
41
43
  adapted_from=["SCIDOCS"],
44
+ **_scidocsnl_metadata,
45
+ )
46
+
47
+
48
+ class SCIDOCSNLv2(AbsTaskRetrieval):
49
+ metadata = TaskMetadata(
50
+ name="SCIDOCS-NL.v2",
51
+ description="SciDocs, a new evaluation benchmark consisting of seven document-level tasks ranging from "
52
+ "citation prediction, to document classification and recommendation. SciDocs-NL is a Dutch "
53
+ "translation. This version adds a Dutch prompt to the dataset.",
54
+ adapted_from=["SCIDOCS-NL"],
55
+ **_scidocsnl_metadata,
56
+ prompt={
57
+ "query": "Gegeven de titel van een wetenschappelijk artikel, haal de abstracts op van artikelen die door het gegeven artikel worden geciteerd"
58
+ },
42
59
  )
@@ -38,4 +38,7 @@ class VABBRetrieval(AbsTaskRetrieval):
38
38
  year = {2024},
39
39
  }
40
40
  """,
41
+ prompt={
42
+ "query": "Gegeven een titel, haal de wetenschappelijke abstract op die het beste bij de titel past"
43
+ },
41
44
  )
@@ -35,6 +35,7 @@ class SICKNLSTS(AbsTaskSTS):
35
35
  year = {2021},
36
36
  }
37
37
  """,
38
+ prompt={"query": "Haal semantisch vergelijkbare tekst op"},
38
39
  )
39
40
 
40
41
  min_score = 0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mteb
3
- Version: 2.1.5
3
+ Version: 2.1.7
4
4
  Summary: Massive Text Embedding Benchmark
5
5
  Author-email: MTEB Contributors <niklas@huggingface.co>, Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Nouamane Tazi <nouamane@huggingface.co>, Nils Reimers <info@nils-reimers.de>
6
6
  Maintainer-email: Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Roman Solomatin <risolomatin@gmail.com>, Isaac Chung <chungisaac1217@gmail.com>
@@ -16,7 +16,7 @@ Classifier: Intended Audience :: Developers
16
16
  Classifier: Intended Audience :: Information Technology
17
17
  Classifier: Operating System :: OS Independent
18
18
  Classifier: Programming Language :: Python
19
- Requires-Python: <3.14,>=3.10
19
+ Requires-Python: <3.15,>=3.10
20
20
  Description-Content-Type: text/markdown
21
21
  License-File: LICENSE
22
22
  Requires-Dist: datasets>=2.19.0