PyPI - mteb - Versions diffs - 2.7.21__py3-none-any.whl → 2.7.22__py3-none-any.whl - Mend

mteb 2.7.21py3-none-any.whl → 2.7.22py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

mteb/abstasks/regression.py CHANGED Viewed

@@ -93,7 +93,6 @@ class AbsTaskRegression(AbsTaskClassification):
         n_samples: Number of samples to use for training the regression model. If the dataset has fewer samples than n_samples, all samples are used.
         abstask_prompt: Prompt to use for the task for instruction model if not prompt is provided in TaskMetadata.prompt.
         evaluator_model: The model to use for evaluation. Can be any sklearn compatible model. Default is `LinearRegression`.
     """
     evaluator: type[SklearnEvaluator] = SklearnEvaluator

mteb/tasks/clustering/deu/ten_k_gnad_clustering_p2p.py CHANGED Viewed

@@ -18,14 +18,17 @@ class TenKGnadClusteringP2P(AbsTaskClusteringLegacy):
         eval_splits=["test"],
         eval_langs=["deu-Latn"],
         main_score="v_measure",
-        date=None,
+        date=(
+            "2000-01-01",
+            "2020-12-31",
+        ),  # since it is news it is guessed that it is from 2000 to 2020
         domains=["Web", "Written"],
         task_subtypes=[],
         license="cc-by-nc-sa-4.0",
-        annotations_creators=None,
+        annotations_creators="derived",
         dialect=[],
         sample_creation="found",
-        bibtex_citation=None,
+        bibtex_citation="",  # none found
         superseded_by="TenKGnadClusteringP2P.v2",
     )
@@ -36,7 +39,7 @@ class TenKGnadClusteringP2PFast(AbsTaskClustering):
     metadata = TaskMetadata(
         name="TenKGnadClusteringP2P.v2",
-        description="Clustering of news article titles+subheadings+texts. Clustering of 10 splits on the news article category.",
+        description="Clustering of news article titles+subheadings+texts. Clustering of 10 splits on the news article category. v2 uses a faster evaluation method used in the MMTEB paper, which allow for notably faster evaluation.",
         reference="https://tblock.github.io/10kGNAD/",
         dataset={
             "path": "slvnwhrl/tenkgnad-clustering-p2p",
@@ -53,13 +56,12 @@ class TenKGnadClusteringP2PFast(AbsTaskClustering):
             "2020-12-31",
         ),  # since it is news it is guessed that it is from 2000 to 2020
         domains=["News", "Non-fiction", "Written"],
-        task_subtypes=None,
+        task_subtypes=["Thematic clustering"],
         license="cc-by-sa-4.0",
         annotations_creators="derived",
         dialect=[],
         sample_creation="found",
-        bibtex_citation=None,  # none found
-        # due to duplicates
+        bibtex_citation="",  # none found
         adapted_from=["TenKGnadClusteringP2P"],
     )

mteb/tasks/clustering/deu/ten_k_gnad_clustering_s2s.py CHANGED Viewed

@@ -18,14 +18,17 @@ class TenKGnadClusteringS2S(AbsTaskClusteringLegacy):
         eval_splits=["test"],
         eval_langs=["deu-Latn"],
         main_score="v_measure",
-        date=None,
+        date=(
+            "2000-01-01",
+            "2020-12-31",
+        ),  # since it is news it is guessed that it is from 2000 to 2020
         domains=["News", "Non-fiction", "Written"],
-        task_subtypes=["Topic classification"],
-        license=None,
-        annotations_creators=None,
-        dialect=None,
-        sample_creation=None,
-        bibtex_citation=None,
+        task_subtypes=["Thematic clustering"],
+        license="cc-by-nc-sa-4.0",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="found",
+        bibtex_citation="",  # none found
         superseded_by="TenKGnadClusteringS2S.v2",
     )
@@ -36,7 +39,7 @@ class TenKGnadClusteringS2SFast(AbsTaskClustering):
     metadata = TaskMetadata(
         name="TenKGnadClusteringS2S.v2",
-        description="Clustering of news article titles. Clustering of 10 splits on the news article category.",
+        description="Clustering of news article titles. Clustering of 10 splits on the news article category. v2 uses a faster evaluation method used in the MMTEB paper, which allow for notably faster evaluation.",
         reference="https://tblock.github.io/10kGNAD/",
         dataset={
             "path": "slvnwhrl/tenkgnad-clustering-s2s",
@@ -53,13 +56,12 @@ class TenKGnadClusteringS2SFast(AbsTaskClustering):
             "2020-12-31",
         ),  # since it is news it is guessed that it is from 2000 to 2020
         domains=["News", "Non-fiction", "Written"],
-        task_subtypes=["Topic classification"],
+        task_subtypes=["Thematic clustering"],
         license="cc-by-sa-4.0",
         annotations_creators="derived",
         dialect=[],
         sample_creation="found",
-        bibtex_citation=None,  # none found
-        # due to duplicates
+        bibtex_citation="",  # none found
         adapted_from=["TenKGnadClusteringS2S"],
     )

mteb/tasks/clustering/fra/hal_clustering_s2s.py CHANGED Viewed

@@ -33,7 +33,7 @@ class HALClusteringS2S(AbsTaskClusteringLegacy):
         task_subtypes=["Thematic clustering"],
         license="apache-2.0",
         annotations_creators="human-annotated",
-        dialect=None,
+        dialect=[],
         sample_creation="found",
         bibtex_citation=r"""
 @misc{ciancone2024extending,

mteb/tasks/clustering/multilingual/wiki_clustering_p2p.py CHANGED Viewed

@@ -47,7 +47,7 @@ class WikiClusteringP2P(AbsTaskClusteringLegacy):
         annotations_creators="derived",
         dialect=[],
         sample_creation="created",
-        bibtex_citation=None,  # None exists
+        bibtex_citation="",  # None exists
         superseded_by="WikiClusteringP2P.v2",
     )

mteb/tasks/clustering/nob/vg_clustering.py CHANGED Viewed

@@ -42,7 +42,7 @@ class VGClustering(AbsTaskClusteringLegacy):
         main_score="v_measure",
         date=("2020-01-01", "2024-12-31"),  # best guess
         domains=["News", "Non-fiction", "Written"],
-        license=None,
+        license="not specified",
         annotations_creators="derived",
         dialect=[],
         task_subtypes=["Thematic clustering"],

mteb/tasks/clustering/rom/romani_bible_clustering.py CHANGED Viewed

@@ -24,5 +24,5 @@ class RomaniBibleClustering(AbsTaskClusteringLegacy):
         annotations_creators="derived",
         dialect=["Kalderash"],
         sample_creation="human-translated and localized",
-        bibtex_citation=None,
+        bibtex_citation="",
     )

mteb/tasks/pair_classification/eng/twitter_sem_eval2015_pc.py CHANGED Viewed

@@ -17,7 +17,7 @@ class TwitterSemEval2015PC(AbsTaskPairClassification):
         eval_splits=["test"],
         eval_langs=["eng-Latn"],
         main_score="max_ap",
-        date=None,
+        date=("2015-01-01", "2015-12-31"),  # publication year
         domains=["Social", "Written"],
         task_subtypes=[],
         license="not specified",

mteb/tasks/pair_classification/eng/twitter_url_corpus_pc.py CHANGED Viewed

@@ -17,7 +17,7 @@ class TwitterURLCorpus(AbsTaskPairClassification):
         eval_splits=["test"],
         eval_langs=["eng-Latn"],
         main_score="max_ap",
-        date=None,
+        date=("2017-01-01", "2017-12-31"),  # publication year
         domains=["Social", "Written"],
         task_subtypes=[],
         license="not specified",

mteb/tasks/pair_classification/multilingual/indic_xnli_pair_classification.py CHANGED Viewed

@@ -41,7 +41,7 @@ class IndicXnliPairClassification(AbsTaskPairClassification):
         main_score="max_ap",
         date=("2022-04-22", "2022-10-06"),
         domains=["Non-fiction", "Fiction", "Government", "Written"],
-        task_subtypes=None,
+        task_subtypes=[],
         license="cc-by-4.0",
         annotations_creators="derived",
         dialect=[],

mteb/tasks/pair_classification/pol/polish_pc.py CHANGED Viewed

@@ -77,7 +77,7 @@ class PpcPC(AbsTaskPairClassification):
         eval_splits=["test"],
         eval_langs=["pol-Latn"],
         main_score="max_ap",
-        date=None,
+        date=("2022-01-01", "2022-12-31"),  # publication year
         domains=[
             "Fiction",
             "Non-fiction",
@@ -125,7 +125,7 @@ class CdscePC(AbsTaskPairClassification):
         eval_splits=["test"],
         eval_langs=["pol-Latn"],
         main_score="max_ap",
-        date=None,
+        date=("2017-01-01", "2017-12-31"),  # publication year
         domains=["Written"],
         task_subtypes=[],
         license="cc-by-nc-sa-4.0",

mteb/tasks/retrieval/eng/cqa_dupstack_android_retrieval.py CHANGED Viewed

@@ -17,7 +17,7 @@ class CQADupstackAndroidRetrieval(AbsTaskRetrieval):
         eval_splits=["test"],
         eval_langs=["eng-Latn"],
         main_score="ndcg_at_10",
-        date=None,
+        date=("2015-01-01", "2015-12-31"),  # publication year
         domains=["Programming", "Web", "Written", "Non-fiction"],
         task_subtypes=["Question answering", "Duplicate Detection"],
         license="apache-2.0",

mteb/tasks/retrieval/eng/cqa_dupstack_english_retrieval.py CHANGED Viewed

@@ -17,7 +17,7 @@ class CQADupstackEnglishRetrieval(AbsTaskRetrieval):
         eval_splits=["test"],
         eval_langs=["eng-Latn"],
         main_score="ndcg_at_10",
-        date=None,
+        date=("2015-01-01", "2015-12-31"),  # publication year
         domains=["Written"],
         task_subtypes=["Question answering", "Duplicate Detection"],
         license="apache-2.0",

mteb/tasks/retrieval/eng/cqa_dupstack_gaming_retrieval.py CHANGED Viewed

@@ -17,7 +17,7 @@ class CQADupstackGamingRetrieval(AbsTaskRetrieval):
         eval_splits=["test"],
         eval_langs=["eng-Latn"],
         main_score="ndcg_at_10",
-        date=None,
+        date=("2015-01-01", "2015-12-31"),  # publication year
         domains=["Web", "Written"],
         task_subtypes=["Question answering", "Duplicate Detection"],
         license="apache-2.0",

mteb/tasks/retrieval/eng/cqa_dupstack_gis_retrieval.py CHANGED Viewed

@@ -17,7 +17,7 @@ class CQADupstackGisRetrieval(AbsTaskRetrieval):
         eval_splits=["test"],
         eval_langs=["eng-Latn"],
         main_score="ndcg_at_10",
-        date=None,
+        date=("2015-01-01", "2015-12-31"),  # publication year
         domains=["Written", "Non-fiction"],
         task_subtypes=["Question answering", "Duplicate Detection"],
         license="apache-2.0",

mteb/tasks/retrieval/eng/cqa_dupstack_mathematica_retrieval.py CHANGED Viewed

@@ -17,7 +17,7 @@ class CQADupstackMathematicaRetrieval(AbsTaskRetrieval):
         eval_splits=["test"],
         eval_langs=["eng-Latn"],
         main_score="ndcg_at_10",
-        date=None,
+        date=("2015-01-01", "2015-12-31"),  # publication year
         domains=["Written", "Academic", "Non-fiction"],
         task_subtypes=["Question answering", "Duplicate Detection"],
         license="apache-2.0",

mteb/tasks/retrieval/eng/cqa_dupstack_physics_retrieval.py CHANGED Viewed

@@ -17,7 +17,7 @@ class CQADupstackPhysicsRetrieval(AbsTaskRetrieval):
         eval_splits=["test"],
         eval_langs=["eng-Latn"],
         main_score="ndcg_at_10",
-        date=None,
+        date=("2015-01-01", "2015-12-31"),  # publication year
         domains=["Written", "Academic", "Non-fiction"],
         task_subtypes=["Question answering", "Duplicate Detection"],
         license="apache-2.0",

mteb/tasks/retrieval/eng/cqa_dupstack_programmers_retrieval.py CHANGED Viewed

@@ -17,7 +17,7 @@ class CQADupstackProgrammersRetrieval(AbsTaskRetrieval):
         eval_splits=["test"],
         eval_langs=["eng-Latn"],
         main_score="ndcg_at_10",
-        date=None,
+        date=("2015-01-01", "2015-12-31"),  # publication year
         domains=["Programming", "Written", "Non-fiction"],
         task_subtypes=[],
         license="apache-2.0",

mteb/tasks/retrieval/eng/cqa_dupstack_stats_retrieval.py CHANGED Viewed

@@ -17,7 +17,7 @@ class CQADupstackStatsRetrieval(AbsTaskRetrieval):
         eval_splits=["test"],
         eval_langs=["eng-Latn"],
         main_score="ndcg_at_10",
-        date=None,
+        date=("2015-01-01", "2015-12-31"),  # publication year
         domains=["Written", "Academic", "Non-fiction"],
         task_subtypes=["Question answering", "Duplicate Detection"],
         license="apache-2.0",

mteb/tasks/retrieval/eng/cqa_dupstack_tex_retrieval.py CHANGED Viewed

@@ -17,7 +17,7 @@ class CQADupstackTexRetrieval(AbsTaskRetrieval):
         eval_splits=["test"],
         eval_langs=["eng-Latn"],
         main_score="ndcg_at_10",
-        date=None,
+        date=("2015-01-01", "2015-12-31"),  # publication year
         domains=["Written", "Non-fiction"],
         task_subtypes=["Question answering", "Duplicate Detection"],
         license="apache-2.0",

mteb/tasks/retrieval/eng/cqa_dupstack_unix_retrieval.py CHANGED Viewed

@@ -17,7 +17,7 @@ class CQADupstackUnixRetrieval(AbsTaskRetrieval):
         eval_splits=["test"],
         eval_langs=["eng-Latn"],
         main_score="ndcg_at_10",
-        date=None,
+        date=("2015-01-01", "2015-12-31"),  # publication year
         domains=["Written", "Web", "Programming"],
         task_subtypes=["Question answering", "Duplicate Detection"],
         license="apache-2.0",

mteb/tasks/retrieval/eng/cqa_dupstack_webmasters_retrieval.py CHANGED Viewed

@@ -17,7 +17,7 @@ class CQADupstackWebmastersRetrieval(AbsTaskRetrieval):
         eval_splits=["test"],
         eval_langs=["eng-Latn"],
         main_score="ndcg_at_10",
-        date=None,
+        date=("2015-01-01", "2015-12-31"),  # publication year
         domains=["Written", "Web"],
         task_subtypes=["Question answering"],
         license="apache-2.0",

mteb/tasks/retrieval/eng/cqa_dupstack_wordpress_retrieval.py CHANGED Viewed

@@ -17,7 +17,7 @@ class CQADupstackWordpressRetrieval(AbsTaskRetrieval):
         eval_splits=["test"],
         eval_langs=["eng-Latn"],
         main_score="ndcg_at_10",
-        date=None,
+        date=("2015-01-01", "2015-12-31"),  # publication year
         domains=["Written", "Web", "Programming"],
         task_subtypes=["Question answering"],
         license="apache-2.0",

mteb/tasks/retrieval/eng/fever_retrieval.py CHANGED Viewed

@@ -9,7 +9,7 @@ _fever_metadata = dict(
     eval_splits=["test"],
     eval_langs=["eng-Latn"],
     main_score="ndcg_at_10",
-    date=None,
+    date=("2018-01-01", "2018-12-31"),  # publication year
     domains=["Encyclopaedic", "Written"],
     task_subtypes=["Claim verification"],
     license="cc-by-nc-sa-3.0",

mteb/tasks/retrieval/eng/fi_qa2018_retrieval.py CHANGED Viewed

@@ -19,7 +19,7 @@ class FiQA2018(AbsTaskRetrieval):
         eval_splits=["test"],
         eval_langs=["eng-Latn"],
         main_score="ndcg_at_10",
-        date=None,
+        date=("2018-01-01", "2018-12-31"),  # publication year
         domains=["Written", "Financial"],
         task_subtypes=["Question answering"],
         license="not specified",

mteb/tasks/retrieval/eng/msmarc_ov2_retrieval.py CHANGED Viewed

@@ -17,7 +17,7 @@ class MSMARCOv2(AbsTaskRetrieval):
         eval_splits=["train", "dev", "dev2"],
         eval_langs=["eng-Latn"],
         main_score="ndcg_at_10",
-        date=None,
+        date=("2016-01-01", "2016-12-31"),  # publication year
         domains=[
             "Encyclopaedic",
             "Academic",

mteb/tasks/retrieval/eng/msmarco_retrieval.py CHANGED Viewed

@@ -19,7 +19,7 @@ class MSMARCO(AbsTaskRetrieval):
         eval_splits=["dev"],
         eval_langs=["eng-Latn"],
         main_score="ndcg_at_10",
-        date=None,
+        date=("2016-01-01", "2016-12-31"),  # publication year
         domains=[
             "Encyclopaedic",
             "Academic",
@@ -81,7 +81,7 @@ class MSMARCOHardNegatives(AbsTaskRetrieval):
         eval_splits=["test"],
         eval_langs=["eng-Latn"],
         main_score="ndcg_at_10",
-        date=None,
+        date=("2016-01-01", "2016-12-31"),  # publication year
         domains=[
             "Encyclopaedic",
             "Academic",

mteb/tasks/retrieval/eng/nf_corpus_retrieval.py CHANGED Viewed

@@ -17,13 +17,13 @@ class NFCorpus(AbsTaskRetrieval):
         eval_splits=["test"],
         eval_langs=["eng-Latn"],
         main_score="ndcg_at_10",
-        date=None,
+        date=("2016-01-01", "2016-12-31"),  # publication year
         domains=["Medical", "Academic", "Written"],
-        task_subtypes=None,
-        license=None,
-        annotations_creators=None,
-        dialect=None,
-        sample_creation=None,
+        task_subtypes=[],
+        license="not specified",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="found",
         bibtex_citation=r"""
 @inproceedings{boteva2016,
   author = {Boteva, Vera and Gholipour, Demian and Sokolov, Artem and Riezler, Stefan},

mteb/tasks/retrieval/eng/nq_retrieval.py CHANGED Viewed

@@ -17,7 +17,7 @@ class NQ(AbsTaskRetrieval):
         eval_splits=["test"],
         eval_langs=["eng-Latn"],
         main_score="ndcg_at_10",
-        date=None,
+        date=("2019-01-01", "2019-12-31"),  # publication year
         domains=["Written", "Encyclopaedic"],
         task_subtypes=["Question answering"],
         license="cc-by-nc-sa-3.0",
@@ -57,13 +57,13 @@ class NQHardNegatives(AbsTaskRetrieval):
         eval_splits=["test"],
         eval_langs=["eng-Latn"],
         main_score="ndcg_at_10",
-        date=None,
-        domains=None,
-        task_subtypes=None,
-        license=None,
-        annotations_creators=None,
-        dialect=None,
-        sample_creation=None,
+        date=("2019-01-01", "2019-12-31"),  # publication year
+        domains=["Written", "Encyclopaedic"],
+        task_subtypes=["Question answering"],
+        license="cc-by-nc-sa-3.0",
+        annotations_creators="human-annotated",
+        dialect=[],
+        sample_creation="found",
         bibtex_citation=r"""
 @article{47761,
   author = {Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh

mteb/tasks/retrieval/eng/quora_retrieval.py CHANGED Viewed

@@ -9,7 +9,7 @@ _quora_metadata = dict(
     eval_splits=["test"],
     eval_langs=["eng-Latn"],
     main_score="ndcg_at_10",
-    date=None,
+    date=("2017-01-01", "2017-12-31"),  # original publication year
     domains=["Written", "Web", "Blog"],
     task_subtypes=["Question answering"],
     license="not specified",

mteb 2.7.21__py3-none-any.whl → 2.7.22__py3-none-any.whl

mteb 2.7.21py3-none-any.whl → 2.7.22py3-none-any.whl