mteb 2.7.16__py3-none-any.whl → 2.7.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/_create_dataloaders.py +16 -16
- mteb/_evaluators/any_sts_evaluator.py +1 -1
- mteb/_evaluators/classification_metrics.py +10 -1
- mteb/_evaluators/clustering_evaluator.py +1 -1
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +2 -2
- mteb/_evaluators/pair_classification_evaluator.py +3 -2
- mteb/_evaluators/retrieval_evaluator.py +1 -1
- mteb/_evaluators/retrieval_metrics.py +9 -7
- mteb/_evaluators/sklearn_evaluator.py +13 -6
- mteb/_evaluators/text/bitext_mining_evaluator.py +1 -1
- mteb/_evaluators/text/summarization_evaluator.py +1 -1
- mteb/_evaluators/zeroshot_classification_evaluator.py +1 -1
- mteb/abstasks/_stratification.py +13 -8
- mteb/abstasks/abstask.py +4 -4
- mteb/abstasks/classification.py +6 -4
- mteb/abstasks/clustering.py +1 -1
- mteb/abstasks/clustering_legacy.py +1 -1
- mteb/abstasks/image/image_text_pair_classification.py +1 -1
- mteb/abstasks/multilabel_classification.py +7 -5
- mteb/abstasks/pair_classification.py +1 -1
- mteb/abstasks/regression.py +3 -2
- mteb/abstasks/retrieval.py +8 -5
- mteb/abstasks/retrieval_dataset_loaders.py +27 -8
- mteb/abstasks/sts.py +1 -1
- mteb/abstasks/text/bitext_mining.py +2 -2
- mteb/abstasks/text/reranking.py +1 -1
- mteb/abstasks/text/summarization.py +1 -1
- mteb/abstasks/zeroshot_classification.py +1 -1
- mteb/benchmarks/benchmark.py +131 -3
- mteb/evaluate.py +2 -2
- mteb/leaderboard/figures.py +2 -1
- mteb/leaderboard/table.py +10 -2
- mteb/models/cache_wrappers/cache_backend_protocol.py +3 -3
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +3 -3
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +8 -3
- mteb/models/cache_wrappers/cache_wrapper.py +2 -2
- mteb/models/model_implementations/bedrock_models.py +4 -4
- mteb/models/model_implementations/bm25.py +2 -2
- mteb/models/model_implementations/mcinext_models.py +2 -2
- mteb/models/model_implementations/openai_models.py +2 -1
- mteb/models/model_implementations/pylate_models.py +4 -4
- mteb/models/model_implementations/random_baseline.py +4 -3
- mteb/models/model_implementations/seed_models.py +7 -2
- mteb/models/model_implementations/voyage_models.py +1 -1
- mteb/models/models_protocols.py +2 -2
- mteb/models/search_wrappers.py +4 -4
- mteb/tasks/bitext_mining/multilingual/bible_nlp_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/in22_conv_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/in22_gen_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ntrex_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py +1 -1
- mteb/tasks/classification/ben/bengali_document_classification.py +2 -2
- mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -1
- mteb/tasks/classification/multilingual/hin_dialect_classification.py +1 -1
- mteb/tasks/classification/multilingual/indic_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/indic_sentiment_classification.py +1 -1
- mteb/tasks/classification/multilingual/language_classification.py +1 -1
- mteb/tasks/classification/multilingual/south_african_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/turkic_classification.py +1 -1
- mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/swa/swahili_news_classification.py +2 -2
- mteb/tasks/clustering/deu/ten_k_gnad_clustering_p2p.py +1 -1
- mteb/tasks/clustering/deu/ten_k_gnad_clustering_s2s.py +1 -1
- mteb/tasks/clustering/multilingual/mlsum_clustering_p2p.py +2 -2
- mteb/tasks/clustering/multilingual/mlsum_clustering_s2s.py +2 -2
- mteb/tasks/clustering/nob/vg_hierarchical_clustering.py +2 -2
- mteb/tasks/image_text_pair_classification/eng/image_co_de.py +1 -1
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
- mteb/tasks/instruction_reranking/multilingual/m_follow_ir.py +2 -2
- mteb/tasks/multichoice/eng/cv_bench.py +4 -4
- mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_smilespc.py +1 -1
- mteb/tasks/pair_classification/multilingual/pub_chem_wiki_pair_classification.py +1 -1
- mteb/tasks/pair_classification/multilingual/rte3.py +1 -1
- mteb/tasks/retrieval/ara/sadeem_question_retrieval.py +1 -1
- mteb/tasks/retrieval/code/code_edit_search_retrieval.py +1 -1
- mteb/tasks/retrieval/code/code_rag.py +8 -8
- mteb/tasks/retrieval/code/code_search_net_cc_retrieval.py +1 -1
- mteb/tasks/retrieval/code/coir_code_search_net_retrieval.py +1 -1
- mteb/tasks/retrieval/code/ds1000_retrieval.py +1 -1
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +1 -1
- mteb/tasks/retrieval/code/human_eval_retrieval.py +1 -1
- mteb/tasks/retrieval/code/mbpp_retrieval.py +1 -1
- mteb/tasks/retrieval/code/wiki_sql_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +2 -2
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
- mteb/tasks/retrieval/deu/german_gov_service_retrieval.py +1 -1
- mteb/tasks/retrieval/deu/german_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/ell/greek_civics_qa.py +1 -1
- mteb/tasks/retrieval/eng/bright_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/chat_doctor_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/fin_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/finance_bench_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hc3_finance_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_narrative_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_needle_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_passkey_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_summ_screen_fd_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_wikim_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lembqm_sum_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/ml_questions.py +1 -1
- mteb/tasks/retrieval/eng/nano_argu_ana_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_climate_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_db_pedia_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_fi_qa2018_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_hotpot_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_msmarco_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_nf_corpus_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_nq_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_quora_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_sci_fact_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_scidocs_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_touche2020_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/narrative_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/r2_med_retrieval.py +8 -8
- mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +10 -10
- mteb/tasks/retrieval/fra/f_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/fra/syntec_retrieval.py +1 -1
- mteb/tasks/retrieval/hun/hun_sum2.py +1 -1
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt19.py +1 -1
- mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt21.py +1 -1
- mteb/tasks/retrieval/multilingual/cur_ev1_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/mr_tidy_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -2
- mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +5 -5
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +1 -0
- mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/x_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_android_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_english_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_gaming_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_gis_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_mathematica_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_physics_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_programmers_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_stats_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_tex_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_unix_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_webmasters_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_wordpress_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nob/norquad.py +2 -2
- mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -1
- mteb/tasks/retrieval/vie/vie_qu_ad_retrieval.py +1 -1
- mteb/tasks/sts/multilingual/sem_rel24_sts.py +1 -1
- mteb/tasks/sts/multilingual/sts_benchmark_multilingual_sts.py +1 -1
- mteb/tasks/sts/por/assin2_sts.py +1 -1
- mteb/types/_encoder_io.py +3 -2
- {mteb-2.7.16.dist-info → mteb-2.7.18.dist-info}/METADATA +1 -1
- {mteb-2.7.16.dist-info → mteb-2.7.18.dist-info}/RECORD +173 -173
- {mteb-2.7.16.dist-info → mteb-2.7.18.dist-info}/WHEEL +0 -0
- {mteb-2.7.16.dist-info → mteb-2.7.18.dist-info}/entry_points.txt +0 -0
- {mteb-2.7.16.dist-info → mteb-2.7.18.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.7.16.dist-info → mteb-2.7.18.dist-info}/top_level.txt +0 -0
|
@@ -143,7 +143,7 @@ class MIRACLVisionRetrieval(AbsTaskRetrieval):
|
|
|
143
143
|
prompt={"query": "Find a screenshot that is relevant to the user's query."},
|
|
144
144
|
)
|
|
145
145
|
|
|
146
|
-
def load_data(self, num_proc: int =
|
|
146
|
+
def load_data(self, num_proc: int | None = None, **kwargs) -> None:
|
|
147
147
|
if self.data_loaded:
|
|
148
148
|
return
|
|
149
149
|
|
|
@@ -103,7 +103,7 @@ class RuSciBenchCiteRetrieval(AbsTaskRetrieval):
|
|
|
103
103
|
},
|
|
104
104
|
)
|
|
105
105
|
|
|
106
|
-
def load_data(self, num_proc: int =
|
|
106
|
+
def load_data(self, num_proc: int | None = None, **kwargs) -> None:
|
|
107
107
|
if self.data_loaded:
|
|
108
108
|
return
|
|
109
109
|
|
|
@@ -161,7 +161,7 @@ class RuSciBenchCociteRetrieval(AbsTaskRetrieval):
|
|
|
161
161
|
},
|
|
162
162
|
)
|
|
163
163
|
|
|
164
|
-
def load_data(self, num_proc: int =
|
|
164
|
+
def load_data(self, num_proc: int | None = None, **kwargs) -> None:
|
|
165
165
|
if self.data_loaded:
|
|
166
166
|
return
|
|
167
167
|
|
|
@@ -16,7 +16,7 @@ def _load_data(
|
|
|
16
16
|
splits: list[str],
|
|
17
17
|
langs: list | None = None,
|
|
18
18
|
revision: str | None = None,
|
|
19
|
-
num_proc: int =
|
|
19
|
+
num_proc: int | None = None,
|
|
20
20
|
):
|
|
21
21
|
if langs is None:
|
|
22
22
|
corpus = {}
|
|
@@ -131,7 +131,7 @@ class Vidore2ESGReportsRetrieval(AbsTaskRetrieval):
|
|
|
131
131
|
prompt={"query": "Find a screenshot that relevant to the user's question."},
|
|
132
132
|
)
|
|
133
133
|
|
|
134
|
-
def load_data(self, num_proc: int =
|
|
134
|
+
def load_data(self, num_proc: int | None = None, **kwargs) -> None:
|
|
135
135
|
if self.data_loaded:
|
|
136
136
|
return
|
|
137
137
|
|
|
@@ -179,7 +179,7 @@ class Vidore2EconomicsReportsRetrieval(AbsTaskRetrieval):
|
|
|
179
179
|
prompt={"query": "Find a screenshot that relevant to the user's question."},
|
|
180
180
|
)
|
|
181
181
|
|
|
182
|
-
def load_data(self, num_proc: int =
|
|
182
|
+
def load_data(self, num_proc: int | None = None, **kwargs) -> None:
|
|
183
183
|
if self.data_loaded:
|
|
184
184
|
return
|
|
185
185
|
|
|
@@ -227,7 +227,7 @@ class Vidore2BioMedicalLecturesRetrieval(AbsTaskRetrieval):
|
|
|
227
227
|
prompt={"query": "Find a screenshot that relevant to the user's question."},
|
|
228
228
|
)
|
|
229
229
|
|
|
230
|
-
def load_data(self, num_proc: int =
|
|
230
|
+
def load_data(self, num_proc: int | None = None, **kwargs) -> None:
|
|
231
231
|
if self.data_loaded:
|
|
232
232
|
return
|
|
233
233
|
|
|
@@ -275,7 +275,7 @@ class Vidore2ESGReportsHLRetrieval(AbsTaskRetrieval):
|
|
|
275
275
|
prompt={"query": "Find a screenshot that relevant to the user's question."},
|
|
276
276
|
)
|
|
277
277
|
|
|
278
|
-
def load_data(self, num_proc: int =
|
|
278
|
+
def load_data(self, num_proc: int | None = None, **kwargs) -> None:
|
|
279
279
|
if self.data_loaded:
|
|
280
280
|
return
|
|
281
281
|
|
|
@@ -68,6 +68,7 @@ class Vidore3FinanceFrRetrieval(AbsTaskRetrieval):
|
|
|
68
68
|
license="cc-by-4.0",
|
|
69
69
|
annotations_creators="derived",
|
|
70
70
|
dialect=[],
|
|
71
|
+
modalities=["text", "image"],
|
|
71
72
|
sample_creation="created and machine-translated",
|
|
72
73
|
bibtex_citation=r"""
|
|
73
74
|
@article{loison2026vidorev3comprehensiveevaluation,
|
|
@@ -42,7 +42,7 @@ class CQADupstackAndroidNLRetrieval(AbsTaskRetrieval):
|
|
|
42
42
|
adapted_from=["CQADupstackAndroid"],
|
|
43
43
|
)
|
|
44
44
|
|
|
45
|
-
def load_data(self, num_proc: int =
|
|
45
|
+
def load_data(self, num_proc: int | None = None, **kwargs) -> None:
|
|
46
46
|
if self.data_loaded:
|
|
47
47
|
return
|
|
48
48
|
|
|
@@ -42,7 +42,7 @@ class CQADupstackEnglishNLRetrieval(AbsTaskRetrieval):
|
|
|
42
42
|
adapted_from=["CQADupstackEnglish"],
|
|
43
43
|
)
|
|
44
44
|
|
|
45
|
-
def load_data(self, num_proc: int =
|
|
45
|
+
def load_data(self, num_proc: int | None = None, **kwargs) -> None:
|
|
46
46
|
if self.data_loaded:
|
|
47
47
|
return
|
|
48
48
|
|
|
@@ -42,7 +42,7 @@ class CQADupstackGamingNLRetrieval(AbsTaskRetrieval):
|
|
|
42
42
|
adapted_from=["CQADupstackGamingRetrieval"],
|
|
43
43
|
)
|
|
44
44
|
|
|
45
|
-
def load_data(self, num_proc: int =
|
|
45
|
+
def load_data(self, num_proc: int | None = None, **kwargs) -> None:
|
|
46
46
|
if self.data_loaded:
|
|
47
47
|
return
|
|
48
48
|
|
|
@@ -42,7 +42,7 @@ class CQADupstackGisNLRetrieval(AbsTaskRetrieval):
|
|
|
42
42
|
adapted_from=["CQADupstackGisRetrieval"],
|
|
43
43
|
)
|
|
44
44
|
|
|
45
|
-
def load_data(self, num_proc: int =
|
|
45
|
+
def load_data(self, num_proc: int | None = None, **kwargs) -> None:
|
|
46
46
|
if self.data_loaded:
|
|
47
47
|
return
|
|
48
48
|
|
|
@@ -42,7 +42,7 @@ class CQADupstackMathematicaNLRetrieval(AbsTaskRetrieval):
|
|
|
42
42
|
adapted_from=["CQADupstackMathematicaRetrieval"],
|
|
43
43
|
)
|
|
44
44
|
|
|
45
|
-
def load_data(self, num_proc: int =
|
|
45
|
+
def load_data(self, num_proc: int | None = None, **kwargs) -> None:
|
|
46
46
|
if self.data_loaded:
|
|
47
47
|
return
|
|
48
48
|
|
|
@@ -42,7 +42,7 @@ class CQADupstackPhysicsNLRetrieval(AbsTaskRetrieval):
|
|
|
42
42
|
adapted_from=["CQADupstackPhysicsRetrieval"],
|
|
43
43
|
)
|
|
44
44
|
|
|
45
|
-
def load_data(self, num_proc: int =
|
|
45
|
+
def load_data(self, num_proc: int | None = None, **kwargs) -> None:
|
|
46
46
|
if self.data_loaded:
|
|
47
47
|
return
|
|
48
48
|
|
|
@@ -42,7 +42,7 @@ class CQADupstackProgrammersNLRetrieval(AbsTaskRetrieval):
|
|
|
42
42
|
adapted_from=["CQADupstackProgrammersRetrieval"],
|
|
43
43
|
)
|
|
44
44
|
|
|
45
|
-
def load_data(self, num_proc: int =
|
|
45
|
+
def load_data(self, num_proc: int | None = None, **kwargs) -> None:
|
|
46
46
|
if self.data_loaded:
|
|
47
47
|
return
|
|
48
48
|
|
|
@@ -42,7 +42,7 @@ class CQADupstackStatsNLRetrieval(AbsTaskRetrieval):
|
|
|
42
42
|
adapted_from=["CQADupstackStatsRetrieval"],
|
|
43
43
|
)
|
|
44
44
|
|
|
45
|
-
def load_data(self, num_proc: int =
|
|
45
|
+
def load_data(self, num_proc: int | None = None, **kwargs) -> None:
|
|
46
46
|
if self.data_loaded:
|
|
47
47
|
return
|
|
48
48
|
|
|
@@ -42,7 +42,7 @@ class CQADupstackTexNLRetrieval(AbsTaskRetrieval):
|
|
|
42
42
|
adapted_from=["CQADupstackTexRetrieval"],
|
|
43
43
|
)
|
|
44
44
|
|
|
45
|
-
def load_data(self, num_proc: int =
|
|
45
|
+
def load_data(self, num_proc: int | None = None, **kwargs) -> None:
|
|
46
46
|
if self.data_loaded:
|
|
47
47
|
return
|
|
48
48
|
|
|
@@ -42,7 +42,7 @@ class CQADupstackUnixNLRetrieval(AbsTaskRetrieval):
|
|
|
42
42
|
adapted_from=["CQADupstackUnixRetrieval"],
|
|
43
43
|
)
|
|
44
44
|
|
|
45
|
-
def load_data(self, num_proc: int =
|
|
45
|
+
def load_data(self, num_proc: int | None = None, **kwargs) -> None:
|
|
46
46
|
if self.data_loaded:
|
|
47
47
|
return
|
|
48
48
|
|
|
@@ -42,7 +42,7 @@ class CQADupstackWebmastersNLRetrieval(AbsTaskRetrieval):
|
|
|
42
42
|
adapted_from=["CQADupstackWebmastersRetrieval"],
|
|
43
43
|
)
|
|
44
44
|
|
|
45
|
-
def load_data(self, num_proc: int =
|
|
45
|
+
def load_data(self, num_proc: int | None = None, **kwargs) -> None:
|
|
46
46
|
if self.data_loaded:
|
|
47
47
|
return
|
|
48
48
|
|
|
@@ -42,7 +42,7 @@ class CQADupstackWordpressNLRetrieval(AbsTaskRetrieval):
|
|
|
42
42
|
adapted_from=["CQADupstackWordpressRetrieval"],
|
|
43
43
|
)
|
|
44
44
|
|
|
45
|
-
def load_data(self, num_proc: int =
|
|
45
|
+
def load_data(self, num_proc: int | None = None, **kwargs) -> None:
|
|
46
46
|
if self.data_loaded:
|
|
47
47
|
return
|
|
48
48
|
|
|
@@ -50,7 +50,7 @@ Fishel, Mark},
|
|
|
50
50
|
},
|
|
51
51
|
)
|
|
52
52
|
|
|
53
|
-
def load_data(self, num_proc: int =
|
|
53
|
+
def load_data(self, num_proc: int | None = None, **kwargs) -> None:
|
|
54
54
|
"""Load dataset from HuggingFace hub"""
|
|
55
55
|
if self.data_loaded:
|
|
56
56
|
return
|
|
@@ -58,7 +58,7 @@ Fishel, Mark},
|
|
|
58
58
|
self.dataset_transform()
|
|
59
59
|
self.data_loaded = True
|
|
60
60
|
|
|
61
|
-
def dataset_transform(self, num_proc: int =
|
|
61
|
+
def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
|
|
62
62
|
"""And transform to a retrieval dataset, which have the following attributes
|
|
63
63
|
|
|
64
64
|
self.corpus = dict[doc_id, dict[str, str]] #id => dict with document data like title and text
|
|
@@ -37,7 +37,7 @@ class SNLRetrieval(AbsTaskRetrieval):
|
|
|
37
37
|
task_subtypes=["Article retrieval"],
|
|
38
38
|
)
|
|
39
39
|
|
|
40
|
-
def load_data(self, num_proc: int =
|
|
40
|
+
def load_data(self, num_proc: int | None = None, **kwargs) -> None:
|
|
41
41
|
"""Load dataset from HuggingFace hub"""
|
|
42
42
|
if self.data_loaded:
|
|
43
43
|
return
|
|
@@ -45,7 +45,7 @@ class SNLRetrieval(AbsTaskRetrieval):
|
|
|
45
45
|
self.dataset_transform()
|
|
46
46
|
self.data_loaded = True
|
|
47
47
|
|
|
48
|
-
def dataset_transform(self, num_proc: int =
|
|
48
|
+
def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
|
|
49
49
|
"""And transform to a retrieval dataset, which have the following attributes
|
|
50
50
|
|
|
51
51
|
self.corpus = dict[doc_id, dict[str, str]] #id => dict with document data like title and text
|
|
@@ -36,7 +36,7 @@ class SlovakSumRetrieval(AbsTaskRetrieval):
|
|
|
36
36
|
""",
|
|
37
37
|
)
|
|
38
38
|
|
|
39
|
-
def load_data(self, num_proc: int =
|
|
39
|
+
def load_data(self, num_proc: int | None = None, **kwargs) -> None:
|
|
40
40
|
if self.data_loaded:
|
|
41
41
|
return
|
|
42
42
|
self.corpus, self.queries, self.relevant_docs = {}, {}, {}
|
|
@@ -66,6 +66,6 @@ Seid Muhie Yimam and Saif M. Mohammad},
|
|
|
66
66
|
min_score = 0
|
|
67
67
|
max_score = 1
|
|
68
68
|
|
|
69
|
-
def dataset_transform(self, num_proc: int =
|
|
69
|
+
def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
|
|
70
70
|
for lang, subset in self.dataset.items():
|
|
71
71
|
self.dataset[lang] = subset.rename_column("label", "score")
|
|
@@ -56,6 +56,6 @@ class STSBenchmarkMultilingualSTS(AbsTaskSTS):
|
|
|
56
56
|
min_score = 0
|
|
57
57
|
max_score = 5
|
|
58
58
|
|
|
59
|
-
def dataset_transform(self, num_proc: int =
|
|
59
|
+
def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
|
|
60
60
|
for lang, subset in self.dataset.items():
|
|
61
61
|
self.dataset[lang] = subset.rename_column("similarity_score", "score")
|
mteb/tasks/sts/por/assin2_sts.py
CHANGED
|
@@ -39,7 +39,7 @@ class Assin2STS(AbsTaskSTS):
|
|
|
39
39
|
min_score = 1
|
|
40
40
|
max_score = 5
|
|
41
41
|
|
|
42
|
-
def dataset_transform(self, num_proc: int =
|
|
42
|
+
def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
|
|
43
43
|
self.dataset = self.dataset.rename_columns(
|
|
44
44
|
{
|
|
45
45
|
"premise": "sentence1",
|
mteb/types/_encoder_io.py
CHANGED
|
@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, TypedDict
|
|
|
7
7
|
import numpy as np
|
|
8
8
|
import torch
|
|
9
9
|
from datasets import Dataset
|
|
10
|
+
from numpy.typing import NDArray
|
|
10
11
|
|
|
11
12
|
if TYPE_CHECKING:
|
|
12
13
|
from PIL import Image
|
|
@@ -26,8 +27,8 @@ class EncodeKwargs(TypedDict):
|
|
|
26
27
|
|
|
27
28
|
|
|
28
29
|
# --- Output types ---
|
|
29
|
-
Array = np.
|
|
30
|
-
"""General array type, can be a numpy array or a torch tensor."""
|
|
30
|
+
Array = NDArray[np.floating | np.integer | np.bool] | torch.Tensor
|
|
31
|
+
"""General array type, can be a numpy array (float, int, or bool) or a torch tensor."""
|
|
31
32
|
|
|
32
33
|
|
|
33
34
|
# --- Input types ---
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mteb
|
|
3
|
-
Version: 2.7.
|
|
3
|
+
Version: 2.7.18
|
|
4
4
|
Summary: Massive Text Embedding Benchmark
|
|
5
5
|
Author-email: MTEB Contributors <niklas@huggingface.co>, Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Nouamane Tazi <nouamane@huggingface.co>, Nils Reimers <info@nils-reimers.de>
|
|
6
6
|
Maintainer-email: Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Roman Solomatin <risolomatin@gmail.com>, Isaac Chung <chungisaac1217@gmail.com>
|