mteb 2.7.17__py3-none-any.whl → 2.7.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/_create_dataloaders.py +16 -16
- mteb/_evaluators/any_sts_evaluator.py +1 -1
- mteb/_evaluators/clustering_evaluator.py +1 -1
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +2 -2
- mteb/_evaluators/pair_classification_evaluator.py +1 -1
- mteb/_evaluators/retrieval_evaluator.py +1 -1
- mteb/_evaluators/sklearn_evaluator.py +4 -2
- mteb/_evaluators/text/bitext_mining_evaluator.py +1 -1
- mteb/_evaluators/text/summarization_evaluator.py +1 -1
- mteb/_evaluators/zeroshot_classification_evaluator.py +1 -1
- mteb/abstasks/abstask.py +4 -4
- mteb/abstasks/classification.py +2 -2
- mteb/abstasks/clustering.py +1 -1
- mteb/abstasks/clustering_legacy.py +1 -1
- mteb/abstasks/image/image_text_pair_classification.py +1 -1
- mteb/abstasks/multilabel_classification.py +1 -1
- mteb/abstasks/pair_classification.py +1 -1
- mteb/abstasks/retrieval.py +8 -5
- mteb/abstasks/retrieval_dataset_loaders.py +27 -8
- mteb/abstasks/sts.py +1 -1
- mteb/abstasks/text/bitext_mining.py +2 -2
- mteb/abstasks/text/reranking.py +1 -1
- mteb/abstasks/text/summarization.py +1 -1
- mteb/abstasks/zeroshot_classification.py +1 -1
- mteb/evaluate.py +2 -2
- mteb/models/model_implementations/bm25.py +2 -2
- mteb/models/model_implementations/ict_time_and_querit_models.py +115 -0
- mteb/models/model_implementations/pylate_models.py +4 -4
- mteb/models/models_protocols.py +2 -2
- mteb/models/search_wrappers.py +4 -4
- mteb/tasks/bitext_mining/multilingual/bible_nlp_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/in22_conv_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/in22_gen_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ntrex_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py +1 -1
- mteb/tasks/classification/ben/bengali_document_classification.py +2 -2
- mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -1
- mteb/tasks/classification/multilingual/hin_dialect_classification.py +1 -1
- mteb/tasks/classification/multilingual/indic_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/indic_sentiment_classification.py +1 -1
- mteb/tasks/classification/multilingual/language_classification.py +1 -1
- mteb/tasks/classification/multilingual/south_african_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/turkic_classification.py +1 -1
- mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/swa/swahili_news_classification.py +2 -2
- mteb/tasks/clustering/deu/ten_k_gnad_clustering_p2p.py +1 -1
- mteb/tasks/clustering/deu/ten_k_gnad_clustering_s2s.py +1 -1
- mteb/tasks/clustering/multilingual/mlsum_clustering_p2p.py +2 -2
- mteb/tasks/clustering/multilingual/mlsum_clustering_s2s.py +2 -2
- mteb/tasks/clustering/nob/vg_hierarchical_clustering.py +2 -2
- mteb/tasks/image_text_pair_classification/eng/image_co_de.py +1 -1
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
- mteb/tasks/instruction_reranking/multilingual/m_follow_ir.py +2 -2
- mteb/tasks/multichoice/eng/cv_bench.py +4 -4
- mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_smilespc.py +1 -1
- mteb/tasks/pair_classification/multilingual/pub_chem_wiki_pair_classification.py +1 -1
- mteb/tasks/pair_classification/multilingual/rte3.py +1 -1
- mteb/tasks/retrieval/ara/sadeem_question_retrieval.py +1 -1
- mteb/tasks/retrieval/code/code_edit_search_retrieval.py +1 -1
- mteb/tasks/retrieval/code/code_rag.py +8 -8
- mteb/tasks/retrieval/code/code_search_net_cc_retrieval.py +1 -1
- mteb/tasks/retrieval/code/coir_code_search_net_retrieval.py +1 -1
- mteb/tasks/retrieval/code/ds1000_retrieval.py +1 -1
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +1 -1
- mteb/tasks/retrieval/code/human_eval_retrieval.py +1 -1
- mteb/tasks/retrieval/code/mbpp_retrieval.py +1 -1
- mteb/tasks/retrieval/code/wiki_sql_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +2 -2
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
- mteb/tasks/retrieval/deu/german_gov_service_retrieval.py +1 -1
- mteb/tasks/retrieval/deu/german_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/ell/greek_civics_qa.py +1 -1
- mteb/tasks/retrieval/eng/bright_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/chat_doctor_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/fin_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/finance_bench_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hc3_finance_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_narrative_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_needle_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_passkey_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_summ_screen_fd_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_wikim_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lembqm_sum_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/ml_questions.py +1 -1
- mteb/tasks/retrieval/eng/nano_argu_ana_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_climate_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_db_pedia_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_fi_qa2018_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_hotpot_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_msmarco_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_nf_corpus_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_nq_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_quora_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_sci_fact_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_scidocs_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_touche2020_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/narrative_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/r2_med_retrieval.py +8 -8
- mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +10 -10
- mteb/tasks/retrieval/fra/f_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/fra/syntec_retrieval.py +1 -1
- mteb/tasks/retrieval/hun/hun_sum2.py +1 -1
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt19.py +1 -1
- mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt21.py +1 -1
- mteb/tasks/retrieval/multilingual/cur_ev1_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/mr_tidy_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -2
- mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +5 -5
- mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/x_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_android_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_english_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_gaming_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_gis_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_mathematica_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_physics_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_programmers_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_stats_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_tex_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_unix_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_webmasters_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_wordpress_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nob/norquad.py +2 -2
- mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -1
- mteb/tasks/retrieval/vie/vie_qu_ad_retrieval.py +1 -1
- mteb/tasks/sts/multilingual/sem_rel24_sts.py +1 -1
- mteb/tasks/sts/multilingual/sts_benchmark_multilingual_sts.py +1 -1
- mteb/tasks/sts/por/assin2_sts.py +1 -1
- mteb/types/_encoder_io.py +1 -1
- {mteb-2.7.17.dist-info → mteb-2.7.19.dist-info}/METADATA +1 -1
- {mteb-2.7.17.dist-info → mteb-2.7.19.dist-info}/RECORD +156 -155
- {mteb-2.7.17.dist-info → mteb-2.7.19.dist-info}/WHEEL +0 -0
- {mteb-2.7.17.dist-info → mteb-2.7.19.dist-info}/entry_points.txt +0 -0
- {mteb-2.7.17.dist-info → mteb-2.7.19.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.7.17.dist-info → mteb-2.7.19.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
|
|
4
|
+
from mteb.models.model_meta import ModelMeta
|
|
5
|
+
from mteb.types import PromptType
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def instruction_template(
|
|
9
|
+
instruction: str | dict, prompt_type: PromptType | None = None
|
|
10
|
+
) -> str:
|
|
11
|
+
"""Format instruction for the model."""
|
|
12
|
+
if isinstance(instruction, dict):
|
|
13
|
+
instruction = instruction.get(prompt_type.value if prompt_type else "", "")
|
|
14
|
+
elif prompt_type == PromptType.document:
|
|
15
|
+
return ""
|
|
16
|
+
|
|
17
|
+
if not instruction:
|
|
18
|
+
return ""
|
|
19
|
+
return f"Instruct: {instruction}\nQuery:"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
multilingual_langs = [
|
|
23
|
+
"deu-Latn",
|
|
24
|
+
"ita-Latn",
|
|
25
|
+
"ara-Arab",
|
|
26
|
+
"fas-Arab",
|
|
27
|
+
"fra-Latn",
|
|
28
|
+
"hin-Deva",
|
|
29
|
+
"spa-Latn",
|
|
30
|
+
"zho-Hans",
|
|
31
|
+
"ben-Beng",
|
|
32
|
+
"eng-Latn",
|
|
33
|
+
"fin-Latn",
|
|
34
|
+
"ind-Latn",
|
|
35
|
+
"jpn-Jpan",
|
|
36
|
+
"kor-Hang",
|
|
37
|
+
"rus-Cyrl",
|
|
38
|
+
"swh-Latn",
|
|
39
|
+
"tel-Telu",
|
|
40
|
+
"tha-Thai",
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
training_data = [
|
|
44
|
+
"FEVER",
|
|
45
|
+
"DuRetrieval",
|
|
46
|
+
"HotpotQA",
|
|
47
|
+
"MSMARCO",
|
|
48
|
+
"T2Retrieval",
|
|
49
|
+
"NQ",
|
|
50
|
+
"MIRACLRetrieval",
|
|
51
|
+
"MrTidyRetrieval",
|
|
52
|
+
"AmazonCounterfactualClassification",
|
|
53
|
+
"Banking77Classification",
|
|
54
|
+
"ImdbClassification",
|
|
55
|
+
"MTOPDomainClassification",
|
|
56
|
+
"ToxicConversationsClassification",
|
|
57
|
+
"TweetSentimentExtractionClassification",
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
boom_4b_instructions = {
|
|
61
|
+
"AmazonCounterfactualClassification": "Classify a given Amazon customer review text as either counterfactual or not-counterfactual.",
|
|
62
|
+
"AmazonPolarityClassification": "Classify Amazon reviews into positive or negative sentiment.",
|
|
63
|
+
"AmazonReviewsClassification": "Classify the given Amazon review into its appropriate rating category.",
|
|
64
|
+
"Banking77Classification": "Given a online banking query, find the corresponding intents.",
|
|
65
|
+
"EmotionClassification": "Classify the emotion expressed in the given Twitter message into one of the six emotions: anger, fear, joy, love, sadness, and surprise.",
|
|
66
|
+
"ImdbClassification": "Classify the sentiment expressed in the given movie review text from the IMDB dataset.",
|
|
67
|
+
"MassiveIntentClassification": "Given a user utterance as query, find the user intents.",
|
|
68
|
+
"MassiveScenarioClassification": "Given a user utterance as query, find the user scenarios.",
|
|
69
|
+
"MTOPDomainClassification": "Classify the intent domain of the given utterance in task-oriented conversation.",
|
|
70
|
+
"MTOPIntentClassification": "Classify the intent of the given utterance in task-oriented conversation.",
|
|
71
|
+
"ToxicConversationsClassification": "Classify the given comments as either toxic or not toxic.",
|
|
72
|
+
"TweetSentimentExtractionClassification": "Classify the sentiment of a given tweet as either positive, negative, or neutral.",
|
|
73
|
+
"TNews": "Classify the fine-grained category of the given news title.",
|
|
74
|
+
"ClimateFEVER": "Given a claim about climate change, retrieve documents that support or refute the claim.",
|
|
75
|
+
"ClimateFEVERHardNegatives": "Given a claim about climate change, retrieve documents that support or refute the claim.",
|
|
76
|
+
"DBPedia": "Given a query, retrieve relevant entity descriptions from DBPedia.",
|
|
77
|
+
"FEVER": "Given a claim, retrieve documents that support or refute the claim.",
|
|
78
|
+
"FEVERHardNegatives": "Given a claim, retrieve documents that support or refute the claim.",
|
|
79
|
+
"FiQA2018": "Given a financial question, retrieve user replies that best answer the question.",
|
|
80
|
+
"HotpotQA": "Given a multi-hop question, retrieve documents that can help answer the question.",
|
|
81
|
+
"HotpotQAHardNegatives": "Given a multi-hop question, retrieve documents that can help answer the question.",
|
|
82
|
+
"MSMARCO": "Given a web search query, retrieve relevant passages that answer the query.",
|
|
83
|
+
"NFCorpus": "Given a question, retrieve relevant documents that best answer the question.",
|
|
84
|
+
"NQ": "Given a question, retrieve Wikipedia passages that answer the question.",
|
|
85
|
+
}
|
|
86
|
+
# How the template actually renders each one at inference time:
|
|
87
|
+
# instruction_template(boom_4b_instructions["Banking77Classification"], PromptType.query)
|
|
88
|
+
# -> "Instruct: Given a online banking query, find the corresponding intents.\nQuery:"
|
|
89
|
+
|
|
90
|
+
boom_4b_v1 = ModelMeta(
|
|
91
|
+
loader=InstructSentenceTransformerModel,
|
|
92
|
+
loader_kwargs=dict(
|
|
93
|
+
instruction_template=instruction_template,
|
|
94
|
+
),
|
|
95
|
+
name="ICT-TIME-and-Querit/BOOM_4B_v1",
|
|
96
|
+
model_type=["dense"],
|
|
97
|
+
languages=multilingual_langs,
|
|
98
|
+
open_weights=True,
|
|
99
|
+
adapted_from="Qwen/Qwen3-4B",
|
|
100
|
+
revision="447ab88574d27e67c428acc2b429d7d4580a4ea7",
|
|
101
|
+
release_date="2026-01-31",
|
|
102
|
+
n_parameters=4021774336,
|
|
103
|
+
n_embedding_parameters=None,
|
|
104
|
+
memory_usage_mb=7671,
|
|
105
|
+
embed_dim=2560,
|
|
106
|
+
max_tokens=32768,
|
|
107
|
+
license="apache-2.0",
|
|
108
|
+
reference="https://huggingface.co/ICT-TIME-and-Querit/BOOM_4B_v1",
|
|
109
|
+
similarity_fn_name="cosine",
|
|
110
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
|
|
111
|
+
use_instructions=True,
|
|
112
|
+
public_training_code=None,
|
|
113
|
+
public_training_data=None,
|
|
114
|
+
training_datasets=training_data,
|
|
115
|
+
)
|
|
@@ -53,7 +53,7 @@ class PylateSearchEncoder:
|
|
|
53
53
|
hf_split: str,
|
|
54
54
|
hf_subset: str,
|
|
55
55
|
encode_kwargs: EncodeKwargs,
|
|
56
|
-
num_proc: int,
|
|
56
|
+
num_proc: int | None,
|
|
57
57
|
) -> None:
|
|
58
58
|
"""Index the corpus for retrieval.
|
|
59
59
|
|
|
@@ -89,7 +89,7 @@ class PylateSearchEncoder:
|
|
|
89
89
|
top_k: int,
|
|
90
90
|
encode_kwargs: EncodeKwargs,
|
|
91
91
|
top_ranked: TopRankedDocumentsType | None = None,
|
|
92
|
-
num_proc: int,
|
|
92
|
+
num_proc: int | None,
|
|
93
93
|
) -> RetrievalOutputType:
|
|
94
94
|
queries_dataloader = create_dataloader(
|
|
95
95
|
queries,
|
|
@@ -150,7 +150,7 @@ class PylateSearchEncoder:
|
|
|
150
150
|
hf_split: str,
|
|
151
151
|
top_k: int,
|
|
152
152
|
encode_kwargs: EncodeKwargs,
|
|
153
|
-
num_proc: int,
|
|
153
|
+
num_proc: int | None,
|
|
154
154
|
) -> dict[str, list[tuple[float, str]]]:
|
|
155
155
|
from pylate import indexes, retrieve
|
|
156
156
|
|
|
@@ -216,7 +216,7 @@ class PylateSearchEncoder:
|
|
|
216
216
|
hf_subset: str,
|
|
217
217
|
hf_split: str,
|
|
218
218
|
encode_kwargs: EncodeKwargs,
|
|
219
|
-
num_proc: int =
|
|
219
|
+
num_proc: int | None = None,
|
|
220
220
|
) -> dict[str, list[tuple[float, str]]]:
|
|
221
221
|
"""Rerank with PyLate's rank.rerank using per-query candidates.
|
|
222
222
|
|
mteb/models/models_protocols.py
CHANGED
|
@@ -32,7 +32,7 @@ class SearchProtocol(Protocol):
|
|
|
32
32
|
hf_split: str,
|
|
33
33
|
hf_subset: str,
|
|
34
34
|
encode_kwargs: EncodeKwargs,
|
|
35
|
-
num_proc: int,
|
|
35
|
+
num_proc: int | None,
|
|
36
36
|
) -> None:
|
|
37
37
|
"""Index the corpus for retrieval.
|
|
38
38
|
|
|
@@ -56,7 +56,7 @@ class SearchProtocol(Protocol):
|
|
|
56
56
|
top_k: int,
|
|
57
57
|
encode_kwargs: EncodeKwargs,
|
|
58
58
|
top_ranked: TopRankedDocumentsType | None = None,
|
|
59
|
-
num_proc: int,
|
|
59
|
+
num_proc: int | None,
|
|
60
60
|
) -> RetrievalOutputType:
|
|
61
61
|
"""Search the corpus using the given queries.
|
|
62
62
|
|
mteb/models/search_wrappers.py
CHANGED
|
@@ -59,7 +59,7 @@ class SearchEncoderWrapper:
|
|
|
59
59
|
hf_split: str,
|
|
60
60
|
hf_subset: str,
|
|
61
61
|
encode_kwargs: EncodeKwargs,
|
|
62
|
-
num_proc: int =
|
|
62
|
+
num_proc: int | None = None,
|
|
63
63
|
) -> None:
|
|
64
64
|
"""Index the corpus for retrieval.
|
|
65
65
|
|
|
@@ -101,7 +101,7 @@ class SearchEncoderWrapper:
|
|
|
101
101
|
top_k: int,
|
|
102
102
|
encode_kwargs: EncodeKwargs,
|
|
103
103
|
top_ranked: TopRankedDocumentsType | None = None,
|
|
104
|
-
num_proc: int =
|
|
104
|
+
num_proc: int | None = None,
|
|
105
105
|
) -> RetrievalOutputType:
|
|
106
106
|
"""Search the corpus for the given queries.
|
|
107
107
|
|
|
@@ -485,7 +485,7 @@ class SearchCrossEncoderWrapper:
|
|
|
485
485
|
hf_split: str,
|
|
486
486
|
hf_subset: str,
|
|
487
487
|
encode_kwargs: EncodeKwargs,
|
|
488
|
-
num_proc: int =
|
|
488
|
+
num_proc: int | None = None,
|
|
489
489
|
) -> None:
|
|
490
490
|
"""Index the corpus for retrieval.
|
|
491
491
|
|
|
@@ -509,7 +509,7 @@ class SearchCrossEncoderWrapper:
|
|
|
509
509
|
top_k: int,
|
|
510
510
|
encode_kwargs: EncodeKwargs,
|
|
511
511
|
top_ranked: TopRankedDocumentsType | None = None,
|
|
512
|
-
num_proc: int =
|
|
512
|
+
num_proc: int | None = None,
|
|
513
513
|
) -> RetrievalOutputType:
|
|
514
514
|
"""Search the corpus using the given queries.
|
|
515
515
|
|
|
@@ -914,7 +914,7 @@ class BibleNLPBitextMining(AbsTaskBitextMining):
|
|
|
914
914
|
self.dataset_transform()
|
|
915
915
|
self.data_loaded = True
|
|
916
916
|
|
|
917
|
-
def dataset_transform(self, num_proc: int =
|
|
917
|
+
def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
|
|
918
918
|
# Convert to standard format
|
|
919
919
|
for lang in self.hf_subsets:
|
|
920
920
|
l1, l2 = (l.split("_")[0] for l in lang.split("-"))
|
|
@@ -32,7 +32,7 @@ class RomaTalesBitextMining(AbsTaskBitextMining):
|
|
|
32
32
|
bibtex_citation="",
|
|
33
33
|
)
|
|
34
34
|
|
|
35
|
-
def load_data(self, num_proc: int =
|
|
35
|
+
def load_data(self, num_proc: int | None = None, **kwargs) -> None:
|
|
36
36
|
"""Load dataset from HuggingFace hub and convert it to the standard format."""
|
|
37
37
|
if self.data_loaded:
|
|
38
38
|
return
|
|
@@ -43,7 +43,7 @@ Islam, Tanvir},
|
|
|
43
43
|
superseded_by="BengaliDocumentClassification.v2",
|
|
44
44
|
)
|
|
45
45
|
|
|
46
|
-
def dataset_transform(self, num_proc: int =
|
|
46
|
+
def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
|
|
47
47
|
self.dataset = self.dataset.rename_columns(
|
|
48
48
|
{"article": "text", "category": "label"}
|
|
49
49
|
)
|
|
@@ -92,7 +92,7 @@ Islam, Tanvir},
|
|
|
92
92
|
""",
|
|
93
93
|
)
|
|
94
94
|
|
|
95
|
-
def dataset_transform(self, num_proc: int =
|
|
95
|
+
def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
|
|
96
96
|
self.dataset = self.stratified_subsampling(
|
|
97
97
|
self.dataset, seed=self.seed, splits=["test"]
|
|
98
98
|
)
|
|
@@ -46,7 +46,7 @@ Montoyo, Andres},
|
|
|
46
46
|
)
|
|
47
47
|
samples_per_label = 16
|
|
48
48
|
|
|
49
|
-
def dataset_transform(self, num_proc: int =
|
|
49
|
+
def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
|
|
50
50
|
self.dataset = self.dataset.rename_columns(
|
|
51
51
|
{"comment": "text", "rating_str": "label"}
|
|
52
52
|
)
|
|
@@ -99,7 +99,7 @@ Montoyo, Andres},
|
|
|
99
99
|
)
|
|
100
100
|
samples_per_label = 16
|
|
101
101
|
|
|
102
|
-
def dataset_transform(self, num_proc: int =
|
|
102
|
+
def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
|
|
103
103
|
self.dataset = self.stratified_subsampling(
|
|
104
104
|
self.dataset, seed=self.seed, splits=["test"]
|
|
105
105
|
)
|
|
@@ -46,7 +46,7 @@ Montoyo, Andres},
|
|
|
46
46
|
)
|
|
47
47
|
samples_per_label = 16
|
|
48
48
|
|
|
49
|
-
def dataset_transform(self, num_proc: int =
|
|
49
|
+
def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
|
|
50
50
|
self.dataset = self.dataset.rename_columns(
|
|
51
51
|
{"comment": "text", "sentiment_int": "label"}
|
|
52
52
|
)
|
|
@@ -60,7 +60,7 @@ class HinDialectClassification(AbsTaskClassification):
|
|
|
60
60
|
""",
|
|
61
61
|
)
|
|
62
62
|
|
|
63
|
-
def dataset_transform(self, num_proc: int =
|
|
63
|
+
def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
|
|
64
64
|
self.dataset = self.dataset.rename_columns(
|
|
65
65
|
{"folksong": "text", "language": "label"}
|
|
66
66
|
)
|
|
@@ -137,6 +137,6 @@ Okazaki, Naoaki},
|
|
|
137
137
|
self.dataset_transform()
|
|
138
138
|
self.data_loaded = True
|
|
139
139
|
|
|
140
|
-
def dataset_transform(self, num_proc: int =
|
|
140
|
+
def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
|
|
141
141
|
self.dataset = self.dataset.remove_columns(["language", "script"])
|
|
142
142
|
self.dataset = self.dataset.rename_columns({"native sentence": "text"})
|
|
@@ -52,7 +52,7 @@ class IndicSentimentClassification(AbsTaskClassification):
|
|
|
52
52
|
""",
|
|
53
53
|
)
|
|
54
54
|
|
|
55
|
-
def dataset_transform(self, num_proc: int =
|
|
55
|
+
def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
|
|
56
56
|
label_map = {"Negative": 0, "Positive": 1}
|
|
57
57
|
# Convert to standard format
|
|
58
58
|
for lang in self.hf_subsets:
|
|
@@ -66,7 +66,7 @@ in Natural Language Processing},
|
|
|
66
66
|
""",
|
|
67
67
|
)
|
|
68
68
|
|
|
69
|
-
def dataset_transform(self, num_proc: int =
|
|
69
|
+
def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
|
|
70
70
|
self.dataset = self.dataset.rename_columns({"labels": "label"})
|
|
71
71
|
self.dataset = self.stratified_subsampling(
|
|
72
72
|
self.dataset, seed=self.seed, splits=["test"]
|
|
@@ -49,7 +49,7 @@ class SouthAfricanLangClassification(AbsTaskClassification):
|
|
|
49
49
|
""",
|
|
50
50
|
)
|
|
51
51
|
|
|
52
|
-
def dataset_transform(self, num_proc: int =
|
|
52
|
+
def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
|
|
53
53
|
self.dataset = self.dataset.rename_columns(
|
|
54
54
|
{" text": "text", "lang_id": "label"}
|
|
55
55
|
)
|
|
@@ -49,7 +49,7 @@ class TurkicClassification(AbsTaskClassification):
|
|
|
49
49
|
)
|
|
50
50
|
return dataset_lang["train"]
|
|
51
51
|
|
|
52
|
-
def load_data(self, num_proc: int =
|
|
52
|
+
def load_data(self, num_proc: int | None = None, **kwargs) -> None:
|
|
53
53
|
"""Load dataset from HuggingFace hub"""
|
|
54
54
|
if self.data_loaded:
|
|
55
55
|
return
|
|
@@ -35,7 +35,7 @@ class SlovakMovieReviewSentimentClassification(AbsTaskClassification):
|
|
|
35
35
|
superseded_by="SlovakMovieReviewSentimentClassification.v2",
|
|
36
36
|
)
|
|
37
37
|
|
|
38
|
-
def dataset_transform(self, num_proc: int =
|
|
38
|
+
def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
|
|
39
39
|
self.dataset = self.dataset.rename_columns({"comment": "text"})
|
|
40
40
|
|
|
41
41
|
self.dataset = self.stratified_subsampling(
|
|
@@ -76,7 +76,7 @@ class SlovakMovieReviewSentimentClassificationV2(AbsTaskClassification):
|
|
|
76
76
|
adapted_from=["SlovakMovieReviewSentimentClassification"],
|
|
77
77
|
)
|
|
78
78
|
|
|
79
|
-
def dataset_transform(self, num_proc: int =
|
|
79
|
+
def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
|
|
80
80
|
self.dataset = self.stratified_subsampling(
|
|
81
81
|
self.dataset, seed=self.seed, splits=["test"]
|
|
82
82
|
)
|
|
@@ -37,7 +37,7 @@ class SwahiliNewsClassification(AbsTaskClassification):
|
|
|
37
37
|
superseded_by="SwahiliNewsClassification.v2",
|
|
38
38
|
)
|
|
39
39
|
|
|
40
|
-
def dataset_transform(self, num_proc: int =
|
|
40
|
+
def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
|
|
41
41
|
self.dataset = self.dataset.rename_columns(
|
|
42
42
|
{"content": "text", "category": "label"}
|
|
43
43
|
)
|
|
@@ -81,7 +81,7 @@ class SwahiliNewsClassificationV2(AbsTaskClassification):
|
|
|
81
81
|
adapted_from=["SwahiliNewsClassification"],
|
|
82
82
|
)
|
|
83
83
|
|
|
84
|
-
def dataset_transform(self, num_proc: int =
|
|
84
|
+
def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
|
|
85
85
|
self.dataset = self.stratified_subsampling(
|
|
86
86
|
self.dataset, seed=self.seed, splits=["train"]
|
|
87
87
|
)
|
|
@@ -63,7 +63,7 @@ class TenKGnadClusteringP2PFast(AbsTaskClustering):
|
|
|
63
63
|
adapted_from=["TenKGnadClusteringP2P"],
|
|
64
64
|
)
|
|
65
65
|
|
|
66
|
-
def dataset_transform(self, num_proc: int =
|
|
66
|
+
def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
|
|
67
67
|
ds = _convert_to_fast(
|
|
68
68
|
self.dataset, self.input_column_name, self.label_column_name, self.seed
|
|
69
69
|
)
|
|
@@ -63,7 +63,7 @@ class TenKGnadClusteringS2SFast(AbsTaskClustering):
|
|
|
63
63
|
adapted_from=["TenKGnadClusteringS2S"],
|
|
64
64
|
)
|
|
65
65
|
|
|
66
|
-
def dataset_transform(self, num_proc: int =
|
|
66
|
+
def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
|
|
67
67
|
ds = _convert_to_fast(
|
|
68
68
|
self.dataset, self.input_column_name, self.label_column_name, self.seed
|
|
69
69
|
)
|
|
@@ -51,7 +51,7 @@ class MLSUMClusteringP2P(AbsTaskClusteringLegacy):
|
|
|
51
51
|
superseded_by="MLSUMClusteringP2P.v2",
|
|
52
52
|
)
|
|
53
53
|
|
|
54
|
-
def load_data(self, num_proc: int =
|
|
54
|
+
def load_data(self, num_proc: int | None = None, **kwargs) -> None:
|
|
55
55
|
"""Load dataset from HuggingFace hub and convert it to the standard format."""
|
|
56
56
|
if self.data_loaded:
|
|
57
57
|
return
|
|
@@ -124,7 +124,7 @@ class MLSUMClusteringP2PFast(AbsTaskClustering):
|
|
|
124
124
|
adapted_from=["MLSUMClusteringP2P"],
|
|
125
125
|
)
|
|
126
126
|
|
|
127
|
-
def load_data(self, num_proc: int =
|
|
127
|
+
def load_data(self, num_proc: int | None = None, **kwargs) -> None:
|
|
128
128
|
"""Load dataset from HuggingFace hub and convert it to the standard format."""
|
|
129
129
|
if self.data_loaded:
|
|
130
130
|
return
|
|
@@ -51,7 +51,7 @@ class MLSUMClusteringS2S(AbsTaskClusteringLegacy):
|
|
|
51
51
|
superseded_by="MLSUMClusteringS2S.v2",
|
|
52
52
|
)
|
|
53
53
|
|
|
54
|
-
def load_data(self, num_proc: int =
|
|
54
|
+
def load_data(self, num_proc: int | None = None, **kwargs) -> None:
|
|
55
55
|
"""Load dataset from HuggingFace hub and convert it to the standard format."""
|
|
56
56
|
if self.data_loaded:
|
|
57
57
|
return
|
|
@@ -119,7 +119,7 @@ class MLSUMClusteringS2SFast(AbsTaskClustering):
|
|
|
119
119
|
adapted_from=["MLSUMClusteringS2S"],
|
|
120
120
|
)
|
|
121
121
|
|
|
122
|
-
def load_data(self, num_proc: int =
|
|
122
|
+
def load_data(self, num_proc: int | None = None, **kwargs) -> None:
|
|
123
123
|
"""Load dataset from HuggingFace hub and convert it to the standard format."""
|
|
124
124
|
if self.data_loaded:
|
|
125
125
|
return
|
|
@@ -45,7 +45,7 @@ class VGHierarchicalClusteringP2P(AbsTaskClustering):
|
|
|
45
45
|
prompt="Identify the categories (e.g. sports) of given articles in Norwegian",
|
|
46
46
|
)
|
|
47
47
|
|
|
48
|
-
def dataset_transform(self, num_proc: int =
|
|
48
|
+
def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
|
|
49
49
|
self.dataset = self.dataset.rename_columns(
|
|
50
50
|
{"article": "sentences", "classes": "labels"}
|
|
51
51
|
)
|
|
@@ -92,7 +92,7 @@ class VGHierarchicalClusteringS2S(AbsTaskClustering):
|
|
|
92
92
|
prompt="Identify the categories (e.g. sports) of given articles in Norwegian",
|
|
93
93
|
)
|
|
94
94
|
|
|
95
|
-
def dataset_transform(self, num_proc: int =
|
|
95
|
+
def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
|
|
96
96
|
self.dataset = self.dataset.rename_columns(
|
|
97
97
|
{"ingress": "sentences", "classes": "labels"}
|
|
98
98
|
)
|
|
@@ -45,7 +45,7 @@ class SugarCrepe(AbsTaskImageTextPairClassification):
|
|
|
45
45
|
""",
|
|
46
46
|
)
|
|
47
47
|
|
|
48
|
-
def load_data(self, num_proc: int =
|
|
48
|
+
def load_data(self, num_proc: int | None = None, **kwargs) -> None:
|
|
49
49
|
"""Load dataset from HuggingFace hub"""
|
|
50
50
|
if self.data_loaded:
|
|
51
51
|
return
|
|
@@ -175,7 +175,7 @@ class mFollowIRCrossLingual(AbsTaskRetrieval): # noqa: N801
|
|
|
175
175
|
""",
|
|
176
176
|
)
|
|
177
177
|
|
|
178
|
-
def load_data(self, num_proc: int =
|
|
178
|
+
def load_data(self, num_proc: int | None = None, **kwargs) -> None:
|
|
179
179
|
if self.data_loaded:
|
|
180
180
|
return
|
|
181
181
|
|
|
@@ -243,7 +243,7 @@ class mFollowIR(AbsTaskRetrieval): # noqa: N801
|
|
|
243
243
|
""",
|
|
244
244
|
)
|
|
245
245
|
|
|
246
|
-
def load_data(self, num_proc: int =
|
|
246
|
+
def load_data(self, num_proc: int | None = None, **kwargs) -> None:
|
|
247
247
|
if self.data_loaded:
|
|
248
248
|
return
|
|
249
249
|
|
|
@@ -123,7 +123,7 @@ class CVBenchCount(AbsTaskRetrieval):
|
|
|
123
123
|
""",
|
|
124
124
|
)
|
|
125
125
|
|
|
126
|
-
def load_data(self, num_proc: int =
|
|
126
|
+
def load_data(self, num_proc: int | None = None, **kwargs) -> None:
|
|
127
127
|
self.corpus, self.queries, self.relevant_docs, self.top_ranked = _load_data(
|
|
128
128
|
path=self.metadata.dataset["path"],
|
|
129
129
|
splits=self.metadata.eval_splits,
|
|
@@ -165,7 +165,7 @@ class CVBenchRelation(AbsTaskRetrieval):
|
|
|
165
165
|
""",
|
|
166
166
|
)
|
|
167
167
|
|
|
168
|
-
def load_data(self, num_proc: int =
|
|
168
|
+
def load_data(self, num_proc: int | None = None, **kwargs) -> None:
|
|
169
169
|
self.corpus, self.queries, self.relevant_docs, self.top_ranked = _load_data(
|
|
170
170
|
path=self.metadata.dataset["path"],
|
|
171
171
|
splits=self.metadata.eval_splits,
|
|
@@ -207,7 +207,7 @@ class CVBenchDepth(AbsTaskRetrieval):
|
|
|
207
207
|
""",
|
|
208
208
|
)
|
|
209
209
|
|
|
210
|
-
def load_data(self, num_proc: int =
|
|
210
|
+
def load_data(self, num_proc: int | None = None, **kwargs) -> None:
|
|
211
211
|
self.corpus, self.queries, self.relevant_docs, self.top_ranked = _load_data(
|
|
212
212
|
path=self.metadata.dataset["path"],
|
|
213
213
|
splits=self.metadata.eval_splits,
|
|
@@ -249,7 +249,7 @@ class CVBenchDistance(AbsTaskRetrieval):
|
|
|
249
249
|
""",
|
|
250
250
|
)
|
|
251
251
|
|
|
252
|
-
def load_data(self, num_proc: int =
|
|
252
|
+
def load_data(self, num_proc: int | None = None, **kwargs) -> None:
|
|
253
253
|
self.corpus, self.queries, self.relevant_docs, self.top_ranked = _load_data(
|
|
254
254
|
path=self.metadata.dataset["path"],
|
|
255
255
|
splits=self.metadata.eval_splits,
|
|
@@ -60,7 +60,7 @@ class PubChemWikiPairClassification(AbsTaskPairClassification):
|
|
|
60
60
|
""",
|
|
61
61
|
)
|
|
62
62
|
|
|
63
|
-
def dataset_transform(self, num_proc: int =
|
|
63
|
+
def dataset_transform(self, num_proc: int | None = None, **kwargs) -> None:
|
|
64
64
|
_dataset = {}
|
|
65
65
|
for lang in self.hf_subsets:
|
|
66
66
|
_dataset[lang] = {}
|
|
@@ -52,7 +52,7 @@ Dolan, Bill},
|
|
|
52
52
|
# sum of 4 languages after neutral filtering
|
|
53
53
|
)
|
|
54
54
|
|
|
55
|
-
def load_data(self, num_proc: int =
|
|
55
|
+
def load_data(self, num_proc: int | None = None, **kwargs) -> None:
|
|
56
56
|
"""Load dataset from HuggingFace hub"""
|
|
57
57
|
if self.data_loaded:
|
|
58
58
|
return
|