PyPI - mteb - Versions diffs - 2.7.3__py3-none-any.whl → 2.7.5__py3-none-any.whl - Mend

mteb 2.7.3py3-none-any.whl → 2.7.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (434) hide show

mteb/models/model_meta.py CHANGED Viewed

@@ -10,6 +10,7 @@ from functools import partial
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Literal, cast
+import numpy as np
 from huggingface_hub import (
     ModelCard,
     get_safetensors_metadata,
@@ -27,6 +28,8 @@ from huggingface_hub.errors import (
     SafetensorsParsingError,
 )
 from pydantic import BaseModel, ConfigDict, field_validator, model_validator
+from sentence_transformers.models import Transformer
+from torch import nn
 from transformers import AutoConfig
 from mteb._helpful_enum import HelpfulStrEnum
@@ -99,8 +102,9 @@ class ModelMeta(BaseModel):
         loader: The function that loads the model. If None it assumes that the model is not implemented.
         loader_kwargs: The keyword arguments to pass to the loader function.
         name: The name of the model, ideally the name on huggingface. It should be in the format "organization/model_name".
-        n_parameters: The number of parameters in the model, e.g. 7_000_000 for a 7M parameter model. Can be None if the number of parameters is not known (e.g. for proprietary models) or
-            if the loader returns a SentenceTransformer model from which it can be derived.
+        n_parameters: The total number of parameters in the model, e.g. `7_000_000` for a 7M parameter model. Can be none in case the number of parameters is unknown.
+        n_embedding_parameters: The number of parameters used for the embedding layer. Can be None if the number of embedding parameters is not known (e.g. for proprietary models).
+        n_active_parameters_override: The number of active parameters used bu model. Should be used **only** for Mixture of Experts models.
         memory_usage_mb: The memory usage of the model in MB. Can be None if the memory usage is not known (e.g. for proprietary models). To calculate it use the `calculate_memory_usage_mb` method.
         max_tokens: The maximum number of tokens the model can handle. Can be None if the maximum number of tokens is not known (e.g. for proprietary
             models).
@@ -139,6 +143,8 @@ class ModelMeta(BaseModel):
     release_date: StrDate | None
     languages: list[ISOLanguageScript] | None
     n_parameters: int | None
+    n_active_parameters_override: int | None = None
+    n_embedding_parameters: int | None = None
     memory_usage_mb: float | None
     max_tokens: float | None
     embed_dim: int | None
@@ -197,6 +203,16 @@ class ModelMeta(BaseModel):
         """
         return "cross-encoder" in self.model_type
+    @property
+    def n_active_parameters(self):
+        """Number of active parameters. Assumed to be `n_parameters - n_embedding_parameters`. Can be overwritten using `n_active_parameters_override` e.g. for MoE models."""
+        if self.n_active_parameters_override is not None:
+            return self.n_active_parameters_override
+        if self.n_parameters is not None and self.n_embedding_parameters is not None:
+            return self.n_parameters - self.n_embedding_parameters
+        return None
     @field_validator("similarity_fn_name", mode="before")
     @classmethod
     def _validate_similarity_fn_name(cls, value: str) -> ScoringFunction | None:
@@ -389,6 +405,14 @@ class ModelMeta(BaseModel):
             else model.model_card_data.base_model
         )
         meta = cls._from_hub(name, revision, compute_metadata)
+        try:
+            first = model[0]
+            if isinstance(first, Transformer):
+                emb = first.auto_model.get_input_embeddings()
+                meta.n_embedding_parameters = int(np.prod(emb.weight.shape))
+        except Exception as e:
+            logger.warning(f"Could not calculate embedding parameters for {name}: {e}")
         meta.revision = model.model_card_data.base_model_revision or meta.revision
         meta.max_tokens = model.max_seq_length
         meta.embed_dim = model.get_sentence_embedding_dimension()
@@ -460,6 +484,15 @@ class ModelMeta(BaseModel):
         from mteb.models import CrossEncoderWrapper
         meta = cls._from_hub(model.model.name_or_path, revision, compute_metadata)
+        try:
+            emb = model.model.get_input_embeddings()
+            if isinstance(emb, nn.Embedding):
+                meta.n_embedding_parameters = int(np.prod(emb.weight.shape))
+        except Exception as e:
+            logger.warning(
+                f"Could not calculate embedding parameters for {model.model.name_or_path}: {e}"
+            )
         meta.revision = model.config._commit_hash or meta.revision
         meta.loader = CrossEncoderWrapper
         meta.embed_dim = None

mteb/models/models_protocols.py CHANGED Viewed

@@ -32,6 +32,7 @@ class SearchProtocol(Protocol):
         hf_split: str,
         hf_subset: str,
         encode_kwargs: EncodeKwargs,
+        num_proc: int,
     ) -> None:
         """Index the corpus for retrieval.
@@ -41,6 +42,7 @@ class SearchProtocol(Protocol):
             hf_split: Split of current task, allows to know some additional information about current split.
             hf_subset: Subset of current task. Similar to `hf_split` to get more information
             encode_kwargs: Additional arguments to pass to the encoder during indexing.
+            num_proc: Number of processes to use for dataloading.
         """
         ...
@@ -54,6 +56,7 @@ class SearchProtocol(Protocol):
         top_k: int,
         encode_kwargs: EncodeKwargs,
         top_ranked: TopRankedDocumentsType | None = None,
+        num_proc: int,
     ) -> RetrievalOutputType:
         """Search the corpus using the given queries.
@@ -66,6 +69,7 @@ class SearchProtocol(Protocol):
                 Passed only from Reranking tasks.
             top_k: Number of top documents to return for each query.
             encode_kwargs: Additional arguments to pass to the encoder during indexing.
+            num_proc: Number of processes to use for dataloading.
         Returns:
             Dictionary with query IDs as keys with dict as values, where each value is a mapping of document IDs to their relevance scores.

mteb/models/search_wrappers.py CHANGED Viewed

@@ -59,6 +59,7 @@ class SearchEncoderWrapper:
         hf_split: str,
         hf_subset: str,
         encode_kwargs: EncodeKwargs,
+        num_proc: int = 1,
     ) -> None:
         """Index the corpus for retrieval.
@@ -68,6 +69,7 @@ class SearchEncoderWrapper:
             hf_split: Split of current task, allows to know some additional information about current split.
             hf_subset: Subset of current task. Similar to `hf_split` to get more information
             encode_kwargs: Additional arguments to pass to the encoder during indexing.
+            num_proc: Number of processes to use for dataloading.
         """
         # Always retain corpus for potential reranking or fallback flows
         self.task_corpus = corpus
@@ -77,6 +79,7 @@ class SearchEncoderWrapper:
                     corpus,
                     task_metadata,
                     prompt_type=PromptType.document,
+                    num_proc=num_proc,
                     **encode_kwargs,
                 ),
                 task_metadata=task_metadata,
@@ -98,6 +101,7 @@ class SearchEncoderWrapper:
         top_k: int,
         encode_kwargs: EncodeKwargs,
         top_ranked: TopRankedDocumentsType | None = None,
+        num_proc: int = 1,
     ) -> RetrievalOutputType:
         """Search the corpus for the given queries.
@@ -110,6 +114,7 @@ class SearchEncoderWrapper:
                 Passed only from Reranking tasks.
             top_k: Number of top documents to return for each query.
             encode_kwargs: Additional arguments to pass to the encoder during indexing.
+            num_proc: Number of processes to use for dataloading.
         Returns:
             Dictionary with query IDs as keys with dict as values, where each value is a mapping of document IDs to their relevance scores.
@@ -121,6 +126,7 @@ class SearchEncoderWrapper:
             queries,
             task_metadata,
             prompt_type=PromptType.query,
+            num_proc=num_proc,
             **encode_kwargs,
         )
@@ -479,6 +485,7 @@ class SearchCrossEncoderWrapper:
         hf_split: str,
         hf_subset: str,
         encode_kwargs: EncodeKwargs,
+        num_proc: int = 1,
     ) -> None:
         """Index the corpus for retrieval.
@@ -488,6 +495,7 @@ class SearchCrossEncoderWrapper:
             hf_split: Split of current task, allows to know some additional information about current split.
             hf_subset: Subset of current task. Similar to `hf_split` to get more information
             encode_kwargs: Additional arguments to pass to the encoder during indexing.
+            num_proc: Number of processes to use.
         """
         self.task_corpus = corpus
@@ -501,6 +509,7 @@ class SearchCrossEncoderWrapper:
         top_k: int,
         encode_kwargs: EncodeKwargs,
         top_ranked: TopRankedDocumentsType | None = None,
+        num_proc: int = 1,
     ) -> RetrievalOutputType:
         """Search the corpus using the given queries.
@@ -513,6 +522,7 @@ class SearchCrossEncoderWrapper:
                 Passed only from Reranking tasks.
             top_k: Number of top documents to return for each query.
             encode_kwargs: Additional arguments to pass to the encoder during indexing.
+            num_proc: Number of processes to use.
         Returns:
             Dictionary with query IDs as keys with dict as values, where each value is a mapping of document IDs to their relevance scores.
@@ -546,12 +556,14 @@ class SearchCrossEncoderWrapper:
             Dataset.from_list(total_queries),
             task_metadata,
             prompt_type=PromptType.document,
+            num_proc=num_proc,
             **encode_kwargs,
         )
         corpus_loader = create_dataloader(
             Dataset.from_list(total_docs),
             task_metadata,
             prompt_type=PromptType.document,
+            num_proc=num_proc,
             **encode_kwargs,
         )
         predictions = self.model.predict(

mteb/tasks/bitext_mining/eng/pub_chem_smiles_bitext_mining.py CHANGED Viewed

@@ -59,7 +59,7 @@ class PubChemSMILESBitextMining(AbsTaskBitextMining):
 """,
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         for subset in self.hf_subsets:
             self.dataset[subset] = self.dataset[subset].rename_columns(
                 COL_MAPPING[subset]

mteb/tasks/bitext_mining/fas/fa_mteb_summary_retrieval.py CHANGED Viewed

@@ -27,7 +27,7 @@ class SAMSumFa(AbsTaskBitextMining):
         bibtex_citation="",
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.dataset.rename_columns(
             {"text": "sentence1", "summary": "sentence2"}
         )
@@ -58,7 +58,7 @@ class SynPerChatbotSumSRetrieval(AbsTaskBitextMining):
         bibtex_citation=""" """,
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.dataset.rename_columns(
             {"text": "sentence1", "summary": "sentence2"}
         )
@@ -89,7 +89,7 @@ class SynPerChatbotRAGSumSRetrieval(AbsTaskBitextMining):
         bibtex_citation=""" """,
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.dataset.rename_columns(
             {"text": "sentence1", "summary": "sentence2"}
         )

mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py CHANGED Viewed

@@ -60,7 +60,7 @@ Rapp, Reinhard},
         superseded_by="BUCC.v2",
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         dataset = {}
         for lang in self.dataset:
             dataset[lang] = {}

mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py CHANGED Viewed

@@ -265,7 +265,7 @@ class FloresBitextMining(AbsTaskBitextMining):
 """,
     )
-    def load_data(self) -> None:
+    def load_data(self, num_proc: int = 1, **kwargs) -> None:
         if self.data_loaded:
             return

mteb/tasks/bitext_mining/multilingual/in22_conv_bitext_mining.py CHANGED Viewed

@@ -99,7 +99,7 @@ class IN22ConvBitextMining(AbsTaskBitextMining):
 """,
     )
-    def load_data(self) -> None:
+    def load_data(self, num_proc: int = 1, **kwargs) -> None:
         if self.data_loaded:
             return

mteb/tasks/bitext_mining/multilingual/in22_gen_bitext_mining.py CHANGED Viewed

@@ -93,7 +93,7 @@ class IN22GenBitextMining(AbsTaskBitextMining):
 """,
     )
-    def load_data(self) -> None:
+    def load_data(self, num_proc: int = 1, **kwargs) -> None:
         if self.data_loaded:
             return

mteb/tasks/bitext_mining/multilingual/norwegian_courts_bitext_mining.py CHANGED Viewed

@@ -35,7 +35,7 @@ class NorwegianCourtsBitextMining(AbsTaskBitextMining):
         prompt="Retrieve parallel sentences in Norwegian Bokmål and Nynorsk",
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         # Convert to standard format
         self.dataset = self.dataset.rename_column("nb", "sentence1")
         self.dataset = self.dataset.rename_column("nn", "sentence2")

mteb/tasks/bitext_mining/multilingual/ntrex_bitext_mining.py CHANGED Viewed

@@ -280,7 +280,7 @@ class NTREXBitextMining(AbsTaskBitextMining):
 """,
     )
-    def load_data(self) -> None:
+    def load_data(self, num_proc: int = 1, **kwargs) -> None:
         if self.data_loaded:
             return

mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py CHANGED Viewed

@@ -32,7 +32,7 @@ class RomaTalesBitextMining(AbsTaskBitextMining):
         bibtex_citation="",
     )
-    def load_data(self) -> None:
+    def load_data(self, num_proc: int = 1, **kwargs) -> None:
         """Load dataset from HuggingFace hub and convert it to the standard format."""
         if self.data_loaded:
             return
@@ -44,7 +44,7 @@ class RomaTalesBitextMining(AbsTaskBitextMining):
         self.dataset_transform()
         self.data_loaded = True
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         for lang in self.hf_subsets:
             self.dataset[lang] = self.dataset[lang].rename_columns(
                 {"romani": "sentence1", "hungarian": "sentence2"}

mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py CHANGED Viewed

@@ -230,7 +230,7 @@ class WebFAQBitextMiningQuestions(AbsTaskBitextMining):
 """,
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         dataset = {}
         for langs in self.dataset:
             dataset[langs] = {}
@@ -284,7 +284,7 @@ class WebFAQBitextMiningQAs(AbsTaskBitextMining):
 """,
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         dataset = {}
         for langs in self.dataset:
             dataset[langs] = {}

mteb/tasks/classification/ara/online_store_review_sentiment_classification.py CHANGED Viewed

@@ -28,7 +28,7 @@ class OnlineStoreReviewSentimentClassification(AbsTaskClassification):
         superseded_by="OnlineStoreReviewSentimentClassification.v2",
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.stratified_subsampling(
             self.dataset, seed=self.seed, splits=["train"]
         )

mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py CHANGED Viewed

@@ -37,7 +37,7 @@ class RestaurantReviewSentimentClassification(AbsTaskClassification):
         superseded_by="RestaurantReviewSentimentClassification.v2",
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         # labels: 0 negative, 1 positive
         self.dataset = self.dataset.rename_column("polarity", "label")
         self.dataset = self.stratified_subsampling(

mteb/tasks/classification/ara/tweet_sarcasm_classification.py CHANGED Viewed

@@ -48,7 +48,7 @@ Mubarak, Hamdy},
         superseded_by="TweetSarcasmClassification.v2",
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         # labels: 0 non-sarcastic, 1 sarcastic
         self.dataset = self.dataset.rename_columns(
             {"tweet": "text", "sarcasm": "label"}

mteb/tasks/classification/ben/bengali_hate_speech_classification.py CHANGED Viewed

@@ -36,7 +36,7 @@ class BengaliHateSpeechClassification(AbsTaskClassification):
         superseded_by="BengaliHateSpeechClassification.v2",
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.stratified_subsampling(
             self.dataset, seed=self.seed, splits=["train"]
         )

mteb/tasks/classification/ben/bengali_sentiment_analysis.py CHANGED Viewed

@@ -36,7 +36,7 @@ class BengaliSentimentAnalysis(AbsTaskClassification):
         superseded_by="BengaliSentimentAnalysis.v2",
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.stratified_subsampling(
             self.dataset, seed=self.seed, splits=["train"]
         )

mteb/tasks/classification/bul/bulgarian_store_review_sentiment_classfication.py CHANGED Viewed

@@ -37,7 +37,7 @@ class BulgarianStoreReviewSentimentClassfication(AbsTaskClassification):
 """,
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.dataset.rename_columns(
             {"Review": "text", "Category": "label"}
         )

mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py CHANGED Viewed

@@ -39,7 +39,7 @@ class CSFDCZMovieReviewSentimentClassification(AbsTaskClassification):
     # Increase the samples_per_label in order to improve baseline performance
     samples_per_label = 20
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.dataset.rename_columns(
             {"comment": "text", "rating_int": "label"}
         )
@@ -85,7 +85,7 @@ class CSFDCZMovieReviewSentimentClassificationV2(AbsTaskClassification):
     # Increase the samples_per_label in order to improve baseline performance
     samples_per_label = 20
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.stratified_subsampling(
             self.dataset, seed=self.seed, splits=["test"], n_samples=2048
         )

mteb/tasks/classification/dan/ddisco_cohesion_classification.py CHANGED Viewed

@@ -56,7 +56,7 @@ Piperidis, Stelios},
         superseded_by="Ddisco.v2",
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.dataset.rename_columns({"rating": "label"}).remove_columns(
             ["domain"]
         )

mteb/tasks/classification/dan/dk_hate_classification.py CHANGED Viewed

@@ -60,7 +60,7 @@ Piperidis, Stelios},
     samples_per_label = 16
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         # convert label to a 0/1 label
         labels = self.dataset["train"]["label"]
         lab2idx = {lab: idx for idx, lab in enumerate(set(labels))}

mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py CHANGED Viewed

@@ -49,7 +49,7 @@ Zesch, Torsten},
         superseded_by="GermanPoliticiansTwitterSentimentClassification.v2",
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.dataset.rename_column("majority_sentiment", "label")

mteb/tasks/classification/ell/greek_legal_code_classification.py CHANGED Viewed

@@ -42,7 +42,7 @@ class GreekLegalCodeClassification(AbsTaskClassification):
 """,
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset["validation"] = (
             self.dataset["validation"]
             .shuffle(seed=self.seed)

mteb/tasks/classification/eng/dbpedia_classification.py CHANGED Viewed

@@ -40,7 +40,7 @@ class DBpediaClassification(AbsTaskClassification):
         superseded_by="DBpediaClassification.v2",
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.dataset.rename_column("content", "text")
         self.dataset = self.stratified_subsampling(
             self.dataset, seed=self.seed, splits=["train", "test"]
@@ -85,7 +85,7 @@ class DBpediaClassificationV2(AbsTaskClassification):
         adapted_from=["DBpediaClassification"],
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.stratified_subsampling(
             self.dataset, seed=self.seed, splits=["train", "test"]
         )

mteb/tasks/classification/eng/toxic_chat_classification.py CHANGED Viewed

@@ -40,7 +40,7 @@ class ToxicChatClassification(AbsTaskClassification):
         superseded_by="ToxicChatClassification.v2",
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         keep_cols = ["user_input", "toxicity"]
         rename_dict = dict(zip(keep_cols, ["text", "label"]))
         remove_cols = [
@@ -93,7 +93,7 @@ class ToxicChatClassificationV2(AbsTaskClassification):
         adapted_from=["ToxicChatClassification"],
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.stratified_subsampling(
             self.dataset, seed=self.seed, splits=["test"]
         )

mteb/tasks/classification/eng/toxic_conversations_classification.py CHANGED Viewed

@@ -42,7 +42,7 @@ class ToxicConversationsClassification(AbsTaskClassification):
     samples_per_label = 16
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.stratified_subsampling(
             self.dataset, seed=self.seed, splits=["test"]
         )
@@ -88,7 +88,7 @@ class ToxicConversationsClassificationV2(AbsTaskClassification):
     samples_per_label = 16
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.stratified_subsampling(
             self.dataset, seed=self.seed, splits=["test"]
         )

mteb/tasks/classification/eng/tweet_topic_single_classification.py CHANGED Viewed

@@ -43,7 +43,7 @@ Barbieri, Francesco},
         superseded_by="TweetTopicSingleClassification.v2",
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset["train"] = self.dataset["train_2021"]

mteb/tasks/classification/eng/yahoo_answers_topics_classification.py CHANGED Viewed

@@ -83,7 +83,7 @@ class YahooAnswersTopicsClassificationV2(AbsTaskClassification):
     samples_per_label = 32
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.stratified_subsampling(
             self.dataset, seed=self.seed, splits=["train", "test"]
         )

mteb/tasks/classification/eng/yelp_review_full_classification.py CHANGED Viewed

@@ -42,7 +42,7 @@ class YelpReviewFullClassification(AbsTaskClassification):
     samples_per_label = 128
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.stratified_subsampling(
             self.dataset, seed=self.seed, splits=["test"]
         )
@@ -88,7 +88,7 @@ class YelpReviewFullClassificationV2(AbsTaskClassification):
     samples_per_label = 128
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.stratified_subsampling(
             self.dataset, seed=self.seed, splits=["test"]
         )

mteb/tasks/classification/est/estonian_valence.py CHANGED Viewed

@@ -40,7 +40,7 @@ class EstonianValenceClassification(AbsTaskClassification):
         superseded_by="EstonianValenceClassification.v2",
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.dataset.rename_column("paragraph", "text").rename_column(
             "valence", "label"
         )

mteb/tasks/classification/fas/fa_mteb_classification.py CHANGED Viewed

@@ -602,7 +602,7 @@ class DeepSentiPers(AbsTaskClassification):
     )
     samples_per_label = 32
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.dataset.rename_column("review", "text")
@@ -773,7 +773,7 @@ class NLPTwitterAnalysisClassification(AbsTaskClassification):
     )
     samples_per_label = 32
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.dataset.rename_column("tweet", "text")
@@ -858,7 +858,7 @@ class FaIntentClassification(AbsTaskClassification):
     )
     samples_per_label = 32
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.dataset.rename_column("words", "text")
         self.dataset = self.dataset.rename_column("intent_label", "label")
@@ -889,7 +889,7 @@ class StyleClassification(AbsTaskClassification):
     )
     samples_per_label = 32
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         mapping = {"formal": 1, "informal": 0}
         self.dataset = self.dataset.map(
             lambda example: {"label": mapping[example["label"]]}
@@ -927,7 +927,7 @@ class PerShopDomainClassification(AbsTaskClassification):
     )
     samples_per_label = 32
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.dataset.rename_column("domain", "label")
@@ -962,5 +962,5 @@ class PerShopIntentClassification(AbsTaskClassification):
     )
     samples_per_label = 32
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.dataset.rename_column("Intents & Actions", "label")

mteb/tasks/classification/fas/persian_food_sentiment_classification.py CHANGED Viewed

@@ -37,7 +37,7 @@ class PersianFoodSentimentClassification(AbsTaskClassification):
 """,
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.stratified_subsampling(
             self.dataset, seed=self.seed, splits=["validation", "test"]
         )

mteb/tasks/classification/fil/filipino_shopee_reviews_classification.py CHANGED Viewed

@@ -36,7 +36,7 @@ class FilipinoShopeeReviewsClassification(AbsTaskClassification):
 """,
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.stratified_subsampling(
             self.dataset, seed=self.seed, splits=["validation", "test"]
         )

mteb/tasks/classification/fin/fin_toxicity_classification.py CHANGED Viewed

@@ -40,7 +40,7 @@ Laippala, Veronika},
         superseded_by="FinToxicityClassification.v2",
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.dataset.rename_column("label_toxicity", "label")
         remove_cols = [
             col

mteb 2.7.3__py3-none-any.whl → 2.7.5__py3-none-any.whl

mteb 2.7.3py3-none-any.whl → 2.7.5py3-none-any.whl