PyPI - mteb - Versions diffs - 2.7.4__py3-none-any.whl → 2.7.5__py3-none-any.whl - Mend

mteb 2.7.4py3-none-any.whl → 2.7.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (287) hide show

mteb/abstasks/zeroshot_classification.py CHANGED Viewed

@@ -127,6 +127,7 @@ class AbsTaskZeroShotClassification(AbsTask):
         hf_subset: str,
         encode_kwargs: EncodeKwargs,
         prediction_folder: Path | None = None,
+        num_proc: int = 1,
         **kwargs,
     ) -> ZeroShotClassificationMetrics:
         if not isinstance(model, EncoderProtocol):
@@ -145,7 +146,11 @@ class AbsTaskZeroShotClassification(AbsTask):
             hf_subset=hf_subset,
             **kwargs,
         )
-        probs = evaluator(model, encode_kwargs=encode_kwargs)
+        probs = evaluator(
+            model,
+            encode_kwargs=encode_kwargs,
+            num_proc=num_proc,
+        )
         if prediction_folder:
             self._save_task_predictions(
@@ -170,13 +175,14 @@ class AbsTaskZeroShotClassification(AbsTask):
             accuracy=metrics.accuracy_score(labels, predictions),
         )
-    def _push_dataset_to_hub(self, repo_name: str) -> None:
+    def _push_dataset_to_hub(self, repo_name: str, num_proc: int = 1) -> None:
         self._upload_dataset_to_hub(
             repo_name,
             [
                 self.input_column_name,
                 self.label_column_name,
             ],
+            num_proc=num_proc,
         )
         labels_dataset = Dataset.from_dict({"labels": self.get_candidate_labels()})
         labels_dataset.push_to_hub(repo_name, config_name="labels")

mteb/evaluate.py CHANGED Viewed

@@ -125,6 +125,7 @@ def _evaluate_task(
                 co2_tracker=False,
                 prediction_folder=prediction_folder,
                 public_only=public_only,
+                num_proc=num_proc,
             )
         if isinstance(result, TaskResult):
             result.kg_co2_emissions = tracker.final_emissions
@@ -137,7 +138,7 @@ def _evaluate_task(
     data_preloaded = task.data_loaded
     if not data_preloaded:
         try:
-            task.load_data()
+            task.load_data(num_proc=num_proc)
         except DatasetNotFoundError as e:
             if not task.metadata.is_public and public_only is None:
                 msg = (
@@ -163,6 +164,7 @@ def _evaluate_task(
             subsets_to_run=hf_subsets,
             encode_kwargs=encode_kwargs,
             prediction_folder=prediction_folder,
+            num_proc=num_proc,
         )
         tock = time()
@@ -280,6 +282,7 @@ def evaluate(
     prediction_folder: Path | str | None = None,
     show_progress_bar: bool = True,
     public_only: bool | None = None,
+    num_proc: int = 1,
 ) -> ModelResult:
     """This function runs a model on a given task and returns the results.
@@ -288,7 +291,7 @@ def evaluate(
         tasks: A task to run.
         co2_tracker: If True, track the CO₂ emissions of the evaluation, required codecarbon to be installed, which can be installed using
             `pip install mteb[codecarbon]`. If none is passed co2 tracking will only be run if codecarbon is installed.
-        encode_kwargs: Additional keyword arguments passed to the models `encode` method.
+        encode_kwargs: Additional keyword arguments passed to the models `encode` and `load_data` methods;
         raise_error: If True, raise an error if the task fails. If False, return an empty list.
         cache: The cache to use for loading the results. If None, then no cache will be used. The default cache saved the cache in the
             `~/.cache/mteb` directory. It can be overridden by setting the `MTEB_CACHE` environment variable to a different directory or by directly
@@ -304,6 +307,7 @@ def evaluate(
         show_progress_bar: Whether to show a progress bar when running the evaluation. Default is True. Setting this to False will also set the
             `encode_kwargs['show_progress_bar']` to False if encode_kwargs is unspecified.
         public_only: Run only public tasks. If None, it will attempt to run the private task.
+        num_proc: Number of processes to use during data loading and transformation. Defaults to 1.
     Returns:
         The results of the evaluation.
@@ -356,6 +360,7 @@ def evaluate(
             prediction_folder=prediction_folder,
             show_progress_bar=show_progress_bar,
             public_only=public_only,
+            num_proc=num_proc,
         )
         combined_results = aggregated_task.combine_task_results(results.task_results)
         return ModelResult(
@@ -388,6 +393,7 @@ def evaluate(
                 prediction_folder=prediction_folder,
                 show_progress_bar=False,
                 public_only=public_only,
+                num_proc=num_proc,
             )
             evaluate_results.extend(_res.task_results)
             if _res.exceptions:
@@ -467,6 +473,7 @@ def evaluate(
                 encode_kwargs=encode_kwargs,
                 prediction_folder=prediction_folder,
                 public_only=public_only,
+                num_proc=num_proc,
             )
         except Exception as e:
             logger.error(
@@ -482,6 +489,7 @@ def evaluate(
             encode_kwargs=encode_kwargs,
             prediction_folder=prediction_folder,
             public_only=public_only,
+            num_proc=num_proc,
         )
     logger.info(f"✓ Finished evaluation for {task.metadata.name}")

mteb/models/model_implementations/bm25.py CHANGED Viewed

@@ -54,6 +54,7 @@ def bm25_loader(model_name, **kwargs) -> SearchProtocol:
             hf_split: str,
             hf_subset: str,
             encode_kwargs: EncodeKwargs,
+            num_proc: int = 1,
         ) -> None:
             logger.info("Encoding Corpus...")
             corpus_texts = [
@@ -80,6 +81,7 @@ def bm25_loader(model_name, **kwargs) -> SearchProtocol:
             top_k: int,
             encode_kwargs: EncodeKwargs,
             top_ranked: TopRankedDocumentsType | None = None,
+            num_proc: int = 1,
         ) -> RetrievalOutputType:
             logger.info("Encoding Queries...")
             query_ids = list(queries["id"])

mteb/models/model_implementations/pylate_models.py CHANGED Viewed

@@ -53,6 +53,7 @@ class PylateSearchEncoder:
         hf_split: str,
         hf_subset: str,
         encode_kwargs: EncodeKwargs,
+        num_proc: int,
     ) -> None:
         """Index the corpus for retrieval.
@@ -62,6 +63,7 @@ class PylateSearchEncoder:
             hf_split: Split of current task, allows to know some additional information about current split.
             hf_subset: Subset of current task. Similar to `hf_split` to get more information
             encode_kwargs: Additional arguments to pass to the encoder during indexing.
+            num_proc: Number of processes to use for indexing.
         """
         self.task_corpus = corpus
@@ -87,12 +89,14 @@ class PylateSearchEncoder:
         top_k: int,
         encode_kwargs: EncodeKwargs,
         top_ranked: TopRankedDocumentsType | None = None,
+        num_proc: int,
     ) -> RetrievalOutputType:
         queries_dataloader = create_dataloader(
             queries,
             task_metadata,
             prompt_type=PromptType.query,
             batch_size=encode_kwargs.get("batch_size", 32),
+            num_proc=num_proc,
         )
         query_embeddings = self.encode(
@@ -116,6 +120,7 @@ class PylateSearchEncoder:
                 hf_subset=hf_subset,
                 hf_split=hf_split,
                 encode_kwargs=encode_kwargs,
+                num_proc=num_proc,
             )
         else:
             result_heaps = self._pylate_full_corpus_search(
@@ -126,6 +131,7 @@ class PylateSearchEncoder:
                 hf_subset=hf_subset,
                 hf_split=hf_split,
                 encode_kwargs=encode_kwargs,
+                num_proc=num_proc,
             )
         results = {qid: {} for qid in query_idx_to_id.values()}
@@ -144,6 +150,7 @@ class PylateSearchEncoder:
         hf_split: str,
         top_k: int,
         encode_kwargs: EncodeKwargs,
+        num_proc: int,
     ) -> dict[str, list[tuple[float, str]]]:
         from pylate import indexes, retrieve
@@ -170,6 +177,7 @@ class PylateSearchEncoder:
             task_metadata,
             prompt_type=PromptType.document,
             batch_size=encode_kwargs.get("batch_size", 32),
+            num_proc=num_proc,
         )
         documents_embeddings = self.encode(
             documents_loader,
@@ -208,6 +216,7 @@ class PylateSearchEncoder:
         hf_subset: str,
         hf_split: str,
         encode_kwargs: EncodeKwargs,
+        num_proc: int = 1,
     ) -> dict[str, list[tuple[float, str]]]:
         """Rerank with PyLate's rank.rerank using per-query candidates.
@@ -230,6 +239,7 @@ class PylateSearchEncoder:
                 task_metadata,
                 prompt_type=PromptType.document,
                 batch_size=encode_kwargs.get("batch_size", 32),
+                num_proc=num_proc,
             ),
             task_metadata=task_metadata,
             hf_split=hf_split,

mteb/models/models_protocols.py CHANGED Viewed

@@ -32,6 +32,7 @@ class SearchProtocol(Protocol):
         hf_split: str,
         hf_subset: str,
         encode_kwargs: EncodeKwargs,
+        num_proc: int,
     ) -> None:
         """Index the corpus for retrieval.
@@ -41,6 +42,7 @@ class SearchProtocol(Protocol):
             hf_split: Split of current task, allows to know some additional information about current split.
             hf_subset: Subset of current task. Similar to `hf_split` to get more information
             encode_kwargs: Additional arguments to pass to the encoder during indexing.
+            num_proc: Number of processes to use for dataloading.
         """
         ...
@@ -54,6 +56,7 @@ class SearchProtocol(Protocol):
         top_k: int,
         encode_kwargs: EncodeKwargs,
         top_ranked: TopRankedDocumentsType | None = None,
+        num_proc: int,
     ) -> RetrievalOutputType:
         """Search the corpus using the given queries.
@@ -66,6 +69,7 @@ class SearchProtocol(Protocol):
                 Passed only from Reranking tasks.
             top_k: Number of top documents to return for each query.
             encode_kwargs: Additional arguments to pass to the encoder during indexing.
+            num_proc: Number of processes to use for dataloading.
         Returns:
             Dictionary with query IDs as keys with dict as values, where each value is a mapping of document IDs to their relevance scores.

mteb/models/search_wrappers.py CHANGED Viewed

@@ -59,6 +59,7 @@ class SearchEncoderWrapper:
         hf_split: str,
         hf_subset: str,
         encode_kwargs: EncodeKwargs,
+        num_proc: int = 1,
     ) -> None:
         """Index the corpus for retrieval.
@@ -68,6 +69,7 @@ class SearchEncoderWrapper:
             hf_split: Split of current task, allows to know some additional information about current split.
             hf_subset: Subset of current task. Similar to `hf_split` to get more information
             encode_kwargs: Additional arguments to pass to the encoder during indexing.
+            num_proc: Number of processes to use for dataloading.
         """
         # Always retain corpus for potential reranking or fallback flows
         self.task_corpus = corpus
@@ -77,6 +79,7 @@ class SearchEncoderWrapper:
                     corpus,
                     task_metadata,
                     prompt_type=PromptType.document,
+                    num_proc=num_proc,
                     **encode_kwargs,
                 ),
                 task_metadata=task_metadata,
@@ -98,6 +101,7 @@ class SearchEncoderWrapper:
         top_k: int,
         encode_kwargs: EncodeKwargs,
         top_ranked: TopRankedDocumentsType | None = None,
+        num_proc: int = 1,
     ) -> RetrievalOutputType:
         """Search the corpus for the given queries.
@@ -110,6 +114,7 @@ class SearchEncoderWrapper:
                 Passed only from Reranking tasks.
             top_k: Number of top documents to return for each query.
             encode_kwargs: Additional arguments to pass to the encoder during indexing.
+            num_proc: Number of processes to use for dataloading.
         Returns:
             Dictionary with query IDs as keys with dict as values, where each value is a mapping of document IDs to their relevance scores.
@@ -121,6 +126,7 @@ class SearchEncoderWrapper:
             queries,
             task_metadata,
             prompt_type=PromptType.query,
+            num_proc=num_proc,
             **encode_kwargs,
         )
@@ -479,6 +485,7 @@ class SearchCrossEncoderWrapper:
         hf_split: str,
         hf_subset: str,
         encode_kwargs: EncodeKwargs,
+        num_proc: int = 1,
     ) -> None:
         """Index the corpus for retrieval.
@@ -488,6 +495,7 @@ class SearchCrossEncoderWrapper:
             hf_split: Split of current task, allows to know some additional information about current split.
             hf_subset: Subset of current task. Similar to `hf_split` to get more information
             encode_kwargs: Additional arguments to pass to the encoder during indexing.
+            num_proc: Number of processes to use.
         """
         self.task_corpus = corpus
@@ -501,6 +509,7 @@ class SearchCrossEncoderWrapper:
         top_k: int,
         encode_kwargs: EncodeKwargs,
         top_ranked: TopRankedDocumentsType | None = None,
+        num_proc: int = 1,
     ) -> RetrievalOutputType:
         """Search the corpus using the given queries.
@@ -513,6 +522,7 @@ class SearchCrossEncoderWrapper:
                 Passed only from Reranking tasks.
             top_k: Number of top documents to return for each query.
             encode_kwargs: Additional arguments to pass to the encoder during indexing.
+            num_proc: Number of processes to use.
         Returns:
             Dictionary with query IDs as keys with dict as values, where each value is a mapping of document IDs to their relevance scores.
@@ -546,12 +556,14 @@ class SearchCrossEncoderWrapper:
             Dataset.from_list(total_queries),
             task_metadata,
             prompt_type=PromptType.document,
+            num_proc=num_proc,
             **encode_kwargs,
         )
         corpus_loader = create_dataloader(
             Dataset.from_list(total_docs),
             task_metadata,
             prompt_type=PromptType.document,
+            num_proc=num_proc,
             **encode_kwargs,
         )
         predictions = self.model.predict(

mteb/tasks/bitext_mining/eng/pub_chem_smiles_bitext_mining.py CHANGED Viewed

@@ -59,7 +59,7 @@ class PubChemSMILESBitextMining(AbsTaskBitextMining):
 """,
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         for subset in self.hf_subsets:
             self.dataset[subset] = self.dataset[subset].rename_columns(
                 COL_MAPPING[subset]

mteb/tasks/bitext_mining/fas/fa_mteb_summary_retrieval.py CHANGED Viewed

@@ -27,7 +27,7 @@ class SAMSumFa(AbsTaskBitextMining):
         bibtex_citation="",
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.dataset.rename_columns(
             {"text": "sentence1", "summary": "sentence2"}
         )
@@ -58,7 +58,7 @@ class SynPerChatbotSumSRetrieval(AbsTaskBitextMining):
         bibtex_citation=""" """,
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.dataset.rename_columns(
             {"text": "sentence1", "summary": "sentence2"}
         )
@@ -89,7 +89,7 @@ class SynPerChatbotRAGSumSRetrieval(AbsTaskBitextMining):
         bibtex_citation=""" """,
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.dataset.rename_columns(
             {"text": "sentence1", "summary": "sentence2"}
         )

mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py CHANGED Viewed

@@ -60,7 +60,7 @@ Rapp, Reinhard},
         superseded_by="BUCC.v2",
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         dataset = {}
         for lang in self.dataset:
             dataset[lang] = {}

mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py CHANGED Viewed

@@ -265,7 +265,7 @@ class FloresBitextMining(AbsTaskBitextMining):
 """,
     )
-    def load_data(self) -> None:
+    def load_data(self, num_proc: int = 1, **kwargs) -> None:
         if self.data_loaded:
             return

mteb/tasks/bitext_mining/multilingual/in22_conv_bitext_mining.py CHANGED Viewed

@@ -99,7 +99,7 @@ class IN22ConvBitextMining(AbsTaskBitextMining):
 """,
     )
-    def load_data(self) -> None:
+    def load_data(self, num_proc: int = 1, **kwargs) -> None:
         if self.data_loaded:
             return

mteb/tasks/bitext_mining/multilingual/in22_gen_bitext_mining.py CHANGED Viewed

@@ -93,7 +93,7 @@ class IN22GenBitextMining(AbsTaskBitextMining):
 """,
     )
-    def load_data(self) -> None:
+    def load_data(self, num_proc: int = 1, **kwargs) -> None:
         if self.data_loaded:
             return

mteb/tasks/bitext_mining/multilingual/norwegian_courts_bitext_mining.py CHANGED Viewed

@@ -35,7 +35,7 @@ class NorwegianCourtsBitextMining(AbsTaskBitextMining):
         prompt="Retrieve parallel sentences in Norwegian Bokmål and Nynorsk",
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         # Convert to standard format
         self.dataset = self.dataset.rename_column("nb", "sentence1")
         self.dataset = self.dataset.rename_column("nn", "sentence2")

mteb/tasks/bitext_mining/multilingual/ntrex_bitext_mining.py CHANGED Viewed

@@ -280,7 +280,7 @@ class NTREXBitextMining(AbsTaskBitextMining):
 """,
     )
-    def load_data(self) -> None:
+    def load_data(self, num_proc: int = 1, **kwargs) -> None:
         if self.data_loaded:
             return

mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py CHANGED Viewed

@@ -32,7 +32,7 @@ class RomaTalesBitextMining(AbsTaskBitextMining):
         bibtex_citation="",
     )
-    def load_data(self) -> None:
+    def load_data(self, num_proc: int = 1, **kwargs) -> None:
         """Load dataset from HuggingFace hub and convert it to the standard format."""
         if self.data_loaded:
             return
@@ -44,7 +44,7 @@ class RomaTalesBitextMining(AbsTaskBitextMining):
         self.dataset_transform()
         self.data_loaded = True
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         for lang in self.hf_subsets:
             self.dataset[lang] = self.dataset[lang].rename_columns(
                 {"romani": "sentence1", "hungarian": "sentence2"}

mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py CHANGED Viewed

@@ -230,7 +230,7 @@ class WebFAQBitextMiningQuestions(AbsTaskBitextMining):
 """,
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         dataset = {}
         for langs in self.dataset:
             dataset[langs] = {}
@@ -284,7 +284,7 @@ class WebFAQBitextMiningQAs(AbsTaskBitextMining):
 """,
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         dataset = {}
         for langs in self.dataset:
             dataset[langs] = {}

mteb/tasks/classification/ara/online_store_review_sentiment_classification.py CHANGED Viewed

@@ -28,7 +28,7 @@ class OnlineStoreReviewSentimentClassification(AbsTaskClassification):
         superseded_by="OnlineStoreReviewSentimentClassification.v2",
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.stratified_subsampling(
             self.dataset, seed=self.seed, splits=["train"]
         )

mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py CHANGED Viewed

@@ -37,7 +37,7 @@ class RestaurantReviewSentimentClassification(AbsTaskClassification):
         superseded_by="RestaurantReviewSentimentClassification.v2",
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         # labels: 0 negative, 1 positive
         self.dataset = self.dataset.rename_column("polarity", "label")
         self.dataset = self.stratified_subsampling(

mteb/tasks/classification/ara/tweet_sarcasm_classification.py CHANGED Viewed

@@ -48,7 +48,7 @@ Mubarak, Hamdy},
         superseded_by="TweetSarcasmClassification.v2",
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         # labels: 0 non-sarcastic, 1 sarcastic
         self.dataset = self.dataset.rename_columns(
             {"tweet": "text", "sarcasm": "label"}

mteb/tasks/classification/ben/bengali_hate_speech_classification.py CHANGED Viewed

@@ -36,7 +36,7 @@ class BengaliHateSpeechClassification(AbsTaskClassification):
         superseded_by="BengaliHateSpeechClassification.v2",
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.stratified_subsampling(
             self.dataset, seed=self.seed, splits=["train"]
         )

mteb/tasks/classification/ben/bengali_sentiment_analysis.py CHANGED Viewed

@@ -36,7 +36,7 @@ class BengaliSentimentAnalysis(AbsTaskClassification):
         superseded_by="BengaliSentimentAnalysis.v2",
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.stratified_subsampling(
             self.dataset, seed=self.seed, splits=["train"]
         )

mteb/tasks/classification/bul/bulgarian_store_review_sentiment_classfication.py CHANGED Viewed

@@ -37,7 +37,7 @@ class BulgarianStoreReviewSentimentClassfication(AbsTaskClassification):
 """,
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.dataset.rename_columns(
             {"Review": "text", "Category": "label"}
         )

mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py CHANGED Viewed

@@ -39,7 +39,7 @@ class CSFDCZMovieReviewSentimentClassification(AbsTaskClassification):
     # Increase the samples_per_label in order to improve baseline performance
     samples_per_label = 20
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.dataset.rename_columns(
             {"comment": "text", "rating_int": "label"}
         )
@@ -85,7 +85,7 @@ class CSFDCZMovieReviewSentimentClassificationV2(AbsTaskClassification):
     # Increase the samples_per_label in order to improve baseline performance
     samples_per_label = 20
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.stratified_subsampling(
             self.dataset, seed=self.seed, splits=["test"], n_samples=2048
         )

mteb/tasks/classification/dan/ddisco_cohesion_classification.py CHANGED Viewed

@@ -56,7 +56,7 @@ Piperidis, Stelios},
         superseded_by="Ddisco.v2",
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.dataset.rename_columns({"rating": "label"}).remove_columns(
             ["domain"]
         )

mteb/tasks/classification/dan/dk_hate_classification.py CHANGED Viewed

@@ -60,7 +60,7 @@ Piperidis, Stelios},
     samples_per_label = 16
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         # convert label to a 0/1 label
         labels = self.dataset["train"]["label"]
         lab2idx = {lab: idx for idx, lab in enumerate(set(labels))}

mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py CHANGED Viewed

@@ -49,7 +49,7 @@ Zesch, Torsten},
         superseded_by="GermanPoliticiansTwitterSentimentClassification.v2",
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.dataset.rename_column("majority_sentiment", "label")

mteb/tasks/classification/ell/greek_legal_code_classification.py CHANGED Viewed

@@ -42,7 +42,7 @@ class GreekLegalCodeClassification(AbsTaskClassification):
 """,
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset["validation"] = (
             self.dataset["validation"]
             .shuffle(seed=self.seed)

mteb/tasks/classification/eng/dbpedia_classification.py CHANGED Viewed

@@ -40,7 +40,7 @@ class DBpediaClassification(AbsTaskClassification):
         superseded_by="DBpediaClassification.v2",
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.dataset.rename_column("content", "text")
         self.dataset = self.stratified_subsampling(
             self.dataset, seed=self.seed, splits=["train", "test"]
@@ -85,7 +85,7 @@ class DBpediaClassificationV2(AbsTaskClassification):
         adapted_from=["DBpediaClassification"],
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.stratified_subsampling(
             self.dataset, seed=self.seed, splits=["train", "test"]
         )

mteb/tasks/classification/eng/toxic_chat_classification.py CHANGED Viewed

@@ -40,7 +40,7 @@ class ToxicChatClassification(AbsTaskClassification):
         superseded_by="ToxicChatClassification.v2",
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         keep_cols = ["user_input", "toxicity"]
         rename_dict = dict(zip(keep_cols, ["text", "label"]))
         remove_cols = [
@@ -93,7 +93,7 @@ class ToxicChatClassificationV2(AbsTaskClassification):
         adapted_from=["ToxicChatClassification"],
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.stratified_subsampling(
             self.dataset, seed=self.seed, splits=["test"]
         )

mteb/tasks/classification/eng/toxic_conversations_classification.py CHANGED Viewed

@@ -42,7 +42,7 @@ class ToxicConversationsClassification(AbsTaskClassification):
     samples_per_label = 16
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.stratified_subsampling(
             self.dataset, seed=self.seed, splits=["test"]
         )
@@ -88,7 +88,7 @@ class ToxicConversationsClassificationV2(AbsTaskClassification):
     samples_per_label = 16
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset = self.stratified_subsampling(
             self.dataset, seed=self.seed, splits=["test"]
         )

mteb/tasks/classification/eng/tweet_topic_single_classification.py CHANGED Viewed

@@ -43,7 +43,7 @@ Barbieri, Francesco},
         superseded_by="TweetTopicSingleClassification.v2",
     )
-    def dataset_transform(self):
+    def dataset_transform(self, num_proc: int = 1):
         self.dataset["train"] = self.dataset["train_2021"]

mteb 2.7.4__py3-none-any.whl → 2.7.5__py3-none-any.whl

mteb 2.7.4py3-none-any.whl → 2.7.5py3-none-any.whl