PyPI - XspecT - Versions diffs - 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl - Mend

XspecT 0.5.0py3-none-any.whl → 0.5.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of XspecT might be problematic. Click here for more details.

Files changed (33) hide show

xspect/classify.py +61 -13
xspect/definitions.py +61 -13
xspect/download_models.py +10 -2
xspect/file_io.py +115 -48
xspect/filter_sequences.py +81 -29
xspect/main.py +90 -39
xspect/mlst_feature/mlst_helper.py +3 -0
xspect/mlst_feature/pub_mlst_handler.py +43 -1
xspect/model_management.py +84 -14
xspect/models/probabilistic_filter_mlst_model.py +75 -37
xspect/models/probabilistic_filter_model.py +201 -19
xspect/models/probabilistic_filter_svm_model.py +106 -13
xspect/models/probabilistic_single_filter_model.py +73 -9
xspect/models/result.py +77 -10
xspect/ncbi.py +48 -12
xspect/train.py +19 -11
xspect/web.py +68 -12
xspect/xspect-web/dist/assets/index-Ceo58xui.css +1 -0
xspect/xspect-web/dist/assets/{index-CMG4V7fZ.js → index-Dt_UlbgE.js} +82 -77
xspect/xspect-web/dist/index.html +2 -2
xspect/xspect-web/src/App.tsx +4 -2
xspect/xspect-web/src/api.tsx +23 -1
xspect/xspect-web/src/components/filter-form.tsx +16 -3
xspect/xspect-web/src/components/filtering-result.tsx +65 -0
xspect/xspect-web/src/components/result.tsx +2 -2
xspect/xspect-web/src/types.tsx +5 -0
{xspect-0.5.0.dist-info → xspect-0.5.2.dist-info}/METADATA +11 -5
{xspect-0.5.0.dist-info → xspect-0.5.2.dist-info}/RECORD +32 -31
{xspect-0.5.0.dist-info → xspect-0.5.2.dist-info}/WHEEL +1 -1
xspect/xspect-web/dist/assets/index-jIKg1HIy.css +0 -1
{xspect-0.5.0.dist-info → xspect-0.5.2.dist-info}/entry_points.txt +0 -0
{xspect-0.5.0.dist-info → xspect-0.5.2.dist-info}/licenses/LICENSE +0 -0
{xspect-0.5.0.dist-info → xspect-0.5.2.dist-info}/top_level.txt +0 -0

xspect/models/probabilistic_filter_svm_model.py CHANGED Viewed

@@ -15,23 +15,51 @@ from xspect.models.result import ModelResult
 class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
-    """Probabilistic filter SVM model for sequence data"""
+    """
+    Probabilistic filter SVM model for sequence data
+    In addition to the standard probabilistic filter model, this model uses an SVM to predict
+    labels based on their scores and training data. It requires the `scikit-learn` library
+    to be installed.
+    """
     def __init__(
         self,
         k: int,
         model_display_name: str,
-        author: str,
-        author_email: str,
+        author: str | None,
+        author_email: str | None,
         model_type: str,
         base_path: Path,
         kernel: str,
         c: float,
         fpr: float = 0.01,
         num_hashes: int = 7,
-        training_accessions: dict[str, list[str]] = None,
-        svm_accessions: dict[str, list[str]] = None,
+        training_accessions: dict[str, list[str]] | None = None,
+        svm_accessions: dict[str, list[str]] | None = None,
     ) -> None:
+        """
+        Initialize the SVM model with the given parameters.
+        In addition to the standard parameters, this model uses an SVM.
+        Therefore, it requires the `kernel` and `C` parameters to be set.
+        Furthermore, the `svm_accessions` parameter is used to store which accessions
+        are used for training the SVM.
+        Args:
+            k (int): The k-mer size for the probabilistic filter.
+            model_display_name (str): The display name of the model.
+            author (str | None): The author of the model.
+            author_email (str | None): The author's email address.
+            model_type (str): The type of the model.
+            base_path (Path): The base path where the model will be stored.
+            kernel (str): The kernel type for the SVM (e.g., 'linear', 'rbf').
+            c (float): Regularization parameter for the SVM.
+            fpr (float, optional): False positive rate for the probabilistic filter. Defaults to 0.01.
+            num_hashes (int, optional): Number of hashes for the probabilistic filter. Defaults to 7.
+            training_accessions (dict[str, list[str]] | None, optional): Accessions used for training the probabilistic filter. Defaults to None.
+            svm_accessions (dict[str, list[str]] | None, optional): Accessions used for training the SVM. Defaults to None.
+        """
         super().__init__(
             k=k,
             model_display_name=model_display_name,
@@ -48,6 +76,12 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
         self.svm_accessions = svm_accessions
     def to_dict(self) -> dict:
+        """
+        Convert the model to a dictionary representation
+        Returns:
+            dict: A dictionary containing the model's parameters and state.
+        """
         return super().to_dict() | {
             "kernel": self.kernel,
             "C": self.c,
@@ -55,7 +89,13 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
         }
     def set_svm_params(self, kernel: str, c: float) -> None:
-        """Set the parameters for the SVM"""
+        """
+        Set the parameters for the SVM
+        Args:
+            kernel (str): The kernel type for the SVM (e.g., 'linear', 'rbf').
+            c (float): Regularization parameter for the SVM.
+        """
         self.kernel = kernel
         self.c = c
         self.save()
@@ -64,12 +104,27 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
         self,
         dir_path: Path,
         svm_path: Path,
-        display_names: dict = None,
+        display_names: dict[str, str] | None = None,
         svm_step: int = 1,
-        training_accessions: list[str] = None,
-        svm_accessions: list[str] = None,
+        training_accessions: dict[str, list[str]] | None = None,
+        svm_accessions: dict[str, list[str]] | None = None,
     ) -> None:
-        """Fit the SVM to the sequences and labels"""
+        """
+        Fit the SVM to the sequences and labels.
+        This method first trains the probabilistic filter model and then
+        calculates scores for the SVM training. It expects the sequences to be in
+        the specified directory and the SVM training sequences to be in the
+        specified SVM path. The scores are saved in a CSV file for later use.
+        Args:
+            dir_path (Path): The directory containing the training sequences.
+            svm_path (Path): The directory containing the SVM training sequences.
+            display_names (dict[str, str] | None): A mapping of accession IDs to display names.
+            svm_step (int): Step size for sparse sampling in SVM training.
+            training_accessions (dict[str, list[str]] | None): Accessions used for training the probabilistic filter.
+            svm_accessions (dict[str, list[str]] | None): Accessions used for training the SVM.
+        """
         # Since the SVM works with score data, we need to train
         # the underlying data structure for score generation first
@@ -124,7 +179,21 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
         filter_ids: list[str] = None,
         step: int = 1,
     ) -> ModelResult:
-        """Predict the labels of the sequences"""
+        """
+        Predict the labels of the sequences.
+        This method uses the SVM to predict labels based on the scores generated
+        from the sequences. It expects the sequences to be in a format compatible
+        with the probabilistic filter model, and it will return a `ModelResult`.
+        Args:
+            sequence_input (SeqRecord | list[SeqRecord] | SeqIO.FastaIO.FastaIterator | SeqIO.QualityIO.FastqPhredIterator | Path): The input sequences to predict.
+            filter_ids (list[str], optional): A list of IDs to filter the predictions. Defaults to None.
+            step (int, optional): Step size for sparse sampling. Defaults to 1.
+        Returns:
+            ModelResult: The result of the prediction containing hits, number of kmers, and the predicted label.
+        """
         # get scores and format them for the SVM
         res = super().predict(sequence_input, filter_ids, step=step)
         svm_scores = dict(sorted(res.get_scores()["total"].items()))
@@ -140,7 +209,19 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
         )
     def _get_svm(self, id_keys) -> SVC:
-        """Get the SVM for the given id keys"""
+        """
+        Get the SVM for the given id keys.
+        This method loads the SVM model from the scores CSV file and trains it
+        using the scores from the CSV. If `id_keys` is provided, it filters the
+        training data to only include those keys.
+        Args:
+            id_keys (list[str] | None): A list of IDs to filter the training data. If None, all data is used.
+        Returns:
+            SVC: The trained SVM model.
+        """
         svm = SVC(kernel=self.kernel, C=self.c)
         # parse csv
         with open(
@@ -160,7 +241,19 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
     @staticmethod
     def load(path: Path) -> "ProbabilisticFilterSVMModel":
-        """Load the model from disk"""
+        """
+        Load the model from disk
+        Loads the model from the specified path. The path should point to a JSON file
+        containing the model's parameters and state. It also checks for the existence of
+        the COBS index file.
+        Args:
+            path (Path): The path to the model JSON file.
+        Returns:
+            ProbabilisticFilterSVMModel: The loaded model instance.
+        """
         with open(path, "r", encoding="utf-8") as file:
             json_object = file.read()
             model_json = json.loads(json_object)

xspect/models/probabilistic_single_filter_model.py CHANGED Viewed

@@ -14,19 +14,39 @@ from xspect.file_io import get_record_iterator
 class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
-    """Base probabilistic filter model for sequence data"""
+    """
+    Probabilistic filter model for sequence data, with a single filter
+    This model uses a Bloom filter to store k-mers from the training sequences. It is designed to
+    be used with a single filter, which is suitable e. g. for genus-level classification.
+    """
     def __init__(
         self,
         k: int,
         model_display_name: str,
-        author: str,
-        author_email: str,
+        author: str | None,
+        author_email: str | None,
         model_type: str,
         base_path: Path,
         fpr: float = 0.01,
-        training_accessions: list[str] = None,
+        training_accessions: list[str] | None = None,
     ) -> None:
+        """Initialize probabilistic single filter model.
+        This model uses a Bloom filter to store k-mers from the training sequences. It is designed to
+        be used with a single filter, which is suitable e.g. for genus-level classification.
+        Args:
+            k (int): Length of the k-mers to use for filtering
+            model_display_name (str): Display name of the model
+            author (str | None): Author of the model
+            author_email (str | None): Email of the author
+            model_type (str): Type of the model, e.g. "probabilistic_single_filter"
+            base_path (Path): Base path where the model will be saved
+            fpr (float): False positive rate for the Bloom filter, default is 0.01
+            training_accessions (list[str] | None): List of accessions used for training, default is None
+        """
         super().__init__(
             k=k,
             model_display_name=model_display_name,
@@ -41,9 +61,22 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
         self.bf = None
     def fit(
-        self, file_path: Path, display_name: str, training_accessions: list[str] = None
+        self,
+        file_path: Path,
+        display_name: str,
+        training_accessions: list[str] | None = None,
     ) -> None:
-        """Fit the cobs classic index to the sequences and labels"""
+        """
+        Fit the bloom filter to the sequences.
+        Trains the model by reading sequences from the provided file path,
+        generating k-mers, and adding them to the Bloom filter.
+        Args:
+            file_path (Path): Path to the file containing sequences in FASTA format
+            display_name (str): Display name for the model
+            training_accessions (list[str] | None): List of accessions used for training, default is None
+        """
         self.training_accessions = training_accessions
         # estimate number of kmers
@@ -65,7 +98,18 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
     def calculate_hits(
         self, sequence: Seq | SeqRecord, filter_ids=None, step: int = 1
     ) -> dict:
-        """Calculate the hits for the sequence"""
+        """
+        Calculate the hits for the sequence
+        Calculates the number of k-mers in the sequence that are present in the Bloom filter.
+        Args:
+            sequence (Seq | SeqRecord): Sequence to calculate hits for, can be a Bio.Seq or Bio.SeqRecord object
+            filter_ids (list[str] | None): List of filter IDs to use, default is None
+            step (int): Step size for generating k-mers, default is 1
+        Returns:
+            dict: Dictionary with the display name as key and the number of hits as value
+        """
         if isinstance(sequence, SeqRecord):
             sequence = sequence.seq
@@ -82,7 +126,17 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
     @staticmethod
     def load(path: Path) -> "ProbabilisticSingleFilterModel":
-        """Load the model from disk"""
+        """
+        Load the model from disk
+        This method reads the model's JSON file and the associated Bloom filter file,
+        reconstructing the model instance.
+        Args:
+            path (Path): Path to the model directory containing the JSON file
+        Returns:
+            ProbabilisticSingleFilterModel: An instance of the model loaded from disk
+        """
         with open(path, "r", encoding="utf-8") as file:
             json_object = file.read()
             model_json = json.loads(json_object)
@@ -105,7 +159,17 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
             return model
     def _generate_kmers(self, sequence: Seq, step: int = 1):
-        """Generate kmers from the sequence"""
+        """
+        Generate kmers from the sequence
+        Generates k-mers from the sequence, considering both the forward and reverse complement strands.
+        Args:
+            sequence (Seq): Sequence to generate k-mers from
+            step (int): Step size for generating k-mers, default is 1
+        Yields:
+            str: The minimizer k-mer (the lexicographically smallest k-mer between the forward and reverse complement)
+        """
         num_kmers = ceil((len(sequence) - self.k + 1) / step)
         for i in range(num_kmers):
             start_pos = i * step

xspect/models/result.py CHANGED Viewed

@@ -14,9 +14,22 @@ class ModelResult:
         hits: dict[str, dict[str, int]],
         num_kmers: dict[str, int],
         sparse_sampling_step: int = 1,
-        prediction: str = None,
-        input_source: str = None,
+        prediction: str | None = None,
+        input_source: str | None = None,
     ):
+        """
+        Initialize the ModelResult object.
+        Args:
+            model_slug (str): The slug of the model.
+            hits (dict[str, dict[str, int]]): A dictionary where keys are subsequence names
+                and values are dictionaries with labels as keys and hit counts as values.
+            num_kmers (dict[str, int]): A dictionary where keys are subsequence names
+                and values are the total number of k-mers for that subsequence.
+            sparse_sampling_step (int): The step size for sparse sampling, default is 1.
+            prediction (str | None): The prediction made by the model, default is None.
+            input_source (str | None): The source of the input data, default is None.
+        """
         if "total" in hits:
             raise ValueError(
                 "'total' is a reserved key and cannot be used as a subsequence"
@@ -29,7 +42,16 @@ class ModelResult:
         self.input_source = input_source
     def get_scores(self) -> dict:
-        """Return the scores of the model."""
+        """
+        Return the scores of the model.
+        The scores are calculated as the number of hits divided by the total number of k-mers
+        for each subsequence and label. The scores are rounded to two decimal places.
+        Returns:
+            dict: A dictionary where keys are subsequence names and values are dictionaries
+                with labels as keys and scores as values. Also includes a 'total' key for overall scores.
+        """
         scores = {
             subsequence: {
                 label: round(hits / self.num_kmers[subsequence], 2)
@@ -50,20 +72,37 @@ class ModelResult:
         return scores
     def get_total_hits(self) -> dict[str, int]:
-        """Return the total hits of the model."""
+        """
+        Return the total hits of the model.
+        The total hits are calculated by summing the hits for each label across all subsequences.
+        Returns:
+            dict: A dictionary where keys are labels and values are the total number of hits for that label.
+        """
         total_hits = {label: 0 for label in list(self.hits.values())[0]}
-        for _, subseuqence_hits in self.hits.items():
-            for label, hits in subseuqence_hits.items():
+        for _, subsequence_hits in self.hits.items():
+            for label, hits in subsequence_hits.items():
                 total_hits[label] += hits
         return total_hits
     def get_filter_mask(self, label: str, filter_threshold: float) -> dict[str, bool]:
-        """Return a mask for filtered subsequences.
+        """
+        Return a mask for filtered subsequences.
         The mask is a dictionary with subsequence names as keys and boolean values
         indicating whether the subsequence is above the filter threshold for the given label.
         A value of -1 for filter_threshold indicates that the subsequence with the maximum score
         for the given label should be returned.
+        Args:
+            label (str): The label for which to filter the subsequences.
+            filter_threshold (float): The threshold for filtering subsequences. Must be between 0 and 1,
+                or -1 to return the subsequence with the maximum score for the label.
+        Returns:
+            dict[str, bool]: A dictionary where keys are subsequence names and values are booleans
+                indicating whether the subsequence meets the filter criteria for the given label.
         """
         if filter_threshold < 0 and not filter_threshold == -1 or filter_threshold > 1:
             raise ValueError("The filter threshold must be between 0 and 1.")
@@ -84,7 +123,19 @@ class ModelResult:
     def get_filtered_subsequence_labels(
         self, label: str, filter_threshold: float = 0.7
     ) -> list[str]:
-        """Return the labels of filtered subsequences."""
+        """
+        Return the labels of filtered subsequences.
+        This method filters subsequences based on the scores for a given label and a filter threshold.
+        Args:
+            label (str): The label for which to filter the subsequences.
+            filter_threshold (float): The threshold for filtering subsequences. Must be between 0 and 1,
+                or -1 to return the subsequence with the maximum score for the label.
+        Returns:
+            list[str]: A list of subsequence names that meet the filter criteria for the given label.
+        """
         return [
             subsequence
             for subsequence, mask in self.get_filter_mask(
@@ -94,7 +145,15 @@ class ModelResult:
         ]
     def to_dict(self) -> dict:
-        """Return the result as a dictionary."""
+        """
+        Return the result as a dictionary.
+        This method converts the ModelResult object into a dictionary format suitable for serialization.
+        Returns:
+            dict: A dictionary representation of the ModelResult object, including model slug,
+            sparse sampling step, hits, scores, number of k-mers, input source, and prediction if available.
+        """
         res = {
             "model_slug": self.model_slug,
             "sparse_sampling_step": self.sparse_sampling_step,
@@ -110,6 +169,14 @@ class ModelResult:
         return res
     def save(self, path: Path) -> None:
-        """Save the result as a JSON file."""
+        """
+        Save the result as a JSON file.
+        This method serializes the ModelResult object to a JSON file at the specified path.
+        Args:
+            path (Path): The path where the JSON file will be saved.
+        """
+        path.parent.mkdir(exist_ok=True, parents=True)
         with open(path, "w", encoding="utf-8") as f:
             f.write(dumps(self.to_dict(), indent=4))

xspect/ncbi.py CHANGED Viewed

@@ -3,6 +3,7 @@
 from enum import Enum
 from pathlib import Path
 import time
+from loguru import logger
 import requests
 # pylint: disable=line-too-long
@@ -26,26 +27,35 @@ class AssemblySource(Enum):
 class NCBIHandler:
-    """This class uses the NCBI Datasets API to get the taxonomy tree of a given Taxon.
+    """
+    This class uses the NCBI Datasets API to get data about taxa and their assemblies.
-    The taxonomy tree consists of only the next children to the parent taxon.
-    The children are only of the next lower rank of the parent taxon.
+    It provides methods to get taxon IDs, species, names, accessions, and download assemblies.
+    It also enforces rate limiting to comply with NCBI's API usage policies.
     """
     def __init__(
         self,
-        api_key: str = None,
+        api_key: str | None = None,
     ):
-        """Initialise the NCBI handler."""
+        """
+        Initialise the NCBI handler.
+        This method sets up the base URL for the NCBI Datasets API and initializes the rate limiting parameters.
+        Args:
+            api_key (str | None): The NCBI API key. If None, the handler will use the public API without an API key.
+        """
         self.api_key = api_key
         self.base_url = "https://api.ncbi.nlm.nih.gov/datasets/v2"
-        self.last_request_time = 0
+        self.last_request_time = 0.0
         self.min_interval = (
             1 / 10 if api_key else 1 / 5
         )  # NCBI allows 10 requests per second with if an API key, otherwise 5 requests per second
-    def _enforce_rate_limit(self):
-        """Enforce rate limiting for the NCBI Datasets API.
+    def _enforce_rate_limit(self) -> None:
+        """
+        Enforce rate limiting for the NCBI Datasets API.
         This method ensures that the requests to the API are limited to 5 requests per second
         without an API key and 10 requests per second with an API key.
@@ -58,7 +68,11 @@ class NCBIHandler:
         self.last_request_time = now
     def _make_request(self, endpoint: str, timeout: int = 15) -> dict:
-        """Make a request to the NCBI Datasets API.
+        """
+        Make a request to the NCBI Datasets API.
+        This method constructs the full URL for the API endpoint, adds the necessary headers (including the API key if provided),
+        and makes a GET request to the API. It also enforces rate limiting before making the request.
         Args:
             endpoint (str): The endpoint to make the request to.
@@ -229,7 +243,7 @@ class NCBIHandler:
                 == "OK"
             ]
         except (IndexError, KeyError, TypeError):
-            print(
+            logger.debug(
                 f"Could not get {assembly_level.value} accessions for taxon with ID: {taxon_id}. Skipping."
             )
             return []
@@ -238,7 +252,20 @@ class NCBIHandler:
     def get_highest_quality_accessions(
         self, taxon_id: int, assembly_source: AssemblySource, count: int
     ) -> list[str]:
-        """Get the highest quality accessions for a given taxon id (based on the assembly level)."""
+        """
+        Get the highest quality accessions for a given taxon id (based on the assembly level).
+        This function iterates through the assembly levels in order of quality and retrieves accessions
+        until the specified count is reached. It ensures that the accessions are unique and sorted by quality.
+        Args:
+            taxon_id (int): The taxon id to get the accessions for.
+            assembly_source (AssemblySource): The assembly source to get the accessions for.
+            count (int): The number of accessions to get.
+        Returns:
+            list[str]: A list containing the highest quality accessions.
+        """
         accessions = []
         for assembly_level in list(AssemblyLevel):
             accessions += self.get_accessions(
@@ -252,7 +279,16 @@ class NCBIHandler:
         return list(set(accessions))[:count]  # Remove duplicates and limit to count
     def download_assemblies(self, accessions: list[str], output_dir: Path) -> None:
-        """Download assemblies for a list of accessions."""
+        """
+        Download assemblies for a list of accessions.
+        This function makes a request to the NCBI Datasets API to download the assemblies for the given accessions.
+        It saves the downloaded assemblies as a zip file in the specified output directory.
+        Args:
+            accessions (list[str]): A list of accessions to download.
+            output_dir (Path): The directory where the downloaded assemblies will be saved.
+        """
         endpoint = f"/genome/accession/{','.join(accessions)}/download?include_annotation_type=GENOME_FASTA"
         self._enforce_rate_limit()

xspect/train.py CHANGED Viewed

@@ -25,12 +25,12 @@ def train_from_directory(
     display_name: str,
     dir_path: Path,
     meta: bool = False,
-    training_accessions: dict[str, list[str]] = None,
-    svm_accessions: list[str] = None,
+    training_accessions: dict[str, list[str]] | None = None,
+    svm_accessions: dict[str, list[str]] | None = None,
     svm_step: int = 1,
-    translation_dict: dict[str, str] = None,
-    author: str = None,
-    author_email: str = None,
+    translation_dict: dict[str, str] | None = None,
+    author: str | None = None,
+    author_email: str | None = None,
 ):
     """
     Train a model from a directory containing training data.
@@ -113,10 +113,11 @@ def train_from_directory(
         species_dir = tmp_dir / "species"
         species_dir.mkdir(parents=True, exist_ok=True)
-        # concatenate files in cobs_training_data for each species
+        logger.info("Concatenating genomes for species training...")
         concatenate_species_fasta_files(cobs_folders, species_dir)
         if svm_path.exists():
+            logger.info("Training species SVM model...")
             species_model = ProbabilisticFilterSVMModel(
                 k=21,
                 model_display_name=display_name,
@@ -136,6 +137,7 @@ def train_from_directory(
                 svm_accessions=svm_accessions,
             )
         else:
+            logger.info("Training species model...")
             species_model = ProbabilisticFilterModel(
                 k=21,
                 model_display_name=display_name,
@@ -153,9 +155,11 @@ def train_from_directory(
         species_model.save()
         if meta:
+            logger.info("Concatenating genomes for metagenome training...")
             meta_fasta = tmp_dir / f"{display_name}.fasta"
             concatenate_metagenome(species_dir, meta_fasta)
+            logger.info("Training metagenome model...")
             genus_model = ProbabilisticSingleFilterModel(
                 k=21,
                 model_display_name=display_name,
@@ -179,10 +183,12 @@ def train_from_directory(
 def train_from_ncbi(
     genus: str,
     svm_step: int = 1,
-    author: str = None,
-    author_email: str = None,
+    author: str | None = None,
+    author_email: str | None = None,
+    ncbi_api_key: str | None = None,
 ):
-    """Train a model using NCBI assembly data for a given genus.
+    """
+    Train a model using NCBI assembly data for a given genus.
     This function trains a probabilistic filter model using the assembly data from NCBI.
     The training data is downloaded and processed, and the model is saved to the
@@ -193,6 +199,7 @@ def train_from_ncbi(
         svm_step (int, optional): Step size for SVM training. Defaults to 1.
         author (str, optional): Author of the model. Defaults to None.
         author_email (str, optional): Author's email. Defaults to None.
+        ncbi_api_key (str, optional): NCBI API key for accessing NCBI resources. Defaults to None.
     Raises:
         TypeError: If `genus` is not a string.
@@ -205,7 +212,8 @@ def train_from_ncbi(
     if not isinstance(genus, str):
         raise TypeError("genus must be a string")
-    ncbi_handler = NCBIHandler()
+    logger.info("Getting NCBI metadata...")
+    ncbi_handler = NCBIHandler(api_key=ncbi_api_key)
     genus_tax_id = ncbi_handler.get_genus_taxon_id(genus)
     species_ids = ncbi_handler.get_species(genus_tax_id)
     species_names = ncbi_handler.get_taxon_names(species_ids)
@@ -243,7 +251,7 @@ def train_from_ncbi(
         cobs_dir.mkdir(parents=True, exist_ok=True)
         svm_dir.mkdir(parents=True, exist_ok=True)
-        # download assemblies
+        logger.info("Downloading genomes from NCBI...")
         all_accessions = sum(accessions.values(), [])
         batch_size = 100
         accession_paths = {}

XspecT 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl

Potentially problematic release.

XspecT 0.5.0py3-none-any.whl → 0.5.2py3-none-any.whl