PyPI - XspecT - Versions diffs - 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl - Mend

XspecT 0.5.1py3-none-any.whl → 0.5.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of XspecT might be problematic. Click here for more details.

Files changed (33) hide show

xspect/classify.py +51 -38
xspect/definitions.py +50 -10
xspect/download_models.py +10 -2
xspect/file_io.py +115 -48
xspect/filter_sequences.py +36 -66
xspect/main.py +41 -10
xspect/mlst_feature/mlst_helper.py +3 -0
xspect/mlst_feature/pub_mlst_handler.py +43 -1
xspect/model_management.py +84 -14
xspect/models/probabilistic_filter_mlst_model.py +75 -37
xspect/models/probabilistic_filter_model.py +194 -12
xspect/models/probabilistic_filter_svm_model.py +99 -6
xspect/models/probabilistic_single_filter_model.py +66 -5
xspect/models/result.py +77 -10
xspect/ncbi.py +45 -10
xspect/train.py +2 -1
xspect/web.py +68 -12
xspect/xspect-web/dist/assets/index-Ceo58xui.css +1 -0
xspect/xspect-web/dist/assets/{index-CMG4V7fZ.js → index-Dt_UlbgE.js} +82 -77
xspect/xspect-web/dist/index.html +2 -2
xspect/xspect-web/src/App.tsx +4 -2
xspect/xspect-web/src/api.tsx +23 -1
xspect/xspect-web/src/components/filter-form.tsx +16 -3
xspect/xspect-web/src/components/filtering-result.tsx +65 -0
xspect/xspect-web/src/components/result.tsx +2 -2
xspect/xspect-web/src/types.tsx +5 -0
{xspect-0.5.1.dist-info → xspect-0.5.2.dist-info}/METADATA +1 -1
{xspect-0.5.1.dist-info → xspect-0.5.2.dist-info}/RECORD +32 -31
{xspect-0.5.1.dist-info → xspect-0.5.2.dist-info}/WHEEL +1 -1
xspect/xspect-web/dist/assets/index-jIKg1HIy.css +0 -1
{xspect-0.5.1.dist-info → xspect-0.5.2.dist-info}/entry_points.txt +0 -0
{xspect-0.5.1.dist-info → xspect-0.5.2.dist-info}/licenses/LICENSE +0 -0
{xspect-0.5.1.dist-info → xspect-0.5.2.dist-info}/top_level.txt +0 -0

xspect/models/probabilistic_filter_model.py CHANGED Viewed

@@ -3,6 +3,7 @@
 import json
 from math import ceil
 from pathlib import Path
+from typing import Any
 from Bio.Seq import Seq
 from Bio.SeqRecord import SeqRecord
 from Bio import SeqIO
@@ -28,6 +29,25 @@ class ProbabilisticFilterModel:
         num_hashes: int = 7,
         training_accessions: dict[str, list[str]] | None = None,
     ) -> None:
+        """
+        Initializes the probabilistic filter model.
+        This method sets up the model with the specified parameters, including the k-mer size,
+        display name, author information, model type, base path for storage, false positive rate,
+        number of hashes, and training accessions.
+        Args:
+            k (int): The size of the k-mers to be used in the model.
+            model_display_name (str): The display name of the model.
+            author (str | None): The name of the author of the model.
+            author_email (str | None): The email of the author of the model.
+            model_type (str): The type of the model.
+            base_path (Path): The base path where the model will be stored.
+            fpr (float): The false positive rate for the model. Default is 0.01.
+            num_hashes (int): The number of hashes to use in the model. Default is 7.
+            training_accessions (dict[str, list[str]] | None): A dictionary mapping filter IDs to
+                lists of accession numbers used for training the model. Default is None.
+        """
         if k < 1:
             raise ValueError("Invalid k value, must be greater than 0")
         if not model_display_name:
@@ -50,11 +70,27 @@ class ProbabilisticFilterModel:
         self.training_accessions = training_accessions
     def get_cobs_index_path(self) -> str:
-        """Returns the path to the cobs index"""
+        """
+        Returns the path to the cobs inde
+        This method constructs the path where the cobs index file will be stored,
+        based on the model's slug and the base path.
+        Returns:
+            str: The path to the cobs index file.
+        """
         return str(self.base_path / self.slug() / "index.cobs_classic")
     def to_dict(self) -> dict:
-        """Returns a dictionary representation of the model"""
+        """
+        Returns a dictionary representation of the model
+        This method includes all relevant attributes of the model, such as k-mer size,
+        display name, author information, model type, and other parameters.
+        Returns:
+            dict: A dictionary containing the model's attributes.
+        """
         return {
             "model_slug": self.slug(),
             "k": self.k,
@@ -70,7 +106,15 @@ class ProbabilisticFilterModel:
         }
     def slug(self) -> str:
-        """Returns a slug representation of the model"""
+        """
+        Returns a slug representation of the model
+        This method generates a slug based on the model's display name and type,
+        which can be used for file naming or identification purposes.
+        Returns:
+            str: A slug representation of the model.
+        """
         return slugify(self.model_display_name + "-" + str(self.model_type))
     def fit(
@@ -79,7 +123,23 @@ class ProbabilisticFilterModel:
         display_names: dict | None = None,
         training_accessions: dict[str, list[str]] | None = None,
     ) -> None:
-        """Adds filters to the model"""
+        """
+        Adds filters to the model
+        This method constructs the model's index from sequence files in the specified directory.
+        It reads files with specified extensions (fasta and fastq), constructs a document list,
+        and builds a cobs index for efficient searching.
+        Args:
+            dir_path (Path): The directory containing sequence files to be indexed.
+            display_names (dict | None): A dictionary mapping file names to display names.
+                If None, uses file names as display names.
+            training_accessions (dict[str, list[str]] | None): A dictionary mapping filter IDs to
+                lists of accession numbers used for training the model. If None, no training accessions
+                are set.
+        Raises:
+            ValueError: If the directory path is invalid, does not exist, or is not a directory.
+        """
         if display_names is None:
             display_names = {}
@@ -125,8 +185,26 @@ class ProbabilisticFilterModel:
     def calculate_hits(
         self, sequence: Seq, filter_ids: list[str] | None = None, step: int = 1
     ) -> dict:
-        """Calculates the hits for a sequence"""
+        """
+        Calculates the hits for a sequence
+        This method searches the model's index for the given sequence and returns a dictionary
+        of filter IDs and their corresponding scores. If filter_ids is provided, it filters the
+        results to only include those IDs.
+        Args:
+            sequence (Seq): The sequence to search for in the model's index.
+            filter_ids (list[str] | None): A list of filter IDs to filter the results. If None,
+                all results are returned.
+            step (int): The step size for the k-mer search. Default is 1.
+        Returns:
+            dict: A dictionary where keys are filter IDs and values are scores for the sequence.
+        Raises:
+            ValueError: If the sequence is not a valid Bio.Seq or Bio.SeqRecord object,
+                        if the sequence length is not greater than k, or if the input is invalid.
+        """
         if not isinstance(sequence, (Seq)):
             raise ValueError(
                 "Invalid sequence, must be a Bio.Seq or a Bio.SeqRecord object"
@@ -153,7 +231,30 @@ class ProbabilisticFilterModel:
         filter_ids: list[str] = None,
         step: int = 1,
     ) -> ModelResult:
-        """Returns scores for the sequence(s) based on the filters in the model"""
+        """
+        Returns a model result object for the sequence(s) based on the filters in the model
+        This method processes the input sequence(s) and calculates hits against the model's index.
+        It supports various input types, including single sequences, lists of sequences,
+        SeqIO iterators, and file paths. The results are returned as a ModelResult object.
+        Args:
+            sequence_input (SeqRecord | list[SeqRecord] | SeqIO.FastaIO.FastaIterator |
+                            SeqIO.QualityIO.FastqPhredIterator | Path):
+                The input sequence(s) to be processed. Can be a single SeqRecord, a list of
+                SeqRecords, a SeqIO iterator, or a Path to a fasta/fastq file.
+            filter_ids (list[str]): A list of filter IDs to filter the results. If None,
+                all results are returned.
+            step (int): The step size for the k-mer search. Default is 1.
+        Returns:
+            ModelResult: An object containing the hits for each sequence, the number of kmers,
+                         and the sparse sampling step.
+        Raises:
+            ValueError: If the input sequence is not valid, or if it is not a Seq object,
+                        a list of Seq objects, a SeqIO iterator, or a Path object to a fasta/fastq file.
+        """
         if isinstance(sequence_input, (SeqRecord)):
             return ProbabilisticFilterModel.predict(
                 self, [sequence_input], filter_ids, step=step
@@ -186,7 +287,14 @@ class ProbabilisticFilterModel:
         )
     def save(self) -> None:
-        """Saves the model to disk"""
+        """
+        Saves the model to disk
+        This method serializes the model's attributes to a JSON file and creates a directory
+        for the model based on its slug. The JSON file contains all relevant information about
+        the model, including k-mer size, display name, author information, model type, and
+        other parameters. The directory structure is created if it does not already exist.
+        """
         json_path = self.base_path / f"{self.slug()}.json"
         filter_path = self.base_path / self.slug()
         filter_path.mkdir(exist_ok=True, parents=True)
@@ -198,7 +306,23 @@ class ProbabilisticFilterModel:
     @staticmethod
     def load(path: Path) -> "ProbabilisticFilterModel":
-        """Loads the model from a file"""
+        """
+        Loads the model from a file
+        This static method reads a JSON file containing the model's attributes and constructs
+        a ProbabilisticFilterModel object. It also checks for the existence of the cobs index file
+        and initializes the index if it exists.
+        Args:
+            path (Path): The path to the JSON file containing the model's attributes.
+        Returns:
+            ProbabilisticFilterModel: An instance of the ProbabilisticFilterModel class
+            initialized with the attributes from the JSON file.
+        Raises:
+            FileNotFoundError: If the JSON file or the cobs index file does not exist.
+        """
         with open(path, "r", encoding="utf-8") as file:
             json_object = file.read()
             model_json = json.loads(json_object)
@@ -223,6 +347,18 @@ class ProbabilisticFilterModel:
             return model
     def _convert_cobs_result_to_dict(self, cobs_result: cobs.SearchResult) -> dict:
+        """
+        Converts a cobs SearchResult to a dictionary
+        This method takes a cobs SearchResult object and converts it into a dictionary
+        where the keys are document names and the values are their corresponding scores.
+        Args:
+            cobs_result (cobs.SearchResult): The result object from a cobs search.
+        Returns:
+            dict: A dictionary mapping document names to their scores.
+        """
         return {
             individual_result.doc_name: individual_result.score
             for individual_result in cobs_result
@@ -239,7 +375,27 @@ class ProbabilisticFilterModel:
         ),
         step: int = 1,
     ) -> int:
-        """Counts the number of kmers in the sequence(s)"""
+        """
+        Counts the number of kmers in the sequence(s)
+        This method calculates the number of k-mers in a given sequence or list of sequences.
+        It supports various input types, including single sequences, SeqRecords, lists of sequences,
+        and SeqIO iterators. The step size for the k-mer search can be specified.
+        Args:
+            sequence_input (Seq | SeqRecord | list[Seq] | SeqIO.FastaIO.FastaIterator |
+                            SeqIO.QualityIO.FastqPhredIterator):
+                The input sequence(s) to count k-mers in. Can be a single Seq, a SeqRecord,
+                a list of Seq objects, or a SeqIO iterator.
+            step (int): The step size for the k-mer search. Default is 1.
+        Returns:
+            int: The total number of k-mers in the input sequence(s).
+        Raises:
+            ValueError: If the input sequence is not valid, or if it is not a Seq object,
+                        a SeqRecord, a list of Seq objects, or a SeqIO iterator.
+        """
         if isinstance(sequence_input, Seq):
             return self._count_kmers([sequence_input], step=step)
@@ -268,12 +424,38 @@ class ProbabilisticFilterModel:
             " SeqIO FastaIterator, or a SeqIO FastqPhredIterator"
         )
-    def _is_sequence_list(self, sequence_input):
+    def _is_sequence_list(self, sequence_input: Any) -> bool:
+        """
+        Checks if the input is a list of SeqRecord objects
+        This method verifies if the input is a list and that all elements in the list
+        are instances of SeqRecord. This is useful for ensuring that the input is a valid
+        collection of sequence records.
+        Args:
+            sequence_input (Any): The input to check.
+        Returns:
+            bool: True if the input is a list of SeqRecord objects, False otherwise.
+        """
         return isinstance(sequence_input, list) and all(
             isinstance(seq, (SeqRecord)) for seq in sequence_input
         )
-    def _is_sequence_iterator(self, sequence_input):
+    def _is_sequence_iterator(self, sequence_input: Any) -> bool:
+        """
+        Checks if the input is a SeqIO iterator
+        This method verifies if the input is an instance of a SeqIO iterator, such as
+        FastaIterator or FastqPhredIterator. This is useful for ensuring that the input
+        is a valid sequence iterator that can be processed by the model.
+        Args:
+            sequence_input (Any): The input to check.
+        Returns:
+            bool: True if the input is a SeqIO iterator, False otherwise.
+        """
         return isinstance(
             sequence_input,
             (SeqIO.FastaIO.FastaIterator, SeqIO.QualityIO.FastqPhredIterator),

xspect/models/probabilistic_filter_svm_model.py CHANGED Viewed

@@ -15,7 +15,13 @@ from xspect.models.result import ModelResult
 class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
-    """Probabilistic filter SVM model for sequence data"""
+    """
+    Probabilistic filter SVM model for sequence data
+    In addition to the standard probabilistic filter model, this model uses an SVM to predict
+    labels based on their scores and training data. It requires the `scikit-learn` library
+    to be installed.
+    """
     def __init__(
         self,
@@ -32,6 +38,28 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
         training_accessions: dict[str, list[str]] | None = None,
         svm_accessions: dict[str, list[str]] | None = None,
     ) -> None:
+        """
+        Initialize the SVM model with the given parameters.
+        In addition to the standard parameters, this model uses an SVM.
+        Therefore, it requires the `kernel` and `C` parameters to be set.
+        Furthermore, the `svm_accessions` parameter is used to store which accessions
+        are used for training the SVM.
+        Args:
+            k (int): The k-mer size for the probabilistic filter.
+            model_display_name (str): The display name of the model.
+            author (str | None): The author of the model.
+            author_email (str | None): The author's email address.
+            model_type (str): The type of the model.
+            base_path (Path): The base path where the model will be stored.
+            kernel (str): The kernel type for the SVM (e.g., 'linear', 'rbf').
+            c (float): Regularization parameter for the SVM.
+            fpr (float, optional): False positive rate for the probabilistic filter. Defaults to 0.01.
+            num_hashes (int, optional): Number of hashes for the probabilistic filter. Defaults to 7.
+            training_accessions (dict[str, list[str]] | None, optional): Accessions used for training the probabilistic filter. Defaults to None.
+            svm_accessions (dict[str, list[str]] | None, optional): Accessions used for training the SVM. Defaults to None.
+        """
         super().__init__(
             k=k,
             model_display_name=model_display_name,
@@ -48,6 +76,12 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
         self.svm_accessions = svm_accessions
     def to_dict(self) -> dict:
+        """
+        Convert the model to a dictionary representation
+        Returns:
+            dict: A dictionary containing the model's parameters and state.
+        """
         return super().to_dict() | {
             "kernel": self.kernel,
             "C": self.c,
@@ -55,7 +89,13 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
         }
     def set_svm_params(self, kernel: str, c: float) -> None:
-        """Set the parameters for the SVM"""
+        """
+        Set the parameters for the SVM
+        Args:
+            kernel (str): The kernel type for the SVM (e.g., 'linear', 'rbf').
+            c (float): Regularization parameter for the SVM.
+        """
         self.kernel = kernel
         self.c = c
         self.save()
@@ -69,7 +109,22 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
         training_accessions: dict[str, list[str]] | None = None,
         svm_accessions: dict[str, list[str]] | None = None,
     ) -> None:
-        """Fit the SVM to the sequences and labels"""
+        """
+        Fit the SVM to the sequences and labels.
+        This method first trains the probabilistic filter model and then
+        calculates scores for the SVM training. It expects the sequences to be in
+        the specified directory and the SVM training sequences to be in the
+        specified SVM path. The scores are saved in a CSV file for later use.
+        Args:
+            dir_path (Path): The directory containing the training sequences.
+            svm_path (Path): The directory containing the SVM training sequences.
+            display_names (dict[str, str] | None): A mapping of accession IDs to display names.
+            svm_step (int): Step size for sparse sampling in SVM training.
+            training_accessions (dict[str, list[str]] | None): Accessions used for training the probabilistic filter.
+            svm_accessions (dict[str, list[str]] | None): Accessions used for training the SVM.
+        """
         # Since the SVM works with score data, we need to train
         # the underlying data structure for score generation first
@@ -124,7 +179,21 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
         filter_ids: list[str] = None,
         step: int = 1,
     ) -> ModelResult:
-        """Predict the labels of the sequences"""
+        """
+        Predict the labels of the sequences.
+        This method uses the SVM to predict labels based on the scores generated
+        from the sequences. It expects the sequences to be in a format compatible
+        with the probabilistic filter model, and it will return a `ModelResult`.
+        Args:
+            sequence_input (SeqRecord | list[SeqRecord] | SeqIO.FastaIO.FastaIterator | SeqIO.QualityIO.FastqPhredIterator | Path): The input sequences to predict.
+            filter_ids (list[str], optional): A list of IDs to filter the predictions. Defaults to None.
+            step (int, optional): Step size for sparse sampling. Defaults to 1.
+        Returns:
+            ModelResult: The result of the prediction containing hits, number of kmers, and the predicted label.
+        """
         # get scores and format them for the SVM
         res = super().predict(sequence_input, filter_ids, step=step)
         svm_scores = dict(sorted(res.get_scores()["total"].items()))
@@ -140,7 +209,19 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
         )
     def _get_svm(self, id_keys) -> SVC:
-        """Get the SVM for the given id keys"""
+        """
+        Get the SVM for the given id keys.
+        This method loads the SVM model from the scores CSV file and trains it
+        using the scores from the CSV. If `id_keys` is provided, it filters the
+        training data to only include those keys.
+        Args:
+            id_keys (list[str] | None): A list of IDs to filter the training data. If None, all data is used.
+        Returns:
+            SVC: The trained SVM model.
+        """
         svm = SVC(kernel=self.kernel, C=self.c)
         # parse csv
         with open(
@@ -160,7 +241,19 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
     @staticmethod
     def load(path: Path) -> "ProbabilisticFilterSVMModel":
-        """Load the model from disk"""
+        """
+        Load the model from disk
+        Loads the model from the specified path. The path should point to a JSON file
+        containing the model's parameters and state. It also checks for the existence of
+        the COBS index file.
+        Args:
+            path (Path): The path to the model JSON file.
+        Returns:
+            ProbabilisticFilterSVMModel: The loaded model instance.
+        """
         with open(path, "r", encoding="utf-8") as file:
             json_object = file.read()
             model_json = json.loads(json_object)

xspect/models/probabilistic_single_filter_model.py CHANGED Viewed

@@ -14,7 +14,12 @@ from xspect.file_io import get_record_iterator
 class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
-    """Base probabilistic filter model for sequence data"""
+    """
+    Probabilistic filter model for sequence data, with a single filter
+    This model uses a Bloom filter to store k-mers from the training sequences. It is designed to
+    be used with a single filter, which is suitable e. g. for genus-level classification.
+    """
     def __init__(
         self,
@@ -27,6 +32,21 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
         fpr: float = 0.01,
         training_accessions: list[str] | None = None,
     ) -> None:
+        """Initialize probabilistic single filter model.
+        This model uses a Bloom filter to store k-mers from the training sequences. It is designed to
+        be used with a single filter, which is suitable e.g. for genus-level classification.
+        Args:
+            k (int): Length of the k-mers to use for filtering
+            model_display_name (str): Display name of the model
+            author (str | None): Author of the model
+            author_email (str | None): Email of the author
+            model_type (str): Type of the model, e.g. "probabilistic_single_filter"
+            base_path (Path): Base path where the model will be saved
+            fpr (float): False positive rate for the Bloom filter, default is 0.01
+            training_accessions (list[str] | None): List of accessions used for training, default is None
+        """
         super().__init__(
             k=k,
             model_display_name=model_display_name,
@@ -46,7 +66,17 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
         display_name: str,
         training_accessions: list[str] | None = None,
     ) -> None:
-        """Fit the cobs classic index to the sequences and labels"""
+        """
+        Fit the bloom filter to the sequences.
+        Trains the model by reading sequences from the provided file path,
+        generating k-mers, and adding them to the Bloom filter.
+        Args:
+            file_path (Path): Path to the file containing sequences in FASTA format
+            display_name (str): Display name for the model
+            training_accessions (list[str] | None): List of accessions used for training, default is None
+        """
         self.training_accessions = training_accessions
         # estimate number of kmers
@@ -68,7 +98,18 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
     def calculate_hits(
         self, sequence: Seq | SeqRecord, filter_ids=None, step: int = 1
     ) -> dict:
-        """Calculate the hits for the sequence"""
+        """
+        Calculate the hits for the sequence
+        Calculates the number of k-mers in the sequence that are present in the Bloom filter.
+        Args:
+            sequence (Seq | SeqRecord): Sequence to calculate hits for, can be a Bio.Seq or Bio.SeqRecord object
+            filter_ids (list[str] | None): List of filter IDs to use, default is None
+            step (int): Step size for generating k-mers, default is 1
+        Returns:
+            dict: Dictionary with the display name as key and the number of hits as value
+        """
         if isinstance(sequence, SeqRecord):
             sequence = sequence.seq
@@ -85,7 +126,17 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
     @staticmethod
     def load(path: Path) -> "ProbabilisticSingleFilterModel":
-        """Load the model from disk"""
+        """
+        Load the model from disk
+        This method reads the model's JSON file and the associated Bloom filter file,
+        reconstructing the model instance.
+        Args:
+            path (Path): Path to the model directory containing the JSON file
+        Returns:
+            ProbabilisticSingleFilterModel: An instance of the model loaded from disk
+        """
         with open(path, "r", encoding="utf-8") as file:
             json_object = file.read()
             model_json = json.loads(json_object)
@@ -108,7 +159,17 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
             return model
     def _generate_kmers(self, sequence: Seq, step: int = 1):
-        """Generate kmers from the sequence"""
+        """
+        Generate kmers from the sequence
+        Generates k-mers from the sequence, considering both the forward and reverse complement strands.
+        Args:
+            sequence (Seq): Sequence to generate k-mers from
+            step (int): Step size for generating k-mers, default is 1
+        Yields:
+            str: The minimizer k-mer (the lexicographically smallest k-mer between the forward and reverse complement)
+        """
         num_kmers = ceil((len(sequence) - self.k + 1) / step)
         for i in range(num_kmers):
             start_pos = i * step

XspecT 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl

Potentially problematic release.

XspecT 0.5.1py3-none-any.whl → 0.5.2py3-none-any.whl