PyPI - XspecT - Versions diffs - 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl - Mend

XspecT 0.5.0py3-none-any.whl → 0.5.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of XspecT might be problematic. Click here for more details.

Files changed (33) hide show

xspect/classify.py +61 -13
xspect/definitions.py +61 -13
xspect/download_models.py +10 -2
xspect/file_io.py +115 -48
xspect/filter_sequences.py +81 -29
xspect/main.py +90 -39
xspect/mlst_feature/mlst_helper.py +3 -0
xspect/mlst_feature/pub_mlst_handler.py +43 -1
xspect/model_management.py +84 -14
xspect/models/probabilistic_filter_mlst_model.py +75 -37
xspect/models/probabilistic_filter_model.py +201 -19
xspect/models/probabilistic_filter_svm_model.py +106 -13
xspect/models/probabilistic_single_filter_model.py +73 -9
xspect/models/result.py +77 -10
xspect/ncbi.py +48 -12
xspect/train.py +19 -11
xspect/web.py +68 -12
xspect/xspect-web/dist/assets/index-Ceo58xui.css +1 -0
xspect/xspect-web/dist/assets/{index-CMG4V7fZ.js → index-Dt_UlbgE.js} +82 -77
xspect/xspect-web/dist/index.html +2 -2
xspect/xspect-web/src/App.tsx +4 -2
xspect/xspect-web/src/api.tsx +23 -1
xspect/xspect-web/src/components/filter-form.tsx +16 -3
xspect/xspect-web/src/components/filtering-result.tsx +65 -0
xspect/xspect-web/src/components/result.tsx +2 -2
xspect/xspect-web/src/types.tsx +5 -0
{xspect-0.5.0.dist-info → xspect-0.5.2.dist-info}/METADATA +11 -5
{xspect-0.5.0.dist-info → xspect-0.5.2.dist-info}/RECORD +32 -31
{xspect-0.5.0.dist-info → xspect-0.5.2.dist-info}/WHEEL +1 -1
xspect/xspect-web/dist/assets/index-jIKg1HIy.css +0 -1
{xspect-0.5.0.dist-info → xspect-0.5.2.dist-info}/entry_points.txt +0 -0
{xspect-0.5.0.dist-info → xspect-0.5.2.dist-info}/licenses/LICENSE +0 -0
{xspect-0.5.0.dist-info → xspect-0.5.2.dist-info}/top_level.txt +0 -0

xspect/models/probabilistic_filter_mlst_model.py CHANGED Viewed

@@ -12,6 +12,7 @@ from cobs_index import DocumentList
 from collections import defaultdict
 from xspect.file_io import get_record_iterator
 from xspect.mlst_feature.mlst_helper import MlstResult
+from xspect.mlst_feature.pub_mlst_handler import PubMLSTHandler
 class ProbabilisticFilterMlstSchemeModel:
@@ -19,20 +20,22 @@ class ProbabilisticFilterMlstSchemeModel:
     def __init__(
         self,
-        k: int,
-        model_display_name: str,
+        k_value: int,
+        model_name: str,
         base_path: Path,
+        scheme_url: str,
         fpr: float = 0.001,
     ) -> None:
         """Initialise a ProbabilisticFilterMlstSchemeModel object."""
-        if k < 1:
+        if k_value < 1:
             raise ValueError("Invalid k value, must be greater than 0")
         if not isinstance(base_path, Path):
             raise ValueError("Invalid base path, must be a pathlib.Path object")
-        self.k = k
-        self.model_display_name = model_display_name
+        self.k_value = k_value
+        self.model_name = model_name
         self.base_path = base_path / "MLST"
+        self.scheme_url = scheme_url
         self.fpr = fpr
         self.model_type = "Strain"
         self.loci = {}
@@ -49,9 +52,10 @@ class ProbabilisticFilterMlstSchemeModel:
             dict: The dictionary containing all metadata of an object.
         """
         return {
-            "k": self.k,
-            "model_display_name": self.model_display_name,
+            "k_value": self.k_value,
+            "model_name": self.model_name,
             "model_type": self.model_type,
+            "scheme_url": str(self.scheme_url),
             "fpr": self.fpr,
             "scheme_path": str(self.scheme_path),
             "cobs_path": str(self.cobs_path),
@@ -115,7 +119,7 @@ class ProbabilisticFilterMlstSchemeModel:
             # COBS only accepts strings as paths
             doclist = DocumentList(str(locus_path))
             index_params = cobs_index.CompactIndexParameters()
-            index_params.term_size = self.k  # k-mer size
+            index_params.term_size = self.k_value  # k-mer size
             index_params.clobber = True  # overwrite output and temporary files
             index_params.false_positive_rate = self.fpr
@@ -130,9 +134,7 @@ class ProbabilisticFilterMlstSchemeModel:
     def save(self) -> None:
         """Saves the model to disk"""
-        scheme = str(self.scheme_path).split("/")[
-            -1
-        ]  # [-1] -> contains the scheme name
+        scheme = str(self.scheme_path).split("/")[-1]  # [-1] contains the scheme name
         json_path = self.base_path / scheme / f"{scheme}.json"
         json_object = json.dumps(self.to_dict(), indent=4)
@@ -156,9 +158,10 @@ class ProbabilisticFilterMlstSchemeModel:
             json_object = file.read()
             model_json = json.loads(json_object)
             model = ProbabilisticFilterMlstSchemeModel(
-                model_json["k"],
-                model_json["model_display_name"],
+                model_json["k_value"],
+                model_json["model_name"],
                 json_path.parent,
+                model_json["scheme_url"],
                 model_json["fpr"],
             )
             model.scheme_path = model_json["scheme_path"]
@@ -175,7 +178,12 @@ class ProbabilisticFilterMlstSchemeModel:
             return model
     def calculate_hits(
-        self, cobs_path: Path, sequence: Seq, step: int = 1
+        self,
+        cobs_path: Path,
+        sequence: Seq,
+        step: int = 1,
+        limit: bool = False,
+        limit_number: int = 5,
     ) -> list[dict]:
         """
         Calculates the hits for a sequence.
@@ -189,6 +197,8 @@ class ProbabilisticFilterMlstSchemeModel:
             cobs_path (Path): The path of the COBS-structure directory.
             sequence (Seq): The input sequence for classification.
             step (int, optional): The amount of kmers that are passed; defaults to one.
+            limit (bool): Applying a filter that limits the best result.
+            limit_number (int): The amount of results when the filter is set to true.
         Returns:
             list[dict]: The results of the prediction.
@@ -201,7 +211,7 @@ class ProbabilisticFilterMlstSchemeModel:
         if not isinstance(sequence, Seq):
             raise ValueError("Invalid sequence, must be a Bio.Seq object")
-        if not len(sequence) > self.k:
+        if not len(sequence) > self.k_value:
             raise ValueError("Invalid sequence, must be longer than k")
         if not self.indices:
@@ -239,6 +249,10 @@ class ProbabilisticFilterMlstSchemeModel:
                 sorted_counts = dict(
                     sorted(all_counts.items(), key=lambda item: -item[1])
                 )
+                if limit:
+                    sorted_counts = dict(list(sorted_counts.items())[:limit_number])
                 if not sorted_counts:
                     result_dict = "A Strain type could not be detected because of no kmer matches!"
                     highest_results[scheme_path_list[counter]] = {"N/A": 0}
@@ -250,25 +264,37 @@ class ProbabilisticFilterMlstSchemeModel:
                         first_key: highest_result
                     }
                 counter += 1
-        else:
+        else:  # No split procedure is needed, when the sequence is short
             for index in self.indices:
-                res = index.search(
+                res = index.search(  # COBS can't handle Seq-Objects
                     str(sequence), step=step
-                )  # COBS can't handle Seq-Objects
-                result_dict[scheme_path_list[counter]] = self.get_cobs_result(
-                    res, False
                 )
-                first_key, highest_result = next(
-                    iter(result_dict[scheme_path_list[counter]].items())
+                result = self.get_cobs_result(res, False)
+                result = (
+                    dict(sorted(result.items(), key=lambda x: -x[1])[:limit_number])
+                    if limit
+                    else result
                 )
+                result_dict[scheme_path_list[counter]] = result
+                first_key, highest_result = next(iter(result.items()))
                 highest_results[scheme_path_list[counter]] = {first_key: highest_result}
                 counter += 1
         # check if the strain type has sufficient amount of kmer hits
         is_valid = self.has_sufficient_score(highest_results, self.avg_locus_bp_size)
         if not is_valid:
             highest_results["Attention:"] = (
                 "This strain type is not reliable due to low kmer hit rates!"
             )
+        else:
+            handler = PubMLSTHandler()
+            # allele_id is of type dict
+            flattened = {
+                locus: int(list(allele_id.keys())[0].split("_")[-1])
+                for locus, allele_id in highest_results.items()
+            }
+            strain_type_name = handler.get_strain_type_name(flattened, self.scheme_url)
+            highest_results["ST_Name"] = strain_type_name
         return [{"Strain type": highest_results}, {"All results": result_dict}]
     def predict(
@@ -282,6 +308,7 @@ class ProbabilisticFilterMlstSchemeModel:
             | Path
         ),
         step: int = 1,
+        limit: bool = False,
     ) -> MlstResult:
         """
         Get scores for the sequence(s) based on the filters in the model.
@@ -290,6 +317,7 @@ class ProbabilisticFilterMlstSchemeModel:
             cobs_path (Path): The path of the COBS-structure directory.
             sequence_input (Seq): The input sequence for classification
             step (int, optional): The amount of kmers that are passed; defaults to one
+            limit (bool, optional): Applying a filter that limits the best result.
         Returns:
             MlstResult: The results of the prediction.
@@ -301,13 +329,19 @@ class ProbabilisticFilterMlstSchemeModel:
             if sequence_input.id == "<unknown id>":
                 sequence_input.id = "test"
             hits = {
-                sequence_input.id: self.calculate_hits(cobs_path, sequence_input.seq)
+                sequence_input.id: self.calculate_hits(
+                    cobs_path, sequence_input.seq, step, limit
+                )
             }
-            return MlstResult(self.model_display_name, step, hits)
+            return MlstResult(self.model_name, step, hits, None)
         if isinstance(sequence_input, Path):
             return ProbabilisticFilterMlstSchemeModel.predict(
-                self, cobs_path, get_record_iterator(sequence_input), step=step
+                self,
+                cobs_path,
+                get_record_iterator(sequence_input),
+                step=step,
+                limit=limit,
             )
         if isinstance(
@@ -317,33 +351,35 @@ class ProbabilisticFilterMlstSchemeModel:
             hits = {}
             # individual_seq is a SeqRecord-Object
             for individual_seq in sequence_input:
-                individual_hits = self.calculate_hits(cobs_path, individual_seq.seq)
+                individual_hits = self.calculate_hits(
+                    cobs_path, individual_seq.seq, step, limit
+                )
                 hits[individual_seq.id] = individual_hits
-            return MlstResult(self.model_display_name, step, hits)
+            return MlstResult(self.model_name, step, hits, None)
         raise ValueError(
             "Invalid sequence input, must be a Seq object, a list of Seq objects, a"
             " SeqIO FastaIterator, or a SeqIO FastqPhredIterator"
         )
     def get_cobs_result(
-        self, cobs_result: cobs_index.SearchResult, kmer_threshold: bool
+        self,
+        cobs_result: cobs_index.SearchResult,
+        kmer_threshold: bool,
     ) -> dict:
         """
         Get every entry in a COBS search result.
         Args:
             cobs_result (SearchResult): The result of the prediction.
-            kmer_threshold (bool): Applying a kmer threshold to mitigate false positives
+            kmer_threshold (bool): Applying a kmer threshold to mitigate false positives.
         Returns:
             dict: A dictionary storing the allele id of locus as key and the score as value.
         """
-        return {
-            individual_result.doc_name: individual_result.score
-            for individual_result in cobs_result
-            if not kmer_threshold or individual_result.score > 50
-        }
+        hits = [
+            result for result in cobs_result if not kmer_threshold or result.score > 50
+        ]
+        return {result.doc_name: result.score for result in hits}
     def sequence_splitter(self, input_sequence: str, allele_len: int) -> list[str]:
         """
@@ -379,13 +415,15 @@ class ProbabilisticFilterMlstSchemeModel:
         while start + substring_length <= sequence_len:
             substring_list.append(input_sequence[start : start + substring_length])
-            start += substring_length - self.k + 1  # To not lose kmers when dividing
+            start += (
+                substring_length - self.k_value + 1
+            )  # To not lose kmers when dividing
         # The remaining string is either appended to the list or added to the last entry.
         if start < len(input_sequence):
             remaining_substring = input_sequence[start:]
             # A substring needs to be at least of size k for COBS.
-            if len(remaining_substring) < self.k:
+            if len(remaining_substring) < self.k_value:
                 substring_list[-1] += remaining_substring
             else:
                 substring_list.append(remaining_substring)

xspect/models/probabilistic_filter_model.py CHANGED Viewed

@@ -3,6 +3,7 @@
 import json
 from math import ceil
 from pathlib import Path
+from typing import Any
 from Bio.Seq import Seq
 from Bio.SeqRecord import SeqRecord
 from Bio import SeqIO
@@ -20,14 +21,33 @@ class ProbabilisticFilterModel:
         self,
         k: int,
         model_display_name: str,
-        author: str,
-        author_email: str,
+        author: str | None,
+        author_email: str | None,
         model_type: str,
         base_path: Path,
         fpr: float = 0.01,
         num_hashes: int = 7,
-        training_accessions: dict[str, list[str]] = None,
+        training_accessions: dict[str, list[str]] | None = None,
     ) -> None:
+        """
+        Initializes the probabilistic filter model.
+        This method sets up the model with the specified parameters, including the k-mer size,
+        display name, author information, model type, base path for storage, false positive rate,
+        number of hashes, and training accessions.
+        Args:
+            k (int): The size of the k-mers to be used in the model.
+            model_display_name (str): The display name of the model.
+            author (str | None): The name of the author of the model.
+            author_email (str | None): The email of the author of the model.
+            model_type (str): The type of the model.
+            base_path (Path): The base path where the model will be stored.
+            fpr (float): The false positive rate for the model. Default is 0.01.
+            num_hashes (int): The number of hashes to use in the model. Default is 7.
+            training_accessions (dict[str, list[str]] | None): A dictionary mapping filter IDs to
+                lists of accession numbers used for training the model. Default is None.
+        """
         if k < 1:
             raise ValueError("Invalid k value, must be greater than 0")
         if not model_display_name:
@@ -49,12 +69,28 @@ class ProbabilisticFilterModel:
         self.index = None
         self.training_accessions = training_accessions
-    def get_cobs_index_path(self) -> Path:
-        """Returns the path to the cobs index"""
+    def get_cobs_index_path(self) -> str:
+        """
+        Returns the path to the cobs inde
+        This method constructs the path where the cobs index file will be stored,
+        based on the model's slug and the base path.
+        Returns:
+            str: The path to the cobs index file.
+        """
         return str(self.base_path / self.slug() / "index.cobs_classic")
     def to_dict(self) -> dict:
-        """Returns a dictionary representation of the model"""
+        """
+        Returns a dictionary representation of the model
+        This method includes all relevant attributes of the model, such as k-mer size,
+        display name, author information, model type, and other parameters.
+        Returns:
+            dict: A dictionary containing the model's attributes.
+        """
         return {
             "model_slug": self.slug(),
             "k": self.k,
@@ -70,16 +106,40 @@ class ProbabilisticFilterModel:
         }
     def slug(self) -> str:
-        """Returns a slug representation of the model"""
+        """
+        Returns a slug representation of the model
+        This method generates a slug based on the model's display name and type,
+        which can be used for file naming or identification purposes.
+        Returns:
+            str: A slug representation of the model.
+        """
         return slugify(self.model_display_name + "-" + str(self.model_type))
     def fit(
         self,
         dir_path: Path,
-        display_names: dict = None,
-        training_accessions: dict[str, list[str]] = None,
+        display_names: dict | None = None,
+        training_accessions: dict[str, list[str]] | None = None,
     ) -> None:
-        """Adds filters to the model"""
+        """
+        Adds filters to the model
+        This method constructs the model's index from sequence files in the specified directory.
+        It reads files with specified extensions (fasta and fastq), constructs a document list,
+        and builds a cobs index for efficient searching.
+        Args:
+            dir_path (Path): The directory containing sequence files to be indexed.
+            display_names (dict | None): A dictionary mapping file names to display names.
+                If None, uses file names as display names.
+            training_accessions (dict[str, list[str]] | None): A dictionary mapping filter IDs to
+                lists of accession numbers used for training the model. If None, no training accessions
+                are set.
+        Raises:
+            ValueError: If the directory path is invalid, does not exist, or is not a directory.
+        """
         if display_names is None:
             display_names = {}
@@ -123,10 +183,28 @@ class ProbabilisticFilterModel:
         self.index = cobs.Search(self.get_cobs_index_path(), True)
     def calculate_hits(
-        self, sequence: Seq, filter_ids: list[str] = None, step: int = 1
+        self, sequence: Seq, filter_ids: list[str] | None = None, step: int = 1
     ) -> dict:
-        """Calculates the hits for a sequence"""
+        """
+        Calculates the hits for a sequence
+        This method searches the model's index for the given sequence and returns a dictionary
+        of filter IDs and their corresponding scores. If filter_ids is provided, it filters the
+        results to only include those IDs.
+        Args:
+            sequence (Seq): The sequence to search for in the model's index.
+            filter_ids (list[str] | None): A list of filter IDs to filter the results. If None,
+                all results are returned.
+            step (int): The step size for the k-mer search. Default is 1.
+        Returns:
+            dict: A dictionary where keys are filter IDs and values are scores for the sequence.
+        Raises:
+            ValueError: If the sequence is not a valid Bio.Seq or Bio.SeqRecord object,
+                        if the sequence length is not greater than k, or if the input is invalid.
+        """
         if not isinstance(sequence, (Seq)):
             raise ValueError(
                 "Invalid sequence, must be a Bio.Seq or a Bio.SeqRecord object"
@@ -153,7 +231,30 @@ class ProbabilisticFilterModel:
         filter_ids: list[str] = None,
         step: int = 1,
     ) -> ModelResult:
-        """Returns scores for the sequence(s) based on the filters in the model"""
+        """
+        Returns a model result object for the sequence(s) based on the filters in the model
+        This method processes the input sequence(s) and calculates hits against the model's index.
+        It supports various input types, including single sequences, lists of sequences,
+        SeqIO iterators, and file paths. The results are returned as a ModelResult object.
+        Args:
+            sequence_input (SeqRecord | list[SeqRecord] | SeqIO.FastaIO.FastaIterator |
+                            SeqIO.QualityIO.FastqPhredIterator | Path):
+                The input sequence(s) to be processed. Can be a single SeqRecord, a list of
+                SeqRecords, a SeqIO iterator, or a Path to a fasta/fastq file.
+            filter_ids (list[str]): A list of filter IDs to filter the results. If None,
+                all results are returned.
+            step (int): The step size for the k-mer search. Default is 1.
+        Returns:
+            ModelResult: An object containing the hits for each sequence, the number of kmers,
+                         and the sparse sampling step.
+        Raises:
+            ValueError: If the input sequence is not valid, or if it is not a Seq object,
+                        a list of Seq objects, a SeqIO iterator, or a Path object to a fasta/fastq file.
+        """
         if isinstance(sequence_input, (SeqRecord)):
             return ProbabilisticFilterModel.predict(
                 self, [sequence_input], filter_ids, step=step
@@ -186,7 +287,14 @@ class ProbabilisticFilterModel:
         )
     def save(self) -> None:
-        """Saves the model to disk"""
+        """
+        Saves the model to disk
+        This method serializes the model's attributes to a JSON file and creates a directory
+        for the model based on its slug. The JSON file contains all relevant information about
+        the model, including k-mer size, display name, author information, model type, and
+        other parameters. The directory structure is created if it does not already exist.
+        """
         json_path = self.base_path / f"{self.slug()}.json"
         filter_path = self.base_path / self.slug()
         filter_path.mkdir(exist_ok=True, parents=True)
@@ -198,7 +306,23 @@ class ProbabilisticFilterModel:
     @staticmethod
     def load(path: Path) -> "ProbabilisticFilterModel":
-        """Loads the model from a file"""
+        """
+        Loads the model from a file
+        This static method reads a JSON file containing the model's attributes and constructs
+        a ProbabilisticFilterModel object. It also checks for the existence of the cobs index file
+        and initializes the index if it exists.
+        Args:
+            path (Path): The path to the JSON file containing the model's attributes.
+        Returns:
+            ProbabilisticFilterModel: An instance of the ProbabilisticFilterModel class
+            initialized with the attributes from the JSON file.
+        Raises:
+            FileNotFoundError: If the JSON file or the cobs index file does not exist.
+        """
         with open(path, "r", encoding="utf-8") as file:
             json_object = file.read()
             model_json = json.loads(json_object)
@@ -223,6 +347,18 @@ class ProbabilisticFilterModel:
             return model
     def _convert_cobs_result_to_dict(self, cobs_result: cobs.SearchResult) -> dict:
+        """
+        Converts a cobs SearchResult to a dictionary
+        This method takes a cobs SearchResult object and converts it into a dictionary
+        where the keys are document names and the values are their corresponding scores.
+        Args:
+            cobs_result (cobs.SearchResult): The result object from a cobs search.
+        Returns:
+            dict: A dictionary mapping document names to their scores.
+        """
         return {
             individual_result.doc_name: individual_result.score
             for individual_result in cobs_result
@@ -239,7 +375,27 @@ class ProbabilisticFilterModel:
         ),
         step: int = 1,
     ) -> int:
-        """Counts the number of kmers in the sequence(s)"""
+        """
+        Counts the number of kmers in the sequence(s)
+        This method calculates the number of k-mers in a given sequence or list of sequences.
+        It supports various input types, including single sequences, SeqRecords, lists of sequences,
+        and SeqIO iterators. The step size for the k-mer search can be specified.
+        Args:
+            sequence_input (Seq | SeqRecord | list[Seq] | SeqIO.FastaIO.FastaIterator |
+                            SeqIO.QualityIO.FastqPhredIterator):
+                The input sequence(s) to count k-mers in. Can be a single Seq, a SeqRecord,
+                a list of Seq objects, or a SeqIO iterator.
+            step (int): The step size for the k-mer search. Default is 1.
+        Returns:
+            int: The total number of k-mers in the input sequence(s).
+        Raises:
+            ValueError: If the input sequence is not valid, or if it is not a Seq object,
+                        a SeqRecord, a list of Seq objects, or a SeqIO iterator.
+        """
         if isinstance(sequence_input, Seq):
             return self._count_kmers([sequence_input], step=step)
@@ -268,12 +424,38 @@ class ProbabilisticFilterModel:
             " SeqIO FastaIterator, or a SeqIO FastqPhredIterator"
         )
-    def _is_sequence_list(self, sequence_input):
+    def _is_sequence_list(self, sequence_input: Any) -> bool:
+        """
+        Checks if the input is a list of SeqRecord objects
+        This method verifies if the input is a list and that all elements in the list
+        are instances of SeqRecord. This is useful for ensuring that the input is a valid
+        collection of sequence records.
+        Args:
+            sequence_input (Any): The input to check.
+        Returns:
+            bool: True if the input is a list of SeqRecord objects, False otherwise.
+        """
         return isinstance(sequence_input, list) and all(
             isinstance(seq, (SeqRecord)) for seq in sequence_input
         )
-    def _is_sequence_iterator(self, sequence_input):
+    def _is_sequence_iterator(self, sequence_input: Any) -> bool:
+        """
+        Checks if the input is a SeqIO iterator
+        This method verifies if the input is an instance of a SeqIO iterator, such as
+        FastaIterator or FastqPhredIterator. This is useful for ensuring that the input
+        is a valid sequence iterator that can be processed by the model.
+        Args:
+            sequence_input (Any): The input to check.
+        Returns:
+            bool: True if the input is a SeqIO iterator, False otherwise.
+        """
         return isinstance(
             sequence_input,
             (SeqIO.FastaIO.FastaIterator, SeqIO.QualityIO.FastqPhredIterator),

XspecT 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl

Potentially problematic release.

XspecT 0.5.0py3-none-any.whl → 0.5.2py3-none-any.whl