PyPI - XspecT - Versions diffs - 0.5.3__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

XspecT 0.5.3py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of XspecT might be problematic. Click here for more details.

Files changed (26) hide show

xspect/classify.py +38 -8
xspect/definitions.py +30 -10
xspect/file_io.py +2 -1
xspect/filter_sequences.py +20 -4
xspect/main.py +126 -28
xspect/misclassification_detection/__init__.py +0 -0
xspect/misclassification_detection/mapping.py +168 -0
xspect/misclassification_detection/point_pattern_analysis.py +102 -0
xspect/misclassification_detection/simulate_reads.py +55 -0
xspect/mlst_feature/mlst_helper.py +15 -19
xspect/mlst_feature/pub_mlst_handler.py +16 -19
xspect/model_management.py +14 -17
xspect/models/probabilistic_filter_mlst_model.py +11 -10
xspect/models/probabilistic_filter_model.py +142 -8
xspect/models/probabilistic_filter_svm_model.py +29 -14
xspect/models/probabilistic_single_filter_model.py +9 -7
xspect/models/result.py +22 -15
xspect/ncbi.py +82 -7
xspect/train.py +21 -4
xspect/web.py +13 -4
{xspect-0.5.3.dist-info → xspect-0.6.0.dist-info}/METADATA +4 -1
{xspect-0.5.3.dist-info → xspect-0.6.0.dist-info}/RECORD +26 -22
{xspect-0.5.3.dist-info → xspect-0.6.0.dist-info}/WHEEL +0 -0
{xspect-0.5.3.dist-info → xspect-0.6.0.dist-info}/entry_points.txt +0 -0
{xspect-0.5.3.dist-info → xspect-0.6.0.dist-info}/licenses/LICENSE +0 -0
{xspect-0.5.3.dist-info → xspect-0.6.0.dist-info}/top_level.txt +0 -0

xspect/misclassification_detection/point_pattern_analysis.py ADDED Viewed

@@ -0,0 +1,102 @@
+"""
+Point pattern density analysis tool for the alignment-based misclassification detection.
+Notes:
+Developed by Oemer Cetin as part of Bsc thesis (2025), Goethe University Frankfurt am Main.
+(An Integration of Alignment-Free and Alignment-Based Approaches for Bacterial Taxon Assignment)
+"""
+import numpy
+__author__ = "Cetin, Oemer"
+class PointPatternAnalysis:
+    """Class for all point pattern density analysis procedures."""
+    def __init__(self, points: list[int], length: int):
+        """
+        Initialise the class for point pattern analysis.
+        This method sets up the required list with data points (sorted) and the length of the reference genome.
+        All required intensity for the statistics is also calculated.
+        Args:
+            points (list): The start coordinates of mapped regions on the genome.
+            length (int): The length of the reference genome.
+        """
+        if len(points) < 2:
+            raise ValueError("Need at least 2 points.")
+        self.sorted_points = numpy.sort(numpy.asarray(points, dtype=float))
+        self.n = len(points)
+        self.length = float(length)
+    def ripleys_k(self) -> tuple[bool, float, float]:
+        """
+        Calculates the K-function for the given point distribution.
+        This method calculates the K-function to describe the point distribution.
+        The result is than compared with what would be expected under a completely random distribution.
+        (Under complete randomness the K-function result is 2*r)
+        Returns:
+            tuple: A tuple containing the information whether points are clustered or not.
+        """
+        r = 0.01 * self.length
+        left = 0
+        right = 0
+        total_neighbors = 0
+        for i in range(self.n):
+            while self.sorted_points[i] - self.sorted_points[left] > r:
+                left += 1
+            if right < i:
+                right = i
+            while (
+                right + 1 < self.n
+                and self.sorted_points[right + 1] - self.sorted_points[i] <= r
+            ):
+                right += 1
+            total_neighbors += right - left
+        k = (self.length / (self.n * (self.n - 1))) * total_neighbors
+        return (k > 2 * r), k, 2 * r
+    def ripleys_k_edge_corrected(self) -> tuple[bool, float, float]:
+        """
+        Calculates the K-function for the given point distribution with an edge correction factor.
+        This method calculates the K-function to describe the point distribution.
+        This time an additional factor is multiplied for each data point to account for edge effects.
+        The result is than compared with what would be expected under a completely random distribution.
+        (Under complete randomness the K-function result is 2*r)
+        Returns:
+            tuple: A tuple containing the information whether the points are clustered or not.
+        """
+        r = 0.01 * self.length
+        left = 0
+        right = 0
+        total_weighted = 0
+        for i in range(self.n):
+            while self.sorted_points[i] - self.sorted_points[left] > r:
+                left += 1
+            if right < i:
+                right = i
+            while (
+                right + 1 < self.n
+                and self.sorted_points[right + 1] - self.sorted_points[i] <= r
+            ):
+                right += 1
+            neighbors = right - left
+            if neighbors > 0:
+                a = max(0, self.sorted_points[i] - r)
+                b = min(self.length, self.sorted_points[i] + r)
+                overlap = b - a
+                weight = (2 * r) / overlap if overlap > 0 else 0
+                total_weighted += weight * neighbors
+        k = (self.length / (self.n * (self.n - 1))) * total_weighted
+        return (bool(k > 2 * r)), float(k), 2 * r

xspect/misclassification_detection/simulate_reads.py ADDED Viewed

@@ -0,0 +1,55 @@
+"""
+Read simulation for the alignment-based misclassification detection (Used for testing purposes).
+Notes:
+Developed by Oemer Cetin as part of a Bsc thesis at Goethe University Frankfurt am Main (2025).
+(An Integration of Alignment-Free and Alignment-Based Approaches for Bacterial Taxon Assignment)
+"""
+import random
+from Bio import SeqIO
+__author__ = "Cetin, Oemer"
+def extract_random_reads(
+    fasta_file, output_fasta, read_length=150, num_reads=1000, seed=42
+) -> None:
+    """
+    Uniformly extracts reads from a genome and writes them to a FASTA-file.
+    Args:
+        fasta_file (str): Path to input FASTA file.
+        output_fasta (str): Output FASTA file to write simulated reads.
+        read_length (int): Length of each read to extract.
+        num_reads (int): Total number of reads to extract.
+        seed (int): A seed for reproducibility.
+    Raises:
+        ValueError: If the sequences are shorter than the chosen read length.
+    """
+    random.seed(seed)
+    sequences = [
+        record
+        for record in SeqIO.parse(fasta_file, "fasta")
+        if len(record.seq) >= read_length
+    ]
+    if not sequences:
+        raise ValueError("No sequences long enough for the desired read length.")
+    # Probability to extract reads from large contigs is higher
+    seq_lengths = [len(rec.seq) for rec in sequences]
+    total_length = sum(seq_lengths)
+    weights = [single_length / total_length for single_length in seq_lengths]
+    with open(output_fasta, "w") as o:
+        for i in range(num_reads):
+            # random.choices() provides a list!
+            selected = random.choices(sequences, weights=weights, k=1)[0]
+            seq_length = len(selected.seq)
+            start = random.randint(0, seq_length - read_length)
+            read_seq = selected.seq[start : start + read_length]
+            o.write(
+                f">read_{i}_{selected.id}_{start}-{start + read_length}\n{read_seq}\n"
+            )
+    print("The reads have been simulated successfully.")

xspect/mlst_feature/mlst_helper.py CHANGED Viewed

@@ -2,10 +2,10 @@
 __author__ = "Cetin, Oemer"
-import requests
 import json
-from io import StringIO
 from pathlib import Path
+from io import StringIO
+import requests
 from Bio import SeqIO
 from xspect.definitions import get_xspect_model_path
@@ -29,7 +29,7 @@ def create_fasta_files(locus_path: Path, fasta_batch: str) -> None:
         output_fasta_file = locus_path / f"Allele_ID_{number}.fasta"
         if output_fasta_file.exists():
             continue  # Ignore existing ones
-        with open(output_fasta_file, "w") as allele:
+        with open(output_fasta_file, "w", encoding="utf-8") as allele:
             SeqIO.write(record, allele, "fasta")
@@ -59,10 +59,9 @@ def pick_species_number_from_db(available_species: dict) -> str:
             if int(choice) in available_species.keys():
                 chosen_species = available_species.get(int(choice))
                 return chosen_species
-            else:
-                print(
-                    "Wrong input! Try again with a number that is available in the list above."
-                )
+            print(
+                "Wrong input! Try again with a number that is available in the list above."
+            )
         except ValueError:
             print(
                 "Wrong input! Try again with a number that is available in the list above."
@@ -95,10 +94,9 @@ def pick_scheme_number_from_db(available_schemes: dict) -> str:
             if int(choice) in available_schemes.keys():
                 chosen_scheme = available_schemes.get(int(choice))[1]
                 return chosen_scheme
-            else:
-                print(
-                    "Wrong input! Try again with a number that is available in the above list."
-                )
+            print(
+                "Wrong input! Try again with a number that is available in the above list."
+            )
         except ValueError:
             print(
                 "Wrong input! Try again with a number that is available in the above list."
@@ -162,12 +160,12 @@ def pick_scheme(available_schemes: dict) -> Path:
     for counter, scheme in available_schemes.items():
         # For Strain Typing with an API-POST Request to the db
         if str(scheme).startswith("http"):
-            scheme_json = requests.get(scheme).json()
+            scheme_json = requests.get(scheme, timeout=10).json()
             print(str(counter) + ":" + scheme_json["description"])
         # To pick a scheme after download for fitting
         else:
-            print(str(counter) + ":" + str(scheme).split("/")[-1])
+            print(str(counter) + ":" + str(scheme).rsplit("/", maxsplit=1)[-1])
     print("\nPick a scheme for strain type prediction")
     while True:
@@ -176,10 +174,9 @@ def pick_scheme(available_schemes: dict) -> Path:
             if int(choice) in available_schemes.keys():
                 chosen_scheme = available_schemes.get(int(choice))
                 return chosen_scheme
-            else:
-                print(
-                    "Wrong input! Try again with a number that is available in the above list."
-                )
+            print(
+                "Wrong input! Try again with a number that is available in the above list."
+            )
         except ValueError:
             print(
                 "Wrong input! Try again with a number that is available in the above list."
@@ -209,8 +206,7 @@ class MlstResult:
         Returns:
             dict: The result dictionary with s sequence ID as key and the Strain type as value.
         """
-        results = {seq_id: result for seq_id, result in self.hits.items()}
-        return results
+        return dict(self.hits.items())
     def to_dict(self) -> dict:
         """

xspect/mlst_feature/pub_mlst_handler.py CHANGED Viewed

@@ -2,8 +2,8 @@
 __author__ = "Cetin, Oemer"
-import requests
 import json
+import requests
 from xspect.mlst_feature.mlst_helper import (
     create_fasta_files,
     pick_species_number_from_db,
@@ -51,7 +51,7 @@ class PubMLSTHandler:
         counter = 1
         # retrieve all available species
         species_url = PubMLSTHandler.base_url
-        for species_databases in requests.get(species_url).json():
+        for species_databases in requests.get(species_url, timeout=10).json():
             for database in species_databases["databases"]:
                 if database["name"].endswith("seqdef"):
                     available_species[counter] = database["name"]
@@ -61,7 +61,7 @@ class PubMLSTHandler:
         counter = 1
         scheme_url = f"{species_url}/{chosen_species}/schemes"
-        for scheme in requests.get(scheme_url).json()["schemes"]:
+        for scheme in requests.get(scheme_url, timeout=10).json()["schemes"]:
             # scheme["description"] stores the name of a scheme.
             # scheme["scheme"] stores the URL that is needed for downloading all loci.
             available_schemes[counter] = [scheme["description"], scheme["scheme"]]
@@ -70,11 +70,8 @@ class PubMLSTHandler:
         # Selection process of available scheme from a species for download (doubles are caught!)
         while True:
             chosen_scheme = pick_scheme_number_from_db(available_schemes)
-            (
+            if chosen_scheme not in chosen_schemes:
                 chosen_schemes.append(chosen_scheme)
-                if chosen_scheme not in chosen_schemes
-                else None
-            )
             choice = input(
                 "Do you want to pick another scheme to download? (y/n):"
             ).lower()
@@ -97,7 +94,7 @@ class PubMLSTHandler:
             self.choose_schemes()  # changes the scheme_list attribute
         for scheme in self.scheme_list:
-            scheme_json = requests.get(scheme).json()
+            scheme_json = requests.get(scheme, timeout=10).json()
             # We only want the name and the respective featured loci of a scheme
             scheme_name = scheme_json["description"]
             locus_list = scheme_json["loci"]
@@ -117,7 +114,7 @@ class PubMLSTHandler:
                 if not locus_path.exists():
                     locus_path.mkdir(exist_ok=True, parents=True)
-                alleles = requests.get(f"{locus_url}/alleles_fasta").text
+                alleles = requests.get(f"{locus_url}/alleles_fasta", timeout=10).text
                 create_fasta_files(locus_path, alleles)
     def assign_strain_type_by_db(self) -> None:
@@ -132,13 +129,15 @@ class PubMLSTHandler:
             str(pick_scheme(scheme_list_to_dict(self.scheme_list))) + "/sequence"
         )
         fasta_file = get_xspect_upload_path() / "Test.fna"
-        with open(fasta_file, "r") as file:
+        with open(fasta_file, "r", encoding="utf-8") as file:
             data = file.read()
             payload = {  # Essential API-POST-Body
                 "sequence": data,
                 "filetype": "fasta",
             }
-        response = requests.post(scheme_url, data=json.dumps(payload)).json()
+        response = requests.post(
+            scheme_url, data=json.dumps(payload), timeout=10
+        ).json()
         for locus, meta_data in response["exact_matches"].items():
             # meta_data is a list containing a dictionary, therefore [0] and then key value.
@@ -170,18 +169,16 @@ class PubMLSTHandler:
             }
         }
-        response = requests.post(post_url + "/designations", json=payload)
+        response = requests.post(post_url + "/designations", json=payload, timeout=10)
         if response.status_code == 200:
             data = response.json()
             if "fields" in data:
                 post_response = data["fields"]
                 return post_response
-            else:
-                post_response = "No matching Strain Type found in the database. "
-                post_response += "Possibly a novel Strain Type."
-                return post_response
-        else:
-            post_response = "Error:" + str(response.status_code)
-            post_response += response.text
+            post_response = "No matching Strain Type found in the database. "
+            post_response += "Possibly a novel Strain Type."
             return post_response
+        post_response = "Error:" + str(response.status_code)
+        post_response += response.text
+        return post_response

xspect/model_management.py CHANGED Viewed

@@ -2,45 +2,41 @@
 from json import loads, dumps
 from pathlib import Path
-from xspect.models.probabilistic_single_filter_model import (
-    ProbabilisticSingleFilterModel,
-)
-from xspect.models.probabilistic_filter_svm_model import ProbabilisticFilterSVMModel
 from xspect.definitions import get_xspect_model_path
-def get_genus_model(genus) -> ProbabilisticSingleFilterModel:
+def get_genus_model_path(genus) -> Path:
     """
-    Get a genus model for the specified genus.
+    Get a genus model path for the specified genus.
-    This function retrieves a pre-trained genus classification model based on the provided genus name.
+    This function retrieves the path of a pre-trained genus classification model based on the
+    provided genus name.
     Args:
         genus (str): The genus name for which the model is to be retrieved.
     Returns:
-        ProbabilisticSingleFilterModel: An instance of the genus classification model.
+        Path: The file path of the genus classification model.
     """
     genus_model_path = get_xspect_model_path() / (genus.lower() + "-genus.json")
-    genus_filter_model = ProbabilisticSingleFilterModel.load(genus_model_path)
-    return genus_filter_model
+    return genus_model_path
-def get_species_model(genus) -> ProbabilisticFilterSVMModel:
+def get_species_model_path(genus) -> Path:
     """
-    Get a species classification model for the specified genus.
+    Get a species model path for the specified genus.
-    This function retrieves a pre-trained species classification model based on the provided genus name.
+    This function retrieves the path of a pre-trained species classification model based on the
+    provided genus name.
     Args:
         genus (str): The genus name for which the species model is to be retrieved.
     Returns:
-        ProbabilisticFilterSVMModel: An instance of the species classification model.
+        Path: The file path of the species classification model.
     """
     species_model_path = get_xspect_model_path() / (genus.lower() + "-species.json")
-    species_filter_model = ProbabilisticFilterSVMModel.load(species_model_path)
-    return species_filter_model
+    return species_model_path
 def get_model_metadata(model: str | Path) -> dict:
@@ -121,7 +117,8 @@ def get_models() -> dict[str, list[dict]]:
     This function scans the model directory for JSON files and organizes them by their model type.
     Returns:
-        dict[str, list[dict]]: A dictionary where keys are model types and values are lists of model display names.
+        dict[str, list[dict]]: A dictionary where keys are model types and values are lists of
+        model display names.
     """
     model_dict = {}
     for model_file in get_xspect_model_path().glob("*.json"):

xspect/models/probabilistic_filter_mlst_model.py CHANGED Viewed

@@ -2,14 +2,14 @@
 __author__ = "Cetin, Oemer"
-import cobs_index
 import json
 from pathlib import Path
+from collections import defaultdict
+import cobs_index
+from cobs_index import DocumentList
 from Bio import SeqIO
 from Bio.Seq import Seq
 from Bio.SeqRecord import SeqRecord
-from cobs_index import DocumentList
-from collections import defaultdict
 from xspect.file_io import get_record_iterator
 from xspect.mlst_feature.mlst_helper import MlstResult
 from xspect.mlst_feature.pub_mlst_handler import PubMLSTHandler
@@ -100,11 +100,11 @@ class ProbabilisticFilterMlstSchemeModel:
                 "Scheme not found. Please make sure to download the schemes prior!"
             )
-        scheme = str(scheme_path).split("/")[-1]
+        scheme = str(scheme_path).rsplit("/", maxsplit=1)[-1]
         cobs_path = ""
         # COBS structure for every locus (default = 7 for Oxford or Pasteur scheme)
         for locus_path in sorted(scheme_path.iterdir()):
-            locus = str(locus_path).split("/")[-1]
+            locus = str(locus_path).rsplit("/", maxsplit=1)[-1]
             # counts all fasta files that belong to a locus
             self.loci[locus] = sum(
                 (1 for _ in locus_path.iterdir() if not str(_).endswith("cache"))
@@ -112,7 +112,7 @@ class ProbabilisticFilterMlstSchemeModel:
             # determine the avg base pair size of alleles
             fasta_file = next(locus_path.glob("*.fasta"), None)
-            with open(fasta_file, "r") as handle:
+            with open(fasta_file, "r", encoding="utf-8") as handle:
                 record = next(SeqIO.parse(handle, "fasta"))
             self.avg_locus_bp_size.append(len(record.seq))
@@ -134,7 +134,8 @@ class ProbabilisticFilterMlstSchemeModel:
     def save(self) -> None:
         """Saves the model to disk"""
-        scheme = str(self.scheme_path).split("/")[-1]  # [-1] contains the scheme name
+        # [-1] contains the scheme name
+        scheme = str(self.scheme_path).rsplit("/", maxsplit=1)[-1]
         json_path = self.base_path / scheme / f"{scheme}.json"
         json_object = json.dumps(self.to_dict(), indent=4)
@@ -152,7 +153,7 @@ class ProbabilisticFilterMlstSchemeModel:
         Returns:
             ProbabilisticFilterMlstSchemeModel: A trained model from the disk in JSON format.
         """
-        scheme_name = str(scheme_path).split("/")[-1]
+        scheme_name = str(scheme_path).rsplit("/", maxsplit=1)[-1]
         json_path = scheme_path / f"{scheme_name}.json"
         with open(json_path, "r", encoding="utf-8") as file:
             json_object = file.read()
@@ -221,7 +222,7 @@ class ProbabilisticFilterMlstSchemeModel:
         for entry in sorted(cobs_path.iterdir()):
             if str(entry).endswith(".json"):
                 continue
-            file_name = str(entry).split("/")[-1]  # file_name = locus
+            file_name = str(entry).rsplit("/", maxsplit=1)[-1]  # file_name = locus
             scheme_path_list.append(file_name.split(".")[0])  # without the file ending
         result_dict = {}
@@ -442,7 +443,7 @@ class ProbabilisticFilterMlstSchemeModel:
         Returns:
             bool: True if any locus score >= 0.5 * its avg base pair size, False otherwise.
         """
-        for i, (locus, allele_score_dict) in enumerate(highest_results.items()):
+        for i, (_, allele_score_dict) in enumerate(highest_results.items()):
             if not allele_score_dict:
                 continue  # skip empty values

XspecT 0.5.3__py3-none-any.whl → 0.6.0__py3-none-any.whl

Potentially problematic release.

XspecT 0.5.3py3-none-any.whl → 0.6.0py3-none-any.whl