PyPI - XspecT - Versions diffs - 0.2.7__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

XspecT 0.2.7py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of XspecT might be problematic. Click here for more details.

Files changed (30) hide show

xspect/definitions.py +0 -7
xspect/download_models.py +25 -24
xspect/fastapi.py +23 -26
xspect/file_io.py +86 -2
xspect/main.py +360 -98
xspect/mlst_feature/mlst_helper.py +4 -6
xspect/model_management.py +7 -15
xspect/models/probabilistic_filter_model.py +16 -5
xspect/models/probabilistic_filter_svm_model.py +33 -18
xspect/models/probabilistic_single_filter_model.py +8 -1
xspect/models/result.py +32 -66
xspect/ncbi.py +265 -0
xspect/train.py +258 -242
{xspect-0.2.7.dist-info → xspect-0.4.1.dist-info}/METADATA +15 -21
xspect-0.4.1.dist-info/RECORD +24 -0
{xspect-0.2.7.dist-info → xspect-0.4.1.dist-info}/WHEEL +1 -1
xspect/pipeline.py +0 -201
xspect/run.py +0 -38
xspect/train_filter/__init__.py +0 -0
xspect/train_filter/create_svm.py +0 -45
xspect/train_filter/extract_and_concatenate.py +0 -124
xspect/train_filter/ncbi_api/__init__.py +0 -0
xspect/train_filter/ncbi_api/download_assemblies.py +0 -31
xspect/train_filter/ncbi_api/ncbi_assembly_metadata.py +0 -110
xspect/train_filter/ncbi_api/ncbi_children_tree.py +0 -53
xspect/train_filter/ncbi_api/ncbi_taxon_metadata.py +0 -55
xspect-0.2.7.dist-info/RECORD +0 -33
{xspect-0.2.7.dist-info → xspect-0.4.1.dist-info}/entry_points.txt +0 -0
{xspect-0.2.7.dist-info → xspect-0.4.1.dist-info/licenses}/LICENSE +0 -0
{xspect-0.2.7.dist-info → xspect-0.4.1.dist-info}/top_level.txt +0 -0

xspect/models/probabilistic_filter_svm_model.py CHANGED Viewed

@@ -4,7 +4,6 @@
 import csv
 import json
-from linecache import getline
 from pathlib import Path
 from sklearn.svm import SVC
 from Bio.SeqRecord import SeqRecord
@@ -30,6 +29,8 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
         c: float,
         fpr: float = 0.01,
         num_hashes: int = 7,
+        training_accessions: dict[str, list[str]] = None,
+        svm_accessions: dict[str, list[str]] = None,
     ) -> None:
         super().__init__(
             k=k,
@@ -40,14 +41,17 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
             base_path=base_path,
             fpr=fpr,
             num_hashes=num_hashes,
+            training_accessions=training_accessions,
         )
         self.kernel = kernel
         self.c = c
+        self.svm_accessions = svm_accessions
     def to_dict(self) -> dict:
         return super().to_dict() | {
             "kernel": self.kernel,
             "C": self.c,
+            "svm_accessions": self.svm_accessions,
         }
     def set_svm_params(self, kernel: str, c: float) -> None:
@@ -62,32 +66,41 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
         svm_path: Path,
         display_names: dict = None,
         svm_step: int = 1,
+        training_accessions: list[str] = None,
+        svm_accessions: list[str] = None,
     ) -> None:
         """Fit the SVM to the sequences and labels"""
         # Since the SVM works with score data, we need to train
         # the underlying data structure for score generation first
-        super().fit(dir_path, display_names=display_names)
+        super().fit(
+            dir_path,
+            display_names=display_names,
+            training_accessions=training_accessions,
+        )
+        self.svm_accessions = svm_accessions
         # calculate scores for SVM training
         score_list = []
-        for file in svm_path.iterdir():
-            if not file.is_file():
-                continue
-            if file.suffix[1:] not in fasta_endings + fastq_endings:
+        for species_folder in svm_path.iterdir():
+            if not species_folder.is_dir():
                 continue
-            print(f"Calculating {file.name} scores for SVM training...")
-            res = super().predict(file, step=svm_step)
-            scores = res.get_scores()["total"]
-            accession = "".join(file.name.split("_")[:2])
-            file_header = getline(str(file), 1)
-            label_id = file_header.replace("\n", "").replace(">", "")
-            # format scores for csv
-            scores = dict(sorted(scores.items()))
-            scores = ",".join([str(score) for score in scores.values()])
-            scores = f"{accession},{scores},{label_id}"
-            score_list.append(scores)
+            for file in species_folder.iterdir():
+                if file.suffix[1:] not in fasta_endings + fastq_endings:
+                    continue
+                print(f"Calculating {file.name} scores for SVM training...")
+                res = super().predict(file, step=svm_step)
+                scores = res.get_scores()["total"]
+                accession = file.stem
+                label_id = species_folder.name
+                # format scores for csv
+                scores = dict(sorted(scores.items()))
+                scores = ",".join([str(score) for score in scores.values()])
+                scores = f"{accession},{scores},{label_id}"
+                score_list.append(scores)
         # csv header
         keys = list(self.display_names.keys())
@@ -162,6 +175,8 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
                 model_json["C"],
                 fpr=model_json["fpr"],
                 num_hashes=model_json["num_hashes"],
+                training_accessions=model_json["training_accessions"],
+                svm_accessions=model_json["svm_accessions"],
             )
             model.display_names = model_json["display_names"]

xspect/models/probabilistic_single_filter_model.py CHANGED Viewed

@@ -25,6 +25,7 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
         model_type: str,
         base_path: Path,
         fpr: float = 0.01,
+        training_accessions: list[str] = None,
     ) -> None:
         super().__init__(
             k=k,
@@ -35,11 +36,16 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
             base_path=base_path,
             fpr=fpr,
             num_hashes=1,
+            training_accessions=training_accessions,
         )
         self.bf = None
-    def fit(self, file_path: Path, display_name: str) -> None:
+    def fit(
+        self, file_path: Path, display_name: str, training_accessions: list[str] = None
+    ) -> None:
         """Fit the cobs classic index to the sequences and labels"""
+        self.training_accessions = training_accessions
         # estimate number of kmers
         total_length = 0
         for record in get_record_iterator(file_path):
@@ -88,6 +94,7 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
                 model_json["model_type"],
                 path.parent,
                 fpr=model_json["fpr"],
+                training_accessions=model_json["training_accessions"],
             )
             model.display_names = model_json["display_names"]
             bloom_path = model.base_path / model.slug() / "filter.bloom"

xspect/models/result.py CHANGED Viewed

@@ -1,50 +1,7 @@
 """Module for storing the results of XspecT models."""
-from enum import Enum
-def get_last_processing_step(result: "ModelResult") -> "ModelResult":
-    """Get the last subprocessing step of the result. First path only."""
-    # traverse result tree to get last step
-    while result.subprocessing_steps:
-        result = result.subprocessing_steps[-1].result
-    return result
-class StepType(Enum):
-    """Enum for defining the type of a subprocessing step."""
-    PREDICTION = 1
-    FILTERING = 2
-    def __str__(self) -> str:
-        return self.name.lower()
-class SubprocessingStep:
-    """Class for storing a subprocessing step of an XspecT model."""
-    def __init__(
-        self,
-        subprocessing_type: StepType,
-        label: str,
-        treshold: float,
-        result: "ModelResult",
-    ):
-        self.subprocessing_type = subprocessing_type
-        self.label = label
-        self.treshold = treshold
-        self.result = result
-    def to_dict(self) -> dict:
-        """Return the subprocessing step as a dictionary."""
-        return {
-            "subprocessing_type": str(self.subprocessing_type),
-            "label": self.label,
-            "treshold": self.treshold,
-            "result": self.result.to_dict() if self.result else {},
-        }
+from json import dumps
+from pathlib import Path
 class ModelResult:
@@ -58,6 +15,7 @@ class ModelResult:
         num_kmers: dict[str, int],
         sparse_sampling_step: int = 1,
         prediction: str = None,
+        input_source: str = None,
     ):
         if "total" in hits:
             raise ValueError(
@@ -68,15 +26,7 @@ class ModelResult:
         self.num_kmers = num_kmers
         self.sparse_sampling_step = sparse_sampling_step
         self.prediction = prediction
-        self.subprocessing_steps = []
-    def add_subprocessing_step(self, subprocessing_step: SubprocessingStep) -> None:
-        """Add a subprocessing step to the result."""
-        if subprocessing_step.label in self.subprocessing_steps:
-            raise ValueError(
-                f"Subprocessing step {subprocessing_step.label} already exists in the result"
-            )
-        self.subprocessing_steps.append(subprocessing_step)
+        self.input_source = input_source
     def get_scores(self) -> dict:
         """Return the scores of the model."""
@@ -108,19 +58,33 @@ class ModelResult:
         return total_hits
     def get_filter_mask(self, label: str, filter_threshold: float) -> dict[str, bool]:
-        """Return a mask for filtered subsequences."""
-        if filter_threshold < 0 or filter_threshold > 1:
+        """Return a mask for filtered subsequences.
+        The mask is a dictionary with subsequence names as keys and boolean values
+        indicating whether the subsequence is above the filter threshold for the given label.
+        A value of -1 for filter_threshold indicates that the subsequence with the maximum score
+        for the given label should be returned.
+        """
+        if filter_threshold < 0 and not filter_threshold == -1 or filter_threshold > 1:
             raise ValueError("The filter threshold must be between 0 and 1.")
         scores = self.get_scores()
         scores.pop("total")
-        return {
-            subsequence: score[label] >= filter_threshold
-            for subsequence, score in scores.items()
-        }
+        if not filter_threshold == -1:
+            return {
+                subsequence: score[label] >= filter_threshold
+                for subsequence, score in scores.items()
+            }
+        else:
+            return {
+                subsequence: score[label] == max(score.values())
+                for subsequence, score in scores.items()
+            }
-    def get_filtered_subsequences(self, label: str, filter_threshold: 0.7) -> list[str]:
-        """Return the filtered subsequences."""
+    def get_filtered_subsequence_labels(
+        self, label: str, filter_threshold: float = 0.7
+    ) -> list[str]:
+        """Return the labels of filtered subsequences."""
         return [
             subsequence
             for subsequence, mask in self.get_filter_mask(
@@ -137,13 +101,15 @@ class ModelResult:
             "hits": self.hits,
             "scores": self.get_scores(),
             "num_kmers": self.num_kmers,
-            "subprocessing_steps": [
-                subprocessing_step.to_dict()
-                for subprocessing_step in self.subprocessing_steps
-            ],
+            "input_source": self.input_source,
         }
         if self.prediction is not None:
             res["prediction"] = self.prediction
         return res
+    def save(self, path: Path) -> None:
+        """Save the result as a JSON file."""
+        with open(path, "w", encoding="utf-8") as f:
+            f.write(dumps(self.to_dict(), indent=4))

xspect/ncbi.py ADDED Viewed

@@ -0,0 +1,265 @@
+"""NCBI handler for the NCBI Datasets API."""
+from enum import Enum
+from pathlib import Path
+import requests
+import time
+# pylint: disable=line-too-long
+class AssemblyLevel(Enum):
+    """Enum for the assembly level."""
+    REFERENCE = "reference"
+    COMPLETE_GENOME = "complete_genome"
+    CHROMOSOME = "chromosome"
+    SCAFFOLD = "scaffold"
+    CONTIG = "contig"
+class AssemblySource(Enum):
+    """Enum for the assembly source."""
+    REFSEQ = "refseq"
+    GENBANK = "genbank"
+class NCBIHandler:
+    """This class uses the NCBI Datasets API to get the taxonomy tree of a given Taxon.
+    The taxonomy tree consists of only the next children to the parent taxon.
+    The children are only of the next lower rank of the parent taxon.
+    """
+    def __init__(
+        self,
+        api_key: str = None,
+    ):
+        """Initialise the NCBI handler."""
+        self.api_key = api_key
+        self.base_url = "https://api.ncbi.nlm.nih.gov/datasets/v2"
+        self.last_request_time = 0
+        self.min_interval = (
+            1 / 10 if api_key else 1 / 5
+        )  # NCBI allows 10 requests per second with if an API key, otherwise 5 requests per second
+    def _enforce_rate_limit(self):
+        """Enforce rate limiting for the NCBI Datasets API.
+        This method ensures that the requests to the API are limited to 5 requests per second
+        without an API key and 10 requests per second with an API key.
+        It uses a simple time-based approach to enforce the rate limit.
+        """
+        now = time.time()
+        elapsed_time = now - self.last_request_time
+        if elapsed_time < self.min_interval:
+            time.sleep(self.min_interval - elapsed_time)
+        self.last_request_time = now  # Update last request time
+    def _make_request(self, endpoint: str, timeout: int = 5) -> dict:
+        """Make a request to the NCBI Datasets API.
+        Args:
+            endpoint (str): The endpoint to make the request to.
+            timeout (int, optional): The timeout for the request in seconds. Defaults to 5.
+        Returns:
+            dict: The response from the API.
+        """
+        self._enforce_rate_limit()
+        endpoint = endpoint if endpoint.startswith("/") else "/" + endpoint
+        headers = {}
+        if self.api_key:
+            headers["api-key"] = self.api_key
+        response = requests.get(
+            self.base_url + endpoint, headers=headers, timeout=timeout
+        )
+        if response.status_code != 200:
+            response.raise_for_status()
+        return response.json()
+    def get_genus_taxon_id(self, genus: str) -> int:
+        """
+        Get the taxon id for a given genus name.
+        This function checks if the genus name is valid by making a request to the NCBI Datasets API.
+        If the genus name is valid, it returns the taxon id.
+        If the genus name is not valid, it raises an exception.
+        Args:
+            genus (str): The genus name to validate.
+        Returns:
+            int: The taxon id for the given genus name.
+        Raises:
+            ValueError: If the genus name is not valid.
+        """
+        endpoint = f"/taxonomy/taxon/{genus}"
+        response = self._make_request(endpoint)
+        try:
+            taxonomy = response["taxonomy_nodes"][0]["taxonomy"]
+            taxon_id = taxonomy["tax_id"]
+            rank = taxonomy["rank"]
+            lineage = taxonomy["lineage"]
+            if rank != "GENUS":
+                raise ValueError(f"Genus name {genus} is not a genus.")
+            if lineage[2] != 2:
+                raise ValueError(f"Genus name {genus} does not belong to bacteria.")
+            return taxon_id
+        except (IndexError, KeyError, TypeError) as e:
+            raise ValueError(f"Invalid genus name: {genus}") from e
+    def get_species(self, genus_id: int) -> list[int]:
+        """
+        Get the species for a given genus id.
+        This function makes a request to the NCBI Datasets API to get the species for a given genus id.
+        It returns a list of species taxonomy ids.
+        Args:
+            genus_id (int): The genus id to get the species for.
+        Returns:
+            list[int]: A list containing the species taxnomy ids.
+        """
+        endpoint = f"/taxonomy/taxon/{genus_id}/filtered_subtree"
+        response = self._make_request(endpoint)
+        try:
+            species_ids = response["edges"][str(genus_id)]["visible_children"]
+        except (IndexError, KeyError, TypeError) as e:
+            raise ValueError(f"Invalid genus id: {genus_id}") from e
+        return species_ids
+    def get_taxon_names(self, taxon_ids: list[int]) -> dict[int, str]:
+        """
+        Get the names for a given list of taxon ids.
+        This function makes a request to the NCBI Datasets API to get the names for a given list of taxon ids.
+        It returns a dictionary with the taxon ids as keys and the names as values.
+        Args:
+            taxon_ids (list[int]): The list of taxon ids to get the names for.
+        Returns:
+            dict[int, str]: A dictionary containing the taxon ids and their corresponding names.
+        """
+        if len(taxon_ids) > 1000:
+            raise ValueError("Maximum number of taxon ids is 1000.")
+        if len(taxon_ids) < 1:
+            raise ValueError("At least one taxon id is required.")
+        endpoint = f"/taxonomy/taxon/{','.join(map(str, taxon_ids))}?page_size=1000"
+        response = self._make_request(endpoint)
+        try:
+            taxon_names = {
+                int(taxonomy_node["taxonomy"]["tax_id"]): taxonomy_node["taxonomy"][
+                    "organism_name"
+                ]
+                for taxonomy_node in response["taxonomy_nodes"]
+            }
+            if len(taxon_names) != len(taxon_ids):
+                raise ValueError("Not all taxon ids were found.")
+        except (IndexError, KeyError, TypeError) as e:
+            raise ValueError(f"Invalid taxon ids: {taxon_ids}") from e
+        return taxon_names
+    def get_accessions(
+        self,
+        taxon_id: int,
+        assembly_level: AssemblyLevel,
+        assembly_source: AssemblySource,
+        count: int,
+        min_n50: int = 10000,
+        exclude_atypical: bool = True,
+        exclude_paired_reports: bool = True,
+        current_version_only: bool = True,
+    ) -> list[str]:
+        """
+        Get the accessions for a given taxon id.
+        This function makes a request to the NCBI Datasets API to get the accessions for a given taxon id.
+        It filters the accessions based on the assembly level, assembly source, and other parameters.
+        It returns a list with the respective accessions.
+        Args:
+            taxon_id int: The taxon id to get the accessions for.
+            assembly_level (AssemblyLevel): The assembly level to get the accessions for.
+            assembly_source (AssemblySource): The assembly source to get the accessions for.
+            count (int): The number of accessions to get.
+            min_n50 (int, optional): The minimum contig n50 to filter the accessions. Defaults to 10000.
+            exclude_atypical (bool, optional): Whether to exclude atypical accessions. Defaults to True.
+            exclude_paired_reports (bool, optional): Whether to exclude paired reports. Defaults to True.
+            current_version_only (bool, optional): Whether to get only the current version of the accessions. Defaults to True.
+        Returns:
+            list[str]: A list containing the accessions.
+        """
+        endpoint = (
+            f"/genome/taxon/{taxon_id}/dataset_report?"
+            f"filters.assembly_source={assembly_source.value}&"
+            f"filters.exclude_atypical={exclude_atypical}&"
+            f"filters.exclude_paired_reports={exclude_paired_reports}&"
+            f"filters.current_version_only={current_version_only}&"
+            f"page_size={count * 2}&"  # to avoid having less than count if n50 or ANI is not met
+        )
+        endpoint += (
+            "&filters.reference_only=true"
+            if assembly_level == AssemblyLevel.REFERENCE
+            else f"&filters.assembly_level={assembly_level.value}"
+        )
+        response = self._make_request(endpoint)
+        try:
+            accessions = [
+                report["accession"]
+                for report in response["reports"]
+                if report["assembly_stats"]["contig_n50"] >= min_n50
+                and report["average_nucleotide_identity"]["taxonomy_check_status"]
+                == "OK"
+            ]
+        except (IndexError, KeyError, TypeError):
+            print(f"Could not get accessions for taxon with ID: {taxon_id}. Skipping.")
+            return []
+        return accessions[:count]  # Limit to count
+    def get_highest_quality_accessions(
+        self, taxon_id: int, assembly_source: AssemblySource, count: int
+    ) -> list[str]:
+        """Get the highest quality accessions for a given taxon id (based on the assembly level)."""
+        accessions = []
+        for assembly_level in list(AssemblyLevel):
+            accessions += self.get_accessions(
+                taxon_id,
+                assembly_level,
+                assembly_source,
+                count,
+            )
+            if len(set(accessions)) >= count:
+                break
+        return list(set(accessions))[:count]  # Remove duplicates and limit to count
+    def download_assemblies(self, accessions: list[str], output_dir: Path) -> None:
+        """Download assemblies for a list of accessions."""
+        endpoint = f"/genome/accession/{','.join(accessions)}/download?include_annotation_type=GENOME_FASTA"
+        self._enforce_rate_limit()
+        response = requests.get(self.base_url + endpoint, stream=True, timeout=5)
+        if response.status_code != 200:
+            response.raise_for_status()
+        output_dir.mkdir(parents=True, exist_ok=True)
+        with open(output_dir / "ncbi_dataset.zip", "wb") as f:
+            for chunk in response.iter_content(chunk_size=8192):
+                f.write(chunk)

XspecT 0.2.7__py3-none-any.whl → 0.4.1__py3-none-any.whl

Potentially problematic release.

XspecT 0.2.7py3-none-any.whl → 0.4.1py3-none-any.whl