PyPI - XspecT - Versions diffs - 0.5.3__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

XspecT 0.5.3py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of XspecT might be problematic. Click here for more details.

Files changed (26) hide show

xspect/classify.py +38 -8
xspect/definitions.py +30 -10
xspect/file_io.py +2 -1
xspect/filter_sequences.py +20 -4
xspect/main.py +126 -28
xspect/misclassification_detection/__init__.py +0 -0
xspect/misclassification_detection/mapping.py +168 -0
xspect/misclassification_detection/point_pattern_analysis.py +102 -0
xspect/misclassification_detection/simulate_reads.py +55 -0
xspect/mlst_feature/mlst_helper.py +15 -19
xspect/mlst_feature/pub_mlst_handler.py +16 -19
xspect/model_management.py +14 -17
xspect/models/probabilistic_filter_mlst_model.py +11 -10
xspect/models/probabilistic_filter_model.py +142 -8
xspect/models/probabilistic_filter_svm_model.py +29 -14
xspect/models/probabilistic_single_filter_model.py +9 -7
xspect/models/result.py +22 -15
xspect/ncbi.py +82 -7
xspect/train.py +21 -4
xspect/web.py +13 -4
{xspect-0.5.3.dist-info → xspect-0.6.0.dist-info}/METADATA +4 -1
{xspect-0.5.3.dist-info → xspect-0.6.0.dist-info}/RECORD +26 -22
{xspect-0.5.3.dist-info → xspect-0.6.0.dist-info}/WHEEL +0 -0
{xspect-0.5.3.dist-info → xspect-0.6.0.dist-info}/entry_points.txt +0 -0
{xspect-0.5.3.dist-info → xspect-0.6.0.dist-info}/licenses/LICENSE +0 -0
{xspect-0.5.3.dist-info → xspect-0.6.0.dist-info}/top_level.txt +0 -0

xspect/models/probabilistic_filter_model.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """Probabilistic filter model for sequence data"""
 import json
+import shutil
 from math import ceil
 from pathlib import Path
 from typing import Any
@@ -9,9 +10,19 @@ from Bio.SeqRecord import SeqRecord
 from Bio import SeqIO
 from slugify import slugify
 import cobs_index as cobs
-from xspect.definitions import fasta_endings, fastq_endings
+from xspect.definitions import (
+    fasta_endings,
+    fastq_endings,
+    get_xspect_misclassification_path,
+)
 from xspect.file_io import get_record_iterator
+from xspect.misclassification_detection.mapping import MappingHandler
 from xspect.models.result import ModelResult
+from collections import defaultdict
+from xspect.ncbi import NCBIHandler
+from xspect.misclassification_detection.point_pattern_analysis import (
+    PointPatternAnalysis,
+)
 class ProbabilisticFilterModel:
@@ -135,8 +146,8 @@ class ProbabilisticFilterModel:
             display_names (dict | None): A dictionary mapping file names to display names.
                 If None, uses file names as display names.
             training_accessions (dict[str, list[str]] | None): A dictionary mapping filter IDs to
-                lists of accession numbers used for training the model. If None, no training accessions
-                are set.
+                lists of accession numbers used for training the model. If None, no training
+                accessions are set.
         Raises:
             ValueError: If the directory path is invalid, does not exist, or is not a directory.
         """
@@ -230,6 +241,8 @@ class ProbabilisticFilterModel:
         ),
         filter_ids: list[str] = None,
         step: int = 1,
+        display_name: bool = False,
+        validation: bool = False,
     ) -> ModelResult:
         """
         Returns a model result object for the sequence(s) based on the filters in the model
@@ -246,6 +259,8 @@ class ProbabilisticFilterModel:
             filter_ids (list[str]): A list of filter IDs to filter the results. If None,
                 all results are returned.
             step (int): The step size for the k-mer search. Default is 1.
+            display_name (bool): Includes a display name for each tax_ID.
+            validation (bool): Sorts out misclassified reads.
         Returns:
             ModelResult: An object containing the hits for each sequence, the number of kmers,
@@ -253,11 +268,12 @@ class ProbabilisticFilterModel:
         Raises:
             ValueError: If the input sequence is not valid, or if it is not a Seq object,
-                        a list of Seq objects, a SeqIO iterator, or a Path object to a fasta/fastq file.
+                        a list of Seq objects, a SeqIO iterator, or a Path object to a fasta/fastq
+                        file.
         """
         if isinstance(sequence_input, (SeqRecord)):
             return ProbabilisticFilterModel.predict(
-                self, [sequence_input], filter_ids, step=step
+                self, [sequence_input], filter_ids, step, display_name, validation
             )
         if self._is_sequence_list(sequence_input) | self._is_sequence_iterator(
@@ -265,19 +281,42 @@ class ProbabilisticFilterModel:
         ):
             hits = {}
             num_kmers = {}
+            if validation and self._is_sequence_iterator(sequence_input):
+                sequence_input = list(sequence_input)
             for individual_sequence in sequence_input:
                 individual_hits = self.calculate_hits(
-                    individual_sequence.seq, filter_ids, step=step
+                    individual_sequence.seq, filter_ids, step
                 )
                 num_kmers[individual_sequence.id] = self._count_kmers(
-                    individual_sequence, step=step
+                    individual_sequence, step
                 )
+                if display_name:
+                    individual_hits.update(
+                        {
+                            f"{key} -{self.display_names.get(key, 'Unknown').replace(
+                                self.model_display_name, '', 1)}": individual_hits.pop(
+                                key
+                            )
+                            for key in list(individual_hits.keys())
+                        }
+                    )
                 hits[individual_sequence.id] = individual_hits
+            if validation:
+                hits = self.detecting_misclassification(hits, sequence_input)
             return ModelResult(self.slug(), hits, num_kmers, sparse_sampling_step=step)
         if isinstance(sequence_input, Path):
             return ProbabilisticFilterModel.predict(
-                self, get_record_iterator(sequence_input), step=step
+                self,
+                get_record_iterator(sequence_input),
+                step=step,
+                display_name=display_name,
+                validation=validation,
             )
         raise ValueError(
@@ -460,3 +499,98 @@ class ProbabilisticFilterModel:
             sequence_input,
             (SeqIO.FastaIO.FastaIterator, SeqIO.QualityIO.FastqPhredIterator),
         )
+    def detecting_misclassification(
+        self,
+        hits: dict[str, dict[str, int]],
+        seq_records: list[SeqRecord],
+        min_reads: int = 10,
+    ) -> dict[str, dict[str, int]]:
+        """
+        Notes:
+        Developed by Oemer Cetin as part of a Bsc thesis at Goethe University Frankfurt am Main (2025).
+        (An Integration of Alignment-Free and Alignment-Based Approaches for Bacterial Taxon Assignment)
+        Detects misclassification for short sequences.
+        This function is an alignment-based procedure that groups species by highest XspecT scores.
+        Each species group is mapped against the respective reference genome.
+        Start coordinates are extracted and scanned for local clustering.
+        When local clustering is detected, all sequences belonging to the species are sorted out.
+        Args:
+            hits (dict): The species annotations from the prediction step.
+            seq_records (list): The provided sequences.
+            min_reads (int): Minimum amount of reads, that species groups should have.
+        Returns:
+            dict: hits where misclassified sequences have been sorted out.
+        """
+        rec_by_id = {record.id: record for record in seq_records}
+        grouped: dict[int, list[SeqRecord]] = defaultdict(list)
+        misclassified = {}
+        # group by species annotation
+        for record, score_dict in hits.items():
+            if record == "misclassified":
+                continue
+            sorted_hits = sorted(
+                score_dict.items(), key=lambda entry: entry[1], reverse=True
+            )
+            if sorted_hits[0][1] > sorted_hits[1][1]:  # unique highest score
+                highest_tax_id = int(sorted_hits[0][0])  # tax_id
+                if record in rec_by_id:
+                    # groups all reads with the highest score by tax_id
+                    grouped[highest_tax_id].append(rec_by_id[record])
+        filtered_grouped = {
+            tax_id: seq for tax_id, seq in grouped.items() if len(seq) > min_reads
+        }
+        largest_group = max(
+            filtered_grouped,
+            key=lambda tax_id: len(filtered_grouped[tax_id]),
+            default=None,
+        )
+        # mapping procedure
+        handler = NCBIHandler()
+        out_dir = get_xspect_misclassification_path()
+        out_dir.mkdir(parents=True, exist_ok=True)
+        for tax_id, reads in filtered_grouped.items():
+            if tax_id == largest_group:
+                continue
+            tax_dir = out_dir / str(tax_id)
+            tax_dir.mkdir(parents=True, exist_ok=True)
+            fasta_path = tax_dir / f"{tax_id}.fasta"
+            SeqIO.write(reads, fasta_path, "fasta")
+            reference_path = tax_dir / f"{tax_id}.fna"
+            # download reference once
+            if not (reference_path.exists() and reference_path.stat().st_size > 0):
+                handler.download_reference_genome(tax_id, tax_dir)
+            if not reference_path.exists():
+                shutil.rmtree(tax_dir)
+                continue
+            mapping_handler = MappingHandler(str(reference_path), str(fasta_path))
+            mapping_handler.map_reads_onto_reference()
+            mapping_handler.extract_starting_coordinates()
+            genome_length = mapping_handler.get_total_genome_length()
+            start_coordinates = mapping_handler.get_start_coordinates()
+            if len(start_coordinates) < min_reads:
+                continue
+            # cluster analysis
+            analysis = PointPatternAnalysis(start_coordinates, genome_length)
+            clustered = analysis.ripleys_k_edge_corrected()
+            if clustered[0]:  # True or False
+                bucket = misclassified.setdefault(tax_id, {})
+                for read in reads:
+                    data = hits.pop(read.id, None)  # remove false reads from main hits
+                    if data is not None:
+                        bucket[read.id] = data
+        if misclassified:
+            hits["misclassified"] = misclassified
+        return hits

xspect/models/probabilistic_filter_svm_model.py CHANGED Viewed

@@ -55,10 +55,14 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
             base_path (Path): The base path where the model will be stored.
             kernel (str): The kernel type for the SVM (e.g., 'linear', 'rbf').
             c (float): Regularization parameter for the SVM.
-            fpr (float, optional): False positive rate for the probabilistic filter. Defaults to 0.01.
-            num_hashes (int, optional): Number of hashes for the probabilistic filter. Defaults to 7.
-            training_accessions (dict[str, list[str]] | None, optional): Accessions used for training the probabilistic filter. Defaults to None.
-            svm_accessions (dict[str, list[str]] | None, optional): Accessions used for training the SVM. Defaults to None.
+            fpr (float, optional): False positive rate for the probabilistic filter.
+            Defaults to 0.01.
+            num_hashes (int, optional): Number of hashes for the probabilistic filter.
+            Defaults to 7.
+            training_accessions (dict[str, list[str]] | None, optional): Accessions used for
+            training the probabilistic filter. Defaults to None.
+            svm_accessions (dict[str, list[str]] | None, optional): Accessions used for
+            training the SVM. Defaults to None.
         """
         super().__init__(
             k=k,
@@ -112,17 +116,18 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
         """
         Fit the SVM to the sequences and labels.
-        This method first trains the probabilistic filter model and then
-        calculates scores for the SVM training. It expects the sequences to be in
-        the specified directory and the SVM training sequences to be in the
-        specified SVM path. The scores are saved in a CSV file for later use.
+        This method first trains the probabilistic filter model and then calculates scores for
+        the SVM training. It expects the sequences to be in the specified directory and the SVM
+        training sequences to be in the specified SVM path. The scores are saved in a CSV file
+        for later use.
         Args:
             dir_path (Path): The directory containing the training sequences.
             svm_path (Path): The directory containing the SVM training sequences.
             display_names (dict[str, str] | None): A mapping of accession IDs to display names.
             svm_step (int): Step size for sparse sampling in SVM training.
-            training_accessions (dict[str, list[str]] | None): Accessions used for training the probabilistic filter.
+            training_accessions (dict[str, list[str]] | None): Accessions used for training the
+            probabilistic filter.
             svm_accessions (dict[str, list[str]] | None): Accessions used for training the SVM.
         """
@@ -178,6 +183,8 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
         ),
         filter_ids: list[str] = None,
         step: int = 1,
+        display_name: bool = False,
+        validation: bool = False,
     ) -> ModelResult:
         """
         Predict the labels of the sequences.
@@ -187,19 +194,26 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
         with the probabilistic filter model, and it will return a `ModelResult`.
         Args:
-            sequence_input (SeqRecord | list[SeqRecord] | SeqIO.FastaIO.FastaIterator | SeqIO.QualityIO.FastqPhredIterator | Path): The input sequences to predict.
-            filter_ids (list[str], optional): A list of IDs to filter the predictions. Defaults to None.
+            sequence_input (SeqRecord | list[SeqRecord] | SeqIO.FastaIO.FastaIterator |
+            SeqIO.QualityIO.FastqPhredIterator | Path): The input sequences to predict.
+            filter_ids (list[str], optional): A list of IDs to filter the predictions.
             step (int, optional): Step size for sparse sampling. Defaults to 1.
+            display_name (bool): Includes a display name for each tax_ID.
+            validation (bool): Sorts out misclassified reads .
         Returns:
-            ModelResult: The result of the prediction containing hits, number of kmers, and the predicted label.
+            ModelResult: The result of the prediction containing hits, number of kmers, and the
+            predicted label.
         """
         # get scores and format them for the SVM
-        res = super().predict(sequence_input, filter_ids, step=step)
+        res = super().predict(
+            sequence_input, filter_ids, step, display_name, validation
+        )
         svm_scores = dict(sorted(res.get_scores()["total"].items()))
         svm_scores = [list(svm_scores.values())]
         svm = self._get_svm(filter_ids)
+        res.hits["misclassified"] = res.misclassified
         return ModelResult(
             self.slug(),
             res.hits,
@@ -217,7 +231,8 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
         training data to only include those keys.
         Args:
-            id_keys (list[str] | None): A list of IDs to filter the training data. If None, all data is used.
+            id_keys (list[str] | None): A list of IDs to filter the training data.
+                If None, all data is used.
         Returns:
             SVC: The trained SVM model.

xspect/models/probabilistic_single_filter_model.py CHANGED Viewed

@@ -34,8 +34,8 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
     ) -> None:
         """Initialize probabilistic single filter model.
-        This model uses a Bloom filter to store k-mers from the training sequences. It is designed to
-        be used with a single filter, which is suitable e.g. for genus-level classification.
+        This model uses a Bloom filter to store k-mers from the training sequences. It is designed
+        to be used with a single filter, which is suitable e.g. for genus-level classification.
         Args:
             k (int): Length of the k-mers to use for filtering
@@ -45,7 +45,7 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
             model_type (str): Type of the model, e.g. "probabilistic_single_filter"
             base_path (Path): Base path where the model will be saved
             fpr (float): False positive rate for the Bloom filter, default is 0.01
-            training_accessions (list[str] | None): List of accessions used for training, default is None
+            training_accessions (list[str] | None): List of accessions used for training
         """
         super().__init__(
             k=k,
@@ -75,7 +75,7 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
         Args:
             file_path (Path): Path to the file containing sequences in FASTA format
             display_name (str): Display name for the model
-            training_accessions (list[str] | None): List of accessions used for training, default is None
+            training_accessions (list[str] | None): List of accessions used for training
         """
         self.training_accessions = training_accessions
@@ -104,7 +104,7 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
         Calculates the number of k-mers in the sequence that are present in the Bloom filter.
         Args:
-            sequence (Seq | SeqRecord): Sequence to calculate hits for, can be a Bio.Seq or Bio.SeqRecord object
+            sequence (Seq | SeqRecord): Sequence to calculate hits for
             filter_ids (list[str] | None): List of filter IDs to use, default is None
             step (int): Step size for generating k-mers, default is 1
         Returns:
@@ -162,13 +162,15 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
         """
         Generate kmers from the sequence
-        Generates k-mers from the sequence, considering both the forward and reverse complement strands.
+        Generates k-mers from the sequence, considering both the forward and reverse complement
+        strands.
         Args:
             sequence (Seq): Sequence to generate k-mers from
             step (int): Step size for generating k-mers, default is 1
         Yields:
-            str: The minimizer k-mer (the lexicographically smallest k-mer between the forward and reverse complement)
+            str: The minimizer k-mer (the lexicographically smallest k-mer between the forward and
+                reverse complement)
         """
         num_kmers = ceil((len(sequence) - self.k + 1) / step)
         for i in range(num_kmers):

xspect/models/result.py CHANGED Viewed

@@ -40,6 +40,7 @@ class ModelResult:
         self.sparse_sampling_step = sparse_sampling_step
         self.prediction = prediction
         self.input_source = input_source
+        self.misclassified = self.hits.pop("misclassified", None)
     def get_scores(self) -> dict:
         """
@@ -50,7 +51,8 @@ class ModelResult:
         Returns:
             dict: A dictionary where keys are subsequence names and values are dictionaries
-                with labels as keys and scores as values. Also includes a 'total' key for overall scores.
+                with labels as keys and scores as values. Also includes a 'total' key for
+                overall scores.
         """
         scores = {
             subsequence: {
@@ -78,7 +80,8 @@ class ModelResult:
         The total hits are calculated by summing the hits for each label across all subsequences.
         Returns:
-            dict: A dictionary where keys are labels and values are the total number of hits for that label.
+            dict: A dictionary where keys are labels and values are the total number of hits for
+            that label.
         """
         total_hits = {label: 0 for label in list(self.hits.values())[0]}
         for _, subsequence_hits in self.hits.items():
@@ -97,8 +100,8 @@ class ModelResult:
         Args:
             label (str): The label for which to filter the subsequences.
-            filter_threshold (float): The threshold for filtering subsequences. Must be between 0 and 1,
-                or -1 to return the subsequence with the maximum score for the label.
+            filter_threshold (float): The threshold for filtering subsequences. Must be between 0
+                and 1, or -1 to return the subsequence with the maximum score for the label.
         Returns:
             dict[str, bool]: A dictionary where keys are subsequence names and values are booleans
@@ -114,11 +117,10 @@ class ModelResult:
                 subsequence: score[label] >= filter_threshold
                 for subsequence, score in scores.items()
             }
-        else:
-            return {
-                subsequence: score[label] == max(score.values())
-                for subsequence, score in scores.items()
-            }
+        return {
+            subsequence: score[label] == max(score.values())
+            for subsequence, score in scores.items()
+        }
     def get_filtered_subsequence_labels(
         self, label: str, filter_threshold: float = 0.7
@@ -126,15 +128,17 @@ class ModelResult:
         """
         Return the labels of filtered subsequences.
-        This method filters subsequences based on the scores for a given label and a filter threshold.
+        This method filters subsequences based on the scores for a given label and a filter
+        threshold.
         Args:
             label (str): The label for which to filter the subsequences.
-            filter_threshold (float): The threshold for filtering subsequences. Must be between 0 and 1,
-                or -1 to return the subsequence with the maximum score for the label.
+            filter_threshold (float): The threshold for filtering subsequences. Must be between 0
+                and 1, or -1 to return the subsequence with the maximum score for the label.
         Returns:
-            list[str]: A list of subsequence names that meet the filter criteria for the given label.
+            list[str]: A list of subsequence names that meet the filter criteria for the given
+                label.
         """
         return [
             subsequence
@@ -148,11 +152,13 @@ class ModelResult:
         """
         Return the result as a dictionary.
-        This method converts the ModelResult object into a dictionary format suitable for serialization.
+        This method converts the ModelResult object into a dictionary format suitable for
+        serialization.
         Returns:
             dict: A dictionary representation of the ModelResult object, including model slug,
-            sparse sampling step, hits, scores, number of k-mers, input source, and prediction if available.
+            sparse sampling step, hits, scores, number of k-mers, input source, and prediction if
+            available.
         """
         res = {
             "model_slug": self.model_slug,
@@ -160,6 +166,7 @@ class ModelResult:
             "hits": self.hits,
             "scores": self.get_scores(),
             "num_kmers": self.num_kmers,
+            "misclassified": self.misclassified,
             "input_source": self.input_source,
         }

xspect/ncbi.py CHANGED Viewed

@@ -1,10 +1,12 @@
 """NCBI handler for the NCBI Datasets API."""
+import shutil
 from enum import Enum
 from pathlib import Path
 import time
 from loguru import logger
 import requests
+import zipfile
 # pylint: disable=line-too-long
@@ -194,8 +196,9 @@ class NCBIHandler:
         assembly_level: AssemblyLevel,
         assembly_source: AssemblySource,
         count: int,
-        min_n50: int = 10000,
-        exclude_atypical: bool = True,
+        min_n50: int,
+        exclude_atypical: bool,
+        allow_inconclusive: bool,
         exclude_paired_reports: bool = True,
         current_version_only: bool = True,
     ) -> list[str]:
@@ -211,11 +214,13 @@ class NCBIHandler:
             assembly_level (AssemblyLevel): The assembly level to get the accessions for.
             assembly_source (AssemblySource): The assembly source to get the accessions for.
             count (int): The number of accessions to get.
-            min_n50 (int, optional): The minimum contig n50 to filter the accessions. Defaults to 10000.
-            exclude_atypical (bool, optional): Whether to exclude atypical accessions. Defaults to True.
+            min_n50 (int): The minimum contig n50 to filter the accessions.
+            exclude_atypical (bool): Whether to exclude atypical accessions.
+            allow_inconclusive (bool): Whether to allow accessions with an inconclusive taxonomy check status.
             exclude_paired_reports (bool, optional): Whether to exclude paired reports. Defaults to True.
             current_version_only (bool, optional): Whether to get only the current version of the accessions. Defaults to True.
         Returns:
             list[str]: A list containing the accessions.
         """
@@ -240,8 +245,11 @@ class NCBIHandler:
                 report["accession"]
                 for report in response["reports"]
                 if report["assembly_stats"]["contig_n50"] >= min_n50
-                and report["average_nucleotide_identity"]["taxonomy_check_status"]
-                == "OK"
+                and (
+                    allow_inconclusive
+                    or report["average_nucleotide_identity"]["taxonomy_check_status"]
+                    == "OK"
+                )
             ]
         except (IndexError, KeyError, TypeError):
             logger.debug(
@@ -251,7 +259,13 @@ class NCBIHandler:
         return accessions[:count]  # Limit to count
     def get_highest_quality_accessions(
-        self, taxon_id: int, assembly_source: AssemblySource, count: int
+        self,
+        taxon_id: int,
+        assembly_source: AssemblySource,
+        count: int,
+        min_n50: int,
+        exclude_atypical: bool,
+        allow_inconclusive: bool,
     ) -> list[str]:
         """
         Get the highest quality accessions for a given taxon id (based on the assembly level).
@@ -263,6 +277,9 @@ class NCBIHandler:
             taxon_id (int): The taxon id to get the accessions for.
             assembly_source (AssemblySource): The assembly source to get the accessions for.
             count (int): The number of accessions to get.
+            min_n50 (int): The minimum contig n50 to filter the accessions.
+            exclude_atypical (bool): Whether to exclude atypical accessions.
+            allow_inconclusive (bool): Whether to allow accessions with an inconclusive taxonomy check status.
         Returns:
             list[str]: A list containing the highest quality accessions.
@@ -274,6 +291,9 @@ class NCBIHandler:
                 assembly_level,
                 assembly_source,
                 count,
+                min_n50=min_n50,
+                exclude_atypical=exclude_atypical,
+                allow_inconclusive=allow_inconclusive,
             )
             if len(set(accessions)) >= count:
                 break
@@ -302,3 +322,58 @@ class NCBIHandler:
         with open(output_dir / "ncbi_dataset.zip", "wb") as f:
             for chunk in response.iter_content(chunk_size=8192):
                 f.write(chunk)
+    def download_reference_genome(self, taxon_id: int, output_dir: Path) -> Path | None:
+        """
+        Notes:
+        Developed by Oemer Cetin as part of a Bsc thesis at Goethe University Frankfurt am Main (2025).
+        (An Integration of Alignment-Free and Alignment-Based Approaches for Bacterial Taxon Assignment)
+        Downloads the reference genome from the RefSeq-DB for a given taxon ID.
+        This function queries the NCBI Datasets API for the reference genome and downloads it.
+        Args:
+            taxon_id (int): The taxonomy ID of the species.
+            output_dir (Path): Directory where the genome will be saved.
+        Returns:
+            Path: Path to the downloaded ZIP file.
+        """
+        accessions = self.get_accessions(
+            taxon_id=taxon_id,
+            assembly_level=AssemblyLevel.REFERENCE,
+            assembly_source=AssemblySource.REFSEQ,
+            count=1,  # only one reference exists
+            min_n50=0,
+            exclude_atypical=True,
+            allow_inconclusive=False,
+        )
+        if not accessions:
+            return None
+        logger.info(
+            f"Downloading reference genome for taxon {taxon_id}: {accessions[0]}"
+        )
+        self.download_assemblies(accessions, output_dir)
+        zip_path = output_dir / "ncbi_dataset.zip"
+        fna_file = ""
+        with zipfile.ZipFile(zip_path, "r") as zip_ref:
+            for file in zip_ref.namelist():
+                if file.endswith(".fna"):
+                    extracted_path = zip_ref.extract(file, path=output_dir)
+                    fna_file = output_dir / f"{taxon_id}.fna"
+                    Path(extracted_path).rename(
+                        fna_file
+                    )  # consistent file name (tax_id)
+                    logger.info(f"Extracted reference genome to {fna_file}")
+                    break
+        # clean up
+        zip_path.unlink()
+        shutil.rmtree(output_dir / "ncbi_dataset")
+        return fna_file

xspect/train.py CHANGED Viewed

@@ -186,6 +186,11 @@ def train_from_ncbi(
     author: str | None = None,
     author_email: str | None = None,
     ncbi_api_key: str | None = None,
+    min_n50: int = 10000,
+    exclude_atypical: bool = True,
+    allow_inconclusive: bool = False,
+    allow_candidatus: bool = False,
+    allow_sp: bool = False,
 ):
     """
     Train a model using NCBI assembly data for a given genus.
@@ -200,6 +205,11 @@ def train_from_ncbi(
         author (str, optional): Author of the model. Defaults to None.
         author_email (str, optional): Author's email. Defaults to None.
         ncbi_api_key (str, optional): NCBI API key for accessing NCBI resources. Defaults to None.
+        min_n50 (int, optional): Minimum N50 value for assemblies. Defaults to 10000.
+        exclude_atypical (bool, optional): Exclude atypical assemblies. Defaults to True.
+        allow_inconclusive (bool, optional): Allow use of accessions with inconclusive taxonomy check status. Defaults to False.
+        allow_candidatus (bool, optional): Allow use of Candidatus species for training. Defaults to False.
+        allow_sp (bool, optional): Allow use of species with "sp." in their names. Defaults to False.
     Raises:
         TypeError: If `genus` is not a string.
@@ -221,8 +231,8 @@ def train_from_ncbi(
     filtered_species_ids = [
         tax_id
         for tax_id in species_ids
-        if "candidatus" not in species_names[tax_id].lower()
-        and " sp." not in species_names[tax_id].lower()
+        if (allow_candidatus or "candidatus" not in species_names[tax_id].lower())
+        and (allow_sp or " sp." not in species_names[tax_id].lower())
     ]
     filtered_species_names = {
         str(tax_id): species_names[tax_id] for tax_id in filtered_species_ids
@@ -231,7 +241,12 @@ def train_from_ncbi(
     accessions = {}
     for tax_id in filtered_species_ids:
         taxon_accessions = ncbi_handler.get_highest_quality_accessions(
-            tax_id, AssemblySource.REFSEQ, 8
+            tax_id,
+            AssemblySource.REFSEQ,
+            8,
+            min_n50,
+            exclude_atypical,
+            allow_inconclusive,
         )
         if not taxon_accessions:
             logger.warning(f"No assemblies found for tax_id {tax_id}. Skipping.")
@@ -241,7 +256,9 @@ def train_from_ncbi(
     if not accessions:
         raise ValueError(
-            "No species with accessions found. Please check the genus name."
+            "No species with accessions found. "
+            "Please check if the genus name is correct or if there are any data quality issues "
+            "(e. g. inconclusive taxonomy check status, atypical assemblies, low N50 values)."
         )
     with TemporaryDirectory() as tmp_dir:

XspecT 0.5.3__py3-none-any.whl → 0.6.0__py3-none-any.whl

Potentially problematic release.

XspecT 0.5.3py3-none-any.whl → 0.6.0py3-none-any.whl