PyPI - XspecT - Versions diffs - 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl - Mend

XspecT 0.5.0py3-none-any.whl → 0.5.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of XspecT might be problematic. Click here for more details.

Files changed (33) hide show

xspect/classify.py +61 -13
xspect/definitions.py +61 -13
xspect/download_models.py +10 -2
xspect/file_io.py +115 -48
xspect/filter_sequences.py +81 -29
xspect/main.py +90 -39
xspect/mlst_feature/mlst_helper.py +3 -0
xspect/mlst_feature/pub_mlst_handler.py +43 -1
xspect/model_management.py +84 -14
xspect/models/probabilistic_filter_mlst_model.py +75 -37
xspect/models/probabilistic_filter_model.py +201 -19
xspect/models/probabilistic_filter_svm_model.py +106 -13
xspect/models/probabilistic_single_filter_model.py +73 -9
xspect/models/result.py +77 -10
xspect/ncbi.py +48 -12
xspect/train.py +19 -11
xspect/web.py +68 -12
xspect/xspect-web/dist/assets/index-Ceo58xui.css +1 -0
xspect/xspect-web/dist/assets/{index-CMG4V7fZ.js → index-Dt_UlbgE.js} +82 -77
xspect/xspect-web/dist/index.html +2 -2
xspect/xspect-web/src/App.tsx +4 -2
xspect/xspect-web/src/api.tsx +23 -1
xspect/xspect-web/src/components/filter-form.tsx +16 -3
xspect/xspect-web/src/components/filtering-result.tsx +65 -0
xspect/xspect-web/src/components/result.tsx +2 -2
xspect/xspect-web/src/types.tsx +5 -0
{xspect-0.5.0.dist-info → xspect-0.5.2.dist-info}/METADATA +11 -5
{xspect-0.5.0.dist-info → xspect-0.5.2.dist-info}/RECORD +32 -31
{xspect-0.5.0.dist-info → xspect-0.5.2.dist-info}/WHEEL +1 -1
xspect/xspect-web/dist/assets/index-jIKg1HIy.css +0 -1
{xspect-0.5.0.dist-info → xspect-0.5.2.dist-info}/entry_points.txt +0 -0
{xspect-0.5.0.dist-info → xspect-0.5.2.dist-info}/licenses/LICENSE +0 -0
{xspect-0.5.0.dist-info → xspect-0.5.2.dist-info}/top_level.txt +0 -0

xspect/classify.py CHANGED Viewed

@@ -4,29 +4,77 @@ import xspect.model_management as mm
 from xspect.models.probabilistic_filter_mlst_model import (
     ProbabilisticFilterMlstSchemeModel,
 )
+from xspect.file_io import prepare_input_output_paths
 def classify_genus(
     model_genus: str, input_path: Path, output_path: Path, step: int = 1
 ):
-    """Classify the input file using the genus model."""
+    """
+    Classify the genus of sequences.
+    This function classifies input files using the genus model.
+    The input path can be a file or directory
+    Args:
+        model_genus (str): The genus model slug.
+        input_path (Path): The path to the input file/directory containing sequences.
+        output_path (Path): The path to the output file where results will be saved.
+        step (int): The amount of kmers to be skipped.
+    """
     model = mm.get_genus_model(model_genus)
-    result = model.predict(input_path, step=step)
-    result.input_source = input_path.name
-    result.save(output_path)
+    input_paths, get_output_path = prepare_input_output_paths(input_path)
+    for idx, current_path in enumerate(input_paths):
+        result = model.predict(current_path, step=step)
+        result.input_source = current_path.name
+        cls_path = get_output_path(idx, output_path)
+        result.save(cls_path)
+        print(f"Saved result as {cls_path.name}")
+def classify_species(
+    model_genus: str, input_path: Path, output_path: Path, step: int = 1
+):
+    """
+    Classify the species of sequences.
+    This function classifies input files using the species model.
+    The input path can be a file or directory
-def classify_species(model_genus, input_path, output_path, step=1):
-    """Classify the input file using the species model."""
+    Args:
+        model_genus (str): The genus model slug.
+        input_path (Path): The path to the input file/directory containing sequences.
+        output_path (Path): The path to the output file where results will be saved.
+        step (int): The amount of kmers to be skipped.
+    """
     model = mm.get_species_model(model_genus)
-    result = model.predict(input_path, step=step)
-    result.input_source = input_path.name
-    result.save(output_path)
+    input_paths, get_output_path = prepare_input_output_paths(input_path)
+    for idx, current_path in enumerate(input_paths):
+        result = model.predict(current_path, step=step)
+        result.input_source = current_path.name
+        cls_path = get_output_path(idx, output_path)
+        result.save(cls_path)
+        print(f"Saved result as {cls_path.name}")
+def classify_mlst(input_path: Path, output_path: Path, limit: bool):
+    """
+    Classify the strain type using the specific MLST model.
+    Args:
+        input_path (Path): The path to the input file/directory containing sequences.
+        output_path (Path): The path to the output file where results will be saved.
+        limit (bool): A limit for the highest allele_id results that are shown.
+    """
-def classify_mlst(input_path, output_path):
-    """Classify the input file using the MLST model."""
     scheme_path = pick_scheme_from_models_dir()
     model = ProbabilisticFilterMlstSchemeModel.load(scheme_path)
-    result = model.predict(scheme_path, input_path)
-    result.save(output_path)
+    input_paths, get_output_path = prepare_input_output_paths(input_path)
+    for idx, current_path in enumerate(input_paths):
+        result = model.predict(scheme_path, current_path, step=1, limit=limit)
+        result.input_source = current_path.name
+        cls_path = get_output_path(idx, output_path)
+        result.save(cls_path)
+        print(f"Saved result as {cls_path.name}")

xspect/definitions.py CHANGED Viewed

@@ -7,36 +7,84 @@ fasta_endings = ["fasta", "fna", "fa", "ffn", "frn"]
 fastq_endings = ["fastq", "fq"]
-def get_xspect_root_path():
-    """Return the root path for XspecT data."""
-    root_path = Path(getcwd()) / "xspect-data"
-    root_path.mkdir(exist_ok=True, parents=True)
-    return root_path
+def get_xspect_root_path() -> Path:
+    """
+    Return the root path for XspecT data.
+    Returns the path to the XspecT data directory, which can be located either in the user's home directory or in the current working directory.
+    If neither exists, it creates the directory in the user's home directory.
-def get_xspect_model_path():
-    """Return the path to the XspecT models."""
+    Returns:
+        Path: The path to the XspecT data directory.
+    """
+    home_based_dir = Path.home() / "xspect-data"
+    if home_based_dir.exists():
+        return home_based_dir
+    cwd_based_dir = Path(getcwd()) / "xspect-data"
+    if cwd_based_dir.exists():
+        return cwd_based_dir
+    home_based_dir.mkdir(exist_ok=True, parents=True)
+    return home_based_dir
+def get_xspect_model_path() -> Path:
+    """
+    Return the path to the XspecT models.
+    Returns the path to the XspecT models directory, which is located within the XspecT data directory.
+    If the directory does not exist, it creates the directory.
+    Returns:
+        Path: The path to the XspecT models directory.
+    """
     model_path = get_xspect_root_path() / "models"
     model_path.mkdir(exist_ok=True, parents=True)
     return model_path
-def get_xspect_upload_path():
-    """Return the path to the XspecT upload directory."""
+def get_xspect_upload_path() -> Path:
+    """
+    Return the path to the XspecT upload directory.
+    Returns the path to the XspecT uploads directory, which is located within the XspecT data directory.
+    If the directory does not exist, it creates the directory.
+    Returns:
+        Path: The path to the XspecT uploads directory.
+    """
     upload_path = get_xspect_root_path() / "uploads"
     upload_path.mkdir(exist_ok=True, parents=True)
     return upload_path
-def get_xspect_runs_path():
-    """Return the path to the XspecT runs directory."""
+def get_xspect_runs_path() -> Path:
+    """
+    Return the path to the XspecT runs directory.
+    Returns the path to the XspecT runs directory, which is located within the XspecT data directory.
+    If the directory does not exist, it creates the directory.
+    Returns:
+        Path: The path to the XspecT runs directory.
+    """
     runs_path = get_xspect_root_path() / "runs"
     runs_path.mkdir(exist_ok=True, parents=True)
     return runs_path
-def get_xspect_mlst_path():
-    """Return the path to the XspecT runs directory."""
+def get_xspect_mlst_path() -> Path:
+    """
+    Return the path to the XspecT MLST directory.
+    Returns the path to the XspecT MLST directory, which is located within the XspecT data directory.
+    If the directory does not exist, it creates the directory.
+    Returns:
+        Path: The path to the XspecT MLST directory.
+    """
     mlst_path = get_xspect_root_path() / "mlst"
     mlst_path.mkdir(exist_ok=True, parents=True)
     return mlst_path

xspect/download_models.py CHANGED Viewed

@@ -8,8 +8,16 @@ import requests
 from xspect.definitions import get_xspect_model_path
-def download_test_models(url):
-    """Download models."""
+def download_test_models(url: str) -> None:
+    """
+    Download models from the specified URL.
+    This function downloads a zip file from the given URL, extracts its contents,
+    and copies the extracted files to the XspecT model directory.
+    Args:
+        url (str): The URL from which to download the models.
+    """
     with TemporaryDirectory() as tmp_dir:
         tmp_dir = Path(tmp_dir)
         download_path = tmp_dir / "models.zip"

xspect/file_io.py CHANGED Viewed

@@ -6,12 +6,20 @@ from json import loads
 import os
 from pathlib import Path
 import zipfile
+from typing import Callable, Iterator
 from Bio import SeqIO
 from xspect.definitions import fasta_endings, fastq_endings
-def delete_zip_files(dir_path):
-    """Delete all zip files in the given directory."""
+def delete_zip_files(dir_path) -> None:
+    """
+    Delete all zip files in the given directory.
+    This function checks each file in the specified directory and removes it if it is a zip file.
+    Args:
+        dir_path (Path): Path to the directory where zip files should be deleted.
+    """
     files = os.listdir(dir_path)
     for file in files:
         if zipfile.is_zipfile(file):
@@ -19,45 +27,39 @@ def delete_zip_files(dir_path):
             os.remove(file_path)
-def extract_zip(zip_path: Path, unzipped_path: Path):
-    """Extracts all files from a zip file."""
+def extract_zip(zip_path: Path, unzipped_path: Path) -> None:
+    """
+    Extracts all files from a zip file.
+    Extracts the contents of the specified zip file to the given directory.
+    Args:
+        zip_path (Path): Path to the zip file to be extracted.
+        unzipped_path (Path): Path to the directory where the contents will be extracted.
+    """
     unzipped_path.mkdir(parents=True, exist_ok=True)
     with zipfile.ZipFile(zip_path) as item:
         item.extractall(unzipped_path)
-def concatenate_meta(path: Path, genus: str):
-    """Concatenates all species files to one fasta file.
-    :param path: Path to the directory with the concatenated fasta files.
-    :type path: Path
-    :param genus: Genus name.
-    :type genus: str
+def get_record_iterator(file_path: Path) -> Iterator:
     """
-    files_path = path / "concatenate"
-    meta_path = path / (genus + ".fasta")
-    files = os.listdir(files_path)
+    Returns a record iterator for a fasta or fastq file.
-    with open(meta_path, "w", encoding="utf-8") as meta_file:
-        # Write the header.
-        meta_header = f">{genus} metagenome\n"
-        meta_file.write(meta_header)
-        # Open each concatenated species file and write the sequence in the meta file.
-        for file in files:
-            file_ending = str(file).rsplit(".", maxsplit=1)[-1]
-            if file_ending in fasta_endings:
-                with open(
-                    (files_path / str(file)), "r", encoding="utf-8"
-                ) as species_file:
-                    for line in species_file:
-                        if line[0] != ">":
-                            meta_file.write(line.replace("\n", ""))
-def get_record_iterator(file_path: Path):
-    """Returns a record iterator for a fasta or fastq file."""
+    This function checks the file extension to determine if the file is in fasta or fastq format
+    and returns an iterator over the records in the file using Biopython's SeqIO module.
+    Args:
+        file_path (Path): Path to the fasta or fastq file.
+    Returns:
+        Iterator: An iterator over the records in the file.
+    Raises:
+        ValueError: If the file path is not a Path object, does not exist, is not a file,
+                    or has an invalid file format.
+    """
     if not isinstance(file_path, Path):
         raise ValueError("Path must be a Path object")
@@ -76,17 +78,18 @@ def get_record_iterator(file_path: Path):
     raise ValueError("Invalid file format, must be a fasta or fastq file")
-def get_records_by_id(file: Path, ids: list[str]):
-    """Return records with the specified ids."""
-    records = get_record_iterator(file)
-    return [record for record in records if record.id in ids]
+def concatenate_species_fasta_files(
+    input_folders: list[Path], output_directory: Path
+) -> None:
+    """
+    Concatenate fasta files from different species into one file per species.
-def concatenate_species_fasta_files(input_folders: list[Path], output_directory: Path):
-    """Concatenate fasta files from different species into one file per species.
+    This function iterates through each species folder within the given input folder,
+    collects all fasta files, and concatenates their contents into a single fasta file
+    named after the species.
     Args:
-        input_species_folders (list[Path]): List of paths to species folders.
+        input_folders (list[Path]): List of paths to species folders.
         output_directory (Path): Path to the output directory.
     """
     for species_folder in input_folders:
@@ -105,15 +108,22 @@ def concatenate_species_fasta_files(input_folders: list[Path], output_directory:
                     f.write(f_in.read())
-def concatenate_metagenome(fasta_dir: Path, meta_path: Path):
-    """Concatenate all fasta files in a directory into one file.
+def concatenate_metagenome(fasta_dir: Path, meta_path: Path) -> None:
+    """
+    Concatenate all fasta files in a directory into one file.
+    This function searches for all fasta files in the specified directory and writes their contents
+    into a single output file. The output file will contain the concatenated sequences from all fasta files.
     Args:
         fasta_dir (Path): Path to the directory with the fasta files.
         meta_path (Path): Path to the output file.
     """
+    fasta_files = [
+        file for ending in fasta_endings for file in fasta_dir.glob(f"*.{ending}")
+    ]
     with open(meta_path, "w", encoding="utf-8") as meta_file:
-        for fasta_file in fasta_dir.glob("*.fasta"):
+        for fasta_file in fasta_files:
             with open(fasta_file, "r", encoding="utf-8") as f_in:
                 meta_file.write(f_in.read())
@@ -121,13 +131,21 @@ def concatenate_metagenome(fasta_dir: Path, meta_path: Path):
 def get_ncbi_dataset_accession_paths(
     ncbi_dataset_path: Path,
 ) -> dict[str, Path]:
-    """Get the paths of the NCBI dataset accessions.
+    """
+    Get the paths of the NCBI dataset accessions.
+    This function reads the dataset catalog from the NCBI dataset directory and returns a dictionary
+    mapping each accession to its corresponding file path. The first item in the dataset catalog is
+    assumed to be a data report, and is skipped.
     Args:
         ncbi_dataset_path (Path): Path to the NCBI dataset directory.
     Returns:
         dict[str, Path]: Dictionary with the accession as key and the path as value.
+    Raises:
+        ValueError: If the dataset path does not exist or is invalid.
     """
     data_path = ncbi_dataset_path / "ncbi_dataset" / "data"
     if not data_path.exists():
@@ -147,13 +165,19 @@ def filter_sequences(
     input_file: Path,
     output_file: Path,
     included_ids: list[str],
-):
-    """Filter sequences by IDs from an input file and save them to an output file.
+) -> None:
+    """
+    Filter sequences by IDs from an input file and save them to an output file.
+    This function reads a fasta or fastq file, filters the sequences based on the provided IDs,
+    and writes the matching sequences to an output file. If no IDs are provided, no output file
+    is created.
     Args:
         input_file (Path): Path to the input file.
         output_file (Path): Path to the output file.
-        included_ids (list[str], optional): List of IDs to include. If None, no output file is created.
+        included_ids (list[str], optional): List of IDs to include. If None, no output file
+            is created.
     """
     if not included_ids:
         print("No IDs provided, no output file will be created.")
@@ -163,3 +187,46 @@ def filter_sequences(
         for record in get_record_iterator(input_file):
             if record.id in included_ids:
                 SeqIO.write(record, out_f, "fasta")
+def prepare_input_output_paths(
+    input_path: Path,
+) -> tuple[list[Path], Callable[[int, Path], Path]]:
+    """
+    Processes the input path into a list of input paths and a function generating output paths.
+    This function checks if the input path is a directory or a file. If it is a directory,
+    it collects all files with specified fasta and fastq endings. If it is a file, it uses that file
+    as the input path. It then returns a list of input file paths and a function that generates
+    output paths based on the index of the input file and a specified output path.
+    Args:
+        input_path (Path): Path to the directory or file.
+    Returns:
+        tuple[list[Path], Callable[[int, Path], Path]]: A tuple containing:
+            - A list of input file paths
+            - A function that takes an index and the output path,
+              and returns the processed output path.
+    Raises:
+        ValueError: If the input path is invalid.
+    """
+    input_is_dir = input_path.is_dir()
+    ending_wildcards = [f"*.{ending}" for ending in fasta_endings + fastq_endings]
+    if input_is_dir:
+        input_paths = [p for e in ending_wildcards for p in input_path.glob(e)]
+    elif input_path.is_file():
+        input_paths = [input_path]
+    else:
+        raise ValueError("Invalid input path")
+    def get_output_path(idx: int, output_path: Path) -> Path:
+        return (
+            output_path.parent / f"{output_path.stem}_{idx+1}{output_path.suffix}"
+            if input_is_dir
+            else output_path
+        )
+    return input_paths, get_output_path

xspect/filter_sequences.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from pathlib import Path
 from xspect.model_management import get_genus_model, get_species_model
-from xspect.file_io import filter_sequences
+from xspect.file_io import filter_sequences, prepare_input_output_paths
 def filter_species(
@@ -9,31 +9,52 @@ def filter_species(
     input_path: Path,
     output_path: Path,
     threshold: float,
+    classification_output_path: Path | None = None,
+    sparse_sampling_step: int = 1,
 ):
-    """Filter sequences by species.
+    """
+    Filter sequences by species.
     This function filters sequences from the input file based on the species model.
-    It uses the genus model to identify the genus of the sequences and then applies
-    the species model to filter the sequences.
+    It uses the species model to identify the species of individual sequences and then applies
+    a threshold filter the sequences.
     Args:
-        model_genus (str): The genus model slug.
-        model_species (str): The species model slug.
+        model_genus (str): The genus of the species model.
+        model_species (str): The species to filter by.
         input_path (Path): The path to the input file containing sequences.
         output_path (Path): The path to the output file where filtered sequences will be saved.
+        classification_output_path (Path): Optional path to save the classification results.
         threshold (float): The threshold for filtering sequences. Only sequences with a score
-            above this threshold will be included in the output file.
+            above this threshold will be included in the output file. A threshold of -1 will
+            include only sequences if the species score is the highest among the
+            available species scores.
+        sparse_sampling_step (int): The step size for sparse sampling. Defaults to 1.
     """
     species_model = get_species_model(model_genus)
-    result = species_model.predict(input_path)
-    included_ids = result.get_filtered_subsequence_labels(model_species, threshold)
-    if not included_ids:
-        print("No sequences found for the given species.")
-        return
-    filter_sequences(
-        input_path,
-        output_path,
-        included_ids,
-    )
+    input_paths, get_output_path = prepare_input_output_paths(input_path)
+    for idx, current_path in enumerate(input_paths):
+        result = species_model.predict(current_path, step=sparse_sampling_step)
+        result.input_source = current_path.name
+        if classification_output_path:
+            cls_out = get_output_path(idx, classification_output_path)
+            result.save(cls_out)
+            print(
+                f"Saved classification results from {current_path.name} as {cls_out.name}"
+            )
+        included_ids = result.get_filtered_subsequence_labels(model_species, threshold)
+        if not included_ids:
+            print(f"No sequences found for the given species in {current_path.name}.")
+            continue
+        filter_output_path = get_output_path(idx, output_path)
+        filter_sequences(current_path, filter_output_path, included_ids)
+        print(
+            f"Saved filtered sequences from {current_path.name} as {filter_output_path.name}"
+        )
 def filter_genus(
@@ -41,16 +62,47 @@ def filter_genus(
     input_path: Path,
     output_path: Path,
     threshold: float,
+    classification_output_path: Path | None = None,
+    sparse_sampling_step: int = 1,
 ):
-    genus_model = get_genus_model(model_genus)
-    result = genus_model.predict(Path(input_path))
-    included_ids = result.get_filtered_subsequence_labels(model_genus, threshold)
-    if not included_ids:
-        print("No sequences found for the given genus.")
-        return
-    filter_sequences(
-        input_path,
-        output_path,
-        included_ids,
-    )
+    """
+    Filter sequences by genus.
+    This function filters sequences from the input file based on the genus model.
+    It uses the genus model to identify the genus of the sequences and then applies
+    the filtering based on the provided threshold.
+    Args:
+        model_genus (str): The genus model slug.
+        input_path (Path): The path to the input file containing sequences.
+        output_path (Path): The path to the output file where filtered sequences will be saved.
+        threshold (float): The threshold for filtering sequences. Only sequences with a score
+            above this threshold will be included in the output file.
+        classification_output_path (Path): Optional path to save the classification results.
+        sparse_sampling_step (int): The step size for sparse sampling. Defaults to 1.
+    """
+    model = get_genus_model(model_genus)
+    input_paths, get_output_path = prepare_input_output_paths(input_path)
+    for idx, current_path in enumerate(input_paths):
+        result = model.predict(current_path, step=sparse_sampling_step)
+        result.input_source = current_path.name
+        if classification_output_path:
+            cls_out = get_output_path(idx, classification_output_path)
+            result.save(cls_out)
+            print(
+                f"Saved classification results from {current_path.name} as {cls_out.name}"
+            )
+        included_ids = result.get_filtered_subsequence_labels(model_genus, threshold)
+        if not included_ids:
+            print(f"No sequences found for the given genus in {current_path.name}.")
+            continue
+        filter_output_path = get_output_path(idx, output_path)
+        filter_sequences(current_path, filter_output_path, included_ids)
+        print(
+            f"Saved filtered sequences from {current_path.name} as {filter_output_path.name}"
+        )

XspecT 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl

Potentially problematic release.

XspecT 0.5.0py3-none-any.whl → 0.5.2py3-none-any.whl