PyPI - XspecT - Versions diffs - 0.1.2__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

XspecT 0.1.2py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of XspecT might be problematic. Click here for more details.

Files changed (57) hide show

{XspecT-0.1.2.dist-info → XspecT-0.2.0.dist-info}/METADATA +23 -29
XspecT-0.2.0.dist-info/RECORD +30 -0
{XspecT-0.1.2.dist-info → XspecT-0.2.0.dist-info}/WHEEL +1 -1
xspect/definitions.py +42 -0
xspect/download_filters.py +11 -26
xspect/fastapi.py +101 -0
xspect/file_io.py +34 -103
xspect/main.py +70 -66
xspect/model_management.py +88 -0
xspect/models/__init__.py +0 -0
xspect/models/probabilistic_filter_model.py +277 -0
xspect/models/probabilistic_filter_svm_model.py +169 -0
xspect/models/probabilistic_single_filter_model.py +109 -0
xspect/models/result.py +148 -0
xspect/pipeline.py +201 -0
xspect/run.py +38 -0
xspect/train.py +304 -0
xspect/train_filter/create_svm.py +6 -183
xspect/train_filter/extract_and_concatenate.py +117 -121
xspect/train_filter/html_scrap.py +16 -28
xspect/train_filter/ncbi_api/download_assemblies.py +7 -8
xspect/train_filter/ncbi_api/ncbi_assembly_metadata.py +9 -17
xspect/train_filter/ncbi_api/ncbi_children_tree.py +3 -2
xspect/train_filter/ncbi_api/ncbi_taxon_metadata.py +7 -5
XspecT-0.1.2.dist-info/RECORD +0 -48
xspect/BF_v2.py +0 -648
xspect/Bootstrap.py +0 -29
xspect/Classifier.py +0 -142
xspect/OXA_Table.py +0 -53
xspect/WebApp.py +0 -737
xspect/XspecT_mini.py +0 -1377
xspect/XspecT_trainer.py +0 -611
xspect/map_kmers.py +0 -155
xspect/search_filter.py +0 -504
xspect/static/How-To.png +0 -0
xspect/static/Logo.png +0 -0
xspect/static/Logo2.png +0 -0
xspect/static/Workflow_AspecT.png +0 -0
xspect/static/Workflow_ClAssT.png +0 -0
xspect/static/js.js +0 -615
xspect/static/main.css +0 -280
xspect/templates/400.html +0 -64
xspect/templates/401.html +0 -62
xspect/templates/404.html +0 -62
xspect/templates/500.html +0 -62
xspect/templates/about.html +0 -544
xspect/templates/home.html +0 -51
xspect/templates/layoutabout.html +0 -87
xspect/templates/layouthome.html +0 -63
xspect/templates/layoutspecies.html +0 -468
xspect/templates/species.html +0 -33
xspect/train_filter/get_paths.py +0 -35
xspect/train_filter/interface_XspecT.py +0 -204
xspect/train_filter/k_mer_count.py +0 -162
{XspecT-0.1.2.dist-info → XspecT-0.2.0.dist-info}/LICENSE +0 -0
{XspecT-0.1.2.dist-info → XspecT-0.2.0.dist-info}/entry_points.txt +0 -0
{XspecT-0.1.2.dist-info → XspecT-0.2.0.dist-info}/top_level.txt +0 -0

xspect/main.py CHANGED Viewed

@@ -1,11 +1,17 @@
 """Project CLI"""
-import webbrowser
+from pathlib import Path
+import datetime
 import click
-from xspect.XspecT_mini import xspecT_mini
+import uvicorn
+from xspect import fastapi
 from xspect.download_filters import download_test_filters
-from xspect.XspecT_trainer import train as x_train
-from xspect.WebApp import app
+from xspect.train import train_ncbi
+from xspect.models.result import (
+    StepType,
+)
+from xspect.definitions import get_xspect_runs_path, fasta_endings, fastq_endings
+from xspect.pipeline import ModelExecution, Pipeline, PipelineStep
 @click.group()
@@ -18,57 +24,60 @@ def cli():
 def download_filters():
     """Download filters."""
     click.echo("Downloading filters, this may take a while...")
-    download_test_filters(
-        "https://applbio.biologie.uni-frankfurt.de/download/xspect/filters.zip"
-    )
+    download_test_filters("https://xspect2.s3.eu-central-1.amazonaws.com/models.zip")
-# todo: add read amount option -> why 342480?
 @cli.command()
 @click.argument("genus")
-@click.argument("path", type=click.Path(exists=True, dir_okay=True, file_okay=False))
-@click.option(
-    "-s", "--species/--no-species", help="Species classification.", default=True
-)
-@click.option("-i", "--ic/--no-ic", help="IC strain typing.", default=False)
-@click.option("-o", "--oxa/--no-oxa", help="OXA gene family detection.", default=False)
+@click.argument("path", type=click.Path(exists=True, dir_okay=True, file_okay=True))
 @click.option(
     "-m",
-    "--metagenome/--no-metagenome",
+    "--meta/--no-meta",
     help="Metagenome classification.",
     default=False,
 )
 @click.option(
-    "-c",
-    "--complete",
-    help="Use every single k-mer as input for classification.",
-    is_flag=True,
-    default=False,
+    "-s",
+    "--step",
+    help="Sparse sampling step size (e. g. only every 500th kmer for step=500).",
+    default=1,
 )
-@click.option(
-    "-s", "--save", help="Save results to csv file.", is_flag=True, default=False
-)
-def classify(genus, path, species, ic, oxa, metagenome, complete, save):
-    """Classify sample(s) from directory PATH."""
-    click.echo("Classifying sample...")
-    mode = 500
-    if complete:
-        mode = 1
-    file_format = "fasta"
-    read_amount = 342480
-    xspecT_mini(
-        path,
-        species,
-        ic,
-        oxa,
-        file_format,
-        read_amount,
-        save,
-        metagenome,
-        genus,
-        mode,
-    )
+def classify(genus, path, meta, step):
+    """Classify sample(s) from file or directory PATH."""
+    click.echo("Classifying...")
+    click.echo(f"Step: {step}")
+    file_paths = []
+    if Path(path).is_dir():
+        file_paths = [
+            f
+            for f in Path(path).iterdir()
+            if f.is_file() and f.suffix[1:] in fasta_endings + fastq_endings
+        ]
+    else:
+        file_paths = [Path(path)]
+    # define pipeline
+    pipeline = Pipeline(genus + " classification", "Test Author", "test@example.com")
+    species_execution = ModelExecution(genus + "-species", sparse_sampling_step=step)
+    if meta:
+        species_filtering_step = PipelineStep(
+            StepType.FILTERING, genus, 0.7, species_execution
+        )
+        genus_execution = ModelExecution(genus + "-genus", sparse_sampling_step=step)
+        genus_execution.add_pipeline_step(species_filtering_step)
+        pipeline.add_pipeline_step(genus_execution)
+    else:
+        pipeline.add_pipeline_step(species_execution)
+    for idx, file_path in enumerate(file_paths):
+        run = pipeline.run(file_path)
+        time_str = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+        save_path = get_xspect_runs_path() / f"run_{time_str}.json"
+        run.save(save_path)
+        print(
+            f"[{idx+1}/{len(file_paths)}] Run finished. Results saved to '{save_path}'."
+        )
 @cli.command()
@@ -86,33 +95,28 @@ def classify(genus, path, species, ic, oxa, metagenome, complete, save):
     type=click.Path(exists=True, dir_okay=True, file_okay=False),
 )
 @click.option(
-    "-c",
-    "--complete",
-    help="Train filter on every single k-mer.",
-    is_flag=True,
-    default=False,
+    "-s",
+    "--svm-step",
+    help="SVM Sparse sampling step size (e. g. only every 500th kmer for step=500).",
+    default=1,
 )
-@click.option(
-    "--check",
-    help="Check if metagenome file was correctly created.",
-    is_flag=True,
-    default=False,
-)
-def train(genus, bf_assembly_path, svm_assembly_path, complete, check):
+def train(genus, bf_assembly_path, svm_assembly_path, svm_step):
     """Train model."""
-    mode = "1"
-    if bf_assembly_path and svm_assembly_path:
-        mode = "2"
-    if check:
-        mode = "3"
-    x_train(genus, mode, complete, bf_assembly_path, svm_assembly_path, "")
+    if bf_assembly_path or svm_assembly_path:
+        raise NotImplementedError(
+            "Training with specific assembly paths is not yet implemented."
+        )
+    try:
+        train_ncbi(genus, svm_step=svm_step)
+    except ValueError as e:
+        raise click.ClickException(str(e)) from e
 @cli.command()
-def web():
-    """Open the XspecT web app."""
-    webbrowser.open("http://localhost:8000")
-    app.run(host="0.0.0.0", port=8000, debug=True, threaded=True)
+def api():
+    """Open the XspecT FastAPI."""
+    uvicorn.run(fastapi.app, host="0.0.0.0", port=8000)
 if __name__ == "__main__":

xspect/model_management.py ADDED Viewed

@@ -0,0 +1,88 @@
+"""This module contains functions to manage models."""
+from json import loads, dumps
+from pathlib import Path
+from xspect.models.probabilistic_filter_model import ProbabilisticFilterModel
+from xspect.models.probabilistic_single_filter_model import (
+    ProbabilisticSingleFilterModel,
+)
+from xspect.models.probabilistic_filter_svm_model import ProbabilisticFilterSVMModel
+from xspect.definitions import get_xspect_model_path
+def get_genus_model(genus):
+    """Get a metagenomic model for the specified genus."""
+    genus_model_path = get_xspect_model_path() / (genus.lower() + "-genus.json")
+    genus_filter_model = ProbabilisticSingleFilterModel.load(genus_model_path)
+    return genus_filter_model
+def get_species_model(genus):
+    """Get a species classification model for the specified genus."""
+    species_model_path = get_xspect_model_path() / (genus.lower() + "-species.json")
+    species_filter_model = ProbabilisticFilterSVMModel.load(species_model_path)
+    return species_filter_model
+def get_model_by_slug(model_slug: str):
+    """Get a model by its slug."""
+    model_path = get_xspect_model_path() / (model_slug + ".json")
+    model_metadata = get_model_metadata(model_path)
+    if model_metadata["model_class"] == "ProbabilisticSingleFilterModel":
+        return ProbabilisticSingleFilterModel.load(model_path)
+    elif model_metadata["model_class"] == "ProbabilisticFilterSVMModel":
+        return ProbabilisticFilterSVMModel.load(model_path)
+    elif model_metadata["model_class"] == "ProbabilisticFilterModel":
+        return ProbabilisticFilterModel.load(model_path)
+    else:
+        raise ValueError(f"Model class {model_metadata['model_class']} not recognized.")
+def get_model_metadata(model: str | Path):
+    """Get the metadata of a model."""
+    if isinstance(model, str):
+        model_path = get_xspect_model_path() / (model + ".json")
+    elif isinstance(model, Path):
+        model_path = model
+    else:
+        raise ValueError("Model must be a string (slug) or a Path object.")
+    if not model_path.exists() or not model_path.is_file():
+        raise ValueError(f"Model at {model_path} does not exist.")
+    with open(model_path, "r", encoding="utf-8") as file:
+        model_json = loads(file.read())
+        return model_json
+def update_model_metadata(model_slug: str, author: str, author_email: str):
+    """Update the metadata of a model."""
+    model_metadata = get_model_metadata(model_slug)
+    model_metadata["author"] = author
+    model_metadata["author_email"] = author_email
+    model_path = get_xspect_model_path() / (model_slug + ".json")
+    with open(model_path, "w", encoding="utf-8") as file:
+        file.write(dumps(model_metadata, indent=4))
+def update_model_display_name(model_slug: str, filter_id: str, display_name: str):
+    """Update the display name of a filter in a model."""
+    model_metadata = get_model_metadata(model_slug)
+    model_metadata["display_names"][filter_id] = display_name
+    model_path = get_xspect_model_path() / (model_slug + ".json")
+    with open(model_path, "w", encoding="utf-8") as file:
+        file.write(dumps(model_metadata, indent=4))
+def get_models():
+    """Get a list of all available models in a dictionary by type."""
+    model_dict = {}
+    for model_file in get_xspect_model_path().glob("*.json"):
+        model_metadata = get_model_metadata(model_file)
+        model_type = model_metadata["model_type"]
+        model_dict.setdefault(model_type, []).append(
+            model_metadata["model_display_name"]
+        )
+    return model_dict

xspect/models/__init__.py ADDED Viewed

File without changes

xspect/models/probabilistic_filter_model.py ADDED Viewed

@@ -0,0 +1,277 @@
+"""Probabilistic filter model for sequence data"""
+import json
+from math import ceil
+from pathlib import Path
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+from Bio import SeqIO
+from slugify import slugify
+import cobs_index as cobs
+from xspect.file_io import get_record_iterator
+from xspect.models.result import ModelResult
+class ProbabilisticFilterModel:
+    """Probabilistic filter model for sequence data"""
+    def __init__(
+        self,
+        k: int,
+        model_display_name: str,
+        author: str,
+        author_email: str,
+        model_type: str,
+        base_path: Path,
+        fpr: float = 0.01,
+        num_hashes: int = 7,
+    ) -> None:
+        if k < 1:
+            raise ValueError("Invalid k value, must be greater than 0")
+        if not model_display_name:
+            raise ValueError("Invalid filter display name, must be a non-empty string")
+        if not model_type:
+            raise ValueError("Invalid filter type, must be a non-empty string")
+        if not isinstance(base_path, Path):
+            raise ValueError("Invalid base path, must be a pathlib.Path object")
+        self.k = k
+        self.model_display_name = model_display_name
+        self.author = author
+        self.author_email = author_email
+        self.model_type = model_type
+        self.base_path = base_path
+        self.display_names = {}
+        self.fpr = fpr
+        self.num_hashes = num_hashes
+        self.index = None
+    def get_cobs_index_path(self) -> Path:
+        """Returns the path to the cobs index"""
+        return str(self.base_path / self.slug() / "index.cobs_classic")
+    def to_dict(self) -> dict:
+        """Returns a dictionary representation of the model"""
+        return {
+            "k": self.k,
+            "model_display_name": self.model_display_name,
+            "author": self.author,
+            "author_email": self.author_email,
+            "model_type": self.model_type,
+            "model_class": self.__class__.__name__,
+            "display_names": self.display_names,
+            "fpr": self.fpr,
+            "num_hashes": self.num_hashes,
+        }
+    def __dict__(self) -> dict:
+        """Returns a dictionary representation of the model"""
+        return self.to_dict()
+    def slug(self) -> str:
+        """Returns a slug representation of the model"""
+        return slugify(self.model_display_name + "-" + str(self.model_type))
+    def fit(self, dir_path: Path, display_names: dict = None) -> None:
+        """Adds filters to the model"""
+        if display_names is None:
+            display_names = {}
+        if not isinstance(dir_path, Path):
+            raise ValueError("Invalid directory path, must be a pathlib.Path object")
+        if not dir_path.exists():
+            raise ValueError("Directory path does not exist")
+        if not dir_path.is_dir():
+            raise ValueError("Directory path must be a directory")
+        doclist = cobs.DocumentList()
+        for file in dir_path.iterdir():
+            if file.is_file() and file.suffix in [
+                ".fasta",
+                ".fna",
+                ".fa",
+                ".fastq",
+                ".fq",
+            ]:
+                # cobs only uses the file name to the first "." as the document name
+                if file.name in display_names:
+                    self.display_names[file.name.split(".")[0]] = display_names[
+                        file.name
+                    ]
+                else:
+                    self.display_names[file.name.split(".")[0]] = file.stem
+                doclist.add(str(file))
+        if len(doclist) == 0:
+            raise ValueError(
+                "No valid files found in directory. Must be fasta or fastq"
+            )
+        index_params = cobs.ClassicIndexParameters()
+        index_params.term_size = self.k
+        index_params.num_hashes = self.num_hashes
+        index_params.false_positive_rate = self.fpr
+        index_params.clobber = True
+        cobs.classic_construct_list(doclist, self.get_cobs_index_path(), index_params)
+        self.index = cobs.Search(self.get_cobs_index_path(), True)
+    def calculate_hits(
+        self, sequence: Seq, filter_ids: list[str] = None, step: int = 1
+    ) -> dict:
+        """Calculates the hits for a sequence"""
+        if not isinstance(sequence, (Seq)):
+            raise ValueError(
+                "Invalid sequence, must be a Bio.Seq or a Bio.SeqRecord object"
+            )
+        if not len(sequence) > self.k:
+            raise ValueError("Invalid sequence, must be longer than k")
+        r = self.index.search(str(sequence), step=step)
+        result_dict = self._convert_cobs_result_to_dict(r)
+        if filter_ids:
+            return {doc: result_dict[doc] for doc in filter_ids}
+        return result_dict
+    def predict(
+        self,
+        sequence_input: (
+            SeqRecord
+            | list[SeqRecord]
+            | SeqIO.FastaIO.FastaIterator
+            | SeqIO.QualityIO.FastqPhredIterator
+            | Path
+        ),
+        filter_ids: list[str] = None,
+        step: int = 1,
+    ) -> ModelResult:
+        """Returns scores for the sequence(s) based on the filters in the model"""
+        if isinstance(sequence_input, (SeqRecord)):
+            return ProbabilisticFilterModel.predict(
+                self, [sequence_input], filter_ids, step=step
+            )
+        if self._is_sequence_list(sequence_input) | self._is_sequence_iterator(
+            sequence_input
+        ):
+            hits = {}
+            num_kmers = {}
+            for individual_sequence in sequence_input:
+                individual_hits = self.calculate_hits(
+                    individual_sequence.seq, filter_ids, step=step
+                )
+                num_kmers[individual_sequence.id] = self._count_kmers(
+                    individual_sequence, step=step
+                )
+                hits[individual_sequence.id] = individual_hits
+            return ModelResult(self.slug(), hits, num_kmers, sparse_sampling_step=step)
+        if isinstance(sequence_input, Path):
+            return ProbabilisticFilterModel.predict(
+                self, get_record_iterator(sequence_input), step=step
+            )
+        raise ValueError(
+            "Invalid sequence input, must be a Seq object, a list of Seq objects, a"
+            " SeqIO FastaIterator, a SeqIO FastqPhredIterator, or a Path object to a"
+            " fasta/fastq file"
+        )
+    def save(self) -> None:
+        """Saves the model to disk"""
+        json_path = self.base_path / f"{self.slug()}.json"
+        filter_path = self.base_path / self.slug()
+        filter_path.mkdir(exist_ok=True, parents=True)
+        json_object = json.dumps(self.to_dict(), indent=4)
+        with open(json_path, "w", encoding="utf-8") as file:
+            file.write(json_object)
+    @staticmethod
+    def load(path: Path) -> "ProbabilisticFilterModel":
+        """Loads the model from a file"""
+        with open(path, "r", encoding="utf-8") as file:
+            json_object = file.read()
+            model_json = json.loads(json_object)
+            model = ProbabilisticFilterModel(
+                model_json["k"],
+                model_json["model_display_name"],
+                model_json["author"],
+                model_json["author_email"],
+                model_json["model_type"],
+                path.parent,
+                model_json["fpr"],
+                model_json["num_hashes"],
+            )
+            model.display_names = model_json["display_names"]
+            p = model.get_cobs_index_path()
+            if not Path(p).exists():
+                raise FileNotFoundError(f"Index file not found at {p}")
+            model.index = cobs.Search(p, True)
+            return model
+    def _convert_cobs_result_to_dict(self, cobs_result: cobs.SearchResult) -> dict:
+        return {
+            individual_result.doc_name: individual_result.score
+            for individual_result in cobs_result
+        }
+    def _count_kmers(
+        self,
+        sequence_input: (
+            Seq
+            | SeqRecord
+            | list[Seq]
+            | SeqIO.FastaIO.FastaIterator
+            | SeqIO.QualityIO.FastqPhredIterator
+        ),
+        step: int = 1,
+    ) -> int:
+        """Counts the number of kmers in the sequence(s)"""
+        if isinstance(sequence_input, Seq):
+            return self._count_kmers([sequence_input], step=step)
+        if isinstance(sequence_input, SeqRecord):
+            return self._count_kmers(sequence_input.seq, step=step)
+        is_sequence_list = isinstance(sequence_input, list) and all(
+            isinstance(seq, Seq) for seq in sequence_input
+        )
+        is_iterator = isinstance(
+            sequence_input,
+            (SeqIO.FastaIO.FastaIterator, SeqIO.QualityIO.FastqPhredIterator),
+        )
+        if is_sequence_list | is_iterator:
+            kmer_sum = 0
+            for individual_sequence in sequence_input:
+                # we need to look specifically at .seq for SeqIO iterators
+                seq = individual_sequence.seq if is_iterator else individual_sequence
+                num_kmers = ceil((len(seq) - self.k + 1) / step)
+                kmer_sum += num_kmers
+            return kmer_sum
+        raise ValueError(
+            "Invalid sequence input, must be a Seq object, a list of Seq objects, a"
+            " SeqIO FastaIterator, or a SeqIO FastqPhredIterator"
+        )
+    def _is_sequence_list(self, sequence_input):
+        return isinstance(sequence_input, list) and all(
+            isinstance(seq, (SeqRecord)) for seq in sequence_input
+        )
+    def _is_sequence_iterator(self, sequence_input):
+        return isinstance(
+            sequence_input,
+            (SeqIO.FastaIO.FastaIterator, SeqIO.QualityIO.FastqPhredIterator),
+        )

XspecT 0.1.2__py3-none-any.whl → 0.2.0__py3-none-any.whl

Potentially problematic release.

XspecT 0.1.2py3-none-any.whl → 0.2.0py3-none-any.whl