PyPI - XspecT - Versions diffs - 0.2.7__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

XspecT 0.2.7py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of XspecT might be problematic. Click here for more details.

Files changed (30) hide show

xspect/definitions.py +0 -7
xspect/download_models.py +25 -24
xspect/fastapi.py +23 -26
xspect/file_io.py +86 -2
xspect/main.py +333 -98
xspect/mlst_feature/mlst_helper.py +4 -6
xspect/model_management.py +6 -0
xspect/models/probabilistic_filter_model.py +16 -5
xspect/models/probabilistic_filter_svm_model.py +33 -18
xspect/models/probabilistic_single_filter_model.py +8 -1
xspect/models/result.py +14 -60
xspect/ncbi.py +265 -0
xspect/train.py +258 -242
{xspect-0.2.7.dist-info → xspect-0.4.0.dist-info}/METADATA +14 -21
xspect-0.4.0.dist-info/RECORD +24 -0
{xspect-0.2.7.dist-info → xspect-0.4.0.dist-info}/WHEEL +1 -1
xspect/pipeline.py +0 -201
xspect/run.py +0 -38
xspect/train_filter/__init__.py +0 -0
xspect/train_filter/create_svm.py +0 -45
xspect/train_filter/extract_and_concatenate.py +0 -124
xspect/train_filter/ncbi_api/__init__.py +0 -0
xspect/train_filter/ncbi_api/download_assemblies.py +0 -31
xspect/train_filter/ncbi_api/ncbi_assembly_metadata.py +0 -110
xspect/train_filter/ncbi_api/ncbi_children_tree.py +0 -53
xspect/train_filter/ncbi_api/ncbi_taxon_metadata.py +0 -55
xspect-0.2.7.dist-info/RECORD +0 -33
{xspect-0.2.7.dist-info → xspect-0.4.0.dist-info}/entry_points.txt +0 -0
{xspect-0.2.7.dist-info → xspect-0.4.0.dist-info/licenses}/LICENSE +0 -0
{xspect-0.2.7.dist-info → xspect-0.4.0.dist-info}/top_level.txt +0 -0

xspect/main.py CHANGED Viewed

@@ -1,28 +1,26 @@
 """Project CLI"""
 from pathlib import Path
-import datetime
-import uuid
+from uuid import uuid4
 import click
 import uvicorn
 from xspect import fastapi
 from xspect.download_models import download_test_models
-from xspect.train import train_ncbi
-from xspect.models.result import (
-    StepType,
-)
+from xspect.file_io import filter_sequences
+from xspect.train import train_from_directory, train_from_ncbi
 from xspect.definitions import (
-    get_xspect_runs_path,
-    fasta_endings,
-    fastq_endings,
     get_xspect_model_path,
 )
-from xspect.pipeline import ModelExecution, Pipeline, PipelineStep
 from xspect.mlst_feature.mlst_helper import pick_scheme, pick_scheme_from_models_dir
 from xspect.mlst_feature.pub_mlst_handler import PubMLSTHandler
 from xspect.models.probabilistic_filter_mlst_model import (
     ProbabilisticFilterMlstSchemeModel,
 )
+from xspect.model_management import (
+    get_genus_model,
+    get_models,
+    get_species_model,
+)
 @click.group()
@@ -32,103 +30,133 @@ def cli():
 @cli.command()
-def download_models():
+def web():
+    """Open the XspecT web application."""
+    uvicorn.run(fastapi.app, host="0.0.0.0", port=8000)
+# # # # # # # # # # # # # # #
+# Model management commands #
+# # # # # # # # # # # # # # #
+@cli.group()
+def models():
+    """Model management commands."""
+    pass
+@models.command(
+    help="Download models from the internet.",
+)
+def download():
     """Download models."""
     click.echo("Downloading models, this may take a while...")
-    download_test_models("https://xspect2.s3.eu-central-1.amazonaws.com/models.zip")
+    download_test_models("http://assets.adrianromberg.com/xspect-models.zip")
-@cli.command()
-@click.argument("genus")
-@click.argument("path", type=click.Path(exists=True, dir_okay=True, file_okay=True))
+@models.command(
+    name="list",
+    help="List all models in the model directory.",
+)
+def list_models():
+    """List models."""
+    available_models = get_models()
+    if not available_models:
+        click.echo("No models found.")
+        return
+    # todo: make this machine readable
+    click.echo("Models found:")
+    click.echo("--------------")
+    for model_type, names in available_models.items():
+        if not names:
+            continue
+        click.echo(f"  {model_type}:")
+        for name in names:
+            click.echo(f"    - {name}")
+@models.group()
+def train():
+    """Train models."""
+    pass
+@train.command(
+    name="ncbi",
+    help="Train a species and a genus model based on NCBI data.",
+)
+@click.option("-g", "--genus", "model_genus", prompt=True)
+@click.option("--svm_steps", type=int, default=1)
 @click.option(
-    "-m",
-    "--meta/--no-meta",
-    help="Metagenome classification.",
-    default=False,
+    "--author",
+    help="Author of the model.",
+    default=None,
 )
 @click.option(
-    "-s",
-    "--step",
-    help="Sparse sampling step size (e. g. only every 500th kmer for step=500).",
-    default=1,
+    "--author-email",
+    help="Email of the author.",
+    default=None,
 )
-def classify_species(genus, path, meta, step):
-    """Classify sample(s) from file or directory PATH."""
-    click.echo("Classifying...")
-    click.echo(f"Step: {step}")
-    file_paths = []
-    if Path(path).is_dir():
-        file_paths = [
-            f
-            for f in Path(path).iterdir()
-            if f.is_file() and f.suffix[1:] in fasta_endings + fastq_endings
-        ]
-    else:
-        file_paths = [Path(path)]
-    # define pipeline
-    pipeline = Pipeline(genus + " classification", "Test Author", "test@example.com")
-    species_execution = ModelExecution(
-        genus.lower() + "-species", sparse_sampling_step=step
-    )
-    if meta:
-        species_filtering_step = PipelineStep(
-            StepType.FILTERING, genus, 0.7, species_execution
-        )
-        genus_execution = ModelExecution(
-            genus.lower() + "-genus", sparse_sampling_step=step
-        )
-        genus_execution.add_pipeline_step(species_filtering_step)
-        pipeline.add_pipeline_step(genus_execution)
-    else:
-        pipeline.add_pipeline_step(species_execution)
-    for idx, file_path in enumerate(file_paths):
-        run = pipeline.run(file_path)
-        time_str = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
-        save_path = get_xspect_runs_path() / f"run_{time_str}_{uuid.uuid4()}.json"
-        run.save(save_path)
-        print(
-            f"[{idx+1}/{len(file_paths)}] Run finished. Results saved to '{save_path}'."
-        )
+def train_ncbi(model_genus, svm_steps, author, author_email):
+    """Train a species and a genus model based on NCBI data."""
+    click.echo(f"Training {model_genus} species and genus metagenome model.")
+    try:
+        train_from_ncbi(model_genus, svm_steps, author, author_email)
+    except ValueError as e:
+        click.echo(f"Error: {e}")
+        return
+    click.echo(f"Training of {model_genus} model finished.")
-@cli.command()
-@click.argument("genus")
+@train.command(
+    name="directory",
+    help="Train a species (and possibly a genus) model based on local data.",
+)
+@click.option("-g", "--genus", "model_genus", prompt=True)
 @click.option(
-    "-bf-path",
-    "--bf-assembly-path",
-    help="Path to assembly directory for Bloom filter training.",
-    type=click.Path(exists=True, dir_okay=True, file_okay=False),
+    "-i",
+    "--input-path",
+    type=click.Path(exists=True, dir_okay=True, file_okay=True),
+    prompt=True,
 )
 @click.option(
-    "-svm-path",
-    "--svm-assembly-path",
-    help="Path to assembly directory for SVM training.",
-    type=click.Path(exists=True, dir_okay=True, file_okay=False),
+    "--meta",
+    is_flag=True,
+    help="Train a metagenome model for the genus.",
+    default=True,
 )
 @click.option(
-    "-s",
-    "--svm-step",
+    "--svm-steps",
+    type=int,
     help="SVM Sparse sampling step size (e. g. only every 500th kmer for step=500).",
     default=1,
 )
-def train_species(genus, bf_assembly_path, svm_assembly_path, svm_step):
-    """Train model."""
-    if bf_assembly_path or svm_assembly_path:
-        raise NotImplementedError(
-            "Training with specific assembly paths is not yet implemented."
-        )
-    try:
-        train_ncbi(genus, svm_step=svm_step)
-    except ValueError as e:
-        raise click.ClickException(str(e)) from e
+@click.option(
+    "--author",
+    help="Author of the model.",
+    default=None,
+)
+@click.option(
+    "--author-email",
+    help="Email of the author.",
+    default=None,
+)
+def train_directory(model_genus, input_path, svm_steps, meta, author, author_email):
+    """Train a model based on data from a directory for a given genus."""
+    click.echo(f"Training {model_genus} model with {svm_steps} SVM steps.")
+    train_from_directory(
+        model_genus,
+        Path(input_path),
+        svm_step=svm_steps,
+        meta=meta,
+        author=author,
+        author_email=author_email,
+    )
-@cli.command()
+@train.command(
+    name="mlst",
+    help="Train a MLST model based on PubMLST data.",
+)
 @click.option(
     "-c",
     "--choose_schemes",
@@ -154,27 +182,234 @@ def train_mlst(choose_schemes):
     click.echo(f"Saved at {model.cobs_path}")
-@cli.command()
+# # # # # # # # # # # # # # #
+# Classification commands   #
+# # # # # # # # # # # # # # #
+@cli.group(
+    name="classify",
+    help="Classify sequences using XspecT models.",
+)
+def classify_seqs():
+    """Classification commands."""
+    pass
+@classify_seqs.command()
+@click.option(
+    "-g",
+    "--genus",
+    "model_genus",
+    help="Genus of the model to classify.",
+    type=click.Choice(get_models().get("Genus"), None),
+    prompt=True,
+)
+@click.option(
+    "-i",
+    "--input-path",
+    help="Path to FASTA or FASTQ file for classification.",
+    type=click.Path(exists=True, dir_okay=True, file_okay=True),
+    prompt=True,
+)
+@click.option(
+    "-o",
+    "--output-path",
+    help="Path to the output file.",
+    type=click.Path(dir_okay=True, file_okay=True),
+    default=Path(".") / f"result_{uuid4()}.json",
+)
+def genus(model_genus, input_path, output_path):
+    """Classify samples using a genus model."""
+    click.echo("Classifying...")
+    genus_model = get_genus_model(model_genus)
+    result = genus_model.predict(Path(input_path))
+    result.save(output_path)
+    click.echo(f"Result saved as {output_path}.")
+@classify_seqs.command()
 @click.option(
-    "-p",
-    "--path",
+    "-g",
+    "--genus",
+    "model_genus",
+    help="Genus of the model to classify.",
+    type=click.Choice(get_models().get("Species"), None),
+    prompt=True,
+)
+@click.option(
+    "-i",
+    "--input-path",
+    help="Path to FASTA or FASTQ file for classification.",
+    type=click.Path(exists=True, dir_okay=True, file_okay=True),
+    prompt=True,
+)
+@click.option(
+    "-o",
+    "--output-path",
+    help="Path to the output file.",
+    type=click.Path(dir_okay=True, file_okay=True),
+    default=Path(".") / f"result_{uuid4()}.json",
+)
+@click.option(
+    "--sparse-sampling-step",
+    type=int,
+    help="Sparse sampling step size (e. g. only every 500th kmer for '--sparse-sampling-step 500').",
+    default=1,
+)
+def species(model_genus, input_path, output_path, sparse_sampling_step):
+    """Classify samples using a species model."""
+    click.echo("Classifying...")
+    species_model = get_species_model(model_genus)
+    result = species_model.predict(Path(input_path), step=sparse_sampling_step)
+    result.save(output_path)
+    click.echo(f"Result saved as {output_path}.")
+@classify_seqs.command(
+    name="mlst",
+    help="Classify samples using a MLST model.",
+)
+@click.option(
+    "-i",
+    "--input-path",
     help="Path to FASTA-file for mlst identification.",
     type=click.Path(exists=True, dir_okay=True, file_okay=True),
+    prompt=True,
+)
+@click.option(
+    "-o",
+    "--output-path",
+    help="Path to the output file.",
+    type=click.Path(dir_okay=True, file_okay=True),
+    default=Path(".") / f"result_{uuid4()}.json",
 )
-def classify_mlst(path):
+def classify_mlst(input_path, output_path):
     """MLST classify a sample."""
     click.echo("Classifying...")
-    path = Path(path)
+    input_path = Path(input_path)
     scheme_path = pick_scheme_from_models_dir()
     model = ProbabilisticFilterMlstSchemeModel.load(scheme_path)
-    model.predict(scheme_path, path).save(model.model_display_name, path)
-    click.echo(f"Run saved at {get_xspect_runs_path()}.")
+    result = model.predict(scheme_path, input_path)
+    result.save(output_path)
+    click.echo(f"Result saved as {output_path}.")
-@cli.command()
-def api():
-    """Open the XspecT FastAPI."""
-    uvicorn.run(fastapi.app, host="0.0.0.0", port=8000)
+# # # # # # # # # # # # # # #
+# Filtering commands        #
+# # # # # # # # # # # # # # #
+@cli.group(
+    name="filter",
+    help="Filter sequences using XspecT models.",
+)
+def filter_seqs():
+    """Filter commands."""
+    pass
+@filter_seqs.command(
+    name="genus",
+    help="Filter sequences using a genus model.",
+)
+@click.option(
+    "-g",
+    "--genus",
+    "model_genus",
+    help="Genus of the model to use for filtering.",
+    type=click.Choice(get_models().get("Species"), None),
+    prompt=True,
+)
+@click.option(
+    "-i",
+    "--input-path",
+    help="Path to FASTA or FASTQ file for classification.",
+    type=click.Path(exists=True, dir_okay=True, file_okay=True),
+    prompt=True,
+)
+@click.option(
+    "-o",
+    "--output-path",
+    help="Path to the output file.",
+    type=click.Path(dir_okay=True, file_okay=True),
+    prompt=True,
+)
+@click.option(
+    "--threshold",
+    type=float,
+    help="Threshold for filtering (default: 0.7).",
+    default=0.7,
+)
+def filter_genus(model_genus, input_path, output_path, threshold):
+    """Filter samples using a genus model."""
+    click.echo("Filtering...")
+    genus_model = get_genus_model(model_genus)
+    result = genus_model.predict(Path(input_path))
+    included_ids = result.get_filtered_subsequence_labels(model_genus, threshold)
+    if not included_ids:
+        click.echo("No sequences found for the given genus.")
+        return
+    filter_sequences(
+        Path(input_path),
+        Path(output_path),
+        included_ids=included_ids,
+    )
+    click.echo(f"Filtered sequences saved at {output_path}.")
+@filter_seqs.command(
+    name="species",
+    help="Filter sequences using a species model.",
+)
+@click.option(
+    "-g",
+    "--genus",
+    "model_genus",
+    help="Genus of the model to use for filtering.",
+    type=click.Choice(get_models().get("Species"), None),
+    prompt=True,
+)
+@click.option(
+    # todo: this should be a choice of the species in the model w/ display names
+    "-s",
+    "--species",
+    "model_species",
+    help="Species of the model to filter for.",
+    prompt=True,
+)
+@click.option(
+    "-i",
+    "--input-path",
+    help="Path to FASTA or FASTQ file for classification.",
+    type=click.Path(exists=True, dir_okay=True, file_okay=True),
+    prompt=True,
+)
+@click.option(
+    "-o",
+    "--output-path",
+    help="Path to the output file.",
+    type=click.Path(dir_okay=True, file_okay=True),
+    prompt=True,
+)
+@click.option(
+    "--threshold",
+    type=float,
+    help="Threshold for filtering (default: 0.7).",
+    default=0.7,
+)
+def filter_species(model_genus, model_species, input_path, output_path, threshold):
+    """Filter a sample using the species model."""
+    click.echo("Filtering...")
+    species_model = get_species_model(model_genus)
+    result = species_model.predict(Path(input_path))
+    included_ids = result.get_filtered_subsequence_labels(model_species, threshold)
+    if not included_ids:
+        click.echo("No sequences found for the given species.")
+        return
+    filter_sequences(
+        Path(input_path),
+        Path(output_path),
+        included_ids=included_ids,
+    )
+    click.echo(f"Filtered sequences saved at {output_path}.")
 if __name__ == "__main__":

xspect/mlst_feature/mlst_helper.py CHANGED Viewed

@@ -144,12 +144,10 @@ class MlstResult:
         }
         return result
-    def save(self, display: str, file_path: Path) -> None:
-        """Saves the result inside the "runs" directory"""
-        file_name = str(file_path).split("/")[-1]
-        json_path = get_xspect_runs_path() / "MLST" / f"{file_name}-{display}.json"
-        json_path.parent.mkdir(exist_ok=True, parents=True)
+    def save(self, output_path: Path) -> None:
+        """Saves the result as a JSON file."""
+        output_path.parent.mkdir(exist_ok=True, parents=True)
         json_object = json.dumps(self.to_dict(), indent=4)
-        with open(json_path, "w", encoding="utf-8") as file:
+        with open(output_path, "w", encoding="utf-8") as file:
             file.write(json_object)

xspect/model_management.py CHANGED Viewed

@@ -85,3 +85,9 @@ def get_models():
             model_metadata["model_display_name"]
         )
     return model_dict
+def get_model_display_names(model_slug: str):
+    """Get the display names included in a model."""
+    model_metadata = get_model_metadata(model_slug)
+    return list(model_metadata["display_names"].values())

xspect/models/probabilistic_filter_model.py CHANGED Viewed

@@ -26,6 +26,7 @@ class ProbabilisticFilterModel:
         base_path: Path,
         fpr: float = 0.01,
         num_hashes: int = 7,
+        training_accessions: dict[str, list[str]] = None,
     ) -> None:
         if k < 1:
             raise ValueError("Invalid k value, must be greater than 0")
@@ -46,6 +47,7 @@ class ProbabilisticFilterModel:
         self.fpr = fpr
         self.num_hashes = num_hashes
         self.index = None
+        self.training_accessions = training_accessions
     def get_cobs_index_path(self) -> Path:
         """Returns the path to the cobs index"""
@@ -63,13 +65,19 @@ class ProbabilisticFilterModel:
             "display_names": self.display_names,
             "fpr": self.fpr,
             "num_hashes": self.num_hashes,
+            "training_accessions": self.training_accessions,
         }
     def slug(self) -> str:
         """Returns a slug representation of the model"""
         return slugify(self.model_display_name + "-" + str(self.model_type))
-    def fit(self, dir_path: Path, display_names: dict = None) -> None:
+    def fit(
+        self,
+        dir_path: Path,
+        display_names: dict = None,
+        training_accessions: dict[str, list[str]] = None,
+    ) -> None:
         """Adds filters to the model"""
         if display_names is None:
@@ -84,16 +92,18 @@ class ProbabilisticFilterModel:
         if not dir_path.is_dir():
             raise ValueError("Directory path must be a directory")
+        self.training_accessions = training_accessions
         doclist = cobs.DocumentList()
         for file in dir_path.iterdir():
             if file.is_file() and file.suffix[1:] in fasta_endings + fastq_endings:
                 # cobs only uses the file name to the first "." as the document name
-                if file.name in display_names:
-                    self.display_names[file.name.split(".")[0]] = display_names[
-                        file.name
+                if file.stem in display_names:
+                    self.display_names[file.stem.split(".")[0]] = display_names[
+                        file.stem
                     ]
                 else:
-                    self.display_names[file.name.split(".")[0]] = file.stem
+                    self.display_names[file.stem.split(".")[0]] = file.stem
                 doclist.add(str(file))
         if len(doclist) == 0:
@@ -200,6 +210,7 @@ class ProbabilisticFilterModel:
                 path.parent,
                 model_json["fpr"],
                 model_json["num_hashes"],
+                model_json["training_accessions"],
             )
             model.display_names = model_json["display_names"]

xspect/models/probabilistic_filter_svm_model.py CHANGED Viewed

@@ -4,7 +4,6 @@
 import csv
 import json
-from linecache import getline
 from pathlib import Path
 from sklearn.svm import SVC
 from Bio.SeqRecord import SeqRecord
@@ -30,6 +29,8 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
         c: float,
         fpr: float = 0.01,
         num_hashes: int = 7,
+        training_accessions: dict[str, list[str]] = None,
+        svm_accessions: dict[str, list[str]] = None,
     ) -> None:
         super().__init__(
             k=k,
@@ -40,14 +41,17 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
             base_path=base_path,
             fpr=fpr,
             num_hashes=num_hashes,
+            training_accessions=training_accessions,
         )
         self.kernel = kernel
         self.c = c
+        self.svm_accessions = svm_accessions
     def to_dict(self) -> dict:
         return super().to_dict() | {
             "kernel": self.kernel,
             "C": self.c,
+            "svm_accessions": self.svm_accessions,
         }
     def set_svm_params(self, kernel: str, c: float) -> None:
@@ -62,32 +66,41 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
         svm_path: Path,
         display_names: dict = None,
         svm_step: int = 1,
+        training_accessions: list[str] = None,
+        svm_accessions: list[str] = None,
     ) -> None:
         """Fit the SVM to the sequences and labels"""
         # Since the SVM works with score data, we need to train
         # the underlying data structure for score generation first
-        super().fit(dir_path, display_names=display_names)
+        super().fit(
+            dir_path,
+            display_names=display_names,
+            training_accessions=training_accessions,
+        )
+        self.svm_accessions = svm_accessions
         # calculate scores for SVM training
         score_list = []
-        for file in svm_path.iterdir():
-            if not file.is_file():
-                continue
-            if file.suffix[1:] not in fasta_endings + fastq_endings:
+        for species_folder in svm_path.iterdir():
+            if not species_folder.is_dir():
                 continue
-            print(f"Calculating {file.name} scores for SVM training...")
-            res = super().predict(file, step=svm_step)
-            scores = res.get_scores()["total"]
-            accession = "".join(file.name.split("_")[:2])
-            file_header = getline(str(file), 1)
-            label_id = file_header.replace("\n", "").replace(">", "")
-            # format scores for csv
-            scores = dict(sorted(scores.items()))
-            scores = ",".join([str(score) for score in scores.values()])
-            scores = f"{accession},{scores},{label_id}"
-            score_list.append(scores)
+            for file in species_folder.iterdir():
+                if file.suffix[1:] not in fasta_endings + fastq_endings:
+                    continue
+                print(f"Calculating {file.name} scores for SVM training...")
+                res = super().predict(file, step=svm_step)
+                scores = res.get_scores()["total"]
+                accession = file.stem
+                label_id = species_folder.name
+                # format scores for csv
+                scores = dict(sorted(scores.items()))
+                scores = ",".join([str(score) for score in scores.values()])
+                scores = f"{accession},{scores},{label_id}"
+                score_list.append(scores)
         # csv header
         keys = list(self.display_names.keys())
@@ -162,6 +175,8 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
                 model_json["C"],
                 fpr=model_json["fpr"],
                 num_hashes=model_json["num_hashes"],
+                training_accessions=model_json["training_accessions"],
+                svm_accessions=model_json["svm_accessions"],
             )
             model.display_names = model_json["display_names"]

XspecT 0.2.7__py3-none-any.whl → 0.4.0__py3-none-any.whl

Potentially problematic release.

XspecT 0.2.7py3-none-any.whl → 0.4.0py3-none-any.whl