PyPI - XspecT - Versions diffs - 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

XspecT 0.1.3py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of XspecT might be problematic. Click here for more details.

Files changed (58) hide show

{XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/METADATA +23 -29
XspecT-0.2.0.dist-info/RECORD +30 -0
{XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/WHEEL +1 -1
xspect/definitions.py +42 -0
xspect/download_filters.py +11 -26
xspect/fastapi.py +101 -0
xspect/file_io.py +34 -103
xspect/main.py +70 -66
xspect/model_management.py +88 -0
xspect/models/__init__.py +0 -0
xspect/models/probabilistic_filter_model.py +277 -0
xspect/models/probabilistic_filter_svm_model.py +169 -0
xspect/models/probabilistic_single_filter_model.py +109 -0
xspect/models/result.py +148 -0
xspect/pipeline.py +201 -0
xspect/run.py +38 -0
xspect/train.py +304 -0
xspect/train_filter/create_svm.py +6 -183
xspect/train_filter/extract_and_concatenate.py +117 -121
xspect/train_filter/html_scrap.py +16 -28
xspect/train_filter/ncbi_api/download_assemblies.py +7 -8
xspect/train_filter/ncbi_api/ncbi_assembly_metadata.py +9 -17
xspect/train_filter/ncbi_api/ncbi_children_tree.py +3 -2
xspect/train_filter/ncbi_api/ncbi_taxon_metadata.py +7 -5
XspecT-0.1.3.dist-info/RECORD +0 -49
xspect/BF_v2.py +0 -637
xspect/Bootstrap.py +0 -29
xspect/Classifier.py +0 -142
xspect/OXA_Table.py +0 -53
xspect/WebApp.py +0 -724
xspect/XspecT_mini.py +0 -1363
xspect/XspecT_trainer.py +0 -611
xspect/map_kmers.py +0 -155
xspect/search_filter.py +0 -504
xspect/static/How-To.png +0 -0
xspect/static/Logo.png +0 -0
xspect/static/Logo2.png +0 -0
xspect/static/Workflow_AspecT.png +0 -0
xspect/static/Workflow_ClAssT.png +0 -0
xspect/static/js.js +0 -615
xspect/static/main.css +0 -280
xspect/templates/400.html +0 -64
xspect/templates/401.html +0 -62
xspect/templates/404.html +0 -62
xspect/templates/500.html +0 -62
xspect/templates/about.html +0 -544
xspect/templates/home.html +0 -51
xspect/templates/layoutabout.html +0 -87
xspect/templates/layouthome.html +0 -63
xspect/templates/layoutspecies.html +0 -468
xspect/templates/species.html +0 -33
xspect/train_filter/README_XspecT_Erweiterung.md +0 -119
xspect/train_filter/get_paths.py +0 -35
xspect/train_filter/interface_XspecT.py +0 -204
xspect/train_filter/k_mer_count.py +0 -162
{XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/LICENSE +0 -0
{XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/entry_points.txt +0 -0
{XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/top_level.txt +0 -0

xspect/models/probabilistic_filter_svm_model.py ADDED Viewed

@@ -0,0 +1,169 @@
+"""Probabilistic filter SVM model for sequence data"""
+# pylint: disable=no-name-in-module, too-many-instance-attributes, arguments-renamed
+import csv
+import json
+from linecache import getline
+from pathlib import Path
+from sklearn.svm import SVC
+from Bio.SeqRecord import SeqRecord
+from Bio import SeqIO
+import cobs_index as cobs
+from xspect.models.probabilistic_filter_model import ProbabilisticFilterModel
+from xspect.definitions import fasta_endings, fastq_endings
+from xspect.models.result import ModelResult
+class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
+    """Probabilistic filter SVM model for sequence data"""
+    def __init__(
+        self,
+        k: int,
+        model_display_name: str,
+        author: str,
+        author_email: str,
+        model_type: str,
+        base_path: Path,
+        kernel: str,
+        c: float,
+        fpr: float = 0.01,
+        num_hashes: int = 7,
+    ) -> None:
+        super().__init__(
+            k=k,
+            model_display_name=model_display_name,
+            author=author,
+            author_email=author_email,
+            model_type=model_type,
+            base_path=base_path,
+            fpr=fpr,
+            num_hashes=num_hashes,
+        )
+        self.kernel = kernel
+        self.c = c
+    def to_dict(self) -> dict:
+        return super().to_dict() | {
+            "kernel": self.kernel,
+            "C": self.c,
+        }
+    def set_svm_params(self, kernel: str, c: float) -> None:
+        """Set the parameters for the SVM"""
+        self.kernel = kernel
+        self.c = c
+        self.save()
+    def fit(
+        self,
+        dir_path: Path,
+        svm_path: Path,
+        display_names: dict = None,
+        svm_step: int = 1,
+    ) -> None:
+        """Fit the SVM to the sequences and labels"""
+        super().fit(dir_path, display_names=display_names)
+        score_list = []
+        for file in svm_path.iterdir():
+            if not file.is_file():
+                continue
+            if file.suffix[1:] not in fasta_endings + fastq_endings:
+                continue
+            print(f"Calculating {file.name} scores for SVM training...")
+            res = super().predict(file, step=svm_step)
+            scores = res.get_scores()["total"]
+            accession = "".join(file.name.split("_")[:2])
+            file_header = getline(str(file), 1)
+            label_id = file_header.replace("\n", "").replace(">", "")
+            # format scores for csv
+            scores = dict(sorted(scores.items()))
+            scores = ",".join([str(score) for score in scores.values()])
+            scores = f"{accession},{scores},{label_id}"
+            score_list.append(scores)
+        # csv header
+        keys = list(self.display_names.keys())
+        keys.sort()
+        score_list.insert(0, f"file,{','.join(keys)},label_id")
+        with open(
+            self.base_path / self.slug() / "scores.csv", "w", encoding="utf-8"
+        ) as file:
+            file.write("\n".join(score_list))
+    def predict(
+        self,
+        sequence_input: (
+            SeqRecord
+            | list[SeqRecord]
+            | SeqIO.FastaIO.FastaIterator
+            | SeqIO.QualityIO.FastqPhredIterator
+            | Path
+        ),
+        filter_ids: list[str] = None,
+        step: int = 1,
+    ) -> dict:
+        """Predict the labels of the sequences"""
+        # get scores and format them for the SVM
+        res = super().predict(sequence_input, filter_ids, step=step)
+        svm_scores = dict(sorted(res.get_scores()["total"].items()))
+        svm_scores = [list(svm_scores.values())]
+        svm = self._get_svm(filter_ids)
+        return ModelResult(
+            self.slug(),
+            res.hits,
+            res.num_kmers,
+            prediction=str(svm.predict(svm_scores)[0]),
+        )
+    def _get_svm(self, id_keys) -> SVC:
+        """Get the SVM for the given id keys"""
+        svm = SVC(kernel=self.kernel, C=self.c)
+        # parse csv
+        with open(
+            self.base_path / self.slug() / "scores.csv", "r", encoding="utf-8"
+        ) as file:
+            file.readline()
+            x_train = []
+            y_train = []
+            for row in csv.reader(file):
+                if id_keys is None or row[-1] in id_keys:
+                    x_train.append(row[1:-1])
+                    y_train.append(row[-1])
+        # train svm
+        svm.fit(x_train, y_train)
+        return svm
+    @staticmethod
+    def load(path: Path) -> "ProbabilisticFilterSVMModel":
+        """Load the model from disk"""
+        with open(path, "r", encoding="utf-8") as file:
+            json_object = file.read()
+            model_json = json.loads(json_object)
+            model = ProbabilisticFilterSVMModel(
+                model_json["k"],
+                model_json["model_display_name"],
+                model_json["author"],
+                model_json["author_email"],
+                model_json["model_type"],
+                path.parent,
+                model_json["kernel"],
+                model_json["C"],
+                fpr=model_json["fpr"],
+                num_hashes=model_json["num_hashes"],
+            )
+            model.display_names = model_json["display_names"]
+            p = model.get_cobs_index_path()
+            if not Path(p).exists():
+                raise FileNotFoundError(f"Index file not found at {p}")
+            model.index = cobs.Search(p, True)
+            return model

xspect/models/probabilistic_single_filter_model.py ADDED Viewed

@@ -0,0 +1,109 @@
+"""Probabilistic filter SVM model for sequence data"""
+# pylint: disable=no-name-in-module, too-many-instance-attributes
+import json
+from math import ceil
+from pathlib import Path
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+from rbloom import Bloom
+from xxhash import xxh3_64_intdigest
+from xspect.models.probabilistic_filter_model import ProbabilisticFilterModel
+from xspect.file_io import get_record_iterator
+class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
+    """Probabilistic filter SVM model for sequence data"""
+    def __init__(
+        self,
+        k: int,
+        model_display_name: str,
+        author: str,
+        author_email: str,
+        model_type: str,
+        base_path: Path,
+        fpr: float = 0.01,
+        num_hashes: int = 7,
+    ) -> None:
+        super().__init__(
+            k=k,
+            model_display_name=model_display_name,
+            author=author,
+            author_email=author_email,
+            model_type=model_type,
+            base_path=base_path,
+            fpr=fpr,
+            num_hashes=num_hashes,
+        )
+        self.bf = None
+    def fit(self, file_path: Path, display_name: str) -> None:
+        """Fit the SVM to the sequences and labels"""
+        # estimate number of kmers
+        total_length = 0
+        for record in get_record_iterator(file_path):
+            total_length += len(record.seq)
+        num_kmers = total_length - self.k + 1
+        self.bf = Bloom(num_kmers, self.fpr, hash_func=xxh3_64_intdigest)
+        for record in get_record_iterator(file_path):
+            for kmer in self._generate_kmers(record.seq):
+                self.bf.add(kmer)
+        self.display_names[file_path.stem] = display_name
+        bloom_path = self.base_path / self.slug() / "filter.bloom"
+        bloom_path.parent.mkdir(parents=True, exist_ok=True)
+        self.bf.save(str(bloom_path))
+    def calculate_hits(
+        self, sequence: Seq | SeqRecord, filter_ids=None, step: int = 1
+    ) -> dict:
+        """Calculate the hits for the sequence"""
+        if isinstance(sequence, SeqRecord):
+            sequence = sequence.seq
+        if not isinstance(sequence, Seq):
+            raise ValueError("Invalid sequence, must be a Bio.Seq object")
+        if not len(sequence) > self.k:
+            raise ValueError("Invalid sequence, must be longer than k")
+        num_hits = sum(
+            1 for kmer in self._generate_kmers(sequence, step=step) if kmer in self.bf
+        )
+        return {next(iter(self.display_names)): num_hits}
+    @staticmethod
+    def load(path: Path) -> "ProbabilisticSingleFilterModel":
+        """Load the model from disk"""
+        with open(path, "r", encoding="utf-8") as file:
+            json_object = file.read()
+            model_json = json.loads(json_object)
+            model = ProbabilisticSingleFilterModel(
+                model_json["k"],
+                model_json["model_display_name"],
+                model_json["author"],
+                model_json["author_email"],
+                model_json["model_type"],
+                path.parent,
+                fpr=model_json["fpr"],
+                num_hashes=model_json["num_hashes"],
+            )
+            model.display_names = model_json["display_names"]
+            bloom_path = model.base_path / model.slug() / "filter.bloom"
+            model.bf = Bloom.load(
+                str(bloom_path),
+                hash_func=xxh3_64_intdigest,
+            )
+            return model
+    def _generate_kmers(self, sequence: Seq, step: int = 1):
+        """Generate kmers from the sequence"""
+        num_kmers = ceil((len(sequence) - self.k + 1) / step)
+        for i in range(num_kmers):
+            start_pos = i * step
+            kmer = sequence[start_pos : start_pos + self.k]
+            minimizer = min(kmer, str(kmer.reverse_complement()))
+            yield str(minimizer)

xspect/models/result.py ADDED Viewed

@@ -0,0 +1,148 @@
+""" Module for storing the results of XspecT models. """
+from enum import Enum
+def get_last_processing_step(result: "ModelResult") -> "ModelResult":
+    """Get the last subprocessing step of the result. First path only."""
+    last_step = result
+    while last_step.subprocessing_steps:
+        last_step = last_step.subprocessing_steps[-1].result
+    return last_step
+class StepType(Enum):
+    """Enum for defining the type of a subprocessing step."""
+    PREDICTION = 1
+    FILTERING = 2
+    def __str__(self) -> str:
+        return self.name.lower()
+class SubprocessingStep:
+    """Class for storing a subprocessing step of an XspecT model."""
+    def __init__(
+        self,
+        subprocessing_type: StepType,
+        label: str,
+        treshold: float,
+        result: "ModelResult",
+    ):
+        self.subprocessing_type = subprocessing_type
+        self.label = label
+        self.treshold = treshold
+        self.result = result
+    def to_dict(self) -> dict:
+        """Return the subprocessing step as a dictionary."""
+        return {
+            "subprocessing_type": str(self.subprocessing_type),
+            "label": self.label,
+            "treshold": self.treshold,
+            "result": self.result.to_dict() if self.result else {},
+        }
+class ModelResult:
+    """Class for storing an XspecT model result."""
+    def __init__(
+        self,
+        # we store hits depending on the subsequence as well as on the label
+        model_slug: str,
+        hits: dict[str, dict[str, int]],
+        num_kmers: dict[str, int],
+        sparse_sampling_step: int = 1,
+        prediction: str = None,
+    ):
+        if "total" in hits:
+            raise ValueError(
+                "'total' is a reserved key and cannot be used as a subsequence"
+            )
+        self.model_slug = model_slug
+        self.hits = hits
+        self.num_kmers = num_kmers
+        self.sparse_sampling_step = sparse_sampling_step
+        self.prediction = prediction
+        self.subprocessing_steps = []
+    def add_subprocessing_step(self, subprocessing_step: SubprocessingStep) -> None:
+        """Add a subprocessing step to the result."""
+        if subprocessing_step.label in self.subprocessing_steps:
+            raise ValueError(
+                f"Subprocessing step {subprocessing_step.label} already exists in the result"
+            )
+        self.subprocessing_steps.append(subprocessing_step)
+    def get_scores(self) -> dict:
+        """Return the scores of the model."""
+        scores = {
+            subsequence: {
+                label: round(hits / self.num_kmers[subsequence], 2)
+                for label, hits in subseuqence_hits.items()
+            }
+            for subsequence, subseuqence_hits in self.hits.items()
+        }
+        # calculate total scores
+        total_num_kmers = sum(self.num_kmers.values())
+        total_hits = self.get_total_hits()
+        scores["total"] = {
+            label: round(hits / total_num_kmers, 2)
+            for label, hits in total_hits.items()
+        }
+        return scores
+    def get_total_hits(self) -> dict[str, int]:
+        """Return the total hits of the model."""
+        total_hits = {label: 0 for label in list(self.hits.values())[0]}
+        for _, subseuqence_hits in self.hits.items():
+            for label, hits in subseuqence_hits.items():
+                total_hits[label] += hits
+        return total_hits
+    def get_filter_mask(self, label: str, filter_threshold: float) -> dict[str, bool]:
+        """Return a mask for filtered subsequences."""
+        if filter_threshold < 0 or filter_threshold > 1:
+            raise ValueError("The filter threshold must be between 0 and 1.")
+        scores = self.get_scores()
+        scores.pop("total")
+        return {
+            subsequence: score[label] >= filter_threshold
+            for subsequence, score in scores.items()
+        }
+    def get_filtered_subsequences(self, label: str, filter_threshold: 0.7) -> list[str]:
+        """Return the filtered subsequences."""
+        return [
+            subsequence
+            for subsequence, mask in self.get_filter_mask(
+                label, filter_threshold
+            ).items()
+            if mask
+        ]
+    def to_dict(self) -> dict:
+        """Return the result as a dictionary."""
+        res = {
+            "model_slug": self.model_slug,
+            "sparse_sampling_step": self.sparse_sampling_step,
+            "hits": self.hits,
+            "scores": self.get_scores(),
+            "num_kmers": self.num_kmers,
+            "subprocessing_steps": [
+                subprocessing_step.to_dict()
+                for subprocessing_step in self.subprocessing_steps
+            ],
+        }
+        if self.prediction is not None:
+            res["prediction"] = self.prediction
+        return res

xspect/pipeline.py ADDED Viewed

@@ -0,0 +1,201 @@
+""" Module for defining the Pipeline class. """
+import json
+from pathlib import Path
+from Bio.SeqRecord import SeqRecord
+from Bio import SeqIO
+from xspect.file_io import get_records_by_id
+from xspect.models.result import StepType, SubprocessingStep
+from xspect.run import Run
+from xspect.models.result import ModelResult
+from xspect.model_management import get_model_by_slug
+class ModelExecution:
+    """Class for storing a processing step of an XspecT pipeline."""
+    def __init__(
+        self,
+        model_slug: str,
+        sparse_sampling_step: int = 1,
+    ):
+        self.model_slug = model_slug
+        self.sparse_sampling_step = sparse_sampling_step
+        self.pipeline_steps = []
+    def add_pipeline_step(
+        self,
+        pipeline_step: "PipelineStep",
+    ):
+        """Add a subprocessing step to the pipeline step."""
+        self.pipeline_steps.append(pipeline_step)
+    def to_dict(self) -> dict:
+        """Return the processing step as a dictionary."""
+        return {
+            "model_slug": self.model_slug,
+            "sparse_sampling_step": self.sparse_sampling_step,
+            "pipeline_steps": [
+                pipeline_step.to_dict() for pipeline_step in self.pipeline_steps
+            ],
+        }
+    def run(
+        self,
+        sequence_input: (
+            SeqRecord
+            | list[SeqRecord]
+            | SeqIO.FastaIO.FastaIterator
+            | SeqIO.QualityIO.FastqPhredIterator
+            | Path
+        ),
+    ) -> ModelResult:
+        """Run the model on a given input."""
+        model = get_model_by_slug(self.model_slug)
+        model_result = model.predict(sequence_input, step=self.sparse_sampling_step)
+        for pipeline_step in self.pipeline_steps:
+            if pipeline_step.subprocessing_type == StepType.PREDICTION:
+                score = model_result.get_scores()["total"][pipeline_step.label]
+                if score >= pipeline_step.treshold:
+                    prediction_model_result = pipeline_step.model_execution.run(
+                        sequence_input
+                    )
+                    subprocessing_step = SubprocessingStep(
+                        pipeline_step.subprocessing_type,
+                        pipeline_step.label,
+                        pipeline_step.treshold,
+                        prediction_model_result,
+                    )
+                    model_result.add_subprocessing_step(subprocessing_step)
+            elif pipeline_step.subprocessing_type == StepType.FILTERING:
+                filtered_sequence_ids = model_result.get_filtered_subsequences(
+                    pipeline_step.label, pipeline_step.treshold
+                )
+                sequence_input = get_records_by_id(
+                    sequence_input, filtered_sequence_ids
+                )
+                filtering_model_result = None
+                if sequence_input:
+                    filtering_model_result = pipeline_step.model_execution.run(
+                        sequence_input
+                    )
+                subprocessing_step = SubprocessingStep(
+                    pipeline_step.subprocessing_type,
+                    pipeline_step.label,
+                    pipeline_step.treshold,
+                    filtering_model_result,
+                )
+                model_result.add_subprocessing_step(subprocessing_step)
+            else:
+                raise ValueError(
+                    f"Invalid subprocessing type {pipeline_step.subprocessing_type}"
+                )
+        return model_result
+class PipelineStep:
+    """Class for storing a subprocessing step of an XspecT model."""
+    def __init__(
+        self,
+        subprocessing_type: StepType,
+        label: str,
+        treshold: float,
+        model_execution: ModelExecution,
+    ):
+        self.subprocessing_type = subprocessing_type
+        self.label = label
+        self.treshold = treshold
+        self.model_execution = model_execution
+    def to_dict(self) -> dict:
+        """Return the subprocessing step as a dictionary."""
+        return {
+            "subprocessing_type": str(self.subprocessing_type),
+            "label": self.label,
+            "treshold": self.treshold,
+            "model_execution": self.model_execution.to_dict(),
+        }
+class Pipeline:
+    """Class for storing an XspecT pipeline consisting of multiple model processing steps."""
+    def __init__(self, display_name: str, author: str, author_email: str):
+        self.display_name = display_name
+        self.author = author
+        self.author_email = author_email
+        self.model_executions = []
+    def add_pipeline_step(
+        self,
+        pipeline_step: ModelExecution,
+    ):
+        """Add a processing step to the pipeline."""
+        self.model_executions.append(pipeline_step)
+    def to_dict(self) -> dict:
+        """Return the pipeline as a dictionary."""
+        return {
+            "display_name": self.display_name,
+            "author": self.author,
+            "author_email": self.author_email,
+            "model_executions": [
+                model_execution.to_dict() for model_execution in self.model_executions
+            ],
+        }
+    def to_json(self) -> str:
+        """Return the pipeline as a JSON string."""
+        return json.dumps(self.to_dict())
+    def save(self, path: Path) -> None:
+        """Save the pipeline as a JSON file."""
+        with open(path, "w", encoding="utf-8") as f:
+            f.write(self.to_json())
+    @staticmethod
+    def from_file(path: Path) -> "Pipeline":
+        """Load the pipeline from a JSON file."""
+        with open(path, "r", encoding="utf-8") as f:
+            pipeline_json = json.load(f)
+            pipeline = Pipeline(
+                pipeline_json["display_name"],
+                pipeline_json["author"],
+                pipeline_json["author_email"],
+            )
+            for model_execution in pipeline_json["model_executions"]:
+                model_execution = ModelExecution(
+                    model_execution["model_slug"],
+                    model_execution["sparse_sampling_step"],
+                )
+                for pipeline_step in model_execution["pipeline_steps"]:
+                    model_execution.add_pipeline_step(
+                        PipelineStep(
+                            StepType(pipeline_step["subprocessing_type"]),
+                            pipeline_step["label"],
+                            pipeline_step["treshold"],
+                            ModelExecution(
+                                pipeline_step["model_execution"]["model_slug"],
+                                pipeline_step["model_execution"][
+                                    "sparse_sampling_step"
+                                ],
+                            ),
+                        )
+                    )
+                pipeline.add_pipeline_step(model_execution)
+            return pipeline
+    def run(self, input_file: Path) -> Run:
+        """Run the pipeline on a given input."""
+        run = Run(self.display_name, input_file)
+        for model_execution in self.model_executions:
+            result = model_execution.run(input_file)
+            run.add_result(result)
+        return run

xspect/run.py ADDED Viewed

@@ -0,0 +1,38 @@
+""" Module with XspecT global run class, which summarizes individual model results. """
+import json
+from pathlib import Path
+from xspect.models.result import ModelResult
+class Run:
+    """Class for storing the results of an XspecT run."""
+    def __init__(self, display_name: str, input_file: str):
+        self.display_name = display_name
+        self.input_file = input_file
+        self.results = []
+    def add_result(self, result: ModelResult):
+        """Add a result to the run."""
+        self.results.append(result)
+    def to_dict(self) -> dict:
+        """Return the run as a dictionary."""
+        return {
+            "display_name": self.display_name,
+            "input_file": str(self.input_file),
+            "results": (
+                [result.to_dict() for result in self.results] if self.results else []
+            ),
+        }
+    def to_json(self) -> str:
+        """Return the run as a JSON string."""
+        json_dict = self.to_dict()
+        return json.dumps(json_dict, indent=4)
+    def save(self, path: Path) -> None:
+        """Save the run as a JSON file."""
+        with open(path, "w", encoding="utf-8") as f:
+            f.write(self.to_json())

XspecT 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl

Potentially problematic release.

XspecT 0.1.3py3-none-any.whl → 0.2.0py3-none-any.whl