PyPI - XspecT - Versions diffs - 0.2.6__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

XspecT 0.2.6py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of XspecT might be problematic. Click here for more details.

Files changed (31) hide show

xspect/definitions.py +0 -7
xspect/download_models.py +25 -24
xspect/fastapi.py +23 -26
xspect/file_io.py +86 -2
xspect/main.py +333 -98
xspect/mlst_feature/mlst_helper.py +5 -7
xspect/model_management.py +6 -0
xspect/models/probabilistic_filter_model.py +16 -5
xspect/models/probabilistic_filter_svm_model.py +33 -18
xspect/models/probabilistic_single_filter_model.py +8 -1
xspect/models/result.py +15 -61
xspect/ncbi.py +265 -0
xspect/train.py +258 -247
{XspecT-0.2.6.dist-info → xspect-0.4.0.dist-info}/METADATA +14 -21
xspect-0.4.0.dist-info/RECORD +24 -0
{XspecT-0.2.6.dist-info → xspect-0.4.0.dist-info}/WHEEL +1 -1
XspecT-0.2.6.dist-info/RECORD +0 -34
xspect/pipeline.py +0 -201
xspect/run.py +0 -38
xspect/train_filter/__init__.py +0 -0
xspect/train_filter/create_svm.py +0 -45
xspect/train_filter/extract_and_concatenate.py +0 -124
xspect/train_filter/html_scrap.py +0 -114
xspect/train_filter/ncbi_api/__init__.py +0 -0
xspect/train_filter/ncbi_api/download_assemblies.py +0 -31
xspect/train_filter/ncbi_api/ncbi_assembly_metadata.py +0 -110
xspect/train_filter/ncbi_api/ncbi_children_tree.py +0 -53
xspect/train_filter/ncbi_api/ncbi_taxon_metadata.py +0 -55
{XspecT-0.2.6.dist-info → xspect-0.4.0.dist-info}/entry_points.txt +0 -0
{XspecT-0.2.6.dist-info → xspect-0.4.0.dist-info/licenses}/LICENSE +0 -0
{XspecT-0.2.6.dist-info → xspect-0.4.0.dist-info}/top_level.txt +0 -0

xspect/definitions.py CHANGED Viewed

@@ -21,13 +21,6 @@ def get_xspect_model_path():
     return model_path
-def get_xspect_tmp_path():
-    """Return the path to the XspecT temporary files."""
-    tmp_path = get_xspect_root_path() / "tmp"
-    tmp_path.mkdir(exist_ok=True, parents=True)
-    return tmp_path
 def get_xspect_upload_path():
     """Return the path to the XspecT upload directory."""
     upload_path = get_xspect_root_path() / "uploads"

xspect/download_models.py CHANGED Viewed

@@ -1,33 +1,34 @@
 """Download filters from public repository."""
-import os
 import shutil
+from tempfile import TemporaryDirectory
+from pathlib import Path
 import requests
-from xspect.definitions import get_xspect_model_path, get_xspect_tmp_path
+from xspect.definitions import get_xspect_model_path
 def download_test_models(url):
     """Download models."""
-    download_path = get_xspect_tmp_path() / "models.zip"
-    extract_path = get_xspect_tmp_path() / "extracted_models"
-    r = requests.get(url, allow_redirects=True, timeout=10)
-    with open(download_path, "wb") as f:
-        f.write(r.content)
-    shutil.unpack_archive(
-        download_path,
-        extract_path,
-        "zip",
-    )
-    shutil.copytree(
-        extract_path,
-        get_xspect_model_path(),
-        dirs_exist_ok=True,
-    )
-    os.remove(download_path)
-    shutil.rmtree(extract_path)
+    with TemporaryDirectory() as tmp_dir:
+        tmp_dir = Path(tmp_dir)
+        download_path = tmp_dir / "models.zip"
+        extract_path = tmp_dir / "extracted_models"
+        r = requests.get(url, allow_redirects=True, timeout=10)
+        with open(download_path, "wb") as f:
+            f.write(r.content)
+        shutil.unpack_archive(
+            download_path,
+            extract_path,
+            "zip",
+        )
+        shutil.copytree(
+            extract_path,
+            get_xspect_model_path(),
+            dirs_exist_ok=True,
+        )
+        shutil.rmtree(extract_path)

xspect/fastapi.py CHANGED Viewed

@@ -1,15 +1,14 @@
 """FastAPI application for XspecT."""
-import datetime
+from uuid import uuid4
 from pathlib import Path
 from shutil import copyfileobj
 from fastapi import FastAPI, UploadFile, BackgroundTasks
 from xspect.definitions import get_xspect_runs_path, get_xspect_upload_path
 from xspect.download_models import download_test_models
+from xspect.file_io import filter_sequences
 import xspect.model_management as mm
-from xspect.models.result import StepType
-from xspect.pipeline import ModelExecution, Pipeline, PipelineStep
-from xspect.train import train_ncbi
+from xspect.train import train_from_ncbi
 app = FastAPI()
@@ -17,43 +16,41 @@ app = FastAPI()
 @app.get("/download-filters")
 def download_filters():
     """Download filters."""
-    download_test_models("https://xspect2.s3.eu-central-1.amazonaws.com/models.zip")
+    download_test_models("http://assets.adrianromberg.com/xspect-models.zip")
 @app.get("/classify")
 def classify(genus: str, file: str, meta: bool = False, step: int = 500):
     """Classify uploaded sample."""
-    path = get_xspect_upload_path() / file
+    input_path = get_xspect_upload_path() / file
+    uuid = str(uuid4())
-    pipeline = Pipeline(genus + " classification", "Test Author", "test@example.com")
-    species_execution = ModelExecution(
-        genus.lower() + "-species", sparse_sampling_step=step
-    )
     if meta:
-        species_filtering_step = PipelineStep(
-            StepType.FILTERING, genus, 0.7, species_execution
-        )
-        genus_execution = ModelExecution(
-            genus.lower() + "-genus", sparse_sampling_step=step
+        genus_model = mm.get_genus_model(genus)
+        genus_result = genus_model.predict(input_path, step=step)
+        included_ids = genus_result.get_filtered_subsequence_labels(genus)
+        if not included_ids:
+            return {"message": "No sequences found for the given genus."}
+        filtered_path = get_xspect_runs_path() / f"filtered_{uuid}.fasta"
+        filter_sequences(
+            Path(input_path),
+            Path(filtered_path),
+            included_ids=included_ids,
         )
-        genus_execution.add_pipeline_step(species_filtering_step)
-        pipeline.add_pipeline_step(genus_execution)
-    else:
-        pipeline.add_pipeline_step(species_execution)
-    run = pipeline.run(Path(path))
-    time_str = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
-    save_path = get_xspect_runs_path() / f"run_{time_str}.json"
-    run.save(save_path)
+        input_path = filtered_path
-    return run.to_dict()
+    species_model = mm.get_species_model(genus)
+    species_result = species_model.predict(input_path, step=step)
+    species_result.save(get_xspect_runs_path() / f"result_{uuid}.json")
+    return species_result.to_dict()
 @app.post("/train")
 def train(genus: str, background_tasks: BackgroundTasks, svm_steps: int = 1):
     """Train NCBI model."""
-    background_tasks.add_task(train_ncbi, genus, svm_steps)
+    background_tasks.add_task(train_from_ncbi, genus, svm_steps)
     return {"message": "Training started."}

xspect/file_io.py CHANGED Viewed

@@ -2,6 +2,7 @@
 File IO module.
 """
+from json import loads
 import os
 from pathlib import Path
 import zipfile
@@ -18,10 +19,10 @@ def delete_zip_files(dir_path):
             os.remove(file_path)
-def extract_zip(zip_path, unzipped_path):
+def extract_zip(zip_path: Path, unzipped_path: Path):
     """Extracts all files from a directory with zip files."""
     # Make new directory.
-    os.mkdir(unzipped_path)
+    unzipped_path.mkdir(parents=True, exist_ok=True)
     file_names = os.listdir(zip_path)
     for file in file_names:
@@ -85,3 +86,86 @@ def get_records_by_id(file: Path, ids: list[str]):
     """Return records with the specified ids."""
     records = get_record_iterator(file)
     return [record for record in records if record.id in ids]
+def concatenate_species_fasta_files(input_folders: list[Path], output_directory: Path):
+    """Concatenate fasta files from different species into one file per species.
+    Args:
+        input_species_folders (list[Path]): List of paths to species folders.
+        output_directory (Path): Path to the output directory.
+    """
+    for species_folder in input_folders:
+        species_name = species_folder.name
+        fasta_files = [
+            f for ending in fasta_endings for f in species_folder.glob(f"*.{ending}")
+        ]
+        if len(fasta_files) == 0:
+            raise ValueError(f"no fasta files found in {species_folder}")
+        # concatenate fasta files
+        concatenated_fasta = output_directory / f"{species_name}.fasta"
+        with open(concatenated_fasta, "w", encoding="utf-8") as f:
+            for fasta_file in fasta_files:
+                with open(fasta_file, "r", encoding="utf-8") as f_in:
+                    f.write(f_in.read())
+def concatenate_metagenome(fasta_dir: Path, meta_path: Path):
+    """Concatenate all fasta files in a directory into one file.
+    Args:
+        fasta_dir (Path): Path to the directory with the fasta files.
+        meta_path (Path): Path to the output file.
+    """
+    with open(meta_path, "w", encoding="utf-8") as meta_file:
+        for fasta_file in fasta_dir.glob("*.fasta"):
+            with open(fasta_file, "r", encoding="utf-8") as f_in:
+                meta_file.write(f_in.read())
+def get_ncbi_dataset_accession_paths(
+    ncbi_dataset_path: Path,
+) -> dict[str, Path]:
+    """Get the paths of the NCBI dataset accessions.
+    Args:
+        ncbi_dataset_path (Path): Path to the NCBI dataset directory.
+    Returns:
+        dict[str, Path]: Dictionary with the accession as key and the path as value.
+    """
+    data_path = ncbi_dataset_path / "ncbi_dataset" / "data"
+    if not data_path.exists():
+        raise ValueError(f"Path {data_path} does not exist.")
+    accession_paths = {}
+    with open(data_path / "dataset_catalog.json", "r", encoding="utf-8") as f:
+        res = loads(f.read())
+        for assembly in res["assemblies"][1:]:  # the first item is the data report
+            accession = assembly["accession"]
+            assembly_path = data_path / assembly["files"][0]["filePath"]
+            accession_paths[accession] = assembly_path
+    return accession_paths
+def filter_sequences(
+    input_file: Path,
+    output_file: Path,
+    included_ids: list[str],
+):
+    """Filter sequences by IDs from an input file and save them to an output file.
+    Args:
+        input_file (Path): Path to the input file.
+        output_file (Path): Path to the output file.
+        included_ids (list[str], optional): List of IDs to include. If None, no output file is created.
+    """
+    if not included_ids:
+        print("No IDs provided, no output file will be created.")
+        return
+    with open(output_file, "w", encoding="utf-8") as out_f:
+        for record in get_record_iterator(input_file):
+            if record.id in included_ids:
+                SeqIO.write(record, out_f, "fasta")

XspecT 0.2.6__py3-none-any.whl → 0.4.0__py3-none-any.whl

Potentially problematic release.

XspecT 0.2.6py3-none-any.whl → 0.4.0py3-none-any.whl