XspecT 0.2.5__py3-none-any.whl → 0.2.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of XspecT might be problematic. Click here for more details.
- {XspecT-0.2.5.dist-info → XspecT-0.2.6.dist-info}/METADATA +16 -16
- XspecT-0.2.6.dist-info/RECORD +34 -0
- {XspecT-0.2.5.dist-info → XspecT-0.2.6.dist-info}/WHEEL +1 -1
- xspect/definitions.py +7 -0
- xspect/{download_filters.py → download_models.py} +2 -2
- xspect/fastapi.py +2 -2
- xspect/main.py +61 -8
- xspect/mlst_feature/__init__.py +0 -0
- xspect/mlst_feature/mlst_helper.py +155 -0
- xspect/mlst_feature/pub_mlst_handler.py +119 -0
- xspect/model_management.py +3 -4
- xspect/models/probabilistic_filter_mlst_model.py +287 -0
- xspect/models/probabilistic_filter_model.py +2 -11
- xspect/models/probabilistic_filter_svm_model.py +3 -0
- xspect/models/probabilistic_single_filter_model.py +4 -6
- xspect/models/result.py +7 -6
- xspect/train.py +1 -33
- XspecT-0.2.5.dist-info/RECORD +0 -30
- {XspecT-0.2.5.dist-info → XspecT-0.2.6.dist-info}/LICENSE +0 -0
- {XspecT-0.2.5.dist-info → XspecT-0.2.6.dist-info}/entry_points.txt +0 -0
- {XspecT-0.2.5.dist-info → XspecT-0.2.6.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
2
|
Name: XspecT
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.6
|
|
4
4
|
Summary: Tool to monitor and characterize pathogens using Bloom filters.
|
|
5
5
|
License: MIT License
|
|
6
6
|
|
|
@@ -46,14 +46,14 @@ Requires-Dist: fastapi
|
|
|
46
46
|
Requires-Dist: uvicorn
|
|
47
47
|
Requires-Dist: python-multipart
|
|
48
48
|
Provides-Extra: docs
|
|
49
|
-
Requires-Dist: sphinx
|
|
50
|
-
Requires-Dist: furo
|
|
51
|
-
Requires-Dist: myst-parser
|
|
52
|
-
Requires-Dist: sphinx-copybutton
|
|
53
|
-
Requires-Dist: sphinx-autobuild
|
|
49
|
+
Requires-Dist: sphinx; extra == "docs"
|
|
50
|
+
Requires-Dist: furo; extra == "docs"
|
|
51
|
+
Requires-Dist: myst-parser; extra == "docs"
|
|
52
|
+
Requires-Dist: sphinx-copybutton; extra == "docs"
|
|
53
|
+
Requires-Dist: sphinx-autobuild; extra == "docs"
|
|
54
54
|
Provides-Extra: test
|
|
55
|
-
Requires-Dist: pytest
|
|
56
|
-
Requires-Dist: pytest-cov
|
|
55
|
+
Requires-Dist: pytest; extra == "test"
|
|
56
|
+
Requires-Dist: pytest-cov; extra == "test"
|
|
57
57
|
|
|
58
58
|
# XspecT - Acinetobacter Species Assignment Tool
|
|
59
59
|

|
|
@@ -63,7 +63,7 @@ Requires-Dist: pytest-cov ; extra == 'test'
|
|
|
63
63
|
<img src="/docs/img/logo.png" height="50%" width="50%">
|
|
64
64
|
|
|
65
65
|
<!-- start intro -->
|
|
66
|
-
XspecT is a Python-based tool to taxonomically classify sequence-reads (or assembled genomes) on the species and/or
|
|
66
|
+
XspecT is a Python-based tool to taxonomically classify sequence-reads (or assembled genomes) on the species and/or MLST level using [Bloom Filters] and a [Support Vector Machine].
|
|
67
67
|
<br/><br/>
|
|
68
68
|
|
|
69
69
|
XspecT utilizes the uniqueness of kmers and compares extracted kmers from the input-data to a reference database. Bloom Filter ensure a fast lookup in this process. For a final prediction the results are classified using a Support Vector Machine.
|
|
@@ -88,14 +88,14 @@ pip install xspect
|
|
|
88
88
|
Please note that Windows and Alpine Linux is currently not supported.
|
|
89
89
|
|
|
90
90
|
## Usage
|
|
91
|
-
### Get the
|
|
92
|
-
To download basic pre-trained
|
|
91
|
+
### Get the models
|
|
92
|
+
To download basic pre-trained models, you can use the built-in command:
|
|
93
93
|
```
|
|
94
|
-
xspect download-
|
|
94
|
+
xspect download-models
|
|
95
95
|
```
|
|
96
|
-
Additional species
|
|
96
|
+
Additional species models can be trained using:
|
|
97
97
|
```
|
|
98
|
-
xspect train you-ncbi-genus-name
|
|
98
|
+
xspect train-species you-ncbi-genus-name
|
|
99
99
|
```
|
|
100
100
|
|
|
101
101
|
### How to run the web app
|
|
@@ -107,7 +107,7 @@ xspect api
|
|
|
107
107
|
### How to use the XspecT command line interface
|
|
108
108
|
Run xspect with the configuration you want to run it with as arguments.
|
|
109
109
|
```
|
|
110
|
-
xspect classify your-genus path/to/your/input-set
|
|
110
|
+
xspect classify-species your-genus path/to/your/input-set
|
|
111
111
|
```
|
|
112
112
|
For further instructions on how to use the command line interface, please refer to the [documentation] or execute:
|
|
113
113
|
```
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
xspect/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
xspect/definitions.py,sha256=Z6RiCOQnsA_i8DPIq-7SUhrKo7KYf1Azp36UZZpcOX4,1419
|
|
3
|
+
xspect/download_models.py,sha256=lml8pSyM0pF-MxghgSRC9noDT4pkUcBZraaTTImVfbA,739
|
|
4
|
+
xspect/fastapi.py,sha256=FDiGXJmLEeTLD83Hem8yV5aoPJ-GhSG5WmDIQRAA_w4,3257
|
|
5
|
+
xspect/file_io.py,sha256=zKhl6Fd9KZAYiD8YgIyje5TbDYk5lxMp1WUrNkGSBo8,2779
|
|
6
|
+
xspect/main.py,sha256=3HqmnMowjkLNwhaZWtY4aeJCyCyT6h_nZWzYIunHfKg,5325
|
|
7
|
+
xspect/model_management.py,sha256=xF-wjVNJbXYv64RajsIcpLfZUvicDyalJEdSeCx3nQI,3542
|
|
8
|
+
xspect/pipeline.py,sha256=h7duhVZ-hupwO_KQPstzFo8KMfMI2yleb9HmtTiMjic,7219
|
|
9
|
+
xspect/run.py,sha256=OJ7pCFqva3AhIYklKjVnqWGooVRO7S3b56kIAy-xabY,1189
|
|
10
|
+
xspect/train.py,sha256=p_5BPh7XNA7R2h8MwpN0-AwzjbNIxdmeMKztP7RU4g8,9499
|
|
11
|
+
xspect/mlst_feature/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
+
xspect/mlst_feature/mlst_helper.py,sha256=omqKmreah5qaspnJ5qKp_9oZsdHxi0tUJnEzZbpWPEw,5916
|
|
13
|
+
xspect/mlst_feature/pub_mlst_handler.py,sha256=oss3CkJNt6041p3qnMdOfoX8ZgUfpB93CUim-Yakc9A,5031
|
|
14
|
+
xspect/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
|
+
xspect/models/probabilistic_filter_mlst_model.py,sha256=JMc0yBJPo7J9b-GpvhDmzhwWPIKOwatAq0edDgM72PE,11735
|
|
16
|
+
xspect/models/probabilistic_filter_model.py,sha256=zCn5dcuq5Z4pvmsV9igS0lQ1plUi9-Kky_zRflfrIkI,9659
|
|
17
|
+
xspect/models/probabilistic_filter_svm_model.py,sha256=uabDrF1_CSuIWf9wWyQAkqjAuRUBzEZLkv3J6YHfJsM,5641
|
|
18
|
+
xspect/models/probabilistic_single_filter_model.py,sha256=TdGbQp8ylOif7dD13OSWaS-zFNJo8mXOb6BaQ0mcPdo,3810
|
|
19
|
+
xspect/models/result.py,sha256=RuYqagyG5QbFlW408haXzDbPYJ6yS2flcWcJ8Dy6UjM,4834
|
|
20
|
+
xspect/train_filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
21
|
+
xspect/train_filter/create_svm.py,sha256=w6gq40yHINVfNzLhJfYFykUaNCwpU9AEDcbkUfis3DY,1504
|
|
22
|
+
xspect/train_filter/extract_and_concatenate.py,sha256=lLrczGgfZi2vAGqxq8fcEmJi5pvqyK33JkB_ZoCNYG8,4840
|
|
23
|
+
xspect/train_filter/html_scrap.py,sha256=76VV_ZbvD2I3IxRb62SiQwRPu2tr4fwn1HkfJQYaosM,3809
|
|
24
|
+
xspect/train_filter/ncbi_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
|
+
xspect/train_filter/ncbi_api/download_assemblies.py,sha256=MB_mxSjCTL05DqIt1WQem8AGU3PjtJnzPndeI9J-AOI,1285
|
|
26
|
+
xspect/train_filter/ncbi_api/ncbi_assembly_metadata.py,sha256=puzDIws-yyBAEHwSAIYUM7g8FpLFmvOKh5xH1EsY8ZE,3830
|
|
27
|
+
xspect/train_filter/ncbi_api/ncbi_children_tree.py,sha256=_8puOsnsKp5lsMV2gZY1ijkfD_BZKG9eXZCX09qph5E,1819
|
|
28
|
+
xspect/train_filter/ncbi_api/ncbi_taxon_metadata.py,sha256=O6JDXC4E6AYaf7NPnb34eSJyZhMB8r--bjoVF_ZsEdA,1868
|
|
29
|
+
XspecT-0.2.6.dist-info/LICENSE,sha256=bhBGDKIRUVwYIHGOGO5hshzuVHyqFJajvSOA3XXOLKI,1094
|
|
30
|
+
XspecT-0.2.6.dist-info/METADATA,sha256=cO5cpS3zD45dpARRO92XG9bCfCbJ1LE-bAaIxkb0bwQ,4714
|
|
31
|
+
XspecT-0.2.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
32
|
+
XspecT-0.2.6.dist-info/entry_points.txt,sha256=L7qliX3pIuwupQxpuOSsrBJCSHYPOPNEzH8KZKQGGUw,43
|
|
33
|
+
XspecT-0.2.6.dist-info/top_level.txt,sha256=hdoa4cnBv6OVzpyhMmyxpJxEydH5n2lDciy8urc1paE,7
|
|
34
|
+
XspecT-0.2.6.dist-info/RECORD,,
|
xspect/definitions.py
CHANGED
|
@@ -40,3 +40,10 @@ def get_xspect_runs_path():
|
|
|
40
40
|
runs_path = get_xspect_root_path() / "runs"
|
|
41
41
|
runs_path.mkdir(exist_ok=True, parents=True)
|
|
42
42
|
return runs_path
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def get_xspect_mlst_path():
|
|
46
|
+
"""Return the path to the XspecT runs directory."""
|
|
47
|
+
mlst_path = get_xspect_root_path() / "mlst"
|
|
48
|
+
mlst_path.mkdir(exist_ok=True, parents=True)
|
|
49
|
+
return mlst_path
|
|
@@ -7,8 +7,8 @@ import requests
|
|
|
7
7
|
from xspect.definitions import get_xspect_model_path, get_xspect_tmp_path
|
|
8
8
|
|
|
9
9
|
|
|
10
|
-
def
|
|
11
|
-
"""Download
|
|
10
|
+
def download_test_models(url):
|
|
11
|
+
"""Download models."""
|
|
12
12
|
|
|
13
13
|
download_path = get_xspect_tmp_path() / "models.zip"
|
|
14
14
|
extract_path = get_xspect_tmp_path() / "extracted_models"
|
xspect/fastapi.py
CHANGED
|
@@ -5,7 +5,7 @@ from pathlib import Path
|
|
|
5
5
|
from shutil import copyfileobj
|
|
6
6
|
from fastapi import FastAPI, UploadFile, BackgroundTasks
|
|
7
7
|
from xspect.definitions import get_xspect_runs_path, get_xspect_upload_path
|
|
8
|
-
from xspect.
|
|
8
|
+
from xspect.download_models import download_test_models
|
|
9
9
|
import xspect.model_management as mm
|
|
10
10
|
from xspect.models.result import StepType
|
|
11
11
|
from xspect.pipeline import ModelExecution, Pipeline, PipelineStep
|
|
@@ -17,7 +17,7 @@ app = FastAPI()
|
|
|
17
17
|
@app.get("/download-filters")
|
|
18
18
|
def download_filters():
|
|
19
19
|
"""Download filters."""
|
|
20
|
-
|
|
20
|
+
download_test_models("https://xspect2.s3.eu-central-1.amazonaws.com/models.zip")
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
@app.get("/classify")
|
xspect/main.py
CHANGED
|
@@ -6,13 +6,23 @@ import uuid
|
|
|
6
6
|
import click
|
|
7
7
|
import uvicorn
|
|
8
8
|
from xspect import fastapi
|
|
9
|
-
from xspect.
|
|
9
|
+
from xspect.download_models import download_test_models
|
|
10
10
|
from xspect.train import train_ncbi
|
|
11
11
|
from xspect.models.result import (
|
|
12
12
|
StepType,
|
|
13
13
|
)
|
|
14
|
-
from xspect.definitions import
|
|
14
|
+
from xspect.definitions import (
|
|
15
|
+
get_xspect_runs_path,
|
|
16
|
+
fasta_endings,
|
|
17
|
+
fastq_endings,
|
|
18
|
+
get_xspect_model_path,
|
|
19
|
+
)
|
|
15
20
|
from xspect.pipeline import ModelExecution, Pipeline, PipelineStep
|
|
21
|
+
from xspect.mlst_feature.mlst_helper import pick_scheme, pick_scheme_from_models_dir
|
|
22
|
+
from xspect.mlst_feature.pub_mlst_handler import PubMLSTHandler
|
|
23
|
+
from xspect.models.probabilistic_filter_mlst_model import (
|
|
24
|
+
ProbabilisticFilterMlstSchemeModel,
|
|
25
|
+
)
|
|
16
26
|
|
|
17
27
|
|
|
18
28
|
@click.group()
|
|
@@ -22,10 +32,10 @@ def cli():
|
|
|
22
32
|
|
|
23
33
|
|
|
24
34
|
@cli.command()
|
|
25
|
-
def
|
|
26
|
-
"""Download
|
|
27
|
-
click.echo("Downloading
|
|
28
|
-
|
|
35
|
+
def download_models():
|
|
36
|
+
"""Download models."""
|
|
37
|
+
click.echo("Downloading models, this may take a while...")
|
|
38
|
+
download_test_models("https://xspect2.s3.eu-central-1.amazonaws.com/models.zip")
|
|
29
39
|
|
|
30
40
|
|
|
31
41
|
@cli.command()
|
|
@@ -43,7 +53,7 @@ def download_filters():
|
|
|
43
53
|
help="Sparse sampling step size (e. g. only every 500th kmer for step=500).",
|
|
44
54
|
default=1,
|
|
45
55
|
)
|
|
46
|
-
def
|
|
56
|
+
def classify_species(genus, path, meta, step):
|
|
47
57
|
"""Classify sample(s) from file or directory PATH."""
|
|
48
58
|
click.echo("Classifying...")
|
|
49
59
|
click.echo(f"Step: {step}")
|
|
@@ -105,7 +115,7 @@ def classify(genus, path, meta, step):
|
|
|
105
115
|
help="SVM Sparse sampling step size (e. g. only every 500th kmer for step=500).",
|
|
106
116
|
default=1,
|
|
107
117
|
)
|
|
108
|
-
def
|
|
118
|
+
def train_species(genus, bf_assembly_path, svm_assembly_path, svm_step):
|
|
109
119
|
"""Train model."""
|
|
110
120
|
|
|
111
121
|
if bf_assembly_path or svm_assembly_path:
|
|
@@ -118,6 +128,49 @@ def train(genus, bf_assembly_path, svm_assembly_path, svm_step):
|
|
|
118
128
|
raise click.ClickException(str(e)) from e
|
|
119
129
|
|
|
120
130
|
|
|
131
|
+
@cli.command()
|
|
132
|
+
@click.option(
|
|
133
|
+
"-c",
|
|
134
|
+
"--choose_schemes",
|
|
135
|
+
is_flag=True,
|
|
136
|
+
help="Choose your own schemes."
|
|
137
|
+
"Default setting is Oxford and Pasteur scheme of A.baumannii.",
|
|
138
|
+
)
|
|
139
|
+
def train_mlst(choose_schemes):
|
|
140
|
+
"""Download alleles and train bloom filters."""
|
|
141
|
+
click.echo("Updating alleles")
|
|
142
|
+
handler = PubMLSTHandler()
|
|
143
|
+
handler.download_alleles(choose_schemes)
|
|
144
|
+
click.echo("Download finished")
|
|
145
|
+
scheme_path = pick_scheme(handler.get_scheme_paths())
|
|
146
|
+
species_name = str(scheme_path).split("/")[-2]
|
|
147
|
+
scheme_name = str(scheme_path).split("/")[-1]
|
|
148
|
+
model = ProbabilisticFilterMlstSchemeModel(
|
|
149
|
+
31, f"{species_name}:{scheme_name}", get_xspect_model_path()
|
|
150
|
+
)
|
|
151
|
+
click.echo("Creating mlst model")
|
|
152
|
+
model.fit(scheme_path)
|
|
153
|
+
model.save()
|
|
154
|
+
click.echo(f"Saved at {model.cobs_path}")
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
@cli.command()
|
|
158
|
+
@click.option(
|
|
159
|
+
"-p",
|
|
160
|
+
"--path",
|
|
161
|
+
help="Path to FASTA-file for mlst identification.",
|
|
162
|
+
type=click.Path(exists=True, dir_okay=True, file_okay=True),
|
|
163
|
+
)
|
|
164
|
+
def classify_mlst(path):
|
|
165
|
+
"""MLST classify a sample."""
|
|
166
|
+
click.echo("Classifying...")
|
|
167
|
+
path = Path(path)
|
|
168
|
+
scheme_path = pick_scheme_from_models_dir()
|
|
169
|
+
model = ProbabilisticFilterMlstSchemeModel.load(scheme_path)
|
|
170
|
+
model.predict(scheme_path, path).save(model.model_display_name, path)
|
|
171
|
+
click.echo(f"Run saved at {get_xspect_runs_path()}.")
|
|
172
|
+
|
|
173
|
+
|
|
121
174
|
@cli.command()
|
|
122
175
|
def api():
|
|
123
176
|
"""Open the XspecT FastAPI."""
|
|
File without changes
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
""" Module for utility functions used in other modules regarding MLST. """
|
|
2
|
+
|
|
3
|
+
__author__ = "Cetin, Oemer"
|
|
4
|
+
|
|
5
|
+
import requests
|
|
6
|
+
import json
|
|
7
|
+
from io import StringIO
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from Bio import SeqIO
|
|
10
|
+
from xspect.definitions import get_xspect_model_path, get_xspect_runs_path
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def create_fasta_files(locus_path: Path, fasta_batch: str):
|
|
14
|
+
"""Create Fasta-Files for every allele of a locus."""
|
|
15
|
+
# fasta_batch = full string of a fasta file containing every allele sequence of a locus
|
|
16
|
+
for record in SeqIO.parse(StringIO(fasta_batch), "fasta"):
|
|
17
|
+
number = record.id.split("_")[-1] # example id = Oxf_cpn60_263
|
|
18
|
+
output_fasta_file = locus_path / f"Allele_ID_{number}.fasta"
|
|
19
|
+
if output_fasta_file.exists():
|
|
20
|
+
continue # Ignore existing ones
|
|
21
|
+
with open(output_fasta_file, "w") as allele:
|
|
22
|
+
SeqIO.write(record, allele, "fasta")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def pick_species_number_from_db(available_species: dict) -> str:
|
|
26
|
+
"""Returns the chosen species from all available ones in the database."""
|
|
27
|
+
# The "database" string can look like this: pubmlst_abaumannii_seqdef
|
|
28
|
+
for counter, database in available_species.items():
|
|
29
|
+
print(str(counter) + ":" + database.split("_")[1])
|
|
30
|
+
print("\nPick one of the above databases")
|
|
31
|
+
while True:
|
|
32
|
+
try:
|
|
33
|
+
choice = input("Choose a species by selecting the corresponding number:")
|
|
34
|
+
if int(choice) in available_species.keys():
|
|
35
|
+
chosen_species = available_species.get(int(choice))
|
|
36
|
+
return chosen_species
|
|
37
|
+
else:
|
|
38
|
+
print(
|
|
39
|
+
"Wrong input! Try again with a number that is available in the list above."
|
|
40
|
+
)
|
|
41
|
+
except ValueError:
|
|
42
|
+
print(
|
|
43
|
+
"Wrong input! Try again with a number that is available in the list above."
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def pick_scheme_number_from_db(available_schemes: dict) -> str:
|
|
48
|
+
"""Returns the chosen schemes from all available ones of a species."""
|
|
49
|
+
# List all available schemes of a species database
|
|
50
|
+
for counter, scheme in available_schemes.items():
|
|
51
|
+
print(str(counter) + ":" + scheme[0])
|
|
52
|
+
print("\nPick any available scheme that is listed for download")
|
|
53
|
+
while True:
|
|
54
|
+
try:
|
|
55
|
+
choice = input("Choose a scheme by selecting the corresponding number:")
|
|
56
|
+
if int(choice) in available_schemes.keys():
|
|
57
|
+
chosen_scheme = available_schemes.get(int(choice))[1]
|
|
58
|
+
return chosen_scheme
|
|
59
|
+
else:
|
|
60
|
+
print(
|
|
61
|
+
"Wrong input! Try again with a number that is available in the above list."
|
|
62
|
+
)
|
|
63
|
+
except ValueError:
|
|
64
|
+
print(
|
|
65
|
+
"Wrong input! Try again with a number that is available in the above list."
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def scheme_list_to_dict(scheme_list: list[str]):
|
|
70
|
+
"""Converts the scheme list attribute into a dictionary with a number as the key."""
|
|
71
|
+
return dict(zip(range(1, len(scheme_list) + 1), scheme_list))
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def pick_scheme_from_models_dir() -> Path:
|
|
75
|
+
"""Returns the chosen scheme from models that have been fitted prior."""
|
|
76
|
+
schemes = {}
|
|
77
|
+
counter = 1
|
|
78
|
+
for entry in sorted((get_xspect_model_path() / "MLST").iterdir()):
|
|
79
|
+
schemes[counter] = entry
|
|
80
|
+
counter += 1
|
|
81
|
+
return pick_scheme(schemes)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def pick_scheme(available_schemes: dict) -> Path:
|
|
85
|
+
"""Returns the chosen scheme from the scheme list."""
|
|
86
|
+
if not available_schemes:
|
|
87
|
+
raise ValueError("No scheme has been chosen for download yet!")
|
|
88
|
+
|
|
89
|
+
if len(available_schemes.items()) == 1:
|
|
90
|
+
return next(iter(available_schemes.values()))
|
|
91
|
+
|
|
92
|
+
# List available schemes
|
|
93
|
+
for counter, scheme in available_schemes.items():
|
|
94
|
+
# For Strain Typing with an API-POST Request to the db
|
|
95
|
+
if str(scheme).startswith("http"):
|
|
96
|
+
scheme_json = requests.get(scheme).json()
|
|
97
|
+
print(str(counter) + ":" + scheme_json["description"])
|
|
98
|
+
|
|
99
|
+
# To pick a scheme after download for fitting
|
|
100
|
+
else:
|
|
101
|
+
print(str(counter) + ":" + str(scheme).split("/")[-1])
|
|
102
|
+
|
|
103
|
+
print("\nPick a scheme for strain type prediction")
|
|
104
|
+
while True:
|
|
105
|
+
try:
|
|
106
|
+
choice = input("Choose a scheme by selecting the corresponding number:")
|
|
107
|
+
if int(choice) in available_schemes.keys():
|
|
108
|
+
chosen_scheme = available_schemes.get(int(choice))
|
|
109
|
+
return chosen_scheme
|
|
110
|
+
else:
|
|
111
|
+
print(
|
|
112
|
+
"Wrong input! Try again with a number that is available in the above list."
|
|
113
|
+
)
|
|
114
|
+
except ValueError:
|
|
115
|
+
print(
|
|
116
|
+
"Wrong input! Try again with a number that is available in the above list."
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class MlstResult:
|
|
121
|
+
"""Class for storing mlst results."""
|
|
122
|
+
|
|
123
|
+
def __init__(
|
|
124
|
+
self,
|
|
125
|
+
scheme_model: str,
|
|
126
|
+
steps: int,
|
|
127
|
+
hits: dict[str, list[dict]],
|
|
128
|
+
):
|
|
129
|
+
self.scheme_model = scheme_model
|
|
130
|
+
self.steps = steps
|
|
131
|
+
self.hits = hits
|
|
132
|
+
|
|
133
|
+
def get_results(self) -> dict:
|
|
134
|
+
"""Stores the result of a prediction in a dictionary."""
|
|
135
|
+
results = {seq_id: result for seq_id, result in self.hits.items()}
|
|
136
|
+
return results
|
|
137
|
+
|
|
138
|
+
def to_dict(self) -> dict:
|
|
139
|
+
"""Converts all attributes into one dictionary."""
|
|
140
|
+
result = {
|
|
141
|
+
"Scheme": self.scheme_model,
|
|
142
|
+
"Steps": self.steps,
|
|
143
|
+
"Results": self.get_results(),
|
|
144
|
+
}
|
|
145
|
+
return result
|
|
146
|
+
|
|
147
|
+
def save(self, display: str, file_path: Path) -> None:
|
|
148
|
+
"""Saves the result inside the "runs" directory"""
|
|
149
|
+
file_name = str(file_path).split("/")[-1]
|
|
150
|
+
json_path = get_xspect_runs_path() / "MLST" / f"{file_name}-{display}.json"
|
|
151
|
+
json_path.parent.mkdir(exist_ok=True, parents=True)
|
|
152
|
+
json_object = json.dumps(self.to_dict(), indent=4)
|
|
153
|
+
|
|
154
|
+
with open(json_path, "w", encoding="utf-8") as file:
|
|
155
|
+
file.write(json_object)
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""Module for connecting with the PubMLST database via API requests and downloading allele files."""
|
|
2
|
+
|
|
3
|
+
__author__ = "Cetin, Oemer"
|
|
4
|
+
|
|
5
|
+
import requests
|
|
6
|
+
import json
|
|
7
|
+
from xspect.mlst_feature.mlst_helper import (
|
|
8
|
+
create_fasta_files,
|
|
9
|
+
pick_species_number_from_db,
|
|
10
|
+
pick_scheme_number_from_db,
|
|
11
|
+
pick_scheme,
|
|
12
|
+
scheme_list_to_dict,
|
|
13
|
+
)
|
|
14
|
+
from xspect.definitions import get_xspect_mlst_path, get_xspect_upload_path
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class PubMLSTHandler:
|
|
18
|
+
"""Class for communicating with PubMLST and downloading alleles (FASTA-Format) from all loci."""
|
|
19
|
+
|
|
20
|
+
base_url = "http://rest.pubmlst.org/db"
|
|
21
|
+
|
|
22
|
+
def __init__(self):
|
|
23
|
+
# Default values: Oxford (1) and Pasteur (2) schemes of A.baumannii species
|
|
24
|
+
self.scheme_list = [
|
|
25
|
+
self.base_url + "/pubmlst_abaumannii_seqdef/schemes/1",
|
|
26
|
+
self.base_url + "/pubmlst_abaumannii_seqdef/schemes/2",
|
|
27
|
+
]
|
|
28
|
+
self.scheme_paths = []
|
|
29
|
+
|
|
30
|
+
def get_scheme_paths(self) -> dict:
|
|
31
|
+
"""Returns the scheme paths in a dictionary"""
|
|
32
|
+
return scheme_list_to_dict(self.scheme_paths)
|
|
33
|
+
|
|
34
|
+
def choose_schemes(self) -> None:
|
|
35
|
+
"""Changes the scheme list attribute to feature other schemes from some species"""
|
|
36
|
+
available_species = {}
|
|
37
|
+
available_schemes = {}
|
|
38
|
+
chosen_schemes = []
|
|
39
|
+
counter = 1
|
|
40
|
+
# retrieve all available species
|
|
41
|
+
species_url = PubMLSTHandler.base_url
|
|
42
|
+
for species_databases in requests.get(species_url).json():
|
|
43
|
+
for database in species_databases["databases"]:
|
|
44
|
+
if database["name"].endswith("seqdef"):
|
|
45
|
+
available_species[counter] = database["name"]
|
|
46
|
+
counter += 1
|
|
47
|
+
# pick a species out of the available ones
|
|
48
|
+
chosen_species = pick_species_number_from_db(available_species)
|
|
49
|
+
|
|
50
|
+
counter = 1
|
|
51
|
+
scheme_url = f"{species_url}/{chosen_species}/schemes"
|
|
52
|
+
for scheme in requests.get(scheme_url).json()["schemes"]:
|
|
53
|
+
# scheme["description"] stores the name of a scheme.
|
|
54
|
+
# scheme["scheme"] stores the URL that is needed for downloading all loci.
|
|
55
|
+
available_schemes[counter] = [scheme["description"], scheme["scheme"]]
|
|
56
|
+
counter += 1
|
|
57
|
+
|
|
58
|
+
# Selection process of available scheme from a species for download (doubles are caught!)
|
|
59
|
+
while True:
|
|
60
|
+
chosen_scheme = pick_scheme_number_from_db(available_schemes)
|
|
61
|
+
(
|
|
62
|
+
chosen_schemes.append(chosen_scheme)
|
|
63
|
+
if chosen_scheme not in chosen_schemes
|
|
64
|
+
else None
|
|
65
|
+
)
|
|
66
|
+
choice = input(
|
|
67
|
+
"Do you want to pick another scheme to download? (y/n):"
|
|
68
|
+
).lower()
|
|
69
|
+
if choice != "y":
|
|
70
|
+
break
|
|
71
|
+
self.scheme_list = chosen_schemes
|
|
72
|
+
|
|
73
|
+
def download_alleles(self, choice: False):
|
|
74
|
+
"""Downloads every allele FASTA-file from all loci of the scheme list attribute"""
|
|
75
|
+
if choice: # pick an own scheme if not Oxford or Pasteur
|
|
76
|
+
self.choose_schemes() # changes the scheme_list attribute
|
|
77
|
+
|
|
78
|
+
for scheme in self.scheme_list:
|
|
79
|
+
scheme_json = requests.get(scheme).json()
|
|
80
|
+
# We only want the name and the respective featured loci of a scheme
|
|
81
|
+
scheme_name = scheme_json["description"]
|
|
82
|
+
locus_list = scheme_json["loci"]
|
|
83
|
+
|
|
84
|
+
species_name = scheme.split("_")[1] # name = pubmlst_abaumannii_seqdef
|
|
85
|
+
scheme_path = get_xspect_mlst_path() / species_name / scheme_name
|
|
86
|
+
self.scheme_paths.append(scheme_path)
|
|
87
|
+
|
|
88
|
+
for locus_url in locus_list:
|
|
89
|
+
# After using split the last part ([-1]) of the url is the locus name
|
|
90
|
+
locus_name = locus_url.split("/")[-1]
|
|
91
|
+
locus_path = (
|
|
92
|
+
get_xspect_mlst_path() / species_name / scheme_name / locus_name
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
if not locus_path.exists():
|
|
96
|
+
locus_path.mkdir(exist_ok=True, parents=True)
|
|
97
|
+
|
|
98
|
+
alleles = requests.get(f"{locus_url}/alleles_fasta").text
|
|
99
|
+
create_fasta_files(locus_path, alleles)
|
|
100
|
+
|
|
101
|
+
def assign_strain_type_by_db(self):
|
|
102
|
+
"""Sends an API-POST-Request to the database for MLST without bloom filters"""
|
|
103
|
+
scheme_url = (
|
|
104
|
+
str(pick_scheme(scheme_list_to_dict(self.scheme_list))) + "/sequence"
|
|
105
|
+
)
|
|
106
|
+
fasta_file = get_xspect_upload_path() / "Test.fna"
|
|
107
|
+
with open(fasta_file, "r") as file:
|
|
108
|
+
data = file.read()
|
|
109
|
+
payload = { # Essential API-POST-Body
|
|
110
|
+
"sequence": data,
|
|
111
|
+
"filetype": "fasta",
|
|
112
|
+
}
|
|
113
|
+
response = requests.post(scheme_url, data=json.dumps(payload)).json()
|
|
114
|
+
|
|
115
|
+
for locus, meta_data in response["exact_matches"].items():
|
|
116
|
+
# meta_data is a list containing a dictionary, therefore [0] and then key value.
|
|
117
|
+
# Example: 'Pas_fusA': [{'href': some URL, 'allele_id': '2'}]
|
|
118
|
+
print(locus + ":" + meta_data[0]["allele_id"], end="; ")
|
|
119
|
+
print("\nStrain Type:", response["fields"])
|
xspect/model_management.py
CHANGED
|
@@ -30,12 +30,11 @@ def get_model_by_slug(model_slug: str):
|
|
|
30
30
|
model_metadata = get_model_metadata(model_path)
|
|
31
31
|
if model_metadata["model_class"] == "ProbabilisticSingleFilterModel":
|
|
32
32
|
return ProbabilisticSingleFilterModel.load(model_path)
|
|
33
|
-
|
|
33
|
+
if model_metadata["model_class"] == "ProbabilisticFilterSVMModel":
|
|
34
34
|
return ProbabilisticFilterSVMModel.load(model_path)
|
|
35
|
-
|
|
35
|
+
if model_metadata["model_class"] == "ProbabilisticFilterModel":
|
|
36
36
|
return ProbabilisticFilterModel.load(model_path)
|
|
37
|
-
|
|
38
|
-
raise ValueError(f"Model class {model_metadata['model_class']} not recognized.")
|
|
37
|
+
raise ValueError(f"Model class {model_metadata['model_class']} not recognized.")
|
|
39
38
|
|
|
40
39
|
|
|
41
40
|
def get_model_metadata(model: str | Path):
|
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
"""Probabilistic filter MLST model for sequence data"""
|
|
2
|
+
|
|
3
|
+
__author__ = "Cetin, Oemer"
|
|
4
|
+
|
|
5
|
+
import cobs_index
|
|
6
|
+
import json
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from Bio import SeqIO
|
|
9
|
+
from Bio.Seq import Seq
|
|
10
|
+
from Bio.SeqRecord import SeqRecord
|
|
11
|
+
from cobs_index import DocumentList
|
|
12
|
+
from collections import defaultdict
|
|
13
|
+
from xspect.file_io import get_record_iterator
|
|
14
|
+
from xspect.mlst_feature.mlst_helper import MlstResult
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ProbabilisticFilterMlstSchemeModel:
|
|
18
|
+
"""Probabilistic filter MLST scheme model for sequence data"""
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
k: int,
|
|
23
|
+
model_display_name: str,
|
|
24
|
+
base_path: Path,
|
|
25
|
+
fpr: float = 0.001,
|
|
26
|
+
) -> None:
|
|
27
|
+
if k < 1:
|
|
28
|
+
raise ValueError("Invalid k value, must be greater than 0")
|
|
29
|
+
if not isinstance(base_path, Path):
|
|
30
|
+
raise ValueError("Invalid base path, must be a pathlib.Path object")
|
|
31
|
+
|
|
32
|
+
self.k = k
|
|
33
|
+
self.model_display_name = model_display_name
|
|
34
|
+
self.base_path = base_path / "MLST"
|
|
35
|
+
self.fpr = fpr
|
|
36
|
+
self.model_type = "Strain"
|
|
37
|
+
self.loci = {}
|
|
38
|
+
self.scheme_path = ""
|
|
39
|
+
self.cobs_path = ""
|
|
40
|
+
self.avg_locus_bp_size = []
|
|
41
|
+
self.indices = []
|
|
42
|
+
|
|
43
|
+
def to_dict(self) -> dict:
|
|
44
|
+
"""Returns a dictionary representation of the model"""
|
|
45
|
+
return {
|
|
46
|
+
"k": self.k,
|
|
47
|
+
"model_display_name": self.model_display_name,
|
|
48
|
+
"model_type": self.model_type,
|
|
49
|
+
"fpr": self.fpr,
|
|
50
|
+
"scheme_path": str(self.scheme_path),
|
|
51
|
+
"cobs_path": str(self.cobs_path),
|
|
52
|
+
"average_locus_base_pair_size": self.avg_locus_bp_size,
|
|
53
|
+
"loci": self.loci,
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
def get_cobs_index_path(self, scheme: str, locus: str) -> Path:
|
|
57
|
+
"""Returns the path to the cobs index"""
|
|
58
|
+
# To differentiate from genus and species models
|
|
59
|
+
cobs_path = self.base_path / f"{scheme}"
|
|
60
|
+
cobs_path.mkdir(exist_ok=True, parents=True)
|
|
61
|
+
return cobs_path / f"{locus}.cobs_compact"
|
|
62
|
+
|
|
63
|
+
def fit(self, scheme_path: Path) -> None:
|
|
64
|
+
"""Trains a COBS structure for every locus with all its alleles"""
|
|
65
|
+
if not scheme_path.exists():
|
|
66
|
+
raise ValueError(
|
|
67
|
+
"Scheme not found. Please make sure to download the schemes prior!"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
scheme = str(scheme_path).split("/")[-1]
|
|
71
|
+
cobs_path = ""
|
|
72
|
+
# COBS structure for every locus (default = 7 for Oxford or Pasteur scheme)
|
|
73
|
+
for locus_path in sorted(scheme_path.iterdir()):
|
|
74
|
+
locus = str(locus_path).split("/")[-1]
|
|
75
|
+
# counts all fasta files that belong to a locus
|
|
76
|
+
self.loci[locus] = sum(
|
|
77
|
+
(1 for _ in locus_path.iterdir() if not str(_).endswith("cache"))
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
# determine the avg base pair size of alleles
|
|
81
|
+
fasta_file = next(locus_path.glob("*.fasta"), None)
|
|
82
|
+
with open(fasta_file, "r") as handle:
|
|
83
|
+
record = next(SeqIO.parse(handle, "fasta"))
|
|
84
|
+
self.avg_locus_bp_size.append(len(record.seq))
|
|
85
|
+
|
|
86
|
+
# COBS only accepts strings as paths
|
|
87
|
+
doclist = DocumentList(str(locus_path))
|
|
88
|
+
index_params = cobs_index.CompactIndexParameters()
|
|
89
|
+
index_params.term_size = self.k # k-mer size
|
|
90
|
+
index_params.clobber = True # overwrite output and temporary files
|
|
91
|
+
index_params.false_positive_rate = self.fpr
|
|
92
|
+
|
|
93
|
+
# Creates COBS data structure for each locus
|
|
94
|
+
cobs_path = self.get_cobs_index_path(scheme, locus)
|
|
95
|
+
cobs_index.compact_construct_list(doclist, str(cobs_path), index_params)
|
|
96
|
+
# Saves COBS-file inside the "indices" attribute
|
|
97
|
+
self.indices.append(cobs_index.Search(str(cobs_path)))
|
|
98
|
+
|
|
99
|
+
self.scheme_path = scheme_path
|
|
100
|
+
self.cobs_path = cobs_path.parent
|
|
101
|
+
|
|
102
|
+
def save(self) -> None:
|
|
103
|
+
"""Saves the model to disk"""
|
|
104
|
+
scheme = str(self.scheme_path).split("/")[
|
|
105
|
+
-1
|
|
106
|
+
] # [-1] -> contains the scheme name
|
|
107
|
+
json_path = self.base_path / scheme / f"{scheme}.json"
|
|
108
|
+
json_object = json.dumps(self.to_dict(), indent=4)
|
|
109
|
+
|
|
110
|
+
with open(json_path, "w", encoding="utf-8") as file:
|
|
111
|
+
file.write(json_object)
|
|
112
|
+
|
|
113
|
+
@staticmethod
|
|
114
|
+
def load(scheme_path: Path) -> "ProbabilisticFilterMlstSchemeModel":
|
|
115
|
+
"""Loads the model from a JSON-file"""
|
|
116
|
+
scheme_name = str(scheme_path).split("/")[-1]
|
|
117
|
+
json_path = scheme_path / f"{scheme_name}.json"
|
|
118
|
+
with open(json_path, "r", encoding="utf-8") as file:
|
|
119
|
+
json_object = file.read()
|
|
120
|
+
model_json = json.loads(json_object)
|
|
121
|
+
model = ProbabilisticFilterMlstSchemeModel(
|
|
122
|
+
model_json["k"],
|
|
123
|
+
model_json["model_display_name"],
|
|
124
|
+
json_path.parent,
|
|
125
|
+
model_json["fpr"],
|
|
126
|
+
)
|
|
127
|
+
model.scheme_path = model_json["scheme_path"]
|
|
128
|
+
model.cobs_path = model_json["cobs_path"]
|
|
129
|
+
model.avg_locus_bp_size = model_json["average_locus_base_pair_size"]
|
|
130
|
+
model.loci = model_json["loci"]
|
|
131
|
+
|
|
132
|
+
for entry in sorted(json_path.parent.iterdir()):
|
|
133
|
+
if not entry.exists():
|
|
134
|
+
raise FileNotFoundError(f"Index file not found at {entry}")
|
|
135
|
+
if str(entry).endswith(".json"): # only COBS-files
|
|
136
|
+
continue
|
|
137
|
+
model.indices.append(cobs_index.Search(str(entry), False))
|
|
138
|
+
return model
|
|
139
|
+
|
|
140
|
+
def calculate_hits(self, path: Path, sequence: Seq, step: int = 1) -> list[dict]:
|
|
141
|
+
"""Calculates the hits for a sequence"""
|
|
142
|
+
if not isinstance(sequence, Seq):
|
|
143
|
+
raise ValueError("Invalid sequence, must be a Bio.Seq object")
|
|
144
|
+
|
|
145
|
+
if not len(sequence) > self.k:
|
|
146
|
+
raise ValueError("Invalid sequence, must be longer than k")
|
|
147
|
+
|
|
148
|
+
if not self.indices:
|
|
149
|
+
raise ValueError("The Model has not been trained yet")
|
|
150
|
+
|
|
151
|
+
scheme_path_list = []
|
|
152
|
+
for entry in sorted(path.iterdir()):
|
|
153
|
+
if str(entry).endswith(".json"):
|
|
154
|
+
continue
|
|
155
|
+
file_name = str(entry).split("/")[-1] # file_name = locus
|
|
156
|
+
scheme_path_list.append(file_name.split(".")[0]) # without the file ending
|
|
157
|
+
|
|
158
|
+
result_dict = {}
|
|
159
|
+
highest_results = {}
|
|
160
|
+
counter = 0
|
|
161
|
+
# split the sequence in parts based on sequence length
|
|
162
|
+
if len(sequence) >= 10000:
|
|
163
|
+
for index in self.indices:
|
|
164
|
+
cobs_results = []
|
|
165
|
+
allele_len = self.avg_locus_bp_size[counter]
|
|
166
|
+
split_sequence = self.sequence_splitter(str(sequence), allele_len)
|
|
167
|
+
for split in split_sequence:
|
|
168
|
+
res = index.search(split, step=step)
|
|
169
|
+
split_result = self.get_cobs_result(res)
|
|
170
|
+
if not split_result:
|
|
171
|
+
continue
|
|
172
|
+
cobs_results.append(split_result)
|
|
173
|
+
|
|
174
|
+
all_counts = defaultdict(int)
|
|
175
|
+
for result in cobs_results:
|
|
176
|
+
for name, value in result.items():
|
|
177
|
+
all_counts[name] += value
|
|
178
|
+
|
|
179
|
+
sorted_counts = dict(
|
|
180
|
+
sorted(all_counts.items(), key=lambda item: -item[1])
|
|
181
|
+
)
|
|
182
|
+
first_key = next(iter(sorted_counts))
|
|
183
|
+
highest_result = sorted_counts[first_key]
|
|
184
|
+
result_dict[scheme_path_list[counter]] = sorted_counts
|
|
185
|
+
highest_results[scheme_path_list[counter]] = {first_key: highest_result}
|
|
186
|
+
counter += 1
|
|
187
|
+
else:
|
|
188
|
+
for index in self.indices:
|
|
189
|
+
res = index.search(
|
|
190
|
+
str(sequence), step=step
|
|
191
|
+
) # COBS can't handle Seq-Objects
|
|
192
|
+
result_dict[scheme_path_list[counter]] = self.get_cobs_result(res)
|
|
193
|
+
highest_results[scheme_path_list[counter]] = (
|
|
194
|
+
self.get_highest_cobs_result(res)
|
|
195
|
+
)
|
|
196
|
+
counter += 1
|
|
197
|
+
return [{"Strain type": highest_results}, {"All results": result_dict}]
|
|
198
|
+
|
|
199
|
+
def predict(
|
|
200
|
+
self,
|
|
201
|
+
cobs_path: Path,
|
|
202
|
+
sequence_input: (
|
|
203
|
+
SeqRecord
|
|
204
|
+
| list[SeqRecord]
|
|
205
|
+
| SeqIO.FastaIO.FastaIterator
|
|
206
|
+
| SeqIO.QualityIO.FastqPhredIterator
|
|
207
|
+
| Path
|
|
208
|
+
),
|
|
209
|
+
step: int = 1,
|
|
210
|
+
) -> MlstResult:
|
|
211
|
+
"""Returns scores for the sequence(s) based on the filters in the model"""
|
|
212
|
+
if isinstance(sequence_input, SeqRecord):
|
|
213
|
+
if sequence_input.id == "<unknown id>":
|
|
214
|
+
sequence_input.id = "test"
|
|
215
|
+
hits = {
|
|
216
|
+
sequence_input.id: self.calculate_hits(cobs_path, sequence_input.seq)
|
|
217
|
+
}
|
|
218
|
+
return MlstResult(self.model_display_name, step, hits)
|
|
219
|
+
|
|
220
|
+
if isinstance(sequence_input, Path):
|
|
221
|
+
return ProbabilisticFilterMlstSchemeModel.predict(
|
|
222
|
+
self, cobs_path, get_record_iterator(sequence_input), step=step
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
if isinstance(
|
|
226
|
+
sequence_input,
|
|
227
|
+
(SeqIO.FastaIO.FastaIterator, SeqIO.QualityIO.FastqPhredIterator),
|
|
228
|
+
):
|
|
229
|
+
hits = {}
|
|
230
|
+
# individual_seq is a SeqRecord-Object
|
|
231
|
+
for individual_seq in sequence_input:
|
|
232
|
+
individual_hits = self.calculate_hits(cobs_path, individual_seq.seq)
|
|
233
|
+
hits[individual_seq.id] = individual_hits
|
|
234
|
+
return MlstResult(self.model_display_name, step, hits)
|
|
235
|
+
|
|
236
|
+
raise ValueError(
|
|
237
|
+
"Invalid sequence input, must be a Seq object, a list of Seq objects, a"
|
|
238
|
+
" SeqIO FastaIterator, or a SeqIO FastqPhredIterator"
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
def get_highest_cobs_result(self, cobs_result: cobs_index.SearchResult) -> dict:
|
|
242
|
+
"""Returns the first entry in a COBS search result."""
|
|
243
|
+
# counter = 1
|
|
244
|
+
# dictio = {}
|
|
245
|
+
for individual_result in cobs_result:
|
|
246
|
+
# COBS already sorts the result in descending order
|
|
247
|
+
# The first doc_name has the highest result which is needed to determine the allele
|
|
248
|
+
return {individual_result.doc_name: individual_result.score}
|
|
249
|
+
|
|
250
|
+
def get_cobs_result(self, cobs_result: cobs_index.SearchResult) -> dict:
|
|
251
|
+
"""Returns all entries in a COBS search result."""
|
|
252
|
+
return {
|
|
253
|
+
individual_result.doc_name: individual_result.score
|
|
254
|
+
for individual_result in cobs_result
|
|
255
|
+
if individual_result.score > 50
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
def sequence_splitter(self, input_sequence: str, allele_len: int) -> list[str]:
|
|
259
|
+
"""Returns an equally divided sequence in form of a list."""
|
|
260
|
+
# An input sequence will have 10000 or more base pairs.
|
|
261
|
+
sequence_len = len(input_sequence)
|
|
262
|
+
|
|
263
|
+
if sequence_len < 100000:
|
|
264
|
+
substring_length = allele_len // 10
|
|
265
|
+
elif 100000 <= sequence_len < 1000000:
|
|
266
|
+
substring_length = allele_len
|
|
267
|
+
elif 1000000 <= sequence_len < 10000000:
|
|
268
|
+
substring_length = allele_len * 10
|
|
269
|
+
else:
|
|
270
|
+
substring_length = allele_len * 100
|
|
271
|
+
|
|
272
|
+
substring_list = []
|
|
273
|
+
start = 0
|
|
274
|
+
|
|
275
|
+
while start + substring_length <= sequence_len:
|
|
276
|
+
substring_list.append(input_sequence[start : start + substring_length])
|
|
277
|
+
start += substring_length - self.k + 1 # To not lose kmers when dividing
|
|
278
|
+
|
|
279
|
+
# The remaining string is either appended to the list or added to the last entry.
|
|
280
|
+
if start < len(input_sequence):
|
|
281
|
+
remaining_substring = input_sequence[start:]
|
|
282
|
+
# A substring needs to be at least of size k for COBS.
|
|
283
|
+
if len(remaining_substring) < self.k:
|
|
284
|
+
substring_list[-1] += remaining_substring
|
|
285
|
+
else:
|
|
286
|
+
substring_list.append(remaining_substring)
|
|
287
|
+
return substring_list
|
|
@@ -8,6 +8,7 @@ from Bio.SeqRecord import SeqRecord
|
|
|
8
8
|
from Bio import SeqIO
|
|
9
9
|
from slugify import slugify
|
|
10
10
|
import cobs_index as cobs
|
|
11
|
+
from xspect.definitions import fasta_endings, fastq_endings
|
|
11
12
|
from xspect.file_io import get_record_iterator
|
|
12
13
|
from xspect.models.result import ModelResult
|
|
13
14
|
|
|
@@ -64,10 +65,6 @@ class ProbabilisticFilterModel:
|
|
|
64
65
|
"num_hashes": self.num_hashes,
|
|
65
66
|
}
|
|
66
67
|
|
|
67
|
-
def __dict__(self) -> dict:
|
|
68
|
-
"""Returns a dictionary representation of the model"""
|
|
69
|
-
return self.to_dict()
|
|
70
|
-
|
|
71
68
|
def slug(self) -> str:
|
|
72
69
|
"""Returns a slug representation of the model"""
|
|
73
70
|
return slugify(self.model_display_name + "-" + str(self.model_type))
|
|
@@ -89,13 +86,7 @@ class ProbabilisticFilterModel:
|
|
|
89
86
|
|
|
90
87
|
doclist = cobs.DocumentList()
|
|
91
88
|
for file in dir_path.iterdir():
|
|
92
|
-
if file.is_file() and file.suffix in
|
|
93
|
-
".fasta",
|
|
94
|
-
".fna",
|
|
95
|
-
".fa",
|
|
96
|
-
".fastq",
|
|
97
|
-
".fq",
|
|
98
|
-
]:
|
|
89
|
+
if file.is_file() and file.suffix[1:] in fasta_endings + fastq_endings:
|
|
99
90
|
# cobs only uses the file name to the first "." as the document name
|
|
100
91
|
if file.name in display_names:
|
|
101
92
|
self.display_names[file.name.split(".")[0]] = display_names[
|
|
@@ -65,8 +65,11 @@ class ProbabilisticFilterSVMModel(ProbabilisticFilterModel):
|
|
|
65
65
|
) -> None:
|
|
66
66
|
"""Fit the SVM to the sequences and labels"""
|
|
67
67
|
|
|
68
|
+
# Since the SVM works with score data, we need to train
|
|
69
|
+
# the underlying data structure for score generation first
|
|
68
70
|
super().fit(dir_path, display_names=display_names)
|
|
69
71
|
|
|
72
|
+
# calculate scores for SVM training
|
|
70
73
|
score_list = []
|
|
71
74
|
for file in svm_path.iterdir():
|
|
72
75
|
if not file.is_file():
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""Base probabilistic filter model for sequence data"""
|
|
2
2
|
|
|
3
3
|
# pylint: disable=no-name-in-module, too-many-instance-attributes
|
|
4
4
|
|
|
@@ -14,7 +14,7 @@ from xspect.file_io import get_record_iterator
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
|
|
17
|
-
"""
|
|
17
|
+
"""Base probabilistic filter model for sequence data"""
|
|
18
18
|
|
|
19
19
|
def __init__(
|
|
20
20
|
self,
|
|
@@ -25,7 +25,6 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
|
|
|
25
25
|
model_type: str,
|
|
26
26
|
base_path: Path,
|
|
27
27
|
fpr: float = 0.01,
|
|
28
|
-
num_hashes: int = 7,
|
|
29
28
|
) -> None:
|
|
30
29
|
super().__init__(
|
|
31
30
|
k=k,
|
|
@@ -35,12 +34,12 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
|
|
|
35
34
|
model_type=model_type,
|
|
36
35
|
base_path=base_path,
|
|
37
36
|
fpr=fpr,
|
|
38
|
-
num_hashes=
|
|
37
|
+
num_hashes=1,
|
|
39
38
|
)
|
|
40
39
|
self.bf = None
|
|
41
40
|
|
|
42
41
|
def fit(self, file_path: Path, display_name: str) -> None:
|
|
43
|
-
"""Fit the
|
|
42
|
+
"""Fit the cobs classic index to the sequences and labels"""
|
|
44
43
|
# estimate number of kmers
|
|
45
44
|
total_length = 0
|
|
46
45
|
for record in get_record_iterator(file_path):
|
|
@@ -89,7 +88,6 @@ class ProbabilisticSingleFilterModel(ProbabilisticFilterModel):
|
|
|
89
88
|
model_json["model_type"],
|
|
90
89
|
path.parent,
|
|
91
90
|
fpr=model_json["fpr"],
|
|
92
|
-
num_hashes=model_json["num_hashes"],
|
|
93
91
|
)
|
|
94
92
|
model.display_names = model_json["display_names"]
|
|
95
93
|
bloom_path = model.base_path / model.slug() / "filter.bloom"
|
xspect/models/result.py
CHANGED
|
@@ -5,10 +5,11 @@ from enum import Enum
|
|
|
5
5
|
|
|
6
6
|
def get_last_processing_step(result: "ModelResult") -> "ModelResult":
|
|
7
7
|
"""Get the last subprocessing step of the result. First path only."""
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
8
|
+
|
|
9
|
+
# traverse result tree to get last step
|
|
10
|
+
while result.subprocessing_steps:
|
|
11
|
+
result = result.subprocessing_steps[-1].result
|
|
12
|
+
return result
|
|
12
13
|
|
|
13
14
|
|
|
14
15
|
class StepType(Enum):
|
|
@@ -82,9 +83,9 @@ class ModelResult:
|
|
|
82
83
|
scores = {
|
|
83
84
|
subsequence: {
|
|
84
85
|
label: round(hits / self.num_kmers[subsequence], 2)
|
|
85
|
-
for label, hits in
|
|
86
|
+
for label, hits in subsequence_hits.items()
|
|
86
87
|
}
|
|
87
|
-
for subsequence,
|
|
88
|
+
for subsequence, subsequence_hits in self.hits.items()
|
|
88
89
|
}
|
|
89
90
|
|
|
90
91
|
# calculate total scores
|
xspect/train.py
CHANGED
|
@@ -40,7 +40,7 @@ def check_user_input(user_input: str):
|
|
|
40
40
|
rank = metadata["rank"]
|
|
41
41
|
lineage = metadata["lineage"]
|
|
42
42
|
bacteria_id = 2
|
|
43
|
-
if
|
|
43
|
+
if user_input not in (sci_name, tax_id):
|
|
44
44
|
print(
|
|
45
45
|
f"{get_current_time()}| The given genus: {user_input} was found as"
|
|
46
46
|
f" genus: {sci_name} ID: {tax_id}"
|
|
@@ -60,38 +60,6 @@ def check_user_input(user_input: str):
|
|
|
60
60
|
sys.exit()
|
|
61
61
|
|
|
62
62
|
|
|
63
|
-
def copy_custom_data(bf_path: str, svm_path: str, dir_name: str):
|
|
64
|
-
"""
|
|
65
|
-
|
|
66
|
-
:param bf_path:
|
|
67
|
-
:param svm_path:
|
|
68
|
-
:param dir_name:
|
|
69
|
-
:return:
|
|
70
|
-
"""
|
|
71
|
-
path = Path(os.getcwd()) / "genus_metadata" / dir_name
|
|
72
|
-
new_bf_path = path / "concatenate"
|
|
73
|
-
new_svm_path = path / "training_data"
|
|
74
|
-
|
|
75
|
-
# Make the new directories.
|
|
76
|
-
path.mkdir(exist_ok=True)
|
|
77
|
-
new_bf_path.mkdir(exist_ok=True)
|
|
78
|
-
new_svm_path.mkdir(exist_ok=True)
|
|
79
|
-
|
|
80
|
-
# Move bloomfilter files.
|
|
81
|
-
bf_files = os.listdir(bf_path)
|
|
82
|
-
for file in bf_files:
|
|
83
|
-
file_path = Path(bf_path) / file
|
|
84
|
-
new_file_path = new_bf_path / file
|
|
85
|
-
shutil.copy2(file_path, new_file_path)
|
|
86
|
-
|
|
87
|
-
# Move svm files.
|
|
88
|
-
svm_files = os.listdir(svm_path)
|
|
89
|
-
for file in svm_files:
|
|
90
|
-
file_path = Path(svm_path) / file
|
|
91
|
-
new_file_path = new_svm_path / file
|
|
92
|
-
shutil.copy2(file_path, new_file_path)
|
|
93
|
-
|
|
94
|
-
|
|
95
63
|
def set_logger(dir_name: str):
|
|
96
64
|
"""Sets the logger parameters.
|
|
97
65
|
|
XspecT-0.2.5.dist-info/RECORD
DELETED
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
xspect/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
xspect/definitions.py,sha256=gg6NvT8ypNzlnJvMMo3nHsyh8DHFFu41lOfnILkRDpE,1215
|
|
3
|
-
xspect/download_filters.py,sha256=ByE7Oggx-AyJ02Wirk_wcJHNdRDrJMfjwhmUe5tgWbE,741
|
|
4
|
-
xspect/fastapi.py,sha256=C8pBBiqM6UdedLZgzfL_YYRuy98aPj8dcw_CLFrtMMc,3260
|
|
5
|
-
xspect/file_io.py,sha256=zKhl6Fd9KZAYiD8YgIyje5TbDYk5lxMp1WUrNkGSBo8,2779
|
|
6
|
-
xspect/main.py,sha256=eOA9PAeq3VvPWWoOZxXFErvPNW-ANzOxqMsbQJPCvDw,3651
|
|
7
|
-
xspect/model_management.py,sha256=w0aqjLUoixCokyKTYrcN1vih5IoLYLJG9p8aeYdVc8Y,3560
|
|
8
|
-
xspect/pipeline.py,sha256=h7duhVZ-hupwO_KQPstzFo8KMfMI2yleb9HmtTiMjic,7219
|
|
9
|
-
xspect/run.py,sha256=OJ7pCFqva3AhIYklKjVnqWGooVRO7S3b56kIAy-xabY,1189
|
|
10
|
-
xspect/train.py,sha256=khC1lldqfr4NvzLUiSJjSlh7DBG1ePielvQMiB29Hl8,10399
|
|
11
|
-
xspect/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
-
xspect/models/probabilistic_filter_model.py,sha256=ImyNRzR7jf2CBPGI65ItG0_eYmrQjo9soQYlsM0r-P0,9829
|
|
13
|
-
xspect/models/probabilistic_filter_svm_model.py,sha256=Z_aAigE_fC_gm80hRfxvROHGs6LuBqZnATHPpAkQGQE,5466
|
|
14
|
-
xspect/models/probabilistic_single_filter_model.py,sha256=nDAd_-_Ci2eH0KOJtf4wA-w63FMq9rGSR1LGiIA-gdw,3884
|
|
15
|
-
xspect/models/result.py,sha256=vHUEFXvbFyB8WmasXp99IrztjwaxH1f9QMFiRUPe40Q,4824
|
|
16
|
-
xspect/train_filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
|
-
xspect/train_filter/create_svm.py,sha256=w6gq40yHINVfNzLhJfYFykUaNCwpU9AEDcbkUfis3DY,1504
|
|
18
|
-
xspect/train_filter/extract_and_concatenate.py,sha256=lLrczGgfZi2vAGqxq8fcEmJi5pvqyK33JkB_ZoCNYG8,4840
|
|
19
|
-
xspect/train_filter/html_scrap.py,sha256=76VV_ZbvD2I3IxRb62SiQwRPu2tr4fwn1HkfJQYaosM,3809
|
|
20
|
-
xspect/train_filter/ncbi_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
21
|
-
xspect/train_filter/ncbi_api/download_assemblies.py,sha256=MB_mxSjCTL05DqIt1WQem8AGU3PjtJnzPndeI9J-AOI,1285
|
|
22
|
-
xspect/train_filter/ncbi_api/ncbi_assembly_metadata.py,sha256=puzDIws-yyBAEHwSAIYUM7g8FpLFmvOKh5xH1EsY8ZE,3830
|
|
23
|
-
xspect/train_filter/ncbi_api/ncbi_children_tree.py,sha256=_8puOsnsKp5lsMV2gZY1ijkfD_BZKG9eXZCX09qph5E,1819
|
|
24
|
-
xspect/train_filter/ncbi_api/ncbi_taxon_metadata.py,sha256=O6JDXC4E6AYaf7NPnb34eSJyZhMB8r--bjoVF_ZsEdA,1868
|
|
25
|
-
XspecT-0.2.5.dist-info/LICENSE,sha256=bhBGDKIRUVwYIHGOGO5hshzuVHyqFJajvSOA3XXOLKI,1094
|
|
26
|
-
XspecT-0.2.5.dist-info/METADATA,sha256=NDw2i1MawAjAkybDXzaQfIIGFI4sw86MSlQJ8z6vkWs,4834
|
|
27
|
-
XspecT-0.2.5.dist-info/WHEEL,sha256=cVxcB9AmuTcXqmwrtPhNK88dr7IR_b6qagTj0UvIEbY,91
|
|
28
|
-
XspecT-0.2.5.dist-info/entry_points.txt,sha256=L7qliX3pIuwupQxpuOSsrBJCSHYPOPNEzH8KZKQGGUw,43
|
|
29
|
-
XspecT-0.2.5.dist-info/top_level.txt,sha256=hdoa4cnBv6OVzpyhMmyxpJxEydH5n2lDciy8urc1paE,7
|
|
30
|
-
XspecT-0.2.5.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|