XspecT 0.5.3__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of XspecT might be problematic. Click here for more details.
- xspect/classify.py +38 -8
- xspect/definitions.py +30 -10
- xspect/file_io.py +2 -1
- xspect/filter_sequences.py +20 -4
- xspect/main.py +126 -28
- xspect/misclassification_detection/__init__.py +0 -0
- xspect/misclassification_detection/mapping.py +168 -0
- xspect/misclassification_detection/point_pattern_analysis.py +102 -0
- xspect/misclassification_detection/simulate_reads.py +55 -0
- xspect/mlst_feature/mlst_helper.py +15 -19
- xspect/mlst_feature/pub_mlst_handler.py +16 -19
- xspect/model_management.py +14 -17
- xspect/models/probabilistic_filter_mlst_model.py +11 -10
- xspect/models/probabilistic_filter_model.py +142 -8
- xspect/models/probabilistic_filter_svm_model.py +29 -14
- xspect/models/probabilistic_single_filter_model.py +9 -7
- xspect/models/result.py +22 -15
- xspect/ncbi.py +82 -7
- xspect/train.py +21 -4
- xspect/web.py +13 -4
- {xspect-0.5.3.dist-info → xspect-0.6.0.dist-info}/METADATA +4 -1
- {xspect-0.5.3.dist-info → xspect-0.6.0.dist-info}/RECORD +26 -22
- {xspect-0.5.3.dist-info → xspect-0.6.0.dist-info}/WHEEL +0 -0
- {xspect-0.5.3.dist-info → xspect-0.6.0.dist-info}/entry_points.txt +0 -0
- {xspect-0.5.3.dist-info → xspect-0.6.0.dist-info}/licenses/LICENSE +0 -0
- {xspect-0.5.3.dist-info → xspect-0.6.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Point pattern density analysis tool for the alignment-based misclassification detection.
|
|
3
|
+
|
|
4
|
+
Notes:
|
|
5
|
+
Developed by Oemer Cetin as part of Bsc thesis (2025), Goethe University Frankfurt am Main.
|
|
6
|
+
(An Integration of Alignment-Free and Alignment-Based Approaches for Bacterial Taxon Assignment)
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import numpy
|
|
10
|
+
|
|
11
|
+
__author__ = "Cetin, Oemer"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class PointPatternAnalysis:
|
|
15
|
+
"""Class for all point pattern density analysis procedures."""
|
|
16
|
+
|
|
17
|
+
def __init__(self, points: list[int], length: int):
|
|
18
|
+
"""
|
|
19
|
+
Initialise the class for point pattern analysis.
|
|
20
|
+
|
|
21
|
+
This method sets up the required list with data points (sorted) and the length of the reference genome.
|
|
22
|
+
All required intensity for the statistics is also calculated.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
points (list): The start coordinates of mapped regions on the genome.
|
|
26
|
+
length (int): The length of the reference genome.
|
|
27
|
+
"""
|
|
28
|
+
if len(points) < 2:
|
|
29
|
+
raise ValueError("Need at least 2 points.")
|
|
30
|
+
self.sorted_points = numpy.sort(numpy.asarray(points, dtype=float))
|
|
31
|
+
self.n = len(points)
|
|
32
|
+
self.length = float(length)
|
|
33
|
+
|
|
34
|
+
def ripleys_k(self) -> tuple[bool, float, float]:
|
|
35
|
+
"""
|
|
36
|
+
Calculates the K-function for the given point distribution.
|
|
37
|
+
|
|
38
|
+
This method calculates the K-function to describe the point distribution.
|
|
39
|
+
The result is than compared with what would be expected under a completely random distribution.
|
|
40
|
+
(Under complete randomness the K-function result is 2*r)
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
tuple: A tuple containing the information whether points are clustered or not.
|
|
44
|
+
"""
|
|
45
|
+
r = 0.01 * self.length
|
|
46
|
+
left = 0
|
|
47
|
+
right = 0
|
|
48
|
+
total_neighbors = 0
|
|
49
|
+
|
|
50
|
+
for i in range(self.n):
|
|
51
|
+
while self.sorted_points[i] - self.sorted_points[left] > r:
|
|
52
|
+
left += 1
|
|
53
|
+
if right < i:
|
|
54
|
+
right = i
|
|
55
|
+
while (
|
|
56
|
+
right + 1 < self.n
|
|
57
|
+
and self.sorted_points[right + 1] - self.sorted_points[i] <= r
|
|
58
|
+
):
|
|
59
|
+
right += 1
|
|
60
|
+
total_neighbors += right - left
|
|
61
|
+
k = (self.length / (self.n * (self.n - 1))) * total_neighbors
|
|
62
|
+
return (k > 2 * r), k, 2 * r
|
|
63
|
+
|
|
64
|
+
def ripleys_k_edge_corrected(self) -> tuple[bool, float, float]:
|
|
65
|
+
"""
|
|
66
|
+
Calculates the K-function for the given point distribution with an edge correction factor.
|
|
67
|
+
|
|
68
|
+
This method calculates the K-function to describe the point distribution.
|
|
69
|
+
This time an additional factor is multiplied for each data point to account for edge effects.
|
|
70
|
+
The result is than compared with what would be expected under a completely random distribution.
|
|
71
|
+
(Under complete randomness the K-function result is 2*r)
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
tuple: A tuple containing the information whether the points are clustered or not.
|
|
75
|
+
"""
|
|
76
|
+
r = 0.01 * self.length
|
|
77
|
+
left = 0
|
|
78
|
+
right = 0
|
|
79
|
+
total_weighted = 0
|
|
80
|
+
|
|
81
|
+
for i in range(self.n):
|
|
82
|
+
while self.sorted_points[i] - self.sorted_points[left] > r:
|
|
83
|
+
left += 1
|
|
84
|
+
if right < i:
|
|
85
|
+
right = i
|
|
86
|
+
while (
|
|
87
|
+
right + 1 < self.n
|
|
88
|
+
and self.sorted_points[right + 1] - self.sorted_points[i] <= r
|
|
89
|
+
):
|
|
90
|
+
right += 1
|
|
91
|
+
|
|
92
|
+
neighbors = right - left
|
|
93
|
+
if neighbors > 0:
|
|
94
|
+
a = max(0, self.sorted_points[i] - r)
|
|
95
|
+
b = min(self.length, self.sorted_points[i] + r)
|
|
96
|
+
overlap = b - a
|
|
97
|
+
weight = (2 * r) / overlap if overlap > 0 else 0
|
|
98
|
+
|
|
99
|
+
total_weighted += weight * neighbors
|
|
100
|
+
|
|
101
|
+
k = (self.length / (self.n * (self.n - 1))) * total_weighted
|
|
102
|
+
return (bool(k > 2 * r)), float(k), 2 * r
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Read simulation for the alignment-based misclassification detection (Used for testing purposes).
|
|
3
|
+
|
|
4
|
+
Notes:
|
|
5
|
+
Developed by Oemer Cetin as part of a Bsc thesis at Goethe University Frankfurt am Main (2025).
|
|
6
|
+
(An Integration of Alignment-Free and Alignment-Based Approaches for Bacterial Taxon Assignment)
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import random
|
|
10
|
+
from Bio import SeqIO
|
|
11
|
+
|
|
12
|
+
__author__ = "Cetin, Oemer"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def extract_random_reads(
|
|
16
|
+
fasta_file, output_fasta, read_length=150, num_reads=1000, seed=42
|
|
17
|
+
) -> None:
|
|
18
|
+
"""
|
|
19
|
+
Uniformly extracts reads from a genome and writes them to a FASTA-file.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
fasta_file (str): Path to input FASTA file.
|
|
23
|
+
output_fasta (str): Output FASTA file to write simulated reads.
|
|
24
|
+
read_length (int): Length of each read to extract.
|
|
25
|
+
num_reads (int): Total number of reads to extract.
|
|
26
|
+
seed (int): A seed for reproducibility.
|
|
27
|
+
|
|
28
|
+
Raises:
|
|
29
|
+
ValueError: If the sequences are shorter than the chosen read length.
|
|
30
|
+
"""
|
|
31
|
+
random.seed(seed)
|
|
32
|
+
sequences = [
|
|
33
|
+
record
|
|
34
|
+
for record in SeqIO.parse(fasta_file, "fasta")
|
|
35
|
+
if len(record.seq) >= read_length
|
|
36
|
+
]
|
|
37
|
+
if not sequences:
|
|
38
|
+
raise ValueError("No sequences long enough for the desired read length.")
|
|
39
|
+
|
|
40
|
+
# Probability to extract reads from large contigs is higher
|
|
41
|
+
seq_lengths = [len(rec.seq) for rec in sequences]
|
|
42
|
+
total_length = sum(seq_lengths)
|
|
43
|
+
weights = [single_length / total_length for single_length in seq_lengths]
|
|
44
|
+
|
|
45
|
+
with open(output_fasta, "w") as o:
|
|
46
|
+
for i in range(num_reads):
|
|
47
|
+
# random.choices() provides a list!
|
|
48
|
+
selected = random.choices(sequences, weights=weights, k=1)[0]
|
|
49
|
+
seq_length = len(selected.seq)
|
|
50
|
+
start = random.randint(0, seq_length - read_length)
|
|
51
|
+
read_seq = selected.seq[start : start + read_length]
|
|
52
|
+
o.write(
|
|
53
|
+
f">read_{i}_{selected.id}_{start}-{start + read_length}\n{read_seq}\n"
|
|
54
|
+
)
|
|
55
|
+
print("The reads have been simulated successfully.")
|
|
@@ -2,10 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
__author__ = "Cetin, Oemer"
|
|
4
4
|
|
|
5
|
-
import requests
|
|
6
5
|
import json
|
|
7
|
-
from io import StringIO
|
|
8
6
|
from pathlib import Path
|
|
7
|
+
from io import StringIO
|
|
8
|
+
import requests
|
|
9
9
|
from Bio import SeqIO
|
|
10
10
|
from xspect.definitions import get_xspect_model_path
|
|
11
11
|
|
|
@@ -29,7 +29,7 @@ def create_fasta_files(locus_path: Path, fasta_batch: str) -> None:
|
|
|
29
29
|
output_fasta_file = locus_path / f"Allele_ID_{number}.fasta"
|
|
30
30
|
if output_fasta_file.exists():
|
|
31
31
|
continue # Ignore existing ones
|
|
32
|
-
with open(output_fasta_file, "w") as allele:
|
|
32
|
+
with open(output_fasta_file, "w", encoding="utf-8") as allele:
|
|
33
33
|
SeqIO.write(record, allele, "fasta")
|
|
34
34
|
|
|
35
35
|
|
|
@@ -59,10 +59,9 @@ def pick_species_number_from_db(available_species: dict) -> str:
|
|
|
59
59
|
if int(choice) in available_species.keys():
|
|
60
60
|
chosen_species = available_species.get(int(choice))
|
|
61
61
|
return chosen_species
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
)
|
|
62
|
+
print(
|
|
63
|
+
"Wrong input! Try again with a number that is available in the list above."
|
|
64
|
+
)
|
|
66
65
|
except ValueError:
|
|
67
66
|
print(
|
|
68
67
|
"Wrong input! Try again with a number that is available in the list above."
|
|
@@ -95,10 +94,9 @@ def pick_scheme_number_from_db(available_schemes: dict) -> str:
|
|
|
95
94
|
if int(choice) in available_schemes.keys():
|
|
96
95
|
chosen_scheme = available_schemes.get(int(choice))[1]
|
|
97
96
|
return chosen_scheme
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
)
|
|
97
|
+
print(
|
|
98
|
+
"Wrong input! Try again with a number that is available in the above list."
|
|
99
|
+
)
|
|
102
100
|
except ValueError:
|
|
103
101
|
print(
|
|
104
102
|
"Wrong input! Try again with a number that is available in the above list."
|
|
@@ -162,12 +160,12 @@ def pick_scheme(available_schemes: dict) -> Path:
|
|
|
162
160
|
for counter, scheme in available_schemes.items():
|
|
163
161
|
# For Strain Typing with an API-POST Request to the db
|
|
164
162
|
if str(scheme).startswith("http"):
|
|
165
|
-
scheme_json = requests.get(scheme).json()
|
|
163
|
+
scheme_json = requests.get(scheme, timeout=10).json()
|
|
166
164
|
print(str(counter) + ":" + scheme_json["description"])
|
|
167
165
|
|
|
168
166
|
# To pick a scheme after download for fitting
|
|
169
167
|
else:
|
|
170
|
-
print(str(counter) + ":" + str(scheme).
|
|
168
|
+
print(str(counter) + ":" + str(scheme).rsplit("/", maxsplit=1)[-1])
|
|
171
169
|
|
|
172
170
|
print("\nPick a scheme for strain type prediction")
|
|
173
171
|
while True:
|
|
@@ -176,10 +174,9 @@ def pick_scheme(available_schemes: dict) -> Path:
|
|
|
176
174
|
if int(choice) in available_schemes.keys():
|
|
177
175
|
chosen_scheme = available_schemes.get(int(choice))
|
|
178
176
|
return chosen_scheme
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
)
|
|
177
|
+
print(
|
|
178
|
+
"Wrong input! Try again with a number that is available in the above list."
|
|
179
|
+
)
|
|
183
180
|
except ValueError:
|
|
184
181
|
print(
|
|
185
182
|
"Wrong input! Try again with a number that is available in the above list."
|
|
@@ -209,8 +206,7 @@ class MlstResult:
|
|
|
209
206
|
Returns:
|
|
210
207
|
dict: The result dictionary with s sequence ID as key and the Strain type as value.
|
|
211
208
|
"""
|
|
212
|
-
|
|
213
|
-
return results
|
|
209
|
+
return dict(self.hits.items())
|
|
214
210
|
|
|
215
211
|
def to_dict(self) -> dict:
|
|
216
212
|
"""
|
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
__author__ = "Cetin, Oemer"
|
|
4
4
|
|
|
5
|
-
import requests
|
|
6
5
|
import json
|
|
6
|
+
import requests
|
|
7
7
|
from xspect.mlst_feature.mlst_helper import (
|
|
8
8
|
create_fasta_files,
|
|
9
9
|
pick_species_number_from_db,
|
|
@@ -51,7 +51,7 @@ class PubMLSTHandler:
|
|
|
51
51
|
counter = 1
|
|
52
52
|
# retrieve all available species
|
|
53
53
|
species_url = PubMLSTHandler.base_url
|
|
54
|
-
for species_databases in requests.get(species_url).json():
|
|
54
|
+
for species_databases in requests.get(species_url, timeout=10).json():
|
|
55
55
|
for database in species_databases["databases"]:
|
|
56
56
|
if database["name"].endswith("seqdef"):
|
|
57
57
|
available_species[counter] = database["name"]
|
|
@@ -61,7 +61,7 @@ class PubMLSTHandler:
|
|
|
61
61
|
|
|
62
62
|
counter = 1
|
|
63
63
|
scheme_url = f"{species_url}/{chosen_species}/schemes"
|
|
64
|
-
for scheme in requests.get(scheme_url).json()["schemes"]:
|
|
64
|
+
for scheme in requests.get(scheme_url, timeout=10).json()["schemes"]:
|
|
65
65
|
# scheme["description"] stores the name of a scheme.
|
|
66
66
|
# scheme["scheme"] stores the URL that is needed for downloading all loci.
|
|
67
67
|
available_schemes[counter] = [scheme["description"], scheme["scheme"]]
|
|
@@ -70,11 +70,8 @@ class PubMLSTHandler:
|
|
|
70
70
|
# Selection process of available scheme from a species for download (doubles are caught!)
|
|
71
71
|
while True:
|
|
72
72
|
chosen_scheme = pick_scheme_number_from_db(available_schemes)
|
|
73
|
-
|
|
73
|
+
if chosen_scheme not in chosen_schemes:
|
|
74
74
|
chosen_schemes.append(chosen_scheme)
|
|
75
|
-
if chosen_scheme not in chosen_schemes
|
|
76
|
-
else None
|
|
77
|
-
)
|
|
78
75
|
choice = input(
|
|
79
76
|
"Do you want to pick another scheme to download? (y/n):"
|
|
80
77
|
).lower()
|
|
@@ -97,7 +94,7 @@ class PubMLSTHandler:
|
|
|
97
94
|
self.choose_schemes() # changes the scheme_list attribute
|
|
98
95
|
|
|
99
96
|
for scheme in self.scheme_list:
|
|
100
|
-
scheme_json = requests.get(scheme).json()
|
|
97
|
+
scheme_json = requests.get(scheme, timeout=10).json()
|
|
101
98
|
# We only want the name and the respective featured loci of a scheme
|
|
102
99
|
scheme_name = scheme_json["description"]
|
|
103
100
|
locus_list = scheme_json["loci"]
|
|
@@ -117,7 +114,7 @@ class PubMLSTHandler:
|
|
|
117
114
|
if not locus_path.exists():
|
|
118
115
|
locus_path.mkdir(exist_ok=True, parents=True)
|
|
119
116
|
|
|
120
|
-
alleles = requests.get(f"{locus_url}/alleles_fasta").text
|
|
117
|
+
alleles = requests.get(f"{locus_url}/alleles_fasta", timeout=10).text
|
|
121
118
|
create_fasta_files(locus_path, alleles)
|
|
122
119
|
|
|
123
120
|
def assign_strain_type_by_db(self) -> None:
|
|
@@ -132,13 +129,15 @@ class PubMLSTHandler:
|
|
|
132
129
|
str(pick_scheme(scheme_list_to_dict(self.scheme_list))) + "/sequence"
|
|
133
130
|
)
|
|
134
131
|
fasta_file = get_xspect_upload_path() / "Test.fna"
|
|
135
|
-
with open(fasta_file, "r") as file:
|
|
132
|
+
with open(fasta_file, "r", encoding="utf-8") as file:
|
|
136
133
|
data = file.read()
|
|
137
134
|
payload = { # Essential API-POST-Body
|
|
138
135
|
"sequence": data,
|
|
139
136
|
"filetype": "fasta",
|
|
140
137
|
}
|
|
141
|
-
response = requests.post(
|
|
138
|
+
response = requests.post(
|
|
139
|
+
scheme_url, data=json.dumps(payload), timeout=10
|
|
140
|
+
).json()
|
|
142
141
|
|
|
143
142
|
for locus, meta_data in response["exact_matches"].items():
|
|
144
143
|
# meta_data is a list containing a dictionary, therefore [0] and then key value.
|
|
@@ -170,18 +169,16 @@ class PubMLSTHandler:
|
|
|
170
169
|
}
|
|
171
170
|
}
|
|
172
171
|
|
|
173
|
-
response = requests.post(post_url + "/designations", json=payload)
|
|
172
|
+
response = requests.post(post_url + "/designations", json=payload, timeout=10)
|
|
174
173
|
|
|
175
174
|
if response.status_code == 200:
|
|
176
175
|
data = response.json()
|
|
177
176
|
if "fields" in data:
|
|
178
177
|
post_response = data["fields"]
|
|
179
178
|
return post_response
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
post_response += "Possibly a novel Strain Type."
|
|
183
|
-
return post_response
|
|
184
|
-
else:
|
|
185
|
-
post_response = "Error:" + str(response.status_code)
|
|
186
|
-
post_response += response.text
|
|
179
|
+
post_response = "No matching Strain Type found in the database. "
|
|
180
|
+
post_response += "Possibly a novel Strain Type."
|
|
187
181
|
return post_response
|
|
182
|
+
post_response = "Error:" + str(response.status_code)
|
|
183
|
+
post_response += response.text
|
|
184
|
+
return post_response
|
xspect/model_management.py
CHANGED
|
@@ -2,45 +2,41 @@
|
|
|
2
2
|
|
|
3
3
|
from json import loads, dumps
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from xspect.models.probabilistic_single_filter_model import (
|
|
6
|
-
ProbabilisticSingleFilterModel,
|
|
7
|
-
)
|
|
8
|
-
from xspect.models.probabilistic_filter_svm_model import ProbabilisticFilterSVMModel
|
|
9
5
|
from xspect.definitions import get_xspect_model_path
|
|
10
6
|
|
|
11
7
|
|
|
12
|
-
def
|
|
8
|
+
def get_genus_model_path(genus) -> Path:
|
|
13
9
|
"""
|
|
14
|
-
Get a genus model for the specified genus.
|
|
10
|
+
Get a genus model path for the specified genus.
|
|
15
11
|
|
|
16
|
-
This function retrieves a pre-trained genus classification model based on the
|
|
12
|
+
This function retrieves the path of a pre-trained genus classification model based on the
|
|
13
|
+
provided genus name.
|
|
17
14
|
|
|
18
15
|
Args:
|
|
19
16
|
genus (str): The genus name for which the model is to be retrieved.
|
|
20
17
|
|
|
21
18
|
Returns:
|
|
22
|
-
|
|
19
|
+
Path: The file path of the genus classification model.
|
|
23
20
|
"""
|
|
24
21
|
genus_model_path = get_xspect_model_path() / (genus.lower() + "-genus.json")
|
|
25
|
-
|
|
26
|
-
return genus_filter_model
|
|
22
|
+
return genus_model_path
|
|
27
23
|
|
|
28
24
|
|
|
29
|
-
def
|
|
25
|
+
def get_species_model_path(genus) -> Path:
|
|
30
26
|
"""
|
|
31
|
-
Get a species
|
|
27
|
+
Get a species model path for the specified genus.
|
|
32
28
|
|
|
33
|
-
This function retrieves a pre-trained species classification model based on the
|
|
29
|
+
This function retrieves the path of a pre-trained species classification model based on the
|
|
30
|
+
provided genus name.
|
|
34
31
|
|
|
35
32
|
Args:
|
|
36
33
|
genus (str): The genus name for which the species model is to be retrieved.
|
|
37
34
|
|
|
38
35
|
Returns:
|
|
39
|
-
|
|
36
|
+
Path: The file path of the species classification model.
|
|
40
37
|
"""
|
|
41
38
|
species_model_path = get_xspect_model_path() / (genus.lower() + "-species.json")
|
|
42
|
-
|
|
43
|
-
return species_filter_model
|
|
39
|
+
return species_model_path
|
|
44
40
|
|
|
45
41
|
|
|
46
42
|
def get_model_metadata(model: str | Path) -> dict:
|
|
@@ -121,7 +117,8 @@ def get_models() -> dict[str, list[dict]]:
|
|
|
121
117
|
This function scans the model directory for JSON files and organizes them by their model type.
|
|
122
118
|
|
|
123
119
|
Returns:
|
|
124
|
-
dict[str, list[dict]]: A dictionary where keys are model types and values are lists of
|
|
120
|
+
dict[str, list[dict]]: A dictionary where keys are model types and values are lists of
|
|
121
|
+
model display names.
|
|
125
122
|
"""
|
|
126
123
|
model_dict = {}
|
|
127
124
|
for model_file in get_xspect_model_path().glob("*.json"):
|
|
@@ -2,14 +2,14 @@
|
|
|
2
2
|
|
|
3
3
|
__author__ = "Cetin, Oemer"
|
|
4
4
|
|
|
5
|
-
import cobs_index
|
|
6
5
|
import json
|
|
7
6
|
from pathlib import Path
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
import cobs_index
|
|
9
|
+
from cobs_index import DocumentList
|
|
8
10
|
from Bio import SeqIO
|
|
9
11
|
from Bio.Seq import Seq
|
|
10
12
|
from Bio.SeqRecord import SeqRecord
|
|
11
|
-
from cobs_index import DocumentList
|
|
12
|
-
from collections import defaultdict
|
|
13
13
|
from xspect.file_io import get_record_iterator
|
|
14
14
|
from xspect.mlst_feature.mlst_helper import MlstResult
|
|
15
15
|
from xspect.mlst_feature.pub_mlst_handler import PubMLSTHandler
|
|
@@ -100,11 +100,11 @@ class ProbabilisticFilterMlstSchemeModel:
|
|
|
100
100
|
"Scheme not found. Please make sure to download the schemes prior!"
|
|
101
101
|
)
|
|
102
102
|
|
|
103
|
-
scheme = str(scheme_path).
|
|
103
|
+
scheme = str(scheme_path).rsplit("/", maxsplit=1)[-1]
|
|
104
104
|
cobs_path = ""
|
|
105
105
|
# COBS structure for every locus (default = 7 for Oxford or Pasteur scheme)
|
|
106
106
|
for locus_path in sorted(scheme_path.iterdir()):
|
|
107
|
-
locus = str(locus_path).
|
|
107
|
+
locus = str(locus_path).rsplit("/", maxsplit=1)[-1]
|
|
108
108
|
# counts all fasta files that belong to a locus
|
|
109
109
|
self.loci[locus] = sum(
|
|
110
110
|
(1 for _ in locus_path.iterdir() if not str(_).endswith("cache"))
|
|
@@ -112,7 +112,7 @@ class ProbabilisticFilterMlstSchemeModel:
|
|
|
112
112
|
|
|
113
113
|
# determine the avg base pair size of alleles
|
|
114
114
|
fasta_file = next(locus_path.glob("*.fasta"), None)
|
|
115
|
-
with open(fasta_file, "r") as handle:
|
|
115
|
+
with open(fasta_file, "r", encoding="utf-8") as handle:
|
|
116
116
|
record = next(SeqIO.parse(handle, "fasta"))
|
|
117
117
|
self.avg_locus_bp_size.append(len(record.seq))
|
|
118
118
|
|
|
@@ -134,7 +134,8 @@ class ProbabilisticFilterMlstSchemeModel:
|
|
|
134
134
|
|
|
135
135
|
def save(self) -> None:
|
|
136
136
|
"""Saves the model to disk"""
|
|
137
|
-
|
|
137
|
+
# [-1] contains the scheme name
|
|
138
|
+
scheme = str(self.scheme_path).rsplit("/", maxsplit=1)[-1]
|
|
138
139
|
json_path = self.base_path / scheme / f"{scheme}.json"
|
|
139
140
|
json_object = json.dumps(self.to_dict(), indent=4)
|
|
140
141
|
|
|
@@ -152,7 +153,7 @@ class ProbabilisticFilterMlstSchemeModel:
|
|
|
152
153
|
Returns:
|
|
153
154
|
ProbabilisticFilterMlstSchemeModel: A trained model from the disk in JSON format.
|
|
154
155
|
"""
|
|
155
|
-
scheme_name = str(scheme_path).
|
|
156
|
+
scheme_name = str(scheme_path).rsplit("/", maxsplit=1)[-1]
|
|
156
157
|
json_path = scheme_path / f"{scheme_name}.json"
|
|
157
158
|
with open(json_path, "r", encoding="utf-8") as file:
|
|
158
159
|
json_object = file.read()
|
|
@@ -221,7 +222,7 @@ class ProbabilisticFilterMlstSchemeModel:
|
|
|
221
222
|
for entry in sorted(cobs_path.iterdir()):
|
|
222
223
|
if str(entry).endswith(".json"):
|
|
223
224
|
continue
|
|
224
|
-
file_name = str(entry).
|
|
225
|
+
file_name = str(entry).rsplit("/", maxsplit=1)[-1] # file_name = locus
|
|
225
226
|
scheme_path_list.append(file_name.split(".")[0]) # without the file ending
|
|
226
227
|
|
|
227
228
|
result_dict = {}
|
|
@@ -442,7 +443,7 @@ class ProbabilisticFilterMlstSchemeModel:
|
|
|
442
443
|
Returns:
|
|
443
444
|
bool: True if any locus score >= 0.5 * its avg base pair size, False otherwise.
|
|
444
445
|
"""
|
|
445
|
-
for i, (
|
|
446
|
+
for i, (_, allele_score_dict) in enumerate(highest_results.items()):
|
|
446
447
|
if not allele_score_dict:
|
|
447
448
|
continue # skip empty values
|
|
448
449
|
|