XspecT 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of XspecT might be problematic. Click here for more details.
- {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/METADATA +23 -29
- XspecT-0.2.0.dist-info/RECORD +30 -0
- {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/WHEEL +1 -1
- xspect/definitions.py +42 -0
- xspect/download_filters.py +11 -26
- xspect/fastapi.py +101 -0
- xspect/file_io.py +34 -103
- xspect/main.py +70 -66
- xspect/model_management.py +88 -0
- xspect/models/__init__.py +0 -0
- xspect/models/probabilistic_filter_model.py +277 -0
- xspect/models/probabilistic_filter_svm_model.py +169 -0
- xspect/models/probabilistic_single_filter_model.py +109 -0
- xspect/models/result.py +148 -0
- xspect/pipeline.py +201 -0
- xspect/run.py +38 -0
- xspect/train.py +304 -0
- xspect/train_filter/create_svm.py +6 -183
- xspect/train_filter/extract_and_concatenate.py +117 -121
- xspect/train_filter/html_scrap.py +16 -28
- xspect/train_filter/ncbi_api/download_assemblies.py +7 -8
- xspect/train_filter/ncbi_api/ncbi_assembly_metadata.py +9 -17
- xspect/train_filter/ncbi_api/ncbi_children_tree.py +3 -2
- xspect/train_filter/ncbi_api/ncbi_taxon_metadata.py +7 -5
- XspecT-0.1.3.dist-info/RECORD +0 -49
- xspect/BF_v2.py +0 -637
- xspect/Bootstrap.py +0 -29
- xspect/Classifier.py +0 -142
- xspect/OXA_Table.py +0 -53
- xspect/WebApp.py +0 -724
- xspect/XspecT_mini.py +0 -1363
- xspect/XspecT_trainer.py +0 -611
- xspect/map_kmers.py +0 -155
- xspect/search_filter.py +0 -504
- xspect/static/How-To.png +0 -0
- xspect/static/Logo.png +0 -0
- xspect/static/Logo2.png +0 -0
- xspect/static/Workflow_AspecT.png +0 -0
- xspect/static/Workflow_ClAssT.png +0 -0
- xspect/static/js.js +0 -615
- xspect/static/main.css +0 -280
- xspect/templates/400.html +0 -64
- xspect/templates/401.html +0 -62
- xspect/templates/404.html +0 -62
- xspect/templates/500.html +0 -62
- xspect/templates/about.html +0 -544
- xspect/templates/home.html +0 -51
- xspect/templates/layoutabout.html +0 -87
- xspect/templates/layouthome.html +0 -63
- xspect/templates/layoutspecies.html +0 -468
- xspect/templates/species.html +0 -33
- xspect/train_filter/README_XspecT_Erweiterung.md +0 -119
- xspect/train_filter/get_paths.py +0 -35
- xspect/train_filter/interface_XspecT.py +0 -204
- xspect/train_filter/k_mer_count.py +0 -162
- {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/LICENSE +0 -0
- {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/entry_points.txt +0 -0
- {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/top_level.txt +0 -0
|
@@ -1,119 +0,0 @@
|
|
|
1
|
-
# XspecT-Erweiterung
|
|
2
|
-
|
|
3
|
-
Expands XspecT, so new filter for a genus can automatically be trained. It's main
|
|
4
|
-
script is XspecT_trainer.py. The rest of the scripts are inside the python module
|
|
5
|
-
train_filter.
|
|
6
|
-
|
|
7
|
-
## Training new filter
|
|
8
|
-
|
|
9
|
-
XspecT_trainer.py uses command line arguments. The examples for using XspecT_trainer.py
|
|
10
|
-
are using Salmonella since this genus only has two defined species in the NCBI
|
|
11
|
-
databases.
|
|
12
|
-
|
|
13
|
-
### Jellyfish
|
|
14
|
-
|
|
15
|
-
The program jellyfish is used to count distinct k-meres in the assemblies. For XspecT_
|
|
16
|
-
trainer.py to work jellyfish needs to be installed. It can be installed using bioconda:
|
|
17
|
-
|
|
18
|
-
`
|
|
19
|
-
conda install -c bioconda jellyfish
|
|
20
|
-
`
|
|
21
|
-
|
|
22
|
-
### Training examples
|
|
23
|
-
|
|
24
|
-
New filters with assemblies from NCBI RefSeq can be trained with the following line. The
|
|
25
|
-
python libraries from [requirements.txt](..%2Frequirements.txt) need to be installed.
|
|
26
|
-
|
|
27
|
-
`
|
|
28
|
-
python XspecT_trainer.py Salmonella 1
|
|
29
|
-
`
|
|
30
|
-
|
|
31
|
-
Training filters with custom data can be done using the following line.
|
|
32
|
-
|
|
33
|
-
`
|
|
34
|
-
python XspecT_trainer.py Salmonella 2 -bf /path/to/concate_assemblies -svm
|
|
35
|
-
/path/to/assemblies
|
|
36
|
-
`
|
|
37
|
-
|
|
38
|
-
All command line arguments are explained using the following line.
|
|
39
|
-
|
|
40
|
-
`
|
|
41
|
-
python XspecT_trainer.py -h
|
|
42
|
-
`
|
|
43
|
-
|
|
44
|
-
# Explanation of the scripts
|
|
45
|
-
|
|
46
|
-
## backup_filter.py
|
|
47
|
-
|
|
48
|
-
Creates a backup of all files needed for the species assignment by XspecT for a specific
|
|
49
|
-
genus. The backup will be done, if new filters will be created for a genus which
|
|
50
|
-
already has trained filters.
|
|
51
|
-
|
|
52
|
-
## create_svm.py
|
|
53
|
-
|
|
54
|
-
Downloads the needed assemblies and trains a support-vector-machine for the genus.
|
|
55
|
-
|
|
56
|
-
## extract_and_concatenate.py
|
|
57
|
-
|
|
58
|
-
Unzips the downloaded assemblies. Concatenates assemblies per species that will be used
|
|
59
|
-
to train the bloomfilters.
|
|
60
|
-
|
|
61
|
-
## get_paths.py
|
|
62
|
-
|
|
63
|
-
Functions that get specific paths.
|
|
64
|
-
|
|
65
|
-
## html_scrap.py
|
|
66
|
-
|
|
67
|
-
Updates a list of all NCBI RefSeq assembly accessions that have a taxonomy check result
|
|
68
|
-
of OK. The taxonomy check from NCBI RefSeq uses the ANI (average-nucleotide-
|
|
69
|
-
identity) to compute a result.
|
|
70
|
-
|
|
71
|
-
## interface_XspecT.py
|
|
72
|
-
|
|
73
|
-
Mostly functions that train new bloomfilters automatically. The functions were
|
|
74
|
-
originally writen for XspecT in a non-automatic way and were updated.
|
|
75
|
-
|
|
76
|
-
## k_mer_count.py
|
|
77
|
-
|
|
78
|
-
Uses jellyfish to count distinct k-meres in every concatenated assembly. The highest
|
|
79
|
-
count will be used to compute the size of the bloomfilters.
|
|
80
|
-
|
|
81
|
-
## ncbi_api
|
|
82
|
-
|
|
83
|
-
A module which makes requests to the NCBI Datasets API.
|
|
84
|
-
|
|
85
|
-
### download_assemblies.py
|
|
86
|
-
|
|
87
|
-
The specific function that downloads assemblies from NCBI RefSeq using NCBI
|
|
88
|
-
datasets.
|
|
89
|
-
|
|
90
|
-
### ncbi_assembly_metadata.py
|
|
91
|
-
|
|
92
|
-
Takes a dictionary with species and their taxon ID and asks NCBI for assemblies of
|
|
93
|
-
the species. Saves the collected accessions of the found and selected assemblies.
|
|
94
|
-
|
|
95
|
-
### ncbi_children_tree.py
|
|
96
|
-
|
|
97
|
-
Takes the name or ID of a genus and gives a list with all its species.
|
|
98
|
-
|
|
99
|
-
### ncbi_taxon_metadata.py
|
|
100
|
-
|
|
101
|
-
Takes a list with taxon and collects metadata like their scientific name and rank.
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
xspect/train_filter/get_paths.py
DELETED
|
@@ -1,35 +0,0 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
import os
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
def get_concatenate_file_path(dir_name):
|
|
6
|
-
"""Returns str to file path of the concatenate directory.
|
|
7
|
-
|
|
8
|
-
:param dir_name: Name of the current genus_metadata directory.
|
|
9
|
-
:type dir_name: str
|
|
10
|
-
:return: File path to the concatenated species assemblies.
|
|
11
|
-
"""
|
|
12
|
-
return Path(os.getcwd()) / "genus_metadata" / dir_name / "concatenate"
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def get_current_dir_file_path(dir_name):
|
|
16
|
-
"""Returns str of file path to the directory with the currently needed metagenome assembly.
|
|
17
|
-
|
|
18
|
-
:param dir_name: Name of the current genus_metadata directory.
|
|
19
|
-
:type dir_name: str
|
|
20
|
-
:return: File path to the metagenome assembly.
|
|
21
|
-
"""
|
|
22
|
-
return Path(os.getcwd()) / "genus_metadata" / dir_name
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def get_metagenome_filter_path():
|
|
26
|
-
"""Returns the file path to the metagenome filters."""
|
|
27
|
-
return Path(os.getcwd()) / "filter" / "Metagenomes"
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
def main():
|
|
31
|
-
pass
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
if __name__ == "__main__":
|
|
35
|
-
main()
|
|
@@ -1,204 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import pickle
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
from shutil import rmtree
|
|
5
|
-
|
|
6
|
-
from loguru import logger
|
|
7
|
-
from numpy import log, square
|
|
8
|
-
|
|
9
|
-
import xspect.BF_v2 as BF_v2
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def compute_array_size(n, p=0.01):
|
|
13
|
-
"""Computes the Bit-Array-Size for the bloomfilters.
|
|
14
|
-
|
|
15
|
-
:param n: Highest k-mer count of a species.
|
|
16
|
-
:type n: int
|
|
17
|
-
:param p: Rate of mistakes.
|
|
18
|
-
:type p: float
|
|
19
|
-
:return: Bit-Array-Size for the bloomfilters.
|
|
20
|
-
"""
|
|
21
|
-
return -((n * log(p)) / (square(log(2))))
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
def make_paths(dir_name, genus):
|
|
25
|
-
"""Create paths to the concatenated sequences and to where the new bloomfilters will be saved.
|
|
26
|
-
|
|
27
|
-
:param dir_name: Name of the parent directory.
|
|
28
|
-
:type dir_name: str
|
|
29
|
-
:param genus: Name of the genus.
|
|
30
|
-
:type genus: str
|
|
31
|
-
:return: The path to the sequence files and the bloomfilter directory.
|
|
32
|
-
"""
|
|
33
|
-
# Path to concatenated sequences
|
|
34
|
-
files_path = Path(os.getcwd()) / "genus_metadata" / dir_name / "concatenate"
|
|
35
|
-
|
|
36
|
-
# Path for results.
|
|
37
|
-
result_path = Path(os.getcwd()) / "filter" / genus
|
|
38
|
-
# Try to create the directory for the bloomfilters.
|
|
39
|
-
try:
|
|
40
|
-
os.mkdir(result_path)
|
|
41
|
-
except FileExistsError:
|
|
42
|
-
# Delete the old directory with bloomfilters if already existed.
|
|
43
|
-
rmtree(result_path, ignore_errors=False, onerror=None)
|
|
44
|
-
os.mkdir(result_path)
|
|
45
|
-
|
|
46
|
-
return str(files_path), str(result_path)
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def init_bf(array_size, clonetypes=1, hashes=7, k=21):
|
|
50
|
-
"""Initiates an bloomfilter object with given parameters.
|
|
51
|
-
|
|
52
|
-
:param array_size: The size for the byte-array.
|
|
53
|
-
:type array_size: int
|
|
54
|
-
:param clonetypes: Number of clonetypes.
|
|
55
|
-
:type clonetypes: int
|
|
56
|
-
:param hashes: Number of hash functions used.
|
|
57
|
-
:type hashes: int
|
|
58
|
-
:param k: Length of k-mers.
|
|
59
|
-
:type k: int
|
|
60
|
-
:return: The initiated bloomfilter object.
|
|
61
|
-
"""
|
|
62
|
-
BF = BF_v2.AbaumanniiBloomfilter(array_size)
|
|
63
|
-
BF.set_arraysize(array_size)
|
|
64
|
-
BF.set_clonetypes(clonetypes)
|
|
65
|
-
BF.set_hashes(hashes)
|
|
66
|
-
BF.set_k(k)
|
|
67
|
-
return BF
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
def new_train_core(files_path, result_path, array_size, k=21):
|
|
71
|
-
"""Trains concatenated genomes into Bloomfilter and saves them.
|
|
72
|
-
|
|
73
|
-
:param files_path: Path to where the concatenated sequences are stored.
|
|
74
|
-
:type files_path: str
|
|
75
|
-
:param result_path: Path where the generated Bloomfilter will be saved.
|
|
76
|
-
:type result_path: str
|
|
77
|
-
:param array_size: Array-size for the Bloomfilter.
|
|
78
|
-
:type array_size: int
|
|
79
|
-
:param k: Length of substring.
|
|
80
|
-
:type k: int
|
|
81
|
-
"""
|
|
82
|
-
files = os.listdir(files_path)
|
|
83
|
-
# Iterate the files backwards to delete all non fasta files from the list.
|
|
84
|
-
for i in range(len(files) - 1, -1, -1):
|
|
85
|
-
if "fna" in files[i] or "fasta" in files[i]:
|
|
86
|
-
continue
|
|
87
|
-
else:
|
|
88
|
-
del files[i]
|
|
89
|
-
|
|
90
|
-
# Train a bloomfilter for each species.
|
|
91
|
-
for i in range(len(files)):
|
|
92
|
-
BF = init_bf(array_size=array_size, clonetypes=1, hashes=7, k=k)
|
|
93
|
-
path = Path(files_path) / files[i]
|
|
94
|
-
species_name = files[i].split(".")[0]
|
|
95
|
-
file_name = species_name + ".txt"
|
|
96
|
-
logger.info("Training {name}", name=species_name)
|
|
97
|
-
result = Path(result_path) / file_name
|
|
98
|
-
BF.train_sequence(path, 0)
|
|
99
|
-
BF.save_clonetypes(result)
|
|
100
|
-
BF.cleanup()
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
def new_write_file_dyn(bf_path, genus, meta_mode=False):
|
|
104
|
-
"""Write file with pickled list of all names for the bloomfilters.
|
|
105
|
-
|
|
106
|
-
:param bf_path: Path to the bloomfilters.
|
|
107
|
-
:type bf_path: str
|
|
108
|
-
:param genus: Name of the genus.
|
|
109
|
-
:type genus: str
|
|
110
|
-
:param meta_mode: Declare to which bloomfilters the path leads.
|
|
111
|
-
:type meta_mode: bool
|
|
112
|
-
"""
|
|
113
|
-
files = os.listdir(bf_path)
|
|
114
|
-
# If the Bloomfilter path leads to Bloomfilter for the metagenome mode.
|
|
115
|
-
if meta_mode:
|
|
116
|
-
for i in range(len(files) - 1, -1, -1):
|
|
117
|
-
if genus not in files[i]:
|
|
118
|
-
del files[i]
|
|
119
|
-
else:
|
|
120
|
-
files[i] = files[i][:-4]
|
|
121
|
-
file_name = "Filter" + genus + "Complete.txt"
|
|
122
|
-
|
|
123
|
-
# If the path leads to bloomfilters for the species.
|
|
124
|
-
else:
|
|
125
|
-
for i in range(len(files) - 1, -1, -1):
|
|
126
|
-
if "txt" not in files[i]:
|
|
127
|
-
del files[i]
|
|
128
|
-
else:
|
|
129
|
-
files[i] = files[i][:-4]
|
|
130
|
-
file_name = "Filter" + genus + ".txt"
|
|
131
|
-
|
|
132
|
-
# Make path for the txt file.
|
|
133
|
-
file_path = Path(os.getcwd()) / "filter" / "species_names" / file_name
|
|
134
|
-
with open(file_path, "wb") as fp:
|
|
135
|
-
pickle.dump(sorted(files), fp)
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
def save_array_sizes(genus, array_sizes):
|
|
139
|
-
"""Saves the array sizes of the bytearray for the bloomfilters in a txt file.
|
|
140
|
-
|
|
141
|
-
:param genus: The current genus.
|
|
142
|
-
:type genus: str
|
|
143
|
-
:param array_sizes: List of all computed array sizes for this genus.
|
|
144
|
-
:type array_sizes: list[str]
|
|
145
|
-
"""
|
|
146
|
-
file_name = genus + ".txt"
|
|
147
|
-
path = Path(os.getcwd()) / "filter" / "array_sizes" / file_name
|
|
148
|
-
|
|
149
|
-
# Save both array sizes as a string in the format: 'size1 size2' as a txt file.
|
|
150
|
-
# The first size is of the species level filters and the second of the meta-mode filter.
|
|
151
|
-
text = " ".join(array_sizes)
|
|
152
|
-
with open(path, "w", encoding="utf-8") as f:
|
|
153
|
-
f.write(text)
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
def save_name_dict(genus, name_dict: dict):
|
|
157
|
-
"""Saves the names and taxon IDs of all species for which filter were trained. XspecT uses this dict to switch
|
|
158
|
-
between the species names and it's ID. The dict is saved as a csv file.
|
|
159
|
-
|
|
160
|
-
:param genus: The genus for which filters were trained.
|
|
161
|
-
:param name_dict: The dictionary with all species names and taxon IDs
|
|
162
|
-
"""
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
def save_time_stats(time_stats, dir_name):
|
|
166
|
-
"""Saves the collected time measurements as a txt file.
|
|
167
|
-
|
|
168
|
-
:param time_stats: The collected time measurements as a formatted string.
|
|
169
|
-
:type time_stats: str
|
|
170
|
-
:param dir_name: Name of the parent directory.
|
|
171
|
-
:type dir_name: str
|
|
172
|
-
"""
|
|
173
|
-
time_file = Path(os.getcwd()) / "genus_metadata" / dir_name / "time.txt"
|
|
174
|
-
with open(str(time_file), "w+", encoding="utf-8") as f:
|
|
175
|
-
f.write(time_stats)
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
def load_translation_dict(genus: str) -> dict[str, str]:
|
|
179
|
-
"""Loads the translation dict for the given genus. The key is the taxon ID and its value the scientific name.
|
|
180
|
-
|
|
181
|
-
:param genus: The name of the genus.
|
|
182
|
-
:return: The translation dict for the genus.
|
|
183
|
-
"""
|
|
184
|
-
file_name = f"{genus}.pickle"
|
|
185
|
-
path = Path(os.getcwd()) / "filter" / "translation_dicts" / file_name
|
|
186
|
-
with open(path, "rb") as f:
|
|
187
|
-
translation_dict = pickle.load(f)
|
|
188
|
-
|
|
189
|
-
return translation_dict
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
def main():
|
|
193
|
-
a = 28858023
|
|
194
|
-
b = compute_array_size(a)
|
|
195
|
-
print(int(round(b + 1000000, -6)))
|
|
196
|
-
# genera = get_genera_array_sizes()
|
|
197
|
-
# print(f"Species: ")
|
|
198
|
-
# print(pre_process_all(genera, meta_mode=False))
|
|
199
|
-
# print(f"\nMeta: ")
|
|
200
|
-
# print(pre_process_all(genera, meta_mode=True))
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
if __name__ == "__main__":
|
|
204
|
-
main()
|
|
@@ -1,162 +0,0 @@
|
|
|
1
|
-
import subprocess as sp
|
|
2
|
-
from linecache import getline
|
|
3
|
-
from os import listdir, remove, getcwd
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
from time import perf_counter, localtime, asctime
|
|
6
|
-
|
|
7
|
-
from loguru import logger
|
|
8
|
-
import numpy as np
|
|
9
|
-
|
|
10
|
-
import xspect.XspecT_trainer
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def get_seq_paths(dir_name: str):
|
|
14
|
-
"""Stores the sequence paths, with the species name as key, in a dictionary. The sequences are DNA-Assemblies which
|
|
15
|
-
were concatenated.
|
|
16
|
-
|
|
17
|
-
:param dir_name: Name of the directory.
|
|
18
|
-
:return: Dictionary with species names and sequences.
|
|
19
|
-
"""
|
|
20
|
-
dir_path = Path(getcwd()) / "genus_metadata" / dir_name / "concatenate"
|
|
21
|
-
sequence_dict = dict()
|
|
22
|
-
files = listdir(dir_path)
|
|
23
|
-
# Go through all files backwards to delete all non fasta files.
|
|
24
|
-
for i in range(len(files) - 1, -1, -1):
|
|
25
|
-
curr_file = str(files[i])
|
|
26
|
-
file_parts = curr_file.split(".")
|
|
27
|
-
if file_parts[-1] != "fasta":
|
|
28
|
-
del files[i]
|
|
29
|
-
else:
|
|
30
|
-
species_name = file_parts[0]
|
|
31
|
-
# Save the species name with the path to its fasta file.
|
|
32
|
-
sequence_dict[species_name] = str(dir_path / curr_file)
|
|
33
|
-
|
|
34
|
-
return sequence_dict
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
def jellyfish_count(command: str):
|
|
38
|
-
"""A jellyfish command to count the k-mers of an fasta file using the linux bash.
|
|
39
|
-
|
|
40
|
-
:param command: The jellyfish command with all chosen parameters.
|
|
41
|
-
"""
|
|
42
|
-
sp.run(command.split(" "))
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
def jellyfish_stats(command: str) -> int:
|
|
46
|
-
"""A jellyfish command to get the count which was
|
|
47
|
-
|
|
48
|
-
:param command: The jellyfish command with all chosen parameters
|
|
49
|
-
:return: The count of all distinct k-mers.
|
|
50
|
-
"""
|
|
51
|
-
result = sp.run(command.split(" "), stdout=sp.PIPE, text=True)
|
|
52
|
-
return int(result.stdout.split("\n")[1].replace(" ", "").split(":")[1])
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
def count_k_meres(sequence_dict, k=21):
|
|
56
|
-
"""Counts all k-meres in the sequences using jellyfish.
|
|
57
|
-
|
|
58
|
-
:param sequence_dict: Dictionary with all sequence paths.
|
|
59
|
-
:type sequence_dict: dict[str, str]
|
|
60
|
-
:param k: K-mer length.
|
|
61
|
-
:type k: int
|
|
62
|
-
:return: Species names and number of distinct k-mere.
|
|
63
|
-
"""
|
|
64
|
-
k_mer_of_species = list()
|
|
65
|
-
count = 0
|
|
66
|
-
|
|
67
|
-
# Iterate through all species.
|
|
68
|
-
for species_name, file_path in sequence_dict.items():
|
|
69
|
-
num_files_to_count = len(sequence_dict) - count
|
|
70
|
-
logger.info(
|
|
71
|
-
"{num} files left to count. Counting {name}",
|
|
72
|
-
num=num_files_to_count,
|
|
73
|
-
name=species_name,
|
|
74
|
-
)
|
|
75
|
-
count += 1
|
|
76
|
-
|
|
77
|
-
# Set parameters for jellyfish commands.
|
|
78
|
-
k = str(k)
|
|
79
|
-
hash_size = "100M"
|
|
80
|
-
num_threads = "4"
|
|
81
|
-
output_name = str(Path(getcwd()) / "output")
|
|
82
|
-
|
|
83
|
-
# Command for jellyfish count.
|
|
84
|
-
count_command = (
|
|
85
|
-
"jellyfish count -m "
|
|
86
|
-
+ k
|
|
87
|
-
+ " -o "
|
|
88
|
-
+ output_name
|
|
89
|
-
+ " -C -s "
|
|
90
|
-
+ hash_size
|
|
91
|
-
+ " -t "
|
|
92
|
-
+ num_threads
|
|
93
|
-
+ " "
|
|
94
|
-
+ file_path
|
|
95
|
-
)
|
|
96
|
-
# Command for jellyfish stats.
|
|
97
|
-
stats_command = "jellyfish stats " + output_name
|
|
98
|
-
jellyfish_count(count_command)
|
|
99
|
-
k_mer_count = jellyfish_stats(stats_command)
|
|
100
|
-
|
|
101
|
-
# Append tuple with species name and distinct k-mer count.
|
|
102
|
-
k_mer_of_species.append((species_name, k_mer_count))
|
|
103
|
-
|
|
104
|
-
return k_mer_of_species
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
def sort_k_mer_counts(k_mer_counts):
|
|
108
|
-
"""Sorts the list of k-mers to determine the highest count.
|
|
109
|
-
|
|
110
|
-
:param k_mer_counts: List of all species with their k-mer counts.
|
|
111
|
-
:type k_mer_counts: list[tuple[str, int]]
|
|
112
|
-
:return: Sorted list beginning with the highest k-mer count.
|
|
113
|
-
"""
|
|
114
|
-
# Define the data type for numpy.
|
|
115
|
-
data_type = [("species", "S50"), ("k_mer_count", int)]
|
|
116
|
-
# Create numpy array with defined data type.
|
|
117
|
-
k_mer_count_sorted = np.array(k_mer_counts, dtype=data_type)
|
|
118
|
-
# Sort array based on k-mer count and than reverse so the first tuple has the highest count.
|
|
119
|
-
k_mer_count_sorted = np.sort(k_mer_count_sorted, order="k_mer_count")[::-1]
|
|
120
|
-
|
|
121
|
-
return k_mer_count_sorted
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
def get_highest_k_mer_count(dir_name, k=21):
|
|
125
|
-
"""Gets highest k-mer count for all species and k-mer count of genus.
|
|
126
|
-
|
|
127
|
-
:param dir_name: Name of the parent directory.
|
|
128
|
-
:type dir_name: str
|
|
129
|
-
:param k: K-mer length.
|
|
130
|
-
:type k: int
|
|
131
|
-
:return: List of the highest k-mer count of all species and the k-mer count of all sequences united.
|
|
132
|
-
"""
|
|
133
|
-
# Get highest k-mer count of all species.
|
|
134
|
-
seq_dict = get_seq_paths(dir_name)
|
|
135
|
-
k_mer_counts = count_k_meres(seq_dict, k=k)
|
|
136
|
-
k_mer_sorted = sort_k_mer_counts(k_mer_counts)
|
|
137
|
-
# Uncomment if the k-mer counts should be saved.
|
|
138
|
-
# save_count(k_mer_sorted, dir_name)
|
|
139
|
-
|
|
140
|
-
# Count distinct k-mers of genus.
|
|
141
|
-
genus = dir_name.split("_")[0]
|
|
142
|
-
file_name = genus + ".fasta"
|
|
143
|
-
file_path = str(Path(getcwd()) / "genus_metadata" / dir_name / file_name)
|
|
144
|
-
seq_dict = {genus: file_path}
|
|
145
|
-
k_mer_count = count_k_meres(seq_dict, k=k)
|
|
146
|
-
|
|
147
|
-
# Return highest k-mer count of all species and k-mer count of complete genus.
|
|
148
|
-
return [k_mer_sorted[0][1], k_mer_count[0][1]]
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
def main():
|
|
152
|
-
dir_name = "Listeria_14_12_2022_21-5-13"
|
|
153
|
-
seq_dict = get_seq_paths(dir_name)
|
|
154
|
-
|
|
155
|
-
start = perf_counter()
|
|
156
|
-
|
|
157
|
-
end = perf_counter()
|
|
158
|
-
print(f"time: {(end-start)/60}")
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
if __name__ == "__main__":
|
|
162
|
-
main()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|