XspecT 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of XspecT might be problematic. Click here for more details.

Files changed (58) hide show
  1. {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/METADATA +23 -29
  2. XspecT-0.2.0.dist-info/RECORD +30 -0
  3. {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/WHEEL +1 -1
  4. xspect/definitions.py +42 -0
  5. xspect/download_filters.py +11 -26
  6. xspect/fastapi.py +101 -0
  7. xspect/file_io.py +34 -103
  8. xspect/main.py +70 -66
  9. xspect/model_management.py +88 -0
  10. xspect/models/__init__.py +0 -0
  11. xspect/models/probabilistic_filter_model.py +277 -0
  12. xspect/models/probabilistic_filter_svm_model.py +169 -0
  13. xspect/models/probabilistic_single_filter_model.py +109 -0
  14. xspect/models/result.py +148 -0
  15. xspect/pipeline.py +201 -0
  16. xspect/run.py +38 -0
  17. xspect/train.py +304 -0
  18. xspect/train_filter/create_svm.py +6 -183
  19. xspect/train_filter/extract_and_concatenate.py +117 -121
  20. xspect/train_filter/html_scrap.py +16 -28
  21. xspect/train_filter/ncbi_api/download_assemblies.py +7 -8
  22. xspect/train_filter/ncbi_api/ncbi_assembly_metadata.py +9 -17
  23. xspect/train_filter/ncbi_api/ncbi_children_tree.py +3 -2
  24. xspect/train_filter/ncbi_api/ncbi_taxon_metadata.py +7 -5
  25. XspecT-0.1.3.dist-info/RECORD +0 -49
  26. xspect/BF_v2.py +0 -637
  27. xspect/Bootstrap.py +0 -29
  28. xspect/Classifier.py +0 -142
  29. xspect/OXA_Table.py +0 -53
  30. xspect/WebApp.py +0 -724
  31. xspect/XspecT_mini.py +0 -1363
  32. xspect/XspecT_trainer.py +0 -611
  33. xspect/map_kmers.py +0 -155
  34. xspect/search_filter.py +0 -504
  35. xspect/static/How-To.png +0 -0
  36. xspect/static/Logo.png +0 -0
  37. xspect/static/Logo2.png +0 -0
  38. xspect/static/Workflow_AspecT.png +0 -0
  39. xspect/static/Workflow_ClAssT.png +0 -0
  40. xspect/static/js.js +0 -615
  41. xspect/static/main.css +0 -280
  42. xspect/templates/400.html +0 -64
  43. xspect/templates/401.html +0 -62
  44. xspect/templates/404.html +0 -62
  45. xspect/templates/500.html +0 -62
  46. xspect/templates/about.html +0 -544
  47. xspect/templates/home.html +0 -51
  48. xspect/templates/layoutabout.html +0 -87
  49. xspect/templates/layouthome.html +0 -63
  50. xspect/templates/layoutspecies.html +0 -468
  51. xspect/templates/species.html +0 -33
  52. xspect/train_filter/README_XspecT_Erweiterung.md +0 -119
  53. xspect/train_filter/get_paths.py +0 -35
  54. xspect/train_filter/interface_XspecT.py +0 -204
  55. xspect/train_filter/k_mer_count.py +0 -162
  56. {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/LICENSE +0 -0
  57. {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/entry_points.txt +0 -0
  58. {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/top_level.txt +0 -0
@@ -1,119 +0,0 @@
1
- # XspecT-Erweiterung
2
-
3
- Expands XspecT, so new filter for a genus can automatically be trained. It's main
4
- script is XspecT_trainer.py. The rest of the scripts are inside the python module
5
- train_filter.
6
-
7
- ## Training new filter
8
-
9
- XspecT_trainer.py uses command line arguments. The examples for using XspecT_trainer.py
10
- are using Salmonella since this genus only has two defined species in the NCBI
11
- databases.
12
-
13
- ### Jellyfish
14
-
15
- The program jellyfish is used to count distinct k-meres in the assemblies. For XspecT_
16
- trainer.py to work jellyfish needs to be installed. It can be installed using bioconda:
17
-
18
- `
19
- conda install -c bioconda jellyfish
20
- `
21
-
22
- ### Training examples
23
-
24
- New filters with assemblies from NCBI RefSeq can be trained with the following line. The
25
- python libraries from [requirements.txt](..%2Frequirements.txt) need to be installed.
26
-
27
- `
28
- python XspecT_trainer.py Salmonella 1
29
- `
30
-
31
- Training filters with custom data can be done using the following line.
32
-
33
- `
34
- python XspecT_trainer.py Salmonella 2 -bf /path/to/concate_assemblies -svm
35
- /path/to/assemblies
36
- `
37
-
38
- All command line arguments are explained using the following line.
39
-
40
- `
41
- python XspecT_trainer.py -h
42
- `
43
-
44
- # Explanation of the scripts
45
-
46
- ## backup_filter.py
47
-
48
- Creates a backup of all files needed for the species assignment by XspecT for a specific
49
- genus. The backup will be done, if new filters will be created for a genus which
50
- already has trained filters.
51
-
52
- ## create_svm.py
53
-
54
- Downloads the needed assemblies and trains a support-vector-machine for the genus.
55
-
56
- ## extract_and_concatenate.py
57
-
58
- Unzips the downloaded assemblies. Concatenates assemblies per species that will be used
59
- to train the bloomfilters.
60
-
61
- ## get_paths.py
62
-
63
- Functions that get specific paths.
64
-
65
- ## html_scrap.py
66
-
67
- Updates a list of all NCBI RefSeq assembly accessions that have a taxonomy check result
68
- of OK. The taxonomy check from NCBI RefSeq uses the ANI (average-nucleotide-
69
- identity) to compute a result.
70
-
71
- ## interface_XspecT.py
72
-
73
- Mostly functions that train new bloomfilters automatically. The functions were
74
- originally writen for XspecT in a non-automatic way and were updated.
75
-
76
- ## k_mer_count.py
77
-
78
- Uses jellyfish to count distinct k-meres in every concatenated assembly. The highest
79
- count will be used to compute the size of the bloomfilters.
80
-
81
- ## ncbi_api
82
-
83
- A module which makes requests to the NCBI Datasets API.
84
-
85
- ### download_assemblies.py
86
-
87
- The specific function that downloads assemblies from NCBI RefSeq using NCBI
88
- datasets.
89
-
90
- ### ncbi_assembly_metadata.py
91
-
92
- Takes a dictionary with species and their taxon ID and asks NCBI for assemblies of
93
- the species. Saves the collected accessions of the found and selected assemblies.
94
-
95
- ### ncbi_children_tree.py
96
-
97
- Takes the name or ID of a genus and gives a list with all its species.
98
-
99
- ### ncbi_taxon_metadata.py
100
-
101
- Takes a list with taxon and collects metadata like their scientific name and rank.
102
-
103
-
104
-
105
-
106
-
107
-
108
-
109
-
110
-
111
-
112
-
113
-
114
-
115
-
116
-
117
-
118
-
119
-
@@ -1,35 +0,0 @@
1
- from pathlib import Path
2
- import os
3
-
4
-
5
- def get_concatenate_file_path(dir_name):
6
- """Returns str to file path of the concatenate directory.
7
-
8
- :param dir_name: Name of the current genus_metadata directory.
9
- :type dir_name: str
10
- :return: File path to the concatenated species assemblies.
11
- """
12
- return Path(os.getcwd()) / "genus_metadata" / dir_name / "concatenate"
13
-
14
-
15
- def get_current_dir_file_path(dir_name):
16
- """Returns str of file path to the directory with the currently needed metagenome assembly.
17
-
18
- :param dir_name: Name of the current genus_metadata directory.
19
- :type dir_name: str
20
- :return: File path to the metagenome assembly.
21
- """
22
- return Path(os.getcwd()) / "genus_metadata" / dir_name
23
-
24
-
25
- def get_metagenome_filter_path():
26
- """Returns the file path to the metagenome filters."""
27
- return Path(os.getcwd()) / "filter" / "Metagenomes"
28
-
29
-
30
- def main():
31
- pass
32
-
33
-
34
- if __name__ == "__main__":
35
- main()
@@ -1,204 +0,0 @@
1
- import os
2
- import pickle
3
- from pathlib import Path
4
- from shutil import rmtree
5
-
6
- from loguru import logger
7
- from numpy import log, square
8
-
9
- import xspect.BF_v2 as BF_v2
10
-
11
-
12
- def compute_array_size(n, p=0.01):
13
- """Computes the Bit-Array-Size for the bloomfilters.
14
-
15
- :param n: Highest k-mer count of a species.
16
- :type n: int
17
- :param p: Rate of mistakes.
18
- :type p: float
19
- :return: Bit-Array-Size for the bloomfilters.
20
- """
21
- return -((n * log(p)) / (square(log(2))))
22
-
23
-
24
- def make_paths(dir_name, genus):
25
- """Create paths to the concatenated sequences and to where the new bloomfilters will be saved.
26
-
27
- :param dir_name: Name of the parent directory.
28
- :type dir_name: str
29
- :param genus: Name of the genus.
30
- :type genus: str
31
- :return: The path to the sequence files and the bloomfilter directory.
32
- """
33
- # Path to concatenated sequences
34
- files_path = Path(os.getcwd()) / "genus_metadata" / dir_name / "concatenate"
35
-
36
- # Path for results.
37
- result_path = Path(os.getcwd()) / "filter" / genus
38
- # Try to create the directory for the bloomfilters.
39
- try:
40
- os.mkdir(result_path)
41
- except FileExistsError:
42
- # Delete the old directory with bloomfilters if already existed.
43
- rmtree(result_path, ignore_errors=False, onerror=None)
44
- os.mkdir(result_path)
45
-
46
- return str(files_path), str(result_path)
47
-
48
-
49
- def init_bf(array_size, clonetypes=1, hashes=7, k=21):
50
- """Initiates an bloomfilter object with given parameters.
51
-
52
- :param array_size: The size for the byte-array.
53
- :type array_size: int
54
- :param clonetypes: Number of clonetypes.
55
- :type clonetypes: int
56
- :param hashes: Number of hash functions used.
57
- :type hashes: int
58
- :param k: Length of k-mers.
59
- :type k: int
60
- :return: The initiated bloomfilter object.
61
- """
62
- BF = BF_v2.AbaumanniiBloomfilter(array_size)
63
- BF.set_arraysize(array_size)
64
- BF.set_clonetypes(clonetypes)
65
- BF.set_hashes(hashes)
66
- BF.set_k(k)
67
- return BF
68
-
69
-
70
- def new_train_core(files_path, result_path, array_size, k=21):
71
- """Trains concatenated genomes into Bloomfilter and saves them.
72
-
73
- :param files_path: Path to where the concatenated sequences are stored.
74
- :type files_path: str
75
- :param result_path: Path where the generated Bloomfilter will be saved.
76
- :type result_path: str
77
- :param array_size: Array-size for the Bloomfilter.
78
- :type array_size: int
79
- :param k: Length of substring.
80
- :type k: int
81
- """
82
- files = os.listdir(files_path)
83
- # Iterate the files backwards to delete all non fasta files from the list.
84
- for i in range(len(files) - 1, -1, -1):
85
- if "fna" in files[i] or "fasta" in files[i]:
86
- continue
87
- else:
88
- del files[i]
89
-
90
- # Train a bloomfilter for each species.
91
- for i in range(len(files)):
92
- BF = init_bf(array_size=array_size, clonetypes=1, hashes=7, k=k)
93
- path = Path(files_path) / files[i]
94
- species_name = files[i].split(".")[0]
95
- file_name = species_name + ".txt"
96
- logger.info("Training {name}", name=species_name)
97
- result = Path(result_path) / file_name
98
- BF.train_sequence(path, 0)
99
- BF.save_clonetypes(result)
100
- BF.cleanup()
101
-
102
-
103
- def new_write_file_dyn(bf_path, genus, meta_mode=False):
104
- """Write file with pickled list of all names for the bloomfilters.
105
-
106
- :param bf_path: Path to the bloomfilters.
107
- :type bf_path: str
108
- :param genus: Name of the genus.
109
- :type genus: str
110
- :param meta_mode: Declare to which bloomfilters the path leads.
111
- :type meta_mode: bool
112
- """
113
- files = os.listdir(bf_path)
114
- # If the Bloomfilter path leads to Bloomfilter for the metagenome mode.
115
- if meta_mode:
116
- for i in range(len(files) - 1, -1, -1):
117
- if genus not in files[i]:
118
- del files[i]
119
- else:
120
- files[i] = files[i][:-4]
121
- file_name = "Filter" + genus + "Complete.txt"
122
-
123
- # If the path leads to bloomfilters for the species.
124
- else:
125
- for i in range(len(files) - 1, -1, -1):
126
- if "txt" not in files[i]:
127
- del files[i]
128
- else:
129
- files[i] = files[i][:-4]
130
- file_name = "Filter" + genus + ".txt"
131
-
132
- # Make path for the txt file.
133
- file_path = Path(os.getcwd()) / "filter" / "species_names" / file_name
134
- with open(file_path, "wb") as fp:
135
- pickle.dump(sorted(files), fp)
136
-
137
-
138
- def save_array_sizes(genus, array_sizes):
139
- """Saves the array sizes of the bytearray for the bloomfilters in a txt file.
140
-
141
- :param genus: The current genus.
142
- :type genus: str
143
- :param array_sizes: List of all computed array sizes for this genus.
144
- :type array_sizes: list[str]
145
- """
146
- file_name = genus + ".txt"
147
- path = Path(os.getcwd()) / "filter" / "array_sizes" / file_name
148
-
149
- # Save both array sizes as a string in the format: 'size1 size2' as a txt file.
150
- # The first size is of the species level filters and the second of the meta-mode filter.
151
- text = " ".join(array_sizes)
152
- with open(path, "w", encoding="utf-8") as f:
153
- f.write(text)
154
-
155
-
156
- def save_name_dict(genus, name_dict: dict):
157
- """Saves the names and taxon IDs of all species for which filter were trained. XspecT uses this dict to switch
158
- between the species names and it's ID. The dict is saved as a csv file.
159
-
160
- :param genus: The genus for which filters were trained.
161
- :param name_dict: The dictionary with all species names and taxon IDs
162
- """
163
-
164
-
165
- def save_time_stats(time_stats, dir_name):
166
- """Saves the collected time measurements as a txt file.
167
-
168
- :param time_stats: The collected time measurements as a formatted string.
169
- :type time_stats: str
170
- :param dir_name: Name of the parent directory.
171
- :type dir_name: str
172
- """
173
- time_file = Path(os.getcwd()) / "genus_metadata" / dir_name / "time.txt"
174
- with open(str(time_file), "w+", encoding="utf-8") as f:
175
- f.write(time_stats)
176
-
177
-
178
- def load_translation_dict(genus: str) -> dict[str, str]:
179
- """Loads the translation dict for the given genus. The key is the taxon ID and its value the scientific name.
180
-
181
- :param genus: The name of the genus.
182
- :return: The translation dict for the genus.
183
- """
184
- file_name = f"{genus}.pickle"
185
- path = Path(os.getcwd()) / "filter" / "translation_dicts" / file_name
186
- with open(path, "rb") as f:
187
- translation_dict = pickle.load(f)
188
-
189
- return translation_dict
190
-
191
-
192
- def main():
193
- a = 28858023
194
- b = compute_array_size(a)
195
- print(int(round(b + 1000000, -6)))
196
- # genera = get_genera_array_sizes()
197
- # print(f"Species: ")
198
- # print(pre_process_all(genera, meta_mode=False))
199
- # print(f"\nMeta: ")
200
- # print(pre_process_all(genera, meta_mode=True))
201
-
202
-
203
- if __name__ == "__main__":
204
- main()
@@ -1,162 +0,0 @@
1
- import subprocess as sp
2
- from linecache import getline
3
- from os import listdir, remove, getcwd
4
- from pathlib import Path
5
- from time import perf_counter, localtime, asctime
6
-
7
- from loguru import logger
8
- import numpy as np
9
-
10
- import xspect.XspecT_trainer
11
-
12
-
13
- def get_seq_paths(dir_name: str):
14
- """Stores the sequence paths, with the species name as key, in a dictionary. The sequences are DNA-Assemblies which
15
- were concatenated.
16
-
17
- :param dir_name: Name of the directory.
18
- :return: Dictionary with species names and sequences.
19
- """
20
- dir_path = Path(getcwd()) / "genus_metadata" / dir_name / "concatenate"
21
- sequence_dict = dict()
22
- files = listdir(dir_path)
23
- # Go through all files backwards to delete all non fasta files.
24
- for i in range(len(files) - 1, -1, -1):
25
- curr_file = str(files[i])
26
- file_parts = curr_file.split(".")
27
- if file_parts[-1] != "fasta":
28
- del files[i]
29
- else:
30
- species_name = file_parts[0]
31
- # Save the species name with the path to its fasta file.
32
- sequence_dict[species_name] = str(dir_path / curr_file)
33
-
34
- return sequence_dict
35
-
36
-
37
- def jellyfish_count(command: str):
38
- """A jellyfish command to count the k-mers of an fasta file using the linux bash.
39
-
40
- :param command: The jellyfish command with all chosen parameters.
41
- """
42
- sp.run(command.split(" "))
43
-
44
-
45
- def jellyfish_stats(command: str) -> int:
46
- """A jellyfish command to get the count which was
47
-
48
- :param command: The jellyfish command with all chosen parameters
49
- :return: The count of all distinct k-mers.
50
- """
51
- result = sp.run(command.split(" "), stdout=sp.PIPE, text=True)
52
- return int(result.stdout.split("\n")[1].replace(" ", "").split(":")[1])
53
-
54
-
55
- def count_k_meres(sequence_dict, k=21):
56
- """Counts all k-meres in the sequences using jellyfish.
57
-
58
- :param sequence_dict: Dictionary with all sequence paths.
59
- :type sequence_dict: dict[str, str]
60
- :param k: K-mer length.
61
- :type k: int
62
- :return: Species names and number of distinct k-mere.
63
- """
64
- k_mer_of_species = list()
65
- count = 0
66
-
67
- # Iterate through all species.
68
- for species_name, file_path in sequence_dict.items():
69
- num_files_to_count = len(sequence_dict) - count
70
- logger.info(
71
- "{num} files left to count. Counting {name}",
72
- num=num_files_to_count,
73
- name=species_name,
74
- )
75
- count += 1
76
-
77
- # Set parameters for jellyfish commands.
78
- k = str(k)
79
- hash_size = "100M"
80
- num_threads = "4"
81
- output_name = str(Path(getcwd()) / "output")
82
-
83
- # Command for jellyfish count.
84
- count_command = (
85
- "jellyfish count -m "
86
- + k
87
- + " -o "
88
- + output_name
89
- + " -C -s "
90
- + hash_size
91
- + " -t "
92
- + num_threads
93
- + " "
94
- + file_path
95
- )
96
- # Command for jellyfish stats.
97
- stats_command = "jellyfish stats " + output_name
98
- jellyfish_count(count_command)
99
- k_mer_count = jellyfish_stats(stats_command)
100
-
101
- # Append tuple with species name and distinct k-mer count.
102
- k_mer_of_species.append((species_name, k_mer_count))
103
-
104
- return k_mer_of_species
105
-
106
-
107
- def sort_k_mer_counts(k_mer_counts):
108
- """Sorts the list of k-mers to determine the highest count.
109
-
110
- :param k_mer_counts: List of all species with their k-mer counts.
111
- :type k_mer_counts: list[tuple[str, int]]
112
- :return: Sorted list beginning with the highest k-mer count.
113
- """
114
- # Define the data type for numpy.
115
- data_type = [("species", "S50"), ("k_mer_count", int)]
116
- # Create numpy array with defined data type.
117
- k_mer_count_sorted = np.array(k_mer_counts, dtype=data_type)
118
- # Sort array based on k-mer count and than reverse so the first tuple has the highest count.
119
- k_mer_count_sorted = np.sort(k_mer_count_sorted, order="k_mer_count")[::-1]
120
-
121
- return k_mer_count_sorted
122
-
123
-
124
- def get_highest_k_mer_count(dir_name, k=21):
125
- """Gets highest k-mer count for all species and k-mer count of genus.
126
-
127
- :param dir_name: Name of the parent directory.
128
- :type dir_name: str
129
- :param k: K-mer length.
130
- :type k: int
131
- :return: List of the highest k-mer count of all species and the k-mer count of all sequences united.
132
- """
133
- # Get highest k-mer count of all species.
134
- seq_dict = get_seq_paths(dir_name)
135
- k_mer_counts = count_k_meres(seq_dict, k=k)
136
- k_mer_sorted = sort_k_mer_counts(k_mer_counts)
137
- # Uncomment if the k-mer counts should be saved.
138
- # save_count(k_mer_sorted, dir_name)
139
-
140
- # Count distinct k-mers of genus.
141
- genus = dir_name.split("_")[0]
142
- file_name = genus + ".fasta"
143
- file_path = str(Path(getcwd()) / "genus_metadata" / dir_name / file_name)
144
- seq_dict = {genus: file_path}
145
- k_mer_count = count_k_meres(seq_dict, k=k)
146
-
147
- # Return highest k-mer count of all species and k-mer count of complete genus.
148
- return [k_mer_sorted[0][1], k_mer_count[0][1]]
149
-
150
-
151
- def main():
152
- dir_name = "Listeria_14_12_2022_21-5-13"
153
- seq_dict = get_seq_paths(dir_name)
154
-
155
- start = perf_counter()
156
-
157
- end = perf_counter()
158
- print(f"time: {(end-start)/60}")
159
-
160
-
161
- if __name__ == "__main__":
162
- main()