XspecT 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of XspecT might be problematic. Click here for more details.

Files changed (58) hide show
  1. {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/METADATA +23 -29
  2. XspecT-0.2.0.dist-info/RECORD +30 -0
  3. {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/WHEEL +1 -1
  4. xspect/definitions.py +42 -0
  5. xspect/download_filters.py +11 -26
  6. xspect/fastapi.py +101 -0
  7. xspect/file_io.py +34 -103
  8. xspect/main.py +70 -66
  9. xspect/model_management.py +88 -0
  10. xspect/models/__init__.py +0 -0
  11. xspect/models/probabilistic_filter_model.py +277 -0
  12. xspect/models/probabilistic_filter_svm_model.py +169 -0
  13. xspect/models/probabilistic_single_filter_model.py +109 -0
  14. xspect/models/result.py +148 -0
  15. xspect/pipeline.py +201 -0
  16. xspect/run.py +38 -0
  17. xspect/train.py +304 -0
  18. xspect/train_filter/create_svm.py +6 -183
  19. xspect/train_filter/extract_and_concatenate.py +117 -121
  20. xspect/train_filter/html_scrap.py +16 -28
  21. xspect/train_filter/ncbi_api/download_assemblies.py +7 -8
  22. xspect/train_filter/ncbi_api/ncbi_assembly_metadata.py +9 -17
  23. xspect/train_filter/ncbi_api/ncbi_children_tree.py +3 -2
  24. xspect/train_filter/ncbi_api/ncbi_taxon_metadata.py +7 -5
  25. XspecT-0.1.3.dist-info/RECORD +0 -49
  26. xspect/BF_v2.py +0 -637
  27. xspect/Bootstrap.py +0 -29
  28. xspect/Classifier.py +0 -142
  29. xspect/OXA_Table.py +0 -53
  30. xspect/WebApp.py +0 -724
  31. xspect/XspecT_mini.py +0 -1363
  32. xspect/XspecT_trainer.py +0 -611
  33. xspect/map_kmers.py +0 -155
  34. xspect/search_filter.py +0 -504
  35. xspect/static/How-To.png +0 -0
  36. xspect/static/Logo.png +0 -0
  37. xspect/static/Logo2.png +0 -0
  38. xspect/static/Workflow_AspecT.png +0 -0
  39. xspect/static/Workflow_ClAssT.png +0 -0
  40. xspect/static/js.js +0 -615
  41. xspect/static/main.css +0 -280
  42. xspect/templates/400.html +0 -64
  43. xspect/templates/401.html +0 -62
  44. xspect/templates/404.html +0 -62
  45. xspect/templates/500.html +0 -62
  46. xspect/templates/about.html +0 -544
  47. xspect/templates/home.html +0 -51
  48. xspect/templates/layoutabout.html +0 -87
  49. xspect/templates/layouthome.html +0 -63
  50. xspect/templates/layoutspecies.html +0 -468
  51. xspect/templates/species.html +0 -33
  52. xspect/train_filter/README_XspecT_Erweiterung.md +0 -119
  53. xspect/train_filter/get_paths.py +0 -35
  54. xspect/train_filter/interface_XspecT.py +0 -204
  55. xspect/train_filter/k_mer_count.py +0 -162
  56. {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/LICENSE +0 -0
  57. {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/entry_points.txt +0 -0
  58. {XspecT-0.1.3.dist-info → XspecT-0.2.0.dist-info}/top_level.txt +0 -0
xspect/train.py ADDED
@@ -0,0 +1,304 @@
1
+ """
2
+ This module contains the main functions for training the models.
3
+ """
4
+
5
+ import os
6
+ import shutil
7
+ from pathlib import Path
8
+ import sys
9
+ from time import localtime, perf_counter, asctime, sleep
10
+ from loguru import logger
11
+ from xspect.definitions import get_xspect_model_path, get_xspect_tmp_path
12
+ from xspect.file_io import concatenate_meta
13
+ from xspect.models.probabilistic_filter_svm_model import ProbabilisticFilterSVMModel
14
+ from xspect.models.probabilistic_single_filter_model import (
15
+ ProbabilisticSingleFilterModel,
16
+ )
17
+ from xspect.train_filter.ncbi_api import (
18
+ ncbi_assembly_metadata,
19
+ ncbi_taxon_metadata,
20
+ ncbi_children_tree,
21
+ download_assemblies,
22
+ )
23
+ from xspect.train_filter import (
24
+ create_svm,
25
+ html_scrap,
26
+ extract_and_concatenate,
27
+ )
28
+
29
+
30
+ def check_user_input(user_input: str):
31
+ """The given input of the user will be checked. The input has to be a genus in NCBI.
32
+
33
+ :return: The genus name.
34
+ """
35
+ taxon_metadata = ncbi_taxon_metadata.NCBITaxonMetadata([user_input])
36
+ all_metadata = taxon_metadata.get_metadata()
37
+ for metadata in all_metadata.values():
38
+ sci_name = metadata["sci_name"]
39
+ tax_id = metadata["tax_id"]
40
+ rank = metadata["rank"]
41
+ lineage = metadata["lineage"]
42
+ bacteria_id = 2
43
+ if not sci_name == user_input and not tax_id == user_input:
44
+ print(
45
+ f"{get_current_time()}| The given genus: {user_input} was found as"
46
+ f" genus: {sci_name} ID: {tax_id}"
47
+ )
48
+ print(f"{get_current_time()}| Using {sci_name} as genus name.")
49
+ if rank == "GENUS":
50
+ if bacteria_id not in lineage:
51
+ print(f"{get_current_time()}| The given genus is not a bacteria.")
52
+ print(f"{get_current_time()}| Do you want to continue: [y/n]")
53
+ choice = input("-> ").lower()
54
+ if choice == "y":
55
+ return str(sci_name)
56
+ print(f"{get_current_time()}| Exiting...")
57
+ sys.exit()
58
+ return str(sci_name)
59
+ print(f"{get_current_time()}| {user_input} is rank {rank} and not genus.")
60
+ sys.exit()
61
+
62
+
63
+ def copy_custom_data(bf_path: str, svm_path: str, dir_name: str):
64
+ """
65
+
66
+ :param bf_path:
67
+ :param svm_path:
68
+ :param dir_name:
69
+ :return:
70
+ """
71
+ path = Path(os.getcwd()) / "genus_metadata" / dir_name
72
+ new_bf_path = path / "concatenate"
73
+ new_svm_path = path / "training_data"
74
+
75
+ # Make the new directories.
76
+ path.mkdir(exist_ok=True)
77
+ new_bf_path.mkdir(exist_ok=True)
78
+ new_svm_path.mkdir(exist_ok=True)
79
+
80
+ # Move bloomfilter files.
81
+ bf_files = os.listdir(bf_path)
82
+ for file in bf_files:
83
+ file_path = Path(bf_path) / file
84
+ new_file_path = new_bf_path / file
85
+ shutil.copy2(file_path, new_file_path)
86
+
87
+ # Move svm files.
88
+ svm_files = os.listdir(svm_path)
89
+ for file in svm_files:
90
+ file_path = Path(svm_path) / file
91
+ new_file_path = new_svm_path / file
92
+ shutil.copy2(file_path, new_file_path)
93
+
94
+
95
+ def set_logger(dir_name: str):
96
+ """Sets the logger parameters.
97
+
98
+ :param dir_name: Name of the folder where the log should be saved.
99
+ """
100
+ genus = dir_name.split("_")[0]
101
+
102
+ # Starting logger.
103
+ logger.remove()
104
+ logger.add(sys.stderr, format="{time:HH:mm:ss} | {level} | {message}", level="INFO")
105
+ log_path = get_xspect_tmp_path() / dir_name / (genus + ".log")
106
+ logger.add(log_path, format="{time:HH:mm:ss} | {level} | {message}", level="DEBUG")
107
+
108
+
109
+ def create_translation_dict(dir_name: str) -> dict[str, str]:
110
+ """Create a translation dictionary to translate the taxon ID to its scientific name.
111
+
112
+ :param dir_name: Directory name for current genus.
113
+ :return: The created translation dictionary.
114
+ """
115
+ path = get_xspect_tmp_path() / dir_name / "concatenate"
116
+ files = os.listdir(path)
117
+ translation_dict = {}
118
+ for file in files:
119
+ file_split = file.split(".")[0].split("_")
120
+ tax_id = file_split[0]
121
+ final_file_name = tax_id + ".fasta"
122
+ name = file_split[1]
123
+ translation_dict[final_file_name] = name
124
+
125
+ return translation_dict
126
+
127
+
128
+ def change_bf_assembly_file_names(dir_name: str):
129
+ """Change all concatenated assembly names to only the taxon ID.
130
+
131
+ :param dir_name: Directory name for current genus.
132
+ """
133
+ path = get_xspect_tmp_path() / dir_name / "concatenate"
134
+ files = os.listdir(path)
135
+ for file in files:
136
+ file_split = file.split(".")[0].split("_")
137
+ tax_id = file_split[0]
138
+ new_file_name = f"{tax_id}.fasta"
139
+ os.rename((path / file), (path / new_file_name))
140
+
141
+
142
+ def get_current_time():
143
+ """Returns the current time in the form hh:mm:ss."""
144
+ return asctime(localtime()).split()[3]
145
+
146
+
147
+ def train_ncbi(genus: str, svm_step: int = 1):
148
+ """Train genus and species models with NCBI assemblies from the given genus."""
149
+
150
+ if not isinstance(genus, str):
151
+ raise TypeError("genus must be a string")
152
+
153
+ # Check user input.
154
+ genus = check_user_input(user_input=genus)
155
+
156
+ # The directory name is defined in the following format: 'genus'_DD_MM_YYYY_hh-mm-ss
157
+ curr_time = localtime()
158
+ dir_name = f"{genus}_{curr_time[2]}_{curr_time[1]}_{curr_time[0]}_{curr_time[3]}-{curr_time[4]}-{curr_time[5]}"
159
+
160
+ # Set the logger.
161
+ set_logger(dir_name)
162
+
163
+ # Time for the whole program.
164
+ start = perf_counter()
165
+
166
+ # Search for every defined species of the genus.
167
+ logger.info("Getting all species of the genus")
168
+ children_ids = ncbi_children_tree.NCBIChildrenTree(genus).children_ids()
169
+ species_dict = ncbi_taxon_metadata.NCBITaxonMetadata(children_ids).get_metadata()
170
+
171
+ # Get all gcf accessions that have Taxonomy check result OK.
172
+ logger.info("Checking ANI data for updates")
173
+ ani_gcf = html_scrap.TaxonomyCheck().ani_gcf()
174
+
175
+ # Look for up to 8 assembly accessions per species.
176
+ logger.info("Getting assembly metadata")
177
+ all_metadata = ncbi_assembly_metadata.NCBIAssemblyMetadata(
178
+ all_metadata=species_dict, ani_gcf=ani_gcf, count=8, contig_n50=10000
179
+ )
180
+ all_metadata = all_metadata.get_all_metadata()
181
+
182
+ # Ensure that the genus has at least one species with accessions.
183
+ if not all_metadata:
184
+ raise ValueError("No species with accessions found")
185
+
186
+ # Download the chosen assemblies.
187
+ # One file for each species with it's downloaded assemblies in zip format.
188
+
189
+ # Iterate through all species.
190
+ logger.info("Downloading assemblies for bloomfilter training")
191
+ for metadata in all_metadata.values():
192
+ # Only try to download when the species has accessions.
193
+ if len(metadata["accessions"]) >= 1:
194
+ sleep(5)
195
+ species_name = metadata["sci_name"]
196
+ tax_id = metadata["tax_id"]
197
+ logger.info("Downloading {id}_{name}", id=tax_id, name=species_name)
198
+ file_name = f"{tax_id}_{species_name}.zip"
199
+
200
+ # Selecting the first 4 assemblies for training the filters.
201
+ accessions = metadata["accessions"][:4]
202
+
203
+ download_assemblies.download_assemblies(
204
+ accessions=accessions,
205
+ dir_name=dir_name,
206
+ target_folder="zip_files",
207
+ zip_file_name=file_name,
208
+ )
209
+ logger.info("Concatenating and extracting")
210
+
211
+ # Concatenate all assemblies of each species.
212
+ extract_and_concatenate.bf(dir_name=dir_name, delete=True)
213
+ concatenate_meta(get_xspect_tmp_path() / dir_name, genus)
214
+
215
+ # Download assemblies for svm creation.
216
+ logger.info("Downloading assemblies for support-vector-machine training")
217
+ accessions = {}
218
+ for metadata in all_metadata.values():
219
+ # Only add taxon with accessions.
220
+ if len(metadata["accessions"]) >= 1:
221
+ accessions[metadata["tax_id"]] = metadata["accessions"]
222
+
223
+ # Downloading assemblies.
224
+ create_svm.get_svm_assemblies(all_accessions=accessions, dir_name=dir_name)
225
+
226
+ logger.info("Extracting SVM assemblies")
227
+
228
+ # Extracting assemblies.
229
+ extract_and_concatenate.svm(
230
+ species_accessions=accessions, dir_name=dir_name, delete=True
231
+ )
232
+
233
+ # Make dictionary for translating taxon ID to scientific name.
234
+ translation_dict = create_translation_dict(dir_name)
235
+ change_bf_assembly_file_names(dir_name)
236
+
237
+ species_files_path = get_xspect_tmp_path() / dir_name / "concatenate"
238
+ species_result_path = get_xspect_model_path() / genus
239
+
240
+ # Train Bloomfilter for complete genus.
241
+ logger.info("Training metagenome model")
242
+ mg_files_path = get_xspect_tmp_path() / dir_name
243
+
244
+ genus_model = ProbabilisticSingleFilterModel(
245
+ k=21,
246
+ model_display_name=genus,
247
+ author="Test",
248
+ author_email="test@example.com",
249
+ model_type="Genus",
250
+ base_path=Path(species_result_path).parent,
251
+ )
252
+ genus_model.fit(mg_files_path / f"{genus}.fasta", genus)
253
+ genus_model.save()
254
+
255
+ logger.info("Training species model")
256
+
257
+ species_model = ProbabilisticFilterSVMModel(
258
+ k=21,
259
+ model_display_name=genus,
260
+ author="Test",
261
+ author_email="test@example.com",
262
+ model_type="Species",
263
+ base_path=Path(species_result_path).parent,
264
+ kernel="rbf",
265
+ c=1.0,
266
+ )
267
+ svm_dir = get_xspect_tmp_path() / dir_name / "training_data"
268
+ species_model.fit(
269
+ Path(species_files_path),
270
+ svm_dir,
271
+ display_names=translation_dict,
272
+ svm_step=svm_step,
273
+ )
274
+ species_model.save()
275
+
276
+ # Cleanup files.
277
+ shutil.rmtree(get_xspect_tmp_path() / dir_name)
278
+
279
+ end = perf_counter()
280
+
281
+ logger.info("Program runtime: {time} m", time=(round((end - start) / 60, 2)))
282
+ logger.info("XspecT-trainer is finished.")
283
+
284
+
285
+ def train_from_directory(display_name: str, dir_path: Path, meta: bool = False):
286
+ """Train the gene family and gene filter.
287
+
288
+ :param display_name: Name of the model.
289
+ :param dir: Input directory.
290
+ """
291
+
292
+ if not isinstance(display_name, str):
293
+ raise TypeError("display_name must be a string")
294
+
295
+ if not isinstance(dir_path, Path) and dir_path.exists() and dir_path.is_dir():
296
+ raise ValueError("dir must be Path object to a valid directory")
297
+
298
+ # check if the directory contains the necessary files
299
+ # copy to temp path
300
+ # check if svm training data exists
301
+ # train model, with svm data if it exists
302
+ # add display names
303
+ # train metagenome model
304
+ # clean up temp path
@@ -1,19 +1,7 @@
1
- import csv
2
- import os
3
- import pickle
4
- from pathlib import Path
5
- from time import sleep
1
+ """This module contains functions to select and download assemblies for SVM creation."""
6
2
 
7
- from Bio import SeqIO
3
+ from time import sleep
8
4
  from loguru import logger
9
-
10
- import xspect.BF_v2 as BF_v2
11
- from xspect.file_io import (
12
- delete_non_fasta,
13
- get_accessions,
14
- get_file_paths,
15
- get_species_names,
16
- )
17
5
  from xspect.train_filter.ncbi_api import download_assemblies
18
6
 
19
7
 
@@ -24,17 +12,11 @@ def select_assemblies(accessions):
24
12
  :type accessions: dict
25
13
  :return: Dict with species name as key and selected accessions as value.
26
14
  """
27
- all_accessions = {}
28
15
 
29
- for sci_name, current_accessions in accessions.items():
30
- selected_accessions = []
31
- # Select 4 assemblies beginning from the last one.
32
- for i in range(len(current_accessions) - 1, -1, -1):
33
- selected_accessions.append(current_accessions[i])
34
- if len(selected_accessions) == 4:
35
- break
36
-
37
- all_accessions[sci_name] = selected_accessions
16
+ all_accessions = {
17
+ sci_name: curr_accessions[-4:]
18
+ for sci_name, curr_accessions in accessions.items()
19
+ }
38
20
 
39
21
  return all_accessions
40
22
 
@@ -61,162 +43,3 @@ def get_svm_assemblies(all_accessions, dir_name):
61
43
  target_folder="training_data_zipped",
62
44
  zip_file_name=file_name,
63
45
  )
64
- logger.info("Downloads finished")
65
-
66
-
67
- def init_bf(genus, array_size, hashes=7, k=21):
68
- """Initializes bloomfilter.
69
-
70
- :param genus: Name of the genus.
71
- :type genus: str
72
- :param array_size: Size of the bloomfilter.
73
- :type array_size: int
74
- :param hashes: The number of hash functions the bf uses.
75
- :type hashes: int
76
- :param k: Length of k-mers.
77
- :type k: int
78
- :return: The bloomfilter object.
79
- """
80
- path = Path(os.getcwd()) / "filter"
81
-
82
- # Initialize bloomfilter for genus.
83
- BF = BF_v2.AbaumanniiBloomfilter(array_size)
84
- BF.set_arraysize(array_size)
85
- BF.set_hashes(hashes)
86
- BF.set_k(k)
87
-
88
- # Get all species names.
89
- names_path = path / "species_names" / ("Filter" + genus + ".txt")
90
- with open(names_path, "rb") as f:
91
- clonetypes = pickle.load(f)
92
-
93
- # Get bloomfilter paths.
94
- bf_path = path / genus
95
- paths = sorted(os.listdir(bf_path))
96
- for i in range(len(paths)):
97
- paths[i] = str(bf_path / str(paths[i]))
98
- # Setup bloomfilters.
99
- BF.read_clonetypes(paths, clonetypes)
100
-
101
- return BF
102
-
103
-
104
- def perform_lookup(bloomfilter, files, file_paths, accessions, names, spacing):
105
- """Performs a lookup on a bloomfilter object and gives the scores as a list.
106
-
107
- :param bloomfilter: The bloomfilter object on which the lookup is performed.
108
- :param files: List of file names.
109
- :type files: list[str]
110
- :param file_paths: List with the file paths.
111
- :type file_paths: list[str]
112
- :param accessions: List of all accessions.
113
- :type accessions: list[str]
114
- :param names: List with all species names.
115
- :type names: list[str]
116
- :return: List with all scores of the lookup.
117
- """
118
- scores = list()
119
- BF = bloomfilter
120
-
121
- # Lookup.
122
- for i in range(len(files)):
123
- BF.number_of_kmeres = 0
124
- BF.hits_per_filter = [0] * BF.clonetypes
125
-
126
- for sequence in SeqIO.parse(file_paths[i], "fasta"):
127
- # Dominik: changed sample size to var
128
- for j in range(0, len(sequence.seq) - BF.k, spacing):
129
- BF.number_of_kmeres += 1
130
- BF.lookup_canonical(str(sequence.seq[j : j + BF.k]))
131
-
132
- score = BF.get_score()
133
- score = [str(x) for x in score]
134
- score = ",".join(score)
135
- scores.append(accessions[i] + "," + score + "," + names[i])
136
-
137
- return scores
138
-
139
-
140
- # https://stackoverflow.com/questions/21431052/sort-list-of-strings-by-a-part-of-the-string
141
- def sort_list(scores, names):
142
- """Sorts the scores list by species name.
143
-
144
- :param scores: The scores gathered by a lookup of a bloomfilter.
145
- :type scores: list
146
- :param names: List with all species names.
147
- :type names: list[str]
148
- :return: The sorted scores list.
149
- """
150
- scores.sort(key=lambda x: x.split(",")[-1][:2])
151
- names = [x for x in names if x != "none"]
152
- names = list(dict.fromkeys(names))
153
- scores.insert(0, sorted(names))
154
- scores[0] = ["File"] + scores[0] + ["Label"]
155
-
156
- for i in range(1, len(scores)):
157
- line = scores[i].split(",")
158
- scores[i] = line
159
-
160
- return scores
161
-
162
-
163
- def save_csv(genus, scores):
164
- """Saves the scores as csv file.
165
-
166
- :param genus: Name of the genus.
167
- :type genus: str
168
- :param scores: The scores gathered by a lookup of a bloomfilter.
169
- :type scores: list
170
- """
171
- training_data_path = Path(os.getcwd()) / "Training_data"
172
- if not os.path.exists(training_data_path):
173
- os.mkdir(training_data_path)
174
-
175
- path = training_data_path / (genus + "_Training_data_spec.csv")
176
- with open(path, "w", newline="") as file:
177
- writer = csv.writer(file)
178
- writer.writerows(scores)
179
-
180
-
181
- # Dominik: added spacing
182
- def new_helper(spacing, genus, dir_name, array_size, k=21):
183
- """Create support vector machine for bloomfilters of a genus.
184
-
185
- :param spacing:
186
- :param genus: Name of the genus.
187
- :type genus: str
188
- :param dir_name: Name of the parent directory.
189
- :type dir_name: str
190
- :param array_size: Size for the byte array which is the bloomfilter.
191
- :type array_size: int
192
- :param k: Length of the k-mers.
193
- :type k: int
194
- """
195
- # Get all files.
196
- base_path = Path(os.getcwd()) / "genus_metadata" / dir_name / "training_data"
197
- files = os.listdir(base_path)
198
-
199
- # Delete all non fasta files.
200
- files = delete_non_fasta(files)
201
-
202
- # Get accessions from file names.
203
- accessions = get_accessions(files)
204
-
205
- # Get all complete file paths.
206
- file_paths = get_file_paths(base_path, files)
207
-
208
- # Get all species names from the header in the fasta files.
209
- names = get_species_names(file_paths)
210
-
211
- # Initialize bloomfilter.
212
- bf = init_bf(genus, array_size)
213
-
214
- # Perform lookup on bloomfilter.
215
- # Dominik: added spacing
216
- scores = perform_lookup(bf, files, file_paths, accessions, names, spacing)
217
-
218
- # Sort score list by species names.
219
- scores = sort_list(scores, names)
220
-
221
- # Save results in csv file.
222
- save_csv(genus, scores)