miga-base 1.2.18.2 → 1.3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/cli/action/doctor/base.rb +2 -1
  3. data/lib/miga/cli/action/init.rb +1 -1
  4. data/lib/miga/dataset/result/add.rb +3 -2
  5. data/lib/miga/result/stats.rb +4 -3
  6. data/lib/miga/version.rb +2 -2
  7. data/scripts/essential_genes.bash +4 -8
  8. data/utils/FastAAI/LICENSE +8 -0
  9. data/utils/FastAAI/README.md +151 -40
  10. data/utils/FastAAI/__init__.py +1 -0
  11. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962915_1.fna.gz +0 -0
  12. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962925_1.fna.gz +0 -0
  13. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962935_1.fna.gz +0 -0
  14. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962945_1.fna.gz +0 -0
  15. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962995_1.fna.gz +0 -0
  16. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963025_1.fna.gz +0 -0
  17. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963055_1.fna.gz +0 -0
  18. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963065_1.fna.gz +0 -0
  19. data/utils/FastAAI/example_genomes/_Pseudomonas__cissicola_GCA_002019225_1.fna.gz +0 -0
  20. data/utils/FastAAI/example_genomes/_Pseudomonas__cissicola_GCA_008801575_1.fna.gz +0 -0
  21. data/utils/FastAAI/fastaai/__init__.py +1 -0
  22. data/utils/FastAAI/fastaai/fastaai +4805 -0
  23. data/utils/FastAAI/fastaai/fastaai.py +4805 -0
  24. data/utils/FastAAI/fastaai/fastaai_miga_crystals_to_db.py +297 -0
  25. data/utils/FastAAI/fastaai/fastaai_miga_preproc.py +931 -0
  26. data/utils/FastAAI/metadata/Accession_names_and_IDs.txt +122 -0
  27. data/utils/distance/commands.rb +55 -27
  28. data/utils/distance/database.rb +3 -0
  29. data/utils/distance/temporal.rb +10 -1
  30. metadata +23 -6
  31. data/utils/FastAAI/FastAAI +0 -3659
  32. /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Archaea_SCG.hmm +0 -0
  33. /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Bacteria_SCG.hmm +0 -0
  34. /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Complete_SCG_DB.hmm +0 -0
@@ -0,0 +1,4805 @@
1
+ #!/usr/bin/env python3
2
+
3
+ ################################################################################
4
+ """---0.0 Import Modules---"""
5
+ import subprocess
6
+ import argparse
7
+ import datetime
8
+ import shutil
9
+ import textwrap
10
+ import multiprocessing
11
+ import pickle
12
+ import gzip
13
+ import tempfile
14
+ #Shouldn't play any role.
15
+ #from random import randint
16
+
17
+ #We could probably remove Path, too.
18
+ #This as well
19
+ import time
20
+ from collections import defaultdict
21
+ import sys
22
+ import os
23
+ from math import floor
24
+ import sqlite3
25
+ #numpy dependency
26
+ import numpy as np
27
+ import io
28
+ import random
29
+
30
+ import pyrodigal as pd
31
+ import pyhmmer
32
+
33
+ from collections import namedtuple
34
+
35
+ from math import ceil
36
+
37
+ import re
38
+
39
+
40
+ class progress_tracker:
41
+ def __init__(self, total, step_size = 2, message = None, one_line = True):
42
+ self.current_count = 0
43
+ self.max_count = total
44
+ #Book keeping.
45
+ self.start_time = None
46
+ self.end_time = None
47
+ #Show progrexx every [step] percent
48
+ self.step = step_size
49
+ self.justify_size = ceil(100/self.step)
50
+ self.last_percent = 0
51
+ self.message = message
52
+
53
+ self.pretty_print = one_line
54
+
55
+ self.start()
56
+
57
+ def curtime(self):
58
+ time_format = "%d/%m/%Y %H:%M:%S"
59
+ timer = datetime.datetime.now()
60
+ time = timer.strftime(time_format)
61
+ return time
62
+
63
+ def start(self):
64
+ print("")
65
+ if self.message is not None:
66
+ print(self.message)
67
+
68
+ try:
69
+ percentage = (self.current_count/self.max_count)*100
70
+ sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/self.step)).ljust(self.justify_size)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(self.current_count) + " of " + str(self.max_count) + ' ) at ' + self.curtime() + "\n")
71
+ sys.stdout.flush()
72
+
73
+ except:
74
+ #It's not really a big deal if the progress bar cannot be printed.
75
+ pass
76
+
77
+ def update(self):
78
+ self.current_count += 1
79
+ percentage = (self.current_count/self.max_count)*100
80
+ try:
81
+ if percentage // self.step > self.last_percent:
82
+ if self.pretty_print:
83
+ sys.stdout.write('\033[A')
84
+ sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/self.step)).ljust(self.justify_size)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(self.current_count) + " of " + str(self.max_count) + ' ) at ' + self.curtime() + "\n")
85
+ sys.stdout.flush()
86
+ self.last_percent = percentage // self.step
87
+ #Bar is always full at the end.
88
+ if count == self.max_count:
89
+ if self.pretty_print:
90
+ sys.stdout.write('\033[A')
91
+ sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*self.justify_size).ljust(self.justify_size)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(self.current_count) + " of " + str(self.max_count) + ' ) at ' + self.curtime() + "\n")
92
+ sys.stdout.flush()
93
+ #Add space at end.
94
+ print("")
95
+ except:
96
+ #It's not really a big deal if the progress bar cannot be printed.
97
+ pass
98
+
99
+
100
+ #Takes a bytestring from the SQL database and converts it to a numpy array.
101
+ def convert_array(bytestring):
102
+ return np.frombuffer(bytestring, dtype = np.int32)
103
+
104
+ def convert_float_array_16(bytestring):
105
+ return np.frombuffer(bytestring, dtype = np.float16)
106
+
107
+ def convert_float_array_32(bytestring):
108
+ return np.frombuffer(bytestring, dtype = np.float32)
109
+
110
+ def convert_float_array_64(bytestring):
111
+ return np.frombuffer(bytestring, dtype = np.float64)
112
+
113
+ def read_fasta(file):
114
+ cur_seq = ""
115
+ cur_prot = ""
116
+
117
+ contents = {}
118
+ deflines = {}
119
+
120
+ fasta = agnostic_reader(file)
121
+ for line in fasta:
122
+ if line.startswith(">"):
123
+ if len(cur_seq) > 0:
124
+ contents[cur_prot] = cur_seq
125
+ deflines[cur_prot] = defline
126
+
127
+ cur_seq = ""
128
+ cur_prot = line.strip().split()[0][1:]
129
+ defline = line.strip()[len(cur_prot)+1 :].strip()
130
+
131
+ else:
132
+ cur_seq += line.strip()
133
+
134
+ fasta.close()
135
+
136
+ #Final iter
137
+ if len(cur_seq) > 0:
138
+ contents[cur_prot] = cur_seq
139
+ deflines[cur_prot] = defline
140
+
141
+ return contents, deflines
142
+
143
+ class fasta_file:
144
+ def __init__(self, file, type = "genome"):
145
+ self.file_path = os.path.abspath(file)
146
+ self.name = os.path.basename(file)
147
+ self.no_ext = os.path.splitext(self.name)[0]
148
+ self.type = type
149
+
150
+ self.tuple_structure = namedtuple("fasta", ["seqid", "description", "sequence"])
151
+ self.contents = {}
152
+
153
+ def convert(self, contents, descriptions):
154
+ for protein in contents:
155
+ self.contents = self.tuple_structure(seqid = protein, description = descriptions[protein], sequence = contents[protein])
156
+
157
+
158
+ def def_import_file(self):
159
+ contents, descriptions = read_fasta(self.file_path)
160
+ self.convert(contents, descriptions)
161
+
162
+ class pyhmmer_manager:
163
+ def __init__(self, do_compress):
164
+ self.hmm_model = []
165
+ self.hmm_model_optimized = None
166
+
167
+ self.proteins_to_search = []
168
+ self.protein_descriptions = None
169
+
170
+ self.hmm_result_proteins = []
171
+ self.hmm_result_accessions = []
172
+ self.hmm_result_scores = []
173
+
174
+ self.printable_lines = []
175
+
176
+ self.bacterial_SCPs = None
177
+ self.archaeal_SCPs = None
178
+ self.assign_hmm_sets()
179
+ self.domain_counts = {"Bacteria" : 0, "Archaea": 0}
180
+ self.voted_domain = {"Bacteria" : len(self.bacterial_SCPs), "Archaea" : len(self.archaeal_SCPs)}
181
+
182
+ self.bacterial_fraction = None
183
+ self.archaeal_fraction = None
184
+
185
+ self.best_hits = None
186
+
187
+ self.do_compress = do_compress
188
+
189
+ def optimize_models(self):
190
+ try:
191
+ self.hmm_model_optimized = []
192
+
193
+ for hmm in self.hmm_model:
194
+ prof = pyhmmer.plan7.Profile(M = hmm.insert_emissions.shape[0], alphabet = pyhmmer.easel.Alphabet.amino())
195
+ prof.configure(hmm = hmm, background = pyhmmer.plan7.Background(alphabet = pyhmmer.easel.Alphabet.amino()), L = hmm.insert_emissions.shape[0]-1)
196
+ optim = prof.optimized()
197
+ self.hmm_model_optimized.append(optim)
198
+
199
+ #Clean up.
200
+ self.hmm_model = None
201
+ except:
202
+ #Quiet fail condition - fall back on default model.
203
+ self.hmm_model_optimized = None
204
+
205
+ #Load HMM and try to optimize.
206
+ def load_hmm_from_file(self, hmm_path):
207
+ hmm_set = pyhmmer.plan7.HMMFile(hmm_path)
208
+ for hmm in hmm_set:
209
+ self.hmm_model.append(hmm)
210
+
211
+ #This doesn't seem to be improving performance currently.
212
+ self.optimize_models()
213
+
214
+ #Set archaeal and bacterial HMM sets.
215
+ def assign_hmm_sets(self):
216
+ self.bacterial_SCPs = {'PF00709_21': 'Adenylsucc_synt', 'PF00406_22': 'ADK', 'PF01808_18': 'AICARFT_IMPCHas', 'PF00231_19': 'ATP-synt',
217
+ 'PF00119_20': 'ATP-synt_A', 'PF01264_21': 'Chorismate_synt', 'PF00889_19': 'EF_TS', 'PF01176_19': 'eIF-1a',
218
+ 'PF02601_15': 'Exonuc_VII_L', 'PF01025_19': 'GrpE', 'PF01725_16': 'Ham1p_like', 'PF01715_17': 'IPPT',
219
+ 'PF00213_18': 'OSCP', 'PF01195_19': 'Pept_tRNA_hydro', 'PF00162_19': 'PGK', 'PF02033_18': 'RBFA', 'PF02565_15': 'RecO_C',
220
+ 'PF00825_18': 'Ribonuclease_P', 'PF00687_21': 'Ribosomal_L1', 'PF00572_18': 'Ribosomal_L13',
221
+ 'PF00238_19': 'Ribosomal_L14', 'PF00252_18': 'Ribosomal_L16', 'PF01196_19': 'Ribosomal_L17',
222
+ 'PF00861_22': 'Ribosomal_L18p', 'PF01245_20': 'Ribosomal_L19', 'PF00453_18': 'Ribosomal_L20',
223
+ 'PF00829_21': 'Ribosomal_L21p', 'PF00237_19': 'Ribosomal_L22', 'PF00276_20': 'Ribosomal_L23',
224
+ 'PF17136_4': 'ribosomal_L24', 'PF00189_20': 'Ribosomal_S3_C', 'PF00281_19': 'Ribosomal_L5', 'PF00181_23': 'Ribosomal_L2',
225
+ 'PF01016_19': 'Ribosomal_L27', 'PF00828_19': 'Ribosomal_L27A', 'PF00830_19': 'Ribosomal_L28',
226
+ 'PF00831_23': 'Ribosomal_L29', 'PF00297_22': 'Ribosomal_L3', 'PF01783_23': 'Ribosomal_L32p',
227
+ 'PF01632_19': 'Ribosomal_L35p', 'PF00573_22': 'Ribosomal_L4', 'PF00347_23': 'Ribosomal_L6',
228
+ 'PF03948_14': 'Ribosomal_L9_C', 'PF00338_22': 'Ribosomal_S10', 'PF00411_19': 'Ribosomal_S11',
229
+ 'PF00416_22': 'Ribosomal_S13', 'PF00312_22': 'Ribosomal_S15', 'PF00886_19': 'Ribosomal_S16',
230
+ 'PF00366_20': 'Ribosomal_S17', 'PF00203_21': 'Ribosomal_S19', 'PF00318_20': 'Ribosomal_S2',
231
+ 'PF01649_18': 'Ribosomal_S20p', 'PF01250_17': 'Ribosomal_S6', 'PF00177_21': 'Ribosomal_S7',
232
+ 'PF00410_19': 'Ribosomal_S8', 'PF00380_19': 'Ribosomal_S9', 'PF00164_25': 'Ribosom_S12_S23',
233
+ 'PF01193_24': 'RNA_pol_L', 'PF01192_22': 'RNA_pol_Rpb6', 'PF01765_19': 'RRF', 'PF02410_15': 'RsfS',
234
+ 'PF03652_15': 'RuvX', 'PF00584_20': 'SecE', 'PF03840_14': 'SecG', 'PF00344_20': 'SecY', 'PF01668_18': 'SmpB',
235
+ 'PF00750_19': 'tRNA-synt_1d', 'PF01746_21': 'tRNA_m1G_MT', 'PF02367_17': 'TsaE', 'PF02130_17': 'UPF0054',
236
+ 'PF02699_15': 'YajC'}
237
+
238
+ self.archaeal_SCPs = {'PF00709_21': 'Adenylsucc_synt', 'PF05221_17': 'AdoHcyase', 'PF01951_16': 'Archease', 'PF01813_17': 'ATP-synt_D',
239
+ 'PF01990_17': 'ATP-synt_F', 'PF01864_17': 'CarS-like', 'PF01982_16': 'CTP-dep_RFKase', 'PF01866_17': 'Diphthamide_syn',
240
+ 'PF04104_14': 'DNA_primase_lrg', 'PF01984_20': 'dsDNA_bind', 'PF04010_13': 'DUF357', 'PF04019_12': 'DUF359',
241
+ 'PF04919_12': 'DUF655', 'PF01912_18': 'eIF-6', 'PF05833_11': 'FbpA', 'PF01725_16': 'Ham1p_like',
242
+ 'PF00368_18': 'HMG-CoA_red', 'PF00334_19': 'NDK', 'PF02006_16': 'PPS_PS', 'PF02996_17': 'Prefoldin',
243
+ 'PF01981_16': 'PTH2', 'PF01948_18': 'PyrI', 'PF00687_21': 'Ribosomal_L1', 'PF00572_18': 'Ribosomal_L13',
244
+ 'PF00238_19': 'Ribosomal_L14', 'PF00827_17': 'Ribosomal_L15e', 'PF00252_18': 'Ribosomal_L16',
245
+ 'PF01157_18': 'Ribosomal_L21e', 'PF00237_19': 'Ribosomal_L22', 'PF00276_20': 'Ribosomal_L23',
246
+ 'PF16906_5': 'Ribosomal_L26', 'PF00831_23': 'Ribosomal_L29', 'PF00297_22': 'Ribosomal_L3',
247
+ 'PF01198_19': 'Ribosomal_L31e', 'PF01655_18': 'Ribosomal_L32e', 'PF01780_19': 'Ribosomal_L37ae',
248
+ 'PF00832_20': 'Ribosomal_L39', 'PF00573_22': 'Ribosomal_L4', 'PF00935_19': 'Ribosomal_L44', 'PF17144_4': 'Ribosomal_L5e',
249
+ 'PF00347_23': 'Ribosomal_L6', 'PF00411_19': 'Ribosomal_S11', 'PF00416_22': 'Ribosomal_S13',
250
+ 'PF00312_22': 'Ribosomal_S15', 'PF00366_20': 'Ribosomal_S17', 'PF00833_18': 'Ribosomal_S17e',
251
+ 'PF00203_21': 'Ribosomal_S19', 'PF01090_19': 'Ribosomal_S19e', 'PF00318_20': 'Ribosomal_S2',
252
+ 'PF01282_19': 'Ribosomal_S24e', 'PF01667_17': 'Ribosomal_S27e', 'PF01200_18': 'Ribosomal_S28e',
253
+ 'PF01015_18': 'Ribosomal_S3Ae', 'PF00177_21': 'Ribosomal_S7', 'PF00410_19': 'Ribosomal_S8',
254
+ 'PF01201_22': 'Ribosomal_S8e', 'PF00380_19': 'Ribosomal_S9', 'PF00164_25': 'Ribosom_S12_S23',
255
+ 'PF06026_14': 'Rib_5-P_isom_A', 'PF01351_18': 'RNase_HII', 'PF13656_6': 'RNA_pol_L_2',
256
+ 'PF01194_17': 'RNA_pol_N', 'PF03874_16': 'RNA_pol_Rpb4', 'PF01192_22': 'RNA_pol_Rpb6',
257
+ 'PF01139_17': 'RtcB', 'PF00344_20': 'SecY', 'PF06093_13': 'Spt4', 'PF00121_18': 'TIM', 'PF01994_16': 'Trm56',
258
+ 'PF00749_21': 'tRNA-synt_1c', 'PF00750_19': 'tRNA-synt_1d', 'PF13393_6': 'tRNA-synt_His',
259
+ 'PF01142_18': 'TruD', 'PF01992_16': 'vATP-synt_AC39', 'PF01991_18': 'vATP-synt_E', 'PF01496_19': 'V_ATPase_I'}
260
+
261
+ #Convert passed sequences.
262
+ def convert_protein_seqs_in_mem(self, contents):
263
+ #Clean up.
264
+ self.proteins_to_search = []
265
+
266
+ for protein in contents:
267
+ #Skip a protein if it's longer than 100k AA.
268
+ if len(contents[protein]) >= 100000:
269
+ continue
270
+ as_bytes = protein.encode()
271
+ #Pyhmmer digitization of sequences for searching.
272
+ easel_seq = pyhmmer.easel.TextSequence(name = as_bytes, sequence = contents[protein])
273
+ easel_seq = easel_seq.digitize(pyhmmer.easel.Alphabet.amino())
274
+ self.proteins_to_search.append(easel_seq)
275
+
276
+ easel_seq = None
277
+
278
+ def load_protein_seqs_from_file(self, prots_file):
279
+ #Pyhmmer has a method for loading a fasta file, but we need to support gzipped inputs, so we do it manually.
280
+ contents, deflines = read_fasta(prots_file)
281
+ self.protein_descriptions = deflines
282
+ self.convert_protein_seqs_in_mem(contents)
283
+
284
+ def execute_search(self):
285
+ if self.hmm_model_optimized is None:
286
+ top_hits = list(pyhmmer.hmmsearch(self.hmm_model, self.proteins_to_search, cpus=1, bit_cutoffs="trusted"))
287
+ else:
288
+ top_hits = list(pyhmmer.hmmsearch(self.hmm_model_optimized, self.proteins_to_search, cpus=1, bit_cutoffs="trusted"))
289
+
290
+ self.printable_lines = []
291
+
292
+ self.hmm_result_proteins = []
293
+ self.hmm_result_accessions = []
294
+ self.hmm_result_scores = []
295
+
296
+ for model in top_hits:
297
+ for hit in model:
298
+ target_name = hit.name.decode()
299
+ target_acc = hit.accession
300
+ if target_acc is None:
301
+ target_acc = "-"
302
+ else:
303
+ target_acc = target_acc.decode()
304
+
305
+ query_name = hit.best_domain.alignment.hmm_name.decode()
306
+ query_acc = hit.best_domain.alignment.hmm_accession.decode()
307
+
308
+ full_seq_evalue = "%.2g" % hit.evalue
309
+ full_seq_score = round(hit.score, 1)
310
+ full_seq_bias = round(hit.bias, 1)
311
+
312
+ best_dom_evalue = "%.2g" % hit.best_domain.alignment.domain.i_evalue
313
+ best_dom_score = round(hit.best_domain.alignment.domain.score, 1)
314
+ best_dom_bias = round(hit.best_domain.alignment.domain.bias, 1)
315
+
316
+ #I don't know how to get most of these values.
317
+ exp = 0
318
+ reg = 0
319
+ clu = 0
320
+ ov = 0
321
+ env = 0
322
+ dom = len(hit.domains)
323
+ rep = 0
324
+ inc = 0
325
+
326
+ try:
327
+ description = self.protein_descriptions[target_name]
328
+ except:
329
+ description = ""
330
+
331
+ writeout = [target_name, target_acc, query_name, query_acc, full_seq_evalue, \
332
+ full_seq_score, full_seq_bias, best_dom_evalue, best_dom_score, best_dom_bias, \
333
+ exp, reg, clu, ov, env, dom, rep, inc, description]
334
+
335
+ #Format and join.
336
+ writeout = [str(i) for i in writeout]
337
+ writeout = '\t'.join(writeout)
338
+
339
+ self.printable_lines.append(writeout)
340
+
341
+ self.hmm_result_proteins.append(target_name)
342
+ self.hmm_result_accessions.append(query_acc)
343
+ self.hmm_result_scores.append(best_dom_score)
344
+
345
+ def filter_to_best_hits(self):
346
+ hmm_file = np.transpose(np.array([self.hmm_result_proteins, self.hmm_result_accessions, self.hmm_result_scores]))
347
+
348
+ #hmm_file = np.loadtxt(hmm_file_name, comments = '#', usecols = (0, 3, 8), dtype=(str))
349
+ #Sort the hmm file based on the score column in descending order.
350
+ hmm_file = hmm_file[hmm_file[:,2].astype(float).argsort()[::-1]]
351
+
352
+ #Identify the first row where each gene name appears, after sorting by score;
353
+ #in effect, return the highest scoring assignment per gene name
354
+ #Sort the indices of the result to match the score-sorted table instead of alphabetical order of gene names
355
+ hmm_file = hmm_file[np.sort(np.unique(hmm_file[:,0], return_index = True)[1])]
356
+
357
+ #Filter the file again for the unique ACCESSION names, since we're only allowed one gene per accession, I guess?
358
+ #Don't sort the indices, we don't care about the scores anymore.
359
+ hmm_file = hmm_file[np.unique(hmm_file[:,1], return_index = True)[1]]
360
+
361
+ sql_friendly_names = [i.replace(".", "_") for i in hmm_file[:,1]]
362
+
363
+ self.best_hits = dict(zip(hmm_file[:,0], sql_friendly_names))
364
+
365
+ hmm_file = None
366
+
367
+ #Count per-dom occurs.
368
+ def assign_domain(self):
369
+ for prot in self.best_hits.values():
370
+ if prot in self.bacterial_SCPs:
371
+ self.domain_counts["Bacteria"] += 1
372
+ if prot in self.archaeal_SCPs:
373
+ self.domain_counts["Archaea"] += 1
374
+
375
+ self.bacterial_fraction = self.domain_counts["Bacteria"] / self.voted_domain["Bacteria"]
376
+ self.aechaeal_fraction = self.domain_counts["Archaea"] / self.voted_domain["Archaea"]
377
+
378
+ if self.bacterial_fraction >= self.aechaeal_fraction:
379
+ self.voted_domain = "Bacteria"
380
+ else:
381
+ self.voted_domain = "Archaea"
382
+
383
+ pop_keys = list(self.best_hits.keys())
384
+ for key in pop_keys:
385
+ if self.voted_domain == "Bacteria":
386
+ if self.best_hits[key] not in self.bacterial_SCPs:
387
+ self.best_hits.pop(key)
388
+ if self.voted_domain == "Archaea":
389
+ if self.best_hits[key] not in self.archaeal_SCPs:
390
+ self.best_hits.pop(key)
391
+
392
+ def to_hmm_file(self, output):
393
+ #PyHMMER data is a bit hard to parse. For each result:
394
+
395
+ content = '\n'.join(self.printable_lines) + '\n'
396
+
397
+ if self.do_compress:
398
+ #Clean
399
+ if os.path.exists(output):
400
+ os.remove(output)
401
+
402
+ content = content.encode()
403
+
404
+ fh = gzip.open(output+".gz", "wb")
405
+ fh.write(content)
406
+ fh.close()
407
+ content = None
408
+
409
+ else:
410
+ #Clean
411
+ if os.path.exists(output+".gz"):
412
+ os.remove(output+".gz")
413
+
414
+ fh = open(output, "w")
415
+
416
+ fh.write(content)
417
+
418
+ fh.close()
419
+
420
+ content = None
421
+
422
+ #If we're doing this step at all, we've either loaded the seqs into mem by reading the prot file
423
+ #or have them in mem thanks to pyrodigal.
424
+ def run_for_fastaai(self, prots, hmm_output):
425
+ try:
426
+ self.convert_protein_seqs_in_mem(prots)
427
+ self.execute_search()
428
+ self.filter_to_best_hits()
429
+ try:
430
+ self.to_hmm_file(hmm_output)
431
+ except:
432
+ print(output, "cannot be created. HMM search failed. This file will be skipped.")
433
+
434
+ except:
435
+ print(output, "failed to run through HMMER!")
436
+ self.best_hits = None
437
+
438
+
439
+ def hmm_preproc_initializer(hmm_file, do_compress = False):
440
+ global hmm_manager
441
+ hmm_manager = pyhmmer_manager(do_compress)
442
+ hmm_manager.load_hmm_from_file(hmm_file)
443
+
444
+ class pyrodigal_manager:
445
+ def __init__(self, file = None, aa_out = None, nt_out = None, is_meta = False, full_headers = True, trans_table = 11,
446
+ num_bp_fmt = True, verbose = True, do_compress = "0", compare_against = None):
447
+ #Input NT sequences
448
+ self.file = file
449
+
450
+ #List of seqs read from input file.
451
+ self.sequences = None
452
+ #Concatenation of up to first 32 million bp in self.sequences - prodigal caps at this point.
453
+ self.training_seq = None
454
+
455
+ #Predicted genes go here
456
+ self.predicted_genes = None
457
+ #Record the translation table used.
458
+ self.trans_table = trans_table
459
+
460
+ #This is the pyrodigal manager - this does the gene predicting.
461
+ self.manager = pd.OrfFinder(meta=is_meta)
462
+ self.is_meta = is_meta
463
+
464
+ #Full prodigal header information includes more than just a protein number.
465
+ #If full_headers is true, protein deflines will match prodigal; else, just protein ID.
466
+ self.full_headers = full_headers
467
+
468
+ #Prodigal prints info to console. I enhanced the info and made printing default, but also allow them to be totally turned off.
469
+ self.verbose = verbose
470
+
471
+ #Prodigal formats outputs with 70 bases per line max
472
+ self.num_bp_fmt = num_bp_fmt
473
+
474
+ #File names for outputs
475
+ self.aa_out = aa_out
476
+ self.nt_out = nt_out
477
+
478
+ #List of proteins in excess of 100K base pairs (HMMER's limit) and their lengths. This is also fastAAI specific.
479
+ self.excluded_seqs = {}
480
+
481
+ #Gzip outputs if asked.
482
+ self.compress = do_compress
483
+
484
+ self.labeled_proteins = None
485
+
486
+ #Normally, we don't need to keep an input sequence after it's had proteins predicted for it - however
487
+ #For FastAAI and MiGA's purposes, comparisons of two translation tables is necessary.
488
+ #Rather than re-importing sequences and reconstructing the training sequences,
489
+ #keep them for faster repredict with less I/O
490
+ self.compare_to = compare_against
491
+ if self.compare_to is not None:
492
+ self.keep_seqs = True
493
+ self.keep_after_train = True
494
+ else:
495
+ self.keep_seqs = False
496
+ self.keep_after_train = False
497
+
498
+ #Imports a fasta as binary.
499
+ def import_sequences(self):
500
+ if self.sequences is None:
501
+ self.sequences = {}
502
+
503
+ #check for zipped and import as needed.
504
+ with open(self.file, 'rb') as test_gz:
505
+ #Gzip magic number
506
+ is_gz = (test_gz.read(2) == b'\x1f\x8b')
507
+
508
+ if is_gz:
509
+ fh = gzip.open(self.file)
510
+ else:
511
+ fh = open(self.file, "rb")
512
+
513
+ imp = fh.readlines()
514
+
515
+ fh.close()
516
+
517
+ cur_seq = None
518
+ for s in imp:
519
+ s = s.decode().strip()
520
+ #> is 62 in ascii. This is asking if the first character is '>'
521
+ if s.startswith(">"):
522
+ #Skip first cycle, then do for each after
523
+ if cur_seq is not None:
524
+ self.sequences[cur_seq] = ''.join(self.sequences[cur_seq])
525
+ self.sequences[cur_seq] = self.sequences[cur_seq].encode()
526
+ #print(cur_seq, len(self.sequences[cur_seq]))
527
+ cur_seq = s[1:]
528
+ cur_seq = cur_seq.split()[0]
529
+ cur_seq = cur_seq.encode('utf-8')
530
+ self.sequences[cur_seq] = []
531
+ else:
532
+ #Remove the newline character.
533
+ #bases = s[:-1]
534
+ self.sequences[cur_seq].append(s)
535
+
536
+ #Final set
537
+ self.sequences[cur_seq] = ''.join(self.sequences[cur_seq])
538
+ self.sequences[cur_seq] = self.sequences[cur_seq].encode()
539
+
540
+ #Now we have the data, go to training.
541
+ if not self.is_meta:
542
+ self.train_manager()
543
+
544
+ #Collect up to the first 32 million bases for use in training seq.
545
+ def train_manager(self):
546
+ running_sum = 0
547
+ seqs_added = 0
548
+ if self.training_seq is None:
549
+ self.training_seq = []
550
+ for seq in self.sequences:
551
+ running_sum += len(self.sequences[seq])
552
+ if seqs_added > 0:
553
+ #Prodigal interleaving logic - add this breaker between sequences, starting at sequence 2
554
+ self.training_seq.append(b'TTAATTAATTAA')
555
+ running_sum += 12
556
+
557
+ seqs_added += 1
558
+
559
+ #Handle excessive size
560
+ if running_sum >= 32000000:
561
+ print("Warning: Sequence is long (max 32000000 for training).")
562
+ print("Training on the first 32000000 bases.")
563
+
564
+ to_remove = running_sum - 32000000
565
+
566
+ #Remove excess characters
567
+ cut_seq = self.sequences[seq][:-to_remove]
568
+ #Add the partial seq
569
+ self.training_seq.append(cut_seq)
570
+
571
+ #Stop the loop and move to training
572
+ break
573
+
574
+ #add in a full sequence
575
+ self.training_seq.append(self.sequences[seq])
576
+
577
+ if seqs_added > 1:
578
+ self.training_seq.append(b'TTAATTAATTAA')
579
+
580
+ self.training_seq = b''.join(self.training_seq)
581
+
582
+ if len(self.training_seq) < 20000:
583
+ if self.verbose:
584
+ print("Can't train on 20 thousand or fewer characters. Switching to meta mode.")
585
+ self.manager = pd.OrfFinder(meta=True)
586
+ self.is_meta = True
587
+ else:
588
+ if self.verbose:
589
+ print("")
590
+ #G is 71, C is 67; we're counting G + C and dividing by the total.
591
+ gc = round(((self.training_seq.count(67) + self.training_seq.count(71))/ len(self.training_seq)) * 100, 2)
592
+ print(len(self.training_seq), "bp seq created,", gc, "pct GC")
593
+
594
+ #Train
595
+ self.manager.train(self.training_seq, translation_table = self.trans_table)
596
+
597
+ if not self.keep_after_train:
598
+ #Clean up
599
+ self.training_seq = None
600
+
601
+ def predict_genes(self):
602
+ if self.is_meta:
603
+ if self.verbose:
604
+ print("Finding genes in metagenomic mode")
605
+ else:
606
+ if self.verbose:
607
+ print("Finding genes with translation table", self.trans_table)
608
+ print("")
609
+
610
+ self.predicted_genes = {}
611
+ for seq in self.sequences:
612
+
613
+ if self.verbose:
614
+ print("Finding genes in sequence", seq.decode(), "("+str(len(self.sequences[seq]))+ " bp)... ", end = '')
615
+
616
+ self.predicted_genes[seq] = self.manager.find_genes(self.sequences[seq])
617
+
618
+ #If we're comparing multiple tables, then we want to keep these for re-prediction.
619
+ if not self.keep_seqs:
620
+ #Clean up
621
+ self.sequences[seq] = None
622
+
623
+ if self.verbose:
624
+ print("done!")
625
+
626
+ #Predict genes with an alternative table, compare results, and keep the winner.
627
+ def compare_alternative_table(self, table):
628
+ if table == self.trans_table:
629
+ print("You're trying to compare table", table, "with itself.")
630
+ else:
631
+ if self.verbose:
632
+ print("Comparing translation table", self.trans_table, "against table", table)
633
+ old_table = self.trans_table
634
+ old_genes = self.predicted_genes
635
+ old_size = 0
636
+ for seq in self.predicted_genes:
637
+ for gene in self.predicted_genes[seq]:
638
+ old_size += (gene.end - gene.begin)
639
+
640
+ self.trans_table = table
641
+ self.train_manager()
642
+ self.predict_genes()
643
+
644
+ new_size = 0
645
+ for seq in self.predicted_genes:
646
+ for gene in self.predicted_genes[seq]:
647
+ new_size += (gene.end - gene.begin)
648
+
649
+ if (old_size / new_size) > 1.1:
650
+ if self.verbose:
651
+ print("Translation table", self.trans_table, "performed better than table", old_table, "and will be used instead.")
652
+ else:
653
+ if self.verbose:
654
+ print("Translation table", self.trans_table, "did not perform significantly better than table", old_table, "and will not be used.")
655
+ self.trans_table = old_table
656
+ self.predicted_genes = old_genes
657
+
658
+ #cleanup
659
+ old_table = None
660
+ old_genes = None
661
+ old_size = None
662
+ new_size = None
663
+
664
+ def predict_and_compare(self):
665
+ self.predict_genes()
666
+
667
+ #Run alt comparisons in gene predict.
668
+ if self.compare_to is not None:
669
+ while len(self.compare_to) > 0:
670
+ try:
671
+ next_table = int(self.compare_to.pop(0))
672
+
673
+ if len(self.compare_to) == 0:
674
+ #Ready to clean up.
675
+ self.keep_after_train = True
676
+ self.keep_seqs = True
677
+
678
+ self.compare_alternative_table(next_table)
679
+ except:
680
+ print("Alternative table comparison failed! Skipping.")
681
+
682
+ #Break lines into size base pairs per line. Prodigal's default for bp is 70, aa is 60.
683
+ def num_bp_line_format(self, string, size = 70):
684
+ #ceiling funciton without the math module
685
+ ceiling = int(round((len(string)/size)+0.5, 0))
686
+ formatted = '\n'.join([string[(i*size):(i+1)*size] for i in range(0, ceiling)])
687
+ return formatted
688
+
689
+ #Writeouts
690
+ def write_nt(self):
691
+ if self.nt_out is not None:
692
+ if self.verbose:
693
+ print("Writing nucleotide sequences... ")
694
+ if self.compress == '1' or self.compress == '2':
695
+ out_writer = gzip.open(self.nt_out+".gz", "wb")
696
+
697
+ content = b''
698
+
699
+ for seq in self.predicted_genes:
700
+ seqname = b">"+ seq + b"_"
701
+ #Gene counter
702
+ count = 1
703
+ for gene in self.predicted_genes[seq]:
704
+ #Full header lines
705
+ if self.full_headers:
706
+ content += b' # '.join([seqname + str(count).encode(), str(gene.begin).encode(), str(gene.end).encode(), str(gene.strand).encode(), gene._gene_data.encode()])
707
+ else:
708
+ #Reduced headers if we don't care.
709
+ content += seqname + str(count).encode()
710
+
711
+ content += b'\n'
712
+
713
+ if self.num_bp_fmt:
714
+ #60 bp cap per line
715
+ content += self.num_bp_line_format(gene.sequence(), size = 70).encode()
716
+ else:
717
+ #One-line sequence.
718
+ content += gene.sequence().encode()
719
+
720
+ content += b'\n'
721
+ count += 1
722
+
723
+ out_writer.write(content)
724
+ out_writer.close()
725
+
726
+ if self.compress == '0' or self.compress == '2':
727
+ out_writer = open(self.nt_out, "w")
728
+
729
+ for seq in self.predicted_genes:
730
+ #Only do this decode once.
731
+ seqname = ">"+ seq.decode() +"_"
732
+ #Gene counter
733
+ count = 1
734
+
735
+ for gene in self.predicted_genes[seq]:
736
+ #Full header lines
737
+ if self.full_headers:
738
+ #Standard prodigal header
739
+ print(seqname + str(count), gene.begin, gene.end, gene.strand, gene._gene_data, sep = " # ", file = out_writer)
740
+ else:
741
+ #Reduced headers if we don't care.
742
+ print(seqname + str(count), file = out_writer)
743
+
744
+ if self.num_bp_fmt:
745
+ #60 bp cap per line
746
+ print(self.num_bp_line_format(gene.sequence(), size = 70), file = out_writer)
747
+ else:
748
+ #One-line sequence.
749
+ print(gene.sequence(), file = out_writer)
750
+
751
+ count += 1
752
+
753
+ out_writer.close()
754
+
755
+ def write_aa(self):
756
+ if self.aa_out is not None:
757
+ if self.verbose:
758
+ print("Writing amino acid sequences...")
759
+
760
+ self.labeled_proteins = {}
761
+ content = ''
762
+ for seq in self.predicted_genes:
763
+ count = 1
764
+ seqname = ">"+ seq.decode() + "_"
765
+ for gene in self.predicted_genes[seq]:
766
+ prot_name = seqname + str(count)
767
+ translation = gene.translate()
768
+ self.labeled_proteins[prot_name[1:]] = translation
769
+ defline = " # ".join([prot_name, str(gene.begin), str(gene.end), str(gene.strand), str(gene._gene_data)])
770
+ content += defline
771
+ content += "\n"
772
+ count += 1
773
+ content += self.num_bp_line_format(translation, size = 60)
774
+ content += "\n"
775
+
776
+ if self.compress == '0' or self.compress == '2':
777
+ out_writer = open(self.aa_out, "w")
778
+ out_writer.write(content)
779
+ out_writer.close()
780
+
781
+ if self.compress == '1' or self.compress == '2':
782
+ content = content.encode()
783
+ out_writer = gzip.open(self.aa_out+".gz", "wb")
784
+ out_writer.write(content)
785
+ out_writer.close()
786
+
787
+ def run_for_fastaai(self):
788
+ self.verbose = False
789
+ self.import_sequences()
790
+ self.train_manager()
791
+ self.predict_and_compare()
792
+ self.write_aa()
793
+
794
+ #Iterator for agnostic reader
795
+ class agnostic_reader_iterator:
796
+ def __init__(self, reader):
797
+ self.handle_ = reader.handle
798
+ self.is_gz_ = reader.is_gz
799
+
800
+ def __next__(self):
801
+ if self.is_gz_:
802
+ line = self.handle_.readline().decode()
803
+ else:
804
+ line = self.handle_.readline()
805
+
806
+ #Ezpz EOF check
807
+ if line:
808
+ return line
809
+ else:
810
+ raise StopIteration
811
+
812
+ #File reader that doesn't care if you give it a gzipped file or not.
813
+ class agnostic_reader:
814
+ def __init__(self, file):
815
+ self.path = file
816
+
817
+ with open(file, 'rb') as test_gz:
818
+ #Gzip magic number
819
+ is_gz = (test_gz.read(2) == b'\x1f\x8b')
820
+
821
+ self.is_gz = is_gz
822
+
823
+ if is_gz:
824
+ self.handle = gzip.open(self.path)
825
+ else:
826
+ self.handle = open(self.path)
827
+
828
+ def __iter__(self):
829
+ return agnostic_reader_iterator(self)
830
+
831
+ def close(self):
832
+ self.handle.close()
833
+
834
+ '''
835
+ Class for handling all of the raw genome/protein/protein+HMM file inputs when building a database.
836
+
837
+ Takes a file or files and processes them from genome -> protein, protein -> hmm, prot+HMM -> kmerized protein best hits as numpy int arrays according to the kmer_index
838
+
839
+ '''
840
+
841
+ class input_file:
842
+ def __init__(self, input_path, output = "", verbosity = False, do_compress = False,
843
+ make_crystal = False):
844
+ #starting path for the file; irrelevant for protein and hmm, but otherwise useful for keeping track.
845
+ self.path = input_path
846
+ #Output directory starts with this
847
+ self.output = os.path.normpath(output + "/")
848
+ #For printing file updates, this is the input name
849
+ self.name = os.path.basename(input_path)
850
+ #original name is the key used for the genomes index later on.
851
+ self.original_name = os.path.basename(input_path)
852
+ #This is the name that can be used for building files with new extensions.
853
+ if input_path.endswith(".gz"):
854
+ #Remove .gz first to make names consistent.
855
+ self.basename = os.path.splitext(os.path.basename(input_path[:-3]))[0]
856
+ else:
857
+ self.basename = os.path.splitext(os.path.basename(input_path))[0]
858
+
859
+ #Sanitize for SQL
860
+ #These are chars safe for sql
861
+ sql_safe = set('_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789')
862
+ current_chars = set(self.basename)
863
+ #self.sql_name = self.basename
864
+ #Identify SQL-unsafe characters as those outside the permissible set and replace all with underscores.
865
+ for char in current_chars - sql_safe:
866
+ self.basename = self.basename.replace(char, "_")
867
+
868
+ #'genome' or 'protein' or 'protein and HMM'
869
+ self.status = None
870
+ #These will keep track of paths for each stage of file for us.
871
+ self.genome = None
872
+ self.protein = None
873
+ self.hmm = None
874
+
875
+ self.ran_hmmer = False
876
+
877
+ #If pyrodigal is run, then the protein sequences are already loaded into memory.
878
+ #We reuse them in kmer extraction instead of another I/O
879
+ self.prepared_proteins = None
880
+
881
+ self.intermediate = None
882
+
883
+ self.crystalize = make_crystal
884
+ self.best_hits = None
885
+ self.best_hits_kmers = None
886
+
887
+ self.protein_count = 0
888
+ self.protein_kmer_count = {}
889
+
890
+ self.trans_table = None
891
+ self.start_time = None
892
+ self.end_time = None
893
+ self.err_log = ""
894
+ #doesn't get updated otw.
895
+ self.initial_state = "protein+HMM"
896
+
897
+ self.verbose = verbosity
898
+
899
+ #Check if the file failed to produce ANY SCP HMM hits.
900
+ self.is_empty = False
901
+
902
+ self.do_compress = do_compress
903
+
904
+ self.crystal = None
905
+
906
+ self.init_time = None
907
+ #default to 0 time.
908
+ self.prot_pred_time = None
909
+ self.hmm_search_time = None
910
+ self.besthits_time = None
911
+
912
+ def curtime(self):
913
+ time_format = "%d/%m/%Y %H:%M:%S"
914
+ timer = datetime.datetime.now()
915
+ time = timer.strftime(time_format)
916
+ return time
917
+
918
+ def partial_timings(self):
919
+ protein_pred = self.prot_pred_time-self.init_time
920
+ hmm_search = self.hmm_search_time-self.prot_pred_time
921
+ besthits = self.besthits_time-self.hmm_search_time
922
+
923
+ protein_pred = protein_pred.total_seconds()
924
+ hmm_search = hmm_search.total_seconds()
925
+ besthits = besthits.total_seconds()
926
+
927
+ self.prot_pred_time = protein_pred
928
+ self.hmm_search_time = hmm_search
929
+ self.besthits_time = besthits
930
+
931
+ #Functions for externally setting status and file paths of particular types
932
+ def set_genome(self, path):
933
+ self.status = 'genome'
934
+ self.genome = path
935
+
936
+ def set_protein(self, path):
937
+ self.status = 'protein'
938
+ self.protein = path
939
+
940
+ def set_hmm(self, path):
941
+ if self.protein is None:
942
+ print("Warning! I don't have a protein yet, so this HMM will be useless to me until I do!")
943
+ self.status = 'protein and hmm'
944
+ self.hmm = path
945
+
946
+ def set_crystal(self, path):
947
+ self.status = 'crystal'
948
+ self.crystal = path
949
+
950
+ #Runs prodigal, compares translation tables and stores faa files
951
+ def genome_to_protein(self):
952
+ if self.genome is None:
953
+ print(self.name, "wasn't a declared as a genome! I can't make this into a protein!")
954
+ else:
955
+ protein_output = os.path.normpath(self.output + "/predicted_proteins/" + self.basename + '.faa')
956
+
957
+ if self.do_compress:
958
+ compress_level = "1"
959
+ else:
960
+ compress_level = "0"
961
+
962
+ mn = pyrodigal_manager(file = self.genome, aa_out = protein_output, compare_against = [4], do_compress = compress_level)
963
+ mn.run_for_fastaai()
964
+
965
+ self.trans_table = str(mn.trans_table)
966
+
967
+ for prot in mn.excluded_seqs:
968
+ self.err_log += "Protein " + prot + " was observed to have >100K amino acids ( " + str(mn.excluded_seqs[prot]) + " AA found ). It will not be included in predicted proteins for this genome;"
969
+
970
+ self.prepared_proteins = mn.labeled_proteins
971
+
972
+ del mn
973
+
974
+ #If there are zipped files leftover and we didn't want them, clean them up.
975
+ if self.do_compress:
976
+ self.set_protein(str(protein_output)+".gz")
977
+ #Clean up unzipped version on reruns
978
+ if os.path.exists(str(protein_output)):
979
+ os.remove(str(protein_output))
980
+ else:
981
+ self.set_protein(str(protein_output))
982
+ #Clean up a zipped version on reruns
983
+ if os.path.exists(str(protein_output)+".gz"):
984
+ os.remove(str(protein_output)+".gz")
985
+
986
+ self.prot_pred_time = datetime.datetime.now()
987
+
988
+ #run hmmsearch on a protein
989
+ def protein_to_hmm(self):
990
+ if self.protein is None:
991
+ print(self.basename, "wasn't a declared as a protein! I can't make this into an HMM!")
992
+ else:
993
+
994
+ folder = os.path.normpath(self.output + "/hmms")
995
+
996
+ hmm_output = os.path.normpath(folder +"/"+ self.basename + '.hmm')
997
+
998
+ if self.prepared_proteins is None:
999
+ self.prepared_proteins, deflines = read_fasta(self.protein)
1000
+
1001
+ hmm_manager.run_for_fastaai(self.prepared_proteins, hmm_output)
1002
+
1003
+ self.ran_hmmer = True
1004
+
1005
+ if self.do_compress:
1006
+ self.set_hmm(str(hmm_output)+".gz")
1007
+ if os.path.exists(str(hmm_output)):
1008
+ os.remove(str(hmm_output))
1009
+ else:
1010
+ self.set_hmm(str(hmm_output))
1011
+ if os.path.exists(str(hmm_output)+".gz"):
1012
+ os.remove(str(hmm_output)+".gz")
1013
+
1014
+ self.hmm_search_time = datetime.datetime.now()
1015
+
1016
+ #Translate tetramers to unique int32 indices.
1017
+ def unique_kmer_simple_key(self, seq):
1018
+ #num tetramers = len(seq) - 4 + 1, just make it -3.
1019
+ n_kmers = len(seq) - 3
1020
+
1021
+ #Converts the characters in a sequence into their ascii int value
1022
+ as_ints = np.array([ord(i) for i in seq], dtype = np.int32)
1023
+
1024
+ #create seq like 0,1,2,3; 1,2,3,4; 2,3,4,5... for each tetramer that needs a value
1025
+ kmers = np.arange(4*n_kmers)
1026
+ kmers = kmers % 4 + kmers // 4
1027
+
1028
+ #Select the characters (as ints) corresponding to each tetramer all at once and reshape into rows of 4,
1029
+ #each row corresp. to a successive tetramer
1030
+ kmers = as_ints[kmers].reshape((n_kmers, 4))
1031
+
1032
+ #Given four 2-digit numbers, these multipliers work as offsets so that all digits are preserved in order when summed
1033
+ mult = np.array([1000000, 10000, 100, 1], dtype = np.int32)
1034
+
1035
+ #the fixed values effectively offset the successive chars of the tetramer by 2 positions each time;
1036
+ #practically, this is concatenation of numbers
1037
+ #Matrix mult does this for all values at once.
1038
+ return np.unique(np.dot(kmers, mult))
1039
+
1040
+ def load_hmm_and_filter_from_file(self):
1041
+ prots = []
1042
+ accs = []
1043
+ scores = []
1044
+ f = agnostic_reader(self.hmm)
1045
+ for line in f:
1046
+ if line.startswith("#"):
1047
+ continue
1048
+ else:
1049
+ segs = line.strip().split()
1050
+
1051
+ if len(segs) < 9:
1052
+ continue
1053
+
1054
+ prots.append(segs[0])
1055
+ accs.append(segs[3])
1056
+ scores.append(segs[8])
1057
+
1058
+ f.close()
1059
+
1060
+ if len(prots) < 1:
1061
+ self.best_hits = {}
1062
+
1063
+ hmm_file = np.transpose(np.array([prots, accs, scores]))
1064
+
1065
+ #hmm_file = np.loadtxt(hmm_file_name, comments = '#', usecols = (0, 3, 8), dtype=(str))
1066
+ #Sort the hmm file based on the score column in descending order.
1067
+ hmm_file = hmm_file[hmm_file[:,2].astype(float).argsort()[::-1]]
1068
+
1069
+ #Identify the first row where each gene name appears, after sorting by score;
1070
+ #in effect, return the highest scoring assignment per gene name
1071
+ #Sort the indices of the result to match the score-sorted table instead of alphabetical order of gene names
1072
+ hmm_file = hmm_file[np.sort(np.unique(hmm_file[:,0], return_index = True)[1])]
1073
+
1074
+ #Filter the file again for the unique ACCESSION names, since we're only allowed one gene per accession, I guess?
1075
+ #Don't sort the indices, we don't care about the scores anymore.
1076
+ hmm_file = hmm_file[np.unique(hmm_file[:,1], return_index = True)[1]]
1077
+
1078
+ sql_friendly_names = [i.replace(".", "_") for i in hmm_file[:,1]]
1079
+ self.best_hits = dict(zip(hmm_file[:,0], sql_friendly_names))
1080
+
1081
+ #This should consider the domain by majority vote...
1082
+ def prot_and_hmm_to_besthits(self):
1083
+ if self.ran_hmmer:
1084
+ #Manager has a filter built in.
1085
+ self.best_hits = hmm_manager.best_hits
1086
+ else:
1087
+ #Load the best hits file via old numpy method.
1088
+ self.load_hmm_and_filter_from_file()
1089
+
1090
+ hit_count = 0
1091
+
1092
+ #from pyrodigal predictions or HMM intermediate production, the sequences are already in mem and don't need read in.
1093
+ if self.prepared_proteins is None:
1094
+ #But otherwise, we need to read them in.
1095
+ self.prepared_proteins, deflines = read_fasta(self.protein)
1096
+
1097
+ self.protein_kmer_count = {}
1098
+ self.best_hits_kmers = {}
1099
+
1100
+ if self.crystalize:
1101
+ crystal_record = []
1102
+
1103
+ #Kmerize proteins and record metadata
1104
+ for protein in self.prepared_proteins:
1105
+ if protein in self.best_hits:
1106
+ accession = self.best_hits[protein]
1107
+
1108
+ if self.crystalize:
1109
+ crystal_record.append(str(protein)+"\t"+str(accession)+"\t"+str(self.prepared_proteins[protein])+"\n")
1110
+
1111
+ kmer_set = self.unique_kmer_simple_key(self.prepared_proteins[protein])
1112
+ self.protein_kmer_count[accession] = kmer_set.shape[0]
1113
+ self.protein_count += 1
1114
+ self.best_hits_kmers[accession] = kmer_set
1115
+ hit_count += 1
1116
+
1117
+ #Free the space either way
1118
+ self.prepared_proteins[protein] = None
1119
+
1120
+ if self.crystalize:
1121
+ #only make a crystal if it actually has content.
1122
+ if len(crystal_record) > 0:
1123
+ crystal_path = os.path.normpath(self.output + "/crystals/" + self.basename + '_faai_crystal.txt')
1124
+ crystal_record = "".join(crystal_record)
1125
+
1126
+ if self.do_compress:
1127
+ crystal_record = crystal_record.encode()
1128
+ crystal_writer = gzip.open(crystal_path+".gz", "wb")
1129
+ crystal_writer.write(crystal_record)
1130
+ crystal_writer.close()
1131
+ else:
1132
+ crystal_writer = open(crystal_path, "w")
1133
+ crystal_writer.write(crystal_record)
1134
+ crystal_writer.close()
1135
+
1136
+ #Final free.
1137
+ self.prepared_proteins = None
1138
+
1139
+ #No HMM hits.
1140
+ if hit_count == 0:
1141
+ self.is_empty = True
1142
+
1143
+ self.besthits_time = datetime.datetime.now()
1144
+ self.status = "best hits found"
1145
+
1146
+ def preprocess(self):
1147
+ self.init_time = datetime.datetime.now()
1148
+ #default to 0 time.
1149
+ self.prot_pred_time = self.init_time
1150
+ self.hmm_search_time = self.init_time
1151
+ self.besthits_time = self.init_time
1152
+
1153
+ #There's no advancement stage for protein and HMM
1154
+ if self.status == 'genome':
1155
+ start_time = self.curtime()
1156
+ #report = True
1157
+ if self.start_time is None:
1158
+ self.start_time = start_time
1159
+
1160
+ if self.initial_state == "protein+HMM":
1161
+ self.initial_state = "genome"
1162
+
1163
+ self.genome_to_protein()
1164
+
1165
+ if self.status == 'protein':
1166
+ start_time = self.curtime()
1167
+ #report = True
1168
+ if self.start_time is None:
1169
+ self.start_time = start_time
1170
+
1171
+ if self.initial_state == "protein+HMM":
1172
+ self.initial_state = "protein"
1173
+
1174
+ self.protein_to_hmm()
1175
+
1176
+ if self.status == 'protein and hmm':
1177
+ start_time = self.curtime()
1178
+
1179
+ if self.start_time is None:
1180
+ self.start_time = start_time
1181
+
1182
+ self.prot_and_hmm_to_besthits()
1183
+
1184
+ #Add an end time if either genome -> protein -> HMM or protein -> HMM happened.
1185
+ if self.start_time is not None:
1186
+ end_time = self.curtime()
1187
+ self.end_time = end_time
1188
+ else:
1189
+ #Start was protein+HMM. There was no runtime, and intitial state is p+hmm
1190
+ #self.initial_state = "protein+HMM"
1191
+ self.start_time = "N/A"
1192
+ self.end_time = "N/A"
1193
+
1194
+ #Protein not generated on this run.
1195
+ if self.trans_table is None:
1196
+ self.trans_table = "unknown"
1197
+
1198
+ self.partial_timings()
1199
+
1200
+ '''
1201
+ Utility functions
1202
+ '''
1203
+ def prepare_directories(output, status, build_or_query, make_crystals = False):
1204
+ preparation_successful = True
1205
+
1206
+ if not os.path.exists(output):
1207
+ try:
1208
+ os.mkdir(output)
1209
+ except:
1210
+ print("")
1211
+ print("FastAAI tried to make output directory: '"+ output + "' but failed.")
1212
+ print("")
1213
+ print("Troubleshooting:")
1214
+ print("")
1215
+ print(" (1) Do you have permission to create directories in the location you specified?")
1216
+ print(" (2) Did you make sure that all directories other than", os.path.basename(output), "already exist?")
1217
+ print("")
1218
+ preparation_successful = False
1219
+
1220
+ if preparation_successful:
1221
+ try:
1222
+ if status == 'genome':
1223
+ if not os.path.exists(os.path.normpath(output + "/" + "predicted_proteins")):
1224
+ os.mkdir(os.path.normpath(output + "/" + "predicted_proteins"))
1225
+ if not os.path.exists(os.path.normpath(output + "/" + "hmms")):
1226
+ os.mkdir(os.path.normpath(output + "/" + "hmms"))
1227
+
1228
+ if status == 'protein':
1229
+ if not os.path.exists(os.path.normpath(output + "/" + "hmms")):
1230
+ os.mkdir(os.path.normpath(output + "/" + "hmms"))
1231
+
1232
+ if make_crystals:
1233
+ if not os.path.exists(os.path.normpath(output + "/" + "crystals")):
1234
+ os.mkdir(os.path.normpath(output + "/" + "crystals"))
1235
+
1236
+ if build_or_query == "build":
1237
+ if not os.path.exists(os.path.normpath(output + "/" + "database")):
1238
+ os.mkdir(os.path.normpath(output + "/" + "database"))
1239
+
1240
+ if build_or_query == "query":
1241
+ if not os.path.exists(os.path.normpath(output + "/" + "results")):
1242
+ os.mkdir(os.path.normpath(output + "/" + "results"))
1243
+
1244
+
1245
+ except:
1246
+ print("FastAAI was able to create or find", output, "but couldn't make directories there.")
1247
+ print("")
1248
+ print("This shouldn't happen. Do you have permission to write to that directory?")
1249
+
1250
+
1251
+ return preparation_successful
1252
+
1253
+ def find_hmm():
1254
+ hmm_path = None
1255
+ try:
1256
+ #Try to locate the data bundled as it would be with a pip/conda install.
1257
+ script_path = os.path.dirname(sys.modules['fastAAI_HMM_models'].__file__)
1258
+ if len(script_path) == 0:
1259
+ script_path = "."
1260
+ hmm_complete_model = os.path.abspath(os.path.normpath(script_path + '/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm'))
1261
+ hmm_path = str(hmm_complete_model)
1262
+ #Check that the file exists or fail to the except.
1263
+ fh = open(hmm_path)
1264
+ fh.close()
1265
+ except:
1266
+ #Look in the same dir as the script; old method/MiGA friendly
1267
+ script_path = os.path.dirname(__file__)
1268
+ if len(script_path) == 0:
1269
+ script_path = "."
1270
+ hmm_complete_model = os.path.abspath(os.path.normpath(script_path +"/"+ "00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm"))
1271
+ hmm_path = str(hmm_complete_model)
1272
+
1273
+ return hmm_path
1274
+
1275
+ #Build DB from genomes
1276
+
1277
+ def unique_kmers(seq, ksize):
1278
+ n_kmers = len(seq) - ksize + 1
1279
+ kmers = []
1280
+ for i in range(n_kmers):
1281
+ kmers.append(kmer_index[seq[i:i + ksize]])
1282
+ #We care about the type because we're working with bytes later.
1283
+ return np.unique(kmers).astype(np.int32)
1284
+
1285
+ def split_seq(seq, num_grps):
1286
+ newseq = []
1287
+ splitsize = 1.0/num_grps*len(seq)
1288
+ for i in range(num_grps):
1289
+ newseq.append(seq[int(round(i*splitsize)):int(round((i+1)*splitsize))])
1290
+ return newseq
1291
+
1292
+ #gives the max and min index needed to split a list of (max_val) genomes into
1293
+ def split_indicies(max_val, num_grps):
1294
+ newseq = []
1295
+ splitsize = 1.0/num_grps*max_val
1296
+ for i in range(num_grps):
1297
+ newseq.append(((round(i*splitsize)), round((i+1)*splitsize)))
1298
+ return newseq
1299
+
1300
+ def split_seq_indices(seq, num_grps):
1301
+ newseq = []
1302
+ splitsize = 1.0/num_grps*len(seq)
1303
+ for i in range(num_grps):
1304
+ newseq.append((int(round(i*splitsize)), int(round((i+1)*splitsize)),))
1305
+ return newseq
1306
+
1307
+
1308
+ def list_to_index_dict(list):
1309
+ result = {}
1310
+ counter = 0
1311
+ for item in list:
1312
+ result[item] = counter
1313
+ counter += 1
1314
+ return result
1315
+
1316
+
1317
+ def rev_list_to_index_dict(list):
1318
+ result = {}
1319
+ counter = 0
1320
+ for item in list:
1321
+ result[counter] = item
1322
+ counter += 1
1323
+ return result
1324
+
1325
+ def generate_accessions_index(forward = True):
1326
+ acc_list = ['PF01780_19', 'PF03948_14', 'PF17144_4', 'PF00830_19', 'PF00347_23', 'PF16906_5', 'PF13393_6',
1327
+ 'PF02565_15', 'PF01991_18', 'PF01984_20', 'PF00861_22', 'PF13656_6', 'PF00368_18', 'PF01142_18', 'PF00312_22', 'PF02367_17',
1328
+ 'PF01951_16', 'PF00749_21', 'PF01655_18', 'PF00318_20', 'PF01813_17', 'PF01649_18', 'PF01025_19', 'PF00380_19', 'PF01282_19',
1329
+ 'PF01864_17', 'PF01783_23', 'PF01808_18', 'PF01982_16', 'PF01715_17', 'PF00213_18', 'PF00119_20', 'PF00573_22', 'PF01981_16',
1330
+ 'PF00281_19', 'PF00584_20', 'PF00825_18', 'PF00406_22', 'PF00177_21', 'PF01192_22', 'PF05833_11', 'PF02699_15', 'PF01016_19',
1331
+ 'PF01765_19', 'PF00453_18', 'PF01193_24', 'PF05221_17', 'PF00231_19', 'PF00416_22', 'PF02033_18', 'PF01668_18', 'PF00886_19',
1332
+ 'PF00252_18', 'PF00572_18', 'PF00366_20', 'PF04104_14', 'PF04919_12', 'PF01912_18', 'PF00276_20', 'PF00203_21', 'PF00889_19',
1333
+ 'PF02996_17', 'PF00121_18', 'PF01990_17', 'PF00344_20', 'PF00297_22', 'PF01196_19', 'PF01194_17', 'PF01725_16', 'PF00750_19',
1334
+ 'PF00338_22', 'PF00238_19', 'PF01200_18', 'PF00162_19', 'PF00181_23', 'PF01866_17', 'PF00709_21', 'PF02006_16', 'PF00164_25',
1335
+ 'PF00237_19', 'PF01139_17', 'PF01351_18', 'PF04010_13', 'PF06093_13', 'PF00828_19', 'PF02410_15', 'PF01176_19', 'PF02130_17',
1336
+ 'PF01948_18', 'PF01195_19', 'PF01746_21', 'PF01667_17', 'PF03874_16', 'PF01090_19', 'PF01198_19', 'PF01250_17', 'PF17136_4',
1337
+ 'PF06026_14', 'PF03652_15', 'PF04019_12', 'PF01201_22', 'PF00832_20', 'PF01264_21', 'PF03840_14', 'PF00831_23', 'PF00189_20',
1338
+ 'PF02601_15', 'PF01496_19', 'PF00411_19', 'PF00334_19', 'PF00687_21', 'PF01157_18', 'PF01245_20', 'PF01994_16', 'PF01632_19',
1339
+ 'PF00827_17', 'PF01015_18', 'PF00829_21', 'PF00410_19', 'PF00833_18', 'PF00935_19', 'PF01992_16']
1340
+ if forward:
1341
+ list_of_poss_accs = list_to_index_dict(acc_list)
1342
+ else:
1343
+ list_of_poss_accs = rev_list_to_index_dict(acc_list)
1344
+
1345
+ return list_of_poss_accs
1346
+
1347
+ #Build or add to a FastAAI DB
1348
+ def build_db_opts():
1349
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
1350
+ description='''
1351
+ This FastAAI module allows you to create a FastAAI database from one or many genomes, proteins, or proteins and HMMs, or add these files to an existing one.
1352
+
1353
+ Supply genomes OR proteins OR proteins AND HMMs as inputs.
1354
+
1355
+ If you supply genomes, FastAAI will predict proteins from them, and HMMs will be created from those proteins
1356
+ If you supply only proteins, FastAAI will create HMM files from them, searching against FastAAI's internal database
1357
+ If you supply proteins AND HMMs, FastAAI will directly use them to build the database.\n
1358
+ You cannot supply both genomes and proteins
1359
+ ''')
1360
+
1361
+ parser.add_argument('-g', '--genomes', dest = 'genomes', default = None, help = 'A directory containing genomes in FASTA format.')
1362
+ parser.add_argument('-p', '--proteins', dest = 'proteins', default = None, help = 'A directory containing protein amino acids in FASTA format.')
1363
+ parser.add_argument('-m', '--hmms', dest = 'hmms', default = None, help = 'A directory containing the results of an HMM search on a set of proteins.')
1364
+ parser.add_argument('-d', '--database', dest = 'db_name', default = "FastAAI_database.sqlite.db", help = 'The name of the database you wish to create or add to. The database will be created if it doesn\'t already exist and placed in the output directory. FastAAI_database.sqlite.db by default.')
1365
+
1366
+ parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory to place the database and any protein or HMM files FastAAI creates. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
1367
+
1368
+ parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
1369
+ parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
1370
+ parser.add_argument('--compress', dest = "do_comp", action = 'store_true', help = 'Gzip compress generated proteins, HMMs. Off by default.')
1371
+
1372
+ args, unknown = parser.parse_known_args()
1373
+
1374
+ return parser, args
1375
+
1376
+ def run_build(input_file):
1377
+ input_file.preprocess()
1378
+ if len(input_file.best_hits_kmers) < 1:
1379
+ input_file.best_hits_kmers = None
1380
+ input_file.err_log += " This file did not successfully complete. No SCPs could be found."
1381
+
1382
+ return input_file
1383
+
1384
+ def acc_transformer_init(db, tempdir_path):
1385
+ sqlite3.register_converter("array", convert_array)
1386
+ global indb
1387
+ indb = db
1388
+ global temp_dir
1389
+ temp_dir = tempdir_path
1390
+ global ok
1391
+ ok = generate_accessions_index()
1392
+
1393
+ def acc_transformer(acc_name):
1394
+ source = sqlite3.connect(indb)
1395
+ scurs = source.cursor()
1396
+
1397
+ data = scurs.execute("SELECT * FROM {acc}_genomes".format(acc=acc_name)).fetchall()
1398
+
1399
+ scurs.close()
1400
+ source.close()
1401
+
1402
+ reformat = {}
1403
+
1404
+ for row in data:
1405
+ genome, kmers = row[0], np.frombuffer(row[1], dtype=np.int32)
1406
+ for k in kmers:
1407
+ if k not in reformat:
1408
+ reformat[k] = []
1409
+ reformat[k].append(genome)
1410
+
1411
+ data = None
1412
+
1413
+ to_add = []
1414
+ for k in reformat:
1415
+ as_bytes = np.array(reformat[k], dtype = np.int32)
1416
+ as_bytes = as_bytes.tobytes()
1417
+ reformat[k] = None
1418
+ to_add.append((int(k), as_bytes,))
1419
+
1420
+ my_acc_db = os.path.normpath(temp_dir + "/"+acc_name+".db")
1421
+
1422
+ if os.path.exists(my_acc_db):
1423
+ os.remove(my_acc_db)
1424
+
1425
+ my_db = sqlite3.connect(my_acc_db)
1426
+ curs = my_db.cursor()
1427
+ curs.execute("CREATE TABLE {acc} (kmer INTEGER PRIMARY KEY, genomes array)".format(acc=acc_name))
1428
+ my_db.commit()
1429
+
1430
+ curs.executemany("INSERT INTO {acc} VALUES (?, ?)".format(acc = acc_name), to_add)
1431
+
1432
+ my_db.commit()
1433
+
1434
+ to_add = None
1435
+
1436
+ curs.execute("CREATE INDEX {acc}_index ON {acc} (kmer)".format(acc=acc_name))
1437
+ my_db.commit()
1438
+
1439
+ curs.close()
1440
+ my_db.close()
1441
+
1442
+ return [my_acc_db, acc_name]
1443
+
1444
+ def build_db(genomes, proteins, hmms, db_name, output, threads, verbose, do_compress):
1445
+ success = True
1446
+
1447
+ imported_files = fastaai_file_importer(genomes = genomes, proteins = proteins, hmms = hmms, output = output, compress = do_compress)
1448
+ imported_files.determine_inputs()
1449
+
1450
+ if imported_files.error:
1451
+ print("Exiting FastAAI due to input file error.")
1452
+ quit()
1453
+
1454
+ good_to_go = prepare_directories(output, imported_files.status, "query")
1455
+
1456
+ db_path = os.path.normpath(output + "/database")
1457
+ if not os.path.exists(db_path):
1458
+ os.mkdir(db_path)
1459
+
1460
+ if not good_to_go:
1461
+ print("Exiting FastAAI")
1462
+ sys.exit()
1463
+
1464
+ print("")
1465
+
1466
+ hmm_path = find_hmm()
1467
+
1468
+ #Check if the db contains path info. Incl. windows version.
1469
+ if "/" not in db_name and "\\" not in db_name:
1470
+ final_database = os.path.normpath(output + "/database/" + db_name)
1471
+ else:
1472
+ #If the person insists that the db has a path, let them.
1473
+ final_database = db_name
1474
+
1475
+ #We'll skip trying this if the file already exists.
1476
+ existing_genome_IDs = None
1477
+ try:
1478
+ if os.path.exists(final_database):
1479
+ parent = sqlite3.connect(final_database)
1480
+ curs = parent.cursor()
1481
+
1482
+ existing_genome_IDs = {}
1483
+ sql_command = "SELECT genome, gen_id FROM genome_index"
1484
+ for result in curs.execute(sql_command).fetchall():
1485
+ genome = result[0]
1486
+ id = int(result[1])
1487
+ existing_genome_IDs[genome] = id
1488
+
1489
+ curs.close()
1490
+ parent.close()
1491
+ except:
1492
+ print("You specified an existing file to be a database, but it does not appear to be a FastAAI database.")
1493
+ print("FastAAI will not be able to continue. Please give FastAAI a different database name and continue.")
1494
+ print("Exiting.")
1495
+ success = False
1496
+
1497
+ if success:
1498
+ hmm_file = find_hmm()
1499
+ if existing_genome_IDs is not None:
1500
+ genome_idx = max(list(existing_genome_IDs.values()))+1
1501
+ else:
1502
+ existing_genome_IDs = {}
1503
+ genome_idx = 0
1504
+
1505
+ #return_to
1506
+ td = tempfile.mkdtemp()
1507
+ #if not os.path.exists(td):
1508
+ # os.mkdir(td)
1509
+
1510
+ temp_db = os.path.normpath(td+"/FastAAI_temp_db.db")
1511
+
1512
+ if os.path.exists(temp_db):
1513
+ os.remove(temp_db)
1514
+
1515
+ sqlite3.register_converter("array", convert_array)
1516
+ worker = sqlite3.connect(temp_db)
1517
+ wcurs = worker.cursor()
1518
+ wcurs.execute("CREATE TABLE genome_index (genome text, gen_id integer, protein_count integer)")
1519
+ wcurs.execute("CREATE TABLE genome_acc_kmer_counts (genome integer, accession integer, count integer)")
1520
+ ok = generate_accessions_index()
1521
+ for t in ok:
1522
+ wcurs.execute("CREATE TABLE " + t + "_genomes (genome INTEGER PRIMARY KEY, kmers array)")
1523
+
1524
+ worker.commit()
1525
+
1526
+ new_gens = []
1527
+ new_gak = []
1528
+ accs_seen = {}
1529
+ if verbose:
1530
+ tracker = progress_tracker(total = len(imported_files.in_files), message = "Processing inputs")
1531
+ else:
1532
+ print("Processing inputs")
1533
+
1534
+ #Only build_db makes a log.
1535
+ if not os.path.exists(os.path.normpath(output + "/" + "logs")):
1536
+ os.mkdir(os.path.normpath(output + "/" + "logs"))
1537
+
1538
+ logger = open(os.path.normpath(output+"/logs/"+"FastAAI_preprocessing_log.txt"), "a")
1539
+ print("file", "start_date", "end_date", "starting_format",
1540
+ "prot_prediction_time", "trans_table", "hmm_search_time", "besthits_time",
1541
+ "errors", sep = "\t", file = logger)
1542
+
1543
+ pool = multiprocessing.Pool(threads, initializer = hmm_preproc_initializer,
1544
+ initargs = (hmm_file, do_compress,))
1545
+
1546
+ for result in pool.imap(run_build, imported_files.in_files):
1547
+ #log data, regardless of kind
1548
+ print(result.basename, result.start_time, result.end_time, result.initial_state,
1549
+ result.prot_pred_time, result.trans_table, result.hmm_search_time, result.besthits_time,
1550
+ result.err_log, sep = "\t", file = logger)
1551
+
1552
+ if result.best_hits_kmers is not None:
1553
+ genome_name = result.original_name
1554
+
1555
+ if genome_name in existing_genome_IDs:
1556
+ print(genome_name, "Already present in final database and will be skipped.")
1557
+ print("")
1558
+ else:
1559
+ protein_count = result.protein_count
1560
+ for acc_name in result.best_hits_kmers:
1561
+ if acc_name not in accs_seen:
1562
+ accs_seen[acc_name] = 0
1563
+ acc_id = ok[acc_name]
1564
+ kmer_ct = result.protein_kmer_count[acc_name]
1565
+ kmers = result.best_hits_kmers[acc_name]
1566
+ kmers = kmers.tobytes()
1567
+ wcurs.execute("INSERT INTO {acc}_genomes VALUES (?, ?)".format(acc=acc_name), (genome_idx, kmers,))
1568
+ new_gak.append((genome_idx, acc_id, kmer_ct,))
1569
+
1570
+ new_gens.append((genome_name, genome_idx, protein_count,))
1571
+ genome_idx += 1
1572
+
1573
+ worker.commit()
1574
+
1575
+ if verbose:
1576
+ tracker.update()
1577
+
1578
+ pool.close()
1579
+
1580
+ logger.close()
1581
+
1582
+ wcurs.executemany("INSERT INTO genome_index VALUES (?,?,?)", new_gens)
1583
+ wcurs.executemany("INSERT INTO genome_acc_kmer_counts VALUES (?,?,?)", new_gak)
1584
+ worker.commit()
1585
+
1586
+ wcurs.close()
1587
+ worker.close()
1588
+
1589
+ accs_seen = list(accs_seen.keys())
1590
+
1591
+ parent = sqlite3.connect(final_database)
1592
+ curs = parent.cursor()
1593
+
1594
+ curs.execute("attach '" + temp_db + "' as worker")
1595
+ #initialize if needed.
1596
+ curs.execute("CREATE TABLE IF NOT EXISTS genome_index (genome text, gen_id integer, protein_count integer)")
1597
+ curs.execute("CREATE TABLE IF NOT EXISTS genome_acc_kmer_counts (genome integer, accession integer, count integer)")
1598
+
1599
+ curs.execute("INSERT INTO genome_index SELECT * FROM worker.genome_index")
1600
+ curs.execute("INSERT INTO genome_acc_kmer_counts SELECT * FROM worker.genome_acc_kmer_counts")
1601
+ curs.execute("CREATE INDEX IF NOT EXISTS kmer_acc ON genome_acc_kmer_counts (genome, accession);")
1602
+ parent.commit()
1603
+
1604
+ if verbose:
1605
+ tracker = progress_tracker(total = len(accs_seen), message = "Collecting results")
1606
+ else:
1607
+ print("Collecting results")
1608
+
1609
+ pool = multiprocessing.Pool(threads, initializer = acc_transformer_init,
1610
+ initargs = (temp_db, td,))
1611
+
1612
+ for result in pool.imap_unordered(acc_transformer, accs_seen):
1613
+ database, accession = result[0], result[1]
1614
+ curs.execute("CREATE TABLE IF NOT EXISTS {acc} (kmer INTEGER PRIMARY KEY, genomes array)".format(acc=accession))
1615
+ curs.execute("CREATE TABLE IF NOT EXISTS {acc}_genomes (genome INTEGER PRIMARY KEY, kmers array)".format(acc=accession))
1616
+ curs.execute("CREATE INDEX IF NOT EXISTS {acc}_index ON {acc}(kmer)".format(acc=accession))
1617
+
1618
+ #Get the genomes from worker db.
1619
+ curs.execute("INSERT INTO {acc}_genomes SELECT * FROM worker.{acc}_genomes".format(acc=accession))
1620
+
1621
+ parent.commit()
1622
+
1623
+ accdb = sqlite3.connect(database)
1624
+ acc_curs = accdb.cursor()
1625
+
1626
+ to_update = acc_curs.execute("SELECT kmer, genomes, genomes FROM {acc}".format(acc=accession)).fetchall()
1627
+
1628
+ acc_curs.close()
1629
+ accdb.close()
1630
+
1631
+ update_concat_sql = "INSERT INTO {acc} VALUES (?,?) ON CONFLICT(kmer) DO UPDATE SET genomes=genomes || (?)".format(acc=accession)
1632
+ #ON CONFLICT(kmer) DO UPDATE SET genomes=genomes || acc.{acc}.genomes;".format(acc=accession)
1633
+ #print(update_concat_sql)
1634
+ curs.executemany(update_concat_sql, to_update)
1635
+
1636
+ parent.commit()
1637
+
1638
+ os.remove(database)
1639
+
1640
+ if verbose:
1641
+ tracker.update()
1642
+
1643
+ pool.close()
1644
+
1645
+ curs.execute("detach worker")
1646
+
1647
+ parent.commit()
1648
+
1649
+ curs.close()
1650
+ parent.close()
1651
+
1652
+ os.remove(temp_db)
1653
+ try:
1654
+ if len(os.listdir(td)) == 0:
1655
+ shutil.rmtree(td)
1656
+ except:
1657
+ pass
1658
+
1659
+ if success:
1660
+ print("Database build complete!")
1661
+
1662
+ return success
1663
+
1664
+ def file_v_db_initializer(tgak, tgt_names, tgt_cts, hmm_file, do_compress, tgt_ct, sd, out, style, in_mem, build_q, tdb):
1665
+ #num_tgts, self.do_sd, self.output, self.style, self.as_mem_db, self.do_db_build
1666
+ global _tdb
1667
+ _tdb = tdb
1668
+
1669
+ global _tgt_gak
1670
+ _tgt_gak = tgak
1671
+
1672
+ global _tname
1673
+ _tname = tgt_names
1674
+
1675
+ global _tct
1676
+ _tct = tgt_cts
1677
+
1678
+ global hmm_manager
1679
+ hmm_manager = pyhmmer_manager(do_compress)
1680
+ hmm_manager.load_hmm_from_file(hmm_file)
1681
+
1682
+ global num_tgts
1683
+ num_tgts = tgt_ct
1684
+
1685
+ global _do_sd
1686
+ _do_sd = sd
1687
+
1688
+ global out_style
1689
+ out_style = style
1690
+
1691
+ global out_base
1692
+ out_base = out
1693
+
1694
+ global db_is_in_mem
1695
+ db_is_in_mem = in_mem
1696
+
1697
+ global make_query_db
1698
+ make_query_db = build_q
1699
+
1700
+ return _tdb, _tgt_gak, _tname, _tct, hmm_manager, num_tgts, _do_sd, out_base, out_style, db_is_in_mem, make_query_db
1701
+
1702
+ def file_v_db_worker(query_args):
1703
+ #query info for this particular query
1704
+ in_file = query_args[0]
1705
+
1706
+ in_file.preprocess()
1707
+
1708
+ qname = in_file.basename
1709
+
1710
+ do_sd = _do_sd
1711
+
1712
+ #std dev. calcs are not meaningful with matrix style output.
1713
+ if out_style == "matrix":
1714
+ do_sd = False
1715
+
1716
+ if do_sd:
1717
+ results = []
1718
+ shared_acc_counts = []
1719
+ else:
1720
+ results = np.zeros(shape = num_tgts, dtype = np.float_)
1721
+ shared_acc_counts = np.zeros(shape = num_tgts, dtype = np.int32)
1722
+
1723
+ if db_is_in_mem:
1724
+ #The connection is already given as MDB if the db is in mem
1725
+ tconn = _tdb
1726
+ else:
1727
+ #db is on disk and the connection has to be established.
1728
+ tconn = sqlite3.connect(_tdb)
1729
+
1730
+ tcurs = tconn.cursor()
1731
+
1732
+ #This is a difference from the DB-first method.
1733
+ acc_idx = generate_accessions_index(forward = True)
1734
+
1735
+ genome_lists = {}
1736
+
1737
+ tcurs.row_factory = lambda cursor, row: row[0]
1738
+
1739
+
1740
+ if make_query_db:
1741
+ ret = [qname, None, []]
1742
+ else:
1743
+ ret = [qname, None, None]
1744
+
1745
+ #We need to purge accsessions not in tgt.
1746
+ for acc in in_file.best_hits_kmers:
1747
+ one = in_file.best_hits_kmers[acc]
1748
+ acc_id = acc_idx[acc]
1749
+
1750
+ if make_query_db:
1751
+ ret[2].append((qname, acc_id, one.tobytes(),))
1752
+
1753
+ #Check working.
1754
+ if acc_id in _tgt_gak:
1755
+
1756
+ kmer_ct = one.shape[0]
1757
+
1758
+ if do_sd:
1759
+ hits = np.zeros(shape = num_tgts, dtype = np.int32)
1760
+ hits[np.nonzero(_tgt_gak[acc_id])] = 1
1761
+ shared_acc_counts.append(hits)
1762
+ else:
1763
+ shared_acc_counts[np.nonzero(_tgt_gak[acc_id])] += 1
1764
+
1765
+ #SQL has a max binding size of 999, for some reason.
1766
+ if kmer_ct > 998:
1767
+ #Each kmer needs to be a tuple.
1768
+ these_kmers = [(int(kmer),) for kmer in one]
1769
+
1770
+ temp_name = "_" + qname +"_" + acc
1771
+ temp_name = temp_name.replace(".", "_")
1772
+
1773
+ tcurs.execute("CREATE TEMP TABLE " + temp_name + " (kmer INTEGER)")
1774
+ tconn.commit()
1775
+ insert_table = "INSERT INTO " + temp_name + " VALUES (?)"
1776
+ tcurs.executemany(insert_table, these_kmers)
1777
+ tconn.commit()
1778
+ join_and_select_sql = "SELECT genomes FROM " + temp_name + " INNER JOIN " + acc + " ON "+ temp_name+".kmer = " + acc+".kmer;"
1779
+
1780
+ set = tcurs.execute(join_and_select_sql).fetchall()
1781
+ else:
1782
+ #kmers must be a list, not a tuple.
1783
+ these_kmers = [int(kmer) for kmer in one]
1784
+ select = "SELECT genomes FROM " + acc + " WHERE kmer IN ({kmers})".format(kmers=','.join(['?']*len(these_kmers)))
1785
+
1786
+ set = tcurs.execute(select, these_kmers).fetchall()
1787
+
1788
+ #join results into one bytestring.
1789
+ set = b''.join(set)
1790
+
1791
+ these_intersections = np.bincount(np.frombuffer(set, dtype = np.int32), minlength = num_tgts)
1792
+ set = None
1793
+ #Add tgt kmer counts to query kmer counts, find union size based on intersection size, cald jacc
1794
+ jacc = np.divide(these_intersections, np.subtract(np.add(_tgt_gak[acc_id], kmer_ct), these_intersections))
1795
+
1796
+ if do_sd:
1797
+ results.append(jacc)
1798
+ else:
1799
+ results += jacc
1800
+
1801
+ tcurs.row_factory = None
1802
+ tcurs.close()
1803
+
1804
+ if do_sd:
1805
+ results = np.vstack(results)
1806
+ has_accs = np.vstack(shared_acc_counts)
1807
+
1808
+ shared_acc_counts = np.sum(has_accs, axis = 0)
1809
+
1810
+ #final jacc_means
1811
+ jaccard_averages = np.divide(np.sum(results, axis = 0), shared_acc_counts)
1812
+
1813
+ aai_ests = numpy_kaai_to_aai(jaccard_averages)
1814
+
1815
+ #find diffs from means; this includes indicies corresponding to unshared SCPs that should not be included.
1816
+ results = results - jaccard_averages
1817
+
1818
+ #fix those corresponding indicies to not contribute to the final SD.
1819
+ results[np.nonzero(has_accs == 0)] = 0
1820
+
1821
+ #Square them
1822
+ results = np.square(results)
1823
+ #Sum squares and divide by shared acc. count, the sqrt to get SD.
1824
+ jaccard_SDs = np.sqrt(np.divide(np.sum(results, axis = 0), shared_acc_counts))
1825
+ jaccard_SDs = np.round(jaccard_SDs, 4).astype(str)
1826
+
1827
+ else:
1828
+ #other condition.
1829
+ jaccard_SDs = None
1830
+ jaccard_averages = np.divide(results, shared_acc_counts)
1831
+ #we don't want to pass char arrays to main, so skip this here and do it in main instead.
1832
+ if out_style != "matrix":
1833
+ aai_ests = numpy_kaai_to_aai(jaccard_averages)
1834
+
1835
+ del results
1836
+
1837
+ #Since the outputs go to separate files, it makes more sense to do them within the worker processes instead of in main.
1838
+ if out_style == "tsv":
1839
+ no_hit = np.where(shared_acc_counts == 0)
1840
+
1841
+ possible_hits = np.minimum(len(in_file.best_hits_kmers), _tct).astype(str)
1842
+ jaccard_averages = np.round(jaccard_averages, 4).astype(str)
1843
+ shared_acc_counts = shared_acc_counts.astype(str)
1844
+
1845
+ jaccard_averages[no_hit] = "N/A"
1846
+ aai_ests[no_hit] = "N/A"
1847
+ shared_acc_counts[no_hit] = "N/A"
1848
+ possible_hits[no_hit] = "N/A"
1849
+
1850
+ output_name = os.path.normpath(out_base + "/"+qname+"_results.txt")
1851
+
1852
+ out = open(output_name, "w")
1853
+ out.write("query\ttarget\tavg_jacc_sim\tjacc_SD\tnum_shared_SCPs\tposs_shared_SCPs\tAAI_estimate\n")
1854
+ if do_sd:
1855
+ jaccard_SDs[no_hit] = "N/A"
1856
+ for i in range(0, len(aai_ests)):
1857
+ out.write(qname+"\t"+_tname[i]+"\t"+jaccard_averages[i]+"\t"+jaccard_SDs[i]+"\t"+shared_acc_counts[i]+"\t"+possible_hits[i]+"\t"+aai_ests[i]+"\n")
1858
+ else:
1859
+ for i in range(0, len(aai_ests)):
1860
+ out.write(qname+"\t"+_tname[i]+"\t"+jaccard_averages[i]+"\t"+"N/A"+"\t"+shared_acc_counts[i]+"\t"+possible_hits[i]+"\t"+aai_ests[i]+"\n")
1861
+ out.close()
1862
+
1863
+
1864
+ #We're just gonna pass this back to the main to print.
1865
+ if out_style == "matrix":
1866
+ ret[1] = jaccard_averages
1867
+
1868
+ return ret
1869
+
1870
+ #Handles both query and target types for a db vs db query
1871
+ class file_vs_db_query:
1872
+ def __init__(self, in_memory = False, input_file_objects = None,
1873
+ target = None, threads = 1, do_sd = False, output_base = "FastAAI", output_style = "tsv",
1874
+ build_db_from_queries = True, qdb_name = "Query_FastAAI_database.db", hmm_path = None,
1875
+ do_comp = True, verbose = True):
1876
+ #files to work with
1877
+ self.queries = input_file_objects
1878
+ self.do_db_build = build_db_from_queries
1879
+ self.dbname = qdb_name
1880
+
1881
+ self.t = target
1882
+ self.valids = None
1883
+
1884
+ #Originally this was made to be a memory database only block of code, but just if/else one change makes it work on disk and it doesn't need a redev, then.
1885
+ self.as_mem_db = in_memory
1886
+
1887
+ self.t_conn = None
1888
+ self.t_curs = None
1889
+
1890
+ self.threads = threads
1891
+ self.do_sd = do_sd
1892
+
1893
+ self.output_base = output_base
1894
+ self.output = os.path.normpath(output_base + "/results")
1895
+ self.style = output_style
1896
+
1897
+ if hmm_path is not None:
1898
+ self.hmm_path = hmm_path
1899
+ else:
1900
+ self.hmm_path = find_hmm()
1901
+
1902
+ self.do_comp = do_comp
1903
+
1904
+ self.verbose = verbose
1905
+
1906
+ '''
1907
+ Workflow is:
1908
+ load target db as mem (optional)
1909
+ assess valid targets
1910
+ create query db output (optional)
1911
+ pass query args to workers
1912
+ preproc query args
1913
+ write results
1914
+ fill query_db_out (optional)
1915
+ '''
1916
+
1917
+
1918
+ def open(self):
1919
+ if self.as_mem_db:
1920
+ self.t_conn = sqlite3.connect(':memory:')
1921
+ else:
1922
+ self.t_conn = sqlite3.connect(self.t)
1923
+
1924
+ self.t_curs = self.t_conn.cursor()
1925
+
1926
+ if self.as_mem_db:
1927
+ self.t_curs.execute("attach '" + self.t + "' as targets")
1928
+
1929
+ self.t_curs.execute("CREATE TABLE genome_index AS SELECT * FROM targets.genome_index")
1930
+ self.t_curs.execute("CREATE TABLE genome_acc_kmer_counts AS SELECT * FROM targets.genome_acc_kmer_counts")
1931
+ self.t_curs.execute("CREATE INDEX t_gi ON genome_index (gen_id)")
1932
+ self.t_curs.execute("CREATE INDEX t_gak ON genome_acc_kmer_counts (accession)")
1933
+
1934
+ if self.as_mem_db:
1935
+ table_sql = "SELECT name FROM targets.sqlite_master"
1936
+ else:
1937
+ table_sql = "SELECT name FROM sqlite_master"
1938
+
1939
+
1940
+ ok = generate_accessions_index()
1941
+ ok_names = set(list(ok.keys()))
1942
+ successful_tables = []
1943
+
1944
+ for name in self.t_curs.execute(table_sql).fetchall():
1945
+ name = name[0]
1946
+ if name in ok_names:
1947
+ successful_tables.append(ok[name])
1948
+ if self.as_mem_db:
1949
+ self.t_curs.execute("CREATE TABLE " + name + " AS SELECT * FROM targets."+name)
1950
+ self.t_curs.execute("CREATE INDEX "+name+"_index ON " + name+" (kmer)" )
1951
+
1952
+ if self.as_mem_db:
1953
+ self.t_conn.commit()
1954
+ self.t_curs.execute("detach targets")
1955
+
1956
+ self.valids = tuple(successful_tables)
1957
+
1958
+ def close(self):
1959
+ self.t_curs.close()
1960
+ self.t_curs = None
1961
+
1962
+ def clean_up(self):
1963
+ self.t_conn.close()
1964
+ self.t_conn = None
1965
+
1966
+ def sqlite_table_schema(self, conn, name):
1967
+ """Return a string representing the table's CREATE"""
1968
+ cursor = conn.execute("SELECT sql FROM sqlite_master WHERE name=?;", [name])
1969
+ sql = cursor.fetchone()[0]
1970
+ cursor.close()
1971
+ return sql
1972
+
1973
+ def execute(self):
1974
+ print("FastAAI is running.")
1975
+ tgt_id_res = self.t_curs.execute("SELECT * FROM genome_index ORDER BY gen_id").fetchall()
1976
+
1977
+ tgt_ids = []
1978
+ tgt_naming = []
1979
+ tgt_counts = []
1980
+ for r in tgt_id_res:
1981
+ genome, id, prot_ct = r[0], r[1], r[2]
1982
+ tgt_ids.append(genome)
1983
+ tgt_naming.append(genome)
1984
+ tgt_counts.append(prot_ct)
1985
+
1986
+ num_tgts = len(tgt_ids)
1987
+ tgt_counts = np.array(tgt_counts, dtype = np.int32)
1988
+
1989
+ tgts_gak = {}
1990
+ gak_sql = "SELECT * FROM genome_acc_kmer_counts WHERE accession in ({accs})".format(accs=','.join(['?']*len(self.valids)))
1991
+
1992
+ for result in self.t_curs.execute(gak_sql, self.valids).fetchall():
1993
+ genome, acc, ct = result[0], result[1], result[2]
1994
+ if acc not in tgts_gak:
1995
+ tgts_gak[acc] = np.zeros(num_tgts, dtype = np.int32)
1996
+ tgts_gak[acc][genome] += ct
1997
+
1998
+ #If the DB is a memory DB, we need to maintain the connection, but neither needs to maintain the curor in main.
1999
+ self.close()
2000
+
2001
+ query_groups = []
2002
+
2003
+ for query_input in self.queries:
2004
+ query_groups.append((query_input,))
2005
+
2006
+ #And if it's a physical database, we do want to close it.
2007
+ if not self.as_mem_db:
2008
+ self.t_conn.close()
2009
+
2010
+ num_queries = len(query_groups)
2011
+
2012
+ if self.do_db_build:
2013
+ sqlite3.register_converter("array", convert_array)
2014
+ qdb_path = os.path.normpath(self.output_base + "/database/"+self.dbname)
2015
+ if not os.path.exists(os.path.normpath(self.output_base + "/database")):
2016
+ try:
2017
+ os.mkdir(os.path.normpath(self.output_base + "/database"))
2018
+ except:
2019
+ print("Couldn't make database at", qdb_path)
2020
+ self.do_db_build = False
2021
+
2022
+ if os.path.exists(qdb_path):
2023
+ print("Database for queries already exists. I can't make one at:", qdb_path)
2024
+ self.do_db_build = False
2025
+ else:
2026
+ query_db_conn = sqlite3.connect(qdb_path)
2027
+ q_curs = query_db_conn.cursor()
2028
+ q_curs.execute("CREATE TABLE storage (genome INTEGER, accession INTEGER, kmers array)")
2029
+ q_curs.execute("CREATE INDEX store_idx ON storage (genome, accession)")
2030
+ query_genome_index = []
2031
+ qgi_ct = 0
2032
+ qg_gak = []
2033
+
2034
+ if self.verbose:
2035
+ tracker = progress_tracker(total = num_queries, message = "Calculating AAI...", one_line = True)
2036
+
2037
+ if self.style == "matrix":
2038
+ output_name = os.path.normpath(self.output + "/FastAAI_matrix.txt")
2039
+ output = open(output_name, "w")
2040
+ #needs target names.
2041
+ print("query_genome", *tgt_ids, sep = "\t", file = output)
2042
+
2043
+ #Need to pass these
2044
+
2045
+ #both initializers will share this.
2046
+ shared_args = [tgts_gak, tgt_naming, tgt_counts, self.hmm_path, self.do_comp, num_tgts, self.do_sd, self.output,
2047
+ self.style, self.as_mem_db, self.do_db_build]
2048
+
2049
+ if self.as_mem_db:
2050
+ shared_args.append(self.t_conn)
2051
+ shared_args = tuple(shared_args)
2052
+ pool = multiprocessing.Pool(self.threads, initializer = file_v_db_initializer,
2053
+ initargs = shared_args)
2054
+ else:
2055
+ #db is on disk,
2056
+ shared_args.append(self.t)
2057
+ shared_args = tuple(shared_args)
2058
+ pool = multiprocessing.Pool(self.threads, initializer = file_v_db_initializer,
2059
+ initargs = shared_args)
2060
+
2061
+ for result in pool.imap(file_v_db_worker, query_groups):
2062
+ if self.verbose:
2063
+ tracker.update()
2064
+ qname = result[0]
2065
+ if self.style == "matrix":
2066
+ printout = numpy_kaai_to_aai(result[1])
2067
+ print(qname, *printout, sep = "\t", file = output)
2068
+
2069
+ if self.do_db_build:
2070
+ query_genome_index.append((qname, qgi_ct, len(result[2]),))
2071
+ for row in result[2]:
2072
+ num_kmers = int(len(row[2])/4)
2073
+ qg_gak.append((qgi_ct, row[1], num_kmers,))
2074
+ qgi_ct += 1
2075
+ q_curs.executemany("INSERT INTO storage VALUES (?, ?, ?)", result[2])
2076
+ query_db_conn.commit()
2077
+
2078
+ pool.close()
2079
+
2080
+ if self.style == "matrix":
2081
+ output.close()
2082
+
2083
+ if self.do_db_build:
2084
+ q_curs.execute("CREATE TABLE genome_index (genome text, gen_id integer, protein_count integer)")
2085
+ q_curs.execute("CREATE TABLE genome_acc_kmer_counts (genome integer, accession integer, count integer)")
2086
+ q_curs.executemany("INSERT INTO genome_index VALUES (?,?,?)", query_genome_index)
2087
+ q_curs.executemany("INSERT INTO genome_acc_kmer_counts VALUES (?,?,?)", qg_gak)
2088
+ query_db_conn.commit()
2089
+
2090
+ acc_id_to_name = generate_accessions_index(forward = False)
2091
+ qgi_dict = {}
2092
+ for tup in query_genome_index:
2093
+ qgi_dict[tup[0]] = tup[1]
2094
+
2095
+ accs_in_db = q_curs.execute("SELECT DISTINCT(accession) FROM genome_acc_kmer_counts").fetchall()
2096
+ if self.verbose:
2097
+ tracker = progress_tracker(total = len(accs_in_db), message = "Crafting database from query outputs.", one_line = True)
2098
+
2099
+ for acc in accs_in_db:
2100
+ acc = acc[0]
2101
+ acc_name = acc_id_to_name[acc]
2102
+ q_curs.execute("CREATE TABLE " + acc_name + " (kmer INTEGER PRIMARY KEY, genomes array)")
2103
+ q_curs.execute("CREATE TABLE " + acc_name + "_genomes (genome INTEGER PRIMARY KEY, kmers array)")
2104
+ data = q_curs.execute("SELECT genome, kmers FROM storage WHERE accession = ?", (acc,)).fetchall()
2105
+
2106
+ ins = []
2107
+ #group by kmer
2108
+ kmers_by_gen = {}
2109
+ for row in data:
2110
+ gen = row[0]
2111
+ gen = qgi_dict[gen]
2112
+ kmers = np.frombuffer(row[1], dtype = np.int32)
2113
+ ins.append((gen, kmers,))
2114
+ for k in kmers:
2115
+ #typecast
2116
+ k = int(k)
2117
+ if k not in kmers_by_gen:
2118
+ kmers_by_gen[k] = []
2119
+ kmers_by_gen[k].append(gen)
2120
+
2121
+ data = None
2122
+
2123
+ q_curs.executemany("INSERT INTO "+ acc_name + "_genomes VALUES (?,?)", ins)
2124
+
2125
+ ins = []
2126
+ for k in kmers_by_gen:
2127
+ dat = kmers_by_gen[k]
2128
+ dat = np.sort(np.array(dat, dtype = np.int32))
2129
+ ins.append((k, dat.tobytes()))
2130
+
2131
+ q_curs.executemany("INSERT INTO "+ acc_name + " VALUES (?,?)", ins)
2132
+
2133
+ ins = None
2134
+
2135
+ query_db_conn.commit()
2136
+
2137
+ q_curs.execute("CREATE INDEX IF NOT EXISTS " + acc_name + "_index ON " + acc_name + " (kmer)")
2138
+
2139
+ if self.verbose:
2140
+ tracker.update()
2141
+
2142
+
2143
+ q_curs.execute("CREATE INDEX IF NOT EXISTS kmer_acc ON genome_acc_kmer_counts (genome, accession);")
2144
+ q_curs.execute("DROP INDEX store_idx")
2145
+ q_curs.execute("DROP TABLE storage")
2146
+ query_db_conn.commit()
2147
+ q_curs.execute("VACUUM")
2148
+ query_db_conn.commit()
2149
+ q_curs.close()
2150
+ query_db_conn.close()
2151
+
2152
+ #Actually run the thing.
2153
+ def run(self):
2154
+ self.open()
2155
+ self.execute()
2156
+ #Clean up the db connections; free the mem.
2157
+ self.clean_up()
2158
+
2159
+ def numpy_kaai_to_aai(kaai_array):
2160
+ #aai_hat = (-0.3087057 + 1.810741 * (np.exp(-(-0.2607023 * np.log(kaai))**(1/3.435))))*100
2161
+
2162
+ #Protect the original jaccard averages memory item
2163
+ aai_hat_array = kaai_array.copy()
2164
+
2165
+ non_zero = np.where(aai_hat_array > 0)
2166
+ is_zero = np.where(aai_hat_array <= 0)
2167
+
2168
+ #I broke this down into its original components
2169
+ #Avoid zeroes in log - still actually works, but it produces warnings I don't want to see.
2170
+ aai_hat_array[non_zero] = np.log(aai_hat_array[non_zero])
2171
+
2172
+ aai_hat_array = np.multiply(np.subtract(np.multiply(np.exp(np.negative(np.power(np.multiply(aai_hat_array, -0.2607023), (1/3.435)))), 1.810741), 0.3087057), 100)
2173
+ '''
2174
+ Same as the above, broken down into easier-to-follow steps.
2175
+ aai_hat_array = np.multiply(aai_hat_array, -0.2607023)
2176
+ aai_hat_array = np.power(aai_hat_array, (1/3.435))
2177
+ aai_hat_array = np.negative(aai_hat_array)
2178
+ aai_hat_array = np.exp(aai_hat_array)
2179
+ aai_hat_array = np.multiply(aai_hat_array, 1.810741)
2180
+ aai_hat_array = np.subtract(aai_hat_array, 0.3087057)
2181
+ aai_hat_array = np.multiply(aai_hat_array, 100)
2182
+ '''
2183
+
2184
+ #<30 and >90 values
2185
+ smol = np.where(aai_hat_array < 30)
2186
+ big = np.where(aai_hat_array > 90)
2187
+
2188
+ aai_hat_array = np.round(aai_hat_array, 2)
2189
+
2190
+ #Convert to final printables
2191
+ aai_hat_array = aai_hat_array.astype(str)
2192
+ aai_hat_array[smol] = "<30%"
2193
+ aai_hat_array[big] = ">90%"
2194
+ #The math of the above ends up with zero values being big, so we fix those.
2195
+ aai_hat_array[is_zero] = "<30%"
2196
+
2197
+ return aai_hat_array
2198
+
2199
+ #Also includes a multiply by 100 and type conversion compared to original - this is some silliness for saving memory.
2200
+ def numpy_kaai_to_aai_just_nums(kaai_array, as_float = False):
2201
+ #aai_hat = (-0.3087057 + 1.810741 * (np.exp(-(-0.2607023 * np.log(kaai))**(1/3.435))))*100
2202
+
2203
+ #Protect the original jaccard averages memory item
2204
+ aai_hat_array = kaai_array.copy()
2205
+
2206
+ non_zero = np.where(aai_hat_array > 0)
2207
+ is_zero = np.where(aai_hat_array <= 0)
2208
+
2209
+ #I broke this down into its original components
2210
+ #Avoid zeroes in log - still actually works, but it produces warnings I don't want to see.
2211
+ aai_hat_array[non_zero] = np.log(aai_hat_array[non_zero])
2212
+
2213
+ aai_hat_array = np.multiply(np.subtract(np.multiply(np.exp(np.negative(np.power(np.multiply(aai_hat_array, -0.2607023), (1/3.435)))), 1.810741), 0.3087057), 100)
2214
+ '''
2215
+ Same as the above, broken down into easier-to-follow steps.
2216
+ aai_hat_array = np.multiply(aai_hat_array, -0.2607023)
2217
+ aai_hat_array = np.power(aai_hat_array, (1/3.435))
2218
+ aai_hat_array = np.negative(aai_hat_array)
2219
+ aai_hat_array = np.exp(aai_hat_array)
2220
+ aai_hat_array = np.multiply(aai_hat_array, 1.810741)
2221
+ aai_hat_array = np.subtract(aai_hat_array, 0.3087057)
2222
+ aai_hat_array = np.multiply(aai_hat_array, 100)
2223
+ '''
2224
+
2225
+ aai_hat_array = np.round(aai_hat_array, 2)
2226
+
2227
+ #<30 and >90 values
2228
+ smol = np.where(aai_hat_array < 30)
2229
+ big = np.where(aai_hat_array > 90)
2230
+
2231
+ #We can find these later.
2232
+ aai_hat_array[smol] = 15
2233
+ aai_hat_array[big] = 95
2234
+
2235
+ if as_float:
2236
+ aai_hat_array = np.round(aai_hat_array, 2)
2237
+ else:
2238
+ aai_hat_array = np.multiply(aai_hat_array, 100)
2239
+ aai_hat_array = np.round(aai_hat_array, 2)
2240
+ aai_hat_array = aai_hat_array.astype(np.int16)
2241
+
2242
+ return aai_hat_array
2243
+
2244
+
2245
+ def curtime():
2246
+ time_format = "%d/%m/%Y %H:%M:%S"
2247
+ timer = datetime.datetime.now()
2248
+ time = timer.strftime(time_format)
2249
+ return time
2250
+
2251
+ #Perform a minimal-memory query of a target database from input files. Lighter weight function for low memory
2252
+ def sql_query_opts():
2253
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
2254
+ description='''
2255
+ This FastAAI module takes one or many genomes, proteins, or proteins and HMMs as a QUERY and searches them against an existing FastAAI database TARGET using SQL
2256
+ If you only have a few genomes - or not enough RAM to hold the entire target database in memory - this is the probably the best option for you.
2257
+
2258
+ To provide files, supply either a directory containing only one type of file (e.g. only genomes in FASTA format), a file containing paths to files of a type, 1 per line,
2259
+ or a comma-separated list of files of a single type (no spaces)
2260
+
2261
+ If you provide FastAAI with genomes or only proteins (not proteins and HMMs), this FastAAI module will produce the required protein and HMM files as needed
2262
+ and place them in the output directory, just like it does while building a database.
2263
+
2264
+ Once these inputs are ready to be queried against the database (each has both a protein and HMM file), they will be processed independently, 1 per thread at a time.
2265
+
2266
+ Note: Protein and HMM files generated during this query can be supplied to build a FastAAI database from proteins and HMMs using the build_db module, without redoing preprocessing.
2267
+ ''')
2268
+
2269
+ parser.add_argument('-g', '--genomes', dest = 'genomes', default = None, help = 'Genomes in FASTA format.')
2270
+ parser.add_argument('-p', '--proteins', dest = 'proteins', default = None, help = 'Protein amino acids in FASTA format.')
2271
+ parser.add_argument('-m', '--hmms', dest = 'hmms', default = None, help = 'HMM search files produced by FastAAI on a set of proteins.')
2272
+
2273
+ parser.add_argument('--target', dest = 'target', default = None, help = 'A path to the FastAAI database you wish to use as the target')
2274
+
2275
+ parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory where FastAAI will place the result of this query and any protein or HMM files it has to generate. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
2276
+ parser.add_argument('--output_style', dest = "style", default = 'tsv', help = "Either 'tsv' or 'matrix'. Matrix produces a simplified output of only AAI estimates.")
2277
+ parser.add_argument('--do_stdev', dest = "do_stdev", action='store_true', help = 'Off by default. Calculate std. deviations on Jaccard indicies. Increases memory usage and runtime slightly. Does NOT change estimated AAI values at all.')
2278
+
2279
+ parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
2280
+ parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
2281
+
2282
+ parser.add_argument('--in_memory', dest = "in_mem", action = 'store_true', help = 'Load the target database into memory before querying. Consumes more RAM, but is faster and reduces file I/O substantially.')
2283
+
2284
+ parser.add_argument('--create_query_db', dest = "make_db", action = 'store_true', help = 'Create a query database from the genomes.')
2285
+ parser.add_argument('--query_db_name', dest = "qdb_name", default = "Query_FastAAI_db.db", help = 'Name the query database. This file must not already exist.')
2286
+
2287
+ parser.add_argument('--compress', dest = "do_comp", action = 'store_true', help = 'Gzip compress generated proteins, HMMs. Off by default.')
2288
+
2289
+ args, unknown = parser.parse_known_args()
2290
+
2291
+ return parser, args
2292
+
2293
+ def sql_query_thread_starter(kmer_cts, protein_cts):
2294
+ global target_kmer_cts
2295
+ global target_protein_counts
2296
+ target_kmer_cts = kmer_cts
2297
+ target_protein_counts = protein_cts
2298
+
2299
+ #took a function from fastaai 2.0
2300
+ class fastaai_file_importer:
2301
+ def __init__(self, genomes = None, proteins = None, hmms = None, crystals = None,
2302
+ output = "FastAAI", compress = False, crystalize = False):
2303
+ #genomes, prots, hmms can be supplied as either directory, a file with paths 1/line, or comma-sep paths. Type is determined automatically.
2304
+ self.genomes = genomes
2305
+ self.proteins = proteins
2306
+ self.hmms = hmms
2307
+ self.crystals = crystals
2308
+
2309
+ self.genome_list = None
2310
+ self.protein_list = None
2311
+ self.hmm_list = None
2312
+ self.crystal_list = None
2313
+
2314
+ self.crystalize = crystalize
2315
+
2316
+ #file base names.
2317
+ self.identifiers = None
2318
+
2319
+ self.error = False
2320
+
2321
+ self.in_files = None
2322
+
2323
+ self.status = "genome"
2324
+ self.output = output
2325
+
2326
+ self.do_comp = compress
2327
+
2328
+ def retrieve_files(self, arg):
2329
+ done = False
2330
+ files = []
2331
+ names = []
2332
+ #Case where a directory is supplied.
2333
+ if os.path.isdir(arg):
2334
+ for file in sorted(os.listdir(arg)):
2335
+ #Retrieve file name
2336
+ if file.endswith(".gz"):
2337
+ name = os.path.splitext(os.path.basename(file[:-3]))[0]
2338
+ else:
2339
+ name = os.path.splitext(os.path.basename(file))[0]
2340
+
2341
+ names.append(name)
2342
+ files.append(os.path.abspath(os.path.normpath(arg + '/' +file)))
2343
+
2344
+ done = True
2345
+
2346
+
2347
+ #Case where a file containing paths is supplied.
2348
+ if os.path.isfile(arg):
2349
+ handle = agnostic_reader(arg)
2350
+ for line in handle:
2351
+ file = line.strip()
2352
+ if os.path.exists(file):
2353
+ if file.endswith(".gz"):
2354
+ name = os.path.splitext(os.path.basename(file[:-3]))[0]
2355
+ else:
2356
+ name = os.path.splitext(os.path.basename(file))[0]
2357
+
2358
+ names.append(name)
2359
+ files.append(os.path.abspath(os.path.normpath(file)))
2360
+
2361
+ handle.close()
2362
+ done = True
2363
+
2364
+ if len(names) == 0 and len(files) == 0:
2365
+ #Try interpreting the file as a singular path.
2366
+ done = False
2367
+
2368
+ #Last check.
2369
+ if not done:
2370
+ for file in arg.split(","):
2371
+ if os.path.exists(file):
2372
+ if file.endswith(".gz"):
2373
+ name = os.path.splitext(os.path.basename(file[:-3]))[0]
2374
+ else:
2375
+ name = os.path.splitext(os.path.basename(file))[0]
2376
+
2377
+ names.append(name)
2378
+ files.append(os.path.abspath(os.path.normpath(file)))
2379
+
2380
+ return files, names
2381
+
2382
+ #Check if g/p/h
2383
+ def determine_inputs(self):
2384
+ if self.genomes is not None:
2385
+ self.genome_list, self.identifiers = self.retrieve_files(self.genomes)
2386
+ if self.proteins is not None or self.hmms is not None:
2387
+ print("You can supply genomes or proteins or proteins and HMMS, but not genomes and anything else.")
2388
+ self.error = True
2389
+
2390
+ #Proteins, but no HMMs
2391
+ if self.proteins is not None and self.hmms is None:
2392
+ self.protein_list, self.identifiers = self.retrieve_files(self.proteins)
2393
+
2394
+ if self.proteins is not None and self.hmms is not None:
2395
+ self.protein_list, prot_names = self.retrieve_files(self.proteins)
2396
+ self.hmm_list, hmm_names = self.retrieve_files(self.hmms)
2397
+
2398
+ if len(self.protein_list) != len(self.hmm_list):
2399
+ print("Different number of proteins and HMMs supplied. You must supply the same number of each, and they must be matched pairs.")
2400
+ self.error = True
2401
+ else:
2402
+ all_same = True
2403
+ for p, h in zip(prot_names, hmm_names):
2404
+ if p != h:
2405
+ all_same = False
2406
+
2407
+ if all_same:
2408
+ self.identifiers = prot_names
2409
+ prot_names = None
2410
+ hmm_names = None
2411
+ else:
2412
+ self.error = True
2413
+
2414
+ if self.crystals is not None:
2415
+ self.crystal_list, self.identifiers = self.retrieve_files(self.crystals)
2416
+ #The crystal naming scheme includes an identifier at the end. This removes it.
2417
+ self.identifiers = [id[:-13] for id in self.identifiers]
2418
+
2419
+
2420
+ if not self.error:
2421
+ self.prep_input_files()
2422
+
2423
+ def prep_input_files(self):
2424
+ self.in_files = []
2425
+ if self.genome_list is not None:
2426
+ self.status = "genome"
2427
+ for g in self.genome_list:
2428
+ f = input_file(g, output = self.output, do_compress = self.do_comp, make_crystal = self.crystalize)
2429
+ f.set_genome(g)
2430
+ self.in_files.append(f)
2431
+
2432
+ if self.protein_list is not None:
2433
+ self.status = "protein"
2434
+ for p in self.protein_list:
2435
+ f = input_file(p, output = self.output, do_compress = self.do_comp, make_crystal = self.crystalize)
2436
+ f.set_protein(p)
2437
+ self.in_files.append(f)
2438
+
2439
+ if self.hmm_list is not None:
2440
+ self.status = "protein+HMM"
2441
+ for h, f in zip(self.hmm_list, self.in_files):
2442
+ f.set_hmm(h)
2443
+
2444
+ def sql_query(genomes, proteins, hmms, db_name, output, threads, verbose, do_stdev, style, in_mem, make_db, qdb_name, do_comp):
2445
+
2446
+ if not os.path.exists(db_name):
2447
+ print("")
2448
+ print("FastAAI can't find your database:", db_name)
2449
+ print("Are you sure that the path you've given to the database is correct and that the database exists?")
2450
+ print("FastAAI exiting.")
2451
+ print("")
2452
+ sys.exit()
2453
+
2454
+ #importer opts
2455
+ #genomes = None, proteins = None, hmms = None, crystals = None
2456
+ imported_files = fastaai_file_importer(genomes = genomes, proteins = proteins, hmms = hmms, output = output)
2457
+ imported_files.determine_inputs()
2458
+
2459
+ if imported_files.error:
2460
+ print("Exiting FastAAI due to input file error.")
2461
+ quit()
2462
+
2463
+ good_to_go = prepare_directories(output, imported_files.status, "query")
2464
+
2465
+ if not good_to_go:
2466
+ print("Exiting FastAAI")
2467
+ sys.exit()
2468
+
2469
+ print("")
2470
+
2471
+ '''
2472
+ self, in_memory = False, input_file_objects = None,
2473
+ target = None, threads = 1, do_sd = False, output_base = "FastAAI", output_style = "tsv",
2474
+ build_db_from_queries = True, qdb_name = "Query_FastAAI_database.db", hmm_path = "00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm",
2475
+ do_comp = True, verbose = True
2476
+ '''
2477
+ hmm_path = find_hmm()
2478
+
2479
+ mdb = file_vs_db_query(in_memory = in_mem, input_file_objects = imported_files.in_files, target=db_name,
2480
+ threads = threads, output_base = output, do_sd = do_stdev, output_style = style, do_comp = do_comp,
2481
+ build_db_from_queries = make_db, qdb_name = qdb_name, verbose = verbose, hmm_path = hmm_path)
2482
+
2483
+ mdb.run()
2484
+
2485
+ #Here's where the querying db comes in
2486
+
2487
+
2488
+ print("FastAAI query complete! Results at:", os.path.normpath(output + "/results"))
2489
+ return None
2490
+
2491
+ #Manages the query process.
2492
+ def db_query_opts():
2493
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
2494
+ description='''
2495
+ This FastAAI module takes two FastAAI databases and searches all of the genomes in the QUERY against all of the genomes in the TARGET
2496
+
2497
+ If you have many genomes (more than 1000), it will be faster to create the query database using FastAAI build_db,
2498
+ then search it against an existing target using this module than it is to do the same thing with an SQL query.
2499
+
2500
+ If you give the same database as query and target, a special all vs. all search of the genomes in the database will be done.
2501
+ ''')
2502
+ parser.add_argument('-q', '--query', dest = 'query', default = None, help = 'Path to the query database. The genomes FROM the query will be searched against the genomes in the target database')
2503
+ parser.add_argument('-t', '--target', dest = 'target', default = None, help = 'Path to the target database.')
2504
+
2505
+ parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory where FastAAI will place the result of this query. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
2506
+ parser.add_argument('--output_style', dest = "style", default = 'tsv', help = "Either 'tsv' or 'matrix'. Matrix produces a simplified output of only AAI estimates.")
2507
+ parser.add_argument('--do_stdev', dest = "do_stdev", action='store_true', help = 'Off by default. Calculate std. deviations on Jaccard indicies. Increases memory usage and runtime slightly. Does NOT change estimated AAI values at all.')
2508
+
2509
+ parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
2510
+ parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
2511
+ parser.add_argument('--in_memory', dest = "in_mem", action = 'store_true', help = 'Load both databases into memory before querying. Consumes more RAM, but is faster and reduces file I/O substantially. Consider reducing number of threads')
2512
+ parser.add_argument('--store_results', dest = "storage", action = 'store_true', help = 'Keep partial results in memory. Only works with --in_memory. Fewer writes, but more RAM. Default off.')
2513
+
2514
+ args, unknown = parser.parse_known_args()
2515
+
2516
+ return parser, args
2517
+
2518
+
2519
+ #db-db query; in-mem
2520
+ def parse_db_init(query, target, outpath):
2521
+ global qdb
2522
+ qdb = query
2523
+ global tdb
2524
+ tdb = target
2525
+ global output_path
2526
+ output_path = outpath
2527
+
2528
+ global query_gak
2529
+ global target_gak
2530
+
2531
+ return qdb, tdb, output_path
2532
+
2533
+ def parse_accession(acc):
2534
+ tmp = sqlite3.connect(":memory:")
2535
+ curs = tmp.cursor()
2536
+
2537
+ curs.execute("attach '" + qdb + "' as queries")
2538
+ curs.execute("attach '" + tdb + "' as targets")
2539
+
2540
+ sql = '''
2541
+ SELECT queries.{acc}.genomes, targets.{acc}.genomes
2542
+ FROM queries.{acc} INNER JOIN targets.{acc}
2543
+ ON queries.{acc}.kmer=targets.{acc}.kmer
2544
+ '''.format(acc = acc)
2545
+
2546
+ res = curs.execute(sql).fetchall()
2547
+
2548
+ curs.execute("detach queries")
2549
+ curs.execute("detach targets")
2550
+
2551
+ curs.close()
2552
+ tmp.close()
2553
+
2554
+ tl = []
2555
+ ql = {}
2556
+
2557
+ acc_id = generate_accessions_index()
2558
+ acc_id = acc_id[acc]
2559
+
2560
+ indexer = 0
2561
+ for r in res:
2562
+ queries = np.frombuffer(r[0], dtype = np.int32)
2563
+ tgt = np.frombuffer(r[1], dtype = np.int32)
2564
+ tl.append(tgt)
2565
+
2566
+ for q in queries:
2567
+ if q not in ql:
2568
+ ql[q] = {}
2569
+ if acc_id not in ql[q]:
2570
+ ql[q][acc_id] = []
2571
+
2572
+ ql[q][acc_id].append(indexer)
2573
+
2574
+ indexer += 1
2575
+
2576
+ tl = np.array(tl, dtype = object)
2577
+
2578
+ for q in ql:
2579
+ if acc_id in ql[q]:
2580
+ ql[q][acc_id] = np.array(ql[q][acc_id], dtype=np.int32)
2581
+
2582
+ out_file = os.path.normpath(output_path+"/"+acc+".pickle")
2583
+
2584
+ with open(out_file, "wb") as out:
2585
+ pickle.dump([ql, tl], out)
2586
+
2587
+ return([acc, out_file])
2588
+
2589
+ #all of this is exclusive to the in-mem approach for db db query
2590
+ def one_init(ql, tl, num_tgt, qgak_queue, tgak, tpres, sd, sty, output_dir, store_results, progress_queue, qnames, tnames, temp_dir):
2591
+ global _ql
2592
+ _ql = ql
2593
+ global _tl
2594
+ _tl = tl
2595
+ global _nt
2596
+ _nt = num_tgt
2597
+
2598
+ qgak_data = qgak_queue.get()
2599
+
2600
+ global out_base
2601
+ out_base = output_dir
2602
+
2603
+ global group_id
2604
+ group_id = os.path.normpath(temp_dir + "/partial_results_group_" + str(qgak_data[0])+ ".txt")
2605
+
2606
+ global _qgak
2607
+ _qgak = qgak_data[1]
2608
+
2609
+ global query_grouping
2610
+ query_grouping = qgak_data[2]
2611
+
2612
+ qgak_data = None
2613
+
2614
+ global _tgak
2615
+ _tgak = tgak
2616
+
2617
+ global _tpres
2618
+ _tpres = tpres
2619
+
2620
+ global _tct
2621
+ _tct = np.sum(_tpres, axis = 0)
2622
+
2623
+ global do_sd
2624
+ do_sd = sd
2625
+ global style
2626
+ style = sty
2627
+ #Suppress div by zero warning - it's handled.
2628
+ np.seterr(divide='ignore')
2629
+
2630
+ global store
2631
+ store = store_results
2632
+ if store:
2633
+ global holder
2634
+ holder = []
2635
+ else:
2636
+ global outwriter
2637
+ outwriter = open(group_id, "w")
2638
+
2639
+ global prog_queue
2640
+ prog_queue = progress_queue
2641
+
2642
+ global _qnames
2643
+ _qnames = qnames
2644
+
2645
+ global _tnames
2646
+ _tnames = tnames
2647
+
2648
+ def one_work(placeholder):
2649
+ for q in query_grouping:
2650
+ results = []
2651
+ #We also need to count the accs in the query genome, but which are not part of the inner join.
2652
+ for acc in _qgak[q][0]:
2653
+ if acc in _ql[q]:
2654
+ #the bincount is intersections.
2655
+ these_intersections = np.bincount(np.concatenate(_tl[acc][_ql[q][acc]]), minlength = _nt)
2656
+ else:
2657
+ #there are no intersections even though this accession is shared with at least one target
2658
+ #number of intersects is all zeros
2659
+ these_intersections = np.zeros(_nt, dtype = np.int32)
2660
+
2661
+ #Append the counts or zeros, either way.
2662
+ results.append(these_intersections)
2663
+
2664
+ results = np.vstack(results)
2665
+
2666
+ target_kmer_counts = _tgak[_qgak[q][0], :]
2667
+
2668
+ #unions = size(A) + size(B) - size(intersections(A, B))
2669
+ #unions = target_kmer_counts + query_kmers_by_acc - intersections
2670
+ unions = np.subtract(np.add(target_kmer_counts, _qgak[q][1][:, None]), results)
2671
+
2672
+ #These are now jaccards, not #intersections
2673
+ results = np.divide(results, unions)
2674
+
2675
+ shared_acc_counts = np.sum(_tpres[_qgak[q][0], :], axis = 0)
2676
+
2677
+ no_hit = np.where(shared_acc_counts == 0)
2678
+
2679
+ jaccard_averages = np.divide(np.sum(results, axis = 0), shared_acc_counts)
2680
+
2681
+ #Skip SD if output is matrix
2682
+ if style == "tsv":
2683
+ aai_ests = numpy_kaai_to_aai(jaccard_averages)
2684
+
2685
+ if do_sd:
2686
+ #find diffs from means; this includes indicies corresponding to unshared SCPs that should not be included.
2687
+ results = results - jaccard_averages
2688
+
2689
+ #fix those corresponding indicies to not contribute to the final SD.
2690
+ results[np.logical_not(_tpres[_qgak[q][0], :])] = 0
2691
+ #results[np.nonzero(has_accs == 0)] = 0
2692
+
2693
+ #Square them; 0^2 = 0, so we don't have to think about the fixed indices any more.
2694
+ results = np.square(results)
2695
+ #Sum squares and divide by shared acc. count, the sqrt to get SD.
2696
+ jaccard_SDs = np.sqrt(np.divide(np.sum(results, axis = 0), shared_acc_counts))
2697
+ jaccard_SDs = np.round(jaccard_SDs, 4).astype(str)
2698
+
2699
+ no_hit = np.where(shared_acc_counts == 0)
2700
+
2701
+ #addtl.shape[0] is the query acc count
2702
+ possible_hits = np.minimum(_qgak[q][0].shape[0], _tct).astype(str)
2703
+
2704
+ jaccard_averages = np.round(jaccard_averages, 4).astype(str)
2705
+ shared_acc_counts = shared_acc_counts.astype(str)
2706
+
2707
+ jaccard_averages[no_hit] = "N/A"
2708
+ aai_ests[no_hit] = "N/A"
2709
+ shared_acc_counts[no_hit] = "N/A"
2710
+ possible_hits[no_hit] = "N/A"
2711
+
2712
+ qname = _qnames[q]
2713
+
2714
+ output_name = os.path.normpath(out_base + "/results/"+qname+"_results.txt")
2715
+
2716
+ out = open(output_name, "w")
2717
+ out.write("query\ttarget\tavg_jacc_sim\tjacc_SD\tnum_shared_SCPs\tposs_shared_SCPs\tAAI_estimate\n")
2718
+ if do_sd:
2719
+ jaccard_SDs[no_hit] = "N/A"
2720
+ for i in range(0, len(aai_ests)):
2721
+ out.write(qname+"\t"+_tnames[i]+"\t"+jaccard_averages[i]+"\t"+jaccard_SDs[i]+"\t"+shared_acc_counts[i]+"\t"+possible_hits[i]+"\t"+aai_ests[i]+"\n")
2722
+ else:
2723
+ for i in range(0, len(aai_ests)):
2724
+ out.write(qname+"\t"+_tnames[i]+"\t"+jaccard_averages[i]+"\t"+"N/A"+"\t"+shared_acc_counts[i]+"\t"+possible_hits[i]+"\t"+aai_ests[i]+"\n")
2725
+ out.close()
2726
+
2727
+
2728
+ else:
2729
+ if store:
2730
+ aai_ests = numpy_kaai_to_aai_just_nums(jaccard_averages, as_float = False)
2731
+ aai_ests[no_hit] = 0
2732
+ #add zeros at misses/NAs
2733
+ holder.append(aai_ests)
2734
+ else:
2735
+ aai_ests = numpy_kaai_to_aai_just_nums(jaccard_averages, as_float = True)
2736
+ aai_ests[no_hit] = 0
2737
+ print(*aai_ests, sep = "\t", file = outwriter)
2738
+
2739
+ prog_queue.put(q)
2740
+
2741
+ prog_queue.put("done")
2742
+
2743
+ return None
2744
+
2745
+ def two_work(i):
2746
+ if store:
2747
+ hold_together = np.vstack(holder)
2748
+ np.savetxt(group_id, hold_together, delimiter = "\t", fmt='%4d')
2749
+ else:
2750
+ outwriter.close()
2751
+
2752
+ return group_id
2753
+
2754
+ def on_disk_init(query_database_path, target_database_path, num_tgt, query_queue, target_gak, tpres, sd, sty, output_dir, progress_queue, qnames, tnames, valids, temp_dir):
2755
+ global database
2756
+ database = sqlite3.connect(":memory:")
2757
+
2758
+ curs = database.cursor()
2759
+ curs.execute("attach '" + query_database_path + "' as queries")
2760
+ curs.execute("attach '" + target_database_path + "' as targets")
2761
+ curs.close()
2762
+
2763
+ global _nt
2764
+ _nt = num_tgt
2765
+
2766
+ qgak_data = query_queue.get()
2767
+
2768
+ global out_base
2769
+ out_base = output_dir
2770
+
2771
+ global group_id
2772
+ group_id = os.path.normpath(temp_dir + "/partial_results_group_" + str(qgak_data[0])+ ".txt")
2773
+
2774
+ global _qgak
2775
+ _qgak = qgak_data[1]
2776
+
2777
+ global query_grouping
2778
+ query_grouping = qgak_data[2]
2779
+
2780
+ global _tgak
2781
+ _tgak = target_gak
2782
+
2783
+ global _tpres
2784
+ _tpres = tpres
2785
+
2786
+ global _tct
2787
+ _tct = np.sum(_tpres, axis = 0)
2788
+
2789
+ global do_sd
2790
+ do_sd = sd
2791
+ global style
2792
+ style = sty
2793
+ #Suppress div by zero warning - it's handled.
2794
+ np.seterr(divide='ignore')
2795
+
2796
+ if style == "matrix":
2797
+ global outwriter
2798
+ outwriter = open(group_id, "w")
2799
+
2800
+ global prog_queue
2801
+ prog_queue = progress_queue
2802
+
2803
+ global _qnames
2804
+ _qnames = qnames
2805
+
2806
+ global _tnames
2807
+ _tnames = tnames
2808
+
2809
+ global acc_indexer
2810
+ acc_indexer = generate_accessions_index(forward = False)
2811
+
2812
+ global _valids
2813
+ _valids = valids
2814
+
2815
+ def on_disk_work_one(placeholder):
2816
+ curs = database.cursor()
2817
+ for q in query_grouping:
2818
+ results = []
2819
+ qname = _qnames[q]
2820
+ for acc in _qgak[q][0]:
2821
+ acc_name = acc_indexer[acc]
2822
+
2823
+ if acc_name in _valids:
2824
+
2825
+ one = curs.execute("SELECT kmers FROM queries."+acc_name+"_genomes WHERE genome=?", (str(q),)).fetchone()[0]
2826
+ one = np.frombuffer(one, dtype = np.int32)
2827
+
2828
+ if one.shape[0] > 998:
2829
+ #Each kmer needs to be a tuple.
2830
+ these_kmers = [(int(kmer),) for kmer in one]
2831
+
2832
+ temp_name = "_" + qname +"_" + acc_name
2833
+ temp_name = temp_name.replace(".", "_")
2834
+
2835
+ curs.execute("CREATE TEMP TABLE " + temp_name + " (kmer INTEGER)")
2836
+ insert_table = "INSERT INTO " + temp_name + " VALUES (?)"
2837
+ curs.executemany(insert_table, these_kmers)
2838
+
2839
+ join_and_select_sql = "SELECT genomes FROM " + temp_name + " INNER JOIN targets." + acc_name + " ON "+ temp_name+".kmer = targets." + acc_name + ".kmer;"
2840
+
2841
+ matches = curs.execute(join_and_select_sql).fetchall()
2842
+ else:
2843
+ #kmers must be a list, not a tuple.
2844
+ these_kmers = [int(kmer) for kmer in one]
2845
+ select = "SELECT genomes FROM targets." + acc_name + " WHERE kmer IN ({kmers})".format(kmers=','.join(['?']*len(these_kmers)))
2846
+ matches = curs.execute(select, these_kmers).fetchall()
2847
+
2848
+ set = []
2849
+ for row in matches:
2850
+ set.append(row[0])
2851
+ set = b''.join(set)
2852
+
2853
+ matches = None
2854
+ these_intersections = np.bincount(np.frombuffer(set, dtype = np.int32), minlength = _nt)
2855
+ set = None
2856
+ results.append(these_intersections)
2857
+
2858
+ else:
2859
+ results.append(np.zeros(_nt, dtype=np.int32))
2860
+
2861
+ results = np.vstack(results)
2862
+
2863
+ target_kmer_counts = _tgak[_qgak[q][0], :]
2864
+
2865
+ #unions = size(A) + size(B) - size(intersections(A, B))
2866
+ #unions = target_kmer_counts + query_kmers_by_acc - intersections
2867
+ unions = np.subtract(np.add(target_kmer_counts, _qgak[q][1][:, None]), results)
2868
+
2869
+ #These are now jaccards, not #intersections
2870
+ results = np.divide(results, unions)
2871
+
2872
+ shared_acc_counts = np.sum(_tpres[_qgak[q][0], :], axis = 0)
2873
+
2874
+ no_hit = np.where(shared_acc_counts == 0)
2875
+
2876
+ jaccard_averages = np.divide(np.sum(results, axis = 0), shared_acc_counts)
2877
+
2878
+ #Skip SD if output is matrix
2879
+ if style == "tsv":
2880
+ aai_ests = numpy_kaai_to_aai(jaccard_averages)
2881
+
2882
+ if do_sd:
2883
+ #find diffs from means; this includes indicies corresponding to unshared SCPs that should not be included.
2884
+ results = results - jaccard_averages
2885
+
2886
+ #fix those corresponding indicies to not contribute to the final SD.
2887
+ results[np.logical_not(_tpres[_qgak[q][0], :])] = 0
2888
+ #results[np.nonzero(has_accs == 0)] = 0
2889
+
2890
+ #Square them; 0^2 = 0, so we don't have to think about the fixed indices any more.
2891
+ results = np.square(results)
2892
+ #Sum squares and divide by shared acc. count, the sqrt to get SD.
2893
+ jaccard_SDs = np.sqrt(np.divide(np.sum(results, axis = 0), shared_acc_counts))
2894
+ jaccard_SDs = np.round(jaccard_SDs, 4).astype(str)
2895
+
2896
+ no_hit = np.where(shared_acc_counts == 0)
2897
+
2898
+ #_qgak[q][0] is the query acc count
2899
+ possible_hits = np.minimum(_qgak[q][0].shape[0], _tct).astype(str)
2900
+
2901
+ jaccard_averages = np.round(jaccard_averages, 4).astype(str)
2902
+ shared_acc_counts = shared_acc_counts.astype(str)
2903
+
2904
+ jaccard_averages[no_hit] = "N/A"
2905
+ aai_ests[no_hit] = "N/A"
2906
+ shared_acc_counts[no_hit] = "N/A"
2907
+ possible_hits[no_hit] = "N/A"
2908
+
2909
+ output_name = os.path.normpath(out_base + "/results/"+qname+"_results.txt")
2910
+
2911
+ out = open(output_name, "w")
2912
+ out.write("query\ttarget\tavg_jacc_sim\tjacc_SD\tnum_shared_SCPs\tposs_shared_SCPs\tAAI_estimate\n")
2913
+ if do_sd:
2914
+ jaccard_SDs[no_hit] = "N/A"
2915
+ for i in range(0, len(aai_ests)):
2916
+ out.write(qname+"\t"+_tnames[i]+"\t"+jaccard_averages[i]+"\t"+jaccard_SDs[i]+"\t"+shared_acc_counts[i]+"\t"+possible_hits[i]+"\t"+aai_ests[i]+"\n")
2917
+ else:
2918
+ for i in range(0, len(aai_ests)):
2919
+ out.write(qname+"\t"+_tnames[i]+"\t"+jaccard_averages[i]+"\t"+"N/A"+"\t"+shared_acc_counts[i]+"\t"+possible_hits[i]+"\t"+aai_ests[i]+"\n")
2920
+ out.close()
2921
+
2922
+ else:
2923
+ aai_ests = numpy_kaai_to_aai_just_nums(jaccard_averages, as_float = True)
2924
+ aai_ests[no_hit] = 0
2925
+ print(*aai_ests, sep = "\t", file = outwriter)
2926
+
2927
+ prog_queue.put(q)
2928
+
2929
+ curs.close()
2930
+ prog_queue.put("done")
2931
+
2932
+ def on_disk_work_two(i):
2933
+ outwriter.close()
2934
+ return group_id
2935
+
2936
+ def sorted_nicely(l):
2937
+ convert = lambda text: int(text) if text.isdigit() else text
2938
+ alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ]
2939
+ return sorted(l, key = alphanum_key)
2940
+
2941
+ class db_db_remake:
2942
+ def __init__(self, in_memory = False, store_mat_res = False,
2943
+ query = None, target = None, threads = 1, do_sd = False,
2944
+ output_base = "FastAAI", output_style = "tsv", verbose = True):
2945
+
2946
+ #databases to eat
2947
+ self.q = query
2948
+ self.t = target
2949
+
2950
+ #metadata
2951
+ self.ok = generate_accessions_index(forward = True)
2952
+ self.rev = generate_accessions_index(forward = False)
2953
+ self.valids = None
2954
+
2955
+ #Originally this was made to be a memory database only block of code, but just if/else one change makes it work on disk and it doesn't need a redev, then.
2956
+ self.as_mem_db = in_memory
2957
+ self.store_mat = store_mat_res
2958
+
2959
+ #in-mem stuff
2960
+ self.conn = None
2961
+ self.curs = None
2962
+
2963
+ self.threads = threads
2964
+ self.do_sd = do_sd
2965
+
2966
+ self.output_base = output_base
2967
+ self.output = os.path.normpath(output_base + "/results")
2968
+ self.style = output_style
2969
+
2970
+ self.query_names = None
2971
+ self.target_names = None
2972
+
2973
+ self.num_queries = None
2974
+ self.num_targets = None
2975
+
2976
+ self.query_gak = None
2977
+ self.target_gak = None
2978
+ self.target_presence = None
2979
+
2980
+ self.query_dict = None
2981
+ self.target_dict = None
2982
+
2983
+ self.verbose = verbose
2984
+
2985
+ #getting the db metadata happens the same way in every case
2986
+ def open(self):
2987
+ if self.verbose:
2988
+ print("Perusing database metadata")
2989
+
2990
+ self.conn = sqlite3.connect(":memory:")
2991
+ self.curs = self.conn.cursor()
2992
+
2993
+ self.curs.execute("attach '" + self.q + "' as queries")
2994
+ self.curs.execute("attach '" + self.t + "' as targets")
2995
+
2996
+ #Find the shared accessions for these databases
2997
+ shared_accs_sql = '''
2998
+ SELECT queries.sqlite_master.name
2999
+ FROM queries.sqlite_master INNER JOIN targets.sqlite_master
3000
+ ON queries.sqlite_master.name = targets.sqlite_master.name
3001
+ '''
3002
+ self.valids = {}
3003
+ for table in self.curs.execute(shared_accs_sql).fetchall():
3004
+ table = table[0]
3005
+ #Filter to
3006
+ if table in self.ok:
3007
+ self.valids[table] = self.ok[table]
3008
+
3009
+ self.query_names = []
3010
+ for r in self.curs.execute("SELECT genome FROM queries.genome_index ORDER BY gen_id").fetchall():
3011
+ self.query_names.append(r[0])
3012
+
3013
+ self.target_names = []
3014
+ for r in self.curs.execute("SELECT genome FROM targets.genome_index ORDER BY gen_id").fetchall():
3015
+ self.target_names.append(r[0])
3016
+
3017
+ self.num_queries = len(self.query_names)
3018
+ self.num_targets = len(self.target_names)
3019
+
3020
+ gak_sql = '''
3021
+ SELECT * FROM {db}.genome_acc_kmer_counts
3022
+ WHERE accession in ({accs})
3023
+ ORDER BY genome
3024
+ '''
3025
+
3026
+ acc_ids = list(self.valids.values())
3027
+ acc_ids.sort()
3028
+ acc_ids = tuple(acc_ids)
3029
+
3030
+ #query genome-acc-kmers (gak) is ordered by genome first, then accession
3031
+ self.query_gak = {}
3032
+ #for result in self.curs.execute(gak_sql.format(db = "queries", accs=','.join(['?']*len(self.valids))), acc_ids).fetchall():
3033
+ for result in self.curs.execute("SELECT * FROM queries.genome_acc_kmer_counts ORDER BY genome").fetchall():
3034
+ genome, accession, kmer_ct = result[0], result[1], result[2]
3035
+ if genome not in self.query_gak:
3036
+ self.query_gak[genome] = [[],[]]
3037
+ self.query_gak[genome][0].append(accession)
3038
+ self.query_gak[genome][1].append(kmer_ct)
3039
+
3040
+ #refigure into numpy arrays for quicker array access later.
3041
+ for genome in self.query_gak:
3042
+ self.query_gak[genome] = (np.array(self.query_gak[genome][0], dtype = np.int32), np.array(self.query_gak[genome][1], dtype = np.int32))
3043
+
3044
+ #Split these into ordered groups - this makes joining results at the end easier.
3045
+ qgak_queue = multiprocessing.Queue()
3046
+ groupings = split_seq_indices(np.arange(self.num_queries), self.threads)
3047
+ group_id = 0
3048
+ for group in groupings:
3049
+ next_set = {}
3050
+ for i in range(group[0], group[1]):
3051
+ next_set[i] = self.query_gak[i]
3052
+ self.query_gak[i] = None
3053
+ #this ensures that the selection of qgak and the query index range match
3054
+ qgak_queue.put((group_id, next_set, np.arange(group[0], group[1]),))
3055
+ group_id += 1
3056
+
3057
+ self.query_gak = qgak_queue
3058
+ qgak_queue = None
3059
+
3060
+ #tgt gak is organized by accession first, then genome
3061
+ self.target_gak = np.zeros(shape = (122, self.num_targets), dtype = np.int32)
3062
+ for result in self.curs.execute(gak_sql.format(db = "targets", accs=','.join(['?']*len(self.valids))), acc_ids).fetchall():
3063
+ genome, accession, kmer_ct = result[0], result[1], result[2]
3064
+ self.target_gak[accession, genome] += kmer_ct
3065
+
3066
+ self.target_presence = self.target_gak > 0
3067
+ self.target_presence = self.target_presence.astype(bool)
3068
+
3069
+ #This needs to have a TSV write method
3070
+ def load_in_mem(self):
3071
+ #tempdir_path = os.path.normpath(self.output_base+"/temp")
3072
+ tempdir_path = tempfile.mkdtemp()
3073
+ #if not os.path.exists(tempdir_path):
3074
+ # os.mkdir(tempdir_path)
3075
+
3076
+ ql = {}
3077
+ tl = {}
3078
+ for t in self.valids.values():
3079
+ tl[t] = None
3080
+ for i in range(0, self.num_queries):
3081
+ ql[i] = {}
3082
+
3083
+ if self.verbose:
3084
+ tracker = progress_tracker(total = len(self.valids), message = "Loading data in memory.")
3085
+ else:
3086
+ print("\nLoading data in memory.")
3087
+
3088
+
3089
+ pool = multiprocessing.Pool(self.threads, initializer = parse_db_init,
3090
+ initargs = (self.q, #query
3091
+ self.t, #target
3092
+ tempdir_path,)) #outpath
3093
+
3094
+ for result in pool.imap_unordered(parse_accession, self.valids.keys()):
3095
+ this_accession = result[0]
3096
+
3097
+ this_acc_id = self.ok[this_accession]
3098
+
3099
+ with open(result[1], "rb") as inp:
3100
+ this_acc_data = pickle.load(inp)
3101
+ os.remove(result[1])
3102
+
3103
+ tl[this_acc_id] = this_acc_data[1]
3104
+
3105
+ for q in this_acc_data[0]:
3106
+ #We know that this acc must be in every ql for this loaded data.
3107
+ ql[q][this_acc_id] = this_acc_data[0][q][this_acc_id]
3108
+ if self.verbose:
3109
+ tracker.update()
3110
+
3111
+ pool.close()
3112
+
3113
+ if self.verbose:
3114
+ tracker = progress_tracker(total = self.num_queries, message = "Calculating AAI")
3115
+ else:
3116
+ print("\nCalculating AAI.")
3117
+
3118
+ query_groups = []
3119
+ for grouping in split_seq_indices(np.arange(self.num_queries), self.threads):
3120
+ query_groups.append(np.arange(grouping[0], grouping[1]))
3121
+
3122
+ result_queue = multiprocessing.Queue()
3123
+ remaining_procs = self.threads
3124
+ still_going = True
3125
+
3126
+ pool = multiprocessing.Pool(self.threads, initializer = one_init,
3127
+ initargs = (ql, #ql
3128
+ tl, #tl
3129
+ self.num_targets, #num_tgt
3130
+ self.query_gak, #qgak_queue
3131
+ self.target_gak, #tgak
3132
+ self.target_presence, #tpres
3133
+ self.do_sd, #sd
3134
+ self.style, #sty
3135
+ self.output_base, #output_dir
3136
+ self.store_mat, #store_results
3137
+ result_queue, #progress_queue
3138
+ self.query_names, #qnames
3139
+ self.target_names, #tnames
3140
+ tempdir_path,)) #temp_dir
3141
+
3142
+ some_results = pool.imap(one_work, query_groups)
3143
+
3144
+ while still_going:
3145
+ item = result_queue.get()
3146
+ if item == "done":
3147
+ remaining_procs -= 1
3148
+ if remaining_procs == 0:
3149
+ still_going = False
3150
+ else:
3151
+ if self.verbose:
3152
+ tracker.update()
3153
+ else:
3154
+ pass
3155
+
3156
+ if self.style == "matrix":
3157
+ result_files = []
3158
+
3159
+ for result in pool.map(two_work, range(0, self.threads)):
3160
+ result_files.append(result)
3161
+
3162
+ pool.close()
3163
+
3164
+ self.write_mat_from_files(result_files, tempdir_path)
3165
+ else:
3166
+ pool.close()
3167
+
3168
+ #This needs to be implemented from existing code.
3169
+ def db_on_disk(self):
3170
+ tempdir_path = tempfile.mkdtemp()
3171
+ if self.style == "matrix":
3172
+ self.store_mat = False
3173
+
3174
+ result_queue = multiprocessing.Queue()
3175
+ remaining_procs = self.threads
3176
+ still_going = True
3177
+
3178
+ if self.verbose:
3179
+ tracker = progress_tracker(total = self.num_queries, message = "Calculating AAI")
3180
+ else:
3181
+ print("\nCalculating AAI")
3182
+
3183
+ query_groups = []
3184
+ for grouping in split_seq_indices(np.arange(self.num_queries), self.threads):
3185
+ query_groups.append(np.arange(grouping[0], grouping[1]))
3186
+
3187
+ #query_database_path, target_database_path, num_tgt, query_queue, target_gak, tpres, sd,
3188
+ #sty, output_dir, progress_queue, qnames, tnames, valids, temp_dir
3189
+ pool = multiprocessing.Pool(self.threads, initializer = on_disk_init,
3190
+ initargs = (self.q, #query_database_path
3191
+ self.t, #target_database_path
3192
+ self.num_targets, #num_tgt
3193
+ self.query_gak, #query_queue
3194
+ self.target_gak, #target_gak
3195
+ self.target_presence, #tpres
3196
+ self.do_sd, #sd
3197
+ self.style, #sty
3198
+ self.output_base, #output_dir
3199
+ result_queue, #progress_queue
3200
+ self.query_names, #qnames
3201
+ self.target_names, #tnames
3202
+ self.valids, #valids
3203
+ tempdir_path,)) #temp_dir
3204
+
3205
+ some_results = pool.imap(on_disk_work_one, query_groups)
3206
+
3207
+ while still_going:
3208
+ item = result_queue.get()
3209
+ if item == "done":
3210
+ remaining_procs -= 1
3211
+ if remaining_procs == 0:
3212
+ still_going = False
3213
+ else:
3214
+ if self.verbose:
3215
+ tracker.update()
3216
+ else:
3217
+ pass
3218
+
3219
+ if self.style == "matrix":
3220
+ result_files = []
3221
+ for result in pool.map(on_disk_work_two, range(0, self.threads)):
3222
+ result_files.append(result)
3223
+
3224
+ pool.close()
3225
+
3226
+ if self.style == "matrix":
3227
+ self.write_mat_from_files(result_files, tempdir_path)
3228
+
3229
+ def write_mat_from_files(self, result_files, tempdir_path):
3230
+ #tempdir_path = os.path.normpath(self.output_base+"/temp")
3231
+
3232
+ result_files = sorted_nicely(result_files)
3233
+
3234
+ #print("Combining:")
3235
+ #for f in result_files:
3236
+ # print(f)
3237
+
3238
+ if self.verbose:
3239
+ tracker = progress_tracker(total = self.threads, step_size = 2, message = "Finalizing results.")
3240
+ else:
3241
+ print("\nFinalizing results.")
3242
+
3243
+ output_file = os.path.normpath(self.output+"/FastAAI_matrix.txt")
3244
+ final_outwriter = open(output_file, "w")
3245
+ print("query_genome\t"+'\t'.join(self.target_names), file = final_outwriter)
3246
+
3247
+ row = 0
3248
+
3249
+ for f in result_files:
3250
+ fh = open(f, "r")
3251
+ cur = fh.readlines()
3252
+ fh.close()
3253
+
3254
+ for i in range(0, len(cur)):
3255
+ if self.store_mat:
3256
+ #Add the decimals - we don't need to do this is we've been writing line-wise.
3257
+ #values will ALWAYS be 4 digits in this method, so groups of 2 dec. works.
3258
+ cur[i] = re.sub("(\d{2})(\d{2})", "\\1.\\2", cur[i])
3259
+ #Add in the query name to the row
3260
+ cur[i] = self.query_names[row]+"\t"+cur[i]
3261
+ row += 1
3262
+
3263
+ final_outwriter.write(''.join(cur))
3264
+ cur = None
3265
+
3266
+ try:
3267
+ os.remove(f)
3268
+ except:
3269
+ pass
3270
+
3271
+ if self.verbose:
3272
+ tracker.update()
3273
+
3274
+ final_outwriter.close()
3275
+
3276
+ try:
3277
+ if len(os.listdir(tempdir_path)) == 0:
3278
+ shutil.rmtree(tempdir_path)
3279
+ except:
3280
+ pass
3281
+
3282
+ def close(self):
3283
+ self.curs.close()
3284
+ self.curs = None
3285
+
3286
+ def clean_up(self):
3287
+ self.conn.close()
3288
+ self.conn = None
3289
+
3290
+ def run(self):
3291
+ self.open()
3292
+
3293
+ #work
3294
+ if self.as_mem_db:
3295
+ self.load_in_mem()
3296
+ else:
3297
+ self.db_on_disk()
3298
+
3299
+ self.close()
3300
+ self.clean_up()
3301
+
3302
+
3303
+ #Control the query process for any DB-first query.
3304
+ def db_query(query, target, verbose, output, threads, do_stdev, style, in_mem, store_results):
3305
+ print("")
3306
+
3307
+ #Sanity checks.
3308
+ if target is None:
3309
+ print("You need to supply a databasae for --target")
3310
+ sys.exit()
3311
+
3312
+ #Sanity checks.
3313
+ if query is None:
3314
+ print("You need to supply a databasae for --query")
3315
+ sys.exit()
3316
+
3317
+
3318
+
3319
+ #Sanity checks.
3320
+ if not os.path.exists(target):
3321
+ print("Target database not found. Exiting FastAAI")
3322
+ sys.exit()
3323
+
3324
+ if not os.path.exists(query):
3325
+ print("Query database not found. Exiting FastAAI")
3326
+ sys.exit()
3327
+
3328
+ #status = "exists"
3329
+ query_ok = assess_db(query)
3330
+ target_ok = assess_db(target)
3331
+
3332
+ if query_ok != "exists":
3333
+ print("Query database improperly formatted. Exiting FastAAI")
3334
+ sys.exit()
3335
+
3336
+ if target_ok != "exists":
3337
+ print("Query database improperly formatted. Exiting FastAAI")
3338
+ sys.exit()
3339
+
3340
+ #Check if the database is querying against itself.
3341
+ if target is None or query is None:
3342
+ print("I require both a query and a target database. FastAAI exiting.")
3343
+ sys.exit()
3344
+
3345
+ if query == target:
3346
+ print("Performing an all vs. all query on", query)
3347
+ #all_vs_all = True
3348
+ else:
3349
+ print("Querying", query, "against", target)
3350
+ #all_vs_all = False
3351
+
3352
+ #Ready the output directories as needed.
3353
+ #The databases are already created, the only state they can be in in P+H
3354
+ good_to_go = prepare_directories(output, "protein and HMM", "query")
3355
+ if not good_to_go:
3356
+ print("Exiting FastAAI")
3357
+ sys.exit()
3358
+
3359
+ #todo
3360
+ mdb = db_db_remake(in_memory = in_mem, store_mat_res = store_results, query = query, target = target, threads = threads, do_sd = do_stdev, output_base = output, output_style = style, verbose = verbose)
3361
+ mdb.run()
3362
+
3363
+ print("")
3364
+
3365
+
3366
+ #Check to see if the file exists and is a valid fastAAI db
3367
+ def assess_db(path):
3368
+ status = None
3369
+ if os.path.exists(path):
3370
+ conn = sqlite3.connect(path)
3371
+ curs = conn.cursor()
3372
+ try:
3373
+ sql = "SELECT name FROM sqlite_master WHERE type='table'"
3374
+
3375
+ curs.row_factory = lambda cursor, row: row[0]
3376
+ tables = curs.execute(sql).fetchall()
3377
+ curs.row_factory = None
3378
+
3379
+ curs.close()
3380
+ conn.close()
3381
+
3382
+ if len(tables) > 2 and "genome_index" in tables and "genome_acc_kmer_counts" in tables:
3383
+ status = "exists"
3384
+ else:
3385
+ status = "wrong format"
3386
+
3387
+ except:
3388
+ status = "wrong format"
3389
+
3390
+ else:
3391
+ try:
3392
+ conn = sqlite3.connect(path)
3393
+ conn.close()
3394
+ status = "created"
3395
+ except:
3396
+ status = "unable to create"
3397
+
3398
+ return status
3399
+
3400
+ #Add one FastAAI DB to another FastAAI DB
3401
+ def merge_db_opts():
3402
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
3403
+ description='''
3404
+ This FastAAI module allows you to add the contents of one or more FastAAI databases to another.
3405
+ You must have at least two already-created FastAAI databases using the build_db module before this module can be used.
3406
+
3407
+ Supply a comma-separated list of at least one donor database and a single recipient database.
3408
+ If the recipient already exists, then genomes in all the donors will be added to the recipient.
3409
+ If the recipient does not already exist, a new database will be created, and the contents of all the donors will be added to it.
3410
+
3411
+ Example:
3412
+ FastAAI.py merge_db --donors databases/db1.db,databases/db2.db -recipient databases/db3.db --threads 3
3413
+ This command will create a new database called "db3.db", merge the data in db1.db and db2.db, and then add the merged data into db3.db
3414
+
3415
+ Only the recipient database will be modified; the donors will be left exactly as they were before running this module.
3416
+ ''')
3417
+
3418
+ parser.add_argument('-d', '--donors', dest = 'donors', default = None, help = 'Comma-separated string of paths to one or more donor databases. The genomes FROM the donors will be added TO the recipient and the donors will be unaltered')
3419
+ parser.add_argument('--donor_file', dest = 'donor_file', default = None, help = 'File containing paths to one or more donor databases, one per line. Use EITHER this or --donors')
3420
+
3421
+ parser.add_argument('-r', '--recipient', dest = 'recipient', default = None, help = 'Path to the recipient database. Any genomes FROM the donor database not already in the recipient will be added to this database.')
3422
+
3423
+ parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
3424
+
3425
+ parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
3426
+
3427
+ args, unknown = parser.parse_known_args()
3428
+
3429
+ return parser, args
3430
+
3431
+ def merge_db_init(indexer, table_record, donor_dbs, tempdir):
3432
+ global mgi
3433
+ mgi = indexer
3434
+ global accs_per_db
3435
+ accs_per_db = table_record
3436
+ global tdb_list
3437
+ tdb_list = donor_dbs
3438
+ global work_space
3439
+ work_space = tempdir
3440
+
3441
+ def acc_transformer_merge(acc_name_genomes):
3442
+ acc_name = acc_name_genomes.split("_genomes")[0]
3443
+ my_acc_db = os.path.normpath(work_space + "/"+acc_name+".db")
3444
+ if os.path.exists(my_acc_db):
3445
+ os.remove(my_acc_db)
3446
+
3447
+ my_db = sqlite3.connect(my_acc_db)
3448
+ curs = my_db.cursor()
3449
+ curs.execute("CREATE TABLE {acc} (kmer INTEGER PRIMARY KEY, genomes array)".format(acc=acc_name))
3450
+ curs.execute("CREATE TABLE {acc} (genome INTEGER PRIMARY KEY, kmers array)".format(acc=acc_name_genomes))
3451
+ my_db.commit()
3452
+
3453
+ reformat = {}
3454
+ for d in tdb_list:
3455
+ simple_rows = []
3456
+ #do nothing if the acc is not in the donor.
3457
+ if acc_name_genomes in accs_per_db[d]:
3458
+ donor_conn = sqlite3.connect(d)
3459
+ dcurs = donor_conn.cursor()
3460
+ data = dcurs.execute("SELECT * FROM {acc}".format(acc=acc_name_genomes)).fetchall()
3461
+ dcurs.close()
3462
+ donor_conn.close()
3463
+
3464
+ for row in data:
3465
+ genome, kmers = row[0], row[1]
3466
+ new_index = mgi[d][genome]
3467
+ #-1 is the value indicating an already-seen genome that should not be added.
3468
+ if new_index > -1:
3469
+ simple_rows.append((new_index, kmers,))
3470
+ kmers = np.frombuffer(kmers, dtype=np.int32)
3471
+ for k in kmers:
3472
+ if k not in reformat:
3473
+ reformat[k] = []
3474
+ reformat[k].append(new_index)
3475
+
3476
+ if len(simple_rows) > 0:
3477
+ curs.executemany("INSERT INTO {acc} VALUES (?,?)".format(acc=acc_name_genomes), simple_rows)
3478
+ my_db.commit()
3479
+
3480
+ simple_rows = None
3481
+ data = None
3482
+
3483
+ to_add = []
3484
+ for k in reformat:
3485
+ as_bytes = np.array(reformat[k], dtype = np.int32)
3486
+ as_bytes = as_bytes.tobytes()
3487
+ reformat[k] = None
3488
+ to_add.append((int(k), as_bytes,))
3489
+
3490
+ curs.executemany("INSERT INTO {acc} VALUES (?, ?)".format(acc = acc_name), to_add)
3491
+
3492
+ my_db.commit()
3493
+
3494
+ to_add = None
3495
+
3496
+ curs.execute("CREATE INDEX {acc}_index ON {acc} (kmer)".format(acc=acc_name))
3497
+ my_db.commit()
3498
+
3499
+ curs.close()
3500
+ my_db.close()
3501
+
3502
+ return [my_acc_db, acc_name]
3503
+
3504
+ def merge_db(recipient, donors, donor_file, verbose, threads):
3505
+ #Prettier on the CLI
3506
+ if (donors is None and donor_file is None) or recipient is None:
3507
+ print("Either donor or target not given. FastAAI is exiting.")
3508
+ return None
3509
+
3510
+ print("")
3511
+
3512
+ if donors is not None:
3513
+ donors = donors.split(",")
3514
+
3515
+ if donor_file is not None:
3516
+ try:
3517
+ donors = []
3518
+ fh = agnostic_reader(donor_file)
3519
+ for line in fh:
3520
+ line = line.strip()
3521
+ donors.append(line)
3522
+ fh.close()
3523
+ except:
3524
+ sys.exit("Could not parse your donor file.")
3525
+
3526
+ valid_donors = []
3527
+ for d in donors:
3528
+ if os.path.exists(d):
3529
+ if d == recipient:
3530
+ print("Donor database", d, "is the same as the recipient. This database will be skipped.")
3531
+ else:
3532
+ check = assess_db(d)
3533
+ if check == "exists":
3534
+ if d not in valid_donors:
3535
+ valid_donors.append(d)
3536
+ else:
3537
+ print("It appears that database", d, "was already added to the list of donors. Did you type it twice in the list of donors? Skipping it.")
3538
+ else:
3539
+ if check == "created":
3540
+ print("Donor database", d, "not found! Skipping.")
3541
+ else:
3542
+ print("Something was wrong with supplied database:", d+". A status check found:", check)
3543
+ else:
3544
+ print("Donor database", d, "not found! Are you sure the path is correct and this donor exists? This database will be skipped.")
3545
+
3546
+ if len(valid_donors) == 0:
3547
+ print("None of the supplied donor databases were able to be accessed. FastAAI cannot continue if none of these databases are valid. Exiting.")
3548
+ sys.exit()
3549
+
3550
+ recip_check = assess_db(recipient)
3551
+
3552
+ if recip_check == "created" or recip_check == "exists":
3553
+ print("Donor databases:")
3554
+ for donor in valid_donors:
3555
+ print("\t", donor)
3556
+ print("Will be added to recipient database:", recipient)
3557
+ else:
3558
+ print("I couldn't find or create the recipient database at", recipient+".", "Does the folder you're trying to place this database in exist, and do you have permission to write files to it? FastAAI exiting.")
3559
+ sys.exit()
3560
+
3561
+ if recipient is None or len(valid_donors) == 0:
3562
+ print("I require both a valid donor and a recipient database. FastAAI exiting.")
3563
+ sys.exit()
3564
+
3565
+ gen_counter = 0
3566
+ multi_gen_ids = {}
3567
+ all_gens = {}
3568
+
3569
+ #Load recipient data, if any.
3570
+ if recip_check == "exists":
3571
+ conn = sqlite3.connect(recipient)
3572
+ curs = conn.cursor()
3573
+ data = curs.execute("SELECT genome, gen_id FROM genome_index").fetchall()
3574
+ tabs = curs.execute("SELECT name FROM sqlite_master").fetchall()
3575
+ curs.close()
3576
+ conn.close()
3577
+
3578
+ multi_gen_ids[recipient] = {}
3579
+ for row in data:
3580
+ genome, index = row[0], row[1]
3581
+ all_gens[genome] = 0
3582
+ multi_gen_ids[recipient][genome] = index
3583
+
3584
+ gen_counter = max(list(multi_gen_ids[recipient].values())) + 1
3585
+
3586
+ genome_index_to_add = []
3587
+ gak_to_add = []
3588
+ tables = {}
3589
+ #Donors should always exist, never be created.
3590
+ for d in valid_donors:
3591
+ #load
3592
+ conn = sqlite3.connect(d)
3593
+ curs = conn.cursor()
3594
+ data = curs.execute("SELECT * FROM genome_index").fetchall()
3595
+ tabs = curs.execute("SELECT name FROM sqlite_master").fetchall()
3596
+ gak = curs.execute("SELECT * FROM genome_acc_kmer_counts").fetchall()
3597
+ curs.close()
3598
+ conn.close()
3599
+ multi_gen_ids[d] = {}
3600
+ for row in data:
3601
+ genome, index, prot_ct = row[0], row[1], row[2]
3602
+ if genome not in all_gens:
3603
+ all_gens[genome] = 0
3604
+ #We need to be able to convert number to number.
3605
+ multi_gen_ids[d][index] = gen_counter
3606
+ genome_index_to_add.append((genome, gen_counter, prot_ct,))
3607
+ gen_counter += 1
3608
+ else:
3609
+ #This is a remove condition for later.
3610
+ multi_gen_ids[d][index] = -1
3611
+ data = None
3612
+
3613
+ for row in gak:
3614
+ genome_id, acc_id, kmer_ct = row[0], row[1], row[2]
3615
+ new_index = multi_gen_ids[d][genome_id]
3616
+ if new_index > -1:
3617
+ gak_to_add.append((new_index, acc_id, kmer_ct,))
3618
+
3619
+ tables[d] = []
3620
+ for tab in tabs:
3621
+ tab = tab[0]
3622
+ if tab.endswith("_genomes"):
3623
+ tables[d].append(tab)
3624
+ tables[d] = set(tables[d])
3625
+
3626
+ all_tabs = set()
3627
+ for t in tables:
3628
+ all_tabs = all_tabs.union(tables[t])
3629
+
3630
+ all_tabs = list(all_tabs)
3631
+
3632
+
3633
+ temp_dir = tempfile.mkdtemp()
3634
+ try:
3635
+ if verbose:
3636
+ tracker = progress_tracker(len(all_tabs), message = "Formatting data to add to database")
3637
+ else:
3638
+ print("Formatting data to add to database")
3639
+
3640
+ conn = sqlite3.connect(recipient)
3641
+ curs = conn.cursor()
3642
+
3643
+ #indexer, table_record, donor_dbs, tempdir
3644
+ pool = multiprocessing.Pool(threads, initializer=merge_db_init, initargs = (multi_gen_ids, tables, valid_donors, temp_dir,))
3645
+
3646
+ for result in pool.imap_unordered(acc_transformer_merge, all_tabs):
3647
+ db, accession = result[0], result[1]
3648
+ curs.execute("CREATE TABLE IF NOT EXISTS {acc} (kmer INTEGER PRIMARY KEY, genomes array)".format(acc=accession))
3649
+ curs.execute("CREATE TABLE IF NOT EXISTS {acc}_genomes (genome INTEGER PRIMARY KEY, kmers array)".format(acc=accession))
3650
+ curs.execute("CREATE INDEX IF NOT EXISTS {acc}_index ON {acc}(kmer)".format(acc=accession))
3651
+ conn.commit()
3652
+
3653
+ curs.execute("attach '" + db + "' as acc")
3654
+ conn.commit()
3655
+
3656
+ #Get the genomes from worker db.
3657
+ curs.execute("INSERT INTO {acc}_genomes SELECT * FROM acc.{acc}_genomes".format(acc=accession))
3658
+ to_update = curs.execute("SELECT kmer, genomes, genomes FROM acc.{acc}".format(acc=accession)).fetchall()
3659
+ update_concat_sql = "INSERT INTO {acc} VALUES (?,?) ON CONFLICT(kmer) DO UPDATE SET genomes=genomes || (?)".format(acc=accession)
3660
+ curs.executemany(update_concat_sql, to_update)
3661
+ conn.commit()
3662
+
3663
+ curs.execute("detach acc")
3664
+ conn.commit()
3665
+
3666
+ os.remove(db)
3667
+
3668
+ if verbose:
3669
+ tracker.update()
3670
+
3671
+ pool.close()
3672
+ pool.join()
3673
+
3674
+ curs.execute("CREATE TABLE IF NOT EXISTS genome_index (genome text, gen_id integer, protein_count integer)")
3675
+ curs.execute("CREATE TABLE IF NOT EXISTS genome_acc_kmer_counts (genome integer, accession integer, count integer)")
3676
+
3677
+ curs.executemany("INSERT INTO genome_index VALUES (?,?,?)", genome_index_to_add)
3678
+ curs.executemany("INSERT INTO genome_acc_kmer_counts VALUES (?,?,?)", gak_to_add)
3679
+
3680
+ curs.execute("CREATE INDEX IF NOT EXISTS kmer_acc ON genome_acc_kmer_counts (genome, accession);")
3681
+
3682
+ conn.commit()
3683
+
3684
+ except:
3685
+ curs.close()
3686
+ conn.close()
3687
+ #Error
3688
+ shutil.rmtree(temp_dir)
3689
+ if recip_check == "created":
3690
+ print("Removing created database after failure.")
3691
+ os.remove(recipient)
3692
+ try:
3693
+ curs.close()
3694
+ conn.close()
3695
+ #Success
3696
+ shutil.rmtree(temp_dir)
3697
+ except:
3698
+ pass
3699
+
3700
+ print("\nDatabases merged!")
3701
+
3702
+ return None
3703
+
3704
+ #Query 1 genome vs. 1 target using Carlos' method - just needs query, target, threads
3705
+ def single_query_opts():
3706
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
3707
+ description='''
3708
+ This FastAAI module takes a single query genome, protein, or protein and HMM pair and a single target genome, protein, or protein and HMM pair as inputs and calculates AAI between the two.
3709
+
3710
+ If you supply a genome as either query or target, a protein and HMM file will be made for the genome.
3711
+ If you supply a protein as either query or target, an HMM file will be made for it.
3712
+ If you supply both an HMM and protein, the search will start right away. You cannot provide only an HMM.
3713
+
3714
+ No database will be built, and you cannot query multiple genomes with this module.
3715
+
3716
+ If you wish to query multiple genomes against themselves in all vs. all AAI search, use aai_index instead.
3717
+ If you wish to query multiple genomes against multiple targets, use multi_query instead.
3718
+ ''')
3719
+ parser.add_argument('-qg', '--query_genome', dest = 'query_genome', default = None, help = 'Query genome')
3720
+ parser.add_argument('-tg', '--target_genome', dest = 'target_genome', default = None, help = 'Target genome')
3721
+
3722
+ parser.add_argument('-qp', '--query_protein', dest = 'query_protein', default = None, help = 'Query protein')
3723
+ parser.add_argument('-tp', '--target_protein', dest = 'target_protein', default = None, help = 'Target protein')
3724
+
3725
+ parser.add_argument('-qh', '--query_hmm', dest = 'query_hmm', default = None, help = 'Query HMM')
3726
+ parser.add_argument('-th', '--target_hmm', dest = 'target_hmm', default = None, help = 'Target HMM')
3727
+
3728
+ parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory where FastAAI will place the result of this query. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
3729
+
3730
+ parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
3731
+ parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
3732
+ parser.add_argument('--compress', dest = "do_comp", action = 'store_true', help = 'Gzip compress generated proteins, HMMs. Off by default.')
3733
+
3734
+ args, unknown = parser.parse_known_args()
3735
+
3736
+ return parser, args
3737
+
3738
+ def kaai_to_aai(kaai):
3739
+ # Transform the kAAI into estimated AAI values
3740
+ aai_hat = (-0.3087057 + 1.810741 * (np.exp(-(-0.2607023 * np.log(kaai))**(1/3.435))))*100
3741
+
3742
+ return aai_hat
3743
+
3744
+ #This one's unique. It doesn't do anything with the DB, which means it doesn't access any other functionality outside of the input_file class. It just advances a pair of inputs in parallel and does intersections.
3745
+ def single_query(qf, tf, output, verbose, threads, do_compress):
3746
+
3747
+ if qf.identifiers[0] == tf.identifiers[0]:
3748
+ print("You've selected the same query and target genome. The AAI is 100%.")
3749
+ print("FastAAI exiting.")
3750
+ return None
3751
+
3752
+ statuses = ["genome", "protein", "protein and hmm"]
3753
+ query_stat = statuses.index(qf.status)
3754
+ target_stat = statuses.index(tf.status)
3755
+ minimum_status = statuses[min(query_stat, target_stat)]
3756
+
3757
+ start_printouts = ["[Genome] Protein Protein+HMM", " Genome [Protein] Protein+HMM", "Genome Protein [Protein+HMM]"]
3758
+
3759
+ print("")
3760
+ print("Query start: ", start_printouts[query_stat])
3761
+ print("Target start:", start_printouts[target_stat])
3762
+ print("")
3763
+
3764
+
3765
+ qname = qf.identifiers[0]
3766
+ tname = tf.identifiers[0]
3767
+
3768
+ name = os.path.normpath(output + "/results/" + qname + "_vs_" + tname + ".aai.txt")
3769
+ print("Output will be located at", name)
3770
+
3771
+ advance_me = [qf.in_files[0], tf.in_files[0]]
3772
+ #All we need to do this.
3773
+ hmm_file = find_hmm()
3774
+ pool = multiprocessing.Pool(min(threads, 2), initializer = hmm_preproc_initializer, initargs = (hmm_file, do_compress,))
3775
+
3776
+ results = pool.map(run_build, advance_me)
3777
+
3778
+ pool.close()
3779
+ pool.join()
3780
+
3781
+ query = results[0]
3782
+ target = results[1]
3783
+
3784
+ print(query.partial_timings())
3785
+ print(target.partial_timings())
3786
+
3787
+ #One of the printouts
3788
+ max_poss_prots = max(len(query.best_hits_kmers), len(target.best_hits_kmers))
3789
+
3790
+ accs_to_view = set(query.best_hits_kmers.keys()).intersection(set(target.best_hits_kmers.keys()))
3791
+
3792
+ results = []
3793
+ for acc in accs_to_view:
3794
+ intersect = np.intersect1d(query.best_hits_kmers[acc], target.best_hits_kmers[acc])
3795
+ intersect = intersect.shape[0]
3796
+ union = query.best_hits_kmers[acc].shape[0] + target.best_hits_kmers[acc].shape[0] - intersect
3797
+ jacc = intersect/union
3798
+ results.append(jacc)
3799
+
3800
+ results = np.array(results, dtype = np.float_)
3801
+
3802
+ jacc_mean = np.mean(results)
3803
+ jacc_std = np.std(results)
3804
+ actual_prots = len(results)
3805
+ poss_prots = max(len(query.best_hits_kmers), len(target.best_hits_kmers))
3806
+ aai_est = round(kaai_to_aai(jacc_mean), 2)
3807
+
3808
+ if aai_est > 90:
3809
+ aai_est = ">90%"
3810
+ else:
3811
+ if aai_est < 30:
3812
+ aai_est = "<30%"
3813
+
3814
+ output = open(name, "w")
3815
+
3816
+ print("query\ttarget\tavg_jacc_sim\tjacc_SD\tnum_shared_SCPs\tposs_shared_SCPs\tAAI_estimate", file = output)
3817
+ print(qname, tname, round(jacc_mean, 4), round(jacc_std, 4), actual_prots, poss_prots, aai_est, sep = "\t", file = output)
3818
+
3819
+ output.close()
3820
+
3821
+ print("query\ttarget\tavg_jacc_sim\tjacc_SD\tnum_shared_SCPs\tposs_shared_SCPs\tAAI_estimate")
3822
+ print(qname, tname, round(jacc_mean, 4), round(jacc_std, 4), actual_prots, poss_prots, aai_est, sep = "\t")
3823
+
3824
+
3825
+ print("FastAAI single query done! Estimated AAI:", aai_est)
3826
+
3827
+ def miga_merge_opts():
3828
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
3829
+ description='''
3830
+ Hello, Miguel.
3831
+
3832
+ Give one genome in nt, aa, or aa+hmm format and a database to create or add to.
3833
+ It'll add the genome as efficiently as possible.
3834
+
3835
+ The normal merge command creates parallel processes and gathers data in
3836
+ one-SCP databases to add to the main DB. Great for many genomes. A lot of extra
3837
+ work for just one.
3838
+
3839
+ This version skips the creation of subordinate DBs and just directly adds the genome.
3840
+ Faster, fewer writes, no parallel overhead.
3841
+ ''')
3842
+
3843
+ parser.add_argument('--genome', dest = 'gen', default = None, help = 'Path to one genome, FASTA format')
3844
+ parser.add_argument('--protein', dest = 'prot', default = None, help = 'Path to one protein, AA FASTA format')
3845
+ parser.add_argument('--hmm', dest = 'hmm', default = None, help = 'Path to one HMM file as predicted by FastAAI')
3846
+
3847
+ parser.add_argument('--output', dest = 'output', default = "FastAAI", help = 'Place the partial output files into a directory with this base. Default "FastAAI"')
3848
+ parser.add_argument('--target', dest = 'database', default = None, help = 'Path to the target database. The genome supplied will be added to this. The DB will be created if needed.')
3849
+
3850
+ parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
3851
+ parser.add_argument('--compress', dest = 'compress', action='store_true', help = 'Compress generated file output')
3852
+
3853
+ args, unknown = parser.parse_known_args()
3854
+
3855
+ return parser, args
3856
+
3857
+ def miga_merge(infile, target_db, verbose, do_compress):
3858
+ status = assess_db(target_db)
3859
+ if status == "wrong format":
3860
+ print("The database", target_db, "exists, but appears to not be a FastAAI database.")
3861
+ print("FastAAI will not alter this file. Quitting.")
3862
+ return None
3863
+
3864
+ if status == "unable to create":
3865
+ print("The database", target_db, "could not be created.")
3866
+ print("Are you sure that the path you gave is valid? Quitting.")
3867
+ return None
3868
+
3869
+ if verbose:
3870
+ print("Processing genome")
3871
+
3872
+ next_id = 0
3873
+ exist_gens = {}
3874
+ conn = sqlite3.connect(target_db)
3875
+ curs = conn.cursor()
3876
+ if status == 'exists':
3877
+ for row in curs.execute("SELECT * FROM genome_index ORDER BY gen_id").fetchall():
3878
+ genome, id, prot_ct = row[0], row[1], row[2]
3879
+ exist_gens[genome] = id
3880
+ next_id += 1
3881
+
3882
+ if infile.basename in exist_gens:
3883
+ print("It looks like the file you're trying to add already exists in the database.")
3884
+ print("Adding it is too likely to corrupt the database. Quitting.")
3885
+ return None
3886
+
3887
+ hmm_file = find_hmm()
3888
+ global hmm_manager
3889
+
3890
+ hmm_manager = pyhmmer_manager(do_compress)
3891
+ hmm_manager.load_hmm_from_file(hmm_file)
3892
+
3893
+ infile.preprocess()
3894
+
3895
+ if len(infile.best_hits_kmers) > 0:
3896
+
3897
+ ok = generate_accessions_index()
3898
+ gak_to_add = []
3899
+
3900
+ gen_id = np.zeros(1, dtype = np.int32)
3901
+ gen_id[0] = next_id
3902
+ gen_id = gen_id.tobytes()
3903
+
3904
+ for accession in infile.best_hits_kmers:
3905
+ acc_id = ok[accession]
3906
+ gak_to_add.append((next_id, acc_id, infile.best_hits_kmers[accession].shape[0],))
3907
+
3908
+ curs.execute("CREATE TABLE IF NOT EXISTS {acc} (kmer INTEGER PRIMARY KEY, genomes array)".format(acc=accession))
3909
+ curs.execute("CREATE TABLE IF NOT EXISTS {acc}_genomes (genome INTEGER PRIMARY KEY, kmers array)".format(acc=accession))
3910
+ curs.execute("CREATE INDEX IF NOT EXISTS {acc}_index ON {acc}(kmer)".format(acc=accession))
3911
+
3912
+ gen_first = (next_id, infile.best_hits_kmers[accession].tobytes(),)
3913
+ curs.execute("INSERT INTO {acc}_genomes VALUES (?,?)".format(acc=accession), gen_first)
3914
+
3915
+ kmers_first = []
3916
+ for k in infile.best_hits_kmers[accession]:
3917
+ #we know there's only one genome in these cases.
3918
+ kmers_first.append((int(k), gen_id, gen_id, ))
3919
+
3920
+ update_concat_sql = "INSERT INTO {acc} VALUES (?,?) ON CONFLICT(kmer) DO UPDATE SET genomes=genomes || (?)".format(acc=accession)
3921
+
3922
+ curs.executemany(update_concat_sql, kmers_first)
3923
+
3924
+ #Safety checks.
3925
+ curs.execute("CREATE TABLE IF NOT EXISTS genome_index (genome text, gen_id integer, protein_count integer)")
3926
+ curs.execute("CREATE TABLE IF NOT EXISTS genome_acc_kmer_counts (genome integer, accession integer, count integer)")
3927
+
3928
+ gen_idx_to_add = (infile.basename, next_id, len(infile.best_hits_kmers))
3929
+ curs.execute("INSERT INTO genome_index VALUES (?, ?, ?)", gen_idx_to_add)
3930
+ #gak was made over the loops.
3931
+ curs.executemany("INSERT INTO genome_acc_kmer_counts VALUES (?,?,?)", gak_to_add)
3932
+ curs.execute("CREATE INDEX IF NOT EXISTS kmer_acc ON genome_acc_kmer_counts (genome, accession);")
3933
+
3934
+ conn.commit()
3935
+
3936
+ else:
3937
+ print("No proteins to add for this genome:",infile.basename,"Database will be unaltered. Exiting.")
3938
+
3939
+ curs.close()
3940
+ conn.close()
3941
+
3942
+
3943
+ def miga_dirs(output, subdir):
3944
+ preparation_successful = True
3945
+
3946
+ if not os.path.exists(output):
3947
+ try:
3948
+ os.mkdir(output)
3949
+ except:
3950
+ print("")
3951
+ print("FastAAI tried to make output directory: '"+ output + "' but failed.")
3952
+ print("")
3953
+ print("Troubleshooting:")
3954
+ print("")
3955
+ print(" (1) Do you have permission to create directories in the location you specified?")
3956
+ print(" (2) Did you make sure that all directories other than", os.path.basename(output), "already exist?")
3957
+ print("")
3958
+ preparation_successful = False
3959
+
3960
+ if preparation_successful:
3961
+ try:
3962
+ if not os.path.exists(os.path.normpath(output + "/" + subdir)):
3963
+ os.mkdir(os.path.normpath(output + "/" + subdir))
3964
+ except:
3965
+ print("FastAAI was able to create or find", output, "but couldn't make directories there.")
3966
+ print("")
3967
+ print("This shouldn't happen. Do you have permission to write to that directory?")
3968
+
3969
+
3970
+ return preparation_successful
3971
+
3972
+ def miga_preproc_opts():
3973
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
3974
+ description='''Build module intended for use by MiGA.
3975
+
3976
+ Performs protein prediction, HMM searching, and best hit identification, but does NOT
3977
+ build a database. Produces instead "crystals," which are tab-sep files containing protein,
3978
+ HMM accession, and original protein sequences for the best hits. These crystals can be passed
3979
+ to "miga_db_from_crystals" action later on to rapidly create a DB from many genomes.
3980
+ ''')
3981
+
3982
+ parser.add_argument('-g', '--genomes', dest = 'genomes', default = None, help = 'A directory containing genomes in FASTA format.')
3983
+ parser.add_argument('-p', '--proteins', dest = 'proteins', default = None, help = 'A directory containing protein amino acids in FASTA format.')
3984
+ parser.add_argument('-m', '--hmms', dest = 'hmms', default = None, help = 'A directory containing the results of an HMM search on a set of proteins.')
3985
+
3986
+ parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory to place the database and any protein or HMM files FastAAI creates. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
3987
+
3988
+ parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
3989
+ parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
3990
+ parser.add_argument('--compress', dest = "do_comp", action = 'store_true', help = 'Gzip compress generated proteins, HMMs. Off by default.')
3991
+
3992
+ args, unknown = parser.parse_known_args()
3993
+
3994
+ return parser, args
3995
+
3996
+ def run_miga_preproc(input_file):
3997
+ input_file.crystalize = True
3998
+ input_file.preprocess()
3999
+ if len(input_file.best_hits_kmers) < 1:
4000
+ input_file.best_hits_kmers = None
4001
+ input_file.err_log += " This file did not successfully complete. No SCPs could be found."
4002
+
4003
+ return input_file
4004
+
4005
+ #Produce FastAAI preprocessed files containing HMM accession and associated protein sequence
4006
+ def miga_preproc(genomes, proteins, hmms, output, threads, verbose, do_compress):
4007
+ success = True
4008
+
4009
+ imported_files = fastaai_file_importer(genomes = genomes, proteins = proteins, hmms = hmms, output = output, compress = do_compress)
4010
+ imported_files.determine_inputs()
4011
+
4012
+ if imported_files.error:
4013
+ print("Exiting FastAAI due to input file error.")
4014
+ quit()
4015
+
4016
+ #file make checks
4017
+ p, h, c, l = True, True, True, True
4018
+
4019
+ if imported_files.status == "genome":
4020
+ p = miga_dirs(output, "predicted_proteins")
4021
+ h = miga_dirs(output, "hmms")
4022
+ c = miga_dirs(output, "crystals")
4023
+
4024
+ if imported_files.status == "protein":
4025
+ h = miga_dirs(output, "hmms")
4026
+ c = miga_dirs(output, "crystals")
4027
+
4028
+ if imported_files.status == "protein+HMM":
4029
+ c = miga_dirs(output, "crystals")
4030
+
4031
+ #We always want this one.
4032
+ l = miga_dirs(output, "logs")
4033
+
4034
+ print("")
4035
+
4036
+ #Check if all created directories were successful.
4037
+ success = p and h and c and l
4038
+
4039
+ if success:
4040
+ hmm_file = find_hmm()
4041
+
4042
+ if verbose:
4043
+ tracker = progress_tracker(total = len(imported_files.in_files), message = "Processing inputs")
4044
+ else:
4045
+ print("Processing inputs")
4046
+
4047
+ #Only build_db makes a log.
4048
+
4049
+ logger = open(os.path.normpath(output+"/logs/"+"FastAAI_preprocessing_log.txt"), "a")
4050
+ print("file", "start_date", "end_date", "starting_format",
4051
+ "prot_prediction_time", "trans_table", "hmm_search_time", "besthits_time",
4052
+ "errors", sep = "\t", file = logger)
4053
+
4054
+ fail_log = open(os.path.normpath(output+"/logs/"+"FastAAI_genome_failures.txt"), "a")
4055
+
4056
+ pool = multiprocessing.Pool(threads, initializer = hmm_preproc_initializer, initargs = (hmm_file, do_compress,))
4057
+
4058
+ for result in pool.imap(run_miga_preproc, imported_files.in_files):
4059
+ #log data, regardless of kind
4060
+ print(result.basename, result.start_time, result.end_time, result.initial_state,
4061
+ result.prot_pred_time, result.trans_table, result.hmm_search_time, result.besthits_time,
4062
+ result.err_log, sep = "\t", file = logger)
4063
+
4064
+ if len(result.best_hits_kmers) < 1:
4065
+ print(result.basename, file = fail_log)
4066
+
4067
+ if verbose:
4068
+ tracker.update()
4069
+
4070
+ pool.close()
4071
+ logger.close()
4072
+ fail_log.close()
4073
+
4074
+ print("FastAAI preprocessing complete!")
4075
+
4076
+ return success
4077
+
4078
+ def miga_db_from_crystals_opts():
4079
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
4080
+ description='''Takes a set of crystals produced with miga_preproc and makes a database from them.
4081
+
4082
+ Supply --crystals with a directory, file of paths, or list of paths just like --genomes in a build command.''')
4083
+
4084
+ parser.add_argument('-c', '--crystals', dest = 'crystals', default = None, help = 'A directory containing genomes in FASTA format.')
4085
+ parser.add_argument('-d', '--database', dest = 'db_name', default = "FastAAI_database.sqlite.db", help = 'The name of the database you wish to create or add to. The database will be created if it doesn\'t already exist and placed in the output directory. FastAAI_database.sqlite.db by default.')
4086
+
4087
+ parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory to place the database and any protein or HMM files FastAAI creates. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
4088
+
4089
+ parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
4090
+ parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
4091
+ args, unknown = parser.parse_known_args()
4092
+
4093
+ return parser, args
4094
+
4095
+ #This is basically a copied function, but I'm going to ignore that for now.
4096
+ def unique_kmer_miga(seq):
4097
+ #num tetramers = len(seq) - 4 + 1, just make it -3.
4098
+ n_kmers = len(seq) - 3
4099
+
4100
+ #Converts the characters in a sequence into their ascii int value
4101
+ as_ints = np.array([ord(i) for i in seq], dtype = np.int32)
4102
+
4103
+ #create seq like 0,1,2,3; 1,2,3,4; 2,3,4,5... for each tetramer that needs a value
4104
+ kmers = np.arange(4*n_kmers)
4105
+ kmers = kmers % 4 + kmers // 4
4106
+
4107
+ #Select the characters (as ints) corresponding to each tetramer all at once and reshape into rows of 4,
4108
+ #each row corresp. to a successive tetramer
4109
+ kmers = as_ints[kmers].reshape((n_kmers, 4))
4110
+
4111
+ #Given four 2-digit numbers, these multipliers work as offsets so that all digits are preserved in order when summed
4112
+ mult = np.array([1000000, 10000, 100, 1], dtype = np.int32)
4113
+
4114
+ #the fixed values effectively offset the successive chars of the tetramer by 2 positions each time;
4115
+ #practically, this is concatenation of numbers
4116
+ #Matrix mult does this for all values at once.
4117
+ return np.unique(np.dot(kmers, mult))
4118
+
4119
+ def para_crystal_init(tdb_queue):
4120
+ global tdb
4121
+ global td_name
4122
+ tdb = tdb_queue.get()
4123
+ td_name = tdb
4124
+ tdb = initialize_blank_db(tdb)
4125
+ global ok
4126
+ ok = generate_accessions_index()
4127
+
4128
+ def initialize_blank_db(path):
4129
+ sqlite3.register_converter("array", convert_array)
4130
+ worker = sqlite3.connect(path)
4131
+ wcurs = worker.cursor()
4132
+ wcurs.execute("CREATE TABLE genome_index (genome text, gen_id integer, protein_count integer)")
4133
+ wcurs.execute("CREATE TABLE genome_acc_kmer_counts (genome integer, accession integer, count integer)")
4134
+ ok = generate_accessions_index()
4135
+ for t in ok:
4136
+ wcurs.execute("CREATE TABLE " + t + "_genomes (genome INTEGER PRIMARY KEY, kmers array)")
4137
+ wcurs.execute("CREATE TABLE " + t + " (kmer INTEGER PRIMARY KEY, genomes array)")
4138
+
4139
+ worker.commit()
4140
+ wcurs.close()
4141
+ return worker
4142
+
4143
+ def para_crystals_to_dbs(args):
4144
+ path, name, num = args[0], args[1], args[2]
4145
+ my_gak = []
4146
+ my_qgi = []
4147
+ num_prots = 0
4148
+ curs = tdb.cursor()
4149
+ fh = agnostic_reader(path)
4150
+ for line in fh:
4151
+ segs = line.strip().split("\t")
4152
+ #prot_name = segs[0]
4153
+ acc_name = segs[1]
4154
+ prot_seq = segs[2]
4155
+ acc_id = ok[acc_name]
4156
+ tetramers = unique_kmer_miga(prot_seq)
4157
+ my_gak.append((num, acc_id, tetramers.shape[0]))
4158
+ tetramers = tetramers.tobytes()
4159
+ curs.execute("INSERT INTO " + acc_name + "_genomes VALUES (?,?)", (num, tetramers,))
4160
+ num_prots += 1
4161
+
4162
+ fh.close()
4163
+
4164
+ curs.execute("INSERT INTO genome_index VALUES (?, ?, ?)", (name, num, num_prots,))
4165
+ curs.executemany("INSERT INTO genome_acc_kmer_counts VALUES (?, ?, ?)", my_gak)
4166
+
4167
+ tdb.commit()
4168
+ curs.close()
4169
+
4170
+ return None
4171
+
4172
+ def group_by_kmer(placeholder):
4173
+ curs = tdb.cursor()
4174
+ surviving_tables = []
4175
+ for acc in ok:
4176
+ collected_data = curs.execute("SELECT * FROM {acc}_genomes".format(acc=acc)).fetchall()
4177
+ rearrange = {}
4178
+ if len(collected_data) > 0:
4179
+ surviving_tables.append(acc)
4180
+ for row in collected_data:
4181
+ genome, tetramers = row[0], np.frombuffer(row[1], dtype = np.int32)
4182
+ for t in tetramers:
4183
+ if t not in rearrange:
4184
+ rearrange[t] = [genome]
4185
+ else:
4186
+ rearrange[t].append(genome)
4187
+
4188
+ to_add = []
4189
+ for tetra in rearrange:
4190
+ as_bytes = np.array(rearrange[tetra], dtype = np.int32).tobytes()
4191
+ rearrange[tetra] = None
4192
+ to_add.append((int(tetra), as_bytes,))
4193
+
4194
+ curs.executemany("INSERT INTO {acc} VALUES (?, ?)".format(acc=acc), to_add)
4195
+ to_add = None
4196
+ else:
4197
+ #Empty table/no genomes contained the relevant SCP
4198
+ curs.execute("DROP TABLE {acc}".format(acc = acc))
4199
+ curs.execute("DROP TABLE {acc}_genomes".format(acc = acc))
4200
+
4201
+ tdb.commit()
4202
+
4203
+ curs.close()
4204
+
4205
+ tdb.close()
4206
+
4207
+ return [td_name, surviving_tables]
4208
+
4209
+ #Merge one or many crystals into a DB.
4210
+ def miga_db_from_crystals(crystals, output, db_name, threads, verbose):
4211
+ success = True
4212
+
4213
+ imported_files = fastaai_file_importer(genomes = None, proteins = None,
4214
+ hmms = None, crystals = crystals, output = output, compress = False)
4215
+ imported_files.determine_inputs()
4216
+
4217
+ if imported_files.error:
4218
+ print("Exiting FastAAI due to input file error.")
4219
+ quit()
4220
+
4221
+ #We'll skip trying this if the file already exists.
4222
+ existing_genome_IDs = None
4223
+ final_db_path = None
4224
+ try:
4225
+ if os.path.exists(db_name):
4226
+ if os.path.isfile(db_name):
4227
+ final_db_path = db_name
4228
+ else:
4229
+ success = miga_dirs(output, "database")
4230
+ final_db_path = os.path.normpath(output+ "/database/" + db_name)
4231
+
4232
+ else:
4233
+ success = miga_dirs(output, "database")
4234
+ final_db_path = os.path.normpath(output+ "/database/" + db_name)
4235
+ except:
4236
+ print("You specified an existing file to be a database, but it does not appear to be a FastAAI database.")
4237
+ print("FastAAI will not be able to continue. Please give FastAAI a different database name and continue.")
4238
+ print("Exiting.")
4239
+ success = False
4240
+
4241
+ if os.path.exists(final_db_path):
4242
+ if os.path.isfile(final_db_path):
4243
+ parent = sqlite3.connect(final_db_path)
4244
+ curs = parent.cursor()
4245
+ existing_genome_IDs = {}
4246
+ sql_command = "SELECT genome, gen_id FROM genome_index"
4247
+ for result in curs.execute(sql_command).fetchall():
4248
+ genome = result[0]
4249
+ id = int(result[1])
4250
+ existing_genome_IDs[genome] = id
4251
+
4252
+ curs.close()
4253
+ parent.close()
4254
+
4255
+ if success:
4256
+ if existing_genome_IDs is not None:
4257
+ genome_idx = max(list(existing_genome_IDs.values()))+1
4258
+ else:
4259
+ existing_genome_IDs = {}
4260
+ genome_idx = 0
4261
+
4262
+ cryst_args = []
4263
+ for crystal_path, crystal_name in zip(imported_files.crystal_list, imported_files.identifiers):
4264
+ #the genome is implicitly dropped if it's already in the target
4265
+ if crystal_name not in existing_genome_IDs:
4266
+ existing_genome_IDs[crystal_name] = genome_idx
4267
+ cryst_args.append((crystal_path, crystal_name, genome_idx,))
4268
+ genome_idx += 1
4269
+
4270
+ final_conn = sqlite3.connect(final_db_path)
4271
+ final_curs = final_conn.cursor()
4272
+
4273
+ final_curs.execute("CREATE TABLE IF NOT EXISTS genome_index (genome text, gen_id integer, protein_count integer)")
4274
+ final_curs.execute("CREATE TABLE IF NOT EXISTS genome_acc_kmer_counts (genome integer, accession integer, count integer)")
4275
+
4276
+ final_curs.execute("CREATE INDEX IF NOT EXISTS kmer_acc ON genome_acc_kmer_counts (genome, accession);")
4277
+
4278
+ final_conn.commit()
4279
+
4280
+ temp_dir = tempfile.mkdtemp()
4281
+
4282
+ temp_db_queue = multiprocessing.Queue()
4283
+ for i in range(0, threads):
4284
+ tdb_name = os.path.normpath(temp_dir + "/temp_db_" + str(i) + ".db")
4285
+ temp_db_queue.put(tdb_name)
4286
+
4287
+ placeholder = [i for i in range(0, threads)]
4288
+
4289
+ pool = multiprocessing.Pool(threads, initializer = para_crystal_init, initargs = (temp_db_queue,))
4290
+
4291
+ if verbose:
4292
+ tracker = progress_tracker(total = len(cryst_args), message = "Importing data")
4293
+ else:
4294
+ print("Importing data")
4295
+
4296
+ for result in pool.imap_unordered(para_crystals_to_dbs, cryst_args):
4297
+ if verbose:
4298
+ tracker.update()
4299
+
4300
+ if verbose:
4301
+ tracker = progress_tracker(total = threads, message = "Formating data")
4302
+ else:
4303
+ print("Formating data")
4304
+
4305
+ for result in pool.imap_unordered(group_by_kmer, placeholder):
4306
+ dbname, surviving_tables = result[0], result[1]
4307
+
4308
+ new_conn = sqlite3.connect(dbname)
4309
+ new_curs = new_conn.cursor()
4310
+
4311
+ ngak = new_curs.execute("SELECT * FROM genome_acc_kmer_counts").fetchall()
4312
+ ngi = new_curs.execute("SELECT * FROM genome_index").fetchall()
4313
+
4314
+ final_curs.executemany("INSERT INTO genome_index VALUES (?, ?, ?)", ngi)
4315
+ final_curs.executemany("INSERT INTO genome_acc_kmer_counts VALUES (?, ?, ?)", ngak)
4316
+
4317
+ final_conn.commit()
4318
+
4319
+ ngak = None
4320
+ ngi = None
4321
+
4322
+ for acc in surviving_tables:
4323
+ final_curs.execute("CREATE TABLE IF NOT EXISTS {acc}_genomes (genome INTEGER PRIMARY KEY, kmers array)".format(acc=acc))
4324
+ final_curs.execute("CREATE TABLE IF NOT EXISTS {acc} (kmer INTEGER PRIMARY KEY, genomes array)".format(acc=acc))
4325
+ final_curs.execute("CREATE INDEX IF NOT EXISTS {acc}_index ON {acc}(kmer)".format(acc=acc))
4326
+
4327
+ curag = new_curs.execute("SELECT * FROM {acc}_genomes".format(acc=acc)).fetchall()
4328
+ final_curs.executemany("INSERT INTO {acc}_genomes VALUES (?, ?)".format(acc=acc), curag)
4329
+ curag = None
4330
+
4331
+ curaac = new_curs.execute("SELECT kmer, genomes, genomes FROM {acc}".format(acc=acc)).fetchall()
4332
+ update_concat_sql = "INSERT INTO {acc} VALUES (?,?) ON CONFLICT(kmer) DO UPDATE SET genomes=genomes || (?)".format(acc=acc)
4333
+ final_curs.executemany(update_concat_sql, curaac)
4334
+ curacc = None
4335
+
4336
+ final_conn.commit()
4337
+
4338
+
4339
+
4340
+ new_curs.close()
4341
+ new_conn.close()
4342
+
4343
+ if verbose:
4344
+ tracker.update()
4345
+
4346
+ pool.close()
4347
+
4348
+
4349
+ final_curs.close()
4350
+ final_conn.close()
4351
+
4352
+ shutil.rmtree(temp_dir)
4353
+ '''
4354
+ Main
4355
+ '''
4356
+
4357
+ #Preprocess genomes, build DB, query all vs all to self.
4358
+ def aai_index_opts():
4359
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
4360
+ description='''FastAAI module for preprocessing a set of genomes, proteins, or proteins+HMMs
4361
+ into a database, and then querying the database against itself.
4362
+
4363
+ Equivalent to running build_db and db_query in sequence. Check these modules for additional
4364
+ details on inputs.''')
4365
+
4366
+ parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory to place the database and any protein or HMM files FastAAI creates. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
4367
+
4368
+ parser.add_argument('-g', '--genomes', dest = 'genomes', default = None, help = 'A directory containing genomes in FASTA format.')
4369
+ parser.add_argument('-p', '--proteins', dest = 'proteins', default = None, help = 'A directory containing protein amino acids in FASTA format.')
4370
+ parser.add_argument('-m', '--hmms', dest = 'hmms', default = None, help = 'A directory containing the results of an HMM search on a set of proteins.')
4371
+
4372
+ parser.add_argument('-d', '--database', dest = 'db_name', default = "FastAAI_database.sqlite.db", help = 'The name of the database you wish to create or add to. The database will be created if it doesn\'t already exist and placed in the output directory. FastAAI_database.sqlite.db by default.')
4373
+
4374
+ parser.add_argument('--output_style', dest = "style", default = 'tsv', help = "Either 'tsv' or 'matrix'. Matrix produces a simplified output of only AAI estimates.")
4375
+ parser.add_argument('--do_stdev', dest = "do_stdev", action='store_true', help = 'Off by default. Calculate std. deviations on Jaccard indicies. Increases memory usage and runtime slightly. Does NOT change estimated AAI values at all.')
4376
+ parser.add_argument('--in_memory', dest = "in_mem", action = 'store_true', help = 'Load both databases into memory before querying. Consumes more RAM, but is faster and reduces file I/O substantially. Consider reducing number of threads')
4377
+ parser.add_argument('--store_results', dest = "storage", action = 'store_true', help = 'Keep partial results in memory. Only works with --in_memory. Fewer writes, but more RAM. Default off.')
4378
+
4379
+ parser.add_argument('--compress', dest = "do_comp", action = 'store_true', help = 'Gzip compress generated proteins, HMMs. Off by default.')
4380
+ parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
4381
+ parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
4382
+
4383
+ args, unknown = parser.parse_known_args()
4384
+
4385
+ return parser, args
4386
+
4387
+ #Preprocess two sets of genomes A and B into two distinct databases Adb and Bdb, then query Adb against Bdb
4388
+ def multi_query_opts():
4389
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
4390
+ description='''FastAAI module for preprocessing two sets of input files into two separate DBs,
4391
+ then querying the DBs against eachother. Not for use with already-made FastAAI databases.
4392
+
4393
+ See "build_db" action for details on file inputs.
4394
+ See "db_query" action for details on querying options.''')
4395
+
4396
+ parser.add_argument('--query_output', dest = 'qoutput', default = "FastAAI_query", help = 'Output directory for query files. Default "FastAAI_query." FastAAI will work if this directory is the same as --target_output, but this is NOT a good idea.')
4397
+ parser.add_argument('--target_output', dest = 'toutput', default = "FastAAI_target", help = 'Output directory for target files. Default "FastAAI_target." AAI results will be placed in this directory')
4398
+
4399
+ parser.add_argument('--query_genomes', dest = 'qgenomes', default = None, help = 'Query genomes')
4400
+ parser.add_argument('--target_genomes', dest = 'tgenomes', default = None, help = 'Target genomes')
4401
+
4402
+ parser.add_argument('--query_proteins', dest = 'qproteins', default = None, help = 'Query proteins')
4403
+ parser.add_argument('--target_proteins', dest = 'tproteins', default = None, help = 'Target proteins')
4404
+
4405
+ parser.add_argument('--query_hmms', dest = 'qhmms', default = None, help = 'Query HMMs')
4406
+ parser.add_argument('--target_hmms', dest = 'thmms', default = None, help = 'Target HMMs')
4407
+
4408
+ parser.add_argument('--query_database', dest = 'qdb_name', default = "FastAAI_query_database.sqlite.db", help = 'Query database name. Default "FastAAI_query_database.sqlite.db"')
4409
+ parser.add_argument('--target_database', dest = 'tdb_name', default = "FastAAI_target_database.sqlite.db", help ='Target database name. Default "FastAAI_target_database.sqlite.db"')
4410
+
4411
+ parser.add_argument('--output_style', dest = "style", default = 'tsv', help = "Either 'tsv' or 'matrix'. Matrix produces a simplified output of only AAI estimates.")
4412
+ parser.add_argument('--do_stdev', dest = "do_stdev", action='store_true', help = 'Off by default. Calculate std. deviations on Jaccard indicies. Increases memory usage and runtime slightly. Does NOT change estimated AAI values at all.')
4413
+ parser.add_argument('--in_memory', dest = "in_mem", action = 'store_true', help = 'Load both databases into memory before querying. Consumes more RAM, but is faster and reduces file I/O substantially. Consider reducing number of threads')
4414
+ parser.add_argument('--store_results', dest = "storage", action = 'store_true', help = 'Keep partial results in memory. Only works with --in_memory. Fewer writes, but more RAM. Default off.')
4415
+
4416
+ parser.add_argument('--compress', dest = "do_comp", action = 'store_true', help = 'Gzip compress generated proteins, HMMs. Off by default.')
4417
+ parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
4418
+ parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
4419
+
4420
+ args, unknown = parser.parse_known_args()
4421
+
4422
+ return parser, args
4423
+
4424
+
4425
+ def main():
4426
+ #The currently supported modules.
4427
+ modules = ["build_db", "merge_db", "simple_query", "db_query", "single_query", "aai_index", "multi_query", "miga_merge", "miga_preproc", "miga_db_from_crystals"]
4428
+
4429
+ #Print modules if someone just types FastAAI
4430
+ if len(sys.argv) < 2:
4431
+ print("")
4432
+ print(" I couldn't find the module you specified. Please select one of the following modules:")
4433
+ print("")
4434
+ print("-------------------------------------- Database Construction Options --------------------------------------")
4435
+ print("")
4436
+ print(" build_db |" + " Create or add to a FastAAI database from genomes, proteins, or proteins and HMMs")
4437
+ print(" merge_db |" + " Add the contents of one FastAAI DB to another")
4438
+ print("")
4439
+ print("---------------------------------------------- Query Options ----------------------------------------------")
4440
+ print("")
4441
+ print(" simple_query |" + " Query a genome or protein (one or many) against an existing FastAAI database")
4442
+ print(" db_query |" + " Query the genomes in one FastAAI database against the genomes in another FastAAI database")
4443
+ print("")
4444
+ print("------------------------------------------- Other Options -------------------------------------------")
4445
+ print("")
4446
+ print(" single_query |" + " Query ONE query genome against ONE target genome")
4447
+ print(" multi_query |" + " Create a query DB and a target DB, then calculate query vs. target AAI")
4448
+ print(" aai_index |" + " Create a database from multiple genomes and do an all vs. all AAI index of the genomes")
4449
+ print("")
4450
+ print("-----------------------------------------------------------------------------------------------------------")
4451
+ print(" To select a module, enter 'FastAAI [module]' into the command line!")
4452
+ print("")
4453
+ sys.exit()
4454
+
4455
+ #This is the module selection
4456
+ selection = sys.argv[1]
4457
+
4458
+ if selection == "version":
4459
+ sys.exit("FastAAI version=0.1.17")
4460
+
4461
+ if selection not in modules:
4462
+ print("")
4463
+ print(" I couldn't find the module you specified. Please select one of the following modules:")
4464
+ print("")
4465
+ print("-------------------------------------- Database Construction Options --------------------------------------")
4466
+ print("")
4467
+ print(" build_db |" + " Create or add to a FastAAI database from genomes, proteins, or proteins and HMMs")
4468
+ print(" merge_db |" + " Add the contents of one FastAAI DB to another")
4469
+ print("")
4470
+ print("---------------------------------------------- Query Options ----------------------------------------------")
4471
+ print("")
4472
+ print(" simple_query |" + " Query a genome or protein (one or many) against an existing FastAAI database")
4473
+ print(" db_query |" + " Query the genomes in one FastAAI database against the genomes in another FastAAI database")
4474
+ print("")
4475
+ print("------------------------------------------- Other Options -------------------------------------------")
4476
+ print("")
4477
+ print(" single_query |" + " Query ONE query genome against ONE target genome")
4478
+ print(" multi_query |" + " Create a query DB and a target DB, then calculate query vs. target AAI")
4479
+ print(" aai_index |" + " Create a database from multiple genomes and do an all vs. all AAI index of the genomes")
4480
+ print("")
4481
+ print("-----------------------------------------------------------------------------------------------------------")
4482
+ print(" To select a module, enter 'FastAAI [module]' into the command line!")
4483
+ print("")
4484
+ sys.exit()
4485
+
4486
+ #################### Database build or add ########################
4487
+
4488
+ if selection == "build_db":
4489
+ parser, opts = build_db_opts()
4490
+
4491
+ #module name only
4492
+ if len(sys.argv) < 3:
4493
+ print(parser.print_help())
4494
+ sys.exit()
4495
+
4496
+ #Directory based
4497
+ genomes, proteins, hmms = opts.genomes, opts.proteins, opts.hmms
4498
+
4499
+ output = os.path.normpath(opts.output)
4500
+
4501
+ threads = opts.threads
4502
+ verbose = opts.verbose
4503
+
4504
+ #Database handle
4505
+ db_name = opts.db_name
4506
+
4507
+ do_comp = opts.do_comp
4508
+
4509
+ build_db(genomes, proteins, hmms, db_name, output, threads, verbose, do_comp)
4510
+
4511
+
4512
+ #################### Add two DBs ########################
4513
+
4514
+ if selection == "merge_db":
4515
+ parser, opts = merge_db_opts()
4516
+ if len(sys.argv) < 3:
4517
+ print(parser.print_help())
4518
+ sys.exit()
4519
+
4520
+ recipient = opts.recipient
4521
+ donors = opts.donors
4522
+ donor_file = opts.donor_file
4523
+ verbose = opts.verbose
4524
+ threads = opts.threads
4525
+
4526
+ if donors is not None and donor_file is not None:
4527
+ sys.exit("You cannot specify both --donors and --donor_file.")
4528
+
4529
+ merge_db(recipient, donors, donor_file, verbose, threads)
4530
+
4531
+ #################### Query files vs DB ########################
4532
+
4533
+ if selection == "simple_query":
4534
+ parser, opts = sql_query_opts()
4535
+
4536
+ if len(sys.argv) < 3:
4537
+ print(parser.print_help())
4538
+ sys.exit()
4539
+
4540
+ genomes, proteins, hmms = opts.genomes, opts.proteins, opts.hmms
4541
+
4542
+ db_name = opts.target
4543
+
4544
+ output = opts.output
4545
+ threads = opts.threads
4546
+ verbose = opts.verbose
4547
+
4548
+ do_stdev = opts.do_stdev
4549
+
4550
+ style, in_mem, make_db, qdb_name, do_comp = opts.style, opts.in_mem, opts.make_db, opts.qdb_name, opts.do_comp
4551
+
4552
+ sql_query(genomes, proteins, hmms, db_name, output, threads, verbose, do_stdev, style, in_mem, make_db, qdb_name, do_comp)
4553
+
4554
+
4555
+ #################### Query DB vs DB ###########################
4556
+ if selection == "db_query":
4557
+ parser, opts = db_query_opts()
4558
+ #module name only
4559
+
4560
+ if len(sys.argv) < 3:
4561
+ print(parser.print_help())
4562
+ sys.exit()
4563
+
4564
+ query = opts.query
4565
+ target = opts.target
4566
+ verbose = opts.verbose
4567
+
4568
+ do_stdev = opts.do_stdev
4569
+ output = opts.output
4570
+ threads = opts.threads
4571
+
4572
+ style, in_mem, store = opts.style, opts.in_mem, opts.storage
4573
+
4574
+
4575
+ db_query(query, target, verbose, output, threads, do_stdev, style, in_mem, store)
4576
+
4577
+ #################### One-pass functions #######################
4578
+ if selection == "single_query":
4579
+ parser, opts = single_query_opts()
4580
+ #module name only
4581
+
4582
+ if len(sys.argv) < 3:
4583
+ print(parser.print_help())
4584
+ sys.exit()
4585
+
4586
+ output = os.path.normpath(opts.output)
4587
+ try:
4588
+ threads = int(opts.threads)
4589
+ except:
4590
+ print("Couldn't interpret your threads. Defaulting to 1.")
4591
+ threads = 1
4592
+ verbose = opts.verbose
4593
+ do_compress = opts.do_comp
4594
+
4595
+ query_genome = opts.query_genome
4596
+ query_protein = opts.query_protein
4597
+ query_hmm = opts.query_hmm
4598
+
4599
+ query_file = fastaai_file_importer(genomes = query_genome, proteins = query_protein, hmms = query_hmm, output = output, compress = do_compress)
4600
+ query_file.determine_inputs()
4601
+
4602
+ target_genome = opts.target_genome
4603
+ target_protein = opts.target_protein
4604
+ target_hmm = opts.target_hmm
4605
+
4606
+ target_file = fastaai_file_importer(genomes = target_genome, proteins = target_protein, hmms = target_hmm, output = output, compress = do_compress)
4607
+ target_file.determine_inputs()
4608
+
4609
+ is_ok = True
4610
+ if len(query_file.in_files) != 1:
4611
+ print("Query genome unacceptable. Check your inputs")
4612
+ is_ok = False
4613
+
4614
+ if len(target_file.in_files) != 1:
4615
+ print("target genome unacceptable. Check your inputs")
4616
+ is_ok = False
4617
+ if is_ok:
4618
+ good_to_go = prepare_directories(output, query_file.status, "query")
4619
+ if good_to_go:
4620
+ good_to_go = prepare_directories(output, target_file.status, "query")
4621
+ if good_to_go:
4622
+ single_query(query_file, target_file, output, verbose, threads, do_compress)
4623
+
4624
+
4625
+ if selection == "aai_index":
4626
+ parser, opts = aai_index_opts()
4627
+
4628
+ if len(sys.argv) < 3:
4629
+ print(parser.print_help())
4630
+ sys.exit()
4631
+
4632
+ genomes, proteins, hmms = opts.genomes, opts.proteins, opts.hmms
4633
+
4634
+ output = os.path.normpath(opts.output)
4635
+
4636
+ threads = opts.threads
4637
+ verbose = opts.verbose
4638
+
4639
+ #Database handle
4640
+ db_name = opts.db_name
4641
+
4642
+ do_comp = opts.do_comp
4643
+
4644
+ do_stdev = opts.do_stdev
4645
+
4646
+ style, in_mem, store = opts.style, opts.in_mem, opts.storage
4647
+
4648
+ #This is the same logic from the build_db section and it's what we need for getting the DB name.
4649
+ #Check if the db contains path info. Incl. windows version.
4650
+ if "/" not in db_name and "\\" not in db_name:
4651
+ final_database = os.path.normpath(output + "/database/" + db_name)
4652
+ else:
4653
+ #If the person insists that the db has a path, let them.
4654
+ final_database = db_name
4655
+
4656
+ build_db(genomes, proteins, hmms, db_name, output, threads, verbose, do_comp)
4657
+
4658
+ query, target = final_database, final_database
4659
+
4660
+ db_query(query, target, verbose, output, threads, do_stdev, style, in_mem, store)
4661
+
4662
+
4663
+ if selection == "multi_query":
4664
+ parser, opts = multi_query_opts()
4665
+
4666
+ if len(sys.argv) < 3:
4667
+ print(parser.print_help())
4668
+ sys.exit()
4669
+
4670
+ #Shared options
4671
+ threads = opts.threads
4672
+ verbose = opts.verbose
4673
+
4674
+ #query options
4675
+ do_comp = opts.do_comp
4676
+ do_stdev = opts.do_stdev
4677
+ style, in_mem, store = opts.style, opts.in_mem, opts.storage
4678
+
4679
+ #query inputs
4680
+ qgenomes, qproteins, qhmms = opts.qgenomes, opts.qproteins, opts.qhmms
4681
+ qoutput = os.path.normpath(opts.qoutput)
4682
+ qdb_name = opts.qdb_name
4683
+ #This is the same logic from the build_db section and it's what we need for getting the DB name.
4684
+ #Check if the db contains path info. Incl. windows version.
4685
+ if "/" not in qdb_name and "\\" not in qdb_name:
4686
+ final_qdb = os.path.normpath(qoutput + "/database/" + qdb_name)
4687
+ else:
4688
+ #If the person insists that the db has a path, let them.
4689
+ final_qdb = db_name
4690
+
4691
+ #target inputs
4692
+ tgenomes, tproteins, thmms = opts.tgenomes, opts.tproteins, opts.thmms
4693
+ toutput = os.path.normpath(opts.toutput)
4694
+ tdb_name = opts.tdb_name
4695
+ #This is the same logic from the build_db section and it's what we need for getting the DB name.
4696
+ #Check if the db contains path info. Incl. windows version.
4697
+ if "/" not in tdb_name and "\\" not in tdb_name:
4698
+ final_tdb = os.path.normpath(toutput + "/database/" + tdb_name)
4699
+ else:
4700
+ #If the person insists that the db has a path other than output/database, let them.
4701
+ final_tdb = db_name
4702
+
4703
+ #run query build
4704
+ build_db(qgenomes, qproteins, qhmms, qdb_name, qoutput, threads, verbose, do_comp)
4705
+ #run target build
4706
+ build_db(tgenomes, tproteins, thmms, tdb_name, toutput, threads, verbose, do_comp)
4707
+ #run query db against target db
4708
+ db_query(final_qdb, final_tdb, verbose, toutput, threads, do_stdev, style, in_mem, store)
4709
+
4710
+
4711
+ ############## MiGA module #################
4712
+ if selection == "miga_merge":
4713
+ parser, opts = miga_merge_opts()
4714
+
4715
+ #module name only
4716
+ if len(sys.argv) < 3:
4717
+ print(parser.print_help())
4718
+ sys.exit()
4719
+
4720
+ g,p,h = opts.gen, opts.prot, opts.hmm
4721
+
4722
+ target = opts.database
4723
+
4724
+ verbose = opts.verbose
4725
+
4726
+ output_path = opts.output
4727
+
4728
+ if target == None:
4729
+ target = os.path.normpath(output_path + "/database/FastAAI_database.sqlite.db")
4730
+
4731
+ do_compress = opts.compress
4732
+
4733
+ imported_files = fastaai_file_importer(genomes = g, proteins = p, hmms = h,
4734
+ output = output_path, compress = do_compress)
4735
+
4736
+ imported_files.determine_inputs()
4737
+
4738
+ if len(imported_files.in_files) == 0:
4739
+ print("Something was wrong with your input file.")
4740
+ else:
4741
+ input_genome = imported_files.in_files[0]
4742
+
4743
+ good_to_go = prepare_directories(output_path, imported_files.status, "build")
4744
+
4745
+ miga_merge(input_genome, target, verbose, do_compress)
4746
+
4747
+ #This is where a new db would normally be created,
4748
+ #which is not what happens when the supplied target is some other sort of path.
4749
+ output_default = os.path.normpath(output_path + "/database")
4750
+ if len(os.listdir(output_default)) == 0:
4751
+ os.rmdir(output_default)
4752
+
4753
+ if selection == "miga_preproc":
4754
+ parser, opts = miga_preproc_opts()
4755
+
4756
+ #module name only
4757
+ if len(sys.argv) < 3:
4758
+ print(parser.print_help())
4759
+ sys.exit()
4760
+
4761
+ #Directory based
4762
+ genomes, proteins, hmms = opts.genomes, opts.proteins, opts.hmms
4763
+
4764
+ output = os.path.normpath(opts.output)
4765
+
4766
+ threads = opts.threads
4767
+ verbose = opts.verbose
4768
+
4769
+ do_comp = opts.do_comp
4770
+
4771
+ miga_preproc(genomes, proteins, hmms, output, threads, verbose, do_comp)
4772
+
4773
+ if selection == "miga_db_from_crystals":
4774
+ parser, opts = miga_db_from_crystals_opts()
4775
+
4776
+ #module name only
4777
+ if len(sys.argv) < 3:
4778
+ print(parser.print_help())
4779
+ sys.exit()
4780
+
4781
+ crystals = opts.crystals
4782
+
4783
+ if crystals is None:
4784
+ print("I need to be given crystals to proceed!")
4785
+ quit()
4786
+
4787
+ db_name = opts.db_name
4788
+ try:
4789
+ threads = int(opts.threads)
4790
+ except:
4791
+ threads = 1
4792
+ print("Can't recognize threads param:", str(opts.threads), "defaulting to 1.")
4793
+
4794
+ verbose = opts.verbose
4795
+ output_path = opts.output
4796
+
4797
+ miga_db_from_crystals(crystals, output_path, db_name, threads, verbose)
4798
+
4799
+
4800
+ return None
4801
+
4802
+ if __name__ == "__main__":
4803
+ main()
4804
+
4805
+