miga-base 1.2.18.2 → 1.3.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (31) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/cli/action/doctor/base.rb +2 -1
  3. data/lib/miga/cli/action/init.rb +1 -1
  4. data/lib/miga/dataset/result/add.rb +3 -2
  5. data/lib/miga/version.rb +2 -2
  6. data/scripts/essential_genes.bash +4 -8
  7. data/utils/FastAAI/LICENSE +8 -0
  8. data/utils/FastAAI/README.md +151 -40
  9. data/utils/FastAAI/__init__.py +1 -0
  10. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962915_1.fna.gz +0 -0
  11. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962925_1.fna.gz +0 -0
  12. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962935_1.fna.gz +0 -0
  13. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962945_1.fna.gz +0 -0
  14. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962995_1.fna.gz +0 -0
  15. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963025_1.fna.gz +0 -0
  16. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963055_1.fna.gz +0 -0
  17. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963065_1.fna.gz +0 -0
  18. data/utils/FastAAI/example_genomes/_Pseudomonas__cissicola_GCA_002019225_1.fna.gz +0 -0
  19. data/utils/FastAAI/example_genomes/_Pseudomonas__cissicola_GCA_008801575_1.fna.gz +0 -0
  20. data/utils/FastAAI/fastaai/__init__.py +1 -0
  21. data/utils/FastAAI/fastaai/fastaai +4805 -0
  22. data/utils/FastAAI/fastaai/fastaai.py +4805 -0
  23. data/utils/FastAAI/fastaai/fastaai_miga_crystals_to_db.py +297 -0
  24. data/utils/FastAAI/fastaai/fastaai_miga_preproc.py +931 -0
  25. data/utils/FastAAI/metadata/Accession_names_and_IDs.txt +122 -0
  26. data/utils/distance/commands.rb +51 -23
  27. metadata +23 -6
  28. data/utils/FastAAI/FastAAI +0 -3659
  29. /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Archaea_SCG.hmm +0 -0
  30. /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Bacteria_SCG.hmm +0 -0
  31. /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Complete_SCG_DB.hmm +0 -0
@@ -0,0 +1,4805 @@
1
+ #!/usr/bin/env python3
2
+
3
+ ################################################################################
4
+ """---0.0 Import Modules---"""
5
+ import subprocess
6
+ import argparse
7
+ import datetime
8
+ import shutil
9
+ import textwrap
10
+ import multiprocessing
11
+ import pickle
12
+ import gzip
13
+ import tempfile
14
+ #Shouldn't play any role.
15
+ #from random import randint
16
+
17
+ #We could probably remove Path, too.
18
+ #This as well
19
+ import time
20
+ from collections import defaultdict
21
+ import sys
22
+ import os
23
+ from math import floor
24
+ import sqlite3
25
+ #numpy dependency
26
+ import numpy as np
27
+ import io
28
+ import random
29
+
30
+ import pyrodigal as pd
31
+ import pyhmmer
32
+
33
+ from collections import namedtuple
34
+
35
+ from math import ceil
36
+
37
+ import re
38
+
39
+
40
+ class progress_tracker:
41
+ def __init__(self, total, step_size = 2, message = None, one_line = True):
42
+ self.current_count = 0
43
+ self.max_count = total
44
+ #Book keeping.
45
+ self.start_time = None
46
+ self.end_time = None
47
+ #Show progrexx every [step] percent
48
+ self.step = step_size
49
+ self.justify_size = ceil(100/self.step)
50
+ self.last_percent = 0
51
+ self.message = message
52
+
53
+ self.pretty_print = one_line
54
+
55
+ self.start()
56
+
57
+ def curtime(self):
58
+ time_format = "%d/%m/%Y %H:%M:%S"
59
+ timer = datetime.datetime.now()
60
+ time = timer.strftime(time_format)
61
+ return time
62
+
63
+ def start(self):
64
+ print("")
65
+ if self.message is not None:
66
+ print(self.message)
67
+
68
+ try:
69
+ percentage = (self.current_count/self.max_count)*100
70
+ sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/self.step)).ljust(self.justify_size)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(self.current_count) + " of " + str(self.max_count) + ' ) at ' + self.curtime() + "\n")
71
+ sys.stdout.flush()
72
+
73
+ except:
74
+ #It's not really a big deal if the progress bar cannot be printed.
75
+ pass
76
+
77
+ def update(self):
78
+ self.current_count += 1
79
+ percentage = (self.current_count/self.max_count)*100
80
+ try:
81
+ if percentage // self.step > self.last_percent:
82
+ if self.pretty_print:
83
+ sys.stdout.write('\033[A')
84
+ sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/self.step)).ljust(self.justify_size)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(self.current_count) + " of " + str(self.max_count) + ' ) at ' + self.curtime() + "\n")
85
+ sys.stdout.flush()
86
+ self.last_percent = percentage // self.step
87
+ #Bar is always full at the end.
88
+ if count == self.max_count:
89
+ if self.pretty_print:
90
+ sys.stdout.write('\033[A')
91
+ sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*self.justify_size).ljust(self.justify_size)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(self.current_count) + " of " + str(self.max_count) + ' ) at ' + self.curtime() + "\n")
92
+ sys.stdout.flush()
93
+ #Add space at end.
94
+ print("")
95
+ except:
96
+ #It's not really a big deal if the progress bar cannot be printed.
97
+ pass
98
+
99
+
100
+ #Takes a bytestring from the SQL database and converts it to a numpy array.
101
+ def convert_array(bytestring):
102
+ return np.frombuffer(bytestring, dtype = np.int32)
103
+
104
+ def convert_float_array_16(bytestring):
105
+ return np.frombuffer(bytestring, dtype = np.float16)
106
+
107
+ def convert_float_array_32(bytestring):
108
+ return np.frombuffer(bytestring, dtype = np.float32)
109
+
110
+ def convert_float_array_64(bytestring):
111
+ return np.frombuffer(bytestring, dtype = np.float64)
112
+
113
+ def read_fasta(file):
114
+ cur_seq = ""
115
+ cur_prot = ""
116
+
117
+ contents = {}
118
+ deflines = {}
119
+
120
+ fasta = agnostic_reader(file)
121
+ for line in fasta:
122
+ if line.startswith(">"):
123
+ if len(cur_seq) > 0:
124
+ contents[cur_prot] = cur_seq
125
+ deflines[cur_prot] = defline
126
+
127
+ cur_seq = ""
128
+ cur_prot = line.strip().split()[0][1:]
129
+ defline = line.strip()[len(cur_prot)+1 :].strip()
130
+
131
+ else:
132
+ cur_seq += line.strip()
133
+
134
+ fasta.close()
135
+
136
+ #Final iter
137
+ if len(cur_seq) > 0:
138
+ contents[cur_prot] = cur_seq
139
+ deflines[cur_prot] = defline
140
+
141
+ return contents, deflines
142
+
143
+ class fasta_file:
144
+ def __init__(self, file, type = "genome"):
145
+ self.file_path = os.path.abspath(file)
146
+ self.name = os.path.basename(file)
147
+ self.no_ext = os.path.splitext(self.name)[0]
148
+ self.type = type
149
+
150
+ self.tuple_structure = namedtuple("fasta", ["seqid", "description", "sequence"])
151
+ self.contents = {}
152
+
153
+ def convert(self, contents, descriptions):
154
+ for protein in contents:
155
+ self.contents = self.tuple_structure(seqid = protein, description = descriptions[protein], sequence = contents[protein])
156
+
157
+
158
+ def def_import_file(self):
159
+ contents, descriptions = read_fasta(self.file_path)
160
+ self.convert(contents, descriptions)
161
+
162
+ class pyhmmer_manager:
163
+ def __init__(self, do_compress):
164
+ self.hmm_model = []
165
+ self.hmm_model_optimized = None
166
+
167
+ self.proteins_to_search = []
168
+ self.protein_descriptions = None
169
+
170
+ self.hmm_result_proteins = []
171
+ self.hmm_result_accessions = []
172
+ self.hmm_result_scores = []
173
+
174
+ self.printable_lines = []
175
+
176
+ self.bacterial_SCPs = None
177
+ self.archaeal_SCPs = None
178
+ self.assign_hmm_sets()
179
+ self.domain_counts = {"Bacteria" : 0, "Archaea": 0}
180
+ self.voted_domain = {"Bacteria" : len(self.bacterial_SCPs), "Archaea" : len(self.archaeal_SCPs)}
181
+
182
+ self.bacterial_fraction = None
183
+ self.archaeal_fraction = None
184
+
185
+ self.best_hits = None
186
+
187
+ self.do_compress = do_compress
188
+
189
+ def optimize_models(self):
190
+ try:
191
+ self.hmm_model_optimized = []
192
+
193
+ for hmm in self.hmm_model:
194
+ prof = pyhmmer.plan7.Profile(M = hmm.insert_emissions.shape[0], alphabet = pyhmmer.easel.Alphabet.amino())
195
+ prof.configure(hmm = hmm, background = pyhmmer.plan7.Background(alphabet = pyhmmer.easel.Alphabet.amino()), L = hmm.insert_emissions.shape[0]-1)
196
+ optim = prof.optimized()
197
+ self.hmm_model_optimized.append(optim)
198
+
199
+ #Clean up.
200
+ self.hmm_model = None
201
+ except:
202
+ #Quiet fail condition - fall back on default model.
203
+ self.hmm_model_optimized = None
204
+
205
+ #Load HMM and try to optimize.
206
+ def load_hmm_from_file(self, hmm_path):
207
+ hmm_set = pyhmmer.plan7.HMMFile(hmm_path)
208
+ for hmm in hmm_set:
209
+ self.hmm_model.append(hmm)
210
+
211
+ #This doesn't seem to be improving performance currently.
212
+ self.optimize_models()
213
+
214
+ #Set archaeal and bacterial HMM sets.
215
+ def assign_hmm_sets(self):
216
+ self.bacterial_SCPs = {'PF00709_21': 'Adenylsucc_synt', 'PF00406_22': 'ADK', 'PF01808_18': 'AICARFT_IMPCHas', 'PF00231_19': 'ATP-synt',
217
+ 'PF00119_20': 'ATP-synt_A', 'PF01264_21': 'Chorismate_synt', 'PF00889_19': 'EF_TS', 'PF01176_19': 'eIF-1a',
218
+ 'PF02601_15': 'Exonuc_VII_L', 'PF01025_19': 'GrpE', 'PF01725_16': 'Ham1p_like', 'PF01715_17': 'IPPT',
219
+ 'PF00213_18': 'OSCP', 'PF01195_19': 'Pept_tRNA_hydro', 'PF00162_19': 'PGK', 'PF02033_18': 'RBFA', 'PF02565_15': 'RecO_C',
220
+ 'PF00825_18': 'Ribonuclease_P', 'PF00687_21': 'Ribosomal_L1', 'PF00572_18': 'Ribosomal_L13',
221
+ 'PF00238_19': 'Ribosomal_L14', 'PF00252_18': 'Ribosomal_L16', 'PF01196_19': 'Ribosomal_L17',
222
+ 'PF00861_22': 'Ribosomal_L18p', 'PF01245_20': 'Ribosomal_L19', 'PF00453_18': 'Ribosomal_L20',
223
+ 'PF00829_21': 'Ribosomal_L21p', 'PF00237_19': 'Ribosomal_L22', 'PF00276_20': 'Ribosomal_L23',
224
+ 'PF17136_4': 'ribosomal_L24', 'PF00189_20': 'Ribosomal_S3_C', 'PF00281_19': 'Ribosomal_L5', 'PF00181_23': 'Ribosomal_L2',
225
+ 'PF01016_19': 'Ribosomal_L27', 'PF00828_19': 'Ribosomal_L27A', 'PF00830_19': 'Ribosomal_L28',
226
+ 'PF00831_23': 'Ribosomal_L29', 'PF00297_22': 'Ribosomal_L3', 'PF01783_23': 'Ribosomal_L32p',
227
+ 'PF01632_19': 'Ribosomal_L35p', 'PF00573_22': 'Ribosomal_L4', 'PF00347_23': 'Ribosomal_L6',
228
+ 'PF03948_14': 'Ribosomal_L9_C', 'PF00338_22': 'Ribosomal_S10', 'PF00411_19': 'Ribosomal_S11',
229
+ 'PF00416_22': 'Ribosomal_S13', 'PF00312_22': 'Ribosomal_S15', 'PF00886_19': 'Ribosomal_S16',
230
+ 'PF00366_20': 'Ribosomal_S17', 'PF00203_21': 'Ribosomal_S19', 'PF00318_20': 'Ribosomal_S2',
231
+ 'PF01649_18': 'Ribosomal_S20p', 'PF01250_17': 'Ribosomal_S6', 'PF00177_21': 'Ribosomal_S7',
232
+ 'PF00410_19': 'Ribosomal_S8', 'PF00380_19': 'Ribosomal_S9', 'PF00164_25': 'Ribosom_S12_S23',
233
+ 'PF01193_24': 'RNA_pol_L', 'PF01192_22': 'RNA_pol_Rpb6', 'PF01765_19': 'RRF', 'PF02410_15': 'RsfS',
234
+ 'PF03652_15': 'RuvX', 'PF00584_20': 'SecE', 'PF03840_14': 'SecG', 'PF00344_20': 'SecY', 'PF01668_18': 'SmpB',
235
+ 'PF00750_19': 'tRNA-synt_1d', 'PF01746_21': 'tRNA_m1G_MT', 'PF02367_17': 'TsaE', 'PF02130_17': 'UPF0054',
236
+ 'PF02699_15': 'YajC'}
237
+
238
+ self.archaeal_SCPs = {'PF00709_21': 'Adenylsucc_synt', 'PF05221_17': 'AdoHcyase', 'PF01951_16': 'Archease', 'PF01813_17': 'ATP-synt_D',
239
+ 'PF01990_17': 'ATP-synt_F', 'PF01864_17': 'CarS-like', 'PF01982_16': 'CTP-dep_RFKase', 'PF01866_17': 'Diphthamide_syn',
240
+ 'PF04104_14': 'DNA_primase_lrg', 'PF01984_20': 'dsDNA_bind', 'PF04010_13': 'DUF357', 'PF04019_12': 'DUF359',
241
+ 'PF04919_12': 'DUF655', 'PF01912_18': 'eIF-6', 'PF05833_11': 'FbpA', 'PF01725_16': 'Ham1p_like',
242
+ 'PF00368_18': 'HMG-CoA_red', 'PF00334_19': 'NDK', 'PF02006_16': 'PPS_PS', 'PF02996_17': 'Prefoldin',
243
+ 'PF01981_16': 'PTH2', 'PF01948_18': 'PyrI', 'PF00687_21': 'Ribosomal_L1', 'PF00572_18': 'Ribosomal_L13',
244
+ 'PF00238_19': 'Ribosomal_L14', 'PF00827_17': 'Ribosomal_L15e', 'PF00252_18': 'Ribosomal_L16',
245
+ 'PF01157_18': 'Ribosomal_L21e', 'PF00237_19': 'Ribosomal_L22', 'PF00276_20': 'Ribosomal_L23',
246
+ 'PF16906_5': 'Ribosomal_L26', 'PF00831_23': 'Ribosomal_L29', 'PF00297_22': 'Ribosomal_L3',
247
+ 'PF01198_19': 'Ribosomal_L31e', 'PF01655_18': 'Ribosomal_L32e', 'PF01780_19': 'Ribosomal_L37ae',
248
+ 'PF00832_20': 'Ribosomal_L39', 'PF00573_22': 'Ribosomal_L4', 'PF00935_19': 'Ribosomal_L44', 'PF17144_4': 'Ribosomal_L5e',
249
+ 'PF00347_23': 'Ribosomal_L6', 'PF00411_19': 'Ribosomal_S11', 'PF00416_22': 'Ribosomal_S13',
250
+ 'PF00312_22': 'Ribosomal_S15', 'PF00366_20': 'Ribosomal_S17', 'PF00833_18': 'Ribosomal_S17e',
251
+ 'PF00203_21': 'Ribosomal_S19', 'PF01090_19': 'Ribosomal_S19e', 'PF00318_20': 'Ribosomal_S2',
252
+ 'PF01282_19': 'Ribosomal_S24e', 'PF01667_17': 'Ribosomal_S27e', 'PF01200_18': 'Ribosomal_S28e',
253
+ 'PF01015_18': 'Ribosomal_S3Ae', 'PF00177_21': 'Ribosomal_S7', 'PF00410_19': 'Ribosomal_S8',
254
+ 'PF01201_22': 'Ribosomal_S8e', 'PF00380_19': 'Ribosomal_S9', 'PF00164_25': 'Ribosom_S12_S23',
255
+ 'PF06026_14': 'Rib_5-P_isom_A', 'PF01351_18': 'RNase_HII', 'PF13656_6': 'RNA_pol_L_2',
256
+ 'PF01194_17': 'RNA_pol_N', 'PF03874_16': 'RNA_pol_Rpb4', 'PF01192_22': 'RNA_pol_Rpb6',
257
+ 'PF01139_17': 'RtcB', 'PF00344_20': 'SecY', 'PF06093_13': 'Spt4', 'PF00121_18': 'TIM', 'PF01994_16': 'Trm56',
258
+ 'PF00749_21': 'tRNA-synt_1c', 'PF00750_19': 'tRNA-synt_1d', 'PF13393_6': 'tRNA-synt_His',
259
+ 'PF01142_18': 'TruD', 'PF01992_16': 'vATP-synt_AC39', 'PF01991_18': 'vATP-synt_E', 'PF01496_19': 'V_ATPase_I'}
260
+
261
+ #Convert passed sequences.
262
+ def convert_protein_seqs_in_mem(self, contents):
263
+ #Clean up.
264
+ self.proteins_to_search = []
265
+
266
+ for protein in contents:
267
+ #Skip a protein if it's longer than 100k AA.
268
+ if len(contents[protein]) >= 100000:
269
+ continue
270
+ as_bytes = protein.encode()
271
+ #Pyhmmer digitization of sequences for searching.
272
+ easel_seq = pyhmmer.easel.TextSequence(name = as_bytes, sequence = contents[protein])
273
+ easel_seq = easel_seq.digitize(pyhmmer.easel.Alphabet.amino())
274
+ self.proteins_to_search.append(easel_seq)
275
+
276
+ easel_seq = None
277
+
278
+ def load_protein_seqs_from_file(self, prots_file):
279
+ #Pyhmmer has a method for loading a fasta file, but we need to support gzipped inputs, so we do it manually.
280
+ contents, deflines = read_fasta(prots_file)
281
+ self.protein_descriptions = deflines
282
+ self.convert_protein_seqs_in_mem(contents)
283
+
284
+ def execute_search(self):
285
+ if self.hmm_model_optimized is None:
286
+ top_hits = list(pyhmmer.hmmsearch(self.hmm_model, self.proteins_to_search, cpus=1, bit_cutoffs="trusted"))
287
+ else:
288
+ top_hits = list(pyhmmer.hmmsearch(self.hmm_model_optimized, self.proteins_to_search, cpus=1, bit_cutoffs="trusted"))
289
+
290
+ self.printable_lines = []
291
+
292
+ self.hmm_result_proteins = []
293
+ self.hmm_result_accessions = []
294
+ self.hmm_result_scores = []
295
+
296
+ for model in top_hits:
297
+ for hit in model:
298
+ target_name = hit.name.decode()
299
+ target_acc = hit.accession
300
+ if target_acc is None:
301
+ target_acc = "-"
302
+ else:
303
+ target_acc = target_acc.decode()
304
+
305
+ query_name = hit.best_domain.alignment.hmm_name.decode()
306
+ query_acc = hit.best_domain.alignment.hmm_accession.decode()
307
+
308
+ full_seq_evalue = "%.2g" % hit.evalue
309
+ full_seq_score = round(hit.score, 1)
310
+ full_seq_bias = round(hit.bias, 1)
311
+
312
+ best_dom_evalue = "%.2g" % hit.best_domain.alignment.domain.i_evalue
313
+ best_dom_score = round(hit.best_domain.alignment.domain.score, 1)
314
+ best_dom_bias = round(hit.best_domain.alignment.domain.bias, 1)
315
+
316
+ #I don't know how to get most of these values.
317
+ exp = 0
318
+ reg = 0
319
+ clu = 0
320
+ ov = 0
321
+ env = 0
322
+ dom = len(hit.domains)
323
+ rep = 0
324
+ inc = 0
325
+
326
+ try:
327
+ description = self.protein_descriptions[target_name]
328
+ except:
329
+ description = ""
330
+
331
+ writeout = [target_name, target_acc, query_name, query_acc, full_seq_evalue, \
332
+ full_seq_score, full_seq_bias, best_dom_evalue, best_dom_score, best_dom_bias, \
333
+ exp, reg, clu, ov, env, dom, rep, inc, description]
334
+
335
+ #Format and join.
336
+ writeout = [str(i) for i in writeout]
337
+ writeout = '\t'.join(writeout)
338
+
339
+ self.printable_lines.append(writeout)
340
+
341
+ self.hmm_result_proteins.append(target_name)
342
+ self.hmm_result_accessions.append(query_acc)
343
+ self.hmm_result_scores.append(best_dom_score)
344
+
345
+ def filter_to_best_hits(self):
346
+ hmm_file = np.transpose(np.array([self.hmm_result_proteins, self.hmm_result_accessions, self.hmm_result_scores]))
347
+
348
+ #hmm_file = np.loadtxt(hmm_file_name, comments = '#', usecols = (0, 3, 8), dtype=(str))
349
+ #Sort the hmm file based on the score column in descending order.
350
+ hmm_file = hmm_file[hmm_file[:,2].astype(float).argsort()[::-1]]
351
+
352
+ #Identify the first row where each gene name appears, after sorting by score;
353
+ #in effect, return the highest scoring assignment per gene name
354
+ #Sort the indices of the result to match the score-sorted table instead of alphabetical order of gene names
355
+ hmm_file = hmm_file[np.sort(np.unique(hmm_file[:,0], return_index = True)[1])]
356
+
357
+ #Filter the file again for the unique ACCESSION names, since we're only allowed one gene per accession, I guess?
358
+ #Don't sort the indices, we don't care about the scores anymore.
359
+ hmm_file = hmm_file[np.unique(hmm_file[:,1], return_index = True)[1]]
360
+
361
+ sql_friendly_names = [i.replace(".", "_") for i in hmm_file[:,1]]
362
+
363
+ self.best_hits = dict(zip(hmm_file[:,0], sql_friendly_names))
364
+
365
+ hmm_file = None
366
+
367
+ #Count per-dom occurs.
368
+ def assign_domain(self):
369
+ for prot in self.best_hits.values():
370
+ if prot in self.bacterial_SCPs:
371
+ self.domain_counts["Bacteria"] += 1
372
+ if prot in self.archaeal_SCPs:
373
+ self.domain_counts["Archaea"] += 1
374
+
375
+ self.bacterial_fraction = self.domain_counts["Bacteria"] / self.voted_domain["Bacteria"]
376
+ self.aechaeal_fraction = self.domain_counts["Archaea"] / self.voted_domain["Archaea"]
377
+
378
+ if self.bacterial_fraction >= self.aechaeal_fraction:
379
+ self.voted_domain = "Bacteria"
380
+ else:
381
+ self.voted_domain = "Archaea"
382
+
383
+ pop_keys = list(self.best_hits.keys())
384
+ for key in pop_keys:
385
+ if self.voted_domain == "Bacteria":
386
+ if self.best_hits[key] not in self.bacterial_SCPs:
387
+ self.best_hits.pop(key)
388
+ if self.voted_domain == "Archaea":
389
+ if self.best_hits[key] not in self.archaeal_SCPs:
390
+ self.best_hits.pop(key)
391
+
392
+ def to_hmm_file(self, output):
393
+ #PyHMMER data is a bit hard to parse. For each result:
394
+
395
+ content = '\n'.join(self.printable_lines) + '\n'
396
+
397
+ if self.do_compress:
398
+ #Clean
399
+ if os.path.exists(output):
400
+ os.remove(output)
401
+
402
+ content = content.encode()
403
+
404
+ fh = gzip.open(output+".gz", "wb")
405
+ fh.write(content)
406
+ fh.close()
407
+ content = None
408
+
409
+ else:
410
+ #Clean
411
+ if os.path.exists(output+".gz"):
412
+ os.remove(output+".gz")
413
+
414
+ fh = open(output, "w")
415
+
416
+ fh.write(content)
417
+
418
+ fh.close()
419
+
420
+ content = None
421
+
422
+ #If we're doing this step at all, we've either loaded the seqs into mem by reading the prot file
423
+ #or have them in mem thanks to pyrodigal.
424
+ def run_for_fastaai(self, prots, hmm_output):
425
+ try:
426
+ self.convert_protein_seqs_in_mem(prots)
427
+ self.execute_search()
428
+ self.filter_to_best_hits()
429
+ try:
430
+ self.to_hmm_file(hmm_output)
431
+ except:
432
+ print(output, "cannot be created. HMM search failed. This file will be skipped.")
433
+
434
+ except:
435
+ print(output, "failed to run through HMMER!")
436
+ self.best_hits = None
437
+
438
+
439
+ def hmm_preproc_initializer(hmm_file, do_compress = False):
440
+ global hmm_manager
441
+ hmm_manager = pyhmmer_manager(do_compress)
442
+ hmm_manager.load_hmm_from_file(hmm_file)
443
+
444
+ class pyrodigal_manager:
445
+ def __init__(self, file = None, aa_out = None, nt_out = None, is_meta = False, full_headers = True, trans_table = 11,
446
+ num_bp_fmt = True, verbose = True, do_compress = "0", compare_against = None):
447
+ #Input NT sequences
448
+ self.file = file
449
+
450
+ #List of seqs read from input file.
451
+ self.sequences = None
452
+ #Concatenation of up to first 32 million bp in self.sequences - prodigal caps at this point.
453
+ self.training_seq = None
454
+
455
+ #Predicted genes go here
456
+ self.predicted_genes = None
457
+ #Record the translation table used.
458
+ self.trans_table = trans_table
459
+
460
+ #This is the pyrodigal manager - this does the gene predicting.
461
+ self.manager = pd.OrfFinder(meta=is_meta)
462
+ self.is_meta = is_meta
463
+
464
+ #Full prodigal header information includes more than just a protein number.
465
+ #If full_headers is true, protein deflines will match prodigal; else, just protein ID.
466
+ self.full_headers = full_headers
467
+
468
+ #Prodigal prints info to console. I enhanced the info and made printing default, but also allow them to be totally turned off.
469
+ self.verbose = verbose
470
+
471
+ #Prodigal formats outputs with 70 bases per line max
472
+ self.num_bp_fmt = num_bp_fmt
473
+
474
+ #File names for outputs
475
+ self.aa_out = aa_out
476
+ self.nt_out = nt_out
477
+
478
+ #List of proteins in excess of 100K base pairs (HMMER's limit) and their lengths. This is also fastAAI specific.
479
+ self.excluded_seqs = {}
480
+
481
+ #Gzip outputs if asked.
482
+ self.compress = do_compress
483
+
484
+ self.labeled_proteins = None
485
+
486
+ #Normally, we don't need to keep an input sequence after it's had proteins predicted for it - however
487
+ #For FastAAI and MiGA's purposes, comparisons of two translation tables is necessary.
488
+ #Rather than re-importing sequences and reconstructing the training sequences,
489
+ #keep them for faster repredict with less I/O
490
+ self.compare_to = compare_against
491
+ if self.compare_to is not None:
492
+ self.keep_seqs = True
493
+ self.keep_after_train = True
494
+ else:
495
+ self.keep_seqs = False
496
+ self.keep_after_train = False
497
+
498
+ #Imports a fasta as binary.
499
+ def import_sequences(self):
500
+ if self.sequences is None:
501
+ self.sequences = {}
502
+
503
+ #check for zipped and import as needed.
504
+ with open(self.file, 'rb') as test_gz:
505
+ #Gzip magic number
506
+ is_gz = (test_gz.read(2) == b'\x1f\x8b')
507
+
508
+ if is_gz:
509
+ fh = gzip.open(self.file)
510
+ else:
511
+ fh = open(self.file, "rb")
512
+
513
+ imp = fh.readlines()
514
+
515
+ fh.close()
516
+
517
+ cur_seq = None
518
+ for s in imp:
519
+ s = s.decode().strip()
520
+ #> is 62 in ascii. This is asking if the first character is '>'
521
+ if s.startswith(">"):
522
+ #Skip first cycle, then do for each after
523
+ if cur_seq is not None:
524
+ self.sequences[cur_seq] = ''.join(self.sequences[cur_seq])
525
+ self.sequences[cur_seq] = self.sequences[cur_seq].encode()
526
+ #print(cur_seq, len(self.sequences[cur_seq]))
527
+ cur_seq = s[1:]
528
+ cur_seq = cur_seq.split()[0]
529
+ cur_seq = cur_seq.encode('utf-8')
530
+ self.sequences[cur_seq] = []
531
+ else:
532
+ #Remove the newline character.
533
+ #bases = s[:-1]
534
+ self.sequences[cur_seq].append(s)
535
+
536
+ #Final set
537
+ self.sequences[cur_seq] = ''.join(self.sequences[cur_seq])
538
+ self.sequences[cur_seq] = self.sequences[cur_seq].encode()
539
+
540
+ #Now we have the data, go to training.
541
+ if not self.is_meta:
542
+ self.train_manager()
543
+
544
+ #Collect up to the first 32 million bases for use in training seq.
545
+ def train_manager(self):
546
+ running_sum = 0
547
+ seqs_added = 0
548
+ if self.training_seq is None:
549
+ self.training_seq = []
550
+ for seq in self.sequences:
551
+ running_sum += len(self.sequences[seq])
552
+ if seqs_added > 0:
553
+ #Prodigal interleaving logic - add this breaker between sequences, starting at sequence 2
554
+ self.training_seq.append(b'TTAATTAATTAA')
555
+ running_sum += 12
556
+
557
+ seqs_added += 1
558
+
559
+ #Handle excessive size
560
+ if running_sum >= 32000000:
561
+ print("Warning: Sequence is long (max 32000000 for training).")
562
+ print("Training on the first 32000000 bases.")
563
+
564
+ to_remove = running_sum - 32000000
565
+
566
+ #Remove excess characters
567
+ cut_seq = self.sequences[seq][:-to_remove]
568
+ #Add the partial seq
569
+ self.training_seq.append(cut_seq)
570
+
571
+ #Stop the loop and move to training
572
+ break
573
+
574
+ #add in a full sequence
575
+ self.training_seq.append(self.sequences[seq])
576
+
577
+ if seqs_added > 1:
578
+ self.training_seq.append(b'TTAATTAATTAA')
579
+
580
+ self.training_seq = b''.join(self.training_seq)
581
+
582
+ if len(self.training_seq) < 20000:
583
+ if self.verbose:
584
+ print("Can't train on 20 thousand or fewer characters. Switching to meta mode.")
585
+ self.manager = pd.OrfFinder(meta=True)
586
+ self.is_meta = True
587
+ else:
588
+ if self.verbose:
589
+ print("")
590
+ #G is 71, C is 67; we're counting G + C and dividing by the total.
591
+ gc = round(((self.training_seq.count(67) + self.training_seq.count(71))/ len(self.training_seq)) * 100, 2)
592
+ print(len(self.training_seq), "bp seq created,", gc, "pct GC")
593
+
594
+ #Train
595
+ self.manager.train(self.training_seq, translation_table = self.trans_table)
596
+
597
+ if not self.keep_after_train:
598
+ #Clean up
599
+ self.training_seq = None
600
+
601
+ def predict_genes(self):
602
+ if self.is_meta:
603
+ if self.verbose:
604
+ print("Finding genes in metagenomic mode")
605
+ else:
606
+ if self.verbose:
607
+ print("Finding genes with translation table", self.trans_table)
608
+ print("")
609
+
610
+ self.predicted_genes = {}
611
+ for seq in self.sequences:
612
+
613
+ if self.verbose:
614
+ print("Finding genes in sequence", seq.decode(), "("+str(len(self.sequences[seq]))+ " bp)... ", end = '')
615
+
616
+ self.predicted_genes[seq] = self.manager.find_genes(self.sequences[seq])
617
+
618
+ #If we're comparing multiple tables, then we want to keep these for re-prediction.
619
+ if not self.keep_seqs:
620
+ #Clean up
621
+ self.sequences[seq] = None
622
+
623
+ if self.verbose:
624
+ print("done!")
625
+
626
+ #Predict genes with an alternative table, compare results, and keep the winner.
627
+ def compare_alternative_table(self, table):
628
+ if table == self.trans_table:
629
+ print("You're trying to compare table", table, "with itself.")
630
+ else:
631
+ if self.verbose:
632
+ print("Comparing translation table", self.trans_table, "against table", table)
633
+ old_table = self.trans_table
634
+ old_genes = self.predicted_genes
635
+ old_size = 0
636
+ for seq in self.predicted_genes:
637
+ for gene in self.predicted_genes[seq]:
638
+ old_size += (gene.end - gene.begin)
639
+
640
+ self.trans_table = table
641
+ self.train_manager()
642
+ self.predict_genes()
643
+
644
+ new_size = 0
645
+ for seq in self.predicted_genes:
646
+ for gene in self.predicted_genes[seq]:
647
+ new_size += (gene.end - gene.begin)
648
+
649
+ if (old_size / new_size) > 1.1:
650
+ if self.verbose:
651
+ print("Translation table", self.trans_table, "performed better than table", old_table, "and will be used instead.")
652
+ else:
653
+ if self.verbose:
654
+ print("Translation table", self.trans_table, "did not perform significantly better than table", old_table, "and will not be used.")
655
+ self.trans_table = old_table
656
+ self.predicted_genes = old_genes
657
+
658
+ #cleanup
659
+ old_table = None
660
+ old_genes = None
661
+ old_size = None
662
+ new_size = None
663
+
664
+ def predict_and_compare(self):
665
+ self.predict_genes()
666
+
667
+ #Run alt comparisons in gene predict.
668
+ if self.compare_to is not None:
669
+ while len(self.compare_to) > 0:
670
+ try:
671
+ next_table = int(self.compare_to.pop(0))
672
+
673
+ if len(self.compare_to) == 0:
674
+ #Ready to clean up.
675
+ self.keep_after_train = True
676
+ self.keep_seqs = True
677
+
678
+ self.compare_alternative_table(next_table)
679
+ except:
680
+ print("Alternative table comparison failed! Skipping.")
681
+
682
+ #Break lines into size base pairs per line. Prodigal's default for bp is 70, aa is 60.
683
+ def num_bp_line_format(self, string, size = 70):
684
+ #ceiling funciton without the math module
685
+ ceiling = int(round((len(string)/size)+0.5, 0))
686
+ formatted = '\n'.join([string[(i*size):(i+1)*size] for i in range(0, ceiling)])
687
+ return formatted
688
+
689
+ #Writeouts
690
+ def write_nt(self):
691
+ if self.nt_out is not None:
692
+ if self.verbose:
693
+ print("Writing nucleotide sequences... ")
694
+ if self.compress == '1' or self.compress == '2':
695
+ out_writer = gzip.open(self.nt_out+".gz", "wb")
696
+
697
+ content = b''
698
+
699
+ for seq in self.predicted_genes:
700
+ seqname = b">"+ seq + b"_"
701
+ #Gene counter
702
+ count = 1
703
+ for gene in self.predicted_genes[seq]:
704
+ #Full header lines
705
+ if self.full_headers:
706
+ content += b' # '.join([seqname + str(count).encode(), str(gene.begin).encode(), str(gene.end).encode(), str(gene.strand).encode(), gene._gene_data.encode()])
707
+ else:
708
+ #Reduced headers if we don't care.
709
+ content += seqname + str(count).encode()
710
+
711
+ content += b'\n'
712
+
713
+ if self.num_bp_fmt:
714
+ #60 bp cap per line
715
+ content += self.num_bp_line_format(gene.sequence(), size = 70).encode()
716
+ else:
717
+ #One-line sequence.
718
+ content += gene.sequence().encode()
719
+
720
+ content += b'\n'
721
+ count += 1
722
+
723
+ out_writer.write(content)
724
+ out_writer.close()
725
+
726
+ if self.compress == '0' or self.compress == '2':
727
+ out_writer = open(self.nt_out, "w")
728
+
729
+ for seq in self.predicted_genes:
730
+ #Only do this decode once.
731
+ seqname = ">"+ seq.decode() +"_"
732
+ #Gene counter
733
+ count = 1
734
+
735
+ for gene in self.predicted_genes[seq]:
736
+ #Full header lines
737
+ if self.full_headers:
738
+ #Standard prodigal header
739
+ print(seqname + str(count), gene.begin, gene.end, gene.strand, gene._gene_data, sep = " # ", file = out_writer)
740
+ else:
741
+ #Reduced headers if we don't care.
742
+ print(seqname + str(count), file = out_writer)
743
+
744
+ if self.num_bp_fmt:
745
+ #60 bp cap per line
746
+ print(self.num_bp_line_format(gene.sequence(), size = 70), file = out_writer)
747
+ else:
748
+ #One-line sequence.
749
+ print(gene.sequence(), file = out_writer)
750
+
751
+ count += 1
752
+
753
+ out_writer.close()
754
+
755
+ def write_aa(self):
756
+ if self.aa_out is not None:
757
+ if self.verbose:
758
+ print("Writing amino acid sequences...")
759
+
760
+ self.labeled_proteins = {}
761
+ content = ''
762
+ for seq in self.predicted_genes:
763
+ count = 1
764
+ seqname = ">"+ seq.decode() + "_"
765
+ for gene in self.predicted_genes[seq]:
766
+ prot_name = seqname + str(count)
767
+ translation = gene.translate()
768
+ self.labeled_proteins[prot_name[1:]] = translation
769
+ defline = " # ".join([prot_name, str(gene.begin), str(gene.end), str(gene.strand), str(gene._gene_data)])
770
+ content += defline
771
+ content += "\n"
772
+ count += 1
773
+ content += self.num_bp_line_format(translation, size = 60)
774
+ content += "\n"
775
+
776
+ if self.compress == '0' or self.compress == '2':
777
+ out_writer = open(self.aa_out, "w")
778
+ out_writer.write(content)
779
+ out_writer.close()
780
+
781
+ if self.compress == '1' or self.compress == '2':
782
+ content = content.encode()
783
+ out_writer = gzip.open(self.aa_out+".gz", "wb")
784
+ out_writer.write(content)
785
+ out_writer.close()
786
+
787
+ def run_for_fastaai(self):
788
+ self.verbose = False
789
+ self.import_sequences()
790
+ self.train_manager()
791
+ self.predict_and_compare()
792
+ self.write_aa()
793
+
794
+ #Iterator for agnostic reader
795
+ class agnostic_reader_iterator:
796
+ def __init__(self, reader):
797
+ self.handle_ = reader.handle
798
+ self.is_gz_ = reader.is_gz
799
+
800
+ def __next__(self):
801
+ if self.is_gz_:
802
+ line = self.handle_.readline().decode()
803
+ else:
804
+ line = self.handle_.readline()
805
+
806
+ #Ezpz EOF check
807
+ if line:
808
+ return line
809
+ else:
810
+ raise StopIteration
811
+
812
+ #File reader that doesn't care if you give it a gzipped file or not.
813
+ class agnostic_reader:
814
+ def __init__(self, file):
815
+ self.path = file
816
+
817
+ with open(file, 'rb') as test_gz:
818
+ #Gzip magic number
819
+ is_gz = (test_gz.read(2) == b'\x1f\x8b')
820
+
821
+ self.is_gz = is_gz
822
+
823
+ if is_gz:
824
+ self.handle = gzip.open(self.path)
825
+ else:
826
+ self.handle = open(self.path)
827
+
828
+ def __iter__(self):
829
+ return agnostic_reader_iterator(self)
830
+
831
+ def close(self):
832
+ self.handle.close()
833
+
834
+ '''
835
+ Class for handling all of the raw genome/protein/protein+HMM file inputs when building a database.
836
+
837
+ Takes a file or files and processes them from genome -> protein, protein -> hmm, prot+HMM -> kmerized protein best hits as numpy int arrays according to the kmer_index
838
+
839
+ '''
840
+
841
+ class input_file:
842
+ def __init__(self, input_path, output = "", verbosity = False, do_compress = False,
843
+ make_crystal = False):
844
+ #starting path for the file; irrelevant for protein and hmm, but otherwise useful for keeping track.
845
+ self.path = input_path
846
+ #Output directory starts with this
847
+ self.output = os.path.normpath(output + "/")
848
+ #For printing file updates, this is the input name
849
+ self.name = os.path.basename(input_path)
850
+ #original name is the key used for the genomes index later on.
851
+ self.original_name = os.path.basename(input_path)
852
+ #This is the name that can be used for building files with new extensions.
853
+ if input_path.endswith(".gz"):
854
+ #Remove .gz first to make names consistent.
855
+ self.basename = os.path.splitext(os.path.basename(input_path[:-3]))[0]
856
+ else:
857
+ self.basename = os.path.splitext(os.path.basename(input_path))[0]
858
+
859
+ #Sanitize for SQL
860
+ #These are chars safe for sql
861
+ sql_safe = set('_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789')
862
+ current_chars = set(self.basename)
863
+ #self.sql_name = self.basename
864
+ #Identify SQL-unsafe characters as those outside the permissible set and replace all with underscores.
865
+ for char in current_chars - sql_safe:
866
+ self.basename = self.basename.replace(char, "_")
867
+
868
+ #'genome' or 'protein' or 'protein and HMM'
869
+ self.status = None
870
+ #These will keep track of paths for each stage of file for us.
871
+ self.genome = None
872
+ self.protein = None
873
+ self.hmm = None
874
+
875
+ self.ran_hmmer = False
876
+
877
+ #If pyrodigal is run, then the protein sequences are already loaded into memory.
878
+ #We reuse them in kmer extraction instead of another I/O
879
+ self.prepared_proteins = None
880
+
881
+ self.intermediate = None
882
+
883
+ self.crystalize = make_crystal
884
+ self.best_hits = None
885
+ self.best_hits_kmers = None
886
+
887
+ self.protein_count = 0
888
+ self.protein_kmer_count = {}
889
+
890
+ self.trans_table = None
891
+ self.start_time = None
892
+ self.end_time = None
893
+ self.err_log = ""
894
+ #doesn't get updated otw.
895
+ self.initial_state = "protein+HMM"
896
+
897
+ self.verbose = verbosity
898
+
899
+ #Check if the file failed to produce ANY SCP HMM hits.
900
+ self.is_empty = False
901
+
902
+ self.do_compress = do_compress
903
+
904
+ self.crystal = None
905
+
906
+ self.init_time = None
907
+ #default to 0 time.
908
+ self.prot_pred_time = None
909
+ self.hmm_search_time = None
910
+ self.besthits_time = None
911
+
912
+ def curtime(self):
913
+ time_format = "%d/%m/%Y %H:%M:%S"
914
+ timer = datetime.datetime.now()
915
+ time = timer.strftime(time_format)
916
+ return time
917
+
918
+ def partial_timings(self):
919
+ protein_pred = self.prot_pred_time-self.init_time
920
+ hmm_search = self.hmm_search_time-self.prot_pred_time
921
+ besthits = self.besthits_time-self.hmm_search_time
922
+
923
+ protein_pred = protein_pred.total_seconds()
924
+ hmm_search = hmm_search.total_seconds()
925
+ besthits = besthits.total_seconds()
926
+
927
+ self.prot_pred_time = protein_pred
928
+ self.hmm_search_time = hmm_search
929
+ self.besthits_time = besthits
930
+
931
+ #Functions for externally setting status and file paths of particular types
932
+ def set_genome(self, path):
933
+ self.status = 'genome'
934
+ self.genome = path
935
+
936
+ def set_protein(self, path):
937
+ self.status = 'protein'
938
+ self.protein = path
939
+
940
+ def set_hmm(self, path):
941
+ if self.protein is None:
942
+ print("Warning! I don't have a protein yet, so this HMM will be useless to me until I do!")
943
+ self.status = 'protein and hmm'
944
+ self.hmm = path
945
+
946
+ def set_crystal(self, path):
947
+ self.status = 'crystal'
948
+ self.crystal = path
949
+
950
+ #Runs prodigal, compares translation tables and stores faa files
951
+ def genome_to_protein(self):
952
+ if self.genome is None:
953
+ print(self.name, "wasn't a declared as a genome! I can't make this into a protein!")
954
+ else:
955
+ protein_output = os.path.normpath(self.output + "/predicted_proteins/" + self.basename + '.faa')
956
+
957
+ if self.do_compress:
958
+ compress_level = "1"
959
+ else:
960
+ compress_level = "0"
961
+
962
+ mn = pyrodigal_manager(file = self.genome, aa_out = protein_output, compare_against = [4], do_compress = compress_level)
963
+ mn.run_for_fastaai()
964
+
965
+ self.trans_table = str(mn.trans_table)
966
+
967
+ for prot in mn.excluded_seqs:
968
+ self.err_log += "Protein " + prot + " was observed to have >100K amino acids ( " + str(mn.excluded_seqs[prot]) + " AA found ). It will not be included in predicted proteins for this genome;"
969
+
970
+ self.prepared_proteins = mn.labeled_proteins
971
+
972
+ del mn
973
+
974
+ #If there are zipped files leftover and we didn't want them, clean them up.
975
+ if self.do_compress:
976
+ self.set_protein(str(protein_output)+".gz")
977
+ #Clean up unzipped version on reruns
978
+ if os.path.exists(str(protein_output)):
979
+ os.remove(str(protein_output))
980
+ else:
981
+ self.set_protein(str(protein_output))
982
+ #Clean up a zipped version on reruns
983
+ if os.path.exists(str(protein_output)+".gz"):
984
+ os.remove(str(protein_output)+".gz")
985
+
986
+ self.prot_pred_time = datetime.datetime.now()
987
+
988
+ #run hmmsearch on a protein
989
+ def protein_to_hmm(self):
990
+ if self.protein is None:
991
+ print(self.basename, "wasn't a declared as a protein! I can't make this into an HMM!")
992
+ else:
993
+
994
+ folder = os.path.normpath(self.output + "/hmms")
995
+
996
+ hmm_output = os.path.normpath(folder +"/"+ self.basename + '.hmm')
997
+
998
+ if self.prepared_proteins is None:
999
+ self.prepared_proteins, deflines = read_fasta(self.protein)
1000
+
1001
+ hmm_manager.run_for_fastaai(self.prepared_proteins, hmm_output)
1002
+
1003
+ self.ran_hmmer = True
1004
+
1005
+ if self.do_compress:
1006
+ self.set_hmm(str(hmm_output)+".gz")
1007
+ if os.path.exists(str(hmm_output)):
1008
+ os.remove(str(hmm_output))
1009
+ else:
1010
+ self.set_hmm(str(hmm_output))
1011
+ if os.path.exists(str(hmm_output)+".gz"):
1012
+ os.remove(str(hmm_output)+".gz")
1013
+
1014
+ self.hmm_search_time = datetime.datetime.now()
1015
+
1016
+ #Translate tetramers to unique int32 indices.
1017
+ def unique_kmer_simple_key(self, seq):
1018
+ #num tetramers = len(seq) - 4 + 1, just make it -3.
1019
+ n_kmers = len(seq) - 3
1020
+
1021
+ #Converts the characters in a sequence into their ascii int value
1022
+ as_ints = np.array([ord(i) for i in seq], dtype = np.int32)
1023
+
1024
+ #create seq like 0,1,2,3; 1,2,3,4; 2,3,4,5... for each tetramer that needs a value
1025
+ kmers = np.arange(4*n_kmers)
1026
+ kmers = kmers % 4 + kmers // 4
1027
+
1028
+ #Select the characters (as ints) corresponding to each tetramer all at once and reshape into rows of 4,
1029
+ #each row corresp. to a successive tetramer
1030
+ kmers = as_ints[kmers].reshape((n_kmers, 4))
1031
+
1032
+ #Given four 2-digit numbers, these multipliers work as offsets so that all digits are preserved in order when summed
1033
+ mult = np.array([1000000, 10000, 100, 1], dtype = np.int32)
1034
+
1035
+ #the fixed values effectively offset the successive chars of the tetramer by 2 positions each time;
1036
+ #practically, this is concatenation of numbers
1037
+ #Matrix mult does this for all values at once.
1038
+ return np.unique(np.dot(kmers, mult))
1039
+
1040
+ def load_hmm_and_filter_from_file(self):
1041
+ prots = []
1042
+ accs = []
1043
+ scores = []
1044
+ f = agnostic_reader(self.hmm)
1045
+ for line in f:
1046
+ if line.startswith("#"):
1047
+ continue
1048
+ else:
1049
+ segs = line.strip().split()
1050
+
1051
+ if len(segs) < 9:
1052
+ continue
1053
+
1054
+ prots.append(segs[0])
1055
+ accs.append(segs[3])
1056
+ scores.append(segs[8])
1057
+
1058
+ f.close()
1059
+
1060
+ if len(prots) < 1:
1061
+ self.best_hits = {}
1062
+
1063
+ hmm_file = np.transpose(np.array([prots, accs, scores]))
1064
+
1065
+ #hmm_file = np.loadtxt(hmm_file_name, comments = '#', usecols = (0, 3, 8), dtype=(str))
1066
+ #Sort the hmm file based on the score column in descending order.
1067
+ hmm_file = hmm_file[hmm_file[:,2].astype(float).argsort()[::-1]]
1068
+
1069
+ #Identify the first row where each gene name appears, after sorting by score;
1070
+ #in effect, return the highest scoring assignment per gene name
1071
+ #Sort the indices of the result to match the score-sorted table instead of alphabetical order of gene names
1072
+ hmm_file = hmm_file[np.sort(np.unique(hmm_file[:,0], return_index = True)[1])]
1073
+
1074
+ #Filter the file again for the unique ACCESSION names, since we're only allowed one gene per accession, I guess?
1075
+ #Don't sort the indices, we don't care about the scores anymore.
1076
+ hmm_file = hmm_file[np.unique(hmm_file[:,1], return_index = True)[1]]
1077
+
1078
+ sql_friendly_names = [i.replace(".", "_") for i in hmm_file[:,1]]
1079
+ self.best_hits = dict(zip(hmm_file[:,0], sql_friendly_names))
1080
+
1081
+ #This should consider the domain by majority vote...
1082
+ def prot_and_hmm_to_besthits(self):
1083
+ if self.ran_hmmer:
1084
+ #Manager has a filter built in.
1085
+ self.best_hits = hmm_manager.best_hits
1086
+ else:
1087
+ #Load the best hits file via old numpy method.
1088
+ self.load_hmm_and_filter_from_file()
1089
+
1090
+ hit_count = 0
1091
+
1092
+ #from pyrodigal predictions or HMM intermediate production, the sequences are already in mem and don't need read in.
1093
+ if self.prepared_proteins is None:
1094
+ #But otherwise, we need to read them in.
1095
+ self.prepared_proteins, deflines = read_fasta(self.protein)
1096
+
1097
+ self.protein_kmer_count = {}
1098
+ self.best_hits_kmers = {}
1099
+
1100
+ if self.crystalize:
1101
+ crystal_record = []
1102
+
1103
+ #Kmerize proteins and record metadata
1104
+ for protein in self.prepared_proteins:
1105
+ if protein in self.best_hits:
1106
+ accession = self.best_hits[protein]
1107
+
1108
+ if self.crystalize:
1109
+ crystal_record.append(str(protein)+"\t"+str(accession)+"\t"+str(self.prepared_proteins[protein])+"\n")
1110
+
1111
+ kmer_set = self.unique_kmer_simple_key(self.prepared_proteins[protein])
1112
+ self.protein_kmer_count[accession] = kmer_set.shape[0]
1113
+ self.protein_count += 1
1114
+ self.best_hits_kmers[accession] = kmer_set
1115
+ hit_count += 1
1116
+
1117
+ #Free the space either way
1118
+ self.prepared_proteins[protein] = None
1119
+
1120
+ if self.crystalize:
1121
+ #only make a crystal if it actually has content.
1122
+ if len(crystal_record) > 0:
1123
+ crystal_path = os.path.normpath(self.output + "/crystals/" + self.basename + '_faai_crystal.txt')
1124
+ crystal_record = "".join(crystal_record)
1125
+
1126
+ if self.do_compress:
1127
+ crystal_record = crystal_record.encode()
1128
+ crystal_writer = gzip.open(crystal_path+".gz", "wb")
1129
+ crystal_writer.write(crystal_record)
1130
+ crystal_writer.close()
1131
+ else:
1132
+ crystal_writer = open(crystal_path, "w")
1133
+ crystal_writer.write(crystal_record)
1134
+ crystal_writer.close()
1135
+
1136
+ #Final free.
1137
+ self.prepared_proteins = None
1138
+
1139
+ #No HMM hits.
1140
+ if hit_count == 0:
1141
+ self.is_empty = True
1142
+
1143
+ self.besthits_time = datetime.datetime.now()
1144
+ self.status = "best hits found"
1145
+
1146
+ def preprocess(self):
1147
+ self.init_time = datetime.datetime.now()
1148
+ #default to 0 time.
1149
+ self.prot_pred_time = self.init_time
1150
+ self.hmm_search_time = self.init_time
1151
+ self.besthits_time = self.init_time
1152
+
1153
+ #There's no advancement stage for protein and HMM
1154
+ if self.status == 'genome':
1155
+ start_time = self.curtime()
1156
+ #report = True
1157
+ if self.start_time is None:
1158
+ self.start_time = start_time
1159
+
1160
+ if self.initial_state == "protein+HMM":
1161
+ self.initial_state = "genome"
1162
+
1163
+ self.genome_to_protein()
1164
+
1165
+ if self.status == 'protein':
1166
+ start_time = self.curtime()
1167
+ #report = True
1168
+ if self.start_time is None:
1169
+ self.start_time = start_time
1170
+
1171
+ if self.initial_state == "protein+HMM":
1172
+ self.initial_state = "protein"
1173
+
1174
+ self.protein_to_hmm()
1175
+
1176
+ if self.status == 'protein and hmm':
1177
+ start_time = self.curtime()
1178
+
1179
+ if self.start_time is None:
1180
+ self.start_time = start_time
1181
+
1182
+ self.prot_and_hmm_to_besthits()
1183
+
1184
+ #Add an end time if either genome -> protein -> HMM or protein -> HMM happened.
1185
+ if self.start_time is not None:
1186
+ end_time = self.curtime()
1187
+ self.end_time = end_time
1188
+ else:
1189
+ #Start was protein+HMM. There was no runtime, and intitial state is p+hmm
1190
+ #self.initial_state = "protein+HMM"
1191
+ self.start_time = "N/A"
1192
+ self.end_time = "N/A"
1193
+
1194
+ #Protein not generated on this run.
1195
+ if self.trans_table is None:
1196
+ self.trans_table = "unknown"
1197
+
1198
+ self.partial_timings()
1199
+
1200
+ '''
1201
+ Utility functions
1202
+ '''
1203
+ def prepare_directories(output, status, build_or_query, make_crystals = False):
1204
+ preparation_successful = True
1205
+
1206
+ if not os.path.exists(output):
1207
+ try:
1208
+ os.mkdir(output)
1209
+ except:
1210
+ print("")
1211
+ print("FastAAI tried to make output directory: '"+ output + "' but failed.")
1212
+ print("")
1213
+ print("Troubleshooting:")
1214
+ print("")
1215
+ print(" (1) Do you have permission to create directories in the location you specified?")
1216
+ print(" (2) Did you make sure that all directories other than", os.path.basename(output), "already exist?")
1217
+ print("")
1218
+ preparation_successful = False
1219
+
1220
+ if preparation_successful:
1221
+ try:
1222
+ if status == 'genome':
1223
+ if not os.path.exists(os.path.normpath(output + "/" + "predicted_proteins")):
1224
+ os.mkdir(os.path.normpath(output + "/" + "predicted_proteins"))
1225
+ if not os.path.exists(os.path.normpath(output + "/" + "hmms")):
1226
+ os.mkdir(os.path.normpath(output + "/" + "hmms"))
1227
+
1228
+ if status == 'protein':
1229
+ if not os.path.exists(os.path.normpath(output + "/" + "hmms")):
1230
+ os.mkdir(os.path.normpath(output + "/" + "hmms"))
1231
+
1232
+ if make_crystals:
1233
+ if not os.path.exists(os.path.normpath(output + "/" + "crystals")):
1234
+ os.mkdir(os.path.normpath(output + "/" + "crystals"))
1235
+
1236
+ if build_or_query == "build":
1237
+ if not os.path.exists(os.path.normpath(output + "/" + "database")):
1238
+ os.mkdir(os.path.normpath(output + "/" + "database"))
1239
+
1240
+ if build_or_query == "query":
1241
+ if not os.path.exists(os.path.normpath(output + "/" + "results")):
1242
+ os.mkdir(os.path.normpath(output + "/" + "results"))
1243
+
1244
+
1245
+ except:
1246
+ print("FastAAI was able to create or find", output, "but couldn't make directories there.")
1247
+ print("")
1248
+ print("This shouldn't happen. Do you have permission to write to that directory?")
1249
+
1250
+
1251
+ return preparation_successful
1252
+
1253
+ def find_hmm():
1254
+ hmm_path = None
1255
+ try:
1256
+ #Try to locate the data bundled as it would be with a pip/conda install.
1257
+ script_path = os.path.dirname(sys.modules['fastAAI_HMM_models'].__file__)
1258
+ if len(script_path) == 0:
1259
+ script_path = "."
1260
+ hmm_complete_model = os.path.abspath(os.path.normpath(script_path + '/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm'))
1261
+ hmm_path = str(hmm_complete_model)
1262
+ #Check that the file exists or fail to the except.
1263
+ fh = open(hmm_path)
1264
+ fh.close()
1265
+ except:
1266
+ #Look in the same dir as the script; old method/MiGA friendly
1267
+ script_path = os.path.dirname(__file__)
1268
+ if len(script_path) == 0:
1269
+ script_path = "."
1270
+ hmm_complete_model = os.path.abspath(os.path.normpath(script_path +"/"+ "00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm"))
1271
+ hmm_path = str(hmm_complete_model)
1272
+
1273
+ return hmm_path
1274
+
1275
+ #Build DB from genomes
1276
+
1277
+ def unique_kmers(seq, ksize):
1278
+ n_kmers = len(seq) - ksize + 1
1279
+ kmers = []
1280
+ for i in range(n_kmers):
1281
+ kmers.append(kmer_index[seq[i:i + ksize]])
1282
+ #We care about the type because we're working with bytes later.
1283
+ return np.unique(kmers).astype(np.int32)
1284
+
1285
+ def split_seq(seq, num_grps):
1286
+ newseq = []
1287
+ splitsize = 1.0/num_grps*len(seq)
1288
+ for i in range(num_grps):
1289
+ newseq.append(seq[int(round(i*splitsize)):int(round((i+1)*splitsize))])
1290
+ return newseq
1291
+
1292
+ #gives the max and min index needed to split a list of (max_val) genomes into
1293
+ def split_indicies(max_val, num_grps):
1294
+ newseq = []
1295
+ splitsize = 1.0/num_grps*max_val
1296
+ for i in range(num_grps):
1297
+ newseq.append(((round(i*splitsize)), round((i+1)*splitsize)))
1298
+ return newseq
1299
+
1300
+ def split_seq_indices(seq, num_grps):
1301
+ newseq = []
1302
+ splitsize = 1.0/num_grps*len(seq)
1303
+ for i in range(num_grps):
1304
+ newseq.append((int(round(i*splitsize)), int(round((i+1)*splitsize)),))
1305
+ return newseq
1306
+
1307
+
1308
+ def list_to_index_dict(list):
1309
+ result = {}
1310
+ counter = 0
1311
+ for item in list:
1312
+ result[item] = counter
1313
+ counter += 1
1314
+ return result
1315
+
1316
+
1317
+ def rev_list_to_index_dict(list):
1318
+ result = {}
1319
+ counter = 0
1320
+ for item in list:
1321
+ result[counter] = item
1322
+ counter += 1
1323
+ return result
1324
+
1325
+ def generate_accessions_index(forward = True):
1326
+ acc_list = ['PF01780_19', 'PF03948_14', 'PF17144_4', 'PF00830_19', 'PF00347_23', 'PF16906_5', 'PF13393_6',
1327
+ 'PF02565_15', 'PF01991_18', 'PF01984_20', 'PF00861_22', 'PF13656_6', 'PF00368_18', 'PF01142_18', 'PF00312_22', 'PF02367_17',
1328
+ 'PF01951_16', 'PF00749_21', 'PF01655_18', 'PF00318_20', 'PF01813_17', 'PF01649_18', 'PF01025_19', 'PF00380_19', 'PF01282_19',
1329
+ 'PF01864_17', 'PF01783_23', 'PF01808_18', 'PF01982_16', 'PF01715_17', 'PF00213_18', 'PF00119_20', 'PF00573_22', 'PF01981_16',
1330
+ 'PF00281_19', 'PF00584_20', 'PF00825_18', 'PF00406_22', 'PF00177_21', 'PF01192_22', 'PF05833_11', 'PF02699_15', 'PF01016_19',
1331
+ 'PF01765_19', 'PF00453_18', 'PF01193_24', 'PF05221_17', 'PF00231_19', 'PF00416_22', 'PF02033_18', 'PF01668_18', 'PF00886_19',
1332
+ 'PF00252_18', 'PF00572_18', 'PF00366_20', 'PF04104_14', 'PF04919_12', 'PF01912_18', 'PF00276_20', 'PF00203_21', 'PF00889_19',
1333
+ 'PF02996_17', 'PF00121_18', 'PF01990_17', 'PF00344_20', 'PF00297_22', 'PF01196_19', 'PF01194_17', 'PF01725_16', 'PF00750_19',
1334
+ 'PF00338_22', 'PF00238_19', 'PF01200_18', 'PF00162_19', 'PF00181_23', 'PF01866_17', 'PF00709_21', 'PF02006_16', 'PF00164_25',
1335
+ 'PF00237_19', 'PF01139_17', 'PF01351_18', 'PF04010_13', 'PF06093_13', 'PF00828_19', 'PF02410_15', 'PF01176_19', 'PF02130_17',
1336
+ 'PF01948_18', 'PF01195_19', 'PF01746_21', 'PF01667_17', 'PF03874_16', 'PF01090_19', 'PF01198_19', 'PF01250_17', 'PF17136_4',
1337
+ 'PF06026_14', 'PF03652_15', 'PF04019_12', 'PF01201_22', 'PF00832_20', 'PF01264_21', 'PF03840_14', 'PF00831_23', 'PF00189_20',
1338
+ 'PF02601_15', 'PF01496_19', 'PF00411_19', 'PF00334_19', 'PF00687_21', 'PF01157_18', 'PF01245_20', 'PF01994_16', 'PF01632_19',
1339
+ 'PF00827_17', 'PF01015_18', 'PF00829_21', 'PF00410_19', 'PF00833_18', 'PF00935_19', 'PF01992_16']
1340
+ if forward:
1341
+ list_of_poss_accs = list_to_index_dict(acc_list)
1342
+ else:
1343
+ list_of_poss_accs = rev_list_to_index_dict(acc_list)
1344
+
1345
+ return list_of_poss_accs
1346
+
1347
+ #Build or add to a FastAAI DB
1348
+ def build_db_opts():
1349
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
1350
+ description='''
1351
+ This FastAAI module allows you to create a FastAAI database from one or many genomes, proteins, or proteins and HMMs, or add these files to an existing one.
1352
+
1353
+ Supply genomes OR proteins OR proteins AND HMMs as inputs.
1354
+
1355
+ If you supply genomes, FastAAI will predict proteins from them, and HMMs will be created from those proteins
1356
+ If you supply only proteins, FastAAI will create HMM files from them, searching against FastAAI's internal database
1357
+ If you supply proteins AND HMMs, FastAAI will directly use them to build the database.\n
1358
+ You cannot supply both genomes and proteins
1359
+ ''')
1360
+
1361
+ parser.add_argument('-g', '--genomes', dest = 'genomes', default = None, help = 'A directory containing genomes in FASTA format.')
1362
+ parser.add_argument('-p', '--proteins', dest = 'proteins', default = None, help = 'A directory containing protein amino acids in FASTA format.')
1363
+ parser.add_argument('-m', '--hmms', dest = 'hmms', default = None, help = 'A directory containing the results of an HMM search on a set of proteins.')
1364
+ parser.add_argument('-d', '--database', dest = 'db_name', default = "FastAAI_database.sqlite.db", help = 'The name of the database you wish to create or add to. The database will be created if it doesn\'t already exist and placed in the output directory. FastAAI_database.sqlite.db by default.')
1365
+
1366
+ parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory to place the database and any protein or HMM files FastAAI creates. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
1367
+
1368
+ parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
1369
+ parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
1370
+ parser.add_argument('--compress', dest = "do_comp", action = 'store_true', help = 'Gzip compress generated proteins, HMMs. Off by default.')
1371
+
1372
+ args, unknown = parser.parse_known_args()
1373
+
1374
+ return parser, args
1375
+
1376
+ def run_build(input_file):
1377
+ input_file.preprocess()
1378
+ if len(input_file.best_hits_kmers) < 1:
1379
+ input_file.best_hits_kmers = None
1380
+ input_file.err_log += " This file did not successfully complete. No SCPs could be found."
1381
+
1382
+ return input_file
1383
+
1384
+ def acc_transformer_init(db, tempdir_path):
1385
+ sqlite3.register_converter("array", convert_array)
1386
+ global indb
1387
+ indb = db
1388
+ global temp_dir
1389
+ temp_dir = tempdir_path
1390
+ global ok
1391
+ ok = generate_accessions_index()
1392
+
1393
+ def acc_transformer(acc_name):
1394
+ source = sqlite3.connect(indb)
1395
+ scurs = source.cursor()
1396
+
1397
+ data = scurs.execute("SELECT * FROM {acc}_genomes".format(acc=acc_name)).fetchall()
1398
+
1399
+ scurs.close()
1400
+ source.close()
1401
+
1402
+ reformat = {}
1403
+
1404
+ for row in data:
1405
+ genome, kmers = row[0], np.frombuffer(row[1], dtype=np.int32)
1406
+ for k in kmers:
1407
+ if k not in reformat:
1408
+ reformat[k] = []
1409
+ reformat[k].append(genome)
1410
+
1411
+ data = None
1412
+
1413
+ to_add = []
1414
+ for k in reformat:
1415
+ as_bytes = np.array(reformat[k], dtype = np.int32)
1416
+ as_bytes = as_bytes.tobytes()
1417
+ reformat[k] = None
1418
+ to_add.append((int(k), as_bytes,))
1419
+
1420
+ my_acc_db = os.path.normpath(temp_dir + "/"+acc_name+".db")
1421
+
1422
+ if os.path.exists(my_acc_db):
1423
+ os.remove(my_acc_db)
1424
+
1425
+ my_db = sqlite3.connect(my_acc_db)
1426
+ curs = my_db.cursor()
1427
+ curs.execute("CREATE TABLE {acc} (kmer INTEGER PRIMARY KEY, genomes array)".format(acc=acc_name))
1428
+ my_db.commit()
1429
+
1430
+ curs.executemany("INSERT INTO {acc} VALUES (?, ?)".format(acc = acc_name), to_add)
1431
+
1432
+ my_db.commit()
1433
+
1434
+ to_add = None
1435
+
1436
+ curs.execute("CREATE INDEX {acc}_index ON {acc} (kmer)".format(acc=acc_name))
1437
+ my_db.commit()
1438
+
1439
+ curs.close()
1440
+ my_db.close()
1441
+
1442
+ return [my_acc_db, acc_name]
1443
+
1444
+ def build_db(genomes, proteins, hmms, db_name, output, threads, verbose, do_compress):
1445
+ success = True
1446
+
1447
+ imported_files = fastaai_file_importer(genomes = genomes, proteins = proteins, hmms = hmms, output = output, compress = do_compress)
1448
+ imported_files.determine_inputs()
1449
+
1450
+ if imported_files.error:
1451
+ print("Exiting FastAAI due to input file error.")
1452
+ quit()
1453
+
1454
+ good_to_go = prepare_directories(output, imported_files.status, "query")
1455
+
1456
+ db_path = os.path.normpath(output + "/database")
1457
+ if not os.path.exists(db_path):
1458
+ os.mkdir(db_path)
1459
+
1460
+ if not good_to_go:
1461
+ print("Exiting FastAAI")
1462
+ sys.exit()
1463
+
1464
+ print("")
1465
+
1466
+ hmm_path = find_hmm()
1467
+
1468
+ #Check if the db contains path info. Incl. windows version.
1469
+ if "/" not in db_name and "\\" not in db_name:
1470
+ final_database = os.path.normpath(output + "/database/" + db_name)
1471
+ else:
1472
+ #If the person insists that the db has a path, let them.
1473
+ final_database = db_name
1474
+
1475
+ #We'll skip trying this if the file already exists.
1476
+ existing_genome_IDs = None
1477
+ try:
1478
+ if os.path.exists(final_database):
1479
+ parent = sqlite3.connect(final_database)
1480
+ curs = parent.cursor()
1481
+
1482
+ existing_genome_IDs = {}
1483
+ sql_command = "SELECT genome, gen_id FROM genome_index"
1484
+ for result in curs.execute(sql_command).fetchall():
1485
+ genome = result[0]
1486
+ id = int(result[1])
1487
+ existing_genome_IDs[genome] = id
1488
+
1489
+ curs.close()
1490
+ parent.close()
1491
+ except:
1492
+ print("You specified an existing file to be a database, but it does not appear to be a FastAAI database.")
1493
+ print("FastAAI will not be able to continue. Please give FastAAI a different database name and continue.")
1494
+ print("Exiting.")
1495
+ success = False
1496
+
1497
+ if success:
1498
+ hmm_file = find_hmm()
1499
+ if existing_genome_IDs is not None:
1500
+ genome_idx = max(list(existing_genome_IDs.values()))+1
1501
+ else:
1502
+ existing_genome_IDs = {}
1503
+ genome_idx = 0
1504
+
1505
+ #return_to
1506
+ td = tempfile.mkdtemp()
1507
+ #if not os.path.exists(td):
1508
+ # os.mkdir(td)
1509
+
1510
+ temp_db = os.path.normpath(td+"/FastAAI_temp_db.db")
1511
+
1512
+ if os.path.exists(temp_db):
1513
+ os.remove(temp_db)
1514
+
1515
+ sqlite3.register_converter("array", convert_array)
1516
+ worker = sqlite3.connect(temp_db)
1517
+ wcurs = worker.cursor()
1518
+ wcurs.execute("CREATE TABLE genome_index (genome text, gen_id integer, protein_count integer)")
1519
+ wcurs.execute("CREATE TABLE genome_acc_kmer_counts (genome integer, accession integer, count integer)")
1520
+ ok = generate_accessions_index()
1521
+ for t in ok:
1522
+ wcurs.execute("CREATE TABLE " + t + "_genomes (genome INTEGER PRIMARY KEY, kmers array)")
1523
+
1524
+ worker.commit()
1525
+
1526
+ new_gens = []
1527
+ new_gak = []
1528
+ accs_seen = {}
1529
+ if verbose:
1530
+ tracker = progress_tracker(total = len(imported_files.in_files), message = "Processing inputs")
1531
+ else:
1532
+ print("Processing inputs")
1533
+
1534
+ #Only build_db makes a log.
1535
+ if not os.path.exists(os.path.normpath(output + "/" + "logs")):
1536
+ os.mkdir(os.path.normpath(output + "/" + "logs"))
1537
+
1538
+ logger = open(os.path.normpath(output+"/logs/"+"FastAAI_preprocessing_log.txt"), "a")
1539
+ print("file", "start_date", "end_date", "starting_format",
1540
+ "prot_prediction_time", "trans_table", "hmm_search_time", "besthits_time",
1541
+ "errors", sep = "\t", file = logger)
1542
+
1543
+ pool = multiprocessing.Pool(threads, initializer = hmm_preproc_initializer,
1544
+ initargs = (hmm_file, do_compress,))
1545
+
1546
+ for result in pool.imap(run_build, imported_files.in_files):
1547
+ #log data, regardless of kind
1548
+ print(result.basename, result.start_time, result.end_time, result.initial_state,
1549
+ result.prot_pred_time, result.trans_table, result.hmm_search_time, result.besthits_time,
1550
+ result.err_log, sep = "\t", file = logger)
1551
+
1552
+ if result.best_hits_kmers is not None:
1553
+ genome_name = result.original_name
1554
+
1555
+ if genome_name in existing_genome_IDs:
1556
+ print(genome_name, "Already present in final database and will be skipped.")
1557
+ print("")
1558
+ else:
1559
+ protein_count = result.protein_count
1560
+ for acc_name in result.best_hits_kmers:
1561
+ if acc_name not in accs_seen:
1562
+ accs_seen[acc_name] = 0
1563
+ acc_id = ok[acc_name]
1564
+ kmer_ct = result.protein_kmer_count[acc_name]
1565
+ kmers = result.best_hits_kmers[acc_name]
1566
+ kmers = kmers.tobytes()
1567
+ wcurs.execute("INSERT INTO {acc}_genomes VALUES (?, ?)".format(acc=acc_name), (genome_idx, kmers,))
1568
+ new_gak.append((genome_idx, acc_id, kmer_ct,))
1569
+
1570
+ new_gens.append((genome_name, genome_idx, protein_count,))
1571
+ genome_idx += 1
1572
+
1573
+ worker.commit()
1574
+
1575
+ if verbose:
1576
+ tracker.update()
1577
+
1578
+ pool.close()
1579
+
1580
+ logger.close()
1581
+
1582
+ wcurs.executemany("INSERT INTO genome_index VALUES (?,?,?)", new_gens)
1583
+ wcurs.executemany("INSERT INTO genome_acc_kmer_counts VALUES (?,?,?)", new_gak)
1584
+ worker.commit()
1585
+
1586
+ wcurs.close()
1587
+ worker.close()
1588
+
1589
+ accs_seen = list(accs_seen.keys())
1590
+
1591
+ parent = sqlite3.connect(final_database)
1592
+ curs = parent.cursor()
1593
+
1594
+ curs.execute("attach '" + temp_db + "' as worker")
1595
+ #initialize if needed.
1596
+ curs.execute("CREATE TABLE IF NOT EXISTS genome_index (genome text, gen_id integer, protein_count integer)")
1597
+ curs.execute("CREATE TABLE IF NOT EXISTS genome_acc_kmer_counts (genome integer, accession integer, count integer)")
1598
+
1599
+ curs.execute("INSERT INTO genome_index SELECT * FROM worker.genome_index")
1600
+ curs.execute("INSERT INTO genome_acc_kmer_counts SELECT * FROM worker.genome_acc_kmer_counts")
1601
+ curs.execute("CREATE INDEX IF NOT EXISTS kmer_acc ON genome_acc_kmer_counts (genome, accession);")
1602
+ parent.commit()
1603
+
1604
+ if verbose:
1605
+ tracker = progress_tracker(total = len(accs_seen), message = "Collecting results")
1606
+ else:
1607
+ print("Collecting results")
1608
+
1609
+ pool = multiprocessing.Pool(threads, initializer = acc_transformer_init,
1610
+ initargs = (temp_db, td,))
1611
+
1612
+ for result in pool.imap_unordered(acc_transformer, accs_seen):
1613
+ database, accession = result[0], result[1]
1614
+ curs.execute("CREATE TABLE IF NOT EXISTS {acc} (kmer INTEGER PRIMARY KEY, genomes array)".format(acc=accession))
1615
+ curs.execute("CREATE TABLE IF NOT EXISTS {acc}_genomes (genome INTEGER PRIMARY KEY, kmers array)".format(acc=accession))
1616
+ curs.execute("CREATE INDEX IF NOT EXISTS {acc}_index ON {acc}(kmer)".format(acc=accession))
1617
+
1618
+ #Get the genomes from worker db.
1619
+ curs.execute("INSERT INTO {acc}_genomes SELECT * FROM worker.{acc}_genomes".format(acc=accession))
1620
+
1621
+ parent.commit()
1622
+
1623
+ accdb = sqlite3.connect(database)
1624
+ acc_curs = accdb.cursor()
1625
+
1626
+ to_update = acc_curs.execute("SELECT kmer, genomes, genomes FROM {acc}".format(acc=accession)).fetchall()
1627
+
1628
+ acc_curs.close()
1629
+ accdb.close()
1630
+
1631
+ update_concat_sql = "INSERT INTO {acc} VALUES (?,?) ON CONFLICT(kmer) DO UPDATE SET genomes=genomes || (?)".format(acc=accession)
1632
+ #ON CONFLICT(kmer) DO UPDATE SET genomes=genomes || acc.{acc}.genomes;".format(acc=accession)
1633
+ #print(update_concat_sql)
1634
+ curs.executemany(update_concat_sql, to_update)
1635
+
1636
+ parent.commit()
1637
+
1638
+ os.remove(database)
1639
+
1640
+ if verbose:
1641
+ tracker.update()
1642
+
1643
+ pool.close()
1644
+
1645
+ curs.execute("detach worker")
1646
+
1647
+ parent.commit()
1648
+
1649
+ curs.close()
1650
+ parent.close()
1651
+
1652
+ os.remove(temp_db)
1653
+ try:
1654
+ if len(os.listdir(td)) == 0:
1655
+ shutil.rmtree(td)
1656
+ except:
1657
+ pass
1658
+
1659
+ if success:
1660
+ print("Database build complete!")
1661
+
1662
+ return success
1663
+
1664
+ def file_v_db_initializer(tgak, tgt_names, tgt_cts, hmm_file, do_compress, tgt_ct, sd, out, style, in_mem, build_q, tdb):
1665
+ #num_tgts, self.do_sd, self.output, self.style, self.as_mem_db, self.do_db_build
1666
+ global _tdb
1667
+ _tdb = tdb
1668
+
1669
+ global _tgt_gak
1670
+ _tgt_gak = tgak
1671
+
1672
+ global _tname
1673
+ _tname = tgt_names
1674
+
1675
+ global _tct
1676
+ _tct = tgt_cts
1677
+
1678
+ global hmm_manager
1679
+ hmm_manager = pyhmmer_manager(do_compress)
1680
+ hmm_manager.load_hmm_from_file(hmm_file)
1681
+
1682
+ global num_tgts
1683
+ num_tgts = tgt_ct
1684
+
1685
+ global _do_sd
1686
+ _do_sd = sd
1687
+
1688
+ global out_style
1689
+ out_style = style
1690
+
1691
+ global out_base
1692
+ out_base = out
1693
+
1694
+ global db_is_in_mem
1695
+ db_is_in_mem = in_mem
1696
+
1697
+ global make_query_db
1698
+ make_query_db = build_q
1699
+
1700
+ return _tdb, _tgt_gak, _tname, _tct, hmm_manager, num_tgts, _do_sd, out_base, out_style, db_is_in_mem, make_query_db
1701
+
1702
+ def file_v_db_worker(query_args):
1703
+ #query info for this particular query
1704
+ in_file = query_args[0]
1705
+
1706
+ in_file.preprocess()
1707
+
1708
+ qname = in_file.basename
1709
+
1710
+ do_sd = _do_sd
1711
+
1712
+ #std dev. calcs are not meaningful with matrix style output.
1713
+ if out_style == "matrix":
1714
+ do_sd = False
1715
+
1716
+ if do_sd:
1717
+ results = []
1718
+ shared_acc_counts = []
1719
+ else:
1720
+ results = np.zeros(shape = num_tgts, dtype = np.float_)
1721
+ shared_acc_counts = np.zeros(shape = num_tgts, dtype = np.int32)
1722
+
1723
+ if db_is_in_mem:
1724
+ #The connection is already given as MDB if the db is in mem
1725
+ tconn = _tdb
1726
+ else:
1727
+ #db is on disk and the connection has to be established.
1728
+ tconn = sqlite3.connect(_tdb)
1729
+
1730
+ tcurs = tconn.cursor()
1731
+
1732
+ #This is a difference from the DB-first method.
1733
+ acc_idx = generate_accessions_index(forward = True)
1734
+
1735
+ genome_lists = {}
1736
+
1737
+ tcurs.row_factory = lambda cursor, row: row[0]
1738
+
1739
+
1740
+ if make_query_db:
1741
+ ret = [qname, None, []]
1742
+ else:
1743
+ ret = [qname, None, None]
1744
+
1745
+ #We need to purge accsessions not in tgt.
1746
+ for acc in in_file.best_hits_kmers:
1747
+ one = in_file.best_hits_kmers[acc]
1748
+ acc_id = acc_idx[acc]
1749
+
1750
+ if make_query_db:
1751
+ ret[2].append((qname, acc_id, one.tobytes(),))
1752
+
1753
+ #Check working.
1754
+ if acc_id in _tgt_gak:
1755
+
1756
+ kmer_ct = one.shape[0]
1757
+
1758
+ if do_sd:
1759
+ hits = np.zeros(shape = num_tgts, dtype = np.int32)
1760
+ hits[np.nonzero(_tgt_gak[acc_id])] = 1
1761
+ shared_acc_counts.append(hits)
1762
+ else:
1763
+ shared_acc_counts[np.nonzero(_tgt_gak[acc_id])] += 1
1764
+
1765
+ #SQL has a max binding size of 999, for some reason.
1766
+ if kmer_ct > 998:
1767
+ #Each kmer needs to be a tuple.
1768
+ these_kmers = [(int(kmer),) for kmer in one]
1769
+
1770
+ temp_name = "_" + qname +"_" + acc
1771
+ temp_name = temp_name.replace(".", "_")
1772
+
1773
+ tcurs.execute("CREATE TEMP TABLE " + temp_name + " (kmer INTEGER)")
1774
+ tconn.commit()
1775
+ insert_table = "INSERT INTO " + temp_name + " VALUES (?)"
1776
+ tcurs.executemany(insert_table, these_kmers)
1777
+ tconn.commit()
1778
+ join_and_select_sql = "SELECT genomes FROM " + temp_name + " INNER JOIN " + acc + " ON "+ temp_name+".kmer = " + acc+".kmer;"
1779
+
1780
+ set = tcurs.execute(join_and_select_sql).fetchall()
1781
+ else:
1782
+ #kmers must be a list, not a tuple.
1783
+ these_kmers = [int(kmer) for kmer in one]
1784
+ select = "SELECT genomes FROM " + acc + " WHERE kmer IN ({kmers})".format(kmers=','.join(['?']*len(these_kmers)))
1785
+
1786
+ set = tcurs.execute(select, these_kmers).fetchall()
1787
+
1788
+ #join results into one bytestring.
1789
+ set = b''.join(set)
1790
+
1791
+ these_intersections = np.bincount(np.frombuffer(set, dtype = np.int32), minlength = num_tgts)
1792
+ set = None
1793
+ #Add tgt kmer counts to query kmer counts, find union size based on intersection size, cald jacc
1794
+ jacc = np.divide(these_intersections, np.subtract(np.add(_tgt_gak[acc_id], kmer_ct), these_intersections))
1795
+
1796
+ if do_sd:
1797
+ results.append(jacc)
1798
+ else:
1799
+ results += jacc
1800
+
1801
+ tcurs.row_factory = None
1802
+ tcurs.close()
1803
+
1804
+ if do_sd:
1805
+ results = np.vstack(results)
1806
+ has_accs = np.vstack(shared_acc_counts)
1807
+
1808
+ shared_acc_counts = np.sum(has_accs, axis = 0)
1809
+
1810
+ #final jacc_means
1811
+ jaccard_averages = np.divide(np.sum(results, axis = 0), shared_acc_counts)
1812
+
1813
+ aai_ests = numpy_kaai_to_aai(jaccard_averages)
1814
+
1815
+ #find diffs from means; this includes indicies corresponding to unshared SCPs that should not be included.
1816
+ results = results - jaccard_averages
1817
+
1818
+ #fix those corresponding indicies to not contribute to the final SD.
1819
+ results[np.nonzero(has_accs == 0)] = 0
1820
+
1821
+ #Square them
1822
+ results = np.square(results)
1823
+ #Sum squares and divide by shared acc. count, the sqrt to get SD.
1824
+ jaccard_SDs = np.sqrt(np.divide(np.sum(results, axis = 0), shared_acc_counts))
1825
+ jaccard_SDs = np.round(jaccard_SDs, 4).astype(str)
1826
+
1827
+ else:
1828
+ #other condition.
1829
+ jaccard_SDs = None
1830
+ jaccard_averages = np.divide(results, shared_acc_counts)
1831
+ #we don't want to pass char arrays to main, so skip this here and do it in main instead.
1832
+ if out_style != "matrix":
1833
+ aai_ests = numpy_kaai_to_aai(jaccard_averages)
1834
+
1835
+ del results
1836
+
1837
+ #Since the outputs go to separate files, it makes more sense to do them within the worker processes instead of in main.
1838
+ if out_style == "tsv":
1839
+ no_hit = np.where(shared_acc_counts == 0)
1840
+
1841
+ possible_hits = np.minimum(len(in_file.best_hits_kmers), _tct).astype(str)
1842
+ jaccard_averages = np.round(jaccard_averages, 4).astype(str)
1843
+ shared_acc_counts = shared_acc_counts.astype(str)
1844
+
1845
+ jaccard_averages[no_hit] = "N/A"
1846
+ aai_ests[no_hit] = "N/A"
1847
+ shared_acc_counts[no_hit] = "N/A"
1848
+ possible_hits[no_hit] = "N/A"
1849
+
1850
+ output_name = os.path.normpath(out_base + "/"+qname+"_results.txt")
1851
+
1852
+ out = open(output_name, "w")
1853
+ out.write("query\ttarget\tavg_jacc_sim\tjacc_SD\tnum_shared_SCPs\tposs_shared_SCPs\tAAI_estimate\n")
1854
+ if do_sd:
1855
+ jaccard_SDs[no_hit] = "N/A"
1856
+ for i in range(0, len(aai_ests)):
1857
+ out.write(qname+"\t"+_tname[i]+"\t"+jaccard_averages[i]+"\t"+jaccard_SDs[i]+"\t"+shared_acc_counts[i]+"\t"+possible_hits[i]+"\t"+aai_ests[i]+"\n")
1858
+ else:
1859
+ for i in range(0, len(aai_ests)):
1860
+ out.write(qname+"\t"+_tname[i]+"\t"+jaccard_averages[i]+"\t"+"N/A"+"\t"+shared_acc_counts[i]+"\t"+possible_hits[i]+"\t"+aai_ests[i]+"\n")
1861
+ out.close()
1862
+
1863
+
1864
+ #We're just gonna pass this back to the main to print.
1865
+ if out_style == "matrix":
1866
+ ret[1] = jaccard_averages
1867
+
1868
+ return ret
1869
+
1870
+ #Handles both query and target types for a db vs db query
1871
+ class file_vs_db_query:
1872
+ def __init__(self, in_memory = False, input_file_objects = None,
1873
+ target = None, threads = 1, do_sd = False, output_base = "FastAAI", output_style = "tsv",
1874
+ build_db_from_queries = True, qdb_name = "Query_FastAAI_database.db", hmm_path = None,
1875
+ do_comp = True, verbose = True):
1876
+ #files to work with
1877
+ self.queries = input_file_objects
1878
+ self.do_db_build = build_db_from_queries
1879
+ self.dbname = qdb_name
1880
+
1881
+ self.t = target
1882
+ self.valids = None
1883
+
1884
+ #Originally this was made to be a memory database only block of code, but just if/else one change makes it work on disk and it doesn't need a redev, then.
1885
+ self.as_mem_db = in_memory
1886
+
1887
+ self.t_conn = None
1888
+ self.t_curs = None
1889
+
1890
+ self.threads = threads
1891
+ self.do_sd = do_sd
1892
+
1893
+ self.output_base = output_base
1894
+ self.output = os.path.normpath(output_base + "/results")
1895
+ self.style = output_style
1896
+
1897
+ if hmm_path is not None:
1898
+ self.hmm_path = hmm_path
1899
+ else:
1900
+ self.hmm_path = find_hmm()
1901
+
1902
+ self.do_comp = do_comp
1903
+
1904
+ self.verbose = verbose
1905
+
1906
+ '''
1907
+ Workflow is:
1908
+ load target db as mem (optional)
1909
+ assess valid targets
1910
+ create query db output (optional)
1911
+ pass query args to workers
1912
+ preproc query args
1913
+ write results
1914
+ fill query_db_out (optional)
1915
+ '''
1916
+
1917
+
1918
+ def open(self):
1919
+ if self.as_mem_db:
1920
+ self.t_conn = sqlite3.connect(':memory:')
1921
+ else:
1922
+ self.t_conn = sqlite3.connect(self.t)
1923
+
1924
+ self.t_curs = self.t_conn.cursor()
1925
+
1926
+ if self.as_mem_db:
1927
+ self.t_curs.execute("attach '" + self.t + "' as targets")
1928
+
1929
+ self.t_curs.execute("CREATE TABLE genome_index AS SELECT * FROM targets.genome_index")
1930
+ self.t_curs.execute("CREATE TABLE genome_acc_kmer_counts AS SELECT * FROM targets.genome_acc_kmer_counts")
1931
+ self.t_curs.execute("CREATE INDEX t_gi ON genome_index (gen_id)")
1932
+ self.t_curs.execute("CREATE INDEX t_gak ON genome_acc_kmer_counts (accession)")
1933
+
1934
+ if self.as_mem_db:
1935
+ table_sql = "SELECT name FROM targets.sqlite_master"
1936
+ else:
1937
+ table_sql = "SELECT name FROM sqlite_master"
1938
+
1939
+
1940
+ ok = generate_accessions_index()
1941
+ ok_names = set(list(ok.keys()))
1942
+ successful_tables = []
1943
+
1944
+ for name in self.t_curs.execute(table_sql).fetchall():
1945
+ name = name[0]
1946
+ if name in ok_names:
1947
+ successful_tables.append(ok[name])
1948
+ if self.as_mem_db:
1949
+ self.t_curs.execute("CREATE TABLE " + name + " AS SELECT * FROM targets."+name)
1950
+ self.t_curs.execute("CREATE INDEX "+name+"_index ON " + name+" (kmer)" )
1951
+
1952
+ if self.as_mem_db:
1953
+ self.t_conn.commit()
1954
+ self.t_curs.execute("detach targets")
1955
+
1956
+ self.valids = tuple(successful_tables)
1957
+
1958
+ def close(self):
1959
+ self.t_curs.close()
1960
+ self.t_curs = None
1961
+
1962
+ def clean_up(self):
1963
+ self.t_conn.close()
1964
+ self.t_conn = None
1965
+
1966
+ def sqlite_table_schema(self, conn, name):
1967
+ """Return a string representing the table's CREATE"""
1968
+ cursor = conn.execute("SELECT sql FROM sqlite_master WHERE name=?;", [name])
1969
+ sql = cursor.fetchone()[0]
1970
+ cursor.close()
1971
+ return sql
1972
+
1973
+ def execute(self):
1974
+ print("FastAAI is running.")
1975
+ tgt_id_res = self.t_curs.execute("SELECT * FROM genome_index ORDER BY gen_id").fetchall()
1976
+
1977
+ tgt_ids = []
1978
+ tgt_naming = []
1979
+ tgt_counts = []
1980
+ for r in tgt_id_res:
1981
+ genome, id, prot_ct = r[0], r[1], r[2]
1982
+ tgt_ids.append(genome)
1983
+ tgt_naming.append(genome)
1984
+ tgt_counts.append(prot_ct)
1985
+
1986
+ num_tgts = len(tgt_ids)
1987
+ tgt_counts = np.array(tgt_counts, dtype = np.int32)
1988
+
1989
+ tgts_gak = {}
1990
+ gak_sql = "SELECT * FROM genome_acc_kmer_counts WHERE accession in ({accs})".format(accs=','.join(['?']*len(self.valids)))
1991
+
1992
+ for result in self.t_curs.execute(gak_sql, self.valids).fetchall():
1993
+ genome, acc, ct = result[0], result[1], result[2]
1994
+ if acc not in tgts_gak:
1995
+ tgts_gak[acc] = np.zeros(num_tgts, dtype = np.int32)
1996
+ tgts_gak[acc][genome] += ct
1997
+
1998
+ #If the DB is a memory DB, we need to maintain the connection, but neither needs to maintain the curor in main.
1999
+ self.close()
2000
+
2001
+ query_groups = []
2002
+
2003
+ for query_input in self.queries:
2004
+ query_groups.append((query_input,))
2005
+
2006
+ #And if it's a physical database, we do want to close it.
2007
+ if not self.as_mem_db:
2008
+ self.t_conn.close()
2009
+
2010
+ num_queries = len(query_groups)
2011
+
2012
+ if self.do_db_build:
2013
+ sqlite3.register_converter("array", convert_array)
2014
+ qdb_path = os.path.normpath(self.output_base + "/database/"+self.dbname)
2015
+ if not os.path.exists(os.path.normpath(self.output_base + "/database")):
2016
+ try:
2017
+ os.mkdir(os.path.normpath(self.output_base + "/database"))
2018
+ except:
2019
+ print("Couldn't make database at", qdb_path)
2020
+ self.do_db_build = False
2021
+
2022
+ if os.path.exists(qdb_path):
2023
+ print("Database for queries already exists. I can't make one at:", qdb_path)
2024
+ self.do_db_build = False
2025
+ else:
2026
+ query_db_conn = sqlite3.connect(qdb_path)
2027
+ q_curs = query_db_conn.cursor()
2028
+ q_curs.execute("CREATE TABLE storage (genome INTEGER, accession INTEGER, kmers array)")
2029
+ q_curs.execute("CREATE INDEX store_idx ON storage (genome, accession)")
2030
+ query_genome_index = []
2031
+ qgi_ct = 0
2032
+ qg_gak = []
2033
+
2034
+ if self.verbose:
2035
+ tracker = progress_tracker(total = num_queries, message = "Calculating AAI...", one_line = True)
2036
+
2037
+ if self.style == "matrix":
2038
+ output_name = os.path.normpath(self.output + "/FastAAI_matrix.txt")
2039
+ output = open(output_name, "w")
2040
+ #needs target names.
2041
+ print("query_genome", *tgt_ids, sep = "\t", file = output)
2042
+
2043
+ #Need to pass these
2044
+
2045
+ #both initializers will share this.
2046
+ shared_args = [tgts_gak, tgt_naming, tgt_counts, self.hmm_path, self.do_comp, num_tgts, self.do_sd, self.output,
2047
+ self.style, self.as_mem_db, self.do_db_build]
2048
+
2049
+ if self.as_mem_db:
2050
+ shared_args.append(self.t_conn)
2051
+ shared_args = tuple(shared_args)
2052
+ pool = multiprocessing.Pool(self.threads, initializer = file_v_db_initializer,
2053
+ initargs = shared_args)
2054
+ else:
2055
+ #db is on disk,
2056
+ shared_args.append(self.t)
2057
+ shared_args = tuple(shared_args)
2058
+ pool = multiprocessing.Pool(self.threads, initializer = file_v_db_initializer,
2059
+ initargs = shared_args)
2060
+
2061
+ for result in pool.imap(file_v_db_worker, query_groups):
2062
+ if self.verbose:
2063
+ tracker.update()
2064
+ qname = result[0]
2065
+ if self.style == "matrix":
2066
+ printout = numpy_kaai_to_aai(result[1])
2067
+ print(qname, *printout, sep = "\t", file = output)
2068
+
2069
+ if self.do_db_build:
2070
+ query_genome_index.append((qname, qgi_ct, len(result[2]),))
2071
+ for row in result[2]:
2072
+ num_kmers = int(len(row[2])/4)
2073
+ qg_gak.append((qgi_ct, row[1], num_kmers,))
2074
+ qgi_ct += 1
2075
+ q_curs.executemany("INSERT INTO storage VALUES (?, ?, ?)", result[2])
2076
+ query_db_conn.commit()
2077
+
2078
+ pool.close()
2079
+
2080
+ if self.style == "matrix":
2081
+ output.close()
2082
+
2083
+ if self.do_db_build:
2084
+ q_curs.execute("CREATE TABLE genome_index (genome text, gen_id integer, protein_count integer)")
2085
+ q_curs.execute("CREATE TABLE genome_acc_kmer_counts (genome integer, accession integer, count integer)")
2086
+ q_curs.executemany("INSERT INTO genome_index VALUES (?,?,?)", query_genome_index)
2087
+ q_curs.executemany("INSERT INTO genome_acc_kmer_counts VALUES (?,?,?)", qg_gak)
2088
+ query_db_conn.commit()
2089
+
2090
+ acc_id_to_name = generate_accessions_index(forward = False)
2091
+ qgi_dict = {}
2092
+ for tup in query_genome_index:
2093
+ qgi_dict[tup[0]] = tup[1]
2094
+
2095
+ accs_in_db = q_curs.execute("SELECT DISTINCT(accession) FROM genome_acc_kmer_counts").fetchall()
2096
+ if self.verbose:
2097
+ tracker = progress_tracker(total = len(accs_in_db), message = "Crafting database from query outputs.", one_line = True)
2098
+
2099
+ for acc in accs_in_db:
2100
+ acc = acc[0]
2101
+ acc_name = acc_id_to_name[acc]
2102
+ q_curs.execute("CREATE TABLE " + acc_name + " (kmer INTEGER PRIMARY KEY, genomes array)")
2103
+ q_curs.execute("CREATE TABLE " + acc_name + "_genomes (genome INTEGER PRIMARY KEY, kmers array)")
2104
+ data = q_curs.execute("SELECT genome, kmers FROM storage WHERE accession = ?", (acc,)).fetchall()
2105
+
2106
+ ins = []
2107
+ #group by kmer
2108
+ kmers_by_gen = {}
2109
+ for row in data:
2110
+ gen = row[0]
2111
+ gen = qgi_dict[gen]
2112
+ kmers = np.frombuffer(row[1], dtype = np.int32)
2113
+ ins.append((gen, kmers,))
2114
+ for k in kmers:
2115
+ #typecast
2116
+ k = int(k)
2117
+ if k not in kmers_by_gen:
2118
+ kmers_by_gen[k] = []
2119
+ kmers_by_gen[k].append(gen)
2120
+
2121
+ data = None
2122
+
2123
+ q_curs.executemany("INSERT INTO "+ acc_name + "_genomes VALUES (?,?)", ins)
2124
+
2125
+ ins = []
2126
+ for k in kmers_by_gen:
2127
+ dat = kmers_by_gen[k]
2128
+ dat = np.sort(np.array(dat, dtype = np.int32))
2129
+ ins.append((k, dat.tobytes()))
2130
+
2131
+ q_curs.executemany("INSERT INTO "+ acc_name + " VALUES (?,?)", ins)
2132
+
2133
+ ins = None
2134
+
2135
+ query_db_conn.commit()
2136
+
2137
+ q_curs.execute("CREATE INDEX IF NOT EXISTS " + acc_name + "_index ON " + acc_name + " (kmer)")
2138
+
2139
+ if self.verbose:
2140
+ tracker.update()
2141
+
2142
+
2143
+ q_curs.execute("CREATE INDEX IF NOT EXISTS kmer_acc ON genome_acc_kmer_counts (genome, accession);")
2144
+ q_curs.execute("DROP INDEX store_idx")
2145
+ q_curs.execute("DROP TABLE storage")
2146
+ query_db_conn.commit()
2147
+ q_curs.execute("VACUUM")
2148
+ query_db_conn.commit()
2149
+ q_curs.close()
2150
+ query_db_conn.close()
2151
+
2152
+ #Actually run the thing.
2153
+ def run(self):
2154
+ self.open()
2155
+ self.execute()
2156
+ #Clean up the db connections; free the mem.
2157
+ self.clean_up()
2158
+
2159
+ def numpy_kaai_to_aai(kaai_array):
2160
+ #aai_hat = (-0.3087057 + 1.810741 * (np.exp(-(-0.2607023 * np.log(kaai))**(1/3.435))))*100
2161
+
2162
+ #Protect the original jaccard averages memory item
2163
+ aai_hat_array = kaai_array.copy()
2164
+
2165
+ non_zero = np.where(aai_hat_array > 0)
2166
+ is_zero = np.where(aai_hat_array <= 0)
2167
+
2168
+ #I broke this down into its original components
2169
+ #Avoid zeroes in log - still actually works, but it produces warnings I don't want to see.
2170
+ aai_hat_array[non_zero] = np.log(aai_hat_array[non_zero])
2171
+
2172
+ aai_hat_array = np.multiply(np.subtract(np.multiply(np.exp(np.negative(np.power(np.multiply(aai_hat_array, -0.2607023), (1/3.435)))), 1.810741), 0.3087057), 100)
2173
+ '''
2174
+ Same as the above, broken down into easier-to-follow steps.
2175
+ aai_hat_array = np.multiply(aai_hat_array, -0.2607023)
2176
+ aai_hat_array = np.power(aai_hat_array, (1/3.435))
2177
+ aai_hat_array = np.negative(aai_hat_array)
2178
+ aai_hat_array = np.exp(aai_hat_array)
2179
+ aai_hat_array = np.multiply(aai_hat_array, 1.810741)
2180
+ aai_hat_array = np.subtract(aai_hat_array, 0.3087057)
2181
+ aai_hat_array = np.multiply(aai_hat_array, 100)
2182
+ '''
2183
+
2184
+ #<30 and >90 values
2185
+ smol = np.where(aai_hat_array < 30)
2186
+ big = np.where(aai_hat_array > 90)
2187
+
2188
+ aai_hat_array = np.round(aai_hat_array, 2)
2189
+
2190
+ #Convert to final printables
2191
+ aai_hat_array = aai_hat_array.astype(str)
2192
+ aai_hat_array[smol] = "<30%"
2193
+ aai_hat_array[big] = ">90%"
2194
+ #The math of the above ends up with zero values being big, so we fix those.
2195
+ aai_hat_array[is_zero] = "<30%"
2196
+
2197
+ return aai_hat_array
2198
+
2199
+ #Also includes a multiply by 100 and type conversion compared to original - this is some silliness for saving memory.
2200
+ def numpy_kaai_to_aai_just_nums(kaai_array, as_float = False):
2201
+ #aai_hat = (-0.3087057 + 1.810741 * (np.exp(-(-0.2607023 * np.log(kaai))**(1/3.435))))*100
2202
+
2203
+ #Protect the original jaccard averages memory item
2204
+ aai_hat_array = kaai_array.copy()
2205
+
2206
+ non_zero = np.where(aai_hat_array > 0)
2207
+ is_zero = np.where(aai_hat_array <= 0)
2208
+
2209
+ #I broke this down into its original components
2210
+ #Avoid zeroes in log - still actually works, but it produces warnings I don't want to see.
2211
+ aai_hat_array[non_zero] = np.log(aai_hat_array[non_zero])
2212
+
2213
+ aai_hat_array = np.multiply(np.subtract(np.multiply(np.exp(np.negative(np.power(np.multiply(aai_hat_array, -0.2607023), (1/3.435)))), 1.810741), 0.3087057), 100)
2214
+ '''
2215
+ Same as the above, broken down into easier-to-follow steps.
2216
+ aai_hat_array = np.multiply(aai_hat_array, -0.2607023)
2217
+ aai_hat_array = np.power(aai_hat_array, (1/3.435))
2218
+ aai_hat_array = np.negative(aai_hat_array)
2219
+ aai_hat_array = np.exp(aai_hat_array)
2220
+ aai_hat_array = np.multiply(aai_hat_array, 1.810741)
2221
+ aai_hat_array = np.subtract(aai_hat_array, 0.3087057)
2222
+ aai_hat_array = np.multiply(aai_hat_array, 100)
2223
+ '''
2224
+
2225
+ aai_hat_array = np.round(aai_hat_array, 2)
2226
+
2227
+ #<30 and >90 values
2228
+ smol = np.where(aai_hat_array < 30)
2229
+ big = np.where(aai_hat_array > 90)
2230
+
2231
+ #We can find these later.
2232
+ aai_hat_array[smol] = 15
2233
+ aai_hat_array[big] = 95
2234
+
2235
+ if as_float:
2236
+ aai_hat_array = np.round(aai_hat_array, 2)
2237
+ else:
2238
+ aai_hat_array = np.multiply(aai_hat_array, 100)
2239
+ aai_hat_array = np.round(aai_hat_array, 2)
2240
+ aai_hat_array = aai_hat_array.astype(np.int16)
2241
+
2242
+ return aai_hat_array
2243
+
2244
+
2245
+ def curtime():
2246
+ time_format = "%d/%m/%Y %H:%M:%S"
2247
+ timer = datetime.datetime.now()
2248
+ time = timer.strftime(time_format)
2249
+ return time
2250
+
2251
+ #Perform a minimal-memory query of a target database from input files. Lighter weight function for low memory
2252
+ def sql_query_opts():
2253
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
2254
+ description='''
2255
+ This FastAAI module takes one or many genomes, proteins, or proteins and HMMs as a QUERY and searches them against an existing FastAAI database TARGET using SQL
2256
+ If you only have a few genomes - or not enough RAM to hold the entire target database in memory - this is the probably the best option for you.
2257
+
2258
+ To provide files, supply either a directory containing only one type of file (e.g. only genomes in FASTA format), a file containing paths to files of a type, 1 per line,
2259
+ or a comma-separated list of files of a single type (no spaces)
2260
+
2261
+ If you provide FastAAI with genomes or only proteins (not proteins and HMMs), this FastAAI module will produce the required protein and HMM files as needed
2262
+ and place them in the output directory, just like it does while building a database.
2263
+
2264
+ Once these inputs are ready to be queried against the database (each has both a protein and HMM file), they will be processed independently, 1 per thread at a time.
2265
+
2266
+ Note: Protein and HMM files generated during this query can be supplied to build a FastAAI database from proteins and HMMs using the build_db module, without redoing preprocessing.
2267
+ ''')
2268
+
2269
+ parser.add_argument('-g', '--genomes', dest = 'genomes', default = None, help = 'Genomes in FASTA format.')
2270
+ parser.add_argument('-p', '--proteins', dest = 'proteins', default = None, help = 'Protein amino acids in FASTA format.')
2271
+ parser.add_argument('-m', '--hmms', dest = 'hmms', default = None, help = 'HMM search files produced by FastAAI on a set of proteins.')
2272
+
2273
+ parser.add_argument('--target', dest = 'target', default = None, help = 'A path to the FastAAI database you wish to use as the target')
2274
+
2275
+ parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory where FastAAI will place the result of this query and any protein or HMM files it has to generate. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
2276
+ parser.add_argument('--output_style', dest = "style", default = 'tsv', help = "Either 'tsv' or 'matrix'. Matrix produces a simplified output of only AAI estimates.")
2277
+ parser.add_argument('--do_stdev', dest = "do_stdev", action='store_true', help = 'Off by default. Calculate std. deviations on Jaccard indicies. Increases memory usage and runtime slightly. Does NOT change estimated AAI values at all.')
2278
+
2279
+ parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
2280
+ parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
2281
+
2282
+ parser.add_argument('--in_memory', dest = "in_mem", action = 'store_true', help = 'Load the target database into memory before querying. Consumes more RAM, but is faster and reduces file I/O substantially.')
2283
+
2284
+ parser.add_argument('--create_query_db', dest = "make_db", action = 'store_true', help = 'Create a query database from the genomes.')
2285
+ parser.add_argument('--query_db_name', dest = "qdb_name", default = "Query_FastAAI_db.db", help = 'Name the query database. This file must not already exist.')
2286
+
2287
+ parser.add_argument('--compress', dest = "do_comp", action = 'store_true', help = 'Gzip compress generated proteins, HMMs. Off by default.')
2288
+
2289
+ args, unknown = parser.parse_known_args()
2290
+
2291
+ return parser, args
2292
+
2293
+ def sql_query_thread_starter(kmer_cts, protein_cts):
2294
+ global target_kmer_cts
2295
+ global target_protein_counts
2296
+ target_kmer_cts = kmer_cts
2297
+ target_protein_counts = protein_cts
2298
+
2299
+ #took a function from fastaai 2.0
2300
+ class fastaai_file_importer:
2301
+ def __init__(self, genomes = None, proteins = None, hmms = None, crystals = None,
2302
+ output = "FastAAI", compress = False, crystalize = False):
2303
+ #genomes, prots, hmms can be supplied as either directory, a file with paths 1/line, or comma-sep paths. Type is determined automatically.
2304
+ self.genomes = genomes
2305
+ self.proteins = proteins
2306
+ self.hmms = hmms
2307
+ self.crystals = crystals
2308
+
2309
+ self.genome_list = None
2310
+ self.protein_list = None
2311
+ self.hmm_list = None
2312
+ self.crystal_list = None
2313
+
2314
+ self.crystalize = crystalize
2315
+
2316
+ #file base names.
2317
+ self.identifiers = None
2318
+
2319
+ self.error = False
2320
+
2321
+ self.in_files = None
2322
+
2323
+ self.status = "genome"
2324
+ self.output = output
2325
+
2326
+ self.do_comp = compress
2327
+
2328
+ def retrieve_files(self, arg):
2329
+ done = False
2330
+ files = []
2331
+ names = []
2332
+ #Case where a directory is supplied.
2333
+ if os.path.isdir(arg):
2334
+ for file in sorted(os.listdir(arg)):
2335
+ #Retrieve file name
2336
+ if file.endswith(".gz"):
2337
+ name = os.path.splitext(os.path.basename(file[:-3]))[0]
2338
+ else:
2339
+ name = os.path.splitext(os.path.basename(file))[0]
2340
+
2341
+ names.append(name)
2342
+ files.append(os.path.abspath(os.path.normpath(arg + '/' +file)))
2343
+
2344
+ done = True
2345
+
2346
+
2347
+ #Case where a file containing paths is supplied.
2348
+ if os.path.isfile(arg):
2349
+ handle = agnostic_reader(arg)
2350
+ for line in handle:
2351
+ file = line.strip()
2352
+ if os.path.exists(file):
2353
+ if file.endswith(".gz"):
2354
+ name = os.path.splitext(os.path.basename(file[:-3]))[0]
2355
+ else:
2356
+ name = os.path.splitext(os.path.basename(file))[0]
2357
+
2358
+ names.append(name)
2359
+ files.append(os.path.abspath(os.path.normpath(file)))
2360
+
2361
+ handle.close()
2362
+ done = True
2363
+
2364
+ if len(names) == 0 and len(files) == 0:
2365
+ #Try interpreting the file as a singular path.
2366
+ done = False
2367
+
2368
+ #Last check.
2369
+ if not done:
2370
+ for file in arg.split(","):
2371
+ if os.path.exists(file):
2372
+ if file.endswith(".gz"):
2373
+ name = os.path.splitext(os.path.basename(file[:-3]))[0]
2374
+ else:
2375
+ name = os.path.splitext(os.path.basename(file))[0]
2376
+
2377
+ names.append(name)
2378
+ files.append(os.path.abspath(os.path.normpath(file)))
2379
+
2380
+ return files, names
2381
+
2382
+ #Check if g/p/h
2383
+ def determine_inputs(self):
2384
+ if self.genomes is not None:
2385
+ self.genome_list, self.identifiers = self.retrieve_files(self.genomes)
2386
+ if self.proteins is not None or self.hmms is not None:
2387
+ print("You can supply genomes or proteins or proteins and HMMS, but not genomes and anything else.")
2388
+ self.error = True
2389
+
2390
+ #Proteins, but no HMMs
2391
+ if self.proteins is not None and self.hmms is None:
2392
+ self.protein_list, self.identifiers = self.retrieve_files(self.proteins)
2393
+
2394
+ if self.proteins is not None and self.hmms is not None:
2395
+ self.protein_list, prot_names = self.retrieve_files(self.proteins)
2396
+ self.hmm_list, hmm_names = self.retrieve_files(self.hmms)
2397
+
2398
+ if len(self.protein_list) != len(self.hmm_list):
2399
+ print("Different number of proteins and HMMs supplied. You must supply the same number of each, and they must be matched pairs.")
2400
+ self.error = True
2401
+ else:
2402
+ all_same = True
2403
+ for p, h in zip(prot_names, hmm_names):
2404
+ if p != h:
2405
+ all_same = False
2406
+
2407
+ if all_same:
2408
+ self.identifiers = prot_names
2409
+ prot_names = None
2410
+ hmm_names = None
2411
+ else:
2412
+ self.error = True
2413
+
2414
+ if self.crystals is not None:
2415
+ self.crystal_list, self.identifiers = self.retrieve_files(self.crystals)
2416
+ #The crystal naming scheme includes an identifier at the end. This removes it.
2417
+ self.identifiers = [id[:-13] for id in self.identifiers]
2418
+
2419
+
2420
+ if not self.error:
2421
+ self.prep_input_files()
2422
+
2423
+ def prep_input_files(self):
2424
+ self.in_files = []
2425
+ if self.genome_list is not None:
2426
+ self.status = "genome"
2427
+ for g in self.genome_list:
2428
+ f = input_file(g, output = self.output, do_compress = self.do_comp, make_crystal = self.crystalize)
2429
+ f.set_genome(g)
2430
+ self.in_files.append(f)
2431
+
2432
+ if self.protein_list is not None:
2433
+ self.status = "protein"
2434
+ for p in self.protein_list:
2435
+ f = input_file(p, output = self.output, do_compress = self.do_comp, make_crystal = self.crystalize)
2436
+ f.set_protein(p)
2437
+ self.in_files.append(f)
2438
+
2439
+ if self.hmm_list is not None:
2440
+ self.status = "protein+HMM"
2441
+ for h, f in zip(self.hmm_list, self.in_files):
2442
+ f.set_hmm(h)
2443
+
2444
+ def sql_query(genomes, proteins, hmms, db_name, output, threads, verbose, do_stdev, style, in_mem, make_db, qdb_name, do_comp):
2445
+
2446
+ if not os.path.exists(db_name):
2447
+ print("")
2448
+ print("FastAAI can't find your database:", db_name)
2449
+ print("Are you sure that the path you've given to the database is correct and that the database exists?")
2450
+ print("FastAAI exiting.")
2451
+ print("")
2452
+ sys.exit()
2453
+
2454
+ #importer opts
2455
+ #genomes = None, proteins = None, hmms = None, crystals = None
2456
+ imported_files = fastaai_file_importer(genomes = genomes, proteins = proteins, hmms = hmms, output = output)
2457
+ imported_files.determine_inputs()
2458
+
2459
+ if imported_files.error:
2460
+ print("Exiting FastAAI due to input file error.")
2461
+ quit()
2462
+
2463
+ good_to_go = prepare_directories(output, imported_files.status, "query")
2464
+
2465
+ if not good_to_go:
2466
+ print("Exiting FastAAI")
2467
+ sys.exit()
2468
+
2469
+ print("")
2470
+
2471
+ '''
2472
+ self, in_memory = False, input_file_objects = None,
2473
+ target = None, threads = 1, do_sd = False, output_base = "FastAAI", output_style = "tsv",
2474
+ build_db_from_queries = True, qdb_name = "Query_FastAAI_database.db", hmm_path = "00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm",
2475
+ do_comp = True, verbose = True
2476
+ '''
2477
+ hmm_path = find_hmm()
2478
+
2479
+ mdb = file_vs_db_query(in_memory = in_mem, input_file_objects = imported_files.in_files, target=db_name,
2480
+ threads = threads, output_base = output, do_sd = do_stdev, output_style = style, do_comp = do_comp,
2481
+ build_db_from_queries = make_db, qdb_name = qdb_name, verbose = verbose, hmm_path = hmm_path)
2482
+
2483
+ mdb.run()
2484
+
2485
+ #Here's where the querying db comes in
2486
+
2487
+
2488
+ print("FastAAI query complete! Results at:", os.path.normpath(output + "/results"))
2489
+ return None
2490
+
2491
+ #Manages the query process.
2492
+ def db_query_opts():
2493
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
2494
+ description='''
2495
+ This FastAAI module takes two FastAAI databases and searches all of the genomes in the QUERY against all of the genomes in the TARGET
2496
+
2497
+ If you have many genomes (more than 1000), it will be faster to create the query database using FastAAI build_db,
2498
+ then search it against an existing target using this module than it is to do the same thing with an SQL query.
2499
+
2500
+ If you give the same database as query and target, a special all vs. all search of the genomes in the database will be done.
2501
+ ''')
2502
+ parser.add_argument('-q', '--query', dest = 'query', default = None, help = 'Path to the query database. The genomes FROM the query will be searched against the genomes in the target database')
2503
+ parser.add_argument('-t', '--target', dest = 'target', default = None, help = 'Path to the target database.')
2504
+
2505
+ parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory where FastAAI will place the result of this query. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
2506
+ parser.add_argument('--output_style', dest = "style", default = 'tsv', help = "Either 'tsv' or 'matrix'. Matrix produces a simplified output of only AAI estimates.")
2507
+ parser.add_argument('--do_stdev', dest = "do_stdev", action='store_true', help = 'Off by default. Calculate std. deviations on Jaccard indicies. Increases memory usage and runtime slightly. Does NOT change estimated AAI values at all.')
2508
+
2509
+ parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
2510
+ parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
2511
+ parser.add_argument('--in_memory', dest = "in_mem", action = 'store_true', help = 'Load both databases into memory before querying. Consumes more RAM, but is faster and reduces file I/O substantially. Consider reducing number of threads')
2512
+ parser.add_argument('--store_results', dest = "storage", action = 'store_true', help = 'Keep partial results in memory. Only works with --in_memory. Fewer writes, but more RAM. Default off.')
2513
+
2514
+ args, unknown = parser.parse_known_args()
2515
+
2516
+ return parser, args
2517
+
2518
+
2519
+ #db-db query; in-mem
2520
+ def parse_db_init(query, target, outpath):
2521
+ global qdb
2522
+ qdb = query
2523
+ global tdb
2524
+ tdb = target
2525
+ global output_path
2526
+ output_path = outpath
2527
+
2528
+ global query_gak
2529
+ global target_gak
2530
+
2531
+ return qdb, tdb, output_path
2532
+
2533
+ def parse_accession(acc):
2534
+ tmp = sqlite3.connect(":memory:")
2535
+ curs = tmp.cursor()
2536
+
2537
+ curs.execute("attach '" + qdb + "' as queries")
2538
+ curs.execute("attach '" + tdb + "' as targets")
2539
+
2540
+ sql = '''
2541
+ SELECT queries.{acc}.genomes, targets.{acc}.genomes
2542
+ FROM queries.{acc} INNER JOIN targets.{acc}
2543
+ ON queries.{acc}.kmer=targets.{acc}.kmer
2544
+ '''.format(acc = acc)
2545
+
2546
+ res = curs.execute(sql).fetchall()
2547
+
2548
+ curs.execute("detach queries")
2549
+ curs.execute("detach targets")
2550
+
2551
+ curs.close()
2552
+ tmp.close()
2553
+
2554
+ tl = []
2555
+ ql = {}
2556
+
2557
+ acc_id = generate_accessions_index()
2558
+ acc_id = acc_id[acc]
2559
+
2560
+ indexer = 0
2561
+ for r in res:
2562
+ queries = np.frombuffer(r[0], dtype = np.int32)
2563
+ tgt = np.frombuffer(r[1], dtype = np.int32)
2564
+ tl.append(tgt)
2565
+
2566
+ for q in queries:
2567
+ if q not in ql:
2568
+ ql[q] = {}
2569
+ if acc_id not in ql[q]:
2570
+ ql[q][acc_id] = []
2571
+
2572
+ ql[q][acc_id].append(indexer)
2573
+
2574
+ indexer += 1
2575
+
2576
+ tl = np.array(tl, dtype = object)
2577
+
2578
+ for q in ql:
2579
+ if acc_id in ql[q]:
2580
+ ql[q][acc_id] = np.array(ql[q][acc_id], dtype=np.int32)
2581
+
2582
+ out_file = os.path.normpath(output_path+"/"+acc+".pickle")
2583
+
2584
+ with open(out_file, "wb") as out:
2585
+ pickle.dump([ql, tl], out)
2586
+
2587
+ return([acc, out_file])
2588
+
2589
+ #all of this is exclusive to the in-mem approach for db db query
2590
+ def one_init(ql, tl, num_tgt, qgak_queue, tgak, tpres, sd, sty, output_dir, store_results, progress_queue, qnames, tnames, temp_dir):
2591
+ global _ql
2592
+ _ql = ql
2593
+ global _tl
2594
+ _tl = tl
2595
+ global _nt
2596
+ _nt = num_tgt
2597
+
2598
+ qgak_data = qgak_queue.get()
2599
+
2600
+ global out_base
2601
+ out_base = output_dir
2602
+
2603
+ global group_id
2604
+ group_id = os.path.normpath(temp_dir + "/partial_results_group_" + str(qgak_data[0])+ ".txt")
2605
+
2606
+ global _qgak
2607
+ _qgak = qgak_data[1]
2608
+
2609
+ global query_grouping
2610
+ query_grouping = qgak_data[2]
2611
+
2612
+ qgak_data = None
2613
+
2614
+ global _tgak
2615
+ _tgak = tgak
2616
+
2617
+ global _tpres
2618
+ _tpres = tpres
2619
+
2620
+ global _tct
2621
+ _tct = np.sum(_tpres, axis = 0)
2622
+
2623
+ global do_sd
2624
+ do_sd = sd
2625
+ global style
2626
+ style = sty
2627
+ #Suppress div by zero warning - it's handled.
2628
+ np.seterr(divide='ignore')
2629
+
2630
+ global store
2631
+ store = store_results
2632
+ if store:
2633
+ global holder
2634
+ holder = []
2635
+ else:
2636
+ global outwriter
2637
+ outwriter = open(group_id, "w")
2638
+
2639
+ global prog_queue
2640
+ prog_queue = progress_queue
2641
+
2642
+ global _qnames
2643
+ _qnames = qnames
2644
+
2645
+ global _tnames
2646
+ _tnames = tnames
2647
+
2648
+ def one_work(placeholder):
2649
+ for q in query_grouping:
2650
+ results = []
2651
+ #We also need to count the accs in the query genome, but which are not part of the inner join.
2652
+ for acc in _qgak[q][0]:
2653
+ if acc in _ql[q]:
2654
+ #the bincount is intersections.
2655
+ these_intersections = np.bincount(np.concatenate(_tl[acc][_ql[q][acc]]), minlength = _nt)
2656
+ else:
2657
+ #there are no intersections even though this accession is shared with at least one target
2658
+ #number of intersects is all zeros
2659
+ these_intersections = np.zeros(_nt, dtype = np.int32)
2660
+
2661
+ #Append the counts or zeros, either way.
2662
+ results.append(these_intersections)
2663
+
2664
+ results = np.vstack(results)
2665
+
2666
+ target_kmer_counts = _tgak[_qgak[q][0], :]
2667
+
2668
+ #unions = size(A) + size(B) - size(intersections(A, B))
2669
+ #unions = target_kmer_counts + query_kmers_by_acc - intersections
2670
+ unions = np.subtract(np.add(target_kmer_counts, _qgak[q][1][:, None]), results)
2671
+
2672
+ #These are now jaccards, not #intersections
2673
+ results = np.divide(results, unions)
2674
+
2675
+ shared_acc_counts = np.sum(_tpres[_qgak[q][0], :], axis = 0)
2676
+
2677
+ no_hit = np.where(shared_acc_counts == 0)
2678
+
2679
+ jaccard_averages = np.divide(np.sum(results, axis = 0), shared_acc_counts)
2680
+
2681
+ #Skip SD if output is matrix
2682
+ if style == "tsv":
2683
+ aai_ests = numpy_kaai_to_aai(jaccard_averages)
2684
+
2685
+ if do_sd:
2686
+ #find diffs from means; this includes indicies corresponding to unshared SCPs that should not be included.
2687
+ results = results - jaccard_averages
2688
+
2689
+ #fix those corresponding indicies to not contribute to the final SD.
2690
+ results[np.logical_not(_tpres[_qgak[q][0], :])] = 0
2691
+ #results[np.nonzero(has_accs == 0)] = 0
2692
+
2693
+ #Square them; 0^2 = 0, so we don't have to think about the fixed indices any more.
2694
+ results = np.square(results)
2695
+ #Sum squares and divide by shared acc. count, the sqrt to get SD.
2696
+ jaccard_SDs = np.sqrt(np.divide(np.sum(results, axis = 0), shared_acc_counts))
2697
+ jaccard_SDs = np.round(jaccard_SDs, 4).astype(str)
2698
+
2699
+ no_hit = np.where(shared_acc_counts == 0)
2700
+
2701
+ #addtl.shape[0] is the query acc count
2702
+ possible_hits = np.minimum(_qgak[q][0].shape[0], _tct).astype(str)
2703
+
2704
+ jaccard_averages = np.round(jaccard_averages, 4).astype(str)
2705
+ shared_acc_counts = shared_acc_counts.astype(str)
2706
+
2707
+ jaccard_averages[no_hit] = "N/A"
2708
+ aai_ests[no_hit] = "N/A"
2709
+ shared_acc_counts[no_hit] = "N/A"
2710
+ possible_hits[no_hit] = "N/A"
2711
+
2712
+ qname = _qnames[q]
2713
+
2714
+ output_name = os.path.normpath(out_base + "/results/"+qname+"_results.txt")
2715
+
2716
+ out = open(output_name, "w")
2717
+ out.write("query\ttarget\tavg_jacc_sim\tjacc_SD\tnum_shared_SCPs\tposs_shared_SCPs\tAAI_estimate\n")
2718
+ if do_sd:
2719
+ jaccard_SDs[no_hit] = "N/A"
2720
+ for i in range(0, len(aai_ests)):
2721
+ out.write(qname+"\t"+_tnames[i]+"\t"+jaccard_averages[i]+"\t"+jaccard_SDs[i]+"\t"+shared_acc_counts[i]+"\t"+possible_hits[i]+"\t"+aai_ests[i]+"\n")
2722
+ else:
2723
+ for i in range(0, len(aai_ests)):
2724
+ out.write(qname+"\t"+_tnames[i]+"\t"+jaccard_averages[i]+"\t"+"N/A"+"\t"+shared_acc_counts[i]+"\t"+possible_hits[i]+"\t"+aai_ests[i]+"\n")
2725
+ out.close()
2726
+
2727
+
2728
+ else:
2729
+ if store:
2730
+ aai_ests = numpy_kaai_to_aai_just_nums(jaccard_averages, as_float = False)
2731
+ aai_ests[no_hit] = 0
2732
+ #add zeros at misses/NAs
2733
+ holder.append(aai_ests)
2734
+ else:
2735
+ aai_ests = numpy_kaai_to_aai_just_nums(jaccard_averages, as_float = True)
2736
+ aai_ests[no_hit] = 0
2737
+ print(*aai_ests, sep = "\t", file = outwriter)
2738
+
2739
+ prog_queue.put(q)
2740
+
2741
+ prog_queue.put("done")
2742
+
2743
+ return None
2744
+
2745
+ def two_work(i):
2746
+ if store:
2747
+ hold_together = np.vstack(holder)
2748
+ np.savetxt(group_id, hold_together, delimiter = "\t", fmt='%4d')
2749
+ else:
2750
+ outwriter.close()
2751
+
2752
+ return group_id
2753
+
2754
+ def on_disk_init(query_database_path, target_database_path, num_tgt, query_queue, target_gak, tpres, sd, sty, output_dir, progress_queue, qnames, tnames, valids, temp_dir):
2755
+ global database
2756
+ database = sqlite3.connect(":memory:")
2757
+
2758
+ curs = database.cursor()
2759
+ curs.execute("attach '" + query_database_path + "' as queries")
2760
+ curs.execute("attach '" + target_database_path + "' as targets")
2761
+ curs.close()
2762
+
2763
+ global _nt
2764
+ _nt = num_tgt
2765
+
2766
+ qgak_data = query_queue.get()
2767
+
2768
+ global out_base
2769
+ out_base = output_dir
2770
+
2771
+ global group_id
2772
+ group_id = os.path.normpath(temp_dir + "/partial_results_group_" + str(qgak_data[0])+ ".txt")
2773
+
2774
+ global _qgak
2775
+ _qgak = qgak_data[1]
2776
+
2777
+ global query_grouping
2778
+ query_grouping = qgak_data[2]
2779
+
2780
+ global _tgak
2781
+ _tgak = target_gak
2782
+
2783
+ global _tpres
2784
+ _tpres = tpres
2785
+
2786
+ global _tct
2787
+ _tct = np.sum(_tpres, axis = 0)
2788
+
2789
+ global do_sd
2790
+ do_sd = sd
2791
+ global style
2792
+ style = sty
2793
+ #Suppress div by zero warning - it's handled.
2794
+ np.seterr(divide='ignore')
2795
+
2796
+ if style == "matrix":
2797
+ global outwriter
2798
+ outwriter = open(group_id, "w")
2799
+
2800
+ global prog_queue
2801
+ prog_queue = progress_queue
2802
+
2803
+ global _qnames
2804
+ _qnames = qnames
2805
+
2806
+ global _tnames
2807
+ _tnames = tnames
2808
+
2809
+ global acc_indexer
2810
+ acc_indexer = generate_accessions_index(forward = False)
2811
+
2812
+ global _valids
2813
+ _valids = valids
2814
+
2815
+ def on_disk_work_one(placeholder):
2816
+ curs = database.cursor()
2817
+ for q in query_grouping:
2818
+ results = []
2819
+ qname = _qnames[q]
2820
+ for acc in _qgak[q][0]:
2821
+ acc_name = acc_indexer[acc]
2822
+
2823
+ if acc_name in _valids:
2824
+
2825
+ one = curs.execute("SELECT kmers FROM queries."+acc_name+"_genomes WHERE genome=?", (str(q),)).fetchone()[0]
2826
+ one = np.frombuffer(one, dtype = np.int32)
2827
+
2828
+ if one.shape[0] > 998:
2829
+ #Each kmer needs to be a tuple.
2830
+ these_kmers = [(int(kmer),) for kmer in one]
2831
+
2832
+ temp_name = "_" + qname +"_" + acc_name
2833
+ temp_name = temp_name.replace(".", "_")
2834
+
2835
+ curs.execute("CREATE TEMP TABLE " + temp_name + " (kmer INTEGER)")
2836
+ insert_table = "INSERT INTO " + temp_name + " VALUES (?)"
2837
+ curs.executemany(insert_table, these_kmers)
2838
+
2839
+ join_and_select_sql = "SELECT genomes FROM " + temp_name + " INNER JOIN targets." + acc_name + " ON "+ temp_name+".kmer = targets." + acc_name + ".kmer;"
2840
+
2841
+ matches = curs.execute(join_and_select_sql).fetchall()
2842
+ else:
2843
+ #kmers must be a list, not a tuple.
2844
+ these_kmers = [int(kmer) for kmer in one]
2845
+ select = "SELECT genomes FROM targets." + acc_name + " WHERE kmer IN ({kmers})".format(kmers=','.join(['?']*len(these_kmers)))
2846
+ matches = curs.execute(select, these_kmers).fetchall()
2847
+
2848
+ set = []
2849
+ for row in matches:
2850
+ set.append(row[0])
2851
+ set = b''.join(set)
2852
+
2853
+ matches = None
2854
+ these_intersections = np.bincount(np.frombuffer(set, dtype = np.int32), minlength = _nt)
2855
+ set = None
2856
+ results.append(these_intersections)
2857
+
2858
+ else:
2859
+ results.append(np.zeros(_nt, dtype=np.int32))
2860
+
2861
+ results = np.vstack(results)
2862
+
2863
+ target_kmer_counts = _tgak[_qgak[q][0], :]
2864
+
2865
+ #unions = size(A) + size(B) - size(intersections(A, B))
2866
+ #unions = target_kmer_counts + query_kmers_by_acc - intersections
2867
+ unions = np.subtract(np.add(target_kmer_counts, _qgak[q][1][:, None]), results)
2868
+
2869
+ #These are now jaccards, not #intersections
2870
+ results = np.divide(results, unions)
2871
+
2872
+ shared_acc_counts = np.sum(_tpres[_qgak[q][0], :], axis = 0)
2873
+
2874
+ no_hit = np.where(shared_acc_counts == 0)
2875
+
2876
+ jaccard_averages = np.divide(np.sum(results, axis = 0), shared_acc_counts)
2877
+
2878
+ #Skip SD if output is matrix
2879
+ if style == "tsv":
2880
+ aai_ests = numpy_kaai_to_aai(jaccard_averages)
2881
+
2882
+ if do_sd:
2883
+ #find diffs from means; this includes indicies corresponding to unshared SCPs that should not be included.
2884
+ results = results - jaccard_averages
2885
+
2886
+ #fix those corresponding indicies to not contribute to the final SD.
2887
+ results[np.logical_not(_tpres[_qgak[q][0], :])] = 0
2888
+ #results[np.nonzero(has_accs == 0)] = 0
2889
+
2890
+ #Square them; 0^2 = 0, so we don't have to think about the fixed indices any more.
2891
+ results = np.square(results)
2892
+ #Sum squares and divide by shared acc. count, the sqrt to get SD.
2893
+ jaccard_SDs = np.sqrt(np.divide(np.sum(results, axis = 0), shared_acc_counts))
2894
+ jaccard_SDs = np.round(jaccard_SDs, 4).astype(str)
2895
+
2896
+ no_hit = np.where(shared_acc_counts == 0)
2897
+
2898
+ #_qgak[q][0] is the query acc count
2899
+ possible_hits = np.minimum(_qgak[q][0].shape[0], _tct).astype(str)
2900
+
2901
+ jaccard_averages = np.round(jaccard_averages, 4).astype(str)
2902
+ shared_acc_counts = shared_acc_counts.astype(str)
2903
+
2904
+ jaccard_averages[no_hit] = "N/A"
2905
+ aai_ests[no_hit] = "N/A"
2906
+ shared_acc_counts[no_hit] = "N/A"
2907
+ possible_hits[no_hit] = "N/A"
2908
+
2909
+ output_name = os.path.normpath(out_base + "/results/"+qname+"_results.txt")
2910
+
2911
+ out = open(output_name, "w")
2912
+ out.write("query\ttarget\tavg_jacc_sim\tjacc_SD\tnum_shared_SCPs\tposs_shared_SCPs\tAAI_estimate\n")
2913
+ if do_sd:
2914
+ jaccard_SDs[no_hit] = "N/A"
2915
+ for i in range(0, len(aai_ests)):
2916
+ out.write(qname+"\t"+_tnames[i]+"\t"+jaccard_averages[i]+"\t"+jaccard_SDs[i]+"\t"+shared_acc_counts[i]+"\t"+possible_hits[i]+"\t"+aai_ests[i]+"\n")
2917
+ else:
2918
+ for i in range(0, len(aai_ests)):
2919
+ out.write(qname+"\t"+_tnames[i]+"\t"+jaccard_averages[i]+"\t"+"N/A"+"\t"+shared_acc_counts[i]+"\t"+possible_hits[i]+"\t"+aai_ests[i]+"\n")
2920
+ out.close()
2921
+
2922
+ else:
2923
+ aai_ests = numpy_kaai_to_aai_just_nums(jaccard_averages, as_float = True)
2924
+ aai_ests[no_hit] = 0
2925
+ print(*aai_ests, sep = "\t", file = outwriter)
2926
+
2927
+ prog_queue.put(q)
2928
+
2929
+ curs.close()
2930
+ prog_queue.put("done")
2931
+
2932
+ def on_disk_work_two(i):
2933
+ outwriter.close()
2934
+ return group_id
2935
+
2936
+ def sorted_nicely(l):
2937
+ convert = lambda text: int(text) if text.isdigit() else text
2938
+ alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ]
2939
+ return sorted(l, key = alphanum_key)
2940
+
2941
+ class db_db_remake:
2942
+ def __init__(self, in_memory = False, store_mat_res = False,
2943
+ query = None, target = None, threads = 1, do_sd = False,
2944
+ output_base = "FastAAI", output_style = "tsv", verbose = True):
2945
+
2946
+ #databases to eat
2947
+ self.q = query
2948
+ self.t = target
2949
+
2950
+ #metadata
2951
+ self.ok = generate_accessions_index(forward = True)
2952
+ self.rev = generate_accessions_index(forward = False)
2953
+ self.valids = None
2954
+
2955
+ #Originally this was made to be a memory database only block of code, but just if/else one change makes it work on disk and it doesn't need a redev, then.
2956
+ self.as_mem_db = in_memory
2957
+ self.store_mat = store_mat_res
2958
+
2959
+ #in-mem stuff
2960
+ self.conn = None
2961
+ self.curs = None
2962
+
2963
+ self.threads = threads
2964
+ self.do_sd = do_sd
2965
+
2966
+ self.output_base = output_base
2967
+ self.output = os.path.normpath(output_base + "/results")
2968
+ self.style = output_style
2969
+
2970
+ self.query_names = None
2971
+ self.target_names = None
2972
+
2973
+ self.num_queries = None
2974
+ self.num_targets = None
2975
+
2976
+ self.query_gak = None
2977
+ self.target_gak = None
2978
+ self.target_presence = None
2979
+
2980
+ self.query_dict = None
2981
+ self.target_dict = None
2982
+
2983
+ self.verbose = verbose
2984
+
2985
+ #getting the db metadata happens the same way in every case
2986
+ def open(self):
2987
+ if self.verbose:
2988
+ print("Perusing database metadata")
2989
+
2990
+ self.conn = sqlite3.connect(":memory:")
2991
+ self.curs = self.conn.cursor()
2992
+
2993
+ self.curs.execute("attach '" + self.q + "' as queries")
2994
+ self.curs.execute("attach '" + self.t + "' as targets")
2995
+
2996
+ #Find the shared accessions for these databases
2997
+ shared_accs_sql = '''
2998
+ SELECT queries.sqlite_master.name
2999
+ FROM queries.sqlite_master INNER JOIN targets.sqlite_master
3000
+ ON queries.sqlite_master.name = targets.sqlite_master.name
3001
+ '''
3002
+ self.valids = {}
3003
+ for table in self.curs.execute(shared_accs_sql).fetchall():
3004
+ table = table[0]
3005
+ #Filter to
3006
+ if table in self.ok:
3007
+ self.valids[table] = self.ok[table]
3008
+
3009
+ self.query_names = []
3010
+ for r in self.curs.execute("SELECT genome FROM queries.genome_index ORDER BY gen_id").fetchall():
3011
+ self.query_names.append(r[0])
3012
+
3013
+ self.target_names = []
3014
+ for r in self.curs.execute("SELECT genome FROM targets.genome_index ORDER BY gen_id").fetchall():
3015
+ self.target_names.append(r[0])
3016
+
3017
+ self.num_queries = len(self.query_names)
3018
+ self.num_targets = len(self.target_names)
3019
+
3020
+ gak_sql = '''
3021
+ SELECT * FROM {db}.genome_acc_kmer_counts
3022
+ WHERE accession in ({accs})
3023
+ ORDER BY genome
3024
+ '''
3025
+
3026
+ acc_ids = list(self.valids.values())
3027
+ acc_ids.sort()
3028
+ acc_ids = tuple(acc_ids)
3029
+
3030
+ #query genome-acc-kmers (gak) is ordered by genome first, then accession
3031
+ self.query_gak = {}
3032
+ #for result in self.curs.execute(gak_sql.format(db = "queries", accs=','.join(['?']*len(self.valids))), acc_ids).fetchall():
3033
+ for result in self.curs.execute("SELECT * FROM queries.genome_acc_kmer_counts ORDER BY genome").fetchall():
3034
+ genome, accession, kmer_ct = result[0], result[1], result[2]
3035
+ if genome not in self.query_gak:
3036
+ self.query_gak[genome] = [[],[]]
3037
+ self.query_gak[genome][0].append(accession)
3038
+ self.query_gak[genome][1].append(kmer_ct)
3039
+
3040
+ #refigure into numpy arrays for quicker array access later.
3041
+ for genome in self.query_gak:
3042
+ self.query_gak[genome] = (np.array(self.query_gak[genome][0], dtype = np.int32), np.array(self.query_gak[genome][1], dtype = np.int32))
3043
+
3044
+ #Split these into ordered groups - this makes joining results at the end easier.
3045
+ qgak_queue = multiprocessing.Queue()
3046
+ groupings = split_seq_indices(np.arange(self.num_queries), self.threads)
3047
+ group_id = 0
3048
+ for group in groupings:
3049
+ next_set = {}
3050
+ for i in range(group[0], group[1]):
3051
+ next_set[i] = self.query_gak[i]
3052
+ self.query_gak[i] = None
3053
+ #this ensures that the selection of qgak and the query index range match
3054
+ qgak_queue.put((group_id, next_set, np.arange(group[0], group[1]),))
3055
+ group_id += 1
3056
+
3057
+ self.query_gak = qgak_queue
3058
+ qgak_queue = None
3059
+
3060
+ #tgt gak is organized by accession first, then genome
3061
+ self.target_gak = np.zeros(shape = (122, self.num_targets), dtype = np.int32)
3062
+ for result in self.curs.execute(gak_sql.format(db = "targets", accs=','.join(['?']*len(self.valids))), acc_ids).fetchall():
3063
+ genome, accession, kmer_ct = result[0], result[1], result[2]
3064
+ self.target_gak[accession, genome] += kmer_ct
3065
+
3066
+ self.target_presence = self.target_gak > 0
3067
+ self.target_presence = self.target_presence.astype(bool)
3068
+
3069
+ #This needs to have a TSV write method
3070
+ def load_in_mem(self):
3071
+ #tempdir_path = os.path.normpath(self.output_base+"/temp")
3072
+ tempdir_path = tempfile.mkdtemp()
3073
+ #if not os.path.exists(tempdir_path):
3074
+ # os.mkdir(tempdir_path)
3075
+
3076
+ ql = {}
3077
+ tl = {}
3078
+ for t in self.valids.values():
3079
+ tl[t] = None
3080
+ for i in range(0, self.num_queries):
3081
+ ql[i] = {}
3082
+
3083
+ if self.verbose:
3084
+ tracker = progress_tracker(total = len(self.valids), message = "Loading data in memory.")
3085
+ else:
3086
+ print("\nLoading data in memory.")
3087
+
3088
+
3089
+ pool = multiprocessing.Pool(self.threads, initializer = parse_db_init,
3090
+ initargs = (self.q, #query
3091
+ self.t, #target
3092
+ tempdir_path,)) #outpath
3093
+
3094
+ for result in pool.imap_unordered(parse_accession, self.valids.keys()):
3095
+ this_accession = result[0]
3096
+
3097
+ this_acc_id = self.ok[this_accession]
3098
+
3099
+ with open(result[1], "rb") as inp:
3100
+ this_acc_data = pickle.load(inp)
3101
+ os.remove(result[1])
3102
+
3103
+ tl[this_acc_id] = this_acc_data[1]
3104
+
3105
+ for q in this_acc_data[0]:
3106
+ #We know that this acc must be in every ql for this loaded data.
3107
+ ql[q][this_acc_id] = this_acc_data[0][q][this_acc_id]
3108
+ if self.verbose:
3109
+ tracker.update()
3110
+
3111
+ pool.close()
3112
+
3113
+ if self.verbose:
3114
+ tracker = progress_tracker(total = self.num_queries, message = "Calculating AAI")
3115
+ else:
3116
+ print("\nCalculating AAI.")
3117
+
3118
+ query_groups = []
3119
+ for grouping in split_seq_indices(np.arange(self.num_queries), self.threads):
3120
+ query_groups.append(np.arange(grouping[0], grouping[1]))
3121
+
3122
+ result_queue = multiprocessing.Queue()
3123
+ remaining_procs = self.threads
3124
+ still_going = True
3125
+
3126
+ pool = multiprocessing.Pool(self.threads, initializer = one_init,
3127
+ initargs = (ql, #ql
3128
+ tl, #tl
3129
+ self.num_targets, #num_tgt
3130
+ self.query_gak, #qgak_queue
3131
+ self.target_gak, #tgak
3132
+ self.target_presence, #tpres
3133
+ self.do_sd, #sd
3134
+ self.style, #sty
3135
+ self.output_base, #output_dir
3136
+ self.store_mat, #store_results
3137
+ result_queue, #progress_queue
3138
+ self.query_names, #qnames
3139
+ self.target_names, #tnames
3140
+ tempdir_path,)) #temp_dir
3141
+
3142
+ some_results = pool.imap(one_work, query_groups)
3143
+
3144
+ while still_going:
3145
+ item = result_queue.get()
3146
+ if item == "done":
3147
+ remaining_procs -= 1
3148
+ if remaining_procs == 0:
3149
+ still_going = False
3150
+ else:
3151
+ if self.verbose:
3152
+ tracker.update()
3153
+ else:
3154
+ pass
3155
+
3156
+ if self.style == "matrix":
3157
+ result_files = []
3158
+
3159
+ for result in pool.map(two_work, range(0, self.threads)):
3160
+ result_files.append(result)
3161
+
3162
+ pool.close()
3163
+
3164
+ self.write_mat_from_files(result_files, tempdir_path)
3165
+ else:
3166
+ pool.close()
3167
+
3168
+ #This needs to be implemented from existing code.
3169
+ def db_on_disk(self):
3170
+ tempdir_path = tempfile.mkdtemp()
3171
+ if self.style == "matrix":
3172
+ self.store_mat = False
3173
+
3174
+ result_queue = multiprocessing.Queue()
3175
+ remaining_procs = self.threads
3176
+ still_going = True
3177
+
3178
+ if self.verbose:
3179
+ tracker = progress_tracker(total = self.num_queries, message = "Calculating AAI")
3180
+ else:
3181
+ print("\nCalculating AAI")
3182
+
3183
+ query_groups = []
3184
+ for grouping in split_seq_indices(np.arange(self.num_queries), self.threads):
3185
+ query_groups.append(np.arange(grouping[0], grouping[1]))
3186
+
3187
+ #query_database_path, target_database_path, num_tgt, query_queue, target_gak, tpres, sd,
3188
+ #sty, output_dir, progress_queue, qnames, tnames, valids, temp_dir
3189
+ pool = multiprocessing.Pool(self.threads, initializer = on_disk_init,
3190
+ initargs = (self.q, #query_database_path
3191
+ self.t, #target_database_path
3192
+ self.num_targets, #num_tgt
3193
+ self.query_gak, #query_queue
3194
+ self.target_gak, #target_gak
3195
+ self.target_presence, #tpres
3196
+ self.do_sd, #sd
3197
+ self.style, #sty
3198
+ self.output_base, #output_dir
3199
+ result_queue, #progress_queue
3200
+ self.query_names, #qnames
3201
+ self.target_names, #tnames
3202
+ self.valids, #valids
3203
+ tempdir_path,)) #temp_dir
3204
+
3205
+ some_results = pool.imap(on_disk_work_one, query_groups)
3206
+
3207
+ while still_going:
3208
+ item = result_queue.get()
3209
+ if item == "done":
3210
+ remaining_procs -= 1
3211
+ if remaining_procs == 0:
3212
+ still_going = False
3213
+ else:
3214
+ if self.verbose:
3215
+ tracker.update()
3216
+ else:
3217
+ pass
3218
+
3219
+ if self.style == "matrix":
3220
+ result_files = []
3221
+ for result in pool.map(on_disk_work_two, range(0, self.threads)):
3222
+ result_files.append(result)
3223
+
3224
+ pool.close()
3225
+
3226
+ if self.style == "matrix":
3227
+ self.write_mat_from_files(result_files, tempdir_path)
3228
+
3229
+ def write_mat_from_files(self, result_files, tempdir_path):
3230
+ #tempdir_path = os.path.normpath(self.output_base+"/temp")
3231
+
3232
+ result_files = sorted_nicely(result_files)
3233
+
3234
+ #print("Combining:")
3235
+ #for f in result_files:
3236
+ # print(f)
3237
+
3238
+ if self.verbose:
3239
+ tracker = progress_tracker(total = self.threads, step_size = 2, message = "Finalizing results.")
3240
+ else:
3241
+ print("\nFinalizing results.")
3242
+
3243
+ output_file = os.path.normpath(self.output+"/FastAAI_matrix.txt")
3244
+ final_outwriter = open(output_file, "w")
3245
+ print("query_genome\t"+'\t'.join(self.target_names), file = final_outwriter)
3246
+
3247
+ row = 0
3248
+
3249
+ for f in result_files:
3250
+ fh = open(f, "r")
3251
+ cur = fh.readlines()
3252
+ fh.close()
3253
+
3254
+ for i in range(0, len(cur)):
3255
+ if self.store_mat:
3256
+ #Add the decimals - we don't need to do this is we've been writing line-wise.
3257
+ #values will ALWAYS be 4 digits in this method, so groups of 2 dec. works.
3258
+ cur[i] = re.sub("(\d{2})(\d{2})", "\\1.\\2", cur[i])
3259
+ #Add in the query name to the row
3260
+ cur[i] = self.query_names[row]+"\t"+cur[i]
3261
+ row += 1
3262
+
3263
+ final_outwriter.write(''.join(cur))
3264
+ cur = None
3265
+
3266
+ try:
3267
+ os.remove(f)
3268
+ except:
3269
+ pass
3270
+
3271
+ if self.verbose:
3272
+ tracker.update()
3273
+
3274
+ final_outwriter.close()
3275
+
3276
+ try:
3277
+ if len(os.listdir(tempdir_path)) == 0:
3278
+ shutil.rmtree(tempdir_path)
3279
+ except:
3280
+ pass
3281
+
3282
+ def close(self):
3283
+ self.curs.close()
3284
+ self.curs = None
3285
+
3286
+ def clean_up(self):
3287
+ self.conn.close()
3288
+ self.conn = None
3289
+
3290
+ def run(self):
3291
+ self.open()
3292
+
3293
+ #work
3294
+ if self.as_mem_db:
3295
+ self.load_in_mem()
3296
+ else:
3297
+ self.db_on_disk()
3298
+
3299
+ self.close()
3300
+ self.clean_up()
3301
+
3302
+
3303
+ #Control the query process for any DB-first query.
3304
+ def db_query(query, target, verbose, output, threads, do_stdev, style, in_mem, store_results):
3305
+ print("")
3306
+
3307
+ #Sanity checks.
3308
+ if target is None:
3309
+ print("You need to supply a databasae for --target")
3310
+ sys.exit()
3311
+
3312
+ #Sanity checks.
3313
+ if query is None:
3314
+ print("You need to supply a databasae for --query")
3315
+ sys.exit()
3316
+
3317
+
3318
+
3319
+ #Sanity checks.
3320
+ if not os.path.exists(target):
3321
+ print("Target database not found. Exiting FastAAI")
3322
+ sys.exit()
3323
+
3324
+ if not os.path.exists(query):
3325
+ print("Query database not found. Exiting FastAAI")
3326
+ sys.exit()
3327
+
3328
+ #status = "exists"
3329
+ query_ok = assess_db(query)
3330
+ target_ok = assess_db(target)
3331
+
3332
+ if query_ok != "exists":
3333
+ print("Query database improperly formatted. Exiting FastAAI")
3334
+ sys.exit()
3335
+
3336
+ if target_ok != "exists":
3337
+ print("Query database improperly formatted. Exiting FastAAI")
3338
+ sys.exit()
3339
+
3340
+ #Check if the database is querying against itself.
3341
+ if target is None or query is None:
3342
+ print("I require both a query and a target database. FastAAI exiting.")
3343
+ sys.exit()
3344
+
3345
+ if query == target:
3346
+ print("Performing an all vs. all query on", query)
3347
+ #all_vs_all = True
3348
+ else:
3349
+ print("Querying", query, "against", target)
3350
+ #all_vs_all = False
3351
+
3352
+ #Ready the output directories as needed.
3353
+ #The databases are already created, the only state they can be in in P+H
3354
+ good_to_go = prepare_directories(output, "protein and HMM", "query")
3355
+ if not good_to_go:
3356
+ print("Exiting FastAAI")
3357
+ sys.exit()
3358
+
3359
+ #todo
3360
+ mdb = db_db_remake(in_memory = in_mem, store_mat_res = store_results, query = query, target = target, threads = threads, do_sd = do_stdev, output_base = output, output_style = style, verbose = verbose)
3361
+ mdb.run()
3362
+
3363
+ print("")
3364
+
3365
+
3366
+ #Check to see if the file exists and is a valid fastAAI db
3367
+ def assess_db(path):
3368
+ status = None
3369
+ if os.path.exists(path):
3370
+ conn = sqlite3.connect(path)
3371
+ curs = conn.cursor()
3372
+ try:
3373
+ sql = "SELECT name FROM sqlite_master WHERE type='table'"
3374
+
3375
+ curs.row_factory = lambda cursor, row: row[0]
3376
+ tables = curs.execute(sql).fetchall()
3377
+ curs.row_factory = None
3378
+
3379
+ curs.close()
3380
+ conn.close()
3381
+
3382
+ if len(tables) > 2 and "genome_index" in tables and "genome_acc_kmer_counts" in tables:
3383
+ status = "exists"
3384
+ else:
3385
+ status = "wrong format"
3386
+
3387
+ except:
3388
+ status = "wrong format"
3389
+
3390
+ else:
3391
+ try:
3392
+ conn = sqlite3.connect(path)
3393
+ conn.close()
3394
+ status = "created"
3395
+ except:
3396
+ status = "unable to create"
3397
+
3398
+ return status
3399
+
3400
+ #Add one FastAAI DB to another FastAAI DB
3401
+ def merge_db_opts():
3402
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
3403
+ description='''
3404
+ This FastAAI module allows you to add the contents of one or more FastAAI databases to another.
3405
+ You must have at least two already-created FastAAI databases using the build_db module before this module can be used.
3406
+
3407
+ Supply a comma-separated list of at least one donor database and a single recipient database.
3408
+ If the recipient already exists, then genomes in all the donors will be added to the recipient.
3409
+ If the recipient does not already exist, a new database will be created, and the contents of all the donors will be added to it.
3410
+
3411
+ Example:
3412
+ FastAAI.py merge_db --donors databases/db1.db,databases/db2.db -recipient databases/db3.db --threads 3
3413
+ This command will create a new database called "db3.db", merge the data in db1.db and db2.db, and then add the merged data into db3.db
3414
+
3415
+ Only the recipient database will be modified; the donors will be left exactly as they were before running this module.
3416
+ ''')
3417
+
3418
+ parser.add_argument('-d', '--donors', dest = 'donors', default = None, help = 'Comma-separated string of paths to one or more donor databases. The genomes FROM the donors will be added TO the recipient and the donors will be unaltered')
3419
+ parser.add_argument('--donor_file', dest = 'donor_file', default = None, help = 'File containing paths to one or more donor databases, one per line. Use EITHER this or --donors')
3420
+
3421
+ parser.add_argument('-r', '--recipient', dest = 'recipient', default = None, help = 'Path to the recipient database. Any genomes FROM the donor database not already in the recipient will be added to this database.')
3422
+
3423
+ parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
3424
+
3425
+ parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
3426
+
3427
+ args, unknown = parser.parse_known_args()
3428
+
3429
+ return parser, args
3430
+
3431
+ def merge_db_init(indexer, table_record, donor_dbs, tempdir):
3432
+ global mgi
3433
+ mgi = indexer
3434
+ global accs_per_db
3435
+ accs_per_db = table_record
3436
+ global tdb_list
3437
+ tdb_list = donor_dbs
3438
+ global work_space
3439
+ work_space = tempdir
3440
+
3441
+ def acc_transformer_merge(acc_name_genomes):
3442
+ acc_name = acc_name_genomes.split("_genomes")[0]
3443
+ my_acc_db = os.path.normpath(work_space + "/"+acc_name+".db")
3444
+ if os.path.exists(my_acc_db):
3445
+ os.remove(my_acc_db)
3446
+
3447
+ my_db = sqlite3.connect(my_acc_db)
3448
+ curs = my_db.cursor()
3449
+ curs.execute("CREATE TABLE {acc} (kmer INTEGER PRIMARY KEY, genomes array)".format(acc=acc_name))
3450
+ curs.execute("CREATE TABLE {acc} (genome INTEGER PRIMARY KEY, kmers array)".format(acc=acc_name_genomes))
3451
+ my_db.commit()
3452
+
3453
+ reformat = {}
3454
+ for d in tdb_list:
3455
+ simple_rows = []
3456
+ #do nothing if the acc is not in the donor.
3457
+ if acc_name_genomes in accs_per_db[d]:
3458
+ donor_conn = sqlite3.connect(d)
3459
+ dcurs = donor_conn.cursor()
3460
+ data = dcurs.execute("SELECT * FROM {acc}".format(acc=acc_name_genomes)).fetchall()
3461
+ dcurs.close()
3462
+ donor_conn.close()
3463
+
3464
+ for row in data:
3465
+ genome, kmers = row[0], row[1]
3466
+ new_index = mgi[d][genome]
3467
+ #-1 is the value indicating an already-seen genome that should not be added.
3468
+ if new_index > -1:
3469
+ simple_rows.append((new_index, kmers,))
3470
+ kmers = np.frombuffer(kmers, dtype=np.int32)
3471
+ for k in kmers:
3472
+ if k not in reformat:
3473
+ reformat[k] = []
3474
+ reformat[k].append(new_index)
3475
+
3476
+ if len(simple_rows) > 0:
3477
+ curs.executemany("INSERT INTO {acc} VALUES (?,?)".format(acc=acc_name_genomes), simple_rows)
3478
+ my_db.commit()
3479
+
3480
+ simple_rows = None
3481
+ data = None
3482
+
3483
+ to_add = []
3484
+ for k in reformat:
3485
+ as_bytes = np.array(reformat[k], dtype = np.int32)
3486
+ as_bytes = as_bytes.tobytes()
3487
+ reformat[k] = None
3488
+ to_add.append((int(k), as_bytes,))
3489
+
3490
+ curs.executemany("INSERT INTO {acc} VALUES (?, ?)".format(acc = acc_name), to_add)
3491
+
3492
+ my_db.commit()
3493
+
3494
+ to_add = None
3495
+
3496
+ curs.execute("CREATE INDEX {acc}_index ON {acc} (kmer)".format(acc=acc_name))
3497
+ my_db.commit()
3498
+
3499
+ curs.close()
3500
+ my_db.close()
3501
+
3502
+ return [my_acc_db, acc_name]
3503
+
3504
+ def merge_db(recipient, donors, donor_file, verbose, threads):
3505
+ #Prettier on the CLI
3506
+ if (donors is None and donor_file is None) or recipient is None:
3507
+ print("Either donor or target not given. FastAAI is exiting.")
3508
+ return None
3509
+
3510
+ print("")
3511
+
3512
+ if donors is not None:
3513
+ donors = donors.split(",")
3514
+
3515
+ if donor_file is not None:
3516
+ try:
3517
+ donors = []
3518
+ fh = agnostic_reader(donor_file)
3519
+ for line in fh:
3520
+ line = line.strip()
3521
+ donors.append(line)
3522
+ fh.close()
3523
+ except:
3524
+ sys.exit("Could not parse your donor file.")
3525
+
3526
+ valid_donors = []
3527
+ for d in donors:
3528
+ if os.path.exists(d):
3529
+ if d == recipient:
3530
+ print("Donor database", d, "is the same as the recipient. This database will be skipped.")
3531
+ else:
3532
+ check = assess_db(d)
3533
+ if check == "exists":
3534
+ if d not in valid_donors:
3535
+ valid_donors.append(d)
3536
+ else:
3537
+ print("It appears that database", d, "was already added to the list of donors. Did you type it twice in the list of donors? Skipping it.")
3538
+ else:
3539
+ if check == "created":
3540
+ print("Donor database", d, "not found! Skipping.")
3541
+ else:
3542
+ print("Something was wrong with supplied database:", d+". A status check found:", check)
3543
+ else:
3544
+ print("Donor database", d, "not found! Are you sure the path is correct and this donor exists? This database will be skipped.")
3545
+
3546
+ if len(valid_donors) == 0:
3547
+ print("None of the supplied donor databases were able to be accessed. FastAAI cannot continue if none of these databases are valid. Exiting.")
3548
+ sys.exit()
3549
+
3550
+ recip_check = assess_db(recipient)
3551
+
3552
+ if recip_check == "created" or recip_check == "exists":
3553
+ print("Donor databases:")
3554
+ for donor in valid_donors:
3555
+ print("\t", donor)
3556
+ print("Will be added to recipient database:", recipient)
3557
+ else:
3558
+ print("I couldn't find or create the recipient database at", recipient+".", "Does the folder you're trying to place this database in exist, and do you have permission to write files to it? FastAAI exiting.")
3559
+ sys.exit()
3560
+
3561
+ if recipient is None or len(valid_donors) == 0:
3562
+ print("I require both a valid donor and a recipient database. FastAAI exiting.")
3563
+ sys.exit()
3564
+
3565
+ gen_counter = 0
3566
+ multi_gen_ids = {}
3567
+ all_gens = {}
3568
+
3569
+ #Load recipient data, if any.
3570
+ if recip_check == "exists":
3571
+ conn = sqlite3.connect(recipient)
3572
+ curs = conn.cursor()
3573
+ data = curs.execute("SELECT genome, gen_id FROM genome_index").fetchall()
3574
+ tabs = curs.execute("SELECT name FROM sqlite_master").fetchall()
3575
+ curs.close()
3576
+ conn.close()
3577
+
3578
+ multi_gen_ids[recipient] = {}
3579
+ for row in data:
3580
+ genome, index = row[0], row[1]
3581
+ all_gens[genome] = 0
3582
+ multi_gen_ids[recipient][genome] = index
3583
+
3584
+ gen_counter = max(list(multi_gen_ids[recipient].values())) + 1
3585
+
3586
+ genome_index_to_add = []
3587
+ gak_to_add = []
3588
+ tables = {}
3589
+ #Donors should always exist, never be created.
3590
+ for d in valid_donors:
3591
+ #load
3592
+ conn = sqlite3.connect(d)
3593
+ curs = conn.cursor()
3594
+ data = curs.execute("SELECT * FROM genome_index").fetchall()
3595
+ tabs = curs.execute("SELECT name FROM sqlite_master").fetchall()
3596
+ gak = curs.execute("SELECT * FROM genome_acc_kmer_counts").fetchall()
3597
+ curs.close()
3598
+ conn.close()
3599
+ multi_gen_ids[d] = {}
3600
+ for row in data:
3601
+ genome, index, prot_ct = row[0], row[1], row[2]
3602
+ if genome not in all_gens:
3603
+ all_gens[genome] = 0
3604
+ #We need to be able to convert number to number.
3605
+ multi_gen_ids[d][index] = gen_counter
3606
+ genome_index_to_add.append((genome, gen_counter, prot_ct,))
3607
+ gen_counter += 1
3608
+ else:
3609
+ #This is a remove condition for later.
3610
+ multi_gen_ids[d][index] = -1
3611
+ data = None
3612
+
3613
+ for row in gak:
3614
+ genome_id, acc_id, kmer_ct = row[0], row[1], row[2]
3615
+ new_index = multi_gen_ids[d][genome_id]
3616
+ if new_index > -1:
3617
+ gak_to_add.append((new_index, acc_id, kmer_ct,))
3618
+
3619
+ tables[d] = []
3620
+ for tab in tabs:
3621
+ tab = tab[0]
3622
+ if tab.endswith("_genomes"):
3623
+ tables[d].append(tab)
3624
+ tables[d] = set(tables[d])
3625
+
3626
+ all_tabs = set()
3627
+ for t in tables:
3628
+ all_tabs = all_tabs.union(tables[t])
3629
+
3630
+ all_tabs = list(all_tabs)
3631
+
3632
+
3633
+ temp_dir = tempfile.mkdtemp()
3634
+ try:
3635
+ if verbose:
3636
+ tracker = progress_tracker(len(all_tabs), message = "Formatting data to add to database")
3637
+ else:
3638
+ print("Formatting data to add to database")
3639
+
3640
+ conn = sqlite3.connect(recipient)
3641
+ curs = conn.cursor()
3642
+
3643
+ #indexer, table_record, donor_dbs, tempdir
3644
+ pool = multiprocessing.Pool(threads, initializer=merge_db_init, initargs = (multi_gen_ids, tables, valid_donors, temp_dir,))
3645
+
3646
+ for result in pool.imap_unordered(acc_transformer_merge, all_tabs):
3647
+ db, accession = result[0], result[1]
3648
+ curs.execute("CREATE TABLE IF NOT EXISTS {acc} (kmer INTEGER PRIMARY KEY, genomes array)".format(acc=accession))
3649
+ curs.execute("CREATE TABLE IF NOT EXISTS {acc}_genomes (genome INTEGER PRIMARY KEY, kmers array)".format(acc=accession))
3650
+ curs.execute("CREATE INDEX IF NOT EXISTS {acc}_index ON {acc}(kmer)".format(acc=accession))
3651
+ conn.commit()
3652
+
3653
+ curs.execute("attach '" + db + "' as acc")
3654
+ conn.commit()
3655
+
3656
+ #Get the genomes from worker db.
3657
+ curs.execute("INSERT INTO {acc}_genomes SELECT * FROM acc.{acc}_genomes".format(acc=accession))
3658
+ to_update = curs.execute("SELECT kmer, genomes, genomes FROM acc.{acc}".format(acc=accession)).fetchall()
3659
+ update_concat_sql = "INSERT INTO {acc} VALUES (?,?) ON CONFLICT(kmer) DO UPDATE SET genomes=genomes || (?)".format(acc=accession)
3660
+ curs.executemany(update_concat_sql, to_update)
3661
+ conn.commit()
3662
+
3663
+ curs.execute("detach acc")
3664
+ conn.commit()
3665
+
3666
+ os.remove(db)
3667
+
3668
+ if verbose:
3669
+ tracker.update()
3670
+
3671
+ pool.close()
3672
+ pool.join()
3673
+
3674
+ curs.execute("CREATE TABLE IF NOT EXISTS genome_index (genome text, gen_id integer, protein_count integer)")
3675
+ curs.execute("CREATE TABLE IF NOT EXISTS genome_acc_kmer_counts (genome integer, accession integer, count integer)")
3676
+
3677
+ curs.executemany("INSERT INTO genome_index VALUES (?,?,?)", genome_index_to_add)
3678
+ curs.executemany("INSERT INTO genome_acc_kmer_counts VALUES (?,?,?)", gak_to_add)
3679
+
3680
+ curs.execute("CREATE INDEX IF NOT EXISTS kmer_acc ON genome_acc_kmer_counts (genome, accession);")
3681
+
3682
+ conn.commit()
3683
+
3684
+ except:
3685
+ curs.close()
3686
+ conn.close()
3687
+ #Error
3688
+ shutil.rmtree(temp_dir)
3689
+ if recip_check == "created":
3690
+ print("Removing created database after failure.")
3691
+ os.remove(recipient)
3692
+ try:
3693
+ curs.close()
3694
+ conn.close()
3695
+ #Success
3696
+ shutil.rmtree(temp_dir)
3697
+ except:
3698
+ pass
3699
+
3700
+ print("\nDatabases merged!")
3701
+
3702
+ return None
3703
+
3704
+ #Query 1 genome vs. 1 target using Carlos' method - just needs query, target, threads
3705
+ def single_query_opts():
3706
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
3707
+ description='''
3708
+ This FastAAI module takes a single query genome, protein, or protein and HMM pair and a single target genome, protein, or protein and HMM pair as inputs and calculates AAI between the two.
3709
+
3710
+ If you supply a genome as either query or target, a protein and HMM file will be made for the genome.
3711
+ If you supply a protein as either query or target, an HMM file will be made for it.
3712
+ If you supply both an HMM and protein, the search will start right away. You cannot provide only an HMM.
3713
+
3714
+ No database will be built, and you cannot query multiple genomes with this module.
3715
+
3716
+ If you wish to query multiple genomes against themselves in all vs. all AAI search, use aai_index instead.
3717
+ If you wish to query multiple genomes against multiple targets, use multi_query instead.
3718
+ ''')
3719
+ parser.add_argument('-qg', '--query_genome', dest = 'query_genome', default = None, help = 'Query genome')
3720
+ parser.add_argument('-tg', '--target_genome', dest = 'target_genome', default = None, help = 'Target genome')
3721
+
3722
+ parser.add_argument('-qp', '--query_protein', dest = 'query_protein', default = None, help = 'Query protein')
3723
+ parser.add_argument('-tp', '--target_protein', dest = 'target_protein', default = None, help = 'Target protein')
3724
+
3725
+ parser.add_argument('-qh', '--query_hmm', dest = 'query_hmm', default = None, help = 'Query HMM')
3726
+ parser.add_argument('-th', '--target_hmm', dest = 'target_hmm', default = None, help = 'Target HMM')
3727
+
3728
+ parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory where FastAAI will place the result of this query. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
3729
+
3730
+ parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
3731
+ parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
3732
+ parser.add_argument('--compress', dest = "do_comp", action = 'store_true', help = 'Gzip compress generated proteins, HMMs. Off by default.')
3733
+
3734
+ args, unknown = parser.parse_known_args()
3735
+
3736
+ return parser, args
3737
+
3738
+ def kaai_to_aai(kaai):
3739
+ # Transform the kAAI into estimated AAI values
3740
+ aai_hat = (-0.3087057 + 1.810741 * (np.exp(-(-0.2607023 * np.log(kaai))**(1/3.435))))*100
3741
+
3742
+ return aai_hat
3743
+
3744
+ #This one's unique. It doesn't do anything with the DB, which means it doesn't access any other functionality outside of the input_file class. It just advances a pair of inputs in parallel and does intersections.
3745
+ def single_query(qf, tf, output, verbose, threads, do_compress):
3746
+
3747
+ if qf.identifiers[0] == tf.identifiers[0]:
3748
+ print("You've selected the same query and target genome. The AAI is 100%.")
3749
+ print("FastAAI exiting.")
3750
+ return None
3751
+
3752
+ statuses = ["genome", "protein", "protein and hmm"]
3753
+ query_stat = statuses.index(qf.status)
3754
+ target_stat = statuses.index(tf.status)
3755
+ minimum_status = statuses[min(query_stat, target_stat)]
3756
+
3757
+ start_printouts = ["[Genome] Protein Protein+HMM", " Genome [Protein] Protein+HMM", "Genome Protein [Protein+HMM]"]
3758
+
3759
+ print("")
3760
+ print("Query start: ", start_printouts[query_stat])
3761
+ print("Target start:", start_printouts[target_stat])
3762
+ print("")
3763
+
3764
+
3765
+ qname = qf.identifiers[0]
3766
+ tname = tf.identifiers[0]
3767
+
3768
+ name = os.path.normpath(output + "/results/" + qname + "_vs_" + tname + ".aai.txt")
3769
+ print("Output will be located at", name)
3770
+
3771
+ advance_me = [qf.in_files[0], tf.in_files[0]]
3772
+ #All we need to do this.
3773
+ hmm_file = find_hmm()
3774
+ pool = multiprocessing.Pool(min(threads, 2), initializer = hmm_preproc_initializer, initargs = (hmm_file, do_compress,))
3775
+
3776
+ results = pool.map(run_build, advance_me)
3777
+
3778
+ pool.close()
3779
+ pool.join()
3780
+
3781
+ query = results[0]
3782
+ target = results[1]
3783
+
3784
+ print(query.partial_timings())
3785
+ print(target.partial_timings())
3786
+
3787
+ #One of the printouts
3788
+ max_poss_prots = max(len(query.best_hits_kmers), len(target.best_hits_kmers))
3789
+
3790
+ accs_to_view = set(query.best_hits_kmers.keys()).intersection(set(target.best_hits_kmers.keys()))
3791
+
3792
+ results = []
3793
+ for acc in accs_to_view:
3794
+ intersect = np.intersect1d(query.best_hits_kmers[acc], target.best_hits_kmers[acc])
3795
+ intersect = intersect.shape[0]
3796
+ union = query.best_hits_kmers[acc].shape[0] + target.best_hits_kmers[acc].shape[0] - intersect
3797
+ jacc = intersect/union
3798
+ results.append(jacc)
3799
+
3800
+ results = np.array(results, dtype = np.float_)
3801
+
3802
+ jacc_mean = np.mean(results)
3803
+ jacc_std = np.std(results)
3804
+ actual_prots = len(results)
3805
+ poss_prots = max(len(query.best_hits_kmers), len(target.best_hits_kmers))
3806
+ aai_est = round(kaai_to_aai(jacc_mean), 2)
3807
+
3808
+ if aai_est > 90:
3809
+ aai_est = ">90%"
3810
+ else:
3811
+ if aai_est < 30:
3812
+ aai_est = "<30%"
3813
+
3814
+ output = open(name, "w")
3815
+
3816
+ print("query\ttarget\tavg_jacc_sim\tjacc_SD\tnum_shared_SCPs\tposs_shared_SCPs\tAAI_estimate", file = output)
3817
+ print(qname, tname, round(jacc_mean, 4), round(jacc_std, 4), actual_prots, poss_prots, aai_est, sep = "\t", file = output)
3818
+
3819
+ output.close()
3820
+
3821
+ print("query\ttarget\tavg_jacc_sim\tjacc_SD\tnum_shared_SCPs\tposs_shared_SCPs\tAAI_estimate")
3822
+ print(qname, tname, round(jacc_mean, 4), round(jacc_std, 4), actual_prots, poss_prots, aai_est, sep = "\t")
3823
+
3824
+
3825
+ print("FastAAI single query done! Estimated AAI:", aai_est)
3826
+
3827
+ def miga_merge_opts():
3828
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
3829
+ description='''
3830
+ Hello, Miguel.
3831
+
3832
+ Give one genome in nt, aa, or aa+hmm format and a database to create or add to.
3833
+ It'll add the genome as efficiently as possible.
3834
+
3835
+ The normal merge command creates parallel processes and gathers data in
3836
+ one-SCP databases to add to the main DB. Great for many genomes. A lot of extra
3837
+ work for just one.
3838
+
3839
+ This version skips the creation of subordinate DBs and just directly adds the genome.
3840
+ Faster, fewer writes, no parallel overhead.
3841
+ ''')
3842
+
3843
+ parser.add_argument('--genome', dest = 'gen', default = None, help = 'Path to one genome, FASTA format')
3844
+ parser.add_argument('--protein', dest = 'prot', default = None, help = 'Path to one protein, AA FASTA format')
3845
+ parser.add_argument('--hmm', dest = 'hmm', default = None, help = 'Path to one HMM file as predicted by FastAAI')
3846
+
3847
+ parser.add_argument('--output', dest = 'output', default = "FastAAI", help = 'Place the partial output files into a directory with this base. Default "FastAAI"')
3848
+ parser.add_argument('--target', dest = 'database', default = None, help = 'Path to the target database. The genome supplied will be added to this. The DB will be created if needed.')
3849
+
3850
+ parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
3851
+ parser.add_argument('--compress', dest = 'compress', action='store_true', help = 'Compress generated file output')
3852
+
3853
+ args, unknown = parser.parse_known_args()
3854
+
3855
+ return parser, args
3856
+
3857
+ def miga_merge(infile, target_db, verbose, do_compress):
3858
+ status = assess_db(target_db)
3859
+ if status == "wrong format":
3860
+ print("The database", target_db, "exists, but appears to not be a FastAAI database.")
3861
+ print("FastAAI will not alter this file. Quitting.")
3862
+ return None
3863
+
3864
+ if status == "unable to create":
3865
+ print("The database", target_db, "could not be created.")
3866
+ print("Are you sure that the path you gave is valid? Quitting.")
3867
+ return None
3868
+
3869
+ if verbose:
3870
+ print("Processing genome")
3871
+
3872
+ next_id = 0
3873
+ exist_gens = {}
3874
+ conn = sqlite3.connect(target_db)
3875
+ curs = conn.cursor()
3876
+ if status == 'exists':
3877
+ for row in curs.execute("SELECT * FROM genome_index ORDER BY gen_id").fetchall():
3878
+ genome, id, prot_ct = row[0], row[1], row[2]
3879
+ exist_gens[genome] = id
3880
+ next_id += 1
3881
+
3882
+ if infile.basename in exist_gens:
3883
+ print("It looks like the file you're trying to add already exists in the database.")
3884
+ print("Adding it is too likely to corrupt the database. Quitting.")
3885
+ return None
3886
+
3887
+ hmm_file = find_hmm()
3888
+ global hmm_manager
3889
+
3890
+ hmm_manager = pyhmmer_manager(do_compress)
3891
+ hmm_manager.load_hmm_from_file(hmm_file)
3892
+
3893
+ infile.preprocess()
3894
+
3895
+ if len(infile.best_hits_kmers) > 0:
3896
+
3897
+ ok = generate_accessions_index()
3898
+ gak_to_add = []
3899
+
3900
+ gen_id = np.zeros(1, dtype = np.int32)
3901
+ gen_id[0] = next_id
3902
+ gen_id = gen_id.tobytes()
3903
+
3904
+ for accession in infile.best_hits_kmers:
3905
+ acc_id = ok[accession]
3906
+ gak_to_add.append((next_id, acc_id, infile.best_hits_kmers[accession].shape[0],))
3907
+
3908
+ curs.execute("CREATE TABLE IF NOT EXISTS {acc} (kmer INTEGER PRIMARY KEY, genomes array)".format(acc=accession))
3909
+ curs.execute("CREATE TABLE IF NOT EXISTS {acc}_genomes (genome INTEGER PRIMARY KEY, kmers array)".format(acc=accession))
3910
+ curs.execute("CREATE INDEX IF NOT EXISTS {acc}_index ON {acc}(kmer)".format(acc=accession))
3911
+
3912
+ gen_first = (next_id, infile.best_hits_kmers[accession].tobytes(),)
3913
+ curs.execute("INSERT INTO {acc}_genomes VALUES (?,?)".format(acc=accession), gen_first)
3914
+
3915
+ kmers_first = []
3916
+ for k in infile.best_hits_kmers[accession]:
3917
+ #we know there's only one genome in these cases.
3918
+ kmers_first.append((int(k), gen_id, gen_id, ))
3919
+
3920
+ update_concat_sql = "INSERT INTO {acc} VALUES (?,?) ON CONFLICT(kmer) DO UPDATE SET genomes=genomes || (?)".format(acc=accession)
3921
+
3922
+ curs.executemany(update_concat_sql, kmers_first)
3923
+
3924
+ #Safety checks.
3925
+ curs.execute("CREATE TABLE IF NOT EXISTS genome_index (genome text, gen_id integer, protein_count integer)")
3926
+ curs.execute("CREATE TABLE IF NOT EXISTS genome_acc_kmer_counts (genome integer, accession integer, count integer)")
3927
+
3928
+ gen_idx_to_add = (infile.basename, next_id, len(infile.best_hits_kmers))
3929
+ curs.execute("INSERT INTO genome_index VALUES (?, ?, ?)", gen_idx_to_add)
3930
+ #gak was made over the loops.
3931
+ curs.executemany("INSERT INTO genome_acc_kmer_counts VALUES (?,?,?)", gak_to_add)
3932
+ curs.execute("CREATE INDEX IF NOT EXISTS kmer_acc ON genome_acc_kmer_counts (genome, accession);")
3933
+
3934
+ conn.commit()
3935
+
3936
+ else:
3937
+ print("No proteins to add for this genome:",infile.basename,"Database will be unaltered. Exiting.")
3938
+
3939
+ curs.close()
3940
+ conn.close()
3941
+
3942
+
3943
+ def miga_dirs(output, subdir):
3944
+ preparation_successful = True
3945
+
3946
+ if not os.path.exists(output):
3947
+ try:
3948
+ os.mkdir(output)
3949
+ except:
3950
+ print("")
3951
+ print("FastAAI tried to make output directory: '"+ output + "' but failed.")
3952
+ print("")
3953
+ print("Troubleshooting:")
3954
+ print("")
3955
+ print(" (1) Do you have permission to create directories in the location you specified?")
3956
+ print(" (2) Did you make sure that all directories other than", os.path.basename(output), "already exist?")
3957
+ print("")
3958
+ preparation_successful = False
3959
+
3960
+ if preparation_successful:
3961
+ try:
3962
+ if not os.path.exists(os.path.normpath(output + "/" + subdir)):
3963
+ os.mkdir(os.path.normpath(output + "/" + subdir))
3964
+ except:
3965
+ print("FastAAI was able to create or find", output, "but couldn't make directories there.")
3966
+ print("")
3967
+ print("This shouldn't happen. Do you have permission to write to that directory?")
3968
+
3969
+
3970
+ return preparation_successful
3971
+
3972
+ def miga_preproc_opts():
3973
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
3974
+ description='''Build module intended for use by MiGA.
3975
+
3976
+ Performs protein prediction, HMM searching, and best hit identification, but does NOT
3977
+ build a database. Produces instead "crystals," which are tab-sep files containing protein,
3978
+ HMM accession, and original protein sequences for the best hits. These crystals can be passed
3979
+ to "miga_db_from_crystals" action later on to rapidly create a DB from many genomes.
3980
+ ''')
3981
+
3982
+ parser.add_argument('-g', '--genomes', dest = 'genomes', default = None, help = 'A directory containing genomes in FASTA format.')
3983
+ parser.add_argument('-p', '--proteins', dest = 'proteins', default = None, help = 'A directory containing protein amino acids in FASTA format.')
3984
+ parser.add_argument('-m', '--hmms', dest = 'hmms', default = None, help = 'A directory containing the results of an HMM search on a set of proteins.')
3985
+
3986
+ parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory to place the database and any protein or HMM files FastAAI creates. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
3987
+
3988
+ parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
3989
+ parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
3990
+ parser.add_argument('--compress', dest = "do_comp", action = 'store_true', help = 'Gzip compress generated proteins, HMMs. Off by default.')
3991
+
3992
+ args, unknown = parser.parse_known_args()
3993
+
3994
+ return parser, args
3995
+
3996
+ def run_miga_preproc(input_file):
3997
+ input_file.crystalize = True
3998
+ input_file.preprocess()
3999
+ if len(input_file.best_hits_kmers) < 1:
4000
+ input_file.best_hits_kmers = None
4001
+ input_file.err_log += " This file did not successfully complete. No SCPs could be found."
4002
+
4003
+ return input_file
4004
+
4005
+ #Produce FastAAI preprocessed files containing HMM accession and associated protein sequence
4006
+ def miga_preproc(genomes, proteins, hmms, output, threads, verbose, do_compress):
4007
+ success = True
4008
+
4009
+ imported_files = fastaai_file_importer(genomes = genomes, proteins = proteins, hmms = hmms, output = output, compress = do_compress)
4010
+ imported_files.determine_inputs()
4011
+
4012
+ if imported_files.error:
4013
+ print("Exiting FastAAI due to input file error.")
4014
+ quit()
4015
+
4016
+ #file make checks
4017
+ p, h, c, l = True, True, True, True
4018
+
4019
+ if imported_files.status == "genome":
4020
+ p = miga_dirs(output, "predicted_proteins")
4021
+ h = miga_dirs(output, "hmms")
4022
+ c = miga_dirs(output, "crystals")
4023
+
4024
+ if imported_files.status == "protein":
4025
+ h = miga_dirs(output, "hmms")
4026
+ c = miga_dirs(output, "crystals")
4027
+
4028
+ if imported_files.status == "protein+HMM":
4029
+ c = miga_dirs(output, "crystals")
4030
+
4031
+ #We always want this one.
4032
+ l = miga_dirs(output, "logs")
4033
+
4034
+ print("")
4035
+
4036
+ #Check if all created directories were successful.
4037
+ success = p and h and c and l
4038
+
4039
+ if success:
4040
+ hmm_file = find_hmm()
4041
+
4042
+ if verbose:
4043
+ tracker = progress_tracker(total = len(imported_files.in_files), message = "Processing inputs")
4044
+ else:
4045
+ print("Processing inputs")
4046
+
4047
+ #Only build_db makes a log.
4048
+
4049
+ logger = open(os.path.normpath(output+"/logs/"+"FastAAI_preprocessing_log.txt"), "a")
4050
+ print("file", "start_date", "end_date", "starting_format",
4051
+ "prot_prediction_time", "trans_table", "hmm_search_time", "besthits_time",
4052
+ "errors", sep = "\t", file = logger)
4053
+
4054
+ fail_log = open(os.path.normpath(output+"/logs/"+"FastAAI_genome_failures.txt"), "a")
4055
+
4056
+ pool = multiprocessing.Pool(threads, initializer = hmm_preproc_initializer, initargs = (hmm_file, do_compress,))
4057
+
4058
+ for result in pool.imap(run_miga_preproc, imported_files.in_files):
4059
+ #log data, regardless of kind
4060
+ print(result.basename, result.start_time, result.end_time, result.initial_state,
4061
+ result.prot_pred_time, result.trans_table, result.hmm_search_time, result.besthits_time,
4062
+ result.err_log, sep = "\t", file = logger)
4063
+
4064
+ if len(result.best_hits_kmers) < 1:
4065
+ print(result.basename, file = fail_log)
4066
+
4067
+ if verbose:
4068
+ tracker.update()
4069
+
4070
+ pool.close()
4071
+ logger.close()
4072
+ fail_log.close()
4073
+
4074
+ print("FastAAI preprocessing complete!")
4075
+
4076
+ return success
4077
+
4078
+ def miga_db_from_crystals_opts():
4079
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
4080
+ description='''Takes a set of crystals produced with miga_preproc and makes a database from them.
4081
+
4082
+ Supply --crystals with a directory, file of paths, or list of paths just like --genomes in a build command.''')
4083
+
4084
+ parser.add_argument('-c', '--crystals', dest = 'crystals', default = None, help = 'A directory containing genomes in FASTA format.')
4085
+ parser.add_argument('-d', '--database', dest = 'db_name', default = "FastAAI_database.sqlite.db", help = 'The name of the database you wish to create or add to. The database will be created if it doesn\'t already exist and placed in the output directory. FastAAI_database.sqlite.db by default.')
4086
+
4087
+ parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory to place the database and any protein or HMM files FastAAI creates. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
4088
+
4089
+ parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
4090
+ parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
4091
+ args, unknown = parser.parse_known_args()
4092
+
4093
+ return parser, args
4094
+
4095
+ #This is basically a copied function, but I'm going to ignore that for now.
4096
+ def unique_kmer_miga(seq):
4097
+ #num tetramers = len(seq) - 4 + 1, just make it -3.
4098
+ n_kmers = len(seq) - 3
4099
+
4100
+ #Converts the characters in a sequence into their ascii int value
4101
+ as_ints = np.array([ord(i) for i in seq], dtype = np.int32)
4102
+
4103
+ #create seq like 0,1,2,3; 1,2,3,4; 2,3,4,5... for each tetramer that needs a value
4104
+ kmers = np.arange(4*n_kmers)
4105
+ kmers = kmers % 4 + kmers // 4
4106
+
4107
+ #Select the characters (as ints) corresponding to each tetramer all at once and reshape into rows of 4,
4108
+ #each row corresp. to a successive tetramer
4109
+ kmers = as_ints[kmers].reshape((n_kmers, 4))
4110
+
4111
+ #Given four 2-digit numbers, these multipliers work as offsets so that all digits are preserved in order when summed
4112
+ mult = np.array([1000000, 10000, 100, 1], dtype = np.int32)
4113
+
4114
+ #the fixed values effectively offset the successive chars of the tetramer by 2 positions each time;
4115
+ #practically, this is concatenation of numbers
4116
+ #Matrix mult does this for all values at once.
4117
+ return np.unique(np.dot(kmers, mult))
4118
+
4119
+ def para_crystal_init(tdb_queue):
4120
+ global tdb
4121
+ global td_name
4122
+ tdb = tdb_queue.get()
4123
+ td_name = tdb
4124
+ tdb = initialize_blank_db(tdb)
4125
+ global ok
4126
+ ok = generate_accessions_index()
4127
+
4128
+ def initialize_blank_db(path):
4129
+ sqlite3.register_converter("array", convert_array)
4130
+ worker = sqlite3.connect(path)
4131
+ wcurs = worker.cursor()
4132
+ wcurs.execute("CREATE TABLE genome_index (genome text, gen_id integer, protein_count integer)")
4133
+ wcurs.execute("CREATE TABLE genome_acc_kmer_counts (genome integer, accession integer, count integer)")
4134
+ ok = generate_accessions_index()
4135
+ for t in ok:
4136
+ wcurs.execute("CREATE TABLE " + t + "_genomes (genome INTEGER PRIMARY KEY, kmers array)")
4137
+ wcurs.execute("CREATE TABLE " + t + " (kmer INTEGER PRIMARY KEY, genomes array)")
4138
+
4139
+ worker.commit()
4140
+ wcurs.close()
4141
+ return worker
4142
+
4143
+ def para_crystals_to_dbs(args):
4144
+ path, name, num = args[0], args[1], args[2]
4145
+ my_gak = []
4146
+ my_qgi = []
4147
+ num_prots = 0
4148
+ curs = tdb.cursor()
4149
+ fh = agnostic_reader(path)
4150
+ for line in fh:
4151
+ segs = line.strip().split("\t")
4152
+ #prot_name = segs[0]
4153
+ acc_name = segs[1]
4154
+ prot_seq = segs[2]
4155
+ acc_id = ok[acc_name]
4156
+ tetramers = unique_kmer_miga(prot_seq)
4157
+ my_gak.append((num, acc_id, tetramers.shape[0]))
4158
+ tetramers = tetramers.tobytes()
4159
+ curs.execute("INSERT INTO " + acc_name + "_genomes VALUES (?,?)", (num, tetramers,))
4160
+ num_prots += 1
4161
+
4162
+ fh.close()
4163
+
4164
+ curs.execute("INSERT INTO genome_index VALUES (?, ?, ?)", (name, num, num_prots,))
4165
+ curs.executemany("INSERT INTO genome_acc_kmer_counts VALUES (?, ?, ?)", my_gak)
4166
+
4167
+ tdb.commit()
4168
+ curs.close()
4169
+
4170
+ return None
4171
+
4172
+ def group_by_kmer(placeholder):
4173
+ curs = tdb.cursor()
4174
+ surviving_tables = []
4175
+ for acc in ok:
4176
+ collected_data = curs.execute("SELECT * FROM {acc}_genomes".format(acc=acc)).fetchall()
4177
+ rearrange = {}
4178
+ if len(collected_data) > 0:
4179
+ surviving_tables.append(acc)
4180
+ for row in collected_data:
4181
+ genome, tetramers = row[0], np.frombuffer(row[1], dtype = np.int32)
4182
+ for t in tetramers:
4183
+ if t not in rearrange:
4184
+ rearrange[t] = [genome]
4185
+ else:
4186
+ rearrange[t].append(genome)
4187
+
4188
+ to_add = []
4189
+ for tetra in rearrange:
4190
+ as_bytes = np.array(rearrange[tetra], dtype = np.int32).tobytes()
4191
+ rearrange[tetra] = None
4192
+ to_add.append((int(tetra), as_bytes,))
4193
+
4194
+ curs.executemany("INSERT INTO {acc} VALUES (?, ?)".format(acc=acc), to_add)
4195
+ to_add = None
4196
+ else:
4197
+ #Empty table/no genomes contained the relevant SCP
4198
+ curs.execute("DROP TABLE {acc}".format(acc = acc))
4199
+ curs.execute("DROP TABLE {acc}_genomes".format(acc = acc))
4200
+
4201
+ tdb.commit()
4202
+
4203
+ curs.close()
4204
+
4205
+ tdb.close()
4206
+
4207
+ return [td_name, surviving_tables]
4208
+
4209
+ #Merge one or many crystals into a DB.
4210
+ def miga_db_from_crystals(crystals, output, db_name, threads, verbose):
4211
+ success = True
4212
+
4213
+ imported_files = fastaai_file_importer(genomes = None, proteins = None,
4214
+ hmms = None, crystals = crystals, output = output, compress = False)
4215
+ imported_files.determine_inputs()
4216
+
4217
+ if imported_files.error:
4218
+ print("Exiting FastAAI due to input file error.")
4219
+ quit()
4220
+
4221
+ #We'll skip trying this if the file already exists.
4222
+ existing_genome_IDs = None
4223
+ final_db_path = None
4224
+ try:
4225
+ if os.path.exists(db_name):
4226
+ if os.path.isfile(db_name):
4227
+ final_db_path = db_name
4228
+ else:
4229
+ success = miga_dirs(output, "database")
4230
+ final_db_path = os.path.normpath(output+ "/database/" + db_name)
4231
+
4232
+ else:
4233
+ success = miga_dirs(output, "database")
4234
+ final_db_path = os.path.normpath(output+ "/database/" + db_name)
4235
+ except:
4236
+ print("You specified an existing file to be a database, but it does not appear to be a FastAAI database.")
4237
+ print("FastAAI will not be able to continue. Please give FastAAI a different database name and continue.")
4238
+ print("Exiting.")
4239
+ success = False
4240
+
4241
+ if os.path.exists(final_db_path):
4242
+ if os.path.isfile(final_db_path):
4243
+ parent = sqlite3.connect(final_db_path)
4244
+ curs = parent.cursor()
4245
+ existing_genome_IDs = {}
4246
+ sql_command = "SELECT genome, gen_id FROM genome_index"
4247
+ for result in curs.execute(sql_command).fetchall():
4248
+ genome = result[0]
4249
+ id = int(result[1])
4250
+ existing_genome_IDs[genome] = id
4251
+
4252
+ curs.close()
4253
+ parent.close()
4254
+
4255
+ if success:
4256
+ if existing_genome_IDs is not None:
4257
+ genome_idx = max(list(existing_genome_IDs.values()))+1
4258
+ else:
4259
+ existing_genome_IDs = {}
4260
+ genome_idx = 0
4261
+
4262
+ cryst_args = []
4263
+ for crystal_path, crystal_name in zip(imported_files.crystal_list, imported_files.identifiers):
4264
+ #the genome is implicitly dropped if it's already in the target
4265
+ if crystal_name not in existing_genome_IDs:
4266
+ existing_genome_IDs[crystal_name] = genome_idx
4267
+ cryst_args.append((crystal_path, crystal_name, genome_idx,))
4268
+ genome_idx += 1
4269
+
4270
+ final_conn = sqlite3.connect(final_db_path)
4271
+ final_curs = final_conn.cursor()
4272
+
4273
+ final_curs.execute("CREATE TABLE IF NOT EXISTS genome_index (genome text, gen_id integer, protein_count integer)")
4274
+ final_curs.execute("CREATE TABLE IF NOT EXISTS genome_acc_kmer_counts (genome integer, accession integer, count integer)")
4275
+
4276
+ final_curs.execute("CREATE INDEX IF NOT EXISTS kmer_acc ON genome_acc_kmer_counts (genome, accession);")
4277
+
4278
+ final_conn.commit()
4279
+
4280
+ temp_dir = tempfile.mkdtemp()
4281
+
4282
+ temp_db_queue = multiprocessing.Queue()
4283
+ for i in range(0, threads):
4284
+ tdb_name = os.path.normpath(temp_dir + "/temp_db_" + str(i) + ".db")
4285
+ temp_db_queue.put(tdb_name)
4286
+
4287
+ placeholder = [i for i in range(0, threads)]
4288
+
4289
+ pool = multiprocessing.Pool(threads, initializer = para_crystal_init, initargs = (temp_db_queue,))
4290
+
4291
+ if verbose:
4292
+ tracker = progress_tracker(total = len(cryst_args), message = "Importing data")
4293
+ else:
4294
+ print("Importing data")
4295
+
4296
+ for result in pool.imap_unordered(para_crystals_to_dbs, cryst_args):
4297
+ if verbose:
4298
+ tracker.update()
4299
+
4300
+ if verbose:
4301
+ tracker = progress_tracker(total = threads, message = "Formating data")
4302
+ else:
4303
+ print("Formating data")
4304
+
4305
+ for result in pool.imap_unordered(group_by_kmer, placeholder):
4306
+ dbname, surviving_tables = result[0], result[1]
4307
+
4308
+ new_conn = sqlite3.connect(dbname)
4309
+ new_curs = new_conn.cursor()
4310
+
4311
+ ngak = new_curs.execute("SELECT * FROM genome_acc_kmer_counts").fetchall()
4312
+ ngi = new_curs.execute("SELECT * FROM genome_index").fetchall()
4313
+
4314
+ final_curs.executemany("INSERT INTO genome_index VALUES (?, ?, ?)", ngi)
4315
+ final_curs.executemany("INSERT INTO genome_acc_kmer_counts VALUES (?, ?, ?)", ngak)
4316
+
4317
+ final_conn.commit()
4318
+
4319
+ ngak = None
4320
+ ngi = None
4321
+
4322
+ for acc in surviving_tables:
4323
+ final_curs.execute("CREATE TABLE IF NOT EXISTS {acc}_genomes (genome INTEGER PRIMARY KEY, kmers array)".format(acc=acc))
4324
+ final_curs.execute("CREATE TABLE IF NOT EXISTS {acc} (kmer INTEGER PRIMARY KEY, genomes array)".format(acc=acc))
4325
+ final_curs.execute("CREATE INDEX IF NOT EXISTS {acc}_index ON {acc}(kmer)".format(acc=acc))
4326
+
4327
+ curag = new_curs.execute("SELECT * FROM {acc}_genomes".format(acc=acc)).fetchall()
4328
+ final_curs.executemany("INSERT INTO {acc}_genomes VALUES (?, ?)".format(acc=acc), curag)
4329
+ curag = None
4330
+
4331
+ curaac = new_curs.execute("SELECT kmer, genomes, genomes FROM {acc}".format(acc=acc)).fetchall()
4332
+ update_concat_sql = "INSERT INTO {acc} VALUES (?,?) ON CONFLICT(kmer) DO UPDATE SET genomes=genomes || (?)".format(acc=acc)
4333
+ final_curs.executemany(update_concat_sql, curaac)
4334
+ curacc = None
4335
+
4336
+ final_conn.commit()
4337
+
4338
+
4339
+
4340
+ new_curs.close()
4341
+ new_conn.close()
4342
+
4343
+ if verbose:
4344
+ tracker.update()
4345
+
4346
+ pool.close()
4347
+
4348
+
4349
+ final_curs.close()
4350
+ final_conn.close()
4351
+
4352
+ shutil.rmtree(temp_dir)
4353
+ '''
4354
+ Main
4355
+ '''
4356
+
4357
+ #Preprocess genomes, build DB, query all vs all to self.
4358
+ def aai_index_opts():
4359
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
4360
+ description='''FastAAI module for preprocessing a set of genomes, proteins, or proteins+HMMs
4361
+ into a database, and then querying the database against itself.
4362
+
4363
+ Equivalent to running build_db and db_query in sequence. Check these modules for additional
4364
+ details on inputs.''')
4365
+
4366
+ parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory to place the database and any protein or HMM files FastAAI creates. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
4367
+
4368
+ parser.add_argument('-g', '--genomes', dest = 'genomes', default = None, help = 'A directory containing genomes in FASTA format.')
4369
+ parser.add_argument('-p', '--proteins', dest = 'proteins', default = None, help = 'A directory containing protein amino acids in FASTA format.')
4370
+ parser.add_argument('-m', '--hmms', dest = 'hmms', default = None, help = 'A directory containing the results of an HMM search on a set of proteins.')
4371
+
4372
+ parser.add_argument('-d', '--database', dest = 'db_name', default = "FastAAI_database.sqlite.db", help = 'The name of the database you wish to create or add to. The database will be created if it doesn\'t already exist and placed in the output directory. FastAAI_database.sqlite.db by default.')
4373
+
4374
+ parser.add_argument('--output_style', dest = "style", default = 'tsv', help = "Either 'tsv' or 'matrix'. Matrix produces a simplified output of only AAI estimates.")
4375
+ parser.add_argument('--do_stdev', dest = "do_stdev", action='store_true', help = 'Off by default. Calculate std. deviations on Jaccard indicies. Increases memory usage and runtime slightly. Does NOT change estimated AAI values at all.')
4376
+ parser.add_argument('--in_memory', dest = "in_mem", action = 'store_true', help = 'Load both databases into memory before querying. Consumes more RAM, but is faster and reduces file I/O substantially. Consider reducing number of threads')
4377
+ parser.add_argument('--store_results', dest = "storage", action = 'store_true', help = 'Keep partial results in memory. Only works with --in_memory. Fewer writes, but more RAM. Default off.')
4378
+
4379
+ parser.add_argument('--compress', dest = "do_comp", action = 'store_true', help = 'Gzip compress generated proteins, HMMs. Off by default.')
4380
+ parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
4381
+ parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
4382
+
4383
+ args, unknown = parser.parse_known_args()
4384
+
4385
+ return parser, args
4386
+
4387
+ #Preprocess two sets of genomes A and B into two distinct databases Adb and Bdb, then query Adb against Bdb
4388
+ def multi_query_opts():
4389
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
4390
+ description='''FastAAI module for preprocessing two sets of input files into two separate DBs,
4391
+ then querying the DBs against eachother. Not for use with already-made FastAAI databases.
4392
+
4393
+ See "build_db" action for details on file inputs.
4394
+ See "db_query" action for details on querying options.''')
4395
+
4396
+ parser.add_argument('--query_output', dest = 'qoutput', default = "FastAAI_query", help = 'Output directory for query files. Default "FastAAI_query." FastAAI will work if this directory is the same as --target_output, but this is NOT a good idea.')
4397
+ parser.add_argument('--target_output', dest = 'toutput', default = "FastAAI_target", help = 'Output directory for target files. Default "FastAAI_target." AAI results will be placed in this directory')
4398
+
4399
+ parser.add_argument('--query_genomes', dest = 'qgenomes', default = None, help = 'Query genomes')
4400
+ parser.add_argument('--target_genomes', dest = 'tgenomes', default = None, help = 'Target genomes')
4401
+
4402
+ parser.add_argument('--query_proteins', dest = 'qproteins', default = None, help = 'Query proteins')
4403
+ parser.add_argument('--target_proteins', dest = 'tproteins', default = None, help = 'Target proteins')
4404
+
4405
+ parser.add_argument('--query_hmms', dest = 'qhmms', default = None, help = 'Query HMMs')
4406
+ parser.add_argument('--target_hmms', dest = 'thmms', default = None, help = 'Target HMMs')
4407
+
4408
+ parser.add_argument('--query_database', dest = 'qdb_name', default = "FastAAI_query_database.sqlite.db", help = 'Query database name. Default "FastAAI_query_database.sqlite.db"')
4409
+ parser.add_argument('--target_database', dest = 'tdb_name', default = "FastAAI_target_database.sqlite.db", help ='Target database name. Default "FastAAI_target_database.sqlite.db"')
4410
+
4411
+ parser.add_argument('--output_style', dest = "style", default = 'tsv', help = "Either 'tsv' or 'matrix'. Matrix produces a simplified output of only AAI estimates.")
4412
+ parser.add_argument('--do_stdev', dest = "do_stdev", action='store_true', help = 'Off by default. Calculate std. deviations on Jaccard indicies. Increases memory usage and runtime slightly. Does NOT change estimated AAI values at all.')
4413
+ parser.add_argument('--in_memory', dest = "in_mem", action = 'store_true', help = 'Load both databases into memory before querying. Consumes more RAM, but is faster and reduces file I/O substantially. Consider reducing number of threads')
4414
+ parser.add_argument('--store_results', dest = "storage", action = 'store_true', help = 'Keep partial results in memory. Only works with --in_memory. Fewer writes, but more RAM. Default off.')
4415
+
4416
+ parser.add_argument('--compress', dest = "do_comp", action = 'store_true', help = 'Gzip compress generated proteins, HMMs. Off by default.')
4417
+ parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
4418
+ parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
4419
+
4420
+ args, unknown = parser.parse_known_args()
4421
+
4422
+ return parser, args
4423
+
4424
+
4425
+ def main():
4426
+ #The currently supported modules.
4427
+ modules = ["build_db", "merge_db", "simple_query", "db_query", "single_query", "aai_index", "multi_query", "miga_merge", "miga_preproc", "miga_db_from_crystals"]
4428
+
4429
+ #Print modules if someone just types FastAAI
4430
+ if len(sys.argv) < 2:
4431
+ print("")
4432
+ print(" I couldn't find the module you specified. Please select one of the following modules:")
4433
+ print("")
4434
+ print("-------------------------------------- Database Construction Options --------------------------------------")
4435
+ print("")
4436
+ print(" build_db |" + " Create or add to a FastAAI database from genomes, proteins, or proteins and HMMs")
4437
+ print(" merge_db |" + " Add the contents of one FastAAI DB to another")
4438
+ print("")
4439
+ print("---------------------------------------------- Query Options ----------------------------------------------")
4440
+ print("")
4441
+ print(" simple_query |" + " Query a genome or protein (one or many) against an existing FastAAI database")
4442
+ print(" db_query |" + " Query the genomes in one FastAAI database against the genomes in another FastAAI database")
4443
+ print("")
4444
+ print("------------------------------------------- Other Options -------------------------------------------")
4445
+ print("")
4446
+ print(" single_query |" + " Query ONE query genome against ONE target genome")
4447
+ print(" multi_query |" + " Create a query DB and a target DB, then calculate query vs. target AAI")
4448
+ print(" aai_index |" + " Create a database from multiple genomes and do an all vs. all AAI index of the genomes")
4449
+ print("")
4450
+ print("-----------------------------------------------------------------------------------------------------------")
4451
+ print(" To select a module, enter 'FastAAI [module]' into the command line!")
4452
+ print("")
4453
+ sys.exit()
4454
+
4455
+ #This is the module selection
4456
+ selection = sys.argv[1]
4457
+
4458
+ if selection == "version":
4459
+ sys.exit("FastAAI version=0.1.17")
4460
+
4461
+ if selection not in modules:
4462
+ print("")
4463
+ print(" I couldn't find the module you specified. Please select one of the following modules:")
4464
+ print("")
4465
+ print("-------------------------------------- Database Construction Options --------------------------------------")
4466
+ print("")
4467
+ print(" build_db |" + " Create or add to a FastAAI database from genomes, proteins, or proteins and HMMs")
4468
+ print(" merge_db |" + " Add the contents of one FastAAI DB to another")
4469
+ print("")
4470
+ print("---------------------------------------------- Query Options ----------------------------------------------")
4471
+ print("")
4472
+ print(" simple_query |" + " Query a genome or protein (one or many) against an existing FastAAI database")
4473
+ print(" db_query |" + " Query the genomes in one FastAAI database against the genomes in another FastAAI database")
4474
+ print("")
4475
+ print("------------------------------------------- Other Options -------------------------------------------")
4476
+ print("")
4477
+ print(" single_query |" + " Query ONE query genome against ONE target genome")
4478
+ print(" multi_query |" + " Create a query DB and a target DB, then calculate query vs. target AAI")
4479
+ print(" aai_index |" + " Create a database from multiple genomes and do an all vs. all AAI index of the genomes")
4480
+ print("")
4481
+ print("-----------------------------------------------------------------------------------------------------------")
4482
+ print(" To select a module, enter 'FastAAI [module]' into the command line!")
4483
+ print("")
4484
+ sys.exit()
4485
+
4486
+ #################### Database build or add ########################
4487
+
4488
+ if selection == "build_db":
4489
+ parser, opts = build_db_opts()
4490
+
4491
+ #module name only
4492
+ if len(sys.argv) < 3:
4493
+ print(parser.print_help())
4494
+ sys.exit()
4495
+
4496
+ #Directory based
4497
+ genomes, proteins, hmms = opts.genomes, opts.proteins, opts.hmms
4498
+
4499
+ output = os.path.normpath(opts.output)
4500
+
4501
+ threads = opts.threads
4502
+ verbose = opts.verbose
4503
+
4504
+ #Database handle
4505
+ db_name = opts.db_name
4506
+
4507
+ do_comp = opts.do_comp
4508
+
4509
+ build_db(genomes, proteins, hmms, db_name, output, threads, verbose, do_comp)
4510
+
4511
+
4512
+ #################### Add two DBs ########################
4513
+
4514
+ if selection == "merge_db":
4515
+ parser, opts = merge_db_opts()
4516
+ if len(sys.argv) < 3:
4517
+ print(parser.print_help())
4518
+ sys.exit()
4519
+
4520
+ recipient = opts.recipient
4521
+ donors = opts.donors
4522
+ donor_file = opts.donor_file
4523
+ verbose = opts.verbose
4524
+ threads = opts.threads
4525
+
4526
+ if donors is not None and donor_file is not None:
4527
+ sys.exit("You cannot specify both --donors and --donor_file.")
4528
+
4529
+ merge_db(recipient, donors, donor_file, verbose, threads)
4530
+
4531
+ #################### Query files vs DB ########################
4532
+
4533
+ if selection == "simple_query":
4534
+ parser, opts = sql_query_opts()
4535
+
4536
+ if len(sys.argv) < 3:
4537
+ print(parser.print_help())
4538
+ sys.exit()
4539
+
4540
+ genomes, proteins, hmms = opts.genomes, opts.proteins, opts.hmms
4541
+
4542
+ db_name = opts.target
4543
+
4544
+ output = opts.output
4545
+ threads = opts.threads
4546
+ verbose = opts.verbose
4547
+
4548
+ do_stdev = opts.do_stdev
4549
+
4550
+ style, in_mem, make_db, qdb_name, do_comp = opts.style, opts.in_mem, opts.make_db, opts.qdb_name, opts.do_comp
4551
+
4552
+ sql_query(genomes, proteins, hmms, db_name, output, threads, verbose, do_stdev, style, in_mem, make_db, qdb_name, do_comp)
4553
+
4554
+
4555
+ #################### Query DB vs DB ###########################
4556
+ if selection == "db_query":
4557
+ parser, opts = db_query_opts()
4558
+ #module name only
4559
+
4560
+ if len(sys.argv) < 3:
4561
+ print(parser.print_help())
4562
+ sys.exit()
4563
+
4564
+ query = opts.query
4565
+ target = opts.target
4566
+ verbose = opts.verbose
4567
+
4568
+ do_stdev = opts.do_stdev
4569
+ output = opts.output
4570
+ threads = opts.threads
4571
+
4572
+ style, in_mem, store = opts.style, opts.in_mem, opts.storage
4573
+
4574
+
4575
+ db_query(query, target, verbose, output, threads, do_stdev, style, in_mem, store)
4576
+
4577
+ #################### One-pass functions #######################
4578
+ if selection == "single_query":
4579
+ parser, opts = single_query_opts()
4580
+ #module name only
4581
+
4582
+ if len(sys.argv) < 3:
4583
+ print(parser.print_help())
4584
+ sys.exit()
4585
+
4586
+ output = os.path.normpath(opts.output)
4587
+ try:
4588
+ threads = int(opts.threads)
4589
+ except:
4590
+ print("Couldn't interpret your threads. Defaulting to 1.")
4591
+ threads = 1
4592
+ verbose = opts.verbose
4593
+ do_compress = opts.do_comp
4594
+
4595
+ query_genome = opts.query_genome
4596
+ query_protein = opts.query_protein
4597
+ query_hmm = opts.query_hmm
4598
+
4599
+ query_file = fastaai_file_importer(genomes = query_genome, proteins = query_protein, hmms = query_hmm, output = output, compress = do_compress)
4600
+ query_file.determine_inputs()
4601
+
4602
+ target_genome = opts.target_genome
4603
+ target_protein = opts.target_protein
4604
+ target_hmm = opts.target_hmm
4605
+
4606
+ target_file = fastaai_file_importer(genomes = target_genome, proteins = target_protein, hmms = target_hmm, output = output, compress = do_compress)
4607
+ target_file.determine_inputs()
4608
+
4609
+ is_ok = True
4610
+ if len(query_file.in_files) != 1:
4611
+ print("Query genome unacceptable. Check your inputs")
4612
+ is_ok = False
4613
+
4614
+ if len(target_file.in_files) != 1:
4615
+ print("target genome unacceptable. Check your inputs")
4616
+ is_ok = False
4617
+ if is_ok:
4618
+ good_to_go = prepare_directories(output, query_file.status, "query")
4619
+ if good_to_go:
4620
+ good_to_go = prepare_directories(output, target_file.status, "query")
4621
+ if good_to_go:
4622
+ single_query(query_file, target_file, output, verbose, threads, do_compress)
4623
+
4624
+
4625
+ if selection == "aai_index":
4626
+ parser, opts = aai_index_opts()
4627
+
4628
+ if len(sys.argv) < 3:
4629
+ print(parser.print_help())
4630
+ sys.exit()
4631
+
4632
+ genomes, proteins, hmms = opts.genomes, opts.proteins, opts.hmms
4633
+
4634
+ output = os.path.normpath(opts.output)
4635
+
4636
+ threads = opts.threads
4637
+ verbose = opts.verbose
4638
+
4639
+ #Database handle
4640
+ db_name = opts.db_name
4641
+
4642
+ do_comp = opts.do_comp
4643
+
4644
+ do_stdev = opts.do_stdev
4645
+
4646
+ style, in_mem, store = opts.style, opts.in_mem, opts.storage
4647
+
4648
+ #This is the same logic from the build_db section and it's what we need for getting the DB name.
4649
+ #Check if the db contains path info. Incl. windows version.
4650
+ if "/" not in db_name and "\\" not in db_name:
4651
+ final_database = os.path.normpath(output + "/database/" + db_name)
4652
+ else:
4653
+ #If the person insists that the db has a path, let them.
4654
+ final_database = db_name
4655
+
4656
+ build_db(genomes, proteins, hmms, db_name, output, threads, verbose, do_comp)
4657
+
4658
+ query, target = final_database, final_database
4659
+
4660
+ db_query(query, target, verbose, output, threads, do_stdev, style, in_mem, store)
4661
+
4662
+
4663
+ if selection == "multi_query":
4664
+ parser, opts = multi_query_opts()
4665
+
4666
+ if len(sys.argv) < 3:
4667
+ print(parser.print_help())
4668
+ sys.exit()
4669
+
4670
+ #Shared options
4671
+ threads = opts.threads
4672
+ verbose = opts.verbose
4673
+
4674
+ #query options
4675
+ do_comp = opts.do_comp
4676
+ do_stdev = opts.do_stdev
4677
+ style, in_mem, store = opts.style, opts.in_mem, opts.storage
4678
+
4679
+ #query inputs
4680
+ qgenomes, qproteins, qhmms = opts.qgenomes, opts.qproteins, opts.qhmms
4681
+ qoutput = os.path.normpath(opts.qoutput)
4682
+ qdb_name = opts.qdb_name
4683
+ #This is the same logic from the build_db section and it's what we need for getting the DB name.
4684
+ #Check if the db contains path info. Incl. windows version.
4685
+ if "/" not in qdb_name and "\\" not in qdb_name:
4686
+ final_qdb = os.path.normpath(qoutput + "/database/" + qdb_name)
4687
+ else:
4688
+ #If the person insists that the db has a path, let them.
4689
+ final_qdb = db_name
4690
+
4691
+ #target inputs
4692
+ tgenomes, tproteins, thmms = opts.tgenomes, opts.tproteins, opts.thmms
4693
+ toutput = os.path.normpath(opts.toutput)
4694
+ tdb_name = opts.tdb_name
4695
+ #This is the same logic from the build_db section and it's what we need for getting the DB name.
4696
+ #Check if the db contains path info. Incl. windows version.
4697
+ if "/" not in tdb_name and "\\" not in tdb_name:
4698
+ final_tdb = os.path.normpath(toutput + "/database/" + tdb_name)
4699
+ else:
4700
+ #If the person insists that the db has a path other than output/database, let them.
4701
+ final_tdb = db_name
4702
+
4703
+ #run query build
4704
+ build_db(qgenomes, qproteins, qhmms, qdb_name, qoutput, threads, verbose, do_comp)
4705
+ #run target build
4706
+ build_db(tgenomes, tproteins, thmms, tdb_name, toutput, threads, verbose, do_comp)
4707
+ #run query db against target db
4708
+ db_query(final_qdb, final_tdb, verbose, toutput, threads, do_stdev, style, in_mem, store)
4709
+
4710
+
4711
+ ############## MiGA module #################
4712
+ if selection == "miga_merge":
4713
+ parser, opts = miga_merge_opts()
4714
+
4715
+ #module name only
4716
+ if len(sys.argv) < 3:
4717
+ print(parser.print_help())
4718
+ sys.exit()
4719
+
4720
+ g,p,h = opts.gen, opts.prot, opts.hmm
4721
+
4722
+ target = opts.database
4723
+
4724
+ verbose = opts.verbose
4725
+
4726
+ output_path = opts.output
4727
+
4728
+ if target == None:
4729
+ target = os.path.normpath(output_path + "/database/FastAAI_database.sqlite.db")
4730
+
4731
+ do_compress = opts.compress
4732
+
4733
+ imported_files = fastaai_file_importer(genomes = g, proteins = p, hmms = h,
4734
+ output = output_path, compress = do_compress)
4735
+
4736
+ imported_files.determine_inputs()
4737
+
4738
+ if len(imported_files.in_files) == 0:
4739
+ print("Something was wrong with your input file.")
4740
+ else:
4741
+ input_genome = imported_files.in_files[0]
4742
+
4743
+ good_to_go = prepare_directories(output_path, imported_files.status, "build")
4744
+
4745
+ miga_merge(input_genome, target, verbose, do_compress)
4746
+
4747
+ #This is where a new db would normally be created,
4748
+ #which is not what happens when the supplied target is some other sort of path.
4749
+ output_default = os.path.normpath(output_path + "/database")
4750
+ if len(os.listdir(output_default)) == 0:
4751
+ os.rmdir(output_default)
4752
+
4753
+ if selection == "miga_preproc":
4754
+ parser, opts = miga_preproc_opts()
4755
+
4756
+ #module name only
4757
+ if len(sys.argv) < 3:
4758
+ print(parser.print_help())
4759
+ sys.exit()
4760
+
4761
+ #Directory based
4762
+ genomes, proteins, hmms = opts.genomes, opts.proteins, opts.hmms
4763
+
4764
+ output = os.path.normpath(opts.output)
4765
+
4766
+ threads = opts.threads
4767
+ verbose = opts.verbose
4768
+
4769
+ do_comp = opts.do_comp
4770
+
4771
+ miga_preproc(genomes, proteins, hmms, output, threads, verbose, do_comp)
4772
+
4773
+ if selection == "miga_db_from_crystals":
4774
+ parser, opts = miga_db_from_crystals_opts()
4775
+
4776
+ #module name only
4777
+ if len(sys.argv) < 3:
4778
+ print(parser.print_help())
4779
+ sys.exit()
4780
+
4781
+ crystals = opts.crystals
4782
+
4783
+ if crystals is None:
4784
+ print("I need to be given crystals to proceed!")
4785
+ quit()
4786
+
4787
+ db_name = opts.db_name
4788
+ try:
4789
+ threads = int(opts.threads)
4790
+ except:
4791
+ threads = 1
4792
+ print("Can't recognize threads param:", str(opts.threads), "defaulting to 1.")
4793
+
4794
+ verbose = opts.verbose
4795
+ output_path = opts.output
4796
+
4797
+ miga_db_from_crystals(crystals, output_path, db_name, threads, verbose)
4798
+
4799
+
4800
+ return None
4801
+
4802
+ if __name__ == "__main__":
4803
+ main()
4804
+
4805
+