miga-base 1.0.5.5 → 1.1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/miga/cli/action/doctor.rb +12 -4
- data/lib/miga/cli/action/env.rb +1 -1
- data/lib/miga/cli/action/init.rb +1 -1
- data/lib/miga/cli/action/ncbi_get/downloads.rb +230 -0
- data/lib/miga/cli/action/ncbi_get.rb +9 -217
- data/lib/miga/cli/action/wf.rb +7 -3
- data/lib/miga/common.rb +12 -11
- data/lib/miga/dataset/result.rb +2 -1
- data/lib/miga/version.rb +3 -3
- data/scripts/essential_genes.bash +7 -11
- data/test/common_test.rb +7 -7
- data/utils/FastAAI/FastAAI +3630 -0
- data/utils/FastAAI/{FastAAI → FastAAI-legacy}/FastAAI +1 -1
- data/utils/FastAAI/{kAAI_v1.0_virus.py → FastAAI-legacy/kAAI_v1.0_virus.py} +0 -0
- data/utils/distance/commands.rb +24 -13
- metadata +6 -4
@@ -0,0 +1,3630 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
|
3
|
+
################################################################################
|
4
|
+
"""---0.0 Import Modules---"""
|
5
|
+
import subprocess
|
6
|
+
import argparse
|
7
|
+
import datetime
|
8
|
+
import shutil
|
9
|
+
import textwrap
|
10
|
+
import multiprocessing
|
11
|
+
import pickle
|
12
|
+
import gzip
|
13
|
+
import tempfile
|
14
|
+
#Shouldn't play any role.
|
15
|
+
#from random import randint
|
16
|
+
|
17
|
+
#We could probably remove Path, too.
|
18
|
+
from pathlib import Path
|
19
|
+
#This as well
|
20
|
+
from functools import partial
|
21
|
+
import time
|
22
|
+
from collections import defaultdict
|
23
|
+
import sys
|
24
|
+
import os
|
25
|
+
from math import floor
|
26
|
+
import sqlite3
|
27
|
+
#numpy dependency
|
28
|
+
import numpy as np
|
29
|
+
import io
|
30
|
+
import random
|
31
|
+
|
32
|
+
|
33
|
+
#Takes a bytestring from the SQL database and converts it to a numpy array.
|
34
|
+
def convert_array(bytestring):
|
35
|
+
return np.frombuffer(bytestring, dtype = np.int32)
|
36
|
+
|
37
|
+
def convert_float_array_16(bytestring):
|
38
|
+
return np.frombuffer(bytestring, dtype = np.float16)
|
39
|
+
|
40
|
+
def convert_float_array_32(bytestring):
|
41
|
+
return np.frombuffer(bytestring, dtype = np.float32)
|
42
|
+
|
43
|
+
def convert_float_array_64(bytestring):
|
44
|
+
return np.frombuffer(bytestring, dtype = np.float64)
|
45
|
+
|
46
|
+
|
47
|
+
#Iterator for agnostic reader
|
48
|
+
class agnostic_reader_iterator:
|
49
|
+
def __init__(self, reader):
|
50
|
+
self.handle_ = reader.handle
|
51
|
+
self.is_gz_ = reader.is_gz
|
52
|
+
|
53
|
+
def __next__(self):
|
54
|
+
if self.is_gz_:
|
55
|
+
line = self.handle_.readline().decode()
|
56
|
+
else:
|
57
|
+
line = self.handle_.readline()
|
58
|
+
|
59
|
+
#Ezpz EOF check
|
60
|
+
if line:
|
61
|
+
return line
|
62
|
+
else:
|
63
|
+
raise StopIteration
|
64
|
+
|
65
|
+
#File reader that doesn't care if you give it a gzipped file or not.
|
66
|
+
class agnostic_reader:
|
67
|
+
def __init__(self, file):
|
68
|
+
self.path = file
|
69
|
+
|
70
|
+
with open(file, 'rb') as test_gz:
|
71
|
+
#Gzip magic number
|
72
|
+
is_gz = (test_gz.read(2) == b'\x1f\x8b')
|
73
|
+
|
74
|
+
self.is_gz = is_gz
|
75
|
+
|
76
|
+
if is_gz:
|
77
|
+
self.handle = gzip.open(self.path)
|
78
|
+
else:
|
79
|
+
self.handle = open(self.path)
|
80
|
+
|
81
|
+
def __iter__(self):
|
82
|
+
return agnostic_reader_iterator(self)
|
83
|
+
|
84
|
+
def close(self):
|
85
|
+
self.handle.close()
|
86
|
+
|
87
|
+
#FastAAI database class. This is the final database
|
88
|
+
class fastaai_database:
|
89
|
+
def __init__(self, path):
|
90
|
+
#open SQL db and load in
|
91
|
+
|
92
|
+
self.path = path
|
93
|
+
self.exists = os.path.exists(path)
|
94
|
+
|
95
|
+
self.child = None
|
96
|
+
self.connection = None
|
97
|
+
self.cursor = None
|
98
|
+
|
99
|
+
self.child_connection = None
|
100
|
+
self.child_cursor = None
|
101
|
+
|
102
|
+
self.accessions = None
|
103
|
+
#self.genomes = None
|
104
|
+
|
105
|
+
#gak stands for 'genome_accession_kmer_counts'
|
106
|
+
self.gak = None
|
107
|
+
self.genome_index = None
|
108
|
+
#Go from index to name
|
109
|
+
self.reverse_genome_index = None
|
110
|
+
self.protein_counts_by_genome = None
|
111
|
+
|
112
|
+
#self.accession_set = None
|
113
|
+
|
114
|
+
self.verbosity = False
|
115
|
+
|
116
|
+
#Open an SQL connection
|
117
|
+
def activate_connection(self, with_converter = True):
|
118
|
+
# Converts np.array to TEXT when inserting
|
119
|
+
##sqlite3.register_adapter(np.ndarray, adapt_array)
|
120
|
+
|
121
|
+
#Converts byte string to numpy ndarray(int32) upon read from DB.
|
122
|
+
if with_converter:
|
123
|
+
sqlite3.register_converter("array", convert_array)
|
124
|
+
self.connection = sqlite3.connect(self.path, detect_types=sqlite3.PARSE_DECLTYPES)
|
125
|
+
|
126
|
+
else:
|
127
|
+
#sqlite3.register_converter("array", convert_array)
|
128
|
+
self.connection = sqlite3.connect(self.path)
|
129
|
+
|
130
|
+
self.cursor = self.connection.cursor()
|
131
|
+
self.exists = True
|
132
|
+
|
133
|
+
#Close an SQL connection
|
134
|
+
def close_connection(self):
|
135
|
+
self.cursor.close()
|
136
|
+
self.connection.close()
|
137
|
+
#True cleanup - even a closed SQL connection obj cannot be passed to multiple processors, but a nonetype can.
|
138
|
+
self.cursor = None
|
139
|
+
self.connection = None
|
140
|
+
|
141
|
+
def initialize_parent_database(self):
|
142
|
+
if not self.exists:
|
143
|
+
print("I need to be activated first!")
|
144
|
+
else:
|
145
|
+
#DB exists. Add metadata tables if needed.
|
146
|
+
self.cursor.execute(''' SELECT count(name) FROM sqlite_master WHERE type='table' AND name='genome_index' ''')
|
147
|
+
if self.cursor.fetchone()[0]!=1 :
|
148
|
+
self.cursor.execute('''CREATE TABLE genome_index
|
149
|
+
(genome text, gen_id INTEGER PRIMARY KEY, protein_count INTEGER)''')
|
150
|
+
self.connection.commit()
|
151
|
+
|
152
|
+
self.cursor.execute(''' SELECT count(name) FROM sqlite_master WHERE type='table' AND name='genome_acc_kmer_counts' ''')
|
153
|
+
if self.cursor.fetchone()[0]!=1 :
|
154
|
+
self.cursor.execute('''CREATE TABLE genome_acc_kmer_counts
|
155
|
+
(genome INTEGER, accession INTEGER, count INTEGER)''')
|
156
|
+
self.connection.commit()
|
157
|
+
|
158
|
+
#Access an existing master database
|
159
|
+
def activate_child_connection(self, child):
|
160
|
+
#Don't try to connect unless it exists. This should never fail.
|
161
|
+
if os.path.exists(child):
|
162
|
+
self.child = child
|
163
|
+
self.child_connection = sqlite3.connect(self.child, detect_types=sqlite3.PARSE_DECLTYPES)
|
164
|
+
self.child_cursor = self.child_connection.cursor()
|
165
|
+
else:
|
166
|
+
print("Child database:", child, "not found!")
|
167
|
+
|
168
|
+
#Close access to master DB
|
169
|
+
def close_child_connection(self):
|
170
|
+
if self.child_cursor is not None:
|
171
|
+
self.child_cursor.close()
|
172
|
+
self.child_connection.close()
|
173
|
+
self.child_cursor = None
|
174
|
+
self.child_connection = None
|
175
|
+
self.child = None
|
176
|
+
|
177
|
+
def add_child_to_parent(self, acc, child_db, remove = True, selected_kmers = None, genomes_too = False, just_genomes = False, update_gak = False):
|
178
|
+
accession_index = generate_accessions_index()
|
179
|
+
|
180
|
+
create_command = "CREATE TABLE IF NOT EXISTS " + acc + " (kmer INTEGER PRIMARY KEY, genomes array)"
|
181
|
+
|
182
|
+
if not just_genomes:
|
183
|
+
self.cursor.execute(create_command)
|
184
|
+
self.connection.commit()
|
185
|
+
|
186
|
+
if genomes_too or just_genomes:
|
187
|
+
create_command = "CREATE TABLE IF NOT EXISTS " + acc + "_genomes (genome INTEGER PRIMARY KEY, kmers array)"
|
188
|
+
self.cursor.execute(create_command)
|
189
|
+
self.connection.commit()
|
190
|
+
|
191
|
+
attach = "attach '"+child_db+"' as toMerge"
|
192
|
+
|
193
|
+
if selected_kmers is not None:
|
194
|
+
add = "INSERT OR REPLACE INTO " + acc + " SELECT * FROM toMerge." + acc + " WHERE kmer in ({kmers})".format(kmers = ','.join(['?']*len(selected_kmers)))
|
195
|
+
else:
|
196
|
+
add = "INSERT OR REPLACE INTO " + acc + " SELECT * FROM toMerge." + acc
|
197
|
+
|
198
|
+
if genomes_too or just_genomes:
|
199
|
+
add_genomes = "INSERT OR REPLACE INTO " + acc + "_genomes" + " SELECT * FROM toMerge." + acc+"_genomes"
|
200
|
+
if update_gak:
|
201
|
+
sql_acc_num = acc.replace("_", ".")
|
202
|
+
sql_acc_num = accession_index[sql_acc_num]
|
203
|
+
#Return num bytes, which is always 4*as many as there are entries, as the dtype is int32. See unique_kmers.
|
204
|
+
gak_sql = 'INSERT OR REPLACE INTO genome_acc_kmer_counts SELECT genome, ' + str(sql_acc_num) + ', length(kmers)/4 FROM toMerge.' + acc + '_genomes'
|
205
|
+
|
206
|
+
detach = "detach toMerge"
|
207
|
+
|
208
|
+
self.cursor.execute(attach)
|
209
|
+
self.connection.commit()
|
210
|
+
|
211
|
+
if not just_genomes:
|
212
|
+
if selected_kmers is not None:
|
213
|
+
self.cursor.execute(add, selected_kmers)
|
214
|
+
else:
|
215
|
+
self.cursor.execute(add)
|
216
|
+
|
217
|
+
self.connection.commit()
|
218
|
+
|
219
|
+
if genomes_too or just_genomes:
|
220
|
+
self.cursor.execute(add_genomes)
|
221
|
+
self.connection.commit()
|
222
|
+
if update_gak:
|
223
|
+
self.cursor.execute(gak_sql)
|
224
|
+
self.connection.commit()
|
225
|
+
|
226
|
+
self.cursor.execute(detach)
|
227
|
+
self.connection.commit()
|
228
|
+
|
229
|
+
if remove:
|
230
|
+
os.remove(child_db)
|
231
|
+
|
232
|
+
def add_genomes_first(self, accession, kmer_dict):
|
233
|
+
kmer_lists = []
|
234
|
+
for genome in kmer_dict:
|
235
|
+
kmer_lists.append((genome, kmer_dict[genome].tobytes()))
|
236
|
+
|
237
|
+
|
238
|
+
sql_friendly_accession = accession.replace(".", "_")
|
239
|
+
|
240
|
+
#self.cursor.execute(" DROP TABLE IF EXISTS " + sql_friendly_accession + "_genomes")
|
241
|
+
|
242
|
+
self.cursor.execute("CREATE TABLE IF NOT EXISTS " + sql_friendly_accession + "_genomes (genome INTEGER PRIMARY KEY, kmers array)")
|
243
|
+
self.connection.commit()
|
244
|
+
|
245
|
+
self.cursor.executemany("INSERT OR REPLACE INTO " + sql_friendly_accession + "_genomes VALUES (?, ?) ", kmer_lists)
|
246
|
+
|
247
|
+
self.connection.commit()
|
248
|
+
|
249
|
+
return sql_friendly_accession
|
250
|
+
|
251
|
+
|
252
|
+
def load_genome_index(self):
|
253
|
+
self.genome_index = {}
|
254
|
+
self.reverse_genome_index = {}
|
255
|
+
self.protein_counts_by_genome = {}
|
256
|
+
|
257
|
+
sql_command = ("SELECT genome, gen_id, protein_count FROM genome_index")
|
258
|
+
|
259
|
+
#Break resist.
|
260
|
+
gen = None
|
261
|
+
id = None
|
262
|
+
protein_count = None
|
263
|
+
|
264
|
+
for result in self.cursor.execute(sql_command).fetchall():
|
265
|
+
gen = result[0]
|
266
|
+
id = result[1]
|
267
|
+
protein_count = result[2]
|
268
|
+
|
269
|
+
self.genome_index[gen] = id
|
270
|
+
self.reverse_genome_index[id] = gen
|
271
|
+
self.protein_counts_by_genome[id] = protein_count
|
272
|
+
|
273
|
+
del gen
|
274
|
+
del id
|
275
|
+
del protein_count
|
276
|
+
|
277
|
+
def load_accessions(self, permitted_genomes = None, permitted_accessions = None):
|
278
|
+
#self.protein_counts_by_genome = None
|
279
|
+
|
280
|
+
self.gak = defaultdict(lambda: defaultdict())
|
281
|
+
self.accessions = set()
|
282
|
+
|
283
|
+
|
284
|
+
#It's possible to do both of these. Don't.
|
285
|
+
if permitted_genomes is not None:
|
286
|
+
sql_command = "SELECT * FROM genome_acc_kmer_counts WHERE genome IN ({genomes})".format(genomes=','.join(['?']*len(permitted_genomes)))
|
287
|
+
#data type is very important to SQL
|
288
|
+
sql_friendly = [int(permitted_genomes[i]) for i in range(0, len(permitted_genomes))]
|
289
|
+
for result in self.cursor.execute(sql_command, sql_friendly).fetchall():
|
290
|
+
genome, accession, kmer_ct = result[0], result[1], result[2]
|
291
|
+
self.gak[genome][accession] = kmer_ct
|
292
|
+
|
293
|
+
if permitted_accessions is not None:
|
294
|
+
sql_command = "SELECT * FROM genome_acc_kmer_counts WHERE accession IN ({accessions})".format(accessions=','.join(['?']*len(permitted_accessions)))
|
295
|
+
#data type is very important to SQL
|
296
|
+
#sql_friendly = [int(permitted_accessions[i]) for i in range(0, len(permitted_genomes))]
|
297
|
+
for result in self.cursor.execute(sql_command, permitted_accessions).fetchall():
|
298
|
+
genome, accession, kmer_ct = result[0], result[1], result[2]
|
299
|
+
self.gak[genome][accession] = kmer_ct
|
300
|
+
|
301
|
+
#Normal case
|
302
|
+
if permitted_accessions is None and permitted_genomes is None:
|
303
|
+
sql_command = "SELECT * FROM genome_acc_kmer_counts"
|
304
|
+
for result in self.cursor.execute(sql_command).fetchall():
|
305
|
+
genome, accession, kmer_ct = result[0], result[1], result[2]
|
306
|
+
self.gak[genome][accession] = kmer_ct
|
307
|
+
|
308
|
+
#un-defaultdict
|
309
|
+
self.gak = dict(self.gak)
|
310
|
+
for genome in self.gak:
|
311
|
+
self.gak[genome] = dict(self.gak[genome])
|
312
|
+
self.accessions = self.accessions.union(self.gak[genome].keys())
|
313
|
+
|
314
|
+
self.accessions = tuple(self.accessions)
|
315
|
+
|
316
|
+
def just_accessions(self):
|
317
|
+
converter = generate_accessions_index()
|
318
|
+
acc_sql = "SELECT name FROM sqlite_master WHERE type='table'"
|
319
|
+
tables = [item[0] for item in self.cursor.execute(acc_sql).fetchall()]
|
320
|
+
|
321
|
+
genome_tables = []
|
322
|
+
for table in tables:
|
323
|
+
if table.endswith('_genomes'):
|
324
|
+
genome_tables.append(table)
|
325
|
+
|
326
|
+
for table in genome_tables:
|
327
|
+
tables.pop(tables.index(table))
|
328
|
+
|
329
|
+
tables.pop(tables.index('genome_acc_kmer_counts'))
|
330
|
+
tables.pop(tables.index('genome_index'))
|
331
|
+
|
332
|
+
#Back to indicies.
|
333
|
+
tables = [converter[table.replace('_', '.')] for table in tables]
|
334
|
+
|
335
|
+
self.accessions = tuple(tables)
|
336
|
+
|
337
|
+
def unload_genomes_and_accessions(self):
|
338
|
+
self.gak = None
|
339
|
+
self.genome_index = None
|
340
|
+
#Go from index to name
|
341
|
+
self.reverse_genome_index = None
|
342
|
+
self.protein_counts_by_genome = None
|
343
|
+
|
344
|
+
#Child database class. This is only used during database builds and merges. Designed to take one single accession at a time and produce a correctly formatted table of kmers and accessions.
|
345
|
+
class child_database:
|
346
|
+
def __init__(self, path, parent):
|
347
|
+
#open SQL db and load in
|
348
|
+
|
349
|
+
self.path = path
|
350
|
+
self.exists = False
|
351
|
+
|
352
|
+
self.parent = parent
|
353
|
+
self.parent_exists = os.path.exists(parent)
|
354
|
+
|
355
|
+
self.connection = None
|
356
|
+
self.cursor = None
|
357
|
+
|
358
|
+
self.parent_connection = None
|
359
|
+
self.parent_cursor = None
|
360
|
+
|
361
|
+
self.verbosity = False
|
362
|
+
|
363
|
+
#Open an SQL connection
|
364
|
+
def activate_child_connection(self):
|
365
|
+
# Converts np.array to TEXT when inserting
|
366
|
+
##sqlite3.register_adapter(np.ndarray, adapt_array)
|
367
|
+
|
368
|
+
# Converts TEXT to np.array when selecting
|
369
|
+
sqlite3.register_converter("array", convert_array)
|
370
|
+
|
371
|
+
self.connection = sqlite3.connect(self.path, detect_types=sqlite3.PARSE_DECLTYPES)
|
372
|
+
self.cursor = self.connection.cursor()
|
373
|
+
self.exists = True
|
374
|
+
|
375
|
+
#Close an SQL connection
|
376
|
+
def close_child_connection(self):
|
377
|
+
self.cursor.close()
|
378
|
+
self.connection.close()
|
379
|
+
#True cleanup - even a closed SQL connection obj cannot be passed to multiple processors, but a nonetype can.
|
380
|
+
self.cursor = None
|
381
|
+
self.connection = None
|
382
|
+
|
383
|
+
def initialize_child_database(self):
|
384
|
+
if not self.exists:
|
385
|
+
print("I need to be activated first!")
|
386
|
+
else:
|
387
|
+
#DB exists. Add metadata tables.
|
388
|
+
self.cursor.execute(''' SELECT count(name) FROM sqlite_master WHERE type='table' AND name='genome_index' ''')
|
389
|
+
if self.cursor.fetchone()[0]!=1 :
|
390
|
+
self.cursor.execute('''CREATE TABLE genome_index
|
391
|
+
(genome text, gen_id integer, protein_count integer)''')
|
392
|
+
self.connection.commit()
|
393
|
+
|
394
|
+
self.cursor.execute(''' SELECT count(name) FROM sqlite_master WHERE type='table' AND name='genome_acc_kmer_counts' ''')
|
395
|
+
if self.cursor.fetchone()[0]!=1 :
|
396
|
+
self.cursor.execute('''CREATE TABLE genome_acc_kmer_counts
|
397
|
+
(genome integer, accession integer, count integer)''')
|
398
|
+
self.connection.commit()
|
399
|
+
|
400
|
+
|
401
|
+
#Access an existing master database
|
402
|
+
def activate_parent_connection(self):
|
403
|
+
if os.path.exists(self.parent):
|
404
|
+
self.parent_exists = True
|
405
|
+
#sqlite3.register_adapter(np.ndarray, adapt_array)
|
406
|
+
# Converts TEXT to np.array when selecting
|
407
|
+
sqlite3.register_converter("array", convert_array)
|
408
|
+
self.parent_connection = sqlite3.connect(self.parent, detect_types=sqlite3.PARSE_DECLTYPES)
|
409
|
+
self.parent_cursor = self.parent_connection.cursor()
|
410
|
+
|
411
|
+
#Close access to master DB
|
412
|
+
def close_parent_connection(self):
|
413
|
+
if self.parent_cursor is not None:
|
414
|
+
self.parent_cursor.close()
|
415
|
+
self.parent_connection.close()
|
416
|
+
self.parent_cursor = None
|
417
|
+
self.parent_connection = None
|
418
|
+
|
419
|
+
def add_genomes_first(self, accession, kmer_lists):
|
420
|
+
|
421
|
+
#kmer_lists = []
|
422
|
+
#Shoot... gotta pass the args
|
423
|
+
|
424
|
+
#for file in prepared_files:
|
425
|
+
# if accession in file.best_hits_kmers:
|
426
|
+
# kmer_lists.append((genome_index[file.basename], file.best_hits_kmers[accession].tobytes()))
|
427
|
+
|
428
|
+
sql_friendly_accession = accession.replace(".", "_")
|
429
|
+
|
430
|
+
self.cursor.execute(" DROP TABLE IF EXISTS " + sql_friendly_accession + "_genomes")
|
431
|
+
|
432
|
+
self.cursor.execute("CREATE TABLE " + sql_friendly_accession + "_genomes (genome INTEGER PRIMARY KEY, kmers array)")
|
433
|
+
self.connection.commit()
|
434
|
+
|
435
|
+
self.cursor.executemany(" INSERT INTO " + sql_friendly_accession + "_genomes VALUES (?, ?) ", kmer_lists)
|
436
|
+
|
437
|
+
self.connection.commit()
|
438
|
+
|
439
|
+
return sql_friendly_accession
|
440
|
+
|
441
|
+
|
442
|
+
def add_accession(self, accession, insert_kmers):
|
443
|
+
sql_friendly_accession = accession.replace(".", "_")
|
444
|
+
|
445
|
+
if self.parent_exists:
|
446
|
+
parent_kmers = {}
|
447
|
+
#Check to see if this acc. is already in parent DB
|
448
|
+
table_exists = (self.parent_cursor.execute(" SELECT count(name) FROM sqlite_master WHERE type='table' AND name=(?)", (sql_friendly_accession,)).fetchone()[0] == 1)
|
449
|
+
#If the accession is in the parent DB
|
450
|
+
if table_exists:
|
451
|
+
#Select the records where the kmers are in the new kmers to be added - we don't have to modify the ones that aren't.
|
452
|
+
search_command = "SELECT * FROM "+ sql_friendly_accession + " WHERE kmer IN ({kmers})".format(kmers=','.join(['?']*len(insert_kmers)))
|
453
|
+
|
454
|
+
#Convert the kmers in the current insert list to the correct type for sql to match them
|
455
|
+
selection = tuple([int(key) for key in insert_kmers.keys()])
|
456
|
+
|
457
|
+
for item in self.parent_cursor.execute(search_command, selection).fetchall():
|
458
|
+
#Get the kmer for this parent
|
459
|
+
k = item[0]
|
460
|
+
#If the record would be modified in the parent, combine the to-add (which will replace the row) with the existing data. Otw. the record is unaffected and we can ignore it.
|
461
|
+
if k in insert_kmers:
|
462
|
+
insert_kmers[k] = np.union1d(insert_kmers[k], item[1])
|
463
|
+
|
464
|
+
|
465
|
+
#Free up the space.
|
466
|
+
del parent_kmers
|
467
|
+
|
468
|
+
formatted_kmers = []
|
469
|
+
|
470
|
+
#Translate the ndarray into its constituent byte data
|
471
|
+
for kmer in insert_kmers:
|
472
|
+
formatted_kmers.append((int(kmer), insert_kmers[kmer].tobytes(), ))
|
473
|
+
|
474
|
+
del insert_kmers
|
475
|
+
|
476
|
+
#Remove the child if it exists - it shouldn't ever exist because these child DBs should be deleted upon being added to the parent, but might if a run was stopped halfway.
|
477
|
+
self.cursor.execute(" DROP TABLE IF EXISTS " + sql_friendly_accession)
|
478
|
+
|
479
|
+
self.cursor.execute("CREATE TABLE " + sql_friendly_accession + " (kmer INTEGER PRIMARY KEY, genomes array)")
|
480
|
+
self.connection.commit()
|
481
|
+
|
482
|
+
self.cursor.executemany(" INSERT INTO " + sql_friendly_accession + " VALUES (?, ?) ", formatted_kmers)
|
483
|
+
|
484
|
+
self.connection.commit()
|
485
|
+
|
486
|
+
del formatted_kmers
|
487
|
+
|
488
|
+
return sql_friendly_accession
|
489
|
+
|
490
|
+
|
491
|
+
#Holds partial results for calculating AAI.
|
492
|
+
class calculation_database:
|
493
|
+
def __init__(self, path, precision):
|
494
|
+
#open SQL db and load in
|
495
|
+
|
496
|
+
self.path = path
|
497
|
+
self.exists = False
|
498
|
+
|
499
|
+
self.connection = None
|
500
|
+
self.cursor = None
|
501
|
+
|
502
|
+
self.genomes = None
|
503
|
+
|
504
|
+
self.verbosity = False
|
505
|
+
|
506
|
+
self.precision = precision
|
507
|
+
|
508
|
+
#Open an SQL connection
|
509
|
+
def activate_connection(self):
|
510
|
+
# Converts np.array to TEXT when inserting
|
511
|
+
##sqlite3.register_adapter(np.ndarray, adapt_array)
|
512
|
+
|
513
|
+
# Converts TEXT to np.array when selecting
|
514
|
+
if self.precision == "low":
|
515
|
+
sqlite3.register_converter("array", convert_float_array_16)
|
516
|
+
if self.precision == "med":
|
517
|
+
sqlite3.register_converter("array", convert_float_array_32)
|
518
|
+
if self.precision == "high":
|
519
|
+
sqlite3.register_converter("array", convert_float_array_64)
|
520
|
+
|
521
|
+
self.connection = sqlite3.connect(self.path, detect_types=sqlite3.PARSE_DECLTYPES)
|
522
|
+
self.cursor = self.connection.cursor()
|
523
|
+
self.exists = True
|
524
|
+
|
525
|
+
#Close an SQL connection
|
526
|
+
def close_connection(self):
|
527
|
+
self.cursor.close()
|
528
|
+
self.connection.close()
|
529
|
+
#True cleanup - even a closed SQL connection obj cannot be passed to multiple processors, but a nonetype can.
|
530
|
+
self.cursor = None
|
531
|
+
self.connection = None
|
532
|
+
|
533
|
+
def initialize_database(self):
|
534
|
+
if not self.exists:
|
535
|
+
print("I need to be activated first!")
|
536
|
+
else:
|
537
|
+
#DB exists. Add metadata tables.
|
538
|
+
self.cursor.execute("DROP TABLE IF EXISTS jaccards")
|
539
|
+
self.connection.commit()
|
540
|
+
self.cursor.execute("CREATE TABLE jaccards (genome INTEGER PRIMARY KEY, jaccards array)")
|
541
|
+
self.connection.commit()
|
542
|
+
|
543
|
+
'''
|
544
|
+
Class for handling all of the raw genome/protein/protein+HMM file inputs when building a database.
|
545
|
+
|
546
|
+
Takes a file or files and processes them from genome -> protein, protein -> hmm, prot+HMM -> kmerized protein best hits as numpy int arrays according to the kmer_index
|
547
|
+
|
548
|
+
'''
|
549
|
+
class input_file:
|
550
|
+
def __init__(self, input_path, output, verbosity):
|
551
|
+
#starting path for the file; irrelevant for protein and hmm, but otherwise useful for keeping track.
|
552
|
+
self.path = input_path
|
553
|
+
#Output directory starts with this
|
554
|
+
self.output = os.path.normpath(os.path.basename(output) + "/")
|
555
|
+
#For printing file updates, this is the input name
|
556
|
+
self.name = os.path.basename(input_path)
|
557
|
+
#original name is the key used for the genomes index later on.
|
558
|
+
self.original_name = os.path.basename(input_path)
|
559
|
+
#This is the name that can be used for building files with new extensions.
|
560
|
+
if input_path.endswith(".gz"):
|
561
|
+
#Remove .gz first to make names consistent.
|
562
|
+
self.basename = os.path.splitext(os.path.basename(input_path[:-3]))[0]
|
563
|
+
else:
|
564
|
+
self.basename = os.path.splitext(os.path.basename(input_path))[0]
|
565
|
+
#'genome' or 'protein' or 'protein and HMM'
|
566
|
+
self.status = None
|
567
|
+
#These will keep track of paths for each stage of file for us.
|
568
|
+
self.genome = None
|
569
|
+
self.protein = None
|
570
|
+
self.hmm = None
|
571
|
+
|
572
|
+
self.best_hits = None
|
573
|
+
self.best_hits_kmers = None
|
574
|
+
|
575
|
+
self.protein_count = 0
|
576
|
+
self.protein_kmer_count = {}
|
577
|
+
|
578
|
+
self.trans_table = None
|
579
|
+
self.start_time = None
|
580
|
+
self.end_time = None
|
581
|
+
self.err_log = ""
|
582
|
+
#doesn't get updated otw.
|
583
|
+
self.initial_state = "protein+HMM"
|
584
|
+
|
585
|
+
self.verbose = verbosity
|
586
|
+
|
587
|
+
#Functions for externally setting status and file paths of particular types
|
588
|
+
def set_genome(self, path):
|
589
|
+
self.status = 'genome'
|
590
|
+
self.genome = path
|
591
|
+
|
592
|
+
def set_protein(self, path):
|
593
|
+
self.status = 'protein'
|
594
|
+
self.protein = path
|
595
|
+
|
596
|
+
def set_hmm(self, path):
|
597
|
+
if self.protein is None:
|
598
|
+
print("Warning! I don't have a protein yet, so this HMM will be useless to me until I do!")
|
599
|
+
self.status = 'protein and hmm'
|
600
|
+
self.hmm = path
|
601
|
+
|
602
|
+
#Runs prodigal, compares translation tables and stores faa files
|
603
|
+
def genome_to_protein(self):
|
604
|
+
if self.genome is None:
|
605
|
+
print(self.name, "wasn't a declared as a genome! I can't make this into a protein!")
|
606
|
+
else:
|
607
|
+
folder = Path(self.output + "/predicted_proteins")
|
608
|
+
protein_output = folder / (self.basename + '.faa')
|
609
|
+
output_11 = folder / (self.basename + '.faa.11')
|
610
|
+
output_4 = folder / (self.basename + '.faa.4')
|
611
|
+
temp_output = folder / (self.basename + '.temp')
|
612
|
+
|
613
|
+
intermediate = folder / (self.basename + '_genome_intermediate.fasta')
|
614
|
+
|
615
|
+
#total_bases = 0
|
616
|
+
|
617
|
+
genome_parser = agnostic_reader(self.genome)
|
618
|
+
|
619
|
+
if genome_parser.is_gz:
|
620
|
+
#File was a gzip; decompress it to an intermediate file and then run prodigal; delete after
|
621
|
+
#print("unzipping input...")
|
622
|
+
midpoint = open(intermediate, "w")
|
623
|
+
#Count input bases and write an unzipped file for prodigal's sake.
|
624
|
+
for line in genome_parser:
|
625
|
+
#if not line.startswith(">"):
|
626
|
+
# total_bases += len(line.strip())
|
627
|
+
midpoint.write(line)
|
628
|
+
|
629
|
+
midpoint.close()
|
630
|
+
|
631
|
+
else:
|
632
|
+
#File is already unzipped, just point to it
|
633
|
+
intermediate = self.genome
|
634
|
+
#Count input bases
|
635
|
+
#for line in genome_parser:
|
636
|
+
# if not line.startswith(">"):
|
637
|
+
# total_bases += len(line.strip())
|
638
|
+
|
639
|
+
genome_parser.close()
|
640
|
+
'''
|
641
|
+
A chunk of code originally indended to match GTDBtk's table selection criteria.
|
642
|
+
if total_bases > 100000:
|
643
|
+
#training mode
|
644
|
+
subprocess.call(["prodigal", "-i", str(intermediate), "-a", str(output_11), "-q", "-o", str(temp_output)])
|
645
|
+
subprocess.call(["prodigal", "-i", str(intermediate), "-a", str(output_4), "-g", "4", "-q", "-o", str(temp_output)])
|
646
|
+
else:
|
647
|
+
#Metagenome mode for very short genomes.
|
648
|
+
subprocess.call(["prodigal", "-i", str(intermediate), "-p", "meta", "-a", str(output_11), "-q", "-o", str(temp_output)])
|
649
|
+
subprocess.call(["prodigal", "-i", str(intermediate), "-p", "meta", "-a", str(output_4), "-g", "4", "-q", "-o", str(temp_output)])
|
650
|
+
'''
|
651
|
+
|
652
|
+
subprocess.call(["prodigal", "-i", str(intermediate), "-a", str(output_11), "-q", "-o", str(temp_output)])
|
653
|
+
subprocess.call(["prodigal", "-i", str(intermediate), "-a", str(output_4), "-g", "4", "-q", "-o", str(temp_output)])
|
654
|
+
|
655
|
+
#We can get rid of the temp file immediately, we won't be using it
|
656
|
+
temp_output.unlink()
|
657
|
+
if genome_parser.is_gz:
|
658
|
+
#If the file was copied, delete. Otw. this would delete the input and we don't want that.
|
659
|
+
intermediate.unlink()
|
660
|
+
|
661
|
+
# Compare translation tables
|
662
|
+
length_4 = 0
|
663
|
+
length_11 = 0
|
664
|
+
with open(output_4, 'r') as table_4:
|
665
|
+
for line in table_4:
|
666
|
+
if line.startswith(">"):
|
667
|
+
continue
|
668
|
+
else:
|
669
|
+
length_4 += len(line.strip())
|
670
|
+
|
671
|
+
with open(output_11, 'r') as table_11:
|
672
|
+
for line in table_11:
|
673
|
+
if line.startswith(">"):
|
674
|
+
continue
|
675
|
+
else:
|
676
|
+
length_11 += len(line.strip())
|
677
|
+
|
678
|
+
#Select the winning translation table and remove the other. Open the winner.
|
679
|
+
if (length_4 / length_11) >= 1.1:
|
680
|
+
output_11.unlink()
|
681
|
+
self.trans_table = "4"
|
682
|
+
chosen_protein = open(output_4, 'r')
|
683
|
+
table_11 = False
|
684
|
+
else:
|
685
|
+
output_4.unlink()
|
686
|
+
self.trans_table = "11"
|
687
|
+
chosen_protein = open(output_11, 'r')
|
688
|
+
table_11 = True
|
689
|
+
|
690
|
+
destination = open(protein_output, "w")
|
691
|
+
|
692
|
+
#Clean the winning output.
|
693
|
+
for line in chosen_protein:
|
694
|
+
if line.startswith(">"):
|
695
|
+
destination.write("{}".format(line))
|
696
|
+
else:
|
697
|
+
line = line.replace('*', '')
|
698
|
+
destination.write("{}".format(line))
|
699
|
+
|
700
|
+
destination.close()
|
701
|
+
chosen_protein.close()
|
702
|
+
|
703
|
+
# Remove the winning intermediate file, since we have the cleaned output
|
704
|
+
if table_11:
|
705
|
+
output_11.unlink()
|
706
|
+
else:
|
707
|
+
output_4.unlink()
|
708
|
+
|
709
|
+
self.set_protein(str(protein_output))
|
710
|
+
|
711
|
+
#run hmmsearch on a protein
|
712
|
+
def protein_to_hmm(self):
|
713
|
+
if self.protein is None:
|
714
|
+
print(self.name, "wasn't a declared as a protein! I can't make this into an HMM!")
|
715
|
+
else:
|
716
|
+
|
717
|
+
folder = Path(self.output + "/hmms")
|
718
|
+
|
719
|
+
hmm_output = folder / (self.basename + '.hmm')
|
720
|
+
temp_output = folder / (self.basename + '.temp')
|
721
|
+
|
722
|
+
intermediate = folder / (self.basename + '_protein_intermediate.faa')
|
723
|
+
|
724
|
+
current_protein = ""
|
725
|
+
current_seq = ""
|
726
|
+
|
727
|
+
protein_parser = agnostic_reader(self.protein)
|
728
|
+
|
729
|
+
#File was a gzip; decompress it to an intermediate file and then run prodigal; delete after
|
730
|
+
#Keeps track of \n chars in the protein sequences.
|
731
|
+
line_ct = 0
|
732
|
+
midpoint = open(intermediate, "w")
|
733
|
+
|
734
|
+
for line in protein_parser:
|
735
|
+
if line.startswith(">"):
|
736
|
+
if len(current_seq) > 0:
|
737
|
+
if len(current_seq) < 100000:
|
738
|
+
midpoint.write(current_protein)
|
739
|
+
midpoint.write(current_seq)
|
740
|
+
else:
|
741
|
+
self.err_log += "Protein " + current_protein.strip().split()[0][1:] + " was observed to have >100K amino acids ( " + str(len(current_seq) - line_ct) + " AA found ). It was skipped. "
|
742
|
+
#print("Protein", current_protein.strip()[1:], "was observed to have >100K amino acids (", len(current_seq) - line_ct, "AA found ).", file = sys.stderr)
|
743
|
+
#print("HMMER cannot handle sequences that long, and the protein is almost certainly erroneous, anyway.", file = sys.stderr)
|
744
|
+
#print("The protein will be skipped, and FastAAI will continue without it.", file = sys.stderr)
|
745
|
+
|
746
|
+
current_protein = line
|
747
|
+
current_seq = ""
|
748
|
+
line_ct = 0
|
749
|
+
else:
|
750
|
+
line_ct += 1
|
751
|
+
current_seq += line
|
752
|
+
|
753
|
+
protein_parser.close()
|
754
|
+
|
755
|
+
#Finally, last prot
|
756
|
+
if len(current_seq) > 0:
|
757
|
+
if len(current_seq) < 100000:
|
758
|
+
midpoint.write(current_protein)
|
759
|
+
midpoint.write(current_seq)
|
760
|
+
else:
|
761
|
+
self.err_log += "Protein " + current_protein.strip().split()[0][1:] + " was observed to have >100K amino acids ( " + str(len(current_seq) - line_ct) + " AA found ). It was skipped. "
|
762
|
+
#print("Protein", current_protein.strip()[1:], "was observed to have >100K amino acids (", len(current_seq) - line_ct, "AA found ).", file = sys.stderr)
|
763
|
+
#print("HMMER cannot handle sequences that long, and the protein is almost certainly erroneous, anyway.", file = sys.stderr)
|
764
|
+
#print("The protein will be skipped, and FastAAI will continue without it.", file = sys.stderr)
|
765
|
+
|
766
|
+
midpoint.close()
|
767
|
+
|
768
|
+
#Should locate the DBs regardless of path.
|
769
|
+
script_path = Path(__file__)
|
770
|
+
script_dir = script_path.parent
|
771
|
+
hmm_complete_model = script_dir / "00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm"
|
772
|
+
|
773
|
+
subprocess.call(["hmmsearch", "--tblout", str(hmm_output), "-o", str(temp_output), "--cut_tc", "--cpu", "1",
|
774
|
+
str(hmm_complete_model), str(intermediate)])
|
775
|
+
|
776
|
+
temp_output.unlink()
|
777
|
+
intermediate.unlink()
|
778
|
+
|
779
|
+
self.set_hmm(str(hmm_output))
|
780
|
+
|
781
|
+
def prot_and_hmm_to_besthits(self):
|
782
|
+
prots = []
|
783
|
+
accs = []
|
784
|
+
scores = []
|
785
|
+
f = agnostic_reader(self.hmm)
|
786
|
+
for line in f:
|
787
|
+
if line.startswith("#"):
|
788
|
+
continue
|
789
|
+
else:
|
790
|
+
segs = line.strip().split()
|
791
|
+
prots.append(segs[0])
|
792
|
+
accs.append(segs[3])
|
793
|
+
scores.append(segs[8])
|
794
|
+
|
795
|
+
f.close()
|
796
|
+
|
797
|
+
hmm_file = np.transpose(np.array([prots, accs, scores]))
|
798
|
+
|
799
|
+
#hmm_file = np.loadtxt(hmm_file_name, comments = '#', usecols = (0, 3, 8), dtype=(str))
|
800
|
+
#Sort the hmm file based on the score column in descending order.
|
801
|
+
hmm_file = hmm_file[hmm_file[:,2].astype(float).argsort()[::-1]]
|
802
|
+
|
803
|
+
#Identify the first row where each gene name appears, after sorting by score;
|
804
|
+
#in effect, return the highest scoring assignment per gene name
|
805
|
+
#Sort the indices of the result to match the score-sorted table instead of alphabetical order of gene names
|
806
|
+
hmm_file = hmm_file[np.sort(np.unique(hmm_file[:,0], return_index = True)[1])]
|
807
|
+
|
808
|
+
#Filter the file again for the unique ACCESSION names, since we're only allowed one gene per accession, I guess?
|
809
|
+
#Don't sort the indices, we don't care about the scores anymore.
|
810
|
+
hmm_file = hmm_file[np.unique(hmm_file[:,1], return_index = True)[1]]
|
811
|
+
|
812
|
+
self.best_hits = dict(zip(hmm_file[:,0], hmm_file[:,1]))
|
813
|
+
|
814
|
+
self.best_hits_kmers = {}
|
815
|
+
current_seq = ""
|
816
|
+
current_prot = ""
|
817
|
+
is_besthit = False
|
818
|
+
|
819
|
+
prot = agnostic_reader(self.protein)
|
820
|
+
|
821
|
+
for line in prot:
|
822
|
+
|
823
|
+
if line.startswith(">"):
|
824
|
+
if len(current_seq) > 0:
|
825
|
+
kmer_set = unique_kmers(current_seq, 4)
|
826
|
+
self.protein_kmer_count[current_prot] = kmer_set.shape[0]
|
827
|
+
self.protein_count += 1
|
828
|
+
self.best_hits_kmers[current_prot] = kmer_set
|
829
|
+
#Select the best hit accession for this protein and just record that. We do not care about the names of the proteins.
|
830
|
+
current_prot = line[1:].strip().split(" ")[0]
|
831
|
+
if current_prot in self.best_hits:
|
832
|
+
current_prot = self.best_hits[current_prot]
|
833
|
+
is_besthit = True
|
834
|
+
else:
|
835
|
+
is_besthit = False
|
836
|
+
current_seq = ""
|
837
|
+
else:
|
838
|
+
if is_besthit:
|
839
|
+
current_seq += line.strip()
|
840
|
+
|
841
|
+
prot.close()
|
842
|
+
|
843
|
+
#Final iter. doesn't happen otw.
|
844
|
+
if current_prot in self.best_hits:
|
845
|
+
kmer_set = unique_kmers(current_seq, 4)
|
846
|
+
#kmer_set = [kmer_index[k] for k in kmer_set]
|
847
|
+
self.protein_kmer_count[current_prot] = kmer_set.shape[0]
|
848
|
+
self.protein_count += 1
|
849
|
+
self.best_hits_kmers[current_prot] = kmer_set
|
850
|
+
|
851
|
+
self.status = "finished preprocessing"
|
852
|
+
|
853
|
+
def preprocess(self):
|
854
|
+
#There's no advancement stage for protein and HMM
|
855
|
+
if self.status == 'genome':
|
856
|
+
start_time = curtime()
|
857
|
+
#report = True
|
858
|
+
if self.start_time is None:
|
859
|
+
self.start_time = start_time
|
860
|
+
|
861
|
+
if self.initial_state == "protein+HMM":
|
862
|
+
self.initial_state = "genome"
|
863
|
+
|
864
|
+
self.genome_to_protein()
|
865
|
+
|
866
|
+
|
867
|
+
if self.status == 'protein':
|
868
|
+
start_time = curtime()
|
869
|
+
#report = True
|
870
|
+
if self.start_time is None:
|
871
|
+
self.start_time = start_time
|
872
|
+
|
873
|
+
if self.initial_state == "protein+HMM":
|
874
|
+
self.initial_state = "protein"
|
875
|
+
|
876
|
+
self.protein_to_hmm()
|
877
|
+
|
878
|
+
if self.status == 'protein and hmm':
|
879
|
+
start_time = curtime()
|
880
|
+
|
881
|
+
if self.start_time is None:
|
882
|
+
self.start_time = start_time
|
883
|
+
|
884
|
+
self.prot_and_hmm_to_besthits()
|
885
|
+
|
886
|
+
#Add an end time if either genome -> protein -> HMM or protein -> HMM happened.
|
887
|
+
if self.start_time is not None:
|
888
|
+
end_time = curtime()
|
889
|
+
self.end_time = end_time
|
890
|
+
else:
|
891
|
+
#Start was protein+HMM. There was no runtime, and intitial state is p+hmm
|
892
|
+
#self.initial_state = "protein+HMM"
|
893
|
+
self.start_time = "N/A"
|
894
|
+
self.end_time = "N/A"
|
895
|
+
|
896
|
+
#Protein not generated on this run.
|
897
|
+
if self.trans_table is None:
|
898
|
+
self.trans_table = "unknown"
|
899
|
+
|
900
|
+
'''
|
901
|
+
Viral functions
|
902
|
+
'''
|
903
|
+
#No translation table comparison for viruses. Slightly reduced logic.
|
904
|
+
def viral_genome_to_protein(self):
|
905
|
+
if self.genome is None:
|
906
|
+
print(self.name, "wasn't a declared as a genome! I can't make this into a protein!")
|
907
|
+
else:
|
908
|
+
folder = Path(self.output + "/predicted_proteins")
|
909
|
+
intermediate_protein_output = folder / (self.basename + '.intermediate.faa')
|
910
|
+
final_protein_output = folder / (self.basename + '.faa')
|
911
|
+
temp_output = folder / (self.basename + '.temp')
|
912
|
+
|
913
|
+
subprocess.call(["prodigal", "-i", str(self.genome), "-a", str(intermediate_protein_output), "-p", "meta", "-q", "-o", str(temp_output)])
|
914
|
+
|
915
|
+
# Remove intermediate files
|
916
|
+
temp_output.unlink()
|
917
|
+
|
918
|
+
chosen_protein = open(intermediate_protein_output, 'r')
|
919
|
+
destination = open(final_protein_output, "w")
|
920
|
+
|
921
|
+
for line in chosen_protein:
|
922
|
+
if line.startswith(">"):
|
923
|
+
destination.write("{}".format(line))
|
924
|
+
else:
|
925
|
+
line = line.replace('*', '')
|
926
|
+
destination.write("{}".format(line))
|
927
|
+
|
928
|
+
destination.close()
|
929
|
+
chosen_protein.close()
|
930
|
+
|
931
|
+
intermediate_protein_output.unlink()
|
932
|
+
|
933
|
+
self.protein = str(protein_output)
|
934
|
+
self.status = 'protein'
|
935
|
+
|
936
|
+
|
937
|
+
'''
|
938
|
+
Preprocessing functions
|
939
|
+
|
940
|
+
Read directories, advance files to hmms as needed.
|
941
|
+
'''
|
942
|
+
#Toy function for passing to a pool
|
943
|
+
def do_advance(input_file_object):
|
944
|
+
input_file_object.preprocess()
|
945
|
+
return input_file_object
|
946
|
+
|
947
|
+
def initialize_preproc(index):
|
948
|
+
global kmer_index
|
949
|
+
kmer_index = index
|
950
|
+
|
951
|
+
#Function which takes an input list
|
952
|
+
def advance_inputs(genomes = None, proteins = None, hmms = None, genomes_file = None, proteins_file = None, hmms_file = None, output = "FastAAI", threads = 1, verbose = False, db_name = ""):
|
953
|
+
inputs = []
|
954
|
+
|
955
|
+
hmm_broke = False
|
956
|
+
|
957
|
+
if genomes_file is not None:
|
958
|
+
fh = agnostic_reader(genomes_file)
|
959
|
+
|
960
|
+
for line in fh:
|
961
|
+
clean = line.strip()
|
962
|
+
if not os.path.exists(clean):
|
963
|
+
print("I can't find file", clean, "Are you sure this file exists and can be found from your current directory using the path you supplied in the input file?")
|
964
|
+
else:
|
965
|
+
current_file = input_file(clean, output, verbose)
|
966
|
+
current_file.set_genome(clean)
|
967
|
+
inputs.append(current_file)
|
968
|
+
del current_file
|
969
|
+
|
970
|
+
fh.close()
|
971
|
+
|
972
|
+
if proteins_file is not None:
|
973
|
+
fh = agnostic_reader(proteins_file)
|
974
|
+
|
975
|
+
for line in fh:
|
976
|
+
#GOTOGOTO
|
977
|
+
print(line)
|
978
|
+
|
979
|
+
clean = line.strip()
|
980
|
+
if not os.path.exists(clean):
|
981
|
+
print("I can't find file", clean, "Are you sure this file exists and can be found from your current directory using the path you supplied in the input file?")
|
982
|
+
else:
|
983
|
+
current_file = input_file(clean, output, verbose)
|
984
|
+
current_file.set_protein(clean)
|
985
|
+
inputs.append(current_file)
|
986
|
+
del current_file
|
987
|
+
|
988
|
+
fh.close()
|
989
|
+
|
990
|
+
if hmms_file is not None:
|
991
|
+
fh = agnostic_reader(hmms_file)
|
992
|
+
|
993
|
+
hmm_pairs = []
|
994
|
+
|
995
|
+
for line in fh:
|
996
|
+
clean = line.strip()
|
997
|
+
if not os.path.exists(clean):
|
998
|
+
print("I can't find file", clean, "Are you sure this file exists and can be found from your current directory using the path you supplied in the input file?")
|
999
|
+
else:
|
1000
|
+
hmm_pairs.append(clean)
|
1001
|
+
|
1002
|
+
fh.close()
|
1003
|
+
|
1004
|
+
if len(hmm_pairs) != len(inputs):
|
1005
|
+
print("Protein and HMM file counts differ! There must be one HMM per protein, generated from its paired protein! These pairs must be in the same order in your input file!")
|
1006
|
+
hmm_broke = True
|
1007
|
+
else:
|
1008
|
+
for h, i in zip(hmm_pairs, inputs):
|
1009
|
+
i.set_hmm(h)
|
1010
|
+
|
1011
|
+
if genomes is not None:
|
1012
|
+
set = os.listdir(genomes)
|
1013
|
+
#Sort is used to ensure lexicographic ordering.
|
1014
|
+
set.sort()
|
1015
|
+
set = [os.path.normpath(genomes + "/" + file) for file in set]
|
1016
|
+
|
1017
|
+
for file in set:
|
1018
|
+
if not os.path.exists(file):
|
1019
|
+
print("I can't find", file, "Are you sure this file exists in the directory you supplied?")
|
1020
|
+
else:
|
1021
|
+
current_file = input_file(file, output, verbose)
|
1022
|
+
current_file.set_genome(file)
|
1023
|
+
inputs.append(current_file)
|
1024
|
+
del current_file
|
1025
|
+
|
1026
|
+
if proteins is not None:
|
1027
|
+
set = os.listdir(proteins)
|
1028
|
+
set.sort()
|
1029
|
+
set = [os.path.normpath(proteins + "/" + file) for file in set]
|
1030
|
+
|
1031
|
+
for file in set:
|
1032
|
+
if not os.path.exists(file):
|
1033
|
+
print("I can't find", file, "Are you sure this file exists in the directory you supplied?")
|
1034
|
+
else:
|
1035
|
+
current_file = input_file(file, output, verbose)
|
1036
|
+
current_file.set_protein(file)
|
1037
|
+
inputs.append(current_file)
|
1038
|
+
del current_file
|
1039
|
+
|
1040
|
+
if hmms is not None:
|
1041
|
+
set = os.listdir(hmms)
|
1042
|
+
set.sort()
|
1043
|
+
set = [os.path.normpath(hmms + "/" + file) for file in set]
|
1044
|
+
|
1045
|
+
hmm_pairs = []
|
1046
|
+
|
1047
|
+
for file in set:
|
1048
|
+
if not os.path.exists(file):
|
1049
|
+
print("I can't find", file, "Are you sure this file exists in the directory you supplied?")
|
1050
|
+
else:
|
1051
|
+
hmm_pairs.append(file)
|
1052
|
+
|
1053
|
+
if len(hmm_pairs) != len(inputs):
|
1054
|
+
print("Protein and HMM file counts differ! There must be one HMM per protein, generated from its paired protein! These must be in the same alphabetical order in their respective directories!")
|
1055
|
+
hmm_broke = True
|
1056
|
+
else:
|
1057
|
+
for h, i in zip(hmm_pairs, inputs):
|
1058
|
+
i.set_hmm(h)
|
1059
|
+
|
1060
|
+
if hmm_broke:
|
1061
|
+
print("FastAAI can't proceed without matching HMM and protein pairs.")
|
1062
|
+
inputs = None
|
1063
|
+
return inputs
|
1064
|
+
|
1065
|
+
total_counts = len(inputs)
|
1066
|
+
count = 0
|
1067
|
+
last_pct = 0
|
1068
|
+
|
1069
|
+
if verbose:
|
1070
|
+
print("")
|
1071
|
+
#progress bar - possible dangerous use of the return to line start sequence.
|
1072
|
+
try:
|
1073
|
+
percentage = 0
|
1074
|
+
sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% (Genome ' + str(count) + " of " + str(total_counts) + ') at ' + curtime()+"\n")
|
1075
|
+
sys.stdout.flush()
|
1076
|
+
except:
|
1077
|
+
#It's not really a big deal if the progress bar cannot be printed.
|
1078
|
+
pass
|
1079
|
+
|
1080
|
+
results = []
|
1081
|
+
|
1082
|
+
kmer_index_ = create_kmer_index()
|
1083
|
+
pool = multiprocessing.Pool(threads, initializer=initialize_preproc, initargs = (kmer_index_,))
|
1084
|
+
|
1085
|
+
for res in pool.imap(do_advance, inputs):
|
1086
|
+
results.append(res)
|
1087
|
+
if verbose:
|
1088
|
+
#progress bar - possible dangerous use of the return to line start sequence.
|
1089
|
+
try:
|
1090
|
+
count += 1
|
1091
|
+
percentage = (count/total_counts)*100
|
1092
|
+
if int(percentage/2) > last_pct or partition == total_partitions:
|
1093
|
+
sys.stdout.write('\033[A')
|
1094
|
+
sys.stdout.flush()
|
1095
|
+
sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% (Genome ' + str(count) + " of " + str(total_counts) + ') at ' + curtime()+"\n")
|
1096
|
+
sys.stdout.flush()
|
1097
|
+
|
1098
|
+
last_pct = int(percentage/2)
|
1099
|
+
except:
|
1100
|
+
#It's not really a big deal if the progress bar cannot be printed.
|
1101
|
+
pass
|
1102
|
+
|
1103
|
+
pool.close()
|
1104
|
+
pool.join()
|
1105
|
+
|
1106
|
+
inputs = results
|
1107
|
+
|
1108
|
+
log_time = curtime()
|
1109
|
+
|
1110
|
+
if os.path.exists(os.path.normpath(output + "/logs/" + os.path.splitext(os.path.basename(db_name))[0] + "_preprocessing_log.txt")):
|
1111
|
+
preproc_log = open(os.path.normpath(output + "/logs/" + os.path.splitext(os.path.basename(db_name))[0] + "_preprocessing_log.txt"), "a")
|
1112
|
+
else:
|
1113
|
+
preproc_log = open(os.path.normpath(output + "/logs/" + os.path.splitext(os.path.basename(db_name))[0] + "_preprocessing_log.txt"), "w")
|
1114
|
+
print("log_date", "genome_name", "started_as_a", "start_time", "end_time", "protein_translation_table", "errors", sep = "\t", file = preproc_log)
|
1115
|
+
for i in inputs:
|
1116
|
+
print(log_time, i.basename, i.initial_state, i.start_time, i.end_time, i.trans_table, i.err_log, sep = "\t", file = preproc_log)
|
1117
|
+
preproc_log.close()
|
1118
|
+
|
1119
|
+
return inputs
|
1120
|
+
|
1121
|
+
'''
|
1122
|
+
Utility functions
|
1123
|
+
'''
|
1124
|
+
def prepare_directories(output, status, build_or_query):
|
1125
|
+
preparation_successful = True
|
1126
|
+
|
1127
|
+
if not os.path.exists(output):
|
1128
|
+
try:
|
1129
|
+
os.mkdir(output)
|
1130
|
+
except:
|
1131
|
+
print("")
|
1132
|
+
print("FastAAI tried to make output directory: '"+ output + "' but failed.")
|
1133
|
+
print("")
|
1134
|
+
print("Troubleshooting:")
|
1135
|
+
print("")
|
1136
|
+
print(" (1) Do you have permission to create directories in the location you specified?")
|
1137
|
+
print(" (2) Did you make sure that all directories other than", os.path.basename(output), "already exist?")
|
1138
|
+
print("")
|
1139
|
+
preparation_successful = False
|
1140
|
+
|
1141
|
+
if preparation_successful:
|
1142
|
+
try:
|
1143
|
+
if status == 'genome':
|
1144
|
+
if not os.path.exists(os.path.normpath(output + "/" + "predicted_proteins")):
|
1145
|
+
os.mkdir(os.path.normpath(output + "/" + "predicted_proteins"))
|
1146
|
+
if not os.path.exists(os.path.normpath(output + "/" + "hmms")):
|
1147
|
+
os.mkdir(os.path.normpath(output + "/" + "hmms"))
|
1148
|
+
|
1149
|
+
if status == 'protein':
|
1150
|
+
if not os.path.exists(os.path.normpath(output + "/" + "hmms")):
|
1151
|
+
os.mkdir(os.path.normpath(output + "/" + "hmms"))
|
1152
|
+
|
1153
|
+
if not os.path.exists(os.path.normpath(output + "/" + "logs")):
|
1154
|
+
os.mkdir(os.path.normpath(output + "/" + "logs"))
|
1155
|
+
|
1156
|
+
if build_or_query == "build":
|
1157
|
+
if not os.path.exists(os.path.normpath(output + "/" + "database")):
|
1158
|
+
os.mkdir(os.path.normpath(output + "/" + "database"))
|
1159
|
+
|
1160
|
+
if build_or_query == "query":
|
1161
|
+
if not os.path.exists(os.path.normpath(output + "/" + "results")):
|
1162
|
+
os.mkdir(os.path.normpath(output + "/" + "results"))
|
1163
|
+
|
1164
|
+
|
1165
|
+
except:
|
1166
|
+
print("FastAAI was able to create or find", output, "but couldn't make directories there.")
|
1167
|
+
print("")
|
1168
|
+
print("This shouldn't happen. Do you have permission to write to that directory?")
|
1169
|
+
|
1170
|
+
|
1171
|
+
return preparation_successful
|
1172
|
+
|
1173
|
+
def check_out_input_files(genomes, proteins, hmms, gf, pf, hf):
|
1174
|
+
#Check only one method of supply was used per file type
|
1175
|
+
if (genomes is not None) and (gf is not None):
|
1176
|
+
print("Supply genomes either by directory or by file, not both.")
|
1177
|
+
return None
|
1178
|
+
if (proteins is not None) and (pf is not None):
|
1179
|
+
print("Supply proteins either by directory or by file, not both.")
|
1180
|
+
return None
|
1181
|
+
if (hmms is not None) and (hf is not None):
|
1182
|
+
print("Supply HMMs either by directory or by file, not both.")
|
1183
|
+
return None
|
1184
|
+
|
1185
|
+
#check that not both proteins and genomes supplied in any combo.
|
1186
|
+
if ((genomes is not None) and (pf is not None))\
|
1187
|
+
or ((gf is not None) and (proteins is not None))\
|
1188
|
+
or ((genomes is not None) and (proteins is not None))\
|
1189
|
+
or ((gf is not None) and (pf is not None)):
|
1190
|
+
print("Supply either genomes or proteins, not both. You can supply proteins and HMMs, but not genomes and proteins.")
|
1191
|
+
return None
|
1192
|
+
|
1193
|
+
#Check that if hmms are given, so are proteins
|
1194
|
+
if (hmms is not None) or (hf is not None):
|
1195
|
+
if (proteins is None) and (pf is None):
|
1196
|
+
print("If you supply HMMs, you also have to supply the proteins from which they were generated.")
|
1197
|
+
return None
|
1198
|
+
|
1199
|
+
#Determine status
|
1200
|
+
if (genomes is not None) or (gf is not None):
|
1201
|
+
print("Starting from genomes")
|
1202
|
+
start = 'genome'
|
1203
|
+
|
1204
|
+
else:
|
1205
|
+
if (hmms is not None) or (hf is not None):
|
1206
|
+
print("Starting from proteins and HMMs")
|
1207
|
+
start = 'protein and HMM'
|
1208
|
+
|
1209
|
+
else:
|
1210
|
+
print("Starting from proteins")
|
1211
|
+
start = 'protein'
|
1212
|
+
|
1213
|
+
return start
|
1214
|
+
|
1215
|
+
|
1216
|
+
#Build DB from genomes
|
1217
|
+
|
1218
|
+
def unique_kmers(seq, ksize):
|
1219
|
+
n_kmers = len(seq) - ksize + 1
|
1220
|
+
kmers = []
|
1221
|
+
for i in range(n_kmers):
|
1222
|
+
kmers.append(kmer_index[seq[i:i + ksize]])
|
1223
|
+
#We care about the type because we're working with bytes later.
|
1224
|
+
return np.unique(kmers).astype(np.int32)
|
1225
|
+
|
1226
|
+
#Quickly creates a dict of all poss. tetramers in a fixed, alphabetical order.
|
1227
|
+
#This can be used to index kmers so that the indices are identical (and thus interchangable) on separate runs of this program.
|
1228
|
+
def create_kmer_index():
|
1229
|
+
valid_chars = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Y', '*']
|
1230
|
+
#This meshgrid method will produce all unique tetramers from AAAA to **** in a consistent order.
|
1231
|
+
#Rightmost char to leftmost, A to * in the same order as valid_chars
|
1232
|
+
kmer_index_ = np.stack(np.meshgrid(valid_chars, valid_chars, valid_chars, valid_chars), -1).reshape(-1, 4)
|
1233
|
+
#Unless someone is passing more than 2.1 billion genomes, int32 will be enough.
|
1234
|
+
kmer_index_ = dict(zip([''.join(kmer_index_[i,]) for i in range(0, kmer_index_.shape[0])], np.arange(kmer_index_.shape[0], dtype = np.int32)))
|
1235
|
+
|
1236
|
+
return kmer_index_
|
1237
|
+
|
1238
|
+
def split_seq(seq, num_grps):
|
1239
|
+
newseq = []
|
1240
|
+
splitsize = 1.0/num_grps*len(seq)
|
1241
|
+
for i in range(num_grps):
|
1242
|
+
newseq.append(seq[int(round(i*splitsize)):int(round((i+1)*splitsize))])
|
1243
|
+
return newseq
|
1244
|
+
|
1245
|
+
#gives the max and min index needed to split a list of (max_val) genomes into
|
1246
|
+
def split_indicies(max_val, num_grps):
|
1247
|
+
newseq = []
|
1248
|
+
splitsize = 1.0/num_grps*max_val
|
1249
|
+
for i in range(num_grps):
|
1250
|
+
newseq.append(((round(i*splitsize)), round((i+1)*splitsize)))
|
1251
|
+
return newseq
|
1252
|
+
|
1253
|
+
def list_to_index_dict(list):
|
1254
|
+
result = {}
|
1255
|
+
counter = 0
|
1256
|
+
for item in list:
|
1257
|
+
result[item] = counter
|
1258
|
+
counter += 1
|
1259
|
+
return result
|
1260
|
+
|
1261
|
+
def generate_accessions_index():
|
1262
|
+
list_of_poss_accs = list_to_index_dict(['PF01780.19', 'PF03948.14', 'PF17144.4', 'PF00830.19', 'PF00347.23', 'PF16906.5', 'PF13393.6',
|
1263
|
+
'PF02565.15', 'PF01991.18', 'PF01984.20', 'PF00861.22', 'PF13656.6', 'PF00368.18', 'PF01142.18', 'PF00312.22', 'PF02367.17',
|
1264
|
+
'PF01951.16', 'PF00749.21', 'PF01655.18', 'PF00318.20', 'PF01813.17', 'PF01649.18', 'PF01025.19', 'PF00380.19', 'PF01282.19',
|
1265
|
+
'PF01864.17', 'PF01783.23', 'PF01808.18', 'PF01982.16', 'PF01715.17', 'PF00213.18', 'PF00119.20', 'PF00573.22', 'PF01981.16',
|
1266
|
+
'PF00281.19', 'PF00584.20', 'PF00825.18', 'PF00406.22', 'PF00177.21', 'PF01192.22', 'PF05833.11', 'PF02699.15', 'PF01016.19',
|
1267
|
+
'PF01765.19', 'PF00453.18', 'PF01193.24', 'PF05221.17', 'PF00231.19', 'PF00416.22', 'PF02033.18', 'PF01668.18', 'PF00886.19',
|
1268
|
+
'PF00252.18', 'PF00572.18', 'PF00366.20', 'PF04104.14', 'PF04919.12', 'PF01912.18', 'PF00276.20', 'PF00203.21', 'PF00889.19',
|
1269
|
+
'PF02996.17', 'PF00121.18', 'PF01990.17', 'PF00344.20', 'PF00297.22', 'PF01196.19', 'PF01194.17', 'PF01725.16', 'PF00750.19',
|
1270
|
+
'PF00338.22', 'PF00238.19', 'PF01200.18', 'PF00162.19', 'PF00181.23', 'PF01866.17', 'PF00709.21', 'PF02006.16', 'PF00164.25',
|
1271
|
+
'PF00237.19', 'PF01139.17', 'PF01351.18', 'PF04010.13', 'PF06093.13', 'PF00828.19', 'PF02410.15', 'PF01176.19', 'PF02130.17',
|
1272
|
+
'PF01948.18', 'PF01195.19', 'PF01746.21', 'PF01667.17', 'PF03874.16', 'PF01090.19', 'PF01198.19', 'PF01250.17', 'PF17136.4',
|
1273
|
+
'PF06026.14', 'PF03652.15', 'PF04019.12', 'PF01201.22', 'PF00832.20', 'PF01264.21', 'PF03840.14', 'PF00831.23', 'PF00189.20',
|
1274
|
+
'PF02601.15', 'PF01496.19', 'PF00411.19', 'PF00334.19', 'PF00687.21', 'PF01157.18', 'PF01245.20', 'PF01994.16', 'PF01632.19',
|
1275
|
+
'PF00827.17', 'PF01015.18', 'PF00829.21', 'PF00410.19', 'PF00833.18', 'PF00935.19', 'PF01992.16'])
|
1276
|
+
|
1277
|
+
return list_of_poss_accs
|
1278
|
+
|
1279
|
+
#Master function for building or adding to a DB with genomes.
|
1280
|
+
def add_inputs(output_path, parent_path, existing_index, threads, verbose, prep_args):
|
1281
|
+
|
1282
|
+
genomes, proteins, hmms, gf, pf, hf, db_name = prep_args[0], prep_args[1], prep_args[2], prep_args[3], prep_args[4], prep_args[5], prep_args[6]
|
1283
|
+
|
1284
|
+
print("")
|
1285
|
+
print("FastAAI is formatting your files to be saved to your database.")
|
1286
|
+
|
1287
|
+
#Let's push this to the inputs section.
|
1288
|
+
inputs = advance_inputs(genomes = genomes, proteins = proteins, hmms = hmms, genomes_file = gf, proteins_file = pf, hmms_file = hf, output = output_path, threads = threads, verbose = verbose, db_name = db_name)
|
1289
|
+
|
1290
|
+
if inputs is None:
|
1291
|
+
return False
|
1292
|
+
|
1293
|
+
kmer_index = None
|
1294
|
+
|
1295
|
+
#global genome_index
|
1296
|
+
genome_index = {}
|
1297
|
+
next_index = 0
|
1298
|
+
|
1299
|
+
#Build upon the genome indexing of an existing DB
|
1300
|
+
if existing_index is not None:
|
1301
|
+
genome_index = existing_index
|
1302
|
+
#zero indexing makes this the next number to add.
|
1303
|
+
next_index = len(existing_index)
|
1304
|
+
|
1305
|
+
final_db = fastaai_database(parent_path)
|
1306
|
+
final_db.activate_connection()
|
1307
|
+
final_db.initialize_parent_database()
|
1308
|
+
|
1309
|
+
#This goes to the genome_index table
|
1310
|
+
protein_counts_to_add = []
|
1311
|
+
genome_acc_kmer_counts_to_add = []
|
1312
|
+
|
1313
|
+
acc_index = generate_accessions_index()
|
1314
|
+
|
1315
|
+
readied_kmers_by_acc = defaultdict(lambda: defaultdict(lambda: None))
|
1316
|
+
|
1317
|
+
#unique_accessions = set()
|
1318
|
+
for file in inputs:
|
1319
|
+
|
1320
|
+
genome = file.basename
|
1321
|
+
|
1322
|
+
#Collect all of the accessions actually found. Will usually be 122 for reasonably sized datasets.
|
1323
|
+
#unique_accessions = unique_accessions.union(set(file.best_hits.values()))
|
1324
|
+
#Avoid adding duplicate genomes
|
1325
|
+
if genome not in genome_index:
|
1326
|
+
protein_counts_to_add.append((genome, next_index, file.protein_count))
|
1327
|
+
for prot in file.protein_kmer_count:
|
1328
|
+
genome_acc_kmer_counts_to_add.append((next_index, acc_index[prot], file.protein_kmer_count[prot]))
|
1329
|
+
genome_index[genome] = next_index
|
1330
|
+
next_index += 1
|
1331
|
+
|
1332
|
+
this_index = genome_index[genome]
|
1333
|
+
for acc in file.best_hits_kmers:
|
1334
|
+
readied_kmers_by_acc[acc][this_index] = file.best_hits_kmers[acc]
|
1335
|
+
#Clean up space
|
1336
|
+
file.best_hits_kmers = None
|
1337
|
+
|
1338
|
+
inputs = None
|
1339
|
+
|
1340
|
+
#Default dicts can't be pickled.
|
1341
|
+
readied_kmers_by_acc = dict(readied_kmers_by_acc)
|
1342
|
+
|
1343
|
+
genomes_per_acc = {}
|
1344
|
+
for acc in readied_kmers_by_acc:
|
1345
|
+
readied_kmers_by_acc[acc] = dict(readied_kmers_by_acc[acc])
|
1346
|
+
genomes_per_acc[acc] = list(readied_kmers_by_acc[acc].keys())
|
1347
|
+
final_db.add_genomes_first(acc, readied_kmers_by_acc[acc])
|
1348
|
+
readied_kmers_by_acc[acc] = None
|
1349
|
+
|
1350
|
+
readied_kmers_by_acc = None
|
1351
|
+
|
1352
|
+
add_genomes = "INSERT OR REPLACE INTO genome_index VALUES (?, ?, ?)"
|
1353
|
+
add_proteins = "INSERT OR REPLACE INTO genome_acc_kmer_counts VALUES (?, ?, ?)"
|
1354
|
+
|
1355
|
+
final_db.cursor.executemany(add_genomes, protein_counts_to_add)
|
1356
|
+
final_db.cursor.executemany(add_proteins, genome_acc_kmer_counts_to_add)
|
1357
|
+
final_db.connection.commit()
|
1358
|
+
|
1359
|
+
final_db.cursor.execute("CREATE INDEX IF NOT EXISTS kmer_acc ON genome_acc_kmer_counts (genome, accession);")
|
1360
|
+
final_db.connection.commit()
|
1361
|
+
|
1362
|
+
protein_counts_to_add = None
|
1363
|
+
genome_acc_kmer_counts_to_add = None
|
1364
|
+
|
1365
|
+
unique_accessions = list(genomes_per_acc.keys())
|
1366
|
+
child_args = []
|
1367
|
+
for i in range(0, len(unique_accessions)):
|
1368
|
+
accession = unique_accessions[i]
|
1369
|
+
name = "accession_" + unique_accessions[i] + "_partition_" + str(i)
|
1370
|
+
child_path = os.path.normpath(output_path+"/temp")
|
1371
|
+
child_args.append([accession, name, child_path, parent_path, genomes_per_acc[accession], genome_index])
|
1372
|
+
|
1373
|
+
print("")
|
1374
|
+
print("Formatting data to add to database at", curtime())
|
1375
|
+
|
1376
|
+
#Add partition, output, parent DB data.
|
1377
|
+
if not os.path.exists(os.path.normpath(output_path+"/temp")):
|
1378
|
+
try:
|
1379
|
+
os.mkdir(os.path.normpath(output_path+"/temp"))
|
1380
|
+
except:
|
1381
|
+
print("Output directory failed to create! Cannot continue.")
|
1382
|
+
return False
|
1383
|
+
|
1384
|
+
if verbose:
|
1385
|
+
print("")
|
1386
|
+
count = 0
|
1387
|
+
total_counts = len(child_args)
|
1388
|
+
try:
|
1389
|
+
log_time = curtime()
|
1390
|
+
percentage = (count/total_counts)*100
|
1391
|
+
sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(count) + " of " + str(total_counts) + ' ) at ' + curtime() + "\n")
|
1392
|
+
sys.stdout.flush()
|
1393
|
+
except:
|
1394
|
+
#It's not really a big deal if the progress bar cannot be printed.
|
1395
|
+
pass
|
1396
|
+
|
1397
|
+
last_pct = 0
|
1398
|
+
|
1399
|
+
quiverfull = []
|
1400
|
+
|
1401
|
+
pool = multiprocessing.Pool(threads)
|
1402
|
+
|
1403
|
+
for result in pool.imap_unordered(produce_children, child_args):
|
1404
|
+
acc = result[0]
|
1405
|
+
child = result[1]
|
1406
|
+
|
1407
|
+
quiverfull.append([acc, child])
|
1408
|
+
|
1409
|
+
if verbose:
|
1410
|
+
count += 1
|
1411
|
+
try:
|
1412
|
+
percentage = (count/total_counts)*100
|
1413
|
+
log_time = curtime()
|
1414
|
+
sys.stdout.write('\033[A')
|
1415
|
+
sys.stdout.flush()
|
1416
|
+
sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(count) + " of " + str(total_counts) + ' done at '+ curtime() + " )\n")
|
1417
|
+
sys.stdout.flush()
|
1418
|
+
except:
|
1419
|
+
#It's not really a big deal if the progress bar cannot be printed.
|
1420
|
+
pass
|
1421
|
+
|
1422
|
+
pool.close()
|
1423
|
+
pool.join()
|
1424
|
+
|
1425
|
+
print("")
|
1426
|
+
print("Adding data to final database.")
|
1427
|
+
|
1428
|
+
if verbose:
|
1429
|
+
print("")
|
1430
|
+
|
1431
|
+
count = 0
|
1432
|
+
total_counts = len(child_args)
|
1433
|
+
try:
|
1434
|
+
percentage = (count/total_counts)*100
|
1435
|
+
|
1436
|
+
("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(count) + " of " + str(total_counts) + ' done at '+ curtime() + " )\n")
|
1437
|
+
sys.stdout.flush()
|
1438
|
+
except:
|
1439
|
+
#It's not really a big deal if the progress bar cannot be printed.
|
1440
|
+
pass
|
1441
|
+
|
1442
|
+
last_pct = 0
|
1443
|
+
|
1444
|
+
for result in quiverfull:
|
1445
|
+
acc = result[0]
|
1446
|
+
child = result[1]
|
1447
|
+
final_db.add_child_to_parent(acc, child)
|
1448
|
+
|
1449
|
+
if verbose:
|
1450
|
+
count += 1
|
1451
|
+
try:
|
1452
|
+
percentage = (count/total_counts)*100
|
1453
|
+
log_time = curtime()
|
1454
|
+
sys.stdout.write('\033[A')
|
1455
|
+
sys.stdout.flush()
|
1456
|
+
sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(count) + " of " + str(total_counts) + ' done at '+ curtime() + " )\n")
|
1457
|
+
sys.stdout.flush()
|
1458
|
+
except:
|
1459
|
+
#It's not really a big deal if the progress bar cannot be printed.
|
1460
|
+
pass
|
1461
|
+
|
1462
|
+
|
1463
|
+
print("")
|
1464
|
+
#print("Cleaning up...")
|
1465
|
+
#final_db.connection.execute("VACUUM")
|
1466
|
+
|
1467
|
+
final_db.close_connection()
|
1468
|
+
|
1469
|
+
os.rmdir(os.path.normpath(output_path+"/temp"))
|
1470
|
+
|
1471
|
+
return True
|
1472
|
+
|
1473
|
+
#genome_index is global already
|
1474
|
+
def produce_children(args):
|
1475
|
+
acc = args[0]
|
1476
|
+
partition = args[1]
|
1477
|
+
output_base = args[2]
|
1478
|
+
parent_db = args[3]
|
1479
|
+
genomes_in_this_acc = args[4]
|
1480
|
+
genome_index = args[5]
|
1481
|
+
|
1482
|
+
parental_database = fastaai_database(parent_db)
|
1483
|
+
|
1484
|
+
sql_friendly_accession = acc.replace('.', '_')
|
1485
|
+
|
1486
|
+
read_parent_sql = "SELECT * FROM " + sql_friendly_accession + "_genomes WHERE genome IN ({genomes})".format(genomes=','.join(['?']*len(genomes_in_this_acc)))
|
1487
|
+
|
1488
|
+
parental_database.activate_connection()
|
1489
|
+
|
1490
|
+
genomes_for_this_acc = dict(parental_database.cursor.execute(read_parent_sql, genomes_in_this_acc).fetchall())
|
1491
|
+
|
1492
|
+
parental_database.close_connection()
|
1493
|
+
|
1494
|
+
child_db = os.path.normpath(output_base + "/" + partition + ".db")
|
1495
|
+
|
1496
|
+
this_child = child_database(child_db, parent_db)
|
1497
|
+
|
1498
|
+
this_child.activate_child_connection()
|
1499
|
+
#this_child.initialize_child_database()
|
1500
|
+
this_child.activate_parent_connection()
|
1501
|
+
|
1502
|
+
#Keys are genomes as indices, values are numpy arrays of kmers. This makes tuples.
|
1503
|
+
#this_child.add_genomes_first(acc, zip(genomes_for_this_acc.keys(), genomes_for_this_acc.values()))
|
1504
|
+
|
1505
|
+
#Here's where we add the genomes as such to the children, too.
|
1506
|
+
readied_kmers = defaultdict(lambda: [])
|
1507
|
+
for genome in genomes_for_this_acc:
|
1508
|
+
for kmer in genomes_for_this_acc[genome]:
|
1509
|
+
readied_kmers[kmer].append(genome)
|
1510
|
+
#cleanup space
|
1511
|
+
genomes_for_this_acc[genome] = None
|
1512
|
+
|
1513
|
+
del genomes_for_this_acc
|
1514
|
+
|
1515
|
+
readied_kmers = dict(readied_kmers)
|
1516
|
+
for kmer in readied_kmers:
|
1517
|
+
readied_kmers[kmer] = np.array(readied_kmers[kmer], dtype = np.int32)
|
1518
|
+
|
1519
|
+
sql_friendly_accession = this_child.add_accession(acc, readied_kmers)
|
1520
|
+
|
1521
|
+
this_child.close_parent_connection()
|
1522
|
+
this_child.close_child_connection()
|
1523
|
+
|
1524
|
+
del readied_kmers
|
1525
|
+
|
1526
|
+
return [sql_friendly_accession, child_db]
|
1527
|
+
|
1528
|
+
#Build or add to a FastAAI DB
|
1529
|
+
def build_db_opts():
|
1530
|
+
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
|
1531
|
+
description='''
|
1532
|
+
This FastAAI module allows you to create a FastAAI database from one or many genomes, proteins, or proteins and HMMs, or add these files to an existing one.
|
1533
|
+
|
1534
|
+
Supply genomes OR proteins OR proteins AND HMMs as inputs.
|
1535
|
+
|
1536
|
+
If you supply genomes, FastAAI will predict proteins from them, and HMMs will be created from those proteins
|
1537
|
+
If you supply only proteins, FastAAI will create HMM files from them, searching against FastAAI's internal database
|
1538
|
+
If you supply proteins AND HMMs, FastAAI will directly use them to build the database.\n
|
1539
|
+
You cannot supply both genomes and proteins
|
1540
|
+
''')
|
1541
|
+
|
1542
|
+
parser.add_argument('-g', '--genomes', dest = 'genomes', default = None, help = 'A directory containing genomes in FASTA format.')
|
1543
|
+
parser.add_argument('-p', '--proteins', dest = 'proteins', default = None, help = 'A directory containing protein amino acids in FASTA format.')
|
1544
|
+
parser.add_argument('-m', '--hmms', dest = 'hmms', default = None, help = 'A directory containing the results of an HMM search on a set of proteins.')
|
1545
|
+
parser.add_argument('-d', '--database', dest = 'db_name', default = "FastAAI_database.sqlite.db", help = 'The name of the database you wish to create or add to. The database will be created if it doesn\'t already exist and placed in the output directory. FastAAI_database.sqlite.db by default.')
|
1546
|
+
|
1547
|
+
parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory to place the database and any protein or HMM files FastAAI creates. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
|
1548
|
+
|
1549
|
+
parser.add_argument('--genome_file', dest = 'gf', default = None, help = 'Alternative way to supply genomes. A file containing paths to your genome files, 1 per line.')
|
1550
|
+
parser.add_argument('--protein_file', dest = 'pf', default = None, help = 'Alternative way to supply proteins. A file containing paths to your protein files, 1 per line.')
|
1551
|
+
parser.add_argument('--hmm_file', dest = 'hf', default = None, help = 'Alternative way to supply HMMs. A file containing paths to your HMM files, 1 per line.')
|
1552
|
+
|
1553
|
+
parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
|
1554
|
+
parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
|
1555
|
+
|
1556
|
+
args, unknown = parser.parse_known_args()
|
1557
|
+
|
1558
|
+
return parser, args
|
1559
|
+
|
1560
|
+
def build_db(genomes, proteins, hmms, db_name, output, threads, gf, pf, hf, verbose):
|
1561
|
+
|
1562
|
+
start = check_out_input_files(genomes, proteins, hmms, gf, pf, hf)
|
1563
|
+
|
1564
|
+
#If something failed, we stop.
|
1565
|
+
if start is None:
|
1566
|
+
return False
|
1567
|
+
|
1568
|
+
good_to_go = prepare_directories(output, start, "build")
|
1569
|
+
|
1570
|
+
if not good_to_go:
|
1571
|
+
return False
|
1572
|
+
|
1573
|
+
#Check if the db contains path info. Incl. windows version.
|
1574
|
+
if "/" not in db_name and "\\" not in db_name:
|
1575
|
+
final_database = os.path.normpath(output + "/database/" + db_name)
|
1576
|
+
else:
|
1577
|
+
#If the person insists that the db has a path, let them.
|
1578
|
+
final_database = db_name
|
1579
|
+
|
1580
|
+
#We'll skip trying this if the file already exists.
|
1581
|
+
existing_genome_IDs = None
|
1582
|
+
try:
|
1583
|
+
if os.path.exists(final_database):
|
1584
|
+
parent = fastaai_database(final_database)
|
1585
|
+
parent.activate_connection()
|
1586
|
+
|
1587
|
+
existing_genome_IDs = {}
|
1588
|
+
sql_command = "SELECT genome, gen_id FROM genome_index"
|
1589
|
+
for result in parent.cursor.execute(sql_command).fetchall():
|
1590
|
+
genome = result[0]
|
1591
|
+
id = int(result[1])
|
1592
|
+
existing_genome_IDs[genome] = id
|
1593
|
+
|
1594
|
+
parent.close_connection()
|
1595
|
+
except:
|
1596
|
+
print("You specified an existing file to be a database, but it does not appear to be a FastAAI database.")
|
1597
|
+
print("FastAAI will not be able to continue. Please give FastAAI a different database name and continue.")
|
1598
|
+
print("Exiting.")
|
1599
|
+
return False
|
1600
|
+
|
1601
|
+
|
1602
|
+
prep_args = [genomes, proteins, hmms, gf, pf, hf, db_name]
|
1603
|
+
|
1604
|
+
#inputs, output_path, parent_path, existing_index, threads
|
1605
|
+
success = add_inputs(output, final_database, existing_genome_IDs, threads, verbose, prep_args)
|
1606
|
+
|
1607
|
+
if success:
|
1608
|
+
print("Database build complete!")
|
1609
|
+
|
1610
|
+
return success
|
1611
|
+
|
1612
|
+
|
1613
|
+
#DB query functionality - unlimited version
|
1614
|
+
def do_query_vs_target_aai_only(query_name, target_name, threads, output, precision, verbose):
|
1615
|
+
if not os.path.exists(os.path.normpath(output+"/temp")):
|
1616
|
+
os.mkdir(os.path.normpath(output+"/temp"))
|
1617
|
+
|
1618
|
+
if precision == "low":
|
1619
|
+
jacc_precision = np.float16
|
1620
|
+
if precision == "med":
|
1621
|
+
jacc_precision = np.float32
|
1622
|
+
if precision == "high":
|
1623
|
+
jacc_precision = np.float64
|
1624
|
+
|
1625
|
+
#Save the file paths.
|
1626
|
+
query = fastaai_database(query_name)
|
1627
|
+
target = fastaai_database(target_name)
|
1628
|
+
|
1629
|
+
query.activate_connection()
|
1630
|
+
query.just_accessions()
|
1631
|
+
query_len = query.cursor.execute("SELECT Count(*) FROM genome_index").fetchall()[0][0]
|
1632
|
+
#query.close_connection()
|
1633
|
+
target.activate_connection()
|
1634
|
+
target.just_accessions()
|
1635
|
+
target_len = target.cursor.execute("SELECT Count(*) FROM genome_index").fetchall()[0][0]
|
1636
|
+
#target.close_connection()
|
1637
|
+
|
1638
|
+
print("FastAAI will search", query_len, "query genomes against", target_len, "target genomes.")
|
1639
|
+
|
1640
|
+
print("")
|
1641
|
+
print("FastAAI is preparing your AAI search... ", end = '', flush = True)
|
1642
|
+
|
1643
|
+
accessions_in_common = list(set(query.accessions).intersection(target.accessions))
|
1644
|
+
|
1645
|
+
query.accessions = None
|
1646
|
+
target.accessions = None
|
1647
|
+
|
1648
|
+
query.close_connection()
|
1649
|
+
target.close_connection()
|
1650
|
+
|
1651
|
+
load_args = [(query, target, acc) for acc in accessions_in_common]
|
1652
|
+
|
1653
|
+
loads = []
|
1654
|
+
ordered_accs = []
|
1655
|
+
|
1656
|
+
pool = multiprocessing.Pool(threads)
|
1657
|
+
|
1658
|
+
for result in pool.imap(load_getter, load_args):
|
1659
|
+
load = result[0]
|
1660
|
+
acc = result[1]
|
1661
|
+
#Load will be None if the accession is in both query and target, but they still don't share even a single Kmer. Unlikely, but it happened once, so it WILL happen again.
|
1662
|
+
if load is not None:
|
1663
|
+
loads.append(load)
|
1664
|
+
ordered_accs.append(acc)
|
1665
|
+
|
1666
|
+
pool.close()
|
1667
|
+
pool.join()
|
1668
|
+
|
1669
|
+
loads = np.array(loads)
|
1670
|
+
ordered_accs = np.array(ordered_accs)
|
1671
|
+
|
1672
|
+
order = loads.argsort()[::-1]
|
1673
|
+
|
1674
|
+
loads = loads[order]
|
1675
|
+
ordered_accs = ordered_accs[order]
|
1676
|
+
|
1677
|
+
load_balancer = {}
|
1678
|
+
accs_per_load = {}
|
1679
|
+
for i in range(0, threads):
|
1680
|
+
load_balancer[i] = 0
|
1681
|
+
accs_per_load[i] = []
|
1682
|
+
|
1683
|
+
for i in range(0, loads.shape[0]):
|
1684
|
+
index = list(load_balancer.values()).index(min(list(load_balancer.values())))
|
1685
|
+
#print(index, load)
|
1686
|
+
load_balancer[index] += loads[i]
|
1687
|
+
accs_per_load[index].append(int(ordered_accs[i]))
|
1688
|
+
|
1689
|
+
del loads
|
1690
|
+
del ordered_accs
|
1691
|
+
|
1692
|
+
print("done!")
|
1693
|
+
if verbose:
|
1694
|
+
print("FastAAI has balanced the workload of calculating AAI from your data.")
|
1695
|
+
for index in accs_per_load:
|
1696
|
+
print("Thread", index, "will handle", len(accs_per_load[index]), "accessions.")
|
1697
|
+
print("FastAAI is beginning the calculation of AAI between your query and target genomes.")
|
1698
|
+
|
1699
|
+
del load_balancer
|
1700
|
+
|
1701
|
+
input_queue = multiprocessing.Queue()
|
1702
|
+
output_queue = multiprocessing.Queue()
|
1703
|
+
|
1704
|
+
for thread in accs_per_load:
|
1705
|
+
input_queue.put(accs_per_load[thread])
|
1706
|
+
|
1707
|
+
for i in range(0, threads):
|
1708
|
+
input_queue.put('STOP')
|
1709
|
+
|
1710
|
+
for i in range(0, threads):
|
1711
|
+
multiprocessing.Process(target=accession_worker, args=(input_queue, output_queue, query, target, query_len, target_len, jacc_precision)).start()
|
1712
|
+
|
1713
|
+
print("")
|
1714
|
+
|
1715
|
+
results = np.zeros(shape = (query_len, target_len), dtype = jacc_precision)
|
1716
|
+
|
1717
|
+
#Counter to keep the threads running until the whole process is done.
|
1718
|
+
donezo = threads
|
1719
|
+
while donezo > 0:
|
1720
|
+
row = output_queue.get()
|
1721
|
+
try:
|
1722
|
+
results[row[0]] += row[1]
|
1723
|
+
except:
|
1724
|
+
donezo -= 1
|
1725
|
+
|
1726
|
+
print("AAI calculations complete. Formatting results for writing.")
|
1727
|
+
|
1728
|
+
#global glob_prec
|
1729
|
+
#glob_prec = jacc_precision
|
1730
|
+
|
1731
|
+
rdb_name = os.path.normpath(output+"/temp/aai_calc_db.db")
|
1732
|
+
rdb = calculation_database(rdb_name, precision)
|
1733
|
+
rdb.activate_connection()
|
1734
|
+
rdb.initialize_database()
|
1735
|
+
|
1736
|
+
#Get the data ready for passing to children...
|
1737
|
+
|
1738
|
+
results = np.split(results, query_len, axis = 0)
|
1739
|
+
|
1740
|
+
insertable = []
|
1741
|
+
#iterate over results and turn them into tuples.
|
1742
|
+
for i in range(0, query_len):
|
1743
|
+
insertable.append((i, results[i].tobytes()))
|
1744
|
+
results[i] = None
|
1745
|
+
|
1746
|
+
rdb.cursor.executemany("INSERT INTO jaccards VALUES (?, ?)", (insertable))
|
1747
|
+
rdb.connection.commit()
|
1748
|
+
|
1749
|
+
rdb.close_connection()
|
1750
|
+
|
1751
|
+
del insertable
|
1752
|
+
del results
|
1753
|
+
|
1754
|
+
#Now we split the query genomes into chunk and have threads process each chunk in parallel with its respective shared prot counts.
|
1755
|
+
query_chunks = split_indicies(query_len, threads)
|
1756
|
+
query_args = [([rdb_name], query_chunks[i], output, query, target, precision) for i in range(0, threads)]
|
1757
|
+
|
1758
|
+
print("Results formatted. Writing results starting at", curtime())
|
1759
|
+
|
1760
|
+
pool = multiprocessing.Pool(threads)
|
1761
|
+
|
1762
|
+
pool.map(finish_jaccards, query_args)
|
1763
|
+
|
1764
|
+
pool.close()
|
1765
|
+
pool.join()
|
1766
|
+
|
1767
|
+
os.remove(rdb_name)
|
1768
|
+
|
1769
|
+
print("FastAAI complete! Results at:", os.path.normpath(output+"/results/"))
|
1770
|
+
|
1771
|
+
return None
|
1772
|
+
|
1773
|
+
#Assess the number of comparisons that will have to be made to complete an accession so that balanced loads can be passed to threads
|
1774
|
+
def load_getter(args):
|
1775
|
+
query, target, accession = args[0], args[1], args[2]
|
1776
|
+
query.activate_connection()
|
1777
|
+
target.activate_connection()
|
1778
|
+
|
1779
|
+
original_index = generate_accessions_index()
|
1780
|
+
accession_inverter = {}
|
1781
|
+
for acc in original_index:
|
1782
|
+
sql_friendly_accession = acc.replace(".", "_")
|
1783
|
+
accession_inverter[original_index[acc]] = sql_friendly_accession
|
1784
|
+
|
1785
|
+
sql_friendly_accession = accession_inverter[accession].replace('.', '_')
|
1786
|
+
sql = "SELECT kmer FROM "+ sql_friendly_accession
|
1787
|
+
query.cursor.row_factory = lambda cursor, row: row[0]
|
1788
|
+
#query_kmers = set(query.cursor.execute(sql).fetchall()).intersection()
|
1789
|
+
target.cursor.row_factory = lambda cursor, row: row[0]
|
1790
|
+
#target_kmers = target.cursor.execute(sql).fetchall()
|
1791
|
+
|
1792
|
+
shared_kmers = list(set(query.cursor.execute(sql).fetchall()).intersection(target.cursor.execute(sql).fetchall()))
|
1793
|
+
query.cursor.row_factory = None
|
1794
|
+
target.cursor.row_factory = None
|
1795
|
+
|
1796
|
+
bytes_sql = "SELECT sum(length(genomes)) FROM " + sql_friendly_accession + " WHERE kmer IN ({kmers})".format(kmers=','.join(['?']*len(shared_kmers)))
|
1797
|
+
|
1798
|
+
if len(shared_kmers) > 0:
|
1799
|
+
tgt_res = target.cursor.execute(bytes_sql, shared_kmers).fetchone()[0]
|
1800
|
+
query_res = query.cursor.execute(bytes_sql, shared_kmers).fetchone()[0]
|
1801
|
+
#This if *should* always happen, if it gets checked.
|
1802
|
+
if tgt_res is not None and query_res is not None:
|
1803
|
+
load = int(tgt_res/(4096) * query_res/(4096))
|
1804
|
+
else:
|
1805
|
+
load = None
|
1806
|
+
else:
|
1807
|
+
load = None
|
1808
|
+
|
1809
|
+
query.close_connection()
|
1810
|
+
target.close_connection()
|
1811
|
+
|
1812
|
+
return [load, accession]
|
1813
|
+
|
1814
|
+
def accession_worker(in_queue, out_queue, query, target, qlen, tlen, prec):
|
1815
|
+
original_index = generate_accessions_index()
|
1816
|
+
accession_inverter = {}
|
1817
|
+
for acc in original_index:
|
1818
|
+
sql_friendly_accession = acc.replace(".", "_")
|
1819
|
+
accession_inverter[original_index[acc]] = sql_friendly_accession
|
1820
|
+
|
1821
|
+
query.activate_connection()
|
1822
|
+
target.activate_connection()
|
1823
|
+
query.load_genome_index()
|
1824
|
+
target.load_genome_index()
|
1825
|
+
|
1826
|
+
for my_accessions in iter(in_queue.get, 'STOP'):
|
1827
|
+
|
1828
|
+
#print(my_accessions)
|
1829
|
+
|
1830
|
+
target.load_accessions(permitted_accessions = my_accessions)
|
1831
|
+
query.load_accessions(permitted_accessions = my_accessions)
|
1832
|
+
|
1833
|
+
query_data = {}
|
1834
|
+
target_data = {}
|
1835
|
+
|
1836
|
+
for acc in my_accessions:
|
1837
|
+
|
1838
|
+
sql_friendly_accession = accession_inverter[acc].replace('.', '_')
|
1839
|
+
|
1840
|
+
query_data[acc] = dict(query.cursor.execute("SELECT * FROM "+sql_friendly_accession+"_genomes").fetchall())
|
1841
|
+
|
1842
|
+
query.cursor.row_factory = lambda cursor, row: row[0]
|
1843
|
+
selected_kmers = list(query.cursor.execute("SELECT kmer FROM "+sql_friendly_accession).fetchall())
|
1844
|
+
query.cursor.row_factory = None
|
1845
|
+
|
1846
|
+
target_sql = "SELECT * FROM " + sql_friendly_accession + " WHERE kmer in ({kmers})".format(kmers=','.join(['?']*len(selected_kmers)))
|
1847
|
+
target_data[acc] = dict(target.cursor.execute(target_sql, selected_kmers).fetchall())
|
1848
|
+
|
1849
|
+
target_kmer_cts_by_acc = {}
|
1850
|
+
for acc in my_accessions:
|
1851
|
+
target_kmer_cts_by_acc[acc] = np.zeros(tlen, dtype = np.int16)
|
1852
|
+
|
1853
|
+
for genome in target.gak:
|
1854
|
+
for acc in target.gak[genome]:
|
1855
|
+
target_kmer_cts_by_acc[acc][genome] = target.gak[genome][acc]
|
1856
|
+
|
1857
|
+
#No longer needed.
|
1858
|
+
target.gak = None
|
1859
|
+
#We want each thread to report every single genome
|
1860
|
+
for genome in query.gak:
|
1861
|
+
#count += 1
|
1862
|
+
#print("Thread", my_thread, "genome", count, "of", total)
|
1863
|
+
these_jaccards = np.zeros(tlen, dtype = np.float64)
|
1864
|
+
for acc in query.gak[genome]:
|
1865
|
+
these_intersections = np.zeros(tlen, dtype = np.int16)
|
1866
|
+
query_kmers = query_data[acc][genome]
|
1867
|
+
query_kmer_ct = query_kmers.shape
|
1868
|
+
for kmer in query_kmers:
|
1869
|
+
if kmer in target_data[acc]:
|
1870
|
+
these_intersections[target_data[acc][kmer]] += 1
|
1871
|
+
|
1872
|
+
these_jaccards += np.divide(these_intersections, np.subtract(np.add(query_kmer_ct, target_kmer_cts_by_acc[acc]), these_intersections))
|
1873
|
+
|
1874
|
+
out_queue.put([genome, these_jaccards])
|
1875
|
+
|
1876
|
+
target.close_connection()
|
1877
|
+
query.close_connection()
|
1878
|
+
out_queue.put("Based")
|
1879
|
+
|
1880
|
+
return None
|
1881
|
+
|
1882
|
+
def finish_jaccards(args):
|
1883
|
+
partial_dbs, my_query_genomes, output, query, target, prec = args[0], args[1], args[2], args[3] ,args[4], args[5]
|
1884
|
+
#Load protein counts
|
1885
|
+
#for each genome, query each partial and sum matching genomes, then divide by shared counts.
|
1886
|
+
|
1887
|
+
query.activate_connection()
|
1888
|
+
target.activate_connection()
|
1889
|
+
query.load_genome_index()
|
1890
|
+
target.load_genome_index()
|
1891
|
+
|
1892
|
+
selected_query_genomes = range(my_query_genomes[0], my_query_genomes[1])
|
1893
|
+
|
1894
|
+
offset = my_query_genomes[0]
|
1895
|
+
|
1896
|
+
target_len = len(target.genome_index)
|
1897
|
+
query_len = my_query_genomes[1] - my_query_genomes[0]
|
1898
|
+
|
1899
|
+
#get shared protein counts
|
1900
|
+
query.load_accessions(permitted_genomes = selected_query_genomes)
|
1901
|
+
|
1902
|
+
max_acc = 122
|
1903
|
+
|
1904
|
+
query_set = np.zeros(shape = (query_len, max_acc), dtype = np.int16)
|
1905
|
+
|
1906
|
+
for g in query.gak:
|
1907
|
+
query_set[(g-offset), list(query.gak[g])] += 1
|
1908
|
+
|
1909
|
+
target_set = np.zeros(shape = (max_acc, len(target.genome_index)), dtype = np.int16)
|
1910
|
+
|
1911
|
+
target.load_accessions()
|
1912
|
+
|
1913
|
+
target_protein_counts = np.zeros(target_len, dtype = np.int16)
|
1914
|
+
for t in target.gak:
|
1915
|
+
target_set[list(target.gak[t]), t] += 1
|
1916
|
+
target_protein_counts[t] = len(target.gak[t])
|
1917
|
+
|
1918
|
+
#This will be used to divide the jaccs and such. If disk, then disk, tho...
|
1919
|
+
shared_prot_counts_by_genome = np.dot(query_set, target_set)
|
1920
|
+
|
1921
|
+
del query_set
|
1922
|
+
del target_set
|
1923
|
+
|
1924
|
+
target.gak = None
|
1925
|
+
|
1926
|
+
query.close_connection()
|
1927
|
+
target.close_connection()
|
1928
|
+
|
1929
|
+
activated_DBs = []
|
1930
|
+
idx = 0
|
1931
|
+
for db in partial_dbs:
|
1932
|
+
activated_DBs.append(calculation_database(db, prec))
|
1933
|
+
activated_DBs[idx].activate_connection()
|
1934
|
+
idx += 1
|
1935
|
+
|
1936
|
+
|
1937
|
+
for genome in selected_query_genomes:
|
1938
|
+
sql = "SELECT jaccards FROM jaccards WHERE genome="+str(genome)
|
1939
|
+
total_jaccs = np.zeros(target_len, dtype = np.float64)
|
1940
|
+
shared_acc_counts = shared_prot_counts_by_genome[genome - offset]
|
1941
|
+
for db in activated_DBs:
|
1942
|
+
result = db.cursor.execute(sql).fetchone()[0]
|
1943
|
+
total_jaccs += result
|
1944
|
+
|
1945
|
+
total_jaccs = np.divide(total_jaccs, shared_acc_counts)
|
1946
|
+
|
1947
|
+
aai_est = numpy_kaai_to_aai(total_jaccs)
|
1948
|
+
|
1949
|
+
no_hit = np.where(shared_acc_counts == 0)
|
1950
|
+
#Actual hits is already stored in shared_acc_counts
|
1951
|
+
possible_hits = np.minimum(len(query.gak[genome]), target_protein_counts).astype(str)
|
1952
|
+
|
1953
|
+
total_jaccs = np.round(total_jaccs, 4).astype(str)
|
1954
|
+
|
1955
|
+
shared_acc_counts = shared_acc_counts.astype(str)
|
1956
|
+
|
1957
|
+
total_jaccs[no_hit] = "N/A"
|
1958
|
+
aai_est[no_hit] = "N/A"
|
1959
|
+
shared_acc_counts[no_hit] = "N/A"
|
1960
|
+
possible_hits[no_hit] = "N/A"
|
1961
|
+
|
1962
|
+
name = query.reverse_genome_index[genome]
|
1963
|
+
|
1964
|
+
output_file = output +"/results/"+name+"_results.txt"
|
1965
|
+
fh = open(output_file, "w")
|
1966
|
+
|
1967
|
+
for tgt in range(0, target_len):
|
1968
|
+
target_name = target.reverse_genome_index[tgt]
|
1969
|
+
if target_name == name:
|
1970
|
+
fh.write(name+"\t"+target_name+"\t"+"100.0"+"\t"+"0.0"+"\t"+shared_acc_counts[tgt]+"\t"+possible_hits[tgt]+"\t"+"100.0"+"\n")
|
1971
|
+
else:
|
1972
|
+
fh.write(name+"\t"+target_name+"\t"+total_jaccs[tgt]+"\t"+"N/A"+"\t"+shared_acc_counts[tgt]+"\t"+possible_hits[tgt]+"\t"+aai_est[tgt]+"\n")
|
1973
|
+
|
1974
|
+
fh.close()
|
1975
|
+
|
1976
|
+
#Write partial to file, here.
|
1977
|
+
|
1978
|
+
for db in activated_DBs:
|
1979
|
+
db.close_connection()
|
1980
|
+
|
1981
|
+
return None
|
1982
|
+
|
1983
|
+
|
1984
|
+
#Here's the DB SQL querying functionality/limited version.
|
1985
|
+
def do_query_vs_target_sql(query, target, threads, output, verbose, do_stdev):
|
1986
|
+
#Save the file paths.
|
1987
|
+
query_name, target_name = query, target
|
1988
|
+
|
1989
|
+
query = fastaai_database(query_name)
|
1990
|
+
query.activate_connection()
|
1991
|
+
query.load_genome_index()
|
1992
|
+
query.just_accessions()
|
1993
|
+
|
1994
|
+
converter = generate_accessions_index()
|
1995
|
+
acc_sql = "SELECT name FROM sqlite_master WHERE type='table'"
|
1996
|
+
tables = [item[0] for item in query.cursor.execute(acc_sql).fetchall()]
|
1997
|
+
cleaned_tables = []
|
1998
|
+
for table in tables:
|
1999
|
+
if table.endswith("_genomes"):
|
2000
|
+
acc_name = table.split("_genomes")[0]
|
2001
|
+
acc_name = acc_name.replace("_", ".")
|
2002
|
+
index = acc_name
|
2003
|
+
cleaned_tables.append((table, index))
|
2004
|
+
|
2005
|
+
del tables
|
2006
|
+
|
2007
|
+
#Go through tables and load data.
|
2008
|
+
query_acc_kmers = defaultdict(dict)
|
2009
|
+
|
2010
|
+
sys.stdout.write("\n")
|
2011
|
+
sys.stdout.write("Loading query data at " + curtime() + " ...\n")
|
2012
|
+
sys.stdout.flush()
|
2013
|
+
|
2014
|
+
for tab_idx in cleaned_tables:
|
2015
|
+
table = tab_idx[0]
|
2016
|
+
accession = tab_idx[1]
|
2017
|
+
for result in query.cursor.execute("SELECT * FROM " + table).fetchall():
|
2018
|
+
query_acc_kmers[result[0]][accession] = result[1]
|
2019
|
+
|
2020
|
+
query.close_connection()
|
2021
|
+
|
2022
|
+
|
2023
|
+
sys.stdout.write("\n")
|
2024
|
+
sys.stdout.write("Loading target data at " + curtime() + " ...\n")
|
2025
|
+
sys.stdout.flush()
|
2026
|
+
|
2027
|
+
target = fastaai_database(target_name)
|
2028
|
+
target.activate_connection()
|
2029
|
+
target.load_genome_index()
|
2030
|
+
target.load_accessions()
|
2031
|
+
target.close_connection()
|
2032
|
+
|
2033
|
+
query_args = []
|
2034
|
+
for genome in query_acc_kmers:
|
2035
|
+
query_args.append((target, query.reverse_genome_index[genome], query_acc_kmers[genome], os.path.normpath(output+"/results")))
|
2036
|
+
|
2037
|
+
detected_query_accs = query.accessions
|
2038
|
+
query_length = len(query.genome_index)
|
2039
|
+
|
2040
|
+
#Cleanup
|
2041
|
+
del query
|
2042
|
+
del query_acc_kmers
|
2043
|
+
|
2044
|
+
#global target_kmer_cts
|
2045
|
+
target_kmer_cts = {}
|
2046
|
+
|
2047
|
+
target_len = len(target.gak)
|
2048
|
+
|
2049
|
+
for accession in np.intersect1d(detected_query_accs, target.accessions):
|
2050
|
+
target_kmer_cts[accession] = np.zeros(target_len, dtype = np.int16)
|
2051
|
+
for g in target.gak:
|
2052
|
+
if accession in target.gak[g]:
|
2053
|
+
target_kmer_cts[accession][g] = target.gak[g][accession]
|
2054
|
+
|
2055
|
+
#global target_protein_counts
|
2056
|
+
target_protein_counts = np.zeros(target_len, dtype = np.int16)
|
2057
|
+
for g in target.gak:
|
2058
|
+
target_protein_counts[g] = len(target.gak[g])
|
2059
|
+
|
2060
|
+
target_length = len(target.gak)
|
2061
|
+
|
2062
|
+
target.gak = None
|
2063
|
+
|
2064
|
+
#Should just load the stuff then straightforward sql
|
2065
|
+
sys.stdout.write("\n")
|
2066
|
+
sys.stdout.write("FastAAI will search "+ str(query_length) + " query genomes against " + str(target_length) + " target genomes.\n")
|
2067
|
+
sys.stdout.write("\n")
|
2068
|
+
|
2069
|
+
count = 0
|
2070
|
+
total = len(query_args)
|
2071
|
+
|
2072
|
+
sys.stdout.write("Beginning AAI calculation at " + curtime())
|
2073
|
+
|
2074
|
+
if verbose:
|
2075
|
+
print("")
|
2076
|
+
#progress bar - possible dangerous use of the return to line start sequence.
|
2077
|
+
try:
|
2078
|
+
percentage = 0
|
2079
|
+
sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% (Query genome ' + str(count) + " of " + str(total) + ' done at '+curtime()+')\n')
|
2080
|
+
sys.stdout.flush()
|
2081
|
+
last_pct = 0
|
2082
|
+
except:
|
2083
|
+
#It's not really a big deal if the progress bar cannot be printed.
|
2084
|
+
pass
|
2085
|
+
|
2086
|
+
pool = multiprocessing.Pool(threads, initializer = sql_query_thread_starter, initargs = (target_kmer_cts, target_protein_counts,))
|
2087
|
+
|
2088
|
+
#Process as we go.
|
2089
|
+
if do_stdev:
|
2090
|
+
for file in pool.imap(do_sql_query, query_args):
|
2091
|
+
if verbose:
|
2092
|
+
#progress bar - possible dangerous use of the return to line start sequence.
|
2093
|
+
try:
|
2094
|
+
count += 1
|
2095
|
+
percentage = (count/total)*100
|
2096
|
+
if int(percentage/2) > last_pct or count == total:
|
2097
|
+
sys.stdout.write('\033[A')
|
2098
|
+
sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% (Query genome ' + str(count) + " of " + str(total) + ' done at '+curtime()+')\n')
|
2099
|
+
sys.stdout.flush()
|
2100
|
+
last_pct = int(percentage/2)
|
2101
|
+
except:
|
2102
|
+
#It's not really a big deal if the progress bar cannot be printed.
|
2103
|
+
pass
|
2104
|
+
|
2105
|
+
pool.close()
|
2106
|
+
pool.join()
|
2107
|
+
else:
|
2108
|
+
|
2109
|
+
for file in pool.imap(do_sql_query_no_SD, query_args):
|
2110
|
+
|
2111
|
+
if verbose:
|
2112
|
+
#progress bar - possible dangerous use of the return to line start sequence.
|
2113
|
+
try:
|
2114
|
+
count += 1
|
2115
|
+
percentage = (count/total)*100
|
2116
|
+
if int(percentage/2) > last_pct or count == total:
|
2117
|
+
sys.stdout.write('\033[A')
|
2118
|
+
sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% (Query genome ' + str(count) + " of " + str(total) + ' done at '+curtime()+')\n')
|
2119
|
+
sys.stdout.flush()
|
2120
|
+
last_pct = int(percentage/2)
|
2121
|
+
except:
|
2122
|
+
#It's not really a big deal if the progress bar cannot be printed.
|
2123
|
+
pass
|
2124
|
+
|
2125
|
+
pool.close()
|
2126
|
+
pool.join()
|
2127
|
+
|
2128
|
+
print("AAI calculation complete! Results at:", os.path.normpath(output+"/results"))
|
2129
|
+
|
2130
|
+
return None
|
2131
|
+
|
2132
|
+
#This can also take the genomes-first formatted prots in the DB and search them memory-efficiently, if not time efficiently.
|
2133
|
+
def do_sql_query(args):
|
2134
|
+
kmer_index = create_kmer_index()
|
2135
|
+
accession_index = generate_accessions_index()
|
2136
|
+
#database, file.basename, file.best_hits_kmers, os.path.normpath(output+"/temp")
|
2137
|
+
database, name, acc_kmers, temp_out = args[0],args[1],args[2],args[3]
|
2138
|
+
|
2139
|
+
database.activate_connection()
|
2140
|
+
|
2141
|
+
res_ct = 0
|
2142
|
+
target_len = len(database.genome_index)
|
2143
|
+
|
2144
|
+
results = np.zeros(shape = (len(acc_kmers), target_len), dtype = np.float64)
|
2145
|
+
row = 0
|
2146
|
+
|
2147
|
+
shared_acc_counts = np.zeros(target_len, dtype = np.int16)
|
2148
|
+
|
2149
|
+
for accession in acc_kmers:
|
2150
|
+
acc_index = accession_index[accession]
|
2151
|
+
sql_friendly_accession = accession.replace(".", "_")
|
2152
|
+
if acc_index in database.accessions:
|
2153
|
+
#The accession was found for this target genome, for each tgt genome.
|
2154
|
+
shared_acc_counts[np.nonzero(target_kmer_cts[acc_index])] += 1
|
2155
|
+
these_kmers = [int(kmer) for kmer in acc_kmers[accession]]
|
2156
|
+
these_intersections = np.zeros(target_len, dtype = np.int16)
|
2157
|
+
sql_query = "SELECT genomes FROM " + sql_friendly_accession + " WHERE kmer in ({kmers})".format(kmers=','.join(['?']*len(these_kmers)))
|
2158
|
+
for result in database.cursor.execute(sql_query, these_kmers):
|
2159
|
+
these_intersections[result] += 1
|
2160
|
+
|
2161
|
+
results[row] = np.divide(these_intersections, np.subtract(np.add(acc_kmers[accession].shape[0], target_kmer_cts[acc_index]), these_intersections))
|
2162
|
+
|
2163
|
+
row += 1
|
2164
|
+
|
2165
|
+
database.close_connection()
|
2166
|
+
|
2167
|
+
#These are the jacc averages
|
2168
|
+
jaccard_averages = np.divide(np.sum(results, axis = 0), shared_acc_counts)
|
2169
|
+
|
2170
|
+
#Get the differences from the mean per hit
|
2171
|
+
results = results - jaccard_averages
|
2172
|
+
#Square them
|
2173
|
+
results = np.square(results)
|
2174
|
+
#Sum squares and divide by shared acc. count, the sqrt to get SD.
|
2175
|
+
jaccard_SDs = np.sqrt(np.divide(np.sum(results, axis = 0), shared_acc_counts))
|
2176
|
+
|
2177
|
+
aai_est = numpy_kaai_to_aai(jaccard_averages)
|
2178
|
+
|
2179
|
+
no_hit = np.where(shared_acc_counts == 0)
|
2180
|
+
#Actual hits is already stored in shared_acc_counts
|
2181
|
+
possible_hits = np.minimum(len(acc_kmers), target_protein_counts).astype(str)
|
2182
|
+
|
2183
|
+
|
2184
|
+
jaccard_averages = np.round(jaccard_averages, 4).astype(str)
|
2185
|
+
jaccard_SDs = np.round(jaccard_SDs, 4).astype(str)
|
2186
|
+
|
2187
|
+
shared_acc_counts = shared_acc_counts.astype(str)
|
2188
|
+
|
2189
|
+
jaccard_averages[no_hit] = "N/A"
|
2190
|
+
aai_est[no_hit] = "N/A"
|
2191
|
+
jaccard_SDs[no_hit] = "N/A"
|
2192
|
+
shared_acc_counts[no_hit] = "N/A"
|
2193
|
+
possible_hits[no_hit] = "N/A"
|
2194
|
+
|
2195
|
+
output_file = temp_out +"/"+name+"_results.txt"
|
2196
|
+
fh = open(output_file, "w")
|
2197
|
+
|
2198
|
+
for target in range(0, target_len):
|
2199
|
+
target_name = database.reverse_genome_index[target]
|
2200
|
+
if target_name == name:
|
2201
|
+
fh.write(name+"\t"+target_name+"\t"+"100.0"+"\t"+"0.0"+"\t"+shared_acc_counts[target]+"\t"+possible_hits[target]+"\t"+"100.0"+"\n")
|
2202
|
+
else:
|
2203
|
+
fh.write(name+"\t"+target_name+"\t"+jaccard_averages[target]+"\t"+jaccard_SDs[target]+"\t"+shared_acc_counts[target]+"\t"+possible_hits[target]+"\t"+aai_est[target]+"\n")
|
2204
|
+
|
2205
|
+
fh.close()
|
2206
|
+
|
2207
|
+
return output_file
|
2208
|
+
|
2209
|
+
#This can also take the genomes-first formatted prots in the DB and search them memory-efficiently, if not time efficiently.
|
2210
|
+
def do_sql_query_no_SD(args):
|
2211
|
+
kmer_index = create_kmer_index()
|
2212
|
+
accession_index = generate_accessions_index()
|
2213
|
+
#database, file.basename, file.best_hits_kmers, os.path.normpath(output+"/temp")
|
2214
|
+
database, name, acc_kmers, temp_out = args[0],args[1],args[2],args[3]
|
2215
|
+
|
2216
|
+
database.activate_connection()
|
2217
|
+
|
2218
|
+
res_ct = 0
|
2219
|
+
target_len = len(database.genome_index)
|
2220
|
+
|
2221
|
+
results = np.zeros(shape = target_len, dtype = np.float64)
|
2222
|
+
#row = 0
|
2223
|
+
|
2224
|
+
shared_acc_counts = np.zeros(target_len, dtype = np.int16)
|
2225
|
+
|
2226
|
+
for accession in acc_kmers:
|
2227
|
+
acc_index = accession_index[accession]
|
2228
|
+
sql_friendly_accession = accession.replace(".", "_")
|
2229
|
+
if acc_index in database.accessions:
|
2230
|
+
#The accession was found for this target genome, for each tgt genome.
|
2231
|
+
shared_acc_counts[np.nonzero(target_kmer_cts[acc_index])] += 1
|
2232
|
+
these_kmers = [int(kmer) for kmer in acc_kmers[accession]]
|
2233
|
+
these_intersections = np.zeros(target_len, dtype = np.int16)
|
2234
|
+
sql_query = "SELECT genomes FROM " + sql_friendly_accession + " WHERE kmer in ({kmers})".format(kmers=','.join(['?']*len(these_kmers)))
|
2235
|
+
for result in database.cursor.execute(sql_query, these_kmers):
|
2236
|
+
these_intersections[result] += 1
|
2237
|
+
|
2238
|
+
results += np.divide(these_intersections, np.subtract(np.add(acc_kmers[accession].shape[0], target_kmer_cts[acc_index]), these_intersections))
|
2239
|
+
|
2240
|
+
database.close_connection()
|
2241
|
+
|
2242
|
+
#These are the jacc averages
|
2243
|
+
jaccard_averages = np.divide(results, shared_acc_counts)
|
2244
|
+
del results
|
2245
|
+
|
2246
|
+
aai_est = numpy_kaai_to_aai(jaccard_averages)
|
2247
|
+
|
2248
|
+
no_hit = np.where(shared_acc_counts == 0)
|
2249
|
+
|
2250
|
+
possible_hits = np.minimum(len(acc_kmers), target_protein_counts).astype(str)
|
2251
|
+
|
2252
|
+
jaccard_averages = np.round(jaccard_averages, 4).astype(str)
|
2253
|
+
|
2254
|
+
shared_acc_counts = shared_acc_counts.astype(str)
|
2255
|
+
|
2256
|
+
jaccard_averages[no_hit] = "N/A"
|
2257
|
+
aai_est[no_hit] = "N/A"
|
2258
|
+
shared_acc_counts[no_hit] = "N/A"
|
2259
|
+
possible_hits[no_hit] = "N/A"
|
2260
|
+
|
2261
|
+
output_file = temp_out +"/"+name+"_results.txt"
|
2262
|
+
fh = open(output_file, "w")
|
2263
|
+
|
2264
|
+
for target in range(0, target_len):
|
2265
|
+
target_name = database.reverse_genome_index[target]
|
2266
|
+
if target_name == name:
|
2267
|
+
fh.write(name+"\t"+target_name+"\t"+"100.0"+"\t"+"0.0"+"\t"+shared_acc_counts[target]+"\t"+possible_hits[target]+"\t"+"100.0"+"\n")
|
2268
|
+
else:
|
2269
|
+
fh.write(name+"\t"+target_name+"\t"+jaccard_averages[target]+"\t"+"N/A"+"\t"+shared_acc_counts[target]+"\t"+possible_hits[target]+"\t"+aai_est[target]+"\n")
|
2270
|
+
|
2271
|
+
fh.close()
|
2272
|
+
|
2273
|
+
return output_file
|
2274
|
+
|
2275
|
+
def numpy_kaai_to_aai(kaai_array):
|
2276
|
+
#aai_hat = (-0.3087057 + 1.810741 * (np.exp(-(-0.2607023 * np.log(kaai))**(1/3.435))))*100
|
2277
|
+
|
2278
|
+
#Protect the original jaccard averages memory item
|
2279
|
+
aai_hat_array = kaai_array.copy()
|
2280
|
+
|
2281
|
+
non_zero = np.where(aai_hat_array > 0)
|
2282
|
+
is_zero = np.where(aai_hat_array <= 0)
|
2283
|
+
|
2284
|
+
#I broke this down into its original components
|
2285
|
+
#Avoid zeroes in log - still actually works, but it produces warnings I don't want to see.
|
2286
|
+
aai_hat_array[non_zero] = np.log(aai_hat_array[non_zero])
|
2287
|
+
|
2288
|
+
aai_hat_array = np.multiply(np.subtract(np.multiply(np.exp(np.negative(np.power(np.multiply(aai_hat_array, -0.2607023), (1/3.435)))), 1.810741), 0.3087057), 100)
|
2289
|
+
'''
|
2290
|
+
Same as the above, broken down into easier-to-follow steps.
|
2291
|
+
aai_hat_array = np.multiply(aai_hat_array, -0.2607023)
|
2292
|
+
aai_hat_array = np.power(aai_hat_array, (1/3.435))
|
2293
|
+
aai_hat_array = np.negative(aai_hat_array)
|
2294
|
+
aai_hat_array = np.exp(aai_hat_array)
|
2295
|
+
aai_hat_array = np.multiply(aai_hat_array, 1.810741)
|
2296
|
+
aai_hat_array = np.subtract(aai_hat_array, 0.3087057)
|
2297
|
+
aai_hat_array = np.multiply(aai_hat_array, 100)
|
2298
|
+
'''
|
2299
|
+
|
2300
|
+
#<30 and >90 values
|
2301
|
+
smol = np.where(aai_hat_array < 30)
|
2302
|
+
big = np.where(aai_hat_array > 90)
|
2303
|
+
|
2304
|
+
aai_hat_array = np.round(aai_hat_array, 2)
|
2305
|
+
|
2306
|
+
#Convert to final printables
|
2307
|
+
aai_hat_array = aai_hat_array.astype(str)
|
2308
|
+
aai_hat_array[smol] = "<30%"
|
2309
|
+
aai_hat_array[big] = ">90%"
|
2310
|
+
#The math of the above ends up with zero values being big, so we fix those.
|
2311
|
+
aai_hat_array[is_zero] = "<30%"
|
2312
|
+
|
2313
|
+
return aai_hat_array
|
2314
|
+
|
2315
|
+
def curtime():
|
2316
|
+
time_format = "%d/%m/%Y %H:%M:%S"
|
2317
|
+
timer = datetime.datetime.now()
|
2318
|
+
time = timer.strftime(time_format)
|
2319
|
+
return time
|
2320
|
+
|
2321
|
+
#Manages the query process.
|
2322
|
+
def db_query_opts():
|
2323
|
+
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
|
2324
|
+
description='''
|
2325
|
+
This FastAAI module takes two FastAAI databases and searches all of the genomes in the QUERY against all of the genomes in the TARGET
|
2326
|
+
|
2327
|
+
If you have many genomes (more than 1000), it will be faster to create the query database using FastAAI build_db,
|
2328
|
+
then search it against an existing target using this module than it is to do the same thing with an SQL query.
|
2329
|
+
|
2330
|
+
If you give the same database as query and target, a special all vs. all search of the genomes in the database will be done.
|
2331
|
+
''')
|
2332
|
+
parser.add_argument('-q', '--query', dest = 'query', default = None, help = 'Path to the query database. The genomes FROM the query will be searched against the genomes in the target database')
|
2333
|
+
parser.add_argument('-t', '--target', dest = 'target', default = None, help = 'Path to the target database.')
|
2334
|
+
|
2335
|
+
parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory where FastAAI will place the result of this query. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
|
2336
|
+
|
2337
|
+
parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
|
2338
|
+
parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
|
2339
|
+
|
2340
|
+
parser.add_argument('--do_stdev', dest = "do_stdev", action='store_true', help = 'Off by default. Calculate std. deviations on Jaccard indicies. Increases memory usage and runtime slightly. Does NOT change estimated AAI values at all.')
|
2341
|
+
parser.add_argument('--unlimited_resources', dest = "large_mem", action = 'store_true', help = 'Off by default. Use a faster algorithm that consumes more RAM. FastAAI cannot calculate std. deviations with this algorithm, so they will automatically be skipped.')
|
2342
|
+
parser.add_argument('--mem', dest = "precision", default = "med", help = 'One of low/med/high. Medium by default. Save RAM in return for slightly rounded AAI estimates. Only affects FastAAI if you are also using the "--unlimited_resources" flag.')
|
2343
|
+
|
2344
|
+
args, unknown = parser.parse_known_args()
|
2345
|
+
|
2346
|
+
return parser, args
|
2347
|
+
|
2348
|
+
#Control the query process for any DB-first query.
|
2349
|
+
def db_query(query, target, verbose, output, threads, do_stdev, precision, memory_efficient):
|
2350
|
+
print("")
|
2351
|
+
|
2352
|
+
#Sanity checks.
|
2353
|
+
if not os.path.exists(target):
|
2354
|
+
print("Target database not found. Exiting FastAAI")
|
2355
|
+
sys.exit()
|
2356
|
+
|
2357
|
+
if not os.path.exists(query):
|
2358
|
+
print("Query database not found. Exiting FastAAI")
|
2359
|
+
sys.exit()
|
2360
|
+
|
2361
|
+
#status = "exists"
|
2362
|
+
query_ok = assess_db(query)
|
2363
|
+
target_ok = assess_db(target)
|
2364
|
+
|
2365
|
+
if query_ok != "exists":
|
2366
|
+
print("Query database improperly formatted. Exiting FastAAI")
|
2367
|
+
sys.exit()
|
2368
|
+
|
2369
|
+
if target_ok != "exists":
|
2370
|
+
print("Query database improperly formatted. Exiting FastAAI")
|
2371
|
+
sys.exit()
|
2372
|
+
|
2373
|
+
#Check if the database is querying against itself.
|
2374
|
+
if target is None or query is None:
|
2375
|
+
print("I require both a query and a target database. FastAAI exiting.")
|
2376
|
+
sys.exit()
|
2377
|
+
|
2378
|
+
if query == target:
|
2379
|
+
print("Performing an all vs. all query on", query)
|
2380
|
+
#all_vs_all = True
|
2381
|
+
else:
|
2382
|
+
print("Querying", query, "against", target)
|
2383
|
+
#all_vs_all = False
|
2384
|
+
|
2385
|
+
#Ready the output directories as needed.
|
2386
|
+
#The databases are already created, the only state they can be in in P+H
|
2387
|
+
good_to_go = prepare_directories(output, "protein and HMM", "query")
|
2388
|
+
if not good_to_go:
|
2389
|
+
print("Exiting FastAAI")
|
2390
|
+
sys.exit()
|
2391
|
+
|
2392
|
+
if precision not in ["high", "med", "low"]:
|
2393
|
+
print("Selected memory usage setting not found. Defaulting to med. Select one with --mem high/med/low.")
|
2394
|
+
precision = 'med'
|
2395
|
+
|
2396
|
+
#Default
|
2397
|
+
if (not memory_efficient) or do_stdev:
|
2398
|
+
do_query_vs_target_sql(query, target, threads, output, verbose, do_stdev)
|
2399
|
+
#Not default.
|
2400
|
+
else:
|
2401
|
+
do_query_vs_target_aai_only(query, target, threads, output, precision, verbose)
|
2402
|
+
|
2403
|
+
print("")
|
2404
|
+
|
2405
|
+
|
2406
|
+
#Perform a minimal-memory query of a target database from input files. Lighter weight function for low memory
|
2407
|
+
def sql_query_opts():
|
2408
|
+
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
|
2409
|
+
description='''
|
2410
|
+
This FastAAI module takes one or many genomes, proteins, or proteins and HMMs as a QUERY and searches them against an existing FastAAI database TARGET using SQL
|
2411
|
+
If you only have a few genomes - or not enough RAM to hold the entire target database in memory - this is the probably the best option for you.
|
2412
|
+
|
2413
|
+
If you provide FastAAI with genomes or only proteins (not proteins and HMMs), this FastAAI module will produce the required protein and HMM files as needed
|
2414
|
+
and place them in the output directory, just like it does while building a database.
|
2415
|
+
|
2416
|
+
Once these inputs are ready to be queried against the database (each has both a protein and HMM file), they will be processed independently, 1 per thread at a time.
|
2417
|
+
|
2418
|
+
Note: Protein and HMM files generated during this query can be supplied to build a FastAAI database from proteins and HMMs using the build_db module, without redoing preprocessing.
|
2419
|
+
''')
|
2420
|
+
|
2421
|
+
parser.add_argument('-g', '--genomes', dest = 'genomes', default = None, help = 'A directory containing genomes in FASTA format.')
|
2422
|
+
parser.add_argument('-p', '--proteins', dest = 'proteins', default = None, help = 'A directory containing protein amino acids in FASTA format.')
|
2423
|
+
parser.add_argument('-m', '--hmms', dest = 'hmms', default = None, help = 'A directory containing the results of an HMM search on a set of proteins.')
|
2424
|
+
|
2425
|
+
parser.add_argument('--target', dest = 'target', default = None, help = 'A path to the FastAAI database you wish to use as the target')
|
2426
|
+
|
2427
|
+
parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory where FastAAI will place the result of this query and any protein or HMM files it has to generate. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
|
2428
|
+
|
2429
|
+
parser.add_argument('--genome_file', dest = 'gf', default = None, help = 'Alternative way to supply genomes. A file containing paths to your genome files, 1 per line.')
|
2430
|
+
parser.add_argument('--protein_file', dest = 'pf', default = None, help = 'Alternative way to supply proteins. A file containing paths to your protein files, 1 per line.')
|
2431
|
+
parser.add_argument('--hmm_file', dest = 'hf', default = None, help = 'Alternative way to supply HMMs. A file containing paths to your HMM files, 1 per line.')
|
2432
|
+
|
2433
|
+
parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
|
2434
|
+
parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
|
2435
|
+
|
2436
|
+
parser.add_argument('--do_stdev', dest = "do_stdev", action='store_true', help = 'Off by default. Calculate std. deviations on Jaccard indicies. Increases memory usage and runtime slightly. Does NOT change estimated AAI values at all.')
|
2437
|
+
|
2438
|
+
args, unknown = parser.parse_known_args()
|
2439
|
+
|
2440
|
+
return parser, args
|
2441
|
+
|
2442
|
+
def sql_query_thread_starter(kmer_cts, protein_cts):
|
2443
|
+
global target_kmer_cts
|
2444
|
+
global target_protein_counts
|
2445
|
+
target_kmer_cts = kmer_cts
|
2446
|
+
target_protein_counts = protein_cts
|
2447
|
+
|
2448
|
+
|
2449
|
+
def sql_query(genomes, proteins, hmms, gf, pf, hf, db_name, output, threads, verbose, do_stdev):
|
2450
|
+
|
2451
|
+
if not os.path.exists(db_name):
|
2452
|
+
print("")
|
2453
|
+
print("FastAAI can't find your database:", db_name)
|
2454
|
+
print("Are you sure that the path you've given to the database is correct and that the database exists?")
|
2455
|
+
print("FastAAI exiting.")
|
2456
|
+
print("")
|
2457
|
+
sys.exit()
|
2458
|
+
|
2459
|
+
start = check_out_input_files(genomes, proteins, hmms, gf, pf, hf)
|
2460
|
+
|
2461
|
+
#If something failed, we stop.
|
2462
|
+
if start is None:
|
2463
|
+
sys.exit()
|
2464
|
+
|
2465
|
+
|
2466
|
+
|
2467
|
+
good_to_go = prepare_directories(output, start, "query")
|
2468
|
+
|
2469
|
+
if not good_to_go:
|
2470
|
+
print("Exiting FastAAI")
|
2471
|
+
sys.exit()
|
2472
|
+
|
2473
|
+
#global kmer_index
|
2474
|
+
#kmer_index = create_kmer_index()
|
2475
|
+
|
2476
|
+
|
2477
|
+
print("")
|
2478
|
+
print("Preparing inputs for querying...")
|
2479
|
+
|
2480
|
+
prepared_files = advance_inputs(genomes = genomes, proteins = proteins, hmms = hmms, genomes_file = gf, proteins_file = pf, hmms_file = hf, output = output, threads = threads, verbose = verbose, db_name = db_name)
|
2481
|
+
|
2482
|
+
if prepared_files is None:
|
2483
|
+
return None
|
2484
|
+
|
2485
|
+
query_accessions_detected = set()
|
2486
|
+
for file in prepared_files:
|
2487
|
+
query_accessions_detected = query_accessions_detected.union(file.best_hits.values())
|
2488
|
+
|
2489
|
+
#We don't want to get more than we have to.
|
2490
|
+
query_accessions_detected = list(query_accessions_detected)
|
2491
|
+
|
2492
|
+
if prepared_files is None:
|
2493
|
+
print("Exiting FastAAI")
|
2494
|
+
sys.exit()
|
2495
|
+
|
2496
|
+
if verbose:
|
2497
|
+
print("")
|
2498
|
+
print("Gathering database information...")
|
2499
|
+
|
2500
|
+
database = fastaai_database(db_name)
|
2501
|
+
database.activate_connection()
|
2502
|
+
database.load_genome_index()
|
2503
|
+
database.load_accessions()
|
2504
|
+
database.close_connection()
|
2505
|
+
|
2506
|
+
#formatted_dataset = [(database, file.basename, file.best_hits_kmers, os.path.normpath(output+"/results")) for file in prepared_files]
|
2507
|
+
|
2508
|
+
#global accession_index
|
2509
|
+
accession_index = generate_accessions_index()
|
2510
|
+
|
2511
|
+
#Translate to indicies.
|
2512
|
+
query_accessions_detected = [accession_index[a] for a in query_accessions_detected]
|
2513
|
+
|
2514
|
+
#global target_kmer_cts
|
2515
|
+
target_kmer_cts = {}
|
2516
|
+
|
2517
|
+
for accession in np.intersect1d(database.accessions, query_accessions_detected):
|
2518
|
+
target_kmer_cts[accession] = np.zeros(len(database.genome_index), dtype = np.int16)
|
2519
|
+
for g in database.gak:
|
2520
|
+
if accession in database.gak[g]:
|
2521
|
+
target_kmer_cts[accession][g] = database.gak[g][accession]
|
2522
|
+
|
2523
|
+
#global target_protein_counts
|
2524
|
+
target_protein_counts = np.zeros(len(database.gak), dtype = np.int16)
|
2525
|
+
for g in database.gak:
|
2526
|
+
target_protein_counts[g] = len(database.gak[g])
|
2527
|
+
|
2528
|
+
database.gak = None
|
2529
|
+
|
2530
|
+
formatted_dataset = [(database, file.basename, file.best_hits_kmers, os.path.normpath(output+"/results")) for file in prepared_files]
|
2531
|
+
|
2532
|
+
if verbose:
|
2533
|
+
print("")
|
2534
|
+
print("-"*100)
|
2535
|
+
print("")
|
2536
|
+
|
2537
|
+
count = 0
|
2538
|
+
total = len(formatted_dataset)
|
2539
|
+
|
2540
|
+
print("Beginning AAI calculation")
|
2541
|
+
|
2542
|
+
#globals to pass... target_kmer_cts target_protein_counts
|
2543
|
+
#Just remake these in the procs. kmer_index accession_index
|
2544
|
+
|
2545
|
+
if verbose:
|
2546
|
+
print("")
|
2547
|
+
#progress bar - possible dangerous use of the return to line start sequence.
|
2548
|
+
try:
|
2549
|
+
percentage = 0
|
2550
|
+
sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% (Query genome ' + str(count) + " of " + str(total) + ' done at '+curtime()+' )\n')
|
2551
|
+
sys.stdout.flush()
|
2552
|
+
last_pct = 0
|
2553
|
+
except:
|
2554
|
+
#It's not really a big deal if the progress bar cannot be printed.
|
2555
|
+
pass
|
2556
|
+
|
2557
|
+
#If parallelized, do parallel
|
2558
|
+
|
2559
|
+
pool = multiprocessing.Pool(threads, initializer = sql_query_thread_starter, initargs = (target_kmer_cts, target_protein_counts,))
|
2560
|
+
|
2561
|
+
#Process as we go.
|
2562
|
+
if do_stdev:
|
2563
|
+
for file in pool.imap(do_sql_query, formatted_dataset):
|
2564
|
+
|
2565
|
+
'''
|
2566
|
+
handle = open(file, "r")
|
2567
|
+
|
2568
|
+
for line in handle:
|
2569
|
+
final_result.write(line)
|
2570
|
+
|
2571
|
+
handle.close()
|
2572
|
+
os.remove(file)
|
2573
|
+
'''
|
2574
|
+
if verbose:
|
2575
|
+
#progress bar - possible dangerous use of the return to line start sequence.
|
2576
|
+
try:
|
2577
|
+
count += 1
|
2578
|
+
percentage = (count/total)*100
|
2579
|
+
if int(percentage/2) > last_pct or count == total:
|
2580
|
+
sys.stdout.write('\033[A')
|
2581
|
+
sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% (Query genome ' + str(count) + " of " + str(total) + ' done at '+curtime()+' )\n')
|
2582
|
+
sys.stdout.flush()
|
2583
|
+
last_pct = int(percentage/2)
|
2584
|
+
except:
|
2585
|
+
#It's not really a big deal if the progress bar cannot be printed.
|
2586
|
+
pass
|
2587
|
+
|
2588
|
+
pool.close()
|
2589
|
+
pool.join()
|
2590
|
+
else:
|
2591
|
+
|
2592
|
+
for file in pool.imap(do_sql_query_no_SD, formatted_dataset):
|
2593
|
+
'''
|
2594
|
+
handle = open(file, "r")
|
2595
|
+
|
2596
|
+
for line in handle:
|
2597
|
+
final_result.write(line)
|
2598
|
+
|
2599
|
+
handle.close()
|
2600
|
+
os.remove(file)
|
2601
|
+
'''
|
2602
|
+
if verbose:
|
2603
|
+
#progress bar - possible dangerous use of the return to line start sequence.
|
2604
|
+
try:
|
2605
|
+
count += 1
|
2606
|
+
percentage = (count/total)*100
|
2607
|
+
if int(percentage/2) > last_pct or count == total:
|
2608
|
+
sys.stdout.write('\033[A')
|
2609
|
+
sys.stdout.flush()
|
2610
|
+
sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% (Query genome ' + str(count) + " of " + str(total) + ' done at '+curtime()+' )\n')
|
2611
|
+
sys.stdout.flush
|
2612
|
+
last_pct = int(percentage/2)
|
2613
|
+
except:
|
2614
|
+
#It's not really a big deal if the progress bar cannot be printed.
|
2615
|
+
pass
|
2616
|
+
|
2617
|
+
pool.close()
|
2618
|
+
pool.join()
|
2619
|
+
|
2620
|
+
if verbose:
|
2621
|
+
print("")
|
2622
|
+
print("-"*100)
|
2623
|
+
print("")
|
2624
|
+
|
2625
|
+
if os.path.exists(output+"/temp"):
|
2626
|
+
os.rmdir(output+"/temp")
|
2627
|
+
|
2628
|
+
print("FastAAI query complete! Results at:", os.path.normpath(output + "/results"))
|
2629
|
+
return None
|
2630
|
+
|
2631
|
+
|
2632
|
+
#Check to see if the file exists and is a valid fastAAI db
|
2633
|
+
def assess_db(path):
|
2634
|
+
status = None
|
2635
|
+
if os.path.exists(path):
|
2636
|
+
db = fastaai_database(path)
|
2637
|
+
try:
|
2638
|
+
db.activate_connection()
|
2639
|
+
sql = "SELECT name FROM sqlite_master WHERE type='table'"
|
2640
|
+
|
2641
|
+
db.cursor.row_factory = lambda cursor, row: row[0]
|
2642
|
+
tables = db.cursor.execute(sql).fetchall()
|
2643
|
+
db.cursor.row_factory = None
|
2644
|
+
|
2645
|
+
db.close_connection()
|
2646
|
+
|
2647
|
+
if len(tables) > 2 and "genome_index" in tables and "genome_acc_kmer_counts" in tables:
|
2648
|
+
status = "exists"
|
2649
|
+
else:
|
2650
|
+
status = "wrong format"
|
2651
|
+
|
2652
|
+
except:
|
2653
|
+
status = "wrong format"
|
2654
|
+
|
2655
|
+
else:
|
2656
|
+
try:
|
2657
|
+
db = fastaai_database(path)
|
2658
|
+
db.activate_connection()
|
2659
|
+
db.initialize_parent_database()
|
2660
|
+
db.close_connection()
|
2661
|
+
status = "created"
|
2662
|
+
except:
|
2663
|
+
status = "unable to create"
|
2664
|
+
|
2665
|
+
return status
|
2666
|
+
|
2667
|
+
#Add one FastAAI DB to another FastAAI DB
|
2668
|
+
def merge_db_opts():
|
2669
|
+
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
|
2670
|
+
description='''
|
2671
|
+
This FastAAI module allows you to add the contents of one or more FastAAI databases to another.
|
2672
|
+
You must have at least two already-created FastAAI databases using the build_db module before this module can be used.
|
2673
|
+
|
2674
|
+
Supply a comma-separated list of at least one donor database and a single recipient database.
|
2675
|
+
If the recipient already exists, then genomes in all the donors will be added to the recipient.
|
2676
|
+
If the recipient does not already exist, a new database will be created, and the contents of all the donors will be added to it.
|
2677
|
+
|
2678
|
+
Example:
|
2679
|
+
FastAAI.py merge_db --donors databases/db1.db,databases/db2.db -recipient databases/db3.db --threads 3
|
2680
|
+
This command will create a new database called "db3.db", merge the data in db1.db and db2.db, and then add the merged data into db3.db
|
2681
|
+
|
2682
|
+
Only the recipient database will be modified; the donors will be left exactly as they were before running this module.
|
2683
|
+
''')
|
2684
|
+
|
2685
|
+
parser.add_argument('-d', '--donors', dest = 'donors', default = None, help = 'Comma-separated string of paths to one or more donor databases. The genomes FROM the donors will be added TO the recipient and the donors will be unaltered')
|
2686
|
+
|
2687
|
+
parser.add_argument('-r', '--recipient', dest = 'recipient', default = None, help = 'Path to the recipient database. Any genomes FROM the donor database not already in the recipient will be added to this database.')
|
2688
|
+
|
2689
|
+
parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
|
2690
|
+
|
2691
|
+
parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
|
2692
|
+
|
2693
|
+
args, unknown = parser.parse_known_args()
|
2694
|
+
|
2695
|
+
return parser, args
|
2696
|
+
|
2697
|
+
def merge_db_thread_starter(rev_index, per_db_accs):
|
2698
|
+
global reverse_genome_indicies
|
2699
|
+
global accs_per_db
|
2700
|
+
reverse_genome_indicies = rev_index
|
2701
|
+
accs_per_db = per_db_accs
|
2702
|
+
|
2703
|
+
|
2704
|
+
def merge_db(recipient, donors, verbose, threads):
|
2705
|
+
#Prettier on the CLI
|
2706
|
+
|
2707
|
+
if donors is None or recipient is None:
|
2708
|
+
print("Either donor or target not given. FastAAI is exiting.")
|
2709
|
+
return None
|
2710
|
+
|
2711
|
+
print("")
|
2712
|
+
|
2713
|
+
donors = donors.split(",")
|
2714
|
+
valid_donors = []
|
2715
|
+
for d in donors:
|
2716
|
+
if os.path.exists(d):
|
2717
|
+
if d == recipient:
|
2718
|
+
print("Donor database", d, "is the same as the recipient. This database will be skipped.")
|
2719
|
+
else:
|
2720
|
+
check = assess_db(d)
|
2721
|
+
if check == "exists":
|
2722
|
+
if d not in valid_donors:
|
2723
|
+
valid_donors.append(d)
|
2724
|
+
else:
|
2725
|
+
print("It appears that database", d, "was already added to the list of donors. Did you type it twice in the list of donors? Skipping it.")
|
2726
|
+
else:
|
2727
|
+
if check == "created":
|
2728
|
+
print("Donor database", d, "not found! Skipping.")
|
2729
|
+
else:
|
2730
|
+
print("Something was wrong with supplied database:", d+". A status check found:", check)
|
2731
|
+
else:
|
2732
|
+
print("Donor database", d, "not found! Are you sure the path is correct and this donor exists? This database will be skipped.")
|
2733
|
+
|
2734
|
+
if len(valid_donors) == 0:
|
2735
|
+
print("None of the supplied donor databases were able to be accessed. FastAAI cannot continue if none of these databases are valid. Exiting.")
|
2736
|
+
sys.exit()
|
2737
|
+
|
2738
|
+
recip_check = assess_db(recipient)
|
2739
|
+
|
2740
|
+
if recip_check == "created" or recip_check == "exists":
|
2741
|
+
for donor in valid_donors:
|
2742
|
+
print("Donor database:", donor, "will be added to recipient database:", recipient)
|
2743
|
+
|
2744
|
+
recipient = fastaai_database(recipient)
|
2745
|
+
else:
|
2746
|
+
print("I couldn't find or create the recipient database at", recipient+".", "Does the folder you're trying to place this database in exist, and do you have permission to write files to it? FastAAI exiting.")
|
2747
|
+
sys.exit()
|
2748
|
+
|
2749
|
+
if recipient is None or len(valid_donors) == 0:
|
2750
|
+
print("I require both a valid donor and a recipient database. FastAAI exiting.")
|
2751
|
+
sys.exit()
|
2752
|
+
|
2753
|
+
donor_dbs = []
|
2754
|
+
for d in valid_donors:
|
2755
|
+
donor_dbs.append(fastaai_database(d))
|
2756
|
+
|
2757
|
+
all_accessions = set()
|
2758
|
+
#global joint_genome_index
|
2759
|
+
joint_genome_index = {}
|
2760
|
+
joint_genome_counts = {}
|
2761
|
+
max_index = 0
|
2762
|
+
#The idea here is to create a set of arrays whose values span the range of each donor's genomes and translate those into an overall list, in order.
|
2763
|
+
|
2764
|
+
#global reverse_genome_indicies
|
2765
|
+
reverse_genome_indices = {}
|
2766
|
+
|
2767
|
+
#global accs_per_db
|
2768
|
+
accs_per_db = {}
|
2769
|
+
|
2770
|
+
#Load recipient data, if any.
|
2771
|
+
if recip_check == "exists":
|
2772
|
+
recipient.activate_connection()
|
2773
|
+
recipient.just_accessions()
|
2774
|
+
recipient.load_genome_index()
|
2775
|
+
recipient.close_connection()
|
2776
|
+
|
2777
|
+
all_accessions = all_accessions.union(recipient.accessions)
|
2778
|
+
accs_per_db[recipient.path] = recipient.accessions
|
2779
|
+
recipient.accessions = None
|
2780
|
+
max_index = len(recipient.genome_index)
|
2781
|
+
|
2782
|
+
joint_genome_index = dict(zip(recipient.genome_index.keys(), recipient.genome_index.values()))
|
2783
|
+
joint_genome_counts = dict(zip(recipient.protein_counts_by_genome.keys(), recipient.protein_counts_by_genome.values()))
|
2784
|
+
|
2785
|
+
#reverse_genome_index = dict(zip(joint_genome_index.values(),joint_genome_index.keys()))
|
2786
|
+
#So... the keys are the genome indicies of the recip. These... shouldn't need any updates. Only the donors need to match.
|
2787
|
+
ct = 0
|
2788
|
+
path = recipient.path
|
2789
|
+
reverse_genome_indices[path] = []
|
2790
|
+
for idx in sorted(recipient.genome_index.values()):
|
2791
|
+
reverse_genome_indices[path].append(idx)
|
2792
|
+
reverse_genome_indices[path] = np.array(reverse_genome_indices[path], dtype = np.int32)
|
2793
|
+
recipient.genome_index = None
|
2794
|
+
|
2795
|
+
#Donors should always exist, never be created.
|
2796
|
+
for d in donor_dbs:
|
2797
|
+
d.activate_connection()
|
2798
|
+
d.just_accessions()
|
2799
|
+
d.load_genome_index()
|
2800
|
+
d.close_connection()
|
2801
|
+
accs_per_db[d.path] = d.accessions
|
2802
|
+
all_accessions = all_accessions.union(d.accessions)
|
2803
|
+
d.accessions = None
|
2804
|
+
reverse_genome_indices[d.path] = []
|
2805
|
+
#Database construction indicates this should always be 0-COUNT
|
2806
|
+
for g in sorted(d.genome_index.keys()):
|
2807
|
+
if g not in joint_genome_index:
|
2808
|
+
reverse_genome_indices[d.path].append(max_index)
|
2809
|
+
joint_genome_index[g] = max_index
|
2810
|
+
#Map the counts on.
|
2811
|
+
joint_genome_counts[max_index] = d.protein_counts_by_genome[d.genome_index[g]]
|
2812
|
+
#reverse_genome_index[max_index] = g
|
2813
|
+
max_index += 1
|
2814
|
+
else:
|
2815
|
+
reverse_genome_indices[d.path].append(joint_genome_index[g])
|
2816
|
+
#Make it an array, now
|
2817
|
+
reverse_genome_indices[d.path] = np.array(reverse_genome_indices[d.path], dtype = np.int32)
|
2818
|
+
d.genome_index = None
|
2819
|
+
|
2820
|
+
#global accession_index
|
2821
|
+
accession_index = generate_accessions_index()
|
2822
|
+
|
2823
|
+
#global accession_inverter
|
2824
|
+
accession_inverter = {}
|
2825
|
+
for acc in accession_index:
|
2826
|
+
sql_friendly_accession = acc.replace(".", "_")
|
2827
|
+
accession_inverter[accession_index[acc]] = sql_friendly_accession
|
2828
|
+
|
2829
|
+
all_accessions = list(all_accessions)
|
2830
|
+
|
2831
|
+
acc_args = [(acc, donor_dbs, recipient) for acc in all_accessions]
|
2832
|
+
|
2833
|
+
if not os.path.exists("FastAAI_temp"):
|
2834
|
+
os.mkdir("FastAAI_temp")
|
2835
|
+
|
2836
|
+
print("")
|
2837
|
+
print("Formatting data to add to database. Started at", curtime())
|
2838
|
+
|
2839
|
+
if verbose:
|
2840
|
+
print("")
|
2841
|
+
count = 0
|
2842
|
+
total_counts = len(acc_args)
|
2843
|
+
try:
|
2844
|
+
percentage = (count/total_counts)*100
|
2845
|
+
sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(count) + " of " + str(total_counts) + ' done at ' + curtime() + " )\n")
|
2846
|
+
sys.stdout.flush()
|
2847
|
+
except:
|
2848
|
+
#It's not really a big deal if the progress bar cannot be printed.
|
2849
|
+
pass
|
2850
|
+
|
2851
|
+
last_pct = 0
|
2852
|
+
|
2853
|
+
pool = multiprocessing.Pool(threads, initializer=merge_db_thread_starter, initargs = (reverse_genome_indices, accs_per_db,))
|
2854
|
+
|
2855
|
+
quiverfull = []
|
2856
|
+
for result in pool.imap_unordered(pull_and_merge_accession, acc_args):
|
2857
|
+
acc = result[0]
|
2858
|
+
child = result[1]
|
2859
|
+
#sub_gak = result[2]
|
2860
|
+
|
2861
|
+
quiverfull.append([acc, child])
|
2862
|
+
#gaks.extend(sub_gak)
|
2863
|
+
|
2864
|
+
if verbose:
|
2865
|
+
count += 1
|
2866
|
+
try:
|
2867
|
+
percentage = (count/total_counts)*100
|
2868
|
+
log_time = curtime()
|
2869
|
+
sys.stdout.write('\033[A')
|
2870
|
+
sys.stdout.flush()
|
2871
|
+
sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(count) + " of " + str(total_counts) + ' done at ' + curtime() + " )\n")
|
2872
|
+
sys.stdout.flush()
|
2873
|
+
except:
|
2874
|
+
#It's not really a big deal if the progress bar cannot be printed.
|
2875
|
+
pass
|
2876
|
+
|
2877
|
+
pool.close()
|
2878
|
+
pool.join()
|
2879
|
+
|
2880
|
+
print("")
|
2881
|
+
print("Adding data to final database. Started at", curtime())
|
2882
|
+
|
2883
|
+
if verbose:
|
2884
|
+
print("")
|
2885
|
+
|
2886
|
+
count = 0
|
2887
|
+
total_counts = len(acc_args)
|
2888
|
+
try:
|
2889
|
+
percentage = (count/total_counts)*100
|
2890
|
+
sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(count) + " of " + str(total_counts) + ' done at ' + curtime() + " )\n")
|
2891
|
+
sys.stdout.flush()
|
2892
|
+
except:
|
2893
|
+
#It's not really a big deal if the progress bar cannot be printed.
|
2894
|
+
pass
|
2895
|
+
|
2896
|
+
last_pct = 0
|
2897
|
+
|
2898
|
+
recipient.activate_connection()
|
2899
|
+
genome_list_update_sql = "INSERT OR REPLACE INTO genome_index VALUES (?, ?, ?)"
|
2900
|
+
genome_reindex = []
|
2901
|
+
for g in joint_genome_index:
|
2902
|
+
genome_reindex.append((g, joint_genome_index[g], joint_genome_counts[joint_genome_index[g]]))
|
2903
|
+
|
2904
|
+
recipient.cursor.executemany(genome_list_update_sql, genome_reindex)
|
2905
|
+
recipient.connection.commit()
|
2906
|
+
|
2907
|
+
del genome_reindex
|
2908
|
+
|
2909
|
+
for result in quiverfull:
|
2910
|
+
acc = result[0]
|
2911
|
+
child = result[1]
|
2912
|
+
|
2913
|
+
recipient.add_child_to_parent(acc, child, genomes_too = True, update_gak = True)
|
2914
|
+
|
2915
|
+
if verbose:
|
2916
|
+
count += 1
|
2917
|
+
try:
|
2918
|
+
percentage = (count/total_counts)*100
|
2919
|
+
log_time = curtime()
|
2920
|
+
sys.stdout.write('\033[A')
|
2921
|
+
sys.stdout.flush()
|
2922
|
+
sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(count) + " of " + str(total_counts) + ' done at ' + curtime() + " )\n")
|
2923
|
+
sys.stdout.flush()
|
2924
|
+
except:
|
2925
|
+
#It's not really a big deal if the progress bar cannot be printed.
|
2926
|
+
pass
|
2927
|
+
|
2928
|
+
os.rmdir("FastAAI_temp")
|
2929
|
+
''' We're only ever increasing the DB size, so we don't actually need to vacuum it.
|
2930
|
+
if recip_check != "created":
|
2931
|
+
print("")
|
2932
|
+
print("Cleaning up the database after the update. This may take a while.")
|
2933
|
+
recipient.connection.execute("VACUUM")
|
2934
|
+
recipient.connection.close()
|
2935
|
+
'''
|
2936
|
+
print("\nDatabases merged!")
|
2937
|
+
|
2938
|
+
return None
|
2939
|
+
|
2940
|
+
def pull_and_merge_accession(args):
|
2941
|
+
accession_index = generate_accessions_index()
|
2942
|
+
|
2943
|
+
#global accession_inverter
|
2944
|
+
accession_inverter = {}
|
2945
|
+
for acc in accession_index:
|
2946
|
+
sql_friendly_accession = acc.replace(".", "_")
|
2947
|
+
accession_inverter[accession_index[acc]] = sql_friendly_accession
|
2948
|
+
|
2949
|
+
#joint_genome_index, accession_index, accession_inverter, accs_per_db are global already.
|
2950
|
+
acc, donor_dbs, recipient = args[0], args[1], args[2]
|
2951
|
+
|
2952
|
+
acc_name = accession_inverter[acc]
|
2953
|
+
acc_name_gens = acc_name + "_genomes"
|
2954
|
+
|
2955
|
+
query_sql = "SELECT * FROM " + acc_name
|
2956
|
+
|
2957
|
+
temp_db = fastaai_database("FastAAI_temp/"+acc_name+".db")
|
2958
|
+
temp_db.activate_connection()
|
2959
|
+
|
2960
|
+
create_command = "CREATE TABLE IF NOT EXISTS " + acc_name + " (kmer INTEGER PRIMARY KEY, genomes array)"
|
2961
|
+
temp_db.cursor.execute(create_command)
|
2962
|
+
temp_db.connection.commit()
|
2963
|
+
|
2964
|
+
create_command = "CREATE TABLE IF NOT EXISTS " + acc_name + "_genomes (genome INTEGER PRIMARY KEY, kmers array)"
|
2965
|
+
temp_db.cursor.execute(create_command)
|
2966
|
+
temp_db.connection.commit()
|
2967
|
+
|
2968
|
+
query_lists = {}
|
2969
|
+
for db in donor_dbs:
|
2970
|
+
if acc in accs_per_db[db.path]:
|
2971
|
+
db.activate_connection()
|
2972
|
+
|
2973
|
+
for result in db.cursor.execute(query_sql).fetchall():
|
2974
|
+
kmer = result[0]
|
2975
|
+
genomes = result[1]
|
2976
|
+
translated_genomes = reverse_genome_indicies[db.path][genomes]
|
2977
|
+
|
2978
|
+
if kmer in query_lists:
|
2979
|
+
query_lists[kmer] = np.union1d(query_lists[kmer], translated_genomes)
|
2980
|
+
else:
|
2981
|
+
query_lists[kmer] = translated_genomes
|
2982
|
+
|
2983
|
+
db.close_connection()
|
2984
|
+
|
2985
|
+
#Recipient is not guaranteed to be in the accs per db - if it was created anew, it wouldn't be.
|
2986
|
+
if recipient.path in accs_per_db:
|
2987
|
+
if acc in accs_per_db[recipient.path]:
|
2988
|
+
recipient.activate_connection()
|
2989
|
+
|
2990
|
+
for result in recipient.cursor.execute(query_sql).fetchall():
|
2991
|
+
kmer = result[0]
|
2992
|
+
genomes = result[1]
|
2993
|
+
translated_genomes = reverse_genome_indicies[recipient.path][genomes]
|
2994
|
+
if kmer in query_lists:
|
2995
|
+
query_lists[kmer] = np.union1d(query_lists[kmer], translated_genomes)
|
2996
|
+
else:
|
2997
|
+
query_lists[kmer] = translated_genomes
|
2998
|
+
|
2999
|
+
recipient.close_connection()
|
3000
|
+
|
3001
|
+
#Byte-string these.
|
3002
|
+
for kmer in query_lists:
|
3003
|
+
query_lists[kmer] = query_lists[kmer].tobytes()
|
3004
|
+
|
3005
|
+
temp_db.cursor.executemany("INSERT INTO " + acc_name + " VALUES (?,?)", zip(query_lists.keys(), query_lists.values()))
|
3006
|
+
temp_db.connection.commit()
|
3007
|
+
|
3008
|
+
del query_lists
|
3009
|
+
|
3010
|
+
#Reset. Do genomes
|
3011
|
+
query_genomes_sql = "SELECT * FROM " + acc_name_gens
|
3012
|
+
query_lists = {}
|
3013
|
+
for db in donor_dbs:
|
3014
|
+
if acc in accs_per_db[db.path]:
|
3015
|
+
db.activate_connection()
|
3016
|
+
|
3017
|
+
for result in db.cursor.execute(query_genomes_sql).fetchall():
|
3018
|
+
genome = result[0]
|
3019
|
+
kmers = result[1]
|
3020
|
+
translated_genome = int(reverse_genome_indicies[db.path][genome])
|
3021
|
+
#Each genome gets added only once, no dupes.
|
3022
|
+
if translated_genome not in query_lists:
|
3023
|
+
query_lists[translated_genome] = kmers
|
3024
|
+
|
3025
|
+
db.close_connection()
|
3026
|
+
|
3027
|
+
if recipient.path in accs_per_db:
|
3028
|
+
if acc in accs_per_db[recipient.path]:
|
3029
|
+
recipient.activate_connection()
|
3030
|
+
|
3031
|
+
for result in recipient.cursor.execute(query_genomes_sql).fetchall():
|
3032
|
+
genome = result[0]
|
3033
|
+
kmers = result[1]
|
3034
|
+
translated_genome = int(reverse_genome_indicies[recipient.path][genome])
|
3035
|
+
#Each genome gets added only once, no dupes.
|
3036
|
+
if translated_genome not in query_lists:
|
3037
|
+
query_lists[translated_genome] = kmers
|
3038
|
+
|
3039
|
+
recipient.close_connection()
|
3040
|
+
|
3041
|
+
#Byte-string these.
|
3042
|
+
#gak = []
|
3043
|
+
for g in query_lists:
|
3044
|
+
#gak.append((g, acc, query_lists[g].shape[0]))
|
3045
|
+
query_lists[g] = query_lists[g].tobytes()
|
3046
|
+
|
3047
|
+
|
3048
|
+
temp_db.cursor.executemany("INSERT INTO " + acc_name_gens + " VALUES (?,?)", zip(query_lists.keys(), query_lists.values()))
|
3049
|
+
temp_db.connection.commit()
|
3050
|
+
|
3051
|
+
temp_db.close_connection()
|
3052
|
+
|
3053
|
+
return [acc_name, temp_db.path]
|
3054
|
+
|
3055
|
+
#Query 1 genome vs. 1 target using Carlos' method - just needs query, target, threads
|
3056
|
+
def single_query_opts():
|
3057
|
+
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
|
3058
|
+
description='''
|
3059
|
+
This FastAAI module takes a single query genome, protein, or protein and HMM pair and a single target genome, protein, or protein and HMM pair as inputs and calculates AAI between the two.
|
3060
|
+
|
3061
|
+
If you supply a genome as either query or target, a protein and HMM file will be made for the genome.
|
3062
|
+
If you supply a protein as either query or target, an HMM file will be made for it.
|
3063
|
+
If you supply both an HMM and protein, the search will start right away. You cannot provide only an HMM.
|
3064
|
+
|
3065
|
+
No database will be built, and you cannot query multiple genomes with this module.
|
3066
|
+
|
3067
|
+
If you wish to query multiple genomes against themselves in all vs. all AAI search, use aai_index instead.
|
3068
|
+
If you wish to query multiple genomes against multiple targets, use multi_query instead.
|
3069
|
+
''')
|
3070
|
+
parser.add_argument('-qg', '--query_genome', dest = 'query_genome', default = None, help = 'Query genome')
|
3071
|
+
parser.add_argument('-tg', '--target_genome', dest = 'target_genome', default = None, help = 'Target genome')
|
3072
|
+
|
3073
|
+
parser.add_argument('-qp', '--query_protein', dest = 'query_protein', default = None, help = 'Query protein')
|
3074
|
+
parser.add_argument('-tp', '--target_protein', dest = 'target_protein', default = None, help = 'Target protein')
|
3075
|
+
|
3076
|
+
parser.add_argument('-qh', '--query_hmm', dest = 'query_hmm', default = None, help = 'Query HMM')
|
3077
|
+
parser.add_argument('-th', '--target_hmm', dest = 'target_hmm', default = None, help = 'Target HMM')
|
3078
|
+
|
3079
|
+
parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory where FastAAI will place the result of this query. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
|
3080
|
+
|
3081
|
+
parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
|
3082
|
+
parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
|
3083
|
+
|
3084
|
+
#Alternative file input
|
3085
|
+
|
3086
|
+
args, unknown = parser.parse_known_args()
|
3087
|
+
|
3088
|
+
return parser, args
|
3089
|
+
|
3090
|
+
def do_single_query(input_file):
|
3091
|
+
input_file.preprocess()
|
3092
|
+
return input_file
|
3093
|
+
|
3094
|
+
def intersect_kmer_lists(pair):
|
3095
|
+
intersection = np.intersect1d(pair[0], pair[1]).shape[0]
|
3096
|
+
union = pair[0].shape[0] + pair[1].shape[0] - intersection
|
3097
|
+
return (intersection/union)
|
3098
|
+
|
3099
|
+
def kaai_to_aai(kaai):
|
3100
|
+
# Transform the kAAI into estimated AAI values
|
3101
|
+
aai_hat = (-0.3087057 + 1.810741 * (np.exp(-(-0.2607023 * np.log(kaai))**(1/3.435))))*100
|
3102
|
+
|
3103
|
+
return aai_hat
|
3104
|
+
|
3105
|
+
#This one's unique. It doesn't do anything with the DB, which means it doesn't access any other functionality outside of the input_file class. It just advances a pair of inputs in parallel and does intersections.
|
3106
|
+
def single_query(query_args, target_args, shared_args):
|
3107
|
+
|
3108
|
+
output, threads, verbose = shared_args[0], shared_args[1], shared_args[2]
|
3109
|
+
|
3110
|
+
genomes, proteins, hmms = query_args[0], query_args[1], query_args[2]
|
3111
|
+
|
3112
|
+
if genomes is None and proteins is None and hmms is None:
|
3113
|
+
print("Please supply a query genome, protein, or protein and HMM pair.")
|
3114
|
+
sys.exit()
|
3115
|
+
|
3116
|
+
query = None
|
3117
|
+
|
3118
|
+
if genomes is not None:
|
3119
|
+
query = input_file(genomes, output, verbose)
|
3120
|
+
query.set_genome(genomes)
|
3121
|
+
if proteins is not None:
|
3122
|
+
if query is not None:
|
3123
|
+
print("If you supply a genome for either query or target, you must supply ONLY the genome, not a genome and either a protein or HMM.")
|
3124
|
+
sys.exit()
|
3125
|
+
else:
|
3126
|
+
query = input_file(proteins, output, verbose)
|
3127
|
+
query.set_protein(proteins)
|
3128
|
+
if hmms is not None:
|
3129
|
+
if query is None:
|
3130
|
+
print("If you supply an HMM for either query or target, you must also supply the protein from which the HMM was generated.")
|
3131
|
+
sys.exit()
|
3132
|
+
else:
|
3133
|
+
query.set_hmm(hmms)
|
3134
|
+
|
3135
|
+
genomes, proteins, hmms = target_args[0], target_args[1], target_args[2]
|
3136
|
+
|
3137
|
+
if genomes is None and proteins is None and hmms is None:
|
3138
|
+
print("Please supply a target genome, protein, or protein and HMM pair.")
|
3139
|
+
sys.exit()
|
3140
|
+
|
3141
|
+
target = None
|
3142
|
+
|
3143
|
+
if genomes is not None:
|
3144
|
+
target = input_file(genomes, output, verbose)
|
3145
|
+
target.set_genome(genomes)
|
3146
|
+
if proteins is not None:
|
3147
|
+
if target is not None:
|
3148
|
+
print("If you supply a genome for either target or target, you must supply ONLY the genome, not a genome and either a protein or HMM.")
|
3149
|
+
sys.exit()
|
3150
|
+
else:
|
3151
|
+
target = input_file(proteins, output, verbose)
|
3152
|
+
target.set_protein(proteins)
|
3153
|
+
if hmms is not None:
|
3154
|
+
if target is None:
|
3155
|
+
print("If you supply an HMM for either target or target, you must also supply the protein from which the HMM was generated.")
|
3156
|
+
sys.exit()
|
3157
|
+
else:
|
3158
|
+
target.set_hmm(hmms)
|
3159
|
+
|
3160
|
+
if query.basename == target.basename:
|
3161
|
+
print("You've selected the same query and target genome. The AAI is 100%.")
|
3162
|
+
print("FastAAI exiting.")
|
3163
|
+
return None
|
3164
|
+
|
3165
|
+
statuses = ["genome", "protein", "protein and hmm"]
|
3166
|
+
query_stat = statuses.index(query.status)
|
3167
|
+
target_stat = statuses.index(target.status)
|
3168
|
+
minimum_status = statuses[min(query_stat, target_stat)]
|
3169
|
+
|
3170
|
+
start_printouts = ["[Genome] Protein Protein+HMM", " Genome [Protein] Protein+HMM", "Genome Protein [Protein+HMM]"]
|
3171
|
+
|
3172
|
+
print("")
|
3173
|
+
print("Query start: ", start_printouts[query_stat])
|
3174
|
+
print("Target start:", start_printouts[target_stat])
|
3175
|
+
print("")
|
3176
|
+
|
3177
|
+
good_to_go = prepare_directories(output, minimum_status, "build")
|
3178
|
+
|
3179
|
+
if not good_to_go:
|
3180
|
+
print("Exiting FastAAI")
|
3181
|
+
sys.exit()
|
3182
|
+
|
3183
|
+
qname = query.basename
|
3184
|
+
tname = target.basename
|
3185
|
+
|
3186
|
+
name = qname + "_vs_" + tname + ".aai.txt"
|
3187
|
+
print("Output will be located at", os.path.normpath(output) + "/results/"+name)
|
3188
|
+
|
3189
|
+
#Give the data for kmer indexing to the parallel processes
|
3190
|
+
global kmer_index
|
3191
|
+
kmer_index = create_kmer_index()
|
3192
|
+
|
3193
|
+
advance_me = [query, target]
|
3194
|
+
#All we need to do this.
|
3195
|
+
pool = multiprocessing.Pool(min(threads, 2))
|
3196
|
+
|
3197
|
+
results = pool.map(do_single_query, advance_me)
|
3198
|
+
|
3199
|
+
pool.close()
|
3200
|
+
pool.join()
|
3201
|
+
|
3202
|
+
query = results[0]
|
3203
|
+
target = results[1]
|
3204
|
+
|
3205
|
+
#One of the printouts
|
3206
|
+
max_poss_prots = max(len(query.best_hits_kmers), len(target.best_hits_kmers))
|
3207
|
+
|
3208
|
+
accs_to_view = set(query.best_hits_kmers.keys()).intersection(set(target.best_hits_kmers.keys()))
|
3209
|
+
|
3210
|
+
seq_pairs = [[query.best_hits_kmers[acc], target.best_hits_kmers[acc]] for acc in accs_to_view]
|
3211
|
+
|
3212
|
+
pool = multiprocessing.Pool(min(threads, len(accs_to_view)))
|
3213
|
+
|
3214
|
+
results = np.array(pool.map(intersect_kmer_lists, seq_pairs))
|
3215
|
+
|
3216
|
+
pool.close()
|
3217
|
+
pool.join()
|
3218
|
+
|
3219
|
+
jacc_mean = np.mean(results)
|
3220
|
+
jacc_std = np.std(results)
|
3221
|
+
actual_prots = len(results)
|
3222
|
+
aai_est = round(kaai_to_aai(jacc_mean), 2)
|
3223
|
+
|
3224
|
+
if aai_est > 90:
|
3225
|
+
aai_est = "> 90%"
|
3226
|
+
else:
|
3227
|
+
if aai_est < 30:
|
3228
|
+
aai_est = "< 30%"
|
3229
|
+
|
3230
|
+
output = open(name, "w")
|
3231
|
+
|
3232
|
+
print(qname, tname, round(jacc_mean, 4), round(jacc_std, 4), actual_prots, aai_est, file = output)
|
3233
|
+
|
3234
|
+
output.close()
|
3235
|
+
|
3236
|
+
print("FastAAI single query done! Estimated AAI:", aai_est)
|
3237
|
+
|
3238
|
+
def aai_index_opts():
|
3239
|
+
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
|
3240
|
+
description='''
|
3241
|
+
This FastAAI module takes a set of genomes, proteins, or proteins and HMMs, creates a FastAAI database from them, and then executes an all vs. all AAI search of the genomes in the database
|
3242
|
+
''')
|
3243
|
+
|
3244
|
+
parser.add_argument('-g', '--genomes', dest = 'genomes', default = None, help = 'A directory containing genomes in FASTA format.')
|
3245
|
+
parser.add_argument('-p', '--proteins', dest = 'proteins', default = None, help = 'A directory containing protein amino acids in FASTA format.')
|
3246
|
+
parser.add_argument('-m', '--hmms', dest = 'hmms', default = None, help = 'A directory containing the results of an HMM search on a set of proteins.')
|
3247
|
+
|
3248
|
+
parser.add_argument('-d', '--database', dest = 'db_name', default = "FastAAI_database.sqlite.db", help = 'The name of the database you wish to create or add to. The database will be created if it doesn\'t already exist and placed in the output directory.')
|
3249
|
+
|
3250
|
+
parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory to place the database and any protein or HMM files FastAAI creates. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
|
3251
|
+
|
3252
|
+
parser.add_argument('--genome_file', dest = 'gf', default = None, help = 'Alternative way to supply genomes. A file containing paths to your genome files, 1 per line.')
|
3253
|
+
parser.add_argument('--protein_file', dest = 'pf', default = None, help = 'Alternative way to supply proteins. A file containing paths to your protein files, 1 per line.')
|
3254
|
+
parser.add_argument('--hmm_file', dest = 'hf', default = None, help = 'Alternative way to supply HMMs. A file containing paths to your HMM files, 1 per line.')
|
3255
|
+
|
3256
|
+
parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
|
3257
|
+
parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
|
3258
|
+
|
3259
|
+
|
3260
|
+
parser.add_argument('--do_stdev', dest = "do_stdev", action='store_true', help = 'Off by default. Calculate std. deviations on Jaccard indicies. Increases memory usage and runtime slightly. Does NOT change estimated AAI values at all.')
|
3261
|
+
parser.add_argument('--unlimited_resources', dest = "large_mem", action = 'store_true', help = 'Off by default. Use a faster algorithm that consumes more RAM. FastAAI cannot calculate std. deviations with this algorithm, so they will automatically be skipped.')
|
3262
|
+
parser.add_argument('--mem', dest = "precision", default = "med", help = 'One of low/med/high. Medium by default. Save RAM in return for slightly rounded AAI estimates. Only affects FastAAI if you are also using the "--unlimited_resources" flag.')
|
3263
|
+
|
3264
|
+
args, unknown = parser.parse_known_args()
|
3265
|
+
|
3266
|
+
return parser, args
|
3267
|
+
|
3268
|
+
#Build a DB and query a dataset vs. self
|
3269
|
+
def aai_index(genomes, proteins, hmms, db_name, output, threads, gf, pf, hf, verbose, do_stdev, memory_use, unlimited_resources):
|
3270
|
+
#run build DB and then db_query with the fresh DB
|
3271
|
+
success = build_db(genomes, proteins, hmms, db_name, output, threads, gf, pf, hf, verbose)
|
3272
|
+
if success:
|
3273
|
+
accessible_name = os.path.normpath(output + "/database/" + db_name)
|
3274
|
+
db_query(accessible_name, accessible_name, verbose, output, threads, do_stdev, memory_use, unlimited_resources)
|
3275
|
+
else:
|
3276
|
+
print("Database could not be built. FastAAI exiting.")
|
3277
|
+
|
3278
|
+
return None
|
3279
|
+
|
3280
|
+
#Build 2 DBs and query query DB vs target DB
|
3281
|
+
def multi_query_opts():
|
3282
|
+
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
|
3283
|
+
description='''
|
3284
|
+
This FastAAI module takes a set of query genomes/proteins/proteins+HMMs and a set of target genomes/proteins/proteins+HMMs.
|
3285
|
+
Two FastAAI databases will be created, one for the query and one for the target, then the query database will have AAI calculated against the target database
|
3286
|
+
''')
|
3287
|
+
|
3288
|
+
parser.add_argument('-qg', '--query_genomes', dest = 'query_genomes', default = None, help = 'A directory containing query genomes in FASTA format.')
|
3289
|
+
parser.add_argument('-qp', '--query_proteins', dest = 'query_proteins', default = None, help = 'A directory containing query protein amino acids in FASTA format.')
|
3290
|
+
parser.add_argument('-qm', '--query_hmms', dest = 'query_hmms', default = None, help = 'A directory containing the results of an HMM search on the set of query proteins.')
|
3291
|
+
|
3292
|
+
parser.add_argument('-tg', '--target_genomes', dest = 'target_genomes', default = None, help = 'A directory containing target genomes in FASTA format.')
|
3293
|
+
parser.add_argument('-tp', '--target_proteins', dest = 'target_proteins', default = None, help = 'A directory containing target protein amino acids in FASTA format.')
|
3294
|
+
parser.add_argument('-tm', '--target_hmms', dest = 'target_hmms', default = None, help = 'A directory containing the results of an HMM search on the set of target proteins.')
|
3295
|
+
|
3296
|
+
|
3297
|
+
parser.add_argument('-qd', '--query_database', dest = 'query_db_name', default = "FastAAI_query_database.sqlite.db", help = 'The name of the query database you wish to create or add to. The database will be created if it doesn\'t already exist and placed in the output directory.')
|
3298
|
+
parser.add_argument('-td', '--target_database', dest = 'target_db_name', default = "FastAAI_target_database.sqlite.db", help = 'The name of the target database you wish to create or add to. The database will be created if it doesn\'t already exist and placed in the output directory.')
|
3299
|
+
|
3300
|
+
parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory to place the database and any protein or HMM files FastAAI creates. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
|
3301
|
+
|
3302
|
+
parser.add_argument('--query_genome_file', dest = 'qgf', default = None, help = 'Alternative way to supply genomes. A file containing paths to your query genome files, 1 per line.')
|
3303
|
+
parser.add_argument('--query_protein_file', dest = 'qpf', default = None, help = 'Alternative way to supply proteins. A file containing paths to your query protein files, 1 per line.')
|
3304
|
+
parser.add_argument('--query_hmm_file', dest = 'qhf', default = None, help = 'Alternative way to supply HMMs. A file containing paths to your query HMM files, 1 per line.')
|
3305
|
+
|
3306
|
+
parser.add_argument('--target_genome_file', dest = 'tgf', default = None, help = 'Alternative way to supply genomes. A file containing paths to your target genome files, 1 per line.')
|
3307
|
+
parser.add_argument('--target_protein_file', dest = 'tpf', default = None, help = 'Alternative way to supply proteins. A file containing paths to your target protein files, 1 per line.')
|
3308
|
+
parser.add_argument('--target_hmm_file', dest = 'thf', default = None, help = 'Alternative way to supply HMMs. A file containing paths to your target HMM files, 1 per line.')
|
3309
|
+
|
3310
|
+
parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
|
3311
|
+
parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
|
3312
|
+
|
3313
|
+
parser.add_argument('--do_stdev', dest = "do_stdev", action='store_true', help = 'Off by default. Calculate std. deviations on Jaccard indicies. Increases memory usage and runtime slightly. Does NOT change estimated AAI values at all.')
|
3314
|
+
parser.add_argument('--unlimited_resources', dest = "large_mem", action = 'store_true', help = 'Off by default. Use a faster algorithm that consumes more RAM. FastAAI cannot calculate std. deviations with this algorithm, so they will automatically be skipped.')
|
3315
|
+
parser.add_argument('--mem', dest = "precision", default = "med", help = 'One of low/med/high. Medium by default. Save RAM in return for slightly rounded AAI estimates. Only affects FastAAI if you are also using the "--unlimited_resources" flag.')
|
3316
|
+
|
3317
|
+
args, unknown = parser.parse_known_args()
|
3318
|
+
|
3319
|
+
return parser, args
|
3320
|
+
|
3321
|
+
#Build 2 DBs and query query DB vs target DB
|
3322
|
+
def multi_query(query_arg_list, target_arg_list, shared_args):
|
3323
|
+
pass
|
3324
|
+
output, threads, verbose, do_stdev, mem, efficient = shared_args[0], shared_args[1], shared_args[2], shared_args[3], shared_args[4], shared_args[5]
|
3325
|
+
|
3326
|
+
genomes, proteins, hmms, gf, pf, hf, db_name = query_arg_list[0], query_arg_list[1], query_arg_list[2], query_arg_list[3], query_arg_list[4], query_arg_list[5], query_arg_list[6]
|
3327
|
+
accessible_name_query = os.path.normpath(output + "/database/" + db_name)
|
3328
|
+
build_db(genomes, proteins, hmms, db_name, output, threads, gf, pf, hf, verbose)
|
3329
|
+
|
3330
|
+
genomes, proteins, hmms, gf, pf, hf, db_name = target_arg_list[0], target_arg_list[1], target_arg_list[2], target_arg_list[3], target_arg_list[4], target_arg_list[5], target_arg_list[6]
|
3331
|
+
accessible_name_target = os.path.normpath(output + "/database/" + db_name)
|
3332
|
+
build_db(genomes, proteins, hmms, db_name, output, threads, gf, pf, hf, verbose)
|
3333
|
+
|
3334
|
+
db_query(accessible_name_query, accessible_name_target, verbose, output, threads, do_stdev, mem, efficient)
|
3335
|
+
|
3336
|
+
'''
|
3337
|
+
Main
|
3338
|
+
'''
|
3339
|
+
def main():
|
3340
|
+
#The currently supported modules.
|
3341
|
+
modules = ["build_db", "merge_db", "simple_query", "db_query", "single_query", "aai_index", "multi_query"]
|
3342
|
+
|
3343
|
+
#Print modules if someone just types FastAAI
|
3344
|
+
if len(sys.argv) < 2:
|
3345
|
+
print("")
|
3346
|
+
print(" Welcome to FastAAI")
|
3347
|
+
print("")
|
3348
|
+
print("")
|
3349
|
+
print(" Please select one of the following modules:")
|
3350
|
+
print("")
|
3351
|
+
print("------------------------------------------- Quick Usage Options -------------------------------------------")
|
3352
|
+
print("")
|
3353
|
+
print(" single_query |" + " Quickly query ONE query genome against ONE target genome")
|
3354
|
+
print(" multi_query |" + " Create a query DB and a target DB, then calculate query vs. target AAI")
|
3355
|
+
print(" aai_index |" + " Create a database from multiple genomes and do an all vs. all AAI index of the genomes")
|
3356
|
+
print("")
|
3357
|
+
print("-------------------------------------- Database Construction Options --------------------------------------")
|
3358
|
+
print("")
|
3359
|
+
print(" build_db |" + " Create or add to a FastAAI database from genomes, proteins, or proteins and HMMs")
|
3360
|
+
print(" merge_db |" + " Add the contents of one FastAAI DB to another")
|
3361
|
+
print("")
|
3362
|
+
print("---------------------------------------------- Query Options ----------------------------------------------")
|
3363
|
+
print("")
|
3364
|
+
print(" simple_query |" + " Query a genome or protein (one or many) against an existing FastAAI database")
|
3365
|
+
print(" db_query |" + " Query the genomes in one FastAAI database against the genomes in another FastAAI database")
|
3366
|
+
print("")
|
3367
|
+
print("-----------------------------------------------------------------------------------------------------------")
|
3368
|
+
print("")
|
3369
|
+
print(" To select a module, enter 'FastAAI [module]' into the command line!")
|
3370
|
+
print("")
|
3371
|
+
sys.exit()
|
3372
|
+
|
3373
|
+
#This is the module selection
|
3374
|
+
selection = sys.argv[1]
|
3375
|
+
|
3376
|
+
if selection not in modules:
|
3377
|
+
print("")
|
3378
|
+
print(" I couldn't find the module you specified. Please select one of the following modules:")
|
3379
|
+
print("")
|
3380
|
+
print("------------------------------------------- Quick Usage Options -------------------------------------------")
|
3381
|
+
print("")
|
3382
|
+
print(" single_query |" + " Quickly query ONE query genome against ONE target genome")
|
3383
|
+
print(" multi_query |" + " Create a query DB and a target DB, then calculate query vs. target AAI")
|
3384
|
+
print(" aai_index |" + " Create a database from multiple genomes and do an all vs. all AAI index of the genomes")
|
3385
|
+
print("")
|
3386
|
+
print("-------------------------------------- Database Construction Options --------------------------------------")
|
3387
|
+
print("")
|
3388
|
+
print(" build_db |" + " Create or add to a FastAAI database from genomes, proteins, or proteins and HMMs")
|
3389
|
+
print(" merge_db |" + " Add the contents of one FastAAI DB to another")
|
3390
|
+
print("")
|
3391
|
+
print("---------------------------------------------- Query Options ----------------------------------------------")
|
3392
|
+
print("")
|
3393
|
+
print(" simple_query |" + " Query a genome or protein (one or many) against an existing FastAAI database")
|
3394
|
+
print(" db_query |" + " Query the genomes in one FastAAI database against the genomes in another FastAAI database")
|
3395
|
+
print("")
|
3396
|
+
print("-----------------------------------------------------------------------------------------------------------")
|
3397
|
+
print("")
|
3398
|
+
print(" To select a module, enter 'FastAAI [module]' into the command line!")
|
3399
|
+
print("")
|
3400
|
+
sys.exit()
|
3401
|
+
|
3402
|
+
#################### Database build or add ########################
|
3403
|
+
|
3404
|
+
if selection == "build_db":
|
3405
|
+
parser, opts = build_db_opts()
|
3406
|
+
|
3407
|
+
#module name only
|
3408
|
+
if len(sys.argv) < 3:
|
3409
|
+
print(parser.print_help())
|
3410
|
+
sys.exit()
|
3411
|
+
|
3412
|
+
#Directory based
|
3413
|
+
genomes, proteins, hmms = opts.genomes, opts.proteins, opts.hmms
|
3414
|
+
|
3415
|
+
#Input list based
|
3416
|
+
gf, pf, hf = opts.gf, opts.pf, opts.hf
|
3417
|
+
|
3418
|
+
output = os.path.normpath(opts.output)
|
3419
|
+
|
3420
|
+
threads = opts.threads
|
3421
|
+
verbose = opts.verbose
|
3422
|
+
|
3423
|
+
#Database handle
|
3424
|
+
db_name = opts.db_name
|
3425
|
+
|
3426
|
+
|
3427
|
+
#genomes, proteins, hmms, db_name, output, threads, gf, pf, hf, verbose
|
3428
|
+
build_db(genomes, proteins, hmms, db_name, output, threads, gf, pf, hf, verbose)
|
3429
|
+
|
3430
|
+
#################### Add two DBs ########################
|
3431
|
+
|
3432
|
+
if selection == "merge_db":
|
3433
|
+
parser, opts = merge_db_opts()
|
3434
|
+
if len(sys.argv) < 3:
|
3435
|
+
print(parser.print_help())
|
3436
|
+
sys.exit()
|
3437
|
+
|
3438
|
+
recipient = opts.recipient
|
3439
|
+
donors = opts.donors
|
3440
|
+
verbose = opts.verbose
|
3441
|
+
threads = opts.threads
|
3442
|
+
|
3443
|
+
merge_db(recipient, donors, verbose, threads)
|
3444
|
+
|
3445
|
+
#################### Query files vs DB ########################
|
3446
|
+
|
3447
|
+
if selection == "simple_query":
|
3448
|
+
parser, opts = sql_query_opts()
|
3449
|
+
|
3450
|
+
if len(sys.argv) < 3:
|
3451
|
+
print(parser.print_help())
|
3452
|
+
sys.exit()
|
3453
|
+
|
3454
|
+
#directory based
|
3455
|
+
genomes, proteins, hmms = opts.genomes, opts.proteins, opts.hmms
|
3456
|
+
|
3457
|
+
#Input list based
|
3458
|
+
gf, pf, hf = opts.gf, opts.pf, opts.hf
|
3459
|
+
|
3460
|
+
db_name = opts.target
|
3461
|
+
|
3462
|
+
output = opts.output
|
3463
|
+
threads = opts.threads
|
3464
|
+
verbose = opts.verbose
|
3465
|
+
|
3466
|
+
do_stdev = opts.do_stdev
|
3467
|
+
|
3468
|
+
sql_query(genomes, proteins, hmms, gf, pf, hf, db_name, output, threads, verbose, do_stdev)
|
3469
|
+
|
3470
|
+
|
3471
|
+
#################### Query DB vs DB ###########################
|
3472
|
+
if selection == "db_query":
|
3473
|
+
parser, opts = db_query_opts()
|
3474
|
+
#module name only
|
3475
|
+
|
3476
|
+
if len(sys.argv) < 3:
|
3477
|
+
print(parser.print_help())
|
3478
|
+
sys.exit()
|
3479
|
+
|
3480
|
+
query = opts.query
|
3481
|
+
target = opts.target
|
3482
|
+
verbose = opts.verbose
|
3483
|
+
|
3484
|
+
do_stdev = opts.do_stdev
|
3485
|
+
#massive = opts.massive
|
3486
|
+
|
3487
|
+
mem = opts.precision
|
3488
|
+
efficient = opts.large_mem
|
3489
|
+
|
3490
|
+
output = opts.output
|
3491
|
+
threads = opts.threads
|
3492
|
+
|
3493
|
+
db_query(query, target, verbose, output, threads, do_stdev, mem, efficient)
|
3494
|
+
|
3495
|
+
#################### One-pass functions #######################
|
3496
|
+
if selection == "single_query":
|
3497
|
+
parser, opts = single_query_opts()
|
3498
|
+
#module name only
|
3499
|
+
|
3500
|
+
if len(sys.argv) < 3:
|
3501
|
+
print(parser.print_help())
|
3502
|
+
sys.exit()
|
3503
|
+
|
3504
|
+
shared_opts = []
|
3505
|
+
output = os.path.normpath(opts.output)
|
3506
|
+
threads = opts.threads
|
3507
|
+
verbose = opts.verbose
|
3508
|
+
|
3509
|
+
shared_opts.append(output)
|
3510
|
+
|
3511
|
+
shared_opts.append(threads)
|
3512
|
+
shared_opts.append(verbose)
|
3513
|
+
|
3514
|
+
query_opts = []
|
3515
|
+
|
3516
|
+
query_genome = opts.query_genome
|
3517
|
+
query_protein = opts.query_protein
|
3518
|
+
query_hmm = opts.query_hmm
|
3519
|
+
|
3520
|
+
|
3521
|
+
query_opts.append(query_genome)
|
3522
|
+
query_opts.append(query_protein)
|
3523
|
+
query_opts.append(query_hmm)
|
3524
|
+
|
3525
|
+
target_opts = []
|
3526
|
+
|
3527
|
+
target_genome = opts.target_genome
|
3528
|
+
target_protein = opts.target_protein
|
3529
|
+
target_hmm = opts.target_hmm
|
3530
|
+
|
3531
|
+
#tg = opts.target_genome_file
|
3532
|
+
#tp = opts.target_protein_file
|
3533
|
+
#th = opts.target_hmm_file
|
3534
|
+
|
3535
|
+
target_opts.append(target_genome)
|
3536
|
+
target_opts.append(target_protein)
|
3537
|
+
target_opts.append(target_hmm)
|
3538
|
+
|
3539
|
+
single_query(query_opts, target_opts, shared_opts)
|
3540
|
+
|
3541
|
+
if selection == "aai_index":
|
3542
|
+
parser, opts = aai_index_opts()
|
3543
|
+
#module name only
|
3544
|
+
|
3545
|
+
if len(sys.argv) < 3:
|
3546
|
+
print(parser.print_help())
|
3547
|
+
sys.exit()
|
3548
|
+
|
3549
|
+
|
3550
|
+
genomes, proteins, hmms = opts.genomes, opts.proteins, opts.hmms
|
3551
|
+
#Text file versions of genomes/proteins/hmms
|
3552
|
+
gf, pf, hf = opts.gf, opts.pf, opts.hf
|
3553
|
+
|
3554
|
+
db_name = opts.db_name
|
3555
|
+
|
3556
|
+
output = opts.output
|
3557
|
+
threads = opts.threads
|
3558
|
+
verbose = opts.verbose
|
3559
|
+
|
3560
|
+
do_stdev = opts.do_stdev
|
3561
|
+
#massive = opts.massive
|
3562
|
+
|
3563
|
+
mem = opts.precision
|
3564
|
+
efficient = opts.large_mem
|
3565
|
+
|
3566
|
+
aai_index(genomes, proteins, hmms, db_name, output, threads, gf, pf, hf, verbose, do_stdev, mem, efficient)
|
3567
|
+
|
3568
|
+
if selection == "multi_query":
|
3569
|
+
parser, opts = multi_query_opts()
|
3570
|
+
#module name only
|
3571
|
+
|
3572
|
+
if len(sys.argv) < 3:
|
3573
|
+
print(parser.print_help())
|
3574
|
+
sys.exit()
|
3575
|
+
|
3576
|
+
shared_arg_list = []
|
3577
|
+
output = os.path.normpath(opts.output)
|
3578
|
+
threads = opts.threads
|
3579
|
+
verbose = opts.verbose
|
3580
|
+
|
3581
|
+
do_stdev = opts.do_stdev
|
3582
|
+
#massive = opts.massive
|
3583
|
+
|
3584
|
+
mem = opts.precision
|
3585
|
+
efficient = opts.large_mem
|
3586
|
+
|
3587
|
+
shared_arg_list.append(output)
|
3588
|
+
shared_arg_list.append(threads)
|
3589
|
+
shared_arg_list.append(verbose)
|
3590
|
+
shared_arg_list.append(do_stdev)
|
3591
|
+
shared_arg_list.append(mem)
|
3592
|
+
shared_arg_list.append(efficient)
|
3593
|
+
|
3594
|
+
query_arg_list = []
|
3595
|
+
genomes, proteins, hmms = opts.query_genomes, opts.query_proteins, opts.query_hmms
|
3596
|
+
#Text file versions of genomes/proteins/hmms
|
3597
|
+
gf, pf, hf = opts.qgf, opts.qpf, opts.qhf
|
3598
|
+
query_db_name = opts.query_db_name
|
3599
|
+
|
3600
|
+
query_arg_list.append(genomes)
|
3601
|
+
query_arg_list.append(proteins)
|
3602
|
+
query_arg_list.append(hmms)
|
3603
|
+
query_arg_list.append(gf)
|
3604
|
+
query_arg_list.append(pf)
|
3605
|
+
query_arg_list.append(hf)
|
3606
|
+
query_arg_list.append(query_db_name)
|
3607
|
+
|
3608
|
+
target_arg_list = []
|
3609
|
+
genomes, proteins, hmms = opts.target_genomes, opts.target_proteins, opts.target_hmms
|
3610
|
+
#Text file versions of genomes/proteins/hmms
|
3611
|
+
gf, pf, hf = opts.tgf, opts.tpf, opts.thf
|
3612
|
+
target_db_name = opts.target_db_name
|
3613
|
+
|
3614
|
+
target_arg_list.append(genomes)
|
3615
|
+
target_arg_list.append(proteins)
|
3616
|
+
target_arg_list.append(hmms)
|
3617
|
+
target_arg_list.append(gf)
|
3618
|
+
target_arg_list.append(pf)
|
3619
|
+
target_arg_list.append(hf)
|
3620
|
+
target_arg_list.append(target_db_name)
|
3621
|
+
|
3622
|
+
multi_query(query_arg_list, target_arg_list, shared_arg_list)
|
3623
|
+
|
3624
|
+
return None
|
3625
|
+
|
3626
|
+
|
3627
|
+
if __name__ == "__main__":
|
3628
|
+
main()
|
3629
|
+
|
3630
|
+
|