miga-base 1.2.18.2 → 1.3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/miga/cli/action/doctor/base.rb +2 -1
- data/lib/miga/cli/action/init.rb +1 -1
- data/lib/miga/dataset/result/add.rb +3 -2
- data/lib/miga/version.rb +2 -2
- data/scripts/essential_genes.bash +4 -8
- data/utils/FastAAI/LICENSE +8 -0
- data/utils/FastAAI/README.md +151 -40
- data/utils/FastAAI/__init__.py +1 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962915_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962925_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962935_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962945_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962995_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963025_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963055_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963065_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/_Pseudomonas__cissicola_GCA_002019225_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/_Pseudomonas__cissicola_GCA_008801575_1.fna.gz +0 -0
- data/utils/FastAAI/fastaai/__init__.py +1 -0
- data/utils/FastAAI/fastaai/fastaai +4805 -0
- data/utils/FastAAI/fastaai/fastaai.py +4805 -0
- data/utils/FastAAI/fastaai/fastaai_miga_crystals_to_db.py +297 -0
- data/utils/FastAAI/fastaai/fastaai_miga_preproc.py +931 -0
- data/utils/FastAAI/metadata/Accession_names_and_IDs.txt +122 -0
- data/utils/distance/commands.rb +51 -23
- metadata +23 -6
- data/utils/FastAAI/FastAAI +0 -3659
- /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Archaea_SCG.hmm +0 -0
- /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Bacteria_SCG.hmm +0 -0
- /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Complete_SCG_DB.hmm +0 -0
data/utils/FastAAI/FastAAI
DELETED
@@ -1,3659 +0,0 @@
|
|
1
|
-
#!/usr/bin/env python3
|
2
|
-
|
3
|
-
################################################################################
|
4
|
-
"""---0.0 Import Modules---"""
|
5
|
-
import subprocess
|
6
|
-
import argparse
|
7
|
-
import datetime
|
8
|
-
import shutil
|
9
|
-
import textwrap
|
10
|
-
import multiprocessing
|
11
|
-
import pickle
|
12
|
-
import gzip
|
13
|
-
import tempfile
|
14
|
-
#Shouldn't play any role.
|
15
|
-
#from random import randint
|
16
|
-
|
17
|
-
#We could probably remove Path, too.
|
18
|
-
from pathlib import Path
|
19
|
-
#This as well
|
20
|
-
from functools import partial
|
21
|
-
import time
|
22
|
-
from collections import defaultdict
|
23
|
-
import sys
|
24
|
-
import os
|
25
|
-
from math import floor
|
26
|
-
import sqlite3
|
27
|
-
#numpy dependency
|
28
|
-
import numpy as np
|
29
|
-
import io
|
30
|
-
import random
|
31
|
-
|
32
|
-
|
33
|
-
#Takes a bytestring from the SQL database and converts it to a numpy array.
|
34
|
-
def convert_array(bytestring):
|
35
|
-
return np.frombuffer(bytestring, dtype = np.int32)
|
36
|
-
|
37
|
-
def convert_float_array_16(bytestring):
|
38
|
-
return np.frombuffer(bytestring, dtype = np.float16)
|
39
|
-
|
40
|
-
def convert_float_array_32(bytestring):
|
41
|
-
return np.frombuffer(bytestring, dtype = np.float32)
|
42
|
-
|
43
|
-
def convert_float_array_64(bytestring):
|
44
|
-
return np.frombuffer(bytestring, dtype = np.float64)
|
45
|
-
|
46
|
-
|
47
|
-
#Iterator for agnostic reader
|
48
|
-
class agnostic_reader_iterator:
|
49
|
-
def __init__(self, reader):
|
50
|
-
self.handle_ = reader.handle
|
51
|
-
self.is_gz_ = reader.is_gz
|
52
|
-
|
53
|
-
def __next__(self):
|
54
|
-
if self.is_gz_:
|
55
|
-
line = self.handle_.readline().decode()
|
56
|
-
else:
|
57
|
-
line = self.handle_.readline()
|
58
|
-
|
59
|
-
#Ezpz EOF check
|
60
|
-
if line:
|
61
|
-
return line
|
62
|
-
else:
|
63
|
-
raise StopIteration
|
64
|
-
|
65
|
-
#File reader that doesn't care if you give it a gzipped file or not.
|
66
|
-
class agnostic_reader:
|
67
|
-
def __init__(self, file):
|
68
|
-
self.path = file
|
69
|
-
|
70
|
-
with open(file, 'rb') as test_gz:
|
71
|
-
#Gzip magic number
|
72
|
-
is_gz = (test_gz.read(2) == b'\x1f\x8b')
|
73
|
-
|
74
|
-
self.is_gz = is_gz
|
75
|
-
|
76
|
-
if is_gz:
|
77
|
-
self.handle = gzip.open(self.path)
|
78
|
-
else:
|
79
|
-
self.handle = open(self.path)
|
80
|
-
|
81
|
-
def __iter__(self):
|
82
|
-
return agnostic_reader_iterator(self)
|
83
|
-
|
84
|
-
def close(self):
|
85
|
-
self.handle.close()
|
86
|
-
|
87
|
-
#FastAAI database class. This is the final database
|
88
|
-
class fastaai_database:
|
89
|
-
def __init__(self, path):
|
90
|
-
#open SQL db and load in
|
91
|
-
|
92
|
-
self.path = path
|
93
|
-
self.exists = os.path.exists(path)
|
94
|
-
|
95
|
-
self.child = None
|
96
|
-
self.connection = None
|
97
|
-
self.cursor = None
|
98
|
-
|
99
|
-
self.child_connection = None
|
100
|
-
self.child_cursor = None
|
101
|
-
|
102
|
-
self.accessions = None
|
103
|
-
#self.genomes = None
|
104
|
-
|
105
|
-
#gak stands for 'genome_accession_kmer_counts'
|
106
|
-
self.gak = None
|
107
|
-
self.genome_index = None
|
108
|
-
#Go from index to name
|
109
|
-
self.reverse_genome_index = None
|
110
|
-
self.protein_counts_by_genome = None
|
111
|
-
|
112
|
-
#self.accession_set = None
|
113
|
-
|
114
|
-
self.verbosity = False
|
115
|
-
|
116
|
-
#Open an SQL connection
|
117
|
-
def activate_connection(self, with_converter = True):
|
118
|
-
# Converts np.array to TEXT when inserting
|
119
|
-
##sqlite3.register_adapter(np.ndarray, adapt_array)
|
120
|
-
|
121
|
-
#Converts byte string to numpy ndarray(int32) upon read from DB.
|
122
|
-
if with_converter:
|
123
|
-
sqlite3.register_converter("array", convert_array)
|
124
|
-
self.connection = sqlite3.connect(self.path, detect_types=sqlite3.PARSE_DECLTYPES)
|
125
|
-
|
126
|
-
else:
|
127
|
-
#sqlite3.register_converter("array", convert_array)
|
128
|
-
self.connection = sqlite3.connect(self.path)
|
129
|
-
|
130
|
-
self.cursor = self.connection.cursor()
|
131
|
-
self.exists = True
|
132
|
-
|
133
|
-
#Close an SQL connection
|
134
|
-
def close_connection(self):
|
135
|
-
self.cursor.close()
|
136
|
-
self.connection.close()
|
137
|
-
#True cleanup - even a closed SQL connection obj cannot be passed to multiple processors, but a nonetype can.
|
138
|
-
self.cursor = None
|
139
|
-
self.connection = None
|
140
|
-
|
141
|
-
def initialize_parent_database(self):
|
142
|
-
if not self.exists:
|
143
|
-
print("I need to be activated first!")
|
144
|
-
else:
|
145
|
-
#DB exists. Add metadata tables if needed.
|
146
|
-
self.cursor.execute(''' SELECT count(name) FROM sqlite_master WHERE type='table' AND name='genome_index' ''')
|
147
|
-
if self.cursor.fetchone()[0]!=1 :
|
148
|
-
self.cursor.execute('''CREATE TABLE genome_index
|
149
|
-
(genome text, gen_id INTEGER PRIMARY KEY, protein_count INTEGER)''')
|
150
|
-
self.connection.commit()
|
151
|
-
|
152
|
-
self.cursor.execute(''' SELECT count(name) FROM sqlite_master WHERE type='table' AND name='genome_acc_kmer_counts' ''')
|
153
|
-
if self.cursor.fetchone()[0]!=1 :
|
154
|
-
self.cursor.execute('''CREATE TABLE genome_acc_kmer_counts
|
155
|
-
(genome INTEGER, accession INTEGER, count INTEGER)''')
|
156
|
-
self.connection.commit()
|
157
|
-
|
158
|
-
#Access an existing master database
|
159
|
-
def activate_child_connection(self, child):
|
160
|
-
#Don't try to connect unless it exists. This should never fail.
|
161
|
-
if os.path.exists(child):
|
162
|
-
self.child = child
|
163
|
-
self.child_connection = sqlite3.connect(self.child, detect_types=sqlite3.PARSE_DECLTYPES)
|
164
|
-
self.child_cursor = self.child_connection.cursor()
|
165
|
-
else:
|
166
|
-
print("Child database:", child, "not found!")
|
167
|
-
|
168
|
-
#Close access to master DB
|
169
|
-
def close_child_connection(self):
|
170
|
-
if self.child_cursor is not None:
|
171
|
-
self.child_cursor.close()
|
172
|
-
self.child_connection.close()
|
173
|
-
self.child_cursor = None
|
174
|
-
self.child_connection = None
|
175
|
-
self.child = None
|
176
|
-
|
177
|
-
def add_child_to_parent(self, acc, child_db, remove = True, selected_kmers = None, genomes_too = False, just_genomes = False, update_gak = False):
|
178
|
-
accession_index = generate_accessions_index()
|
179
|
-
|
180
|
-
create_command = "CREATE TABLE IF NOT EXISTS " + acc + " (kmer INTEGER PRIMARY KEY, genomes array)"
|
181
|
-
|
182
|
-
if not just_genomes:
|
183
|
-
self.cursor.execute(create_command)
|
184
|
-
self.connection.commit()
|
185
|
-
|
186
|
-
if genomes_too or just_genomes:
|
187
|
-
create_command = "CREATE TABLE IF NOT EXISTS " + acc + "_genomes (genome INTEGER PRIMARY KEY, kmers array)"
|
188
|
-
self.cursor.execute(create_command)
|
189
|
-
self.connection.commit()
|
190
|
-
|
191
|
-
attach = "attach '"+child_db+"' as toMerge"
|
192
|
-
|
193
|
-
if selected_kmers is not None:
|
194
|
-
add = "INSERT OR REPLACE INTO " + acc + " SELECT * FROM toMerge." + acc + " WHERE kmer in ({kmers})".format(kmers = ','.join(['?']*len(selected_kmers)))
|
195
|
-
else:
|
196
|
-
add = "INSERT OR REPLACE INTO " + acc + " SELECT * FROM toMerge." + acc
|
197
|
-
|
198
|
-
if genomes_too or just_genomes:
|
199
|
-
add_genomes = "INSERT OR REPLACE INTO " + acc + "_genomes" + " SELECT * FROM toMerge." + acc+"_genomes"
|
200
|
-
if update_gak:
|
201
|
-
sql_acc_num = acc.replace("_", ".")
|
202
|
-
sql_acc_num = accession_index[sql_acc_num]
|
203
|
-
#Return num bytes, which is always 4*as many as there are entries, as the dtype is int32. See unique_kmers.
|
204
|
-
gak_sql = 'INSERT OR REPLACE INTO genome_acc_kmer_counts SELECT genome, ' + str(sql_acc_num) + ', length(kmers)/4 FROM toMerge.' + acc + '_genomes'
|
205
|
-
|
206
|
-
detach = "detach toMerge"
|
207
|
-
|
208
|
-
self.cursor.execute(attach)
|
209
|
-
self.connection.commit()
|
210
|
-
|
211
|
-
if not just_genomes:
|
212
|
-
if selected_kmers is not None:
|
213
|
-
self.cursor.execute(add, selected_kmers)
|
214
|
-
else:
|
215
|
-
self.cursor.execute(add)
|
216
|
-
|
217
|
-
self.connection.commit()
|
218
|
-
|
219
|
-
if genomes_too or just_genomes:
|
220
|
-
self.cursor.execute(add_genomes)
|
221
|
-
self.connection.commit()
|
222
|
-
if update_gak:
|
223
|
-
self.cursor.execute(gak_sql)
|
224
|
-
self.connection.commit()
|
225
|
-
|
226
|
-
self.cursor.execute(detach)
|
227
|
-
self.connection.commit()
|
228
|
-
|
229
|
-
if remove:
|
230
|
-
os.remove(child_db)
|
231
|
-
|
232
|
-
def add_genomes_first(self, accession, kmer_dict):
|
233
|
-
kmer_lists = []
|
234
|
-
for genome in kmer_dict:
|
235
|
-
kmer_lists.append((genome, kmer_dict[genome].tobytes()))
|
236
|
-
|
237
|
-
|
238
|
-
sql_friendly_accession = accession.replace(".", "_")
|
239
|
-
|
240
|
-
#self.cursor.execute(" DROP TABLE IF EXISTS " + sql_friendly_accession + "_genomes")
|
241
|
-
|
242
|
-
self.cursor.execute("CREATE TABLE IF NOT EXISTS " + sql_friendly_accession + "_genomes (genome INTEGER PRIMARY KEY, kmers array)")
|
243
|
-
self.connection.commit()
|
244
|
-
|
245
|
-
self.cursor.executemany("INSERT OR REPLACE INTO " + sql_friendly_accession + "_genomes VALUES (?, ?) ", kmer_lists)
|
246
|
-
|
247
|
-
self.connection.commit()
|
248
|
-
|
249
|
-
return sql_friendly_accession
|
250
|
-
|
251
|
-
|
252
|
-
def load_genome_index(self):
|
253
|
-
self.genome_index = {}
|
254
|
-
self.reverse_genome_index = {}
|
255
|
-
self.protein_counts_by_genome = {}
|
256
|
-
|
257
|
-
sql_command = ("SELECT genome, gen_id, protein_count FROM genome_index")
|
258
|
-
|
259
|
-
#Break resist.
|
260
|
-
gen = None
|
261
|
-
id = None
|
262
|
-
protein_count = None
|
263
|
-
|
264
|
-
for result in self.cursor.execute(sql_command).fetchall():
|
265
|
-
gen = result[0]
|
266
|
-
id = result[1]
|
267
|
-
protein_count = result[2]
|
268
|
-
|
269
|
-
self.genome_index[gen] = id
|
270
|
-
self.reverse_genome_index[id] = gen
|
271
|
-
self.protein_counts_by_genome[id] = protein_count
|
272
|
-
|
273
|
-
del gen
|
274
|
-
del id
|
275
|
-
del protein_count
|
276
|
-
|
277
|
-
def load_accessions(self, permitted_genomes = None, permitted_accessions = None):
|
278
|
-
#self.protein_counts_by_genome = None
|
279
|
-
|
280
|
-
self.gak = defaultdict(lambda: defaultdict())
|
281
|
-
self.accessions = set()
|
282
|
-
|
283
|
-
|
284
|
-
#It's possible to do both of these. Don't.
|
285
|
-
if permitted_genomes is not None:
|
286
|
-
sql_command = "SELECT * FROM genome_acc_kmer_counts WHERE genome IN ({genomes})".format(genomes=','.join(['?']*len(permitted_genomes)))
|
287
|
-
#data type is very important to SQL
|
288
|
-
sql_friendly = [int(permitted_genomes[i]) for i in range(0, len(permitted_genomes))]
|
289
|
-
for result in self.cursor.execute(sql_command, sql_friendly).fetchall():
|
290
|
-
genome, accession, kmer_ct = result[0], result[1], result[2]
|
291
|
-
self.gak[genome][accession] = kmer_ct
|
292
|
-
|
293
|
-
if permitted_accessions is not None:
|
294
|
-
sql_command = "SELECT * FROM genome_acc_kmer_counts WHERE accession IN ({accessions})".format(accessions=','.join(['?']*len(permitted_accessions)))
|
295
|
-
#data type is very important to SQL
|
296
|
-
#sql_friendly = [int(permitted_accessions[i]) for i in range(0, len(permitted_genomes))]
|
297
|
-
for result in self.cursor.execute(sql_command, permitted_accessions).fetchall():
|
298
|
-
genome, accession, kmer_ct = result[0], result[1], result[2]
|
299
|
-
self.gak[genome][accession] = kmer_ct
|
300
|
-
|
301
|
-
#Normal case
|
302
|
-
if permitted_accessions is None and permitted_genomes is None:
|
303
|
-
sql_command = "SELECT * FROM genome_acc_kmer_counts"
|
304
|
-
for result in self.cursor.execute(sql_command).fetchall():
|
305
|
-
genome, accession, kmer_ct = result[0], result[1], result[2]
|
306
|
-
self.gak[genome][accession] = kmer_ct
|
307
|
-
|
308
|
-
#un-defaultdict
|
309
|
-
self.gak = dict(self.gak)
|
310
|
-
for genome in self.gak:
|
311
|
-
self.gak[genome] = dict(self.gak[genome])
|
312
|
-
self.accessions = self.accessions.union(self.gak[genome].keys())
|
313
|
-
|
314
|
-
self.accessions = tuple(self.accessions)
|
315
|
-
|
316
|
-
def just_accessions(self):
|
317
|
-
converter = generate_accessions_index()
|
318
|
-
acc_sql = "SELECT name FROM sqlite_master WHERE type='table'"
|
319
|
-
tables = [item[0] for item in self.cursor.execute(acc_sql).fetchall()]
|
320
|
-
|
321
|
-
genome_tables = []
|
322
|
-
for table in tables:
|
323
|
-
if table.endswith('_genomes'):
|
324
|
-
genome_tables.append(table)
|
325
|
-
|
326
|
-
for table in genome_tables:
|
327
|
-
tables.pop(tables.index(table))
|
328
|
-
|
329
|
-
tables.pop(tables.index('genome_acc_kmer_counts'))
|
330
|
-
tables.pop(tables.index('genome_index'))
|
331
|
-
|
332
|
-
#Back to indicies.
|
333
|
-
tables = [converter[table.replace('_', '.')] for table in tables]
|
334
|
-
|
335
|
-
self.accessions = tuple(tables)
|
336
|
-
|
337
|
-
def unload_genomes_and_accessions(self):
|
338
|
-
self.gak = None
|
339
|
-
self.genome_index = None
|
340
|
-
#Go from index to name
|
341
|
-
self.reverse_genome_index = None
|
342
|
-
self.protein_counts_by_genome = None
|
343
|
-
|
344
|
-
#Child database class. This is only used during database builds and merges. Designed to take one single accession at a time and produce a correctly formatted table of kmers and accessions.
|
345
|
-
class child_database:
|
346
|
-
def __init__(self, path, parent):
|
347
|
-
#open SQL db and load in
|
348
|
-
|
349
|
-
self.path = path
|
350
|
-
self.exists = False
|
351
|
-
|
352
|
-
self.parent = parent
|
353
|
-
self.parent_exists = os.path.exists(parent)
|
354
|
-
|
355
|
-
self.connection = None
|
356
|
-
self.cursor = None
|
357
|
-
|
358
|
-
self.parent_connection = None
|
359
|
-
self.parent_cursor = None
|
360
|
-
|
361
|
-
self.verbosity = False
|
362
|
-
|
363
|
-
#Open an SQL connection
|
364
|
-
def activate_child_connection(self):
|
365
|
-
# Converts np.array to TEXT when inserting
|
366
|
-
##sqlite3.register_adapter(np.ndarray, adapt_array)
|
367
|
-
|
368
|
-
# Converts TEXT to np.array when selecting
|
369
|
-
sqlite3.register_converter("array", convert_array)
|
370
|
-
|
371
|
-
self.connection = sqlite3.connect(self.path, detect_types=sqlite3.PARSE_DECLTYPES)
|
372
|
-
self.cursor = self.connection.cursor()
|
373
|
-
self.exists = True
|
374
|
-
|
375
|
-
#Close an SQL connection
|
376
|
-
def close_child_connection(self):
|
377
|
-
self.cursor.close()
|
378
|
-
self.connection.close()
|
379
|
-
#True cleanup - even a closed SQL connection obj cannot be passed to multiple processors, but a nonetype can.
|
380
|
-
self.cursor = None
|
381
|
-
self.connection = None
|
382
|
-
|
383
|
-
def initialize_child_database(self):
|
384
|
-
if not self.exists:
|
385
|
-
print("I need to be activated first!")
|
386
|
-
else:
|
387
|
-
#DB exists. Add metadata tables.
|
388
|
-
self.cursor.execute(''' SELECT count(name) FROM sqlite_master WHERE type='table' AND name='genome_index' ''')
|
389
|
-
if self.cursor.fetchone()[0]!=1 :
|
390
|
-
self.cursor.execute('''CREATE TABLE genome_index
|
391
|
-
(genome text, gen_id integer, protein_count integer)''')
|
392
|
-
self.connection.commit()
|
393
|
-
|
394
|
-
self.cursor.execute(''' SELECT count(name) FROM sqlite_master WHERE type='table' AND name='genome_acc_kmer_counts' ''')
|
395
|
-
if self.cursor.fetchone()[0]!=1 :
|
396
|
-
self.cursor.execute('''CREATE TABLE genome_acc_kmer_counts
|
397
|
-
(genome integer, accession integer, count integer)''')
|
398
|
-
self.connection.commit()
|
399
|
-
|
400
|
-
|
401
|
-
#Access an existing master database
|
402
|
-
def activate_parent_connection(self):
|
403
|
-
if os.path.exists(self.parent):
|
404
|
-
self.parent_exists = True
|
405
|
-
#sqlite3.register_adapter(np.ndarray, adapt_array)
|
406
|
-
# Converts TEXT to np.array when selecting
|
407
|
-
sqlite3.register_converter("array", convert_array)
|
408
|
-
self.parent_connection = sqlite3.connect(self.parent, detect_types=sqlite3.PARSE_DECLTYPES)
|
409
|
-
self.parent_cursor = self.parent_connection.cursor()
|
410
|
-
|
411
|
-
#Close access to master DB
|
412
|
-
def close_parent_connection(self):
|
413
|
-
if self.parent_cursor is not None:
|
414
|
-
self.parent_cursor.close()
|
415
|
-
self.parent_connection.close()
|
416
|
-
self.parent_cursor = None
|
417
|
-
self.parent_connection = None
|
418
|
-
|
419
|
-
def add_genomes_first(self, accession, kmer_lists):
|
420
|
-
|
421
|
-
#kmer_lists = []
|
422
|
-
#Shoot... gotta pass the args
|
423
|
-
|
424
|
-
#for file in prepared_files:
|
425
|
-
# if accession in file.best_hits_kmers:
|
426
|
-
# kmer_lists.append((genome_index[file.basename], file.best_hits_kmers[accession].tobytes()))
|
427
|
-
|
428
|
-
sql_friendly_accession = accession.replace(".", "_")
|
429
|
-
|
430
|
-
self.cursor.execute(" DROP TABLE IF EXISTS " + sql_friendly_accession + "_genomes")
|
431
|
-
|
432
|
-
self.cursor.execute("CREATE TABLE " + sql_friendly_accession + "_genomes (genome INTEGER PRIMARY KEY, kmers array)")
|
433
|
-
self.connection.commit()
|
434
|
-
|
435
|
-
self.cursor.executemany(" INSERT INTO " + sql_friendly_accession + "_genomes VALUES (?, ?) ", kmer_lists)
|
436
|
-
|
437
|
-
self.connection.commit()
|
438
|
-
|
439
|
-
return sql_friendly_accession
|
440
|
-
|
441
|
-
|
442
|
-
def add_accession(self, accession, insert_kmers):
|
443
|
-
sql_friendly_accession = accession.replace(".", "_")
|
444
|
-
|
445
|
-
if self.parent_exists:
|
446
|
-
parent_kmers = {}
|
447
|
-
#Check to see if this acc. is already in parent DB
|
448
|
-
table_exists = (self.parent_cursor.execute(" SELECT count(name) FROM sqlite_master WHERE type='table' AND name=(?)", (sql_friendly_accession,)).fetchone()[0] == 1)
|
449
|
-
#If the accession is in the parent DB
|
450
|
-
if table_exists:
|
451
|
-
#Select the records where the kmers are in the new kmers to be added - we don't have to modify the ones that aren't.
|
452
|
-
search_command = "SELECT * FROM "+ sql_friendly_accession + " WHERE kmer IN ({kmers})".format(kmers=','.join(['?']*len(insert_kmers)))
|
453
|
-
|
454
|
-
#Convert the kmers in the current insert list to the correct type for sql to match them
|
455
|
-
selection = tuple([int(key) for key in insert_kmers.keys()])
|
456
|
-
|
457
|
-
for item in self.parent_cursor.execute(search_command, selection).fetchall():
|
458
|
-
#Get the kmer for this parent
|
459
|
-
k = item[0]
|
460
|
-
#If the record would be modified in the parent, combine the to-add (which will replace the row) with the existing data. Otw. the record is unaffected and we can ignore it.
|
461
|
-
if k in insert_kmers:
|
462
|
-
insert_kmers[k] = np.union1d(insert_kmers[k], item[1])
|
463
|
-
|
464
|
-
|
465
|
-
#Free up the space.
|
466
|
-
del parent_kmers
|
467
|
-
|
468
|
-
formatted_kmers = []
|
469
|
-
|
470
|
-
#Translate the ndarray into its constituent byte data
|
471
|
-
for kmer in insert_kmers:
|
472
|
-
formatted_kmers.append((int(kmer), insert_kmers[kmer].tobytes(), ))
|
473
|
-
|
474
|
-
del insert_kmers
|
475
|
-
|
476
|
-
#Remove the child if it exists - it shouldn't ever exist because these child DBs should be deleted upon being added to the parent, but might if a run was stopped halfway.
|
477
|
-
self.cursor.execute(" DROP TABLE IF EXISTS " + sql_friendly_accession)
|
478
|
-
|
479
|
-
self.cursor.execute("CREATE TABLE " + sql_friendly_accession + " (kmer INTEGER PRIMARY KEY, genomes array)")
|
480
|
-
self.connection.commit()
|
481
|
-
|
482
|
-
self.cursor.executemany(" INSERT INTO " + sql_friendly_accession + " VALUES (?, ?) ", formatted_kmers)
|
483
|
-
|
484
|
-
self.connection.commit()
|
485
|
-
|
486
|
-
del formatted_kmers
|
487
|
-
|
488
|
-
return sql_friendly_accession
|
489
|
-
|
490
|
-
|
491
|
-
#Holds partial results for calculating AAI.
|
492
|
-
class calculation_database:
|
493
|
-
def __init__(self, path, precision):
|
494
|
-
#open SQL db and load in
|
495
|
-
|
496
|
-
self.path = path
|
497
|
-
self.exists = False
|
498
|
-
|
499
|
-
self.connection = None
|
500
|
-
self.cursor = None
|
501
|
-
|
502
|
-
self.genomes = None
|
503
|
-
|
504
|
-
self.verbosity = False
|
505
|
-
|
506
|
-
self.precision = precision
|
507
|
-
|
508
|
-
#Open an SQL connection
|
509
|
-
def activate_connection(self):
|
510
|
-
# Converts np.array to TEXT when inserting
|
511
|
-
##sqlite3.register_adapter(np.ndarray, adapt_array)
|
512
|
-
|
513
|
-
# Converts TEXT to np.array when selecting
|
514
|
-
if self.precision == "low":
|
515
|
-
sqlite3.register_converter("array", convert_float_array_16)
|
516
|
-
if self.precision == "med":
|
517
|
-
sqlite3.register_converter("array", convert_float_array_32)
|
518
|
-
if self.precision == "high":
|
519
|
-
sqlite3.register_converter("array", convert_float_array_64)
|
520
|
-
|
521
|
-
self.connection = sqlite3.connect(self.path, detect_types=sqlite3.PARSE_DECLTYPES)
|
522
|
-
self.cursor = self.connection.cursor()
|
523
|
-
self.exists = True
|
524
|
-
|
525
|
-
#Close an SQL connection
|
526
|
-
def close_connection(self):
|
527
|
-
self.cursor.close()
|
528
|
-
self.connection.close()
|
529
|
-
#True cleanup - even a closed SQL connection obj cannot be passed to multiple processors, but a nonetype can.
|
530
|
-
self.cursor = None
|
531
|
-
self.connection = None
|
532
|
-
|
533
|
-
def initialize_database(self):
|
534
|
-
if not self.exists:
|
535
|
-
print("I need to be activated first!")
|
536
|
-
else:
|
537
|
-
#DB exists. Add metadata tables.
|
538
|
-
self.cursor.execute("DROP TABLE IF EXISTS jaccards")
|
539
|
-
self.connection.commit()
|
540
|
-
self.cursor.execute("CREATE TABLE jaccards (genome INTEGER PRIMARY KEY, jaccards array)")
|
541
|
-
self.connection.commit()
|
542
|
-
|
543
|
-
'''
|
544
|
-
Class for handling all of the raw genome/protein/protein+HMM file inputs when building a database.
|
545
|
-
|
546
|
-
Takes a file or files and processes them from genome -> protein, protein -> hmm, prot+HMM -> kmerized protein best hits as numpy int arrays according to the kmer_index
|
547
|
-
|
548
|
-
'''
|
549
|
-
class input_file:
|
550
|
-
def __init__(self, input_path, output, verbosity):
|
551
|
-
#starting path for the file; irrelevant for protein and hmm, but otherwise useful for keeping track.
|
552
|
-
self.path = input_path
|
553
|
-
#Output directory starts with this
|
554
|
-
self.output = os.path.normpath(os.path.basename(output) + "/")
|
555
|
-
#For printing file updates, this is the input name
|
556
|
-
self.name = os.path.basename(input_path)
|
557
|
-
#original name is the key used for the genomes index later on.
|
558
|
-
self.original_name = os.path.basename(input_path)
|
559
|
-
#This is the name that can be used for building files with new extensions.
|
560
|
-
if input_path.endswith(".gz"):
|
561
|
-
#Remove .gz first to make names consistent.
|
562
|
-
self.basename = os.path.splitext(os.path.basename(input_path[:-3]))[0]
|
563
|
-
else:
|
564
|
-
self.basename = os.path.splitext(os.path.basename(input_path))[0]
|
565
|
-
#'genome' or 'protein' or 'protein and HMM'
|
566
|
-
self.status = None
|
567
|
-
#These will keep track of paths for each stage of file for us.
|
568
|
-
self.genome = None
|
569
|
-
self.protein = None
|
570
|
-
self.hmm = None
|
571
|
-
|
572
|
-
self.best_hits = None
|
573
|
-
self.best_hits_kmers = None
|
574
|
-
|
575
|
-
self.protein_count = 0
|
576
|
-
self.protein_kmer_count = {}
|
577
|
-
|
578
|
-
self.trans_table = None
|
579
|
-
self.start_time = None
|
580
|
-
self.end_time = None
|
581
|
-
self.err_log = ""
|
582
|
-
#doesn't get updated otw.
|
583
|
-
self.initial_state = "protein+HMM"
|
584
|
-
|
585
|
-
self.verbose = verbosity
|
586
|
-
|
587
|
-
#r_scripts_loc = os.path.dirname(sys.modules['metapop'].__file__) + "/metapop_r/"
|
588
|
-
#"00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm"
|
589
|
-
self.hmm_path = None
|
590
|
-
try:
|
591
|
-
#Try to locate the data bundled as it would be with a pip/conda install.
|
592
|
-
script_path = os.path.dirname(sys.modules['fastAAI_HMM_models'].__file__)
|
593
|
-
hmm_complete_model = script_path + '/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm'
|
594
|
-
self.hmm_path = str(hmm_complete_model)
|
595
|
-
#Check that the file exists or fail to the except.
|
596
|
-
fh = open(self.hmm_path)
|
597
|
-
fh.close()
|
598
|
-
except:
|
599
|
-
#Look in the same dir as the script; old method/MiGA friendly
|
600
|
-
script_path = Path(__file__)
|
601
|
-
script_dir = script_path.parent
|
602
|
-
hmm_complete_model = script_dir / "00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm"
|
603
|
-
self.hmm_path = str(hmm_complete_model)
|
604
|
-
|
605
|
-
#Functions for externally setting status and file paths of particular types
|
606
|
-
def set_genome(self, path):
|
607
|
-
self.status = 'genome'
|
608
|
-
self.genome = path
|
609
|
-
|
610
|
-
def set_protein(self, path):
|
611
|
-
self.status = 'protein'
|
612
|
-
self.protein = path
|
613
|
-
|
614
|
-
def set_hmm(self, path):
|
615
|
-
if self.protein is None:
|
616
|
-
print("Warning! I don't have a protein yet, so this HMM will be useless to me until I do!")
|
617
|
-
self.status = 'protein and hmm'
|
618
|
-
self.hmm = path
|
619
|
-
|
620
|
-
#Runs prodigal, compares translation tables and stores faa files
|
621
|
-
def genome_to_protein(self):
|
622
|
-
if self.genome is None:
|
623
|
-
print(self.name, "wasn't a declared as a genome! I can't make this into a protein!")
|
624
|
-
else:
|
625
|
-
folder = Path(self.output + "/predicted_proteins")
|
626
|
-
protein_output = folder / (self.basename + '.faa')
|
627
|
-
output_11 = folder / (self.basename + '.faa.11')
|
628
|
-
output_4 = folder / (self.basename + '.faa.4')
|
629
|
-
temp_output = folder / (self.basename + '.temp')
|
630
|
-
|
631
|
-
intermediate = folder / (self.basename + '_genome_intermediate.fasta')
|
632
|
-
|
633
|
-
#total_bases = 0
|
634
|
-
|
635
|
-
genome_parser = agnostic_reader(self.genome)
|
636
|
-
|
637
|
-
if genome_parser.is_gz:
|
638
|
-
#File was a gzip; decompress it to an intermediate file and then run prodigal; delete after
|
639
|
-
#print("unzipping input...")
|
640
|
-
midpoint = open(intermediate, "w")
|
641
|
-
#Count input bases and write an unzipped file for prodigal's sake.
|
642
|
-
for line in genome_parser:
|
643
|
-
#if not line.startswith(">"):
|
644
|
-
# total_bases += len(line.strip())
|
645
|
-
midpoint.write(line)
|
646
|
-
|
647
|
-
midpoint.close()
|
648
|
-
|
649
|
-
else:
|
650
|
-
#File is already unzipped, just point to it
|
651
|
-
intermediate = self.genome
|
652
|
-
#Count input bases
|
653
|
-
#for line in genome_parser:
|
654
|
-
# if not line.startswith(">"):
|
655
|
-
# total_bases += len(line.strip())
|
656
|
-
|
657
|
-
genome_parser.close()
|
658
|
-
'''
|
659
|
-
A chunk of code originally indended to match GTDBtk's table selection criteria.
|
660
|
-
if total_bases > 100000:
|
661
|
-
#training mode
|
662
|
-
subprocess.call(["prodigal", "-i", str(intermediate), "-a", str(output_11), "-q", "-o", str(temp_output)])
|
663
|
-
subprocess.call(["prodigal", "-i", str(intermediate), "-a", str(output_4), "-g", "4", "-q", "-o", str(temp_output)])
|
664
|
-
else:
|
665
|
-
#Metagenome mode for very short genomes.
|
666
|
-
subprocess.call(["prodigal", "-i", str(intermediate), "-p", "meta", "-a", str(output_11), "-q", "-o", str(temp_output)])
|
667
|
-
subprocess.call(["prodigal", "-i", str(intermediate), "-p", "meta", "-a", str(output_4), "-g", "4", "-q", "-o", str(temp_output)])
|
668
|
-
'''
|
669
|
-
|
670
|
-
subprocess.call(["prodigal", "-i", str(intermediate), "-a", str(output_11), "-q", "-o", str(temp_output)])
|
671
|
-
subprocess.call(["prodigal", "-i", str(intermediate), "-a", str(output_4), "-g", "4", "-q", "-o", str(temp_output)])
|
672
|
-
|
673
|
-
#We can get rid of the temp file immediately, we won't be using it
|
674
|
-
temp_output.unlink()
|
675
|
-
if genome_parser.is_gz:
|
676
|
-
#If the file was copied, delete. Otw. this would delete the input and we don't want that.
|
677
|
-
intermediate.unlink()
|
678
|
-
|
679
|
-
# Compare translation tables
|
680
|
-
length_4 = 0
|
681
|
-
length_11 = 0
|
682
|
-
with open(output_4, 'r') as table_4:
|
683
|
-
for line in table_4:
|
684
|
-
if line.startswith(">"):
|
685
|
-
continue
|
686
|
-
else:
|
687
|
-
length_4 += len(line.strip())
|
688
|
-
|
689
|
-
with open(output_11, 'r') as table_11:
|
690
|
-
for line in table_11:
|
691
|
-
if line.startswith(">"):
|
692
|
-
continue
|
693
|
-
else:
|
694
|
-
length_11 += len(line.strip())
|
695
|
-
|
696
|
-
#Select the winning translation table and remove the other. Open the winner.
|
697
|
-
if (length_4 / length_11) >= 1.1:
|
698
|
-
output_11.unlink()
|
699
|
-
self.trans_table = "4"
|
700
|
-
chosen_protein = open(output_4, 'r')
|
701
|
-
table_11 = False
|
702
|
-
else:
|
703
|
-
output_4.unlink()
|
704
|
-
self.trans_table = "11"
|
705
|
-
chosen_protein = open(output_11, 'r')
|
706
|
-
table_11 = True
|
707
|
-
|
708
|
-
destination = open(protein_output, "w")
|
709
|
-
|
710
|
-
#Clean the winning output.
|
711
|
-
for line in chosen_protein:
|
712
|
-
if line.startswith(">"):
|
713
|
-
destination.write("{}".format(line))
|
714
|
-
else:
|
715
|
-
line = line.replace('*', '')
|
716
|
-
destination.write("{}".format(line))
|
717
|
-
|
718
|
-
destination.close()
|
719
|
-
chosen_protein.close()
|
720
|
-
|
721
|
-
# Remove the winning intermediate file, since we have the cleaned output
|
722
|
-
if table_11:
|
723
|
-
output_11.unlink()
|
724
|
-
else:
|
725
|
-
output_4.unlink()
|
726
|
-
|
727
|
-
self.set_protein(str(protein_output))
|
728
|
-
|
729
|
-
#run hmmsearch on a protein
|
730
|
-
def protein_to_hmm(self):
|
731
|
-
if self.protein is None:
|
732
|
-
print(self.name, "wasn't a declared as a protein! I can't make this into an HMM!")
|
733
|
-
else:
|
734
|
-
|
735
|
-
folder = Path(self.output + "/hmms")
|
736
|
-
|
737
|
-
hmm_output = folder / (self.basename + '.hmm')
|
738
|
-
temp_output = folder / (self.basename + '.temp')
|
739
|
-
|
740
|
-
intermediate = folder / (self.basename + '_protein_intermediate.faa')
|
741
|
-
|
742
|
-
current_protein = ""
|
743
|
-
current_seq = ""
|
744
|
-
|
745
|
-
protein_parser = agnostic_reader(self.protein)
|
746
|
-
|
747
|
-
#File was a gzip; decompress it to an intermediate file and then run prodigal; delete after
|
748
|
-
#Keeps track of \n chars in the protein sequences.
|
749
|
-
line_ct = 0
|
750
|
-
midpoint = open(intermediate, "w")
|
751
|
-
|
752
|
-
for line in protein_parser:
|
753
|
-
if line.startswith(">"):
|
754
|
-
if len(current_seq) > 0:
|
755
|
-
if len(current_seq) < 100000:
|
756
|
-
midpoint.write(current_protein)
|
757
|
-
midpoint.write(current_seq)
|
758
|
-
else:
|
759
|
-
self.err_log += "Protein " + current_protein.strip().split()[0][1:] + " was observed to have >100K amino acids ( " + str(len(current_seq) - line_ct) + " AA found ). It was skipped. "
|
760
|
-
#print("Protein", current_protein.strip()[1:], "was observed to have >100K amino acids (", len(current_seq) - line_ct, "AA found ).", file = sys.stderr)
|
761
|
-
#print("HMMER cannot handle sequences that long, and the protein is almost certainly erroneous, anyway.", file = sys.stderr)
|
762
|
-
#print("The protein will be skipped, and FastAAI will continue without it.", file = sys.stderr)
|
763
|
-
|
764
|
-
current_protein = line
|
765
|
-
current_seq = ""
|
766
|
-
line_ct = 0
|
767
|
-
else:
|
768
|
-
line_ct += 1
|
769
|
-
current_seq += line
|
770
|
-
|
771
|
-
protein_parser.close()
|
772
|
-
|
773
|
-
#Finally, last prot
|
774
|
-
if len(current_seq) > 0:
|
775
|
-
if len(current_seq) < 100000:
|
776
|
-
midpoint.write(current_protein)
|
777
|
-
midpoint.write(current_seq)
|
778
|
-
else:
|
779
|
-
self.err_log += "Protein " + current_protein.strip().split()[0][1:] + " was observed to have >100K amino acids ( " + str(len(current_seq) - line_ct) + " AA found ). It was skipped. "
|
780
|
-
#print("Protein", current_protein.strip()[1:], "was observed to have >100K amino acids (", len(current_seq) - line_ct, "AA found ).", file = sys.stderr)
|
781
|
-
#print("HMMER cannot handle sequences that long, and the protein is almost certainly erroneous, anyway.", file = sys.stderr)
|
782
|
-
#print("The protein will be skipped, and FastAAI will continue without it.", file = sys.stderr)
|
783
|
-
|
784
|
-
midpoint.close()
|
785
|
-
|
786
|
-
#Should locate the DBs regardless of path.
|
787
|
-
script_path = Path(__file__)
|
788
|
-
script_dir = script_path.parent
|
789
|
-
hmm_complete_model = script_dir / "00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm"
|
790
|
-
|
791
|
-
subprocess.call(["hmmsearch", "--tblout", str(hmm_output), "-o", str(temp_output), "--cut_tc", "--cpu", "1",
|
792
|
-
str(hmm_complete_model), str(intermediate)])
|
793
|
-
|
794
|
-
temp_output.unlink()
|
795
|
-
intermediate.unlink()
|
796
|
-
|
797
|
-
self.set_hmm(str(hmm_output))
|
798
|
-
|
799
|
-
def prot_and_hmm_to_besthits(self):
|
800
|
-
prots = []
|
801
|
-
accs = []
|
802
|
-
scores = []
|
803
|
-
f = agnostic_reader(self.hmm)
|
804
|
-
for line in f:
|
805
|
-
if line.startswith("#"):
|
806
|
-
continue
|
807
|
-
else:
|
808
|
-
segs = line.strip().split()
|
809
|
-
prots.append(segs[0])
|
810
|
-
accs.append(segs[3])
|
811
|
-
scores.append(segs[8])
|
812
|
-
|
813
|
-
f.close()
|
814
|
-
|
815
|
-
hmm_file = np.transpose(np.array([prots, accs, scores]))
|
816
|
-
|
817
|
-
#hmm_file = np.loadtxt(hmm_file_name, comments = '#', usecols = (0, 3, 8), dtype=(str))
|
818
|
-
#Sort the hmm file based on the score column in descending order.
|
819
|
-
hmm_file = hmm_file[hmm_file[:,2].astype(float).argsort()[::-1]]
|
820
|
-
|
821
|
-
#Identify the first row where each gene name appears, after sorting by score;
|
822
|
-
#in effect, return the highest scoring assignment per gene name
|
823
|
-
#Sort the indices of the result to match the score-sorted table instead of alphabetical order of gene names
|
824
|
-
hmm_file = hmm_file[np.sort(np.unique(hmm_file[:,0], return_index = True)[1])]
|
825
|
-
|
826
|
-
#Filter the file again for the unique ACCESSION names, since we're only allowed one gene per accession, I guess?
|
827
|
-
#Don't sort the indices, we don't care about the scores anymore.
|
828
|
-
hmm_file = hmm_file[np.unique(hmm_file[:,1], return_index = True)[1]]
|
829
|
-
|
830
|
-
self.best_hits = dict(zip(hmm_file[:,0], hmm_file[:,1]))
|
831
|
-
|
832
|
-
self.best_hits_kmers = {}
|
833
|
-
current_seq = ""
|
834
|
-
current_prot = ""
|
835
|
-
is_besthit = False
|
836
|
-
|
837
|
-
prot = agnostic_reader(self.protein)
|
838
|
-
|
839
|
-
for line in prot:
|
840
|
-
|
841
|
-
if line.startswith(">"):
|
842
|
-
if len(current_seq) > 0:
|
843
|
-
kmer_set = unique_kmers(current_seq, 4)
|
844
|
-
self.protein_kmer_count[current_prot] = kmer_set.shape[0]
|
845
|
-
self.protein_count += 1
|
846
|
-
self.best_hits_kmers[current_prot] = kmer_set
|
847
|
-
#Select the best hit accession for this protein and just record that. We do not care about the names of the proteins.
|
848
|
-
current_prot = line[1:].strip().split(" ")[0]
|
849
|
-
if current_prot in self.best_hits:
|
850
|
-
current_prot = self.best_hits[current_prot]
|
851
|
-
is_besthit = True
|
852
|
-
else:
|
853
|
-
is_besthit = False
|
854
|
-
current_seq = ""
|
855
|
-
else:
|
856
|
-
if is_besthit:
|
857
|
-
current_seq += line.strip()
|
858
|
-
|
859
|
-
prot.close()
|
860
|
-
|
861
|
-
#Final iter. doesn't happen otw.
|
862
|
-
if current_prot in self.best_hits:
|
863
|
-
kmer_set = unique_kmers(current_seq, 4)
|
864
|
-
#kmer_set = [kmer_index[k] for k in kmer_set]
|
865
|
-
self.protein_kmer_count[current_prot] = kmer_set.shape[0]
|
866
|
-
self.protein_count += 1
|
867
|
-
self.best_hits_kmers[current_prot] = kmer_set
|
868
|
-
|
869
|
-
self.status = "finished preprocessing"
|
870
|
-
|
871
|
-
def preprocess(self):
|
872
|
-
#There's no advancement stage for protein and HMM
|
873
|
-
if self.status == 'genome':
|
874
|
-
start_time = curtime()
|
875
|
-
#report = True
|
876
|
-
if self.start_time is None:
|
877
|
-
self.start_time = start_time
|
878
|
-
|
879
|
-
if self.initial_state == "protein+HMM":
|
880
|
-
self.initial_state = "genome"
|
881
|
-
|
882
|
-
self.genome_to_protein()
|
883
|
-
|
884
|
-
|
885
|
-
if self.status == 'protein':
|
886
|
-
start_time = curtime()
|
887
|
-
#report = True
|
888
|
-
if self.start_time is None:
|
889
|
-
self.start_time = start_time
|
890
|
-
|
891
|
-
if self.initial_state == "protein+HMM":
|
892
|
-
self.initial_state = "protein"
|
893
|
-
|
894
|
-
self.protein_to_hmm()
|
895
|
-
|
896
|
-
if self.status == 'protein and hmm':
|
897
|
-
start_time = curtime()
|
898
|
-
|
899
|
-
if self.start_time is None:
|
900
|
-
self.start_time = start_time
|
901
|
-
|
902
|
-
self.prot_and_hmm_to_besthits()
|
903
|
-
|
904
|
-
#Add an end time if either genome -> protein -> HMM or protein -> HMM happened.
|
905
|
-
if self.start_time is not None:
|
906
|
-
end_time = curtime()
|
907
|
-
self.end_time = end_time
|
908
|
-
else:
|
909
|
-
#Start was protein+HMM. There was no runtime, and intitial state is p+hmm
|
910
|
-
#self.initial_state = "protein+HMM"
|
911
|
-
self.start_time = "N/A"
|
912
|
-
self.end_time = "N/A"
|
913
|
-
|
914
|
-
#Protein not generated on this run.
|
915
|
-
if self.trans_table is None:
|
916
|
-
self.trans_table = "unknown"
|
917
|
-
|
918
|
-
'''
|
919
|
-
Viral functions
|
920
|
-
'''
|
921
|
-
#No translation table comparison for viruses. Slightly reduced logic.
|
922
|
-
def viral_genome_to_protein(self):
|
923
|
-
if self.genome is None:
|
924
|
-
print(self.name, "wasn't a declared as a genome! I can't make this into a protein!")
|
925
|
-
else:
|
926
|
-
folder = Path(self.output + "/predicted_proteins")
|
927
|
-
intermediate_protein_output = folder / (self.basename + '.intermediate.faa')
|
928
|
-
final_protein_output = folder / (self.basename + '.faa')
|
929
|
-
temp_output = folder / (self.basename + '.temp')
|
930
|
-
|
931
|
-
subprocess.call(["prodigal", "-i", str(self.genome), "-a", str(intermediate_protein_output), "-p", "meta", "-q", "-o", str(temp_output)])
|
932
|
-
|
933
|
-
# Remove intermediate files
|
934
|
-
temp_output.unlink()
|
935
|
-
|
936
|
-
chosen_protein = open(intermediate_protein_output, 'r')
|
937
|
-
destination = open(final_protein_output, "w")
|
938
|
-
|
939
|
-
for line in chosen_protein:
|
940
|
-
if line.startswith(">"):
|
941
|
-
destination.write("{}".format(line))
|
942
|
-
else:
|
943
|
-
line = line.replace('*', '')
|
944
|
-
destination.write("{}".format(line))
|
945
|
-
|
946
|
-
destination.close()
|
947
|
-
chosen_protein.close()
|
948
|
-
|
949
|
-
intermediate_protein_output.unlink()
|
950
|
-
|
951
|
-
self.protein = str(protein_output)
|
952
|
-
self.status = 'protein'
|
953
|
-
|
954
|
-
|
955
|
-
'''
|
956
|
-
Preprocessing functions
|
957
|
-
|
958
|
-
Read directories, advance files to hmms as needed.
|
959
|
-
'''
|
960
|
-
#Toy function for passing to a pool
|
961
|
-
def do_advance(input_file_object):
|
962
|
-
input_file_object.preprocess()
|
963
|
-
return input_file_object
|
964
|
-
|
965
|
-
def initialize_preproc(index):
|
966
|
-
global kmer_index
|
967
|
-
kmer_index = index
|
968
|
-
|
969
|
-
#Function which takes an input list
|
970
|
-
def advance_inputs(genomes = None, proteins = None, hmms = None, genomes_file = None, proteins_file = None, hmms_file = None, output = "FastAAI", threads = 1, verbose = False, db_name = ""):
|
971
|
-
inputs = []
|
972
|
-
|
973
|
-
hmm_broke = False
|
974
|
-
|
975
|
-
if genomes_file is not None:
|
976
|
-
fh = agnostic_reader(genomes_file)
|
977
|
-
|
978
|
-
for line in fh:
|
979
|
-
clean = line.strip()
|
980
|
-
if not os.path.exists(clean):
|
981
|
-
print("I can't find file", clean, "Are you sure this file exists and can be found from your current directory using the path you supplied in the input file?")
|
982
|
-
else:
|
983
|
-
current_file = input_file(clean, output, verbose)
|
984
|
-
current_file.set_genome(clean)
|
985
|
-
inputs.append(current_file)
|
986
|
-
del current_file
|
987
|
-
|
988
|
-
fh.close()
|
989
|
-
|
990
|
-
if proteins_file is not None:
|
991
|
-
fh = agnostic_reader(proteins_file)
|
992
|
-
|
993
|
-
for line in fh:
|
994
|
-
#GOTOGOTO
|
995
|
-
print(line)
|
996
|
-
|
997
|
-
clean = line.strip()
|
998
|
-
if not os.path.exists(clean):
|
999
|
-
print("I can't find file", clean, "Are you sure this file exists and can be found from your current directory using the path you supplied in the input file?")
|
1000
|
-
else:
|
1001
|
-
current_file = input_file(clean, output, verbose)
|
1002
|
-
current_file.set_protein(clean)
|
1003
|
-
inputs.append(current_file)
|
1004
|
-
del current_file
|
1005
|
-
|
1006
|
-
fh.close()
|
1007
|
-
|
1008
|
-
if hmms_file is not None:
|
1009
|
-
fh = agnostic_reader(hmms_file)
|
1010
|
-
|
1011
|
-
hmm_pairs = []
|
1012
|
-
|
1013
|
-
for line in fh:
|
1014
|
-
clean = line.strip()
|
1015
|
-
if not os.path.exists(clean):
|
1016
|
-
print("I can't find file", clean, "Are you sure this file exists and can be found from your current directory using the path you supplied in the input file?")
|
1017
|
-
else:
|
1018
|
-
hmm_pairs.append(clean)
|
1019
|
-
|
1020
|
-
fh.close()
|
1021
|
-
|
1022
|
-
if len(hmm_pairs) != len(inputs):
|
1023
|
-
print("Protein and HMM file counts differ! There must be one HMM per protein, generated from its paired protein! These pairs must be in the same order in your input file!")
|
1024
|
-
hmm_broke = True
|
1025
|
-
else:
|
1026
|
-
for h, i in zip(hmm_pairs, inputs):
|
1027
|
-
i.set_hmm(h)
|
1028
|
-
|
1029
|
-
if genomes is not None:
|
1030
|
-
set = os.listdir(genomes)
|
1031
|
-
#Sort is used to ensure lexicographic ordering.
|
1032
|
-
set.sort()
|
1033
|
-
set = [os.path.normpath(genomes + "/" + file) for file in set]
|
1034
|
-
|
1035
|
-
for file in set:
|
1036
|
-
if not os.path.exists(file):
|
1037
|
-
print("I can't find", file, "Are you sure this file exists in the directory you supplied?")
|
1038
|
-
else:
|
1039
|
-
current_file = input_file(file, output, verbose)
|
1040
|
-
current_file.set_genome(file)
|
1041
|
-
inputs.append(current_file)
|
1042
|
-
del current_file
|
1043
|
-
|
1044
|
-
if proteins is not None:
|
1045
|
-
set = os.listdir(proteins)
|
1046
|
-
set.sort()
|
1047
|
-
set = [os.path.normpath(proteins + "/" + file) for file in set]
|
1048
|
-
|
1049
|
-
for file in set:
|
1050
|
-
if not os.path.exists(file):
|
1051
|
-
print("I can't find", file, "Are you sure this file exists in the directory you supplied?")
|
1052
|
-
else:
|
1053
|
-
current_file = input_file(file, output, verbose)
|
1054
|
-
current_file.set_protein(file)
|
1055
|
-
inputs.append(current_file)
|
1056
|
-
del current_file
|
1057
|
-
|
1058
|
-
if hmms is not None:
|
1059
|
-
set = os.listdir(hmms)
|
1060
|
-
set.sort()
|
1061
|
-
set = [os.path.normpath(hmms + "/" + file) for file in set]
|
1062
|
-
|
1063
|
-
hmm_pairs = []
|
1064
|
-
|
1065
|
-
for file in set:
|
1066
|
-
if not os.path.exists(file):
|
1067
|
-
print("I can't find", file, "Are you sure this file exists in the directory you supplied?")
|
1068
|
-
else:
|
1069
|
-
hmm_pairs.append(file)
|
1070
|
-
|
1071
|
-
if len(hmm_pairs) != len(inputs):
|
1072
|
-
print("Protein and HMM file counts differ! There must be one HMM per protein, generated from its paired protein! These must be in the same alphabetical order in their respective directories!")
|
1073
|
-
hmm_broke = True
|
1074
|
-
else:
|
1075
|
-
for h, i in zip(hmm_pairs, inputs):
|
1076
|
-
i.set_hmm(h)
|
1077
|
-
|
1078
|
-
if hmm_broke:
|
1079
|
-
print("FastAAI can't proceed without matching HMM and protein pairs.")
|
1080
|
-
inputs = None
|
1081
|
-
return inputs
|
1082
|
-
|
1083
|
-
total_counts = len(inputs)
|
1084
|
-
count = 0
|
1085
|
-
last_pct = 0
|
1086
|
-
|
1087
|
-
if verbose:
|
1088
|
-
print("")
|
1089
|
-
#progress bar - possible dangerous use of the return to line start sequence.
|
1090
|
-
try:
|
1091
|
-
percentage = 0
|
1092
|
-
sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% (Genome ' + str(count) + " of " + str(total_counts) + ') at ' + curtime()+"\n")
|
1093
|
-
sys.stdout.flush()
|
1094
|
-
except:
|
1095
|
-
#It's not really a big deal if the progress bar cannot be printed.
|
1096
|
-
pass
|
1097
|
-
|
1098
|
-
results = []
|
1099
|
-
|
1100
|
-
kmer_index_ = create_kmer_index()
|
1101
|
-
pool = multiprocessing.Pool(threads, initializer=initialize_preproc, initargs = (kmer_index_,))
|
1102
|
-
|
1103
|
-
for res in pool.imap(do_advance, inputs):
|
1104
|
-
results.append(res)
|
1105
|
-
if verbose:
|
1106
|
-
#progress bar - possible dangerous use of the return to line start sequence.
|
1107
|
-
try:
|
1108
|
-
count += 1
|
1109
|
-
percentage = (count/total_counts)*100
|
1110
|
-
if int(percentage/2) > last_pct or partition == total_partitions:
|
1111
|
-
sys.stdout.write('\033[A')
|
1112
|
-
sys.stdout.flush()
|
1113
|
-
sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% (Genome ' + str(count) + " of " + str(total_counts) + ') at ' + curtime()+"\n")
|
1114
|
-
sys.stdout.flush()
|
1115
|
-
|
1116
|
-
last_pct = int(percentage/2)
|
1117
|
-
except:
|
1118
|
-
#It's not really a big deal if the progress bar cannot be printed.
|
1119
|
-
pass
|
1120
|
-
|
1121
|
-
pool.close()
|
1122
|
-
pool.join()
|
1123
|
-
|
1124
|
-
inputs = results
|
1125
|
-
|
1126
|
-
log_time = curtime()
|
1127
|
-
|
1128
|
-
if os.path.exists(os.path.normpath(output + "/logs/" + os.path.splitext(os.path.basename(db_name))[0] + "_preprocessing_log.txt")):
|
1129
|
-
preproc_log = open(os.path.normpath(output + "/logs/" + os.path.splitext(os.path.basename(db_name))[0] + "_preprocessing_log.txt"), "a")
|
1130
|
-
else:
|
1131
|
-
preproc_log = open(os.path.normpath(output + "/logs/" + os.path.splitext(os.path.basename(db_name))[0] + "_preprocessing_log.txt"), "w")
|
1132
|
-
print("log_date", "genome_name", "started_as_a", "start_time", "end_time", "protein_translation_table", "errors", sep = "\t", file = preproc_log)
|
1133
|
-
for i in inputs:
|
1134
|
-
print(log_time, i.basename, i.initial_state, i.start_time, i.end_time, i.trans_table, i.err_log, sep = "\t", file = preproc_log)
|
1135
|
-
preproc_log.close()
|
1136
|
-
|
1137
|
-
return inputs
|
1138
|
-
|
1139
|
-
'''
|
1140
|
-
Utility functions
|
1141
|
-
'''
|
1142
|
-
def prepare_directories(output, status, build_or_query):
|
1143
|
-
preparation_successful = True
|
1144
|
-
|
1145
|
-
if not os.path.exists(output):
|
1146
|
-
try:
|
1147
|
-
os.mkdir(output)
|
1148
|
-
except:
|
1149
|
-
print("")
|
1150
|
-
print("FastAAI tried to make output directory: '"+ output + "' but failed.")
|
1151
|
-
print("")
|
1152
|
-
print("Troubleshooting:")
|
1153
|
-
print("")
|
1154
|
-
print(" (1) Do you have permission to create directories in the location you specified?")
|
1155
|
-
print(" (2) Did you make sure that all directories other than", os.path.basename(output), "already exist?")
|
1156
|
-
print("")
|
1157
|
-
preparation_successful = False
|
1158
|
-
|
1159
|
-
if preparation_successful:
|
1160
|
-
try:
|
1161
|
-
if status == 'genome':
|
1162
|
-
if not os.path.exists(os.path.normpath(output + "/" + "predicted_proteins")):
|
1163
|
-
os.mkdir(os.path.normpath(output + "/" + "predicted_proteins"))
|
1164
|
-
if not os.path.exists(os.path.normpath(output + "/" + "hmms")):
|
1165
|
-
os.mkdir(os.path.normpath(output + "/" + "hmms"))
|
1166
|
-
|
1167
|
-
if status == 'protein':
|
1168
|
-
if not os.path.exists(os.path.normpath(output + "/" + "hmms")):
|
1169
|
-
os.mkdir(os.path.normpath(output + "/" + "hmms"))
|
1170
|
-
|
1171
|
-
if not os.path.exists(os.path.normpath(output + "/" + "logs")):
|
1172
|
-
os.mkdir(os.path.normpath(output + "/" + "logs"))
|
1173
|
-
|
1174
|
-
if build_or_query == "build":
|
1175
|
-
if not os.path.exists(os.path.normpath(output + "/" + "database")):
|
1176
|
-
os.mkdir(os.path.normpath(output + "/" + "database"))
|
1177
|
-
|
1178
|
-
if build_or_query == "query":
|
1179
|
-
if not os.path.exists(os.path.normpath(output + "/" + "results")):
|
1180
|
-
os.mkdir(os.path.normpath(output + "/" + "results"))
|
1181
|
-
|
1182
|
-
|
1183
|
-
except:
|
1184
|
-
print("FastAAI was able to create or find", output, "but couldn't make directories there.")
|
1185
|
-
print("")
|
1186
|
-
print("This shouldn't happen. Do you have permission to write to that directory?")
|
1187
|
-
|
1188
|
-
|
1189
|
-
return preparation_successful
|
1190
|
-
|
1191
|
-
def check_out_input_files(genomes, proteins, hmms, gf, pf, hf):
|
1192
|
-
#Check only one method of supply was used per file type
|
1193
|
-
if (genomes is not None) and (gf is not None):
|
1194
|
-
print("Supply genomes either by directory or by file, not both.")
|
1195
|
-
return None
|
1196
|
-
if (proteins is not None) and (pf is not None):
|
1197
|
-
print("Supply proteins either by directory or by file, not both.")
|
1198
|
-
return None
|
1199
|
-
if (hmms is not None) and (hf is not None):
|
1200
|
-
print("Supply HMMs either by directory or by file, not both.")
|
1201
|
-
return None
|
1202
|
-
|
1203
|
-
#check that not both proteins and genomes supplied in any combo.
|
1204
|
-
if ((genomes is not None) and (pf is not None))\
|
1205
|
-
or ((gf is not None) and (proteins is not None))\
|
1206
|
-
or ((genomes is not None) and (proteins is not None))\
|
1207
|
-
or ((gf is not None) and (pf is not None)):
|
1208
|
-
print("Supply either genomes or proteins, not both. You can supply proteins and HMMs, but not genomes and proteins.")
|
1209
|
-
return None
|
1210
|
-
|
1211
|
-
#Check that if hmms are given, so are proteins
|
1212
|
-
if (hmms is not None) or (hf is not None):
|
1213
|
-
if (proteins is None) and (pf is None):
|
1214
|
-
print("If you supply HMMs, you also have to supply the proteins from which they were generated.")
|
1215
|
-
return None
|
1216
|
-
|
1217
|
-
#Determine status
|
1218
|
-
if (genomes is not None) or (gf is not None):
|
1219
|
-
print("Starting from genomes")
|
1220
|
-
start = 'genome'
|
1221
|
-
|
1222
|
-
else:
|
1223
|
-
if (hmms is not None) or (hf is not None):
|
1224
|
-
print("Starting from proteins and HMMs")
|
1225
|
-
start = 'protein and HMM'
|
1226
|
-
|
1227
|
-
else:
|
1228
|
-
print("Starting from proteins")
|
1229
|
-
start = 'protein'
|
1230
|
-
|
1231
|
-
return start
|
1232
|
-
|
1233
|
-
|
1234
|
-
#Build DB from genomes
|
1235
|
-
|
1236
|
-
def unique_kmers(seq, ksize):
|
1237
|
-
n_kmers = len(seq) - ksize + 1
|
1238
|
-
kmers = []
|
1239
|
-
for i in range(n_kmers):
|
1240
|
-
kmers.append(kmer_index[seq[i:i + ksize]])
|
1241
|
-
#We care about the type because we're working with bytes later.
|
1242
|
-
return np.unique(kmers).astype(np.int32)
|
1243
|
-
|
1244
|
-
#Quickly creates a dict of all poss. tetramers in a fixed, alphabetical order.
|
1245
|
-
#This can be used to index kmers so that the indices are identical (and thus interchangable) on separate runs of this program.
|
1246
|
-
def create_kmer_index():
|
1247
|
-
valid_chars = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Y', '*']
|
1248
|
-
#This meshgrid method will produce all unique tetramers from AAAA to **** in a consistent order.
|
1249
|
-
#Rightmost char to leftmost, A to * in the same order as valid_chars
|
1250
|
-
kmer_index_ = np.stack(np.meshgrid(valid_chars, valid_chars, valid_chars, valid_chars), -1).reshape(-1, 4)
|
1251
|
-
#Unless someone is passing more than 2.1 billion genomes, int32 will be enough.
|
1252
|
-
kmer_index_ = dict(zip([''.join(kmer_index_[i,]) for i in range(0, kmer_index_.shape[0])], np.arange(kmer_index_.shape[0], dtype = np.int32)))
|
1253
|
-
|
1254
|
-
return kmer_index_
|
1255
|
-
|
1256
|
-
def split_seq(seq, num_grps):
|
1257
|
-
newseq = []
|
1258
|
-
splitsize = 1.0/num_grps*len(seq)
|
1259
|
-
for i in range(num_grps):
|
1260
|
-
newseq.append(seq[int(round(i*splitsize)):int(round((i+1)*splitsize))])
|
1261
|
-
return newseq
|
1262
|
-
|
1263
|
-
#gives the max and min index needed to split a list of (max_val) genomes into
|
1264
|
-
def split_indicies(max_val, num_grps):
|
1265
|
-
newseq = []
|
1266
|
-
splitsize = 1.0/num_grps*max_val
|
1267
|
-
for i in range(num_grps):
|
1268
|
-
newseq.append(((round(i*splitsize)), round((i+1)*splitsize)))
|
1269
|
-
return newseq
|
1270
|
-
|
1271
|
-
def list_to_index_dict(list):
|
1272
|
-
result = {}
|
1273
|
-
counter = 0
|
1274
|
-
for item in list:
|
1275
|
-
result[item] = counter
|
1276
|
-
counter += 1
|
1277
|
-
return result
|
1278
|
-
|
1279
|
-
def generate_accessions_index():
|
1280
|
-
list_of_poss_accs = list_to_index_dict(['PF01780.19', 'PF03948.14', 'PF17144.4', 'PF00830.19', 'PF00347.23', 'PF16906.5', 'PF13393.6',
|
1281
|
-
'PF02565.15', 'PF01991.18', 'PF01984.20', 'PF00861.22', 'PF13656.6', 'PF00368.18', 'PF01142.18', 'PF00312.22', 'PF02367.17',
|
1282
|
-
'PF01951.16', 'PF00749.21', 'PF01655.18', 'PF00318.20', 'PF01813.17', 'PF01649.18', 'PF01025.19', 'PF00380.19', 'PF01282.19',
|
1283
|
-
'PF01864.17', 'PF01783.23', 'PF01808.18', 'PF01982.16', 'PF01715.17', 'PF00213.18', 'PF00119.20', 'PF00573.22', 'PF01981.16',
|
1284
|
-
'PF00281.19', 'PF00584.20', 'PF00825.18', 'PF00406.22', 'PF00177.21', 'PF01192.22', 'PF05833.11', 'PF02699.15', 'PF01016.19',
|
1285
|
-
'PF01765.19', 'PF00453.18', 'PF01193.24', 'PF05221.17', 'PF00231.19', 'PF00416.22', 'PF02033.18', 'PF01668.18', 'PF00886.19',
|
1286
|
-
'PF00252.18', 'PF00572.18', 'PF00366.20', 'PF04104.14', 'PF04919.12', 'PF01912.18', 'PF00276.20', 'PF00203.21', 'PF00889.19',
|
1287
|
-
'PF02996.17', 'PF00121.18', 'PF01990.17', 'PF00344.20', 'PF00297.22', 'PF01196.19', 'PF01194.17', 'PF01725.16', 'PF00750.19',
|
1288
|
-
'PF00338.22', 'PF00238.19', 'PF01200.18', 'PF00162.19', 'PF00181.23', 'PF01866.17', 'PF00709.21', 'PF02006.16', 'PF00164.25',
|
1289
|
-
'PF00237.19', 'PF01139.17', 'PF01351.18', 'PF04010.13', 'PF06093.13', 'PF00828.19', 'PF02410.15', 'PF01176.19', 'PF02130.17',
|
1290
|
-
'PF01948.18', 'PF01195.19', 'PF01746.21', 'PF01667.17', 'PF03874.16', 'PF01090.19', 'PF01198.19', 'PF01250.17', 'PF17136.4',
|
1291
|
-
'PF06026.14', 'PF03652.15', 'PF04019.12', 'PF01201.22', 'PF00832.20', 'PF01264.21', 'PF03840.14', 'PF00831.23', 'PF00189.20',
|
1292
|
-
'PF02601.15', 'PF01496.19', 'PF00411.19', 'PF00334.19', 'PF00687.21', 'PF01157.18', 'PF01245.20', 'PF01994.16', 'PF01632.19',
|
1293
|
-
'PF00827.17', 'PF01015.18', 'PF00829.21', 'PF00410.19', 'PF00833.18', 'PF00935.19', 'PF01992.16'])
|
1294
|
-
|
1295
|
-
return list_of_poss_accs
|
1296
|
-
|
1297
|
-
#Master function for building or adding to a DB with genomes.
|
1298
|
-
def add_inputs(output_path, parent_path, existing_index, threads, verbose, prep_args):
|
1299
|
-
|
1300
|
-
genomes, proteins, hmms, gf, pf, hf, db_name = prep_args[0], prep_args[1], prep_args[2], prep_args[3], prep_args[4], prep_args[5], prep_args[6]
|
1301
|
-
|
1302
|
-
print("")
|
1303
|
-
print("FastAAI is formatting your files to be saved to your database.")
|
1304
|
-
|
1305
|
-
#Let's push this to the inputs section.
|
1306
|
-
inputs = advance_inputs(genomes = genomes, proteins = proteins, hmms = hmms, genomes_file = gf, proteins_file = pf, hmms_file = hf, output = output_path, threads = threads, verbose = verbose, db_name = db_name)
|
1307
|
-
|
1308
|
-
if inputs is None:
|
1309
|
-
return False
|
1310
|
-
|
1311
|
-
kmer_index = None
|
1312
|
-
|
1313
|
-
#global genome_index
|
1314
|
-
genome_index = {}
|
1315
|
-
next_index = 0
|
1316
|
-
|
1317
|
-
#Build upon the genome indexing of an existing DB
|
1318
|
-
if existing_index is not None:
|
1319
|
-
genome_index = existing_index
|
1320
|
-
#zero indexing makes this the next number to add.
|
1321
|
-
next_index = len(existing_index)
|
1322
|
-
|
1323
|
-
final_db = fastaai_database(parent_path)
|
1324
|
-
final_db.activate_connection()
|
1325
|
-
final_db.initialize_parent_database()
|
1326
|
-
|
1327
|
-
#This goes to the genome_index table
|
1328
|
-
protein_counts_to_add = []
|
1329
|
-
genome_acc_kmer_counts_to_add = []
|
1330
|
-
|
1331
|
-
acc_index = generate_accessions_index()
|
1332
|
-
|
1333
|
-
readied_kmers_by_acc = defaultdict(lambda: defaultdict(lambda: None))
|
1334
|
-
|
1335
|
-
#unique_accessions = set()
|
1336
|
-
for file in inputs:
|
1337
|
-
|
1338
|
-
genome = file.basename
|
1339
|
-
|
1340
|
-
#Collect all of the accessions actually found. Will usually be 122 for reasonably sized datasets.
|
1341
|
-
#unique_accessions = unique_accessions.union(set(file.best_hits.values()))
|
1342
|
-
#Avoid adding duplicate genomes
|
1343
|
-
if genome not in genome_index:
|
1344
|
-
protein_counts_to_add.append((genome, next_index, file.protein_count))
|
1345
|
-
for prot in file.protein_kmer_count:
|
1346
|
-
genome_acc_kmer_counts_to_add.append((next_index, acc_index[prot], file.protein_kmer_count[prot]))
|
1347
|
-
genome_index[genome] = next_index
|
1348
|
-
next_index += 1
|
1349
|
-
|
1350
|
-
this_index = genome_index[genome]
|
1351
|
-
for acc in file.best_hits_kmers:
|
1352
|
-
readied_kmers_by_acc[acc][this_index] = file.best_hits_kmers[acc]
|
1353
|
-
#Clean up space
|
1354
|
-
file.best_hits_kmers = None
|
1355
|
-
|
1356
|
-
inputs = None
|
1357
|
-
|
1358
|
-
#Default dicts can't be pickled.
|
1359
|
-
readied_kmers_by_acc = dict(readied_kmers_by_acc)
|
1360
|
-
|
1361
|
-
genomes_per_acc = {}
|
1362
|
-
for acc in readied_kmers_by_acc:
|
1363
|
-
readied_kmers_by_acc[acc] = dict(readied_kmers_by_acc[acc])
|
1364
|
-
genomes_per_acc[acc] = list(readied_kmers_by_acc[acc].keys())
|
1365
|
-
final_db.add_genomes_first(acc, readied_kmers_by_acc[acc])
|
1366
|
-
readied_kmers_by_acc[acc] = None
|
1367
|
-
|
1368
|
-
readied_kmers_by_acc = None
|
1369
|
-
|
1370
|
-
add_genomes = "INSERT OR REPLACE INTO genome_index VALUES (?, ?, ?)"
|
1371
|
-
add_proteins = "INSERT OR REPLACE INTO genome_acc_kmer_counts VALUES (?, ?, ?)"
|
1372
|
-
|
1373
|
-
final_db.cursor.executemany(add_genomes, protein_counts_to_add)
|
1374
|
-
final_db.cursor.executemany(add_proteins, genome_acc_kmer_counts_to_add)
|
1375
|
-
final_db.connection.commit()
|
1376
|
-
|
1377
|
-
final_db.cursor.execute("CREATE INDEX IF NOT EXISTS kmer_acc ON genome_acc_kmer_counts (genome, accession);")
|
1378
|
-
final_db.connection.commit()
|
1379
|
-
|
1380
|
-
protein_counts_to_add = None
|
1381
|
-
genome_acc_kmer_counts_to_add = None
|
1382
|
-
|
1383
|
-
unique_accessions = list(genomes_per_acc.keys())
|
1384
|
-
child_args = []
|
1385
|
-
for i in range(0, len(unique_accessions)):
|
1386
|
-
accession = unique_accessions[i]
|
1387
|
-
name = "accession_" + unique_accessions[i] + "_partition_" + str(i)
|
1388
|
-
child_path = os.path.normpath(output_path+"/temp")
|
1389
|
-
child_args.append([accession, name, child_path, parent_path, genomes_per_acc[accession], genome_index])
|
1390
|
-
|
1391
|
-
print("")
|
1392
|
-
print("Formatting data to add to database at", curtime())
|
1393
|
-
|
1394
|
-
#Add partition, output, parent DB data.
|
1395
|
-
if not os.path.exists(os.path.normpath(output_path+"/temp")):
|
1396
|
-
try:
|
1397
|
-
os.mkdir(os.path.normpath(output_path+"/temp"))
|
1398
|
-
except:
|
1399
|
-
print("Output directory failed to create! Cannot continue.")
|
1400
|
-
return False
|
1401
|
-
|
1402
|
-
if verbose:
|
1403
|
-
print("")
|
1404
|
-
count = 0
|
1405
|
-
total_counts = len(child_args)
|
1406
|
-
try:
|
1407
|
-
log_time = curtime()
|
1408
|
-
percentage = (count/total_counts)*100
|
1409
|
-
sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(count) + " of " + str(total_counts) + ' ) at ' + curtime() + "\n")
|
1410
|
-
sys.stdout.flush()
|
1411
|
-
except:
|
1412
|
-
#It's not really a big deal if the progress bar cannot be printed.
|
1413
|
-
pass
|
1414
|
-
|
1415
|
-
last_pct = 0
|
1416
|
-
|
1417
|
-
quiverfull = []
|
1418
|
-
|
1419
|
-
pool = multiprocessing.Pool(threads)
|
1420
|
-
|
1421
|
-
for result in pool.imap_unordered(produce_children, child_args):
|
1422
|
-
acc = result[0]
|
1423
|
-
child = result[1]
|
1424
|
-
|
1425
|
-
quiverfull.append([acc, child])
|
1426
|
-
|
1427
|
-
if verbose:
|
1428
|
-
count += 1
|
1429
|
-
try:
|
1430
|
-
percentage = (count/total_counts)*100
|
1431
|
-
log_time = curtime()
|
1432
|
-
sys.stdout.write('\033[A')
|
1433
|
-
sys.stdout.flush()
|
1434
|
-
sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(count) + " of " + str(total_counts) + ' done at '+ curtime() + " )\n")
|
1435
|
-
sys.stdout.flush()
|
1436
|
-
except:
|
1437
|
-
#It's not really a big deal if the progress bar cannot be printed.
|
1438
|
-
pass
|
1439
|
-
|
1440
|
-
pool.close()
|
1441
|
-
pool.join()
|
1442
|
-
|
1443
|
-
print("")
|
1444
|
-
print("Adding data to final database.")
|
1445
|
-
|
1446
|
-
if verbose:
|
1447
|
-
print("")
|
1448
|
-
|
1449
|
-
count = 0
|
1450
|
-
total_counts = len(child_args)
|
1451
|
-
try:
|
1452
|
-
percentage = (count/total_counts)*100
|
1453
|
-
|
1454
|
-
("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(count) + " of " + str(total_counts) + ' done at '+ curtime() + " )\n")
|
1455
|
-
sys.stdout.flush()
|
1456
|
-
except:
|
1457
|
-
#It's not really a big deal if the progress bar cannot be printed.
|
1458
|
-
pass
|
1459
|
-
|
1460
|
-
last_pct = 0
|
1461
|
-
|
1462
|
-
for result in quiverfull:
|
1463
|
-
acc = result[0]
|
1464
|
-
child = result[1]
|
1465
|
-
final_db.add_child_to_parent(acc, child)
|
1466
|
-
|
1467
|
-
if verbose:
|
1468
|
-
count += 1
|
1469
|
-
try:
|
1470
|
-
percentage = (count/total_counts)*100
|
1471
|
-
log_time = curtime()
|
1472
|
-
sys.stdout.write('\033[A')
|
1473
|
-
sys.stdout.flush()
|
1474
|
-
sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(count) + " of " + str(total_counts) + ' done at '+ curtime() + " )\n")
|
1475
|
-
sys.stdout.flush()
|
1476
|
-
except:
|
1477
|
-
#It's not really a big deal if the progress bar cannot be printed.
|
1478
|
-
pass
|
1479
|
-
|
1480
|
-
|
1481
|
-
print("")
|
1482
|
-
#print("Cleaning up...")
|
1483
|
-
#final_db.connection.execute("VACUUM")
|
1484
|
-
|
1485
|
-
final_db.close_connection()
|
1486
|
-
|
1487
|
-
os.rmdir(os.path.normpath(output_path+"/temp"))
|
1488
|
-
|
1489
|
-
return True
|
1490
|
-
|
1491
|
-
#genome_index is global already
|
1492
|
-
def produce_children(args):
|
1493
|
-
acc = args[0]
|
1494
|
-
partition = args[1]
|
1495
|
-
output_base = args[2]
|
1496
|
-
parent_db = args[3]
|
1497
|
-
genomes_in_this_acc = args[4]
|
1498
|
-
genome_index = args[5]
|
1499
|
-
|
1500
|
-
parental_database = fastaai_database(parent_db)
|
1501
|
-
|
1502
|
-
sql_friendly_accession = acc.replace('.', '_')
|
1503
|
-
|
1504
|
-
read_parent_sql = "SELECT * FROM " + sql_friendly_accession + "_genomes WHERE genome IN ({genomes})".format(genomes=','.join(['?']*len(genomes_in_this_acc)))
|
1505
|
-
|
1506
|
-
parental_database.activate_connection()
|
1507
|
-
|
1508
|
-
genomes_for_this_acc = dict(parental_database.cursor.execute(read_parent_sql, genomes_in_this_acc).fetchall())
|
1509
|
-
|
1510
|
-
parental_database.close_connection()
|
1511
|
-
|
1512
|
-
child_db = os.path.normpath(output_base + "/" + partition + ".db")
|
1513
|
-
|
1514
|
-
this_child = child_database(child_db, parent_db)
|
1515
|
-
|
1516
|
-
this_child.activate_child_connection()
|
1517
|
-
#this_child.initialize_child_database()
|
1518
|
-
this_child.activate_parent_connection()
|
1519
|
-
|
1520
|
-
#Keys are genomes as indices, values are numpy arrays of kmers. This makes tuples.
|
1521
|
-
#this_child.add_genomes_first(acc, zip(genomes_for_this_acc.keys(), genomes_for_this_acc.values()))
|
1522
|
-
|
1523
|
-
#Here's where we add the genomes as such to the children, too.
|
1524
|
-
readied_kmers = defaultdict(lambda: [])
|
1525
|
-
for genome in genomes_for_this_acc:
|
1526
|
-
for kmer in genomes_for_this_acc[genome]:
|
1527
|
-
readied_kmers[kmer].append(genome)
|
1528
|
-
#cleanup space
|
1529
|
-
genomes_for_this_acc[genome] = None
|
1530
|
-
|
1531
|
-
del genomes_for_this_acc
|
1532
|
-
|
1533
|
-
readied_kmers = dict(readied_kmers)
|
1534
|
-
for kmer in readied_kmers:
|
1535
|
-
readied_kmers[kmer] = np.array(readied_kmers[kmer], dtype = np.int32)
|
1536
|
-
|
1537
|
-
sql_friendly_accession = this_child.add_accession(acc, readied_kmers)
|
1538
|
-
|
1539
|
-
this_child.close_parent_connection()
|
1540
|
-
this_child.close_child_connection()
|
1541
|
-
|
1542
|
-
del readied_kmers
|
1543
|
-
|
1544
|
-
return [sql_friendly_accession, child_db]
|
1545
|
-
|
1546
|
-
#Build or add to a FastAAI DB
|
1547
|
-
def build_db_opts():
|
1548
|
-
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
|
1549
|
-
description='''
|
1550
|
-
This FastAAI module allows you to create a FastAAI database from one or many genomes, proteins, or proteins and HMMs, or add these files to an existing one.
|
1551
|
-
|
1552
|
-
Supply genomes OR proteins OR proteins AND HMMs as inputs.
|
1553
|
-
|
1554
|
-
If you supply genomes, FastAAI will predict proteins from them, and HMMs will be created from those proteins
|
1555
|
-
If you supply only proteins, FastAAI will create HMM files from them, searching against FastAAI's internal database
|
1556
|
-
If you supply proteins AND HMMs, FastAAI will directly use them to build the database.\n
|
1557
|
-
You cannot supply both genomes and proteins
|
1558
|
-
''')
|
1559
|
-
|
1560
|
-
parser.add_argument('-g', '--genomes', dest = 'genomes', default = None, help = 'A directory containing genomes in FASTA format.')
|
1561
|
-
parser.add_argument('-p', '--proteins', dest = 'proteins', default = None, help = 'A directory containing protein amino acids in FASTA format.')
|
1562
|
-
parser.add_argument('-m', '--hmms', dest = 'hmms', default = None, help = 'A directory containing the results of an HMM search on a set of proteins.')
|
1563
|
-
parser.add_argument('-d', '--database', dest = 'db_name', default = "FastAAI_database.sqlite.db", help = 'The name of the database you wish to create or add to. The database will be created if it doesn\'t already exist and placed in the output directory. FastAAI_database.sqlite.db by default.')
|
1564
|
-
|
1565
|
-
parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory to place the database and any protein or HMM files FastAAI creates. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
|
1566
|
-
|
1567
|
-
parser.add_argument('--genome_file', dest = 'gf', default = None, help = 'Alternative way to supply genomes. A file containing paths to your genome files, 1 per line.')
|
1568
|
-
parser.add_argument('--protein_file', dest = 'pf', default = None, help = 'Alternative way to supply proteins. A file containing paths to your protein files, 1 per line.')
|
1569
|
-
parser.add_argument('--hmm_file', dest = 'hf', default = None, help = 'Alternative way to supply HMMs. A file containing paths to your HMM files, 1 per line.')
|
1570
|
-
|
1571
|
-
parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
|
1572
|
-
parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
|
1573
|
-
|
1574
|
-
args, unknown = parser.parse_known_args()
|
1575
|
-
|
1576
|
-
return parser, args
|
1577
|
-
|
1578
|
-
def build_db(genomes, proteins, hmms, db_name, output, threads, gf, pf, hf, verbose):
|
1579
|
-
|
1580
|
-
start = check_out_input_files(genomes, proteins, hmms, gf, pf, hf)
|
1581
|
-
|
1582
|
-
#If something failed, we stop.
|
1583
|
-
if start is None:
|
1584
|
-
return False
|
1585
|
-
|
1586
|
-
good_to_go = prepare_directories(output, start, "build")
|
1587
|
-
|
1588
|
-
if not good_to_go:
|
1589
|
-
return False
|
1590
|
-
|
1591
|
-
#Check if the db contains path info. Incl. windows version.
|
1592
|
-
if "/" not in db_name and "\\" not in db_name:
|
1593
|
-
final_database = os.path.normpath(output + "/database/" + db_name)
|
1594
|
-
else:
|
1595
|
-
#If the person insists that the db has a path, let them.
|
1596
|
-
final_database = db_name
|
1597
|
-
|
1598
|
-
#We'll skip trying this if the file already exists.
|
1599
|
-
existing_genome_IDs = None
|
1600
|
-
try:
|
1601
|
-
if os.path.exists(final_database):
|
1602
|
-
parent = fastaai_database(final_database)
|
1603
|
-
parent.activate_connection()
|
1604
|
-
|
1605
|
-
existing_genome_IDs = {}
|
1606
|
-
sql_command = "SELECT genome, gen_id FROM genome_index"
|
1607
|
-
for result in parent.cursor.execute(sql_command).fetchall():
|
1608
|
-
genome = result[0]
|
1609
|
-
id = int(result[1])
|
1610
|
-
existing_genome_IDs[genome] = id
|
1611
|
-
|
1612
|
-
parent.close_connection()
|
1613
|
-
except:
|
1614
|
-
print("You specified an existing file to be a database, but it does not appear to be a FastAAI database.")
|
1615
|
-
print("FastAAI will not be able to continue. Please give FastAAI a different database name and continue.")
|
1616
|
-
print("Exiting.")
|
1617
|
-
return False
|
1618
|
-
|
1619
|
-
|
1620
|
-
prep_args = [genomes, proteins, hmms, gf, pf, hf, db_name]
|
1621
|
-
|
1622
|
-
#inputs, output_path, parent_path, existing_index, threads
|
1623
|
-
success = add_inputs(output, final_database, existing_genome_IDs, threads, verbose, prep_args)
|
1624
|
-
|
1625
|
-
if success:
|
1626
|
-
print("Database build complete!")
|
1627
|
-
|
1628
|
-
return success
|
1629
|
-
|
1630
|
-
|
1631
|
-
#DB query functionality - unlimited version
|
1632
|
-
def do_query_vs_target_aai_only(query_name, target_name, threads, output, precision, verbose):
|
1633
|
-
if not os.path.exists(os.path.normpath(output+"/temp")):
|
1634
|
-
os.mkdir(os.path.normpath(output+"/temp"))
|
1635
|
-
|
1636
|
-
if precision == "low":
|
1637
|
-
jacc_precision = np.float16
|
1638
|
-
if precision == "med":
|
1639
|
-
jacc_precision = np.float32
|
1640
|
-
if precision == "high":
|
1641
|
-
jacc_precision = np.float64
|
1642
|
-
|
1643
|
-
#Save the file paths.
|
1644
|
-
query = fastaai_database(query_name)
|
1645
|
-
target = fastaai_database(target_name)
|
1646
|
-
|
1647
|
-
query.activate_connection()
|
1648
|
-
query.just_accessions()
|
1649
|
-
query_len = query.cursor.execute("SELECT Count(*) FROM genome_index").fetchall()[0][0]
|
1650
|
-
#query.close_connection()
|
1651
|
-
target.activate_connection()
|
1652
|
-
target.just_accessions()
|
1653
|
-
target_len = target.cursor.execute("SELECT Count(*) FROM genome_index").fetchall()[0][0]
|
1654
|
-
#target.close_connection()
|
1655
|
-
|
1656
|
-
print("FastAAI will search", query_len, "query genomes against", target_len, "target genomes.")
|
1657
|
-
|
1658
|
-
print("")
|
1659
|
-
print("FastAAI is preparing your AAI search... ", end = '', flush = True)
|
1660
|
-
|
1661
|
-
accessions_in_common = list(set(query.accessions).intersection(target.accessions))
|
1662
|
-
|
1663
|
-
query.accessions = None
|
1664
|
-
target.accessions = None
|
1665
|
-
|
1666
|
-
query.close_connection()
|
1667
|
-
target.close_connection()
|
1668
|
-
|
1669
|
-
load_args = [(query, target, acc) for acc in accessions_in_common]
|
1670
|
-
|
1671
|
-
loads = []
|
1672
|
-
ordered_accs = []
|
1673
|
-
|
1674
|
-
pool = multiprocessing.Pool(threads)
|
1675
|
-
|
1676
|
-
for result in pool.imap(load_getter, load_args):
|
1677
|
-
load = result[0]
|
1678
|
-
acc = result[1]
|
1679
|
-
#Load will be None if the accession is in both query and target, but they still don't share even a single Kmer. Unlikely, but it happened once, so it WILL happen again.
|
1680
|
-
if load is not None:
|
1681
|
-
loads.append(load)
|
1682
|
-
ordered_accs.append(acc)
|
1683
|
-
|
1684
|
-
pool.close()
|
1685
|
-
pool.join()
|
1686
|
-
|
1687
|
-
loads = np.array(loads)
|
1688
|
-
ordered_accs = np.array(ordered_accs)
|
1689
|
-
|
1690
|
-
order = loads.argsort()[::-1]
|
1691
|
-
|
1692
|
-
loads = loads[order]
|
1693
|
-
ordered_accs = ordered_accs[order]
|
1694
|
-
|
1695
|
-
load_balancer = {}
|
1696
|
-
accs_per_load = {}
|
1697
|
-
for i in range(0, threads):
|
1698
|
-
load_balancer[i] = 0
|
1699
|
-
accs_per_load[i] = []
|
1700
|
-
|
1701
|
-
for i in range(0, loads.shape[0]):
|
1702
|
-
index = list(load_balancer.values()).index(min(list(load_balancer.values())))
|
1703
|
-
#print(index, load)
|
1704
|
-
load_balancer[index] += loads[i]
|
1705
|
-
accs_per_load[index].append(int(ordered_accs[i]))
|
1706
|
-
|
1707
|
-
del loads
|
1708
|
-
del ordered_accs
|
1709
|
-
|
1710
|
-
print("done!")
|
1711
|
-
if verbose:
|
1712
|
-
print("FastAAI has balanced the workload of calculating AAI from your data.")
|
1713
|
-
for index in accs_per_load:
|
1714
|
-
print("Thread", index, "will handle", len(accs_per_load[index]), "accessions.")
|
1715
|
-
print("FastAAI is beginning the calculation of AAI between your query and target genomes.")
|
1716
|
-
|
1717
|
-
del load_balancer
|
1718
|
-
|
1719
|
-
input_queue = multiprocessing.Queue()
|
1720
|
-
output_queue = multiprocessing.Queue()
|
1721
|
-
|
1722
|
-
for thread in accs_per_load:
|
1723
|
-
input_queue.put(accs_per_load[thread])
|
1724
|
-
|
1725
|
-
for i in range(0, threads):
|
1726
|
-
input_queue.put('STOP')
|
1727
|
-
|
1728
|
-
for i in range(0, threads):
|
1729
|
-
multiprocessing.Process(target=accession_worker, args=(input_queue, output_queue, query, target, query_len, target_len, jacc_precision)).start()
|
1730
|
-
|
1731
|
-
print("")
|
1732
|
-
|
1733
|
-
results = np.zeros(shape = (query_len, target_len), dtype = jacc_precision)
|
1734
|
-
|
1735
|
-
#Counter to keep the threads running until the whole process is done.
|
1736
|
-
donezo = threads
|
1737
|
-
while donezo > 0:
|
1738
|
-
row = output_queue.get()
|
1739
|
-
try:
|
1740
|
-
results[row[0]] += row[1]
|
1741
|
-
except:
|
1742
|
-
donezo -= 1
|
1743
|
-
|
1744
|
-
print("AAI calculations complete. Formatting results for writing.")
|
1745
|
-
|
1746
|
-
#global glob_prec
|
1747
|
-
#glob_prec = jacc_precision
|
1748
|
-
|
1749
|
-
rdb_name = os.path.normpath(output+"/temp/aai_calc_db.db")
|
1750
|
-
rdb = calculation_database(rdb_name, precision)
|
1751
|
-
rdb.activate_connection()
|
1752
|
-
rdb.initialize_database()
|
1753
|
-
|
1754
|
-
#Get the data ready for passing to children...
|
1755
|
-
|
1756
|
-
results = np.split(results, query_len, axis = 0)
|
1757
|
-
|
1758
|
-
insertable = []
|
1759
|
-
#iterate over results and turn them into tuples.
|
1760
|
-
for i in range(0, query_len):
|
1761
|
-
insertable.append((i, results[i].tobytes()))
|
1762
|
-
results[i] = None
|
1763
|
-
|
1764
|
-
rdb.cursor.executemany("INSERT INTO jaccards VALUES (?, ?)", (insertable))
|
1765
|
-
rdb.connection.commit()
|
1766
|
-
|
1767
|
-
rdb.close_connection()
|
1768
|
-
|
1769
|
-
del insertable
|
1770
|
-
del results
|
1771
|
-
|
1772
|
-
#Now we split the query genomes into chunk and have threads process each chunk in parallel with its respective shared prot counts.
|
1773
|
-
query_chunks = split_indicies(query_len, threads)
|
1774
|
-
query_args = [([rdb_name], query_chunks[i], output, query, target, precision) for i in range(0, threads)]
|
1775
|
-
|
1776
|
-
print("Results formatted. Writing results starting at", curtime())
|
1777
|
-
|
1778
|
-
pool = multiprocessing.Pool(threads)
|
1779
|
-
|
1780
|
-
pool.map(finish_jaccards, query_args)
|
1781
|
-
|
1782
|
-
pool.close()
|
1783
|
-
pool.join()
|
1784
|
-
|
1785
|
-
os.remove(rdb_name)
|
1786
|
-
|
1787
|
-
print("FastAAI complete! Results at:", os.path.normpath(output+"/results/"))
|
1788
|
-
|
1789
|
-
return None
|
1790
|
-
|
1791
|
-
#Assess the number of comparisons that will have to be made to complete an accession so that balanced loads can be passed to threads
|
1792
|
-
def load_getter(args):
|
1793
|
-
query, target, accession = args[0], args[1], args[2]
|
1794
|
-
query.activate_connection()
|
1795
|
-
target.activate_connection()
|
1796
|
-
|
1797
|
-
original_index = generate_accessions_index()
|
1798
|
-
accession_inverter = {}
|
1799
|
-
for acc in original_index:
|
1800
|
-
sql_friendly_accession = acc.replace(".", "_")
|
1801
|
-
accession_inverter[original_index[acc]] = sql_friendly_accession
|
1802
|
-
|
1803
|
-
sql_friendly_accession = accession_inverter[accession].replace('.', '_')
|
1804
|
-
sql = "SELECT kmer FROM "+ sql_friendly_accession
|
1805
|
-
query.cursor.row_factory = lambda cursor, row: row[0]
|
1806
|
-
#query_kmers = set(query.cursor.execute(sql).fetchall()).intersection()
|
1807
|
-
target.cursor.row_factory = lambda cursor, row: row[0]
|
1808
|
-
#target_kmers = target.cursor.execute(sql).fetchall()
|
1809
|
-
|
1810
|
-
shared_kmers = list(set(query.cursor.execute(sql).fetchall()).intersection(target.cursor.execute(sql).fetchall()))
|
1811
|
-
query.cursor.row_factory = None
|
1812
|
-
target.cursor.row_factory = None
|
1813
|
-
|
1814
|
-
bytes_sql = "SELECT sum(length(genomes)) FROM " + sql_friendly_accession + " WHERE kmer IN ({kmers})".format(kmers=','.join(['?']*len(shared_kmers)))
|
1815
|
-
|
1816
|
-
if len(shared_kmers) > 0:
|
1817
|
-
tgt_res = target.cursor.execute(bytes_sql, shared_kmers).fetchone()[0]
|
1818
|
-
query_res = query.cursor.execute(bytes_sql, shared_kmers).fetchone()[0]
|
1819
|
-
#This if *should* always happen, if it gets checked.
|
1820
|
-
if tgt_res is not None and query_res is not None:
|
1821
|
-
load = int(tgt_res/(4096) * query_res/(4096))
|
1822
|
-
else:
|
1823
|
-
load = None
|
1824
|
-
else:
|
1825
|
-
load = None
|
1826
|
-
|
1827
|
-
query.close_connection()
|
1828
|
-
target.close_connection()
|
1829
|
-
|
1830
|
-
return [load, accession]
|
1831
|
-
|
1832
|
-
def accession_worker(in_queue, out_queue, query, target, qlen, tlen, prec):
|
1833
|
-
original_index = generate_accessions_index()
|
1834
|
-
accession_inverter = {}
|
1835
|
-
for acc in original_index:
|
1836
|
-
sql_friendly_accession = acc.replace(".", "_")
|
1837
|
-
accession_inverter[original_index[acc]] = sql_friendly_accession
|
1838
|
-
|
1839
|
-
query.activate_connection()
|
1840
|
-
target.activate_connection()
|
1841
|
-
query.load_genome_index()
|
1842
|
-
target.load_genome_index()
|
1843
|
-
|
1844
|
-
for my_accessions in iter(in_queue.get, 'STOP'):
|
1845
|
-
|
1846
|
-
#print(my_accessions)
|
1847
|
-
|
1848
|
-
target.load_accessions(permitted_accessions = my_accessions)
|
1849
|
-
query.load_accessions(permitted_accessions = my_accessions)
|
1850
|
-
|
1851
|
-
query_data = {}
|
1852
|
-
target_data = {}
|
1853
|
-
|
1854
|
-
for acc in my_accessions:
|
1855
|
-
|
1856
|
-
sql_friendly_accession = accession_inverter[acc].replace('.', '_')
|
1857
|
-
|
1858
|
-
query_data[acc] = dict(query.cursor.execute("SELECT * FROM "+sql_friendly_accession+"_genomes").fetchall())
|
1859
|
-
|
1860
|
-
query.cursor.row_factory = lambda cursor, row: row[0]
|
1861
|
-
selected_kmers = list(query.cursor.execute("SELECT kmer FROM "+sql_friendly_accession).fetchall())
|
1862
|
-
query.cursor.row_factory = None
|
1863
|
-
|
1864
|
-
target_sql = "SELECT * FROM " + sql_friendly_accession + " WHERE kmer in ({kmers})".format(kmers=','.join(['?']*len(selected_kmers)))
|
1865
|
-
target_data[acc] = dict(target.cursor.execute(target_sql, selected_kmers).fetchall())
|
1866
|
-
|
1867
|
-
target_kmer_cts_by_acc = {}
|
1868
|
-
for acc in my_accessions:
|
1869
|
-
target_kmer_cts_by_acc[acc] = np.zeros(tlen, dtype = np.int16)
|
1870
|
-
|
1871
|
-
for genome in target.gak:
|
1872
|
-
for acc in target.gak[genome]:
|
1873
|
-
target_kmer_cts_by_acc[acc][genome] = target.gak[genome][acc]
|
1874
|
-
|
1875
|
-
#No longer needed.
|
1876
|
-
target.gak = None
|
1877
|
-
#We want each thread to report every single genome
|
1878
|
-
for genome in query.gak:
|
1879
|
-
#count += 1
|
1880
|
-
#print("Thread", my_thread, "genome", count, "of", total)
|
1881
|
-
these_jaccards = np.zeros(tlen, dtype = np.float64)
|
1882
|
-
for acc in query.gak[genome]:
|
1883
|
-
these_intersections = np.zeros(tlen, dtype = np.int16)
|
1884
|
-
query_kmers = query_data[acc][genome]
|
1885
|
-
query_kmer_ct = query_kmers.shape
|
1886
|
-
for kmer in query_kmers:
|
1887
|
-
if kmer in target_data[acc]:
|
1888
|
-
these_intersections[target_data[acc][kmer]] += 1
|
1889
|
-
|
1890
|
-
these_jaccards += np.divide(these_intersections, np.subtract(np.add(query_kmer_ct, target_kmer_cts_by_acc[acc]), these_intersections))
|
1891
|
-
|
1892
|
-
out_queue.put([genome, these_jaccards])
|
1893
|
-
|
1894
|
-
target.close_connection()
|
1895
|
-
query.close_connection()
|
1896
|
-
out_queue.put("Based")
|
1897
|
-
|
1898
|
-
return None
|
1899
|
-
|
1900
|
-
def finish_jaccards(args):
|
1901
|
-
partial_dbs, my_query_genomes, output, query, target, prec = args[0], args[1], args[2], args[3] ,args[4], args[5]
|
1902
|
-
#Load protein counts
|
1903
|
-
#for each genome, query each partial and sum matching genomes, then divide by shared counts.
|
1904
|
-
|
1905
|
-
query.activate_connection()
|
1906
|
-
target.activate_connection()
|
1907
|
-
query.load_genome_index()
|
1908
|
-
target.load_genome_index()
|
1909
|
-
|
1910
|
-
selected_query_genomes = range(my_query_genomes[0], my_query_genomes[1])
|
1911
|
-
|
1912
|
-
offset = my_query_genomes[0]
|
1913
|
-
|
1914
|
-
target_len = len(target.genome_index)
|
1915
|
-
query_len = my_query_genomes[1] - my_query_genomes[0]
|
1916
|
-
|
1917
|
-
#get shared protein counts
|
1918
|
-
query.load_accessions(permitted_genomes = selected_query_genomes)
|
1919
|
-
|
1920
|
-
max_acc = 122
|
1921
|
-
|
1922
|
-
query_set = np.zeros(shape = (query_len, max_acc), dtype = np.int16)
|
1923
|
-
|
1924
|
-
for g in query.gak:
|
1925
|
-
query_set[(g-offset), list(query.gak[g])] += 1
|
1926
|
-
|
1927
|
-
target_set = np.zeros(shape = (max_acc, len(target.genome_index)), dtype = np.int16)
|
1928
|
-
|
1929
|
-
target.load_accessions()
|
1930
|
-
|
1931
|
-
target_protein_counts = np.zeros(target_len, dtype = np.int16)
|
1932
|
-
for t in target.gak:
|
1933
|
-
target_set[list(target.gak[t]), t] += 1
|
1934
|
-
target_protein_counts[t] = len(target.gak[t])
|
1935
|
-
|
1936
|
-
#This will be used to divide the jaccs and such. If disk, then disk, tho...
|
1937
|
-
shared_prot_counts_by_genome = np.dot(query_set, target_set)
|
1938
|
-
|
1939
|
-
del query_set
|
1940
|
-
del target_set
|
1941
|
-
|
1942
|
-
target.gak = None
|
1943
|
-
|
1944
|
-
query.close_connection()
|
1945
|
-
target.close_connection()
|
1946
|
-
|
1947
|
-
activated_DBs = []
|
1948
|
-
idx = 0
|
1949
|
-
for db in partial_dbs:
|
1950
|
-
activated_DBs.append(calculation_database(db, prec))
|
1951
|
-
activated_DBs[idx].activate_connection()
|
1952
|
-
idx += 1
|
1953
|
-
|
1954
|
-
|
1955
|
-
for genome in selected_query_genomes:
|
1956
|
-
sql = "SELECT jaccards FROM jaccards WHERE genome="+str(genome)
|
1957
|
-
total_jaccs = np.zeros(target_len, dtype = np.float64)
|
1958
|
-
shared_acc_counts = shared_prot_counts_by_genome[genome - offset]
|
1959
|
-
for db in activated_DBs:
|
1960
|
-
result = db.cursor.execute(sql).fetchone()[0]
|
1961
|
-
total_jaccs += result
|
1962
|
-
|
1963
|
-
total_jaccs = np.divide(total_jaccs, shared_acc_counts)
|
1964
|
-
|
1965
|
-
aai_est = numpy_kaai_to_aai(total_jaccs)
|
1966
|
-
|
1967
|
-
no_hit = np.where(shared_acc_counts == 0)
|
1968
|
-
#Actual hits is already stored in shared_acc_counts
|
1969
|
-
possible_hits = np.minimum(len(query.gak[genome]), target_protein_counts).astype(str)
|
1970
|
-
|
1971
|
-
total_jaccs = np.round(total_jaccs, 4).astype(str)
|
1972
|
-
|
1973
|
-
shared_acc_counts = shared_acc_counts.astype(str)
|
1974
|
-
|
1975
|
-
total_jaccs[no_hit] = "N/A"
|
1976
|
-
aai_est[no_hit] = "N/A"
|
1977
|
-
shared_acc_counts[no_hit] = "N/A"
|
1978
|
-
possible_hits[no_hit] = "N/A"
|
1979
|
-
|
1980
|
-
name = query.reverse_genome_index[genome]
|
1981
|
-
|
1982
|
-
output_file = output +"/results/"+name+"_results.txt"
|
1983
|
-
fh = open(output_file, "w")
|
1984
|
-
|
1985
|
-
for tgt in range(0, target_len):
|
1986
|
-
target_name = target.reverse_genome_index[tgt]
|
1987
|
-
if target_name == name:
|
1988
|
-
fh.write(name+"\t"+target_name+"\t"+"100.0"+"\t"+"0.0"+"\t"+shared_acc_counts[tgt]+"\t"+possible_hits[tgt]+"\t"+"100.0"+"\n")
|
1989
|
-
else:
|
1990
|
-
fh.write(name+"\t"+target_name+"\t"+total_jaccs[tgt]+"\t"+"N/A"+"\t"+shared_acc_counts[tgt]+"\t"+possible_hits[tgt]+"\t"+aai_est[tgt]+"\n")
|
1991
|
-
|
1992
|
-
fh.close()
|
1993
|
-
|
1994
|
-
#Write partial to file, here.
|
1995
|
-
|
1996
|
-
for db in activated_DBs:
|
1997
|
-
db.close_connection()
|
1998
|
-
|
1999
|
-
return None
|
2000
|
-
|
2001
|
-
|
2002
|
-
#Here's the DB SQL querying functionality/limited version.
|
2003
|
-
def do_query_vs_target_sql(query, target, threads, output, verbose, do_stdev):
|
2004
|
-
#Save the file paths.
|
2005
|
-
query_name, target_name = query, target
|
2006
|
-
|
2007
|
-
query = fastaai_database(query_name)
|
2008
|
-
query.activate_connection()
|
2009
|
-
query.load_genome_index()
|
2010
|
-
query.just_accessions()
|
2011
|
-
|
2012
|
-
converter = generate_accessions_index()
|
2013
|
-
acc_sql = "SELECT name FROM sqlite_master WHERE type='table'"
|
2014
|
-
tables = [item[0] for item in query.cursor.execute(acc_sql).fetchall()]
|
2015
|
-
cleaned_tables = []
|
2016
|
-
for table in tables:
|
2017
|
-
if table.endswith("_genomes"):
|
2018
|
-
acc_name = table.split("_genomes")[0]
|
2019
|
-
acc_name = acc_name.replace("_", ".")
|
2020
|
-
index = acc_name
|
2021
|
-
cleaned_tables.append((table, index))
|
2022
|
-
|
2023
|
-
del tables
|
2024
|
-
|
2025
|
-
#Go through tables and load data.
|
2026
|
-
query_acc_kmers = defaultdict(dict)
|
2027
|
-
|
2028
|
-
sys.stdout.write("\n")
|
2029
|
-
sys.stdout.write("Loading query data at " + curtime() + " ...\n")
|
2030
|
-
sys.stdout.flush()
|
2031
|
-
|
2032
|
-
for tab_idx in cleaned_tables:
|
2033
|
-
table = tab_idx[0]
|
2034
|
-
accession = tab_idx[1]
|
2035
|
-
for result in query.cursor.execute("SELECT * FROM " + table).fetchall():
|
2036
|
-
query_acc_kmers[result[0]][accession] = result[1]
|
2037
|
-
|
2038
|
-
query.close_connection()
|
2039
|
-
|
2040
|
-
|
2041
|
-
sys.stdout.write("\n")
|
2042
|
-
sys.stdout.write("Loading target data at " + curtime() + " ...\n")
|
2043
|
-
sys.stdout.flush()
|
2044
|
-
|
2045
|
-
target = fastaai_database(target_name)
|
2046
|
-
target.activate_connection()
|
2047
|
-
target.load_genome_index()
|
2048
|
-
target.load_accessions()
|
2049
|
-
target.close_connection()
|
2050
|
-
|
2051
|
-
query_args = []
|
2052
|
-
for genome in query_acc_kmers:
|
2053
|
-
query_args.append((target, query.reverse_genome_index[genome], query_acc_kmers[genome], os.path.normpath(output+"/results")))
|
2054
|
-
|
2055
|
-
detected_query_accs = query.accessions
|
2056
|
-
query_length = len(query.genome_index)
|
2057
|
-
|
2058
|
-
#Cleanup
|
2059
|
-
del query
|
2060
|
-
del query_acc_kmers
|
2061
|
-
|
2062
|
-
#global target_kmer_cts
|
2063
|
-
target_kmer_cts = {}
|
2064
|
-
|
2065
|
-
target_len = len(target.gak)
|
2066
|
-
|
2067
|
-
for accession in np.intersect1d(detected_query_accs, target.accessions):
|
2068
|
-
target_kmer_cts[accession] = np.zeros(target_len, dtype = np.int16)
|
2069
|
-
for g in target.gak:
|
2070
|
-
if accession in target.gak[g]:
|
2071
|
-
target_kmer_cts[accession][g] = target.gak[g][accession]
|
2072
|
-
|
2073
|
-
#global target_protein_counts
|
2074
|
-
target_protein_counts = np.zeros(target_len, dtype = np.int16)
|
2075
|
-
for g in target.gak:
|
2076
|
-
target_protein_counts[g] = len(target.gak[g])
|
2077
|
-
|
2078
|
-
target_length = len(target.gak)
|
2079
|
-
|
2080
|
-
target.gak = None
|
2081
|
-
|
2082
|
-
#Should just load the stuff then straightforward sql
|
2083
|
-
sys.stdout.write("\n")
|
2084
|
-
sys.stdout.write("FastAAI will search "+ str(query_length) + " query genomes against " + str(target_length) + " target genomes.\n")
|
2085
|
-
sys.stdout.write("\n")
|
2086
|
-
|
2087
|
-
count = 0
|
2088
|
-
total = len(query_args)
|
2089
|
-
|
2090
|
-
sys.stdout.write("Beginning AAI calculation at " + curtime())
|
2091
|
-
|
2092
|
-
if verbose:
|
2093
|
-
print("")
|
2094
|
-
#progress bar - possible dangerous use of the return to line start sequence.
|
2095
|
-
try:
|
2096
|
-
percentage = 0
|
2097
|
-
sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% (Query genome ' + str(count) + " of " + str(total) + ' done at '+curtime()+')\n')
|
2098
|
-
sys.stdout.flush()
|
2099
|
-
last_pct = 0
|
2100
|
-
except:
|
2101
|
-
#It's not really a big deal if the progress bar cannot be printed.
|
2102
|
-
pass
|
2103
|
-
|
2104
|
-
pool = multiprocessing.Pool(threads, initializer = sql_query_thread_starter, initargs = (target_kmer_cts, target_protein_counts,))
|
2105
|
-
|
2106
|
-
#Process as we go.
|
2107
|
-
if do_stdev:
|
2108
|
-
for file in pool.imap(do_sql_query, query_args):
|
2109
|
-
if verbose:
|
2110
|
-
#progress bar - possible dangerous use of the return to line start sequence.
|
2111
|
-
try:
|
2112
|
-
count += 1
|
2113
|
-
percentage = (count/total)*100
|
2114
|
-
if int(percentage/2) > last_pct or count == total:
|
2115
|
-
sys.stdout.write('\033[A')
|
2116
|
-
sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% (Query genome ' + str(count) + " of " + str(total) + ' done at '+curtime()+')\n')
|
2117
|
-
sys.stdout.flush()
|
2118
|
-
last_pct = int(percentage/2)
|
2119
|
-
except:
|
2120
|
-
#It's not really a big deal if the progress bar cannot be printed.
|
2121
|
-
pass
|
2122
|
-
|
2123
|
-
pool.close()
|
2124
|
-
pool.join()
|
2125
|
-
else:
|
2126
|
-
|
2127
|
-
for file in pool.imap(do_sql_query_no_SD, query_args):
|
2128
|
-
|
2129
|
-
if verbose:
|
2130
|
-
#progress bar - possible dangerous use of the return to line start sequence.
|
2131
|
-
try:
|
2132
|
-
count += 1
|
2133
|
-
percentage = (count/total)*100
|
2134
|
-
if int(percentage/2) > last_pct or count == total:
|
2135
|
-
sys.stdout.write('\033[A')
|
2136
|
-
sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% (Query genome ' + str(count) + " of " + str(total) + ' done at '+curtime()+')\n')
|
2137
|
-
sys.stdout.flush()
|
2138
|
-
last_pct = int(percentage/2)
|
2139
|
-
except:
|
2140
|
-
#It's not really a big deal if the progress bar cannot be printed.
|
2141
|
-
pass
|
2142
|
-
|
2143
|
-
pool.close()
|
2144
|
-
pool.join()
|
2145
|
-
|
2146
|
-
print("AAI calculation complete! Results at:", os.path.normpath(output+"/results"))
|
2147
|
-
|
2148
|
-
return None
|
2149
|
-
|
2150
|
-
#This can also take the genomes-first formatted prots in the DB and search them memory-efficiently, if not time efficiently.
|
2151
|
-
def do_sql_query(args):
|
2152
|
-
kmer_index = create_kmer_index()
|
2153
|
-
accession_index = generate_accessions_index()
|
2154
|
-
#database, file.basename, file.best_hits_kmers, os.path.normpath(output+"/temp")
|
2155
|
-
database, name, acc_kmers, temp_out = args[0],args[1],args[2],args[3]
|
2156
|
-
|
2157
|
-
database.activate_connection()
|
2158
|
-
|
2159
|
-
res_ct = 0
|
2160
|
-
target_len = len(database.genome_index)
|
2161
|
-
|
2162
|
-
results = np.zeros(shape = (len(acc_kmers), target_len), dtype = np.float64)
|
2163
|
-
row = 0
|
2164
|
-
|
2165
|
-
shared_acc_counts = np.zeros(target_len, dtype = np.int16)
|
2166
|
-
|
2167
|
-
for accession in acc_kmers:
|
2168
|
-
acc_index = accession_index[accession]
|
2169
|
-
sql_friendly_accession = accession.replace(".", "_")
|
2170
|
-
if acc_index in database.accessions:
|
2171
|
-
#The accession was found for this target genome, for each tgt genome.
|
2172
|
-
shared_acc_counts[np.nonzero(target_kmer_cts[acc_index])] += 1
|
2173
|
-
these_kmers = [int(kmer) for kmer in acc_kmers[accession]]
|
2174
|
-
these_intersections = np.zeros(target_len, dtype = np.int16)
|
2175
|
-
sql_query = "SELECT genomes FROM " + sql_friendly_accession + " WHERE kmer in ({kmers})".format(kmers=','.join(['?']*len(these_kmers)))
|
2176
|
-
for result in database.cursor.execute(sql_query, these_kmers):
|
2177
|
-
these_intersections[result] += 1
|
2178
|
-
|
2179
|
-
results[row] = np.divide(these_intersections, np.subtract(np.add(acc_kmers[accession].shape[0], target_kmer_cts[acc_index]), these_intersections))
|
2180
|
-
|
2181
|
-
row += 1
|
2182
|
-
|
2183
|
-
database.close_connection()
|
2184
|
-
|
2185
|
-
#These are the jacc averages
|
2186
|
-
jaccard_averages = np.divide(np.sum(results, axis = 0), shared_acc_counts)
|
2187
|
-
|
2188
|
-
#Get the differences from the mean per hit
|
2189
|
-
results = results - jaccard_averages
|
2190
|
-
#Square them
|
2191
|
-
results = np.square(results)
|
2192
|
-
#Sum squares and divide by shared acc. count, the sqrt to get SD.
|
2193
|
-
jaccard_SDs = np.sqrt(np.divide(np.sum(results, axis = 0), shared_acc_counts))
|
2194
|
-
|
2195
|
-
aai_est = numpy_kaai_to_aai(jaccard_averages)
|
2196
|
-
|
2197
|
-
no_hit = np.where(shared_acc_counts == 0)
|
2198
|
-
#Actual hits is already stored in shared_acc_counts
|
2199
|
-
possible_hits = np.minimum(len(acc_kmers), target_protein_counts).astype(str)
|
2200
|
-
|
2201
|
-
|
2202
|
-
jaccard_averages = np.round(jaccard_averages, 4).astype(str)
|
2203
|
-
jaccard_SDs = np.round(jaccard_SDs, 4).astype(str)
|
2204
|
-
|
2205
|
-
shared_acc_counts = shared_acc_counts.astype(str)
|
2206
|
-
|
2207
|
-
jaccard_averages[no_hit] = "N/A"
|
2208
|
-
aai_est[no_hit] = "N/A"
|
2209
|
-
jaccard_SDs[no_hit] = "N/A"
|
2210
|
-
shared_acc_counts[no_hit] = "N/A"
|
2211
|
-
possible_hits[no_hit] = "N/A"
|
2212
|
-
|
2213
|
-
output_file = temp_out +"/"+name+"_results.txt"
|
2214
|
-
fh = open(output_file, "w")
|
2215
|
-
|
2216
|
-
for target in range(0, target_len):
|
2217
|
-
target_name = database.reverse_genome_index[target]
|
2218
|
-
if target_name == name:
|
2219
|
-
fh.write(name+"\t"+target_name+"\t"+"100.0"+"\t"+"0.0"+"\t"+shared_acc_counts[target]+"\t"+possible_hits[target]+"\t"+"100.0"+"\n")
|
2220
|
-
else:
|
2221
|
-
fh.write(name+"\t"+target_name+"\t"+jaccard_averages[target]+"\t"+jaccard_SDs[target]+"\t"+shared_acc_counts[target]+"\t"+possible_hits[target]+"\t"+aai_est[target]+"\n")
|
2222
|
-
|
2223
|
-
fh.close()
|
2224
|
-
|
2225
|
-
return output_file
|
2226
|
-
|
2227
|
-
#This can also take the genomes-first formatted prots in the DB and search them memory-efficiently, if not time efficiently.
|
2228
|
-
def do_sql_query_no_SD(args):
|
2229
|
-
kmer_index = create_kmer_index()
|
2230
|
-
accession_index = generate_accessions_index()
|
2231
|
-
#database, file.basename, file.best_hits_kmers, os.path.normpath(output+"/temp")
|
2232
|
-
database, name, acc_kmers, temp_out = args[0],args[1],args[2],args[3]
|
2233
|
-
|
2234
|
-
database.activate_connection()
|
2235
|
-
|
2236
|
-
res_ct = 0
|
2237
|
-
target_len = len(database.genome_index)
|
2238
|
-
|
2239
|
-
results = np.zeros(shape = target_len, dtype = np.float64)
|
2240
|
-
#row = 0
|
2241
|
-
|
2242
|
-
shared_acc_counts = np.zeros(target_len, dtype = np.int16)
|
2243
|
-
|
2244
|
-
for accession in acc_kmers:
|
2245
|
-
acc_index = accession_index[accession]
|
2246
|
-
sql_friendly_accession = accession.replace(".", "_")
|
2247
|
-
if acc_index in database.accessions:
|
2248
|
-
#The accession was found for this target genome, for each tgt genome.
|
2249
|
-
shared_acc_counts[np.nonzero(target_kmer_cts[acc_index])] += 1
|
2250
|
-
these_kmers = [int(kmer) for kmer in acc_kmers[accession]]
|
2251
|
-
these_intersections = np.zeros(target_len, dtype = np.int16)
|
2252
|
-
sql_query = "SELECT genomes FROM " + sql_friendly_accession + " WHERE kmer in ({kmers})".format(kmers=','.join(['?']*len(these_kmers)))
|
2253
|
-
for result in database.cursor.execute(sql_query, these_kmers):
|
2254
|
-
these_intersections[result] += 1
|
2255
|
-
|
2256
|
-
results += np.divide(these_intersections, np.subtract(np.add(acc_kmers[accession].shape[0], target_kmer_cts[acc_index]), these_intersections))
|
2257
|
-
|
2258
|
-
database.close_connection()
|
2259
|
-
|
2260
|
-
#These are the jacc averages
|
2261
|
-
jaccard_averages = np.divide(results, shared_acc_counts)
|
2262
|
-
del results
|
2263
|
-
|
2264
|
-
aai_est = numpy_kaai_to_aai(jaccard_averages)
|
2265
|
-
|
2266
|
-
no_hit = np.where(shared_acc_counts == 0)
|
2267
|
-
|
2268
|
-
possible_hits = np.minimum(len(acc_kmers), target_protein_counts).astype(str)
|
2269
|
-
|
2270
|
-
jaccard_averages = np.round(jaccard_averages, 4).astype(str)
|
2271
|
-
|
2272
|
-
shared_acc_counts = shared_acc_counts.astype(str)
|
2273
|
-
|
2274
|
-
jaccard_averages[no_hit] = "N/A"
|
2275
|
-
aai_est[no_hit] = "N/A"
|
2276
|
-
shared_acc_counts[no_hit] = "N/A"
|
2277
|
-
possible_hits[no_hit] = "N/A"
|
2278
|
-
|
2279
|
-
output_file = temp_out +"/"+name+"_results.txt"
|
2280
|
-
fh = open(output_file, "w")
|
2281
|
-
|
2282
|
-
for target in range(0, target_len):
|
2283
|
-
target_name = database.reverse_genome_index[target]
|
2284
|
-
if target_name == name:
|
2285
|
-
fh.write(name+"\t"+target_name+"\t"+"100.0"+"\t"+"0.0"+"\t"+shared_acc_counts[target]+"\t"+possible_hits[target]+"\t"+"100.0"+"\n")
|
2286
|
-
else:
|
2287
|
-
fh.write(name+"\t"+target_name+"\t"+jaccard_averages[target]+"\t"+"N/A"+"\t"+shared_acc_counts[target]+"\t"+possible_hits[target]+"\t"+aai_est[target]+"\n")
|
2288
|
-
|
2289
|
-
fh.close()
|
2290
|
-
|
2291
|
-
return output_file
|
2292
|
-
|
2293
|
-
def numpy_kaai_to_aai(kaai_array):
|
2294
|
-
#aai_hat = (-0.3087057 + 1.810741 * (np.exp(-(-0.2607023 * np.log(kaai))**(1/3.435))))*100
|
2295
|
-
|
2296
|
-
#Protect the original jaccard averages memory item
|
2297
|
-
aai_hat_array = kaai_array.copy()
|
2298
|
-
|
2299
|
-
non_zero = np.where(aai_hat_array > 0)
|
2300
|
-
is_zero = np.where(aai_hat_array <= 0)
|
2301
|
-
|
2302
|
-
#I broke this down into its original components
|
2303
|
-
#Avoid zeroes in log - still actually works, but it produces warnings I don't want to see.
|
2304
|
-
aai_hat_array[non_zero] = np.log(aai_hat_array[non_zero])
|
2305
|
-
|
2306
|
-
aai_hat_array = np.multiply(np.subtract(np.multiply(np.exp(np.negative(np.power(np.multiply(aai_hat_array, -0.2607023), (1/3.435)))), 1.810741), 0.3087057), 100)
|
2307
|
-
'''
|
2308
|
-
Same as the above, broken down into easier-to-follow steps.
|
2309
|
-
aai_hat_array = np.multiply(aai_hat_array, -0.2607023)
|
2310
|
-
aai_hat_array = np.power(aai_hat_array, (1/3.435))
|
2311
|
-
aai_hat_array = np.negative(aai_hat_array)
|
2312
|
-
aai_hat_array = np.exp(aai_hat_array)
|
2313
|
-
aai_hat_array = np.multiply(aai_hat_array, 1.810741)
|
2314
|
-
aai_hat_array = np.subtract(aai_hat_array, 0.3087057)
|
2315
|
-
aai_hat_array = np.multiply(aai_hat_array, 100)
|
2316
|
-
'''
|
2317
|
-
|
2318
|
-
#<30 and >90 values
|
2319
|
-
smol = np.where(aai_hat_array < 30)
|
2320
|
-
big = np.where(aai_hat_array > 90)
|
2321
|
-
|
2322
|
-
aai_hat_array = np.round(aai_hat_array, 2)
|
2323
|
-
|
2324
|
-
#Convert to final printables
|
2325
|
-
aai_hat_array = aai_hat_array.astype(str)
|
2326
|
-
aai_hat_array[smol] = "<30%"
|
2327
|
-
aai_hat_array[big] = ">90%"
|
2328
|
-
#The math of the above ends up with zero values being big, so we fix those.
|
2329
|
-
aai_hat_array[is_zero] = "<30%"
|
2330
|
-
|
2331
|
-
return aai_hat_array
|
2332
|
-
|
2333
|
-
def curtime():
|
2334
|
-
time_format = "%d/%m/%Y %H:%M:%S"
|
2335
|
-
timer = datetime.datetime.now()
|
2336
|
-
time = timer.strftime(time_format)
|
2337
|
-
return time
|
2338
|
-
|
2339
|
-
#Manages the query process.
|
2340
|
-
def db_query_opts():
|
2341
|
-
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
|
2342
|
-
description='''
|
2343
|
-
This FastAAI module takes two FastAAI databases and searches all of the genomes in the QUERY against all of the genomes in the TARGET
|
2344
|
-
|
2345
|
-
If you have many genomes (more than 1000), it will be faster to create the query database using FastAAI build_db,
|
2346
|
-
then search it against an existing target using this module than it is to do the same thing with an SQL query.
|
2347
|
-
|
2348
|
-
If you give the same database as query and target, a special all vs. all search of the genomes in the database will be done.
|
2349
|
-
''')
|
2350
|
-
parser.add_argument('-q', '--query', dest = 'query', default = None, help = 'Path to the query database. The genomes FROM the query will be searched against the genomes in the target database')
|
2351
|
-
parser.add_argument('-t', '--target', dest = 'target', default = None, help = 'Path to the target database.')
|
2352
|
-
|
2353
|
-
parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory where FastAAI will place the result of this query. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
|
2354
|
-
|
2355
|
-
parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
|
2356
|
-
parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
|
2357
|
-
|
2358
|
-
parser.add_argument('--do_stdev', dest = "do_stdev", action='store_true', help = 'Off by default. Calculate std. deviations on Jaccard indicies. Increases memory usage and runtime slightly. Does NOT change estimated AAI values at all.')
|
2359
|
-
parser.add_argument('--unlimited_resources', dest = "large_mem", action = 'store_true', help = 'Off by default. Use a faster algorithm that consumes more RAM. FastAAI cannot calculate std. deviations with this algorithm, so they will automatically be skipped.')
|
2360
|
-
parser.add_argument('--mem', dest = "precision", default = "med", help = 'One of low/med/high. Medium by default. Save RAM in return for slightly rounded AAI estimates. Only affects FastAAI if you are also using the "--unlimited_resources" flag.')
|
2361
|
-
|
2362
|
-
args, unknown = parser.parse_known_args()
|
2363
|
-
|
2364
|
-
return parser, args
|
2365
|
-
|
2366
|
-
#Control the query process for any DB-first query.
|
2367
|
-
def db_query(query, target, verbose, output, threads, do_stdev, precision, memory_efficient):
|
2368
|
-
print("")
|
2369
|
-
|
2370
|
-
#Sanity checks.
|
2371
|
-
if not os.path.exists(target):
|
2372
|
-
print("Target database not found. Exiting FastAAI")
|
2373
|
-
sys.exit()
|
2374
|
-
|
2375
|
-
if not os.path.exists(query):
|
2376
|
-
print("Query database not found. Exiting FastAAI")
|
2377
|
-
sys.exit()
|
2378
|
-
|
2379
|
-
#status = "exists"
|
2380
|
-
query_ok = assess_db(query)
|
2381
|
-
target_ok = assess_db(target)
|
2382
|
-
|
2383
|
-
if query_ok != "exists":
|
2384
|
-
print("Query database improperly formatted. Exiting FastAAI")
|
2385
|
-
sys.exit()
|
2386
|
-
|
2387
|
-
if target_ok != "exists":
|
2388
|
-
print("Query database improperly formatted. Exiting FastAAI")
|
2389
|
-
sys.exit()
|
2390
|
-
|
2391
|
-
#Check if the database is querying against itself.
|
2392
|
-
if target is None or query is None:
|
2393
|
-
print("I require both a query and a target database. FastAAI exiting.")
|
2394
|
-
sys.exit()
|
2395
|
-
|
2396
|
-
if query == target:
|
2397
|
-
print("Performing an all vs. all query on", query)
|
2398
|
-
#all_vs_all = True
|
2399
|
-
else:
|
2400
|
-
print("Querying", query, "against", target)
|
2401
|
-
#all_vs_all = False
|
2402
|
-
|
2403
|
-
#Ready the output directories as needed.
|
2404
|
-
#The databases are already created, the only state they can be in in P+H
|
2405
|
-
good_to_go = prepare_directories(output, "protein and HMM", "query")
|
2406
|
-
if not good_to_go:
|
2407
|
-
print("Exiting FastAAI")
|
2408
|
-
sys.exit()
|
2409
|
-
|
2410
|
-
if precision not in ["high", "med", "low"]:
|
2411
|
-
print("Selected memory usage setting not found. Defaulting to med. Select one with --mem high/med/low.")
|
2412
|
-
precision = 'med'
|
2413
|
-
|
2414
|
-
#Default
|
2415
|
-
if (not memory_efficient) or do_stdev:
|
2416
|
-
do_query_vs_target_sql(query, target, threads, output, verbose, do_stdev)
|
2417
|
-
#Not default.
|
2418
|
-
else:
|
2419
|
-
do_query_vs_target_aai_only(query, target, threads, output, precision, verbose)
|
2420
|
-
|
2421
|
-
print("")
|
2422
|
-
|
2423
|
-
|
2424
|
-
#Perform a minimal-memory query of a target database from input files. Lighter weight function for low memory
|
2425
|
-
def sql_query_opts():
|
2426
|
-
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
|
2427
|
-
description='''
|
2428
|
-
This FastAAI module takes one or many genomes, proteins, or proteins and HMMs as a QUERY and searches them against an existing FastAAI database TARGET using SQL
|
2429
|
-
If you only have a few genomes - or not enough RAM to hold the entire target database in memory - this is the probably the best option for you.
|
2430
|
-
|
2431
|
-
If you provide FastAAI with genomes or only proteins (not proteins and HMMs), this FastAAI module will produce the required protein and HMM files as needed
|
2432
|
-
and place them in the output directory, just like it does while building a database.
|
2433
|
-
|
2434
|
-
Once these inputs are ready to be queried against the database (each has both a protein and HMM file), they will be processed independently, 1 per thread at a time.
|
2435
|
-
|
2436
|
-
Note: Protein and HMM files generated during this query can be supplied to build a FastAAI database from proteins and HMMs using the build_db module, without redoing preprocessing.
|
2437
|
-
''')
|
2438
|
-
|
2439
|
-
parser.add_argument('-g', '--genomes', dest = 'genomes', default = None, help = 'A directory containing genomes in FASTA format.')
|
2440
|
-
parser.add_argument('-p', '--proteins', dest = 'proteins', default = None, help = 'A directory containing protein amino acids in FASTA format.')
|
2441
|
-
parser.add_argument('-m', '--hmms', dest = 'hmms', default = None, help = 'A directory containing the results of an HMM search on a set of proteins.')
|
2442
|
-
|
2443
|
-
parser.add_argument('--target', dest = 'target', default = None, help = 'A path to the FastAAI database you wish to use as the target')
|
2444
|
-
|
2445
|
-
parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory where FastAAI will place the result of this query and any protein or HMM files it has to generate. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
|
2446
|
-
|
2447
|
-
parser.add_argument('--genome_file', dest = 'gf', default = None, help = 'Alternative way to supply genomes. A file containing paths to your genome files, 1 per line.')
|
2448
|
-
parser.add_argument('--protein_file', dest = 'pf', default = None, help = 'Alternative way to supply proteins. A file containing paths to your protein files, 1 per line.')
|
2449
|
-
parser.add_argument('--hmm_file', dest = 'hf', default = None, help = 'Alternative way to supply HMMs. A file containing paths to your HMM files, 1 per line.')
|
2450
|
-
|
2451
|
-
parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
|
2452
|
-
parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
|
2453
|
-
|
2454
|
-
parser.add_argument('--do_stdev', dest = "do_stdev", action='store_true', help = 'Off by default. Calculate std. deviations on Jaccard indicies. Increases memory usage and runtime slightly. Does NOT change estimated AAI values at all.')
|
2455
|
-
|
2456
|
-
args, unknown = parser.parse_known_args()
|
2457
|
-
|
2458
|
-
return parser, args
|
2459
|
-
|
2460
|
-
def sql_query_thread_starter(kmer_cts, protein_cts):
|
2461
|
-
global target_kmer_cts
|
2462
|
-
global target_protein_counts
|
2463
|
-
target_kmer_cts = kmer_cts
|
2464
|
-
target_protein_counts = protein_cts
|
2465
|
-
|
2466
|
-
|
2467
|
-
def sql_query(genomes, proteins, hmms, gf, pf, hf, db_name, output, threads, verbose, do_stdev):
|
2468
|
-
|
2469
|
-
if not os.path.exists(db_name):
|
2470
|
-
print("")
|
2471
|
-
print("FastAAI can't find your database:", db_name)
|
2472
|
-
print("Are you sure that the path you've given to the database is correct and that the database exists?")
|
2473
|
-
print("FastAAI exiting.")
|
2474
|
-
print("")
|
2475
|
-
sys.exit()
|
2476
|
-
|
2477
|
-
start = check_out_input_files(genomes, proteins, hmms, gf, pf, hf)
|
2478
|
-
|
2479
|
-
#If something failed, we stop.
|
2480
|
-
if start is None:
|
2481
|
-
sys.exit()
|
2482
|
-
|
2483
|
-
|
2484
|
-
|
2485
|
-
good_to_go = prepare_directories(output, start, "query")
|
2486
|
-
|
2487
|
-
if not good_to_go:
|
2488
|
-
print("Exiting FastAAI")
|
2489
|
-
sys.exit()
|
2490
|
-
|
2491
|
-
#global kmer_index
|
2492
|
-
#kmer_index = create_kmer_index()
|
2493
|
-
|
2494
|
-
|
2495
|
-
print("")
|
2496
|
-
print("Preparing inputs for querying...")
|
2497
|
-
|
2498
|
-
prepared_files = advance_inputs(genomes = genomes, proteins = proteins, hmms = hmms, genomes_file = gf, proteins_file = pf, hmms_file = hf, output = output, threads = threads, verbose = verbose, db_name = db_name)
|
2499
|
-
|
2500
|
-
if prepared_files is None:
|
2501
|
-
return None
|
2502
|
-
|
2503
|
-
query_accessions_detected = set()
|
2504
|
-
for file in prepared_files:
|
2505
|
-
query_accessions_detected = query_accessions_detected.union(file.best_hits.values())
|
2506
|
-
|
2507
|
-
#We don't want to get more than we have to.
|
2508
|
-
query_accessions_detected = list(query_accessions_detected)
|
2509
|
-
|
2510
|
-
if prepared_files is None:
|
2511
|
-
print("Exiting FastAAI")
|
2512
|
-
sys.exit()
|
2513
|
-
|
2514
|
-
if verbose:
|
2515
|
-
print("")
|
2516
|
-
print("Gathering database information...")
|
2517
|
-
|
2518
|
-
database = fastaai_database(db_name)
|
2519
|
-
database.activate_connection()
|
2520
|
-
database.load_genome_index()
|
2521
|
-
database.load_accessions()
|
2522
|
-
database.close_connection()
|
2523
|
-
|
2524
|
-
#formatted_dataset = [(database, file.basename, file.best_hits_kmers, os.path.normpath(output+"/results")) for file in prepared_files]
|
2525
|
-
|
2526
|
-
#global accession_index
|
2527
|
-
accession_index = generate_accessions_index()
|
2528
|
-
|
2529
|
-
#Translate to indicies.
|
2530
|
-
query_accessions_detected = [accession_index[a] for a in query_accessions_detected]
|
2531
|
-
|
2532
|
-
#global target_kmer_cts
|
2533
|
-
target_kmer_cts = {}
|
2534
|
-
|
2535
|
-
for accession in np.intersect1d(database.accessions, query_accessions_detected):
|
2536
|
-
target_kmer_cts[accession] = np.zeros(len(database.genome_index), dtype = np.int16)
|
2537
|
-
for g in database.gak:
|
2538
|
-
if accession in database.gak[g]:
|
2539
|
-
target_kmer_cts[accession][g] = database.gak[g][accession]
|
2540
|
-
|
2541
|
-
#global target_protein_counts
|
2542
|
-
target_protein_counts = np.zeros(len(database.gak), dtype = np.int16)
|
2543
|
-
for g in database.gak:
|
2544
|
-
target_protein_counts[g] = len(database.gak[g])
|
2545
|
-
|
2546
|
-
database.gak = None
|
2547
|
-
|
2548
|
-
formatted_dataset = [(database, file.basename, file.best_hits_kmers, os.path.normpath(output+"/results")) for file in prepared_files]
|
2549
|
-
|
2550
|
-
if verbose:
|
2551
|
-
print("")
|
2552
|
-
print("-"*100)
|
2553
|
-
print("")
|
2554
|
-
|
2555
|
-
count = 0
|
2556
|
-
total = len(formatted_dataset)
|
2557
|
-
|
2558
|
-
print("Beginning AAI calculation")
|
2559
|
-
|
2560
|
-
#globals to pass... target_kmer_cts target_protein_counts
|
2561
|
-
#Just remake these in the procs. kmer_index accession_index
|
2562
|
-
|
2563
|
-
if verbose:
|
2564
|
-
print("")
|
2565
|
-
#progress bar - possible dangerous use of the return to line start sequence.
|
2566
|
-
try:
|
2567
|
-
percentage = 0
|
2568
|
-
sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% (Query genome ' + str(count) + " of " + str(total) + ' done at '+curtime()+' )\n')
|
2569
|
-
sys.stdout.flush()
|
2570
|
-
last_pct = 0
|
2571
|
-
except:
|
2572
|
-
#It's not really a big deal if the progress bar cannot be printed.
|
2573
|
-
pass
|
2574
|
-
|
2575
|
-
#If parallelized, do parallel
|
2576
|
-
|
2577
|
-
pool = multiprocessing.Pool(threads, initializer = sql_query_thread_starter, initargs = (target_kmer_cts, target_protein_counts,))
|
2578
|
-
|
2579
|
-
#Process as we go.
|
2580
|
-
if do_stdev:
|
2581
|
-
for file in pool.imap(do_sql_query, formatted_dataset):
|
2582
|
-
|
2583
|
-
'''
|
2584
|
-
handle = open(file, "r")
|
2585
|
-
|
2586
|
-
for line in handle:
|
2587
|
-
final_result.write(line)
|
2588
|
-
|
2589
|
-
handle.close()
|
2590
|
-
os.remove(file)
|
2591
|
-
'''
|
2592
|
-
if verbose:
|
2593
|
-
#progress bar - possible dangerous use of the return to line start sequence.
|
2594
|
-
try:
|
2595
|
-
count += 1
|
2596
|
-
percentage = (count/total)*100
|
2597
|
-
if int(percentage/2) > last_pct or count == total:
|
2598
|
-
sys.stdout.write('\033[A')
|
2599
|
-
sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% (Query genome ' + str(count) + " of " + str(total) + ' done at '+curtime()+' )\n')
|
2600
|
-
sys.stdout.flush()
|
2601
|
-
last_pct = int(percentage/2)
|
2602
|
-
except:
|
2603
|
-
#It's not really a big deal if the progress bar cannot be printed.
|
2604
|
-
pass
|
2605
|
-
|
2606
|
-
pool.close()
|
2607
|
-
pool.join()
|
2608
|
-
else:
|
2609
|
-
|
2610
|
-
for file in pool.imap(do_sql_query_no_SD, formatted_dataset):
|
2611
|
-
'''
|
2612
|
-
handle = open(file, "r")
|
2613
|
-
|
2614
|
-
for line in handle:
|
2615
|
-
final_result.write(line)
|
2616
|
-
|
2617
|
-
handle.close()
|
2618
|
-
os.remove(file)
|
2619
|
-
'''
|
2620
|
-
if verbose:
|
2621
|
-
#progress bar - possible dangerous use of the return to line start sequence.
|
2622
|
-
try:
|
2623
|
-
count += 1
|
2624
|
-
percentage = (count/total)*100
|
2625
|
-
if int(percentage/2) > last_pct or count == total:
|
2626
|
-
sys.stdout.write('\033[A')
|
2627
|
-
sys.stdout.flush()
|
2628
|
-
sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% (Query genome ' + str(count) + " of " + str(total) + ' done at '+curtime()+' )\n')
|
2629
|
-
sys.stdout.flush
|
2630
|
-
last_pct = int(percentage/2)
|
2631
|
-
except:
|
2632
|
-
#It's not really a big deal if the progress bar cannot be printed.
|
2633
|
-
pass
|
2634
|
-
|
2635
|
-
pool.close()
|
2636
|
-
pool.join()
|
2637
|
-
|
2638
|
-
if verbose:
|
2639
|
-
print("")
|
2640
|
-
print("-"*100)
|
2641
|
-
print("")
|
2642
|
-
|
2643
|
-
if os.path.exists(output+"/temp"):
|
2644
|
-
os.rmdir(output+"/temp")
|
2645
|
-
|
2646
|
-
print("FastAAI query complete! Results at:", os.path.normpath(output + "/results"))
|
2647
|
-
return None
|
2648
|
-
|
2649
|
-
|
2650
|
-
#Check to see if the file exists and is a valid fastAAI db
|
2651
|
-
def assess_db(path):
|
2652
|
-
status = None
|
2653
|
-
if os.path.exists(path):
|
2654
|
-
db = fastaai_database(path)
|
2655
|
-
try:
|
2656
|
-
db.activate_connection()
|
2657
|
-
sql = "SELECT name FROM sqlite_master WHERE type='table'"
|
2658
|
-
|
2659
|
-
db.cursor.row_factory = lambda cursor, row: row[0]
|
2660
|
-
tables = db.cursor.execute(sql).fetchall()
|
2661
|
-
db.cursor.row_factory = None
|
2662
|
-
|
2663
|
-
db.close_connection()
|
2664
|
-
|
2665
|
-
if len(tables) > 2 and "genome_index" in tables and "genome_acc_kmer_counts" in tables:
|
2666
|
-
status = "exists"
|
2667
|
-
else:
|
2668
|
-
status = "wrong format"
|
2669
|
-
|
2670
|
-
except:
|
2671
|
-
status = "wrong format"
|
2672
|
-
|
2673
|
-
else:
|
2674
|
-
try:
|
2675
|
-
db = fastaai_database(path)
|
2676
|
-
db.activate_connection()
|
2677
|
-
db.initialize_parent_database()
|
2678
|
-
db.close_connection()
|
2679
|
-
status = "created"
|
2680
|
-
except:
|
2681
|
-
status = "unable to create"
|
2682
|
-
|
2683
|
-
return status
|
2684
|
-
|
2685
|
-
#Add one FastAAI DB to another FastAAI DB
|
2686
|
-
def merge_db_opts():
|
2687
|
-
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
|
2688
|
-
description='''
|
2689
|
-
This FastAAI module allows you to add the contents of one or more FastAAI databases to another.
|
2690
|
-
You must have at least two already-created FastAAI databases using the build_db module before this module can be used.
|
2691
|
-
|
2692
|
-
Supply a comma-separated list of at least one donor database and a single recipient database.
|
2693
|
-
If the recipient already exists, then genomes in all the donors will be added to the recipient.
|
2694
|
-
If the recipient does not already exist, a new database will be created, and the contents of all the donors will be added to it.
|
2695
|
-
|
2696
|
-
Example:
|
2697
|
-
FastAAI.py merge_db --donors databases/db1.db,databases/db2.db -recipient databases/db3.db --threads 3
|
2698
|
-
This command will create a new database called "db3.db", merge the data in db1.db and db2.db, and then add the merged data into db3.db
|
2699
|
-
|
2700
|
-
Only the recipient database will be modified; the donors will be left exactly as they were before running this module.
|
2701
|
-
''')
|
2702
|
-
|
2703
|
-
parser.add_argument('-d', '--donors', dest = 'donors', default = None, help = 'Comma-separated string of paths to one or more donor databases. The genomes FROM the donors will be added TO the recipient and the donors will be unaltered')
|
2704
|
-
|
2705
|
-
parser.add_argument('--donor_file', dest = 'donor_file', default = None, help = 'Alternative way to supply donors. A file containing paths to the donor databases, 1 per line')
|
2706
|
-
|
2707
|
-
parser.add_argument('-r', '--recipient', dest = 'recipient', default = None, help = 'Path to the recipient database. Any genomes FROM the donor database not already in the recipient will be added to this database.')
|
2708
|
-
|
2709
|
-
parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
|
2710
|
-
|
2711
|
-
parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
|
2712
|
-
|
2713
|
-
args, unknown = parser.parse_known_args()
|
2714
|
-
|
2715
|
-
return parser, args
|
2716
|
-
|
2717
|
-
def merge_db_thread_starter(rev_index, per_db_accs):
|
2718
|
-
global reverse_genome_indicies
|
2719
|
-
global accs_per_db
|
2720
|
-
reverse_genome_indicies = rev_index
|
2721
|
-
accs_per_db = per_db_accs
|
2722
|
-
|
2723
|
-
|
2724
|
-
|
2725
|
-
def merge_db(recipient, donors, donor_file, verbose, threads):
|
2726
|
-
#Prettier on the CLI
|
2727
|
-
|
2728
|
-
if donor_file is not None:
|
2729
|
-
fh = agnostic_reader(donor_file)
|
2730
|
-
donors = [line.strip() for line in fh]
|
2731
|
-
fh.close()
|
2732
|
-
|
2733
|
-
if donors is None or recipient is None:
|
2734
|
-
print("Either donor or target not given. FastAAI is exiting.")
|
2735
|
-
return None
|
2736
|
-
|
2737
|
-
print("")
|
2738
|
-
|
2739
|
-
if donor_file is None:
|
2740
|
-
donors = donors.split(",")
|
2741
|
-
|
2742
|
-
valid_donors = []
|
2743
|
-
for d in donors:
|
2744
|
-
if os.path.exists(d):
|
2745
|
-
if d == recipient:
|
2746
|
-
print("Donor database", d, "is the same as the recipient. This database will be skipped.")
|
2747
|
-
else:
|
2748
|
-
check = assess_db(d)
|
2749
|
-
if check == "exists":
|
2750
|
-
if d not in valid_donors:
|
2751
|
-
valid_donors.append(d)
|
2752
|
-
else:
|
2753
|
-
print("It appears that database", d, "was already added to the list of donors. Did you type it twice in the list of donors? Skipping it.")
|
2754
|
-
else:
|
2755
|
-
if check == "created":
|
2756
|
-
print("Donor database", d, "not found! Skipping.")
|
2757
|
-
else:
|
2758
|
-
print("Something was wrong with supplied database:", d+". A status check found:", check)
|
2759
|
-
else:
|
2760
|
-
print("Donor database", d, "not found! Are you sure the path is correct and this donor exists? This database will be skipped.")
|
2761
|
-
|
2762
|
-
if len(valid_donors) == 0:
|
2763
|
-
print("None of the supplied donor databases were able to be accessed. FastAAI cannot continue if none of these databases are valid. Exiting.")
|
2764
|
-
sys.exit()
|
2765
|
-
|
2766
|
-
recip_check = assess_db(recipient)
|
2767
|
-
|
2768
|
-
if recip_check == "created" or recip_check == "exists":
|
2769
|
-
for donor in valid_donors:
|
2770
|
-
print("Donor database:", donor, "will be added to recipient database:", recipient)
|
2771
|
-
|
2772
|
-
recipient = fastaai_database(recipient)
|
2773
|
-
else:
|
2774
|
-
print("I couldn't find or create the recipient database at", recipient+".", "Does the folder you're trying to place this database in exist, and do you have permission to write files to it? FastAAI exiting.")
|
2775
|
-
sys.exit()
|
2776
|
-
|
2777
|
-
if recipient is None or len(valid_donors) == 0:
|
2778
|
-
print("I require both a valid donor and a recipient database. FastAAI exiting.")
|
2779
|
-
sys.exit()
|
2780
|
-
|
2781
|
-
donor_dbs = []
|
2782
|
-
for d in valid_donors:
|
2783
|
-
donor_dbs.append(fastaai_database(d))
|
2784
|
-
|
2785
|
-
all_accessions = set()
|
2786
|
-
#global joint_genome_index
|
2787
|
-
joint_genome_index = {}
|
2788
|
-
joint_genome_counts = {}
|
2789
|
-
max_index = 0
|
2790
|
-
#The idea here is to create a set of arrays whose values span the range of each donor's genomes and translate those into an overall list, in order.
|
2791
|
-
|
2792
|
-
#global reverse_genome_indicies
|
2793
|
-
reverse_genome_indices = {}
|
2794
|
-
|
2795
|
-
#global accs_per_db
|
2796
|
-
accs_per_db = {}
|
2797
|
-
|
2798
|
-
#Load recipient data, if any.
|
2799
|
-
if recip_check == "exists":
|
2800
|
-
recipient.activate_connection()
|
2801
|
-
recipient.just_accessions()
|
2802
|
-
recipient.load_genome_index()
|
2803
|
-
recipient.close_connection()
|
2804
|
-
|
2805
|
-
all_accessions = all_accessions.union(recipient.accessions)
|
2806
|
-
accs_per_db[recipient.path] = recipient.accessions
|
2807
|
-
recipient.accessions = None
|
2808
|
-
max_index = len(recipient.genome_index)
|
2809
|
-
|
2810
|
-
joint_genome_index = dict(zip(recipient.genome_index.keys(), recipient.genome_index.values()))
|
2811
|
-
joint_genome_counts = dict(zip(recipient.protein_counts_by_genome.keys(), recipient.protein_counts_by_genome.values()))
|
2812
|
-
|
2813
|
-
#reverse_genome_index = dict(zip(joint_genome_index.values(),joint_genome_index.keys()))
|
2814
|
-
#So... the keys are the genome indicies of the recip. These... shouldn't need any updates. Only the donors need to match.
|
2815
|
-
ct = 0
|
2816
|
-
path = recipient.path
|
2817
|
-
reverse_genome_indices[path] = []
|
2818
|
-
for idx in sorted(recipient.genome_index.values()):
|
2819
|
-
reverse_genome_indices[path].append(idx)
|
2820
|
-
reverse_genome_indices[path] = np.array(reverse_genome_indices[path], dtype = np.int32)
|
2821
|
-
recipient.genome_index = None
|
2822
|
-
|
2823
|
-
#Donors should always exist, never be created.
|
2824
|
-
for d in donor_dbs:
|
2825
|
-
d.activate_connection()
|
2826
|
-
d.just_accessions()
|
2827
|
-
d.load_genome_index()
|
2828
|
-
d.close_connection()
|
2829
|
-
accs_per_db[d.path] = d.accessions
|
2830
|
-
all_accessions = all_accessions.union(d.accessions)
|
2831
|
-
d.accessions = None
|
2832
|
-
reverse_genome_indices[d.path] = []
|
2833
|
-
#Database construction indicates this should always be 0-COUNT
|
2834
|
-
for g in sorted(d.genome_index.keys()):
|
2835
|
-
if g not in joint_genome_index:
|
2836
|
-
reverse_genome_indices[d.path].append(max_index)
|
2837
|
-
joint_genome_index[g] = max_index
|
2838
|
-
#Map the counts on.
|
2839
|
-
joint_genome_counts[max_index] = d.protein_counts_by_genome[d.genome_index[g]]
|
2840
|
-
#reverse_genome_index[max_index] = g
|
2841
|
-
max_index += 1
|
2842
|
-
else:
|
2843
|
-
reverse_genome_indices[d.path].append(joint_genome_index[g])
|
2844
|
-
#Make it an array, now
|
2845
|
-
reverse_genome_indices[d.path] = np.array(reverse_genome_indices[d.path], dtype = np.int32)
|
2846
|
-
d.genome_index = None
|
2847
|
-
|
2848
|
-
#global accession_index
|
2849
|
-
accession_index = generate_accessions_index()
|
2850
|
-
|
2851
|
-
#global accession_inverter
|
2852
|
-
accession_inverter = {}
|
2853
|
-
for acc in accession_index:
|
2854
|
-
sql_friendly_accession = acc.replace(".", "_")
|
2855
|
-
accession_inverter[accession_index[acc]] = sql_friendly_accession
|
2856
|
-
|
2857
|
-
all_accessions = list(all_accessions)
|
2858
|
-
|
2859
|
-
|
2860
|
-
print("")
|
2861
|
-
print("Formatting data to add to database. Started at", curtime())
|
2862
|
-
|
2863
|
-
temp_dir = tempfile.mkdtemp()
|
2864
|
-
try:
|
2865
|
-
acc_args = [(acc, donor_dbs, recipient, temp_dir) for acc in all_accessions]
|
2866
|
-
|
2867
|
-
if verbose:
|
2868
|
-
print("")
|
2869
|
-
count = 0
|
2870
|
-
total_counts = len(acc_args)
|
2871
|
-
try:
|
2872
|
-
percentage = (count/total_counts)*100
|
2873
|
-
sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(count) + " of " + str(total_counts) + ' done at ' + curtime() + " )\n")
|
2874
|
-
sys.stdout.flush()
|
2875
|
-
except:
|
2876
|
-
#It's not really a big deal if the progress bar cannot be printed.
|
2877
|
-
pass
|
2878
|
-
|
2879
|
-
last_pct = 0
|
2880
|
-
|
2881
|
-
pool = multiprocessing.Pool(threads, initializer=merge_db_thread_starter, initargs = (reverse_genome_indices, accs_per_db,))
|
2882
|
-
|
2883
|
-
quiverfull = []
|
2884
|
-
for result in pool.imap_unordered(pull_and_merge_accession, acc_args):
|
2885
|
-
acc = result[0]
|
2886
|
-
child = result[1]
|
2887
|
-
#sub_gak = result[2]
|
2888
|
-
|
2889
|
-
quiverfull.append([acc, child])
|
2890
|
-
#gaks.extend(sub_gak)
|
2891
|
-
|
2892
|
-
if verbose:
|
2893
|
-
count += 1
|
2894
|
-
try:
|
2895
|
-
percentage = (count/total_counts)*100
|
2896
|
-
log_time = curtime()
|
2897
|
-
sys.stdout.write('\033[A')
|
2898
|
-
sys.stdout.flush()
|
2899
|
-
sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(count) + " of " + str(total_counts) + ' done at ' + curtime() + " )\n")
|
2900
|
-
sys.stdout.flush()
|
2901
|
-
except:
|
2902
|
-
#It's not really a big deal if the progress bar cannot be printed.
|
2903
|
-
pass
|
2904
|
-
|
2905
|
-
pool.close()
|
2906
|
-
pool.join()
|
2907
|
-
|
2908
|
-
print("")
|
2909
|
-
print("Adding data to final database. Started at", curtime())
|
2910
|
-
|
2911
|
-
if verbose:
|
2912
|
-
print("")
|
2913
|
-
|
2914
|
-
count = 0
|
2915
|
-
total_counts = len(acc_args)
|
2916
|
-
try:
|
2917
|
-
percentage = (count/total_counts)*100
|
2918
|
-
sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(count) + " of " + str(total_counts) + ' done at ' + curtime() + " )\n")
|
2919
|
-
sys.stdout.flush()
|
2920
|
-
except:
|
2921
|
-
#It's not really a big deal if the progress bar cannot be printed.
|
2922
|
-
pass
|
2923
|
-
|
2924
|
-
last_pct = 0
|
2925
|
-
|
2926
|
-
recipient.activate_connection()
|
2927
|
-
genome_list_update_sql = "INSERT OR REPLACE INTO genome_index VALUES (?, ?, ?)"
|
2928
|
-
genome_reindex = []
|
2929
|
-
for g in joint_genome_index:
|
2930
|
-
genome_reindex.append((g, joint_genome_index[g], joint_genome_counts[joint_genome_index[g]]))
|
2931
|
-
|
2932
|
-
recipient.cursor.executemany(genome_list_update_sql, genome_reindex)
|
2933
|
-
recipient.connection.commit()
|
2934
|
-
|
2935
|
-
del genome_reindex
|
2936
|
-
|
2937
|
-
for result in quiverfull:
|
2938
|
-
acc = result[0]
|
2939
|
-
child = result[1]
|
2940
|
-
|
2941
|
-
recipient.add_child_to_parent(acc, child, genomes_too = True, update_gak = True)
|
2942
|
-
|
2943
|
-
if verbose:
|
2944
|
-
count += 1
|
2945
|
-
try:
|
2946
|
-
percentage = (count/total_counts)*100
|
2947
|
-
log_time = curtime()
|
2948
|
-
sys.stdout.write('\033[A')
|
2949
|
-
sys.stdout.flush()
|
2950
|
-
sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(count) + " of " + str(total_counts) + ' done at ' + curtime() + " )\n")
|
2951
|
-
sys.stdout.flush()
|
2952
|
-
except:
|
2953
|
-
#It's not really a big deal if the progress bar cannot be printed.
|
2954
|
-
pass
|
2955
|
-
except:
|
2956
|
-
#Error
|
2957
|
-
if os.path.exists(temp_dir):
|
2958
|
-
shutil.rmtree(temp_dir)
|
2959
|
-
finally:
|
2960
|
-
#Success
|
2961
|
-
if os.path.exists(temp_dir):
|
2962
|
-
shutil.rmtree(temp_dir)
|
2963
|
-
|
2964
|
-
print("\nDatabases merged!")
|
2965
|
-
|
2966
|
-
return None
|
2967
|
-
|
2968
|
-
def pull_and_merge_accession(args):
|
2969
|
-
accession_index = generate_accessions_index()
|
2970
|
-
|
2971
|
-
#global accession_inverter
|
2972
|
-
accession_inverter = {}
|
2973
|
-
for acc in accession_index:
|
2974
|
-
sql_friendly_accession = acc.replace(".", "_")
|
2975
|
-
accession_inverter[accession_index[acc]] = sql_friendly_accession
|
2976
|
-
|
2977
|
-
#joint_genome_index, accession_index, accession_inverter, accs_per_db are global already.
|
2978
|
-
acc, donor_dbs, recipient, temp = args[0], args[1], args[2], args[3]
|
2979
|
-
|
2980
|
-
acc_name = accession_inverter[acc]
|
2981
|
-
acc_name_gens = acc_name + "_genomes"
|
2982
|
-
|
2983
|
-
query_sql = "SELECT * FROM " + acc_name
|
2984
|
-
|
2985
|
-
temp_db = fastaai_database(os.path.normpath(temp+"/"+acc_name+".db"))
|
2986
|
-
temp_db.activate_connection()
|
2987
|
-
|
2988
|
-
create_command = "CREATE TABLE IF NOT EXISTS " + acc_name + " (kmer INTEGER PRIMARY KEY, genomes array)"
|
2989
|
-
temp_db.cursor.execute(create_command)
|
2990
|
-
temp_db.connection.commit()
|
2991
|
-
|
2992
|
-
create_command = "CREATE TABLE IF NOT EXISTS " + acc_name + "_genomes (genome INTEGER PRIMARY KEY, kmers array)"
|
2993
|
-
temp_db.cursor.execute(create_command)
|
2994
|
-
temp_db.connection.commit()
|
2995
|
-
|
2996
|
-
query_lists = {}
|
2997
|
-
for db in donor_dbs:
|
2998
|
-
if acc in accs_per_db[db.path]:
|
2999
|
-
db.activate_connection()
|
3000
|
-
|
3001
|
-
for result in db.cursor.execute(query_sql).fetchall():
|
3002
|
-
kmer = result[0]
|
3003
|
-
genomes = result[1]
|
3004
|
-
translated_genomes = reverse_genome_indicies[db.path][genomes]
|
3005
|
-
|
3006
|
-
if kmer in query_lists:
|
3007
|
-
query_lists[kmer] = np.union1d(query_lists[kmer], translated_genomes)
|
3008
|
-
else:
|
3009
|
-
query_lists[kmer] = translated_genomes
|
3010
|
-
|
3011
|
-
db.close_connection()
|
3012
|
-
|
3013
|
-
#Recipient is not guaranteed to be in the accs per db - if it was created anew, it wouldn't be.
|
3014
|
-
if recipient.path in accs_per_db:
|
3015
|
-
if acc in accs_per_db[recipient.path]:
|
3016
|
-
recipient.activate_connection()
|
3017
|
-
|
3018
|
-
for result in recipient.cursor.execute(query_sql).fetchall():
|
3019
|
-
kmer = result[0]
|
3020
|
-
genomes = result[1]
|
3021
|
-
translated_genomes = reverse_genome_indicies[recipient.path][genomes]
|
3022
|
-
if kmer in query_lists:
|
3023
|
-
query_lists[kmer] = np.union1d(query_lists[kmer], translated_genomes)
|
3024
|
-
else:
|
3025
|
-
query_lists[kmer] = translated_genomes
|
3026
|
-
|
3027
|
-
recipient.close_connection()
|
3028
|
-
|
3029
|
-
#Byte-string these.
|
3030
|
-
for kmer in query_lists:
|
3031
|
-
query_lists[kmer] = query_lists[kmer].tobytes()
|
3032
|
-
|
3033
|
-
temp_db.cursor.executemany("INSERT INTO " + acc_name + " VALUES (?,?)", zip(query_lists.keys(), query_lists.values()))
|
3034
|
-
temp_db.connection.commit()
|
3035
|
-
|
3036
|
-
del query_lists
|
3037
|
-
|
3038
|
-
#Reset. Do genomes
|
3039
|
-
query_genomes_sql = "SELECT * FROM " + acc_name_gens
|
3040
|
-
query_lists = {}
|
3041
|
-
for db in donor_dbs:
|
3042
|
-
if acc in accs_per_db[db.path]:
|
3043
|
-
db.activate_connection()
|
3044
|
-
|
3045
|
-
for result in db.cursor.execute(query_genomes_sql).fetchall():
|
3046
|
-
genome = result[0]
|
3047
|
-
kmers = result[1]
|
3048
|
-
translated_genome = int(reverse_genome_indicies[db.path][genome])
|
3049
|
-
#Each genome gets added only once, no dupes.
|
3050
|
-
if translated_genome not in query_lists:
|
3051
|
-
query_lists[translated_genome] = kmers
|
3052
|
-
|
3053
|
-
db.close_connection()
|
3054
|
-
|
3055
|
-
if recipient.path in accs_per_db:
|
3056
|
-
if acc in accs_per_db[recipient.path]:
|
3057
|
-
recipient.activate_connection()
|
3058
|
-
|
3059
|
-
for result in recipient.cursor.execute(query_genomes_sql).fetchall():
|
3060
|
-
genome = result[0]
|
3061
|
-
kmers = result[1]
|
3062
|
-
translated_genome = int(reverse_genome_indicies[recipient.path][genome])
|
3063
|
-
#Each genome gets added only once, no dupes.
|
3064
|
-
if translated_genome not in query_lists:
|
3065
|
-
query_lists[translated_genome] = kmers
|
3066
|
-
|
3067
|
-
recipient.close_connection()
|
3068
|
-
|
3069
|
-
#Byte-string these.
|
3070
|
-
#gak = []
|
3071
|
-
for g in query_lists:
|
3072
|
-
#gak.append((g, acc, query_lists[g].shape[0]))
|
3073
|
-
query_lists[g] = query_lists[g].tobytes()
|
3074
|
-
|
3075
|
-
|
3076
|
-
temp_db.cursor.executemany("INSERT INTO " + acc_name_gens + " VALUES (?,?)", zip(query_lists.keys(), query_lists.values()))
|
3077
|
-
temp_db.connection.commit()
|
3078
|
-
|
3079
|
-
temp_db.close_connection()
|
3080
|
-
|
3081
|
-
return [acc_name, temp_db.path]
|
3082
|
-
|
3083
|
-
#Query 1 genome vs. 1 target using Carlos' method - just needs query, target, threads
|
3084
|
-
def single_query_opts():
|
3085
|
-
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
|
3086
|
-
description='''
|
3087
|
-
This FastAAI module takes a single query genome, protein, or protein and HMM pair and a single target genome, protein, or protein and HMM pair as inputs and calculates AAI between the two.
|
3088
|
-
|
3089
|
-
If you supply a genome as either query or target, a protein and HMM file will be made for the genome.
|
3090
|
-
If you supply a protein as either query or target, an HMM file will be made for it.
|
3091
|
-
If you supply both an HMM and protein, the search will start right away. You cannot provide only an HMM.
|
3092
|
-
|
3093
|
-
No database will be built, and you cannot query multiple genomes with this module.
|
3094
|
-
|
3095
|
-
If you wish to query multiple genomes against themselves in all vs. all AAI search, use aai_index instead.
|
3096
|
-
If you wish to query multiple genomes against multiple targets, use multi_query instead.
|
3097
|
-
''')
|
3098
|
-
parser.add_argument('-qg', '--query_genome', dest = 'query_genome', default = None, help = 'Query genome')
|
3099
|
-
parser.add_argument('-tg', '--target_genome', dest = 'target_genome', default = None, help = 'Target genome')
|
3100
|
-
|
3101
|
-
parser.add_argument('-qp', '--query_protein', dest = 'query_protein', default = None, help = 'Query protein')
|
3102
|
-
parser.add_argument('-tp', '--target_protein', dest = 'target_protein', default = None, help = 'Target protein')
|
3103
|
-
|
3104
|
-
parser.add_argument('-qh', '--query_hmm', dest = 'query_hmm', default = None, help = 'Query HMM')
|
3105
|
-
parser.add_argument('-th', '--target_hmm', dest = 'target_hmm', default = None, help = 'Target HMM')
|
3106
|
-
|
3107
|
-
parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory where FastAAI will place the result of this query. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
|
3108
|
-
|
3109
|
-
parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
|
3110
|
-
parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
|
3111
|
-
|
3112
|
-
#Alternative file input
|
3113
|
-
|
3114
|
-
args, unknown = parser.parse_known_args()
|
3115
|
-
|
3116
|
-
return parser, args
|
3117
|
-
|
3118
|
-
def do_single_query(input_file):
|
3119
|
-
input_file.preprocess()
|
3120
|
-
return input_file
|
3121
|
-
|
3122
|
-
def intersect_kmer_lists(pair):
|
3123
|
-
intersection = np.intersect1d(pair[0], pair[1]).shape[0]
|
3124
|
-
union = pair[0].shape[0] + pair[1].shape[0] - intersection
|
3125
|
-
return (intersection/union)
|
3126
|
-
|
3127
|
-
def kaai_to_aai(kaai):
|
3128
|
-
# Transform the kAAI into estimated AAI values
|
3129
|
-
aai_hat = (-0.3087057 + 1.810741 * (np.exp(-(-0.2607023 * np.log(kaai))**(1/3.435))))*100
|
3130
|
-
|
3131
|
-
return aai_hat
|
3132
|
-
|
3133
|
-
#This one's unique. It doesn't do anything with the DB, which means it doesn't access any other functionality outside of the input_file class. It just advances a pair of inputs in parallel and does intersections.
|
3134
|
-
def single_query(query_args, target_args, shared_args):
|
3135
|
-
|
3136
|
-
output, threads, verbose = shared_args[0], shared_args[1], shared_args[2]
|
3137
|
-
|
3138
|
-
genomes, proteins, hmms = query_args[0], query_args[1], query_args[2]
|
3139
|
-
|
3140
|
-
if genomes is None and proteins is None and hmms is None:
|
3141
|
-
print("Please supply a query genome, protein, or protein and HMM pair.")
|
3142
|
-
sys.exit()
|
3143
|
-
|
3144
|
-
query = None
|
3145
|
-
|
3146
|
-
if genomes is not None:
|
3147
|
-
query = input_file(genomes, output, verbose)
|
3148
|
-
query.set_genome(genomes)
|
3149
|
-
if proteins is not None:
|
3150
|
-
if query is not None:
|
3151
|
-
print("If you supply a genome for either query or target, you must supply ONLY the genome, not a genome and either a protein or HMM.")
|
3152
|
-
sys.exit()
|
3153
|
-
else:
|
3154
|
-
query = input_file(proteins, output, verbose)
|
3155
|
-
query.set_protein(proteins)
|
3156
|
-
if hmms is not None:
|
3157
|
-
if query is None:
|
3158
|
-
print("If you supply an HMM for either query or target, you must also supply the protein from which the HMM was generated.")
|
3159
|
-
sys.exit()
|
3160
|
-
else:
|
3161
|
-
query.set_hmm(hmms)
|
3162
|
-
|
3163
|
-
genomes, proteins, hmms = target_args[0], target_args[1], target_args[2]
|
3164
|
-
|
3165
|
-
if genomes is None and proteins is None and hmms is None:
|
3166
|
-
print("Please supply a target genome, protein, or protein and HMM pair.")
|
3167
|
-
sys.exit()
|
3168
|
-
|
3169
|
-
target = None
|
3170
|
-
|
3171
|
-
if genomes is not None:
|
3172
|
-
target = input_file(genomes, output, verbose)
|
3173
|
-
target.set_genome(genomes)
|
3174
|
-
if proteins is not None:
|
3175
|
-
if target is not None:
|
3176
|
-
print("If you supply a genome for either target or target, you must supply ONLY the genome, not a genome and either a protein or HMM.")
|
3177
|
-
sys.exit()
|
3178
|
-
else:
|
3179
|
-
target = input_file(proteins, output, verbose)
|
3180
|
-
target.set_protein(proteins)
|
3181
|
-
if hmms is not None:
|
3182
|
-
if target is None:
|
3183
|
-
print("If you supply an HMM for either target or target, you must also supply the protein from which the HMM was generated.")
|
3184
|
-
sys.exit()
|
3185
|
-
else:
|
3186
|
-
target.set_hmm(hmms)
|
3187
|
-
|
3188
|
-
if query.basename == target.basename:
|
3189
|
-
print("You've selected the same query and target genome. The AAI is 100%.")
|
3190
|
-
print("FastAAI exiting.")
|
3191
|
-
return None
|
3192
|
-
|
3193
|
-
statuses = ["genome", "protein", "protein and hmm"]
|
3194
|
-
query_stat = statuses.index(query.status)
|
3195
|
-
target_stat = statuses.index(target.status)
|
3196
|
-
minimum_status = statuses[min(query_stat, target_stat)]
|
3197
|
-
|
3198
|
-
start_printouts = ["[Genome] Protein Protein+HMM", " Genome [Protein] Protein+HMM", "Genome Protein [Protein+HMM]"]
|
3199
|
-
|
3200
|
-
print("")
|
3201
|
-
print("Query start: ", start_printouts[query_stat])
|
3202
|
-
print("Target start:", start_printouts[target_stat])
|
3203
|
-
print("")
|
3204
|
-
|
3205
|
-
good_to_go = prepare_directories(output, minimum_status, "build")
|
3206
|
-
|
3207
|
-
if not good_to_go:
|
3208
|
-
print("Exiting FastAAI")
|
3209
|
-
sys.exit()
|
3210
|
-
|
3211
|
-
qname = query.basename
|
3212
|
-
tname = target.basename
|
3213
|
-
|
3214
|
-
name = qname + "_vs_" + tname + ".aai.txt"
|
3215
|
-
print("Output will be located at", os.path.normpath(output) + "/results/"+name)
|
3216
|
-
|
3217
|
-
#Give the data for kmer indexing to the parallel processes
|
3218
|
-
global kmer_index
|
3219
|
-
kmer_index = create_kmer_index()
|
3220
|
-
|
3221
|
-
advance_me = [query, target]
|
3222
|
-
#All we need to do this.
|
3223
|
-
pool = multiprocessing.Pool(min(threads, 2))
|
3224
|
-
|
3225
|
-
results = pool.map(do_single_query, advance_me)
|
3226
|
-
|
3227
|
-
pool.close()
|
3228
|
-
pool.join()
|
3229
|
-
|
3230
|
-
query = results[0]
|
3231
|
-
target = results[1]
|
3232
|
-
|
3233
|
-
#One of the printouts
|
3234
|
-
max_poss_prots = max(len(query.best_hits_kmers), len(target.best_hits_kmers))
|
3235
|
-
|
3236
|
-
accs_to_view = set(query.best_hits_kmers.keys()).intersection(set(target.best_hits_kmers.keys()))
|
3237
|
-
|
3238
|
-
seq_pairs = [[query.best_hits_kmers[acc], target.best_hits_kmers[acc]] for acc in accs_to_view]
|
3239
|
-
|
3240
|
-
pool = multiprocessing.Pool(min(threads, len(accs_to_view)))
|
3241
|
-
|
3242
|
-
results = np.array(pool.map(intersect_kmer_lists, seq_pairs))
|
3243
|
-
|
3244
|
-
pool.close()
|
3245
|
-
pool.join()
|
3246
|
-
|
3247
|
-
jacc_mean = np.mean(results)
|
3248
|
-
jacc_std = np.std(results)
|
3249
|
-
actual_prots = len(results)
|
3250
|
-
aai_est = round(kaai_to_aai(jacc_mean), 2)
|
3251
|
-
|
3252
|
-
if aai_est > 90:
|
3253
|
-
aai_est = "> 90%"
|
3254
|
-
else:
|
3255
|
-
if aai_est < 30:
|
3256
|
-
aai_est = "< 30%"
|
3257
|
-
|
3258
|
-
output = open(name, "w")
|
3259
|
-
|
3260
|
-
print(qname, tname, round(jacc_mean, 4), round(jacc_std, 4), actual_prots, aai_est, file = output)
|
3261
|
-
|
3262
|
-
output.close()
|
3263
|
-
|
3264
|
-
print("FastAAI single query done! Estimated AAI:", aai_est)
|
3265
|
-
|
3266
|
-
def aai_index_opts():
|
3267
|
-
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
|
3268
|
-
description='''
|
3269
|
-
This FastAAI module takes a set of genomes, proteins, or proteins and HMMs, creates a FastAAI database from them, and then executes an all vs. all AAI search of the genomes in the database
|
3270
|
-
''')
|
3271
|
-
|
3272
|
-
parser.add_argument('-g', '--genomes', dest = 'genomes', default = None, help = 'A directory containing genomes in FASTA format.')
|
3273
|
-
parser.add_argument('-p', '--proteins', dest = 'proteins', default = None, help = 'A directory containing protein amino acids in FASTA format.')
|
3274
|
-
parser.add_argument('-m', '--hmms', dest = 'hmms', default = None, help = 'A directory containing the results of an HMM search on a set of proteins.')
|
3275
|
-
|
3276
|
-
parser.add_argument('-d', '--database', dest = 'db_name', default = "FastAAI_database.sqlite.db", help = 'The name of the database you wish to create or add to. The database will be created if it doesn\'t already exist and placed in the output directory.')
|
3277
|
-
|
3278
|
-
parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory to place the database and any protein or HMM files FastAAI creates. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
|
3279
|
-
|
3280
|
-
parser.add_argument('--genome_file', dest = 'gf', default = None, help = 'Alternative way to supply genomes. A file containing paths to your genome files, 1 per line.')
|
3281
|
-
parser.add_argument('--protein_file', dest = 'pf', default = None, help = 'Alternative way to supply proteins. A file containing paths to your protein files, 1 per line.')
|
3282
|
-
parser.add_argument('--hmm_file', dest = 'hf', default = None, help = 'Alternative way to supply HMMs. A file containing paths to your HMM files, 1 per line.')
|
3283
|
-
|
3284
|
-
parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
|
3285
|
-
parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
|
3286
|
-
|
3287
|
-
|
3288
|
-
parser.add_argument('--do_stdev', dest = "do_stdev", action='store_true', help = 'Off by default. Calculate std. deviations on Jaccard indicies. Increases memory usage and runtime slightly. Does NOT change estimated AAI values at all.')
|
3289
|
-
parser.add_argument('--unlimited_resources', dest = "large_mem", action = 'store_true', help = 'Off by default. Use a faster algorithm that consumes more RAM. FastAAI cannot calculate std. deviations with this algorithm, so they will automatically be skipped.')
|
3290
|
-
parser.add_argument('--mem', dest = "precision", default = "med", help = 'One of low/med/high. Medium by default. Save RAM in return for slightly rounded AAI estimates. Only affects FastAAI if you are also using the "--unlimited_resources" flag.')
|
3291
|
-
|
3292
|
-
args, unknown = parser.parse_known_args()
|
3293
|
-
|
3294
|
-
return parser, args
|
3295
|
-
|
3296
|
-
#Build a DB and query a dataset vs. self
|
3297
|
-
def aai_index(genomes, proteins, hmms, db_name, output, threads, gf, pf, hf, verbose, do_stdev, memory_use, unlimited_resources):
|
3298
|
-
#run build DB and then db_query with the fresh DB
|
3299
|
-
success = build_db(genomes, proteins, hmms, db_name, output, threads, gf, pf, hf, verbose)
|
3300
|
-
if success:
|
3301
|
-
accessible_name = os.path.normpath(output + "/database/" + db_name)
|
3302
|
-
db_query(accessible_name, accessible_name, verbose, output, threads, do_stdev, memory_use, unlimited_resources)
|
3303
|
-
else:
|
3304
|
-
print("Database could not be built. FastAAI exiting.")
|
3305
|
-
|
3306
|
-
return None
|
3307
|
-
|
3308
|
-
#Build 2 DBs and query query DB vs target DB
|
3309
|
-
def multi_query_opts():
|
3310
|
-
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
|
3311
|
-
description='''
|
3312
|
-
This FastAAI module takes a set of query genomes/proteins/proteins+HMMs and a set of target genomes/proteins/proteins+HMMs.
|
3313
|
-
Two FastAAI databases will be created, one for the query and one for the target, then the query database will have AAI calculated against the target database
|
3314
|
-
''')
|
3315
|
-
|
3316
|
-
parser.add_argument('-qg', '--query_genomes', dest = 'query_genomes', default = None, help = 'A directory containing query genomes in FASTA format.')
|
3317
|
-
parser.add_argument('-qp', '--query_proteins', dest = 'query_proteins', default = None, help = 'A directory containing query protein amino acids in FASTA format.')
|
3318
|
-
parser.add_argument('-qm', '--query_hmms', dest = 'query_hmms', default = None, help = 'A directory containing the results of an HMM search on the set of query proteins.')
|
3319
|
-
|
3320
|
-
parser.add_argument('-tg', '--target_genomes', dest = 'target_genomes', default = None, help = 'A directory containing target genomes in FASTA format.')
|
3321
|
-
parser.add_argument('-tp', '--target_proteins', dest = 'target_proteins', default = None, help = 'A directory containing target protein amino acids in FASTA format.')
|
3322
|
-
parser.add_argument('-tm', '--target_hmms', dest = 'target_hmms', default = None, help = 'A directory containing the results of an HMM search on the set of target proteins.')
|
3323
|
-
|
3324
|
-
|
3325
|
-
parser.add_argument('-qd', '--query_database', dest = 'query_db_name', default = "FastAAI_query_database.sqlite.db", help = 'The name of the query database you wish to create or add to. The database will be created if it doesn\'t already exist and placed in the output directory.')
|
3326
|
-
parser.add_argument('-td', '--target_database', dest = 'target_db_name', default = "FastAAI_target_database.sqlite.db", help = 'The name of the target database you wish to create or add to. The database will be created if it doesn\'t already exist and placed in the output directory.')
|
3327
|
-
|
3328
|
-
parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory to place the database and any protein or HMM files FastAAI creates. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
|
3329
|
-
|
3330
|
-
parser.add_argument('--query_genome_file', dest = 'qgf', default = None, help = 'Alternative way to supply genomes. A file containing paths to your query genome files, 1 per line.')
|
3331
|
-
parser.add_argument('--query_protein_file', dest = 'qpf', default = None, help = 'Alternative way to supply proteins. A file containing paths to your query protein files, 1 per line.')
|
3332
|
-
parser.add_argument('--query_hmm_file', dest = 'qhf', default = None, help = 'Alternative way to supply HMMs. A file containing paths to your query HMM files, 1 per line.')
|
3333
|
-
|
3334
|
-
parser.add_argument('--target_genome_file', dest = 'tgf', default = None, help = 'Alternative way to supply genomes. A file containing paths to your target genome files, 1 per line.')
|
3335
|
-
parser.add_argument('--target_protein_file', dest = 'tpf', default = None, help = 'Alternative way to supply proteins. A file containing paths to your target protein files, 1 per line.')
|
3336
|
-
parser.add_argument('--target_hmm_file', dest = 'thf', default = None, help = 'Alternative way to supply HMMs. A file containing paths to your target HMM files, 1 per line.')
|
3337
|
-
|
3338
|
-
parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
|
3339
|
-
parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
|
3340
|
-
|
3341
|
-
parser.add_argument('--do_stdev', dest = "do_stdev", action='store_true', help = 'Off by default. Calculate std. deviations on Jaccard indicies. Increases memory usage and runtime slightly. Does NOT change estimated AAI values at all.')
|
3342
|
-
parser.add_argument('--unlimited_resources', dest = "large_mem", action = 'store_true', help = 'Off by default. Use a faster algorithm that consumes more RAM. FastAAI cannot calculate std. deviations with this algorithm, so they will automatically be skipped.')
|
3343
|
-
parser.add_argument('--mem', dest = "precision", default = "med", help = 'One of low/med/high. Medium by default. Save RAM in return for slightly rounded AAI estimates. Only affects FastAAI if you are also using the "--unlimited_resources" flag.')
|
3344
|
-
|
3345
|
-
args, unknown = parser.parse_known_args()
|
3346
|
-
|
3347
|
-
return parser, args
|
3348
|
-
|
3349
|
-
#Build 2 DBs and query query DB vs target DB
|
3350
|
-
def multi_query(query_arg_list, target_arg_list, shared_args):
|
3351
|
-
pass
|
3352
|
-
output, threads, verbose, do_stdev, mem, efficient = shared_args[0], shared_args[1], shared_args[2], shared_args[3], shared_args[4], shared_args[5]
|
3353
|
-
|
3354
|
-
genomes, proteins, hmms, gf, pf, hf, db_name = query_arg_list[0], query_arg_list[1], query_arg_list[2], query_arg_list[3], query_arg_list[4], query_arg_list[5], query_arg_list[6]
|
3355
|
-
accessible_name_query = os.path.normpath(output + "/database/" + db_name)
|
3356
|
-
build_db(genomes, proteins, hmms, db_name, output, threads, gf, pf, hf, verbose)
|
3357
|
-
|
3358
|
-
genomes, proteins, hmms, gf, pf, hf, db_name = target_arg_list[0], target_arg_list[1], target_arg_list[2], target_arg_list[3], target_arg_list[4], target_arg_list[5], target_arg_list[6]
|
3359
|
-
accessible_name_target = os.path.normpath(output + "/database/" + db_name)
|
3360
|
-
build_db(genomes, proteins, hmms, db_name, output, threads, gf, pf, hf, verbose)
|
3361
|
-
|
3362
|
-
db_query(accessible_name_query, accessible_name_target, verbose, output, threads, do_stdev, mem, efficient)
|
3363
|
-
|
3364
|
-
'''
|
3365
|
-
Main
|
3366
|
-
'''
|
3367
|
-
def main():
|
3368
|
-
#The currently supported modules.
|
3369
|
-
modules = ["build_db", "merge_db", "simple_query", "db_query", "single_query", "aai_index", "multi_query"]
|
3370
|
-
|
3371
|
-
#Print modules if someone just types FastAAI
|
3372
|
-
if len(sys.argv) < 2:
|
3373
|
-
print("")
|
3374
|
-
print(" Welcome to FastAAI")
|
3375
|
-
print("")
|
3376
|
-
print("")
|
3377
|
-
print(" Please select one of the following modules:")
|
3378
|
-
print("")
|
3379
|
-
print("------------------------------------------- Quick Usage Options -------------------------------------------")
|
3380
|
-
print("")
|
3381
|
-
print(" single_query |" + " Quickly query ONE query genome against ONE target genome")
|
3382
|
-
print(" multi_query |" + " Create a query DB and a target DB, then calculate query vs. target AAI")
|
3383
|
-
print(" aai_index |" + " Create a database from multiple genomes and do an all vs. all AAI index of the genomes")
|
3384
|
-
print("")
|
3385
|
-
print("-------------------------------------- Database Construction Options --------------------------------------")
|
3386
|
-
print("")
|
3387
|
-
print(" build_db |" + " Create or add to a FastAAI database from genomes, proteins, or proteins and HMMs")
|
3388
|
-
print(" merge_db |" + " Add the contents of one FastAAI DB to another")
|
3389
|
-
print("")
|
3390
|
-
print("---------------------------------------------- Query Options ----------------------------------------------")
|
3391
|
-
print("")
|
3392
|
-
print(" simple_query |" + " Query a genome or protein (one or many) against an existing FastAAI database")
|
3393
|
-
print(" db_query |" + " Query the genomes in one FastAAI database against the genomes in another FastAAI database")
|
3394
|
-
print("")
|
3395
|
-
print("-----------------------------------------------------------------------------------------------------------")
|
3396
|
-
print("")
|
3397
|
-
print(" To select a module, enter 'FastAAI [module]' into the command line!")
|
3398
|
-
print("")
|
3399
|
-
sys.exit()
|
3400
|
-
|
3401
|
-
#This is the module selection
|
3402
|
-
selection = sys.argv[1]
|
3403
|
-
|
3404
|
-
if selection not in modules:
|
3405
|
-
print("")
|
3406
|
-
print(" I couldn't find the module you specified. Please select one of the following modules:")
|
3407
|
-
print("")
|
3408
|
-
print("------------------------------------------- Quick Usage Options -------------------------------------------")
|
3409
|
-
print("")
|
3410
|
-
print(" single_query |" + " Quickly query ONE query genome against ONE target genome")
|
3411
|
-
print(" multi_query |" + " Create a query DB and a target DB, then calculate query vs. target AAI")
|
3412
|
-
print(" aai_index |" + " Create a database from multiple genomes and do an all vs. all AAI index of the genomes")
|
3413
|
-
print("")
|
3414
|
-
print("-------------------------------------- Database Construction Options --------------------------------------")
|
3415
|
-
print("")
|
3416
|
-
print(" build_db |" + " Create or add to a FastAAI database from genomes, proteins, or proteins and HMMs")
|
3417
|
-
print(" merge_db |" + " Add the contents of one FastAAI DB to another")
|
3418
|
-
print("")
|
3419
|
-
print("---------------------------------------------- Query Options ----------------------------------------------")
|
3420
|
-
print("")
|
3421
|
-
print(" simple_query |" + " Query a genome or protein (one or many) against an existing FastAAI database")
|
3422
|
-
print(" db_query |" + " Query the genomes in one FastAAI database against the genomes in another FastAAI database")
|
3423
|
-
print("")
|
3424
|
-
print("-----------------------------------------------------------------------------------------------------------")
|
3425
|
-
print("")
|
3426
|
-
print(" To select a module, enter 'FastAAI [module]' into the command line!")
|
3427
|
-
print("")
|
3428
|
-
sys.exit()
|
3429
|
-
|
3430
|
-
#################### Database build or add ########################
|
3431
|
-
|
3432
|
-
if selection == "build_db":
|
3433
|
-
parser, opts = build_db_opts()
|
3434
|
-
|
3435
|
-
#module name only
|
3436
|
-
if len(sys.argv) < 3:
|
3437
|
-
print(parser.print_help())
|
3438
|
-
sys.exit()
|
3439
|
-
|
3440
|
-
#Directory based
|
3441
|
-
genomes, proteins, hmms = opts.genomes, opts.proteins, opts.hmms
|
3442
|
-
|
3443
|
-
#Input list based
|
3444
|
-
gf, pf, hf = opts.gf, opts.pf, opts.hf
|
3445
|
-
|
3446
|
-
output = os.path.normpath(opts.output)
|
3447
|
-
|
3448
|
-
threads = opts.threads
|
3449
|
-
verbose = opts.verbose
|
3450
|
-
|
3451
|
-
#Database handle
|
3452
|
-
db_name = opts.db_name
|
3453
|
-
|
3454
|
-
|
3455
|
-
#genomes, proteins, hmms, db_name, output, threads, gf, pf, hf, verbose
|
3456
|
-
build_db(genomes, proteins, hmms, db_name, output, threads, gf, pf, hf, verbose)
|
3457
|
-
|
3458
|
-
#################### Add two DBs ########################
|
3459
|
-
|
3460
|
-
if selection == "merge_db":
|
3461
|
-
parser, opts = merge_db_opts()
|
3462
|
-
if len(sys.argv) < 3:
|
3463
|
-
print(parser.print_help())
|
3464
|
-
sys.exit()
|
3465
|
-
|
3466
|
-
recipient = opts.recipient
|
3467
|
-
donors = opts.donors
|
3468
|
-
donor_file = opts.donor_file
|
3469
|
-
verbose = opts.verbose
|
3470
|
-
threads = opts.threads
|
3471
|
-
|
3472
|
-
merge_db(recipient, donors, donor_file, verbose, threads)
|
3473
|
-
|
3474
|
-
#################### Query files vs DB ########################
|
3475
|
-
|
3476
|
-
if selection == "simple_query":
|
3477
|
-
parser, opts = sql_query_opts()
|
3478
|
-
|
3479
|
-
if len(sys.argv) < 3:
|
3480
|
-
print(parser.print_help())
|
3481
|
-
sys.exit()
|
3482
|
-
|
3483
|
-
#directory based
|
3484
|
-
genomes, proteins, hmms = opts.genomes, opts.proteins, opts.hmms
|
3485
|
-
|
3486
|
-
#Input list based
|
3487
|
-
gf, pf, hf = opts.gf, opts.pf, opts.hf
|
3488
|
-
|
3489
|
-
db_name = opts.target
|
3490
|
-
|
3491
|
-
output = opts.output
|
3492
|
-
threads = opts.threads
|
3493
|
-
verbose = opts.verbose
|
3494
|
-
|
3495
|
-
do_stdev = opts.do_stdev
|
3496
|
-
|
3497
|
-
sql_query(genomes, proteins, hmms, gf, pf, hf, db_name, output, threads, verbose, do_stdev)
|
3498
|
-
|
3499
|
-
|
3500
|
-
#################### Query DB vs DB ###########################
|
3501
|
-
if selection == "db_query":
|
3502
|
-
parser, opts = db_query_opts()
|
3503
|
-
#module name only
|
3504
|
-
|
3505
|
-
if len(sys.argv) < 3:
|
3506
|
-
print(parser.print_help())
|
3507
|
-
sys.exit()
|
3508
|
-
|
3509
|
-
query = opts.query
|
3510
|
-
target = opts.target
|
3511
|
-
verbose = opts.verbose
|
3512
|
-
|
3513
|
-
do_stdev = opts.do_stdev
|
3514
|
-
#massive = opts.massive
|
3515
|
-
|
3516
|
-
mem = opts.precision
|
3517
|
-
efficient = opts.large_mem
|
3518
|
-
|
3519
|
-
output = opts.output
|
3520
|
-
threads = opts.threads
|
3521
|
-
|
3522
|
-
db_query(query, target, verbose, output, threads, do_stdev, mem, efficient)
|
3523
|
-
|
3524
|
-
#################### One-pass functions #######################
|
3525
|
-
if selection == "single_query":
|
3526
|
-
parser, opts = single_query_opts()
|
3527
|
-
#module name only
|
3528
|
-
|
3529
|
-
if len(sys.argv) < 3:
|
3530
|
-
print(parser.print_help())
|
3531
|
-
sys.exit()
|
3532
|
-
|
3533
|
-
shared_opts = []
|
3534
|
-
output = os.path.normpath(opts.output)
|
3535
|
-
threads = opts.threads
|
3536
|
-
verbose = opts.verbose
|
3537
|
-
|
3538
|
-
shared_opts.append(output)
|
3539
|
-
|
3540
|
-
shared_opts.append(threads)
|
3541
|
-
shared_opts.append(verbose)
|
3542
|
-
|
3543
|
-
query_opts = []
|
3544
|
-
|
3545
|
-
query_genome = opts.query_genome
|
3546
|
-
query_protein = opts.query_protein
|
3547
|
-
query_hmm = opts.query_hmm
|
3548
|
-
|
3549
|
-
|
3550
|
-
query_opts.append(query_genome)
|
3551
|
-
query_opts.append(query_protein)
|
3552
|
-
query_opts.append(query_hmm)
|
3553
|
-
|
3554
|
-
target_opts = []
|
3555
|
-
|
3556
|
-
target_genome = opts.target_genome
|
3557
|
-
target_protein = opts.target_protein
|
3558
|
-
target_hmm = opts.target_hmm
|
3559
|
-
|
3560
|
-
#tg = opts.target_genome_file
|
3561
|
-
#tp = opts.target_protein_file
|
3562
|
-
#th = opts.target_hmm_file
|
3563
|
-
|
3564
|
-
target_opts.append(target_genome)
|
3565
|
-
target_opts.append(target_protein)
|
3566
|
-
target_opts.append(target_hmm)
|
3567
|
-
|
3568
|
-
single_query(query_opts, target_opts, shared_opts)
|
3569
|
-
|
3570
|
-
if selection == "aai_index":
|
3571
|
-
parser, opts = aai_index_opts()
|
3572
|
-
#module name only
|
3573
|
-
|
3574
|
-
if len(sys.argv) < 3:
|
3575
|
-
print(parser.print_help())
|
3576
|
-
sys.exit()
|
3577
|
-
|
3578
|
-
|
3579
|
-
genomes, proteins, hmms = opts.genomes, opts.proteins, opts.hmms
|
3580
|
-
#Text file versions of genomes/proteins/hmms
|
3581
|
-
gf, pf, hf = opts.gf, opts.pf, opts.hf
|
3582
|
-
|
3583
|
-
db_name = opts.db_name
|
3584
|
-
|
3585
|
-
output = opts.output
|
3586
|
-
threads = opts.threads
|
3587
|
-
verbose = opts.verbose
|
3588
|
-
|
3589
|
-
do_stdev = opts.do_stdev
|
3590
|
-
#massive = opts.massive
|
3591
|
-
|
3592
|
-
mem = opts.precision
|
3593
|
-
efficient = opts.large_mem
|
3594
|
-
|
3595
|
-
aai_index(genomes, proteins, hmms, db_name, output, threads, gf, pf, hf, verbose, do_stdev, mem, efficient)
|
3596
|
-
|
3597
|
-
if selection == "multi_query":
|
3598
|
-
parser, opts = multi_query_opts()
|
3599
|
-
#module name only
|
3600
|
-
|
3601
|
-
if len(sys.argv) < 3:
|
3602
|
-
print(parser.print_help())
|
3603
|
-
sys.exit()
|
3604
|
-
|
3605
|
-
shared_arg_list = []
|
3606
|
-
output = os.path.normpath(opts.output)
|
3607
|
-
threads = opts.threads
|
3608
|
-
verbose = opts.verbose
|
3609
|
-
|
3610
|
-
do_stdev = opts.do_stdev
|
3611
|
-
#massive = opts.massive
|
3612
|
-
|
3613
|
-
mem = opts.precision
|
3614
|
-
efficient = opts.large_mem
|
3615
|
-
|
3616
|
-
shared_arg_list.append(output)
|
3617
|
-
shared_arg_list.append(threads)
|
3618
|
-
shared_arg_list.append(verbose)
|
3619
|
-
shared_arg_list.append(do_stdev)
|
3620
|
-
shared_arg_list.append(mem)
|
3621
|
-
shared_arg_list.append(efficient)
|
3622
|
-
|
3623
|
-
query_arg_list = []
|
3624
|
-
genomes, proteins, hmms = opts.query_genomes, opts.query_proteins, opts.query_hmms
|
3625
|
-
#Text file versions of genomes/proteins/hmms
|
3626
|
-
gf, pf, hf = opts.qgf, opts.qpf, opts.qhf
|
3627
|
-
query_db_name = opts.query_db_name
|
3628
|
-
|
3629
|
-
query_arg_list.append(genomes)
|
3630
|
-
query_arg_list.append(proteins)
|
3631
|
-
query_arg_list.append(hmms)
|
3632
|
-
query_arg_list.append(gf)
|
3633
|
-
query_arg_list.append(pf)
|
3634
|
-
query_arg_list.append(hf)
|
3635
|
-
query_arg_list.append(query_db_name)
|
3636
|
-
|
3637
|
-
target_arg_list = []
|
3638
|
-
genomes, proteins, hmms = opts.target_genomes, opts.target_proteins, opts.target_hmms
|
3639
|
-
#Text file versions of genomes/proteins/hmms
|
3640
|
-
gf, pf, hf = opts.tgf, opts.tpf, opts.thf
|
3641
|
-
target_db_name = opts.target_db_name
|
3642
|
-
|
3643
|
-
target_arg_list.append(genomes)
|
3644
|
-
target_arg_list.append(proteins)
|
3645
|
-
target_arg_list.append(hmms)
|
3646
|
-
target_arg_list.append(gf)
|
3647
|
-
target_arg_list.append(pf)
|
3648
|
-
target_arg_list.append(hf)
|
3649
|
-
target_arg_list.append(target_db_name)
|
3650
|
-
|
3651
|
-
multi_query(query_arg_list, target_arg_list, shared_arg_list)
|
3652
|
-
|
3653
|
-
return None
|
3654
|
-
|
3655
|
-
|
3656
|
-
if __name__ == "__main__":
|
3657
|
-
main()
|
3658
|
-
|
3659
|
-
|