miga-base 1.0.5.2 → 1.1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,3630 @@
1
+ #!/usr/bin/env python3
2
+
3
+ ################################################################################
4
+ """---0.0 Import Modules---"""
5
+ import subprocess
6
+ import argparse
7
+ import datetime
8
+ import shutil
9
+ import textwrap
10
+ import multiprocessing
11
+ import pickle
12
+ import gzip
13
+ import tempfile
14
+ #Shouldn't play any role.
15
+ #from random import randint
16
+
17
+ #We could probably remove Path, too.
18
+ from pathlib import Path
19
+ #This as well
20
+ from functools import partial
21
+ import time
22
+ from collections import defaultdict
23
+ import sys
24
+ import os
25
+ from math import floor
26
+ import sqlite3
27
+ #numpy dependency
28
+ import numpy as np
29
+ import io
30
+ import random
31
+
32
+
33
+ #Takes a bytestring from the SQL database and converts it to a numpy array.
34
+ def convert_array(bytestring):
35
+ return np.frombuffer(bytestring, dtype = np.int32)
36
+
37
+ def convert_float_array_16(bytestring):
38
+ return np.frombuffer(bytestring, dtype = np.float16)
39
+
40
+ def convert_float_array_32(bytestring):
41
+ return np.frombuffer(bytestring, dtype = np.float32)
42
+
43
+ def convert_float_array_64(bytestring):
44
+ return np.frombuffer(bytestring, dtype = np.float64)
45
+
46
+
47
+ #Iterator for agnostic reader
48
+ class agnostic_reader_iterator:
49
+ def __init__(self, reader):
50
+ self.handle_ = reader.handle
51
+ self.is_gz_ = reader.is_gz
52
+
53
+ def __next__(self):
54
+ if self.is_gz_:
55
+ line = self.handle_.readline().decode()
56
+ else:
57
+ line = self.handle_.readline()
58
+
59
+ #Ezpz EOF check
60
+ if line:
61
+ return line
62
+ else:
63
+ raise StopIteration
64
+
65
+ #File reader that doesn't care if you give it a gzipped file or not.
66
+ class agnostic_reader:
67
+ def __init__(self, file):
68
+ self.path = file
69
+
70
+ with open(file, 'rb') as test_gz:
71
+ #Gzip magic number
72
+ is_gz = (test_gz.read(2) == b'\x1f\x8b')
73
+
74
+ self.is_gz = is_gz
75
+
76
+ if is_gz:
77
+ self.handle = gzip.open(self.path)
78
+ else:
79
+ self.handle = open(self.path)
80
+
81
+ def __iter__(self):
82
+ return agnostic_reader_iterator(self)
83
+
84
+ def close(self):
85
+ self.handle.close()
86
+
87
+ #FastAAI database class. This is the final database
88
+ class fastaai_database:
89
+ def __init__(self, path):
90
+ #open SQL db and load in
91
+
92
+ self.path = path
93
+ self.exists = os.path.exists(path)
94
+
95
+ self.child = None
96
+ self.connection = None
97
+ self.cursor = None
98
+
99
+ self.child_connection = None
100
+ self.child_cursor = None
101
+
102
+ self.accessions = None
103
+ #self.genomes = None
104
+
105
+ #gak stands for 'genome_accession_kmer_counts'
106
+ self.gak = None
107
+ self.genome_index = None
108
+ #Go from index to name
109
+ self.reverse_genome_index = None
110
+ self.protein_counts_by_genome = None
111
+
112
+ #self.accession_set = None
113
+
114
+ self.verbosity = False
115
+
116
+ #Open an SQL connection
117
+ def activate_connection(self, with_converter = True):
118
+ # Converts np.array to TEXT when inserting
119
+ ##sqlite3.register_adapter(np.ndarray, adapt_array)
120
+
121
+ #Converts byte string to numpy ndarray(int32) upon read from DB.
122
+ if with_converter:
123
+ sqlite3.register_converter("array", convert_array)
124
+ self.connection = sqlite3.connect(self.path, detect_types=sqlite3.PARSE_DECLTYPES)
125
+
126
+ else:
127
+ #sqlite3.register_converter("array", convert_array)
128
+ self.connection = sqlite3.connect(self.path)
129
+
130
+ self.cursor = self.connection.cursor()
131
+ self.exists = True
132
+
133
+ #Close an SQL connection
134
+ def close_connection(self):
135
+ self.cursor.close()
136
+ self.connection.close()
137
+ #True cleanup - even a closed SQL connection obj cannot be passed to multiple processors, but a nonetype can.
138
+ self.cursor = None
139
+ self.connection = None
140
+
141
+ def initialize_parent_database(self):
142
+ if not self.exists:
143
+ print("I need to be activated first!")
144
+ else:
145
+ #DB exists. Add metadata tables if needed.
146
+ self.cursor.execute(''' SELECT count(name) FROM sqlite_master WHERE type='table' AND name='genome_index' ''')
147
+ if self.cursor.fetchone()[0]!=1 :
148
+ self.cursor.execute('''CREATE TABLE genome_index
149
+ (genome text, gen_id INTEGER PRIMARY KEY, protein_count INTEGER)''')
150
+ self.connection.commit()
151
+
152
+ self.cursor.execute(''' SELECT count(name) FROM sqlite_master WHERE type='table' AND name='genome_acc_kmer_counts' ''')
153
+ if self.cursor.fetchone()[0]!=1 :
154
+ self.cursor.execute('''CREATE TABLE genome_acc_kmer_counts
155
+ (genome INTEGER, accession INTEGER, count INTEGER)''')
156
+ self.connection.commit()
157
+
158
+ #Access an existing master database
159
+ def activate_child_connection(self, child):
160
+ #Don't try to connect unless it exists. This should never fail.
161
+ if os.path.exists(child):
162
+ self.child = child
163
+ self.child_connection = sqlite3.connect(self.child, detect_types=sqlite3.PARSE_DECLTYPES)
164
+ self.child_cursor = self.child_connection.cursor()
165
+ else:
166
+ print("Child database:", child, "not found!")
167
+
168
+ #Close access to master DB
169
+ def close_child_connection(self):
170
+ if self.child_cursor is not None:
171
+ self.child_cursor.close()
172
+ self.child_connection.close()
173
+ self.child_cursor = None
174
+ self.child_connection = None
175
+ self.child = None
176
+
177
+ def add_child_to_parent(self, acc, child_db, remove = True, selected_kmers = None, genomes_too = False, just_genomes = False, update_gak = False):
178
+ accession_index = generate_accessions_index()
179
+
180
+ create_command = "CREATE TABLE IF NOT EXISTS " + acc + " (kmer INTEGER PRIMARY KEY, genomes array)"
181
+
182
+ if not just_genomes:
183
+ self.cursor.execute(create_command)
184
+ self.connection.commit()
185
+
186
+ if genomes_too or just_genomes:
187
+ create_command = "CREATE TABLE IF NOT EXISTS " + acc + "_genomes (genome INTEGER PRIMARY KEY, kmers array)"
188
+ self.cursor.execute(create_command)
189
+ self.connection.commit()
190
+
191
+ attach = "attach '"+child_db+"' as toMerge"
192
+
193
+ if selected_kmers is not None:
194
+ add = "INSERT OR REPLACE INTO " + acc + " SELECT * FROM toMerge." + acc + " WHERE kmer in ({kmers})".format(kmers = ','.join(['?']*len(selected_kmers)))
195
+ else:
196
+ add = "INSERT OR REPLACE INTO " + acc + " SELECT * FROM toMerge." + acc
197
+
198
+ if genomes_too or just_genomes:
199
+ add_genomes = "INSERT OR REPLACE INTO " + acc + "_genomes" + " SELECT * FROM toMerge." + acc+"_genomes"
200
+ if update_gak:
201
+ sql_acc_num = acc.replace("_", ".")
202
+ sql_acc_num = accession_index[sql_acc_num]
203
+ #Return num bytes, which is always 4*as many as there are entries, as the dtype is int32. See unique_kmers.
204
+ gak_sql = 'INSERT OR REPLACE INTO genome_acc_kmer_counts SELECT genome, ' + str(sql_acc_num) + ', length(kmers)/4 FROM toMerge.' + acc + '_genomes'
205
+
206
+ detach = "detach toMerge"
207
+
208
+ self.cursor.execute(attach)
209
+ self.connection.commit()
210
+
211
+ if not just_genomes:
212
+ if selected_kmers is not None:
213
+ self.cursor.execute(add, selected_kmers)
214
+ else:
215
+ self.cursor.execute(add)
216
+
217
+ self.connection.commit()
218
+
219
+ if genomes_too or just_genomes:
220
+ self.cursor.execute(add_genomes)
221
+ self.connection.commit()
222
+ if update_gak:
223
+ self.cursor.execute(gak_sql)
224
+ self.connection.commit()
225
+
226
+ self.cursor.execute(detach)
227
+ self.connection.commit()
228
+
229
+ if remove:
230
+ os.remove(child_db)
231
+
232
+ def add_genomes_first(self, accession, kmer_dict):
233
+ kmer_lists = []
234
+ for genome in kmer_dict:
235
+ kmer_lists.append((genome, kmer_dict[genome].tobytes()))
236
+
237
+
238
+ sql_friendly_accession = accession.replace(".", "_")
239
+
240
+ #self.cursor.execute(" DROP TABLE IF EXISTS " + sql_friendly_accession + "_genomes")
241
+
242
+ self.cursor.execute("CREATE TABLE IF NOT EXISTS " + sql_friendly_accession + "_genomes (genome INTEGER PRIMARY KEY, kmers array)")
243
+ self.connection.commit()
244
+
245
+ self.cursor.executemany("INSERT OR REPLACE INTO " + sql_friendly_accession + "_genomes VALUES (?, ?) ", kmer_lists)
246
+
247
+ self.connection.commit()
248
+
249
+ return sql_friendly_accession
250
+
251
+
252
+ def load_genome_index(self):
253
+ self.genome_index = {}
254
+ self.reverse_genome_index = {}
255
+ self.protein_counts_by_genome = {}
256
+
257
+ sql_command = ("SELECT genome, gen_id, protein_count FROM genome_index")
258
+
259
+ #Break resist.
260
+ gen = None
261
+ id = None
262
+ protein_count = None
263
+
264
+ for result in self.cursor.execute(sql_command).fetchall():
265
+ gen = result[0]
266
+ id = result[1]
267
+ protein_count = result[2]
268
+
269
+ self.genome_index[gen] = id
270
+ self.reverse_genome_index[id] = gen
271
+ self.protein_counts_by_genome[id] = protein_count
272
+
273
+ del gen
274
+ del id
275
+ del protein_count
276
+
277
+ def load_accessions(self, permitted_genomes = None, permitted_accessions = None):
278
+ #self.protein_counts_by_genome = None
279
+
280
+ self.gak = defaultdict(lambda: defaultdict())
281
+ self.accessions = set()
282
+
283
+
284
+ #It's possible to do both of these. Don't.
285
+ if permitted_genomes is not None:
286
+ sql_command = "SELECT * FROM genome_acc_kmer_counts WHERE genome IN ({genomes})".format(genomes=','.join(['?']*len(permitted_genomes)))
287
+ #data type is very important to SQL
288
+ sql_friendly = [int(permitted_genomes[i]) for i in range(0, len(permitted_genomes))]
289
+ for result in self.cursor.execute(sql_command, sql_friendly).fetchall():
290
+ genome, accession, kmer_ct = result[0], result[1], result[2]
291
+ self.gak[genome][accession] = kmer_ct
292
+
293
+ if permitted_accessions is not None:
294
+ sql_command = "SELECT * FROM genome_acc_kmer_counts WHERE accession IN ({accessions})".format(accessions=','.join(['?']*len(permitted_accessions)))
295
+ #data type is very important to SQL
296
+ #sql_friendly = [int(permitted_accessions[i]) for i in range(0, len(permitted_genomes))]
297
+ for result in self.cursor.execute(sql_command, permitted_accessions).fetchall():
298
+ genome, accession, kmer_ct = result[0], result[1], result[2]
299
+ self.gak[genome][accession] = kmer_ct
300
+
301
+ #Normal case
302
+ if permitted_accessions is None and permitted_genomes is None:
303
+ sql_command = "SELECT * FROM genome_acc_kmer_counts"
304
+ for result in self.cursor.execute(sql_command).fetchall():
305
+ genome, accession, kmer_ct = result[0], result[1], result[2]
306
+ self.gak[genome][accession] = kmer_ct
307
+
308
+ #un-defaultdict
309
+ self.gak = dict(self.gak)
310
+ for genome in self.gak:
311
+ self.gak[genome] = dict(self.gak[genome])
312
+ self.accessions = self.accessions.union(self.gak[genome].keys())
313
+
314
+ self.accessions = tuple(self.accessions)
315
+
316
+ def just_accessions(self):
317
+ converter = generate_accessions_index()
318
+ acc_sql = "SELECT name FROM sqlite_master WHERE type='table'"
319
+ tables = [item[0] for item in self.cursor.execute(acc_sql).fetchall()]
320
+
321
+ genome_tables = []
322
+ for table in tables:
323
+ if table.endswith('_genomes'):
324
+ genome_tables.append(table)
325
+
326
+ for table in genome_tables:
327
+ tables.pop(tables.index(table))
328
+
329
+ tables.pop(tables.index('genome_acc_kmer_counts'))
330
+ tables.pop(tables.index('genome_index'))
331
+
332
+ #Back to indicies.
333
+ tables = [converter[table.replace('_', '.')] for table in tables]
334
+
335
+ self.accessions = tuple(tables)
336
+
337
+ def unload_genomes_and_accessions(self):
338
+ self.gak = None
339
+ self.genome_index = None
340
+ #Go from index to name
341
+ self.reverse_genome_index = None
342
+ self.protein_counts_by_genome = None
343
+
344
+ #Child database class. This is only used during database builds and merges. Designed to take one single accession at a time and produce a correctly formatted table of kmers and accessions.
345
+ class child_database:
346
+ def __init__(self, path, parent):
347
+ #open SQL db and load in
348
+
349
+ self.path = path
350
+ self.exists = False
351
+
352
+ self.parent = parent
353
+ self.parent_exists = os.path.exists(parent)
354
+
355
+ self.connection = None
356
+ self.cursor = None
357
+
358
+ self.parent_connection = None
359
+ self.parent_cursor = None
360
+
361
+ self.verbosity = False
362
+
363
+ #Open an SQL connection
364
+ def activate_child_connection(self):
365
+ # Converts np.array to TEXT when inserting
366
+ ##sqlite3.register_adapter(np.ndarray, adapt_array)
367
+
368
+ # Converts TEXT to np.array when selecting
369
+ sqlite3.register_converter("array", convert_array)
370
+
371
+ self.connection = sqlite3.connect(self.path, detect_types=sqlite3.PARSE_DECLTYPES)
372
+ self.cursor = self.connection.cursor()
373
+ self.exists = True
374
+
375
+ #Close an SQL connection
376
+ def close_child_connection(self):
377
+ self.cursor.close()
378
+ self.connection.close()
379
+ #True cleanup - even a closed SQL connection obj cannot be passed to multiple processors, but a nonetype can.
380
+ self.cursor = None
381
+ self.connection = None
382
+
383
+ def initialize_child_database(self):
384
+ if not self.exists:
385
+ print("I need to be activated first!")
386
+ else:
387
+ #DB exists. Add metadata tables.
388
+ self.cursor.execute(''' SELECT count(name) FROM sqlite_master WHERE type='table' AND name='genome_index' ''')
389
+ if self.cursor.fetchone()[0]!=1 :
390
+ self.cursor.execute('''CREATE TABLE genome_index
391
+ (genome text, gen_id integer, protein_count integer)''')
392
+ self.connection.commit()
393
+
394
+ self.cursor.execute(''' SELECT count(name) FROM sqlite_master WHERE type='table' AND name='genome_acc_kmer_counts' ''')
395
+ if self.cursor.fetchone()[0]!=1 :
396
+ self.cursor.execute('''CREATE TABLE genome_acc_kmer_counts
397
+ (genome integer, accession integer, count integer)''')
398
+ self.connection.commit()
399
+
400
+
401
+ #Access an existing master database
402
+ def activate_parent_connection(self):
403
+ if os.path.exists(self.parent):
404
+ self.parent_exists = True
405
+ #sqlite3.register_adapter(np.ndarray, adapt_array)
406
+ # Converts TEXT to np.array when selecting
407
+ sqlite3.register_converter("array", convert_array)
408
+ self.parent_connection = sqlite3.connect(self.parent, detect_types=sqlite3.PARSE_DECLTYPES)
409
+ self.parent_cursor = self.parent_connection.cursor()
410
+
411
+ #Close access to master DB
412
+ def close_parent_connection(self):
413
+ if self.parent_cursor is not None:
414
+ self.parent_cursor.close()
415
+ self.parent_connection.close()
416
+ self.parent_cursor = None
417
+ self.parent_connection = None
418
+
419
+ def add_genomes_first(self, accession, kmer_lists):
420
+
421
+ #kmer_lists = []
422
+ #Shoot... gotta pass the args
423
+
424
+ #for file in prepared_files:
425
+ # if accession in file.best_hits_kmers:
426
+ # kmer_lists.append((genome_index[file.basename], file.best_hits_kmers[accession].tobytes()))
427
+
428
+ sql_friendly_accession = accession.replace(".", "_")
429
+
430
+ self.cursor.execute(" DROP TABLE IF EXISTS " + sql_friendly_accession + "_genomes")
431
+
432
+ self.cursor.execute("CREATE TABLE " + sql_friendly_accession + "_genomes (genome INTEGER PRIMARY KEY, kmers array)")
433
+ self.connection.commit()
434
+
435
+ self.cursor.executemany(" INSERT INTO " + sql_friendly_accession + "_genomes VALUES (?, ?) ", kmer_lists)
436
+
437
+ self.connection.commit()
438
+
439
+ return sql_friendly_accession
440
+
441
+
442
+ def add_accession(self, accession, insert_kmers):
443
+ sql_friendly_accession = accession.replace(".", "_")
444
+
445
+ if self.parent_exists:
446
+ parent_kmers = {}
447
+ #Check to see if this acc. is already in parent DB
448
+ table_exists = (self.parent_cursor.execute(" SELECT count(name) FROM sqlite_master WHERE type='table' AND name=(?)", (sql_friendly_accession,)).fetchone()[0] == 1)
449
+ #If the accession is in the parent DB
450
+ if table_exists:
451
+ #Select the records where the kmers are in the new kmers to be added - we don't have to modify the ones that aren't.
452
+ search_command = "SELECT * FROM "+ sql_friendly_accession + " WHERE kmer IN ({kmers})".format(kmers=','.join(['?']*len(insert_kmers)))
453
+
454
+ #Convert the kmers in the current insert list to the correct type for sql to match them
455
+ selection = tuple([int(key) for key in insert_kmers.keys()])
456
+
457
+ for item in self.parent_cursor.execute(search_command, selection).fetchall():
458
+ #Get the kmer for this parent
459
+ k = item[0]
460
+ #If the record would be modified in the parent, combine the to-add (which will replace the row) with the existing data. Otw. the record is unaffected and we can ignore it.
461
+ if k in insert_kmers:
462
+ insert_kmers[k] = np.union1d(insert_kmers[k], item[1])
463
+
464
+
465
+ #Free up the space.
466
+ del parent_kmers
467
+
468
+ formatted_kmers = []
469
+
470
+ #Translate the ndarray into its constituent byte data
471
+ for kmer in insert_kmers:
472
+ formatted_kmers.append((int(kmer), insert_kmers[kmer].tobytes(), ))
473
+
474
+ del insert_kmers
475
+
476
+ #Remove the child if it exists - it shouldn't ever exist because these child DBs should be deleted upon being added to the parent, but might if a run was stopped halfway.
477
+ self.cursor.execute(" DROP TABLE IF EXISTS " + sql_friendly_accession)
478
+
479
+ self.cursor.execute("CREATE TABLE " + sql_friendly_accession + " (kmer INTEGER PRIMARY KEY, genomes array)")
480
+ self.connection.commit()
481
+
482
+ self.cursor.executemany(" INSERT INTO " + sql_friendly_accession + " VALUES (?, ?) ", formatted_kmers)
483
+
484
+ self.connection.commit()
485
+
486
+ del formatted_kmers
487
+
488
+ return sql_friendly_accession
489
+
490
+
491
+ #Holds partial results for calculating AAI.
492
+ class calculation_database:
493
+ def __init__(self, path, precision):
494
+ #open SQL db and load in
495
+
496
+ self.path = path
497
+ self.exists = False
498
+
499
+ self.connection = None
500
+ self.cursor = None
501
+
502
+ self.genomes = None
503
+
504
+ self.verbosity = False
505
+
506
+ self.precision = precision
507
+
508
+ #Open an SQL connection
509
+ def activate_connection(self):
510
+ # Converts np.array to TEXT when inserting
511
+ ##sqlite3.register_adapter(np.ndarray, adapt_array)
512
+
513
+ # Converts TEXT to np.array when selecting
514
+ if self.precision == "low":
515
+ sqlite3.register_converter("array", convert_float_array_16)
516
+ if self.precision == "med":
517
+ sqlite3.register_converter("array", convert_float_array_32)
518
+ if self.precision == "high":
519
+ sqlite3.register_converter("array", convert_float_array_64)
520
+
521
+ self.connection = sqlite3.connect(self.path, detect_types=sqlite3.PARSE_DECLTYPES)
522
+ self.cursor = self.connection.cursor()
523
+ self.exists = True
524
+
525
+ #Close an SQL connection
526
+ def close_connection(self):
527
+ self.cursor.close()
528
+ self.connection.close()
529
+ #True cleanup - even a closed SQL connection obj cannot be passed to multiple processors, but a nonetype can.
530
+ self.cursor = None
531
+ self.connection = None
532
+
533
+ def initialize_database(self):
534
+ if not self.exists:
535
+ print("I need to be activated first!")
536
+ else:
537
+ #DB exists. Add metadata tables.
538
+ self.cursor.execute("DROP TABLE IF EXISTS jaccards")
539
+ self.connection.commit()
540
+ self.cursor.execute("CREATE TABLE jaccards (genome INTEGER PRIMARY KEY, jaccards array)")
541
+ self.connection.commit()
542
+
543
+ '''
544
+ Class for handling all of the raw genome/protein/protein+HMM file inputs when building a database.
545
+
546
+ Takes a file or files and processes them from genome -> protein, protein -> hmm, prot+HMM -> kmerized protein best hits as numpy int arrays according to the kmer_index
547
+
548
+ '''
549
+ class input_file:
550
+ def __init__(self, input_path, output, verbosity):
551
+ #starting path for the file; irrelevant for protein and hmm, but otherwise useful for keeping track.
552
+ self.path = input_path
553
+ #Output directory starts with this
554
+ self.output = os.path.normpath(os.path.basename(output) + "/")
555
+ #For printing file updates, this is the input name
556
+ self.name = os.path.basename(input_path)
557
+ #original name is the key used for the genomes index later on.
558
+ self.original_name = os.path.basename(input_path)
559
+ #This is the name that can be used for building files with new extensions.
560
+ if input_path.endswith(".gz"):
561
+ #Remove .gz first to make names consistent.
562
+ self.basename = os.path.splitext(os.path.basename(input_path[:-3]))[0]
563
+ else:
564
+ self.basename = os.path.splitext(os.path.basename(input_path))[0]
565
+ #'genome' or 'protein' or 'protein and HMM'
566
+ self.status = None
567
+ #These will keep track of paths for each stage of file for us.
568
+ self.genome = None
569
+ self.protein = None
570
+ self.hmm = None
571
+
572
+ self.best_hits = None
573
+ self.best_hits_kmers = None
574
+
575
+ self.protein_count = 0
576
+ self.protein_kmer_count = {}
577
+
578
+ self.trans_table = None
579
+ self.start_time = None
580
+ self.end_time = None
581
+ self.err_log = ""
582
+ #doesn't get updated otw.
583
+ self.initial_state = "protein+HMM"
584
+
585
+ self.verbose = verbosity
586
+
587
+ #Functions for externally setting status and file paths of particular types
588
+ def set_genome(self, path):
589
+ self.status = 'genome'
590
+ self.genome = path
591
+
592
+ def set_protein(self, path):
593
+ self.status = 'protein'
594
+ self.protein = path
595
+
596
+ def set_hmm(self, path):
597
+ if self.protein is None:
598
+ print("Warning! I don't have a protein yet, so this HMM will be useless to me until I do!")
599
+ self.status = 'protein and hmm'
600
+ self.hmm = path
601
+
602
+ #Runs prodigal, compares translation tables and stores faa files
603
+ def genome_to_protein(self):
604
+ if self.genome is None:
605
+ print(self.name, "wasn't a declared as a genome! I can't make this into a protein!")
606
+ else:
607
+ folder = Path(self.output + "/predicted_proteins")
608
+ protein_output = folder / (self.basename + '.faa')
609
+ output_11 = folder / (self.basename + '.faa.11')
610
+ output_4 = folder / (self.basename + '.faa.4')
611
+ temp_output = folder / (self.basename + '.temp')
612
+
613
+ intermediate = folder / (self.basename + '_genome_intermediate.fasta')
614
+
615
+ #total_bases = 0
616
+
617
+ genome_parser = agnostic_reader(self.genome)
618
+
619
+ if genome_parser.is_gz:
620
+ #File was a gzip; decompress it to an intermediate file and then run prodigal; delete after
621
+ #print("unzipping input...")
622
+ midpoint = open(intermediate, "w")
623
+ #Count input bases and write an unzipped file for prodigal's sake.
624
+ for line in genome_parser:
625
+ #if not line.startswith(">"):
626
+ # total_bases += len(line.strip())
627
+ midpoint.write(line)
628
+
629
+ midpoint.close()
630
+
631
+ else:
632
+ #File is already unzipped, just point to it
633
+ intermediate = self.genome
634
+ #Count input bases
635
+ #for line in genome_parser:
636
+ # if not line.startswith(">"):
637
+ # total_bases += len(line.strip())
638
+
639
+ genome_parser.close()
640
+ '''
641
+ A chunk of code originally indended to match GTDBtk's table selection criteria.
642
+ if total_bases > 100000:
643
+ #training mode
644
+ subprocess.call(["prodigal", "-i", str(intermediate), "-a", str(output_11), "-q", "-o", str(temp_output)])
645
+ subprocess.call(["prodigal", "-i", str(intermediate), "-a", str(output_4), "-g", "4", "-q", "-o", str(temp_output)])
646
+ else:
647
+ #Metagenome mode for very short genomes.
648
+ subprocess.call(["prodigal", "-i", str(intermediate), "-p", "meta", "-a", str(output_11), "-q", "-o", str(temp_output)])
649
+ subprocess.call(["prodigal", "-i", str(intermediate), "-p", "meta", "-a", str(output_4), "-g", "4", "-q", "-o", str(temp_output)])
650
+ '''
651
+
652
+ subprocess.call(["prodigal", "-i", str(intermediate), "-a", str(output_11), "-q", "-o", str(temp_output)])
653
+ subprocess.call(["prodigal", "-i", str(intermediate), "-a", str(output_4), "-g", "4", "-q", "-o", str(temp_output)])
654
+
655
+ #We can get rid of the temp file immediately, we won't be using it
656
+ temp_output.unlink()
657
+ if genome_parser.is_gz:
658
+ #If the file was copied, delete. Otw. this would delete the input and we don't want that.
659
+ intermediate.unlink()
660
+
661
+ # Compare translation tables
662
+ length_4 = 0
663
+ length_11 = 0
664
+ with open(output_4, 'r') as table_4:
665
+ for line in table_4:
666
+ if line.startswith(">"):
667
+ continue
668
+ else:
669
+ length_4 += len(line.strip())
670
+
671
+ with open(output_11, 'r') as table_11:
672
+ for line in table_11:
673
+ if line.startswith(">"):
674
+ continue
675
+ else:
676
+ length_11 += len(line.strip())
677
+
678
+ #Select the winning translation table and remove the other. Open the winner.
679
+ if (length_4 / length_11) >= 1.1:
680
+ output_11.unlink()
681
+ self.trans_table = "4"
682
+ chosen_protein = open(output_4, 'r')
683
+ table_11 = False
684
+ else:
685
+ output_4.unlink()
686
+ self.trans_table = "11"
687
+ chosen_protein = open(output_11, 'r')
688
+ table_11 = True
689
+
690
+ destination = open(protein_output, "w")
691
+
692
+ #Clean the winning output.
693
+ for line in chosen_protein:
694
+ if line.startswith(">"):
695
+ destination.write("{}".format(line))
696
+ else:
697
+ line = line.replace('*', '')
698
+ destination.write("{}".format(line))
699
+
700
+ destination.close()
701
+ chosen_protein.close()
702
+
703
+ # Remove the winning intermediate file, since we have the cleaned output
704
+ if table_11:
705
+ output_11.unlink()
706
+ else:
707
+ output_4.unlink()
708
+
709
+ self.set_protein(str(protein_output))
710
+
711
+ #run hmmsearch on a protein
712
+ def protein_to_hmm(self):
713
+ if self.protein is None:
714
+ print(self.name, "wasn't a declared as a protein! I can't make this into an HMM!")
715
+ else:
716
+
717
+ folder = Path(self.output + "/hmms")
718
+
719
+ hmm_output = folder / (self.basename + '.hmm')
720
+ temp_output = folder / (self.basename + '.temp')
721
+
722
+ intermediate = folder / (self.basename + '_protein_intermediate.faa')
723
+
724
+ current_protein = ""
725
+ current_seq = ""
726
+
727
+ protein_parser = agnostic_reader(self.protein)
728
+
729
+ #File was a gzip; decompress it to an intermediate file and then run prodigal; delete after
730
+ #Keeps track of \n chars in the protein sequences.
731
+ line_ct = 0
732
+ midpoint = open(intermediate, "w")
733
+
734
+ for line in protein_parser:
735
+ if line.startswith(">"):
736
+ if len(current_seq) > 0:
737
+ if len(current_seq) < 100000:
738
+ midpoint.write(current_protein)
739
+ midpoint.write(current_seq)
740
+ else:
741
+ self.err_log += "Protein " + current_protein.strip().split()[0][1:] + " was observed to have >100K amino acids ( " + str(len(current_seq) - line_ct) + " AA found ). It was skipped. "
742
+ #print("Protein", current_protein.strip()[1:], "was observed to have >100K amino acids (", len(current_seq) - line_ct, "AA found ).", file = sys.stderr)
743
+ #print("HMMER cannot handle sequences that long, and the protein is almost certainly erroneous, anyway.", file = sys.stderr)
744
+ #print("The protein will be skipped, and FastAAI will continue without it.", file = sys.stderr)
745
+
746
+ current_protein = line
747
+ current_seq = ""
748
+ line_ct = 0
749
+ else:
750
+ line_ct += 1
751
+ current_seq += line
752
+
753
+ protein_parser.close()
754
+
755
+ #Finally, last prot
756
+ if len(current_seq) > 0:
757
+ if len(current_seq) < 100000:
758
+ midpoint.write(current_protein)
759
+ midpoint.write(current_seq)
760
+ else:
761
+ self.err_log += "Protein " + current_protein.strip().split()[0][1:] + " was observed to have >100K amino acids ( " + str(len(current_seq) - line_ct) + " AA found ). It was skipped. "
762
+ #print("Protein", current_protein.strip()[1:], "was observed to have >100K amino acids (", len(current_seq) - line_ct, "AA found ).", file = sys.stderr)
763
+ #print("HMMER cannot handle sequences that long, and the protein is almost certainly erroneous, anyway.", file = sys.stderr)
764
+ #print("The protein will be skipped, and FastAAI will continue without it.", file = sys.stderr)
765
+
766
+ midpoint.close()
767
+
768
+ #Should locate the DBs regardless of path.
769
+ script_path = Path(__file__)
770
+ script_dir = script_path.parent
771
+ hmm_complete_model = script_dir / "00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm"
772
+
773
+ subprocess.call(["hmmsearch", "--tblout", str(hmm_output), "-o", str(temp_output), "--cut_tc", "--cpu", "1",
774
+ str(hmm_complete_model), str(intermediate)])
775
+
776
+ temp_output.unlink()
777
+ intermediate.unlink()
778
+
779
+ self.set_hmm(str(hmm_output))
780
+
781
+ def prot_and_hmm_to_besthits(self):
782
+ prots = []
783
+ accs = []
784
+ scores = []
785
+ f = agnostic_reader(self.hmm)
786
+ for line in f:
787
+ if line.startswith("#"):
788
+ continue
789
+ else:
790
+ segs = line.strip().split()
791
+ prots.append(segs[0])
792
+ accs.append(segs[3])
793
+ scores.append(segs[8])
794
+
795
+ f.close()
796
+
797
+ hmm_file = np.transpose(np.array([prots, accs, scores]))
798
+
799
+ #hmm_file = np.loadtxt(hmm_file_name, comments = '#', usecols = (0, 3, 8), dtype=(str))
800
+ #Sort the hmm file based on the score column in descending order.
801
+ hmm_file = hmm_file[hmm_file[:,2].astype(float).argsort()[::-1]]
802
+
803
+ #Identify the first row where each gene name appears, after sorting by score;
804
+ #in effect, return the highest scoring assignment per gene name
805
+ #Sort the indices of the result to match the score-sorted table instead of alphabetical order of gene names
806
+ hmm_file = hmm_file[np.sort(np.unique(hmm_file[:,0], return_index = True)[1])]
807
+
808
+ #Filter the file again for the unique ACCESSION names, since we're only allowed one gene per accession, I guess?
809
+ #Don't sort the indices, we don't care about the scores anymore.
810
+ hmm_file = hmm_file[np.unique(hmm_file[:,1], return_index = True)[1]]
811
+
812
+ self.best_hits = dict(zip(hmm_file[:,0], hmm_file[:,1]))
813
+
814
+ self.best_hits_kmers = {}
815
+ current_seq = ""
816
+ current_prot = ""
817
+ is_besthit = False
818
+
819
+ prot = agnostic_reader(self.protein)
820
+
821
+ for line in prot:
822
+
823
+ if line.startswith(">"):
824
+ if len(current_seq) > 0:
825
+ kmer_set = unique_kmers(current_seq, 4)
826
+ self.protein_kmer_count[current_prot] = kmer_set.shape[0]
827
+ self.protein_count += 1
828
+ self.best_hits_kmers[current_prot] = kmer_set
829
+ #Select the best hit accession for this protein and just record that. We do not care about the names of the proteins.
830
+ current_prot = line[1:].strip().split(" ")[0]
831
+ if current_prot in self.best_hits:
832
+ current_prot = self.best_hits[current_prot]
833
+ is_besthit = True
834
+ else:
835
+ is_besthit = False
836
+ current_seq = ""
837
+ else:
838
+ if is_besthit:
839
+ current_seq += line.strip()
840
+
841
+ prot.close()
842
+
843
+ #Final iter. doesn't happen otw.
844
+ if current_prot in self.best_hits:
845
+ kmer_set = unique_kmers(current_seq, 4)
846
+ #kmer_set = [kmer_index[k] for k in kmer_set]
847
+ self.protein_kmer_count[current_prot] = kmer_set.shape[0]
848
+ self.protein_count += 1
849
+ self.best_hits_kmers[current_prot] = kmer_set
850
+
851
+ self.status = "finished preprocessing"
852
+
853
+ def preprocess(self):
854
+ #There's no advancement stage for protein and HMM
855
+ if self.status == 'genome':
856
+ start_time = curtime()
857
+ #report = True
858
+ if self.start_time is None:
859
+ self.start_time = start_time
860
+
861
+ if self.initial_state == "protein+HMM":
862
+ self.initial_state = "genome"
863
+
864
+ self.genome_to_protein()
865
+
866
+
867
+ if self.status == 'protein':
868
+ start_time = curtime()
869
+ #report = True
870
+ if self.start_time is None:
871
+ self.start_time = start_time
872
+
873
+ if self.initial_state == "protein+HMM":
874
+ self.initial_state = "protein"
875
+
876
+ self.protein_to_hmm()
877
+
878
+ if self.status == 'protein and hmm':
879
+ start_time = curtime()
880
+
881
+ if self.start_time is None:
882
+ self.start_time = start_time
883
+
884
+ self.prot_and_hmm_to_besthits()
885
+
886
+ #Add an end time if either genome -> protein -> HMM or protein -> HMM happened.
887
+ if self.start_time is not None:
888
+ end_time = curtime()
889
+ self.end_time = end_time
890
+ else:
891
+ #Start was protein+HMM. There was no runtime, and intitial state is p+hmm
892
+ #self.initial_state = "protein+HMM"
893
+ self.start_time = "N/A"
894
+ self.end_time = "N/A"
895
+
896
+ #Protein not generated on this run.
897
+ if self.trans_table is None:
898
+ self.trans_table = "unknown"
899
+
900
+ '''
901
+ Viral functions
902
+ '''
903
+ #No translation table comparison for viruses. Slightly reduced logic.
904
+ def viral_genome_to_protein(self):
905
+ if self.genome is None:
906
+ print(self.name, "wasn't a declared as a genome! I can't make this into a protein!")
907
+ else:
908
+ folder = Path(self.output + "/predicted_proteins")
909
+ intermediate_protein_output = folder / (self.basename + '.intermediate.faa')
910
+ final_protein_output = folder / (self.basename + '.faa')
911
+ temp_output = folder / (self.basename + '.temp')
912
+
913
+ subprocess.call(["prodigal", "-i", str(self.genome), "-a", str(intermediate_protein_output), "-p", "meta", "-q", "-o", str(temp_output)])
914
+
915
+ # Remove intermediate files
916
+ temp_output.unlink()
917
+
918
+ chosen_protein = open(intermediate_protein_output, 'r')
919
+ destination = open(final_protein_output, "w")
920
+
921
+ for line in chosen_protein:
922
+ if line.startswith(">"):
923
+ destination.write("{}".format(line))
924
+ else:
925
+ line = line.replace('*', '')
926
+ destination.write("{}".format(line))
927
+
928
+ destination.close()
929
+ chosen_protein.close()
930
+
931
+ intermediate_protein_output.unlink()
932
+
933
+ self.protein = str(protein_output)
934
+ self.status = 'protein'
935
+
936
+
937
+ '''
938
+ Preprocessing functions
939
+
940
+ Read directories, advance files to hmms as needed.
941
+ '''
942
+ #Toy function for passing to a pool
943
+ def do_advance(input_file_object):
944
+ input_file_object.preprocess()
945
+ return input_file_object
946
+
947
+ def initialize_preproc(index):
948
+ global kmer_index
949
+ kmer_index = index
950
+
951
+ #Function which takes an input list
952
+ def advance_inputs(genomes = None, proteins = None, hmms = None, genomes_file = None, proteins_file = None, hmms_file = None, output = "FastAAI", threads = 1, verbose = False, db_name = ""):
953
+ inputs = []
954
+
955
+ hmm_broke = False
956
+
957
+ if genomes_file is not None:
958
+ fh = agnostic_reader(genomes_file)
959
+
960
+ for line in fh:
961
+ clean = line.strip()
962
+ if not os.path.exists(clean):
963
+ print("I can't find file", clean, "Are you sure this file exists and can be found from your current directory using the path you supplied in the input file?")
964
+ else:
965
+ current_file = input_file(clean, output, verbose)
966
+ current_file.set_genome(clean)
967
+ inputs.append(current_file)
968
+ del current_file
969
+
970
+ fh.close()
971
+
972
+ if proteins_file is not None:
973
+ fh = agnostic_reader(proteins_file)
974
+
975
+ for line in fh:
976
+ #GOTOGOTO
977
+ print(line)
978
+
979
+ clean = line.strip()
980
+ if not os.path.exists(clean):
981
+ print("I can't find file", clean, "Are you sure this file exists and can be found from your current directory using the path you supplied in the input file?")
982
+ else:
983
+ current_file = input_file(clean, output, verbose)
984
+ current_file.set_protein(clean)
985
+ inputs.append(current_file)
986
+ del current_file
987
+
988
+ fh.close()
989
+
990
+ if hmms_file is not None:
991
+ fh = agnostic_reader(hmms_file)
992
+
993
+ hmm_pairs = []
994
+
995
+ for line in fh:
996
+ clean = line.strip()
997
+ if not os.path.exists(clean):
998
+ print("I can't find file", clean, "Are you sure this file exists and can be found from your current directory using the path you supplied in the input file?")
999
+ else:
1000
+ hmm_pairs.append(clean)
1001
+
1002
+ fh.close()
1003
+
1004
+ if len(hmm_pairs) != len(inputs):
1005
+ print("Protein and HMM file counts differ! There must be one HMM per protein, generated from its paired protein! These pairs must be in the same order in your input file!")
1006
+ hmm_broke = True
1007
+ else:
1008
+ for h, i in zip(hmm_pairs, inputs):
1009
+ i.set_hmm(h)
1010
+
1011
+ if genomes is not None:
1012
+ set = os.listdir(genomes)
1013
+ #Sort is used to ensure lexicographic ordering.
1014
+ set.sort()
1015
+ set = [os.path.normpath(genomes + "/" + file) for file in set]
1016
+
1017
+ for file in set:
1018
+ if not os.path.exists(file):
1019
+ print("I can't find", file, "Are you sure this file exists in the directory you supplied?")
1020
+ else:
1021
+ current_file = input_file(file, output, verbose)
1022
+ current_file.set_genome(file)
1023
+ inputs.append(current_file)
1024
+ del current_file
1025
+
1026
+ if proteins is not None:
1027
+ set = os.listdir(proteins)
1028
+ set.sort()
1029
+ set = [os.path.normpath(proteins + "/" + file) for file in set]
1030
+
1031
+ for file in set:
1032
+ if not os.path.exists(file):
1033
+ print("I can't find", file, "Are you sure this file exists in the directory you supplied?")
1034
+ else:
1035
+ current_file = input_file(file, output, verbose)
1036
+ current_file.set_protein(file)
1037
+ inputs.append(current_file)
1038
+ del current_file
1039
+
1040
+ if hmms is not None:
1041
+ set = os.listdir(hmms)
1042
+ set.sort()
1043
+ set = [os.path.normpath(hmms + "/" + file) for file in set]
1044
+
1045
+ hmm_pairs = []
1046
+
1047
+ for file in set:
1048
+ if not os.path.exists(file):
1049
+ print("I can't find", file, "Are you sure this file exists in the directory you supplied?")
1050
+ else:
1051
+ hmm_pairs.append(file)
1052
+
1053
+ if len(hmm_pairs) != len(inputs):
1054
+ print("Protein and HMM file counts differ! There must be one HMM per protein, generated from its paired protein! These must be in the same alphabetical order in their respective directories!")
1055
+ hmm_broke = True
1056
+ else:
1057
+ for h, i in zip(hmm_pairs, inputs):
1058
+ i.set_hmm(h)
1059
+
1060
+ if hmm_broke:
1061
+ print("FastAAI can't proceed without matching HMM and protein pairs.")
1062
+ inputs = None
1063
+ return inputs
1064
+
1065
+ total_counts = len(inputs)
1066
+ count = 0
1067
+ last_pct = 0
1068
+
1069
+ if verbose:
1070
+ print("")
1071
+ #progress bar - possible dangerous use of the return to line start sequence.
1072
+ try:
1073
+ percentage = 0
1074
+ sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% (Genome ' + str(count) + " of " + str(total_counts) + ') at ' + curtime()+"\n")
1075
+ sys.stdout.flush()
1076
+ except:
1077
+ #It's not really a big deal if the progress bar cannot be printed.
1078
+ pass
1079
+
1080
+ results = []
1081
+
1082
+ kmer_index_ = create_kmer_index()
1083
+ pool = multiprocessing.Pool(threads, initializer=initialize_preproc, initargs = (kmer_index_,))
1084
+
1085
+ for res in pool.imap(do_advance, inputs):
1086
+ results.append(res)
1087
+ if verbose:
1088
+ #progress bar - possible dangerous use of the return to line start sequence.
1089
+ try:
1090
+ count += 1
1091
+ percentage = (count/total_counts)*100
1092
+ if int(percentage/2) > last_pct or partition == total_partitions:
1093
+ sys.stdout.write('\033[A')
1094
+ sys.stdout.flush()
1095
+ sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% (Genome ' + str(count) + " of " + str(total_counts) + ') at ' + curtime()+"\n")
1096
+ sys.stdout.flush()
1097
+
1098
+ last_pct = int(percentage/2)
1099
+ except:
1100
+ #It's not really a big deal if the progress bar cannot be printed.
1101
+ pass
1102
+
1103
+ pool.close()
1104
+ pool.join()
1105
+
1106
+ inputs = results
1107
+
1108
+ log_time = curtime()
1109
+
1110
+ if os.path.exists(os.path.normpath(output + "/logs/" + os.path.splitext(os.path.basename(db_name))[0] + "_preprocessing_log.txt")):
1111
+ preproc_log = open(os.path.normpath(output + "/logs/" + os.path.splitext(os.path.basename(db_name))[0] + "_preprocessing_log.txt"), "a")
1112
+ else:
1113
+ preproc_log = open(os.path.normpath(output + "/logs/" + os.path.splitext(os.path.basename(db_name))[0] + "_preprocessing_log.txt"), "w")
1114
+ print("log_date", "genome_name", "started_as_a", "start_time", "end_time", "protein_translation_table", "errors", sep = "\t", file = preproc_log)
1115
+ for i in inputs:
1116
+ print(log_time, i.basename, i.initial_state, i.start_time, i.end_time, i.trans_table, i.err_log, sep = "\t", file = preproc_log)
1117
+ preproc_log.close()
1118
+
1119
+ return inputs
1120
+
1121
+ '''
1122
+ Utility functions
1123
+ '''
1124
+ def prepare_directories(output, status, build_or_query):
1125
+ preparation_successful = True
1126
+
1127
+ if not os.path.exists(output):
1128
+ try:
1129
+ os.mkdir(output)
1130
+ except:
1131
+ print("")
1132
+ print("FastAAI tried to make output directory: '"+ output + "' but failed.")
1133
+ print("")
1134
+ print("Troubleshooting:")
1135
+ print("")
1136
+ print(" (1) Do you have permission to create directories in the location you specified?")
1137
+ print(" (2) Did you make sure that all directories other than", os.path.basename(output), "already exist?")
1138
+ print("")
1139
+ preparation_successful = False
1140
+
1141
+ if preparation_successful:
1142
+ try:
1143
+ if status == 'genome':
1144
+ if not os.path.exists(os.path.normpath(output + "/" + "predicted_proteins")):
1145
+ os.mkdir(os.path.normpath(output + "/" + "predicted_proteins"))
1146
+ if not os.path.exists(os.path.normpath(output + "/" + "hmms")):
1147
+ os.mkdir(os.path.normpath(output + "/" + "hmms"))
1148
+
1149
+ if status == 'protein':
1150
+ if not os.path.exists(os.path.normpath(output + "/" + "hmms")):
1151
+ os.mkdir(os.path.normpath(output + "/" + "hmms"))
1152
+
1153
+ if not os.path.exists(os.path.normpath(output + "/" + "logs")):
1154
+ os.mkdir(os.path.normpath(output + "/" + "logs"))
1155
+
1156
+ if build_or_query == "build":
1157
+ if not os.path.exists(os.path.normpath(output + "/" + "database")):
1158
+ os.mkdir(os.path.normpath(output + "/" + "database"))
1159
+
1160
+ if build_or_query == "query":
1161
+ if not os.path.exists(os.path.normpath(output + "/" + "results")):
1162
+ os.mkdir(os.path.normpath(output + "/" + "results"))
1163
+
1164
+
1165
+ except:
1166
+ print("FastAAI was able to create or find", output, "but couldn't make directories there.")
1167
+ print("")
1168
+ print("This shouldn't happen. Do you have permission to write to that directory?")
1169
+
1170
+
1171
+ return preparation_successful
1172
+
1173
+ def check_out_input_files(genomes, proteins, hmms, gf, pf, hf):
1174
+ #Check only one method of supply was used per file type
1175
+ if (genomes is not None) and (gf is not None):
1176
+ print("Supply genomes either by directory or by file, not both.")
1177
+ return None
1178
+ if (proteins is not None) and (pf is not None):
1179
+ print("Supply proteins either by directory or by file, not both.")
1180
+ return None
1181
+ if (hmms is not None) and (hf is not None):
1182
+ print("Supply HMMs either by directory or by file, not both.")
1183
+ return None
1184
+
1185
+ #check that not both proteins and genomes supplied in any combo.
1186
+ if ((genomes is not None) and (pf is not None))\
1187
+ or ((gf is not None) and (proteins is not None))\
1188
+ or ((genomes is not None) and (proteins is not None))\
1189
+ or ((gf is not None) and (pf is not None)):
1190
+ print("Supply either genomes or proteins, not both. You can supply proteins and HMMs, but not genomes and proteins.")
1191
+ return None
1192
+
1193
+ #Check that if hmms are given, so are proteins
1194
+ if (hmms is not None) or (hf is not None):
1195
+ if (proteins is None) and (pf is None):
1196
+ print("If you supply HMMs, you also have to supply the proteins from which they were generated.")
1197
+ return None
1198
+
1199
+ #Determine status
1200
+ if (genomes is not None) or (gf is not None):
1201
+ print("Starting from genomes")
1202
+ start = 'genome'
1203
+
1204
+ else:
1205
+ if (hmms is not None) or (hf is not None):
1206
+ print("Starting from proteins and HMMs")
1207
+ start = 'protein and HMM'
1208
+
1209
+ else:
1210
+ print("Starting from proteins")
1211
+ start = 'protein'
1212
+
1213
+ return start
1214
+
1215
+
1216
+ #Build DB from genomes
1217
+
1218
+ def unique_kmers(seq, ksize):
1219
+ n_kmers = len(seq) - ksize + 1
1220
+ kmers = []
1221
+ for i in range(n_kmers):
1222
+ kmers.append(kmer_index[seq[i:i + ksize]])
1223
+ #We care about the type because we're working with bytes later.
1224
+ return np.unique(kmers).astype(np.int32)
1225
+
1226
+ #Quickly creates a dict of all poss. tetramers in a fixed, alphabetical order.
1227
+ #This can be used to index kmers so that the indices are identical (and thus interchangable) on separate runs of this program.
1228
+ def create_kmer_index():
1229
+ valid_chars = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Y', '*']
1230
+ #This meshgrid method will produce all unique tetramers from AAAA to **** in a consistent order.
1231
+ #Rightmost char to leftmost, A to * in the same order as valid_chars
1232
+ kmer_index_ = np.stack(np.meshgrid(valid_chars, valid_chars, valid_chars, valid_chars), -1).reshape(-1, 4)
1233
+ #Unless someone is passing more than 2.1 billion genomes, int32 will be enough.
1234
+ kmer_index_ = dict(zip([''.join(kmer_index_[i,]) for i in range(0, kmer_index_.shape[0])], np.arange(kmer_index_.shape[0], dtype = np.int32)))
1235
+
1236
+ return kmer_index_
1237
+
1238
+ def split_seq(seq, num_grps):
1239
+ newseq = []
1240
+ splitsize = 1.0/num_grps*len(seq)
1241
+ for i in range(num_grps):
1242
+ newseq.append(seq[int(round(i*splitsize)):int(round((i+1)*splitsize))])
1243
+ return newseq
1244
+
1245
+ #gives the max and min index needed to split a list of (max_val) genomes into
1246
+ def split_indicies(max_val, num_grps):
1247
+ newseq = []
1248
+ splitsize = 1.0/num_grps*max_val
1249
+ for i in range(num_grps):
1250
+ newseq.append(((round(i*splitsize)), round((i+1)*splitsize)))
1251
+ return newseq
1252
+
1253
+ def list_to_index_dict(list):
1254
+ result = {}
1255
+ counter = 0
1256
+ for item in list:
1257
+ result[item] = counter
1258
+ counter += 1
1259
+ return result
1260
+
1261
+ def generate_accessions_index():
1262
+ list_of_poss_accs = list_to_index_dict(['PF01780.19', 'PF03948.14', 'PF17144.4', 'PF00830.19', 'PF00347.23', 'PF16906.5', 'PF13393.6',
1263
+ 'PF02565.15', 'PF01991.18', 'PF01984.20', 'PF00861.22', 'PF13656.6', 'PF00368.18', 'PF01142.18', 'PF00312.22', 'PF02367.17',
1264
+ 'PF01951.16', 'PF00749.21', 'PF01655.18', 'PF00318.20', 'PF01813.17', 'PF01649.18', 'PF01025.19', 'PF00380.19', 'PF01282.19',
1265
+ 'PF01864.17', 'PF01783.23', 'PF01808.18', 'PF01982.16', 'PF01715.17', 'PF00213.18', 'PF00119.20', 'PF00573.22', 'PF01981.16',
1266
+ 'PF00281.19', 'PF00584.20', 'PF00825.18', 'PF00406.22', 'PF00177.21', 'PF01192.22', 'PF05833.11', 'PF02699.15', 'PF01016.19',
1267
+ 'PF01765.19', 'PF00453.18', 'PF01193.24', 'PF05221.17', 'PF00231.19', 'PF00416.22', 'PF02033.18', 'PF01668.18', 'PF00886.19',
1268
+ 'PF00252.18', 'PF00572.18', 'PF00366.20', 'PF04104.14', 'PF04919.12', 'PF01912.18', 'PF00276.20', 'PF00203.21', 'PF00889.19',
1269
+ 'PF02996.17', 'PF00121.18', 'PF01990.17', 'PF00344.20', 'PF00297.22', 'PF01196.19', 'PF01194.17', 'PF01725.16', 'PF00750.19',
1270
+ 'PF00338.22', 'PF00238.19', 'PF01200.18', 'PF00162.19', 'PF00181.23', 'PF01866.17', 'PF00709.21', 'PF02006.16', 'PF00164.25',
1271
+ 'PF00237.19', 'PF01139.17', 'PF01351.18', 'PF04010.13', 'PF06093.13', 'PF00828.19', 'PF02410.15', 'PF01176.19', 'PF02130.17',
1272
+ 'PF01948.18', 'PF01195.19', 'PF01746.21', 'PF01667.17', 'PF03874.16', 'PF01090.19', 'PF01198.19', 'PF01250.17', 'PF17136.4',
1273
+ 'PF06026.14', 'PF03652.15', 'PF04019.12', 'PF01201.22', 'PF00832.20', 'PF01264.21', 'PF03840.14', 'PF00831.23', 'PF00189.20',
1274
+ 'PF02601.15', 'PF01496.19', 'PF00411.19', 'PF00334.19', 'PF00687.21', 'PF01157.18', 'PF01245.20', 'PF01994.16', 'PF01632.19',
1275
+ 'PF00827.17', 'PF01015.18', 'PF00829.21', 'PF00410.19', 'PF00833.18', 'PF00935.19', 'PF01992.16'])
1276
+
1277
+ return list_of_poss_accs
1278
+
1279
+ #Master function for building or adding to a DB with genomes.
1280
+ def add_inputs(output_path, parent_path, existing_index, threads, verbose, prep_args):
1281
+
1282
+ genomes, proteins, hmms, gf, pf, hf, db_name = prep_args[0], prep_args[1], prep_args[2], prep_args[3], prep_args[4], prep_args[5], prep_args[6]
1283
+
1284
+ print("")
1285
+ print("FastAAI is formatting your files to be saved to your database.")
1286
+
1287
+ #Let's push this to the inputs section.
1288
+ inputs = advance_inputs(genomes = genomes, proteins = proteins, hmms = hmms, genomes_file = gf, proteins_file = pf, hmms_file = hf, output = output_path, threads = threads, verbose = verbose, db_name = db_name)
1289
+
1290
+ if inputs is None:
1291
+ return False
1292
+
1293
+ kmer_index = None
1294
+
1295
+ #global genome_index
1296
+ genome_index = {}
1297
+ next_index = 0
1298
+
1299
+ #Build upon the genome indexing of an existing DB
1300
+ if existing_index is not None:
1301
+ genome_index = existing_index
1302
+ #zero indexing makes this the next number to add.
1303
+ next_index = len(existing_index)
1304
+
1305
+ final_db = fastaai_database(parent_path)
1306
+ final_db.activate_connection()
1307
+ final_db.initialize_parent_database()
1308
+
1309
+ #This goes to the genome_index table
1310
+ protein_counts_to_add = []
1311
+ genome_acc_kmer_counts_to_add = []
1312
+
1313
+ acc_index = generate_accessions_index()
1314
+
1315
+ readied_kmers_by_acc = defaultdict(lambda: defaultdict(lambda: None))
1316
+
1317
+ #unique_accessions = set()
1318
+ for file in inputs:
1319
+
1320
+ genome = file.basename
1321
+
1322
+ #Collect all of the accessions actually found. Will usually be 122 for reasonably sized datasets.
1323
+ #unique_accessions = unique_accessions.union(set(file.best_hits.values()))
1324
+ #Avoid adding duplicate genomes
1325
+ if genome not in genome_index:
1326
+ protein_counts_to_add.append((genome, next_index, file.protein_count))
1327
+ for prot in file.protein_kmer_count:
1328
+ genome_acc_kmer_counts_to_add.append((next_index, acc_index[prot], file.protein_kmer_count[prot]))
1329
+ genome_index[genome] = next_index
1330
+ next_index += 1
1331
+
1332
+ this_index = genome_index[genome]
1333
+ for acc in file.best_hits_kmers:
1334
+ readied_kmers_by_acc[acc][this_index] = file.best_hits_kmers[acc]
1335
+ #Clean up space
1336
+ file.best_hits_kmers = None
1337
+
1338
+ inputs = None
1339
+
1340
+ #Default dicts can't be pickled.
1341
+ readied_kmers_by_acc = dict(readied_kmers_by_acc)
1342
+
1343
+ genomes_per_acc = {}
1344
+ for acc in readied_kmers_by_acc:
1345
+ readied_kmers_by_acc[acc] = dict(readied_kmers_by_acc[acc])
1346
+ genomes_per_acc[acc] = list(readied_kmers_by_acc[acc].keys())
1347
+ final_db.add_genomes_first(acc, readied_kmers_by_acc[acc])
1348
+ readied_kmers_by_acc[acc] = None
1349
+
1350
+ readied_kmers_by_acc = None
1351
+
1352
+ add_genomes = "INSERT OR REPLACE INTO genome_index VALUES (?, ?, ?)"
1353
+ add_proteins = "INSERT OR REPLACE INTO genome_acc_kmer_counts VALUES (?, ?, ?)"
1354
+
1355
+ final_db.cursor.executemany(add_genomes, protein_counts_to_add)
1356
+ final_db.cursor.executemany(add_proteins, genome_acc_kmer_counts_to_add)
1357
+ final_db.connection.commit()
1358
+
1359
+ final_db.cursor.execute("CREATE INDEX IF NOT EXISTS kmer_acc ON genome_acc_kmer_counts (genome, accession);")
1360
+ final_db.connection.commit()
1361
+
1362
+ protein_counts_to_add = None
1363
+ genome_acc_kmer_counts_to_add = None
1364
+
1365
+ unique_accessions = list(genomes_per_acc.keys())
1366
+ child_args = []
1367
+ for i in range(0, len(unique_accessions)):
1368
+ accession = unique_accessions[i]
1369
+ name = "accession_" + unique_accessions[i] + "_partition_" + str(i)
1370
+ child_path = os.path.normpath(output_path+"/temp")
1371
+ child_args.append([accession, name, child_path, parent_path, genomes_per_acc[accession], genome_index])
1372
+
1373
+ print("")
1374
+ print("Formatting data to add to database at", curtime())
1375
+
1376
+ #Add partition, output, parent DB data.
1377
+ if not os.path.exists(os.path.normpath(output_path+"/temp")):
1378
+ try:
1379
+ os.mkdir(os.path.normpath(output_path+"/temp"))
1380
+ except:
1381
+ print("Output directory failed to create! Cannot continue.")
1382
+ return False
1383
+
1384
+ if verbose:
1385
+ print("")
1386
+ count = 0
1387
+ total_counts = len(child_args)
1388
+ try:
1389
+ log_time = curtime()
1390
+ percentage = (count/total_counts)*100
1391
+ sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(count) + " of " + str(total_counts) + ' ) at ' + curtime() + "\n")
1392
+ sys.stdout.flush()
1393
+ except:
1394
+ #It's not really a big deal if the progress bar cannot be printed.
1395
+ pass
1396
+
1397
+ last_pct = 0
1398
+
1399
+ quiverfull = []
1400
+
1401
+ pool = multiprocessing.Pool(threads)
1402
+
1403
+ for result in pool.imap_unordered(produce_children, child_args):
1404
+ acc = result[0]
1405
+ child = result[1]
1406
+
1407
+ quiverfull.append([acc, child])
1408
+
1409
+ if verbose:
1410
+ count += 1
1411
+ try:
1412
+ percentage = (count/total_counts)*100
1413
+ log_time = curtime()
1414
+ sys.stdout.write('\033[A')
1415
+ sys.stdout.flush()
1416
+ sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(count) + " of " + str(total_counts) + ' done at '+ curtime() + " )\n")
1417
+ sys.stdout.flush()
1418
+ except:
1419
+ #It's not really a big deal if the progress bar cannot be printed.
1420
+ pass
1421
+
1422
+ pool.close()
1423
+ pool.join()
1424
+
1425
+ print("")
1426
+ print("Adding data to final database.")
1427
+
1428
+ if verbose:
1429
+ print("")
1430
+
1431
+ count = 0
1432
+ total_counts = len(child_args)
1433
+ try:
1434
+ percentage = (count/total_counts)*100
1435
+
1436
+ ("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(count) + " of " + str(total_counts) + ' done at '+ curtime() + " )\n")
1437
+ sys.stdout.flush()
1438
+ except:
1439
+ #It's not really a big deal if the progress bar cannot be printed.
1440
+ pass
1441
+
1442
+ last_pct = 0
1443
+
1444
+ for result in quiverfull:
1445
+ acc = result[0]
1446
+ child = result[1]
1447
+ final_db.add_child_to_parent(acc, child)
1448
+
1449
+ if verbose:
1450
+ count += 1
1451
+ try:
1452
+ percentage = (count/total_counts)*100
1453
+ log_time = curtime()
1454
+ sys.stdout.write('\033[A')
1455
+ sys.stdout.flush()
1456
+ sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(count) + " of " + str(total_counts) + ' done at '+ curtime() + " )\n")
1457
+ sys.stdout.flush()
1458
+ except:
1459
+ #It's not really a big deal if the progress bar cannot be printed.
1460
+ pass
1461
+
1462
+
1463
+ print("")
1464
+ #print("Cleaning up...")
1465
+ #final_db.connection.execute("VACUUM")
1466
+
1467
+ final_db.close_connection()
1468
+
1469
+ os.rmdir(os.path.normpath(output_path+"/temp"))
1470
+
1471
+ return True
1472
+
1473
+ #genome_index is global already
1474
+ def produce_children(args):
1475
+ acc = args[0]
1476
+ partition = args[1]
1477
+ output_base = args[2]
1478
+ parent_db = args[3]
1479
+ genomes_in_this_acc = args[4]
1480
+ genome_index = args[5]
1481
+
1482
+ parental_database = fastaai_database(parent_db)
1483
+
1484
+ sql_friendly_accession = acc.replace('.', '_')
1485
+
1486
+ read_parent_sql = "SELECT * FROM " + sql_friendly_accession + "_genomes WHERE genome IN ({genomes})".format(genomes=','.join(['?']*len(genomes_in_this_acc)))
1487
+
1488
+ parental_database.activate_connection()
1489
+
1490
+ genomes_for_this_acc = dict(parental_database.cursor.execute(read_parent_sql, genomes_in_this_acc).fetchall())
1491
+
1492
+ parental_database.close_connection()
1493
+
1494
+ child_db = os.path.normpath(output_base + "/" + partition + ".db")
1495
+
1496
+ this_child = child_database(child_db, parent_db)
1497
+
1498
+ this_child.activate_child_connection()
1499
+ #this_child.initialize_child_database()
1500
+ this_child.activate_parent_connection()
1501
+
1502
+ #Keys are genomes as indices, values are numpy arrays of kmers. This makes tuples.
1503
+ #this_child.add_genomes_first(acc, zip(genomes_for_this_acc.keys(), genomes_for_this_acc.values()))
1504
+
1505
+ #Here's where we add the genomes as such to the children, too.
1506
+ readied_kmers = defaultdict(lambda: [])
1507
+ for genome in genomes_for_this_acc:
1508
+ for kmer in genomes_for_this_acc[genome]:
1509
+ readied_kmers[kmer].append(genome)
1510
+ #cleanup space
1511
+ genomes_for_this_acc[genome] = None
1512
+
1513
+ del genomes_for_this_acc
1514
+
1515
+ readied_kmers = dict(readied_kmers)
1516
+ for kmer in readied_kmers:
1517
+ readied_kmers[kmer] = np.array(readied_kmers[kmer], dtype = np.int32)
1518
+
1519
+ sql_friendly_accession = this_child.add_accession(acc, readied_kmers)
1520
+
1521
+ this_child.close_parent_connection()
1522
+ this_child.close_child_connection()
1523
+
1524
+ del readied_kmers
1525
+
1526
+ return [sql_friendly_accession, child_db]
1527
+
1528
+ #Build or add to a FastAAI DB
1529
+ def build_db_opts():
1530
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
1531
+ description='''
1532
+ This FastAAI module allows you to create a FastAAI database from one or many genomes, proteins, or proteins and HMMs, or add these files to an existing one.
1533
+
1534
+ Supply genomes OR proteins OR proteins AND HMMs as inputs.
1535
+
1536
+ If you supply genomes, FastAAI will predict proteins from them, and HMMs will be created from those proteins
1537
+ If you supply only proteins, FastAAI will create HMM files from them, searching against FastAAI's internal database
1538
+ If you supply proteins AND HMMs, FastAAI will directly use them to build the database.\n
1539
+ You cannot supply both genomes and proteins
1540
+ ''')
1541
+
1542
+ parser.add_argument('-g', '--genomes', dest = 'genomes', default = None, help = 'A directory containing genomes in FASTA format.')
1543
+ parser.add_argument('-p', '--proteins', dest = 'proteins', default = None, help = 'A directory containing protein amino acids in FASTA format.')
1544
+ parser.add_argument('-m', '--hmms', dest = 'hmms', default = None, help = 'A directory containing the results of an HMM search on a set of proteins.')
1545
+ parser.add_argument('-d', '--database', dest = 'db_name', default = "FastAAI_database.sqlite.db", help = 'The name of the database you wish to create or add to. The database will be created if it doesn\'t already exist and placed in the output directory. FastAAI_database.sqlite.db by default.')
1546
+
1547
+ parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory to place the database and any protein or HMM files FastAAI creates. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
1548
+
1549
+ parser.add_argument('--genome_file', dest = 'gf', default = None, help = 'Alternative way to supply genomes. A file containing paths to your genome files, 1 per line.')
1550
+ parser.add_argument('--protein_file', dest = 'pf', default = None, help = 'Alternative way to supply proteins. A file containing paths to your protein files, 1 per line.')
1551
+ parser.add_argument('--hmm_file', dest = 'hf', default = None, help = 'Alternative way to supply HMMs. A file containing paths to your HMM files, 1 per line.')
1552
+
1553
+ parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
1554
+ parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
1555
+
1556
+ args, unknown = parser.parse_known_args()
1557
+
1558
+ return parser, args
1559
+
1560
+ def build_db(genomes, proteins, hmms, db_name, output, threads, gf, pf, hf, verbose):
1561
+
1562
+ start = check_out_input_files(genomes, proteins, hmms, gf, pf, hf)
1563
+
1564
+ #If something failed, we stop.
1565
+ if start is None:
1566
+ return False
1567
+
1568
+ good_to_go = prepare_directories(output, start, "build")
1569
+
1570
+ if not good_to_go:
1571
+ return False
1572
+
1573
+ #Check if the db contains path info. Incl. windows version.
1574
+ if "/" not in db_name and "\\" not in db_name:
1575
+ final_database = os.path.normpath(output + "/database/" + db_name)
1576
+ else:
1577
+ #If the person insists that the db has a path, let them.
1578
+ final_database = db_name
1579
+
1580
+ #We'll skip trying this if the file already exists.
1581
+ existing_genome_IDs = None
1582
+ try:
1583
+ if os.path.exists(final_database):
1584
+ parent = fastaai_database(final_database)
1585
+ parent.activate_connection()
1586
+
1587
+ existing_genome_IDs = {}
1588
+ sql_command = "SELECT genome, gen_id FROM genome_index"
1589
+ for result in parent.cursor.execute(sql_command).fetchall():
1590
+ genome = result[0]
1591
+ id = int(result[1])
1592
+ existing_genome_IDs[genome] = id
1593
+
1594
+ parent.close_connection()
1595
+ except:
1596
+ print("You specified an existing file to be a database, but it does not appear to be a FastAAI database.")
1597
+ print("FastAAI will not be able to continue. Please give FastAAI a different database name and continue.")
1598
+ print("Exiting.")
1599
+ return False
1600
+
1601
+
1602
+ prep_args = [genomes, proteins, hmms, gf, pf, hf, db_name]
1603
+
1604
+ #inputs, output_path, parent_path, existing_index, threads
1605
+ success = add_inputs(output, final_database, existing_genome_IDs, threads, verbose, prep_args)
1606
+
1607
+ if success:
1608
+ print("Database build complete!")
1609
+
1610
+ return success
1611
+
1612
+
1613
+ #DB query functionality - unlimited version
1614
+ def do_query_vs_target_aai_only(query_name, target_name, threads, output, precision, verbose):
1615
+ if not os.path.exists(os.path.normpath(output+"/temp")):
1616
+ os.mkdir(os.path.normpath(output+"/temp"))
1617
+
1618
+ if precision == "low":
1619
+ jacc_precision = np.float16
1620
+ if precision == "med":
1621
+ jacc_precision = np.float32
1622
+ if precision == "high":
1623
+ jacc_precision = np.float64
1624
+
1625
+ #Save the file paths.
1626
+ query = fastaai_database(query_name)
1627
+ target = fastaai_database(target_name)
1628
+
1629
+ query.activate_connection()
1630
+ query.just_accessions()
1631
+ query_len = query.cursor.execute("SELECT Count(*) FROM genome_index").fetchall()[0][0]
1632
+ #query.close_connection()
1633
+ target.activate_connection()
1634
+ target.just_accessions()
1635
+ target_len = target.cursor.execute("SELECT Count(*) FROM genome_index").fetchall()[0][0]
1636
+ #target.close_connection()
1637
+
1638
+ print("FastAAI will search", query_len, "query genomes against", target_len, "target genomes.")
1639
+
1640
+ print("")
1641
+ print("FastAAI is preparing your AAI search... ", end = '', flush = True)
1642
+
1643
+ accessions_in_common = list(set(query.accessions).intersection(target.accessions))
1644
+
1645
+ query.accessions = None
1646
+ target.accessions = None
1647
+
1648
+ query.close_connection()
1649
+ target.close_connection()
1650
+
1651
+ load_args = [(query, target, acc) for acc in accessions_in_common]
1652
+
1653
+ loads = []
1654
+ ordered_accs = []
1655
+
1656
+ pool = multiprocessing.Pool(threads)
1657
+
1658
+ for result in pool.imap(load_getter, load_args):
1659
+ load = result[0]
1660
+ acc = result[1]
1661
+ #Load will be None if the accession is in both query and target, but they still don't share even a single Kmer. Unlikely, but it happened once, so it WILL happen again.
1662
+ if load is not None:
1663
+ loads.append(load)
1664
+ ordered_accs.append(acc)
1665
+
1666
+ pool.close()
1667
+ pool.join()
1668
+
1669
+ loads = np.array(loads)
1670
+ ordered_accs = np.array(ordered_accs)
1671
+
1672
+ order = loads.argsort()[::-1]
1673
+
1674
+ loads = loads[order]
1675
+ ordered_accs = ordered_accs[order]
1676
+
1677
+ load_balancer = {}
1678
+ accs_per_load = {}
1679
+ for i in range(0, threads):
1680
+ load_balancer[i] = 0
1681
+ accs_per_load[i] = []
1682
+
1683
+ for i in range(0, loads.shape[0]):
1684
+ index = list(load_balancer.values()).index(min(list(load_balancer.values())))
1685
+ #print(index, load)
1686
+ load_balancer[index] += loads[i]
1687
+ accs_per_load[index].append(int(ordered_accs[i]))
1688
+
1689
+ del loads
1690
+ del ordered_accs
1691
+
1692
+ print("done!")
1693
+ if verbose:
1694
+ print("FastAAI has balanced the workload of calculating AAI from your data.")
1695
+ for index in accs_per_load:
1696
+ print("Thread", index, "will handle", len(accs_per_load[index]), "accessions.")
1697
+ print("FastAAI is beginning the calculation of AAI between your query and target genomes.")
1698
+
1699
+ del load_balancer
1700
+
1701
+ input_queue = multiprocessing.Queue()
1702
+ output_queue = multiprocessing.Queue()
1703
+
1704
+ for thread in accs_per_load:
1705
+ input_queue.put(accs_per_load[thread])
1706
+
1707
+ for i in range(0, threads):
1708
+ input_queue.put('STOP')
1709
+
1710
+ for i in range(0, threads):
1711
+ multiprocessing.Process(target=accession_worker, args=(input_queue, output_queue, query, target, query_len, target_len, jacc_precision)).start()
1712
+
1713
+ print("")
1714
+
1715
+ results = np.zeros(shape = (query_len, target_len), dtype = jacc_precision)
1716
+
1717
+ #Counter to keep the threads running until the whole process is done.
1718
+ donezo = threads
1719
+ while donezo > 0:
1720
+ row = output_queue.get()
1721
+ try:
1722
+ results[row[0]] += row[1]
1723
+ except:
1724
+ donezo -= 1
1725
+
1726
+ print("AAI calculations complete. Formatting results for writing.")
1727
+
1728
+ #global glob_prec
1729
+ #glob_prec = jacc_precision
1730
+
1731
+ rdb_name = os.path.normpath(output+"/temp/aai_calc_db.db")
1732
+ rdb = calculation_database(rdb_name, precision)
1733
+ rdb.activate_connection()
1734
+ rdb.initialize_database()
1735
+
1736
+ #Get the data ready for passing to children...
1737
+
1738
+ results = np.split(results, query_len, axis = 0)
1739
+
1740
+ insertable = []
1741
+ #iterate over results and turn them into tuples.
1742
+ for i in range(0, query_len):
1743
+ insertable.append((i, results[i].tobytes()))
1744
+ results[i] = None
1745
+
1746
+ rdb.cursor.executemany("INSERT INTO jaccards VALUES (?, ?)", (insertable))
1747
+ rdb.connection.commit()
1748
+
1749
+ rdb.close_connection()
1750
+
1751
+ del insertable
1752
+ del results
1753
+
1754
+ #Now we split the query genomes into chunk and have threads process each chunk in parallel with its respective shared prot counts.
1755
+ query_chunks = split_indicies(query_len, threads)
1756
+ query_args = [([rdb_name], query_chunks[i], output, query, target, precision) for i in range(0, threads)]
1757
+
1758
+ print("Results formatted. Writing results starting at", curtime())
1759
+
1760
+ pool = multiprocessing.Pool(threads)
1761
+
1762
+ pool.map(finish_jaccards, query_args)
1763
+
1764
+ pool.close()
1765
+ pool.join()
1766
+
1767
+ os.remove(rdb_name)
1768
+
1769
+ print("FastAAI complete! Results at:", os.path.normpath(output+"/results/"))
1770
+
1771
+ return None
1772
+
1773
+ #Assess the number of comparisons that will have to be made to complete an accession so that balanced loads can be passed to threads
1774
+ def load_getter(args):
1775
+ query, target, accession = args[0], args[1], args[2]
1776
+ query.activate_connection()
1777
+ target.activate_connection()
1778
+
1779
+ original_index = generate_accessions_index()
1780
+ accession_inverter = {}
1781
+ for acc in original_index:
1782
+ sql_friendly_accession = acc.replace(".", "_")
1783
+ accession_inverter[original_index[acc]] = sql_friendly_accession
1784
+
1785
+ sql_friendly_accession = accession_inverter[accession].replace('.', '_')
1786
+ sql = "SELECT kmer FROM "+ sql_friendly_accession
1787
+ query.cursor.row_factory = lambda cursor, row: row[0]
1788
+ #query_kmers = set(query.cursor.execute(sql).fetchall()).intersection()
1789
+ target.cursor.row_factory = lambda cursor, row: row[0]
1790
+ #target_kmers = target.cursor.execute(sql).fetchall()
1791
+
1792
+ shared_kmers = list(set(query.cursor.execute(sql).fetchall()).intersection(target.cursor.execute(sql).fetchall()))
1793
+ query.cursor.row_factory = None
1794
+ target.cursor.row_factory = None
1795
+
1796
+ bytes_sql = "SELECT sum(length(genomes)) FROM " + sql_friendly_accession + " WHERE kmer IN ({kmers})".format(kmers=','.join(['?']*len(shared_kmers)))
1797
+
1798
+ if len(shared_kmers) > 0:
1799
+ tgt_res = target.cursor.execute(bytes_sql, shared_kmers).fetchone()[0]
1800
+ query_res = query.cursor.execute(bytes_sql, shared_kmers).fetchone()[0]
1801
+ #This if *should* always happen, if it gets checked.
1802
+ if tgt_res is not None and query_res is not None:
1803
+ load = int(tgt_res/(4096) * query_res/(4096))
1804
+ else:
1805
+ load = None
1806
+ else:
1807
+ load = None
1808
+
1809
+ query.close_connection()
1810
+ target.close_connection()
1811
+
1812
+ return [load, accession]
1813
+
1814
+ def accession_worker(in_queue, out_queue, query, target, qlen, tlen, prec):
1815
+ original_index = generate_accessions_index()
1816
+ accession_inverter = {}
1817
+ for acc in original_index:
1818
+ sql_friendly_accession = acc.replace(".", "_")
1819
+ accession_inverter[original_index[acc]] = sql_friendly_accession
1820
+
1821
+ query.activate_connection()
1822
+ target.activate_connection()
1823
+ query.load_genome_index()
1824
+ target.load_genome_index()
1825
+
1826
+ for my_accessions in iter(in_queue.get, 'STOP'):
1827
+
1828
+ #print(my_accessions)
1829
+
1830
+ target.load_accessions(permitted_accessions = my_accessions)
1831
+ query.load_accessions(permitted_accessions = my_accessions)
1832
+
1833
+ query_data = {}
1834
+ target_data = {}
1835
+
1836
+ for acc in my_accessions:
1837
+
1838
+ sql_friendly_accession = accession_inverter[acc].replace('.', '_')
1839
+
1840
+ query_data[acc] = dict(query.cursor.execute("SELECT * FROM "+sql_friendly_accession+"_genomes").fetchall())
1841
+
1842
+ query.cursor.row_factory = lambda cursor, row: row[0]
1843
+ selected_kmers = list(query.cursor.execute("SELECT kmer FROM "+sql_friendly_accession).fetchall())
1844
+ query.cursor.row_factory = None
1845
+
1846
+ target_sql = "SELECT * FROM " + sql_friendly_accession + " WHERE kmer in ({kmers})".format(kmers=','.join(['?']*len(selected_kmers)))
1847
+ target_data[acc] = dict(target.cursor.execute(target_sql, selected_kmers).fetchall())
1848
+
1849
+ target_kmer_cts_by_acc = {}
1850
+ for acc in my_accessions:
1851
+ target_kmer_cts_by_acc[acc] = np.zeros(tlen, dtype = np.int16)
1852
+
1853
+ for genome in target.gak:
1854
+ for acc in target.gak[genome]:
1855
+ target_kmer_cts_by_acc[acc][genome] = target.gak[genome][acc]
1856
+
1857
+ #No longer needed.
1858
+ target.gak = None
1859
+ #We want each thread to report every single genome
1860
+ for genome in query.gak:
1861
+ #count += 1
1862
+ #print("Thread", my_thread, "genome", count, "of", total)
1863
+ these_jaccards = np.zeros(tlen, dtype = np.float64)
1864
+ for acc in query.gak[genome]:
1865
+ these_intersections = np.zeros(tlen, dtype = np.int16)
1866
+ query_kmers = query_data[acc][genome]
1867
+ query_kmer_ct = query_kmers.shape
1868
+ for kmer in query_kmers:
1869
+ if kmer in target_data[acc]:
1870
+ these_intersections[target_data[acc][kmer]] += 1
1871
+
1872
+ these_jaccards += np.divide(these_intersections, np.subtract(np.add(query_kmer_ct, target_kmer_cts_by_acc[acc]), these_intersections))
1873
+
1874
+ out_queue.put([genome, these_jaccards])
1875
+
1876
+ target.close_connection()
1877
+ query.close_connection()
1878
+ out_queue.put("Based")
1879
+
1880
+ return None
1881
+
1882
+ def finish_jaccards(args):
1883
+ partial_dbs, my_query_genomes, output, query, target, prec = args[0], args[1], args[2], args[3] ,args[4], args[5]
1884
+ #Load protein counts
1885
+ #for each genome, query each partial and sum matching genomes, then divide by shared counts.
1886
+
1887
+ query.activate_connection()
1888
+ target.activate_connection()
1889
+ query.load_genome_index()
1890
+ target.load_genome_index()
1891
+
1892
+ selected_query_genomes = range(my_query_genomes[0], my_query_genomes[1])
1893
+
1894
+ offset = my_query_genomes[0]
1895
+
1896
+ target_len = len(target.genome_index)
1897
+ query_len = my_query_genomes[1] - my_query_genomes[0]
1898
+
1899
+ #get shared protein counts
1900
+ query.load_accessions(permitted_genomes = selected_query_genomes)
1901
+
1902
+ max_acc = 122
1903
+
1904
+ query_set = np.zeros(shape = (query_len, max_acc), dtype = np.int16)
1905
+
1906
+ for g in query.gak:
1907
+ query_set[(g-offset), list(query.gak[g])] += 1
1908
+
1909
+ target_set = np.zeros(shape = (max_acc, len(target.genome_index)), dtype = np.int16)
1910
+
1911
+ target.load_accessions()
1912
+
1913
+ target_protein_counts = np.zeros(target_len, dtype = np.int16)
1914
+ for t in target.gak:
1915
+ target_set[list(target.gak[t]), t] += 1
1916
+ target_protein_counts[t] = len(target.gak[t])
1917
+
1918
+ #This will be used to divide the jaccs and such. If disk, then disk, tho...
1919
+ shared_prot_counts_by_genome = np.dot(query_set, target_set)
1920
+
1921
+ del query_set
1922
+ del target_set
1923
+
1924
+ target.gak = None
1925
+
1926
+ query.close_connection()
1927
+ target.close_connection()
1928
+
1929
+ activated_DBs = []
1930
+ idx = 0
1931
+ for db in partial_dbs:
1932
+ activated_DBs.append(calculation_database(db, prec))
1933
+ activated_DBs[idx].activate_connection()
1934
+ idx += 1
1935
+
1936
+
1937
+ for genome in selected_query_genomes:
1938
+ sql = "SELECT jaccards FROM jaccards WHERE genome="+str(genome)
1939
+ total_jaccs = np.zeros(target_len, dtype = np.float64)
1940
+ shared_acc_counts = shared_prot_counts_by_genome[genome - offset]
1941
+ for db in activated_DBs:
1942
+ result = db.cursor.execute(sql).fetchone()[0]
1943
+ total_jaccs += result
1944
+
1945
+ total_jaccs = np.divide(total_jaccs, shared_acc_counts)
1946
+
1947
+ aai_est = numpy_kaai_to_aai(total_jaccs)
1948
+
1949
+ no_hit = np.where(shared_acc_counts == 0)
1950
+ #Actual hits is already stored in shared_acc_counts
1951
+ possible_hits = np.minimum(len(query.gak[genome]), target_protein_counts).astype(str)
1952
+
1953
+ total_jaccs = np.round(total_jaccs, 4).astype(str)
1954
+
1955
+ shared_acc_counts = shared_acc_counts.astype(str)
1956
+
1957
+ total_jaccs[no_hit] = "N/A"
1958
+ aai_est[no_hit] = "N/A"
1959
+ shared_acc_counts[no_hit] = "N/A"
1960
+ possible_hits[no_hit] = "N/A"
1961
+
1962
+ name = query.reverse_genome_index[genome]
1963
+
1964
+ output_file = output +"/results/"+name+"_results.txt"
1965
+ fh = open(output_file, "w")
1966
+
1967
+ for tgt in range(0, target_len):
1968
+ target_name = target.reverse_genome_index[tgt]
1969
+ if target_name == name:
1970
+ fh.write(name+"\t"+target_name+"\t"+"100.0"+"\t"+"0.0"+"\t"+shared_acc_counts[tgt]+"\t"+possible_hits[tgt]+"\t"+"100.0"+"\n")
1971
+ else:
1972
+ fh.write(name+"\t"+target_name+"\t"+total_jaccs[tgt]+"\t"+"N/A"+"\t"+shared_acc_counts[tgt]+"\t"+possible_hits[tgt]+"\t"+aai_est[tgt]+"\n")
1973
+
1974
+ fh.close()
1975
+
1976
+ #Write partial to file, here.
1977
+
1978
+ for db in activated_DBs:
1979
+ db.close_connection()
1980
+
1981
+ return None
1982
+
1983
+
1984
+ #Here's the DB SQL querying functionality/limited version.
1985
+ def do_query_vs_target_sql(query, target, threads, output, verbose, do_stdev):
1986
+ #Save the file paths.
1987
+ query_name, target_name = query, target
1988
+
1989
+ query = fastaai_database(query_name)
1990
+ query.activate_connection()
1991
+ query.load_genome_index()
1992
+ query.just_accessions()
1993
+
1994
+ converter = generate_accessions_index()
1995
+ acc_sql = "SELECT name FROM sqlite_master WHERE type='table'"
1996
+ tables = [item[0] for item in query.cursor.execute(acc_sql).fetchall()]
1997
+ cleaned_tables = []
1998
+ for table in tables:
1999
+ if table.endswith("_genomes"):
2000
+ acc_name = table.split("_genomes")[0]
2001
+ acc_name = acc_name.replace("_", ".")
2002
+ index = acc_name
2003
+ cleaned_tables.append((table, index))
2004
+
2005
+ del tables
2006
+
2007
+ #Go through tables and load data.
2008
+ query_acc_kmers = defaultdict(dict)
2009
+
2010
+ sys.stdout.write("\n")
2011
+ sys.stdout.write("Loading query data at " + curtime() + " ...\n")
2012
+ sys.stdout.flush()
2013
+
2014
+ for tab_idx in cleaned_tables:
2015
+ table = tab_idx[0]
2016
+ accession = tab_idx[1]
2017
+ for result in query.cursor.execute("SELECT * FROM " + table).fetchall():
2018
+ query_acc_kmers[result[0]][accession] = result[1]
2019
+
2020
+ query.close_connection()
2021
+
2022
+
2023
+ sys.stdout.write("\n")
2024
+ sys.stdout.write("Loading target data at " + curtime() + " ...\n")
2025
+ sys.stdout.flush()
2026
+
2027
+ target = fastaai_database(target_name)
2028
+ target.activate_connection()
2029
+ target.load_genome_index()
2030
+ target.load_accessions()
2031
+ target.close_connection()
2032
+
2033
+ query_args = []
2034
+ for genome in query_acc_kmers:
2035
+ query_args.append((target, query.reverse_genome_index[genome], query_acc_kmers[genome], os.path.normpath(output+"/results")))
2036
+
2037
+ detected_query_accs = query.accessions
2038
+ query_length = len(query.genome_index)
2039
+
2040
+ #Cleanup
2041
+ del query
2042
+ del query_acc_kmers
2043
+
2044
+ #global target_kmer_cts
2045
+ target_kmer_cts = {}
2046
+
2047
+ target_len = len(target.gak)
2048
+
2049
+ for accession in np.intersect1d(detected_query_accs, target.accessions):
2050
+ target_kmer_cts[accession] = np.zeros(target_len, dtype = np.int16)
2051
+ for g in target.gak:
2052
+ if accession in target.gak[g]:
2053
+ target_kmer_cts[accession][g] = target.gak[g][accession]
2054
+
2055
+ #global target_protein_counts
2056
+ target_protein_counts = np.zeros(target_len, dtype = np.int16)
2057
+ for g in target.gak:
2058
+ target_protein_counts[g] = len(target.gak[g])
2059
+
2060
+ target_length = len(target.gak)
2061
+
2062
+ target.gak = None
2063
+
2064
+ #Should just load the stuff then straightforward sql
2065
+ sys.stdout.write("\n")
2066
+ sys.stdout.write("FastAAI will search "+ str(query_length) + " query genomes against " + str(target_length) + " target genomes.\n")
2067
+ sys.stdout.write("\n")
2068
+
2069
+ count = 0
2070
+ total = len(query_args)
2071
+
2072
+ sys.stdout.write("Beginning AAI calculation at " + curtime())
2073
+
2074
+ if verbose:
2075
+ print("")
2076
+ #progress bar - possible dangerous use of the return to line start sequence.
2077
+ try:
2078
+ percentage = 0
2079
+ sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% (Query genome ' + str(count) + " of " + str(total) + ' done at '+curtime()+')\n')
2080
+ sys.stdout.flush()
2081
+ last_pct = 0
2082
+ except:
2083
+ #It's not really a big deal if the progress bar cannot be printed.
2084
+ pass
2085
+
2086
+ pool = multiprocessing.Pool(threads, initializer = sql_query_thread_starter, initargs = (target_kmer_cts, target_protein_counts,))
2087
+
2088
+ #Process as we go.
2089
+ if do_stdev:
2090
+ for file in pool.imap(do_sql_query, query_args):
2091
+ if verbose:
2092
+ #progress bar - possible dangerous use of the return to line start sequence.
2093
+ try:
2094
+ count += 1
2095
+ percentage = (count/total)*100
2096
+ if int(percentage/2) > last_pct or count == total:
2097
+ sys.stdout.write('\033[A')
2098
+ sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% (Query genome ' + str(count) + " of " + str(total) + ' done at '+curtime()+')\n')
2099
+ sys.stdout.flush()
2100
+ last_pct = int(percentage/2)
2101
+ except:
2102
+ #It's not really a big deal if the progress bar cannot be printed.
2103
+ pass
2104
+
2105
+ pool.close()
2106
+ pool.join()
2107
+ else:
2108
+
2109
+ for file in pool.imap(do_sql_query_no_SD, query_args):
2110
+
2111
+ if verbose:
2112
+ #progress bar - possible dangerous use of the return to line start sequence.
2113
+ try:
2114
+ count += 1
2115
+ percentage = (count/total)*100
2116
+ if int(percentage/2) > last_pct or count == total:
2117
+ sys.stdout.write('\033[A')
2118
+ sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% (Query genome ' + str(count) + " of " + str(total) + ' done at '+curtime()+')\n')
2119
+ sys.stdout.flush()
2120
+ last_pct = int(percentage/2)
2121
+ except:
2122
+ #It's not really a big deal if the progress bar cannot be printed.
2123
+ pass
2124
+
2125
+ pool.close()
2126
+ pool.join()
2127
+
2128
+ print("AAI calculation complete! Results at:", os.path.normpath(output+"/results"))
2129
+
2130
+ return None
2131
+
2132
+ #This can also take the genomes-first formatted prots in the DB and search them memory-efficiently, if not time efficiently.
2133
+ def do_sql_query(args):
2134
+ kmer_index = create_kmer_index()
2135
+ accession_index = generate_accessions_index()
2136
+ #database, file.basename, file.best_hits_kmers, os.path.normpath(output+"/temp")
2137
+ database, name, acc_kmers, temp_out = args[0],args[1],args[2],args[3]
2138
+
2139
+ database.activate_connection()
2140
+
2141
+ res_ct = 0
2142
+ target_len = len(database.genome_index)
2143
+
2144
+ results = np.zeros(shape = (len(acc_kmers), target_len), dtype = np.float64)
2145
+ row = 0
2146
+
2147
+ shared_acc_counts = np.zeros(target_len, dtype = np.int16)
2148
+
2149
+ for accession in acc_kmers:
2150
+ acc_index = accession_index[accession]
2151
+ sql_friendly_accession = accession.replace(".", "_")
2152
+ if acc_index in database.accessions:
2153
+ #The accession was found for this target genome, for each tgt genome.
2154
+ shared_acc_counts[np.nonzero(target_kmer_cts[acc_index])] += 1
2155
+ these_kmers = [int(kmer) for kmer in acc_kmers[accession]]
2156
+ these_intersections = np.zeros(target_len, dtype = np.int16)
2157
+ sql_query = "SELECT genomes FROM " + sql_friendly_accession + " WHERE kmer in ({kmers})".format(kmers=','.join(['?']*len(these_kmers)))
2158
+ for result in database.cursor.execute(sql_query, these_kmers):
2159
+ these_intersections[result] += 1
2160
+
2161
+ results[row] = np.divide(these_intersections, np.subtract(np.add(acc_kmers[accession].shape[0], target_kmer_cts[acc_index]), these_intersections))
2162
+
2163
+ row += 1
2164
+
2165
+ database.close_connection()
2166
+
2167
+ #These are the jacc averages
2168
+ jaccard_averages = np.divide(np.sum(results, axis = 0), shared_acc_counts)
2169
+
2170
+ #Get the differences from the mean per hit
2171
+ results = results - jaccard_averages
2172
+ #Square them
2173
+ results = np.square(results)
2174
+ #Sum squares and divide by shared acc. count, the sqrt to get SD.
2175
+ jaccard_SDs = np.sqrt(np.divide(np.sum(results, axis = 0), shared_acc_counts))
2176
+
2177
+ aai_est = numpy_kaai_to_aai(jaccard_averages)
2178
+
2179
+ no_hit = np.where(shared_acc_counts == 0)
2180
+ #Actual hits is already stored in shared_acc_counts
2181
+ possible_hits = np.minimum(len(acc_kmers), target_protein_counts).astype(str)
2182
+
2183
+
2184
+ jaccard_averages = np.round(jaccard_averages, 4).astype(str)
2185
+ jaccard_SDs = np.round(jaccard_SDs, 4).astype(str)
2186
+
2187
+ shared_acc_counts = shared_acc_counts.astype(str)
2188
+
2189
+ jaccard_averages[no_hit] = "N/A"
2190
+ aai_est[no_hit] = "N/A"
2191
+ jaccard_SDs[no_hit] = "N/A"
2192
+ shared_acc_counts[no_hit] = "N/A"
2193
+ possible_hits[no_hit] = "N/A"
2194
+
2195
+ output_file = temp_out +"/"+name+"_results.txt"
2196
+ fh = open(output_file, "w")
2197
+
2198
+ for target in range(0, target_len):
2199
+ target_name = database.reverse_genome_index[target]
2200
+ if target_name == name:
2201
+ fh.write(name+"\t"+target_name+"\t"+"100.0"+"\t"+"0.0"+"\t"+shared_acc_counts[target]+"\t"+possible_hits[target]+"\t"+"100.0"+"\n")
2202
+ else:
2203
+ fh.write(name+"\t"+target_name+"\t"+jaccard_averages[target]+"\t"+jaccard_SDs[target]+"\t"+shared_acc_counts[target]+"\t"+possible_hits[target]+"\t"+aai_est[target]+"\n")
2204
+
2205
+ fh.close()
2206
+
2207
+ return output_file
2208
+
2209
+ #This can also take the genomes-first formatted prots in the DB and search them memory-efficiently, if not time efficiently.
2210
+ def do_sql_query_no_SD(args):
2211
+ kmer_index = create_kmer_index()
2212
+ accession_index = generate_accessions_index()
2213
+ #database, file.basename, file.best_hits_kmers, os.path.normpath(output+"/temp")
2214
+ database, name, acc_kmers, temp_out = args[0],args[1],args[2],args[3]
2215
+
2216
+ database.activate_connection()
2217
+
2218
+ res_ct = 0
2219
+ target_len = len(database.genome_index)
2220
+
2221
+ results = np.zeros(shape = target_len, dtype = np.float64)
2222
+ #row = 0
2223
+
2224
+ shared_acc_counts = np.zeros(target_len, dtype = np.int16)
2225
+
2226
+ for accession in acc_kmers:
2227
+ acc_index = accession_index[accession]
2228
+ sql_friendly_accession = accession.replace(".", "_")
2229
+ if acc_index in database.accessions:
2230
+ #The accession was found for this target genome, for each tgt genome.
2231
+ shared_acc_counts[np.nonzero(target_kmer_cts[acc_index])] += 1
2232
+ these_kmers = [int(kmer) for kmer in acc_kmers[accession]]
2233
+ these_intersections = np.zeros(target_len, dtype = np.int16)
2234
+ sql_query = "SELECT genomes FROM " + sql_friendly_accession + " WHERE kmer in ({kmers})".format(kmers=','.join(['?']*len(these_kmers)))
2235
+ for result in database.cursor.execute(sql_query, these_kmers):
2236
+ these_intersections[result] += 1
2237
+
2238
+ results += np.divide(these_intersections, np.subtract(np.add(acc_kmers[accession].shape[0], target_kmer_cts[acc_index]), these_intersections))
2239
+
2240
+ database.close_connection()
2241
+
2242
+ #These are the jacc averages
2243
+ jaccard_averages = np.divide(results, shared_acc_counts)
2244
+ del results
2245
+
2246
+ aai_est = numpy_kaai_to_aai(jaccard_averages)
2247
+
2248
+ no_hit = np.where(shared_acc_counts == 0)
2249
+
2250
+ possible_hits = np.minimum(len(acc_kmers), target_protein_counts).astype(str)
2251
+
2252
+ jaccard_averages = np.round(jaccard_averages, 4).astype(str)
2253
+
2254
+ shared_acc_counts = shared_acc_counts.astype(str)
2255
+
2256
+ jaccard_averages[no_hit] = "N/A"
2257
+ aai_est[no_hit] = "N/A"
2258
+ shared_acc_counts[no_hit] = "N/A"
2259
+ possible_hits[no_hit] = "N/A"
2260
+
2261
+ output_file = temp_out +"/"+name+"_results.txt"
2262
+ fh = open(output_file, "w")
2263
+
2264
+ for target in range(0, target_len):
2265
+ target_name = database.reverse_genome_index[target]
2266
+ if target_name == name:
2267
+ fh.write(name+"\t"+target_name+"\t"+"100.0"+"\t"+"0.0"+"\t"+shared_acc_counts[target]+"\t"+possible_hits[target]+"\t"+"100.0"+"\n")
2268
+ else:
2269
+ fh.write(name+"\t"+target_name+"\t"+jaccard_averages[target]+"\t"+"N/A"+"\t"+shared_acc_counts[target]+"\t"+possible_hits[target]+"\t"+aai_est[target]+"\n")
2270
+
2271
+ fh.close()
2272
+
2273
+ return output_file
2274
+
2275
+ def numpy_kaai_to_aai(kaai_array):
2276
+ #aai_hat = (-0.3087057 + 1.810741 * (np.exp(-(-0.2607023 * np.log(kaai))**(1/3.435))))*100
2277
+
2278
+ #Protect the original jaccard averages memory item
2279
+ aai_hat_array = kaai_array.copy()
2280
+
2281
+ non_zero = np.where(aai_hat_array > 0)
2282
+ is_zero = np.where(aai_hat_array <= 0)
2283
+
2284
+ #I broke this down into its original components
2285
+ #Avoid zeroes in log - still actually works, but it produces warnings I don't want to see.
2286
+ aai_hat_array[non_zero] = np.log(aai_hat_array[non_zero])
2287
+
2288
+ aai_hat_array = np.multiply(np.subtract(np.multiply(np.exp(np.negative(np.power(np.multiply(aai_hat_array, -0.2607023), (1/3.435)))), 1.810741), 0.3087057), 100)
2289
+ '''
2290
+ Same as the above, broken down into easier-to-follow steps.
2291
+ aai_hat_array = np.multiply(aai_hat_array, -0.2607023)
2292
+ aai_hat_array = np.power(aai_hat_array, (1/3.435))
2293
+ aai_hat_array = np.negative(aai_hat_array)
2294
+ aai_hat_array = np.exp(aai_hat_array)
2295
+ aai_hat_array = np.multiply(aai_hat_array, 1.810741)
2296
+ aai_hat_array = np.subtract(aai_hat_array, 0.3087057)
2297
+ aai_hat_array = np.multiply(aai_hat_array, 100)
2298
+ '''
2299
+
2300
+ #<30 and >90 values
2301
+ smol = np.where(aai_hat_array < 30)
2302
+ big = np.where(aai_hat_array > 90)
2303
+
2304
+ aai_hat_array = np.round(aai_hat_array, 2)
2305
+
2306
+ #Convert to final printables
2307
+ aai_hat_array = aai_hat_array.astype(str)
2308
+ aai_hat_array[smol] = "<30%"
2309
+ aai_hat_array[big] = ">90%"
2310
+ #The math of the above ends up with zero values being big, so we fix those.
2311
+ aai_hat_array[is_zero] = "<30%"
2312
+
2313
+ return aai_hat_array
2314
+
2315
+ def curtime():
2316
+ time_format = "%d/%m/%Y %H:%M:%S"
2317
+ timer = datetime.datetime.now()
2318
+ time = timer.strftime(time_format)
2319
+ return time
2320
+
2321
+ #Manages the query process.
2322
+ def db_query_opts():
2323
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
2324
+ description='''
2325
+ This FastAAI module takes two FastAAI databases and searches all of the genomes in the QUERY against all of the genomes in the TARGET
2326
+
2327
+ If you have many genomes (more than 1000), it will be faster to create the query database using FastAAI build_db,
2328
+ then search it against an existing target using this module than it is to do the same thing with an SQL query.
2329
+
2330
+ If you give the same database as query and target, a special all vs. all search of the genomes in the database will be done.
2331
+ ''')
2332
+ parser.add_argument('-q', '--query', dest = 'query', default = None, help = 'Path to the query database. The genomes FROM the query will be searched against the genomes in the target database')
2333
+ parser.add_argument('-t', '--target', dest = 'target', default = None, help = 'Path to the target database.')
2334
+
2335
+ parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory where FastAAI will place the result of this query. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
2336
+
2337
+ parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
2338
+ parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
2339
+
2340
+ parser.add_argument('--do_stdev', dest = "do_stdev", action='store_true', help = 'Off by default. Calculate std. deviations on Jaccard indicies. Increases memory usage and runtime slightly. Does NOT change estimated AAI values at all.')
2341
+ parser.add_argument('--unlimited_resources', dest = "large_mem", action = 'store_true', help = 'Off by default. Use a faster algorithm that consumes more RAM. FastAAI cannot calculate std. deviations with this algorithm, so they will automatically be skipped.')
2342
+ parser.add_argument('--mem', dest = "precision", default = "med", help = 'One of low/med/high. Medium by default. Save RAM in return for slightly rounded AAI estimates. Only affects FastAAI if you are also using the "--unlimited_resources" flag.')
2343
+
2344
+ args, unknown = parser.parse_known_args()
2345
+
2346
+ return parser, args
2347
+
2348
+ #Control the query process for any DB-first query.
2349
+ def db_query(query, target, verbose, output, threads, do_stdev, precision, memory_efficient):
2350
+ print("")
2351
+
2352
+ #Sanity checks.
2353
+ if not os.path.exists(target):
2354
+ print("Target database not found. Exiting FastAAI")
2355
+ sys.exit()
2356
+
2357
+ if not os.path.exists(query):
2358
+ print("Query database not found. Exiting FastAAI")
2359
+ sys.exit()
2360
+
2361
+ #status = "exists"
2362
+ query_ok = assess_db(query)
2363
+ target_ok = assess_db(target)
2364
+
2365
+ if query_ok != "exists":
2366
+ print("Query database improperly formatted. Exiting FastAAI")
2367
+ sys.exit()
2368
+
2369
+ if target_ok != "exists":
2370
+ print("Query database improperly formatted. Exiting FastAAI")
2371
+ sys.exit()
2372
+
2373
+ #Check if the database is querying against itself.
2374
+ if target is None or query is None:
2375
+ print("I require both a query and a target database. FastAAI exiting.")
2376
+ sys.exit()
2377
+
2378
+ if query == target:
2379
+ print("Performing an all vs. all query on", query)
2380
+ #all_vs_all = True
2381
+ else:
2382
+ print("Querying", query, "against", target)
2383
+ #all_vs_all = False
2384
+
2385
+ #Ready the output directories as needed.
2386
+ #The databases are already created, the only state they can be in in P+H
2387
+ good_to_go = prepare_directories(output, "protein and HMM", "query")
2388
+ if not good_to_go:
2389
+ print("Exiting FastAAI")
2390
+ sys.exit()
2391
+
2392
+ if precision not in ["high", "med", "low"]:
2393
+ print("Selected memory usage setting not found. Defaulting to med. Select one with --mem high/med/low.")
2394
+ precision = 'med'
2395
+
2396
+ #Default
2397
+ if (not memory_efficient) or do_stdev:
2398
+ do_query_vs_target_sql(query, target, threads, output, verbose, do_stdev)
2399
+ #Not default.
2400
+ else:
2401
+ do_query_vs_target_aai_only(query, target, threads, output, precision, verbose)
2402
+
2403
+ print("")
2404
+
2405
+
2406
+ #Perform a minimal-memory query of a target database from input files. Lighter weight function for low memory
2407
+ def sql_query_opts():
2408
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
2409
+ description='''
2410
+ This FastAAI module takes one or many genomes, proteins, or proteins and HMMs as a QUERY and searches them against an existing FastAAI database TARGET using SQL
2411
+ If you only have a few genomes - or not enough RAM to hold the entire target database in memory - this is the probably the best option for you.
2412
+
2413
+ If you provide FastAAI with genomes or only proteins (not proteins and HMMs), this FastAAI module will produce the required protein and HMM files as needed
2414
+ and place them in the output directory, just like it does while building a database.
2415
+
2416
+ Once these inputs are ready to be queried against the database (each has both a protein and HMM file), they will be processed independently, 1 per thread at a time.
2417
+
2418
+ Note: Protein and HMM files generated during this query can be supplied to build a FastAAI database from proteins and HMMs using the build_db module, without redoing preprocessing.
2419
+ ''')
2420
+
2421
+ parser.add_argument('-g', '--genomes', dest = 'genomes', default = None, help = 'A directory containing genomes in FASTA format.')
2422
+ parser.add_argument('-p', '--proteins', dest = 'proteins', default = None, help = 'A directory containing protein amino acids in FASTA format.')
2423
+ parser.add_argument('-m', '--hmms', dest = 'hmms', default = None, help = 'A directory containing the results of an HMM search on a set of proteins.')
2424
+
2425
+ parser.add_argument('--target', dest = 'target', default = None, help = 'A path to the FastAAI database you wish to use as the target')
2426
+
2427
+ parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory where FastAAI will place the result of this query and any protein or HMM files it has to generate. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
2428
+
2429
+ parser.add_argument('--genome_file', dest = 'gf', default = None, help = 'Alternative way to supply genomes. A file containing paths to your genome files, 1 per line.')
2430
+ parser.add_argument('--protein_file', dest = 'pf', default = None, help = 'Alternative way to supply proteins. A file containing paths to your protein files, 1 per line.')
2431
+ parser.add_argument('--hmm_file', dest = 'hf', default = None, help = 'Alternative way to supply HMMs. A file containing paths to your HMM files, 1 per line.')
2432
+
2433
+ parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
2434
+ parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
2435
+
2436
+ parser.add_argument('--do_stdev', dest = "do_stdev", action='store_true', help = 'Off by default. Calculate std. deviations on Jaccard indicies. Increases memory usage and runtime slightly. Does NOT change estimated AAI values at all.')
2437
+
2438
+ args, unknown = parser.parse_known_args()
2439
+
2440
+ return parser, args
2441
+
2442
+ def sql_query_thread_starter(kmer_cts, protein_cts):
2443
+ global target_kmer_cts
2444
+ global target_protein_counts
2445
+ target_kmer_cts = kmer_cts
2446
+ target_protein_counts = protein_cts
2447
+
2448
+
2449
+ def sql_query(genomes, proteins, hmms, gf, pf, hf, db_name, output, threads, verbose, do_stdev):
2450
+
2451
+ if not os.path.exists(db_name):
2452
+ print("")
2453
+ print("FastAAI can't find your database:", db_name)
2454
+ print("Are you sure that the path you've given to the database is correct and that the database exists?")
2455
+ print("FastAAI exiting.")
2456
+ print("")
2457
+ sys.exit()
2458
+
2459
+ start = check_out_input_files(genomes, proteins, hmms, gf, pf, hf)
2460
+
2461
+ #If something failed, we stop.
2462
+ if start is None:
2463
+ sys.exit()
2464
+
2465
+
2466
+
2467
+ good_to_go = prepare_directories(output, start, "query")
2468
+
2469
+ if not good_to_go:
2470
+ print("Exiting FastAAI")
2471
+ sys.exit()
2472
+
2473
+ #global kmer_index
2474
+ #kmer_index = create_kmer_index()
2475
+
2476
+
2477
+ print("")
2478
+ print("Preparing inputs for querying...")
2479
+
2480
+ prepared_files = advance_inputs(genomes = genomes, proteins = proteins, hmms = hmms, genomes_file = gf, proteins_file = pf, hmms_file = hf, output = output, threads = threads, verbose = verbose, db_name = db_name)
2481
+
2482
+ if prepared_files is None:
2483
+ return None
2484
+
2485
+ query_accessions_detected = set()
2486
+ for file in prepared_files:
2487
+ query_accessions_detected = query_accessions_detected.union(file.best_hits.values())
2488
+
2489
+ #We don't want to get more than we have to.
2490
+ query_accessions_detected = list(query_accessions_detected)
2491
+
2492
+ if prepared_files is None:
2493
+ print("Exiting FastAAI")
2494
+ sys.exit()
2495
+
2496
+ if verbose:
2497
+ print("")
2498
+ print("Gathering database information...")
2499
+
2500
+ database = fastaai_database(db_name)
2501
+ database.activate_connection()
2502
+ database.load_genome_index()
2503
+ database.load_accessions()
2504
+ database.close_connection()
2505
+
2506
+ #formatted_dataset = [(database, file.basename, file.best_hits_kmers, os.path.normpath(output+"/results")) for file in prepared_files]
2507
+
2508
+ #global accession_index
2509
+ accession_index = generate_accessions_index()
2510
+
2511
+ #Translate to indicies.
2512
+ query_accessions_detected = [accession_index[a] for a in query_accessions_detected]
2513
+
2514
+ #global target_kmer_cts
2515
+ target_kmer_cts = {}
2516
+
2517
+ for accession in np.intersect1d(database.accessions, query_accessions_detected):
2518
+ target_kmer_cts[accession] = np.zeros(len(database.genome_index), dtype = np.int16)
2519
+ for g in database.gak:
2520
+ if accession in database.gak[g]:
2521
+ target_kmer_cts[accession][g] = database.gak[g][accession]
2522
+
2523
+ #global target_protein_counts
2524
+ target_protein_counts = np.zeros(len(database.gak), dtype = np.int16)
2525
+ for g in database.gak:
2526
+ target_protein_counts[g] = len(database.gak[g])
2527
+
2528
+ database.gak = None
2529
+
2530
+ formatted_dataset = [(database, file.basename, file.best_hits_kmers, os.path.normpath(output+"/results")) for file in prepared_files]
2531
+
2532
+ if verbose:
2533
+ print("")
2534
+ print("-"*100)
2535
+ print("")
2536
+
2537
+ count = 0
2538
+ total = len(formatted_dataset)
2539
+
2540
+ print("Beginning AAI calculation")
2541
+
2542
+ #globals to pass... target_kmer_cts target_protein_counts
2543
+ #Just remake these in the procs. kmer_index accession_index
2544
+
2545
+ if verbose:
2546
+ print("")
2547
+ #progress bar - possible dangerous use of the return to line start sequence.
2548
+ try:
2549
+ percentage = 0
2550
+ sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% (Query genome ' + str(count) + " of " + str(total) + ' done at '+curtime()+' )\n')
2551
+ sys.stdout.flush()
2552
+ last_pct = 0
2553
+ except:
2554
+ #It's not really a big deal if the progress bar cannot be printed.
2555
+ pass
2556
+
2557
+ #If parallelized, do parallel
2558
+
2559
+ pool = multiprocessing.Pool(threads, initializer = sql_query_thread_starter, initargs = (target_kmer_cts, target_protein_counts,))
2560
+
2561
+ #Process as we go.
2562
+ if do_stdev:
2563
+ for file in pool.imap(do_sql_query, formatted_dataset):
2564
+
2565
+ '''
2566
+ handle = open(file, "r")
2567
+
2568
+ for line in handle:
2569
+ final_result.write(line)
2570
+
2571
+ handle.close()
2572
+ os.remove(file)
2573
+ '''
2574
+ if verbose:
2575
+ #progress bar - possible dangerous use of the return to line start sequence.
2576
+ try:
2577
+ count += 1
2578
+ percentage = (count/total)*100
2579
+ if int(percentage/2) > last_pct or count == total:
2580
+ sys.stdout.write('\033[A')
2581
+ sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% (Query genome ' + str(count) + " of " + str(total) + ' done at '+curtime()+' )\n')
2582
+ sys.stdout.flush()
2583
+ last_pct = int(percentage/2)
2584
+ except:
2585
+ #It's not really a big deal if the progress bar cannot be printed.
2586
+ pass
2587
+
2588
+ pool.close()
2589
+ pool.join()
2590
+ else:
2591
+
2592
+ for file in pool.imap(do_sql_query_no_SD, formatted_dataset):
2593
+ '''
2594
+ handle = open(file, "r")
2595
+
2596
+ for line in handle:
2597
+ final_result.write(line)
2598
+
2599
+ handle.close()
2600
+ os.remove(file)
2601
+ '''
2602
+ if verbose:
2603
+ #progress bar - possible dangerous use of the return to line start sequence.
2604
+ try:
2605
+ count += 1
2606
+ percentage = (count/total)*100
2607
+ if int(percentage/2) > last_pct or count == total:
2608
+ sys.stdout.write('\033[A')
2609
+ sys.stdout.flush()
2610
+ sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% (Query genome ' + str(count) + " of " + str(total) + ' done at '+curtime()+' )\n')
2611
+ sys.stdout.flush
2612
+ last_pct = int(percentage/2)
2613
+ except:
2614
+ #It's not really a big deal if the progress bar cannot be printed.
2615
+ pass
2616
+
2617
+ pool.close()
2618
+ pool.join()
2619
+
2620
+ if verbose:
2621
+ print("")
2622
+ print("-"*100)
2623
+ print("")
2624
+
2625
+ if os.path.exists(output+"/temp"):
2626
+ os.rmdir(output+"/temp")
2627
+
2628
+ print("FastAAI query complete! Results at:", os.path.normpath(output + "/results"))
2629
+ return None
2630
+
2631
+
2632
+ #Check to see if the file exists and is a valid fastAAI db
2633
+ def assess_db(path):
2634
+ status = None
2635
+ if os.path.exists(path):
2636
+ db = fastaai_database(path)
2637
+ try:
2638
+ db.activate_connection()
2639
+ sql = "SELECT name FROM sqlite_master WHERE type='table'"
2640
+
2641
+ db.cursor.row_factory = lambda cursor, row: row[0]
2642
+ tables = db.cursor.execute(sql).fetchall()
2643
+ db.cursor.row_factory = None
2644
+
2645
+ db.close_connection()
2646
+
2647
+ if len(tables) > 2 and "genome_index" in tables and "genome_acc_kmer_counts" in tables:
2648
+ status = "exists"
2649
+ else:
2650
+ status = "wrong format"
2651
+
2652
+ except:
2653
+ status = "wrong format"
2654
+
2655
+ else:
2656
+ try:
2657
+ db = fastaai_database(path)
2658
+ db.activate_connection()
2659
+ db.initialize_parent_database()
2660
+ db.close_connection()
2661
+ status = "created"
2662
+ except:
2663
+ status = "unable to create"
2664
+
2665
+ return status
2666
+
2667
+ #Add one FastAAI DB to another FastAAI DB
2668
+ def merge_db_opts():
2669
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
2670
+ description='''
2671
+ This FastAAI module allows you to add the contents of one or more FastAAI databases to another.
2672
+ You must have at least two already-created FastAAI databases using the build_db module before this module can be used.
2673
+
2674
+ Supply a comma-separated list of at least one donor database and a single recipient database.
2675
+ If the recipient already exists, then genomes in all the donors will be added to the recipient.
2676
+ If the recipient does not already exist, a new database will be created, and the contents of all the donors will be added to it.
2677
+
2678
+ Example:
2679
+ FastAAI.py merge_db --donors databases/db1.db,databases/db2.db -recipient databases/db3.db --threads 3
2680
+ This command will create a new database called "db3.db", merge the data in db1.db and db2.db, and then add the merged data into db3.db
2681
+
2682
+ Only the recipient database will be modified; the donors will be left exactly as they were before running this module.
2683
+ ''')
2684
+
2685
+ parser.add_argument('-d', '--donors', dest = 'donors', default = None, help = 'Comma-separated string of paths to one or more donor databases. The genomes FROM the donors will be added TO the recipient and the donors will be unaltered')
2686
+
2687
+ parser.add_argument('-r', '--recipient', dest = 'recipient', default = None, help = 'Path to the recipient database. Any genomes FROM the donor database not already in the recipient will be added to this database.')
2688
+
2689
+ parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
2690
+
2691
+ parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
2692
+
2693
+ args, unknown = parser.parse_known_args()
2694
+
2695
+ return parser, args
2696
+
2697
+ def merge_db_thread_starter(rev_index, per_db_accs):
2698
+ global reverse_genome_indicies
2699
+ global accs_per_db
2700
+ reverse_genome_indicies = rev_index
2701
+ accs_per_db = per_db_accs
2702
+
2703
+
2704
+ def merge_db(recipient, donors, verbose, threads):
2705
+ #Prettier on the CLI
2706
+
2707
+ if donors is None or recipient is None:
2708
+ print("Either donor or target not given. FastAAI is exiting.")
2709
+ return None
2710
+
2711
+ print("")
2712
+
2713
+ donors = donors.split(",")
2714
+ valid_donors = []
2715
+ for d in donors:
2716
+ if os.path.exists(d):
2717
+ if d == recipient:
2718
+ print("Donor database", d, "is the same as the recipient. This database will be skipped.")
2719
+ else:
2720
+ check = assess_db(d)
2721
+ if check == "exists":
2722
+ if d not in valid_donors:
2723
+ valid_donors.append(d)
2724
+ else:
2725
+ print("It appears that database", d, "was already added to the list of donors. Did you type it twice in the list of donors? Skipping it.")
2726
+ else:
2727
+ if check == "created":
2728
+ print("Donor database", d, "not found! Skipping.")
2729
+ else:
2730
+ print("Something was wrong with supplied database:", d+". A status check found:", check)
2731
+ else:
2732
+ print("Donor database", d, "not found! Are you sure the path is correct and this donor exists? This database will be skipped.")
2733
+
2734
+ if len(valid_donors) == 0:
2735
+ print("None of the supplied donor databases were able to be accessed. FastAAI cannot continue if none of these databases are valid. Exiting.")
2736
+ sys.exit()
2737
+
2738
+ recip_check = assess_db(recipient)
2739
+
2740
+ if recip_check == "created" or recip_check == "exists":
2741
+ for donor in valid_donors:
2742
+ print("Donor database:", donor, "will be added to recipient database:", recipient)
2743
+
2744
+ recipient = fastaai_database(recipient)
2745
+ else:
2746
+ print("I couldn't find or create the recipient database at", recipient+".", "Does the folder you're trying to place this database in exist, and do you have permission to write files to it? FastAAI exiting.")
2747
+ sys.exit()
2748
+
2749
+ if recipient is None or len(valid_donors) == 0:
2750
+ print("I require both a valid donor and a recipient database. FastAAI exiting.")
2751
+ sys.exit()
2752
+
2753
+ donor_dbs = []
2754
+ for d in valid_donors:
2755
+ donor_dbs.append(fastaai_database(d))
2756
+
2757
+ all_accessions = set()
2758
+ #global joint_genome_index
2759
+ joint_genome_index = {}
2760
+ joint_genome_counts = {}
2761
+ max_index = 0
2762
+ #The idea here is to create a set of arrays whose values span the range of each donor's genomes and translate those into an overall list, in order.
2763
+
2764
+ #global reverse_genome_indicies
2765
+ reverse_genome_indices = {}
2766
+
2767
+ #global accs_per_db
2768
+ accs_per_db = {}
2769
+
2770
+ #Load recipient data, if any.
2771
+ if recip_check == "exists":
2772
+ recipient.activate_connection()
2773
+ recipient.just_accessions()
2774
+ recipient.load_genome_index()
2775
+ recipient.close_connection()
2776
+
2777
+ all_accessions = all_accessions.union(recipient.accessions)
2778
+ accs_per_db[recipient.path] = recipient.accessions
2779
+ recipient.accessions = None
2780
+ max_index = len(recipient.genome_index)
2781
+
2782
+ joint_genome_index = dict(zip(recipient.genome_index.keys(), recipient.genome_index.values()))
2783
+ joint_genome_counts = dict(zip(recipient.protein_counts_by_genome.keys(), recipient.protein_counts_by_genome.values()))
2784
+
2785
+ #reverse_genome_index = dict(zip(joint_genome_index.values(),joint_genome_index.keys()))
2786
+ #So... the keys are the genome indicies of the recip. These... shouldn't need any updates. Only the donors need to match.
2787
+ ct = 0
2788
+ path = recipient.path
2789
+ reverse_genome_indices[path] = []
2790
+ for idx in sorted(recipient.genome_index.values()):
2791
+ reverse_genome_indices[path].append(idx)
2792
+ reverse_genome_indices[path] = np.array(reverse_genome_indices[path], dtype = np.int32)
2793
+ recipient.genome_index = None
2794
+
2795
+ #Donors should always exist, never be created.
2796
+ for d in donor_dbs:
2797
+ d.activate_connection()
2798
+ d.just_accessions()
2799
+ d.load_genome_index()
2800
+ d.close_connection()
2801
+ accs_per_db[d.path] = d.accessions
2802
+ all_accessions = all_accessions.union(d.accessions)
2803
+ d.accessions = None
2804
+ reverse_genome_indices[d.path] = []
2805
+ #Database construction indicates this should always be 0-COUNT
2806
+ for g in sorted(d.genome_index.keys()):
2807
+ if g not in joint_genome_index:
2808
+ reverse_genome_indices[d.path].append(max_index)
2809
+ joint_genome_index[g] = max_index
2810
+ #Map the counts on.
2811
+ joint_genome_counts[max_index] = d.protein_counts_by_genome[d.genome_index[g]]
2812
+ #reverse_genome_index[max_index] = g
2813
+ max_index += 1
2814
+ else:
2815
+ reverse_genome_indices[d.path].append(joint_genome_index[g])
2816
+ #Make it an array, now
2817
+ reverse_genome_indices[d.path] = np.array(reverse_genome_indices[d.path], dtype = np.int32)
2818
+ d.genome_index = None
2819
+
2820
+ #global accession_index
2821
+ accession_index = generate_accessions_index()
2822
+
2823
+ #global accession_inverter
2824
+ accession_inverter = {}
2825
+ for acc in accession_index:
2826
+ sql_friendly_accession = acc.replace(".", "_")
2827
+ accession_inverter[accession_index[acc]] = sql_friendly_accession
2828
+
2829
+ all_accessions = list(all_accessions)
2830
+
2831
+ acc_args = [(acc, donor_dbs, recipient) for acc in all_accessions]
2832
+
2833
+ if not os.path.exists("FastAAI_temp"):
2834
+ os.mkdir("FastAAI_temp")
2835
+
2836
+ print("")
2837
+ print("Formatting data to add to database. Started at", curtime())
2838
+
2839
+ if verbose:
2840
+ print("")
2841
+ count = 0
2842
+ total_counts = len(acc_args)
2843
+ try:
2844
+ percentage = (count/total_counts)*100
2845
+ sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(count) + " of " + str(total_counts) + ' done at ' + curtime() + " )\n")
2846
+ sys.stdout.flush()
2847
+ except:
2848
+ #It's not really a big deal if the progress bar cannot be printed.
2849
+ pass
2850
+
2851
+ last_pct = 0
2852
+
2853
+ pool = multiprocessing.Pool(threads, initializer=merge_db_thread_starter, initargs = (reverse_genome_indices, accs_per_db,))
2854
+
2855
+ quiverfull = []
2856
+ for result in pool.imap_unordered(pull_and_merge_accession, acc_args):
2857
+ acc = result[0]
2858
+ child = result[1]
2859
+ #sub_gak = result[2]
2860
+
2861
+ quiverfull.append([acc, child])
2862
+ #gaks.extend(sub_gak)
2863
+
2864
+ if verbose:
2865
+ count += 1
2866
+ try:
2867
+ percentage = (count/total_counts)*100
2868
+ log_time = curtime()
2869
+ sys.stdout.write('\033[A')
2870
+ sys.stdout.flush()
2871
+ sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(count) + " of " + str(total_counts) + ' done at ' + curtime() + " )\n")
2872
+ sys.stdout.flush()
2873
+ except:
2874
+ #It's not really a big deal if the progress bar cannot be printed.
2875
+ pass
2876
+
2877
+ pool.close()
2878
+ pool.join()
2879
+
2880
+ print("")
2881
+ print("Adding data to final database. Started at", curtime())
2882
+
2883
+ if verbose:
2884
+ print("")
2885
+
2886
+ count = 0
2887
+ total_counts = len(acc_args)
2888
+ try:
2889
+ percentage = (count/total_counts)*100
2890
+ sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(count) + " of " + str(total_counts) + ' done at ' + curtime() + " )\n")
2891
+ sys.stdout.flush()
2892
+ except:
2893
+ #It's not really a big deal if the progress bar cannot be printed.
2894
+ pass
2895
+
2896
+ last_pct = 0
2897
+
2898
+ recipient.activate_connection()
2899
+ genome_list_update_sql = "INSERT OR REPLACE INTO genome_index VALUES (?, ?, ?)"
2900
+ genome_reindex = []
2901
+ for g in joint_genome_index:
2902
+ genome_reindex.append((g, joint_genome_index[g], joint_genome_counts[joint_genome_index[g]]))
2903
+
2904
+ recipient.cursor.executemany(genome_list_update_sql, genome_reindex)
2905
+ recipient.connection.commit()
2906
+
2907
+ del genome_reindex
2908
+
2909
+ for result in quiverfull:
2910
+ acc = result[0]
2911
+ child = result[1]
2912
+
2913
+ recipient.add_child_to_parent(acc, child, genomes_too = True, update_gak = True)
2914
+
2915
+ if verbose:
2916
+ count += 1
2917
+ try:
2918
+ percentage = (count/total_counts)*100
2919
+ log_time = curtime()
2920
+ sys.stdout.write('\033[A')
2921
+ sys.stdout.flush()
2922
+ sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/2)).ljust(50)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(count) + " of " + str(total_counts) + ' done at ' + curtime() + " )\n")
2923
+ sys.stdout.flush()
2924
+ except:
2925
+ #It's not really a big deal if the progress bar cannot be printed.
2926
+ pass
2927
+
2928
+ os.rmdir("FastAAI_temp")
2929
+ ''' We're only ever increasing the DB size, so we don't actually need to vacuum it.
2930
+ if recip_check != "created":
2931
+ print("")
2932
+ print("Cleaning up the database after the update. This may take a while.")
2933
+ recipient.connection.execute("VACUUM")
2934
+ recipient.connection.close()
2935
+ '''
2936
+ print("\nDatabases merged!")
2937
+
2938
+ return None
2939
+
2940
+ def pull_and_merge_accession(args):
2941
+ accession_index = generate_accessions_index()
2942
+
2943
+ #global accession_inverter
2944
+ accession_inverter = {}
2945
+ for acc in accession_index:
2946
+ sql_friendly_accession = acc.replace(".", "_")
2947
+ accession_inverter[accession_index[acc]] = sql_friendly_accession
2948
+
2949
+ #joint_genome_index, accession_index, accession_inverter, accs_per_db are global already.
2950
+ acc, donor_dbs, recipient = args[0], args[1], args[2]
2951
+
2952
+ acc_name = accession_inverter[acc]
2953
+ acc_name_gens = acc_name + "_genomes"
2954
+
2955
+ query_sql = "SELECT * FROM " + acc_name
2956
+
2957
+ temp_db = fastaai_database("FastAAI_temp/"+acc_name+".db")
2958
+ temp_db.activate_connection()
2959
+
2960
+ create_command = "CREATE TABLE IF NOT EXISTS " + acc_name + " (kmer INTEGER PRIMARY KEY, genomes array)"
2961
+ temp_db.cursor.execute(create_command)
2962
+ temp_db.connection.commit()
2963
+
2964
+ create_command = "CREATE TABLE IF NOT EXISTS " + acc_name + "_genomes (genome INTEGER PRIMARY KEY, kmers array)"
2965
+ temp_db.cursor.execute(create_command)
2966
+ temp_db.connection.commit()
2967
+
2968
+ query_lists = {}
2969
+ for db in donor_dbs:
2970
+ if acc in accs_per_db[db.path]:
2971
+ db.activate_connection()
2972
+
2973
+ for result in db.cursor.execute(query_sql).fetchall():
2974
+ kmer = result[0]
2975
+ genomes = result[1]
2976
+ translated_genomes = reverse_genome_indicies[db.path][genomes]
2977
+
2978
+ if kmer in query_lists:
2979
+ query_lists[kmer] = np.union1d(query_lists[kmer], translated_genomes)
2980
+ else:
2981
+ query_lists[kmer] = translated_genomes
2982
+
2983
+ db.close_connection()
2984
+
2985
+ #Recipient is not guaranteed to be in the accs per db - if it was created anew, it wouldn't be.
2986
+ if recipient.path in accs_per_db:
2987
+ if acc in accs_per_db[recipient.path]:
2988
+ recipient.activate_connection()
2989
+
2990
+ for result in recipient.cursor.execute(query_sql).fetchall():
2991
+ kmer = result[0]
2992
+ genomes = result[1]
2993
+ translated_genomes = reverse_genome_indicies[recipient.path][genomes]
2994
+ if kmer in query_lists:
2995
+ query_lists[kmer] = np.union1d(query_lists[kmer], translated_genomes)
2996
+ else:
2997
+ query_lists[kmer] = translated_genomes
2998
+
2999
+ recipient.close_connection()
3000
+
3001
+ #Byte-string these.
3002
+ for kmer in query_lists:
3003
+ query_lists[kmer] = query_lists[kmer].tobytes()
3004
+
3005
+ temp_db.cursor.executemany("INSERT INTO " + acc_name + " VALUES (?,?)", zip(query_lists.keys(), query_lists.values()))
3006
+ temp_db.connection.commit()
3007
+
3008
+ del query_lists
3009
+
3010
+ #Reset. Do genomes
3011
+ query_genomes_sql = "SELECT * FROM " + acc_name_gens
3012
+ query_lists = {}
3013
+ for db in donor_dbs:
3014
+ if acc in accs_per_db[db.path]:
3015
+ db.activate_connection()
3016
+
3017
+ for result in db.cursor.execute(query_genomes_sql).fetchall():
3018
+ genome = result[0]
3019
+ kmers = result[1]
3020
+ translated_genome = int(reverse_genome_indicies[db.path][genome])
3021
+ #Each genome gets added only once, no dupes.
3022
+ if translated_genome not in query_lists:
3023
+ query_lists[translated_genome] = kmers
3024
+
3025
+ db.close_connection()
3026
+
3027
+ if recipient.path in accs_per_db:
3028
+ if acc in accs_per_db[recipient.path]:
3029
+ recipient.activate_connection()
3030
+
3031
+ for result in recipient.cursor.execute(query_genomes_sql).fetchall():
3032
+ genome = result[0]
3033
+ kmers = result[1]
3034
+ translated_genome = int(reverse_genome_indicies[recipient.path][genome])
3035
+ #Each genome gets added only once, no dupes.
3036
+ if translated_genome not in query_lists:
3037
+ query_lists[translated_genome] = kmers
3038
+
3039
+ recipient.close_connection()
3040
+
3041
+ #Byte-string these.
3042
+ #gak = []
3043
+ for g in query_lists:
3044
+ #gak.append((g, acc, query_lists[g].shape[0]))
3045
+ query_lists[g] = query_lists[g].tobytes()
3046
+
3047
+
3048
+ temp_db.cursor.executemany("INSERT INTO " + acc_name_gens + " VALUES (?,?)", zip(query_lists.keys(), query_lists.values()))
3049
+ temp_db.connection.commit()
3050
+
3051
+ temp_db.close_connection()
3052
+
3053
+ return [acc_name, temp_db.path]
3054
+
3055
+ #Query 1 genome vs. 1 target using Carlos' method - just needs query, target, threads
3056
+ def single_query_opts():
3057
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
3058
+ description='''
3059
+ This FastAAI module takes a single query genome, protein, or protein and HMM pair and a single target genome, protein, or protein and HMM pair as inputs and calculates AAI between the two.
3060
+
3061
+ If you supply a genome as either query or target, a protein and HMM file will be made for the genome.
3062
+ If you supply a protein as either query or target, an HMM file will be made for it.
3063
+ If you supply both an HMM and protein, the search will start right away. You cannot provide only an HMM.
3064
+
3065
+ No database will be built, and you cannot query multiple genomes with this module.
3066
+
3067
+ If you wish to query multiple genomes against themselves in all vs. all AAI search, use aai_index instead.
3068
+ If you wish to query multiple genomes against multiple targets, use multi_query instead.
3069
+ ''')
3070
+ parser.add_argument('-qg', '--query_genome', dest = 'query_genome', default = None, help = 'Query genome')
3071
+ parser.add_argument('-tg', '--target_genome', dest = 'target_genome', default = None, help = 'Target genome')
3072
+
3073
+ parser.add_argument('-qp', '--query_protein', dest = 'query_protein', default = None, help = 'Query protein')
3074
+ parser.add_argument('-tp', '--target_protein', dest = 'target_protein', default = None, help = 'Target protein')
3075
+
3076
+ parser.add_argument('-qh', '--query_hmm', dest = 'query_hmm', default = None, help = 'Query HMM')
3077
+ parser.add_argument('-th', '--target_hmm', dest = 'target_hmm', default = None, help = 'Target HMM')
3078
+
3079
+ parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory where FastAAI will place the result of this query. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
3080
+
3081
+ parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
3082
+ parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
3083
+
3084
+ #Alternative file input
3085
+
3086
+ args, unknown = parser.parse_known_args()
3087
+
3088
+ return parser, args
3089
+
3090
+ def do_single_query(input_file):
3091
+ input_file.preprocess()
3092
+ return input_file
3093
+
3094
+ def intersect_kmer_lists(pair):
3095
+ intersection = np.intersect1d(pair[0], pair[1]).shape[0]
3096
+ union = pair[0].shape[0] + pair[1].shape[0] - intersection
3097
+ return (intersection/union)
3098
+
3099
+ def kaai_to_aai(kaai):
3100
+ # Transform the kAAI into estimated AAI values
3101
+ aai_hat = (-0.3087057 + 1.810741 * (np.exp(-(-0.2607023 * np.log(kaai))**(1/3.435))))*100
3102
+
3103
+ return aai_hat
3104
+
3105
+ #This one's unique. It doesn't do anything with the DB, which means it doesn't access any other functionality outside of the input_file class. It just advances a pair of inputs in parallel and does intersections.
3106
+ def single_query(query_args, target_args, shared_args):
3107
+
3108
+ output, threads, verbose = shared_args[0], shared_args[1], shared_args[2]
3109
+
3110
+ genomes, proteins, hmms = query_args[0], query_args[1], query_args[2]
3111
+
3112
+ if genomes is None and proteins is None and hmms is None:
3113
+ print("Please supply a query genome, protein, or protein and HMM pair.")
3114
+ sys.exit()
3115
+
3116
+ query = None
3117
+
3118
+ if genomes is not None:
3119
+ query = input_file(genomes, output, verbose)
3120
+ query.set_genome(genomes)
3121
+ if proteins is not None:
3122
+ if query is not None:
3123
+ print("If you supply a genome for either query or target, you must supply ONLY the genome, not a genome and either a protein or HMM.")
3124
+ sys.exit()
3125
+ else:
3126
+ query = input_file(proteins, output, verbose)
3127
+ query.set_protein(proteins)
3128
+ if hmms is not None:
3129
+ if query is None:
3130
+ print("If you supply an HMM for either query or target, you must also supply the protein from which the HMM was generated.")
3131
+ sys.exit()
3132
+ else:
3133
+ query.set_hmm(hmms)
3134
+
3135
+ genomes, proteins, hmms = target_args[0], target_args[1], target_args[2]
3136
+
3137
+ if genomes is None and proteins is None and hmms is None:
3138
+ print("Please supply a target genome, protein, or protein and HMM pair.")
3139
+ sys.exit()
3140
+
3141
+ target = None
3142
+
3143
+ if genomes is not None:
3144
+ target = input_file(genomes, output, verbose)
3145
+ target.set_genome(genomes)
3146
+ if proteins is not None:
3147
+ if target is not None:
3148
+ print("If you supply a genome for either target or target, you must supply ONLY the genome, not a genome and either a protein or HMM.")
3149
+ sys.exit()
3150
+ else:
3151
+ target = input_file(proteins, output, verbose)
3152
+ target.set_protein(proteins)
3153
+ if hmms is not None:
3154
+ if target is None:
3155
+ print("If you supply an HMM for either target or target, you must also supply the protein from which the HMM was generated.")
3156
+ sys.exit()
3157
+ else:
3158
+ target.set_hmm(hmms)
3159
+
3160
+ if query.basename == target.basename:
3161
+ print("You've selected the same query and target genome. The AAI is 100%.")
3162
+ print("FastAAI exiting.")
3163
+ return None
3164
+
3165
+ statuses = ["genome", "protein", "protein and hmm"]
3166
+ query_stat = statuses.index(query.status)
3167
+ target_stat = statuses.index(target.status)
3168
+ minimum_status = statuses[min(query_stat, target_stat)]
3169
+
3170
+ start_printouts = ["[Genome] Protein Protein+HMM", " Genome [Protein] Protein+HMM", "Genome Protein [Protein+HMM]"]
3171
+
3172
+ print("")
3173
+ print("Query start: ", start_printouts[query_stat])
3174
+ print("Target start:", start_printouts[target_stat])
3175
+ print("")
3176
+
3177
+ good_to_go = prepare_directories(output, minimum_status, "build")
3178
+
3179
+ if not good_to_go:
3180
+ print("Exiting FastAAI")
3181
+ sys.exit()
3182
+
3183
+ qname = query.basename
3184
+ tname = target.basename
3185
+
3186
+ name = qname + "_vs_" + tname + ".aai.txt"
3187
+ print("Output will be located at", os.path.normpath(output) + "/results/"+name)
3188
+
3189
+ #Give the data for kmer indexing to the parallel processes
3190
+ global kmer_index
3191
+ kmer_index = create_kmer_index()
3192
+
3193
+ advance_me = [query, target]
3194
+ #All we need to do this.
3195
+ pool = multiprocessing.Pool(min(threads, 2))
3196
+
3197
+ results = pool.map(do_single_query, advance_me)
3198
+
3199
+ pool.close()
3200
+ pool.join()
3201
+
3202
+ query = results[0]
3203
+ target = results[1]
3204
+
3205
+ #One of the printouts
3206
+ max_poss_prots = max(len(query.best_hits_kmers), len(target.best_hits_kmers))
3207
+
3208
+ accs_to_view = set(query.best_hits_kmers.keys()).intersection(set(target.best_hits_kmers.keys()))
3209
+
3210
+ seq_pairs = [[query.best_hits_kmers[acc], target.best_hits_kmers[acc]] for acc in accs_to_view]
3211
+
3212
+ pool = multiprocessing.Pool(min(threads, len(accs_to_view)))
3213
+
3214
+ results = np.array(pool.map(intersect_kmer_lists, seq_pairs))
3215
+
3216
+ pool.close()
3217
+ pool.join()
3218
+
3219
+ jacc_mean = np.mean(results)
3220
+ jacc_std = np.std(results)
3221
+ actual_prots = len(results)
3222
+ aai_est = round(kaai_to_aai(jacc_mean), 2)
3223
+
3224
+ if aai_est > 90:
3225
+ aai_est = "> 90%"
3226
+ else:
3227
+ if aai_est < 30:
3228
+ aai_est = "< 30%"
3229
+
3230
+ output = open(name, "w")
3231
+
3232
+ print(qname, tname, round(jacc_mean, 4), round(jacc_std, 4), actual_prots, aai_est, file = output)
3233
+
3234
+ output.close()
3235
+
3236
+ print("FastAAI single query done! Estimated AAI:", aai_est)
3237
+
3238
+ def aai_index_opts():
3239
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
3240
+ description='''
3241
+ This FastAAI module takes a set of genomes, proteins, or proteins and HMMs, creates a FastAAI database from them, and then executes an all vs. all AAI search of the genomes in the database
3242
+ ''')
3243
+
3244
+ parser.add_argument('-g', '--genomes', dest = 'genomes', default = None, help = 'A directory containing genomes in FASTA format.')
3245
+ parser.add_argument('-p', '--proteins', dest = 'proteins', default = None, help = 'A directory containing protein amino acids in FASTA format.')
3246
+ parser.add_argument('-m', '--hmms', dest = 'hmms', default = None, help = 'A directory containing the results of an HMM search on a set of proteins.')
3247
+
3248
+ parser.add_argument('-d', '--database', dest = 'db_name', default = "FastAAI_database.sqlite.db", help = 'The name of the database you wish to create or add to. The database will be created if it doesn\'t already exist and placed in the output directory.')
3249
+
3250
+ parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory to place the database and any protein or HMM files FastAAI creates. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
3251
+
3252
+ parser.add_argument('--genome_file', dest = 'gf', default = None, help = 'Alternative way to supply genomes. A file containing paths to your genome files, 1 per line.')
3253
+ parser.add_argument('--protein_file', dest = 'pf', default = None, help = 'Alternative way to supply proteins. A file containing paths to your protein files, 1 per line.')
3254
+ parser.add_argument('--hmm_file', dest = 'hf', default = None, help = 'Alternative way to supply HMMs. A file containing paths to your HMM files, 1 per line.')
3255
+
3256
+ parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
3257
+ parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
3258
+
3259
+
3260
+ parser.add_argument('--do_stdev', dest = "do_stdev", action='store_true', help = 'Off by default. Calculate std. deviations on Jaccard indicies. Increases memory usage and runtime slightly. Does NOT change estimated AAI values at all.')
3261
+ parser.add_argument('--unlimited_resources', dest = "large_mem", action = 'store_true', help = 'Off by default. Use a faster algorithm that consumes more RAM. FastAAI cannot calculate std. deviations with this algorithm, so they will automatically be skipped.')
3262
+ parser.add_argument('--mem', dest = "precision", default = "med", help = 'One of low/med/high. Medium by default. Save RAM in return for slightly rounded AAI estimates. Only affects FastAAI if you are also using the "--unlimited_resources" flag.')
3263
+
3264
+ args, unknown = parser.parse_known_args()
3265
+
3266
+ return parser, args
3267
+
3268
+ #Build a DB and query a dataset vs. self
3269
+ def aai_index(genomes, proteins, hmms, db_name, output, threads, gf, pf, hf, verbose, do_stdev, memory_use, unlimited_resources):
3270
+ #run build DB and then db_query with the fresh DB
3271
+ success = build_db(genomes, proteins, hmms, db_name, output, threads, gf, pf, hf, verbose)
3272
+ if success:
3273
+ accessible_name = os.path.normpath(output + "/database/" + db_name)
3274
+ db_query(accessible_name, accessible_name, verbose, output, threads, do_stdev, memory_use, unlimited_resources)
3275
+ else:
3276
+ print("Database could not be built. FastAAI exiting.")
3277
+
3278
+ return None
3279
+
3280
+ #Build 2 DBs and query query DB vs target DB
3281
+ def multi_query_opts():
3282
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
3283
+ description='''
3284
+ This FastAAI module takes a set of query genomes/proteins/proteins+HMMs and a set of target genomes/proteins/proteins+HMMs.
3285
+ Two FastAAI databases will be created, one for the query and one for the target, then the query database will have AAI calculated against the target database
3286
+ ''')
3287
+
3288
+ parser.add_argument('-qg', '--query_genomes', dest = 'query_genomes', default = None, help = 'A directory containing query genomes in FASTA format.')
3289
+ parser.add_argument('-qp', '--query_proteins', dest = 'query_proteins', default = None, help = 'A directory containing query protein amino acids in FASTA format.')
3290
+ parser.add_argument('-qm', '--query_hmms', dest = 'query_hmms', default = None, help = 'A directory containing the results of an HMM search on the set of query proteins.')
3291
+
3292
+ parser.add_argument('-tg', '--target_genomes', dest = 'target_genomes', default = None, help = 'A directory containing target genomes in FASTA format.')
3293
+ parser.add_argument('-tp', '--target_proteins', dest = 'target_proteins', default = None, help = 'A directory containing target protein amino acids in FASTA format.')
3294
+ parser.add_argument('-tm', '--target_hmms', dest = 'target_hmms', default = None, help = 'A directory containing the results of an HMM search on the set of target proteins.')
3295
+
3296
+
3297
+ parser.add_argument('-qd', '--query_database', dest = 'query_db_name', default = "FastAAI_query_database.sqlite.db", help = 'The name of the query database you wish to create or add to. The database will be created if it doesn\'t already exist and placed in the output directory.')
3298
+ parser.add_argument('-td', '--target_database', dest = 'target_db_name', default = "FastAAI_target_database.sqlite.db", help = 'The name of the target database you wish to create or add to. The database will be created if it doesn\'t already exist and placed in the output directory.')
3299
+
3300
+ parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory to place the database and any protein or HMM files FastAAI creates. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
3301
+
3302
+ parser.add_argument('--query_genome_file', dest = 'qgf', default = None, help = 'Alternative way to supply genomes. A file containing paths to your query genome files, 1 per line.')
3303
+ parser.add_argument('--query_protein_file', dest = 'qpf', default = None, help = 'Alternative way to supply proteins. A file containing paths to your query protein files, 1 per line.')
3304
+ parser.add_argument('--query_hmm_file', dest = 'qhf', default = None, help = 'Alternative way to supply HMMs. A file containing paths to your query HMM files, 1 per line.')
3305
+
3306
+ parser.add_argument('--target_genome_file', dest = 'tgf', default = None, help = 'Alternative way to supply genomes. A file containing paths to your target genome files, 1 per line.')
3307
+ parser.add_argument('--target_protein_file', dest = 'tpf', default = None, help = 'Alternative way to supply proteins. A file containing paths to your target protein files, 1 per line.')
3308
+ parser.add_argument('--target_hmm_file', dest = 'thf', default = None, help = 'Alternative way to supply HMMs. A file containing paths to your target HMM files, 1 per line.')
3309
+
3310
+ parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
3311
+ parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
3312
+
3313
+ parser.add_argument('--do_stdev', dest = "do_stdev", action='store_true', help = 'Off by default. Calculate std. deviations on Jaccard indicies. Increases memory usage and runtime slightly. Does NOT change estimated AAI values at all.')
3314
+ parser.add_argument('--unlimited_resources', dest = "large_mem", action = 'store_true', help = 'Off by default. Use a faster algorithm that consumes more RAM. FastAAI cannot calculate std. deviations with this algorithm, so they will automatically be skipped.')
3315
+ parser.add_argument('--mem', dest = "precision", default = "med", help = 'One of low/med/high. Medium by default. Save RAM in return for slightly rounded AAI estimates. Only affects FastAAI if you are also using the "--unlimited_resources" flag.')
3316
+
3317
+ args, unknown = parser.parse_known_args()
3318
+
3319
+ return parser, args
3320
+
3321
+ #Build 2 DBs and query query DB vs target DB
3322
+ def multi_query(query_arg_list, target_arg_list, shared_args):
3323
+ pass
3324
+ output, threads, verbose, do_stdev, mem, efficient = shared_args[0], shared_args[1], shared_args[2], shared_args[3], shared_args[4], shared_args[5]
3325
+
3326
+ genomes, proteins, hmms, gf, pf, hf, db_name = query_arg_list[0], query_arg_list[1], query_arg_list[2], query_arg_list[3], query_arg_list[4], query_arg_list[5], query_arg_list[6]
3327
+ accessible_name_query = os.path.normpath(output + "/database/" + db_name)
3328
+ build_db(genomes, proteins, hmms, db_name, output, threads, gf, pf, hf, verbose)
3329
+
3330
+ genomes, proteins, hmms, gf, pf, hf, db_name = target_arg_list[0], target_arg_list[1], target_arg_list[2], target_arg_list[3], target_arg_list[4], target_arg_list[5], target_arg_list[6]
3331
+ accessible_name_target = os.path.normpath(output + "/database/" + db_name)
3332
+ build_db(genomes, proteins, hmms, db_name, output, threads, gf, pf, hf, verbose)
3333
+
3334
+ db_query(accessible_name_query, accessible_name_target, verbose, output, threads, do_stdev, mem, efficient)
3335
+
3336
+ '''
3337
+ Main
3338
+ '''
3339
+ def main():
3340
+ #The currently supported modules.
3341
+ modules = ["build_db", "merge_db", "simple_query", "db_query", "single_query", "aai_index", "multi_query"]
3342
+
3343
+ #Print modules if someone just types FastAAI
3344
+ if len(sys.argv) < 2:
3345
+ print("")
3346
+ print(" Welcome to FastAAI")
3347
+ print("")
3348
+ print("")
3349
+ print(" Please select one of the following modules:")
3350
+ print("")
3351
+ print("------------------------------------------- Quick Usage Options -------------------------------------------")
3352
+ print("")
3353
+ print(" single_query |" + " Quickly query ONE query genome against ONE target genome")
3354
+ print(" multi_query |" + " Create a query DB and a target DB, then calculate query vs. target AAI")
3355
+ print(" aai_index |" + " Create a database from multiple genomes and do an all vs. all AAI index of the genomes")
3356
+ print("")
3357
+ print("-------------------------------------- Database Construction Options --------------------------------------")
3358
+ print("")
3359
+ print(" build_db |" + " Create or add to a FastAAI database from genomes, proteins, or proteins and HMMs")
3360
+ print(" merge_db |" + " Add the contents of one FastAAI DB to another")
3361
+ print("")
3362
+ print("---------------------------------------------- Query Options ----------------------------------------------")
3363
+ print("")
3364
+ print(" simple_query |" + " Query a genome or protein (one or many) against an existing FastAAI database")
3365
+ print(" db_query |" + " Query the genomes in one FastAAI database against the genomes in another FastAAI database")
3366
+ print("")
3367
+ print("-----------------------------------------------------------------------------------------------------------")
3368
+ print("")
3369
+ print(" To select a module, enter 'FastAAI [module]' into the command line!")
3370
+ print("")
3371
+ sys.exit()
3372
+
3373
+ #This is the module selection
3374
+ selection = sys.argv[1]
3375
+
3376
+ if selection not in modules:
3377
+ print("")
3378
+ print(" I couldn't find the module you specified. Please select one of the following modules:")
3379
+ print("")
3380
+ print("------------------------------------------- Quick Usage Options -------------------------------------------")
3381
+ print("")
3382
+ print(" single_query |" + " Quickly query ONE query genome against ONE target genome")
3383
+ print(" multi_query |" + " Create a query DB and a target DB, then calculate query vs. target AAI")
3384
+ print(" aai_index |" + " Create a database from multiple genomes and do an all vs. all AAI index of the genomes")
3385
+ print("")
3386
+ print("-------------------------------------- Database Construction Options --------------------------------------")
3387
+ print("")
3388
+ print(" build_db |" + " Create or add to a FastAAI database from genomes, proteins, or proteins and HMMs")
3389
+ print(" merge_db |" + " Add the contents of one FastAAI DB to another")
3390
+ print("")
3391
+ print("---------------------------------------------- Query Options ----------------------------------------------")
3392
+ print("")
3393
+ print(" simple_query |" + " Query a genome or protein (one or many) against an existing FastAAI database")
3394
+ print(" db_query |" + " Query the genomes in one FastAAI database against the genomes in another FastAAI database")
3395
+ print("")
3396
+ print("-----------------------------------------------------------------------------------------------------------")
3397
+ print("")
3398
+ print(" To select a module, enter 'FastAAI [module]' into the command line!")
3399
+ print("")
3400
+ sys.exit()
3401
+
3402
+ #################### Database build or add ########################
3403
+
3404
+ if selection == "build_db":
3405
+ parser, opts = build_db_opts()
3406
+
3407
+ #module name only
3408
+ if len(sys.argv) < 3:
3409
+ print(parser.print_help())
3410
+ sys.exit()
3411
+
3412
+ #Directory based
3413
+ genomes, proteins, hmms = opts.genomes, opts.proteins, opts.hmms
3414
+
3415
+ #Input list based
3416
+ gf, pf, hf = opts.gf, opts.pf, opts.hf
3417
+
3418
+ output = os.path.normpath(opts.output)
3419
+
3420
+ threads = opts.threads
3421
+ verbose = opts.verbose
3422
+
3423
+ #Database handle
3424
+ db_name = opts.db_name
3425
+
3426
+
3427
+ #genomes, proteins, hmms, db_name, output, threads, gf, pf, hf, verbose
3428
+ build_db(genomes, proteins, hmms, db_name, output, threads, gf, pf, hf, verbose)
3429
+
3430
+ #################### Add two DBs ########################
3431
+
3432
+ if selection == "merge_db":
3433
+ parser, opts = merge_db_opts()
3434
+ if len(sys.argv) < 3:
3435
+ print(parser.print_help())
3436
+ sys.exit()
3437
+
3438
+ recipient = opts.recipient
3439
+ donors = opts.donors
3440
+ verbose = opts.verbose
3441
+ threads = opts.threads
3442
+
3443
+ merge_db(recipient, donors, verbose, threads)
3444
+
3445
+ #################### Query files vs DB ########################
3446
+
3447
+ if selection == "simple_query":
3448
+ parser, opts = sql_query_opts()
3449
+
3450
+ if len(sys.argv) < 3:
3451
+ print(parser.print_help())
3452
+ sys.exit()
3453
+
3454
+ #directory based
3455
+ genomes, proteins, hmms = opts.genomes, opts.proteins, opts.hmms
3456
+
3457
+ #Input list based
3458
+ gf, pf, hf = opts.gf, opts.pf, opts.hf
3459
+
3460
+ db_name = opts.target
3461
+
3462
+ output = opts.output
3463
+ threads = opts.threads
3464
+ verbose = opts.verbose
3465
+
3466
+ do_stdev = opts.do_stdev
3467
+
3468
+ sql_query(genomes, proteins, hmms, gf, pf, hf, db_name, output, threads, verbose, do_stdev)
3469
+
3470
+
3471
+ #################### Query DB vs DB ###########################
3472
+ if selection == "db_query":
3473
+ parser, opts = db_query_opts()
3474
+ #module name only
3475
+
3476
+ if len(sys.argv) < 3:
3477
+ print(parser.print_help())
3478
+ sys.exit()
3479
+
3480
+ query = opts.query
3481
+ target = opts.target
3482
+ verbose = opts.verbose
3483
+
3484
+ do_stdev = opts.do_stdev
3485
+ #massive = opts.massive
3486
+
3487
+ mem = opts.precision
3488
+ efficient = opts.large_mem
3489
+
3490
+ output = opts.output
3491
+ threads = opts.threads
3492
+
3493
+ db_query(query, target, verbose, output, threads, do_stdev, mem, efficient)
3494
+
3495
+ #################### One-pass functions #######################
3496
+ if selection == "single_query":
3497
+ parser, opts = single_query_opts()
3498
+ #module name only
3499
+
3500
+ if len(sys.argv) < 3:
3501
+ print(parser.print_help())
3502
+ sys.exit()
3503
+
3504
+ shared_opts = []
3505
+ output = os.path.normpath(opts.output)
3506
+ threads = opts.threads
3507
+ verbose = opts.verbose
3508
+
3509
+ shared_opts.append(output)
3510
+
3511
+ shared_opts.append(threads)
3512
+ shared_opts.append(verbose)
3513
+
3514
+ query_opts = []
3515
+
3516
+ query_genome = opts.query_genome
3517
+ query_protein = opts.query_protein
3518
+ query_hmm = opts.query_hmm
3519
+
3520
+
3521
+ query_opts.append(query_genome)
3522
+ query_opts.append(query_protein)
3523
+ query_opts.append(query_hmm)
3524
+
3525
+ target_opts = []
3526
+
3527
+ target_genome = opts.target_genome
3528
+ target_protein = opts.target_protein
3529
+ target_hmm = opts.target_hmm
3530
+
3531
+ #tg = opts.target_genome_file
3532
+ #tp = opts.target_protein_file
3533
+ #th = opts.target_hmm_file
3534
+
3535
+ target_opts.append(target_genome)
3536
+ target_opts.append(target_protein)
3537
+ target_opts.append(target_hmm)
3538
+
3539
+ single_query(query_opts, target_opts, shared_opts)
3540
+
3541
+ if selection == "aai_index":
3542
+ parser, opts = aai_index_opts()
3543
+ #module name only
3544
+
3545
+ if len(sys.argv) < 3:
3546
+ print(parser.print_help())
3547
+ sys.exit()
3548
+
3549
+
3550
+ genomes, proteins, hmms = opts.genomes, opts.proteins, opts.hmms
3551
+ #Text file versions of genomes/proteins/hmms
3552
+ gf, pf, hf = opts.gf, opts.pf, opts.hf
3553
+
3554
+ db_name = opts.db_name
3555
+
3556
+ output = opts.output
3557
+ threads = opts.threads
3558
+ verbose = opts.verbose
3559
+
3560
+ do_stdev = opts.do_stdev
3561
+ #massive = opts.massive
3562
+
3563
+ mem = opts.precision
3564
+ efficient = opts.large_mem
3565
+
3566
+ aai_index(genomes, proteins, hmms, db_name, output, threads, gf, pf, hf, verbose, do_stdev, mem, efficient)
3567
+
3568
+ if selection == "multi_query":
3569
+ parser, opts = multi_query_opts()
3570
+ #module name only
3571
+
3572
+ if len(sys.argv) < 3:
3573
+ print(parser.print_help())
3574
+ sys.exit()
3575
+
3576
+ shared_arg_list = []
3577
+ output = os.path.normpath(opts.output)
3578
+ threads = opts.threads
3579
+ verbose = opts.verbose
3580
+
3581
+ do_stdev = opts.do_stdev
3582
+ #massive = opts.massive
3583
+
3584
+ mem = opts.precision
3585
+ efficient = opts.large_mem
3586
+
3587
+ shared_arg_list.append(output)
3588
+ shared_arg_list.append(threads)
3589
+ shared_arg_list.append(verbose)
3590
+ shared_arg_list.append(do_stdev)
3591
+ shared_arg_list.append(mem)
3592
+ shared_arg_list.append(efficient)
3593
+
3594
+ query_arg_list = []
3595
+ genomes, proteins, hmms = opts.query_genomes, opts.query_proteins, opts.query_hmms
3596
+ #Text file versions of genomes/proteins/hmms
3597
+ gf, pf, hf = opts.qgf, opts.qpf, opts.qhf
3598
+ query_db_name = opts.query_db_name
3599
+
3600
+ query_arg_list.append(genomes)
3601
+ query_arg_list.append(proteins)
3602
+ query_arg_list.append(hmms)
3603
+ query_arg_list.append(gf)
3604
+ query_arg_list.append(pf)
3605
+ query_arg_list.append(hf)
3606
+ query_arg_list.append(query_db_name)
3607
+
3608
+ target_arg_list = []
3609
+ genomes, proteins, hmms = opts.target_genomes, opts.target_proteins, opts.target_hmms
3610
+ #Text file versions of genomes/proteins/hmms
3611
+ gf, pf, hf = opts.tgf, opts.tpf, opts.thf
3612
+ target_db_name = opts.target_db_name
3613
+
3614
+ target_arg_list.append(genomes)
3615
+ target_arg_list.append(proteins)
3616
+ target_arg_list.append(hmms)
3617
+ target_arg_list.append(gf)
3618
+ target_arg_list.append(pf)
3619
+ target_arg_list.append(hf)
3620
+ target_arg_list.append(target_db_name)
3621
+
3622
+ multi_query(query_arg_list, target_arg_list, shared_arg_list)
3623
+
3624
+ return None
3625
+
3626
+
3627
+ if __name__ == "__main__":
3628
+ main()
3629
+
3630
+