miga-base 1.2.18.2 → 1.3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/miga/cli/action/doctor/base.rb +2 -1
- data/lib/miga/cli/action/init.rb +1 -1
- data/lib/miga/dataset/result/add.rb +3 -2
- data/lib/miga/version.rb +2 -2
- data/scripts/essential_genes.bash +4 -8
- data/utils/FastAAI/LICENSE +8 -0
- data/utils/FastAAI/README.md +151 -40
- data/utils/FastAAI/__init__.py +1 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962915_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962925_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962935_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962945_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962995_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963025_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963055_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963065_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/_Pseudomonas__cissicola_GCA_002019225_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/_Pseudomonas__cissicola_GCA_008801575_1.fna.gz +0 -0
- data/utils/FastAAI/fastaai/__init__.py +1 -0
- data/utils/FastAAI/fastaai/fastaai +4805 -0
- data/utils/FastAAI/fastaai/fastaai.py +4805 -0
- data/utils/FastAAI/fastaai/fastaai_miga_crystals_to_db.py +297 -0
- data/utils/FastAAI/fastaai/fastaai_miga_preproc.py +931 -0
- data/utils/FastAAI/metadata/Accession_names_and_IDs.txt +122 -0
- data/utils/distance/commands.rb +51 -23
- metadata +23 -6
- data/utils/FastAAI/FastAAI +0 -3659
- /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Archaea_SCG.hmm +0 -0
- /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Bacteria_SCG.hmm +0 -0
- /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Complete_SCG_DB.hmm +0 -0
@@ -0,0 +1,4805 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
|
3
|
+
################################################################################
|
4
|
+
"""---0.0 Import Modules---"""
|
5
|
+
import subprocess
|
6
|
+
import argparse
|
7
|
+
import datetime
|
8
|
+
import shutil
|
9
|
+
import textwrap
|
10
|
+
import multiprocessing
|
11
|
+
import pickle
|
12
|
+
import gzip
|
13
|
+
import tempfile
|
14
|
+
#Shouldn't play any role.
|
15
|
+
#from random import randint
|
16
|
+
|
17
|
+
#We could probably remove Path, too.
|
18
|
+
#This as well
|
19
|
+
import time
|
20
|
+
from collections import defaultdict
|
21
|
+
import sys
|
22
|
+
import os
|
23
|
+
from math import floor
|
24
|
+
import sqlite3
|
25
|
+
#numpy dependency
|
26
|
+
import numpy as np
|
27
|
+
import io
|
28
|
+
import random
|
29
|
+
|
30
|
+
import pyrodigal as pd
|
31
|
+
import pyhmmer
|
32
|
+
|
33
|
+
from collections import namedtuple
|
34
|
+
|
35
|
+
from math import ceil
|
36
|
+
|
37
|
+
import re
|
38
|
+
|
39
|
+
|
40
|
+
class progress_tracker:
|
41
|
+
def __init__(self, total, step_size = 2, message = None, one_line = True):
|
42
|
+
self.current_count = 0
|
43
|
+
self.max_count = total
|
44
|
+
#Book keeping.
|
45
|
+
self.start_time = None
|
46
|
+
self.end_time = None
|
47
|
+
#Show progrexx every [step] percent
|
48
|
+
self.step = step_size
|
49
|
+
self.justify_size = ceil(100/self.step)
|
50
|
+
self.last_percent = 0
|
51
|
+
self.message = message
|
52
|
+
|
53
|
+
self.pretty_print = one_line
|
54
|
+
|
55
|
+
self.start()
|
56
|
+
|
57
|
+
def curtime(self):
|
58
|
+
time_format = "%d/%m/%Y %H:%M:%S"
|
59
|
+
timer = datetime.datetime.now()
|
60
|
+
time = timer.strftime(time_format)
|
61
|
+
return time
|
62
|
+
|
63
|
+
def start(self):
|
64
|
+
print("")
|
65
|
+
if self.message is not None:
|
66
|
+
print(self.message)
|
67
|
+
|
68
|
+
try:
|
69
|
+
percentage = (self.current_count/self.max_count)*100
|
70
|
+
sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/self.step)).ljust(self.justify_size)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(self.current_count) + " of " + str(self.max_count) + ' ) at ' + self.curtime() + "\n")
|
71
|
+
sys.stdout.flush()
|
72
|
+
|
73
|
+
except:
|
74
|
+
#It's not really a big deal if the progress bar cannot be printed.
|
75
|
+
pass
|
76
|
+
|
77
|
+
def update(self):
|
78
|
+
self.current_count += 1
|
79
|
+
percentage = (self.current_count/self.max_count)*100
|
80
|
+
try:
|
81
|
+
if percentage // self.step > self.last_percent:
|
82
|
+
if self.pretty_print:
|
83
|
+
sys.stdout.write('\033[A')
|
84
|
+
sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*int(percentage/self.step)).ljust(self.justify_size)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(self.current_count) + " of " + str(self.max_count) + ' ) at ' + self.curtime() + "\n")
|
85
|
+
sys.stdout.flush()
|
86
|
+
self.last_percent = percentage // self.step
|
87
|
+
#Bar is always full at the end.
|
88
|
+
if count == self.max_count:
|
89
|
+
if self.pretty_print:
|
90
|
+
sys.stdout.write('\033[A')
|
91
|
+
sys.stdout.write("Completion".rjust(3)+ ' |'+('#'*self.justify_size).ljust(self.justify_size)+'| ' + ('%.2f'%percentage).rjust(7)+'% ( ' + str(self.current_count) + " of " + str(self.max_count) + ' ) at ' + self.curtime() + "\n")
|
92
|
+
sys.stdout.flush()
|
93
|
+
#Add space at end.
|
94
|
+
print("")
|
95
|
+
except:
|
96
|
+
#It's not really a big deal if the progress bar cannot be printed.
|
97
|
+
pass
|
98
|
+
|
99
|
+
|
100
|
+
#Takes a bytestring from the SQL database and converts it to a numpy array.
|
101
|
+
def convert_array(bytestring):
|
102
|
+
return np.frombuffer(bytestring, dtype = np.int32)
|
103
|
+
|
104
|
+
def convert_float_array_16(bytestring):
|
105
|
+
return np.frombuffer(bytestring, dtype = np.float16)
|
106
|
+
|
107
|
+
def convert_float_array_32(bytestring):
|
108
|
+
return np.frombuffer(bytestring, dtype = np.float32)
|
109
|
+
|
110
|
+
def convert_float_array_64(bytestring):
|
111
|
+
return np.frombuffer(bytestring, dtype = np.float64)
|
112
|
+
|
113
|
+
def read_fasta(file):
|
114
|
+
cur_seq = ""
|
115
|
+
cur_prot = ""
|
116
|
+
|
117
|
+
contents = {}
|
118
|
+
deflines = {}
|
119
|
+
|
120
|
+
fasta = agnostic_reader(file)
|
121
|
+
for line in fasta:
|
122
|
+
if line.startswith(">"):
|
123
|
+
if len(cur_seq) > 0:
|
124
|
+
contents[cur_prot] = cur_seq
|
125
|
+
deflines[cur_prot] = defline
|
126
|
+
|
127
|
+
cur_seq = ""
|
128
|
+
cur_prot = line.strip().split()[0][1:]
|
129
|
+
defline = line.strip()[len(cur_prot)+1 :].strip()
|
130
|
+
|
131
|
+
else:
|
132
|
+
cur_seq += line.strip()
|
133
|
+
|
134
|
+
fasta.close()
|
135
|
+
|
136
|
+
#Final iter
|
137
|
+
if len(cur_seq) > 0:
|
138
|
+
contents[cur_prot] = cur_seq
|
139
|
+
deflines[cur_prot] = defline
|
140
|
+
|
141
|
+
return contents, deflines
|
142
|
+
|
143
|
+
class fasta_file:
|
144
|
+
def __init__(self, file, type = "genome"):
|
145
|
+
self.file_path = os.path.abspath(file)
|
146
|
+
self.name = os.path.basename(file)
|
147
|
+
self.no_ext = os.path.splitext(self.name)[0]
|
148
|
+
self.type = type
|
149
|
+
|
150
|
+
self.tuple_structure = namedtuple("fasta", ["seqid", "description", "sequence"])
|
151
|
+
self.contents = {}
|
152
|
+
|
153
|
+
def convert(self, contents, descriptions):
|
154
|
+
for protein in contents:
|
155
|
+
self.contents = self.tuple_structure(seqid = protein, description = descriptions[protein], sequence = contents[protein])
|
156
|
+
|
157
|
+
|
158
|
+
def def_import_file(self):
|
159
|
+
contents, descriptions = read_fasta(self.file_path)
|
160
|
+
self.convert(contents, descriptions)
|
161
|
+
|
162
|
+
class pyhmmer_manager:
|
163
|
+
def __init__(self, do_compress):
|
164
|
+
self.hmm_model = []
|
165
|
+
self.hmm_model_optimized = None
|
166
|
+
|
167
|
+
self.proteins_to_search = []
|
168
|
+
self.protein_descriptions = None
|
169
|
+
|
170
|
+
self.hmm_result_proteins = []
|
171
|
+
self.hmm_result_accessions = []
|
172
|
+
self.hmm_result_scores = []
|
173
|
+
|
174
|
+
self.printable_lines = []
|
175
|
+
|
176
|
+
self.bacterial_SCPs = None
|
177
|
+
self.archaeal_SCPs = None
|
178
|
+
self.assign_hmm_sets()
|
179
|
+
self.domain_counts = {"Bacteria" : 0, "Archaea": 0}
|
180
|
+
self.voted_domain = {"Bacteria" : len(self.bacterial_SCPs), "Archaea" : len(self.archaeal_SCPs)}
|
181
|
+
|
182
|
+
self.bacterial_fraction = None
|
183
|
+
self.archaeal_fraction = None
|
184
|
+
|
185
|
+
self.best_hits = None
|
186
|
+
|
187
|
+
self.do_compress = do_compress
|
188
|
+
|
189
|
+
def optimize_models(self):
|
190
|
+
try:
|
191
|
+
self.hmm_model_optimized = []
|
192
|
+
|
193
|
+
for hmm in self.hmm_model:
|
194
|
+
prof = pyhmmer.plan7.Profile(M = hmm.insert_emissions.shape[0], alphabet = pyhmmer.easel.Alphabet.amino())
|
195
|
+
prof.configure(hmm = hmm, background = pyhmmer.plan7.Background(alphabet = pyhmmer.easel.Alphabet.amino()), L = hmm.insert_emissions.shape[0]-1)
|
196
|
+
optim = prof.optimized()
|
197
|
+
self.hmm_model_optimized.append(optim)
|
198
|
+
|
199
|
+
#Clean up.
|
200
|
+
self.hmm_model = None
|
201
|
+
except:
|
202
|
+
#Quiet fail condition - fall back on default model.
|
203
|
+
self.hmm_model_optimized = None
|
204
|
+
|
205
|
+
#Load HMM and try to optimize.
|
206
|
+
def load_hmm_from_file(self, hmm_path):
|
207
|
+
hmm_set = pyhmmer.plan7.HMMFile(hmm_path)
|
208
|
+
for hmm in hmm_set:
|
209
|
+
self.hmm_model.append(hmm)
|
210
|
+
|
211
|
+
#This doesn't seem to be improving performance currently.
|
212
|
+
self.optimize_models()
|
213
|
+
|
214
|
+
#Set archaeal and bacterial HMM sets.
|
215
|
+
def assign_hmm_sets(self):
|
216
|
+
self.bacterial_SCPs = {'PF00709_21': 'Adenylsucc_synt', 'PF00406_22': 'ADK', 'PF01808_18': 'AICARFT_IMPCHas', 'PF00231_19': 'ATP-synt',
|
217
|
+
'PF00119_20': 'ATP-synt_A', 'PF01264_21': 'Chorismate_synt', 'PF00889_19': 'EF_TS', 'PF01176_19': 'eIF-1a',
|
218
|
+
'PF02601_15': 'Exonuc_VII_L', 'PF01025_19': 'GrpE', 'PF01725_16': 'Ham1p_like', 'PF01715_17': 'IPPT',
|
219
|
+
'PF00213_18': 'OSCP', 'PF01195_19': 'Pept_tRNA_hydro', 'PF00162_19': 'PGK', 'PF02033_18': 'RBFA', 'PF02565_15': 'RecO_C',
|
220
|
+
'PF00825_18': 'Ribonuclease_P', 'PF00687_21': 'Ribosomal_L1', 'PF00572_18': 'Ribosomal_L13',
|
221
|
+
'PF00238_19': 'Ribosomal_L14', 'PF00252_18': 'Ribosomal_L16', 'PF01196_19': 'Ribosomal_L17',
|
222
|
+
'PF00861_22': 'Ribosomal_L18p', 'PF01245_20': 'Ribosomal_L19', 'PF00453_18': 'Ribosomal_L20',
|
223
|
+
'PF00829_21': 'Ribosomal_L21p', 'PF00237_19': 'Ribosomal_L22', 'PF00276_20': 'Ribosomal_L23',
|
224
|
+
'PF17136_4': 'ribosomal_L24', 'PF00189_20': 'Ribosomal_S3_C', 'PF00281_19': 'Ribosomal_L5', 'PF00181_23': 'Ribosomal_L2',
|
225
|
+
'PF01016_19': 'Ribosomal_L27', 'PF00828_19': 'Ribosomal_L27A', 'PF00830_19': 'Ribosomal_L28',
|
226
|
+
'PF00831_23': 'Ribosomal_L29', 'PF00297_22': 'Ribosomal_L3', 'PF01783_23': 'Ribosomal_L32p',
|
227
|
+
'PF01632_19': 'Ribosomal_L35p', 'PF00573_22': 'Ribosomal_L4', 'PF00347_23': 'Ribosomal_L6',
|
228
|
+
'PF03948_14': 'Ribosomal_L9_C', 'PF00338_22': 'Ribosomal_S10', 'PF00411_19': 'Ribosomal_S11',
|
229
|
+
'PF00416_22': 'Ribosomal_S13', 'PF00312_22': 'Ribosomal_S15', 'PF00886_19': 'Ribosomal_S16',
|
230
|
+
'PF00366_20': 'Ribosomal_S17', 'PF00203_21': 'Ribosomal_S19', 'PF00318_20': 'Ribosomal_S2',
|
231
|
+
'PF01649_18': 'Ribosomal_S20p', 'PF01250_17': 'Ribosomal_S6', 'PF00177_21': 'Ribosomal_S7',
|
232
|
+
'PF00410_19': 'Ribosomal_S8', 'PF00380_19': 'Ribosomal_S9', 'PF00164_25': 'Ribosom_S12_S23',
|
233
|
+
'PF01193_24': 'RNA_pol_L', 'PF01192_22': 'RNA_pol_Rpb6', 'PF01765_19': 'RRF', 'PF02410_15': 'RsfS',
|
234
|
+
'PF03652_15': 'RuvX', 'PF00584_20': 'SecE', 'PF03840_14': 'SecG', 'PF00344_20': 'SecY', 'PF01668_18': 'SmpB',
|
235
|
+
'PF00750_19': 'tRNA-synt_1d', 'PF01746_21': 'tRNA_m1G_MT', 'PF02367_17': 'TsaE', 'PF02130_17': 'UPF0054',
|
236
|
+
'PF02699_15': 'YajC'}
|
237
|
+
|
238
|
+
self.archaeal_SCPs = {'PF00709_21': 'Adenylsucc_synt', 'PF05221_17': 'AdoHcyase', 'PF01951_16': 'Archease', 'PF01813_17': 'ATP-synt_D',
|
239
|
+
'PF01990_17': 'ATP-synt_F', 'PF01864_17': 'CarS-like', 'PF01982_16': 'CTP-dep_RFKase', 'PF01866_17': 'Diphthamide_syn',
|
240
|
+
'PF04104_14': 'DNA_primase_lrg', 'PF01984_20': 'dsDNA_bind', 'PF04010_13': 'DUF357', 'PF04019_12': 'DUF359',
|
241
|
+
'PF04919_12': 'DUF655', 'PF01912_18': 'eIF-6', 'PF05833_11': 'FbpA', 'PF01725_16': 'Ham1p_like',
|
242
|
+
'PF00368_18': 'HMG-CoA_red', 'PF00334_19': 'NDK', 'PF02006_16': 'PPS_PS', 'PF02996_17': 'Prefoldin',
|
243
|
+
'PF01981_16': 'PTH2', 'PF01948_18': 'PyrI', 'PF00687_21': 'Ribosomal_L1', 'PF00572_18': 'Ribosomal_L13',
|
244
|
+
'PF00238_19': 'Ribosomal_L14', 'PF00827_17': 'Ribosomal_L15e', 'PF00252_18': 'Ribosomal_L16',
|
245
|
+
'PF01157_18': 'Ribosomal_L21e', 'PF00237_19': 'Ribosomal_L22', 'PF00276_20': 'Ribosomal_L23',
|
246
|
+
'PF16906_5': 'Ribosomal_L26', 'PF00831_23': 'Ribosomal_L29', 'PF00297_22': 'Ribosomal_L3',
|
247
|
+
'PF01198_19': 'Ribosomal_L31e', 'PF01655_18': 'Ribosomal_L32e', 'PF01780_19': 'Ribosomal_L37ae',
|
248
|
+
'PF00832_20': 'Ribosomal_L39', 'PF00573_22': 'Ribosomal_L4', 'PF00935_19': 'Ribosomal_L44', 'PF17144_4': 'Ribosomal_L5e',
|
249
|
+
'PF00347_23': 'Ribosomal_L6', 'PF00411_19': 'Ribosomal_S11', 'PF00416_22': 'Ribosomal_S13',
|
250
|
+
'PF00312_22': 'Ribosomal_S15', 'PF00366_20': 'Ribosomal_S17', 'PF00833_18': 'Ribosomal_S17e',
|
251
|
+
'PF00203_21': 'Ribosomal_S19', 'PF01090_19': 'Ribosomal_S19e', 'PF00318_20': 'Ribosomal_S2',
|
252
|
+
'PF01282_19': 'Ribosomal_S24e', 'PF01667_17': 'Ribosomal_S27e', 'PF01200_18': 'Ribosomal_S28e',
|
253
|
+
'PF01015_18': 'Ribosomal_S3Ae', 'PF00177_21': 'Ribosomal_S7', 'PF00410_19': 'Ribosomal_S8',
|
254
|
+
'PF01201_22': 'Ribosomal_S8e', 'PF00380_19': 'Ribosomal_S9', 'PF00164_25': 'Ribosom_S12_S23',
|
255
|
+
'PF06026_14': 'Rib_5-P_isom_A', 'PF01351_18': 'RNase_HII', 'PF13656_6': 'RNA_pol_L_2',
|
256
|
+
'PF01194_17': 'RNA_pol_N', 'PF03874_16': 'RNA_pol_Rpb4', 'PF01192_22': 'RNA_pol_Rpb6',
|
257
|
+
'PF01139_17': 'RtcB', 'PF00344_20': 'SecY', 'PF06093_13': 'Spt4', 'PF00121_18': 'TIM', 'PF01994_16': 'Trm56',
|
258
|
+
'PF00749_21': 'tRNA-synt_1c', 'PF00750_19': 'tRNA-synt_1d', 'PF13393_6': 'tRNA-synt_His',
|
259
|
+
'PF01142_18': 'TruD', 'PF01992_16': 'vATP-synt_AC39', 'PF01991_18': 'vATP-synt_E', 'PF01496_19': 'V_ATPase_I'}
|
260
|
+
|
261
|
+
#Convert passed sequences.
|
262
|
+
def convert_protein_seqs_in_mem(self, contents):
|
263
|
+
#Clean up.
|
264
|
+
self.proteins_to_search = []
|
265
|
+
|
266
|
+
for protein in contents:
|
267
|
+
#Skip a protein if it's longer than 100k AA.
|
268
|
+
if len(contents[protein]) >= 100000:
|
269
|
+
continue
|
270
|
+
as_bytes = protein.encode()
|
271
|
+
#Pyhmmer digitization of sequences for searching.
|
272
|
+
easel_seq = pyhmmer.easel.TextSequence(name = as_bytes, sequence = contents[protein])
|
273
|
+
easel_seq = easel_seq.digitize(pyhmmer.easel.Alphabet.amino())
|
274
|
+
self.proteins_to_search.append(easel_seq)
|
275
|
+
|
276
|
+
easel_seq = None
|
277
|
+
|
278
|
+
def load_protein_seqs_from_file(self, prots_file):
|
279
|
+
#Pyhmmer has a method for loading a fasta file, but we need to support gzipped inputs, so we do it manually.
|
280
|
+
contents, deflines = read_fasta(prots_file)
|
281
|
+
self.protein_descriptions = deflines
|
282
|
+
self.convert_protein_seqs_in_mem(contents)
|
283
|
+
|
284
|
+
def execute_search(self):
|
285
|
+
if self.hmm_model_optimized is None:
|
286
|
+
top_hits = list(pyhmmer.hmmsearch(self.hmm_model, self.proteins_to_search, cpus=1, bit_cutoffs="trusted"))
|
287
|
+
else:
|
288
|
+
top_hits = list(pyhmmer.hmmsearch(self.hmm_model_optimized, self.proteins_to_search, cpus=1, bit_cutoffs="trusted"))
|
289
|
+
|
290
|
+
self.printable_lines = []
|
291
|
+
|
292
|
+
self.hmm_result_proteins = []
|
293
|
+
self.hmm_result_accessions = []
|
294
|
+
self.hmm_result_scores = []
|
295
|
+
|
296
|
+
for model in top_hits:
|
297
|
+
for hit in model:
|
298
|
+
target_name = hit.name.decode()
|
299
|
+
target_acc = hit.accession
|
300
|
+
if target_acc is None:
|
301
|
+
target_acc = "-"
|
302
|
+
else:
|
303
|
+
target_acc = target_acc.decode()
|
304
|
+
|
305
|
+
query_name = hit.best_domain.alignment.hmm_name.decode()
|
306
|
+
query_acc = hit.best_domain.alignment.hmm_accession.decode()
|
307
|
+
|
308
|
+
full_seq_evalue = "%.2g" % hit.evalue
|
309
|
+
full_seq_score = round(hit.score, 1)
|
310
|
+
full_seq_bias = round(hit.bias, 1)
|
311
|
+
|
312
|
+
best_dom_evalue = "%.2g" % hit.best_domain.alignment.domain.i_evalue
|
313
|
+
best_dom_score = round(hit.best_domain.alignment.domain.score, 1)
|
314
|
+
best_dom_bias = round(hit.best_domain.alignment.domain.bias, 1)
|
315
|
+
|
316
|
+
#I don't know how to get most of these values.
|
317
|
+
exp = 0
|
318
|
+
reg = 0
|
319
|
+
clu = 0
|
320
|
+
ov = 0
|
321
|
+
env = 0
|
322
|
+
dom = len(hit.domains)
|
323
|
+
rep = 0
|
324
|
+
inc = 0
|
325
|
+
|
326
|
+
try:
|
327
|
+
description = self.protein_descriptions[target_name]
|
328
|
+
except:
|
329
|
+
description = ""
|
330
|
+
|
331
|
+
writeout = [target_name, target_acc, query_name, query_acc, full_seq_evalue, \
|
332
|
+
full_seq_score, full_seq_bias, best_dom_evalue, best_dom_score, best_dom_bias, \
|
333
|
+
exp, reg, clu, ov, env, dom, rep, inc, description]
|
334
|
+
|
335
|
+
#Format and join.
|
336
|
+
writeout = [str(i) for i in writeout]
|
337
|
+
writeout = '\t'.join(writeout)
|
338
|
+
|
339
|
+
self.printable_lines.append(writeout)
|
340
|
+
|
341
|
+
self.hmm_result_proteins.append(target_name)
|
342
|
+
self.hmm_result_accessions.append(query_acc)
|
343
|
+
self.hmm_result_scores.append(best_dom_score)
|
344
|
+
|
345
|
+
def filter_to_best_hits(self):
|
346
|
+
hmm_file = np.transpose(np.array([self.hmm_result_proteins, self.hmm_result_accessions, self.hmm_result_scores]))
|
347
|
+
|
348
|
+
#hmm_file = np.loadtxt(hmm_file_name, comments = '#', usecols = (0, 3, 8), dtype=(str))
|
349
|
+
#Sort the hmm file based on the score column in descending order.
|
350
|
+
hmm_file = hmm_file[hmm_file[:,2].astype(float).argsort()[::-1]]
|
351
|
+
|
352
|
+
#Identify the first row where each gene name appears, after sorting by score;
|
353
|
+
#in effect, return the highest scoring assignment per gene name
|
354
|
+
#Sort the indices of the result to match the score-sorted table instead of alphabetical order of gene names
|
355
|
+
hmm_file = hmm_file[np.sort(np.unique(hmm_file[:,0], return_index = True)[1])]
|
356
|
+
|
357
|
+
#Filter the file again for the unique ACCESSION names, since we're only allowed one gene per accession, I guess?
|
358
|
+
#Don't sort the indices, we don't care about the scores anymore.
|
359
|
+
hmm_file = hmm_file[np.unique(hmm_file[:,1], return_index = True)[1]]
|
360
|
+
|
361
|
+
sql_friendly_names = [i.replace(".", "_") for i in hmm_file[:,1]]
|
362
|
+
|
363
|
+
self.best_hits = dict(zip(hmm_file[:,0], sql_friendly_names))
|
364
|
+
|
365
|
+
hmm_file = None
|
366
|
+
|
367
|
+
#Count per-dom occurs.
|
368
|
+
def assign_domain(self):
|
369
|
+
for prot in self.best_hits.values():
|
370
|
+
if prot in self.bacterial_SCPs:
|
371
|
+
self.domain_counts["Bacteria"] += 1
|
372
|
+
if prot in self.archaeal_SCPs:
|
373
|
+
self.domain_counts["Archaea"] += 1
|
374
|
+
|
375
|
+
self.bacterial_fraction = self.domain_counts["Bacteria"] / self.voted_domain["Bacteria"]
|
376
|
+
self.aechaeal_fraction = self.domain_counts["Archaea"] / self.voted_domain["Archaea"]
|
377
|
+
|
378
|
+
if self.bacterial_fraction >= self.aechaeal_fraction:
|
379
|
+
self.voted_domain = "Bacteria"
|
380
|
+
else:
|
381
|
+
self.voted_domain = "Archaea"
|
382
|
+
|
383
|
+
pop_keys = list(self.best_hits.keys())
|
384
|
+
for key in pop_keys:
|
385
|
+
if self.voted_domain == "Bacteria":
|
386
|
+
if self.best_hits[key] not in self.bacterial_SCPs:
|
387
|
+
self.best_hits.pop(key)
|
388
|
+
if self.voted_domain == "Archaea":
|
389
|
+
if self.best_hits[key] not in self.archaeal_SCPs:
|
390
|
+
self.best_hits.pop(key)
|
391
|
+
|
392
|
+
def to_hmm_file(self, output):
|
393
|
+
#PyHMMER data is a bit hard to parse. For each result:
|
394
|
+
|
395
|
+
content = '\n'.join(self.printable_lines) + '\n'
|
396
|
+
|
397
|
+
if self.do_compress:
|
398
|
+
#Clean
|
399
|
+
if os.path.exists(output):
|
400
|
+
os.remove(output)
|
401
|
+
|
402
|
+
content = content.encode()
|
403
|
+
|
404
|
+
fh = gzip.open(output+".gz", "wb")
|
405
|
+
fh.write(content)
|
406
|
+
fh.close()
|
407
|
+
content = None
|
408
|
+
|
409
|
+
else:
|
410
|
+
#Clean
|
411
|
+
if os.path.exists(output+".gz"):
|
412
|
+
os.remove(output+".gz")
|
413
|
+
|
414
|
+
fh = open(output, "w")
|
415
|
+
|
416
|
+
fh.write(content)
|
417
|
+
|
418
|
+
fh.close()
|
419
|
+
|
420
|
+
content = None
|
421
|
+
|
422
|
+
#If we're doing this step at all, we've either loaded the seqs into mem by reading the prot file
|
423
|
+
#or have them in mem thanks to pyrodigal.
|
424
|
+
def run_for_fastaai(self, prots, hmm_output):
|
425
|
+
try:
|
426
|
+
self.convert_protein_seqs_in_mem(prots)
|
427
|
+
self.execute_search()
|
428
|
+
self.filter_to_best_hits()
|
429
|
+
try:
|
430
|
+
self.to_hmm_file(hmm_output)
|
431
|
+
except:
|
432
|
+
print(output, "cannot be created. HMM search failed. This file will be skipped.")
|
433
|
+
|
434
|
+
except:
|
435
|
+
print(output, "failed to run through HMMER!")
|
436
|
+
self.best_hits = None
|
437
|
+
|
438
|
+
|
439
|
+
def hmm_preproc_initializer(hmm_file, do_compress = False):
|
440
|
+
global hmm_manager
|
441
|
+
hmm_manager = pyhmmer_manager(do_compress)
|
442
|
+
hmm_manager.load_hmm_from_file(hmm_file)
|
443
|
+
|
444
|
+
class pyrodigal_manager:
|
445
|
+
def __init__(self, file = None, aa_out = None, nt_out = None, is_meta = False, full_headers = True, trans_table = 11,
|
446
|
+
num_bp_fmt = True, verbose = True, do_compress = "0", compare_against = None):
|
447
|
+
#Input NT sequences
|
448
|
+
self.file = file
|
449
|
+
|
450
|
+
#List of seqs read from input file.
|
451
|
+
self.sequences = None
|
452
|
+
#Concatenation of up to first 32 million bp in self.sequences - prodigal caps at this point.
|
453
|
+
self.training_seq = None
|
454
|
+
|
455
|
+
#Predicted genes go here
|
456
|
+
self.predicted_genes = None
|
457
|
+
#Record the translation table used.
|
458
|
+
self.trans_table = trans_table
|
459
|
+
|
460
|
+
#This is the pyrodigal manager - this does the gene predicting.
|
461
|
+
self.manager = pd.OrfFinder(meta=is_meta)
|
462
|
+
self.is_meta = is_meta
|
463
|
+
|
464
|
+
#Full prodigal header information includes more than just a protein number.
|
465
|
+
#If full_headers is true, protein deflines will match prodigal; else, just protein ID.
|
466
|
+
self.full_headers = full_headers
|
467
|
+
|
468
|
+
#Prodigal prints info to console. I enhanced the info and made printing default, but also allow them to be totally turned off.
|
469
|
+
self.verbose = verbose
|
470
|
+
|
471
|
+
#Prodigal formats outputs with 70 bases per line max
|
472
|
+
self.num_bp_fmt = num_bp_fmt
|
473
|
+
|
474
|
+
#File names for outputs
|
475
|
+
self.aa_out = aa_out
|
476
|
+
self.nt_out = nt_out
|
477
|
+
|
478
|
+
#List of proteins in excess of 100K base pairs (HMMER's limit) and their lengths. This is also fastAAI specific.
|
479
|
+
self.excluded_seqs = {}
|
480
|
+
|
481
|
+
#Gzip outputs if asked.
|
482
|
+
self.compress = do_compress
|
483
|
+
|
484
|
+
self.labeled_proteins = None
|
485
|
+
|
486
|
+
#Normally, we don't need to keep an input sequence after it's had proteins predicted for it - however
|
487
|
+
#For FastAAI and MiGA's purposes, comparisons of two translation tables is necessary.
|
488
|
+
#Rather than re-importing sequences and reconstructing the training sequences,
|
489
|
+
#keep them for faster repredict with less I/O
|
490
|
+
self.compare_to = compare_against
|
491
|
+
if self.compare_to is not None:
|
492
|
+
self.keep_seqs = True
|
493
|
+
self.keep_after_train = True
|
494
|
+
else:
|
495
|
+
self.keep_seqs = False
|
496
|
+
self.keep_after_train = False
|
497
|
+
|
498
|
+
#Imports a fasta as binary.
|
499
|
+
def import_sequences(self):
|
500
|
+
if self.sequences is None:
|
501
|
+
self.sequences = {}
|
502
|
+
|
503
|
+
#check for zipped and import as needed.
|
504
|
+
with open(self.file, 'rb') as test_gz:
|
505
|
+
#Gzip magic number
|
506
|
+
is_gz = (test_gz.read(2) == b'\x1f\x8b')
|
507
|
+
|
508
|
+
if is_gz:
|
509
|
+
fh = gzip.open(self.file)
|
510
|
+
else:
|
511
|
+
fh = open(self.file, "rb")
|
512
|
+
|
513
|
+
imp = fh.readlines()
|
514
|
+
|
515
|
+
fh.close()
|
516
|
+
|
517
|
+
cur_seq = None
|
518
|
+
for s in imp:
|
519
|
+
s = s.decode().strip()
|
520
|
+
#> is 62 in ascii. This is asking if the first character is '>'
|
521
|
+
if s.startswith(">"):
|
522
|
+
#Skip first cycle, then do for each after
|
523
|
+
if cur_seq is not None:
|
524
|
+
self.sequences[cur_seq] = ''.join(self.sequences[cur_seq])
|
525
|
+
self.sequences[cur_seq] = self.sequences[cur_seq].encode()
|
526
|
+
#print(cur_seq, len(self.sequences[cur_seq]))
|
527
|
+
cur_seq = s[1:]
|
528
|
+
cur_seq = cur_seq.split()[0]
|
529
|
+
cur_seq = cur_seq.encode('utf-8')
|
530
|
+
self.sequences[cur_seq] = []
|
531
|
+
else:
|
532
|
+
#Remove the newline character.
|
533
|
+
#bases = s[:-1]
|
534
|
+
self.sequences[cur_seq].append(s)
|
535
|
+
|
536
|
+
#Final set
|
537
|
+
self.sequences[cur_seq] = ''.join(self.sequences[cur_seq])
|
538
|
+
self.sequences[cur_seq] = self.sequences[cur_seq].encode()
|
539
|
+
|
540
|
+
#Now we have the data, go to training.
|
541
|
+
if not self.is_meta:
|
542
|
+
self.train_manager()
|
543
|
+
|
544
|
+
#Collect up to the first 32 million bases for use in training seq.
|
545
|
+
def train_manager(self):
|
546
|
+
running_sum = 0
|
547
|
+
seqs_added = 0
|
548
|
+
if self.training_seq is None:
|
549
|
+
self.training_seq = []
|
550
|
+
for seq in self.sequences:
|
551
|
+
running_sum += len(self.sequences[seq])
|
552
|
+
if seqs_added > 0:
|
553
|
+
#Prodigal interleaving logic - add this breaker between sequences, starting at sequence 2
|
554
|
+
self.training_seq.append(b'TTAATTAATTAA')
|
555
|
+
running_sum += 12
|
556
|
+
|
557
|
+
seqs_added += 1
|
558
|
+
|
559
|
+
#Handle excessive size
|
560
|
+
if running_sum >= 32000000:
|
561
|
+
print("Warning: Sequence is long (max 32000000 for training).")
|
562
|
+
print("Training on the first 32000000 bases.")
|
563
|
+
|
564
|
+
to_remove = running_sum - 32000000
|
565
|
+
|
566
|
+
#Remove excess characters
|
567
|
+
cut_seq = self.sequences[seq][:-to_remove]
|
568
|
+
#Add the partial seq
|
569
|
+
self.training_seq.append(cut_seq)
|
570
|
+
|
571
|
+
#Stop the loop and move to training
|
572
|
+
break
|
573
|
+
|
574
|
+
#add in a full sequence
|
575
|
+
self.training_seq.append(self.sequences[seq])
|
576
|
+
|
577
|
+
if seqs_added > 1:
|
578
|
+
self.training_seq.append(b'TTAATTAATTAA')
|
579
|
+
|
580
|
+
self.training_seq = b''.join(self.training_seq)
|
581
|
+
|
582
|
+
if len(self.training_seq) < 20000:
|
583
|
+
if self.verbose:
|
584
|
+
print("Can't train on 20 thousand or fewer characters. Switching to meta mode.")
|
585
|
+
self.manager = pd.OrfFinder(meta=True)
|
586
|
+
self.is_meta = True
|
587
|
+
else:
|
588
|
+
if self.verbose:
|
589
|
+
print("")
|
590
|
+
#G is 71, C is 67; we're counting G + C and dividing by the total.
|
591
|
+
gc = round(((self.training_seq.count(67) + self.training_seq.count(71))/ len(self.training_seq)) * 100, 2)
|
592
|
+
print(len(self.training_seq), "bp seq created,", gc, "pct GC")
|
593
|
+
|
594
|
+
#Train
|
595
|
+
self.manager.train(self.training_seq, translation_table = self.trans_table)
|
596
|
+
|
597
|
+
if not self.keep_after_train:
|
598
|
+
#Clean up
|
599
|
+
self.training_seq = None
|
600
|
+
|
601
|
+
def predict_genes(self):
|
602
|
+
if self.is_meta:
|
603
|
+
if self.verbose:
|
604
|
+
print("Finding genes in metagenomic mode")
|
605
|
+
else:
|
606
|
+
if self.verbose:
|
607
|
+
print("Finding genes with translation table", self.trans_table)
|
608
|
+
print("")
|
609
|
+
|
610
|
+
self.predicted_genes = {}
|
611
|
+
for seq in self.sequences:
|
612
|
+
|
613
|
+
if self.verbose:
|
614
|
+
print("Finding genes in sequence", seq.decode(), "("+str(len(self.sequences[seq]))+ " bp)... ", end = '')
|
615
|
+
|
616
|
+
self.predicted_genes[seq] = self.manager.find_genes(self.sequences[seq])
|
617
|
+
|
618
|
+
#If we're comparing multiple tables, then we want to keep these for re-prediction.
|
619
|
+
if not self.keep_seqs:
|
620
|
+
#Clean up
|
621
|
+
self.sequences[seq] = None
|
622
|
+
|
623
|
+
if self.verbose:
|
624
|
+
print("done!")
|
625
|
+
|
626
|
+
#Predict genes with an alternative table, compare results, and keep the winner.
|
627
|
+
def compare_alternative_table(self, table):
|
628
|
+
if table == self.trans_table:
|
629
|
+
print("You're trying to compare table", table, "with itself.")
|
630
|
+
else:
|
631
|
+
if self.verbose:
|
632
|
+
print("Comparing translation table", self.trans_table, "against table", table)
|
633
|
+
old_table = self.trans_table
|
634
|
+
old_genes = self.predicted_genes
|
635
|
+
old_size = 0
|
636
|
+
for seq in self.predicted_genes:
|
637
|
+
for gene in self.predicted_genes[seq]:
|
638
|
+
old_size += (gene.end - gene.begin)
|
639
|
+
|
640
|
+
self.trans_table = table
|
641
|
+
self.train_manager()
|
642
|
+
self.predict_genes()
|
643
|
+
|
644
|
+
new_size = 0
|
645
|
+
for seq in self.predicted_genes:
|
646
|
+
for gene in self.predicted_genes[seq]:
|
647
|
+
new_size += (gene.end - gene.begin)
|
648
|
+
|
649
|
+
if (old_size / new_size) > 1.1:
|
650
|
+
if self.verbose:
|
651
|
+
print("Translation table", self.trans_table, "performed better than table", old_table, "and will be used instead.")
|
652
|
+
else:
|
653
|
+
if self.verbose:
|
654
|
+
print("Translation table", self.trans_table, "did not perform significantly better than table", old_table, "and will not be used.")
|
655
|
+
self.trans_table = old_table
|
656
|
+
self.predicted_genes = old_genes
|
657
|
+
|
658
|
+
#cleanup
|
659
|
+
old_table = None
|
660
|
+
old_genes = None
|
661
|
+
old_size = None
|
662
|
+
new_size = None
|
663
|
+
|
664
|
+
def predict_and_compare(self):
|
665
|
+
self.predict_genes()
|
666
|
+
|
667
|
+
#Run alt comparisons in gene predict.
|
668
|
+
if self.compare_to is not None:
|
669
|
+
while len(self.compare_to) > 0:
|
670
|
+
try:
|
671
|
+
next_table = int(self.compare_to.pop(0))
|
672
|
+
|
673
|
+
if len(self.compare_to) == 0:
|
674
|
+
#Ready to clean up.
|
675
|
+
self.keep_after_train = True
|
676
|
+
self.keep_seqs = True
|
677
|
+
|
678
|
+
self.compare_alternative_table(next_table)
|
679
|
+
except:
|
680
|
+
print("Alternative table comparison failed! Skipping.")
|
681
|
+
|
682
|
+
#Break lines into size base pairs per line. Prodigal's default for bp is 70, aa is 60.
|
683
|
+
def num_bp_line_format(self, string, size = 70):
|
684
|
+
#ceiling funciton without the math module
|
685
|
+
ceiling = int(round((len(string)/size)+0.5, 0))
|
686
|
+
formatted = '\n'.join([string[(i*size):(i+1)*size] for i in range(0, ceiling)])
|
687
|
+
return formatted
|
688
|
+
|
689
|
+
#Writeouts
|
690
|
+
def write_nt(self):
|
691
|
+
if self.nt_out is not None:
|
692
|
+
if self.verbose:
|
693
|
+
print("Writing nucleotide sequences... ")
|
694
|
+
if self.compress == '1' or self.compress == '2':
|
695
|
+
out_writer = gzip.open(self.nt_out+".gz", "wb")
|
696
|
+
|
697
|
+
content = b''
|
698
|
+
|
699
|
+
for seq in self.predicted_genes:
|
700
|
+
seqname = b">"+ seq + b"_"
|
701
|
+
#Gene counter
|
702
|
+
count = 1
|
703
|
+
for gene in self.predicted_genes[seq]:
|
704
|
+
#Full header lines
|
705
|
+
if self.full_headers:
|
706
|
+
content += b' # '.join([seqname + str(count).encode(), str(gene.begin).encode(), str(gene.end).encode(), str(gene.strand).encode(), gene._gene_data.encode()])
|
707
|
+
else:
|
708
|
+
#Reduced headers if we don't care.
|
709
|
+
content += seqname + str(count).encode()
|
710
|
+
|
711
|
+
content += b'\n'
|
712
|
+
|
713
|
+
if self.num_bp_fmt:
|
714
|
+
#60 bp cap per line
|
715
|
+
content += self.num_bp_line_format(gene.sequence(), size = 70).encode()
|
716
|
+
else:
|
717
|
+
#One-line sequence.
|
718
|
+
content += gene.sequence().encode()
|
719
|
+
|
720
|
+
content += b'\n'
|
721
|
+
count += 1
|
722
|
+
|
723
|
+
out_writer.write(content)
|
724
|
+
out_writer.close()
|
725
|
+
|
726
|
+
if self.compress == '0' or self.compress == '2':
|
727
|
+
out_writer = open(self.nt_out, "w")
|
728
|
+
|
729
|
+
for seq in self.predicted_genes:
|
730
|
+
#Only do this decode once.
|
731
|
+
seqname = ">"+ seq.decode() +"_"
|
732
|
+
#Gene counter
|
733
|
+
count = 1
|
734
|
+
|
735
|
+
for gene in self.predicted_genes[seq]:
|
736
|
+
#Full header lines
|
737
|
+
if self.full_headers:
|
738
|
+
#Standard prodigal header
|
739
|
+
print(seqname + str(count), gene.begin, gene.end, gene.strand, gene._gene_data, sep = " # ", file = out_writer)
|
740
|
+
else:
|
741
|
+
#Reduced headers if we don't care.
|
742
|
+
print(seqname + str(count), file = out_writer)
|
743
|
+
|
744
|
+
if self.num_bp_fmt:
|
745
|
+
#60 bp cap per line
|
746
|
+
print(self.num_bp_line_format(gene.sequence(), size = 70), file = out_writer)
|
747
|
+
else:
|
748
|
+
#One-line sequence.
|
749
|
+
print(gene.sequence(), file = out_writer)
|
750
|
+
|
751
|
+
count += 1
|
752
|
+
|
753
|
+
out_writer.close()
|
754
|
+
|
755
|
+
def write_aa(self):
|
756
|
+
if self.aa_out is not None:
|
757
|
+
if self.verbose:
|
758
|
+
print("Writing amino acid sequences...")
|
759
|
+
|
760
|
+
self.labeled_proteins = {}
|
761
|
+
content = ''
|
762
|
+
for seq in self.predicted_genes:
|
763
|
+
count = 1
|
764
|
+
seqname = ">"+ seq.decode() + "_"
|
765
|
+
for gene in self.predicted_genes[seq]:
|
766
|
+
prot_name = seqname + str(count)
|
767
|
+
translation = gene.translate()
|
768
|
+
self.labeled_proteins[prot_name[1:]] = translation
|
769
|
+
defline = " # ".join([prot_name, str(gene.begin), str(gene.end), str(gene.strand), str(gene._gene_data)])
|
770
|
+
content += defline
|
771
|
+
content += "\n"
|
772
|
+
count += 1
|
773
|
+
content += self.num_bp_line_format(translation, size = 60)
|
774
|
+
content += "\n"
|
775
|
+
|
776
|
+
if self.compress == '0' or self.compress == '2':
|
777
|
+
out_writer = open(self.aa_out, "w")
|
778
|
+
out_writer.write(content)
|
779
|
+
out_writer.close()
|
780
|
+
|
781
|
+
if self.compress == '1' or self.compress == '2':
|
782
|
+
content = content.encode()
|
783
|
+
out_writer = gzip.open(self.aa_out+".gz", "wb")
|
784
|
+
out_writer.write(content)
|
785
|
+
out_writer.close()
|
786
|
+
|
787
|
+
def run_for_fastaai(self):
|
788
|
+
self.verbose = False
|
789
|
+
self.import_sequences()
|
790
|
+
self.train_manager()
|
791
|
+
self.predict_and_compare()
|
792
|
+
self.write_aa()
|
793
|
+
|
794
|
+
#Iterator for agnostic reader
|
795
|
+
class agnostic_reader_iterator:
|
796
|
+
def __init__(self, reader):
|
797
|
+
self.handle_ = reader.handle
|
798
|
+
self.is_gz_ = reader.is_gz
|
799
|
+
|
800
|
+
def __next__(self):
|
801
|
+
if self.is_gz_:
|
802
|
+
line = self.handle_.readline().decode()
|
803
|
+
else:
|
804
|
+
line = self.handle_.readline()
|
805
|
+
|
806
|
+
#Ezpz EOF check
|
807
|
+
if line:
|
808
|
+
return line
|
809
|
+
else:
|
810
|
+
raise StopIteration
|
811
|
+
|
812
|
+
#File reader that doesn't care if you give it a gzipped file or not.
|
813
|
+
class agnostic_reader:
|
814
|
+
def __init__(self, file):
|
815
|
+
self.path = file
|
816
|
+
|
817
|
+
with open(file, 'rb') as test_gz:
|
818
|
+
#Gzip magic number
|
819
|
+
is_gz = (test_gz.read(2) == b'\x1f\x8b')
|
820
|
+
|
821
|
+
self.is_gz = is_gz
|
822
|
+
|
823
|
+
if is_gz:
|
824
|
+
self.handle = gzip.open(self.path)
|
825
|
+
else:
|
826
|
+
self.handle = open(self.path)
|
827
|
+
|
828
|
+
def __iter__(self):
|
829
|
+
return agnostic_reader_iterator(self)
|
830
|
+
|
831
|
+
def close(self):
|
832
|
+
self.handle.close()
|
833
|
+
|
834
|
+
'''
|
835
|
+
Class for handling all of the raw genome/protein/protein+HMM file inputs when building a database.
|
836
|
+
|
837
|
+
Takes a file or files and processes them from genome -> protein, protein -> hmm, prot+HMM -> kmerized protein best hits as numpy int arrays according to the kmer_index
|
838
|
+
|
839
|
+
'''
|
840
|
+
|
841
|
+
class input_file:
|
842
|
+
def __init__(self, input_path, output = "", verbosity = False, do_compress = False,
|
843
|
+
make_crystal = False):
|
844
|
+
#starting path for the file; irrelevant for protein and hmm, but otherwise useful for keeping track.
|
845
|
+
self.path = input_path
|
846
|
+
#Output directory starts with this
|
847
|
+
self.output = os.path.normpath(output + "/")
|
848
|
+
#For printing file updates, this is the input name
|
849
|
+
self.name = os.path.basename(input_path)
|
850
|
+
#original name is the key used for the genomes index later on.
|
851
|
+
self.original_name = os.path.basename(input_path)
|
852
|
+
#This is the name that can be used for building files with new extensions.
|
853
|
+
if input_path.endswith(".gz"):
|
854
|
+
#Remove .gz first to make names consistent.
|
855
|
+
self.basename = os.path.splitext(os.path.basename(input_path[:-3]))[0]
|
856
|
+
else:
|
857
|
+
self.basename = os.path.splitext(os.path.basename(input_path))[0]
|
858
|
+
|
859
|
+
#Sanitize for SQL
|
860
|
+
#These are chars safe for sql
|
861
|
+
sql_safe = set('_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789')
|
862
|
+
current_chars = set(self.basename)
|
863
|
+
#self.sql_name = self.basename
|
864
|
+
#Identify SQL-unsafe characters as those outside the permissible set and replace all with underscores.
|
865
|
+
for char in current_chars - sql_safe:
|
866
|
+
self.basename = self.basename.replace(char, "_")
|
867
|
+
|
868
|
+
#'genome' or 'protein' or 'protein and HMM'
|
869
|
+
self.status = None
|
870
|
+
#These will keep track of paths for each stage of file for us.
|
871
|
+
self.genome = None
|
872
|
+
self.protein = None
|
873
|
+
self.hmm = None
|
874
|
+
|
875
|
+
self.ran_hmmer = False
|
876
|
+
|
877
|
+
#If pyrodigal is run, then the protein sequences are already loaded into memory.
|
878
|
+
#We reuse them in kmer extraction instead of another I/O
|
879
|
+
self.prepared_proteins = None
|
880
|
+
|
881
|
+
self.intermediate = None
|
882
|
+
|
883
|
+
self.crystalize = make_crystal
|
884
|
+
self.best_hits = None
|
885
|
+
self.best_hits_kmers = None
|
886
|
+
|
887
|
+
self.protein_count = 0
|
888
|
+
self.protein_kmer_count = {}
|
889
|
+
|
890
|
+
self.trans_table = None
|
891
|
+
self.start_time = None
|
892
|
+
self.end_time = None
|
893
|
+
self.err_log = ""
|
894
|
+
#doesn't get updated otw.
|
895
|
+
self.initial_state = "protein+HMM"
|
896
|
+
|
897
|
+
self.verbose = verbosity
|
898
|
+
|
899
|
+
#Check if the file failed to produce ANY SCP HMM hits.
|
900
|
+
self.is_empty = False
|
901
|
+
|
902
|
+
self.do_compress = do_compress
|
903
|
+
|
904
|
+
self.crystal = None
|
905
|
+
|
906
|
+
self.init_time = None
|
907
|
+
#default to 0 time.
|
908
|
+
self.prot_pred_time = None
|
909
|
+
self.hmm_search_time = None
|
910
|
+
self.besthits_time = None
|
911
|
+
|
912
|
+
def curtime(self):
|
913
|
+
time_format = "%d/%m/%Y %H:%M:%S"
|
914
|
+
timer = datetime.datetime.now()
|
915
|
+
time = timer.strftime(time_format)
|
916
|
+
return time
|
917
|
+
|
918
|
+
def partial_timings(self):
|
919
|
+
protein_pred = self.prot_pred_time-self.init_time
|
920
|
+
hmm_search = self.hmm_search_time-self.prot_pred_time
|
921
|
+
besthits = self.besthits_time-self.hmm_search_time
|
922
|
+
|
923
|
+
protein_pred = protein_pred.total_seconds()
|
924
|
+
hmm_search = hmm_search.total_seconds()
|
925
|
+
besthits = besthits.total_seconds()
|
926
|
+
|
927
|
+
self.prot_pred_time = protein_pred
|
928
|
+
self.hmm_search_time = hmm_search
|
929
|
+
self.besthits_time = besthits
|
930
|
+
|
931
|
+
#Functions for externally setting status and file paths of particular types
|
932
|
+
def set_genome(self, path):
|
933
|
+
self.status = 'genome'
|
934
|
+
self.genome = path
|
935
|
+
|
936
|
+
def set_protein(self, path):
|
937
|
+
self.status = 'protein'
|
938
|
+
self.protein = path
|
939
|
+
|
940
|
+
def set_hmm(self, path):
|
941
|
+
if self.protein is None:
|
942
|
+
print("Warning! I don't have a protein yet, so this HMM will be useless to me until I do!")
|
943
|
+
self.status = 'protein and hmm'
|
944
|
+
self.hmm = path
|
945
|
+
|
946
|
+
def set_crystal(self, path):
|
947
|
+
self.status = 'crystal'
|
948
|
+
self.crystal = path
|
949
|
+
|
950
|
+
#Runs prodigal, compares translation tables and stores faa files
|
951
|
+
def genome_to_protein(self):
|
952
|
+
if self.genome is None:
|
953
|
+
print(self.name, "wasn't a declared as a genome! I can't make this into a protein!")
|
954
|
+
else:
|
955
|
+
protein_output = os.path.normpath(self.output + "/predicted_proteins/" + self.basename + '.faa')
|
956
|
+
|
957
|
+
if self.do_compress:
|
958
|
+
compress_level = "1"
|
959
|
+
else:
|
960
|
+
compress_level = "0"
|
961
|
+
|
962
|
+
mn = pyrodigal_manager(file = self.genome, aa_out = protein_output, compare_against = [4], do_compress = compress_level)
|
963
|
+
mn.run_for_fastaai()
|
964
|
+
|
965
|
+
self.trans_table = str(mn.trans_table)
|
966
|
+
|
967
|
+
for prot in mn.excluded_seqs:
|
968
|
+
self.err_log += "Protein " + prot + " was observed to have >100K amino acids ( " + str(mn.excluded_seqs[prot]) + " AA found ). It will not be included in predicted proteins for this genome;"
|
969
|
+
|
970
|
+
self.prepared_proteins = mn.labeled_proteins
|
971
|
+
|
972
|
+
del mn
|
973
|
+
|
974
|
+
#If there are zipped files leftover and we didn't want them, clean them up.
|
975
|
+
if self.do_compress:
|
976
|
+
self.set_protein(str(protein_output)+".gz")
|
977
|
+
#Clean up unzipped version on reruns
|
978
|
+
if os.path.exists(str(protein_output)):
|
979
|
+
os.remove(str(protein_output))
|
980
|
+
else:
|
981
|
+
self.set_protein(str(protein_output))
|
982
|
+
#Clean up a zipped version on reruns
|
983
|
+
if os.path.exists(str(protein_output)+".gz"):
|
984
|
+
os.remove(str(protein_output)+".gz")
|
985
|
+
|
986
|
+
self.prot_pred_time = datetime.datetime.now()
|
987
|
+
|
988
|
+
#run hmmsearch on a protein
|
989
|
+
def protein_to_hmm(self):
|
990
|
+
if self.protein is None:
|
991
|
+
print(self.basename, "wasn't a declared as a protein! I can't make this into an HMM!")
|
992
|
+
else:
|
993
|
+
|
994
|
+
folder = os.path.normpath(self.output + "/hmms")
|
995
|
+
|
996
|
+
hmm_output = os.path.normpath(folder +"/"+ self.basename + '.hmm')
|
997
|
+
|
998
|
+
if self.prepared_proteins is None:
|
999
|
+
self.prepared_proteins, deflines = read_fasta(self.protein)
|
1000
|
+
|
1001
|
+
hmm_manager.run_for_fastaai(self.prepared_proteins, hmm_output)
|
1002
|
+
|
1003
|
+
self.ran_hmmer = True
|
1004
|
+
|
1005
|
+
if self.do_compress:
|
1006
|
+
self.set_hmm(str(hmm_output)+".gz")
|
1007
|
+
if os.path.exists(str(hmm_output)):
|
1008
|
+
os.remove(str(hmm_output))
|
1009
|
+
else:
|
1010
|
+
self.set_hmm(str(hmm_output))
|
1011
|
+
if os.path.exists(str(hmm_output)+".gz"):
|
1012
|
+
os.remove(str(hmm_output)+".gz")
|
1013
|
+
|
1014
|
+
self.hmm_search_time = datetime.datetime.now()
|
1015
|
+
|
1016
|
+
#Translate tetramers to unique int32 indices.
|
1017
|
+
def unique_kmer_simple_key(self, seq):
|
1018
|
+
#num tetramers = len(seq) - 4 + 1, just make it -3.
|
1019
|
+
n_kmers = len(seq) - 3
|
1020
|
+
|
1021
|
+
#Converts the characters in a sequence into their ascii int value
|
1022
|
+
as_ints = np.array([ord(i) for i in seq], dtype = np.int32)
|
1023
|
+
|
1024
|
+
#create seq like 0,1,2,3; 1,2,3,4; 2,3,4,5... for each tetramer that needs a value
|
1025
|
+
kmers = np.arange(4*n_kmers)
|
1026
|
+
kmers = kmers % 4 + kmers // 4
|
1027
|
+
|
1028
|
+
#Select the characters (as ints) corresponding to each tetramer all at once and reshape into rows of 4,
|
1029
|
+
#each row corresp. to a successive tetramer
|
1030
|
+
kmers = as_ints[kmers].reshape((n_kmers, 4))
|
1031
|
+
|
1032
|
+
#Given four 2-digit numbers, these multipliers work as offsets so that all digits are preserved in order when summed
|
1033
|
+
mult = np.array([1000000, 10000, 100, 1], dtype = np.int32)
|
1034
|
+
|
1035
|
+
#the fixed values effectively offset the successive chars of the tetramer by 2 positions each time;
|
1036
|
+
#practically, this is concatenation of numbers
|
1037
|
+
#Matrix mult does this for all values at once.
|
1038
|
+
return np.unique(np.dot(kmers, mult))
|
1039
|
+
|
1040
|
+
def load_hmm_and_filter_from_file(self):
|
1041
|
+
prots = []
|
1042
|
+
accs = []
|
1043
|
+
scores = []
|
1044
|
+
f = agnostic_reader(self.hmm)
|
1045
|
+
for line in f:
|
1046
|
+
if line.startswith("#"):
|
1047
|
+
continue
|
1048
|
+
else:
|
1049
|
+
segs = line.strip().split()
|
1050
|
+
|
1051
|
+
if len(segs) < 9:
|
1052
|
+
continue
|
1053
|
+
|
1054
|
+
prots.append(segs[0])
|
1055
|
+
accs.append(segs[3])
|
1056
|
+
scores.append(segs[8])
|
1057
|
+
|
1058
|
+
f.close()
|
1059
|
+
|
1060
|
+
if len(prots) < 1:
|
1061
|
+
self.best_hits = {}
|
1062
|
+
|
1063
|
+
hmm_file = np.transpose(np.array([prots, accs, scores]))
|
1064
|
+
|
1065
|
+
#hmm_file = np.loadtxt(hmm_file_name, comments = '#', usecols = (0, 3, 8), dtype=(str))
|
1066
|
+
#Sort the hmm file based on the score column in descending order.
|
1067
|
+
hmm_file = hmm_file[hmm_file[:,2].astype(float).argsort()[::-1]]
|
1068
|
+
|
1069
|
+
#Identify the first row where each gene name appears, after sorting by score;
|
1070
|
+
#in effect, return the highest scoring assignment per gene name
|
1071
|
+
#Sort the indices of the result to match the score-sorted table instead of alphabetical order of gene names
|
1072
|
+
hmm_file = hmm_file[np.sort(np.unique(hmm_file[:,0], return_index = True)[1])]
|
1073
|
+
|
1074
|
+
#Filter the file again for the unique ACCESSION names, since we're only allowed one gene per accession, I guess?
|
1075
|
+
#Don't sort the indices, we don't care about the scores anymore.
|
1076
|
+
hmm_file = hmm_file[np.unique(hmm_file[:,1], return_index = True)[1]]
|
1077
|
+
|
1078
|
+
sql_friendly_names = [i.replace(".", "_") for i in hmm_file[:,1]]
|
1079
|
+
self.best_hits = dict(zip(hmm_file[:,0], sql_friendly_names))
|
1080
|
+
|
1081
|
+
#This should consider the domain by majority vote...
|
1082
|
+
def prot_and_hmm_to_besthits(self):
|
1083
|
+
if self.ran_hmmer:
|
1084
|
+
#Manager has a filter built in.
|
1085
|
+
self.best_hits = hmm_manager.best_hits
|
1086
|
+
else:
|
1087
|
+
#Load the best hits file via old numpy method.
|
1088
|
+
self.load_hmm_and_filter_from_file()
|
1089
|
+
|
1090
|
+
hit_count = 0
|
1091
|
+
|
1092
|
+
#from pyrodigal predictions or HMM intermediate production, the sequences are already in mem and don't need read in.
|
1093
|
+
if self.prepared_proteins is None:
|
1094
|
+
#But otherwise, we need to read them in.
|
1095
|
+
self.prepared_proteins, deflines = read_fasta(self.protein)
|
1096
|
+
|
1097
|
+
self.protein_kmer_count = {}
|
1098
|
+
self.best_hits_kmers = {}
|
1099
|
+
|
1100
|
+
if self.crystalize:
|
1101
|
+
crystal_record = []
|
1102
|
+
|
1103
|
+
#Kmerize proteins and record metadata
|
1104
|
+
for protein in self.prepared_proteins:
|
1105
|
+
if protein in self.best_hits:
|
1106
|
+
accession = self.best_hits[protein]
|
1107
|
+
|
1108
|
+
if self.crystalize:
|
1109
|
+
crystal_record.append(str(protein)+"\t"+str(accession)+"\t"+str(self.prepared_proteins[protein])+"\n")
|
1110
|
+
|
1111
|
+
kmer_set = self.unique_kmer_simple_key(self.prepared_proteins[protein])
|
1112
|
+
self.protein_kmer_count[accession] = kmer_set.shape[0]
|
1113
|
+
self.protein_count += 1
|
1114
|
+
self.best_hits_kmers[accession] = kmer_set
|
1115
|
+
hit_count += 1
|
1116
|
+
|
1117
|
+
#Free the space either way
|
1118
|
+
self.prepared_proteins[protein] = None
|
1119
|
+
|
1120
|
+
if self.crystalize:
|
1121
|
+
#only make a crystal if it actually has content.
|
1122
|
+
if len(crystal_record) > 0:
|
1123
|
+
crystal_path = os.path.normpath(self.output + "/crystals/" + self.basename + '_faai_crystal.txt')
|
1124
|
+
crystal_record = "".join(crystal_record)
|
1125
|
+
|
1126
|
+
if self.do_compress:
|
1127
|
+
crystal_record = crystal_record.encode()
|
1128
|
+
crystal_writer = gzip.open(crystal_path+".gz", "wb")
|
1129
|
+
crystal_writer.write(crystal_record)
|
1130
|
+
crystal_writer.close()
|
1131
|
+
else:
|
1132
|
+
crystal_writer = open(crystal_path, "w")
|
1133
|
+
crystal_writer.write(crystal_record)
|
1134
|
+
crystal_writer.close()
|
1135
|
+
|
1136
|
+
#Final free.
|
1137
|
+
self.prepared_proteins = None
|
1138
|
+
|
1139
|
+
#No HMM hits.
|
1140
|
+
if hit_count == 0:
|
1141
|
+
self.is_empty = True
|
1142
|
+
|
1143
|
+
self.besthits_time = datetime.datetime.now()
|
1144
|
+
self.status = "best hits found"
|
1145
|
+
|
1146
|
+
def preprocess(self):
|
1147
|
+
self.init_time = datetime.datetime.now()
|
1148
|
+
#default to 0 time.
|
1149
|
+
self.prot_pred_time = self.init_time
|
1150
|
+
self.hmm_search_time = self.init_time
|
1151
|
+
self.besthits_time = self.init_time
|
1152
|
+
|
1153
|
+
#There's no advancement stage for protein and HMM
|
1154
|
+
if self.status == 'genome':
|
1155
|
+
start_time = self.curtime()
|
1156
|
+
#report = True
|
1157
|
+
if self.start_time is None:
|
1158
|
+
self.start_time = start_time
|
1159
|
+
|
1160
|
+
if self.initial_state == "protein+HMM":
|
1161
|
+
self.initial_state = "genome"
|
1162
|
+
|
1163
|
+
self.genome_to_protein()
|
1164
|
+
|
1165
|
+
if self.status == 'protein':
|
1166
|
+
start_time = self.curtime()
|
1167
|
+
#report = True
|
1168
|
+
if self.start_time is None:
|
1169
|
+
self.start_time = start_time
|
1170
|
+
|
1171
|
+
if self.initial_state == "protein+HMM":
|
1172
|
+
self.initial_state = "protein"
|
1173
|
+
|
1174
|
+
self.protein_to_hmm()
|
1175
|
+
|
1176
|
+
if self.status == 'protein and hmm':
|
1177
|
+
start_time = self.curtime()
|
1178
|
+
|
1179
|
+
if self.start_time is None:
|
1180
|
+
self.start_time = start_time
|
1181
|
+
|
1182
|
+
self.prot_and_hmm_to_besthits()
|
1183
|
+
|
1184
|
+
#Add an end time if either genome -> protein -> HMM or protein -> HMM happened.
|
1185
|
+
if self.start_time is not None:
|
1186
|
+
end_time = self.curtime()
|
1187
|
+
self.end_time = end_time
|
1188
|
+
else:
|
1189
|
+
#Start was protein+HMM. There was no runtime, and intitial state is p+hmm
|
1190
|
+
#self.initial_state = "protein+HMM"
|
1191
|
+
self.start_time = "N/A"
|
1192
|
+
self.end_time = "N/A"
|
1193
|
+
|
1194
|
+
#Protein not generated on this run.
|
1195
|
+
if self.trans_table is None:
|
1196
|
+
self.trans_table = "unknown"
|
1197
|
+
|
1198
|
+
self.partial_timings()
|
1199
|
+
|
1200
|
+
'''
|
1201
|
+
Utility functions
|
1202
|
+
'''
|
1203
|
+
def prepare_directories(output, status, build_or_query, make_crystals = False):
|
1204
|
+
preparation_successful = True
|
1205
|
+
|
1206
|
+
if not os.path.exists(output):
|
1207
|
+
try:
|
1208
|
+
os.mkdir(output)
|
1209
|
+
except:
|
1210
|
+
print("")
|
1211
|
+
print("FastAAI tried to make output directory: '"+ output + "' but failed.")
|
1212
|
+
print("")
|
1213
|
+
print("Troubleshooting:")
|
1214
|
+
print("")
|
1215
|
+
print(" (1) Do you have permission to create directories in the location you specified?")
|
1216
|
+
print(" (2) Did you make sure that all directories other than", os.path.basename(output), "already exist?")
|
1217
|
+
print("")
|
1218
|
+
preparation_successful = False
|
1219
|
+
|
1220
|
+
if preparation_successful:
|
1221
|
+
try:
|
1222
|
+
if status == 'genome':
|
1223
|
+
if not os.path.exists(os.path.normpath(output + "/" + "predicted_proteins")):
|
1224
|
+
os.mkdir(os.path.normpath(output + "/" + "predicted_proteins"))
|
1225
|
+
if not os.path.exists(os.path.normpath(output + "/" + "hmms")):
|
1226
|
+
os.mkdir(os.path.normpath(output + "/" + "hmms"))
|
1227
|
+
|
1228
|
+
if status == 'protein':
|
1229
|
+
if not os.path.exists(os.path.normpath(output + "/" + "hmms")):
|
1230
|
+
os.mkdir(os.path.normpath(output + "/" + "hmms"))
|
1231
|
+
|
1232
|
+
if make_crystals:
|
1233
|
+
if not os.path.exists(os.path.normpath(output + "/" + "crystals")):
|
1234
|
+
os.mkdir(os.path.normpath(output + "/" + "crystals"))
|
1235
|
+
|
1236
|
+
if build_or_query == "build":
|
1237
|
+
if not os.path.exists(os.path.normpath(output + "/" + "database")):
|
1238
|
+
os.mkdir(os.path.normpath(output + "/" + "database"))
|
1239
|
+
|
1240
|
+
if build_or_query == "query":
|
1241
|
+
if not os.path.exists(os.path.normpath(output + "/" + "results")):
|
1242
|
+
os.mkdir(os.path.normpath(output + "/" + "results"))
|
1243
|
+
|
1244
|
+
|
1245
|
+
except:
|
1246
|
+
print("FastAAI was able to create or find", output, "but couldn't make directories there.")
|
1247
|
+
print("")
|
1248
|
+
print("This shouldn't happen. Do you have permission to write to that directory?")
|
1249
|
+
|
1250
|
+
|
1251
|
+
return preparation_successful
|
1252
|
+
|
1253
|
+
def find_hmm():
|
1254
|
+
hmm_path = None
|
1255
|
+
try:
|
1256
|
+
#Try to locate the data bundled as it would be with a pip/conda install.
|
1257
|
+
script_path = os.path.dirname(sys.modules['fastAAI_HMM_models'].__file__)
|
1258
|
+
if len(script_path) == 0:
|
1259
|
+
script_path = "."
|
1260
|
+
hmm_complete_model = os.path.abspath(os.path.normpath(script_path + '/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm'))
|
1261
|
+
hmm_path = str(hmm_complete_model)
|
1262
|
+
#Check that the file exists or fail to the except.
|
1263
|
+
fh = open(hmm_path)
|
1264
|
+
fh.close()
|
1265
|
+
except:
|
1266
|
+
#Look in the same dir as the script; old method/MiGA friendly
|
1267
|
+
script_path = os.path.dirname(__file__)
|
1268
|
+
if len(script_path) == 0:
|
1269
|
+
script_path = "."
|
1270
|
+
hmm_complete_model = os.path.abspath(os.path.normpath(script_path +"/"+ "00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm"))
|
1271
|
+
hmm_path = str(hmm_complete_model)
|
1272
|
+
|
1273
|
+
return hmm_path
|
1274
|
+
|
1275
|
+
#Build DB from genomes
|
1276
|
+
|
1277
|
+
def unique_kmers(seq, ksize):
|
1278
|
+
n_kmers = len(seq) - ksize + 1
|
1279
|
+
kmers = []
|
1280
|
+
for i in range(n_kmers):
|
1281
|
+
kmers.append(kmer_index[seq[i:i + ksize]])
|
1282
|
+
#We care about the type because we're working with bytes later.
|
1283
|
+
return np.unique(kmers).astype(np.int32)
|
1284
|
+
|
1285
|
+
def split_seq(seq, num_grps):
|
1286
|
+
newseq = []
|
1287
|
+
splitsize = 1.0/num_grps*len(seq)
|
1288
|
+
for i in range(num_grps):
|
1289
|
+
newseq.append(seq[int(round(i*splitsize)):int(round((i+1)*splitsize))])
|
1290
|
+
return newseq
|
1291
|
+
|
1292
|
+
#gives the max and min index needed to split a list of (max_val) genomes into
|
1293
|
+
def split_indicies(max_val, num_grps):
|
1294
|
+
newseq = []
|
1295
|
+
splitsize = 1.0/num_grps*max_val
|
1296
|
+
for i in range(num_grps):
|
1297
|
+
newseq.append(((round(i*splitsize)), round((i+1)*splitsize)))
|
1298
|
+
return newseq
|
1299
|
+
|
1300
|
+
def split_seq_indices(seq, num_grps):
|
1301
|
+
newseq = []
|
1302
|
+
splitsize = 1.0/num_grps*len(seq)
|
1303
|
+
for i in range(num_grps):
|
1304
|
+
newseq.append((int(round(i*splitsize)), int(round((i+1)*splitsize)),))
|
1305
|
+
return newseq
|
1306
|
+
|
1307
|
+
|
1308
|
+
def list_to_index_dict(list):
|
1309
|
+
result = {}
|
1310
|
+
counter = 0
|
1311
|
+
for item in list:
|
1312
|
+
result[item] = counter
|
1313
|
+
counter += 1
|
1314
|
+
return result
|
1315
|
+
|
1316
|
+
|
1317
|
+
def rev_list_to_index_dict(list):
|
1318
|
+
result = {}
|
1319
|
+
counter = 0
|
1320
|
+
for item in list:
|
1321
|
+
result[counter] = item
|
1322
|
+
counter += 1
|
1323
|
+
return result
|
1324
|
+
|
1325
|
+
def generate_accessions_index(forward = True):
|
1326
|
+
acc_list = ['PF01780_19', 'PF03948_14', 'PF17144_4', 'PF00830_19', 'PF00347_23', 'PF16906_5', 'PF13393_6',
|
1327
|
+
'PF02565_15', 'PF01991_18', 'PF01984_20', 'PF00861_22', 'PF13656_6', 'PF00368_18', 'PF01142_18', 'PF00312_22', 'PF02367_17',
|
1328
|
+
'PF01951_16', 'PF00749_21', 'PF01655_18', 'PF00318_20', 'PF01813_17', 'PF01649_18', 'PF01025_19', 'PF00380_19', 'PF01282_19',
|
1329
|
+
'PF01864_17', 'PF01783_23', 'PF01808_18', 'PF01982_16', 'PF01715_17', 'PF00213_18', 'PF00119_20', 'PF00573_22', 'PF01981_16',
|
1330
|
+
'PF00281_19', 'PF00584_20', 'PF00825_18', 'PF00406_22', 'PF00177_21', 'PF01192_22', 'PF05833_11', 'PF02699_15', 'PF01016_19',
|
1331
|
+
'PF01765_19', 'PF00453_18', 'PF01193_24', 'PF05221_17', 'PF00231_19', 'PF00416_22', 'PF02033_18', 'PF01668_18', 'PF00886_19',
|
1332
|
+
'PF00252_18', 'PF00572_18', 'PF00366_20', 'PF04104_14', 'PF04919_12', 'PF01912_18', 'PF00276_20', 'PF00203_21', 'PF00889_19',
|
1333
|
+
'PF02996_17', 'PF00121_18', 'PF01990_17', 'PF00344_20', 'PF00297_22', 'PF01196_19', 'PF01194_17', 'PF01725_16', 'PF00750_19',
|
1334
|
+
'PF00338_22', 'PF00238_19', 'PF01200_18', 'PF00162_19', 'PF00181_23', 'PF01866_17', 'PF00709_21', 'PF02006_16', 'PF00164_25',
|
1335
|
+
'PF00237_19', 'PF01139_17', 'PF01351_18', 'PF04010_13', 'PF06093_13', 'PF00828_19', 'PF02410_15', 'PF01176_19', 'PF02130_17',
|
1336
|
+
'PF01948_18', 'PF01195_19', 'PF01746_21', 'PF01667_17', 'PF03874_16', 'PF01090_19', 'PF01198_19', 'PF01250_17', 'PF17136_4',
|
1337
|
+
'PF06026_14', 'PF03652_15', 'PF04019_12', 'PF01201_22', 'PF00832_20', 'PF01264_21', 'PF03840_14', 'PF00831_23', 'PF00189_20',
|
1338
|
+
'PF02601_15', 'PF01496_19', 'PF00411_19', 'PF00334_19', 'PF00687_21', 'PF01157_18', 'PF01245_20', 'PF01994_16', 'PF01632_19',
|
1339
|
+
'PF00827_17', 'PF01015_18', 'PF00829_21', 'PF00410_19', 'PF00833_18', 'PF00935_19', 'PF01992_16']
|
1340
|
+
if forward:
|
1341
|
+
list_of_poss_accs = list_to_index_dict(acc_list)
|
1342
|
+
else:
|
1343
|
+
list_of_poss_accs = rev_list_to_index_dict(acc_list)
|
1344
|
+
|
1345
|
+
return list_of_poss_accs
|
1346
|
+
|
1347
|
+
#Build or add to a FastAAI DB
|
1348
|
+
def build_db_opts():
|
1349
|
+
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
|
1350
|
+
description='''
|
1351
|
+
This FastAAI module allows you to create a FastAAI database from one or many genomes, proteins, or proteins and HMMs, or add these files to an existing one.
|
1352
|
+
|
1353
|
+
Supply genomes OR proteins OR proteins AND HMMs as inputs.
|
1354
|
+
|
1355
|
+
If you supply genomes, FastAAI will predict proteins from them, and HMMs will be created from those proteins
|
1356
|
+
If you supply only proteins, FastAAI will create HMM files from them, searching against FastAAI's internal database
|
1357
|
+
If you supply proteins AND HMMs, FastAAI will directly use them to build the database.\n
|
1358
|
+
You cannot supply both genomes and proteins
|
1359
|
+
''')
|
1360
|
+
|
1361
|
+
parser.add_argument('-g', '--genomes', dest = 'genomes', default = None, help = 'A directory containing genomes in FASTA format.')
|
1362
|
+
parser.add_argument('-p', '--proteins', dest = 'proteins', default = None, help = 'A directory containing protein amino acids in FASTA format.')
|
1363
|
+
parser.add_argument('-m', '--hmms', dest = 'hmms', default = None, help = 'A directory containing the results of an HMM search on a set of proteins.')
|
1364
|
+
parser.add_argument('-d', '--database', dest = 'db_name', default = "FastAAI_database.sqlite.db", help = 'The name of the database you wish to create or add to. The database will be created if it doesn\'t already exist and placed in the output directory. FastAAI_database.sqlite.db by default.')
|
1365
|
+
|
1366
|
+
parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory to place the database and any protein or HMM files FastAAI creates. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
|
1367
|
+
|
1368
|
+
parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
|
1369
|
+
parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
|
1370
|
+
parser.add_argument('--compress', dest = "do_comp", action = 'store_true', help = 'Gzip compress generated proteins, HMMs. Off by default.')
|
1371
|
+
|
1372
|
+
args, unknown = parser.parse_known_args()
|
1373
|
+
|
1374
|
+
return parser, args
|
1375
|
+
|
1376
|
+
def run_build(input_file):
|
1377
|
+
input_file.preprocess()
|
1378
|
+
if len(input_file.best_hits_kmers) < 1:
|
1379
|
+
input_file.best_hits_kmers = None
|
1380
|
+
input_file.err_log += " This file did not successfully complete. No SCPs could be found."
|
1381
|
+
|
1382
|
+
return input_file
|
1383
|
+
|
1384
|
+
def acc_transformer_init(db, tempdir_path):
|
1385
|
+
sqlite3.register_converter("array", convert_array)
|
1386
|
+
global indb
|
1387
|
+
indb = db
|
1388
|
+
global temp_dir
|
1389
|
+
temp_dir = tempdir_path
|
1390
|
+
global ok
|
1391
|
+
ok = generate_accessions_index()
|
1392
|
+
|
1393
|
+
def acc_transformer(acc_name):
|
1394
|
+
source = sqlite3.connect(indb)
|
1395
|
+
scurs = source.cursor()
|
1396
|
+
|
1397
|
+
data = scurs.execute("SELECT * FROM {acc}_genomes".format(acc=acc_name)).fetchall()
|
1398
|
+
|
1399
|
+
scurs.close()
|
1400
|
+
source.close()
|
1401
|
+
|
1402
|
+
reformat = {}
|
1403
|
+
|
1404
|
+
for row in data:
|
1405
|
+
genome, kmers = row[0], np.frombuffer(row[1], dtype=np.int32)
|
1406
|
+
for k in kmers:
|
1407
|
+
if k not in reformat:
|
1408
|
+
reformat[k] = []
|
1409
|
+
reformat[k].append(genome)
|
1410
|
+
|
1411
|
+
data = None
|
1412
|
+
|
1413
|
+
to_add = []
|
1414
|
+
for k in reformat:
|
1415
|
+
as_bytes = np.array(reformat[k], dtype = np.int32)
|
1416
|
+
as_bytes = as_bytes.tobytes()
|
1417
|
+
reformat[k] = None
|
1418
|
+
to_add.append((int(k), as_bytes,))
|
1419
|
+
|
1420
|
+
my_acc_db = os.path.normpath(temp_dir + "/"+acc_name+".db")
|
1421
|
+
|
1422
|
+
if os.path.exists(my_acc_db):
|
1423
|
+
os.remove(my_acc_db)
|
1424
|
+
|
1425
|
+
my_db = sqlite3.connect(my_acc_db)
|
1426
|
+
curs = my_db.cursor()
|
1427
|
+
curs.execute("CREATE TABLE {acc} (kmer INTEGER PRIMARY KEY, genomes array)".format(acc=acc_name))
|
1428
|
+
my_db.commit()
|
1429
|
+
|
1430
|
+
curs.executemany("INSERT INTO {acc} VALUES (?, ?)".format(acc = acc_name), to_add)
|
1431
|
+
|
1432
|
+
my_db.commit()
|
1433
|
+
|
1434
|
+
to_add = None
|
1435
|
+
|
1436
|
+
curs.execute("CREATE INDEX {acc}_index ON {acc} (kmer)".format(acc=acc_name))
|
1437
|
+
my_db.commit()
|
1438
|
+
|
1439
|
+
curs.close()
|
1440
|
+
my_db.close()
|
1441
|
+
|
1442
|
+
return [my_acc_db, acc_name]
|
1443
|
+
|
1444
|
+
def build_db(genomes, proteins, hmms, db_name, output, threads, verbose, do_compress):
|
1445
|
+
success = True
|
1446
|
+
|
1447
|
+
imported_files = fastaai_file_importer(genomes = genomes, proteins = proteins, hmms = hmms, output = output, compress = do_compress)
|
1448
|
+
imported_files.determine_inputs()
|
1449
|
+
|
1450
|
+
if imported_files.error:
|
1451
|
+
print("Exiting FastAAI due to input file error.")
|
1452
|
+
quit()
|
1453
|
+
|
1454
|
+
good_to_go = prepare_directories(output, imported_files.status, "query")
|
1455
|
+
|
1456
|
+
db_path = os.path.normpath(output + "/database")
|
1457
|
+
if not os.path.exists(db_path):
|
1458
|
+
os.mkdir(db_path)
|
1459
|
+
|
1460
|
+
if not good_to_go:
|
1461
|
+
print("Exiting FastAAI")
|
1462
|
+
sys.exit()
|
1463
|
+
|
1464
|
+
print("")
|
1465
|
+
|
1466
|
+
hmm_path = find_hmm()
|
1467
|
+
|
1468
|
+
#Check if the db contains path info. Incl. windows version.
|
1469
|
+
if "/" not in db_name and "\\" not in db_name:
|
1470
|
+
final_database = os.path.normpath(output + "/database/" + db_name)
|
1471
|
+
else:
|
1472
|
+
#If the person insists that the db has a path, let them.
|
1473
|
+
final_database = db_name
|
1474
|
+
|
1475
|
+
#We'll skip trying this if the file already exists.
|
1476
|
+
existing_genome_IDs = None
|
1477
|
+
try:
|
1478
|
+
if os.path.exists(final_database):
|
1479
|
+
parent = sqlite3.connect(final_database)
|
1480
|
+
curs = parent.cursor()
|
1481
|
+
|
1482
|
+
existing_genome_IDs = {}
|
1483
|
+
sql_command = "SELECT genome, gen_id FROM genome_index"
|
1484
|
+
for result in curs.execute(sql_command).fetchall():
|
1485
|
+
genome = result[0]
|
1486
|
+
id = int(result[1])
|
1487
|
+
existing_genome_IDs[genome] = id
|
1488
|
+
|
1489
|
+
curs.close()
|
1490
|
+
parent.close()
|
1491
|
+
except:
|
1492
|
+
print("You specified an existing file to be a database, but it does not appear to be a FastAAI database.")
|
1493
|
+
print("FastAAI will not be able to continue. Please give FastAAI a different database name and continue.")
|
1494
|
+
print("Exiting.")
|
1495
|
+
success = False
|
1496
|
+
|
1497
|
+
if success:
|
1498
|
+
hmm_file = find_hmm()
|
1499
|
+
if existing_genome_IDs is not None:
|
1500
|
+
genome_idx = max(list(existing_genome_IDs.values()))+1
|
1501
|
+
else:
|
1502
|
+
existing_genome_IDs = {}
|
1503
|
+
genome_idx = 0
|
1504
|
+
|
1505
|
+
#return_to
|
1506
|
+
td = tempfile.mkdtemp()
|
1507
|
+
#if not os.path.exists(td):
|
1508
|
+
# os.mkdir(td)
|
1509
|
+
|
1510
|
+
temp_db = os.path.normpath(td+"/FastAAI_temp_db.db")
|
1511
|
+
|
1512
|
+
if os.path.exists(temp_db):
|
1513
|
+
os.remove(temp_db)
|
1514
|
+
|
1515
|
+
sqlite3.register_converter("array", convert_array)
|
1516
|
+
worker = sqlite3.connect(temp_db)
|
1517
|
+
wcurs = worker.cursor()
|
1518
|
+
wcurs.execute("CREATE TABLE genome_index (genome text, gen_id integer, protein_count integer)")
|
1519
|
+
wcurs.execute("CREATE TABLE genome_acc_kmer_counts (genome integer, accession integer, count integer)")
|
1520
|
+
ok = generate_accessions_index()
|
1521
|
+
for t in ok:
|
1522
|
+
wcurs.execute("CREATE TABLE " + t + "_genomes (genome INTEGER PRIMARY KEY, kmers array)")
|
1523
|
+
|
1524
|
+
worker.commit()
|
1525
|
+
|
1526
|
+
new_gens = []
|
1527
|
+
new_gak = []
|
1528
|
+
accs_seen = {}
|
1529
|
+
if verbose:
|
1530
|
+
tracker = progress_tracker(total = len(imported_files.in_files), message = "Processing inputs")
|
1531
|
+
else:
|
1532
|
+
print("Processing inputs")
|
1533
|
+
|
1534
|
+
#Only build_db makes a log.
|
1535
|
+
if not os.path.exists(os.path.normpath(output + "/" + "logs")):
|
1536
|
+
os.mkdir(os.path.normpath(output + "/" + "logs"))
|
1537
|
+
|
1538
|
+
logger = open(os.path.normpath(output+"/logs/"+"FastAAI_preprocessing_log.txt"), "a")
|
1539
|
+
print("file", "start_date", "end_date", "starting_format",
|
1540
|
+
"prot_prediction_time", "trans_table", "hmm_search_time", "besthits_time",
|
1541
|
+
"errors", sep = "\t", file = logger)
|
1542
|
+
|
1543
|
+
pool = multiprocessing.Pool(threads, initializer = hmm_preproc_initializer,
|
1544
|
+
initargs = (hmm_file, do_compress,))
|
1545
|
+
|
1546
|
+
for result in pool.imap(run_build, imported_files.in_files):
|
1547
|
+
#log data, regardless of kind
|
1548
|
+
print(result.basename, result.start_time, result.end_time, result.initial_state,
|
1549
|
+
result.prot_pred_time, result.trans_table, result.hmm_search_time, result.besthits_time,
|
1550
|
+
result.err_log, sep = "\t", file = logger)
|
1551
|
+
|
1552
|
+
if result.best_hits_kmers is not None:
|
1553
|
+
genome_name = result.original_name
|
1554
|
+
|
1555
|
+
if genome_name in existing_genome_IDs:
|
1556
|
+
print(genome_name, "Already present in final database and will be skipped.")
|
1557
|
+
print("")
|
1558
|
+
else:
|
1559
|
+
protein_count = result.protein_count
|
1560
|
+
for acc_name in result.best_hits_kmers:
|
1561
|
+
if acc_name not in accs_seen:
|
1562
|
+
accs_seen[acc_name] = 0
|
1563
|
+
acc_id = ok[acc_name]
|
1564
|
+
kmer_ct = result.protein_kmer_count[acc_name]
|
1565
|
+
kmers = result.best_hits_kmers[acc_name]
|
1566
|
+
kmers = kmers.tobytes()
|
1567
|
+
wcurs.execute("INSERT INTO {acc}_genomes VALUES (?, ?)".format(acc=acc_name), (genome_idx, kmers,))
|
1568
|
+
new_gak.append((genome_idx, acc_id, kmer_ct,))
|
1569
|
+
|
1570
|
+
new_gens.append((genome_name, genome_idx, protein_count,))
|
1571
|
+
genome_idx += 1
|
1572
|
+
|
1573
|
+
worker.commit()
|
1574
|
+
|
1575
|
+
if verbose:
|
1576
|
+
tracker.update()
|
1577
|
+
|
1578
|
+
pool.close()
|
1579
|
+
|
1580
|
+
logger.close()
|
1581
|
+
|
1582
|
+
wcurs.executemany("INSERT INTO genome_index VALUES (?,?,?)", new_gens)
|
1583
|
+
wcurs.executemany("INSERT INTO genome_acc_kmer_counts VALUES (?,?,?)", new_gak)
|
1584
|
+
worker.commit()
|
1585
|
+
|
1586
|
+
wcurs.close()
|
1587
|
+
worker.close()
|
1588
|
+
|
1589
|
+
accs_seen = list(accs_seen.keys())
|
1590
|
+
|
1591
|
+
parent = sqlite3.connect(final_database)
|
1592
|
+
curs = parent.cursor()
|
1593
|
+
|
1594
|
+
curs.execute("attach '" + temp_db + "' as worker")
|
1595
|
+
#initialize if needed.
|
1596
|
+
curs.execute("CREATE TABLE IF NOT EXISTS genome_index (genome text, gen_id integer, protein_count integer)")
|
1597
|
+
curs.execute("CREATE TABLE IF NOT EXISTS genome_acc_kmer_counts (genome integer, accession integer, count integer)")
|
1598
|
+
|
1599
|
+
curs.execute("INSERT INTO genome_index SELECT * FROM worker.genome_index")
|
1600
|
+
curs.execute("INSERT INTO genome_acc_kmer_counts SELECT * FROM worker.genome_acc_kmer_counts")
|
1601
|
+
curs.execute("CREATE INDEX IF NOT EXISTS kmer_acc ON genome_acc_kmer_counts (genome, accession);")
|
1602
|
+
parent.commit()
|
1603
|
+
|
1604
|
+
if verbose:
|
1605
|
+
tracker = progress_tracker(total = len(accs_seen), message = "Collecting results")
|
1606
|
+
else:
|
1607
|
+
print("Collecting results")
|
1608
|
+
|
1609
|
+
pool = multiprocessing.Pool(threads, initializer = acc_transformer_init,
|
1610
|
+
initargs = (temp_db, td,))
|
1611
|
+
|
1612
|
+
for result in pool.imap_unordered(acc_transformer, accs_seen):
|
1613
|
+
database, accession = result[0], result[1]
|
1614
|
+
curs.execute("CREATE TABLE IF NOT EXISTS {acc} (kmer INTEGER PRIMARY KEY, genomes array)".format(acc=accession))
|
1615
|
+
curs.execute("CREATE TABLE IF NOT EXISTS {acc}_genomes (genome INTEGER PRIMARY KEY, kmers array)".format(acc=accession))
|
1616
|
+
curs.execute("CREATE INDEX IF NOT EXISTS {acc}_index ON {acc}(kmer)".format(acc=accession))
|
1617
|
+
|
1618
|
+
#Get the genomes from worker db.
|
1619
|
+
curs.execute("INSERT INTO {acc}_genomes SELECT * FROM worker.{acc}_genomes".format(acc=accession))
|
1620
|
+
|
1621
|
+
parent.commit()
|
1622
|
+
|
1623
|
+
accdb = sqlite3.connect(database)
|
1624
|
+
acc_curs = accdb.cursor()
|
1625
|
+
|
1626
|
+
to_update = acc_curs.execute("SELECT kmer, genomes, genomes FROM {acc}".format(acc=accession)).fetchall()
|
1627
|
+
|
1628
|
+
acc_curs.close()
|
1629
|
+
accdb.close()
|
1630
|
+
|
1631
|
+
update_concat_sql = "INSERT INTO {acc} VALUES (?,?) ON CONFLICT(kmer) DO UPDATE SET genomes=genomes || (?)".format(acc=accession)
|
1632
|
+
#ON CONFLICT(kmer) DO UPDATE SET genomes=genomes || acc.{acc}.genomes;".format(acc=accession)
|
1633
|
+
#print(update_concat_sql)
|
1634
|
+
curs.executemany(update_concat_sql, to_update)
|
1635
|
+
|
1636
|
+
parent.commit()
|
1637
|
+
|
1638
|
+
os.remove(database)
|
1639
|
+
|
1640
|
+
if verbose:
|
1641
|
+
tracker.update()
|
1642
|
+
|
1643
|
+
pool.close()
|
1644
|
+
|
1645
|
+
curs.execute("detach worker")
|
1646
|
+
|
1647
|
+
parent.commit()
|
1648
|
+
|
1649
|
+
curs.close()
|
1650
|
+
parent.close()
|
1651
|
+
|
1652
|
+
os.remove(temp_db)
|
1653
|
+
try:
|
1654
|
+
if len(os.listdir(td)) == 0:
|
1655
|
+
shutil.rmtree(td)
|
1656
|
+
except:
|
1657
|
+
pass
|
1658
|
+
|
1659
|
+
if success:
|
1660
|
+
print("Database build complete!")
|
1661
|
+
|
1662
|
+
return success
|
1663
|
+
|
1664
|
+
def file_v_db_initializer(tgak, tgt_names, tgt_cts, hmm_file, do_compress, tgt_ct, sd, out, style, in_mem, build_q, tdb):
|
1665
|
+
#num_tgts, self.do_sd, self.output, self.style, self.as_mem_db, self.do_db_build
|
1666
|
+
global _tdb
|
1667
|
+
_tdb = tdb
|
1668
|
+
|
1669
|
+
global _tgt_gak
|
1670
|
+
_tgt_gak = tgak
|
1671
|
+
|
1672
|
+
global _tname
|
1673
|
+
_tname = tgt_names
|
1674
|
+
|
1675
|
+
global _tct
|
1676
|
+
_tct = tgt_cts
|
1677
|
+
|
1678
|
+
global hmm_manager
|
1679
|
+
hmm_manager = pyhmmer_manager(do_compress)
|
1680
|
+
hmm_manager.load_hmm_from_file(hmm_file)
|
1681
|
+
|
1682
|
+
global num_tgts
|
1683
|
+
num_tgts = tgt_ct
|
1684
|
+
|
1685
|
+
global _do_sd
|
1686
|
+
_do_sd = sd
|
1687
|
+
|
1688
|
+
global out_style
|
1689
|
+
out_style = style
|
1690
|
+
|
1691
|
+
global out_base
|
1692
|
+
out_base = out
|
1693
|
+
|
1694
|
+
global db_is_in_mem
|
1695
|
+
db_is_in_mem = in_mem
|
1696
|
+
|
1697
|
+
global make_query_db
|
1698
|
+
make_query_db = build_q
|
1699
|
+
|
1700
|
+
return _tdb, _tgt_gak, _tname, _tct, hmm_manager, num_tgts, _do_sd, out_base, out_style, db_is_in_mem, make_query_db
|
1701
|
+
|
1702
|
+
def file_v_db_worker(query_args):
|
1703
|
+
#query info for this particular query
|
1704
|
+
in_file = query_args[0]
|
1705
|
+
|
1706
|
+
in_file.preprocess()
|
1707
|
+
|
1708
|
+
qname = in_file.basename
|
1709
|
+
|
1710
|
+
do_sd = _do_sd
|
1711
|
+
|
1712
|
+
#std dev. calcs are not meaningful with matrix style output.
|
1713
|
+
if out_style == "matrix":
|
1714
|
+
do_sd = False
|
1715
|
+
|
1716
|
+
if do_sd:
|
1717
|
+
results = []
|
1718
|
+
shared_acc_counts = []
|
1719
|
+
else:
|
1720
|
+
results = np.zeros(shape = num_tgts, dtype = np.float_)
|
1721
|
+
shared_acc_counts = np.zeros(shape = num_tgts, dtype = np.int32)
|
1722
|
+
|
1723
|
+
if db_is_in_mem:
|
1724
|
+
#The connection is already given as MDB if the db is in mem
|
1725
|
+
tconn = _tdb
|
1726
|
+
else:
|
1727
|
+
#db is on disk and the connection has to be established.
|
1728
|
+
tconn = sqlite3.connect(_tdb)
|
1729
|
+
|
1730
|
+
tcurs = tconn.cursor()
|
1731
|
+
|
1732
|
+
#This is a difference from the DB-first method.
|
1733
|
+
acc_idx = generate_accessions_index(forward = True)
|
1734
|
+
|
1735
|
+
genome_lists = {}
|
1736
|
+
|
1737
|
+
tcurs.row_factory = lambda cursor, row: row[0]
|
1738
|
+
|
1739
|
+
|
1740
|
+
if make_query_db:
|
1741
|
+
ret = [qname, None, []]
|
1742
|
+
else:
|
1743
|
+
ret = [qname, None, None]
|
1744
|
+
|
1745
|
+
#We need to purge accsessions not in tgt.
|
1746
|
+
for acc in in_file.best_hits_kmers:
|
1747
|
+
one = in_file.best_hits_kmers[acc]
|
1748
|
+
acc_id = acc_idx[acc]
|
1749
|
+
|
1750
|
+
if make_query_db:
|
1751
|
+
ret[2].append((qname, acc_id, one.tobytes(),))
|
1752
|
+
|
1753
|
+
#Check working.
|
1754
|
+
if acc_id in _tgt_gak:
|
1755
|
+
|
1756
|
+
kmer_ct = one.shape[0]
|
1757
|
+
|
1758
|
+
if do_sd:
|
1759
|
+
hits = np.zeros(shape = num_tgts, dtype = np.int32)
|
1760
|
+
hits[np.nonzero(_tgt_gak[acc_id])] = 1
|
1761
|
+
shared_acc_counts.append(hits)
|
1762
|
+
else:
|
1763
|
+
shared_acc_counts[np.nonzero(_tgt_gak[acc_id])] += 1
|
1764
|
+
|
1765
|
+
#SQL has a max binding size of 999, for some reason.
|
1766
|
+
if kmer_ct > 998:
|
1767
|
+
#Each kmer needs to be a tuple.
|
1768
|
+
these_kmers = [(int(kmer),) for kmer in one]
|
1769
|
+
|
1770
|
+
temp_name = "_" + qname +"_" + acc
|
1771
|
+
temp_name = temp_name.replace(".", "_")
|
1772
|
+
|
1773
|
+
tcurs.execute("CREATE TEMP TABLE " + temp_name + " (kmer INTEGER)")
|
1774
|
+
tconn.commit()
|
1775
|
+
insert_table = "INSERT INTO " + temp_name + " VALUES (?)"
|
1776
|
+
tcurs.executemany(insert_table, these_kmers)
|
1777
|
+
tconn.commit()
|
1778
|
+
join_and_select_sql = "SELECT genomes FROM " + temp_name + " INNER JOIN " + acc + " ON "+ temp_name+".kmer = " + acc+".kmer;"
|
1779
|
+
|
1780
|
+
set = tcurs.execute(join_and_select_sql).fetchall()
|
1781
|
+
else:
|
1782
|
+
#kmers must be a list, not a tuple.
|
1783
|
+
these_kmers = [int(kmer) for kmer in one]
|
1784
|
+
select = "SELECT genomes FROM " + acc + " WHERE kmer IN ({kmers})".format(kmers=','.join(['?']*len(these_kmers)))
|
1785
|
+
|
1786
|
+
set = tcurs.execute(select, these_kmers).fetchall()
|
1787
|
+
|
1788
|
+
#join results into one bytestring.
|
1789
|
+
set = b''.join(set)
|
1790
|
+
|
1791
|
+
these_intersections = np.bincount(np.frombuffer(set, dtype = np.int32), minlength = num_tgts)
|
1792
|
+
set = None
|
1793
|
+
#Add tgt kmer counts to query kmer counts, find union size based on intersection size, cald jacc
|
1794
|
+
jacc = np.divide(these_intersections, np.subtract(np.add(_tgt_gak[acc_id], kmer_ct), these_intersections))
|
1795
|
+
|
1796
|
+
if do_sd:
|
1797
|
+
results.append(jacc)
|
1798
|
+
else:
|
1799
|
+
results += jacc
|
1800
|
+
|
1801
|
+
tcurs.row_factory = None
|
1802
|
+
tcurs.close()
|
1803
|
+
|
1804
|
+
if do_sd:
|
1805
|
+
results = np.vstack(results)
|
1806
|
+
has_accs = np.vstack(shared_acc_counts)
|
1807
|
+
|
1808
|
+
shared_acc_counts = np.sum(has_accs, axis = 0)
|
1809
|
+
|
1810
|
+
#final jacc_means
|
1811
|
+
jaccard_averages = np.divide(np.sum(results, axis = 0), shared_acc_counts)
|
1812
|
+
|
1813
|
+
aai_ests = numpy_kaai_to_aai(jaccard_averages)
|
1814
|
+
|
1815
|
+
#find diffs from means; this includes indicies corresponding to unshared SCPs that should not be included.
|
1816
|
+
results = results - jaccard_averages
|
1817
|
+
|
1818
|
+
#fix those corresponding indicies to not contribute to the final SD.
|
1819
|
+
results[np.nonzero(has_accs == 0)] = 0
|
1820
|
+
|
1821
|
+
#Square them
|
1822
|
+
results = np.square(results)
|
1823
|
+
#Sum squares and divide by shared acc. count, the sqrt to get SD.
|
1824
|
+
jaccard_SDs = np.sqrt(np.divide(np.sum(results, axis = 0), shared_acc_counts))
|
1825
|
+
jaccard_SDs = np.round(jaccard_SDs, 4).astype(str)
|
1826
|
+
|
1827
|
+
else:
|
1828
|
+
#other condition.
|
1829
|
+
jaccard_SDs = None
|
1830
|
+
jaccard_averages = np.divide(results, shared_acc_counts)
|
1831
|
+
#we don't want to pass char arrays to main, so skip this here and do it in main instead.
|
1832
|
+
if out_style != "matrix":
|
1833
|
+
aai_ests = numpy_kaai_to_aai(jaccard_averages)
|
1834
|
+
|
1835
|
+
del results
|
1836
|
+
|
1837
|
+
#Since the outputs go to separate files, it makes more sense to do them within the worker processes instead of in main.
|
1838
|
+
if out_style == "tsv":
|
1839
|
+
no_hit = np.where(shared_acc_counts == 0)
|
1840
|
+
|
1841
|
+
possible_hits = np.minimum(len(in_file.best_hits_kmers), _tct).astype(str)
|
1842
|
+
jaccard_averages = np.round(jaccard_averages, 4).astype(str)
|
1843
|
+
shared_acc_counts = shared_acc_counts.astype(str)
|
1844
|
+
|
1845
|
+
jaccard_averages[no_hit] = "N/A"
|
1846
|
+
aai_ests[no_hit] = "N/A"
|
1847
|
+
shared_acc_counts[no_hit] = "N/A"
|
1848
|
+
possible_hits[no_hit] = "N/A"
|
1849
|
+
|
1850
|
+
output_name = os.path.normpath(out_base + "/"+qname+"_results.txt")
|
1851
|
+
|
1852
|
+
out = open(output_name, "w")
|
1853
|
+
out.write("query\ttarget\tavg_jacc_sim\tjacc_SD\tnum_shared_SCPs\tposs_shared_SCPs\tAAI_estimate\n")
|
1854
|
+
if do_sd:
|
1855
|
+
jaccard_SDs[no_hit] = "N/A"
|
1856
|
+
for i in range(0, len(aai_ests)):
|
1857
|
+
out.write(qname+"\t"+_tname[i]+"\t"+jaccard_averages[i]+"\t"+jaccard_SDs[i]+"\t"+shared_acc_counts[i]+"\t"+possible_hits[i]+"\t"+aai_ests[i]+"\n")
|
1858
|
+
else:
|
1859
|
+
for i in range(0, len(aai_ests)):
|
1860
|
+
out.write(qname+"\t"+_tname[i]+"\t"+jaccard_averages[i]+"\t"+"N/A"+"\t"+shared_acc_counts[i]+"\t"+possible_hits[i]+"\t"+aai_ests[i]+"\n")
|
1861
|
+
out.close()
|
1862
|
+
|
1863
|
+
|
1864
|
+
#We're just gonna pass this back to the main to print.
|
1865
|
+
if out_style == "matrix":
|
1866
|
+
ret[1] = jaccard_averages
|
1867
|
+
|
1868
|
+
return ret
|
1869
|
+
|
1870
|
+
#Handles both query and target types for a db vs db query
|
1871
|
+
class file_vs_db_query:
|
1872
|
+
def __init__(self, in_memory = False, input_file_objects = None,
|
1873
|
+
target = None, threads = 1, do_sd = False, output_base = "FastAAI", output_style = "tsv",
|
1874
|
+
build_db_from_queries = True, qdb_name = "Query_FastAAI_database.db", hmm_path = None,
|
1875
|
+
do_comp = True, verbose = True):
|
1876
|
+
#files to work with
|
1877
|
+
self.queries = input_file_objects
|
1878
|
+
self.do_db_build = build_db_from_queries
|
1879
|
+
self.dbname = qdb_name
|
1880
|
+
|
1881
|
+
self.t = target
|
1882
|
+
self.valids = None
|
1883
|
+
|
1884
|
+
#Originally this was made to be a memory database only block of code, but just if/else one change makes it work on disk and it doesn't need a redev, then.
|
1885
|
+
self.as_mem_db = in_memory
|
1886
|
+
|
1887
|
+
self.t_conn = None
|
1888
|
+
self.t_curs = None
|
1889
|
+
|
1890
|
+
self.threads = threads
|
1891
|
+
self.do_sd = do_sd
|
1892
|
+
|
1893
|
+
self.output_base = output_base
|
1894
|
+
self.output = os.path.normpath(output_base + "/results")
|
1895
|
+
self.style = output_style
|
1896
|
+
|
1897
|
+
if hmm_path is not None:
|
1898
|
+
self.hmm_path = hmm_path
|
1899
|
+
else:
|
1900
|
+
self.hmm_path = find_hmm()
|
1901
|
+
|
1902
|
+
self.do_comp = do_comp
|
1903
|
+
|
1904
|
+
self.verbose = verbose
|
1905
|
+
|
1906
|
+
'''
|
1907
|
+
Workflow is:
|
1908
|
+
load target db as mem (optional)
|
1909
|
+
assess valid targets
|
1910
|
+
create query db output (optional)
|
1911
|
+
pass query args to workers
|
1912
|
+
preproc query args
|
1913
|
+
write results
|
1914
|
+
fill query_db_out (optional)
|
1915
|
+
'''
|
1916
|
+
|
1917
|
+
|
1918
|
+
def open(self):
|
1919
|
+
if self.as_mem_db:
|
1920
|
+
self.t_conn = sqlite3.connect(':memory:')
|
1921
|
+
else:
|
1922
|
+
self.t_conn = sqlite3.connect(self.t)
|
1923
|
+
|
1924
|
+
self.t_curs = self.t_conn.cursor()
|
1925
|
+
|
1926
|
+
if self.as_mem_db:
|
1927
|
+
self.t_curs.execute("attach '" + self.t + "' as targets")
|
1928
|
+
|
1929
|
+
self.t_curs.execute("CREATE TABLE genome_index AS SELECT * FROM targets.genome_index")
|
1930
|
+
self.t_curs.execute("CREATE TABLE genome_acc_kmer_counts AS SELECT * FROM targets.genome_acc_kmer_counts")
|
1931
|
+
self.t_curs.execute("CREATE INDEX t_gi ON genome_index (gen_id)")
|
1932
|
+
self.t_curs.execute("CREATE INDEX t_gak ON genome_acc_kmer_counts (accession)")
|
1933
|
+
|
1934
|
+
if self.as_mem_db:
|
1935
|
+
table_sql = "SELECT name FROM targets.sqlite_master"
|
1936
|
+
else:
|
1937
|
+
table_sql = "SELECT name FROM sqlite_master"
|
1938
|
+
|
1939
|
+
|
1940
|
+
ok = generate_accessions_index()
|
1941
|
+
ok_names = set(list(ok.keys()))
|
1942
|
+
successful_tables = []
|
1943
|
+
|
1944
|
+
for name in self.t_curs.execute(table_sql).fetchall():
|
1945
|
+
name = name[0]
|
1946
|
+
if name in ok_names:
|
1947
|
+
successful_tables.append(ok[name])
|
1948
|
+
if self.as_mem_db:
|
1949
|
+
self.t_curs.execute("CREATE TABLE " + name + " AS SELECT * FROM targets."+name)
|
1950
|
+
self.t_curs.execute("CREATE INDEX "+name+"_index ON " + name+" (kmer)" )
|
1951
|
+
|
1952
|
+
if self.as_mem_db:
|
1953
|
+
self.t_conn.commit()
|
1954
|
+
self.t_curs.execute("detach targets")
|
1955
|
+
|
1956
|
+
self.valids = tuple(successful_tables)
|
1957
|
+
|
1958
|
+
def close(self):
|
1959
|
+
self.t_curs.close()
|
1960
|
+
self.t_curs = None
|
1961
|
+
|
1962
|
+
def clean_up(self):
|
1963
|
+
self.t_conn.close()
|
1964
|
+
self.t_conn = None
|
1965
|
+
|
1966
|
+
def sqlite_table_schema(self, conn, name):
|
1967
|
+
"""Return a string representing the table's CREATE"""
|
1968
|
+
cursor = conn.execute("SELECT sql FROM sqlite_master WHERE name=?;", [name])
|
1969
|
+
sql = cursor.fetchone()[0]
|
1970
|
+
cursor.close()
|
1971
|
+
return sql
|
1972
|
+
|
1973
|
+
def execute(self):
|
1974
|
+
print("FastAAI is running.")
|
1975
|
+
tgt_id_res = self.t_curs.execute("SELECT * FROM genome_index ORDER BY gen_id").fetchall()
|
1976
|
+
|
1977
|
+
tgt_ids = []
|
1978
|
+
tgt_naming = []
|
1979
|
+
tgt_counts = []
|
1980
|
+
for r in tgt_id_res:
|
1981
|
+
genome, id, prot_ct = r[0], r[1], r[2]
|
1982
|
+
tgt_ids.append(genome)
|
1983
|
+
tgt_naming.append(genome)
|
1984
|
+
tgt_counts.append(prot_ct)
|
1985
|
+
|
1986
|
+
num_tgts = len(tgt_ids)
|
1987
|
+
tgt_counts = np.array(tgt_counts, dtype = np.int32)
|
1988
|
+
|
1989
|
+
tgts_gak = {}
|
1990
|
+
gak_sql = "SELECT * FROM genome_acc_kmer_counts WHERE accession in ({accs})".format(accs=','.join(['?']*len(self.valids)))
|
1991
|
+
|
1992
|
+
for result in self.t_curs.execute(gak_sql, self.valids).fetchall():
|
1993
|
+
genome, acc, ct = result[0], result[1], result[2]
|
1994
|
+
if acc not in tgts_gak:
|
1995
|
+
tgts_gak[acc] = np.zeros(num_tgts, dtype = np.int32)
|
1996
|
+
tgts_gak[acc][genome] += ct
|
1997
|
+
|
1998
|
+
#If the DB is a memory DB, we need to maintain the connection, but neither needs to maintain the curor in main.
|
1999
|
+
self.close()
|
2000
|
+
|
2001
|
+
query_groups = []
|
2002
|
+
|
2003
|
+
for query_input in self.queries:
|
2004
|
+
query_groups.append((query_input,))
|
2005
|
+
|
2006
|
+
#And if it's a physical database, we do want to close it.
|
2007
|
+
if not self.as_mem_db:
|
2008
|
+
self.t_conn.close()
|
2009
|
+
|
2010
|
+
num_queries = len(query_groups)
|
2011
|
+
|
2012
|
+
if self.do_db_build:
|
2013
|
+
sqlite3.register_converter("array", convert_array)
|
2014
|
+
qdb_path = os.path.normpath(self.output_base + "/database/"+self.dbname)
|
2015
|
+
if not os.path.exists(os.path.normpath(self.output_base + "/database")):
|
2016
|
+
try:
|
2017
|
+
os.mkdir(os.path.normpath(self.output_base + "/database"))
|
2018
|
+
except:
|
2019
|
+
print("Couldn't make database at", qdb_path)
|
2020
|
+
self.do_db_build = False
|
2021
|
+
|
2022
|
+
if os.path.exists(qdb_path):
|
2023
|
+
print("Database for queries already exists. I can't make one at:", qdb_path)
|
2024
|
+
self.do_db_build = False
|
2025
|
+
else:
|
2026
|
+
query_db_conn = sqlite3.connect(qdb_path)
|
2027
|
+
q_curs = query_db_conn.cursor()
|
2028
|
+
q_curs.execute("CREATE TABLE storage (genome INTEGER, accession INTEGER, kmers array)")
|
2029
|
+
q_curs.execute("CREATE INDEX store_idx ON storage (genome, accession)")
|
2030
|
+
query_genome_index = []
|
2031
|
+
qgi_ct = 0
|
2032
|
+
qg_gak = []
|
2033
|
+
|
2034
|
+
if self.verbose:
|
2035
|
+
tracker = progress_tracker(total = num_queries, message = "Calculating AAI...", one_line = True)
|
2036
|
+
|
2037
|
+
if self.style == "matrix":
|
2038
|
+
output_name = os.path.normpath(self.output + "/FastAAI_matrix.txt")
|
2039
|
+
output = open(output_name, "w")
|
2040
|
+
#needs target names.
|
2041
|
+
print("query_genome", *tgt_ids, sep = "\t", file = output)
|
2042
|
+
|
2043
|
+
#Need to pass these
|
2044
|
+
|
2045
|
+
#both initializers will share this.
|
2046
|
+
shared_args = [tgts_gak, tgt_naming, tgt_counts, self.hmm_path, self.do_comp, num_tgts, self.do_sd, self.output,
|
2047
|
+
self.style, self.as_mem_db, self.do_db_build]
|
2048
|
+
|
2049
|
+
if self.as_mem_db:
|
2050
|
+
shared_args.append(self.t_conn)
|
2051
|
+
shared_args = tuple(shared_args)
|
2052
|
+
pool = multiprocessing.Pool(self.threads, initializer = file_v_db_initializer,
|
2053
|
+
initargs = shared_args)
|
2054
|
+
else:
|
2055
|
+
#db is on disk,
|
2056
|
+
shared_args.append(self.t)
|
2057
|
+
shared_args = tuple(shared_args)
|
2058
|
+
pool = multiprocessing.Pool(self.threads, initializer = file_v_db_initializer,
|
2059
|
+
initargs = shared_args)
|
2060
|
+
|
2061
|
+
for result in pool.imap(file_v_db_worker, query_groups):
|
2062
|
+
if self.verbose:
|
2063
|
+
tracker.update()
|
2064
|
+
qname = result[0]
|
2065
|
+
if self.style == "matrix":
|
2066
|
+
printout = numpy_kaai_to_aai(result[1])
|
2067
|
+
print(qname, *printout, sep = "\t", file = output)
|
2068
|
+
|
2069
|
+
if self.do_db_build:
|
2070
|
+
query_genome_index.append((qname, qgi_ct, len(result[2]),))
|
2071
|
+
for row in result[2]:
|
2072
|
+
num_kmers = int(len(row[2])/4)
|
2073
|
+
qg_gak.append((qgi_ct, row[1], num_kmers,))
|
2074
|
+
qgi_ct += 1
|
2075
|
+
q_curs.executemany("INSERT INTO storage VALUES (?, ?, ?)", result[2])
|
2076
|
+
query_db_conn.commit()
|
2077
|
+
|
2078
|
+
pool.close()
|
2079
|
+
|
2080
|
+
if self.style == "matrix":
|
2081
|
+
output.close()
|
2082
|
+
|
2083
|
+
if self.do_db_build:
|
2084
|
+
q_curs.execute("CREATE TABLE genome_index (genome text, gen_id integer, protein_count integer)")
|
2085
|
+
q_curs.execute("CREATE TABLE genome_acc_kmer_counts (genome integer, accession integer, count integer)")
|
2086
|
+
q_curs.executemany("INSERT INTO genome_index VALUES (?,?,?)", query_genome_index)
|
2087
|
+
q_curs.executemany("INSERT INTO genome_acc_kmer_counts VALUES (?,?,?)", qg_gak)
|
2088
|
+
query_db_conn.commit()
|
2089
|
+
|
2090
|
+
acc_id_to_name = generate_accessions_index(forward = False)
|
2091
|
+
qgi_dict = {}
|
2092
|
+
for tup in query_genome_index:
|
2093
|
+
qgi_dict[tup[0]] = tup[1]
|
2094
|
+
|
2095
|
+
accs_in_db = q_curs.execute("SELECT DISTINCT(accession) FROM genome_acc_kmer_counts").fetchall()
|
2096
|
+
if self.verbose:
|
2097
|
+
tracker = progress_tracker(total = len(accs_in_db), message = "Crafting database from query outputs.", one_line = True)
|
2098
|
+
|
2099
|
+
for acc in accs_in_db:
|
2100
|
+
acc = acc[0]
|
2101
|
+
acc_name = acc_id_to_name[acc]
|
2102
|
+
q_curs.execute("CREATE TABLE " + acc_name + " (kmer INTEGER PRIMARY KEY, genomes array)")
|
2103
|
+
q_curs.execute("CREATE TABLE " + acc_name + "_genomes (genome INTEGER PRIMARY KEY, kmers array)")
|
2104
|
+
data = q_curs.execute("SELECT genome, kmers FROM storage WHERE accession = ?", (acc,)).fetchall()
|
2105
|
+
|
2106
|
+
ins = []
|
2107
|
+
#group by kmer
|
2108
|
+
kmers_by_gen = {}
|
2109
|
+
for row in data:
|
2110
|
+
gen = row[0]
|
2111
|
+
gen = qgi_dict[gen]
|
2112
|
+
kmers = np.frombuffer(row[1], dtype = np.int32)
|
2113
|
+
ins.append((gen, kmers,))
|
2114
|
+
for k in kmers:
|
2115
|
+
#typecast
|
2116
|
+
k = int(k)
|
2117
|
+
if k not in kmers_by_gen:
|
2118
|
+
kmers_by_gen[k] = []
|
2119
|
+
kmers_by_gen[k].append(gen)
|
2120
|
+
|
2121
|
+
data = None
|
2122
|
+
|
2123
|
+
q_curs.executemany("INSERT INTO "+ acc_name + "_genomes VALUES (?,?)", ins)
|
2124
|
+
|
2125
|
+
ins = []
|
2126
|
+
for k in kmers_by_gen:
|
2127
|
+
dat = kmers_by_gen[k]
|
2128
|
+
dat = np.sort(np.array(dat, dtype = np.int32))
|
2129
|
+
ins.append((k, dat.tobytes()))
|
2130
|
+
|
2131
|
+
q_curs.executemany("INSERT INTO "+ acc_name + " VALUES (?,?)", ins)
|
2132
|
+
|
2133
|
+
ins = None
|
2134
|
+
|
2135
|
+
query_db_conn.commit()
|
2136
|
+
|
2137
|
+
q_curs.execute("CREATE INDEX IF NOT EXISTS " + acc_name + "_index ON " + acc_name + " (kmer)")
|
2138
|
+
|
2139
|
+
if self.verbose:
|
2140
|
+
tracker.update()
|
2141
|
+
|
2142
|
+
|
2143
|
+
q_curs.execute("CREATE INDEX IF NOT EXISTS kmer_acc ON genome_acc_kmer_counts (genome, accession);")
|
2144
|
+
q_curs.execute("DROP INDEX store_idx")
|
2145
|
+
q_curs.execute("DROP TABLE storage")
|
2146
|
+
query_db_conn.commit()
|
2147
|
+
q_curs.execute("VACUUM")
|
2148
|
+
query_db_conn.commit()
|
2149
|
+
q_curs.close()
|
2150
|
+
query_db_conn.close()
|
2151
|
+
|
2152
|
+
#Actually run the thing.
|
2153
|
+
def run(self):
|
2154
|
+
self.open()
|
2155
|
+
self.execute()
|
2156
|
+
#Clean up the db connections; free the mem.
|
2157
|
+
self.clean_up()
|
2158
|
+
|
2159
|
+
def numpy_kaai_to_aai(kaai_array):
|
2160
|
+
#aai_hat = (-0.3087057 + 1.810741 * (np.exp(-(-0.2607023 * np.log(kaai))**(1/3.435))))*100
|
2161
|
+
|
2162
|
+
#Protect the original jaccard averages memory item
|
2163
|
+
aai_hat_array = kaai_array.copy()
|
2164
|
+
|
2165
|
+
non_zero = np.where(aai_hat_array > 0)
|
2166
|
+
is_zero = np.where(aai_hat_array <= 0)
|
2167
|
+
|
2168
|
+
#I broke this down into its original components
|
2169
|
+
#Avoid zeroes in log - still actually works, but it produces warnings I don't want to see.
|
2170
|
+
aai_hat_array[non_zero] = np.log(aai_hat_array[non_zero])
|
2171
|
+
|
2172
|
+
aai_hat_array = np.multiply(np.subtract(np.multiply(np.exp(np.negative(np.power(np.multiply(aai_hat_array, -0.2607023), (1/3.435)))), 1.810741), 0.3087057), 100)
|
2173
|
+
'''
|
2174
|
+
Same as the above, broken down into easier-to-follow steps.
|
2175
|
+
aai_hat_array = np.multiply(aai_hat_array, -0.2607023)
|
2176
|
+
aai_hat_array = np.power(aai_hat_array, (1/3.435))
|
2177
|
+
aai_hat_array = np.negative(aai_hat_array)
|
2178
|
+
aai_hat_array = np.exp(aai_hat_array)
|
2179
|
+
aai_hat_array = np.multiply(aai_hat_array, 1.810741)
|
2180
|
+
aai_hat_array = np.subtract(aai_hat_array, 0.3087057)
|
2181
|
+
aai_hat_array = np.multiply(aai_hat_array, 100)
|
2182
|
+
'''
|
2183
|
+
|
2184
|
+
#<30 and >90 values
|
2185
|
+
smol = np.where(aai_hat_array < 30)
|
2186
|
+
big = np.where(aai_hat_array > 90)
|
2187
|
+
|
2188
|
+
aai_hat_array = np.round(aai_hat_array, 2)
|
2189
|
+
|
2190
|
+
#Convert to final printables
|
2191
|
+
aai_hat_array = aai_hat_array.astype(str)
|
2192
|
+
aai_hat_array[smol] = "<30%"
|
2193
|
+
aai_hat_array[big] = ">90%"
|
2194
|
+
#The math of the above ends up with zero values being big, so we fix those.
|
2195
|
+
aai_hat_array[is_zero] = "<30%"
|
2196
|
+
|
2197
|
+
return aai_hat_array
|
2198
|
+
|
2199
|
+
#Also includes a multiply by 100 and type conversion compared to original - this is some silliness for saving memory.
|
2200
|
+
def numpy_kaai_to_aai_just_nums(kaai_array, as_float = False):
|
2201
|
+
#aai_hat = (-0.3087057 + 1.810741 * (np.exp(-(-0.2607023 * np.log(kaai))**(1/3.435))))*100
|
2202
|
+
|
2203
|
+
#Protect the original jaccard averages memory item
|
2204
|
+
aai_hat_array = kaai_array.copy()
|
2205
|
+
|
2206
|
+
non_zero = np.where(aai_hat_array > 0)
|
2207
|
+
is_zero = np.where(aai_hat_array <= 0)
|
2208
|
+
|
2209
|
+
#I broke this down into its original components
|
2210
|
+
#Avoid zeroes in log - still actually works, but it produces warnings I don't want to see.
|
2211
|
+
aai_hat_array[non_zero] = np.log(aai_hat_array[non_zero])
|
2212
|
+
|
2213
|
+
aai_hat_array = np.multiply(np.subtract(np.multiply(np.exp(np.negative(np.power(np.multiply(aai_hat_array, -0.2607023), (1/3.435)))), 1.810741), 0.3087057), 100)
|
2214
|
+
'''
|
2215
|
+
Same as the above, broken down into easier-to-follow steps.
|
2216
|
+
aai_hat_array = np.multiply(aai_hat_array, -0.2607023)
|
2217
|
+
aai_hat_array = np.power(aai_hat_array, (1/3.435))
|
2218
|
+
aai_hat_array = np.negative(aai_hat_array)
|
2219
|
+
aai_hat_array = np.exp(aai_hat_array)
|
2220
|
+
aai_hat_array = np.multiply(aai_hat_array, 1.810741)
|
2221
|
+
aai_hat_array = np.subtract(aai_hat_array, 0.3087057)
|
2222
|
+
aai_hat_array = np.multiply(aai_hat_array, 100)
|
2223
|
+
'''
|
2224
|
+
|
2225
|
+
aai_hat_array = np.round(aai_hat_array, 2)
|
2226
|
+
|
2227
|
+
#<30 and >90 values
|
2228
|
+
smol = np.where(aai_hat_array < 30)
|
2229
|
+
big = np.where(aai_hat_array > 90)
|
2230
|
+
|
2231
|
+
#We can find these later.
|
2232
|
+
aai_hat_array[smol] = 15
|
2233
|
+
aai_hat_array[big] = 95
|
2234
|
+
|
2235
|
+
if as_float:
|
2236
|
+
aai_hat_array = np.round(aai_hat_array, 2)
|
2237
|
+
else:
|
2238
|
+
aai_hat_array = np.multiply(aai_hat_array, 100)
|
2239
|
+
aai_hat_array = np.round(aai_hat_array, 2)
|
2240
|
+
aai_hat_array = aai_hat_array.astype(np.int16)
|
2241
|
+
|
2242
|
+
return aai_hat_array
|
2243
|
+
|
2244
|
+
|
2245
|
+
def curtime():
|
2246
|
+
time_format = "%d/%m/%Y %H:%M:%S"
|
2247
|
+
timer = datetime.datetime.now()
|
2248
|
+
time = timer.strftime(time_format)
|
2249
|
+
return time
|
2250
|
+
|
2251
|
+
#Perform a minimal-memory query of a target database from input files. Lighter weight function for low memory
|
2252
|
+
def sql_query_opts():
|
2253
|
+
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
|
2254
|
+
description='''
|
2255
|
+
This FastAAI module takes one or many genomes, proteins, or proteins and HMMs as a QUERY and searches them against an existing FastAAI database TARGET using SQL
|
2256
|
+
If you only have a few genomes - or not enough RAM to hold the entire target database in memory - this is the probably the best option for you.
|
2257
|
+
|
2258
|
+
To provide files, supply either a directory containing only one type of file (e.g. only genomes in FASTA format), a file containing paths to files of a type, 1 per line,
|
2259
|
+
or a comma-separated list of files of a single type (no spaces)
|
2260
|
+
|
2261
|
+
If you provide FastAAI with genomes or only proteins (not proteins and HMMs), this FastAAI module will produce the required protein and HMM files as needed
|
2262
|
+
and place them in the output directory, just like it does while building a database.
|
2263
|
+
|
2264
|
+
Once these inputs are ready to be queried against the database (each has both a protein and HMM file), they will be processed independently, 1 per thread at a time.
|
2265
|
+
|
2266
|
+
Note: Protein and HMM files generated during this query can be supplied to build a FastAAI database from proteins and HMMs using the build_db module, without redoing preprocessing.
|
2267
|
+
''')
|
2268
|
+
|
2269
|
+
parser.add_argument('-g', '--genomes', dest = 'genomes', default = None, help = 'Genomes in FASTA format.')
|
2270
|
+
parser.add_argument('-p', '--proteins', dest = 'proteins', default = None, help = 'Protein amino acids in FASTA format.')
|
2271
|
+
parser.add_argument('-m', '--hmms', dest = 'hmms', default = None, help = 'HMM search files produced by FastAAI on a set of proteins.')
|
2272
|
+
|
2273
|
+
parser.add_argument('--target', dest = 'target', default = None, help = 'A path to the FastAAI database you wish to use as the target')
|
2274
|
+
|
2275
|
+
parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory where FastAAI will place the result of this query and any protein or HMM files it has to generate. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
|
2276
|
+
parser.add_argument('--output_style', dest = "style", default = 'tsv', help = "Either 'tsv' or 'matrix'. Matrix produces a simplified output of only AAI estimates.")
|
2277
|
+
parser.add_argument('--do_stdev', dest = "do_stdev", action='store_true', help = 'Off by default. Calculate std. deviations on Jaccard indicies. Increases memory usage and runtime slightly. Does NOT change estimated AAI values at all.')
|
2278
|
+
|
2279
|
+
parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
|
2280
|
+
parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
|
2281
|
+
|
2282
|
+
parser.add_argument('--in_memory', dest = "in_mem", action = 'store_true', help = 'Load the target database into memory before querying. Consumes more RAM, but is faster and reduces file I/O substantially.')
|
2283
|
+
|
2284
|
+
parser.add_argument('--create_query_db', dest = "make_db", action = 'store_true', help = 'Create a query database from the genomes.')
|
2285
|
+
parser.add_argument('--query_db_name', dest = "qdb_name", default = "Query_FastAAI_db.db", help = 'Name the query database. This file must not already exist.')
|
2286
|
+
|
2287
|
+
parser.add_argument('--compress', dest = "do_comp", action = 'store_true', help = 'Gzip compress generated proteins, HMMs. Off by default.')
|
2288
|
+
|
2289
|
+
args, unknown = parser.parse_known_args()
|
2290
|
+
|
2291
|
+
return parser, args
|
2292
|
+
|
2293
|
+
def sql_query_thread_starter(kmer_cts, protein_cts):
|
2294
|
+
global target_kmer_cts
|
2295
|
+
global target_protein_counts
|
2296
|
+
target_kmer_cts = kmer_cts
|
2297
|
+
target_protein_counts = protein_cts
|
2298
|
+
|
2299
|
+
#took a function from fastaai 2.0
|
2300
|
+
class fastaai_file_importer:
|
2301
|
+
def __init__(self, genomes = None, proteins = None, hmms = None, crystals = None,
|
2302
|
+
output = "FastAAI", compress = False, crystalize = False):
|
2303
|
+
#genomes, prots, hmms can be supplied as either directory, a file with paths 1/line, or comma-sep paths. Type is determined automatically.
|
2304
|
+
self.genomes = genomes
|
2305
|
+
self.proteins = proteins
|
2306
|
+
self.hmms = hmms
|
2307
|
+
self.crystals = crystals
|
2308
|
+
|
2309
|
+
self.genome_list = None
|
2310
|
+
self.protein_list = None
|
2311
|
+
self.hmm_list = None
|
2312
|
+
self.crystal_list = None
|
2313
|
+
|
2314
|
+
self.crystalize = crystalize
|
2315
|
+
|
2316
|
+
#file base names.
|
2317
|
+
self.identifiers = None
|
2318
|
+
|
2319
|
+
self.error = False
|
2320
|
+
|
2321
|
+
self.in_files = None
|
2322
|
+
|
2323
|
+
self.status = "genome"
|
2324
|
+
self.output = output
|
2325
|
+
|
2326
|
+
self.do_comp = compress
|
2327
|
+
|
2328
|
+
def retrieve_files(self, arg):
|
2329
|
+
done = False
|
2330
|
+
files = []
|
2331
|
+
names = []
|
2332
|
+
#Case where a directory is supplied.
|
2333
|
+
if os.path.isdir(arg):
|
2334
|
+
for file in sorted(os.listdir(arg)):
|
2335
|
+
#Retrieve file name
|
2336
|
+
if file.endswith(".gz"):
|
2337
|
+
name = os.path.splitext(os.path.basename(file[:-3]))[0]
|
2338
|
+
else:
|
2339
|
+
name = os.path.splitext(os.path.basename(file))[0]
|
2340
|
+
|
2341
|
+
names.append(name)
|
2342
|
+
files.append(os.path.abspath(os.path.normpath(arg + '/' +file)))
|
2343
|
+
|
2344
|
+
done = True
|
2345
|
+
|
2346
|
+
|
2347
|
+
#Case where a file containing paths is supplied.
|
2348
|
+
if os.path.isfile(arg):
|
2349
|
+
handle = agnostic_reader(arg)
|
2350
|
+
for line in handle:
|
2351
|
+
file = line.strip()
|
2352
|
+
if os.path.exists(file):
|
2353
|
+
if file.endswith(".gz"):
|
2354
|
+
name = os.path.splitext(os.path.basename(file[:-3]))[0]
|
2355
|
+
else:
|
2356
|
+
name = os.path.splitext(os.path.basename(file))[0]
|
2357
|
+
|
2358
|
+
names.append(name)
|
2359
|
+
files.append(os.path.abspath(os.path.normpath(file)))
|
2360
|
+
|
2361
|
+
handle.close()
|
2362
|
+
done = True
|
2363
|
+
|
2364
|
+
if len(names) == 0 and len(files) == 0:
|
2365
|
+
#Try interpreting the file as a singular path.
|
2366
|
+
done = False
|
2367
|
+
|
2368
|
+
#Last check.
|
2369
|
+
if not done:
|
2370
|
+
for file in arg.split(","):
|
2371
|
+
if os.path.exists(file):
|
2372
|
+
if file.endswith(".gz"):
|
2373
|
+
name = os.path.splitext(os.path.basename(file[:-3]))[0]
|
2374
|
+
else:
|
2375
|
+
name = os.path.splitext(os.path.basename(file))[0]
|
2376
|
+
|
2377
|
+
names.append(name)
|
2378
|
+
files.append(os.path.abspath(os.path.normpath(file)))
|
2379
|
+
|
2380
|
+
return files, names
|
2381
|
+
|
2382
|
+
#Check if g/p/h
|
2383
|
+
def determine_inputs(self):
|
2384
|
+
if self.genomes is not None:
|
2385
|
+
self.genome_list, self.identifiers = self.retrieve_files(self.genomes)
|
2386
|
+
if self.proteins is not None or self.hmms is not None:
|
2387
|
+
print("You can supply genomes or proteins or proteins and HMMS, but not genomes and anything else.")
|
2388
|
+
self.error = True
|
2389
|
+
|
2390
|
+
#Proteins, but no HMMs
|
2391
|
+
if self.proteins is not None and self.hmms is None:
|
2392
|
+
self.protein_list, self.identifiers = self.retrieve_files(self.proteins)
|
2393
|
+
|
2394
|
+
if self.proteins is not None and self.hmms is not None:
|
2395
|
+
self.protein_list, prot_names = self.retrieve_files(self.proteins)
|
2396
|
+
self.hmm_list, hmm_names = self.retrieve_files(self.hmms)
|
2397
|
+
|
2398
|
+
if len(self.protein_list) != len(self.hmm_list):
|
2399
|
+
print("Different number of proteins and HMMs supplied. You must supply the same number of each, and they must be matched pairs.")
|
2400
|
+
self.error = True
|
2401
|
+
else:
|
2402
|
+
all_same = True
|
2403
|
+
for p, h in zip(prot_names, hmm_names):
|
2404
|
+
if p != h:
|
2405
|
+
all_same = False
|
2406
|
+
|
2407
|
+
if all_same:
|
2408
|
+
self.identifiers = prot_names
|
2409
|
+
prot_names = None
|
2410
|
+
hmm_names = None
|
2411
|
+
else:
|
2412
|
+
self.error = True
|
2413
|
+
|
2414
|
+
if self.crystals is not None:
|
2415
|
+
self.crystal_list, self.identifiers = self.retrieve_files(self.crystals)
|
2416
|
+
#The crystal naming scheme includes an identifier at the end. This removes it.
|
2417
|
+
self.identifiers = [id[:-13] for id in self.identifiers]
|
2418
|
+
|
2419
|
+
|
2420
|
+
if not self.error:
|
2421
|
+
self.prep_input_files()
|
2422
|
+
|
2423
|
+
def prep_input_files(self):
|
2424
|
+
self.in_files = []
|
2425
|
+
if self.genome_list is not None:
|
2426
|
+
self.status = "genome"
|
2427
|
+
for g in self.genome_list:
|
2428
|
+
f = input_file(g, output = self.output, do_compress = self.do_comp, make_crystal = self.crystalize)
|
2429
|
+
f.set_genome(g)
|
2430
|
+
self.in_files.append(f)
|
2431
|
+
|
2432
|
+
if self.protein_list is not None:
|
2433
|
+
self.status = "protein"
|
2434
|
+
for p in self.protein_list:
|
2435
|
+
f = input_file(p, output = self.output, do_compress = self.do_comp, make_crystal = self.crystalize)
|
2436
|
+
f.set_protein(p)
|
2437
|
+
self.in_files.append(f)
|
2438
|
+
|
2439
|
+
if self.hmm_list is not None:
|
2440
|
+
self.status = "protein+HMM"
|
2441
|
+
for h, f in zip(self.hmm_list, self.in_files):
|
2442
|
+
f.set_hmm(h)
|
2443
|
+
|
2444
|
+
def sql_query(genomes, proteins, hmms, db_name, output, threads, verbose, do_stdev, style, in_mem, make_db, qdb_name, do_comp):
|
2445
|
+
|
2446
|
+
if not os.path.exists(db_name):
|
2447
|
+
print("")
|
2448
|
+
print("FastAAI can't find your database:", db_name)
|
2449
|
+
print("Are you sure that the path you've given to the database is correct and that the database exists?")
|
2450
|
+
print("FastAAI exiting.")
|
2451
|
+
print("")
|
2452
|
+
sys.exit()
|
2453
|
+
|
2454
|
+
#importer opts
|
2455
|
+
#genomes = None, proteins = None, hmms = None, crystals = None
|
2456
|
+
imported_files = fastaai_file_importer(genomes = genomes, proteins = proteins, hmms = hmms, output = output)
|
2457
|
+
imported_files.determine_inputs()
|
2458
|
+
|
2459
|
+
if imported_files.error:
|
2460
|
+
print("Exiting FastAAI due to input file error.")
|
2461
|
+
quit()
|
2462
|
+
|
2463
|
+
good_to_go = prepare_directories(output, imported_files.status, "query")
|
2464
|
+
|
2465
|
+
if not good_to_go:
|
2466
|
+
print("Exiting FastAAI")
|
2467
|
+
sys.exit()
|
2468
|
+
|
2469
|
+
print("")
|
2470
|
+
|
2471
|
+
'''
|
2472
|
+
self, in_memory = False, input_file_objects = None,
|
2473
|
+
target = None, threads = 1, do_sd = False, output_base = "FastAAI", output_style = "tsv",
|
2474
|
+
build_db_from_queries = True, qdb_name = "Query_FastAAI_database.db", hmm_path = "00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm",
|
2475
|
+
do_comp = True, verbose = True
|
2476
|
+
'''
|
2477
|
+
hmm_path = find_hmm()
|
2478
|
+
|
2479
|
+
mdb = file_vs_db_query(in_memory = in_mem, input_file_objects = imported_files.in_files, target=db_name,
|
2480
|
+
threads = threads, output_base = output, do_sd = do_stdev, output_style = style, do_comp = do_comp,
|
2481
|
+
build_db_from_queries = make_db, qdb_name = qdb_name, verbose = verbose, hmm_path = hmm_path)
|
2482
|
+
|
2483
|
+
mdb.run()
|
2484
|
+
|
2485
|
+
#Here's where the querying db comes in
|
2486
|
+
|
2487
|
+
|
2488
|
+
print("FastAAI query complete! Results at:", os.path.normpath(output + "/results"))
|
2489
|
+
return None
|
2490
|
+
|
2491
|
+
#Manages the query process.
|
2492
|
+
def db_query_opts():
|
2493
|
+
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
|
2494
|
+
description='''
|
2495
|
+
This FastAAI module takes two FastAAI databases and searches all of the genomes in the QUERY against all of the genomes in the TARGET
|
2496
|
+
|
2497
|
+
If you have many genomes (more than 1000), it will be faster to create the query database using FastAAI build_db,
|
2498
|
+
then search it against an existing target using this module than it is to do the same thing with an SQL query.
|
2499
|
+
|
2500
|
+
If you give the same database as query and target, a special all vs. all search of the genomes in the database will be done.
|
2501
|
+
''')
|
2502
|
+
parser.add_argument('-q', '--query', dest = 'query', default = None, help = 'Path to the query database. The genomes FROM the query will be searched against the genomes in the target database')
|
2503
|
+
parser.add_argument('-t', '--target', dest = 'target', default = None, help = 'Path to the target database.')
|
2504
|
+
|
2505
|
+
parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory where FastAAI will place the result of this query. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
|
2506
|
+
parser.add_argument('--output_style', dest = "style", default = 'tsv', help = "Either 'tsv' or 'matrix'. Matrix produces a simplified output of only AAI estimates.")
|
2507
|
+
parser.add_argument('--do_stdev', dest = "do_stdev", action='store_true', help = 'Off by default. Calculate std. deviations on Jaccard indicies. Increases memory usage and runtime slightly. Does NOT change estimated AAI values at all.')
|
2508
|
+
|
2509
|
+
parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
|
2510
|
+
parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
|
2511
|
+
parser.add_argument('--in_memory', dest = "in_mem", action = 'store_true', help = 'Load both databases into memory before querying. Consumes more RAM, but is faster and reduces file I/O substantially. Consider reducing number of threads')
|
2512
|
+
parser.add_argument('--store_results', dest = "storage", action = 'store_true', help = 'Keep partial results in memory. Only works with --in_memory. Fewer writes, but more RAM. Default off.')
|
2513
|
+
|
2514
|
+
args, unknown = parser.parse_known_args()
|
2515
|
+
|
2516
|
+
return parser, args
|
2517
|
+
|
2518
|
+
|
2519
|
+
#db-db query; in-mem
|
2520
|
+
def parse_db_init(query, target, outpath):
|
2521
|
+
global qdb
|
2522
|
+
qdb = query
|
2523
|
+
global tdb
|
2524
|
+
tdb = target
|
2525
|
+
global output_path
|
2526
|
+
output_path = outpath
|
2527
|
+
|
2528
|
+
global query_gak
|
2529
|
+
global target_gak
|
2530
|
+
|
2531
|
+
return qdb, tdb, output_path
|
2532
|
+
|
2533
|
+
def parse_accession(acc):
|
2534
|
+
tmp = sqlite3.connect(":memory:")
|
2535
|
+
curs = tmp.cursor()
|
2536
|
+
|
2537
|
+
curs.execute("attach '" + qdb + "' as queries")
|
2538
|
+
curs.execute("attach '" + tdb + "' as targets")
|
2539
|
+
|
2540
|
+
sql = '''
|
2541
|
+
SELECT queries.{acc}.genomes, targets.{acc}.genomes
|
2542
|
+
FROM queries.{acc} INNER JOIN targets.{acc}
|
2543
|
+
ON queries.{acc}.kmer=targets.{acc}.kmer
|
2544
|
+
'''.format(acc = acc)
|
2545
|
+
|
2546
|
+
res = curs.execute(sql).fetchall()
|
2547
|
+
|
2548
|
+
curs.execute("detach queries")
|
2549
|
+
curs.execute("detach targets")
|
2550
|
+
|
2551
|
+
curs.close()
|
2552
|
+
tmp.close()
|
2553
|
+
|
2554
|
+
tl = []
|
2555
|
+
ql = {}
|
2556
|
+
|
2557
|
+
acc_id = generate_accessions_index()
|
2558
|
+
acc_id = acc_id[acc]
|
2559
|
+
|
2560
|
+
indexer = 0
|
2561
|
+
for r in res:
|
2562
|
+
queries = np.frombuffer(r[0], dtype = np.int32)
|
2563
|
+
tgt = np.frombuffer(r[1], dtype = np.int32)
|
2564
|
+
tl.append(tgt)
|
2565
|
+
|
2566
|
+
for q in queries:
|
2567
|
+
if q not in ql:
|
2568
|
+
ql[q] = {}
|
2569
|
+
if acc_id not in ql[q]:
|
2570
|
+
ql[q][acc_id] = []
|
2571
|
+
|
2572
|
+
ql[q][acc_id].append(indexer)
|
2573
|
+
|
2574
|
+
indexer += 1
|
2575
|
+
|
2576
|
+
tl = np.array(tl, dtype = object)
|
2577
|
+
|
2578
|
+
for q in ql:
|
2579
|
+
if acc_id in ql[q]:
|
2580
|
+
ql[q][acc_id] = np.array(ql[q][acc_id], dtype=np.int32)
|
2581
|
+
|
2582
|
+
out_file = os.path.normpath(output_path+"/"+acc+".pickle")
|
2583
|
+
|
2584
|
+
with open(out_file, "wb") as out:
|
2585
|
+
pickle.dump([ql, tl], out)
|
2586
|
+
|
2587
|
+
return([acc, out_file])
|
2588
|
+
|
2589
|
+
#all of this is exclusive to the in-mem approach for db db query
|
2590
|
+
def one_init(ql, tl, num_tgt, qgak_queue, tgak, tpres, sd, sty, output_dir, store_results, progress_queue, qnames, tnames, temp_dir):
|
2591
|
+
global _ql
|
2592
|
+
_ql = ql
|
2593
|
+
global _tl
|
2594
|
+
_tl = tl
|
2595
|
+
global _nt
|
2596
|
+
_nt = num_tgt
|
2597
|
+
|
2598
|
+
qgak_data = qgak_queue.get()
|
2599
|
+
|
2600
|
+
global out_base
|
2601
|
+
out_base = output_dir
|
2602
|
+
|
2603
|
+
global group_id
|
2604
|
+
group_id = os.path.normpath(temp_dir + "/partial_results_group_" + str(qgak_data[0])+ ".txt")
|
2605
|
+
|
2606
|
+
global _qgak
|
2607
|
+
_qgak = qgak_data[1]
|
2608
|
+
|
2609
|
+
global query_grouping
|
2610
|
+
query_grouping = qgak_data[2]
|
2611
|
+
|
2612
|
+
qgak_data = None
|
2613
|
+
|
2614
|
+
global _tgak
|
2615
|
+
_tgak = tgak
|
2616
|
+
|
2617
|
+
global _tpres
|
2618
|
+
_tpres = tpres
|
2619
|
+
|
2620
|
+
global _tct
|
2621
|
+
_tct = np.sum(_tpres, axis = 0)
|
2622
|
+
|
2623
|
+
global do_sd
|
2624
|
+
do_sd = sd
|
2625
|
+
global style
|
2626
|
+
style = sty
|
2627
|
+
#Suppress div by zero warning - it's handled.
|
2628
|
+
np.seterr(divide='ignore')
|
2629
|
+
|
2630
|
+
global store
|
2631
|
+
store = store_results
|
2632
|
+
if store:
|
2633
|
+
global holder
|
2634
|
+
holder = []
|
2635
|
+
else:
|
2636
|
+
global outwriter
|
2637
|
+
outwriter = open(group_id, "w")
|
2638
|
+
|
2639
|
+
global prog_queue
|
2640
|
+
prog_queue = progress_queue
|
2641
|
+
|
2642
|
+
global _qnames
|
2643
|
+
_qnames = qnames
|
2644
|
+
|
2645
|
+
global _tnames
|
2646
|
+
_tnames = tnames
|
2647
|
+
|
2648
|
+
def one_work(placeholder):
|
2649
|
+
for q in query_grouping:
|
2650
|
+
results = []
|
2651
|
+
#We also need to count the accs in the query genome, but which are not part of the inner join.
|
2652
|
+
for acc in _qgak[q][0]:
|
2653
|
+
if acc in _ql[q]:
|
2654
|
+
#the bincount is intersections.
|
2655
|
+
these_intersections = np.bincount(np.concatenate(_tl[acc][_ql[q][acc]]), minlength = _nt)
|
2656
|
+
else:
|
2657
|
+
#there are no intersections even though this accession is shared with at least one target
|
2658
|
+
#number of intersects is all zeros
|
2659
|
+
these_intersections = np.zeros(_nt, dtype = np.int32)
|
2660
|
+
|
2661
|
+
#Append the counts or zeros, either way.
|
2662
|
+
results.append(these_intersections)
|
2663
|
+
|
2664
|
+
results = np.vstack(results)
|
2665
|
+
|
2666
|
+
target_kmer_counts = _tgak[_qgak[q][0], :]
|
2667
|
+
|
2668
|
+
#unions = size(A) + size(B) - size(intersections(A, B))
|
2669
|
+
#unions = target_kmer_counts + query_kmers_by_acc - intersections
|
2670
|
+
unions = np.subtract(np.add(target_kmer_counts, _qgak[q][1][:, None]), results)
|
2671
|
+
|
2672
|
+
#These are now jaccards, not #intersections
|
2673
|
+
results = np.divide(results, unions)
|
2674
|
+
|
2675
|
+
shared_acc_counts = np.sum(_tpres[_qgak[q][0], :], axis = 0)
|
2676
|
+
|
2677
|
+
no_hit = np.where(shared_acc_counts == 0)
|
2678
|
+
|
2679
|
+
jaccard_averages = np.divide(np.sum(results, axis = 0), shared_acc_counts)
|
2680
|
+
|
2681
|
+
#Skip SD if output is matrix
|
2682
|
+
if style == "tsv":
|
2683
|
+
aai_ests = numpy_kaai_to_aai(jaccard_averages)
|
2684
|
+
|
2685
|
+
if do_sd:
|
2686
|
+
#find diffs from means; this includes indicies corresponding to unshared SCPs that should not be included.
|
2687
|
+
results = results - jaccard_averages
|
2688
|
+
|
2689
|
+
#fix those corresponding indicies to not contribute to the final SD.
|
2690
|
+
results[np.logical_not(_tpres[_qgak[q][0], :])] = 0
|
2691
|
+
#results[np.nonzero(has_accs == 0)] = 0
|
2692
|
+
|
2693
|
+
#Square them; 0^2 = 0, so we don't have to think about the fixed indices any more.
|
2694
|
+
results = np.square(results)
|
2695
|
+
#Sum squares and divide by shared acc. count, the sqrt to get SD.
|
2696
|
+
jaccard_SDs = np.sqrt(np.divide(np.sum(results, axis = 0), shared_acc_counts))
|
2697
|
+
jaccard_SDs = np.round(jaccard_SDs, 4).astype(str)
|
2698
|
+
|
2699
|
+
no_hit = np.where(shared_acc_counts == 0)
|
2700
|
+
|
2701
|
+
#addtl.shape[0] is the query acc count
|
2702
|
+
possible_hits = np.minimum(_qgak[q][0].shape[0], _tct).astype(str)
|
2703
|
+
|
2704
|
+
jaccard_averages = np.round(jaccard_averages, 4).astype(str)
|
2705
|
+
shared_acc_counts = shared_acc_counts.astype(str)
|
2706
|
+
|
2707
|
+
jaccard_averages[no_hit] = "N/A"
|
2708
|
+
aai_ests[no_hit] = "N/A"
|
2709
|
+
shared_acc_counts[no_hit] = "N/A"
|
2710
|
+
possible_hits[no_hit] = "N/A"
|
2711
|
+
|
2712
|
+
qname = _qnames[q]
|
2713
|
+
|
2714
|
+
output_name = os.path.normpath(out_base + "/results/"+qname+"_results.txt")
|
2715
|
+
|
2716
|
+
out = open(output_name, "w")
|
2717
|
+
out.write("query\ttarget\tavg_jacc_sim\tjacc_SD\tnum_shared_SCPs\tposs_shared_SCPs\tAAI_estimate\n")
|
2718
|
+
if do_sd:
|
2719
|
+
jaccard_SDs[no_hit] = "N/A"
|
2720
|
+
for i in range(0, len(aai_ests)):
|
2721
|
+
out.write(qname+"\t"+_tnames[i]+"\t"+jaccard_averages[i]+"\t"+jaccard_SDs[i]+"\t"+shared_acc_counts[i]+"\t"+possible_hits[i]+"\t"+aai_ests[i]+"\n")
|
2722
|
+
else:
|
2723
|
+
for i in range(0, len(aai_ests)):
|
2724
|
+
out.write(qname+"\t"+_tnames[i]+"\t"+jaccard_averages[i]+"\t"+"N/A"+"\t"+shared_acc_counts[i]+"\t"+possible_hits[i]+"\t"+aai_ests[i]+"\n")
|
2725
|
+
out.close()
|
2726
|
+
|
2727
|
+
|
2728
|
+
else:
|
2729
|
+
if store:
|
2730
|
+
aai_ests = numpy_kaai_to_aai_just_nums(jaccard_averages, as_float = False)
|
2731
|
+
aai_ests[no_hit] = 0
|
2732
|
+
#add zeros at misses/NAs
|
2733
|
+
holder.append(aai_ests)
|
2734
|
+
else:
|
2735
|
+
aai_ests = numpy_kaai_to_aai_just_nums(jaccard_averages, as_float = True)
|
2736
|
+
aai_ests[no_hit] = 0
|
2737
|
+
print(*aai_ests, sep = "\t", file = outwriter)
|
2738
|
+
|
2739
|
+
prog_queue.put(q)
|
2740
|
+
|
2741
|
+
prog_queue.put("done")
|
2742
|
+
|
2743
|
+
return None
|
2744
|
+
|
2745
|
+
def two_work(i):
|
2746
|
+
if store:
|
2747
|
+
hold_together = np.vstack(holder)
|
2748
|
+
np.savetxt(group_id, hold_together, delimiter = "\t", fmt='%4d')
|
2749
|
+
else:
|
2750
|
+
outwriter.close()
|
2751
|
+
|
2752
|
+
return group_id
|
2753
|
+
|
2754
|
+
def on_disk_init(query_database_path, target_database_path, num_tgt, query_queue, target_gak, tpres, sd, sty, output_dir, progress_queue, qnames, tnames, valids, temp_dir):
|
2755
|
+
global database
|
2756
|
+
database = sqlite3.connect(":memory:")
|
2757
|
+
|
2758
|
+
curs = database.cursor()
|
2759
|
+
curs.execute("attach '" + query_database_path + "' as queries")
|
2760
|
+
curs.execute("attach '" + target_database_path + "' as targets")
|
2761
|
+
curs.close()
|
2762
|
+
|
2763
|
+
global _nt
|
2764
|
+
_nt = num_tgt
|
2765
|
+
|
2766
|
+
qgak_data = query_queue.get()
|
2767
|
+
|
2768
|
+
global out_base
|
2769
|
+
out_base = output_dir
|
2770
|
+
|
2771
|
+
global group_id
|
2772
|
+
group_id = os.path.normpath(temp_dir + "/partial_results_group_" + str(qgak_data[0])+ ".txt")
|
2773
|
+
|
2774
|
+
global _qgak
|
2775
|
+
_qgak = qgak_data[1]
|
2776
|
+
|
2777
|
+
global query_grouping
|
2778
|
+
query_grouping = qgak_data[2]
|
2779
|
+
|
2780
|
+
global _tgak
|
2781
|
+
_tgak = target_gak
|
2782
|
+
|
2783
|
+
global _tpres
|
2784
|
+
_tpres = tpres
|
2785
|
+
|
2786
|
+
global _tct
|
2787
|
+
_tct = np.sum(_tpres, axis = 0)
|
2788
|
+
|
2789
|
+
global do_sd
|
2790
|
+
do_sd = sd
|
2791
|
+
global style
|
2792
|
+
style = sty
|
2793
|
+
#Suppress div by zero warning - it's handled.
|
2794
|
+
np.seterr(divide='ignore')
|
2795
|
+
|
2796
|
+
if style == "matrix":
|
2797
|
+
global outwriter
|
2798
|
+
outwriter = open(group_id, "w")
|
2799
|
+
|
2800
|
+
global prog_queue
|
2801
|
+
prog_queue = progress_queue
|
2802
|
+
|
2803
|
+
global _qnames
|
2804
|
+
_qnames = qnames
|
2805
|
+
|
2806
|
+
global _tnames
|
2807
|
+
_tnames = tnames
|
2808
|
+
|
2809
|
+
global acc_indexer
|
2810
|
+
acc_indexer = generate_accessions_index(forward = False)
|
2811
|
+
|
2812
|
+
global _valids
|
2813
|
+
_valids = valids
|
2814
|
+
|
2815
|
+
def on_disk_work_one(placeholder):
|
2816
|
+
curs = database.cursor()
|
2817
|
+
for q in query_grouping:
|
2818
|
+
results = []
|
2819
|
+
qname = _qnames[q]
|
2820
|
+
for acc in _qgak[q][0]:
|
2821
|
+
acc_name = acc_indexer[acc]
|
2822
|
+
|
2823
|
+
if acc_name in _valids:
|
2824
|
+
|
2825
|
+
one = curs.execute("SELECT kmers FROM queries."+acc_name+"_genomes WHERE genome=?", (str(q),)).fetchone()[0]
|
2826
|
+
one = np.frombuffer(one, dtype = np.int32)
|
2827
|
+
|
2828
|
+
if one.shape[0] > 998:
|
2829
|
+
#Each kmer needs to be a tuple.
|
2830
|
+
these_kmers = [(int(kmer),) for kmer in one]
|
2831
|
+
|
2832
|
+
temp_name = "_" + qname +"_" + acc_name
|
2833
|
+
temp_name = temp_name.replace(".", "_")
|
2834
|
+
|
2835
|
+
curs.execute("CREATE TEMP TABLE " + temp_name + " (kmer INTEGER)")
|
2836
|
+
insert_table = "INSERT INTO " + temp_name + " VALUES (?)"
|
2837
|
+
curs.executemany(insert_table, these_kmers)
|
2838
|
+
|
2839
|
+
join_and_select_sql = "SELECT genomes FROM " + temp_name + " INNER JOIN targets." + acc_name + " ON "+ temp_name+".kmer = targets." + acc_name + ".kmer;"
|
2840
|
+
|
2841
|
+
matches = curs.execute(join_and_select_sql).fetchall()
|
2842
|
+
else:
|
2843
|
+
#kmers must be a list, not a tuple.
|
2844
|
+
these_kmers = [int(kmer) for kmer in one]
|
2845
|
+
select = "SELECT genomes FROM targets." + acc_name + " WHERE kmer IN ({kmers})".format(kmers=','.join(['?']*len(these_kmers)))
|
2846
|
+
matches = curs.execute(select, these_kmers).fetchall()
|
2847
|
+
|
2848
|
+
set = []
|
2849
|
+
for row in matches:
|
2850
|
+
set.append(row[0])
|
2851
|
+
set = b''.join(set)
|
2852
|
+
|
2853
|
+
matches = None
|
2854
|
+
these_intersections = np.bincount(np.frombuffer(set, dtype = np.int32), minlength = _nt)
|
2855
|
+
set = None
|
2856
|
+
results.append(these_intersections)
|
2857
|
+
|
2858
|
+
else:
|
2859
|
+
results.append(np.zeros(_nt, dtype=np.int32))
|
2860
|
+
|
2861
|
+
results = np.vstack(results)
|
2862
|
+
|
2863
|
+
target_kmer_counts = _tgak[_qgak[q][0], :]
|
2864
|
+
|
2865
|
+
#unions = size(A) + size(B) - size(intersections(A, B))
|
2866
|
+
#unions = target_kmer_counts + query_kmers_by_acc - intersections
|
2867
|
+
unions = np.subtract(np.add(target_kmer_counts, _qgak[q][1][:, None]), results)
|
2868
|
+
|
2869
|
+
#These are now jaccards, not #intersections
|
2870
|
+
results = np.divide(results, unions)
|
2871
|
+
|
2872
|
+
shared_acc_counts = np.sum(_tpres[_qgak[q][0], :], axis = 0)
|
2873
|
+
|
2874
|
+
no_hit = np.where(shared_acc_counts == 0)
|
2875
|
+
|
2876
|
+
jaccard_averages = np.divide(np.sum(results, axis = 0), shared_acc_counts)
|
2877
|
+
|
2878
|
+
#Skip SD if output is matrix
|
2879
|
+
if style == "tsv":
|
2880
|
+
aai_ests = numpy_kaai_to_aai(jaccard_averages)
|
2881
|
+
|
2882
|
+
if do_sd:
|
2883
|
+
#find diffs from means; this includes indicies corresponding to unshared SCPs that should not be included.
|
2884
|
+
results = results - jaccard_averages
|
2885
|
+
|
2886
|
+
#fix those corresponding indicies to not contribute to the final SD.
|
2887
|
+
results[np.logical_not(_tpres[_qgak[q][0], :])] = 0
|
2888
|
+
#results[np.nonzero(has_accs == 0)] = 0
|
2889
|
+
|
2890
|
+
#Square them; 0^2 = 0, so we don't have to think about the fixed indices any more.
|
2891
|
+
results = np.square(results)
|
2892
|
+
#Sum squares and divide by shared acc. count, the sqrt to get SD.
|
2893
|
+
jaccard_SDs = np.sqrt(np.divide(np.sum(results, axis = 0), shared_acc_counts))
|
2894
|
+
jaccard_SDs = np.round(jaccard_SDs, 4).astype(str)
|
2895
|
+
|
2896
|
+
no_hit = np.where(shared_acc_counts == 0)
|
2897
|
+
|
2898
|
+
#_qgak[q][0] is the query acc count
|
2899
|
+
possible_hits = np.minimum(_qgak[q][0].shape[0], _tct).astype(str)
|
2900
|
+
|
2901
|
+
jaccard_averages = np.round(jaccard_averages, 4).astype(str)
|
2902
|
+
shared_acc_counts = shared_acc_counts.astype(str)
|
2903
|
+
|
2904
|
+
jaccard_averages[no_hit] = "N/A"
|
2905
|
+
aai_ests[no_hit] = "N/A"
|
2906
|
+
shared_acc_counts[no_hit] = "N/A"
|
2907
|
+
possible_hits[no_hit] = "N/A"
|
2908
|
+
|
2909
|
+
output_name = os.path.normpath(out_base + "/results/"+qname+"_results.txt")
|
2910
|
+
|
2911
|
+
out = open(output_name, "w")
|
2912
|
+
out.write("query\ttarget\tavg_jacc_sim\tjacc_SD\tnum_shared_SCPs\tposs_shared_SCPs\tAAI_estimate\n")
|
2913
|
+
if do_sd:
|
2914
|
+
jaccard_SDs[no_hit] = "N/A"
|
2915
|
+
for i in range(0, len(aai_ests)):
|
2916
|
+
out.write(qname+"\t"+_tnames[i]+"\t"+jaccard_averages[i]+"\t"+jaccard_SDs[i]+"\t"+shared_acc_counts[i]+"\t"+possible_hits[i]+"\t"+aai_ests[i]+"\n")
|
2917
|
+
else:
|
2918
|
+
for i in range(0, len(aai_ests)):
|
2919
|
+
out.write(qname+"\t"+_tnames[i]+"\t"+jaccard_averages[i]+"\t"+"N/A"+"\t"+shared_acc_counts[i]+"\t"+possible_hits[i]+"\t"+aai_ests[i]+"\n")
|
2920
|
+
out.close()
|
2921
|
+
|
2922
|
+
else:
|
2923
|
+
aai_ests = numpy_kaai_to_aai_just_nums(jaccard_averages, as_float = True)
|
2924
|
+
aai_ests[no_hit] = 0
|
2925
|
+
print(*aai_ests, sep = "\t", file = outwriter)
|
2926
|
+
|
2927
|
+
prog_queue.put(q)
|
2928
|
+
|
2929
|
+
curs.close()
|
2930
|
+
prog_queue.put("done")
|
2931
|
+
|
2932
|
+
def on_disk_work_two(i):
|
2933
|
+
outwriter.close()
|
2934
|
+
return group_id
|
2935
|
+
|
2936
|
+
def sorted_nicely(l):
|
2937
|
+
convert = lambda text: int(text) if text.isdigit() else text
|
2938
|
+
alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ]
|
2939
|
+
return sorted(l, key = alphanum_key)
|
2940
|
+
|
2941
|
+
class db_db_remake:
|
2942
|
+
def __init__(self, in_memory = False, store_mat_res = False,
|
2943
|
+
query = None, target = None, threads = 1, do_sd = False,
|
2944
|
+
output_base = "FastAAI", output_style = "tsv", verbose = True):
|
2945
|
+
|
2946
|
+
#databases to eat
|
2947
|
+
self.q = query
|
2948
|
+
self.t = target
|
2949
|
+
|
2950
|
+
#metadata
|
2951
|
+
self.ok = generate_accessions_index(forward = True)
|
2952
|
+
self.rev = generate_accessions_index(forward = False)
|
2953
|
+
self.valids = None
|
2954
|
+
|
2955
|
+
#Originally this was made to be a memory database only block of code, but just if/else one change makes it work on disk and it doesn't need a redev, then.
|
2956
|
+
self.as_mem_db = in_memory
|
2957
|
+
self.store_mat = store_mat_res
|
2958
|
+
|
2959
|
+
#in-mem stuff
|
2960
|
+
self.conn = None
|
2961
|
+
self.curs = None
|
2962
|
+
|
2963
|
+
self.threads = threads
|
2964
|
+
self.do_sd = do_sd
|
2965
|
+
|
2966
|
+
self.output_base = output_base
|
2967
|
+
self.output = os.path.normpath(output_base + "/results")
|
2968
|
+
self.style = output_style
|
2969
|
+
|
2970
|
+
self.query_names = None
|
2971
|
+
self.target_names = None
|
2972
|
+
|
2973
|
+
self.num_queries = None
|
2974
|
+
self.num_targets = None
|
2975
|
+
|
2976
|
+
self.query_gak = None
|
2977
|
+
self.target_gak = None
|
2978
|
+
self.target_presence = None
|
2979
|
+
|
2980
|
+
self.query_dict = None
|
2981
|
+
self.target_dict = None
|
2982
|
+
|
2983
|
+
self.verbose = verbose
|
2984
|
+
|
2985
|
+
#getting the db metadata happens the same way in every case
|
2986
|
+
def open(self):
|
2987
|
+
if self.verbose:
|
2988
|
+
print("Perusing database metadata")
|
2989
|
+
|
2990
|
+
self.conn = sqlite3.connect(":memory:")
|
2991
|
+
self.curs = self.conn.cursor()
|
2992
|
+
|
2993
|
+
self.curs.execute("attach '" + self.q + "' as queries")
|
2994
|
+
self.curs.execute("attach '" + self.t + "' as targets")
|
2995
|
+
|
2996
|
+
#Find the shared accessions for these databases
|
2997
|
+
shared_accs_sql = '''
|
2998
|
+
SELECT queries.sqlite_master.name
|
2999
|
+
FROM queries.sqlite_master INNER JOIN targets.sqlite_master
|
3000
|
+
ON queries.sqlite_master.name = targets.sqlite_master.name
|
3001
|
+
'''
|
3002
|
+
self.valids = {}
|
3003
|
+
for table in self.curs.execute(shared_accs_sql).fetchall():
|
3004
|
+
table = table[0]
|
3005
|
+
#Filter to
|
3006
|
+
if table in self.ok:
|
3007
|
+
self.valids[table] = self.ok[table]
|
3008
|
+
|
3009
|
+
self.query_names = []
|
3010
|
+
for r in self.curs.execute("SELECT genome FROM queries.genome_index ORDER BY gen_id").fetchall():
|
3011
|
+
self.query_names.append(r[0])
|
3012
|
+
|
3013
|
+
self.target_names = []
|
3014
|
+
for r in self.curs.execute("SELECT genome FROM targets.genome_index ORDER BY gen_id").fetchall():
|
3015
|
+
self.target_names.append(r[0])
|
3016
|
+
|
3017
|
+
self.num_queries = len(self.query_names)
|
3018
|
+
self.num_targets = len(self.target_names)
|
3019
|
+
|
3020
|
+
gak_sql = '''
|
3021
|
+
SELECT * FROM {db}.genome_acc_kmer_counts
|
3022
|
+
WHERE accession in ({accs})
|
3023
|
+
ORDER BY genome
|
3024
|
+
'''
|
3025
|
+
|
3026
|
+
acc_ids = list(self.valids.values())
|
3027
|
+
acc_ids.sort()
|
3028
|
+
acc_ids = tuple(acc_ids)
|
3029
|
+
|
3030
|
+
#query genome-acc-kmers (gak) is ordered by genome first, then accession
|
3031
|
+
self.query_gak = {}
|
3032
|
+
#for result in self.curs.execute(gak_sql.format(db = "queries", accs=','.join(['?']*len(self.valids))), acc_ids).fetchall():
|
3033
|
+
for result in self.curs.execute("SELECT * FROM queries.genome_acc_kmer_counts ORDER BY genome").fetchall():
|
3034
|
+
genome, accession, kmer_ct = result[0], result[1], result[2]
|
3035
|
+
if genome not in self.query_gak:
|
3036
|
+
self.query_gak[genome] = [[],[]]
|
3037
|
+
self.query_gak[genome][0].append(accession)
|
3038
|
+
self.query_gak[genome][1].append(kmer_ct)
|
3039
|
+
|
3040
|
+
#refigure into numpy arrays for quicker array access later.
|
3041
|
+
for genome in self.query_gak:
|
3042
|
+
self.query_gak[genome] = (np.array(self.query_gak[genome][0], dtype = np.int32), np.array(self.query_gak[genome][1], dtype = np.int32))
|
3043
|
+
|
3044
|
+
#Split these into ordered groups - this makes joining results at the end easier.
|
3045
|
+
qgak_queue = multiprocessing.Queue()
|
3046
|
+
groupings = split_seq_indices(np.arange(self.num_queries), self.threads)
|
3047
|
+
group_id = 0
|
3048
|
+
for group in groupings:
|
3049
|
+
next_set = {}
|
3050
|
+
for i in range(group[0], group[1]):
|
3051
|
+
next_set[i] = self.query_gak[i]
|
3052
|
+
self.query_gak[i] = None
|
3053
|
+
#this ensures that the selection of qgak and the query index range match
|
3054
|
+
qgak_queue.put((group_id, next_set, np.arange(group[0], group[1]),))
|
3055
|
+
group_id += 1
|
3056
|
+
|
3057
|
+
self.query_gak = qgak_queue
|
3058
|
+
qgak_queue = None
|
3059
|
+
|
3060
|
+
#tgt gak is organized by accession first, then genome
|
3061
|
+
self.target_gak = np.zeros(shape = (122, self.num_targets), dtype = np.int32)
|
3062
|
+
for result in self.curs.execute(gak_sql.format(db = "targets", accs=','.join(['?']*len(self.valids))), acc_ids).fetchall():
|
3063
|
+
genome, accession, kmer_ct = result[0], result[1], result[2]
|
3064
|
+
self.target_gak[accession, genome] += kmer_ct
|
3065
|
+
|
3066
|
+
self.target_presence = self.target_gak > 0
|
3067
|
+
self.target_presence = self.target_presence.astype(bool)
|
3068
|
+
|
3069
|
+
#This needs to have a TSV write method
|
3070
|
+
def load_in_mem(self):
|
3071
|
+
#tempdir_path = os.path.normpath(self.output_base+"/temp")
|
3072
|
+
tempdir_path = tempfile.mkdtemp()
|
3073
|
+
#if not os.path.exists(tempdir_path):
|
3074
|
+
# os.mkdir(tempdir_path)
|
3075
|
+
|
3076
|
+
ql = {}
|
3077
|
+
tl = {}
|
3078
|
+
for t in self.valids.values():
|
3079
|
+
tl[t] = None
|
3080
|
+
for i in range(0, self.num_queries):
|
3081
|
+
ql[i] = {}
|
3082
|
+
|
3083
|
+
if self.verbose:
|
3084
|
+
tracker = progress_tracker(total = len(self.valids), message = "Loading data in memory.")
|
3085
|
+
else:
|
3086
|
+
print("\nLoading data in memory.")
|
3087
|
+
|
3088
|
+
|
3089
|
+
pool = multiprocessing.Pool(self.threads, initializer = parse_db_init,
|
3090
|
+
initargs = (self.q, #query
|
3091
|
+
self.t, #target
|
3092
|
+
tempdir_path,)) #outpath
|
3093
|
+
|
3094
|
+
for result in pool.imap_unordered(parse_accession, self.valids.keys()):
|
3095
|
+
this_accession = result[0]
|
3096
|
+
|
3097
|
+
this_acc_id = self.ok[this_accession]
|
3098
|
+
|
3099
|
+
with open(result[1], "rb") as inp:
|
3100
|
+
this_acc_data = pickle.load(inp)
|
3101
|
+
os.remove(result[1])
|
3102
|
+
|
3103
|
+
tl[this_acc_id] = this_acc_data[1]
|
3104
|
+
|
3105
|
+
for q in this_acc_data[0]:
|
3106
|
+
#We know that this acc must be in every ql for this loaded data.
|
3107
|
+
ql[q][this_acc_id] = this_acc_data[0][q][this_acc_id]
|
3108
|
+
if self.verbose:
|
3109
|
+
tracker.update()
|
3110
|
+
|
3111
|
+
pool.close()
|
3112
|
+
|
3113
|
+
if self.verbose:
|
3114
|
+
tracker = progress_tracker(total = self.num_queries, message = "Calculating AAI")
|
3115
|
+
else:
|
3116
|
+
print("\nCalculating AAI.")
|
3117
|
+
|
3118
|
+
query_groups = []
|
3119
|
+
for grouping in split_seq_indices(np.arange(self.num_queries), self.threads):
|
3120
|
+
query_groups.append(np.arange(grouping[0], grouping[1]))
|
3121
|
+
|
3122
|
+
result_queue = multiprocessing.Queue()
|
3123
|
+
remaining_procs = self.threads
|
3124
|
+
still_going = True
|
3125
|
+
|
3126
|
+
pool = multiprocessing.Pool(self.threads, initializer = one_init,
|
3127
|
+
initargs = (ql, #ql
|
3128
|
+
tl, #tl
|
3129
|
+
self.num_targets, #num_tgt
|
3130
|
+
self.query_gak, #qgak_queue
|
3131
|
+
self.target_gak, #tgak
|
3132
|
+
self.target_presence, #tpres
|
3133
|
+
self.do_sd, #sd
|
3134
|
+
self.style, #sty
|
3135
|
+
self.output_base, #output_dir
|
3136
|
+
self.store_mat, #store_results
|
3137
|
+
result_queue, #progress_queue
|
3138
|
+
self.query_names, #qnames
|
3139
|
+
self.target_names, #tnames
|
3140
|
+
tempdir_path,)) #temp_dir
|
3141
|
+
|
3142
|
+
some_results = pool.imap(one_work, query_groups)
|
3143
|
+
|
3144
|
+
while still_going:
|
3145
|
+
item = result_queue.get()
|
3146
|
+
if item == "done":
|
3147
|
+
remaining_procs -= 1
|
3148
|
+
if remaining_procs == 0:
|
3149
|
+
still_going = False
|
3150
|
+
else:
|
3151
|
+
if self.verbose:
|
3152
|
+
tracker.update()
|
3153
|
+
else:
|
3154
|
+
pass
|
3155
|
+
|
3156
|
+
if self.style == "matrix":
|
3157
|
+
result_files = []
|
3158
|
+
|
3159
|
+
for result in pool.map(two_work, range(0, self.threads)):
|
3160
|
+
result_files.append(result)
|
3161
|
+
|
3162
|
+
pool.close()
|
3163
|
+
|
3164
|
+
self.write_mat_from_files(result_files, tempdir_path)
|
3165
|
+
else:
|
3166
|
+
pool.close()
|
3167
|
+
|
3168
|
+
#This needs to be implemented from existing code.
|
3169
|
+
def db_on_disk(self):
|
3170
|
+
tempdir_path = tempfile.mkdtemp()
|
3171
|
+
if self.style == "matrix":
|
3172
|
+
self.store_mat = False
|
3173
|
+
|
3174
|
+
result_queue = multiprocessing.Queue()
|
3175
|
+
remaining_procs = self.threads
|
3176
|
+
still_going = True
|
3177
|
+
|
3178
|
+
if self.verbose:
|
3179
|
+
tracker = progress_tracker(total = self.num_queries, message = "Calculating AAI")
|
3180
|
+
else:
|
3181
|
+
print("\nCalculating AAI")
|
3182
|
+
|
3183
|
+
query_groups = []
|
3184
|
+
for grouping in split_seq_indices(np.arange(self.num_queries), self.threads):
|
3185
|
+
query_groups.append(np.arange(grouping[0], grouping[1]))
|
3186
|
+
|
3187
|
+
#query_database_path, target_database_path, num_tgt, query_queue, target_gak, tpres, sd,
|
3188
|
+
#sty, output_dir, progress_queue, qnames, tnames, valids, temp_dir
|
3189
|
+
pool = multiprocessing.Pool(self.threads, initializer = on_disk_init,
|
3190
|
+
initargs = (self.q, #query_database_path
|
3191
|
+
self.t, #target_database_path
|
3192
|
+
self.num_targets, #num_tgt
|
3193
|
+
self.query_gak, #query_queue
|
3194
|
+
self.target_gak, #target_gak
|
3195
|
+
self.target_presence, #tpres
|
3196
|
+
self.do_sd, #sd
|
3197
|
+
self.style, #sty
|
3198
|
+
self.output_base, #output_dir
|
3199
|
+
result_queue, #progress_queue
|
3200
|
+
self.query_names, #qnames
|
3201
|
+
self.target_names, #tnames
|
3202
|
+
self.valids, #valids
|
3203
|
+
tempdir_path,)) #temp_dir
|
3204
|
+
|
3205
|
+
some_results = pool.imap(on_disk_work_one, query_groups)
|
3206
|
+
|
3207
|
+
while still_going:
|
3208
|
+
item = result_queue.get()
|
3209
|
+
if item == "done":
|
3210
|
+
remaining_procs -= 1
|
3211
|
+
if remaining_procs == 0:
|
3212
|
+
still_going = False
|
3213
|
+
else:
|
3214
|
+
if self.verbose:
|
3215
|
+
tracker.update()
|
3216
|
+
else:
|
3217
|
+
pass
|
3218
|
+
|
3219
|
+
if self.style == "matrix":
|
3220
|
+
result_files = []
|
3221
|
+
for result in pool.map(on_disk_work_two, range(0, self.threads)):
|
3222
|
+
result_files.append(result)
|
3223
|
+
|
3224
|
+
pool.close()
|
3225
|
+
|
3226
|
+
if self.style == "matrix":
|
3227
|
+
self.write_mat_from_files(result_files, tempdir_path)
|
3228
|
+
|
3229
|
+
def write_mat_from_files(self, result_files, tempdir_path):
|
3230
|
+
#tempdir_path = os.path.normpath(self.output_base+"/temp")
|
3231
|
+
|
3232
|
+
result_files = sorted_nicely(result_files)
|
3233
|
+
|
3234
|
+
#print("Combining:")
|
3235
|
+
#for f in result_files:
|
3236
|
+
# print(f)
|
3237
|
+
|
3238
|
+
if self.verbose:
|
3239
|
+
tracker = progress_tracker(total = self.threads, step_size = 2, message = "Finalizing results.")
|
3240
|
+
else:
|
3241
|
+
print("\nFinalizing results.")
|
3242
|
+
|
3243
|
+
output_file = os.path.normpath(self.output+"/FastAAI_matrix.txt")
|
3244
|
+
final_outwriter = open(output_file, "w")
|
3245
|
+
print("query_genome\t"+'\t'.join(self.target_names), file = final_outwriter)
|
3246
|
+
|
3247
|
+
row = 0
|
3248
|
+
|
3249
|
+
for f in result_files:
|
3250
|
+
fh = open(f, "r")
|
3251
|
+
cur = fh.readlines()
|
3252
|
+
fh.close()
|
3253
|
+
|
3254
|
+
for i in range(0, len(cur)):
|
3255
|
+
if self.store_mat:
|
3256
|
+
#Add the decimals - we don't need to do this is we've been writing line-wise.
|
3257
|
+
#values will ALWAYS be 4 digits in this method, so groups of 2 dec. works.
|
3258
|
+
cur[i] = re.sub("(\d{2})(\d{2})", "\\1.\\2", cur[i])
|
3259
|
+
#Add in the query name to the row
|
3260
|
+
cur[i] = self.query_names[row]+"\t"+cur[i]
|
3261
|
+
row += 1
|
3262
|
+
|
3263
|
+
final_outwriter.write(''.join(cur))
|
3264
|
+
cur = None
|
3265
|
+
|
3266
|
+
try:
|
3267
|
+
os.remove(f)
|
3268
|
+
except:
|
3269
|
+
pass
|
3270
|
+
|
3271
|
+
if self.verbose:
|
3272
|
+
tracker.update()
|
3273
|
+
|
3274
|
+
final_outwriter.close()
|
3275
|
+
|
3276
|
+
try:
|
3277
|
+
if len(os.listdir(tempdir_path)) == 0:
|
3278
|
+
shutil.rmtree(tempdir_path)
|
3279
|
+
except:
|
3280
|
+
pass
|
3281
|
+
|
3282
|
+
def close(self):
|
3283
|
+
self.curs.close()
|
3284
|
+
self.curs = None
|
3285
|
+
|
3286
|
+
def clean_up(self):
|
3287
|
+
self.conn.close()
|
3288
|
+
self.conn = None
|
3289
|
+
|
3290
|
+
def run(self):
|
3291
|
+
self.open()
|
3292
|
+
|
3293
|
+
#work
|
3294
|
+
if self.as_mem_db:
|
3295
|
+
self.load_in_mem()
|
3296
|
+
else:
|
3297
|
+
self.db_on_disk()
|
3298
|
+
|
3299
|
+
self.close()
|
3300
|
+
self.clean_up()
|
3301
|
+
|
3302
|
+
|
3303
|
+
#Control the query process for any DB-first query.
|
3304
|
+
def db_query(query, target, verbose, output, threads, do_stdev, style, in_mem, store_results):
|
3305
|
+
print("")
|
3306
|
+
|
3307
|
+
#Sanity checks.
|
3308
|
+
if target is None:
|
3309
|
+
print("You need to supply a databasae for --target")
|
3310
|
+
sys.exit()
|
3311
|
+
|
3312
|
+
#Sanity checks.
|
3313
|
+
if query is None:
|
3314
|
+
print("You need to supply a databasae for --query")
|
3315
|
+
sys.exit()
|
3316
|
+
|
3317
|
+
|
3318
|
+
|
3319
|
+
#Sanity checks.
|
3320
|
+
if not os.path.exists(target):
|
3321
|
+
print("Target database not found. Exiting FastAAI")
|
3322
|
+
sys.exit()
|
3323
|
+
|
3324
|
+
if not os.path.exists(query):
|
3325
|
+
print("Query database not found. Exiting FastAAI")
|
3326
|
+
sys.exit()
|
3327
|
+
|
3328
|
+
#status = "exists"
|
3329
|
+
query_ok = assess_db(query)
|
3330
|
+
target_ok = assess_db(target)
|
3331
|
+
|
3332
|
+
if query_ok != "exists":
|
3333
|
+
print("Query database improperly formatted. Exiting FastAAI")
|
3334
|
+
sys.exit()
|
3335
|
+
|
3336
|
+
if target_ok != "exists":
|
3337
|
+
print("Query database improperly formatted. Exiting FastAAI")
|
3338
|
+
sys.exit()
|
3339
|
+
|
3340
|
+
#Check if the database is querying against itself.
|
3341
|
+
if target is None or query is None:
|
3342
|
+
print("I require both a query and a target database. FastAAI exiting.")
|
3343
|
+
sys.exit()
|
3344
|
+
|
3345
|
+
if query == target:
|
3346
|
+
print("Performing an all vs. all query on", query)
|
3347
|
+
#all_vs_all = True
|
3348
|
+
else:
|
3349
|
+
print("Querying", query, "against", target)
|
3350
|
+
#all_vs_all = False
|
3351
|
+
|
3352
|
+
#Ready the output directories as needed.
|
3353
|
+
#The databases are already created, the only state they can be in in P+H
|
3354
|
+
good_to_go = prepare_directories(output, "protein and HMM", "query")
|
3355
|
+
if not good_to_go:
|
3356
|
+
print("Exiting FastAAI")
|
3357
|
+
sys.exit()
|
3358
|
+
|
3359
|
+
#todo
|
3360
|
+
mdb = db_db_remake(in_memory = in_mem, store_mat_res = store_results, query = query, target = target, threads = threads, do_sd = do_stdev, output_base = output, output_style = style, verbose = verbose)
|
3361
|
+
mdb.run()
|
3362
|
+
|
3363
|
+
print("")
|
3364
|
+
|
3365
|
+
|
3366
|
+
#Check to see if the file exists and is a valid fastAAI db
|
3367
|
+
def assess_db(path):
|
3368
|
+
status = None
|
3369
|
+
if os.path.exists(path):
|
3370
|
+
conn = sqlite3.connect(path)
|
3371
|
+
curs = conn.cursor()
|
3372
|
+
try:
|
3373
|
+
sql = "SELECT name FROM sqlite_master WHERE type='table'"
|
3374
|
+
|
3375
|
+
curs.row_factory = lambda cursor, row: row[0]
|
3376
|
+
tables = curs.execute(sql).fetchall()
|
3377
|
+
curs.row_factory = None
|
3378
|
+
|
3379
|
+
curs.close()
|
3380
|
+
conn.close()
|
3381
|
+
|
3382
|
+
if len(tables) > 2 and "genome_index" in tables and "genome_acc_kmer_counts" in tables:
|
3383
|
+
status = "exists"
|
3384
|
+
else:
|
3385
|
+
status = "wrong format"
|
3386
|
+
|
3387
|
+
except:
|
3388
|
+
status = "wrong format"
|
3389
|
+
|
3390
|
+
else:
|
3391
|
+
try:
|
3392
|
+
conn = sqlite3.connect(path)
|
3393
|
+
conn.close()
|
3394
|
+
status = "created"
|
3395
|
+
except:
|
3396
|
+
status = "unable to create"
|
3397
|
+
|
3398
|
+
return status
|
3399
|
+
|
3400
|
+
#Add one FastAAI DB to another FastAAI DB
|
3401
|
+
def merge_db_opts():
|
3402
|
+
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
|
3403
|
+
description='''
|
3404
|
+
This FastAAI module allows you to add the contents of one or more FastAAI databases to another.
|
3405
|
+
You must have at least two already-created FastAAI databases using the build_db module before this module can be used.
|
3406
|
+
|
3407
|
+
Supply a comma-separated list of at least one donor database and a single recipient database.
|
3408
|
+
If the recipient already exists, then genomes in all the donors will be added to the recipient.
|
3409
|
+
If the recipient does not already exist, a new database will be created, and the contents of all the donors will be added to it.
|
3410
|
+
|
3411
|
+
Example:
|
3412
|
+
FastAAI.py merge_db --donors databases/db1.db,databases/db2.db -recipient databases/db3.db --threads 3
|
3413
|
+
This command will create a new database called "db3.db", merge the data in db1.db and db2.db, and then add the merged data into db3.db
|
3414
|
+
|
3415
|
+
Only the recipient database will be modified; the donors will be left exactly as they were before running this module.
|
3416
|
+
''')
|
3417
|
+
|
3418
|
+
parser.add_argument('-d', '--donors', dest = 'donors', default = None, help = 'Comma-separated string of paths to one or more donor databases. The genomes FROM the donors will be added TO the recipient and the donors will be unaltered')
|
3419
|
+
parser.add_argument('--donor_file', dest = 'donor_file', default = None, help = 'File containing paths to one or more donor databases, one per line. Use EITHER this or --donors')
|
3420
|
+
|
3421
|
+
parser.add_argument('-r', '--recipient', dest = 'recipient', default = None, help = 'Path to the recipient database. Any genomes FROM the donor database not already in the recipient will be added to this database.')
|
3422
|
+
|
3423
|
+
parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
|
3424
|
+
|
3425
|
+
parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
|
3426
|
+
|
3427
|
+
args, unknown = parser.parse_known_args()
|
3428
|
+
|
3429
|
+
return parser, args
|
3430
|
+
|
3431
|
+
def merge_db_init(indexer, table_record, donor_dbs, tempdir):
|
3432
|
+
global mgi
|
3433
|
+
mgi = indexer
|
3434
|
+
global accs_per_db
|
3435
|
+
accs_per_db = table_record
|
3436
|
+
global tdb_list
|
3437
|
+
tdb_list = donor_dbs
|
3438
|
+
global work_space
|
3439
|
+
work_space = tempdir
|
3440
|
+
|
3441
|
+
def acc_transformer_merge(acc_name_genomes):
|
3442
|
+
acc_name = acc_name_genomes.split("_genomes")[0]
|
3443
|
+
my_acc_db = os.path.normpath(work_space + "/"+acc_name+".db")
|
3444
|
+
if os.path.exists(my_acc_db):
|
3445
|
+
os.remove(my_acc_db)
|
3446
|
+
|
3447
|
+
my_db = sqlite3.connect(my_acc_db)
|
3448
|
+
curs = my_db.cursor()
|
3449
|
+
curs.execute("CREATE TABLE {acc} (kmer INTEGER PRIMARY KEY, genomes array)".format(acc=acc_name))
|
3450
|
+
curs.execute("CREATE TABLE {acc} (genome INTEGER PRIMARY KEY, kmers array)".format(acc=acc_name_genomes))
|
3451
|
+
my_db.commit()
|
3452
|
+
|
3453
|
+
reformat = {}
|
3454
|
+
for d in tdb_list:
|
3455
|
+
simple_rows = []
|
3456
|
+
#do nothing if the acc is not in the donor.
|
3457
|
+
if acc_name_genomes in accs_per_db[d]:
|
3458
|
+
donor_conn = sqlite3.connect(d)
|
3459
|
+
dcurs = donor_conn.cursor()
|
3460
|
+
data = dcurs.execute("SELECT * FROM {acc}".format(acc=acc_name_genomes)).fetchall()
|
3461
|
+
dcurs.close()
|
3462
|
+
donor_conn.close()
|
3463
|
+
|
3464
|
+
for row in data:
|
3465
|
+
genome, kmers = row[0], row[1]
|
3466
|
+
new_index = mgi[d][genome]
|
3467
|
+
#-1 is the value indicating an already-seen genome that should not be added.
|
3468
|
+
if new_index > -1:
|
3469
|
+
simple_rows.append((new_index, kmers,))
|
3470
|
+
kmers = np.frombuffer(kmers, dtype=np.int32)
|
3471
|
+
for k in kmers:
|
3472
|
+
if k not in reformat:
|
3473
|
+
reformat[k] = []
|
3474
|
+
reformat[k].append(new_index)
|
3475
|
+
|
3476
|
+
if len(simple_rows) > 0:
|
3477
|
+
curs.executemany("INSERT INTO {acc} VALUES (?,?)".format(acc=acc_name_genomes), simple_rows)
|
3478
|
+
my_db.commit()
|
3479
|
+
|
3480
|
+
simple_rows = None
|
3481
|
+
data = None
|
3482
|
+
|
3483
|
+
to_add = []
|
3484
|
+
for k in reformat:
|
3485
|
+
as_bytes = np.array(reformat[k], dtype = np.int32)
|
3486
|
+
as_bytes = as_bytes.tobytes()
|
3487
|
+
reformat[k] = None
|
3488
|
+
to_add.append((int(k), as_bytes,))
|
3489
|
+
|
3490
|
+
curs.executemany("INSERT INTO {acc} VALUES (?, ?)".format(acc = acc_name), to_add)
|
3491
|
+
|
3492
|
+
my_db.commit()
|
3493
|
+
|
3494
|
+
to_add = None
|
3495
|
+
|
3496
|
+
curs.execute("CREATE INDEX {acc}_index ON {acc} (kmer)".format(acc=acc_name))
|
3497
|
+
my_db.commit()
|
3498
|
+
|
3499
|
+
curs.close()
|
3500
|
+
my_db.close()
|
3501
|
+
|
3502
|
+
return [my_acc_db, acc_name]
|
3503
|
+
|
3504
|
+
def merge_db(recipient, donors, donor_file, verbose, threads):
|
3505
|
+
#Prettier on the CLI
|
3506
|
+
if (donors is None and donor_file is None) or recipient is None:
|
3507
|
+
print("Either donor or target not given. FastAAI is exiting.")
|
3508
|
+
return None
|
3509
|
+
|
3510
|
+
print("")
|
3511
|
+
|
3512
|
+
if donors is not None:
|
3513
|
+
donors = donors.split(",")
|
3514
|
+
|
3515
|
+
if donor_file is not None:
|
3516
|
+
try:
|
3517
|
+
donors = []
|
3518
|
+
fh = agnostic_reader(donor_file)
|
3519
|
+
for line in fh:
|
3520
|
+
line = line.strip()
|
3521
|
+
donors.append(line)
|
3522
|
+
fh.close()
|
3523
|
+
except:
|
3524
|
+
sys.exit("Could not parse your donor file.")
|
3525
|
+
|
3526
|
+
valid_donors = []
|
3527
|
+
for d in donors:
|
3528
|
+
if os.path.exists(d):
|
3529
|
+
if d == recipient:
|
3530
|
+
print("Donor database", d, "is the same as the recipient. This database will be skipped.")
|
3531
|
+
else:
|
3532
|
+
check = assess_db(d)
|
3533
|
+
if check == "exists":
|
3534
|
+
if d not in valid_donors:
|
3535
|
+
valid_donors.append(d)
|
3536
|
+
else:
|
3537
|
+
print("It appears that database", d, "was already added to the list of donors. Did you type it twice in the list of donors? Skipping it.")
|
3538
|
+
else:
|
3539
|
+
if check == "created":
|
3540
|
+
print("Donor database", d, "not found! Skipping.")
|
3541
|
+
else:
|
3542
|
+
print("Something was wrong with supplied database:", d+". A status check found:", check)
|
3543
|
+
else:
|
3544
|
+
print("Donor database", d, "not found! Are you sure the path is correct and this donor exists? This database will be skipped.")
|
3545
|
+
|
3546
|
+
if len(valid_donors) == 0:
|
3547
|
+
print("None of the supplied donor databases were able to be accessed. FastAAI cannot continue if none of these databases are valid. Exiting.")
|
3548
|
+
sys.exit()
|
3549
|
+
|
3550
|
+
recip_check = assess_db(recipient)
|
3551
|
+
|
3552
|
+
if recip_check == "created" or recip_check == "exists":
|
3553
|
+
print("Donor databases:")
|
3554
|
+
for donor in valid_donors:
|
3555
|
+
print("\t", donor)
|
3556
|
+
print("Will be added to recipient database:", recipient)
|
3557
|
+
else:
|
3558
|
+
print("I couldn't find or create the recipient database at", recipient+".", "Does the folder you're trying to place this database in exist, and do you have permission to write files to it? FastAAI exiting.")
|
3559
|
+
sys.exit()
|
3560
|
+
|
3561
|
+
if recipient is None or len(valid_donors) == 0:
|
3562
|
+
print("I require both a valid donor and a recipient database. FastAAI exiting.")
|
3563
|
+
sys.exit()
|
3564
|
+
|
3565
|
+
gen_counter = 0
|
3566
|
+
multi_gen_ids = {}
|
3567
|
+
all_gens = {}
|
3568
|
+
|
3569
|
+
#Load recipient data, if any.
|
3570
|
+
if recip_check == "exists":
|
3571
|
+
conn = sqlite3.connect(recipient)
|
3572
|
+
curs = conn.cursor()
|
3573
|
+
data = curs.execute("SELECT genome, gen_id FROM genome_index").fetchall()
|
3574
|
+
tabs = curs.execute("SELECT name FROM sqlite_master").fetchall()
|
3575
|
+
curs.close()
|
3576
|
+
conn.close()
|
3577
|
+
|
3578
|
+
multi_gen_ids[recipient] = {}
|
3579
|
+
for row in data:
|
3580
|
+
genome, index = row[0], row[1]
|
3581
|
+
all_gens[genome] = 0
|
3582
|
+
multi_gen_ids[recipient][genome] = index
|
3583
|
+
|
3584
|
+
gen_counter = max(list(multi_gen_ids[recipient].values())) + 1
|
3585
|
+
|
3586
|
+
genome_index_to_add = []
|
3587
|
+
gak_to_add = []
|
3588
|
+
tables = {}
|
3589
|
+
#Donors should always exist, never be created.
|
3590
|
+
for d in valid_donors:
|
3591
|
+
#load
|
3592
|
+
conn = sqlite3.connect(d)
|
3593
|
+
curs = conn.cursor()
|
3594
|
+
data = curs.execute("SELECT * FROM genome_index").fetchall()
|
3595
|
+
tabs = curs.execute("SELECT name FROM sqlite_master").fetchall()
|
3596
|
+
gak = curs.execute("SELECT * FROM genome_acc_kmer_counts").fetchall()
|
3597
|
+
curs.close()
|
3598
|
+
conn.close()
|
3599
|
+
multi_gen_ids[d] = {}
|
3600
|
+
for row in data:
|
3601
|
+
genome, index, prot_ct = row[0], row[1], row[2]
|
3602
|
+
if genome not in all_gens:
|
3603
|
+
all_gens[genome] = 0
|
3604
|
+
#We need to be able to convert number to number.
|
3605
|
+
multi_gen_ids[d][index] = gen_counter
|
3606
|
+
genome_index_to_add.append((genome, gen_counter, prot_ct,))
|
3607
|
+
gen_counter += 1
|
3608
|
+
else:
|
3609
|
+
#This is a remove condition for later.
|
3610
|
+
multi_gen_ids[d][index] = -1
|
3611
|
+
data = None
|
3612
|
+
|
3613
|
+
for row in gak:
|
3614
|
+
genome_id, acc_id, kmer_ct = row[0], row[1], row[2]
|
3615
|
+
new_index = multi_gen_ids[d][genome_id]
|
3616
|
+
if new_index > -1:
|
3617
|
+
gak_to_add.append((new_index, acc_id, kmer_ct,))
|
3618
|
+
|
3619
|
+
tables[d] = []
|
3620
|
+
for tab in tabs:
|
3621
|
+
tab = tab[0]
|
3622
|
+
if tab.endswith("_genomes"):
|
3623
|
+
tables[d].append(tab)
|
3624
|
+
tables[d] = set(tables[d])
|
3625
|
+
|
3626
|
+
all_tabs = set()
|
3627
|
+
for t in tables:
|
3628
|
+
all_tabs = all_tabs.union(tables[t])
|
3629
|
+
|
3630
|
+
all_tabs = list(all_tabs)
|
3631
|
+
|
3632
|
+
|
3633
|
+
temp_dir = tempfile.mkdtemp()
|
3634
|
+
try:
|
3635
|
+
if verbose:
|
3636
|
+
tracker = progress_tracker(len(all_tabs), message = "Formatting data to add to database")
|
3637
|
+
else:
|
3638
|
+
print("Formatting data to add to database")
|
3639
|
+
|
3640
|
+
conn = sqlite3.connect(recipient)
|
3641
|
+
curs = conn.cursor()
|
3642
|
+
|
3643
|
+
#indexer, table_record, donor_dbs, tempdir
|
3644
|
+
pool = multiprocessing.Pool(threads, initializer=merge_db_init, initargs = (multi_gen_ids, tables, valid_donors, temp_dir,))
|
3645
|
+
|
3646
|
+
for result in pool.imap_unordered(acc_transformer_merge, all_tabs):
|
3647
|
+
db, accession = result[0], result[1]
|
3648
|
+
curs.execute("CREATE TABLE IF NOT EXISTS {acc} (kmer INTEGER PRIMARY KEY, genomes array)".format(acc=accession))
|
3649
|
+
curs.execute("CREATE TABLE IF NOT EXISTS {acc}_genomes (genome INTEGER PRIMARY KEY, kmers array)".format(acc=accession))
|
3650
|
+
curs.execute("CREATE INDEX IF NOT EXISTS {acc}_index ON {acc}(kmer)".format(acc=accession))
|
3651
|
+
conn.commit()
|
3652
|
+
|
3653
|
+
curs.execute("attach '" + db + "' as acc")
|
3654
|
+
conn.commit()
|
3655
|
+
|
3656
|
+
#Get the genomes from worker db.
|
3657
|
+
curs.execute("INSERT INTO {acc}_genomes SELECT * FROM acc.{acc}_genomes".format(acc=accession))
|
3658
|
+
to_update = curs.execute("SELECT kmer, genomes, genomes FROM acc.{acc}".format(acc=accession)).fetchall()
|
3659
|
+
update_concat_sql = "INSERT INTO {acc} VALUES (?,?) ON CONFLICT(kmer) DO UPDATE SET genomes=genomes || (?)".format(acc=accession)
|
3660
|
+
curs.executemany(update_concat_sql, to_update)
|
3661
|
+
conn.commit()
|
3662
|
+
|
3663
|
+
curs.execute("detach acc")
|
3664
|
+
conn.commit()
|
3665
|
+
|
3666
|
+
os.remove(db)
|
3667
|
+
|
3668
|
+
if verbose:
|
3669
|
+
tracker.update()
|
3670
|
+
|
3671
|
+
pool.close()
|
3672
|
+
pool.join()
|
3673
|
+
|
3674
|
+
curs.execute("CREATE TABLE IF NOT EXISTS genome_index (genome text, gen_id integer, protein_count integer)")
|
3675
|
+
curs.execute("CREATE TABLE IF NOT EXISTS genome_acc_kmer_counts (genome integer, accession integer, count integer)")
|
3676
|
+
|
3677
|
+
curs.executemany("INSERT INTO genome_index VALUES (?,?,?)", genome_index_to_add)
|
3678
|
+
curs.executemany("INSERT INTO genome_acc_kmer_counts VALUES (?,?,?)", gak_to_add)
|
3679
|
+
|
3680
|
+
curs.execute("CREATE INDEX IF NOT EXISTS kmer_acc ON genome_acc_kmer_counts (genome, accession);")
|
3681
|
+
|
3682
|
+
conn.commit()
|
3683
|
+
|
3684
|
+
except:
|
3685
|
+
curs.close()
|
3686
|
+
conn.close()
|
3687
|
+
#Error
|
3688
|
+
shutil.rmtree(temp_dir)
|
3689
|
+
if recip_check == "created":
|
3690
|
+
print("Removing created database after failure.")
|
3691
|
+
os.remove(recipient)
|
3692
|
+
try:
|
3693
|
+
curs.close()
|
3694
|
+
conn.close()
|
3695
|
+
#Success
|
3696
|
+
shutil.rmtree(temp_dir)
|
3697
|
+
except:
|
3698
|
+
pass
|
3699
|
+
|
3700
|
+
print("\nDatabases merged!")
|
3701
|
+
|
3702
|
+
return None
|
3703
|
+
|
3704
|
+
#Query 1 genome vs. 1 target using Carlos' method - just needs query, target, threads
|
3705
|
+
def single_query_opts():
|
3706
|
+
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
|
3707
|
+
description='''
|
3708
|
+
This FastAAI module takes a single query genome, protein, or protein and HMM pair and a single target genome, protein, or protein and HMM pair as inputs and calculates AAI between the two.
|
3709
|
+
|
3710
|
+
If you supply a genome as either query or target, a protein and HMM file will be made for the genome.
|
3711
|
+
If you supply a protein as either query or target, an HMM file will be made for it.
|
3712
|
+
If you supply both an HMM and protein, the search will start right away. You cannot provide only an HMM.
|
3713
|
+
|
3714
|
+
No database will be built, and you cannot query multiple genomes with this module.
|
3715
|
+
|
3716
|
+
If you wish to query multiple genomes against themselves in all vs. all AAI search, use aai_index instead.
|
3717
|
+
If you wish to query multiple genomes against multiple targets, use multi_query instead.
|
3718
|
+
''')
|
3719
|
+
parser.add_argument('-qg', '--query_genome', dest = 'query_genome', default = None, help = 'Query genome')
|
3720
|
+
parser.add_argument('-tg', '--target_genome', dest = 'target_genome', default = None, help = 'Target genome')
|
3721
|
+
|
3722
|
+
parser.add_argument('-qp', '--query_protein', dest = 'query_protein', default = None, help = 'Query protein')
|
3723
|
+
parser.add_argument('-tp', '--target_protein', dest = 'target_protein', default = None, help = 'Target protein')
|
3724
|
+
|
3725
|
+
parser.add_argument('-qh', '--query_hmm', dest = 'query_hmm', default = None, help = 'Query HMM')
|
3726
|
+
parser.add_argument('-th', '--target_hmm', dest = 'target_hmm', default = None, help = 'Target HMM')
|
3727
|
+
|
3728
|
+
parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory where FastAAI will place the result of this query. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
|
3729
|
+
|
3730
|
+
parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
|
3731
|
+
parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
|
3732
|
+
parser.add_argument('--compress', dest = "do_comp", action = 'store_true', help = 'Gzip compress generated proteins, HMMs. Off by default.')
|
3733
|
+
|
3734
|
+
args, unknown = parser.parse_known_args()
|
3735
|
+
|
3736
|
+
return parser, args
|
3737
|
+
|
3738
|
+
def kaai_to_aai(kaai):
|
3739
|
+
# Transform the kAAI into estimated AAI values
|
3740
|
+
aai_hat = (-0.3087057 + 1.810741 * (np.exp(-(-0.2607023 * np.log(kaai))**(1/3.435))))*100
|
3741
|
+
|
3742
|
+
return aai_hat
|
3743
|
+
|
3744
|
+
#This one's unique. It doesn't do anything with the DB, which means it doesn't access any other functionality outside of the input_file class. It just advances a pair of inputs in parallel and does intersections.
|
3745
|
+
def single_query(qf, tf, output, verbose, threads, do_compress):
|
3746
|
+
|
3747
|
+
if qf.identifiers[0] == tf.identifiers[0]:
|
3748
|
+
print("You've selected the same query and target genome. The AAI is 100%.")
|
3749
|
+
print("FastAAI exiting.")
|
3750
|
+
return None
|
3751
|
+
|
3752
|
+
statuses = ["genome", "protein", "protein and hmm"]
|
3753
|
+
query_stat = statuses.index(qf.status)
|
3754
|
+
target_stat = statuses.index(tf.status)
|
3755
|
+
minimum_status = statuses[min(query_stat, target_stat)]
|
3756
|
+
|
3757
|
+
start_printouts = ["[Genome] Protein Protein+HMM", " Genome [Protein] Protein+HMM", "Genome Protein [Protein+HMM]"]
|
3758
|
+
|
3759
|
+
print("")
|
3760
|
+
print("Query start: ", start_printouts[query_stat])
|
3761
|
+
print("Target start:", start_printouts[target_stat])
|
3762
|
+
print("")
|
3763
|
+
|
3764
|
+
|
3765
|
+
qname = qf.identifiers[0]
|
3766
|
+
tname = tf.identifiers[0]
|
3767
|
+
|
3768
|
+
name = os.path.normpath(output + "/results/" + qname + "_vs_" + tname + ".aai.txt")
|
3769
|
+
print("Output will be located at", name)
|
3770
|
+
|
3771
|
+
advance_me = [qf.in_files[0], tf.in_files[0]]
|
3772
|
+
#All we need to do this.
|
3773
|
+
hmm_file = find_hmm()
|
3774
|
+
pool = multiprocessing.Pool(min(threads, 2), initializer = hmm_preproc_initializer, initargs = (hmm_file, do_compress,))
|
3775
|
+
|
3776
|
+
results = pool.map(run_build, advance_me)
|
3777
|
+
|
3778
|
+
pool.close()
|
3779
|
+
pool.join()
|
3780
|
+
|
3781
|
+
query = results[0]
|
3782
|
+
target = results[1]
|
3783
|
+
|
3784
|
+
print(query.partial_timings())
|
3785
|
+
print(target.partial_timings())
|
3786
|
+
|
3787
|
+
#One of the printouts
|
3788
|
+
max_poss_prots = max(len(query.best_hits_kmers), len(target.best_hits_kmers))
|
3789
|
+
|
3790
|
+
accs_to_view = set(query.best_hits_kmers.keys()).intersection(set(target.best_hits_kmers.keys()))
|
3791
|
+
|
3792
|
+
results = []
|
3793
|
+
for acc in accs_to_view:
|
3794
|
+
intersect = np.intersect1d(query.best_hits_kmers[acc], target.best_hits_kmers[acc])
|
3795
|
+
intersect = intersect.shape[0]
|
3796
|
+
union = query.best_hits_kmers[acc].shape[0] + target.best_hits_kmers[acc].shape[0] - intersect
|
3797
|
+
jacc = intersect/union
|
3798
|
+
results.append(jacc)
|
3799
|
+
|
3800
|
+
results = np.array(results, dtype = np.float_)
|
3801
|
+
|
3802
|
+
jacc_mean = np.mean(results)
|
3803
|
+
jacc_std = np.std(results)
|
3804
|
+
actual_prots = len(results)
|
3805
|
+
poss_prots = max(len(query.best_hits_kmers), len(target.best_hits_kmers))
|
3806
|
+
aai_est = round(kaai_to_aai(jacc_mean), 2)
|
3807
|
+
|
3808
|
+
if aai_est > 90:
|
3809
|
+
aai_est = ">90%"
|
3810
|
+
else:
|
3811
|
+
if aai_est < 30:
|
3812
|
+
aai_est = "<30%"
|
3813
|
+
|
3814
|
+
output = open(name, "w")
|
3815
|
+
|
3816
|
+
print("query\ttarget\tavg_jacc_sim\tjacc_SD\tnum_shared_SCPs\tposs_shared_SCPs\tAAI_estimate", file = output)
|
3817
|
+
print(qname, tname, round(jacc_mean, 4), round(jacc_std, 4), actual_prots, poss_prots, aai_est, sep = "\t", file = output)
|
3818
|
+
|
3819
|
+
output.close()
|
3820
|
+
|
3821
|
+
print("query\ttarget\tavg_jacc_sim\tjacc_SD\tnum_shared_SCPs\tposs_shared_SCPs\tAAI_estimate")
|
3822
|
+
print(qname, tname, round(jacc_mean, 4), round(jacc_std, 4), actual_prots, poss_prots, aai_est, sep = "\t")
|
3823
|
+
|
3824
|
+
|
3825
|
+
print("FastAAI single query done! Estimated AAI:", aai_est)
|
3826
|
+
|
3827
|
+
def miga_merge_opts():
|
3828
|
+
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
|
3829
|
+
description='''
|
3830
|
+
Hello, Miguel.
|
3831
|
+
|
3832
|
+
Give one genome in nt, aa, or aa+hmm format and a database to create or add to.
|
3833
|
+
It'll add the genome as efficiently as possible.
|
3834
|
+
|
3835
|
+
The normal merge command creates parallel processes and gathers data in
|
3836
|
+
one-SCP databases to add to the main DB. Great for many genomes. A lot of extra
|
3837
|
+
work for just one.
|
3838
|
+
|
3839
|
+
This version skips the creation of subordinate DBs and just directly adds the genome.
|
3840
|
+
Faster, fewer writes, no parallel overhead.
|
3841
|
+
''')
|
3842
|
+
|
3843
|
+
parser.add_argument('--genome', dest = 'gen', default = None, help = 'Path to one genome, FASTA format')
|
3844
|
+
parser.add_argument('--protein', dest = 'prot', default = None, help = 'Path to one protein, AA FASTA format')
|
3845
|
+
parser.add_argument('--hmm', dest = 'hmm', default = None, help = 'Path to one HMM file as predicted by FastAAI')
|
3846
|
+
|
3847
|
+
parser.add_argument('--output', dest = 'output', default = "FastAAI", help = 'Place the partial output files into a directory with this base. Default "FastAAI"')
|
3848
|
+
parser.add_argument('--target', dest = 'database', default = None, help = 'Path to the target database. The genome supplied will be added to this. The DB will be created if needed.')
|
3849
|
+
|
3850
|
+
parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
|
3851
|
+
parser.add_argument('--compress', dest = 'compress', action='store_true', help = 'Compress generated file output')
|
3852
|
+
|
3853
|
+
args, unknown = parser.parse_known_args()
|
3854
|
+
|
3855
|
+
return parser, args
|
3856
|
+
|
3857
|
+
def miga_merge(infile, target_db, verbose, do_compress):
|
3858
|
+
status = assess_db(target_db)
|
3859
|
+
if status == "wrong format":
|
3860
|
+
print("The database", target_db, "exists, but appears to not be a FastAAI database.")
|
3861
|
+
print("FastAAI will not alter this file. Quitting.")
|
3862
|
+
return None
|
3863
|
+
|
3864
|
+
if status == "unable to create":
|
3865
|
+
print("The database", target_db, "could not be created.")
|
3866
|
+
print("Are you sure that the path you gave is valid? Quitting.")
|
3867
|
+
return None
|
3868
|
+
|
3869
|
+
if verbose:
|
3870
|
+
print("Processing genome")
|
3871
|
+
|
3872
|
+
next_id = 0
|
3873
|
+
exist_gens = {}
|
3874
|
+
conn = sqlite3.connect(target_db)
|
3875
|
+
curs = conn.cursor()
|
3876
|
+
if status == 'exists':
|
3877
|
+
for row in curs.execute("SELECT * FROM genome_index ORDER BY gen_id").fetchall():
|
3878
|
+
genome, id, prot_ct = row[0], row[1], row[2]
|
3879
|
+
exist_gens[genome] = id
|
3880
|
+
next_id += 1
|
3881
|
+
|
3882
|
+
if infile.basename in exist_gens:
|
3883
|
+
print("It looks like the file you're trying to add already exists in the database.")
|
3884
|
+
print("Adding it is too likely to corrupt the database. Quitting.")
|
3885
|
+
return None
|
3886
|
+
|
3887
|
+
hmm_file = find_hmm()
|
3888
|
+
global hmm_manager
|
3889
|
+
|
3890
|
+
hmm_manager = pyhmmer_manager(do_compress)
|
3891
|
+
hmm_manager.load_hmm_from_file(hmm_file)
|
3892
|
+
|
3893
|
+
infile.preprocess()
|
3894
|
+
|
3895
|
+
if len(infile.best_hits_kmers) > 0:
|
3896
|
+
|
3897
|
+
ok = generate_accessions_index()
|
3898
|
+
gak_to_add = []
|
3899
|
+
|
3900
|
+
gen_id = np.zeros(1, dtype = np.int32)
|
3901
|
+
gen_id[0] = next_id
|
3902
|
+
gen_id = gen_id.tobytes()
|
3903
|
+
|
3904
|
+
for accession in infile.best_hits_kmers:
|
3905
|
+
acc_id = ok[accession]
|
3906
|
+
gak_to_add.append((next_id, acc_id, infile.best_hits_kmers[accession].shape[0],))
|
3907
|
+
|
3908
|
+
curs.execute("CREATE TABLE IF NOT EXISTS {acc} (kmer INTEGER PRIMARY KEY, genomes array)".format(acc=accession))
|
3909
|
+
curs.execute("CREATE TABLE IF NOT EXISTS {acc}_genomes (genome INTEGER PRIMARY KEY, kmers array)".format(acc=accession))
|
3910
|
+
curs.execute("CREATE INDEX IF NOT EXISTS {acc}_index ON {acc}(kmer)".format(acc=accession))
|
3911
|
+
|
3912
|
+
gen_first = (next_id, infile.best_hits_kmers[accession].tobytes(),)
|
3913
|
+
curs.execute("INSERT INTO {acc}_genomes VALUES (?,?)".format(acc=accession), gen_first)
|
3914
|
+
|
3915
|
+
kmers_first = []
|
3916
|
+
for k in infile.best_hits_kmers[accession]:
|
3917
|
+
#we know there's only one genome in these cases.
|
3918
|
+
kmers_first.append((int(k), gen_id, gen_id, ))
|
3919
|
+
|
3920
|
+
update_concat_sql = "INSERT INTO {acc} VALUES (?,?) ON CONFLICT(kmer) DO UPDATE SET genomes=genomes || (?)".format(acc=accession)
|
3921
|
+
|
3922
|
+
curs.executemany(update_concat_sql, kmers_first)
|
3923
|
+
|
3924
|
+
#Safety checks.
|
3925
|
+
curs.execute("CREATE TABLE IF NOT EXISTS genome_index (genome text, gen_id integer, protein_count integer)")
|
3926
|
+
curs.execute("CREATE TABLE IF NOT EXISTS genome_acc_kmer_counts (genome integer, accession integer, count integer)")
|
3927
|
+
|
3928
|
+
gen_idx_to_add = (infile.basename, next_id, len(infile.best_hits_kmers))
|
3929
|
+
curs.execute("INSERT INTO genome_index VALUES (?, ?, ?)", gen_idx_to_add)
|
3930
|
+
#gak was made over the loops.
|
3931
|
+
curs.executemany("INSERT INTO genome_acc_kmer_counts VALUES (?,?,?)", gak_to_add)
|
3932
|
+
curs.execute("CREATE INDEX IF NOT EXISTS kmer_acc ON genome_acc_kmer_counts (genome, accession);")
|
3933
|
+
|
3934
|
+
conn.commit()
|
3935
|
+
|
3936
|
+
else:
|
3937
|
+
print("No proteins to add for this genome:",infile.basename,"Database will be unaltered. Exiting.")
|
3938
|
+
|
3939
|
+
curs.close()
|
3940
|
+
conn.close()
|
3941
|
+
|
3942
|
+
|
3943
|
+
def miga_dirs(output, subdir):
|
3944
|
+
preparation_successful = True
|
3945
|
+
|
3946
|
+
if not os.path.exists(output):
|
3947
|
+
try:
|
3948
|
+
os.mkdir(output)
|
3949
|
+
except:
|
3950
|
+
print("")
|
3951
|
+
print("FastAAI tried to make output directory: '"+ output + "' but failed.")
|
3952
|
+
print("")
|
3953
|
+
print("Troubleshooting:")
|
3954
|
+
print("")
|
3955
|
+
print(" (1) Do you have permission to create directories in the location you specified?")
|
3956
|
+
print(" (2) Did you make sure that all directories other than", os.path.basename(output), "already exist?")
|
3957
|
+
print("")
|
3958
|
+
preparation_successful = False
|
3959
|
+
|
3960
|
+
if preparation_successful:
|
3961
|
+
try:
|
3962
|
+
if not os.path.exists(os.path.normpath(output + "/" + subdir)):
|
3963
|
+
os.mkdir(os.path.normpath(output + "/" + subdir))
|
3964
|
+
except:
|
3965
|
+
print("FastAAI was able to create or find", output, "but couldn't make directories there.")
|
3966
|
+
print("")
|
3967
|
+
print("This shouldn't happen. Do you have permission to write to that directory?")
|
3968
|
+
|
3969
|
+
|
3970
|
+
return preparation_successful
|
3971
|
+
|
3972
|
+
def miga_preproc_opts():
|
3973
|
+
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
|
3974
|
+
description='''Build module intended for use by MiGA.
|
3975
|
+
|
3976
|
+
Performs protein prediction, HMM searching, and best hit identification, but does NOT
|
3977
|
+
build a database. Produces instead "crystals," which are tab-sep files containing protein,
|
3978
|
+
HMM accession, and original protein sequences for the best hits. These crystals can be passed
|
3979
|
+
to "miga_db_from_crystals" action later on to rapidly create a DB from many genomes.
|
3980
|
+
''')
|
3981
|
+
|
3982
|
+
parser.add_argument('-g', '--genomes', dest = 'genomes', default = None, help = 'A directory containing genomes in FASTA format.')
|
3983
|
+
parser.add_argument('-p', '--proteins', dest = 'proteins', default = None, help = 'A directory containing protein amino acids in FASTA format.')
|
3984
|
+
parser.add_argument('-m', '--hmms', dest = 'hmms', default = None, help = 'A directory containing the results of an HMM search on a set of proteins.')
|
3985
|
+
|
3986
|
+
parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory to place the database and any protein or HMM files FastAAI creates. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
|
3987
|
+
|
3988
|
+
parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
|
3989
|
+
parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
|
3990
|
+
parser.add_argument('--compress', dest = "do_comp", action = 'store_true', help = 'Gzip compress generated proteins, HMMs. Off by default.')
|
3991
|
+
|
3992
|
+
args, unknown = parser.parse_known_args()
|
3993
|
+
|
3994
|
+
return parser, args
|
3995
|
+
|
3996
|
+
def run_miga_preproc(input_file):
|
3997
|
+
input_file.crystalize = True
|
3998
|
+
input_file.preprocess()
|
3999
|
+
if len(input_file.best_hits_kmers) < 1:
|
4000
|
+
input_file.best_hits_kmers = None
|
4001
|
+
input_file.err_log += " This file did not successfully complete. No SCPs could be found."
|
4002
|
+
|
4003
|
+
return input_file
|
4004
|
+
|
4005
|
+
#Produce FastAAI preprocessed files containing HMM accession and associated protein sequence
|
4006
|
+
def miga_preproc(genomes, proteins, hmms, output, threads, verbose, do_compress):
|
4007
|
+
success = True
|
4008
|
+
|
4009
|
+
imported_files = fastaai_file_importer(genomes = genomes, proteins = proteins, hmms = hmms, output = output, compress = do_compress)
|
4010
|
+
imported_files.determine_inputs()
|
4011
|
+
|
4012
|
+
if imported_files.error:
|
4013
|
+
print("Exiting FastAAI due to input file error.")
|
4014
|
+
quit()
|
4015
|
+
|
4016
|
+
#file make checks
|
4017
|
+
p, h, c, l = True, True, True, True
|
4018
|
+
|
4019
|
+
if imported_files.status == "genome":
|
4020
|
+
p = miga_dirs(output, "predicted_proteins")
|
4021
|
+
h = miga_dirs(output, "hmms")
|
4022
|
+
c = miga_dirs(output, "crystals")
|
4023
|
+
|
4024
|
+
if imported_files.status == "protein":
|
4025
|
+
h = miga_dirs(output, "hmms")
|
4026
|
+
c = miga_dirs(output, "crystals")
|
4027
|
+
|
4028
|
+
if imported_files.status == "protein+HMM":
|
4029
|
+
c = miga_dirs(output, "crystals")
|
4030
|
+
|
4031
|
+
#We always want this one.
|
4032
|
+
l = miga_dirs(output, "logs")
|
4033
|
+
|
4034
|
+
print("")
|
4035
|
+
|
4036
|
+
#Check if all created directories were successful.
|
4037
|
+
success = p and h and c and l
|
4038
|
+
|
4039
|
+
if success:
|
4040
|
+
hmm_file = find_hmm()
|
4041
|
+
|
4042
|
+
if verbose:
|
4043
|
+
tracker = progress_tracker(total = len(imported_files.in_files), message = "Processing inputs")
|
4044
|
+
else:
|
4045
|
+
print("Processing inputs")
|
4046
|
+
|
4047
|
+
#Only build_db makes a log.
|
4048
|
+
|
4049
|
+
logger = open(os.path.normpath(output+"/logs/"+"FastAAI_preprocessing_log.txt"), "a")
|
4050
|
+
print("file", "start_date", "end_date", "starting_format",
|
4051
|
+
"prot_prediction_time", "trans_table", "hmm_search_time", "besthits_time",
|
4052
|
+
"errors", sep = "\t", file = logger)
|
4053
|
+
|
4054
|
+
fail_log = open(os.path.normpath(output+"/logs/"+"FastAAI_genome_failures.txt"), "a")
|
4055
|
+
|
4056
|
+
pool = multiprocessing.Pool(threads, initializer = hmm_preproc_initializer, initargs = (hmm_file, do_compress,))
|
4057
|
+
|
4058
|
+
for result in pool.imap(run_miga_preproc, imported_files.in_files):
|
4059
|
+
#log data, regardless of kind
|
4060
|
+
print(result.basename, result.start_time, result.end_time, result.initial_state,
|
4061
|
+
result.prot_pred_time, result.trans_table, result.hmm_search_time, result.besthits_time,
|
4062
|
+
result.err_log, sep = "\t", file = logger)
|
4063
|
+
|
4064
|
+
if len(result.best_hits_kmers) < 1:
|
4065
|
+
print(result.basename, file = fail_log)
|
4066
|
+
|
4067
|
+
if verbose:
|
4068
|
+
tracker.update()
|
4069
|
+
|
4070
|
+
pool.close()
|
4071
|
+
logger.close()
|
4072
|
+
fail_log.close()
|
4073
|
+
|
4074
|
+
print("FastAAI preprocessing complete!")
|
4075
|
+
|
4076
|
+
return success
|
4077
|
+
|
4078
|
+
def miga_db_from_crystals_opts():
|
4079
|
+
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
|
4080
|
+
description='''Takes a set of crystals produced with miga_preproc and makes a database from them.
|
4081
|
+
|
4082
|
+
Supply --crystals with a directory, file of paths, or list of paths just like --genomes in a build command.''')
|
4083
|
+
|
4084
|
+
parser.add_argument('-c', '--crystals', dest = 'crystals', default = None, help = 'A directory containing genomes in FASTA format.')
|
4085
|
+
parser.add_argument('-d', '--database', dest = 'db_name', default = "FastAAI_database.sqlite.db", help = 'The name of the database you wish to create or add to. The database will be created if it doesn\'t already exist and placed in the output directory. FastAAI_database.sqlite.db by default.')
|
4086
|
+
|
4087
|
+
parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory to place the database and any protein or HMM files FastAAI creates. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
|
4088
|
+
|
4089
|
+
parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
|
4090
|
+
parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
|
4091
|
+
args, unknown = parser.parse_known_args()
|
4092
|
+
|
4093
|
+
return parser, args
|
4094
|
+
|
4095
|
+
#This is basically a copied function, but I'm going to ignore that for now.
|
4096
|
+
def unique_kmer_miga(seq):
|
4097
|
+
#num tetramers = len(seq) - 4 + 1, just make it -3.
|
4098
|
+
n_kmers = len(seq) - 3
|
4099
|
+
|
4100
|
+
#Converts the characters in a sequence into their ascii int value
|
4101
|
+
as_ints = np.array([ord(i) for i in seq], dtype = np.int32)
|
4102
|
+
|
4103
|
+
#create seq like 0,1,2,3; 1,2,3,4; 2,3,4,5... for each tetramer that needs a value
|
4104
|
+
kmers = np.arange(4*n_kmers)
|
4105
|
+
kmers = kmers % 4 + kmers // 4
|
4106
|
+
|
4107
|
+
#Select the characters (as ints) corresponding to each tetramer all at once and reshape into rows of 4,
|
4108
|
+
#each row corresp. to a successive tetramer
|
4109
|
+
kmers = as_ints[kmers].reshape((n_kmers, 4))
|
4110
|
+
|
4111
|
+
#Given four 2-digit numbers, these multipliers work as offsets so that all digits are preserved in order when summed
|
4112
|
+
mult = np.array([1000000, 10000, 100, 1], dtype = np.int32)
|
4113
|
+
|
4114
|
+
#the fixed values effectively offset the successive chars of the tetramer by 2 positions each time;
|
4115
|
+
#practically, this is concatenation of numbers
|
4116
|
+
#Matrix mult does this for all values at once.
|
4117
|
+
return np.unique(np.dot(kmers, mult))
|
4118
|
+
|
4119
|
+
def para_crystal_init(tdb_queue):
|
4120
|
+
global tdb
|
4121
|
+
global td_name
|
4122
|
+
tdb = tdb_queue.get()
|
4123
|
+
td_name = tdb
|
4124
|
+
tdb = initialize_blank_db(tdb)
|
4125
|
+
global ok
|
4126
|
+
ok = generate_accessions_index()
|
4127
|
+
|
4128
|
+
def initialize_blank_db(path):
|
4129
|
+
sqlite3.register_converter("array", convert_array)
|
4130
|
+
worker = sqlite3.connect(path)
|
4131
|
+
wcurs = worker.cursor()
|
4132
|
+
wcurs.execute("CREATE TABLE genome_index (genome text, gen_id integer, protein_count integer)")
|
4133
|
+
wcurs.execute("CREATE TABLE genome_acc_kmer_counts (genome integer, accession integer, count integer)")
|
4134
|
+
ok = generate_accessions_index()
|
4135
|
+
for t in ok:
|
4136
|
+
wcurs.execute("CREATE TABLE " + t + "_genomes (genome INTEGER PRIMARY KEY, kmers array)")
|
4137
|
+
wcurs.execute("CREATE TABLE " + t + " (kmer INTEGER PRIMARY KEY, genomes array)")
|
4138
|
+
|
4139
|
+
worker.commit()
|
4140
|
+
wcurs.close()
|
4141
|
+
return worker
|
4142
|
+
|
4143
|
+
def para_crystals_to_dbs(args):
|
4144
|
+
path, name, num = args[0], args[1], args[2]
|
4145
|
+
my_gak = []
|
4146
|
+
my_qgi = []
|
4147
|
+
num_prots = 0
|
4148
|
+
curs = tdb.cursor()
|
4149
|
+
fh = agnostic_reader(path)
|
4150
|
+
for line in fh:
|
4151
|
+
segs = line.strip().split("\t")
|
4152
|
+
#prot_name = segs[0]
|
4153
|
+
acc_name = segs[1]
|
4154
|
+
prot_seq = segs[2]
|
4155
|
+
acc_id = ok[acc_name]
|
4156
|
+
tetramers = unique_kmer_miga(prot_seq)
|
4157
|
+
my_gak.append((num, acc_id, tetramers.shape[0]))
|
4158
|
+
tetramers = tetramers.tobytes()
|
4159
|
+
curs.execute("INSERT INTO " + acc_name + "_genomes VALUES (?,?)", (num, tetramers,))
|
4160
|
+
num_prots += 1
|
4161
|
+
|
4162
|
+
fh.close()
|
4163
|
+
|
4164
|
+
curs.execute("INSERT INTO genome_index VALUES (?, ?, ?)", (name, num, num_prots,))
|
4165
|
+
curs.executemany("INSERT INTO genome_acc_kmer_counts VALUES (?, ?, ?)", my_gak)
|
4166
|
+
|
4167
|
+
tdb.commit()
|
4168
|
+
curs.close()
|
4169
|
+
|
4170
|
+
return None
|
4171
|
+
|
4172
|
+
def group_by_kmer(placeholder):
|
4173
|
+
curs = tdb.cursor()
|
4174
|
+
surviving_tables = []
|
4175
|
+
for acc in ok:
|
4176
|
+
collected_data = curs.execute("SELECT * FROM {acc}_genomes".format(acc=acc)).fetchall()
|
4177
|
+
rearrange = {}
|
4178
|
+
if len(collected_data) > 0:
|
4179
|
+
surviving_tables.append(acc)
|
4180
|
+
for row in collected_data:
|
4181
|
+
genome, tetramers = row[0], np.frombuffer(row[1], dtype = np.int32)
|
4182
|
+
for t in tetramers:
|
4183
|
+
if t not in rearrange:
|
4184
|
+
rearrange[t] = [genome]
|
4185
|
+
else:
|
4186
|
+
rearrange[t].append(genome)
|
4187
|
+
|
4188
|
+
to_add = []
|
4189
|
+
for tetra in rearrange:
|
4190
|
+
as_bytes = np.array(rearrange[tetra], dtype = np.int32).tobytes()
|
4191
|
+
rearrange[tetra] = None
|
4192
|
+
to_add.append((int(tetra), as_bytes,))
|
4193
|
+
|
4194
|
+
curs.executemany("INSERT INTO {acc} VALUES (?, ?)".format(acc=acc), to_add)
|
4195
|
+
to_add = None
|
4196
|
+
else:
|
4197
|
+
#Empty table/no genomes contained the relevant SCP
|
4198
|
+
curs.execute("DROP TABLE {acc}".format(acc = acc))
|
4199
|
+
curs.execute("DROP TABLE {acc}_genomes".format(acc = acc))
|
4200
|
+
|
4201
|
+
tdb.commit()
|
4202
|
+
|
4203
|
+
curs.close()
|
4204
|
+
|
4205
|
+
tdb.close()
|
4206
|
+
|
4207
|
+
return [td_name, surviving_tables]
|
4208
|
+
|
4209
|
+
#Merge one or many crystals into a DB.
|
4210
|
+
def miga_db_from_crystals(crystals, output, db_name, threads, verbose):
|
4211
|
+
success = True
|
4212
|
+
|
4213
|
+
imported_files = fastaai_file_importer(genomes = None, proteins = None,
|
4214
|
+
hmms = None, crystals = crystals, output = output, compress = False)
|
4215
|
+
imported_files.determine_inputs()
|
4216
|
+
|
4217
|
+
if imported_files.error:
|
4218
|
+
print("Exiting FastAAI due to input file error.")
|
4219
|
+
quit()
|
4220
|
+
|
4221
|
+
#We'll skip trying this if the file already exists.
|
4222
|
+
existing_genome_IDs = None
|
4223
|
+
final_db_path = None
|
4224
|
+
try:
|
4225
|
+
if os.path.exists(db_name):
|
4226
|
+
if os.path.isfile(db_name):
|
4227
|
+
final_db_path = db_name
|
4228
|
+
else:
|
4229
|
+
success = miga_dirs(output, "database")
|
4230
|
+
final_db_path = os.path.normpath(output+ "/database/" + db_name)
|
4231
|
+
|
4232
|
+
else:
|
4233
|
+
success = miga_dirs(output, "database")
|
4234
|
+
final_db_path = os.path.normpath(output+ "/database/" + db_name)
|
4235
|
+
except:
|
4236
|
+
print("You specified an existing file to be a database, but it does not appear to be a FastAAI database.")
|
4237
|
+
print("FastAAI will not be able to continue. Please give FastAAI a different database name and continue.")
|
4238
|
+
print("Exiting.")
|
4239
|
+
success = False
|
4240
|
+
|
4241
|
+
if os.path.exists(final_db_path):
|
4242
|
+
if os.path.isfile(final_db_path):
|
4243
|
+
parent = sqlite3.connect(final_db_path)
|
4244
|
+
curs = parent.cursor()
|
4245
|
+
existing_genome_IDs = {}
|
4246
|
+
sql_command = "SELECT genome, gen_id FROM genome_index"
|
4247
|
+
for result in curs.execute(sql_command).fetchall():
|
4248
|
+
genome = result[0]
|
4249
|
+
id = int(result[1])
|
4250
|
+
existing_genome_IDs[genome] = id
|
4251
|
+
|
4252
|
+
curs.close()
|
4253
|
+
parent.close()
|
4254
|
+
|
4255
|
+
if success:
|
4256
|
+
if existing_genome_IDs is not None:
|
4257
|
+
genome_idx = max(list(existing_genome_IDs.values()))+1
|
4258
|
+
else:
|
4259
|
+
existing_genome_IDs = {}
|
4260
|
+
genome_idx = 0
|
4261
|
+
|
4262
|
+
cryst_args = []
|
4263
|
+
for crystal_path, crystal_name in zip(imported_files.crystal_list, imported_files.identifiers):
|
4264
|
+
#the genome is implicitly dropped if it's already in the target
|
4265
|
+
if crystal_name not in existing_genome_IDs:
|
4266
|
+
existing_genome_IDs[crystal_name] = genome_idx
|
4267
|
+
cryst_args.append((crystal_path, crystal_name, genome_idx,))
|
4268
|
+
genome_idx += 1
|
4269
|
+
|
4270
|
+
final_conn = sqlite3.connect(final_db_path)
|
4271
|
+
final_curs = final_conn.cursor()
|
4272
|
+
|
4273
|
+
final_curs.execute("CREATE TABLE IF NOT EXISTS genome_index (genome text, gen_id integer, protein_count integer)")
|
4274
|
+
final_curs.execute("CREATE TABLE IF NOT EXISTS genome_acc_kmer_counts (genome integer, accession integer, count integer)")
|
4275
|
+
|
4276
|
+
final_curs.execute("CREATE INDEX IF NOT EXISTS kmer_acc ON genome_acc_kmer_counts (genome, accession);")
|
4277
|
+
|
4278
|
+
final_conn.commit()
|
4279
|
+
|
4280
|
+
temp_dir = tempfile.mkdtemp()
|
4281
|
+
|
4282
|
+
temp_db_queue = multiprocessing.Queue()
|
4283
|
+
for i in range(0, threads):
|
4284
|
+
tdb_name = os.path.normpath(temp_dir + "/temp_db_" + str(i) + ".db")
|
4285
|
+
temp_db_queue.put(tdb_name)
|
4286
|
+
|
4287
|
+
placeholder = [i for i in range(0, threads)]
|
4288
|
+
|
4289
|
+
pool = multiprocessing.Pool(threads, initializer = para_crystal_init, initargs = (temp_db_queue,))
|
4290
|
+
|
4291
|
+
if verbose:
|
4292
|
+
tracker = progress_tracker(total = len(cryst_args), message = "Importing data")
|
4293
|
+
else:
|
4294
|
+
print("Importing data")
|
4295
|
+
|
4296
|
+
for result in pool.imap_unordered(para_crystals_to_dbs, cryst_args):
|
4297
|
+
if verbose:
|
4298
|
+
tracker.update()
|
4299
|
+
|
4300
|
+
if verbose:
|
4301
|
+
tracker = progress_tracker(total = threads, message = "Formating data")
|
4302
|
+
else:
|
4303
|
+
print("Formating data")
|
4304
|
+
|
4305
|
+
for result in pool.imap_unordered(group_by_kmer, placeholder):
|
4306
|
+
dbname, surviving_tables = result[0], result[1]
|
4307
|
+
|
4308
|
+
new_conn = sqlite3.connect(dbname)
|
4309
|
+
new_curs = new_conn.cursor()
|
4310
|
+
|
4311
|
+
ngak = new_curs.execute("SELECT * FROM genome_acc_kmer_counts").fetchall()
|
4312
|
+
ngi = new_curs.execute("SELECT * FROM genome_index").fetchall()
|
4313
|
+
|
4314
|
+
final_curs.executemany("INSERT INTO genome_index VALUES (?, ?, ?)", ngi)
|
4315
|
+
final_curs.executemany("INSERT INTO genome_acc_kmer_counts VALUES (?, ?, ?)", ngak)
|
4316
|
+
|
4317
|
+
final_conn.commit()
|
4318
|
+
|
4319
|
+
ngak = None
|
4320
|
+
ngi = None
|
4321
|
+
|
4322
|
+
for acc in surviving_tables:
|
4323
|
+
final_curs.execute("CREATE TABLE IF NOT EXISTS {acc}_genomes (genome INTEGER PRIMARY KEY, kmers array)".format(acc=acc))
|
4324
|
+
final_curs.execute("CREATE TABLE IF NOT EXISTS {acc} (kmer INTEGER PRIMARY KEY, genomes array)".format(acc=acc))
|
4325
|
+
final_curs.execute("CREATE INDEX IF NOT EXISTS {acc}_index ON {acc}(kmer)".format(acc=acc))
|
4326
|
+
|
4327
|
+
curag = new_curs.execute("SELECT * FROM {acc}_genomes".format(acc=acc)).fetchall()
|
4328
|
+
final_curs.executemany("INSERT INTO {acc}_genomes VALUES (?, ?)".format(acc=acc), curag)
|
4329
|
+
curag = None
|
4330
|
+
|
4331
|
+
curaac = new_curs.execute("SELECT kmer, genomes, genomes FROM {acc}".format(acc=acc)).fetchall()
|
4332
|
+
update_concat_sql = "INSERT INTO {acc} VALUES (?,?) ON CONFLICT(kmer) DO UPDATE SET genomes=genomes || (?)".format(acc=acc)
|
4333
|
+
final_curs.executemany(update_concat_sql, curaac)
|
4334
|
+
curacc = None
|
4335
|
+
|
4336
|
+
final_conn.commit()
|
4337
|
+
|
4338
|
+
|
4339
|
+
|
4340
|
+
new_curs.close()
|
4341
|
+
new_conn.close()
|
4342
|
+
|
4343
|
+
if verbose:
|
4344
|
+
tracker.update()
|
4345
|
+
|
4346
|
+
pool.close()
|
4347
|
+
|
4348
|
+
|
4349
|
+
final_curs.close()
|
4350
|
+
final_conn.close()
|
4351
|
+
|
4352
|
+
shutil.rmtree(temp_dir)
|
4353
|
+
'''
|
4354
|
+
Main
|
4355
|
+
'''
|
4356
|
+
|
4357
|
+
#Preprocess genomes, build DB, query all vs all to self.
|
4358
|
+
def aai_index_opts():
|
4359
|
+
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
|
4360
|
+
description='''FastAAI module for preprocessing a set of genomes, proteins, or proteins+HMMs
|
4361
|
+
into a database, and then querying the database against itself.
|
4362
|
+
|
4363
|
+
Equivalent to running build_db and db_query in sequence. Check these modules for additional
|
4364
|
+
details on inputs.''')
|
4365
|
+
|
4366
|
+
parser.add_argument('-o', '--output', dest = 'output', default = "FastAAI", help = 'The directory to place the database and any protein or HMM files FastAAI creates. By default, a directory named "FastAAI" will be created in the current working directory and results will be placed there.')
|
4367
|
+
|
4368
|
+
parser.add_argument('-g', '--genomes', dest = 'genomes', default = None, help = 'A directory containing genomes in FASTA format.')
|
4369
|
+
parser.add_argument('-p', '--proteins', dest = 'proteins', default = None, help = 'A directory containing protein amino acids in FASTA format.')
|
4370
|
+
parser.add_argument('-m', '--hmms', dest = 'hmms', default = None, help = 'A directory containing the results of an HMM search on a set of proteins.')
|
4371
|
+
|
4372
|
+
parser.add_argument('-d', '--database', dest = 'db_name', default = "FastAAI_database.sqlite.db", help = 'The name of the database you wish to create or add to. The database will be created if it doesn\'t already exist and placed in the output directory. FastAAI_database.sqlite.db by default.')
|
4373
|
+
|
4374
|
+
parser.add_argument('--output_style', dest = "style", default = 'tsv', help = "Either 'tsv' or 'matrix'. Matrix produces a simplified output of only AAI estimates.")
|
4375
|
+
parser.add_argument('--do_stdev', dest = "do_stdev", action='store_true', help = 'Off by default. Calculate std. deviations on Jaccard indicies. Increases memory usage and runtime slightly. Does NOT change estimated AAI values at all.')
|
4376
|
+
parser.add_argument('--in_memory', dest = "in_mem", action = 'store_true', help = 'Load both databases into memory before querying. Consumes more RAM, but is faster and reduces file I/O substantially. Consider reducing number of threads')
|
4377
|
+
parser.add_argument('--store_results', dest = "storage", action = 'store_true', help = 'Keep partial results in memory. Only works with --in_memory. Fewer writes, but more RAM. Default off.')
|
4378
|
+
|
4379
|
+
parser.add_argument('--compress', dest = "do_comp", action = 'store_true', help = 'Gzip compress generated proteins, HMMs. Off by default.')
|
4380
|
+
parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
|
4381
|
+
parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
|
4382
|
+
|
4383
|
+
args, unknown = parser.parse_known_args()
|
4384
|
+
|
4385
|
+
return parser, args
|
4386
|
+
|
4387
|
+
#Preprocess two sets of genomes A and B into two distinct databases Adb and Bdb, then query Adb against Bdb
|
4388
|
+
def multi_query_opts():
|
4389
|
+
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
|
4390
|
+
description='''FastAAI module for preprocessing two sets of input files into two separate DBs,
|
4391
|
+
then querying the DBs against eachother. Not for use with already-made FastAAI databases.
|
4392
|
+
|
4393
|
+
See "build_db" action for details on file inputs.
|
4394
|
+
See "db_query" action for details on querying options.''')
|
4395
|
+
|
4396
|
+
parser.add_argument('--query_output', dest = 'qoutput', default = "FastAAI_query", help = 'Output directory for query files. Default "FastAAI_query." FastAAI will work if this directory is the same as --target_output, but this is NOT a good idea.')
|
4397
|
+
parser.add_argument('--target_output', dest = 'toutput', default = "FastAAI_target", help = 'Output directory for target files. Default "FastAAI_target." AAI results will be placed in this directory')
|
4398
|
+
|
4399
|
+
parser.add_argument('--query_genomes', dest = 'qgenomes', default = None, help = 'Query genomes')
|
4400
|
+
parser.add_argument('--target_genomes', dest = 'tgenomes', default = None, help = 'Target genomes')
|
4401
|
+
|
4402
|
+
parser.add_argument('--query_proteins', dest = 'qproteins', default = None, help = 'Query proteins')
|
4403
|
+
parser.add_argument('--target_proteins', dest = 'tproteins', default = None, help = 'Target proteins')
|
4404
|
+
|
4405
|
+
parser.add_argument('--query_hmms', dest = 'qhmms', default = None, help = 'Query HMMs')
|
4406
|
+
parser.add_argument('--target_hmms', dest = 'thmms', default = None, help = 'Target HMMs')
|
4407
|
+
|
4408
|
+
parser.add_argument('--query_database', dest = 'qdb_name', default = "FastAAI_query_database.sqlite.db", help = 'Query database name. Default "FastAAI_query_database.sqlite.db"')
|
4409
|
+
parser.add_argument('--target_database', dest = 'tdb_name', default = "FastAAI_target_database.sqlite.db", help ='Target database name. Default "FastAAI_target_database.sqlite.db"')
|
4410
|
+
|
4411
|
+
parser.add_argument('--output_style', dest = "style", default = 'tsv', help = "Either 'tsv' or 'matrix'. Matrix produces a simplified output of only AAI estimates.")
|
4412
|
+
parser.add_argument('--do_stdev', dest = "do_stdev", action='store_true', help = 'Off by default. Calculate std. deviations on Jaccard indicies. Increases memory usage and runtime slightly. Does NOT change estimated AAI values at all.')
|
4413
|
+
parser.add_argument('--in_memory', dest = "in_mem", action = 'store_true', help = 'Load both databases into memory before querying. Consumes more RAM, but is faster and reduces file I/O substantially. Consider reducing number of threads')
|
4414
|
+
parser.add_argument('--store_results', dest = "storage", action = 'store_true', help = 'Keep partial results in memory. Only works with --in_memory. Fewer writes, but more RAM. Default off.')
|
4415
|
+
|
4416
|
+
parser.add_argument('--compress', dest = "do_comp", action = 'store_true', help = 'Gzip compress generated proteins, HMMs. Off by default.')
|
4417
|
+
parser.add_argument('--threads', dest = 'threads', type=int, default = 1, help = 'The number of processors to use. Default 1.')
|
4418
|
+
parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
|
4419
|
+
|
4420
|
+
args, unknown = parser.parse_known_args()
|
4421
|
+
|
4422
|
+
return parser, args
|
4423
|
+
|
4424
|
+
|
4425
|
+
def main():
|
4426
|
+
#The currently supported modules.
|
4427
|
+
modules = ["build_db", "merge_db", "simple_query", "db_query", "single_query", "aai_index", "multi_query", "miga_merge", "miga_preproc", "miga_db_from_crystals"]
|
4428
|
+
|
4429
|
+
#Print modules if someone just types FastAAI
|
4430
|
+
if len(sys.argv) < 2:
|
4431
|
+
print("")
|
4432
|
+
print(" I couldn't find the module you specified. Please select one of the following modules:")
|
4433
|
+
print("")
|
4434
|
+
print("-------------------------------------- Database Construction Options --------------------------------------")
|
4435
|
+
print("")
|
4436
|
+
print(" build_db |" + " Create or add to a FastAAI database from genomes, proteins, or proteins and HMMs")
|
4437
|
+
print(" merge_db |" + " Add the contents of one FastAAI DB to another")
|
4438
|
+
print("")
|
4439
|
+
print("---------------------------------------------- Query Options ----------------------------------------------")
|
4440
|
+
print("")
|
4441
|
+
print(" simple_query |" + " Query a genome or protein (one or many) against an existing FastAAI database")
|
4442
|
+
print(" db_query |" + " Query the genomes in one FastAAI database against the genomes in another FastAAI database")
|
4443
|
+
print("")
|
4444
|
+
print("------------------------------------------- Other Options -------------------------------------------")
|
4445
|
+
print("")
|
4446
|
+
print(" single_query |" + " Query ONE query genome against ONE target genome")
|
4447
|
+
print(" multi_query |" + " Create a query DB and a target DB, then calculate query vs. target AAI")
|
4448
|
+
print(" aai_index |" + " Create a database from multiple genomes and do an all vs. all AAI index of the genomes")
|
4449
|
+
print("")
|
4450
|
+
print("-----------------------------------------------------------------------------------------------------------")
|
4451
|
+
print(" To select a module, enter 'FastAAI [module]' into the command line!")
|
4452
|
+
print("")
|
4453
|
+
sys.exit()
|
4454
|
+
|
4455
|
+
#This is the module selection
|
4456
|
+
selection = sys.argv[1]
|
4457
|
+
|
4458
|
+
if selection == "version":
|
4459
|
+
sys.exit("FastAAI version=0.1.17")
|
4460
|
+
|
4461
|
+
if selection not in modules:
|
4462
|
+
print("")
|
4463
|
+
print(" I couldn't find the module you specified. Please select one of the following modules:")
|
4464
|
+
print("")
|
4465
|
+
print("-------------------------------------- Database Construction Options --------------------------------------")
|
4466
|
+
print("")
|
4467
|
+
print(" build_db |" + " Create or add to a FastAAI database from genomes, proteins, or proteins and HMMs")
|
4468
|
+
print(" merge_db |" + " Add the contents of one FastAAI DB to another")
|
4469
|
+
print("")
|
4470
|
+
print("---------------------------------------------- Query Options ----------------------------------------------")
|
4471
|
+
print("")
|
4472
|
+
print(" simple_query |" + " Query a genome or protein (one or many) against an existing FastAAI database")
|
4473
|
+
print(" db_query |" + " Query the genomes in one FastAAI database against the genomes in another FastAAI database")
|
4474
|
+
print("")
|
4475
|
+
print("------------------------------------------- Other Options -------------------------------------------")
|
4476
|
+
print("")
|
4477
|
+
print(" single_query |" + " Query ONE query genome against ONE target genome")
|
4478
|
+
print(" multi_query |" + " Create a query DB and a target DB, then calculate query vs. target AAI")
|
4479
|
+
print(" aai_index |" + " Create a database from multiple genomes and do an all vs. all AAI index of the genomes")
|
4480
|
+
print("")
|
4481
|
+
print("-----------------------------------------------------------------------------------------------------------")
|
4482
|
+
print(" To select a module, enter 'FastAAI [module]' into the command line!")
|
4483
|
+
print("")
|
4484
|
+
sys.exit()
|
4485
|
+
|
4486
|
+
#################### Database build or add ########################
|
4487
|
+
|
4488
|
+
if selection == "build_db":
|
4489
|
+
parser, opts = build_db_opts()
|
4490
|
+
|
4491
|
+
#module name only
|
4492
|
+
if len(sys.argv) < 3:
|
4493
|
+
print(parser.print_help())
|
4494
|
+
sys.exit()
|
4495
|
+
|
4496
|
+
#Directory based
|
4497
|
+
genomes, proteins, hmms = opts.genomes, opts.proteins, opts.hmms
|
4498
|
+
|
4499
|
+
output = os.path.normpath(opts.output)
|
4500
|
+
|
4501
|
+
threads = opts.threads
|
4502
|
+
verbose = opts.verbose
|
4503
|
+
|
4504
|
+
#Database handle
|
4505
|
+
db_name = opts.db_name
|
4506
|
+
|
4507
|
+
do_comp = opts.do_comp
|
4508
|
+
|
4509
|
+
build_db(genomes, proteins, hmms, db_name, output, threads, verbose, do_comp)
|
4510
|
+
|
4511
|
+
|
4512
|
+
#################### Add two DBs ########################
|
4513
|
+
|
4514
|
+
if selection == "merge_db":
|
4515
|
+
parser, opts = merge_db_opts()
|
4516
|
+
if len(sys.argv) < 3:
|
4517
|
+
print(parser.print_help())
|
4518
|
+
sys.exit()
|
4519
|
+
|
4520
|
+
recipient = opts.recipient
|
4521
|
+
donors = opts.donors
|
4522
|
+
donor_file = opts.donor_file
|
4523
|
+
verbose = opts.verbose
|
4524
|
+
threads = opts.threads
|
4525
|
+
|
4526
|
+
if donors is not None and donor_file is not None:
|
4527
|
+
sys.exit("You cannot specify both --donors and --donor_file.")
|
4528
|
+
|
4529
|
+
merge_db(recipient, donors, donor_file, verbose, threads)
|
4530
|
+
|
4531
|
+
#################### Query files vs DB ########################
|
4532
|
+
|
4533
|
+
if selection == "simple_query":
|
4534
|
+
parser, opts = sql_query_opts()
|
4535
|
+
|
4536
|
+
if len(sys.argv) < 3:
|
4537
|
+
print(parser.print_help())
|
4538
|
+
sys.exit()
|
4539
|
+
|
4540
|
+
genomes, proteins, hmms = opts.genomes, opts.proteins, opts.hmms
|
4541
|
+
|
4542
|
+
db_name = opts.target
|
4543
|
+
|
4544
|
+
output = opts.output
|
4545
|
+
threads = opts.threads
|
4546
|
+
verbose = opts.verbose
|
4547
|
+
|
4548
|
+
do_stdev = opts.do_stdev
|
4549
|
+
|
4550
|
+
style, in_mem, make_db, qdb_name, do_comp = opts.style, opts.in_mem, opts.make_db, opts.qdb_name, opts.do_comp
|
4551
|
+
|
4552
|
+
sql_query(genomes, proteins, hmms, db_name, output, threads, verbose, do_stdev, style, in_mem, make_db, qdb_name, do_comp)
|
4553
|
+
|
4554
|
+
|
4555
|
+
#################### Query DB vs DB ###########################
|
4556
|
+
if selection == "db_query":
|
4557
|
+
parser, opts = db_query_opts()
|
4558
|
+
#module name only
|
4559
|
+
|
4560
|
+
if len(sys.argv) < 3:
|
4561
|
+
print(parser.print_help())
|
4562
|
+
sys.exit()
|
4563
|
+
|
4564
|
+
query = opts.query
|
4565
|
+
target = opts.target
|
4566
|
+
verbose = opts.verbose
|
4567
|
+
|
4568
|
+
do_stdev = opts.do_stdev
|
4569
|
+
output = opts.output
|
4570
|
+
threads = opts.threads
|
4571
|
+
|
4572
|
+
style, in_mem, store = opts.style, opts.in_mem, opts.storage
|
4573
|
+
|
4574
|
+
|
4575
|
+
db_query(query, target, verbose, output, threads, do_stdev, style, in_mem, store)
|
4576
|
+
|
4577
|
+
#################### One-pass functions #######################
|
4578
|
+
if selection == "single_query":
|
4579
|
+
parser, opts = single_query_opts()
|
4580
|
+
#module name only
|
4581
|
+
|
4582
|
+
if len(sys.argv) < 3:
|
4583
|
+
print(parser.print_help())
|
4584
|
+
sys.exit()
|
4585
|
+
|
4586
|
+
output = os.path.normpath(opts.output)
|
4587
|
+
try:
|
4588
|
+
threads = int(opts.threads)
|
4589
|
+
except:
|
4590
|
+
print("Couldn't interpret your threads. Defaulting to 1.")
|
4591
|
+
threads = 1
|
4592
|
+
verbose = opts.verbose
|
4593
|
+
do_compress = opts.do_comp
|
4594
|
+
|
4595
|
+
query_genome = opts.query_genome
|
4596
|
+
query_protein = opts.query_protein
|
4597
|
+
query_hmm = opts.query_hmm
|
4598
|
+
|
4599
|
+
query_file = fastaai_file_importer(genomes = query_genome, proteins = query_protein, hmms = query_hmm, output = output, compress = do_compress)
|
4600
|
+
query_file.determine_inputs()
|
4601
|
+
|
4602
|
+
target_genome = opts.target_genome
|
4603
|
+
target_protein = opts.target_protein
|
4604
|
+
target_hmm = opts.target_hmm
|
4605
|
+
|
4606
|
+
target_file = fastaai_file_importer(genomes = target_genome, proteins = target_protein, hmms = target_hmm, output = output, compress = do_compress)
|
4607
|
+
target_file.determine_inputs()
|
4608
|
+
|
4609
|
+
is_ok = True
|
4610
|
+
if len(query_file.in_files) != 1:
|
4611
|
+
print("Query genome unacceptable. Check your inputs")
|
4612
|
+
is_ok = False
|
4613
|
+
|
4614
|
+
if len(target_file.in_files) != 1:
|
4615
|
+
print("target genome unacceptable. Check your inputs")
|
4616
|
+
is_ok = False
|
4617
|
+
if is_ok:
|
4618
|
+
good_to_go = prepare_directories(output, query_file.status, "query")
|
4619
|
+
if good_to_go:
|
4620
|
+
good_to_go = prepare_directories(output, target_file.status, "query")
|
4621
|
+
if good_to_go:
|
4622
|
+
single_query(query_file, target_file, output, verbose, threads, do_compress)
|
4623
|
+
|
4624
|
+
|
4625
|
+
if selection == "aai_index":
|
4626
|
+
parser, opts = aai_index_opts()
|
4627
|
+
|
4628
|
+
if len(sys.argv) < 3:
|
4629
|
+
print(parser.print_help())
|
4630
|
+
sys.exit()
|
4631
|
+
|
4632
|
+
genomes, proteins, hmms = opts.genomes, opts.proteins, opts.hmms
|
4633
|
+
|
4634
|
+
output = os.path.normpath(opts.output)
|
4635
|
+
|
4636
|
+
threads = opts.threads
|
4637
|
+
verbose = opts.verbose
|
4638
|
+
|
4639
|
+
#Database handle
|
4640
|
+
db_name = opts.db_name
|
4641
|
+
|
4642
|
+
do_comp = opts.do_comp
|
4643
|
+
|
4644
|
+
do_stdev = opts.do_stdev
|
4645
|
+
|
4646
|
+
style, in_mem, store = opts.style, opts.in_mem, opts.storage
|
4647
|
+
|
4648
|
+
#This is the same logic from the build_db section and it's what we need for getting the DB name.
|
4649
|
+
#Check if the db contains path info. Incl. windows version.
|
4650
|
+
if "/" not in db_name and "\\" not in db_name:
|
4651
|
+
final_database = os.path.normpath(output + "/database/" + db_name)
|
4652
|
+
else:
|
4653
|
+
#If the person insists that the db has a path, let them.
|
4654
|
+
final_database = db_name
|
4655
|
+
|
4656
|
+
build_db(genomes, proteins, hmms, db_name, output, threads, verbose, do_comp)
|
4657
|
+
|
4658
|
+
query, target = final_database, final_database
|
4659
|
+
|
4660
|
+
db_query(query, target, verbose, output, threads, do_stdev, style, in_mem, store)
|
4661
|
+
|
4662
|
+
|
4663
|
+
if selection == "multi_query":
|
4664
|
+
parser, opts = multi_query_opts()
|
4665
|
+
|
4666
|
+
if len(sys.argv) < 3:
|
4667
|
+
print(parser.print_help())
|
4668
|
+
sys.exit()
|
4669
|
+
|
4670
|
+
#Shared options
|
4671
|
+
threads = opts.threads
|
4672
|
+
verbose = opts.verbose
|
4673
|
+
|
4674
|
+
#query options
|
4675
|
+
do_comp = opts.do_comp
|
4676
|
+
do_stdev = opts.do_stdev
|
4677
|
+
style, in_mem, store = opts.style, opts.in_mem, opts.storage
|
4678
|
+
|
4679
|
+
#query inputs
|
4680
|
+
qgenomes, qproteins, qhmms = opts.qgenomes, opts.qproteins, opts.qhmms
|
4681
|
+
qoutput = os.path.normpath(opts.qoutput)
|
4682
|
+
qdb_name = opts.qdb_name
|
4683
|
+
#This is the same logic from the build_db section and it's what we need for getting the DB name.
|
4684
|
+
#Check if the db contains path info. Incl. windows version.
|
4685
|
+
if "/" not in qdb_name and "\\" not in qdb_name:
|
4686
|
+
final_qdb = os.path.normpath(qoutput + "/database/" + qdb_name)
|
4687
|
+
else:
|
4688
|
+
#If the person insists that the db has a path, let them.
|
4689
|
+
final_qdb = db_name
|
4690
|
+
|
4691
|
+
#target inputs
|
4692
|
+
tgenomes, tproteins, thmms = opts.tgenomes, opts.tproteins, opts.thmms
|
4693
|
+
toutput = os.path.normpath(opts.toutput)
|
4694
|
+
tdb_name = opts.tdb_name
|
4695
|
+
#This is the same logic from the build_db section and it's what we need for getting the DB name.
|
4696
|
+
#Check if the db contains path info. Incl. windows version.
|
4697
|
+
if "/" not in tdb_name and "\\" not in tdb_name:
|
4698
|
+
final_tdb = os.path.normpath(toutput + "/database/" + tdb_name)
|
4699
|
+
else:
|
4700
|
+
#If the person insists that the db has a path other than output/database, let them.
|
4701
|
+
final_tdb = db_name
|
4702
|
+
|
4703
|
+
#run query build
|
4704
|
+
build_db(qgenomes, qproteins, qhmms, qdb_name, qoutput, threads, verbose, do_comp)
|
4705
|
+
#run target build
|
4706
|
+
build_db(tgenomes, tproteins, thmms, tdb_name, toutput, threads, verbose, do_comp)
|
4707
|
+
#run query db against target db
|
4708
|
+
db_query(final_qdb, final_tdb, verbose, toutput, threads, do_stdev, style, in_mem, store)
|
4709
|
+
|
4710
|
+
|
4711
|
+
############## MiGA module #################
|
4712
|
+
if selection == "miga_merge":
|
4713
|
+
parser, opts = miga_merge_opts()
|
4714
|
+
|
4715
|
+
#module name only
|
4716
|
+
if len(sys.argv) < 3:
|
4717
|
+
print(parser.print_help())
|
4718
|
+
sys.exit()
|
4719
|
+
|
4720
|
+
g,p,h = opts.gen, opts.prot, opts.hmm
|
4721
|
+
|
4722
|
+
target = opts.database
|
4723
|
+
|
4724
|
+
verbose = opts.verbose
|
4725
|
+
|
4726
|
+
output_path = opts.output
|
4727
|
+
|
4728
|
+
if target == None:
|
4729
|
+
target = os.path.normpath(output_path + "/database/FastAAI_database.sqlite.db")
|
4730
|
+
|
4731
|
+
do_compress = opts.compress
|
4732
|
+
|
4733
|
+
imported_files = fastaai_file_importer(genomes = g, proteins = p, hmms = h,
|
4734
|
+
output = output_path, compress = do_compress)
|
4735
|
+
|
4736
|
+
imported_files.determine_inputs()
|
4737
|
+
|
4738
|
+
if len(imported_files.in_files) == 0:
|
4739
|
+
print("Something was wrong with your input file.")
|
4740
|
+
else:
|
4741
|
+
input_genome = imported_files.in_files[0]
|
4742
|
+
|
4743
|
+
good_to_go = prepare_directories(output_path, imported_files.status, "build")
|
4744
|
+
|
4745
|
+
miga_merge(input_genome, target, verbose, do_compress)
|
4746
|
+
|
4747
|
+
#This is where a new db would normally be created,
|
4748
|
+
#which is not what happens when the supplied target is some other sort of path.
|
4749
|
+
output_default = os.path.normpath(output_path + "/database")
|
4750
|
+
if len(os.listdir(output_default)) == 0:
|
4751
|
+
os.rmdir(output_default)
|
4752
|
+
|
4753
|
+
if selection == "miga_preproc":
|
4754
|
+
parser, opts = miga_preproc_opts()
|
4755
|
+
|
4756
|
+
#module name only
|
4757
|
+
if len(sys.argv) < 3:
|
4758
|
+
print(parser.print_help())
|
4759
|
+
sys.exit()
|
4760
|
+
|
4761
|
+
#Directory based
|
4762
|
+
genomes, proteins, hmms = opts.genomes, opts.proteins, opts.hmms
|
4763
|
+
|
4764
|
+
output = os.path.normpath(opts.output)
|
4765
|
+
|
4766
|
+
threads = opts.threads
|
4767
|
+
verbose = opts.verbose
|
4768
|
+
|
4769
|
+
do_comp = opts.do_comp
|
4770
|
+
|
4771
|
+
miga_preproc(genomes, proteins, hmms, output, threads, verbose, do_comp)
|
4772
|
+
|
4773
|
+
if selection == "miga_db_from_crystals":
|
4774
|
+
parser, opts = miga_db_from_crystals_opts()
|
4775
|
+
|
4776
|
+
#module name only
|
4777
|
+
if len(sys.argv) < 3:
|
4778
|
+
print(parser.print_help())
|
4779
|
+
sys.exit()
|
4780
|
+
|
4781
|
+
crystals = opts.crystals
|
4782
|
+
|
4783
|
+
if crystals is None:
|
4784
|
+
print("I need to be given crystals to proceed!")
|
4785
|
+
quit()
|
4786
|
+
|
4787
|
+
db_name = opts.db_name
|
4788
|
+
try:
|
4789
|
+
threads = int(opts.threads)
|
4790
|
+
except:
|
4791
|
+
threads = 1
|
4792
|
+
print("Can't recognize threads param:", str(opts.threads), "defaulting to 1.")
|
4793
|
+
|
4794
|
+
verbose = opts.verbose
|
4795
|
+
output_path = opts.output
|
4796
|
+
|
4797
|
+
miga_db_from_crystals(crystals, output_path, db_name, threads, verbose)
|
4798
|
+
|
4799
|
+
|
4800
|
+
return None
|
4801
|
+
|
4802
|
+
if __name__ == "__main__":
|
4803
|
+
main()
|
4804
|
+
|
4805
|
+
|