miga-base 1.2.18.1 → 1.3.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/miga/cli/action/doctor/base.rb +2 -1
- data/lib/miga/cli/action/init.rb +1 -1
- data/lib/miga/dataset/result/add.rb +3 -2
- data/lib/miga/lair.rb +9 -3
- data/lib/miga/version.rb +2 -2
- data/scripts/essential_genes.bash +4 -8
- data/utils/FastAAI/LICENSE +8 -0
- data/utils/FastAAI/README.md +151 -40
- data/utils/FastAAI/__init__.py +1 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962915_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962925_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962935_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962945_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962995_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963025_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963055_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963065_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/_Pseudomonas__cissicola_GCA_002019225_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/_Pseudomonas__cissicola_GCA_008801575_1.fna.gz +0 -0
- data/utils/FastAAI/fastaai/__init__.py +1 -0
- data/utils/FastAAI/fastaai/fastaai +4805 -0
- data/utils/FastAAI/fastaai/fastaai.py +4805 -0
- data/utils/FastAAI/fastaai/fastaai_miga_crystals_to_db.py +297 -0
- data/utils/FastAAI/fastaai/fastaai_miga_preproc.py +931 -0
- data/utils/FastAAI/metadata/Accession_names_and_IDs.txt +122 -0
- data/utils/distance/commands.rb +51 -23
- metadata +23 -6
- data/utils/FastAAI/FastAAI +0 -3659
- /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Archaea_SCG.hmm +0 -0
- /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Bacteria_SCG.hmm +0 -0
- /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Complete_SCG_DB.hmm +0 -0
@@ -0,0 +1,931 @@
|
|
1
|
+
import sys
|
2
|
+
import os
|
3
|
+
import pyrodigal as pd
|
4
|
+
import pyhmmer
|
5
|
+
|
6
|
+
import gzip
|
7
|
+
from collections import namedtuple
|
8
|
+
import argparse
|
9
|
+
import datetime
|
10
|
+
import json
|
11
|
+
|
12
|
+
import numpy as np
|
13
|
+
|
14
|
+
class fasta_file:
|
15
|
+
def __init__(self, file):
|
16
|
+
self.file_path = os.path.abspath(file)
|
17
|
+
|
18
|
+
self.contents = {}
|
19
|
+
|
20
|
+
self.read_fasta()
|
21
|
+
|
22
|
+
def read_fasta(self):
|
23
|
+
cur_seq = ""
|
24
|
+
cur_prot = ""
|
25
|
+
|
26
|
+
contents = {}
|
27
|
+
deflines = {}
|
28
|
+
|
29
|
+
fasta = agnostic_reader(self.file_path)
|
30
|
+
for line in fasta:
|
31
|
+
if line.startswith(">"):
|
32
|
+
if len(cur_seq) > 0:
|
33
|
+
contents[cur_prot] = cur_seq
|
34
|
+
deflines[cur_prot] = defline
|
35
|
+
|
36
|
+
cur_seq = ""
|
37
|
+
cur_prot = line.strip().split()[0][1:]
|
38
|
+
defline = line.strip()[len(cur_prot)+1 :].strip()
|
39
|
+
|
40
|
+
else:
|
41
|
+
cur_seq += line.strip()
|
42
|
+
|
43
|
+
fasta.close()
|
44
|
+
|
45
|
+
#Final iter
|
46
|
+
if len(cur_seq) > 0:
|
47
|
+
contents[cur_prot] = cur_seq
|
48
|
+
deflines[cur_prot] = defline
|
49
|
+
|
50
|
+
self.contents = contents
|
51
|
+
|
52
|
+
#return contents, deflines
|
53
|
+
return None
|
54
|
+
|
55
|
+
class agnostic_reader_iterator:
|
56
|
+
def __init__(self, reader):
|
57
|
+
self.handle_ = reader.handle
|
58
|
+
self.is_gz_ = reader.is_gz
|
59
|
+
|
60
|
+
def __next__(self):
|
61
|
+
if self.is_gz_:
|
62
|
+
line = self.handle_.readline().decode()
|
63
|
+
else:
|
64
|
+
line = self.handle_.readline()
|
65
|
+
|
66
|
+
#Ezpz EOF check
|
67
|
+
if line:
|
68
|
+
return line
|
69
|
+
else:
|
70
|
+
raise StopIteration
|
71
|
+
|
72
|
+
#File reader that doesn't care if you give it a gzipped file or not.
|
73
|
+
class agnostic_reader:
|
74
|
+
def __init__(self, file):
|
75
|
+
self.path = file
|
76
|
+
|
77
|
+
with open(file, 'rb') as test_gz:
|
78
|
+
#Gzip magic number
|
79
|
+
is_gz = (test_gz.read(2) == b'\x1f\x8b')
|
80
|
+
|
81
|
+
self.is_gz = is_gz
|
82
|
+
|
83
|
+
if is_gz:
|
84
|
+
self.handle = gzip.open(self.path)
|
85
|
+
else:
|
86
|
+
self.handle = open(self.path)
|
87
|
+
|
88
|
+
def __iter__(self):
|
89
|
+
return agnostic_reader_iterator(self)
|
90
|
+
|
91
|
+
def close(self):
|
92
|
+
self.handle.close()
|
93
|
+
|
94
|
+
class pyrodigal_manager:
|
95
|
+
def __init__(self, sequences = None,
|
96
|
+
trans_tables = [11, 4], #translation tables to use - only relevant if training input is None and is_meta is False
|
97
|
+
aa = None, compress = False):
|
98
|
+
|
99
|
+
|
100
|
+
self.meta = False
|
101
|
+
self.gene_predictors = {}
|
102
|
+
|
103
|
+
self.sequences = sequences
|
104
|
+
self.seqlens = {}
|
105
|
+
|
106
|
+
self.training_seq = []
|
107
|
+
self.running_sum = 0 #training sequence current length
|
108
|
+
self.training_needs_join = False #needed if more than 1 seq added to training sequence
|
109
|
+
self.training_done = False
|
110
|
+
|
111
|
+
self.trans_tables = trans_tables
|
112
|
+
self.training_data = {}
|
113
|
+
|
114
|
+
self.aa = aa
|
115
|
+
|
116
|
+
self.predicted_genes = {}
|
117
|
+
self.coding_densities = {}
|
118
|
+
if self.meta:
|
119
|
+
self.predicted_genes[-1] = {}
|
120
|
+
self.coding_densities[-1] = {}
|
121
|
+
else:
|
122
|
+
for t in self.trans_tables:
|
123
|
+
self.predicted_genes[t] = {}
|
124
|
+
self.coding_densities[t] = {}
|
125
|
+
|
126
|
+
self.log = []
|
127
|
+
self.do_compress = compress
|
128
|
+
|
129
|
+
def sequence_handler(self):
|
130
|
+
self.training_seq = []
|
131
|
+
self.training_needs_join = False
|
132
|
+
if not self.meta:
|
133
|
+
for seqid in self.sequences:
|
134
|
+
current_seqlen = len(self.sequences[seqid])
|
135
|
+
self.seqlens[seqid] = current_seqlen #get length
|
136
|
+
self.sequences[seqid] = self.sequences[seqid].encode() #to binary
|
137
|
+
|
138
|
+
self.training_seq.append(self.sequences[seqid]) #add to training set
|
139
|
+
self.running_sum += current_seqlen #running total of 32 million or less
|
140
|
+
|
141
|
+
if self.training_needs_join:
|
142
|
+
self.running_sum += 12
|
143
|
+
|
144
|
+
self.training_needs_join = True
|
145
|
+
|
146
|
+
if self.running_sum > 32000000:
|
147
|
+
self.train_manager()
|
148
|
+
|
149
|
+
if not self.training_done:
|
150
|
+
self.train_manager()
|
151
|
+
|
152
|
+
def convert_seq(self, this_seqid):
|
153
|
+
self.sequences[this_seqid] = ''.join(self.sequences[this_seqid])
|
154
|
+
seqlen = len(self.sequences[this_seqid])
|
155
|
+
self.sequences[this_seqid] = self.sequences[this_seqid].encode()
|
156
|
+
return seqlen
|
157
|
+
|
158
|
+
def train_manager(self):
|
159
|
+
if not self.training_done: #Make sure I don't happen twice
|
160
|
+
self.training_done = True #Make sure I don't happen twice
|
161
|
+
if self.running_sum < 20000:
|
162
|
+
self.log.append("Can't train on 20 thousand or fewer characters. Switching to meta mode.")
|
163
|
+
self.gene_predictor = pd.OrfFinder(meta=True)
|
164
|
+
self.is_meta = True
|
165
|
+
else:
|
166
|
+
#Collect sequences into a prodigal-formatted string
|
167
|
+
self.training_seq = b'TTAATTAATTAA'.join(self.sequences.values())
|
168
|
+
|
169
|
+
#Truncate to 32 million bp if needed
|
170
|
+
if self.running_sum > 32000000:
|
171
|
+
self.log.append("Warning: Sequence is long (max 32000000 for training).")
|
172
|
+
self.log.append("Training on the first 32000000 bases.")
|
173
|
+
self.training_seq = self.training_seq[0:32000000]
|
174
|
+
|
175
|
+
#G is 71, C is 67; we're counting G + C and dividing by the total.
|
176
|
+
gc = round(((self.training_seq.count(67) + self.training_seq.count(71))/ len(self.training_seq)) * 100, 2)
|
177
|
+
|
178
|
+
self.log.append(str(len(self.training_seq)) + " bp training seq created, " + str(gc) + " pct GC")
|
179
|
+
|
180
|
+
#Intialize orffinder
|
181
|
+
self.gene_predictor = pd.OrfFinder(meta=False)
|
182
|
+
|
183
|
+
#Create training data on each sequence
|
184
|
+
for i in range(0, len(self.trans_tables)):
|
185
|
+
next_table = self.trans_tables.pop(0)
|
186
|
+
|
187
|
+
self.gene_predictors[next_table] = pd.OrfFinder(meta = False)
|
188
|
+
self.gene_predictors[next_table].train(self.training_seq, translation_table = next_table)
|
189
|
+
|
190
|
+
#Making another OrfFinder instance with this will allow quick swapping while comparing tables.
|
191
|
+
self.training_data[next_table] = self.gene_predictors[next_table].training_info
|
192
|
+
|
193
|
+
#Clean up afterwards
|
194
|
+
self.training_seq = None
|
195
|
+
|
196
|
+
def predict(self):
|
197
|
+
#Eliminate sequence entries to prevent memory bloat.
|
198
|
+
#Usually just grabs one sequence.
|
199
|
+
remaining_sequence_ids = tuple(self.sequences.keys())
|
200
|
+
for seqid in remaining_sequence_ids:
|
201
|
+
sequence = self.sequences.pop(seqid)
|
202
|
+
|
203
|
+
for tt in self.gene_predictors:
|
204
|
+
#How do we get this working with the training data instances...
|
205
|
+
next_genes = self.gene_predictors[tt].find_genes(sequence)
|
206
|
+
|
207
|
+
#Keep internal copy if the goal is to reuse them in another program
|
208
|
+
self.predicted_genes[tt][seqid] = next_genes #Easier to retain like this and call gene functions.
|
209
|
+
|
210
|
+
self.compare_predicted_genes()
|
211
|
+
|
212
|
+
def compare_predicted_genes(self):
|
213
|
+
if len(self.predicted_genes) == 1:
|
214
|
+
pass
|
215
|
+
else:
|
216
|
+
for tt in self.predicted_genes:
|
217
|
+
total_seqlen = 0
|
218
|
+
total_coding_bases = 0
|
219
|
+
for seqid in self.predicted_genes[tt]:
|
220
|
+
seqlen = self.seqlens[seqid]
|
221
|
+
total_seqlen += seqlen
|
222
|
+
for gene in self.predicted_genes[tt][seqid]:
|
223
|
+
total_coding_bases += (gene.end - gene.begin + 1) #Sequence is 1 longer because it's inclusive
|
224
|
+
|
225
|
+
self.coding_densities[tt] = total_coding_bases/total_seqlen
|
226
|
+
|
227
|
+
tables_to_remove = list(self.coding_densities.keys())
|
228
|
+
winning_table = None
|
229
|
+
winning_density = 0
|
230
|
+
for tt in self.coding_densities:
|
231
|
+
if self.coding_densities[tt] > 1.1 * winning_density:
|
232
|
+
winning_density = self.coding_densities[tt]
|
233
|
+
winning_table = tt
|
234
|
+
|
235
|
+
tables_to_remove.pop(tables_to_remove.index(winning_table)) #keep the winning table by removing all others
|
236
|
+
|
237
|
+
|
238
|
+
self.log.append("Winning translation table was: " + str(winning_table) + " with coding density " + str(round(winning_density, 4)))
|
239
|
+
for t in tables_to_remove:
|
240
|
+
self.log.append("Losing translation table: " + str(t) + " had coding density" + str(round(self.coding_densities[t], 4)))
|
241
|
+
|
242
|
+
self.predicted_genes = self.predicted_genes[winning_table] #keep the winning set.
|
243
|
+
|
244
|
+
def format_seq(self, seq, num_chars = 60):
|
245
|
+
#ceiling funciton without the math module
|
246
|
+
ceiling = int(round((len(seq)/num_chars)+0.5, 0))
|
247
|
+
formatted = '\n'.join([seq[(i*num_chars):(i+1)*num_chars] for i in range(0, ceiling)])
|
248
|
+
formatted = formatted.strip()
|
249
|
+
|
250
|
+
return formatted
|
251
|
+
|
252
|
+
def write_aa_file(self):
|
253
|
+
if self.aa is not None:
|
254
|
+
content = []
|
255
|
+
|
256
|
+
seqnum = 1
|
257
|
+
for seqid in self.predicted_genes:
|
258
|
+
gene_num = 1
|
259
|
+
for g in self.predicted_genes[seqid]:
|
260
|
+
#print(g)
|
261
|
+
protein_name = ">" + seqid + "_" + str(gene_num)
|
262
|
+
#table = g.translation_table
|
263
|
+
start = str(g.begin)
|
264
|
+
end = str(g.end)
|
265
|
+
strand = str(g.strand)
|
266
|
+
annotation = g._gene_data(seqnum)
|
267
|
+
translation = g.translate()
|
268
|
+
writeable_trans = self.format_seq(translation)
|
269
|
+
translation = None
|
270
|
+
|
271
|
+
header = " # ".join([protein_name, start, end, strand, annotation])
|
272
|
+
|
273
|
+
content.append(header)
|
274
|
+
content.append(writeable_trans)
|
275
|
+
|
276
|
+
gene_num += 1
|
277
|
+
|
278
|
+
seqnum += 1
|
279
|
+
|
280
|
+
content = "\n".join(content)
|
281
|
+
content += "\n" #final newline
|
282
|
+
|
283
|
+
if self.do_compress:
|
284
|
+
if not self.aa.endswith(".gz"):
|
285
|
+
self.aa += ".gz"
|
286
|
+
|
287
|
+
content = content.encode()
|
288
|
+
|
289
|
+
output_writer = gzip.open(self.aa, "wb")
|
290
|
+
else:
|
291
|
+
output_writer = open(self.aa, "w")
|
292
|
+
|
293
|
+
output_writer.write(content)
|
294
|
+
|
295
|
+
output_writer.close()
|
296
|
+
|
297
|
+
content = None
|
298
|
+
|
299
|
+
def convert_to_internal_rep(self): #go from pyrodigal objects to protein name:translation dict
|
300
|
+
conversion = {}
|
301
|
+
for seqid in self.predicted_genes:
|
302
|
+
gene_num = 1
|
303
|
+
for g in self.predicted_genes[seqid]:
|
304
|
+
#print(g)
|
305
|
+
protein_name = seqid + "_" + str(gene_num)
|
306
|
+
translation = g.translate()
|
307
|
+
conversion[protein_name] = translation
|
308
|
+
gene_num += 1
|
309
|
+
|
310
|
+
self.predicted_genes = conversion
|
311
|
+
conversion = None
|
312
|
+
|
313
|
+
def run(self):
|
314
|
+
self.sequence_handler()
|
315
|
+
self.predict()
|
316
|
+
self.write_aa_file()
|
317
|
+
self.convert_to_internal_rep()
|
318
|
+
|
319
|
+
class pyhmmer_manager:
|
320
|
+
def __init__(self, do_compress):
|
321
|
+
self.hmm_model = []
|
322
|
+
|
323
|
+
self.proteins_to_search = []
|
324
|
+
self.protein_descriptions = None
|
325
|
+
|
326
|
+
self.hmm_result_proteins = []
|
327
|
+
self.hmm_result_accessions = []
|
328
|
+
self.hmm_result_scores = []
|
329
|
+
|
330
|
+
self.printable_lines = []
|
331
|
+
|
332
|
+
self.bacterial_SCPs = None
|
333
|
+
self.archaeal_SCPs = None
|
334
|
+
self.assign_hmm_sets()
|
335
|
+
self.domain_counts = {"Bacteria" : 0, "Archaea": 0}
|
336
|
+
self.voted_domain = {"Bacteria" : len(self.bacterial_SCPs), "Archaea" : len(self.archaeal_SCPs)}
|
337
|
+
|
338
|
+
self.bacterial_fraction = None
|
339
|
+
self.archaeal_fraction = None
|
340
|
+
|
341
|
+
self.best_hits = None
|
342
|
+
|
343
|
+
self.do_compress = do_compress
|
344
|
+
|
345
|
+
#Load HMM
|
346
|
+
def load_hmm_from_file(self, hmm_path):
|
347
|
+
hmm_set = pyhmmer.plan7.HMMFile(hmm_path)
|
348
|
+
for hmm in hmm_set:
|
349
|
+
self.hmm_model.append(hmm)
|
350
|
+
|
351
|
+
#Set archaeal and bacterial HMM sets.
|
352
|
+
def assign_hmm_sets(self):
|
353
|
+
self.bacterial_SCPs = {'PF00709_21': 'Adenylsucc_synt', 'PF00406_22': 'ADK', 'PF01808_18': 'AICARFT_IMPCHas', 'PF00231_19': 'ATP-synt',
|
354
|
+
'PF00119_20': 'ATP-synt_A', 'PF01264_21': 'Chorismate_synt', 'PF00889_19': 'EF_TS', 'PF01176_19': 'eIF-1a',
|
355
|
+
'PF02601_15': 'Exonuc_VII_L', 'PF01025_19': 'GrpE', 'PF01725_16': 'Ham1p_like', 'PF01715_17': 'IPPT',
|
356
|
+
'PF00213_18': 'OSCP', 'PF01195_19': 'Pept_tRNA_hydro', 'PF00162_19': 'PGK', 'PF02033_18': 'RBFA', 'PF02565_15': 'RecO_C',
|
357
|
+
'PF00825_18': 'Ribonuclease_P', 'PF00687_21': 'Ribosomal_L1', 'PF00572_18': 'Ribosomal_L13',
|
358
|
+
'PF00238_19': 'Ribosomal_L14', 'PF00252_18': 'Ribosomal_L16', 'PF01196_19': 'Ribosomal_L17',
|
359
|
+
'PF00861_22': 'Ribosomal_L18p', 'PF01245_20': 'Ribosomal_L19', 'PF00453_18': 'Ribosomal_L20',
|
360
|
+
'PF00829_21': 'Ribosomal_L21p', 'PF00237_19': 'Ribosomal_L22', 'PF00276_20': 'Ribosomal_L23',
|
361
|
+
'PF17136_4': 'ribosomal_L24', 'PF00189_20': 'Ribosomal_S3_C', 'PF00281_19': 'Ribosomal_L5', 'PF00181_23': 'Ribosomal_L2',
|
362
|
+
'PF01016_19': 'Ribosomal_L27', 'PF00828_19': 'Ribosomal_L27A', 'PF00830_19': 'Ribosomal_L28',
|
363
|
+
'PF00831_23': 'Ribosomal_L29', 'PF00297_22': 'Ribosomal_L3', 'PF01783_23': 'Ribosomal_L32p',
|
364
|
+
'PF01632_19': 'Ribosomal_L35p', 'PF00573_22': 'Ribosomal_L4', 'PF00347_23': 'Ribosomal_L6',
|
365
|
+
'PF03948_14': 'Ribosomal_L9_C', 'PF00338_22': 'Ribosomal_S10', 'PF00411_19': 'Ribosomal_S11',
|
366
|
+
'PF00416_22': 'Ribosomal_S13', 'PF00312_22': 'Ribosomal_S15', 'PF00886_19': 'Ribosomal_S16',
|
367
|
+
'PF00366_20': 'Ribosomal_S17', 'PF00203_21': 'Ribosomal_S19', 'PF00318_20': 'Ribosomal_S2',
|
368
|
+
'PF01649_18': 'Ribosomal_S20p', 'PF01250_17': 'Ribosomal_S6', 'PF00177_21': 'Ribosomal_S7',
|
369
|
+
'PF00410_19': 'Ribosomal_S8', 'PF00380_19': 'Ribosomal_S9', 'PF00164_25': 'Ribosom_S12_S23',
|
370
|
+
'PF01193_24': 'RNA_pol_L', 'PF01192_22': 'RNA_pol_Rpb6', 'PF01765_19': 'RRF', 'PF02410_15': 'RsfS',
|
371
|
+
'PF03652_15': 'RuvX', 'PF00584_20': 'SecE', 'PF03840_14': 'SecG', 'PF00344_20': 'SecY', 'PF01668_18': 'SmpB',
|
372
|
+
'PF00750_19': 'tRNA-synt_1d', 'PF01746_21': 'tRNA_m1G_MT', 'PF02367_17': 'TsaE', 'PF02130_17': 'UPF0054',
|
373
|
+
'PF02699_15': 'YajC'}
|
374
|
+
|
375
|
+
self.archaeal_SCPs = {'PF00709_21': 'Adenylsucc_synt', 'PF05221_17': 'AdoHcyase', 'PF01951_16': 'Archease', 'PF01813_17': 'ATP-synt_D',
|
376
|
+
'PF01990_17': 'ATP-synt_F', 'PF01864_17': 'CarS-like', 'PF01982_16': 'CTP-dep_RFKase', 'PF01866_17': 'Diphthamide_syn',
|
377
|
+
'PF04104_14': 'DNA_primase_lrg', 'PF01984_20': 'dsDNA_bind', 'PF04010_13': 'DUF357', 'PF04019_12': 'DUF359',
|
378
|
+
'PF04919_12': 'DUF655', 'PF01912_18': 'eIF-6', 'PF05833_11': 'FbpA', 'PF01725_16': 'Ham1p_like',
|
379
|
+
'PF00368_18': 'HMG-CoA_red', 'PF00334_19': 'NDK', 'PF02006_16': 'PPS_PS', 'PF02996_17': 'Prefoldin',
|
380
|
+
'PF01981_16': 'PTH2', 'PF01948_18': 'PyrI', 'PF00687_21': 'Ribosomal_L1', 'PF00572_18': 'Ribosomal_L13',
|
381
|
+
'PF00238_19': 'Ribosomal_L14', 'PF00827_17': 'Ribosomal_L15e', 'PF00252_18': 'Ribosomal_L16',
|
382
|
+
'PF01157_18': 'Ribosomal_L21e', 'PF00237_19': 'Ribosomal_L22', 'PF00276_20': 'Ribosomal_L23',
|
383
|
+
'PF16906_5': 'Ribosomal_L26', 'PF00831_23': 'Ribosomal_L29', 'PF00297_22': 'Ribosomal_L3',
|
384
|
+
'PF01198_19': 'Ribosomal_L31e', 'PF01655_18': 'Ribosomal_L32e', 'PF01780_19': 'Ribosomal_L37ae',
|
385
|
+
'PF00832_20': 'Ribosomal_L39', 'PF00573_22': 'Ribosomal_L4', 'PF00935_19': 'Ribosomal_L44', 'PF17144_4': 'Ribosomal_L5e',
|
386
|
+
'PF00347_23': 'Ribosomal_L6', 'PF00411_19': 'Ribosomal_S11', 'PF00416_22': 'Ribosomal_S13',
|
387
|
+
'PF00312_22': 'Ribosomal_S15', 'PF00366_20': 'Ribosomal_S17', 'PF00833_18': 'Ribosomal_S17e',
|
388
|
+
'PF00203_21': 'Ribosomal_S19', 'PF01090_19': 'Ribosomal_S19e', 'PF00318_20': 'Ribosomal_S2',
|
389
|
+
'PF01282_19': 'Ribosomal_S24e', 'PF01667_17': 'Ribosomal_S27e', 'PF01200_18': 'Ribosomal_S28e',
|
390
|
+
'PF01015_18': 'Ribosomal_S3Ae', 'PF00177_21': 'Ribosomal_S7', 'PF00410_19': 'Ribosomal_S8',
|
391
|
+
'PF01201_22': 'Ribosomal_S8e', 'PF00380_19': 'Ribosomal_S9', 'PF00164_25': 'Ribosom_S12_S23',
|
392
|
+
'PF06026_14': 'Rib_5-P_isom_A', 'PF01351_18': 'RNase_HII', 'PF13656_6': 'RNA_pol_L_2',
|
393
|
+
'PF01194_17': 'RNA_pol_N', 'PF03874_16': 'RNA_pol_Rpb4', 'PF01192_22': 'RNA_pol_Rpb6',
|
394
|
+
'PF01139_17': 'RtcB', 'PF00344_20': 'SecY', 'PF06093_13': 'Spt4', 'PF00121_18': 'TIM', 'PF01994_16': 'Trm56',
|
395
|
+
'PF00749_21': 'tRNA-synt_1c', 'PF00750_19': 'tRNA-synt_1d', 'PF13393_6': 'tRNA-synt_His',
|
396
|
+
'PF01142_18': 'TruD', 'PF01992_16': 'vATP-synt_AC39', 'PF01991_18': 'vATP-synt_E', 'PF01496_19': 'V_ATPase_I'}
|
397
|
+
|
398
|
+
#Convert passed sequences.
|
399
|
+
def convert_protein_seqs_in_mem(self, contents):
|
400
|
+
#Clean up.
|
401
|
+
self.proteins_to_search = []
|
402
|
+
|
403
|
+
for protein in contents:
|
404
|
+
#Skip a protein if it's longer than 100k AA.
|
405
|
+
if len(contents[protein]) >= 100000:
|
406
|
+
continue
|
407
|
+
as_bytes = protein.encode()
|
408
|
+
#Pyhmmer digitization of sequences for searching.
|
409
|
+
easel_seq = pyhmmer.easel.TextSequence(name = as_bytes, sequence = contents[protein])
|
410
|
+
easel_seq = easel_seq.digitize(pyhmmer.easel.Alphabet.amino())
|
411
|
+
self.proteins_to_search.append(easel_seq)
|
412
|
+
|
413
|
+
easel_seq = None
|
414
|
+
|
415
|
+
def execute_search(self):
|
416
|
+
top_hits = list(pyhmmer.hmmsearch(self.hmm_model, self.proteins_to_search, cpus=1, bit_cutoffs="trusted"))
|
417
|
+
|
418
|
+
self.printable_lines = []
|
419
|
+
|
420
|
+
self.hmm_result_proteins = []
|
421
|
+
self.hmm_result_accessions = []
|
422
|
+
self.hmm_result_scores = []
|
423
|
+
|
424
|
+
for model in top_hits:
|
425
|
+
for hit in model:
|
426
|
+
target_name = hit.name.decode()
|
427
|
+
target_acc = hit.accession
|
428
|
+
if target_acc is None:
|
429
|
+
target_acc = "-"
|
430
|
+
else:
|
431
|
+
target_acc = target_acc.decode()
|
432
|
+
|
433
|
+
query_name = hit.best_domain.alignment.hmm_name.decode()
|
434
|
+
query_acc = hit.best_domain.alignment.hmm_accession.decode()
|
435
|
+
|
436
|
+
full_seq_evalue = "%.2g" % hit.evalue
|
437
|
+
full_seq_score = round(hit.score, 1)
|
438
|
+
full_seq_bias = round(hit.bias, 1)
|
439
|
+
|
440
|
+
best_dom_evalue = "%.2g" % hit.best_domain.alignment.domain.i_evalue
|
441
|
+
best_dom_score = round(hit.best_domain.alignment.domain.score, 1)
|
442
|
+
best_dom_bias = round(hit.best_domain.alignment.domain.bias, 1)
|
443
|
+
|
444
|
+
#I don't know how to get most of these values.
|
445
|
+
exp = 0
|
446
|
+
reg = 0
|
447
|
+
clu = 0
|
448
|
+
ov = 0
|
449
|
+
env = 0
|
450
|
+
dom = len(hit.domains)
|
451
|
+
rep = 0
|
452
|
+
inc = 0
|
453
|
+
|
454
|
+
try:
|
455
|
+
description = self.protein_descriptions[target_name]
|
456
|
+
except:
|
457
|
+
description = ""
|
458
|
+
|
459
|
+
writeout = [target_name, target_acc, query_name, query_acc, full_seq_evalue, \
|
460
|
+
full_seq_score, full_seq_bias, best_dom_evalue, best_dom_score, best_dom_bias, \
|
461
|
+
exp, reg, clu, ov, env, dom, rep, inc, description]
|
462
|
+
|
463
|
+
#Format and join.
|
464
|
+
writeout = [str(i) for i in writeout]
|
465
|
+
writeout = '\t'.join(writeout)
|
466
|
+
|
467
|
+
self.printable_lines.append(writeout)
|
468
|
+
|
469
|
+
self.hmm_result_proteins.append(target_name)
|
470
|
+
self.hmm_result_accessions.append(query_acc)
|
471
|
+
self.hmm_result_scores.append(best_dom_score)
|
472
|
+
|
473
|
+
def filter_to_best_hits(self):
|
474
|
+
hmm_file = np.transpose(np.array([self.hmm_result_proteins, self.hmm_result_accessions, self.hmm_result_scores]))
|
475
|
+
|
476
|
+
#hmm_file = np.loadtxt(hmm_file_name, comments = '#', usecols = (0, 3, 8), dtype=(str))
|
477
|
+
#Sort the hmm file based on the score column in descending order.
|
478
|
+
hmm_file = hmm_file[hmm_file[:,2].astype(float).argsort()[::-1]]
|
479
|
+
|
480
|
+
#Identify the first row where each gene name appears, after sorting by score;
|
481
|
+
#in effect, return the highest scoring assignment per gene name
|
482
|
+
#Sort the indices of the result to match the score-sorted table instead of alphabetical order of gene names
|
483
|
+
hmm_file = hmm_file[np.sort(np.unique(hmm_file[:,0], return_index = True)[1])]
|
484
|
+
|
485
|
+
#Filter the file again for the unique ACCESSION names, since we're only allowed one gene per accession, I guess?
|
486
|
+
#Don't sort the indices, we don't care about the scores anymore.
|
487
|
+
hmm_file = hmm_file[np.unique(hmm_file[:,1], return_index = True)[1]]
|
488
|
+
|
489
|
+
sql_friendly_names = [i.replace(".", "_") for i in hmm_file[:,1]]
|
490
|
+
|
491
|
+
self.best_hits = dict(zip(hmm_file[:,0], sql_friendly_names))
|
492
|
+
|
493
|
+
hmm_file = None
|
494
|
+
|
495
|
+
#Count per-dom occurs.
|
496
|
+
def assign_domain(self):
|
497
|
+
for prot in self.best_hits.values():
|
498
|
+
if prot in self.bacterial_SCPs:
|
499
|
+
self.domain_counts["Bacteria"] += 1
|
500
|
+
if prot in self.archaeal_SCPs:
|
501
|
+
self.domain_counts["Archaea"] += 1
|
502
|
+
|
503
|
+
self.bacterial_fraction = self.domain_counts["Bacteria"] / self.voted_domain["Bacteria"]
|
504
|
+
self.aechaeal_fraction = self.domain_counts["Archaea"] / self.voted_domain["Archaea"]
|
505
|
+
|
506
|
+
if self.bacterial_fraction >= self.aechaeal_fraction:
|
507
|
+
self.voted_domain = "Bacteria"
|
508
|
+
else:
|
509
|
+
self.voted_domain = "Archaea"
|
510
|
+
|
511
|
+
pop_keys = list(self.best_hits.keys())
|
512
|
+
for key in pop_keys:
|
513
|
+
if self.voted_domain == "Bacteria":
|
514
|
+
if self.best_hits[key] not in self.bacterial_SCPs:
|
515
|
+
self.best_hits.pop(key)
|
516
|
+
if self.voted_domain == "Archaea":
|
517
|
+
if self.best_hits[key] not in self.archaeal_SCPs:
|
518
|
+
self.best_hits.pop(key)
|
519
|
+
|
520
|
+
def to_hmm_file(self, output):
|
521
|
+
if output is not None:
|
522
|
+
#PyHMMER data is a bit hard to parse. For each result:
|
523
|
+
content = '\n'.join(self.printable_lines) + '\n'
|
524
|
+
|
525
|
+
if self.do_compress:
|
526
|
+
if not output.endswith(".gz"):
|
527
|
+
output += ".gz"
|
528
|
+
|
529
|
+
content = content.encode()
|
530
|
+
|
531
|
+
fh = gzip.open(output, "wb")
|
532
|
+
fh.write(content)
|
533
|
+
fh.close()
|
534
|
+
content = None
|
535
|
+
|
536
|
+
else:
|
537
|
+
fh = open(output, "w")
|
538
|
+
|
539
|
+
fh.write(content)
|
540
|
+
|
541
|
+
fh.close()
|
542
|
+
|
543
|
+
content = None
|
544
|
+
|
545
|
+
#If we're doing this step at all, we've either loaded the seqs into mem by reading the prot file
|
546
|
+
#or have them in mem thanks to pyrodigal.
|
547
|
+
def run_for_fastaai(self, prots, hmm_output):
|
548
|
+
#self.convert_protein_seqs_in_mem(prots)
|
549
|
+
#self.execute_search()
|
550
|
+
#self.filter_to_best_hits()
|
551
|
+
|
552
|
+
try:
|
553
|
+
self.convert_protein_seqs_in_mem(prots)
|
554
|
+
self.execute_search()
|
555
|
+
self.filter_to_best_hits()
|
556
|
+
try:
|
557
|
+
self.to_hmm_file(hmm_output)
|
558
|
+
except:
|
559
|
+
print(output, "cannot be created. HMM search failed. This file will be skipped.")
|
560
|
+
|
561
|
+
except:
|
562
|
+
print(hmm_output, "failed to run through HMMER!")
|
563
|
+
self.best_hits = None
|
564
|
+
|
565
|
+
class mining_straight_down:
|
566
|
+
def __init__(self, basename = None, protein_list = None, crystal_output = None, compress = False):
|
567
|
+
self.basename = basename
|
568
|
+
self.proteins_to_format = protein_list
|
569
|
+
self.output_file = crystal_output
|
570
|
+
self.formatted_data = None
|
571
|
+
self.do_compress = compress
|
572
|
+
|
573
|
+
#Translate tetramers to unique int32 indices.
|
574
|
+
def unique_kmer_simple_key(self, seq):
|
575
|
+
#num tetramers = len(seq) - 4 + 1, just make it -3.
|
576
|
+
n_kmers = len(seq) - 3
|
577
|
+
|
578
|
+
#Converts the characters in a sequence into their ascii int value
|
579
|
+
as_ints = np.array([ord(i) for i in seq], dtype = np.int32)
|
580
|
+
|
581
|
+
#create seq like 0,1,2,3; 1,2,3,4; 2,3,4,5... for each tetramer that needs a value
|
582
|
+
kmers = np.arange(4*n_kmers)
|
583
|
+
kmers = kmers % 4 + kmers // 4
|
584
|
+
|
585
|
+
#Select the characters (as ints) corresponding to each tetramer all at once and reshape into rows of 4,
|
586
|
+
#each row corresp. to a successive tetramer
|
587
|
+
kmers = as_ints[kmers].reshape((n_kmers, 4))
|
588
|
+
|
589
|
+
#Given four 2-digit numbers, these multipliers work as offsets so that all digits are preserved in order when summed
|
590
|
+
mult = np.array([1000000, 10000, 100, 1], dtype = np.int32)
|
591
|
+
|
592
|
+
#the fixed values effectively offset the successive chars of the tetramer by 2 positions each time;
|
593
|
+
#practically, this is concatenation of numbers
|
594
|
+
#Matrix mult does this for all values at once.
|
595
|
+
return np.unique(np.dot(kmers, mult))
|
596
|
+
|
597
|
+
def prepare_data(self):
|
598
|
+
self.formatted_data = {"filename": self.basename, "protein_data":{}}
|
599
|
+
for prot_acc_seq in self.proteins_to_format:
|
600
|
+
prot = prot_acc_seq[0]
|
601
|
+
acc = prot_acc_seq[1]
|
602
|
+
kmerized_seq = self.unique_kmer_simple_key(prot_acc_seq[2])
|
603
|
+
kmerized_seq = kmerized_seq.tolist()
|
604
|
+
#print(kmerized_seq)
|
605
|
+
|
606
|
+
self.formatted_data["protein_data"][acc] = {"protein_name":prot, "kmers":kmerized_seq}
|
607
|
+
|
608
|
+
def to_json(self):
|
609
|
+
if self.do_compress:
|
610
|
+
if not self.output_file.endswith(".gz"):
|
611
|
+
self.output_file += ".gz"
|
612
|
+
|
613
|
+
self.formatted_data = json.dumps(self.formatted_data, indent = 4) #Convert to JSON
|
614
|
+
self.formatted_data = self.formatted_data.encode('utf-8') #Encode to binary
|
615
|
+
with gzip.open(self.output_file, 'wb') as fh:
|
616
|
+
fh.write(self.formatted_data)
|
617
|
+
|
618
|
+
else:
|
619
|
+
with open(self.output_file, "w") as fh:
|
620
|
+
json.dump(self.formatted_data, fh, indent = 4)
|
621
|
+
|
622
|
+
class input_file:
|
623
|
+
def __init__(self, genome = None, protein = None, hmm = None, #data inputs
|
624
|
+
output_protein = None, output_hmm = None, output_crystal = None, #data outputs
|
625
|
+
output_log = None, verbose = False, compress_outputs = False):
|
626
|
+
|
627
|
+
self.verbose = verbose
|
628
|
+
self.do_compress = compress_outputs
|
629
|
+
|
630
|
+
self.genome_input = genome
|
631
|
+
self.protein_input = protein
|
632
|
+
self.hmm_input = hmm
|
633
|
+
|
634
|
+
self.protein_output = output_protein
|
635
|
+
self.hmm_output = output_hmm
|
636
|
+
self.crystal_output = output_crystal
|
637
|
+
|
638
|
+
self.log_contents = []
|
639
|
+
self.log_file = output_log
|
640
|
+
|
641
|
+
self.initial_status = None
|
642
|
+
self.current_status = "genome"
|
643
|
+
|
644
|
+
self.basename = None
|
645
|
+
|
646
|
+
self.genome = None
|
647
|
+
self.proteins = None
|
648
|
+
self.hmm_besthits = None
|
649
|
+
|
650
|
+
current_datetime = datetime.datetime.now()
|
651
|
+
self.timestamps = {"start":current_datetime,
|
652
|
+
"protein_pred":current_datetime,
|
653
|
+
"hmm_search":current_datetime,
|
654
|
+
"crystal":current_datetime}
|
655
|
+
|
656
|
+
self.runtimes = None
|
657
|
+
|
658
|
+
self.hmm_file = None
|
659
|
+
|
660
|
+
def curtime(self, step = None):
|
661
|
+
if step is not None:
|
662
|
+
self.timestamps[step] = datetime.datetime.now()
|
663
|
+
|
664
|
+
def timediffs(self):
|
665
|
+
self.runtimes = {}
|
666
|
+
protein_pred_time = self.timestamps["protein_pred"] - self.timestamps["start"]
|
667
|
+
protein_pred_time = round(protein_pred_time.total_seconds(), 2)
|
668
|
+
|
669
|
+
hmm_search_time = self.timestamps["hmm_search"] - self.timestamps["protein_pred"]
|
670
|
+
hmm_search_time = round(hmm_search_time.total_seconds(), 2)
|
671
|
+
|
672
|
+
crystal_time = self.timestamps["crystal"] - self.timestamps["hmm_search"]
|
673
|
+
crystal_time = round(crystal_time.total_seconds(), 2)
|
674
|
+
|
675
|
+
self.runtimes["protein_pred"] = protein_pred_time
|
676
|
+
self.runtimes["hmm_search"] = hmm_search_time
|
677
|
+
self.runtimes["crystal"] = crystal_time
|
678
|
+
|
679
|
+
def get_initial_status(self):
|
680
|
+
if self.genome_input is not None:
|
681
|
+
self.initial_status = "genome"
|
682
|
+
|
683
|
+
if self.protein_input is not None:
|
684
|
+
self.initial_status = "protein"
|
685
|
+
|
686
|
+
if self.hmm_input is not None and self.protein_input is not None:
|
687
|
+
self.initial_status = "hmm"
|
688
|
+
|
689
|
+
def get_file_basename(self):
|
690
|
+
if self.initial_status == "genome":
|
691
|
+
self.basename = self.file_basename(self.genome_input)
|
692
|
+
if self.initial_status == "protein":
|
693
|
+
self.basename = self.file_basename(self.protein_input)
|
694
|
+
if self.initial_status == "hmm":
|
695
|
+
self.basename = self.file_basename(self.protein_input)
|
696
|
+
|
697
|
+
#Not an input sanitizer - simply replaces characters that would throw SQLite for a loop.
|
698
|
+
def sql_safe(self, string):
|
699
|
+
#Sanitize for SQL
|
700
|
+
#These are chars safe for sql
|
701
|
+
sql_safe = set('_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789')
|
702
|
+
current_chars = set(string)
|
703
|
+
#self.sql_name = self.basename
|
704
|
+
#Identify SQL-unsafe characters as those outside the permissible set and replace all with underscores.
|
705
|
+
for char in current_chars - sql_safe:
|
706
|
+
string = string.replace(char, "_")
|
707
|
+
|
708
|
+
return string
|
709
|
+
|
710
|
+
#Gonna have to go put this everywhere...
|
711
|
+
#Consistent file basename behavior
|
712
|
+
def file_basename(self, file):
|
713
|
+
#Get the name after the final directory path
|
714
|
+
name = os.path.basename(file)
|
715
|
+
#Extract the portion of a filename prior to the first '.' separator.
|
716
|
+
while name != os.path.splitext(name)[0]:
|
717
|
+
name = os.path.splitext(name)[0]
|
718
|
+
|
719
|
+
name = self.sql_safe(name)
|
720
|
+
|
721
|
+
return name
|
722
|
+
|
723
|
+
def find_hmm(self):
|
724
|
+
self.hmm_file = None
|
725
|
+
try:
|
726
|
+
#Look in the same dir as the script; old method/MiGA friendly
|
727
|
+
script_path = os.path.dirname(__file__)
|
728
|
+
if len(script_path) == 0:
|
729
|
+
script_path = "."
|
730
|
+
hmm_complete_model = os.path.abspath(os.path.normpath(script_path +"/"+ "00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm"))
|
731
|
+
self.hmm_file = str(hmm_complete_model)
|
732
|
+
except:
|
733
|
+
#Try to locate the data bundled as it would be with a pip/conda install.
|
734
|
+
script_path = os.path.dirname(sys.modules['fastAAI_HMM_models'].__file__)
|
735
|
+
if len(script_path) == 0:
|
736
|
+
script_path = "."
|
737
|
+
hmm_complete_model = os.path.abspath(os.path.normpath(script_path + '/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm'))
|
738
|
+
self.hmm_file = str(hmm_complete_model)
|
739
|
+
#Check that the file exists or fail to the except.
|
740
|
+
fh = open(self.hmm_file)
|
741
|
+
fh.close()
|
742
|
+
|
743
|
+
#Load existing files functions
|
744
|
+
def read_genomes(self):
|
745
|
+
if self.genome_input is not None:
|
746
|
+
genome_seqs = fasta_file(self.genome_input)
|
747
|
+
self.genome = genome_seqs.contents
|
748
|
+
genome_seqs = None
|
749
|
+
|
750
|
+
def read_proteins(self):
|
751
|
+
if self.protein_input is not None:
|
752
|
+
#Simple dict of seqid:sequence
|
753
|
+
protein_seqs = fasta_file(self.protein_input)
|
754
|
+
self.proteins = protein_seqs.contents
|
755
|
+
protein_seqs = None
|
756
|
+
|
757
|
+
def read_hmms(self):
|
758
|
+
if self.hmm_input is not None:
|
759
|
+
prots = []
|
760
|
+
accs = []
|
761
|
+
scores = []
|
762
|
+
f = agnostic_reader(self.hmm_input)
|
763
|
+
for line in f:
|
764
|
+
if line.startswith("#"):
|
765
|
+
continue
|
766
|
+
else:
|
767
|
+
segs = line.strip().split()
|
768
|
+
|
769
|
+
if len(segs) < 9:
|
770
|
+
continue
|
771
|
+
|
772
|
+
prots.append(segs[0])
|
773
|
+
accs.append(segs[3])
|
774
|
+
scores.append(segs[8])
|
775
|
+
|
776
|
+
f.close()
|
777
|
+
|
778
|
+
if len(prots) < 1:
|
779
|
+
self.best_hits = {}
|
780
|
+
|
781
|
+
hmm_file = np.transpose(np.array([prots, accs, scores]))
|
782
|
+
|
783
|
+
#hmm_file = np.loadtxt(hmm_file_name, comments = '#', usecols = (0, 3, 8), dtype=(str))
|
784
|
+
#Sort the hmm file based on the score column in descending order.
|
785
|
+
hmm_file = hmm_file[hmm_file[:,2].astype(float).argsort()[::-1]]
|
786
|
+
|
787
|
+
#Identify the first row where each gene name appears, after sorting by score;
|
788
|
+
#in effect, return the highest scoring assignment per gene name
|
789
|
+
#Sort the indices of the result to match the score-sorted table instead of alphabetical order of gene names
|
790
|
+
hmm_file = hmm_file[np.sort(np.unique(hmm_file[:,0], return_index = True)[1])]
|
791
|
+
|
792
|
+
#Filter the file again for the unique ACCESSION names, since we're only allowed one gene per accession, I guess?
|
793
|
+
#Don't sort the indices, we don't care about the scores anymore.
|
794
|
+
hmm_file = hmm_file[np.unique(hmm_file[:,1], return_index = True)[1]]
|
795
|
+
|
796
|
+
sql_friendly_names = [i.replace(".", "_") for i in hmm_file[:,1]]
|
797
|
+
self.hmm_besthits = dict(zip(hmm_file[:,0], sql_friendly_names))
|
798
|
+
|
799
|
+
#runner functions
|
800
|
+
def predict_proteins(self):
|
801
|
+
mn = pyrodigal_manager(sequences = self.genome,
|
802
|
+
aa = self.protein_output,
|
803
|
+
compress = self.do_compress)
|
804
|
+
mn.run()
|
805
|
+
self.proteins = mn.predicted_genes
|
806
|
+
|
807
|
+
mn = None
|
808
|
+
|
809
|
+
def hmm_search_and_BH(self):
|
810
|
+
hmm_manager = pyhmmer_manager(self.do_compress)
|
811
|
+
hmm_manager.load_hmm_from_file(self.hmm_file)
|
812
|
+
|
813
|
+
hmm_manager.run_for_fastaai(prots = self.proteins, hmm_output = self.hmm_output)
|
814
|
+
|
815
|
+
self.hmm_besthits = hmm_manager.best_hits
|
816
|
+
|
817
|
+
def filter_bh_prots(self):
|
818
|
+
cleaned_prots = []
|
819
|
+
for protein in self.proteins:
|
820
|
+
if protein in self.hmm_besthits:
|
821
|
+
accession = self.hmm_besthits[protein]
|
822
|
+
|
823
|
+
next_item = (protein, accession, self.proteins[protein])
|
824
|
+
|
825
|
+
cleaned_prots.append(next_item)
|
826
|
+
|
827
|
+
self.proteins = cleaned_prots
|
828
|
+
cleaned_prots = None
|
829
|
+
|
830
|
+
def crystalize(self):
|
831
|
+
mn = mining_straight_down(basename = self.basename, protein_list = self.proteins, crystal_output = self.crystal_output, compress = self.do_compress)
|
832
|
+
mn.prepare_data()
|
833
|
+
mn.to_json()
|
834
|
+
|
835
|
+
def run(self):
|
836
|
+
self.get_initial_status()
|
837
|
+
self.get_file_basename()
|
838
|
+
|
839
|
+
self.current_status = self.initial_status
|
840
|
+
|
841
|
+
if self.current_status == "genome":
|
842
|
+
self.read_genomes()
|
843
|
+
self.predict_proteins()
|
844
|
+
self.current_status = "protein"
|
845
|
+
|
846
|
+
if self.initial_status == "protein":
|
847
|
+
self.read_proteins()
|
848
|
+
|
849
|
+
if self.verbose:
|
850
|
+
self.curtime("protein_pred")
|
851
|
+
|
852
|
+
if self.current_status == "protein":
|
853
|
+
self.find_hmm()
|
854
|
+
self.hmm_search_and_BH()
|
855
|
+
self.current_status = "hmm"
|
856
|
+
|
857
|
+
if self.initial_status == "hmm":
|
858
|
+
self.read_proteins()
|
859
|
+
self.read_hmms()
|
860
|
+
|
861
|
+
if self.verbose:
|
862
|
+
self.curtime("hmm_search")
|
863
|
+
|
864
|
+
if self.current_status == "hmm":
|
865
|
+
self.filter_bh_prots()
|
866
|
+
self.crystalize()
|
867
|
+
|
868
|
+
if self.verbose:
|
869
|
+
self.curtime("crystal")
|
870
|
+
|
871
|
+
if self.verbose:
|
872
|
+
self.timediffs()
|
873
|
+
print(self.basename, "complete.")
|
874
|
+
print("\tRuntimes: ", self.runtimes)
|
875
|
+
|
876
|
+
#Add options
|
877
|
+
def options():
|
878
|
+
'''
|
879
|
+
genome = None, protein = None, hmm = None, #data inputs
|
880
|
+
output_protein = None, output_hmm = None, output_crystal = None, #data outputs
|
881
|
+
output_log = None, verbose = False, compress_outputs = False
|
882
|
+
'''
|
883
|
+
|
884
|
+
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
|
885
|
+
description='''''')
|
886
|
+
|
887
|
+
parser.add_argument('--genome', dest = 'in_gen', default = None, help = 'Input genome in nt FASTA format.')
|
888
|
+
parser.add_argument('--protein', dest = 'in_prot', default = None, help = 'Input proteome for a genome in AA FASTA format')
|
889
|
+
parser.add_argument('--hmm', dest = 'in_hmm', default = None, help = 'Input FastAAI HMM search result for this proteome. Must be paired with --protein to work.')
|
890
|
+
|
891
|
+
parser.add_argument('--output_protein', dest = 'out_prot', default = None, help = 'An output containing predicted proteins for this genome in AA FASTA format. If omitted, no proteins file will be produced.')
|
892
|
+
parser.add_argument('--output_hmm', dest = 'out_hmm', default = None, help = 'An output containing the results of an HMM search of this proteome against FastAAIs SCPs. If omitted, no HMM file will be produced.')
|
893
|
+
parser.add_argument('--output_crystal', dest = 'out_crystal', default = None, required = True, help = 'Required. A JSON-format output representing the fully preprocessed input.')
|
894
|
+
|
895
|
+
parser.add_argument('--compress', dest = 'compress', action='store_true', help = 'GZIP protein and HMM outputs')
|
896
|
+
parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print feedback to stdout')
|
897
|
+
|
898
|
+
args, unknown_opts = parser.parse_known_args()
|
899
|
+
|
900
|
+
return parser, args
|
901
|
+
|
902
|
+
def main():
|
903
|
+
parser, opts = options()
|
904
|
+
|
905
|
+
if len(sys.argv) < 3:
|
906
|
+
parser.print_help()
|
907
|
+
|
908
|
+
ing = opts.in_gen
|
909
|
+
inp = opts.in_prot
|
910
|
+
inh = opts.in_hmm
|
911
|
+
|
912
|
+
outp = opts.out_prot
|
913
|
+
outh = opts.out_hmm
|
914
|
+
outc = opts.out_crystal
|
915
|
+
|
916
|
+
comp = opts.compress
|
917
|
+
verb = opts.verbose
|
918
|
+
|
919
|
+
mn = input_file(genome = ing,
|
920
|
+
protein = inp,
|
921
|
+
hmm = inh,
|
922
|
+
output_protein = outp,
|
923
|
+
output_hmm = outh,
|
924
|
+
output_crystal = outc,
|
925
|
+
compress_outputs = comp,
|
926
|
+
verbose = verb)
|
927
|
+
|
928
|
+
mn.run()
|
929
|
+
|
930
|
+
if __name__ == "__main__":
|
931
|
+
main()
|