miga-base 1.2.18.1 → 1.3.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (32) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/cli/action/doctor/base.rb +2 -1
  3. data/lib/miga/cli/action/init.rb +1 -1
  4. data/lib/miga/dataset/result/add.rb +3 -2
  5. data/lib/miga/lair.rb +9 -3
  6. data/lib/miga/version.rb +2 -2
  7. data/scripts/essential_genes.bash +4 -8
  8. data/utils/FastAAI/LICENSE +8 -0
  9. data/utils/FastAAI/README.md +151 -40
  10. data/utils/FastAAI/__init__.py +1 -0
  11. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962915_1.fna.gz +0 -0
  12. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962925_1.fna.gz +0 -0
  13. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962935_1.fna.gz +0 -0
  14. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962945_1.fna.gz +0 -0
  15. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962995_1.fna.gz +0 -0
  16. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963025_1.fna.gz +0 -0
  17. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963055_1.fna.gz +0 -0
  18. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963065_1.fna.gz +0 -0
  19. data/utils/FastAAI/example_genomes/_Pseudomonas__cissicola_GCA_002019225_1.fna.gz +0 -0
  20. data/utils/FastAAI/example_genomes/_Pseudomonas__cissicola_GCA_008801575_1.fna.gz +0 -0
  21. data/utils/FastAAI/fastaai/__init__.py +1 -0
  22. data/utils/FastAAI/fastaai/fastaai +4805 -0
  23. data/utils/FastAAI/fastaai/fastaai.py +4805 -0
  24. data/utils/FastAAI/fastaai/fastaai_miga_crystals_to_db.py +297 -0
  25. data/utils/FastAAI/fastaai/fastaai_miga_preproc.py +931 -0
  26. data/utils/FastAAI/metadata/Accession_names_and_IDs.txt +122 -0
  27. data/utils/distance/commands.rb +51 -23
  28. metadata +23 -6
  29. data/utils/FastAAI/FastAAI +0 -3659
  30. /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Archaea_SCG.hmm +0 -0
  31. /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Bacteria_SCG.hmm +0 -0
  32. /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Complete_SCG_DB.hmm +0 -0
@@ -0,0 +1,931 @@
1
+ import sys
2
+ import os
3
+ import pyrodigal as pd
4
+ import pyhmmer
5
+
6
+ import gzip
7
+ from collections import namedtuple
8
+ import argparse
9
+ import datetime
10
+ import json
11
+
12
+ import numpy as np
13
+
14
+ class fasta_file:
15
+ def __init__(self, file):
16
+ self.file_path = os.path.abspath(file)
17
+
18
+ self.contents = {}
19
+
20
+ self.read_fasta()
21
+
22
+ def read_fasta(self):
23
+ cur_seq = ""
24
+ cur_prot = ""
25
+
26
+ contents = {}
27
+ deflines = {}
28
+
29
+ fasta = agnostic_reader(self.file_path)
30
+ for line in fasta:
31
+ if line.startswith(">"):
32
+ if len(cur_seq) > 0:
33
+ contents[cur_prot] = cur_seq
34
+ deflines[cur_prot] = defline
35
+
36
+ cur_seq = ""
37
+ cur_prot = line.strip().split()[0][1:]
38
+ defline = line.strip()[len(cur_prot)+1 :].strip()
39
+
40
+ else:
41
+ cur_seq += line.strip()
42
+
43
+ fasta.close()
44
+
45
+ #Final iter
46
+ if len(cur_seq) > 0:
47
+ contents[cur_prot] = cur_seq
48
+ deflines[cur_prot] = defline
49
+
50
+ self.contents = contents
51
+
52
+ #return contents, deflines
53
+ return None
54
+
55
+ class agnostic_reader_iterator:
56
+ def __init__(self, reader):
57
+ self.handle_ = reader.handle
58
+ self.is_gz_ = reader.is_gz
59
+
60
+ def __next__(self):
61
+ if self.is_gz_:
62
+ line = self.handle_.readline().decode()
63
+ else:
64
+ line = self.handle_.readline()
65
+
66
+ #Ezpz EOF check
67
+ if line:
68
+ return line
69
+ else:
70
+ raise StopIteration
71
+
72
+ #File reader that doesn't care if you give it a gzipped file or not.
73
+ class agnostic_reader:
74
+ def __init__(self, file):
75
+ self.path = file
76
+
77
+ with open(file, 'rb') as test_gz:
78
+ #Gzip magic number
79
+ is_gz = (test_gz.read(2) == b'\x1f\x8b')
80
+
81
+ self.is_gz = is_gz
82
+
83
+ if is_gz:
84
+ self.handle = gzip.open(self.path)
85
+ else:
86
+ self.handle = open(self.path)
87
+
88
+ def __iter__(self):
89
+ return agnostic_reader_iterator(self)
90
+
91
+ def close(self):
92
+ self.handle.close()
93
+
94
+ class pyrodigal_manager:
95
+ def __init__(self, sequences = None,
96
+ trans_tables = [11, 4], #translation tables to use - only relevant if training input is None and is_meta is False
97
+ aa = None, compress = False):
98
+
99
+
100
+ self.meta = False
101
+ self.gene_predictors = {}
102
+
103
+ self.sequences = sequences
104
+ self.seqlens = {}
105
+
106
+ self.training_seq = []
107
+ self.running_sum = 0 #training sequence current length
108
+ self.training_needs_join = False #needed if more than 1 seq added to training sequence
109
+ self.training_done = False
110
+
111
+ self.trans_tables = trans_tables
112
+ self.training_data = {}
113
+
114
+ self.aa = aa
115
+
116
+ self.predicted_genes = {}
117
+ self.coding_densities = {}
118
+ if self.meta:
119
+ self.predicted_genes[-1] = {}
120
+ self.coding_densities[-1] = {}
121
+ else:
122
+ for t in self.trans_tables:
123
+ self.predicted_genes[t] = {}
124
+ self.coding_densities[t] = {}
125
+
126
+ self.log = []
127
+ self.do_compress = compress
128
+
129
+ def sequence_handler(self):
130
+ self.training_seq = []
131
+ self.training_needs_join = False
132
+ if not self.meta:
133
+ for seqid in self.sequences:
134
+ current_seqlen = len(self.sequences[seqid])
135
+ self.seqlens[seqid] = current_seqlen #get length
136
+ self.sequences[seqid] = self.sequences[seqid].encode() #to binary
137
+
138
+ self.training_seq.append(self.sequences[seqid]) #add to training set
139
+ self.running_sum += current_seqlen #running total of 32 million or less
140
+
141
+ if self.training_needs_join:
142
+ self.running_sum += 12
143
+
144
+ self.training_needs_join = True
145
+
146
+ if self.running_sum > 32000000:
147
+ self.train_manager()
148
+
149
+ if not self.training_done:
150
+ self.train_manager()
151
+
152
+ def convert_seq(self, this_seqid):
153
+ self.sequences[this_seqid] = ''.join(self.sequences[this_seqid])
154
+ seqlen = len(self.sequences[this_seqid])
155
+ self.sequences[this_seqid] = self.sequences[this_seqid].encode()
156
+ return seqlen
157
+
158
+ def train_manager(self):
159
+ if not self.training_done: #Make sure I don't happen twice
160
+ self.training_done = True #Make sure I don't happen twice
161
+ if self.running_sum < 20000:
162
+ self.log.append("Can't train on 20 thousand or fewer characters. Switching to meta mode.")
163
+ self.gene_predictor = pd.OrfFinder(meta=True)
164
+ self.is_meta = True
165
+ else:
166
+ #Collect sequences into a prodigal-formatted string
167
+ self.training_seq = b'TTAATTAATTAA'.join(self.sequences.values())
168
+
169
+ #Truncate to 32 million bp if needed
170
+ if self.running_sum > 32000000:
171
+ self.log.append("Warning: Sequence is long (max 32000000 for training).")
172
+ self.log.append("Training on the first 32000000 bases.")
173
+ self.training_seq = self.training_seq[0:32000000]
174
+
175
+ #G is 71, C is 67; we're counting G + C and dividing by the total.
176
+ gc = round(((self.training_seq.count(67) + self.training_seq.count(71))/ len(self.training_seq)) * 100, 2)
177
+
178
+ self.log.append(str(len(self.training_seq)) + " bp training seq created, " + str(gc) + " pct GC")
179
+
180
+ #Intialize orffinder
181
+ self.gene_predictor = pd.OrfFinder(meta=False)
182
+
183
+ #Create training data on each sequence
184
+ for i in range(0, len(self.trans_tables)):
185
+ next_table = self.trans_tables.pop(0)
186
+
187
+ self.gene_predictors[next_table] = pd.OrfFinder(meta = False)
188
+ self.gene_predictors[next_table].train(self.training_seq, translation_table = next_table)
189
+
190
+ #Making another OrfFinder instance with this will allow quick swapping while comparing tables.
191
+ self.training_data[next_table] = self.gene_predictors[next_table].training_info
192
+
193
+ #Clean up afterwards
194
+ self.training_seq = None
195
+
196
+ def predict(self):
197
+ #Eliminate sequence entries to prevent memory bloat.
198
+ #Usually just grabs one sequence.
199
+ remaining_sequence_ids = tuple(self.sequences.keys())
200
+ for seqid in remaining_sequence_ids:
201
+ sequence = self.sequences.pop(seqid)
202
+
203
+ for tt in self.gene_predictors:
204
+ #How do we get this working with the training data instances...
205
+ next_genes = self.gene_predictors[tt].find_genes(sequence)
206
+
207
+ #Keep internal copy if the goal is to reuse them in another program
208
+ self.predicted_genes[tt][seqid] = next_genes #Easier to retain like this and call gene functions.
209
+
210
+ self.compare_predicted_genes()
211
+
212
+ def compare_predicted_genes(self):
213
+ if len(self.predicted_genes) == 1:
214
+ pass
215
+ else:
216
+ for tt in self.predicted_genes:
217
+ total_seqlen = 0
218
+ total_coding_bases = 0
219
+ for seqid in self.predicted_genes[tt]:
220
+ seqlen = self.seqlens[seqid]
221
+ total_seqlen += seqlen
222
+ for gene in self.predicted_genes[tt][seqid]:
223
+ total_coding_bases += (gene.end - gene.begin + 1) #Sequence is 1 longer because it's inclusive
224
+
225
+ self.coding_densities[tt] = total_coding_bases/total_seqlen
226
+
227
+ tables_to_remove = list(self.coding_densities.keys())
228
+ winning_table = None
229
+ winning_density = 0
230
+ for tt in self.coding_densities:
231
+ if self.coding_densities[tt] > 1.1 * winning_density:
232
+ winning_density = self.coding_densities[tt]
233
+ winning_table = tt
234
+
235
+ tables_to_remove.pop(tables_to_remove.index(winning_table)) #keep the winning table by removing all others
236
+
237
+
238
+ self.log.append("Winning translation table was: " + str(winning_table) + " with coding density " + str(round(winning_density, 4)))
239
+ for t in tables_to_remove:
240
+ self.log.append("Losing translation table: " + str(t) + " had coding density" + str(round(self.coding_densities[t], 4)))
241
+
242
+ self.predicted_genes = self.predicted_genes[winning_table] #keep the winning set.
243
+
244
+ def format_seq(self, seq, num_chars = 60):
245
+ #ceiling funciton without the math module
246
+ ceiling = int(round((len(seq)/num_chars)+0.5, 0))
247
+ formatted = '\n'.join([seq[(i*num_chars):(i+1)*num_chars] for i in range(0, ceiling)])
248
+ formatted = formatted.strip()
249
+
250
+ return formatted
251
+
252
+ def write_aa_file(self):
253
+ if self.aa is not None:
254
+ content = []
255
+
256
+ seqnum = 1
257
+ for seqid in self.predicted_genes:
258
+ gene_num = 1
259
+ for g in self.predicted_genes[seqid]:
260
+ #print(g)
261
+ protein_name = ">" + seqid + "_" + str(gene_num)
262
+ #table = g.translation_table
263
+ start = str(g.begin)
264
+ end = str(g.end)
265
+ strand = str(g.strand)
266
+ annotation = g._gene_data(seqnum)
267
+ translation = g.translate()
268
+ writeable_trans = self.format_seq(translation)
269
+ translation = None
270
+
271
+ header = " # ".join([protein_name, start, end, strand, annotation])
272
+
273
+ content.append(header)
274
+ content.append(writeable_trans)
275
+
276
+ gene_num += 1
277
+
278
+ seqnum += 1
279
+
280
+ content = "\n".join(content)
281
+ content += "\n" #final newline
282
+
283
+ if self.do_compress:
284
+ if not self.aa.endswith(".gz"):
285
+ self.aa += ".gz"
286
+
287
+ content = content.encode()
288
+
289
+ output_writer = gzip.open(self.aa, "wb")
290
+ else:
291
+ output_writer = open(self.aa, "w")
292
+
293
+ output_writer.write(content)
294
+
295
+ output_writer.close()
296
+
297
+ content = None
298
+
299
+ def convert_to_internal_rep(self): #go from pyrodigal objects to protein name:translation dict
300
+ conversion = {}
301
+ for seqid in self.predicted_genes:
302
+ gene_num = 1
303
+ for g in self.predicted_genes[seqid]:
304
+ #print(g)
305
+ protein_name = seqid + "_" + str(gene_num)
306
+ translation = g.translate()
307
+ conversion[protein_name] = translation
308
+ gene_num += 1
309
+
310
+ self.predicted_genes = conversion
311
+ conversion = None
312
+
313
+ def run(self):
314
+ self.sequence_handler()
315
+ self.predict()
316
+ self.write_aa_file()
317
+ self.convert_to_internal_rep()
318
+
319
+ class pyhmmer_manager:
320
+ def __init__(self, do_compress):
321
+ self.hmm_model = []
322
+
323
+ self.proteins_to_search = []
324
+ self.protein_descriptions = None
325
+
326
+ self.hmm_result_proteins = []
327
+ self.hmm_result_accessions = []
328
+ self.hmm_result_scores = []
329
+
330
+ self.printable_lines = []
331
+
332
+ self.bacterial_SCPs = None
333
+ self.archaeal_SCPs = None
334
+ self.assign_hmm_sets()
335
+ self.domain_counts = {"Bacteria" : 0, "Archaea": 0}
336
+ self.voted_domain = {"Bacteria" : len(self.bacterial_SCPs), "Archaea" : len(self.archaeal_SCPs)}
337
+
338
+ self.bacterial_fraction = None
339
+ self.archaeal_fraction = None
340
+
341
+ self.best_hits = None
342
+
343
+ self.do_compress = do_compress
344
+
345
+ #Load HMM
346
+ def load_hmm_from_file(self, hmm_path):
347
+ hmm_set = pyhmmer.plan7.HMMFile(hmm_path)
348
+ for hmm in hmm_set:
349
+ self.hmm_model.append(hmm)
350
+
351
+ #Set archaeal and bacterial HMM sets.
352
+ def assign_hmm_sets(self):
353
+ self.bacterial_SCPs = {'PF00709_21': 'Adenylsucc_synt', 'PF00406_22': 'ADK', 'PF01808_18': 'AICARFT_IMPCHas', 'PF00231_19': 'ATP-synt',
354
+ 'PF00119_20': 'ATP-synt_A', 'PF01264_21': 'Chorismate_synt', 'PF00889_19': 'EF_TS', 'PF01176_19': 'eIF-1a',
355
+ 'PF02601_15': 'Exonuc_VII_L', 'PF01025_19': 'GrpE', 'PF01725_16': 'Ham1p_like', 'PF01715_17': 'IPPT',
356
+ 'PF00213_18': 'OSCP', 'PF01195_19': 'Pept_tRNA_hydro', 'PF00162_19': 'PGK', 'PF02033_18': 'RBFA', 'PF02565_15': 'RecO_C',
357
+ 'PF00825_18': 'Ribonuclease_P', 'PF00687_21': 'Ribosomal_L1', 'PF00572_18': 'Ribosomal_L13',
358
+ 'PF00238_19': 'Ribosomal_L14', 'PF00252_18': 'Ribosomal_L16', 'PF01196_19': 'Ribosomal_L17',
359
+ 'PF00861_22': 'Ribosomal_L18p', 'PF01245_20': 'Ribosomal_L19', 'PF00453_18': 'Ribosomal_L20',
360
+ 'PF00829_21': 'Ribosomal_L21p', 'PF00237_19': 'Ribosomal_L22', 'PF00276_20': 'Ribosomal_L23',
361
+ 'PF17136_4': 'ribosomal_L24', 'PF00189_20': 'Ribosomal_S3_C', 'PF00281_19': 'Ribosomal_L5', 'PF00181_23': 'Ribosomal_L2',
362
+ 'PF01016_19': 'Ribosomal_L27', 'PF00828_19': 'Ribosomal_L27A', 'PF00830_19': 'Ribosomal_L28',
363
+ 'PF00831_23': 'Ribosomal_L29', 'PF00297_22': 'Ribosomal_L3', 'PF01783_23': 'Ribosomal_L32p',
364
+ 'PF01632_19': 'Ribosomal_L35p', 'PF00573_22': 'Ribosomal_L4', 'PF00347_23': 'Ribosomal_L6',
365
+ 'PF03948_14': 'Ribosomal_L9_C', 'PF00338_22': 'Ribosomal_S10', 'PF00411_19': 'Ribosomal_S11',
366
+ 'PF00416_22': 'Ribosomal_S13', 'PF00312_22': 'Ribosomal_S15', 'PF00886_19': 'Ribosomal_S16',
367
+ 'PF00366_20': 'Ribosomal_S17', 'PF00203_21': 'Ribosomal_S19', 'PF00318_20': 'Ribosomal_S2',
368
+ 'PF01649_18': 'Ribosomal_S20p', 'PF01250_17': 'Ribosomal_S6', 'PF00177_21': 'Ribosomal_S7',
369
+ 'PF00410_19': 'Ribosomal_S8', 'PF00380_19': 'Ribosomal_S9', 'PF00164_25': 'Ribosom_S12_S23',
370
+ 'PF01193_24': 'RNA_pol_L', 'PF01192_22': 'RNA_pol_Rpb6', 'PF01765_19': 'RRF', 'PF02410_15': 'RsfS',
371
+ 'PF03652_15': 'RuvX', 'PF00584_20': 'SecE', 'PF03840_14': 'SecG', 'PF00344_20': 'SecY', 'PF01668_18': 'SmpB',
372
+ 'PF00750_19': 'tRNA-synt_1d', 'PF01746_21': 'tRNA_m1G_MT', 'PF02367_17': 'TsaE', 'PF02130_17': 'UPF0054',
373
+ 'PF02699_15': 'YajC'}
374
+
375
+ self.archaeal_SCPs = {'PF00709_21': 'Adenylsucc_synt', 'PF05221_17': 'AdoHcyase', 'PF01951_16': 'Archease', 'PF01813_17': 'ATP-synt_D',
376
+ 'PF01990_17': 'ATP-synt_F', 'PF01864_17': 'CarS-like', 'PF01982_16': 'CTP-dep_RFKase', 'PF01866_17': 'Diphthamide_syn',
377
+ 'PF04104_14': 'DNA_primase_lrg', 'PF01984_20': 'dsDNA_bind', 'PF04010_13': 'DUF357', 'PF04019_12': 'DUF359',
378
+ 'PF04919_12': 'DUF655', 'PF01912_18': 'eIF-6', 'PF05833_11': 'FbpA', 'PF01725_16': 'Ham1p_like',
379
+ 'PF00368_18': 'HMG-CoA_red', 'PF00334_19': 'NDK', 'PF02006_16': 'PPS_PS', 'PF02996_17': 'Prefoldin',
380
+ 'PF01981_16': 'PTH2', 'PF01948_18': 'PyrI', 'PF00687_21': 'Ribosomal_L1', 'PF00572_18': 'Ribosomal_L13',
381
+ 'PF00238_19': 'Ribosomal_L14', 'PF00827_17': 'Ribosomal_L15e', 'PF00252_18': 'Ribosomal_L16',
382
+ 'PF01157_18': 'Ribosomal_L21e', 'PF00237_19': 'Ribosomal_L22', 'PF00276_20': 'Ribosomal_L23',
383
+ 'PF16906_5': 'Ribosomal_L26', 'PF00831_23': 'Ribosomal_L29', 'PF00297_22': 'Ribosomal_L3',
384
+ 'PF01198_19': 'Ribosomal_L31e', 'PF01655_18': 'Ribosomal_L32e', 'PF01780_19': 'Ribosomal_L37ae',
385
+ 'PF00832_20': 'Ribosomal_L39', 'PF00573_22': 'Ribosomal_L4', 'PF00935_19': 'Ribosomal_L44', 'PF17144_4': 'Ribosomal_L5e',
386
+ 'PF00347_23': 'Ribosomal_L6', 'PF00411_19': 'Ribosomal_S11', 'PF00416_22': 'Ribosomal_S13',
387
+ 'PF00312_22': 'Ribosomal_S15', 'PF00366_20': 'Ribosomal_S17', 'PF00833_18': 'Ribosomal_S17e',
388
+ 'PF00203_21': 'Ribosomal_S19', 'PF01090_19': 'Ribosomal_S19e', 'PF00318_20': 'Ribosomal_S2',
389
+ 'PF01282_19': 'Ribosomal_S24e', 'PF01667_17': 'Ribosomal_S27e', 'PF01200_18': 'Ribosomal_S28e',
390
+ 'PF01015_18': 'Ribosomal_S3Ae', 'PF00177_21': 'Ribosomal_S7', 'PF00410_19': 'Ribosomal_S8',
391
+ 'PF01201_22': 'Ribosomal_S8e', 'PF00380_19': 'Ribosomal_S9', 'PF00164_25': 'Ribosom_S12_S23',
392
+ 'PF06026_14': 'Rib_5-P_isom_A', 'PF01351_18': 'RNase_HII', 'PF13656_6': 'RNA_pol_L_2',
393
+ 'PF01194_17': 'RNA_pol_N', 'PF03874_16': 'RNA_pol_Rpb4', 'PF01192_22': 'RNA_pol_Rpb6',
394
+ 'PF01139_17': 'RtcB', 'PF00344_20': 'SecY', 'PF06093_13': 'Spt4', 'PF00121_18': 'TIM', 'PF01994_16': 'Trm56',
395
+ 'PF00749_21': 'tRNA-synt_1c', 'PF00750_19': 'tRNA-synt_1d', 'PF13393_6': 'tRNA-synt_His',
396
+ 'PF01142_18': 'TruD', 'PF01992_16': 'vATP-synt_AC39', 'PF01991_18': 'vATP-synt_E', 'PF01496_19': 'V_ATPase_I'}
397
+
398
+ #Convert passed sequences.
399
+ def convert_protein_seqs_in_mem(self, contents):
400
+ #Clean up.
401
+ self.proteins_to_search = []
402
+
403
+ for protein in contents:
404
+ #Skip a protein if it's longer than 100k AA.
405
+ if len(contents[protein]) >= 100000:
406
+ continue
407
+ as_bytes = protein.encode()
408
+ #Pyhmmer digitization of sequences for searching.
409
+ easel_seq = pyhmmer.easel.TextSequence(name = as_bytes, sequence = contents[protein])
410
+ easel_seq = easel_seq.digitize(pyhmmer.easel.Alphabet.amino())
411
+ self.proteins_to_search.append(easel_seq)
412
+
413
+ easel_seq = None
414
+
415
+ def execute_search(self):
416
+ top_hits = list(pyhmmer.hmmsearch(self.hmm_model, self.proteins_to_search, cpus=1, bit_cutoffs="trusted"))
417
+
418
+ self.printable_lines = []
419
+
420
+ self.hmm_result_proteins = []
421
+ self.hmm_result_accessions = []
422
+ self.hmm_result_scores = []
423
+
424
+ for model in top_hits:
425
+ for hit in model:
426
+ target_name = hit.name.decode()
427
+ target_acc = hit.accession
428
+ if target_acc is None:
429
+ target_acc = "-"
430
+ else:
431
+ target_acc = target_acc.decode()
432
+
433
+ query_name = hit.best_domain.alignment.hmm_name.decode()
434
+ query_acc = hit.best_domain.alignment.hmm_accession.decode()
435
+
436
+ full_seq_evalue = "%.2g" % hit.evalue
437
+ full_seq_score = round(hit.score, 1)
438
+ full_seq_bias = round(hit.bias, 1)
439
+
440
+ best_dom_evalue = "%.2g" % hit.best_domain.alignment.domain.i_evalue
441
+ best_dom_score = round(hit.best_domain.alignment.domain.score, 1)
442
+ best_dom_bias = round(hit.best_domain.alignment.domain.bias, 1)
443
+
444
+ #I don't know how to get most of these values.
445
+ exp = 0
446
+ reg = 0
447
+ clu = 0
448
+ ov = 0
449
+ env = 0
450
+ dom = len(hit.domains)
451
+ rep = 0
452
+ inc = 0
453
+
454
+ try:
455
+ description = self.protein_descriptions[target_name]
456
+ except:
457
+ description = ""
458
+
459
+ writeout = [target_name, target_acc, query_name, query_acc, full_seq_evalue, \
460
+ full_seq_score, full_seq_bias, best_dom_evalue, best_dom_score, best_dom_bias, \
461
+ exp, reg, clu, ov, env, dom, rep, inc, description]
462
+
463
+ #Format and join.
464
+ writeout = [str(i) for i in writeout]
465
+ writeout = '\t'.join(writeout)
466
+
467
+ self.printable_lines.append(writeout)
468
+
469
+ self.hmm_result_proteins.append(target_name)
470
+ self.hmm_result_accessions.append(query_acc)
471
+ self.hmm_result_scores.append(best_dom_score)
472
+
473
+ def filter_to_best_hits(self):
474
+ hmm_file = np.transpose(np.array([self.hmm_result_proteins, self.hmm_result_accessions, self.hmm_result_scores]))
475
+
476
+ #hmm_file = np.loadtxt(hmm_file_name, comments = '#', usecols = (0, 3, 8), dtype=(str))
477
+ #Sort the hmm file based on the score column in descending order.
478
+ hmm_file = hmm_file[hmm_file[:,2].astype(float).argsort()[::-1]]
479
+
480
+ #Identify the first row where each gene name appears, after sorting by score;
481
+ #in effect, return the highest scoring assignment per gene name
482
+ #Sort the indices of the result to match the score-sorted table instead of alphabetical order of gene names
483
+ hmm_file = hmm_file[np.sort(np.unique(hmm_file[:,0], return_index = True)[1])]
484
+
485
+ #Filter the file again for the unique ACCESSION names, since we're only allowed one gene per accession, I guess?
486
+ #Don't sort the indices, we don't care about the scores anymore.
487
+ hmm_file = hmm_file[np.unique(hmm_file[:,1], return_index = True)[1]]
488
+
489
+ sql_friendly_names = [i.replace(".", "_") for i in hmm_file[:,1]]
490
+
491
+ self.best_hits = dict(zip(hmm_file[:,0], sql_friendly_names))
492
+
493
+ hmm_file = None
494
+
495
+ #Count per-dom occurs.
496
+ def assign_domain(self):
497
+ for prot in self.best_hits.values():
498
+ if prot in self.bacterial_SCPs:
499
+ self.domain_counts["Bacteria"] += 1
500
+ if prot in self.archaeal_SCPs:
501
+ self.domain_counts["Archaea"] += 1
502
+
503
+ self.bacterial_fraction = self.domain_counts["Bacteria"] / self.voted_domain["Bacteria"]
504
+ self.aechaeal_fraction = self.domain_counts["Archaea"] / self.voted_domain["Archaea"]
505
+
506
+ if self.bacterial_fraction >= self.aechaeal_fraction:
507
+ self.voted_domain = "Bacteria"
508
+ else:
509
+ self.voted_domain = "Archaea"
510
+
511
+ pop_keys = list(self.best_hits.keys())
512
+ for key in pop_keys:
513
+ if self.voted_domain == "Bacteria":
514
+ if self.best_hits[key] not in self.bacterial_SCPs:
515
+ self.best_hits.pop(key)
516
+ if self.voted_domain == "Archaea":
517
+ if self.best_hits[key] not in self.archaeal_SCPs:
518
+ self.best_hits.pop(key)
519
+
520
+ def to_hmm_file(self, output):
521
+ if output is not None:
522
+ #PyHMMER data is a bit hard to parse. For each result:
523
+ content = '\n'.join(self.printable_lines) + '\n'
524
+
525
+ if self.do_compress:
526
+ if not output.endswith(".gz"):
527
+ output += ".gz"
528
+
529
+ content = content.encode()
530
+
531
+ fh = gzip.open(output, "wb")
532
+ fh.write(content)
533
+ fh.close()
534
+ content = None
535
+
536
+ else:
537
+ fh = open(output, "w")
538
+
539
+ fh.write(content)
540
+
541
+ fh.close()
542
+
543
+ content = None
544
+
545
+ #If we're doing this step at all, we've either loaded the seqs into mem by reading the prot file
546
+ #or have them in mem thanks to pyrodigal.
547
+ def run_for_fastaai(self, prots, hmm_output):
548
+ #self.convert_protein_seqs_in_mem(prots)
549
+ #self.execute_search()
550
+ #self.filter_to_best_hits()
551
+
552
+ try:
553
+ self.convert_protein_seqs_in_mem(prots)
554
+ self.execute_search()
555
+ self.filter_to_best_hits()
556
+ try:
557
+ self.to_hmm_file(hmm_output)
558
+ except:
559
+ print(output, "cannot be created. HMM search failed. This file will be skipped.")
560
+
561
+ except:
562
+ print(hmm_output, "failed to run through HMMER!")
563
+ self.best_hits = None
564
+
565
+ class mining_straight_down:
566
+ def __init__(self, basename = None, protein_list = None, crystal_output = None, compress = False):
567
+ self.basename = basename
568
+ self.proteins_to_format = protein_list
569
+ self.output_file = crystal_output
570
+ self.formatted_data = None
571
+ self.do_compress = compress
572
+
573
+ #Translate tetramers to unique int32 indices.
574
+ def unique_kmer_simple_key(self, seq):
575
+ #num tetramers = len(seq) - 4 + 1, just make it -3.
576
+ n_kmers = len(seq) - 3
577
+
578
+ #Converts the characters in a sequence into their ascii int value
579
+ as_ints = np.array([ord(i) for i in seq], dtype = np.int32)
580
+
581
+ #create seq like 0,1,2,3; 1,2,3,4; 2,3,4,5... for each tetramer that needs a value
582
+ kmers = np.arange(4*n_kmers)
583
+ kmers = kmers % 4 + kmers // 4
584
+
585
+ #Select the characters (as ints) corresponding to each tetramer all at once and reshape into rows of 4,
586
+ #each row corresp. to a successive tetramer
587
+ kmers = as_ints[kmers].reshape((n_kmers, 4))
588
+
589
+ #Given four 2-digit numbers, these multipliers work as offsets so that all digits are preserved in order when summed
590
+ mult = np.array([1000000, 10000, 100, 1], dtype = np.int32)
591
+
592
+ #the fixed values effectively offset the successive chars of the tetramer by 2 positions each time;
593
+ #practically, this is concatenation of numbers
594
+ #Matrix mult does this for all values at once.
595
+ return np.unique(np.dot(kmers, mult))
596
+
597
+ def prepare_data(self):
598
+ self.formatted_data = {"filename": self.basename, "protein_data":{}}
599
+ for prot_acc_seq in self.proteins_to_format:
600
+ prot = prot_acc_seq[0]
601
+ acc = prot_acc_seq[1]
602
+ kmerized_seq = self.unique_kmer_simple_key(prot_acc_seq[2])
603
+ kmerized_seq = kmerized_seq.tolist()
604
+ #print(kmerized_seq)
605
+
606
+ self.formatted_data["protein_data"][acc] = {"protein_name":prot, "kmers":kmerized_seq}
607
+
608
+ def to_json(self):
609
+ if self.do_compress:
610
+ if not self.output_file.endswith(".gz"):
611
+ self.output_file += ".gz"
612
+
613
+ self.formatted_data = json.dumps(self.formatted_data, indent = 4) #Convert to JSON
614
+ self.formatted_data = self.formatted_data.encode('utf-8') #Encode to binary
615
+ with gzip.open(self.output_file, 'wb') as fh:
616
+ fh.write(self.formatted_data)
617
+
618
+ else:
619
+ with open(self.output_file, "w") as fh:
620
+ json.dump(self.formatted_data, fh, indent = 4)
621
+
622
+ class input_file:
623
+ def __init__(self, genome = None, protein = None, hmm = None, #data inputs
624
+ output_protein = None, output_hmm = None, output_crystal = None, #data outputs
625
+ output_log = None, verbose = False, compress_outputs = False):
626
+
627
+ self.verbose = verbose
628
+ self.do_compress = compress_outputs
629
+
630
+ self.genome_input = genome
631
+ self.protein_input = protein
632
+ self.hmm_input = hmm
633
+
634
+ self.protein_output = output_protein
635
+ self.hmm_output = output_hmm
636
+ self.crystal_output = output_crystal
637
+
638
+ self.log_contents = []
639
+ self.log_file = output_log
640
+
641
+ self.initial_status = None
642
+ self.current_status = "genome"
643
+
644
+ self.basename = None
645
+
646
+ self.genome = None
647
+ self.proteins = None
648
+ self.hmm_besthits = None
649
+
650
+ current_datetime = datetime.datetime.now()
651
+ self.timestamps = {"start":current_datetime,
652
+ "protein_pred":current_datetime,
653
+ "hmm_search":current_datetime,
654
+ "crystal":current_datetime}
655
+
656
+ self.runtimes = None
657
+
658
+ self.hmm_file = None
659
+
660
+ def curtime(self, step = None):
661
+ if step is not None:
662
+ self.timestamps[step] = datetime.datetime.now()
663
+
664
+ def timediffs(self):
665
+ self.runtimes = {}
666
+ protein_pred_time = self.timestamps["protein_pred"] - self.timestamps["start"]
667
+ protein_pred_time = round(protein_pred_time.total_seconds(), 2)
668
+
669
+ hmm_search_time = self.timestamps["hmm_search"] - self.timestamps["protein_pred"]
670
+ hmm_search_time = round(hmm_search_time.total_seconds(), 2)
671
+
672
+ crystal_time = self.timestamps["crystal"] - self.timestamps["hmm_search"]
673
+ crystal_time = round(crystal_time.total_seconds(), 2)
674
+
675
+ self.runtimes["protein_pred"] = protein_pred_time
676
+ self.runtimes["hmm_search"] = hmm_search_time
677
+ self.runtimes["crystal"] = crystal_time
678
+
679
+ def get_initial_status(self):
680
+ if self.genome_input is not None:
681
+ self.initial_status = "genome"
682
+
683
+ if self.protein_input is not None:
684
+ self.initial_status = "protein"
685
+
686
+ if self.hmm_input is not None and self.protein_input is not None:
687
+ self.initial_status = "hmm"
688
+
689
+ def get_file_basename(self):
690
+ if self.initial_status == "genome":
691
+ self.basename = self.file_basename(self.genome_input)
692
+ if self.initial_status == "protein":
693
+ self.basename = self.file_basename(self.protein_input)
694
+ if self.initial_status == "hmm":
695
+ self.basename = self.file_basename(self.protein_input)
696
+
697
+ #Not an input sanitizer - simply replaces characters that would throw SQLite for a loop.
698
+ def sql_safe(self, string):
699
+ #Sanitize for SQL
700
+ #These are chars safe for sql
701
+ sql_safe = set('_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789')
702
+ current_chars = set(string)
703
+ #self.sql_name = self.basename
704
+ #Identify SQL-unsafe characters as those outside the permissible set and replace all with underscores.
705
+ for char in current_chars - sql_safe:
706
+ string = string.replace(char, "_")
707
+
708
+ return string
709
+
710
+ #Gonna have to go put this everywhere...
711
+ #Consistent file basename behavior
712
+ def file_basename(self, file):
713
+ #Get the name after the final directory path
714
+ name = os.path.basename(file)
715
+ #Extract the portion of a filename prior to the first '.' separator.
716
+ while name != os.path.splitext(name)[0]:
717
+ name = os.path.splitext(name)[0]
718
+
719
+ name = self.sql_safe(name)
720
+
721
+ return name
722
+
723
+ def find_hmm(self):
724
+ self.hmm_file = None
725
+ try:
726
+ #Look in the same dir as the script; old method/MiGA friendly
727
+ script_path = os.path.dirname(__file__)
728
+ if len(script_path) == 0:
729
+ script_path = "."
730
+ hmm_complete_model = os.path.abspath(os.path.normpath(script_path +"/"+ "00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm"))
731
+ self.hmm_file = str(hmm_complete_model)
732
+ except:
733
+ #Try to locate the data bundled as it would be with a pip/conda install.
734
+ script_path = os.path.dirname(sys.modules['fastAAI_HMM_models'].__file__)
735
+ if len(script_path) == 0:
736
+ script_path = "."
737
+ hmm_complete_model = os.path.abspath(os.path.normpath(script_path + '/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm'))
738
+ self.hmm_file = str(hmm_complete_model)
739
+ #Check that the file exists or fail to the except.
740
+ fh = open(self.hmm_file)
741
+ fh.close()
742
+
743
+ #Load existing files functions
744
+ def read_genomes(self):
745
+ if self.genome_input is not None:
746
+ genome_seqs = fasta_file(self.genome_input)
747
+ self.genome = genome_seqs.contents
748
+ genome_seqs = None
749
+
750
+ def read_proteins(self):
751
+ if self.protein_input is not None:
752
+ #Simple dict of seqid:sequence
753
+ protein_seqs = fasta_file(self.protein_input)
754
+ self.proteins = protein_seqs.contents
755
+ protein_seqs = None
756
+
757
+ def read_hmms(self):
758
+ if self.hmm_input is not None:
759
+ prots = []
760
+ accs = []
761
+ scores = []
762
+ f = agnostic_reader(self.hmm_input)
763
+ for line in f:
764
+ if line.startswith("#"):
765
+ continue
766
+ else:
767
+ segs = line.strip().split()
768
+
769
+ if len(segs) < 9:
770
+ continue
771
+
772
+ prots.append(segs[0])
773
+ accs.append(segs[3])
774
+ scores.append(segs[8])
775
+
776
+ f.close()
777
+
778
+ if len(prots) < 1:
779
+ self.best_hits = {}
780
+
781
+ hmm_file = np.transpose(np.array([prots, accs, scores]))
782
+
783
+ #hmm_file = np.loadtxt(hmm_file_name, comments = '#', usecols = (0, 3, 8), dtype=(str))
784
+ #Sort the hmm file based on the score column in descending order.
785
+ hmm_file = hmm_file[hmm_file[:,2].astype(float).argsort()[::-1]]
786
+
787
+ #Identify the first row where each gene name appears, after sorting by score;
788
+ #in effect, return the highest scoring assignment per gene name
789
+ #Sort the indices of the result to match the score-sorted table instead of alphabetical order of gene names
790
+ hmm_file = hmm_file[np.sort(np.unique(hmm_file[:,0], return_index = True)[1])]
791
+
792
+ #Filter the file again for the unique ACCESSION names, since we're only allowed one gene per accession, I guess?
793
+ #Don't sort the indices, we don't care about the scores anymore.
794
+ hmm_file = hmm_file[np.unique(hmm_file[:,1], return_index = True)[1]]
795
+
796
+ sql_friendly_names = [i.replace(".", "_") for i in hmm_file[:,1]]
797
+ self.hmm_besthits = dict(zip(hmm_file[:,0], sql_friendly_names))
798
+
799
+ #runner functions
800
+ def predict_proteins(self):
801
+ mn = pyrodigal_manager(sequences = self.genome,
802
+ aa = self.protein_output,
803
+ compress = self.do_compress)
804
+ mn.run()
805
+ self.proteins = mn.predicted_genes
806
+
807
+ mn = None
808
+
809
+ def hmm_search_and_BH(self):
810
+ hmm_manager = pyhmmer_manager(self.do_compress)
811
+ hmm_manager.load_hmm_from_file(self.hmm_file)
812
+
813
+ hmm_manager.run_for_fastaai(prots = self.proteins, hmm_output = self.hmm_output)
814
+
815
+ self.hmm_besthits = hmm_manager.best_hits
816
+
817
+ def filter_bh_prots(self):
818
+ cleaned_prots = []
819
+ for protein in self.proteins:
820
+ if protein in self.hmm_besthits:
821
+ accession = self.hmm_besthits[protein]
822
+
823
+ next_item = (protein, accession, self.proteins[protein])
824
+
825
+ cleaned_prots.append(next_item)
826
+
827
+ self.proteins = cleaned_prots
828
+ cleaned_prots = None
829
+
830
+ def crystalize(self):
831
+ mn = mining_straight_down(basename = self.basename, protein_list = self.proteins, crystal_output = self.crystal_output, compress = self.do_compress)
832
+ mn.prepare_data()
833
+ mn.to_json()
834
+
835
+ def run(self):
836
+ self.get_initial_status()
837
+ self.get_file_basename()
838
+
839
+ self.current_status = self.initial_status
840
+
841
+ if self.current_status == "genome":
842
+ self.read_genomes()
843
+ self.predict_proteins()
844
+ self.current_status = "protein"
845
+
846
+ if self.initial_status == "protein":
847
+ self.read_proteins()
848
+
849
+ if self.verbose:
850
+ self.curtime("protein_pred")
851
+
852
+ if self.current_status == "protein":
853
+ self.find_hmm()
854
+ self.hmm_search_and_BH()
855
+ self.current_status = "hmm"
856
+
857
+ if self.initial_status == "hmm":
858
+ self.read_proteins()
859
+ self.read_hmms()
860
+
861
+ if self.verbose:
862
+ self.curtime("hmm_search")
863
+
864
+ if self.current_status == "hmm":
865
+ self.filter_bh_prots()
866
+ self.crystalize()
867
+
868
+ if self.verbose:
869
+ self.curtime("crystal")
870
+
871
+ if self.verbose:
872
+ self.timediffs()
873
+ print(self.basename, "complete.")
874
+ print("\tRuntimes: ", self.runtimes)
875
+
876
+ #Add options
877
+ def options():
878
+ '''
879
+ genome = None, protein = None, hmm = None, #data inputs
880
+ output_protein = None, output_hmm = None, output_crystal = None, #data outputs
881
+ output_log = None, verbose = False, compress_outputs = False
882
+ '''
883
+
884
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
885
+ description='''''')
886
+
887
+ parser.add_argument('--genome', dest = 'in_gen', default = None, help = 'Input genome in nt FASTA format.')
888
+ parser.add_argument('--protein', dest = 'in_prot', default = None, help = 'Input proteome for a genome in AA FASTA format')
889
+ parser.add_argument('--hmm', dest = 'in_hmm', default = None, help = 'Input FastAAI HMM search result for this proteome. Must be paired with --protein to work.')
890
+
891
+ parser.add_argument('--output_protein', dest = 'out_prot', default = None, help = 'An output containing predicted proteins for this genome in AA FASTA format. If omitted, no proteins file will be produced.')
892
+ parser.add_argument('--output_hmm', dest = 'out_hmm', default = None, help = 'An output containing the results of an HMM search of this proteome against FastAAIs SCPs. If omitted, no HMM file will be produced.')
893
+ parser.add_argument('--output_crystal', dest = 'out_crystal', default = None, required = True, help = 'Required. A JSON-format output representing the fully preprocessed input.')
894
+
895
+ parser.add_argument('--compress', dest = 'compress', action='store_true', help = 'GZIP protein and HMM outputs')
896
+ parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print feedback to stdout')
897
+
898
+ args, unknown_opts = parser.parse_known_args()
899
+
900
+ return parser, args
901
+
902
+ def main():
903
+ parser, opts = options()
904
+
905
+ if len(sys.argv) < 3:
906
+ parser.print_help()
907
+
908
+ ing = opts.in_gen
909
+ inp = opts.in_prot
910
+ inh = opts.in_hmm
911
+
912
+ outp = opts.out_prot
913
+ outh = opts.out_hmm
914
+ outc = opts.out_crystal
915
+
916
+ comp = opts.compress
917
+ verb = opts.verbose
918
+
919
+ mn = input_file(genome = ing,
920
+ protein = inp,
921
+ hmm = inh,
922
+ output_protein = outp,
923
+ output_hmm = outh,
924
+ output_crystal = outc,
925
+ compress_outputs = comp,
926
+ verbose = verb)
927
+
928
+ mn.run()
929
+
930
+ if __name__ == "__main__":
931
+ main()