miga-base 1.2.18.2 → 1.3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/cli/action/doctor/base.rb +2 -1
  3. data/lib/miga/cli/action/init.rb +1 -1
  4. data/lib/miga/dataset/result/add.rb +3 -2
  5. data/lib/miga/version.rb +2 -2
  6. data/scripts/essential_genes.bash +4 -8
  7. data/utils/FastAAI/LICENSE +8 -0
  8. data/utils/FastAAI/README.md +151 -40
  9. data/utils/FastAAI/__init__.py +1 -0
  10. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962915_1.fna.gz +0 -0
  11. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962925_1.fna.gz +0 -0
  12. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962935_1.fna.gz +0 -0
  13. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962945_1.fna.gz +0 -0
  14. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962995_1.fna.gz +0 -0
  15. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963025_1.fna.gz +0 -0
  16. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963055_1.fna.gz +0 -0
  17. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963065_1.fna.gz +0 -0
  18. data/utils/FastAAI/example_genomes/_Pseudomonas__cissicola_GCA_002019225_1.fna.gz +0 -0
  19. data/utils/FastAAI/example_genomes/_Pseudomonas__cissicola_GCA_008801575_1.fna.gz +0 -0
  20. data/utils/FastAAI/fastaai/__init__.py +1 -0
  21. data/utils/FastAAI/fastaai/fastaai +4805 -0
  22. data/utils/FastAAI/fastaai/fastaai.py +4805 -0
  23. data/utils/FastAAI/fastaai/fastaai_miga_crystals_to_db.py +297 -0
  24. data/utils/FastAAI/fastaai/fastaai_miga_preproc.py +931 -0
  25. data/utils/FastAAI/metadata/Accession_names_and_IDs.txt +122 -0
  26. data/utils/distance/commands.rb +51 -23
  27. metadata +23 -6
  28. data/utils/FastAAI/FastAAI +0 -3659
  29. /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Archaea_SCG.hmm +0 -0
  30. /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Bacteria_SCG.hmm +0 -0
  31. /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Complete_SCG_DB.hmm +0 -0
@@ -0,0 +1,931 @@
1
+ import sys
2
+ import os
3
+ import pyrodigal as pd
4
+ import pyhmmer
5
+
6
+ import gzip
7
+ from collections import namedtuple
8
+ import argparse
9
+ import datetime
10
+ import json
11
+
12
+ import numpy as np
13
+
14
+ class fasta_file:
15
+ def __init__(self, file):
16
+ self.file_path = os.path.abspath(file)
17
+
18
+ self.contents = {}
19
+
20
+ self.read_fasta()
21
+
22
+ def read_fasta(self):
23
+ cur_seq = ""
24
+ cur_prot = ""
25
+
26
+ contents = {}
27
+ deflines = {}
28
+
29
+ fasta = agnostic_reader(self.file_path)
30
+ for line in fasta:
31
+ if line.startswith(">"):
32
+ if len(cur_seq) > 0:
33
+ contents[cur_prot] = cur_seq
34
+ deflines[cur_prot] = defline
35
+
36
+ cur_seq = ""
37
+ cur_prot = line.strip().split()[0][1:]
38
+ defline = line.strip()[len(cur_prot)+1 :].strip()
39
+
40
+ else:
41
+ cur_seq += line.strip()
42
+
43
+ fasta.close()
44
+
45
+ #Final iter
46
+ if len(cur_seq) > 0:
47
+ contents[cur_prot] = cur_seq
48
+ deflines[cur_prot] = defline
49
+
50
+ self.contents = contents
51
+
52
+ #return contents, deflines
53
+ return None
54
+
55
+ class agnostic_reader_iterator:
56
+ def __init__(self, reader):
57
+ self.handle_ = reader.handle
58
+ self.is_gz_ = reader.is_gz
59
+
60
+ def __next__(self):
61
+ if self.is_gz_:
62
+ line = self.handle_.readline().decode()
63
+ else:
64
+ line = self.handle_.readline()
65
+
66
+ #Ezpz EOF check
67
+ if line:
68
+ return line
69
+ else:
70
+ raise StopIteration
71
+
72
+ #File reader that doesn't care if you give it a gzipped file or not.
73
+ class agnostic_reader:
74
+ def __init__(self, file):
75
+ self.path = file
76
+
77
+ with open(file, 'rb') as test_gz:
78
+ #Gzip magic number
79
+ is_gz = (test_gz.read(2) == b'\x1f\x8b')
80
+
81
+ self.is_gz = is_gz
82
+
83
+ if is_gz:
84
+ self.handle = gzip.open(self.path)
85
+ else:
86
+ self.handle = open(self.path)
87
+
88
+ def __iter__(self):
89
+ return agnostic_reader_iterator(self)
90
+
91
+ def close(self):
92
+ self.handle.close()
93
+
94
+ class pyrodigal_manager:
95
+ def __init__(self, sequences = None,
96
+ trans_tables = [11, 4], #translation tables to use - only relevant if training input is None and is_meta is False
97
+ aa = None, compress = False):
98
+
99
+
100
+ self.meta = False
101
+ self.gene_predictors = {}
102
+
103
+ self.sequences = sequences
104
+ self.seqlens = {}
105
+
106
+ self.training_seq = []
107
+ self.running_sum = 0 #training sequence current length
108
+ self.training_needs_join = False #needed if more than 1 seq added to training sequence
109
+ self.training_done = False
110
+
111
+ self.trans_tables = trans_tables
112
+ self.training_data = {}
113
+
114
+ self.aa = aa
115
+
116
+ self.predicted_genes = {}
117
+ self.coding_densities = {}
118
+ if self.meta:
119
+ self.predicted_genes[-1] = {}
120
+ self.coding_densities[-1] = {}
121
+ else:
122
+ for t in self.trans_tables:
123
+ self.predicted_genes[t] = {}
124
+ self.coding_densities[t] = {}
125
+
126
+ self.log = []
127
+ self.do_compress = compress
128
+
129
+ def sequence_handler(self):
130
+ self.training_seq = []
131
+ self.training_needs_join = False
132
+ if not self.meta:
133
+ for seqid in self.sequences:
134
+ current_seqlen = len(self.sequences[seqid])
135
+ self.seqlens[seqid] = current_seqlen #get length
136
+ self.sequences[seqid] = self.sequences[seqid].encode() #to binary
137
+
138
+ self.training_seq.append(self.sequences[seqid]) #add to training set
139
+ self.running_sum += current_seqlen #running total of 32 million or less
140
+
141
+ if self.training_needs_join:
142
+ self.running_sum += 12
143
+
144
+ self.training_needs_join = True
145
+
146
+ if self.running_sum > 32000000:
147
+ self.train_manager()
148
+
149
+ if not self.training_done:
150
+ self.train_manager()
151
+
152
+ def convert_seq(self, this_seqid):
153
+ self.sequences[this_seqid] = ''.join(self.sequences[this_seqid])
154
+ seqlen = len(self.sequences[this_seqid])
155
+ self.sequences[this_seqid] = self.sequences[this_seqid].encode()
156
+ return seqlen
157
+
158
+ def train_manager(self):
159
+ if not self.training_done: #Make sure I don't happen twice
160
+ self.training_done = True #Make sure I don't happen twice
161
+ if self.running_sum < 20000:
162
+ self.log.append("Can't train on 20 thousand or fewer characters. Switching to meta mode.")
163
+ self.gene_predictor = pd.OrfFinder(meta=True)
164
+ self.is_meta = True
165
+ else:
166
+ #Collect sequences into a prodigal-formatted string
167
+ self.training_seq = b'TTAATTAATTAA'.join(self.sequences.values())
168
+
169
+ #Truncate to 32 million bp if needed
170
+ if self.running_sum > 32000000:
171
+ self.log.append("Warning: Sequence is long (max 32000000 for training).")
172
+ self.log.append("Training on the first 32000000 bases.")
173
+ self.training_seq = self.training_seq[0:32000000]
174
+
175
+ #G is 71, C is 67; we're counting G + C and dividing by the total.
176
+ gc = round(((self.training_seq.count(67) + self.training_seq.count(71))/ len(self.training_seq)) * 100, 2)
177
+
178
+ self.log.append(str(len(self.training_seq)) + " bp training seq created, " + str(gc) + " pct GC")
179
+
180
+ #Intialize orffinder
181
+ self.gene_predictor = pd.OrfFinder(meta=False)
182
+
183
+ #Create training data on each sequence
184
+ for i in range(0, len(self.trans_tables)):
185
+ next_table = self.trans_tables.pop(0)
186
+
187
+ self.gene_predictors[next_table] = pd.OrfFinder(meta = False)
188
+ self.gene_predictors[next_table].train(self.training_seq, translation_table = next_table)
189
+
190
+ #Making another OrfFinder instance with this will allow quick swapping while comparing tables.
191
+ self.training_data[next_table] = self.gene_predictors[next_table].training_info
192
+
193
+ #Clean up afterwards
194
+ self.training_seq = None
195
+
196
+ def predict(self):
197
+ #Eliminate sequence entries to prevent memory bloat.
198
+ #Usually just grabs one sequence.
199
+ remaining_sequence_ids = tuple(self.sequences.keys())
200
+ for seqid in remaining_sequence_ids:
201
+ sequence = self.sequences.pop(seqid)
202
+
203
+ for tt in self.gene_predictors:
204
+ #How do we get this working with the training data instances...
205
+ next_genes = self.gene_predictors[tt].find_genes(sequence)
206
+
207
+ #Keep internal copy if the goal is to reuse them in another program
208
+ self.predicted_genes[tt][seqid] = next_genes #Easier to retain like this and call gene functions.
209
+
210
+ self.compare_predicted_genes()
211
+
212
+ def compare_predicted_genes(self):
213
+ if len(self.predicted_genes) == 1:
214
+ pass
215
+ else:
216
+ for tt in self.predicted_genes:
217
+ total_seqlen = 0
218
+ total_coding_bases = 0
219
+ for seqid in self.predicted_genes[tt]:
220
+ seqlen = self.seqlens[seqid]
221
+ total_seqlen += seqlen
222
+ for gene in self.predicted_genes[tt][seqid]:
223
+ total_coding_bases += (gene.end - gene.begin + 1) #Sequence is 1 longer because it's inclusive
224
+
225
+ self.coding_densities[tt] = total_coding_bases/total_seqlen
226
+
227
+ tables_to_remove = list(self.coding_densities.keys())
228
+ winning_table = None
229
+ winning_density = 0
230
+ for tt in self.coding_densities:
231
+ if self.coding_densities[tt] > 1.1 * winning_density:
232
+ winning_density = self.coding_densities[tt]
233
+ winning_table = tt
234
+
235
+ tables_to_remove.pop(tables_to_remove.index(winning_table)) #keep the winning table by removing all others
236
+
237
+
238
+ self.log.append("Winning translation table was: " + str(winning_table) + " with coding density " + str(round(winning_density, 4)))
239
+ for t in tables_to_remove:
240
+ self.log.append("Losing translation table: " + str(t) + " had coding density" + str(round(self.coding_densities[t], 4)))
241
+
242
+ self.predicted_genes = self.predicted_genes[winning_table] #keep the winning set.
243
+
244
+ def format_seq(self, seq, num_chars = 60):
245
+ #ceiling funciton without the math module
246
+ ceiling = int(round((len(seq)/num_chars)+0.5, 0))
247
+ formatted = '\n'.join([seq[(i*num_chars):(i+1)*num_chars] for i in range(0, ceiling)])
248
+ formatted = formatted.strip()
249
+
250
+ return formatted
251
+
252
+ def write_aa_file(self):
253
+ if self.aa is not None:
254
+ content = []
255
+
256
+ seqnum = 1
257
+ for seqid in self.predicted_genes:
258
+ gene_num = 1
259
+ for g in self.predicted_genes[seqid]:
260
+ #print(g)
261
+ protein_name = ">" + seqid + "_" + str(gene_num)
262
+ #table = g.translation_table
263
+ start = str(g.begin)
264
+ end = str(g.end)
265
+ strand = str(g.strand)
266
+ annotation = g._gene_data(seqnum)
267
+ translation = g.translate()
268
+ writeable_trans = self.format_seq(translation)
269
+ translation = None
270
+
271
+ header = " # ".join([protein_name, start, end, strand, annotation])
272
+
273
+ content.append(header)
274
+ content.append(writeable_trans)
275
+
276
+ gene_num += 1
277
+
278
+ seqnum += 1
279
+
280
+ content = "\n".join(content)
281
+ content += "\n" #final newline
282
+
283
+ if self.do_compress:
284
+ if not self.aa.endswith(".gz"):
285
+ self.aa += ".gz"
286
+
287
+ content = content.encode()
288
+
289
+ output_writer = gzip.open(self.aa, "wb")
290
+ else:
291
+ output_writer = open(self.aa, "w")
292
+
293
+ output_writer.write(content)
294
+
295
+ output_writer.close()
296
+
297
+ content = None
298
+
299
+ def convert_to_internal_rep(self): #go from pyrodigal objects to protein name:translation dict
300
+ conversion = {}
301
+ for seqid in self.predicted_genes:
302
+ gene_num = 1
303
+ for g in self.predicted_genes[seqid]:
304
+ #print(g)
305
+ protein_name = seqid + "_" + str(gene_num)
306
+ translation = g.translate()
307
+ conversion[protein_name] = translation
308
+ gene_num += 1
309
+
310
+ self.predicted_genes = conversion
311
+ conversion = None
312
+
313
+ def run(self):
314
+ self.sequence_handler()
315
+ self.predict()
316
+ self.write_aa_file()
317
+ self.convert_to_internal_rep()
318
+
319
+ class pyhmmer_manager:
320
+ def __init__(self, do_compress):
321
+ self.hmm_model = []
322
+
323
+ self.proteins_to_search = []
324
+ self.protein_descriptions = None
325
+
326
+ self.hmm_result_proteins = []
327
+ self.hmm_result_accessions = []
328
+ self.hmm_result_scores = []
329
+
330
+ self.printable_lines = []
331
+
332
+ self.bacterial_SCPs = None
333
+ self.archaeal_SCPs = None
334
+ self.assign_hmm_sets()
335
+ self.domain_counts = {"Bacteria" : 0, "Archaea": 0}
336
+ self.voted_domain = {"Bacteria" : len(self.bacterial_SCPs), "Archaea" : len(self.archaeal_SCPs)}
337
+
338
+ self.bacterial_fraction = None
339
+ self.archaeal_fraction = None
340
+
341
+ self.best_hits = None
342
+
343
+ self.do_compress = do_compress
344
+
345
+ #Load HMM
346
+ def load_hmm_from_file(self, hmm_path):
347
+ hmm_set = pyhmmer.plan7.HMMFile(hmm_path)
348
+ for hmm in hmm_set:
349
+ self.hmm_model.append(hmm)
350
+
351
+ #Set archaeal and bacterial HMM sets.
352
+ def assign_hmm_sets(self):
353
+ self.bacterial_SCPs = {'PF00709_21': 'Adenylsucc_synt', 'PF00406_22': 'ADK', 'PF01808_18': 'AICARFT_IMPCHas', 'PF00231_19': 'ATP-synt',
354
+ 'PF00119_20': 'ATP-synt_A', 'PF01264_21': 'Chorismate_synt', 'PF00889_19': 'EF_TS', 'PF01176_19': 'eIF-1a',
355
+ 'PF02601_15': 'Exonuc_VII_L', 'PF01025_19': 'GrpE', 'PF01725_16': 'Ham1p_like', 'PF01715_17': 'IPPT',
356
+ 'PF00213_18': 'OSCP', 'PF01195_19': 'Pept_tRNA_hydro', 'PF00162_19': 'PGK', 'PF02033_18': 'RBFA', 'PF02565_15': 'RecO_C',
357
+ 'PF00825_18': 'Ribonuclease_P', 'PF00687_21': 'Ribosomal_L1', 'PF00572_18': 'Ribosomal_L13',
358
+ 'PF00238_19': 'Ribosomal_L14', 'PF00252_18': 'Ribosomal_L16', 'PF01196_19': 'Ribosomal_L17',
359
+ 'PF00861_22': 'Ribosomal_L18p', 'PF01245_20': 'Ribosomal_L19', 'PF00453_18': 'Ribosomal_L20',
360
+ 'PF00829_21': 'Ribosomal_L21p', 'PF00237_19': 'Ribosomal_L22', 'PF00276_20': 'Ribosomal_L23',
361
+ 'PF17136_4': 'ribosomal_L24', 'PF00189_20': 'Ribosomal_S3_C', 'PF00281_19': 'Ribosomal_L5', 'PF00181_23': 'Ribosomal_L2',
362
+ 'PF01016_19': 'Ribosomal_L27', 'PF00828_19': 'Ribosomal_L27A', 'PF00830_19': 'Ribosomal_L28',
363
+ 'PF00831_23': 'Ribosomal_L29', 'PF00297_22': 'Ribosomal_L3', 'PF01783_23': 'Ribosomal_L32p',
364
+ 'PF01632_19': 'Ribosomal_L35p', 'PF00573_22': 'Ribosomal_L4', 'PF00347_23': 'Ribosomal_L6',
365
+ 'PF03948_14': 'Ribosomal_L9_C', 'PF00338_22': 'Ribosomal_S10', 'PF00411_19': 'Ribosomal_S11',
366
+ 'PF00416_22': 'Ribosomal_S13', 'PF00312_22': 'Ribosomal_S15', 'PF00886_19': 'Ribosomal_S16',
367
+ 'PF00366_20': 'Ribosomal_S17', 'PF00203_21': 'Ribosomal_S19', 'PF00318_20': 'Ribosomal_S2',
368
+ 'PF01649_18': 'Ribosomal_S20p', 'PF01250_17': 'Ribosomal_S6', 'PF00177_21': 'Ribosomal_S7',
369
+ 'PF00410_19': 'Ribosomal_S8', 'PF00380_19': 'Ribosomal_S9', 'PF00164_25': 'Ribosom_S12_S23',
370
+ 'PF01193_24': 'RNA_pol_L', 'PF01192_22': 'RNA_pol_Rpb6', 'PF01765_19': 'RRF', 'PF02410_15': 'RsfS',
371
+ 'PF03652_15': 'RuvX', 'PF00584_20': 'SecE', 'PF03840_14': 'SecG', 'PF00344_20': 'SecY', 'PF01668_18': 'SmpB',
372
+ 'PF00750_19': 'tRNA-synt_1d', 'PF01746_21': 'tRNA_m1G_MT', 'PF02367_17': 'TsaE', 'PF02130_17': 'UPF0054',
373
+ 'PF02699_15': 'YajC'}
374
+
375
+ self.archaeal_SCPs = {'PF00709_21': 'Adenylsucc_synt', 'PF05221_17': 'AdoHcyase', 'PF01951_16': 'Archease', 'PF01813_17': 'ATP-synt_D',
376
+ 'PF01990_17': 'ATP-synt_F', 'PF01864_17': 'CarS-like', 'PF01982_16': 'CTP-dep_RFKase', 'PF01866_17': 'Diphthamide_syn',
377
+ 'PF04104_14': 'DNA_primase_lrg', 'PF01984_20': 'dsDNA_bind', 'PF04010_13': 'DUF357', 'PF04019_12': 'DUF359',
378
+ 'PF04919_12': 'DUF655', 'PF01912_18': 'eIF-6', 'PF05833_11': 'FbpA', 'PF01725_16': 'Ham1p_like',
379
+ 'PF00368_18': 'HMG-CoA_red', 'PF00334_19': 'NDK', 'PF02006_16': 'PPS_PS', 'PF02996_17': 'Prefoldin',
380
+ 'PF01981_16': 'PTH2', 'PF01948_18': 'PyrI', 'PF00687_21': 'Ribosomal_L1', 'PF00572_18': 'Ribosomal_L13',
381
+ 'PF00238_19': 'Ribosomal_L14', 'PF00827_17': 'Ribosomal_L15e', 'PF00252_18': 'Ribosomal_L16',
382
+ 'PF01157_18': 'Ribosomal_L21e', 'PF00237_19': 'Ribosomal_L22', 'PF00276_20': 'Ribosomal_L23',
383
+ 'PF16906_5': 'Ribosomal_L26', 'PF00831_23': 'Ribosomal_L29', 'PF00297_22': 'Ribosomal_L3',
384
+ 'PF01198_19': 'Ribosomal_L31e', 'PF01655_18': 'Ribosomal_L32e', 'PF01780_19': 'Ribosomal_L37ae',
385
+ 'PF00832_20': 'Ribosomal_L39', 'PF00573_22': 'Ribosomal_L4', 'PF00935_19': 'Ribosomal_L44', 'PF17144_4': 'Ribosomal_L5e',
386
+ 'PF00347_23': 'Ribosomal_L6', 'PF00411_19': 'Ribosomal_S11', 'PF00416_22': 'Ribosomal_S13',
387
+ 'PF00312_22': 'Ribosomal_S15', 'PF00366_20': 'Ribosomal_S17', 'PF00833_18': 'Ribosomal_S17e',
388
+ 'PF00203_21': 'Ribosomal_S19', 'PF01090_19': 'Ribosomal_S19e', 'PF00318_20': 'Ribosomal_S2',
389
+ 'PF01282_19': 'Ribosomal_S24e', 'PF01667_17': 'Ribosomal_S27e', 'PF01200_18': 'Ribosomal_S28e',
390
+ 'PF01015_18': 'Ribosomal_S3Ae', 'PF00177_21': 'Ribosomal_S7', 'PF00410_19': 'Ribosomal_S8',
391
+ 'PF01201_22': 'Ribosomal_S8e', 'PF00380_19': 'Ribosomal_S9', 'PF00164_25': 'Ribosom_S12_S23',
392
+ 'PF06026_14': 'Rib_5-P_isom_A', 'PF01351_18': 'RNase_HII', 'PF13656_6': 'RNA_pol_L_2',
393
+ 'PF01194_17': 'RNA_pol_N', 'PF03874_16': 'RNA_pol_Rpb4', 'PF01192_22': 'RNA_pol_Rpb6',
394
+ 'PF01139_17': 'RtcB', 'PF00344_20': 'SecY', 'PF06093_13': 'Spt4', 'PF00121_18': 'TIM', 'PF01994_16': 'Trm56',
395
+ 'PF00749_21': 'tRNA-synt_1c', 'PF00750_19': 'tRNA-synt_1d', 'PF13393_6': 'tRNA-synt_His',
396
+ 'PF01142_18': 'TruD', 'PF01992_16': 'vATP-synt_AC39', 'PF01991_18': 'vATP-synt_E', 'PF01496_19': 'V_ATPase_I'}
397
+
398
+ #Convert passed sequences.
399
+ def convert_protein_seqs_in_mem(self, contents):
400
+ #Clean up.
401
+ self.proteins_to_search = []
402
+
403
+ for protein in contents:
404
+ #Skip a protein if it's longer than 100k AA.
405
+ if len(contents[protein]) >= 100000:
406
+ continue
407
+ as_bytes = protein.encode()
408
+ #Pyhmmer digitization of sequences for searching.
409
+ easel_seq = pyhmmer.easel.TextSequence(name = as_bytes, sequence = contents[protein])
410
+ easel_seq = easel_seq.digitize(pyhmmer.easel.Alphabet.amino())
411
+ self.proteins_to_search.append(easel_seq)
412
+
413
+ easel_seq = None
414
+
415
+ def execute_search(self):
416
+ top_hits = list(pyhmmer.hmmsearch(self.hmm_model, self.proteins_to_search, cpus=1, bit_cutoffs="trusted"))
417
+
418
+ self.printable_lines = []
419
+
420
+ self.hmm_result_proteins = []
421
+ self.hmm_result_accessions = []
422
+ self.hmm_result_scores = []
423
+
424
+ for model in top_hits:
425
+ for hit in model:
426
+ target_name = hit.name.decode()
427
+ target_acc = hit.accession
428
+ if target_acc is None:
429
+ target_acc = "-"
430
+ else:
431
+ target_acc = target_acc.decode()
432
+
433
+ query_name = hit.best_domain.alignment.hmm_name.decode()
434
+ query_acc = hit.best_domain.alignment.hmm_accession.decode()
435
+
436
+ full_seq_evalue = "%.2g" % hit.evalue
437
+ full_seq_score = round(hit.score, 1)
438
+ full_seq_bias = round(hit.bias, 1)
439
+
440
+ best_dom_evalue = "%.2g" % hit.best_domain.alignment.domain.i_evalue
441
+ best_dom_score = round(hit.best_domain.alignment.domain.score, 1)
442
+ best_dom_bias = round(hit.best_domain.alignment.domain.bias, 1)
443
+
444
+ #I don't know how to get most of these values.
445
+ exp = 0
446
+ reg = 0
447
+ clu = 0
448
+ ov = 0
449
+ env = 0
450
+ dom = len(hit.domains)
451
+ rep = 0
452
+ inc = 0
453
+
454
+ try:
455
+ description = self.protein_descriptions[target_name]
456
+ except:
457
+ description = ""
458
+
459
+ writeout = [target_name, target_acc, query_name, query_acc, full_seq_evalue, \
460
+ full_seq_score, full_seq_bias, best_dom_evalue, best_dom_score, best_dom_bias, \
461
+ exp, reg, clu, ov, env, dom, rep, inc, description]
462
+
463
+ #Format and join.
464
+ writeout = [str(i) for i in writeout]
465
+ writeout = '\t'.join(writeout)
466
+
467
+ self.printable_lines.append(writeout)
468
+
469
+ self.hmm_result_proteins.append(target_name)
470
+ self.hmm_result_accessions.append(query_acc)
471
+ self.hmm_result_scores.append(best_dom_score)
472
+
473
+ def filter_to_best_hits(self):
474
+ hmm_file = np.transpose(np.array([self.hmm_result_proteins, self.hmm_result_accessions, self.hmm_result_scores]))
475
+
476
+ #hmm_file = np.loadtxt(hmm_file_name, comments = '#', usecols = (0, 3, 8), dtype=(str))
477
+ #Sort the hmm file based on the score column in descending order.
478
+ hmm_file = hmm_file[hmm_file[:,2].astype(float).argsort()[::-1]]
479
+
480
+ #Identify the first row where each gene name appears, after sorting by score;
481
+ #in effect, return the highest scoring assignment per gene name
482
+ #Sort the indices of the result to match the score-sorted table instead of alphabetical order of gene names
483
+ hmm_file = hmm_file[np.sort(np.unique(hmm_file[:,0], return_index = True)[1])]
484
+
485
+ #Filter the file again for the unique ACCESSION names, since we're only allowed one gene per accession, I guess?
486
+ #Don't sort the indices, we don't care about the scores anymore.
487
+ hmm_file = hmm_file[np.unique(hmm_file[:,1], return_index = True)[1]]
488
+
489
+ sql_friendly_names = [i.replace(".", "_") for i in hmm_file[:,1]]
490
+
491
+ self.best_hits = dict(zip(hmm_file[:,0], sql_friendly_names))
492
+
493
+ hmm_file = None
494
+
495
+ #Count per-dom occurs.
496
+ def assign_domain(self):
497
+ for prot in self.best_hits.values():
498
+ if prot in self.bacterial_SCPs:
499
+ self.domain_counts["Bacteria"] += 1
500
+ if prot in self.archaeal_SCPs:
501
+ self.domain_counts["Archaea"] += 1
502
+
503
+ self.bacterial_fraction = self.domain_counts["Bacteria"] / self.voted_domain["Bacteria"]
504
+ self.aechaeal_fraction = self.domain_counts["Archaea"] / self.voted_domain["Archaea"]
505
+
506
+ if self.bacterial_fraction >= self.aechaeal_fraction:
507
+ self.voted_domain = "Bacteria"
508
+ else:
509
+ self.voted_domain = "Archaea"
510
+
511
+ pop_keys = list(self.best_hits.keys())
512
+ for key in pop_keys:
513
+ if self.voted_domain == "Bacteria":
514
+ if self.best_hits[key] not in self.bacterial_SCPs:
515
+ self.best_hits.pop(key)
516
+ if self.voted_domain == "Archaea":
517
+ if self.best_hits[key] not in self.archaeal_SCPs:
518
+ self.best_hits.pop(key)
519
+
520
+ def to_hmm_file(self, output):
521
+ if output is not None:
522
+ #PyHMMER data is a bit hard to parse. For each result:
523
+ content = '\n'.join(self.printable_lines) + '\n'
524
+
525
+ if self.do_compress:
526
+ if not output.endswith(".gz"):
527
+ output += ".gz"
528
+
529
+ content = content.encode()
530
+
531
+ fh = gzip.open(output, "wb")
532
+ fh.write(content)
533
+ fh.close()
534
+ content = None
535
+
536
+ else:
537
+ fh = open(output, "w")
538
+
539
+ fh.write(content)
540
+
541
+ fh.close()
542
+
543
+ content = None
544
+
545
+ #If we're doing this step at all, we've either loaded the seqs into mem by reading the prot file
546
+ #or have them in mem thanks to pyrodigal.
547
+ def run_for_fastaai(self, prots, hmm_output):
548
+ #self.convert_protein_seqs_in_mem(prots)
549
+ #self.execute_search()
550
+ #self.filter_to_best_hits()
551
+
552
+ try:
553
+ self.convert_protein_seqs_in_mem(prots)
554
+ self.execute_search()
555
+ self.filter_to_best_hits()
556
+ try:
557
+ self.to_hmm_file(hmm_output)
558
+ except:
559
+ print(output, "cannot be created. HMM search failed. This file will be skipped.")
560
+
561
+ except:
562
+ print(hmm_output, "failed to run through HMMER!")
563
+ self.best_hits = None
564
+
565
+ class mining_straight_down:
566
+ def __init__(self, basename = None, protein_list = None, crystal_output = None, compress = False):
567
+ self.basename = basename
568
+ self.proteins_to_format = protein_list
569
+ self.output_file = crystal_output
570
+ self.formatted_data = None
571
+ self.do_compress = compress
572
+
573
+ #Translate tetramers to unique int32 indices.
574
+ def unique_kmer_simple_key(self, seq):
575
+ #num tetramers = len(seq) - 4 + 1, just make it -3.
576
+ n_kmers = len(seq) - 3
577
+
578
+ #Converts the characters in a sequence into their ascii int value
579
+ as_ints = np.array([ord(i) for i in seq], dtype = np.int32)
580
+
581
+ #create seq like 0,1,2,3; 1,2,3,4; 2,3,4,5... for each tetramer that needs a value
582
+ kmers = np.arange(4*n_kmers)
583
+ kmers = kmers % 4 + kmers // 4
584
+
585
+ #Select the characters (as ints) corresponding to each tetramer all at once and reshape into rows of 4,
586
+ #each row corresp. to a successive tetramer
587
+ kmers = as_ints[kmers].reshape((n_kmers, 4))
588
+
589
+ #Given four 2-digit numbers, these multipliers work as offsets so that all digits are preserved in order when summed
590
+ mult = np.array([1000000, 10000, 100, 1], dtype = np.int32)
591
+
592
+ #the fixed values effectively offset the successive chars of the tetramer by 2 positions each time;
593
+ #practically, this is concatenation of numbers
594
+ #Matrix mult does this for all values at once.
595
+ return np.unique(np.dot(kmers, mult))
596
+
597
+ def prepare_data(self):
598
+ self.formatted_data = {"filename": self.basename, "protein_data":{}}
599
+ for prot_acc_seq in self.proteins_to_format:
600
+ prot = prot_acc_seq[0]
601
+ acc = prot_acc_seq[1]
602
+ kmerized_seq = self.unique_kmer_simple_key(prot_acc_seq[2])
603
+ kmerized_seq = kmerized_seq.tolist()
604
+ #print(kmerized_seq)
605
+
606
+ self.formatted_data["protein_data"][acc] = {"protein_name":prot, "kmers":kmerized_seq}
607
+
608
+ def to_json(self):
609
+ if self.do_compress:
610
+ if not self.output_file.endswith(".gz"):
611
+ self.output_file += ".gz"
612
+
613
+ self.formatted_data = json.dumps(self.formatted_data, indent = 4) #Convert to JSON
614
+ self.formatted_data = self.formatted_data.encode('utf-8') #Encode to binary
615
+ with gzip.open(self.output_file, 'wb') as fh:
616
+ fh.write(self.formatted_data)
617
+
618
+ else:
619
+ with open(self.output_file, "w") as fh:
620
+ json.dump(self.formatted_data, fh, indent = 4)
621
+
622
+ class input_file:
623
+ def __init__(self, genome = None, protein = None, hmm = None, #data inputs
624
+ output_protein = None, output_hmm = None, output_crystal = None, #data outputs
625
+ output_log = None, verbose = False, compress_outputs = False):
626
+
627
+ self.verbose = verbose
628
+ self.do_compress = compress_outputs
629
+
630
+ self.genome_input = genome
631
+ self.protein_input = protein
632
+ self.hmm_input = hmm
633
+
634
+ self.protein_output = output_protein
635
+ self.hmm_output = output_hmm
636
+ self.crystal_output = output_crystal
637
+
638
+ self.log_contents = []
639
+ self.log_file = output_log
640
+
641
+ self.initial_status = None
642
+ self.current_status = "genome"
643
+
644
+ self.basename = None
645
+
646
+ self.genome = None
647
+ self.proteins = None
648
+ self.hmm_besthits = None
649
+
650
+ current_datetime = datetime.datetime.now()
651
+ self.timestamps = {"start":current_datetime,
652
+ "protein_pred":current_datetime,
653
+ "hmm_search":current_datetime,
654
+ "crystal":current_datetime}
655
+
656
+ self.runtimes = None
657
+
658
+ self.hmm_file = None
659
+
660
+ def curtime(self, step = None):
661
+ if step is not None:
662
+ self.timestamps[step] = datetime.datetime.now()
663
+
664
+ def timediffs(self):
665
+ self.runtimes = {}
666
+ protein_pred_time = self.timestamps["protein_pred"] - self.timestamps["start"]
667
+ protein_pred_time = round(protein_pred_time.total_seconds(), 2)
668
+
669
+ hmm_search_time = self.timestamps["hmm_search"] - self.timestamps["protein_pred"]
670
+ hmm_search_time = round(hmm_search_time.total_seconds(), 2)
671
+
672
+ crystal_time = self.timestamps["crystal"] - self.timestamps["hmm_search"]
673
+ crystal_time = round(crystal_time.total_seconds(), 2)
674
+
675
+ self.runtimes["protein_pred"] = protein_pred_time
676
+ self.runtimes["hmm_search"] = hmm_search_time
677
+ self.runtimes["crystal"] = crystal_time
678
+
679
+ def get_initial_status(self):
680
+ if self.genome_input is not None:
681
+ self.initial_status = "genome"
682
+
683
+ if self.protein_input is not None:
684
+ self.initial_status = "protein"
685
+
686
+ if self.hmm_input is not None and self.protein_input is not None:
687
+ self.initial_status = "hmm"
688
+
689
+ def get_file_basename(self):
690
+ if self.initial_status == "genome":
691
+ self.basename = self.file_basename(self.genome_input)
692
+ if self.initial_status == "protein":
693
+ self.basename = self.file_basename(self.protein_input)
694
+ if self.initial_status == "hmm":
695
+ self.basename = self.file_basename(self.protein_input)
696
+
697
+ #Not an input sanitizer - simply replaces characters that would throw SQLite for a loop.
698
+ def sql_safe(self, string):
699
+ #Sanitize for SQL
700
+ #These are chars safe for sql
701
+ sql_safe = set('_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789')
702
+ current_chars = set(string)
703
+ #self.sql_name = self.basename
704
+ #Identify SQL-unsafe characters as those outside the permissible set and replace all with underscores.
705
+ for char in current_chars - sql_safe:
706
+ string = string.replace(char, "_")
707
+
708
+ return string
709
+
710
+ #Gonna have to go put this everywhere...
711
+ #Consistent file basename behavior
712
+ def file_basename(self, file):
713
+ #Get the name after the final directory path
714
+ name = os.path.basename(file)
715
+ #Extract the portion of a filename prior to the first '.' separator.
716
+ while name != os.path.splitext(name)[0]:
717
+ name = os.path.splitext(name)[0]
718
+
719
+ name = self.sql_safe(name)
720
+
721
+ return name
722
+
723
+ def find_hmm(self):
724
+ self.hmm_file = None
725
+ try:
726
+ #Look in the same dir as the script; old method/MiGA friendly
727
+ script_path = os.path.dirname(__file__)
728
+ if len(script_path) == 0:
729
+ script_path = "."
730
+ hmm_complete_model = os.path.abspath(os.path.normpath(script_path +"/"+ "00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm"))
731
+ self.hmm_file = str(hmm_complete_model)
732
+ except:
733
+ #Try to locate the data bundled as it would be with a pip/conda install.
734
+ script_path = os.path.dirname(sys.modules['fastAAI_HMM_models'].__file__)
735
+ if len(script_path) == 0:
736
+ script_path = "."
737
+ hmm_complete_model = os.path.abspath(os.path.normpath(script_path + '/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm'))
738
+ self.hmm_file = str(hmm_complete_model)
739
+ #Check that the file exists or fail to the except.
740
+ fh = open(self.hmm_file)
741
+ fh.close()
742
+
743
+ #Load existing files functions
744
+ def read_genomes(self):
745
+ if self.genome_input is not None:
746
+ genome_seqs = fasta_file(self.genome_input)
747
+ self.genome = genome_seqs.contents
748
+ genome_seqs = None
749
+
750
+ def read_proteins(self):
751
+ if self.protein_input is not None:
752
+ #Simple dict of seqid:sequence
753
+ protein_seqs = fasta_file(self.protein_input)
754
+ self.proteins = protein_seqs.contents
755
+ protein_seqs = None
756
+
757
+ def read_hmms(self):
758
+ if self.hmm_input is not None:
759
+ prots = []
760
+ accs = []
761
+ scores = []
762
+ f = agnostic_reader(self.hmm_input)
763
+ for line in f:
764
+ if line.startswith("#"):
765
+ continue
766
+ else:
767
+ segs = line.strip().split()
768
+
769
+ if len(segs) < 9:
770
+ continue
771
+
772
+ prots.append(segs[0])
773
+ accs.append(segs[3])
774
+ scores.append(segs[8])
775
+
776
+ f.close()
777
+
778
+ if len(prots) < 1:
779
+ self.best_hits = {}
780
+
781
+ hmm_file = np.transpose(np.array([prots, accs, scores]))
782
+
783
+ #hmm_file = np.loadtxt(hmm_file_name, comments = '#', usecols = (0, 3, 8), dtype=(str))
784
+ #Sort the hmm file based on the score column in descending order.
785
+ hmm_file = hmm_file[hmm_file[:,2].astype(float).argsort()[::-1]]
786
+
787
+ #Identify the first row where each gene name appears, after sorting by score;
788
+ #in effect, return the highest scoring assignment per gene name
789
+ #Sort the indices of the result to match the score-sorted table instead of alphabetical order of gene names
790
+ hmm_file = hmm_file[np.sort(np.unique(hmm_file[:,0], return_index = True)[1])]
791
+
792
+ #Filter the file again for the unique ACCESSION names, since we're only allowed one gene per accession, I guess?
793
+ #Don't sort the indices, we don't care about the scores anymore.
794
+ hmm_file = hmm_file[np.unique(hmm_file[:,1], return_index = True)[1]]
795
+
796
+ sql_friendly_names = [i.replace(".", "_") for i in hmm_file[:,1]]
797
+ self.hmm_besthits = dict(zip(hmm_file[:,0], sql_friendly_names))
798
+
799
+ #runner functions
800
+ def predict_proteins(self):
801
+ mn = pyrodigal_manager(sequences = self.genome,
802
+ aa = self.protein_output,
803
+ compress = self.do_compress)
804
+ mn.run()
805
+ self.proteins = mn.predicted_genes
806
+
807
+ mn = None
808
+
809
+ def hmm_search_and_BH(self):
810
+ hmm_manager = pyhmmer_manager(self.do_compress)
811
+ hmm_manager.load_hmm_from_file(self.hmm_file)
812
+
813
+ hmm_manager.run_for_fastaai(prots = self.proteins, hmm_output = self.hmm_output)
814
+
815
+ self.hmm_besthits = hmm_manager.best_hits
816
+
817
+ def filter_bh_prots(self):
818
+ cleaned_prots = []
819
+ for protein in self.proteins:
820
+ if protein in self.hmm_besthits:
821
+ accession = self.hmm_besthits[protein]
822
+
823
+ next_item = (protein, accession, self.proteins[protein])
824
+
825
+ cleaned_prots.append(next_item)
826
+
827
+ self.proteins = cleaned_prots
828
+ cleaned_prots = None
829
+
830
+ def crystalize(self):
831
+ mn = mining_straight_down(basename = self.basename, protein_list = self.proteins, crystal_output = self.crystal_output, compress = self.do_compress)
832
+ mn.prepare_data()
833
+ mn.to_json()
834
+
835
+ def run(self):
836
+ self.get_initial_status()
837
+ self.get_file_basename()
838
+
839
+ self.current_status = self.initial_status
840
+
841
+ if self.current_status == "genome":
842
+ self.read_genomes()
843
+ self.predict_proteins()
844
+ self.current_status = "protein"
845
+
846
+ if self.initial_status == "protein":
847
+ self.read_proteins()
848
+
849
+ if self.verbose:
850
+ self.curtime("protein_pred")
851
+
852
+ if self.current_status == "protein":
853
+ self.find_hmm()
854
+ self.hmm_search_and_BH()
855
+ self.current_status = "hmm"
856
+
857
+ if self.initial_status == "hmm":
858
+ self.read_proteins()
859
+ self.read_hmms()
860
+
861
+ if self.verbose:
862
+ self.curtime("hmm_search")
863
+
864
+ if self.current_status == "hmm":
865
+ self.filter_bh_prots()
866
+ self.crystalize()
867
+
868
+ if self.verbose:
869
+ self.curtime("crystal")
870
+
871
+ if self.verbose:
872
+ self.timediffs()
873
+ print(self.basename, "complete.")
874
+ print("\tRuntimes: ", self.runtimes)
875
+
876
+ #Add options
877
+ def options():
878
+ '''
879
+ genome = None, protein = None, hmm = None, #data inputs
880
+ output_protein = None, output_hmm = None, output_crystal = None, #data outputs
881
+ output_log = None, verbose = False, compress_outputs = False
882
+ '''
883
+
884
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
885
+ description='''''')
886
+
887
+ parser.add_argument('--genome', dest = 'in_gen', default = None, help = 'Input genome in nt FASTA format.')
888
+ parser.add_argument('--protein', dest = 'in_prot', default = None, help = 'Input proteome for a genome in AA FASTA format')
889
+ parser.add_argument('--hmm', dest = 'in_hmm', default = None, help = 'Input FastAAI HMM search result for this proteome. Must be paired with --protein to work.')
890
+
891
+ parser.add_argument('--output_protein', dest = 'out_prot', default = None, help = 'An output containing predicted proteins for this genome in AA FASTA format. If omitted, no proteins file will be produced.')
892
+ parser.add_argument('--output_hmm', dest = 'out_hmm', default = None, help = 'An output containing the results of an HMM search of this proteome against FastAAIs SCPs. If omitted, no HMM file will be produced.')
893
+ parser.add_argument('--output_crystal', dest = 'out_crystal', default = None, required = True, help = 'Required. A JSON-format output representing the fully preprocessed input.')
894
+
895
+ parser.add_argument('--compress', dest = 'compress', action='store_true', help = 'GZIP protein and HMM outputs')
896
+ parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print feedback to stdout')
897
+
898
+ args, unknown_opts = parser.parse_known_args()
899
+
900
+ return parser, args
901
+
902
+ def main():
903
+ parser, opts = options()
904
+
905
+ if len(sys.argv) < 3:
906
+ parser.print_help()
907
+
908
+ ing = opts.in_gen
909
+ inp = opts.in_prot
910
+ inh = opts.in_hmm
911
+
912
+ outp = opts.out_prot
913
+ outh = opts.out_hmm
914
+ outc = opts.out_crystal
915
+
916
+ comp = opts.compress
917
+ verb = opts.verbose
918
+
919
+ mn = input_file(genome = ing,
920
+ protein = inp,
921
+ hmm = inh,
922
+ output_protein = outp,
923
+ output_hmm = outh,
924
+ output_crystal = outc,
925
+ compress_outputs = comp,
926
+ verbose = verb)
927
+
928
+ mn.run()
929
+
930
+ if __name__ == "__main__":
931
+ main()