miga-base 1.2.18.2 → 1.3.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (31) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/cli/action/doctor/base.rb +2 -1
  3. data/lib/miga/cli/action/init.rb +1 -1
  4. data/lib/miga/dataset/result/add.rb +3 -2
  5. data/lib/miga/version.rb +2 -2
  6. data/scripts/essential_genes.bash +4 -8
  7. data/utils/FastAAI/LICENSE +8 -0
  8. data/utils/FastAAI/README.md +151 -40
  9. data/utils/FastAAI/__init__.py +1 -0
  10. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962915_1.fna.gz +0 -0
  11. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962925_1.fna.gz +0 -0
  12. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962935_1.fna.gz +0 -0
  13. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962945_1.fna.gz +0 -0
  14. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962995_1.fna.gz +0 -0
  15. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963025_1.fna.gz +0 -0
  16. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963055_1.fna.gz +0 -0
  17. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963065_1.fna.gz +0 -0
  18. data/utils/FastAAI/example_genomes/_Pseudomonas__cissicola_GCA_002019225_1.fna.gz +0 -0
  19. data/utils/FastAAI/example_genomes/_Pseudomonas__cissicola_GCA_008801575_1.fna.gz +0 -0
  20. data/utils/FastAAI/fastaai/__init__.py +1 -0
  21. data/utils/FastAAI/fastaai/fastaai +4805 -0
  22. data/utils/FastAAI/fastaai/fastaai.py +4805 -0
  23. data/utils/FastAAI/fastaai/fastaai_miga_crystals_to_db.py +297 -0
  24. data/utils/FastAAI/fastaai/fastaai_miga_preproc.py +931 -0
  25. data/utils/FastAAI/metadata/Accession_names_and_IDs.txt +122 -0
  26. data/utils/distance/commands.rb +51 -23
  27. metadata +23 -6
  28. data/utils/FastAAI/FastAAI +0 -3659
  29. /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Archaea_SCG.hmm +0 -0
  30. /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Bacteria_SCG.hmm +0 -0
  31. /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Complete_SCG_DB.hmm +0 -0
@@ -0,0 +1,297 @@
1
+ import sys
2
+ import os
3
+
4
+ import gzip
5
+ import argparse
6
+ import json
7
+ import gzip
8
+
9
+ import numpy as np
10
+
11
+ import multiprocessing
12
+ import sqlite3
13
+
14
+ def convert_array(bytestring):
15
+ return np.frombuffer(bytestring, dtype = np.int32)
16
+
17
+ sqlite3.register_converter("array", convert_array)
18
+
19
+ class acc_indexer:
20
+ def __init__(self):
21
+ self.forward = None
22
+ self.reverse = None
23
+ self.generate_accessions_index()
24
+
25
+ def list_to_index_dict(self, list):
26
+ result = {}
27
+ counter = 0
28
+ for item in list:
29
+ result[item] = counter
30
+ counter += 1
31
+ return result
32
+
33
+ def rev_list_to_index_dict(self, list):
34
+ result = {}
35
+ counter = 0
36
+ for item in list:
37
+ result[counter] = item
38
+ counter += 1
39
+ return result
40
+
41
+ def generate_accessions_index(self):
42
+ acc_list = ['PF01780_19', 'PF03948_14', 'PF17144_4', 'PF00830_19', 'PF00347_23', 'PF16906_5', 'PF13393_6',
43
+ 'PF02565_15', 'PF01991_18', 'PF01984_20', 'PF00861_22', 'PF13656_6', 'PF00368_18', 'PF01142_18', 'PF00312_22', 'PF02367_17',
44
+ 'PF01951_16', 'PF00749_21', 'PF01655_18', 'PF00318_20', 'PF01813_17', 'PF01649_18', 'PF01025_19', 'PF00380_19', 'PF01282_19',
45
+ 'PF01864_17', 'PF01783_23', 'PF01808_18', 'PF01982_16', 'PF01715_17', 'PF00213_18', 'PF00119_20', 'PF00573_22', 'PF01981_16',
46
+ 'PF00281_19', 'PF00584_20', 'PF00825_18', 'PF00406_22', 'PF00177_21', 'PF01192_22', 'PF05833_11', 'PF02699_15', 'PF01016_19',
47
+ 'PF01765_19', 'PF00453_18', 'PF01193_24', 'PF05221_17', 'PF00231_19', 'PF00416_22', 'PF02033_18', 'PF01668_18', 'PF00886_19',
48
+ 'PF00252_18', 'PF00572_18', 'PF00366_20', 'PF04104_14', 'PF04919_12', 'PF01912_18', 'PF00276_20', 'PF00203_21', 'PF00889_19',
49
+ 'PF02996_17', 'PF00121_18', 'PF01990_17', 'PF00344_20', 'PF00297_22', 'PF01196_19', 'PF01194_17', 'PF01725_16', 'PF00750_19',
50
+ 'PF00338_22', 'PF00238_19', 'PF01200_18', 'PF00162_19', 'PF00181_23', 'PF01866_17', 'PF00709_21', 'PF02006_16', 'PF00164_25',
51
+ 'PF00237_19', 'PF01139_17', 'PF01351_18', 'PF04010_13', 'PF06093_13', 'PF00828_19', 'PF02410_15', 'PF01176_19', 'PF02130_17',
52
+ 'PF01948_18', 'PF01195_19', 'PF01746_21', 'PF01667_17', 'PF03874_16', 'PF01090_19', 'PF01198_19', 'PF01250_17', 'PF17136_4',
53
+ 'PF06026_14', 'PF03652_15', 'PF04019_12', 'PF01201_22', 'PF00832_20', 'PF01264_21', 'PF03840_14', 'PF00831_23', 'PF00189_20',
54
+ 'PF02601_15', 'PF01496_19', 'PF00411_19', 'PF00334_19', 'PF00687_21', 'PF01157_18', 'PF01245_20', 'PF01994_16', 'PF01632_19',
55
+ 'PF00827_17', 'PF01015_18', 'PF00829_21', 'PF00410_19', 'PF00833_18', 'PF00935_19', 'PF01992_16']
56
+
57
+ self.forward = list_of_poss_accs = self.list_to_index_dict(acc_list)
58
+ self.reverse = list_of_poss_accs = self.rev_list_to_index_dict(acc_list)
59
+
60
+ class miga_db_builder:
61
+ def __init__(self, path):
62
+ self.path = path
63
+ self.conn = None
64
+ self.curs = None
65
+
66
+ def activate(self):
67
+ self.conn = sqlite3.connect(self.path)
68
+ self.curs = self.conn.cursor()
69
+
70
+ def deactivate(self):
71
+ self.curs.close()
72
+ self.conn.close()
73
+ self.conn = None
74
+ self.curs = None
75
+
76
+ def initialize_metadata(self):
77
+ self.curs.execute("CREATE TABLE IF NOT EXISTS genome_index (genome text, gen_id integer, protein_count integer)")
78
+ self.curs.execute("CREATE TABLE IF NOT EXISTS genome_acc_kmer_counts (genome integer, accession integer, count integer)")
79
+ self.curs.execute("CREATE INDEX IF NOT EXISTS kmer_acc ON genome_acc_kmer_counts (genome, accession);")
80
+
81
+ def insert_genome_index(self, gi):
82
+ self.curs.executemany("INSERT INTO genome_index VALUES (?, ?, ?)", gi)
83
+ self.conn.commit()
84
+
85
+ def insert_gak(self, gak):
86
+ self.curs.executemany("INSERT INTO genome_acc_kmer_counts VALUES (?, ?, ?)", gak)
87
+ self.conn.commit()
88
+
89
+ def add_acc_genomes(self, acc, data):
90
+ create_sql = "CREATE TABLE IF NOT EXISTS {acc}_genomes (genome INTEGER PRIMARY KEY, kmers array)"
91
+ create_sql = create_sql.format(acc = acc)
92
+ self.curs.execute(create_sql)
93
+ insert_sql = "INSERT INTO {acc}_genomes VALUES (?, ?)"
94
+ insert_sql = insert_sql.format(acc=acc)
95
+ self.curs.executemany(insert_sql, data)
96
+
97
+ self.conn.commit()
98
+
99
+
100
+ def add_acc_kmers(self, acc, data):
101
+ create_sql = "CREATE TABLE IF NOT EXISTS {acc} (kmer INTEGER PRIMARY KEY, genomes array)"
102
+ create_sql = create_sql.format(acc = acc)
103
+ self.curs.execute(create_sql)
104
+ insert_sql = "INSERT INTO {acc} VALUES (?, ?)"
105
+ insert_sql = insert_sql.format(acc=acc)
106
+ self.curs.executemany(insert_sql, data)
107
+
108
+ self.conn.commit()
109
+
110
+ self.curs.execute("CREATE INDEX {acc}_index ON {acc} (kmer)".format(acc=acc))
111
+ self.conn.commit()
112
+
113
+ #Class for loading crystal files and prepping them for consumption.
114
+ class ravenous_crystal_lizard:
115
+ def __init__(self, crystal_list, database, overwrite = False):
116
+ self.paths_file = crystal_list
117
+ self.input_paths = None
118
+
119
+ self.crystal_contents = None
120
+
121
+ self.accession_index = acc_indexer()
122
+
123
+ self.genome_index = None
124
+ self.genome_prot_ct = None
125
+ self.gak = None
126
+
127
+ self.db_already_exists = os.path.exists(database)
128
+ self.overwrite = overwrite
129
+ self.og_db_path = database
130
+ self.db = miga_db_builder(database)
131
+
132
+ def consume_list(self):
133
+ with open(self.paths_file) as fh:
134
+ self.input_paths = fh.readlines()
135
+
136
+ self.input_paths = [path.strip() for path in self.input_paths]
137
+
138
+ def consume_crystal_data(self):
139
+ self.crystal_contents = {}
140
+ self.genome_index = []
141
+ self.gak = []
142
+ current_index = 0
143
+ for crystal in self.input_paths:
144
+ if crystal.endswith(".gz"):
145
+ with gzip.open(crystal, "rb") as fh:
146
+ next_crystal = fh.read()
147
+ next_crystal = next_crystal.decode('utf-8')
148
+ next_crystal = json.loads(next_crystal)
149
+ else:
150
+ with open(crystal, "r") as fh:
151
+ next_crystal = json.load(fh)
152
+
153
+ filename = next_crystal["filename"]
154
+ next_crystal = next_crystal["protein_data"]
155
+ protein_count = len(next_crystal)
156
+
157
+ next_index = (filename, current_index, protein_count,)
158
+ self.genome_index.append(next_index)
159
+
160
+ for acc in next_crystal:
161
+ acc_id = self.accession_index.forward[acc]
162
+
163
+ if acc not in self.crystal_contents:
164
+ self.crystal_contents[acc] = {}
165
+
166
+ kmer_list = np.array(next_crystal[acc]["kmers"], dtype = np.int32)
167
+ kmer_ct = kmer_list.shape[0]
168
+
169
+ next_gak = (current_index, acc_id, kmer_ct, )
170
+ self.gak.append(next_gak)
171
+
172
+ self.crystal_contents[acc][current_index] = kmer_list
173
+
174
+ current_index += 1
175
+
176
+ self.db.activate()
177
+ self.db.initialize_metadata()
178
+ self.db.insert_genome_index(self.genome_index)
179
+ self.db.insert_gak(self.gak)
180
+
181
+ for acc in self.crystal_contents:
182
+ #self.db.add_acc_genomes(acc, self.crystal_contents[acc])
183
+ flipped_dataset = self.invert_to_kmer_first(self.crystal_contents[acc])
184
+ self.db.add_acc_kmers(acc, flipped_dataset)
185
+ flipped_dataset = None
186
+ insertable_genomes = []
187
+ for genome, kmer_array in self.crystal_contents[acc].items():
188
+ next_row = (genome, kmer_array.tobytes(),)
189
+ insertable_genomes.append(next_row)
190
+
191
+ self.db.add_acc_genomes(acc, insertable_genomes)
192
+ self.crystal_contents[acc] = None
193
+
194
+ self.db.deactivate()
195
+
196
+ #Take a set of genome : kmer_lists and flip them to an equivalent set of kmer : genome_lists
197
+ def invert_to_kmer_first(self, dataset):
198
+ genomes = []
199
+ counts = []
200
+ kmer_unlist = []
201
+ for genome_index in dataset:
202
+ genomes.append(genome_index)
203
+ counts.append(dataset[genome_index].shape[0])
204
+ kmer_unlist.append(dataset[genome_index])
205
+
206
+ genomes = np.array(genomes, dtype = np.int32)
207
+ counts = np.array(counts, dtype = np.int32)
208
+
209
+ kmer_unlist = np.concatenate(kmer_unlist) #A 1-d array of all of the kmers for all of the genomes containing this SCP
210
+ counted_gens = np.repeat(genomes, counts) #A 1-d array of the same length as kmer_unlist with the corresp. genome index for each kmer
211
+
212
+ #This contains a list of kmers and genome indices repeated enough times to match their kmer collection 1 to 1 in the same order
213
+ formatted_pairs = np.vstack([kmer_unlist, counted_gens])
214
+ kmer_unlist = None
215
+ counted_gens = None
216
+
217
+ #Sort the list based on kmer, then genome
218
+ sorted_indices = np.lexsort((formatted_pairs[1, :], formatted_pairs[0, :]))
219
+
220
+ formatted_pairs = formatted_pairs[:, sorted_indices]
221
+
222
+ #Collect an ordered list of unique kmers
223
+ discovered_kmers = np.unique(formatted_pairs[0, :])
224
+
225
+ #Collect a list of the genomes associated with each kmer
226
+ formatted_pairs = np.split(formatted_pairs[1, :], np.unique(formatted_pairs[0, :], return_index=True)[1][1:])
227
+
228
+ final_dataset = []
229
+ for kmer, genomes in zip(discovered_kmers, formatted_pairs):
230
+ genome_bytestring = genomes.tobytes()
231
+
232
+ kmer = int(kmer)
233
+
234
+ final_dataset.append((kmer, genome_bytestring,))
235
+
236
+ return final_dataset
237
+
238
+ def run(self):
239
+ do_run = True
240
+ if self.db_already_exists:
241
+ if self.overwrite:
242
+ os.remove(self.og_db_path)
243
+ else:
244
+ print("")
245
+ print("Target database file already exists! I'm quitting.")
246
+ print("Supply a different path or use --overwrite")
247
+ do_run = False
248
+
249
+ if do_run:
250
+ self.consume_list()
251
+ self.consume_crystal_data()
252
+
253
+ #Add options
254
+ def options():
255
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
256
+ description='''Dedicated MiGA db builder module.
257
+
258
+ Takes a file containing a list of paths to crystals and builds a database from those.
259
+
260
+ Notes:
261
+
262
+ Assumes the supplied database path does not exist.
263
+ Use --overwrite to delete an existing DB under the same path if you want to.''')
264
+
265
+ parser.add_argument('--crystal_list', dest = 'crystals', default = None, help = 'File containing a list of paths to FastAAI crystals')
266
+ parser.add_argument('--database_path', dest = 'db', default = None, help = 'Path to a NEW database to be built.')
267
+
268
+ parser.add_argument('--overwrite', dest = 'overwrite', action = 'store_true', help = 'Delete an existing database at --database_path and create a new one. Otw. quits to preserve existing db.')
269
+
270
+ args, unknown_opts = parser.parse_known_args()
271
+
272
+ return parser, args
273
+
274
+ def main():
275
+ p, a = options()
276
+ crystal_file = a.crystals
277
+ db = a.db
278
+ overwrite = a.overwrite
279
+
280
+ if len(sys.argv) < 3:
281
+ p.print_help()
282
+
283
+ if crystal_file is None:
284
+ print("I need a file containing a list of paths to crystals")
285
+ sys.exit()
286
+
287
+ if db is None:
288
+ print("I need a path to an output database")
289
+ sys.exit()
290
+
291
+ mn = ravenous_crystal_lizard(crystal_list = crystal_file,
292
+ database = db,
293
+ overwrite = overwrite)
294
+ mn.run()
295
+
296
+ if __name__ == "__main__":
297
+ main()