miga-base 1.2.18.2 → 1.3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/cli/action/doctor/base.rb +2 -1
  3. data/lib/miga/cli/action/init.rb +1 -1
  4. data/lib/miga/dataset/result/add.rb +3 -2
  5. data/lib/miga/version.rb +2 -2
  6. data/scripts/essential_genes.bash +4 -8
  7. data/utils/FastAAI/LICENSE +8 -0
  8. data/utils/FastAAI/README.md +151 -40
  9. data/utils/FastAAI/__init__.py +1 -0
  10. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962915_1.fna.gz +0 -0
  11. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962925_1.fna.gz +0 -0
  12. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962935_1.fna.gz +0 -0
  13. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962945_1.fna.gz +0 -0
  14. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962995_1.fna.gz +0 -0
  15. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963025_1.fna.gz +0 -0
  16. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963055_1.fna.gz +0 -0
  17. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963065_1.fna.gz +0 -0
  18. data/utils/FastAAI/example_genomes/_Pseudomonas__cissicola_GCA_002019225_1.fna.gz +0 -0
  19. data/utils/FastAAI/example_genomes/_Pseudomonas__cissicola_GCA_008801575_1.fna.gz +0 -0
  20. data/utils/FastAAI/fastaai/__init__.py +1 -0
  21. data/utils/FastAAI/fastaai/fastaai +4805 -0
  22. data/utils/FastAAI/fastaai/fastaai.py +4805 -0
  23. data/utils/FastAAI/fastaai/fastaai_miga_crystals_to_db.py +297 -0
  24. data/utils/FastAAI/fastaai/fastaai_miga_preproc.py +931 -0
  25. data/utils/FastAAI/metadata/Accession_names_and_IDs.txt +122 -0
  26. data/utils/distance/commands.rb +51 -23
  27. metadata +23 -6
  28. data/utils/FastAAI/FastAAI +0 -3659
  29. /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Archaea_SCG.hmm +0 -0
  30. /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Bacteria_SCG.hmm +0 -0
  31. /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Complete_SCG_DB.hmm +0 -0
@@ -0,0 +1,297 @@
1
+ import sys
2
+ import os
3
+
4
+ import gzip
5
+ import argparse
6
+ import json
7
+ import gzip
8
+
9
+ import numpy as np
10
+
11
+ import multiprocessing
12
+ import sqlite3
13
+
14
+ def convert_array(bytestring):
15
+ return np.frombuffer(bytestring, dtype = np.int32)
16
+
17
+ sqlite3.register_converter("array", convert_array)
18
+
19
+ class acc_indexer:
20
+ def __init__(self):
21
+ self.forward = None
22
+ self.reverse = None
23
+ self.generate_accessions_index()
24
+
25
+ def list_to_index_dict(self, list):
26
+ result = {}
27
+ counter = 0
28
+ for item in list:
29
+ result[item] = counter
30
+ counter += 1
31
+ return result
32
+
33
+ def rev_list_to_index_dict(self, list):
34
+ result = {}
35
+ counter = 0
36
+ for item in list:
37
+ result[counter] = item
38
+ counter += 1
39
+ return result
40
+
41
+ def generate_accessions_index(self):
42
+ acc_list = ['PF01780_19', 'PF03948_14', 'PF17144_4', 'PF00830_19', 'PF00347_23', 'PF16906_5', 'PF13393_6',
43
+ 'PF02565_15', 'PF01991_18', 'PF01984_20', 'PF00861_22', 'PF13656_6', 'PF00368_18', 'PF01142_18', 'PF00312_22', 'PF02367_17',
44
+ 'PF01951_16', 'PF00749_21', 'PF01655_18', 'PF00318_20', 'PF01813_17', 'PF01649_18', 'PF01025_19', 'PF00380_19', 'PF01282_19',
45
+ 'PF01864_17', 'PF01783_23', 'PF01808_18', 'PF01982_16', 'PF01715_17', 'PF00213_18', 'PF00119_20', 'PF00573_22', 'PF01981_16',
46
+ 'PF00281_19', 'PF00584_20', 'PF00825_18', 'PF00406_22', 'PF00177_21', 'PF01192_22', 'PF05833_11', 'PF02699_15', 'PF01016_19',
47
+ 'PF01765_19', 'PF00453_18', 'PF01193_24', 'PF05221_17', 'PF00231_19', 'PF00416_22', 'PF02033_18', 'PF01668_18', 'PF00886_19',
48
+ 'PF00252_18', 'PF00572_18', 'PF00366_20', 'PF04104_14', 'PF04919_12', 'PF01912_18', 'PF00276_20', 'PF00203_21', 'PF00889_19',
49
+ 'PF02996_17', 'PF00121_18', 'PF01990_17', 'PF00344_20', 'PF00297_22', 'PF01196_19', 'PF01194_17', 'PF01725_16', 'PF00750_19',
50
+ 'PF00338_22', 'PF00238_19', 'PF01200_18', 'PF00162_19', 'PF00181_23', 'PF01866_17', 'PF00709_21', 'PF02006_16', 'PF00164_25',
51
+ 'PF00237_19', 'PF01139_17', 'PF01351_18', 'PF04010_13', 'PF06093_13', 'PF00828_19', 'PF02410_15', 'PF01176_19', 'PF02130_17',
52
+ 'PF01948_18', 'PF01195_19', 'PF01746_21', 'PF01667_17', 'PF03874_16', 'PF01090_19', 'PF01198_19', 'PF01250_17', 'PF17136_4',
53
+ 'PF06026_14', 'PF03652_15', 'PF04019_12', 'PF01201_22', 'PF00832_20', 'PF01264_21', 'PF03840_14', 'PF00831_23', 'PF00189_20',
54
+ 'PF02601_15', 'PF01496_19', 'PF00411_19', 'PF00334_19', 'PF00687_21', 'PF01157_18', 'PF01245_20', 'PF01994_16', 'PF01632_19',
55
+ 'PF00827_17', 'PF01015_18', 'PF00829_21', 'PF00410_19', 'PF00833_18', 'PF00935_19', 'PF01992_16']
56
+
57
+ self.forward = list_of_poss_accs = self.list_to_index_dict(acc_list)
58
+ self.reverse = list_of_poss_accs = self.rev_list_to_index_dict(acc_list)
59
+
60
+ class miga_db_builder:
61
+ def __init__(self, path):
62
+ self.path = path
63
+ self.conn = None
64
+ self.curs = None
65
+
66
+ def activate(self):
67
+ self.conn = sqlite3.connect(self.path)
68
+ self.curs = self.conn.cursor()
69
+
70
+ def deactivate(self):
71
+ self.curs.close()
72
+ self.conn.close()
73
+ self.conn = None
74
+ self.curs = None
75
+
76
+ def initialize_metadata(self):
77
+ self.curs.execute("CREATE TABLE IF NOT EXISTS genome_index (genome text, gen_id integer, protein_count integer)")
78
+ self.curs.execute("CREATE TABLE IF NOT EXISTS genome_acc_kmer_counts (genome integer, accession integer, count integer)")
79
+ self.curs.execute("CREATE INDEX IF NOT EXISTS kmer_acc ON genome_acc_kmer_counts (genome, accession);")
80
+
81
+ def insert_genome_index(self, gi):
82
+ self.curs.executemany("INSERT INTO genome_index VALUES (?, ?, ?)", gi)
83
+ self.conn.commit()
84
+
85
+ def insert_gak(self, gak):
86
+ self.curs.executemany("INSERT INTO genome_acc_kmer_counts VALUES (?, ?, ?)", gak)
87
+ self.conn.commit()
88
+
89
+ def add_acc_genomes(self, acc, data):
90
+ create_sql = "CREATE TABLE IF NOT EXISTS {acc}_genomes (genome INTEGER PRIMARY KEY, kmers array)"
91
+ create_sql = create_sql.format(acc = acc)
92
+ self.curs.execute(create_sql)
93
+ insert_sql = "INSERT INTO {acc}_genomes VALUES (?, ?)"
94
+ insert_sql = insert_sql.format(acc=acc)
95
+ self.curs.executemany(insert_sql, data)
96
+
97
+ self.conn.commit()
98
+
99
+
100
+ def add_acc_kmers(self, acc, data):
101
+ create_sql = "CREATE TABLE IF NOT EXISTS {acc} (kmer INTEGER PRIMARY KEY, genomes array)"
102
+ create_sql = create_sql.format(acc = acc)
103
+ self.curs.execute(create_sql)
104
+ insert_sql = "INSERT INTO {acc} VALUES (?, ?)"
105
+ insert_sql = insert_sql.format(acc=acc)
106
+ self.curs.executemany(insert_sql, data)
107
+
108
+ self.conn.commit()
109
+
110
+ self.curs.execute("CREATE INDEX {acc}_index ON {acc} (kmer)".format(acc=acc))
111
+ self.conn.commit()
112
+
113
+ #Class for loading crystal files and prepping them for consumption.
114
+ class ravenous_crystal_lizard:
115
+ def __init__(self, crystal_list, database, overwrite = False):
116
+ self.paths_file = crystal_list
117
+ self.input_paths = None
118
+
119
+ self.crystal_contents = None
120
+
121
+ self.accession_index = acc_indexer()
122
+
123
+ self.genome_index = None
124
+ self.genome_prot_ct = None
125
+ self.gak = None
126
+
127
+ self.db_already_exists = os.path.exists(database)
128
+ self.overwrite = overwrite
129
+ self.og_db_path = database
130
+ self.db = miga_db_builder(database)
131
+
132
+ def consume_list(self):
133
+ with open(self.paths_file) as fh:
134
+ self.input_paths = fh.readlines()
135
+
136
+ self.input_paths = [path.strip() for path in self.input_paths]
137
+
138
+ def consume_crystal_data(self):
139
+ self.crystal_contents = {}
140
+ self.genome_index = []
141
+ self.gak = []
142
+ current_index = 0
143
+ for crystal in self.input_paths:
144
+ if crystal.endswith(".gz"):
145
+ with gzip.open(crystal, "rb") as fh:
146
+ next_crystal = fh.read()
147
+ next_crystal = next_crystal.decode('utf-8')
148
+ next_crystal = json.loads(next_crystal)
149
+ else:
150
+ with open(crystal, "r") as fh:
151
+ next_crystal = json.load(fh)
152
+
153
+ filename = next_crystal["filename"]
154
+ next_crystal = next_crystal["protein_data"]
155
+ protein_count = len(next_crystal)
156
+
157
+ next_index = (filename, current_index, protein_count,)
158
+ self.genome_index.append(next_index)
159
+
160
+ for acc in next_crystal:
161
+ acc_id = self.accession_index.forward[acc]
162
+
163
+ if acc not in self.crystal_contents:
164
+ self.crystal_contents[acc] = {}
165
+
166
+ kmer_list = np.array(next_crystal[acc]["kmers"], dtype = np.int32)
167
+ kmer_ct = kmer_list.shape[0]
168
+
169
+ next_gak = (current_index, acc_id, kmer_ct, )
170
+ self.gak.append(next_gak)
171
+
172
+ self.crystal_contents[acc][current_index] = kmer_list
173
+
174
+ current_index += 1
175
+
176
+ self.db.activate()
177
+ self.db.initialize_metadata()
178
+ self.db.insert_genome_index(self.genome_index)
179
+ self.db.insert_gak(self.gak)
180
+
181
+ for acc in self.crystal_contents:
182
+ #self.db.add_acc_genomes(acc, self.crystal_contents[acc])
183
+ flipped_dataset = self.invert_to_kmer_first(self.crystal_contents[acc])
184
+ self.db.add_acc_kmers(acc, flipped_dataset)
185
+ flipped_dataset = None
186
+ insertable_genomes = []
187
+ for genome, kmer_array in self.crystal_contents[acc].items():
188
+ next_row = (genome, kmer_array.tobytes(),)
189
+ insertable_genomes.append(next_row)
190
+
191
+ self.db.add_acc_genomes(acc, insertable_genomes)
192
+ self.crystal_contents[acc] = None
193
+
194
+ self.db.deactivate()
195
+
196
+ #Take a set of genome : kmer_lists and flip them to an equivalent set of kmer : genome_lists
197
+ def invert_to_kmer_first(self, dataset):
198
+ genomes = []
199
+ counts = []
200
+ kmer_unlist = []
201
+ for genome_index in dataset:
202
+ genomes.append(genome_index)
203
+ counts.append(dataset[genome_index].shape[0])
204
+ kmer_unlist.append(dataset[genome_index])
205
+
206
+ genomes = np.array(genomes, dtype = np.int32)
207
+ counts = np.array(counts, dtype = np.int32)
208
+
209
+ kmer_unlist = np.concatenate(kmer_unlist) #A 1-d array of all of the kmers for all of the genomes containing this SCP
210
+ counted_gens = np.repeat(genomes, counts) #A 1-d array of the same length as kmer_unlist with the corresp. genome index for each kmer
211
+
212
+ #This contains a list of kmers and genome indices repeated enough times to match their kmer collection 1 to 1 in the same order
213
+ formatted_pairs = np.vstack([kmer_unlist, counted_gens])
214
+ kmer_unlist = None
215
+ counted_gens = None
216
+
217
+ #Sort the list based on kmer, then genome
218
+ sorted_indices = np.lexsort((formatted_pairs[1, :], formatted_pairs[0, :]))
219
+
220
+ formatted_pairs = formatted_pairs[:, sorted_indices]
221
+
222
+ #Collect an ordered list of unique kmers
223
+ discovered_kmers = np.unique(formatted_pairs[0, :])
224
+
225
+ #Collect a list of the genomes associated with each kmer
226
+ formatted_pairs = np.split(formatted_pairs[1, :], np.unique(formatted_pairs[0, :], return_index=True)[1][1:])
227
+
228
+ final_dataset = []
229
+ for kmer, genomes in zip(discovered_kmers, formatted_pairs):
230
+ genome_bytestring = genomes.tobytes()
231
+
232
+ kmer = int(kmer)
233
+
234
+ final_dataset.append((kmer, genome_bytestring,))
235
+
236
+ return final_dataset
237
+
238
+ def run(self):
239
+ do_run = True
240
+ if self.db_already_exists:
241
+ if self.overwrite:
242
+ os.remove(self.og_db_path)
243
+ else:
244
+ print("")
245
+ print("Target database file already exists! I'm quitting.")
246
+ print("Supply a different path or use --overwrite")
247
+ do_run = False
248
+
249
+ if do_run:
250
+ self.consume_list()
251
+ self.consume_crystal_data()
252
+
253
+ #Add options
254
+ def options():
255
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
256
+ description='''Dedicated MiGA db builder module.
257
+
258
+ Takes a file containing a list of paths to crystals and builds a database from those.
259
+
260
+ Notes:
261
+
262
+ Assumes the supplied database path does not exist.
263
+ Use --overwrite to delete an existing DB under the same path if you want to.''')
264
+
265
+ parser.add_argument('--crystal_list', dest = 'crystals', default = None, help = 'File containing a list of paths to FastAAI crystals')
266
+ parser.add_argument('--database_path', dest = 'db', default = None, help = 'Path to a NEW database to be built.')
267
+
268
+ parser.add_argument('--overwrite', dest = 'overwrite', action = 'store_true', help = 'Delete an existing database at --database_path and create a new one. Otw. quits to preserve existing db.')
269
+
270
+ args, unknown_opts = parser.parse_known_args()
271
+
272
+ return parser, args
273
+
274
+ def main():
275
+ p, a = options()
276
+ crystal_file = a.crystals
277
+ db = a.db
278
+ overwrite = a.overwrite
279
+
280
+ if len(sys.argv) < 3:
281
+ p.print_help()
282
+
283
+ if crystal_file is None:
284
+ print("I need a file containing a list of paths to crystals")
285
+ sys.exit()
286
+
287
+ if db is None:
288
+ print("I need a path to an output database")
289
+ sys.exit()
290
+
291
+ mn = ravenous_crystal_lizard(crystal_list = crystal_file,
292
+ database = db,
293
+ overwrite = overwrite)
294
+ mn.run()
295
+
296
+ if __name__ == "__main__":
297
+ main()