miga-base 1.2.18.1 → 1.3.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/miga/cli/action/doctor/base.rb +2 -1
- data/lib/miga/cli/action/init.rb +1 -1
- data/lib/miga/dataset/result/add.rb +3 -2
- data/lib/miga/lair.rb +9 -3
- data/lib/miga/version.rb +2 -2
- data/scripts/essential_genes.bash +4 -8
- data/utils/FastAAI/LICENSE +8 -0
- data/utils/FastAAI/README.md +151 -40
- data/utils/FastAAI/__init__.py +1 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962915_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962925_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962935_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962945_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962995_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963025_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963055_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963065_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/_Pseudomonas__cissicola_GCA_002019225_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/_Pseudomonas__cissicola_GCA_008801575_1.fna.gz +0 -0
- data/utils/FastAAI/fastaai/__init__.py +1 -0
- data/utils/FastAAI/fastaai/fastaai +4805 -0
- data/utils/FastAAI/fastaai/fastaai.py +4805 -0
- data/utils/FastAAI/fastaai/fastaai_miga_crystals_to_db.py +297 -0
- data/utils/FastAAI/fastaai/fastaai_miga_preproc.py +931 -0
- data/utils/FastAAI/metadata/Accession_names_and_IDs.txt +122 -0
- data/utils/distance/commands.rb +51 -23
- metadata +23 -6
- data/utils/FastAAI/FastAAI +0 -3659
- /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Archaea_SCG.hmm +0 -0
- /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Bacteria_SCG.hmm +0 -0
- /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Complete_SCG_DB.hmm +0 -0
@@ -0,0 +1,297 @@
|
|
1
|
+
import sys
|
2
|
+
import os
|
3
|
+
|
4
|
+
import gzip
|
5
|
+
import argparse
|
6
|
+
import json
|
7
|
+
import gzip
|
8
|
+
|
9
|
+
import numpy as np
|
10
|
+
|
11
|
+
import multiprocessing
|
12
|
+
import sqlite3
|
13
|
+
|
14
|
+
def convert_array(bytestring):
|
15
|
+
return np.frombuffer(bytestring, dtype = np.int32)
|
16
|
+
|
17
|
+
sqlite3.register_converter("array", convert_array)
|
18
|
+
|
19
|
+
class acc_indexer:
|
20
|
+
def __init__(self):
|
21
|
+
self.forward = None
|
22
|
+
self.reverse = None
|
23
|
+
self.generate_accessions_index()
|
24
|
+
|
25
|
+
def list_to_index_dict(self, list):
|
26
|
+
result = {}
|
27
|
+
counter = 0
|
28
|
+
for item in list:
|
29
|
+
result[item] = counter
|
30
|
+
counter += 1
|
31
|
+
return result
|
32
|
+
|
33
|
+
def rev_list_to_index_dict(self, list):
|
34
|
+
result = {}
|
35
|
+
counter = 0
|
36
|
+
for item in list:
|
37
|
+
result[counter] = item
|
38
|
+
counter += 1
|
39
|
+
return result
|
40
|
+
|
41
|
+
def generate_accessions_index(self):
|
42
|
+
acc_list = ['PF01780_19', 'PF03948_14', 'PF17144_4', 'PF00830_19', 'PF00347_23', 'PF16906_5', 'PF13393_6',
|
43
|
+
'PF02565_15', 'PF01991_18', 'PF01984_20', 'PF00861_22', 'PF13656_6', 'PF00368_18', 'PF01142_18', 'PF00312_22', 'PF02367_17',
|
44
|
+
'PF01951_16', 'PF00749_21', 'PF01655_18', 'PF00318_20', 'PF01813_17', 'PF01649_18', 'PF01025_19', 'PF00380_19', 'PF01282_19',
|
45
|
+
'PF01864_17', 'PF01783_23', 'PF01808_18', 'PF01982_16', 'PF01715_17', 'PF00213_18', 'PF00119_20', 'PF00573_22', 'PF01981_16',
|
46
|
+
'PF00281_19', 'PF00584_20', 'PF00825_18', 'PF00406_22', 'PF00177_21', 'PF01192_22', 'PF05833_11', 'PF02699_15', 'PF01016_19',
|
47
|
+
'PF01765_19', 'PF00453_18', 'PF01193_24', 'PF05221_17', 'PF00231_19', 'PF00416_22', 'PF02033_18', 'PF01668_18', 'PF00886_19',
|
48
|
+
'PF00252_18', 'PF00572_18', 'PF00366_20', 'PF04104_14', 'PF04919_12', 'PF01912_18', 'PF00276_20', 'PF00203_21', 'PF00889_19',
|
49
|
+
'PF02996_17', 'PF00121_18', 'PF01990_17', 'PF00344_20', 'PF00297_22', 'PF01196_19', 'PF01194_17', 'PF01725_16', 'PF00750_19',
|
50
|
+
'PF00338_22', 'PF00238_19', 'PF01200_18', 'PF00162_19', 'PF00181_23', 'PF01866_17', 'PF00709_21', 'PF02006_16', 'PF00164_25',
|
51
|
+
'PF00237_19', 'PF01139_17', 'PF01351_18', 'PF04010_13', 'PF06093_13', 'PF00828_19', 'PF02410_15', 'PF01176_19', 'PF02130_17',
|
52
|
+
'PF01948_18', 'PF01195_19', 'PF01746_21', 'PF01667_17', 'PF03874_16', 'PF01090_19', 'PF01198_19', 'PF01250_17', 'PF17136_4',
|
53
|
+
'PF06026_14', 'PF03652_15', 'PF04019_12', 'PF01201_22', 'PF00832_20', 'PF01264_21', 'PF03840_14', 'PF00831_23', 'PF00189_20',
|
54
|
+
'PF02601_15', 'PF01496_19', 'PF00411_19', 'PF00334_19', 'PF00687_21', 'PF01157_18', 'PF01245_20', 'PF01994_16', 'PF01632_19',
|
55
|
+
'PF00827_17', 'PF01015_18', 'PF00829_21', 'PF00410_19', 'PF00833_18', 'PF00935_19', 'PF01992_16']
|
56
|
+
|
57
|
+
self.forward = list_of_poss_accs = self.list_to_index_dict(acc_list)
|
58
|
+
self.reverse = list_of_poss_accs = self.rev_list_to_index_dict(acc_list)
|
59
|
+
|
60
|
+
class miga_db_builder:
|
61
|
+
def __init__(self, path):
|
62
|
+
self.path = path
|
63
|
+
self.conn = None
|
64
|
+
self.curs = None
|
65
|
+
|
66
|
+
def activate(self):
|
67
|
+
self.conn = sqlite3.connect(self.path)
|
68
|
+
self.curs = self.conn.cursor()
|
69
|
+
|
70
|
+
def deactivate(self):
|
71
|
+
self.curs.close()
|
72
|
+
self.conn.close()
|
73
|
+
self.conn = None
|
74
|
+
self.curs = None
|
75
|
+
|
76
|
+
def initialize_metadata(self):
|
77
|
+
self.curs.execute("CREATE TABLE IF NOT EXISTS genome_index (genome text, gen_id integer, protein_count integer)")
|
78
|
+
self.curs.execute("CREATE TABLE IF NOT EXISTS genome_acc_kmer_counts (genome integer, accession integer, count integer)")
|
79
|
+
self.curs.execute("CREATE INDEX IF NOT EXISTS kmer_acc ON genome_acc_kmer_counts (genome, accession);")
|
80
|
+
|
81
|
+
def insert_genome_index(self, gi):
|
82
|
+
self.curs.executemany("INSERT INTO genome_index VALUES (?, ?, ?)", gi)
|
83
|
+
self.conn.commit()
|
84
|
+
|
85
|
+
def insert_gak(self, gak):
|
86
|
+
self.curs.executemany("INSERT INTO genome_acc_kmer_counts VALUES (?, ?, ?)", gak)
|
87
|
+
self.conn.commit()
|
88
|
+
|
89
|
+
def add_acc_genomes(self, acc, data):
|
90
|
+
create_sql = "CREATE TABLE IF NOT EXISTS {acc}_genomes (genome INTEGER PRIMARY KEY, kmers array)"
|
91
|
+
create_sql = create_sql.format(acc = acc)
|
92
|
+
self.curs.execute(create_sql)
|
93
|
+
insert_sql = "INSERT INTO {acc}_genomes VALUES (?, ?)"
|
94
|
+
insert_sql = insert_sql.format(acc=acc)
|
95
|
+
self.curs.executemany(insert_sql, data)
|
96
|
+
|
97
|
+
self.conn.commit()
|
98
|
+
|
99
|
+
|
100
|
+
def add_acc_kmers(self, acc, data):
|
101
|
+
create_sql = "CREATE TABLE IF NOT EXISTS {acc} (kmer INTEGER PRIMARY KEY, genomes array)"
|
102
|
+
create_sql = create_sql.format(acc = acc)
|
103
|
+
self.curs.execute(create_sql)
|
104
|
+
insert_sql = "INSERT INTO {acc} VALUES (?, ?)"
|
105
|
+
insert_sql = insert_sql.format(acc=acc)
|
106
|
+
self.curs.executemany(insert_sql, data)
|
107
|
+
|
108
|
+
self.conn.commit()
|
109
|
+
|
110
|
+
self.curs.execute("CREATE INDEX {acc}_index ON {acc} (kmer)".format(acc=acc))
|
111
|
+
self.conn.commit()
|
112
|
+
|
113
|
+
#Class for loading crystal files and prepping them for consumption.
|
114
|
+
class ravenous_crystal_lizard:
|
115
|
+
def __init__(self, crystal_list, database, overwrite = False):
|
116
|
+
self.paths_file = crystal_list
|
117
|
+
self.input_paths = None
|
118
|
+
|
119
|
+
self.crystal_contents = None
|
120
|
+
|
121
|
+
self.accession_index = acc_indexer()
|
122
|
+
|
123
|
+
self.genome_index = None
|
124
|
+
self.genome_prot_ct = None
|
125
|
+
self.gak = None
|
126
|
+
|
127
|
+
self.db_already_exists = os.path.exists(database)
|
128
|
+
self.overwrite = overwrite
|
129
|
+
self.og_db_path = database
|
130
|
+
self.db = miga_db_builder(database)
|
131
|
+
|
132
|
+
def consume_list(self):
|
133
|
+
with open(self.paths_file) as fh:
|
134
|
+
self.input_paths = fh.readlines()
|
135
|
+
|
136
|
+
self.input_paths = [path.strip() for path in self.input_paths]
|
137
|
+
|
138
|
+
def consume_crystal_data(self):
|
139
|
+
self.crystal_contents = {}
|
140
|
+
self.genome_index = []
|
141
|
+
self.gak = []
|
142
|
+
current_index = 0
|
143
|
+
for crystal in self.input_paths:
|
144
|
+
if crystal.endswith(".gz"):
|
145
|
+
with gzip.open(crystal, "rb") as fh:
|
146
|
+
next_crystal = fh.read()
|
147
|
+
next_crystal = next_crystal.decode('utf-8')
|
148
|
+
next_crystal = json.loads(next_crystal)
|
149
|
+
else:
|
150
|
+
with open(crystal, "r") as fh:
|
151
|
+
next_crystal = json.load(fh)
|
152
|
+
|
153
|
+
filename = next_crystal["filename"]
|
154
|
+
next_crystal = next_crystal["protein_data"]
|
155
|
+
protein_count = len(next_crystal)
|
156
|
+
|
157
|
+
next_index = (filename, current_index, protein_count,)
|
158
|
+
self.genome_index.append(next_index)
|
159
|
+
|
160
|
+
for acc in next_crystal:
|
161
|
+
acc_id = self.accession_index.forward[acc]
|
162
|
+
|
163
|
+
if acc not in self.crystal_contents:
|
164
|
+
self.crystal_contents[acc] = {}
|
165
|
+
|
166
|
+
kmer_list = np.array(next_crystal[acc]["kmers"], dtype = np.int32)
|
167
|
+
kmer_ct = kmer_list.shape[0]
|
168
|
+
|
169
|
+
next_gak = (current_index, acc_id, kmer_ct, )
|
170
|
+
self.gak.append(next_gak)
|
171
|
+
|
172
|
+
self.crystal_contents[acc][current_index] = kmer_list
|
173
|
+
|
174
|
+
current_index += 1
|
175
|
+
|
176
|
+
self.db.activate()
|
177
|
+
self.db.initialize_metadata()
|
178
|
+
self.db.insert_genome_index(self.genome_index)
|
179
|
+
self.db.insert_gak(self.gak)
|
180
|
+
|
181
|
+
for acc in self.crystal_contents:
|
182
|
+
#self.db.add_acc_genomes(acc, self.crystal_contents[acc])
|
183
|
+
flipped_dataset = self.invert_to_kmer_first(self.crystal_contents[acc])
|
184
|
+
self.db.add_acc_kmers(acc, flipped_dataset)
|
185
|
+
flipped_dataset = None
|
186
|
+
insertable_genomes = []
|
187
|
+
for genome, kmer_array in self.crystal_contents[acc].items():
|
188
|
+
next_row = (genome, kmer_array.tobytes(),)
|
189
|
+
insertable_genomes.append(next_row)
|
190
|
+
|
191
|
+
self.db.add_acc_genomes(acc, insertable_genomes)
|
192
|
+
self.crystal_contents[acc] = None
|
193
|
+
|
194
|
+
self.db.deactivate()
|
195
|
+
|
196
|
+
#Take a set of genome : kmer_lists and flip them to an equivalent set of kmer : genome_lists
|
197
|
+
def invert_to_kmer_first(self, dataset):
|
198
|
+
genomes = []
|
199
|
+
counts = []
|
200
|
+
kmer_unlist = []
|
201
|
+
for genome_index in dataset:
|
202
|
+
genomes.append(genome_index)
|
203
|
+
counts.append(dataset[genome_index].shape[0])
|
204
|
+
kmer_unlist.append(dataset[genome_index])
|
205
|
+
|
206
|
+
genomes = np.array(genomes, dtype = np.int32)
|
207
|
+
counts = np.array(counts, dtype = np.int32)
|
208
|
+
|
209
|
+
kmer_unlist = np.concatenate(kmer_unlist) #A 1-d array of all of the kmers for all of the genomes containing this SCP
|
210
|
+
counted_gens = np.repeat(genomes, counts) #A 1-d array of the same length as kmer_unlist with the corresp. genome index for each kmer
|
211
|
+
|
212
|
+
#This contains a list of kmers and genome indices repeated enough times to match their kmer collection 1 to 1 in the same order
|
213
|
+
formatted_pairs = np.vstack([kmer_unlist, counted_gens])
|
214
|
+
kmer_unlist = None
|
215
|
+
counted_gens = None
|
216
|
+
|
217
|
+
#Sort the list based on kmer, then genome
|
218
|
+
sorted_indices = np.lexsort((formatted_pairs[1, :], formatted_pairs[0, :]))
|
219
|
+
|
220
|
+
formatted_pairs = formatted_pairs[:, sorted_indices]
|
221
|
+
|
222
|
+
#Collect an ordered list of unique kmers
|
223
|
+
discovered_kmers = np.unique(formatted_pairs[0, :])
|
224
|
+
|
225
|
+
#Collect a list of the genomes associated with each kmer
|
226
|
+
formatted_pairs = np.split(formatted_pairs[1, :], np.unique(formatted_pairs[0, :], return_index=True)[1][1:])
|
227
|
+
|
228
|
+
final_dataset = []
|
229
|
+
for kmer, genomes in zip(discovered_kmers, formatted_pairs):
|
230
|
+
genome_bytestring = genomes.tobytes()
|
231
|
+
|
232
|
+
kmer = int(kmer)
|
233
|
+
|
234
|
+
final_dataset.append((kmer, genome_bytestring,))
|
235
|
+
|
236
|
+
return final_dataset
|
237
|
+
|
238
|
+
def run(self):
|
239
|
+
do_run = True
|
240
|
+
if self.db_already_exists:
|
241
|
+
if self.overwrite:
|
242
|
+
os.remove(self.og_db_path)
|
243
|
+
else:
|
244
|
+
print("")
|
245
|
+
print("Target database file already exists! I'm quitting.")
|
246
|
+
print("Supply a different path or use --overwrite")
|
247
|
+
do_run = False
|
248
|
+
|
249
|
+
if do_run:
|
250
|
+
self.consume_list()
|
251
|
+
self.consume_crystal_data()
|
252
|
+
|
253
|
+
#Add options
|
254
|
+
def options():
|
255
|
+
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
|
256
|
+
description='''Dedicated MiGA db builder module.
|
257
|
+
|
258
|
+
Takes a file containing a list of paths to crystals and builds a database from those.
|
259
|
+
|
260
|
+
Notes:
|
261
|
+
|
262
|
+
Assumes the supplied database path does not exist.
|
263
|
+
Use --overwrite to delete an existing DB under the same path if you want to.''')
|
264
|
+
|
265
|
+
parser.add_argument('--crystal_list', dest = 'crystals', default = None, help = 'File containing a list of paths to FastAAI crystals')
|
266
|
+
parser.add_argument('--database_path', dest = 'db', default = None, help = 'Path to a NEW database to be built.')
|
267
|
+
|
268
|
+
parser.add_argument('--overwrite', dest = 'overwrite', action = 'store_true', help = 'Delete an existing database at --database_path and create a new one. Otw. quits to preserve existing db.')
|
269
|
+
|
270
|
+
args, unknown_opts = parser.parse_known_args()
|
271
|
+
|
272
|
+
return parser, args
|
273
|
+
|
274
|
+
def main():
|
275
|
+
p, a = options()
|
276
|
+
crystal_file = a.crystals
|
277
|
+
db = a.db
|
278
|
+
overwrite = a.overwrite
|
279
|
+
|
280
|
+
if len(sys.argv) < 3:
|
281
|
+
p.print_help()
|
282
|
+
|
283
|
+
if crystal_file is None:
|
284
|
+
print("I need a file containing a list of paths to crystals")
|
285
|
+
sys.exit()
|
286
|
+
|
287
|
+
if db is None:
|
288
|
+
print("I need a path to an output database")
|
289
|
+
sys.exit()
|
290
|
+
|
291
|
+
mn = ravenous_crystal_lizard(crystal_list = crystal_file,
|
292
|
+
database = db,
|
293
|
+
overwrite = overwrite)
|
294
|
+
mn.run()
|
295
|
+
|
296
|
+
if __name__ == "__main__":
|
297
|
+
main()
|