miga-base 1.2.18.2 → 1.3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/miga/cli/action/doctor/base.rb +2 -1
- data/lib/miga/cli/action/init.rb +1 -1
- data/lib/miga/dataset/result/add.rb +3 -2
- data/lib/miga/version.rb +2 -2
- data/scripts/essential_genes.bash +4 -8
- data/utils/FastAAI/LICENSE +8 -0
- data/utils/FastAAI/README.md +151 -40
- data/utils/FastAAI/__init__.py +1 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962915_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962925_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962935_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962945_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962995_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963025_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963055_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963065_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/_Pseudomonas__cissicola_GCA_002019225_1.fna.gz +0 -0
- data/utils/FastAAI/example_genomes/_Pseudomonas__cissicola_GCA_008801575_1.fna.gz +0 -0
- data/utils/FastAAI/fastaai/__init__.py +1 -0
- data/utils/FastAAI/fastaai/fastaai +4805 -0
- data/utils/FastAAI/fastaai/fastaai.py +4805 -0
- data/utils/FastAAI/fastaai/fastaai_miga_crystals_to_db.py +297 -0
- data/utils/FastAAI/fastaai/fastaai_miga_preproc.py +931 -0
- data/utils/FastAAI/metadata/Accession_names_and_IDs.txt +122 -0
- data/utils/distance/commands.rb +51 -23
- metadata +23 -6
- data/utils/FastAAI/FastAAI +0 -3659
- /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Archaea_SCG.hmm +0 -0
- /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Bacteria_SCG.hmm +0 -0
- /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Complete_SCG_DB.hmm +0 -0
@@ -0,0 +1,297 @@
|
|
1
|
+
import sys
|
2
|
+
import os
|
3
|
+
|
4
|
+
import gzip
|
5
|
+
import argparse
|
6
|
+
import json
|
7
|
+
import gzip
|
8
|
+
|
9
|
+
import numpy as np
|
10
|
+
|
11
|
+
import multiprocessing
|
12
|
+
import sqlite3
|
13
|
+
|
14
|
+
def convert_array(bytestring):
|
15
|
+
return np.frombuffer(bytestring, dtype = np.int32)
|
16
|
+
|
17
|
+
sqlite3.register_converter("array", convert_array)
|
18
|
+
|
19
|
+
class acc_indexer:
|
20
|
+
def __init__(self):
|
21
|
+
self.forward = None
|
22
|
+
self.reverse = None
|
23
|
+
self.generate_accessions_index()
|
24
|
+
|
25
|
+
def list_to_index_dict(self, list):
|
26
|
+
result = {}
|
27
|
+
counter = 0
|
28
|
+
for item in list:
|
29
|
+
result[item] = counter
|
30
|
+
counter += 1
|
31
|
+
return result
|
32
|
+
|
33
|
+
def rev_list_to_index_dict(self, list):
|
34
|
+
result = {}
|
35
|
+
counter = 0
|
36
|
+
for item in list:
|
37
|
+
result[counter] = item
|
38
|
+
counter += 1
|
39
|
+
return result
|
40
|
+
|
41
|
+
def generate_accessions_index(self):
|
42
|
+
acc_list = ['PF01780_19', 'PF03948_14', 'PF17144_4', 'PF00830_19', 'PF00347_23', 'PF16906_5', 'PF13393_6',
|
43
|
+
'PF02565_15', 'PF01991_18', 'PF01984_20', 'PF00861_22', 'PF13656_6', 'PF00368_18', 'PF01142_18', 'PF00312_22', 'PF02367_17',
|
44
|
+
'PF01951_16', 'PF00749_21', 'PF01655_18', 'PF00318_20', 'PF01813_17', 'PF01649_18', 'PF01025_19', 'PF00380_19', 'PF01282_19',
|
45
|
+
'PF01864_17', 'PF01783_23', 'PF01808_18', 'PF01982_16', 'PF01715_17', 'PF00213_18', 'PF00119_20', 'PF00573_22', 'PF01981_16',
|
46
|
+
'PF00281_19', 'PF00584_20', 'PF00825_18', 'PF00406_22', 'PF00177_21', 'PF01192_22', 'PF05833_11', 'PF02699_15', 'PF01016_19',
|
47
|
+
'PF01765_19', 'PF00453_18', 'PF01193_24', 'PF05221_17', 'PF00231_19', 'PF00416_22', 'PF02033_18', 'PF01668_18', 'PF00886_19',
|
48
|
+
'PF00252_18', 'PF00572_18', 'PF00366_20', 'PF04104_14', 'PF04919_12', 'PF01912_18', 'PF00276_20', 'PF00203_21', 'PF00889_19',
|
49
|
+
'PF02996_17', 'PF00121_18', 'PF01990_17', 'PF00344_20', 'PF00297_22', 'PF01196_19', 'PF01194_17', 'PF01725_16', 'PF00750_19',
|
50
|
+
'PF00338_22', 'PF00238_19', 'PF01200_18', 'PF00162_19', 'PF00181_23', 'PF01866_17', 'PF00709_21', 'PF02006_16', 'PF00164_25',
|
51
|
+
'PF00237_19', 'PF01139_17', 'PF01351_18', 'PF04010_13', 'PF06093_13', 'PF00828_19', 'PF02410_15', 'PF01176_19', 'PF02130_17',
|
52
|
+
'PF01948_18', 'PF01195_19', 'PF01746_21', 'PF01667_17', 'PF03874_16', 'PF01090_19', 'PF01198_19', 'PF01250_17', 'PF17136_4',
|
53
|
+
'PF06026_14', 'PF03652_15', 'PF04019_12', 'PF01201_22', 'PF00832_20', 'PF01264_21', 'PF03840_14', 'PF00831_23', 'PF00189_20',
|
54
|
+
'PF02601_15', 'PF01496_19', 'PF00411_19', 'PF00334_19', 'PF00687_21', 'PF01157_18', 'PF01245_20', 'PF01994_16', 'PF01632_19',
|
55
|
+
'PF00827_17', 'PF01015_18', 'PF00829_21', 'PF00410_19', 'PF00833_18', 'PF00935_19', 'PF01992_16']
|
56
|
+
|
57
|
+
self.forward = list_of_poss_accs = self.list_to_index_dict(acc_list)
|
58
|
+
self.reverse = list_of_poss_accs = self.rev_list_to_index_dict(acc_list)
|
59
|
+
|
60
|
+
class miga_db_builder:
|
61
|
+
def __init__(self, path):
|
62
|
+
self.path = path
|
63
|
+
self.conn = None
|
64
|
+
self.curs = None
|
65
|
+
|
66
|
+
def activate(self):
|
67
|
+
self.conn = sqlite3.connect(self.path)
|
68
|
+
self.curs = self.conn.cursor()
|
69
|
+
|
70
|
+
def deactivate(self):
|
71
|
+
self.curs.close()
|
72
|
+
self.conn.close()
|
73
|
+
self.conn = None
|
74
|
+
self.curs = None
|
75
|
+
|
76
|
+
def initialize_metadata(self):
|
77
|
+
self.curs.execute("CREATE TABLE IF NOT EXISTS genome_index (genome text, gen_id integer, protein_count integer)")
|
78
|
+
self.curs.execute("CREATE TABLE IF NOT EXISTS genome_acc_kmer_counts (genome integer, accession integer, count integer)")
|
79
|
+
self.curs.execute("CREATE INDEX IF NOT EXISTS kmer_acc ON genome_acc_kmer_counts (genome, accession);")
|
80
|
+
|
81
|
+
def insert_genome_index(self, gi):
|
82
|
+
self.curs.executemany("INSERT INTO genome_index VALUES (?, ?, ?)", gi)
|
83
|
+
self.conn.commit()
|
84
|
+
|
85
|
+
def insert_gak(self, gak):
|
86
|
+
self.curs.executemany("INSERT INTO genome_acc_kmer_counts VALUES (?, ?, ?)", gak)
|
87
|
+
self.conn.commit()
|
88
|
+
|
89
|
+
def add_acc_genomes(self, acc, data):
|
90
|
+
create_sql = "CREATE TABLE IF NOT EXISTS {acc}_genomes (genome INTEGER PRIMARY KEY, kmers array)"
|
91
|
+
create_sql = create_sql.format(acc = acc)
|
92
|
+
self.curs.execute(create_sql)
|
93
|
+
insert_sql = "INSERT INTO {acc}_genomes VALUES (?, ?)"
|
94
|
+
insert_sql = insert_sql.format(acc=acc)
|
95
|
+
self.curs.executemany(insert_sql, data)
|
96
|
+
|
97
|
+
self.conn.commit()
|
98
|
+
|
99
|
+
|
100
|
+
def add_acc_kmers(self, acc, data):
|
101
|
+
create_sql = "CREATE TABLE IF NOT EXISTS {acc} (kmer INTEGER PRIMARY KEY, genomes array)"
|
102
|
+
create_sql = create_sql.format(acc = acc)
|
103
|
+
self.curs.execute(create_sql)
|
104
|
+
insert_sql = "INSERT INTO {acc} VALUES (?, ?)"
|
105
|
+
insert_sql = insert_sql.format(acc=acc)
|
106
|
+
self.curs.executemany(insert_sql, data)
|
107
|
+
|
108
|
+
self.conn.commit()
|
109
|
+
|
110
|
+
self.curs.execute("CREATE INDEX {acc}_index ON {acc} (kmer)".format(acc=acc))
|
111
|
+
self.conn.commit()
|
112
|
+
|
113
|
+
#Class for loading crystal files and prepping them for consumption.
|
114
|
+
class ravenous_crystal_lizard:
|
115
|
+
def __init__(self, crystal_list, database, overwrite = False):
|
116
|
+
self.paths_file = crystal_list
|
117
|
+
self.input_paths = None
|
118
|
+
|
119
|
+
self.crystal_contents = None
|
120
|
+
|
121
|
+
self.accession_index = acc_indexer()
|
122
|
+
|
123
|
+
self.genome_index = None
|
124
|
+
self.genome_prot_ct = None
|
125
|
+
self.gak = None
|
126
|
+
|
127
|
+
self.db_already_exists = os.path.exists(database)
|
128
|
+
self.overwrite = overwrite
|
129
|
+
self.og_db_path = database
|
130
|
+
self.db = miga_db_builder(database)
|
131
|
+
|
132
|
+
def consume_list(self):
|
133
|
+
with open(self.paths_file) as fh:
|
134
|
+
self.input_paths = fh.readlines()
|
135
|
+
|
136
|
+
self.input_paths = [path.strip() for path in self.input_paths]
|
137
|
+
|
138
|
+
def consume_crystal_data(self):
|
139
|
+
self.crystal_contents = {}
|
140
|
+
self.genome_index = []
|
141
|
+
self.gak = []
|
142
|
+
current_index = 0
|
143
|
+
for crystal in self.input_paths:
|
144
|
+
if crystal.endswith(".gz"):
|
145
|
+
with gzip.open(crystal, "rb") as fh:
|
146
|
+
next_crystal = fh.read()
|
147
|
+
next_crystal = next_crystal.decode('utf-8')
|
148
|
+
next_crystal = json.loads(next_crystal)
|
149
|
+
else:
|
150
|
+
with open(crystal, "r") as fh:
|
151
|
+
next_crystal = json.load(fh)
|
152
|
+
|
153
|
+
filename = next_crystal["filename"]
|
154
|
+
next_crystal = next_crystal["protein_data"]
|
155
|
+
protein_count = len(next_crystal)
|
156
|
+
|
157
|
+
next_index = (filename, current_index, protein_count,)
|
158
|
+
self.genome_index.append(next_index)
|
159
|
+
|
160
|
+
for acc in next_crystal:
|
161
|
+
acc_id = self.accession_index.forward[acc]
|
162
|
+
|
163
|
+
if acc not in self.crystal_contents:
|
164
|
+
self.crystal_contents[acc] = {}
|
165
|
+
|
166
|
+
kmer_list = np.array(next_crystal[acc]["kmers"], dtype = np.int32)
|
167
|
+
kmer_ct = kmer_list.shape[0]
|
168
|
+
|
169
|
+
next_gak = (current_index, acc_id, kmer_ct, )
|
170
|
+
self.gak.append(next_gak)
|
171
|
+
|
172
|
+
self.crystal_contents[acc][current_index] = kmer_list
|
173
|
+
|
174
|
+
current_index += 1
|
175
|
+
|
176
|
+
self.db.activate()
|
177
|
+
self.db.initialize_metadata()
|
178
|
+
self.db.insert_genome_index(self.genome_index)
|
179
|
+
self.db.insert_gak(self.gak)
|
180
|
+
|
181
|
+
for acc in self.crystal_contents:
|
182
|
+
#self.db.add_acc_genomes(acc, self.crystal_contents[acc])
|
183
|
+
flipped_dataset = self.invert_to_kmer_first(self.crystal_contents[acc])
|
184
|
+
self.db.add_acc_kmers(acc, flipped_dataset)
|
185
|
+
flipped_dataset = None
|
186
|
+
insertable_genomes = []
|
187
|
+
for genome, kmer_array in self.crystal_contents[acc].items():
|
188
|
+
next_row = (genome, kmer_array.tobytes(),)
|
189
|
+
insertable_genomes.append(next_row)
|
190
|
+
|
191
|
+
self.db.add_acc_genomes(acc, insertable_genomes)
|
192
|
+
self.crystal_contents[acc] = None
|
193
|
+
|
194
|
+
self.db.deactivate()
|
195
|
+
|
196
|
+
#Take a set of genome : kmer_lists and flip them to an equivalent set of kmer : genome_lists
|
197
|
+
def invert_to_kmer_first(self, dataset):
|
198
|
+
genomes = []
|
199
|
+
counts = []
|
200
|
+
kmer_unlist = []
|
201
|
+
for genome_index in dataset:
|
202
|
+
genomes.append(genome_index)
|
203
|
+
counts.append(dataset[genome_index].shape[0])
|
204
|
+
kmer_unlist.append(dataset[genome_index])
|
205
|
+
|
206
|
+
genomes = np.array(genomes, dtype = np.int32)
|
207
|
+
counts = np.array(counts, dtype = np.int32)
|
208
|
+
|
209
|
+
kmer_unlist = np.concatenate(kmer_unlist) #A 1-d array of all of the kmers for all of the genomes containing this SCP
|
210
|
+
counted_gens = np.repeat(genomes, counts) #A 1-d array of the same length as kmer_unlist with the corresp. genome index for each kmer
|
211
|
+
|
212
|
+
#This contains a list of kmers and genome indices repeated enough times to match their kmer collection 1 to 1 in the same order
|
213
|
+
formatted_pairs = np.vstack([kmer_unlist, counted_gens])
|
214
|
+
kmer_unlist = None
|
215
|
+
counted_gens = None
|
216
|
+
|
217
|
+
#Sort the list based on kmer, then genome
|
218
|
+
sorted_indices = np.lexsort((formatted_pairs[1, :], formatted_pairs[0, :]))
|
219
|
+
|
220
|
+
formatted_pairs = formatted_pairs[:, sorted_indices]
|
221
|
+
|
222
|
+
#Collect an ordered list of unique kmers
|
223
|
+
discovered_kmers = np.unique(formatted_pairs[0, :])
|
224
|
+
|
225
|
+
#Collect a list of the genomes associated with each kmer
|
226
|
+
formatted_pairs = np.split(formatted_pairs[1, :], np.unique(formatted_pairs[0, :], return_index=True)[1][1:])
|
227
|
+
|
228
|
+
final_dataset = []
|
229
|
+
for kmer, genomes in zip(discovered_kmers, formatted_pairs):
|
230
|
+
genome_bytestring = genomes.tobytes()
|
231
|
+
|
232
|
+
kmer = int(kmer)
|
233
|
+
|
234
|
+
final_dataset.append((kmer, genome_bytestring,))
|
235
|
+
|
236
|
+
return final_dataset
|
237
|
+
|
238
|
+
def run(self):
|
239
|
+
do_run = True
|
240
|
+
if self.db_already_exists:
|
241
|
+
if self.overwrite:
|
242
|
+
os.remove(self.og_db_path)
|
243
|
+
else:
|
244
|
+
print("")
|
245
|
+
print("Target database file already exists! I'm quitting.")
|
246
|
+
print("Supply a different path or use --overwrite")
|
247
|
+
do_run = False
|
248
|
+
|
249
|
+
if do_run:
|
250
|
+
self.consume_list()
|
251
|
+
self.consume_crystal_data()
|
252
|
+
|
253
|
+
#Add options
|
254
|
+
def options():
|
255
|
+
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
|
256
|
+
description='''Dedicated MiGA db builder module.
|
257
|
+
|
258
|
+
Takes a file containing a list of paths to crystals and builds a database from those.
|
259
|
+
|
260
|
+
Notes:
|
261
|
+
|
262
|
+
Assumes the supplied database path does not exist.
|
263
|
+
Use --overwrite to delete an existing DB under the same path if you want to.''')
|
264
|
+
|
265
|
+
parser.add_argument('--crystal_list', dest = 'crystals', default = None, help = 'File containing a list of paths to FastAAI crystals')
|
266
|
+
parser.add_argument('--database_path', dest = 'db', default = None, help = 'Path to a NEW database to be built.')
|
267
|
+
|
268
|
+
parser.add_argument('--overwrite', dest = 'overwrite', action = 'store_true', help = 'Delete an existing database at --database_path and create a new one. Otw. quits to preserve existing db.')
|
269
|
+
|
270
|
+
args, unknown_opts = parser.parse_known_args()
|
271
|
+
|
272
|
+
return parser, args
|
273
|
+
|
274
|
+
def main():
|
275
|
+
p, a = options()
|
276
|
+
crystal_file = a.crystals
|
277
|
+
db = a.db
|
278
|
+
overwrite = a.overwrite
|
279
|
+
|
280
|
+
if len(sys.argv) < 3:
|
281
|
+
p.print_help()
|
282
|
+
|
283
|
+
if crystal_file is None:
|
284
|
+
print("I need a file containing a list of paths to crystals")
|
285
|
+
sys.exit()
|
286
|
+
|
287
|
+
if db is None:
|
288
|
+
print("I need a path to an output database")
|
289
|
+
sys.exit()
|
290
|
+
|
291
|
+
mn = ravenous_crystal_lizard(crystal_list = crystal_file,
|
292
|
+
database = db,
|
293
|
+
overwrite = overwrite)
|
294
|
+
mn.run()
|
295
|
+
|
296
|
+
if __name__ == "__main__":
|
297
|
+
main()
|