PyamilySeq 0.0.1__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyamilySeq/CD-Hit_StORF-Reporter_Cross-Genera_Builder.py +600 -0
- PyamilySeq/Constants.py +1 -0
- PyamilySeq/PyamilySeq_Species.py +647 -0
- PyamilySeq/__init__.py +0 -0
- PyamilySeq/combine_FASTA_with_genome_IDs.py +49 -0
- {PyamilySeq-0.0.1.dist-info → PyamilySeq-0.2.0.dist-info}/METADATA +33 -3
- PyamilySeq-0.2.0.dist-info/RECORD +11 -0
- PyamilySeq-0.2.0.dist-info/top_level.txt +1 -0
- PyamilySeq-0.0.1.dist-info/RECORD +0 -6
- PyamilySeq-0.0.1.dist-info/top_level.txt +0 -1
- {PyamilySeq-0.0.1.dist-info → PyamilySeq-0.2.0.dist-info}/LICENSE +0 -0
- {PyamilySeq-0.0.1.dist-info → PyamilySeq-0.2.0.dist-info}/WHEEL +0 -0
- {PyamilySeq-0.0.1.dist-info → PyamilySeq-0.2.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,647 @@
|
|
|
1
|
+
#from line_profiler_pycharm import profile
|
|
2
|
+
|
|
3
|
+
from collections import OrderedDict,defaultdict
|
|
4
|
+
import copy
|
|
5
|
+
import math
|
|
6
|
+
import sys
|
|
7
|
+
import argparse
|
|
8
|
+
import os
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
from .Constants import *
|
|
12
|
+
except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
|
|
13
|
+
from Constants import *
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def custom_sort_key(k, dict1, dict2):
|
|
17
|
+
return (len(dict1[k]), len(dict2[k]))
|
|
18
|
+
|
|
19
|
+
def sort_keys_by_values(dict1, dict2):
|
|
20
|
+
sorted_keys = sorted(dict1.keys(), key=lambda k: custom_sort_key(k, dict1, dict2), reverse=True)
|
|
21
|
+
return sorted_keys
|
|
22
|
+
|
|
23
|
+
def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted):
|
|
24
|
+
print("Outputting gene_presence_absence file")
|
|
25
|
+
in_name = options.clusters.split('.')[0]
|
|
26
|
+
gpa_outfile = open(in_name+'_gene_presence_absence.csv','w')
|
|
27
|
+
gpa_outfile.write('"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment","'
|
|
28
|
+
'"Accessory Fragment","Accessory Order with Fragment","QC","Min group size nuc","Max group size nuc","Avg group size nuc","')
|
|
29
|
+
gpa_outfile.write('","'.join(genome_dict.keys()))
|
|
30
|
+
gpa_outfile.write('"\n')
|
|
31
|
+
for cluster, sequences in pangenome_clusters_First_sequences_sorted.items():
|
|
32
|
+
average_sequences_per_genome = len(sequences) / len(pangenome_clusters_First_sorted[cluster])
|
|
33
|
+
gpa_outfile.write('"group_'+str(cluster)+'","","'+str(len(pangenome_clusters_First_sorted[cluster]))+'","'+str(len(sequences))+'","'+str(average_sequences_per_genome)+
|
|
34
|
+
'","","","","","","","","",""')
|
|
35
|
+
|
|
36
|
+
full_out = ''
|
|
37
|
+
for genome in genome_dict.keys():
|
|
38
|
+
tmp_list = []
|
|
39
|
+
for value in sequences:
|
|
40
|
+
if value.split('|')[0] == genome:
|
|
41
|
+
tmp_list.append(value)
|
|
42
|
+
if tmp_list:
|
|
43
|
+
full_out += ',"'+''.join(tmp_list)+'"'
|
|
44
|
+
gpa_outfile.write(full_out)
|
|
45
|
+
gpa_outfile.write('\n')
|
|
46
|
+
|
|
47
|
+
### Below is some unfinished code
|
|
48
|
+
# edge_list_outfile = open(in_name+'_edge_list.csv','w')
|
|
49
|
+
# for cluster, sequences in pangenome_clusters_First_sequences_sorted.items():
|
|
50
|
+
# output = []
|
|
51
|
+
# for entry in sequences:
|
|
52
|
+
# # Split each entry at '|'
|
|
53
|
+
# genome, gene = entry.split('|')
|
|
54
|
+
# # Format the result as "gene genome"
|
|
55
|
+
# output.append(f"{gene}\t{genome}")
|
|
56
|
+
# for line in output:
|
|
57
|
+
# edge_list_outfile.write(line + '\n')
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def wrap_sequence(sequence, width=60):
|
|
61
|
+
wrapped_sequence = []
|
|
62
|
+
for i in range(0, len(sequence), width):
|
|
63
|
+
wrapped_sequence.append(sequence[i:i + width])
|
|
64
|
+
return "\n".join(wrapped_sequence)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def read_fasta(fasta_file):
|
|
68
|
+
sequences = {}
|
|
69
|
+
current_sequence = None
|
|
70
|
+
with open(fasta_file, 'r') as file:
|
|
71
|
+
for line in file:
|
|
72
|
+
line = line.strip()
|
|
73
|
+
if not line:
|
|
74
|
+
continue # Skip empty lines
|
|
75
|
+
if line.startswith('>'):
|
|
76
|
+
current_sequence = line[1:] # Remove '>' character
|
|
77
|
+
sequences[current_sequence] = ''
|
|
78
|
+
else:
|
|
79
|
+
sequences[current_sequence] += line
|
|
80
|
+
return sequences
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def reorder_dict_by_keys(original_dict, sorted_keys):
|
|
84
|
+
return {k: original_dict[k] for k in sorted_keys}
|
|
85
|
+
|
|
86
|
+
def get_cores(options,genome_dict):
|
|
87
|
+
##Calculate core groups
|
|
88
|
+
groups = OrderedDict()
|
|
89
|
+
cores = OrderedDict()
|
|
90
|
+
prev_top = len(genome_dict)
|
|
91
|
+
first = True
|
|
92
|
+
for group in options.core_groups.split(','):
|
|
93
|
+
calculated_floor = math.floor(int(group) / 100 * len(genome_dict))
|
|
94
|
+
if first == False:
|
|
95
|
+
groups[group] = (calculated_floor,prev_top -1)
|
|
96
|
+
else:
|
|
97
|
+
groups[group] = (calculated_floor, prev_top)
|
|
98
|
+
first = False
|
|
99
|
+
prev_top = calculated_floor
|
|
100
|
+
first_core_group = 'first_core_' + group
|
|
101
|
+
cores[first_core_group] = []
|
|
102
|
+
if options.reclustered != None:
|
|
103
|
+
extended_core_group = 'extended_core_' + group
|
|
104
|
+
cores[extended_core_group] = []
|
|
105
|
+
combined_core_group = 'combined_core_' + group
|
|
106
|
+
cores[combined_core_group] = []
|
|
107
|
+
second_core_group = 'second_core_' + group
|
|
108
|
+
cores[second_core_group] = []
|
|
109
|
+
only_second_core_group = 'only_second_core_' + group
|
|
110
|
+
cores[only_second_core_group] = []
|
|
111
|
+
return cores, groups
|
|
112
|
+
|
|
113
|
+
#@profile
|
|
114
|
+
def calc_First_only_core(cluster, pep_num, groups, cores):
|
|
115
|
+
groups_as_list = list(groups.values())
|
|
116
|
+
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= pep_num <= fir):
|
|
117
|
+
res = idx
|
|
118
|
+
family_group = list(groups)[res]
|
|
119
|
+
cores['first_core_'+family_group].append(cluster)
|
|
120
|
+
|
|
121
|
+
#@profile
|
|
122
|
+
def calc_single_First_extended_Second_only_core(pep_num, groups, cores, second_num): # Count gene families extended with StORFs
|
|
123
|
+
groups_as_list = list(groups.values())
|
|
124
|
+
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= pep_num+second_num <= fir):
|
|
125
|
+
res = idx
|
|
126
|
+
family_group = list(groups)[res]
|
|
127
|
+
cores['extended_core_' + family_group].append(pep_num)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
#@profile
|
|
131
|
+
def calc_multi_First_extended_Second_only_core(pep_num, groups, cores, second_num): # Count seperately those gene families extended with StORF_Reporter but combined >1 PEP
|
|
132
|
+
groups_as_list = list(groups.values())
|
|
133
|
+
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= pep_num+second_num <= fir):
|
|
134
|
+
res = idx
|
|
135
|
+
family_group = list(groups)[res]
|
|
136
|
+
cores['combined_core_' + family_group] += 1
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
#@profile
|
|
140
|
+
def calc_Second_only_core(groups, cores, second_num):
|
|
141
|
+
groups_as_list = list(groups.values())
|
|
142
|
+
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= second_num <= fir):
|
|
143
|
+
res = idx
|
|
144
|
+
family_group = list(groups)[res]
|
|
145
|
+
cores['second_core_' + family_group] += 1
|
|
146
|
+
|
|
147
|
+
#@profile
|
|
148
|
+
def calc_only_Second_only_core(groups, cores, second_num): # only count the true storf onlies
|
|
149
|
+
groups_as_list = list(groups.values())
|
|
150
|
+
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= second_num <= fir):
|
|
151
|
+
res = idx
|
|
152
|
+
family_group = list(groups)[res]
|
|
153
|
+
cores['only_second_core_' + family_group] += 1
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
#@profile
|
|
160
|
+
def combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered):
|
|
161
|
+
num_clustered_First = defaultdict(list)
|
|
162
|
+
pangenome_clusters_Type = copy.deepcopy(pangenome_clusters_First)
|
|
163
|
+
list_of_reps = list(reps.keys())
|
|
164
|
+
for cluster, pep_genomes in pangenome_clusters_First.items():
|
|
165
|
+
rep = list_of_reps[int(cluster)] # get the rep of the current pep cluster
|
|
166
|
+
Com_PEP_Genomes = 0
|
|
167
|
+
Seconds = 0
|
|
168
|
+
seen_Seconds = []
|
|
169
|
+
added_Second_genomes = 0
|
|
170
|
+
try: # get the cluster from the storf clusters which contains this rep
|
|
171
|
+
clustered_combined = combined_pangenome_clusters_First_Second_clustered[rep] # Not true clusters - I put a PEP as key myself
|
|
172
|
+
seen_clust_Genomes = []
|
|
173
|
+
num_clustered_First[cluster].append(rep + '_' + str(len(pep_genomes)))
|
|
174
|
+
for clust in clustered_combined:
|
|
175
|
+
if options.sequence_tag not in clust: # Not good enough at the moment
|
|
176
|
+
clust_Genome = clust.split('|')[0]
|
|
177
|
+
if clust_Genome not in seen_clust_Genomes:
|
|
178
|
+
seen_clust_Genomes.append(clust_Genome)
|
|
179
|
+
if clust_Genome not in pep_genomes:
|
|
180
|
+
Com_PEP_Genomes += 1
|
|
181
|
+
num_clustered_First[cluster].append(clust + '_' + str(reps[clust][1]))
|
|
182
|
+
elif options.sequence_tag in clust:
|
|
183
|
+
Seconds += 1
|
|
184
|
+
clust_Genome = clust.split('|')[0]
|
|
185
|
+
if clust_Genome not in seen_Seconds:
|
|
186
|
+
seen_Seconds.append(clust_Genome)
|
|
187
|
+
if clust_Genome not in seen_clust_Genomes:
|
|
188
|
+
seen_clust_Genomes.append(clust_Genome)
|
|
189
|
+
if clust_Genome not in pep_genomes:
|
|
190
|
+
added_Second_genomes += 1
|
|
191
|
+
else:
|
|
192
|
+
sys.exit("Error: looking for sequence_tag")
|
|
193
|
+
|
|
194
|
+
size_of_pep_clusters = []
|
|
195
|
+
peps = num_clustered_First[cluster]
|
|
196
|
+
for pep in peps:
|
|
197
|
+
pep = pep.rsplit('_', 1)
|
|
198
|
+
size_of_pep_clusters.append(int(pep[1]))
|
|
199
|
+
pangenome_clusters_Type[cluster] = [len(num_clustered_First[cluster]), sum(size_of_pep_clusters),
|
|
200
|
+
size_of_pep_clusters, added_Second_genomes, Seconds, len(seen_Seconds)]
|
|
201
|
+
|
|
202
|
+
except KeyError:
|
|
203
|
+
###Singleton
|
|
204
|
+
num_pep_genomes = [len(pep_genomes)]
|
|
205
|
+
pangenome_clusters_Type[cluster] = [1, len(pep_genomes), num_pep_genomes, added_Second_genomes, Seconds,
|
|
206
|
+
len(seen_Seconds)]
|
|
207
|
+
|
|
208
|
+
return pangenome_clusters_Type
|
|
209
|
+
|
|
210
|
+
#@profile
|
|
211
|
+
def single_clustering_counting(options, pangenome_clusters_First, reps):
|
|
212
|
+
num_clustered_PEP = defaultdict(list)
|
|
213
|
+
recorded_PEP = []
|
|
214
|
+
pangenome_clusters_Type = copy.deepcopy(pangenome_clusters_First)
|
|
215
|
+
list_of_reps = list(reps.keys())
|
|
216
|
+
for cluster, pep_genomes in pangenome_clusters_First.items():
|
|
217
|
+
rep = list_of_reps[int(cluster)] # get the rep of the current pep cluster
|
|
218
|
+
|
|
219
|
+
try: # get the cluster from the storf clusters which contains this rep
|
|
220
|
+
num_clustered_PEP[cluster].append(rep + '_' + str(len(pep_genomes)))
|
|
221
|
+
size_of_pep_clusters = []
|
|
222
|
+
peps = num_clustered_PEP[cluster]
|
|
223
|
+
for pep in peps:
|
|
224
|
+
pep = pep.rsplit('_', 1)
|
|
225
|
+
size_of_pep_clusters.append(int(pep[1]))
|
|
226
|
+
recorded_PEP.append(pep[0])
|
|
227
|
+
pangenome_clusters_Type[cluster] = [len(num_clustered_PEP[cluster]), sum(size_of_pep_clusters),
|
|
228
|
+
size_of_pep_clusters, 0, 0, 0]
|
|
229
|
+
|
|
230
|
+
except KeyError:
|
|
231
|
+
###Singleton
|
|
232
|
+
num_pep_genomes = [len(pep_genomes)]
|
|
233
|
+
pangenome_clusters_Type[cluster] = [1, len(pep_genomes), num_pep_genomes, 0, 0, 0]
|
|
234
|
+
|
|
235
|
+
return pangenome_clusters_Type
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
#@profile
|
|
240
|
+
def combined_clustering_CDHIT(options, genome_dict):
|
|
241
|
+
unique_genomes = []
|
|
242
|
+
Second_in = open(options.reclustered, 'r')
|
|
243
|
+
combined_pangenome_clusters_First = OrderedDict()
|
|
244
|
+
combined_pangenome_clusters_First_sequences = OrderedDict()
|
|
245
|
+
combined_pangenome_clusters_Second = OrderedDict()
|
|
246
|
+
combined_pangenome_clusters_Second_sequences = OrderedDict()
|
|
247
|
+
combined_pangenome_clusters_First_Second_clustered = OrderedDict()
|
|
248
|
+
|
|
249
|
+
not_Second_only_cluster_ids = []
|
|
250
|
+
already_seen_PEP = []
|
|
251
|
+
Combined_clusters = OrderedDict()
|
|
252
|
+
Combined_reps = OrderedDict()
|
|
253
|
+
first = True
|
|
254
|
+
for line in Second_in:
|
|
255
|
+
if line.startswith('>'):
|
|
256
|
+
if first == False:
|
|
257
|
+
cluster_size = len(Combined_clusters[cluster_id])
|
|
258
|
+
Combined_reps.update({rep: cluster_size})
|
|
259
|
+
for pep in combined_pangenome_clusters_First_sequences[cluster_id]:
|
|
260
|
+
if pep != []:
|
|
261
|
+
if pep in already_seen_PEP:
|
|
262
|
+
continue
|
|
263
|
+
else:
|
|
264
|
+
already_seen_PEP.append(pep)
|
|
265
|
+
if len(combined_pangenome_clusters_Second_sequences[cluster_id]) > 0 and len(combined_pangenome_clusters_First_sequences[cluster_id]) > 0:
|
|
266
|
+
if len(combined_pangenome_clusters_First_sequences[cluster_id]) > 1: # If we have clustered >1 PEP family, we need to record 1 as key and all others are val
|
|
267
|
+
all_but_first = combined_pangenome_clusters_First_sequences[cluster_id][1:]
|
|
268
|
+
storfs_clustered = combined_pangenome_clusters_Second_sequences[cluster_id]
|
|
269
|
+
VALUE = all_but_first + storfs_clustered
|
|
270
|
+
else:
|
|
271
|
+
VALUE = combined_pangenome_clusters_Second_sequences[cluster_id]
|
|
272
|
+
KEY = combined_pangenome_clusters_First_sequences[cluster_id][0]
|
|
273
|
+
combined_pangenome_clusters_First_Second_clustered.update({KEY: VALUE})
|
|
274
|
+
cluster_id = line.strip('>')
|
|
275
|
+
cluster_id = cluster_id.strip('\n')
|
|
276
|
+
cluster_id = cluster_id.split(' ')[1]
|
|
277
|
+
Combined_clusters.update({cluster_id: []})
|
|
278
|
+
combined_pangenome_clusters_First.update({cluster_id: []})
|
|
279
|
+
combined_pangenome_clusters_First_sequences.update({cluster_id: []})
|
|
280
|
+
combined_pangenome_clusters_Second.update({cluster_id: []})
|
|
281
|
+
combined_pangenome_clusters_Second_sequences.update({cluster_id: []})
|
|
282
|
+
|
|
283
|
+
first = False
|
|
284
|
+
else:
|
|
285
|
+
clustered = line.split('\t')[1]
|
|
286
|
+
clustered = clustered.split('>')[1]
|
|
287
|
+
clustered = clustered.split('...')[0]
|
|
288
|
+
genome = clustered.split('|')[0]
|
|
289
|
+
genome_dict[genome] += 1
|
|
290
|
+
if '*' in line:
|
|
291
|
+
rep = clustered
|
|
292
|
+
Combined_reps.update({rep: 0})
|
|
293
|
+
if first == False:
|
|
294
|
+
Combined_clusters[cluster_id].append(clustered)
|
|
295
|
+
clustered_genome = clustered.split('|')[0]
|
|
296
|
+
if options.sequence_tag in line:
|
|
297
|
+
if clustered_genome not in combined_pangenome_clusters_Second[cluster_id]:
|
|
298
|
+
combined_pangenome_clusters_Second[cluster_id].append(clustered_genome)
|
|
299
|
+
combined_pangenome_clusters_Second_sequences[cluster_id].append(clustered)
|
|
300
|
+
else:
|
|
301
|
+
if cluster_id not in not_Second_only_cluster_ids:
|
|
302
|
+
not_Second_only_cluster_ids.append(cluster_id) # Tell us which StORF_Reporter clustered are unmatched to a PEP
|
|
303
|
+
if clustered_genome not in combined_pangenome_clusters_First[cluster_id]:
|
|
304
|
+
combined_pangenome_clusters_First[cluster_id].append(clustered_genome)
|
|
305
|
+
combined_pangenome_clusters_First_sequences[cluster_id].append(clustered)
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
return combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids, combined_pangenome_clusters_Second, unique_genomes
|
|
309
|
+
|
|
310
|
+
def combined_clustering_Edge_List(options, genome_dict):
|
|
311
|
+
if options.format == 'TSV':
|
|
312
|
+
separator = '\t'
|
|
313
|
+
elif options.format == 'CSV':
|
|
314
|
+
separator = ','
|
|
315
|
+
unique_genomes = []
|
|
316
|
+
cluster_id = 0
|
|
317
|
+
last_rep = ''
|
|
318
|
+
Second_in = open(options.reclustered, 'r')
|
|
319
|
+
combined_pangenome_clusters_First = OrderedDict()
|
|
320
|
+
combined_pangenome_clusters_First_sequences = OrderedDict()
|
|
321
|
+
combined_pangenome_clusters_Second = OrderedDict()
|
|
322
|
+
combined_pangenome_clusters_Second_sequences = OrderedDict()
|
|
323
|
+
combined_pangenome_clusters_First_Second_clustered = OrderedDict()
|
|
324
|
+
|
|
325
|
+
not_Second_only_cluster_ids = []
|
|
326
|
+
already_seen_PEP = []
|
|
327
|
+
Combined_clusters = OrderedDict()
|
|
328
|
+
Combined_reps = OrderedDict()
|
|
329
|
+
first = True
|
|
330
|
+
for line in Second_in:
|
|
331
|
+
rep, child = line.strip().split(separator)
|
|
332
|
+
child_genome = child.split('|')[0] # Extracting the genome identifier from the child sequence
|
|
333
|
+
|
|
334
|
+
if first == True:
|
|
335
|
+
Combined_clusters.update({cluster_id: []})
|
|
336
|
+
combined_pangenome_clusters_First.update({cluster_id: []})
|
|
337
|
+
combined_pangenome_clusters_First_sequences.update({cluster_id: []})
|
|
338
|
+
combined_pangenome_clusters_Second.update({cluster_id: []})
|
|
339
|
+
combined_pangenome_clusters_Second_sequences.update({cluster_id: []})
|
|
340
|
+
Combined_reps.update({rep: 0})
|
|
341
|
+
first = False
|
|
342
|
+
|
|
343
|
+
if first == False:
|
|
344
|
+
if rep != last_rep and last_rep != '':
|
|
345
|
+
cluster_size = len(Combined_clusters[cluster_id])
|
|
346
|
+
Combined_reps.update({rep: cluster_size})
|
|
347
|
+
for pep in combined_pangenome_clusters_First_sequences[cluster_id]:
|
|
348
|
+
if pep != []:
|
|
349
|
+
if pep in already_seen_PEP:
|
|
350
|
+
continue
|
|
351
|
+
else:
|
|
352
|
+
already_seen_PEP.append(pep)
|
|
353
|
+
if len(combined_pangenome_clusters_Second_sequences[cluster_id]) > 0 and len(combined_pangenome_clusters_First_sequences[cluster_id]) > 0:
|
|
354
|
+
if len(combined_pangenome_clusters_First_sequences[cluster_id]) > 1: # If we have clustered >1 PEP family, we need to record 1 as key and all others are val
|
|
355
|
+
all_but_first = combined_pangenome_clusters_First_sequences[cluster_id][1:]
|
|
356
|
+
storfs_clustered = combined_pangenome_clusters_Second_sequences[cluster_id]
|
|
357
|
+
VALUE = all_but_first + storfs_clustered
|
|
358
|
+
else:
|
|
359
|
+
VALUE = combined_pangenome_clusters_Second_sequences[cluster_id]
|
|
360
|
+
KEY = combined_pangenome_clusters_First_sequences[cluster_id][0]
|
|
361
|
+
combined_pangenome_clusters_First_Second_clustered.update({KEY: VALUE})
|
|
362
|
+
|
|
363
|
+
cluster_id += 1
|
|
364
|
+
Combined_clusters.update({cluster_id: []})
|
|
365
|
+
combined_pangenome_clusters_First.update({cluster_id: []})
|
|
366
|
+
combined_pangenome_clusters_First_sequences.update({cluster_id: []})
|
|
367
|
+
combined_pangenome_clusters_Second.update({cluster_id: []})
|
|
368
|
+
combined_pangenome_clusters_Second_sequences.update({cluster_id: []})
|
|
369
|
+
Combined_reps.update({rep: 0})
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
Combined_clusters[cluster_id].append(child)
|
|
373
|
+
if options.sequence_tag in line:
|
|
374
|
+
if child_genome not in combined_pangenome_clusters_Second[cluster_id]:
|
|
375
|
+
combined_pangenome_clusters_Second[cluster_id].append(child_genome)
|
|
376
|
+
combined_pangenome_clusters_Second_sequences[cluster_id].append(child)
|
|
377
|
+
else:
|
|
378
|
+
if cluster_id not in not_Second_only_cluster_ids:
|
|
379
|
+
not_Second_only_cluster_ids.append(cluster_id) # Tell us which StORF_Reporter clustered are unmatched to a PEP
|
|
380
|
+
if child_genome not in combined_pangenome_clusters_First[cluster_id]:
|
|
381
|
+
combined_pangenome_clusters_First[cluster_id].append(child_genome)
|
|
382
|
+
combined_pangenome_clusters_First_sequences[cluster_id].append(child)
|
|
383
|
+
|
|
384
|
+
last_rep = rep
|
|
385
|
+
|
|
386
|
+
return combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids, combined_pangenome_clusters_Second, unique_genomes
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def cluster_EdgeList(options):
|
|
390
|
+
if options.format == 'TSV':
|
|
391
|
+
separator = '\t'
|
|
392
|
+
elif options.format == 'CSV':
|
|
393
|
+
separator = ','
|
|
394
|
+
cluster_id = 0
|
|
395
|
+
last_rep = ''
|
|
396
|
+
first = True
|
|
397
|
+
First_in = open(options.clusters, 'r')
|
|
398
|
+
pangenome_clusters_First = OrderedDict()
|
|
399
|
+
pangenome_clusters_First_sequences = OrderedDict()
|
|
400
|
+
genome_dict = defaultdict(int)
|
|
401
|
+
reps = OrderedDict()
|
|
402
|
+
for line in First_in:
|
|
403
|
+
rep, child = line.strip().split(separator)
|
|
404
|
+
child_genome = child.split('|')[0] # Extracting the genome identifier from the child sequence
|
|
405
|
+
# Counting occurrences of genomes
|
|
406
|
+
genome_dict[child_genome] += 1
|
|
407
|
+
if first == True:
|
|
408
|
+
pangenome_clusters_First[0] = []
|
|
409
|
+
pangenome_clusters_First_sequences[0] = []
|
|
410
|
+
first = False
|
|
411
|
+
|
|
412
|
+
if rep != last_rep and last_rep != '':
|
|
413
|
+
cluster_id +=1
|
|
414
|
+
pangenome_clusters_First[cluster_id] = []
|
|
415
|
+
pangenome_clusters_First_sequences[cluster_id] = []
|
|
416
|
+
cluster_size = len(pangenome_clusters_First_sequences[cluster_id-1])
|
|
417
|
+
reps.update({last_rep: [cluster_size, len(pangenome_clusters_First[cluster_id-1])]})
|
|
418
|
+
pangenome_clusters_First[cluster_id] = []
|
|
419
|
+
pangenome_clusters_First_sequences[cluster_id] = []
|
|
420
|
+
if child_genome not in pangenome_clusters_First[cluster_id]:
|
|
421
|
+
pangenome_clusters_First[cluster_id].append(child_genome)
|
|
422
|
+
|
|
423
|
+
pangenome_clusters_First_sequences[cluster_id].append(child)
|
|
424
|
+
last_rep = rep
|
|
425
|
+
cluster_size = len(pangenome_clusters_First_sequences[cluster_id])
|
|
426
|
+
reps.update({rep: [cluster_size, len(pangenome_clusters_First[cluster_id])]})
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
return genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
def cluster_CDHIT(options):
|
|
434
|
+
First_in = open(options.clusters, 'r')
|
|
435
|
+
clusters = OrderedDict()
|
|
436
|
+
pangenome_clusters_First = OrderedDict()
|
|
437
|
+
pangenome_clusters_First_sequences = OrderedDict()
|
|
438
|
+
first = True
|
|
439
|
+
genome_dict = defaultdict(int)
|
|
440
|
+
reps = OrderedDict()
|
|
441
|
+
## Load in all data for easier reuse later
|
|
442
|
+
for line in First_in:
|
|
443
|
+
if line.startswith('>'):
|
|
444
|
+
if first == False:
|
|
445
|
+
cluster_size = len(clusters[cluster_id])
|
|
446
|
+
reps.update({rep: [cluster_size, len(pangenome_clusters_First[cluster_id])]})
|
|
447
|
+
cluster_id = line.strip('>')
|
|
448
|
+
cluster_id = cluster_id.strip('\n')
|
|
449
|
+
cluster_id = cluster_id.split(' ')[1]
|
|
450
|
+
clusters.update({cluster_id: []})
|
|
451
|
+
pangenome_clusters_First.update({cluster_id: []})
|
|
452
|
+
pangenome_clusters_First_sequences.update({cluster_id: []})
|
|
453
|
+
|
|
454
|
+
first = False
|
|
455
|
+
else:
|
|
456
|
+
clustered = line.split('\t')[1]
|
|
457
|
+
clustered = clustered.split('>')[1]
|
|
458
|
+
clustered = clustered.split('...')[0]
|
|
459
|
+
genome = clustered.split('|')[0]
|
|
460
|
+
genome_dict[genome] += 1
|
|
461
|
+
if '*' in line:
|
|
462
|
+
rep = clustered
|
|
463
|
+
reps.update({rep: [0, 0]})
|
|
464
|
+
if first == False:
|
|
465
|
+
clusters[cluster_id].append(clustered)
|
|
466
|
+
clustered_genome = clustered.split('|')[0]
|
|
467
|
+
if clustered_genome not in pangenome_clusters_First[cluster_id]:
|
|
468
|
+
pangenome_clusters_First[cluster_id].append(clustered_genome)
|
|
469
|
+
pangenome_clusters_First_sequences[cluster_id].append(clustered)
|
|
470
|
+
return genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps
|
|
471
|
+
|
|
472
|
+
#@profile
|
|
473
|
+
def cluster(options):
|
|
474
|
+
|
|
475
|
+
if options.format == 'CD-HIT':
|
|
476
|
+
genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps = cluster_CDHIT(options)
|
|
477
|
+
elif options.format in ['TSV','CSV']:
|
|
478
|
+
genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps = cluster_EdgeList(options)
|
|
479
|
+
|
|
480
|
+
######################################
|
|
481
|
+
cores, groups = get_cores(options, genome_dict)
|
|
482
|
+
###
|
|
483
|
+
|
|
484
|
+
if options.reclustered != None:
|
|
485
|
+
if options.format == 'CD-HIT':
|
|
486
|
+
combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second,\
|
|
487
|
+
unique_genomes = combined_clustering_CDHIT(options, genome_dict)
|
|
488
|
+
if options.format == 'TSV':
|
|
489
|
+
combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second,\
|
|
490
|
+
unique_genomes = combined_clustering_Edge_List(options, genome_dict)
|
|
491
|
+
pangenome_clusters_Type = combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered)
|
|
492
|
+
else:
|
|
493
|
+
pangenome_clusters_Type = single_clustering_counting(options, pangenome_clusters_First, reps)
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
counter = 0
|
|
497
|
+
Number_Of_StORF_Extending_But_Same_Genomes = 0
|
|
498
|
+
|
|
499
|
+
sorted_first_keys = sort_keys_by_values(pangenome_clusters_First, pangenome_clusters_First_sequences)
|
|
500
|
+
pangenome_clusters_First_sorted = reorder_dict_by_keys(pangenome_clusters_First, sorted_first_keys)
|
|
501
|
+
pangenome_clusters_First_sequences_sorted = reorder_dict_by_keys(pangenome_clusters_First_sequences, sorted_first_keys)
|
|
502
|
+
pangenome_clusters_Type_sorted = reorder_dict_by_keys(pangenome_clusters_Type, sorted_first_keys)
|
|
503
|
+
|
|
504
|
+
print("Calculating Groups")
|
|
505
|
+
for cluster, numbers in pangenome_clusters_Type_sorted.items():
|
|
506
|
+
############################### Calculate First only
|
|
507
|
+
if numbers[0] == 1 and numbers[1] >=2:
|
|
508
|
+
calc_First_only_core(cluster, numbers[1],groups,cores)
|
|
509
|
+
counter +=1
|
|
510
|
+
elif numbers[0] >1 and numbers[1] >=2:
|
|
511
|
+
calc_First_only_core(cluster, numbers[2][0],groups,cores)
|
|
512
|
+
counter += 1
|
|
513
|
+
|
|
514
|
+
if options.reclustered != None:
|
|
515
|
+
############################# Calculate First and Reclustered-Second
|
|
516
|
+
if numbers[0] == 1 and numbers[3] >= 1: # If Seconds did not combine First reps
|
|
517
|
+
calc_single_First_extended_Second_only_core(numbers[1], groups, cores, numbers[3])
|
|
518
|
+
elif numbers[0] > 1 and numbers[3] >= 1: # If unique Secondss combined multiple Firsts
|
|
519
|
+
calc_multi_First_extended_Second_only_core(numbers[1], groups, cores, numbers[3])
|
|
520
|
+
elif numbers[4] >= 1:
|
|
521
|
+
Number_Of_StORF_Extending_But_Same_Genomes += 1
|
|
522
|
+
combined_pangenome_clusters_ONLY_Second_Type = defaultdict(list)
|
|
523
|
+
combined_pangenome_clusters_Second_Type = defaultdict(list)
|
|
524
|
+
for cluster, genomes in combined_pangenome_clusters_Second.items():
|
|
525
|
+
if cluster in not_Second_only_cluster_ids:
|
|
526
|
+
combined_pangenome_clusters_Second_Type[cluster] = [cluster, len(genomes)]
|
|
527
|
+
else:
|
|
528
|
+
combined_pangenome_clusters_ONLY_Second_Type[cluster] = [cluster, len(genomes)]
|
|
529
|
+
for cluster, data in combined_pangenome_clusters_Second_Type.items():
|
|
530
|
+
calc_Second_only_core(groups, cores, data[1])
|
|
531
|
+
for cluster, data in combined_pangenome_clusters_ONLY_Second_Type.items():
|
|
532
|
+
if data[1] >= 2:
|
|
533
|
+
calc_only_Second_only_core(groups, cores, data[1])
|
|
534
|
+
###########################
|
|
535
|
+
print("End")
|
|
536
|
+
key_order = ['first_core_', 'extended_core_', 'combined_core_', 'second_core_','only_second_core_']
|
|
537
|
+
print("Gene Family Groups:")
|
|
538
|
+
for key_prefix in key_order:
|
|
539
|
+
for key, value in cores.items():
|
|
540
|
+
if key.startswith(key_prefix):
|
|
541
|
+
print(f"{key}: {len(value)}")
|
|
542
|
+
|
|
543
|
+
if options.gene_presence_absence_out != None:
|
|
544
|
+
gene_presence_absence_output(options,genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted)
|
|
545
|
+
|
|
546
|
+
if options.write_families != None and options.fasta != None:
|
|
547
|
+
sequences = read_fasta(options.fasta)
|
|
548
|
+
input_dir = os.path.dirname(os.path.abspath(options.clusters))
|
|
549
|
+
output_dir = os.path.join(input_dir, 'Gene_Families_Output')
|
|
550
|
+
|
|
551
|
+
# Create output directory if it doesn't exist
|
|
552
|
+
if not os.path.exists(output_dir):
|
|
553
|
+
os.makedirs(output_dir)
|
|
554
|
+
for key_prefix in key_order:
|
|
555
|
+
for key, values in cores.items():
|
|
556
|
+
if any(part in options.write_families.split(',') for part in key.split('_')):
|
|
557
|
+
if key.startswith(key_prefix):
|
|
558
|
+
for value in values:
|
|
559
|
+
output_filename = f"{key}_{value}.fasta"
|
|
560
|
+
sequences_to_write = pangenome_clusters_First_sequences_sorted[value]
|
|
561
|
+
# Write sequences to output file that are in the sequences dictionary
|
|
562
|
+
with open(os.path.join(output_dir, output_filename), 'w') as outfile:
|
|
563
|
+
for header in sequences_to_write:
|
|
564
|
+
if header in sequences:
|
|
565
|
+
outfile.write(f">{header}\n")
|
|
566
|
+
wrapped_sequence = wrap_sequence(sequences[header])
|
|
567
|
+
outfile.write(f"{wrapped_sequence}\n")
|
|
568
|
+
|
|
569
|
+
|
|
570
|
+
|
|
571
|
+
def main():
|
|
572
|
+
|
|
573
|
+
parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ': PyamilySeq Run Parameters.')
|
|
574
|
+
parser._action_groups.pop()
|
|
575
|
+
|
|
576
|
+
required = parser.add_argument_group('Required Arguments')
|
|
577
|
+
required.add_argument('-c', action='store', dest='clusters', help='Clustering output file from CD-HIT, TSV or CSV Edge List',
|
|
578
|
+
required=True)
|
|
579
|
+
required.add_argument('-f', action='store', dest='format', choices=['CD-HIT', 'CSV', 'TSV'],
|
|
580
|
+
help='Which format to use (CD-HIT or Comma/Tab Separated Edge-List (such as MMseqs2 tsv output))', required=True)
|
|
581
|
+
|
|
582
|
+
output_args = parser.add_argument_group('Output Parameters')
|
|
583
|
+
output_args.add_argument('-w', action="store", dest='write_families', default="99",
|
|
584
|
+
help='Default - No output: Output sequences of identified families (provide levels at which to output "-w 99 95"'
|
|
585
|
+
' - Must provide FASTA file with -fasta')
|
|
586
|
+
output_args.add_argument('-fasta', action='store', dest='fasta',
|
|
587
|
+
help='FASTA file to use in conjunction with "-w"',
|
|
588
|
+
required=False)
|
|
589
|
+
|
|
590
|
+
optional = parser.add_argument_group('Optional Arguments')
|
|
591
|
+
optional.add_argument('-rc', action='store', dest='reclustered', help='Clustering output file from secondary round of clustering',
|
|
592
|
+
required=False)
|
|
593
|
+
optional.add_argument('-st', action='store', dest='sequence_tag', help='Default - "StORF": Unique identifier to be used to distinguish the second of two rounds of clustered sequences',
|
|
594
|
+
required=False)
|
|
595
|
+
optional.add_argument('-groups', action="store", dest='core_groups', default="99,95,90,80,15",
|
|
596
|
+
help='Default - (\'99,95,90,80,15\'): Gene family groups to use')
|
|
597
|
+
optional.add_argument('-gpa', action='store', dest='gene_presence_absence_out', help='Default - False: If selected, a Roary formatted gene_presence_absence.csv will be created - Required for Coinfinder and other downstream tools',
|
|
598
|
+
required=False)
|
|
599
|
+
|
|
600
|
+
misc = parser.add_argument_group('Misc')
|
|
601
|
+
misc.add_argument('-verbose', action='store', dest='verbose', default=False, type=eval, choices=[True, False],
|
|
602
|
+
help='Default - False: Print out runtime messages')
|
|
603
|
+
misc.add_argument('-v', action='store_true', dest='version',
|
|
604
|
+
help='Default - False: Print out version number and exit')
|
|
605
|
+
|
|
606
|
+
|
|
607
|
+
options = parser.parse_args()
|
|
608
|
+
if options.clusters == None or options.format == None:
|
|
609
|
+
if options.version:
|
|
610
|
+
sys.exit(PyamilySeq_Version)
|
|
611
|
+
else:
|
|
612
|
+
exit('PyamilySeq: error: the following arguments are required: -c, -f')
|
|
613
|
+
|
|
614
|
+
if options.sequence_tag == None:
|
|
615
|
+
options.sequence_tag = 'StORF'
|
|
616
|
+
|
|
617
|
+
|
|
618
|
+
if options.write_families != None and options.fasta == False:
|
|
619
|
+
exit("-fasta must br provided if -w is used")
|
|
620
|
+
|
|
621
|
+
|
|
622
|
+
options.clusters = os.path.normpath(options.clusters)
|
|
623
|
+
options.clusters = os.path.realpath(options.clusters)
|
|
624
|
+
if options.reclustered:
|
|
625
|
+
options.reclustered = os.path.normpath(options.reclustered)
|
|
626
|
+
options.reclustered = os.path.realpath(options.reclustered)
|
|
627
|
+
|
|
628
|
+
|
|
629
|
+
options.core_groups = options.core_groups + ',0'
|
|
630
|
+
|
|
631
|
+
cluster(options)
|
|
632
|
+
|
|
633
|
+
|
|
634
|
+
|
|
635
|
+
|
|
636
|
+
|
|
637
|
+
print("Thank you for using PyamilySeq -- A detailed user manual can be found at https://github.com/NickJD/PyamilySeq\n"
|
|
638
|
+
"Please report any issues to: https://github.com/NickJD/PyamilySeq/issues\n#####")
|
|
639
|
+
|
|
640
|
+
|
|
641
|
+
|
|
642
|
+
|
|
643
|
+
|
|
644
|
+
if __name__ == "__main__":
|
|
645
|
+
main()
|
|
646
|
+
print("Complete")
|
|
647
|
+
|
PyamilySeq/__init__.py
ADDED
|
File without changes
|