PyamilySeq 0.0.1__py3-none-any.whl → 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyamilySeq/CD-Hit_StORF-Reporter_Cross-Genera_Builder.py +600 -0
- PyamilySeq/Constants.py +1 -0
- PyamilySeq/PyamilySeq_Species.py +586 -0
- PyamilySeq/__init__.py +0 -0
- PyamilySeq/combine_FASTA_with_genome_IDs.py +49 -0
- {PyamilySeq-0.0.1.dist-info → PyamilySeq-0.0.2.dist-info}/METADATA +1 -2
- PyamilySeq-0.0.2.dist-info/RECORD +11 -0
- PyamilySeq-0.0.2.dist-info/top_level.txt +1 -0
- PyamilySeq-0.0.1.dist-info/RECORD +0 -6
- PyamilySeq-0.0.1.dist-info/top_level.txt +0 -1
- {PyamilySeq-0.0.1.dist-info → PyamilySeq-0.0.2.dist-info}/LICENSE +0 -0
- {PyamilySeq-0.0.1.dist-info → PyamilySeq-0.0.2.dist-info}/WHEEL +0 -0
- {PyamilySeq-0.0.1.dist-info → PyamilySeq-0.0.2.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,600 @@
|
|
|
1
|
+
import collections
|
|
2
|
+
import copy
|
|
3
|
+
import math
|
|
4
|
+
import sys
|
|
5
|
+
import numpy as np
|
|
6
|
+
from itertools import chain
|
|
7
|
+
|
|
8
|
+
def get_Genus(clustered):
|
|
9
|
+
clustered_genus = clustered.split('|')[0]
|
|
10
|
+
if '_' in clustered_genus[0]: # Remove name error
|
|
11
|
+
clustered_genus = clustered_genus.split('_')[1]
|
|
12
|
+
else:
|
|
13
|
+
clustered_genus = clustered_genus.split('_')[0]
|
|
14
|
+
return str(clustered_genus).capitalize()
|
|
15
|
+
|
|
16
|
+
def get_Species(clustered):
|
|
17
|
+
clustered_species = clustered.split('|')[0]
|
|
18
|
+
if '_' in clustered_species[0]: # Remove name error
|
|
19
|
+
clustered_species = clustered_species.split('_')[1]
|
|
20
|
+
else:
|
|
21
|
+
clustered_species = clustered_species.split('_')[:2]
|
|
22
|
+
return str('_'.join(clustered_species)).capitalize()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
PEP_In = open('/home/nick/Documents/Single_Genome/All_Ensembl_PEP_CD_Clustered_90_60.clstr','r')
|
|
26
|
+
StORF_In = open('/home/nick/Documents/Single_Genome/All_Ensem_PEP_CD_Clustered_90_60_Unclustered_UR_StORFs_AA_CD.clstr','r') # Clusters for single Genera
|
|
27
|
+
|
|
28
|
+
clusters = collections.OrderedDict()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
pangenome_clusters_PEP_Genera = collections.OrderedDict()
|
|
32
|
+
pangenome_clusters_PEP_Species = collections.OrderedDict()
|
|
33
|
+
pangenome_clusters_PEP_Strains = collections.OrderedDict()
|
|
34
|
+
pangenome_clusters_PEP_SEQS = collections.OrderedDict()
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
max_storf_only_genera = 0
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
count = 0
|
|
43
|
+
first = True
|
|
44
|
+
genome_dict = collections.defaultdict(int)
|
|
45
|
+
reps = collections.OrderedDict()
|
|
46
|
+
county = 0
|
|
47
|
+
#singleton_cluster = "Null"
|
|
48
|
+
clusters_With_Con_StORFs = []
|
|
49
|
+
## Load in all data for easier reuse later
|
|
50
|
+
for line in PEP_In:
|
|
51
|
+
if line.startswith('>'):
|
|
52
|
+
if first == False:
|
|
53
|
+
Ensem_Con = set(Ensem_genomes).intersection(Con_genomes)
|
|
54
|
+
cluster_size = len(clusters[cluster_id])
|
|
55
|
+
reps.update({rep: [cluster_size,len(pangenome_clusters_PEP_Genera[cluster_id])]}) # Add strains, species here if wanted
|
|
56
|
+
#if len(clusters[cluster_id]) == 1 and "Null" not in singleton_cluster: # Stop at clusters smaller than 10
|
|
57
|
+
# singleton_cluster = cluster_id
|
|
58
|
+
#if len(clusters[cluster_id]) < 10: # Stop at clusters smaller than 10
|
|
59
|
+
# pangenome_clusters_PEP_Species.popitem()
|
|
60
|
+
# pangenome_clusters_PEP_Genera.popitem()
|
|
61
|
+
# pangenome_clusters_PEP_SEQS.popitem()
|
|
62
|
+
# reps.popitem()
|
|
63
|
+
# if len(clusters[cluster_id]) == 1:
|
|
64
|
+
# break # REMEMBER
|
|
65
|
+
Ensem_genomes, Con_genomes = [], []
|
|
66
|
+
cluster_id = line.strip('>')
|
|
67
|
+
cluster_id = cluster_id.strip('\n')
|
|
68
|
+
cluster_id = cluster_id.split(' ')[1]
|
|
69
|
+
clusters.update({cluster_id: []})
|
|
70
|
+
pangenome_clusters_PEP_Genera.update({cluster_id: []})
|
|
71
|
+
pangenome_clusters_PEP_Species.update({cluster_id:[]})
|
|
72
|
+
pangenome_clusters_PEP_Strains.update({cluster_id: []})
|
|
73
|
+
# pangenome_clusters_PEP_SEQS.update({cluster_id:[]})
|
|
74
|
+
|
|
75
|
+
first = False
|
|
76
|
+
else:
|
|
77
|
+
clustered = line.split('\t')[1]
|
|
78
|
+
clustered = clustered.split('>')[1]
|
|
79
|
+
clustered = clustered.split('...')[0]
|
|
80
|
+
genome = clustered.split('|')[0]
|
|
81
|
+
genome_dict[genome] +=1
|
|
82
|
+
if '*' in line:
|
|
83
|
+
rep = clustered
|
|
84
|
+
reps.update({rep:[0,0]})
|
|
85
|
+
if first == False:
|
|
86
|
+
clusters[cluster_id].append(clustered)
|
|
87
|
+
clustered_genus = get_Genus(clustered)
|
|
88
|
+
clustered_species = get_Species(clustered)
|
|
89
|
+
clustered_strain = clustered.split('|')[0]
|
|
90
|
+
|
|
91
|
+
if clustered_genus not in pangenome_clusters_PEP_Genera[cluster_id]:
|
|
92
|
+
pangenome_clusters_PEP_Genera[cluster_id].append(clustered_genus)
|
|
93
|
+
#if clustered_species not in pangenome_clusters_PEP_Species[cluster_id]:
|
|
94
|
+
# pangenome_clusters_PEP_Species[cluster_id].append(clustered_species)
|
|
95
|
+
if genome not in pangenome_clusters_PEP_Strains[cluster_id]:
|
|
96
|
+
pangenome_clusters_PEP_Strains[cluster_id].append(genome)
|
|
97
|
+
# pangenome_clusters_PEP_SEQS[cluster_id].append(clustered)
|
|
98
|
+
print("PEP DONE")
|
|
99
|
+
######################################
|
|
100
|
+
Combined_pangenome_clusters_PEP_Genera = collections.OrderedDict()
|
|
101
|
+
Combined_pangenome_clusters_PEP_Species = collections.OrderedDict()
|
|
102
|
+
Combined_pangenome_clusters_PEP_Strains = collections.OrderedDict()
|
|
103
|
+
Combined_pangenome_clusters_PEP_SEQS = collections.OrderedDict()
|
|
104
|
+
|
|
105
|
+
Combined_pangenome_clusters_StORF_Genera = collections.OrderedDict()
|
|
106
|
+
Combined_pangenome_clusters_StORF_Species = collections.OrderedDict()
|
|
107
|
+
Combined_pangenome_clusters_StORF_Strains = collections.OrderedDict()
|
|
108
|
+
Combined_pangenome_clusters_StORF_SEQS = collections.OrderedDict()
|
|
109
|
+
|
|
110
|
+
Combined_pangenome_clusters_PEP_StORF_Clustered_Genera = collections.OrderedDict()
|
|
111
|
+
Combined_pangenome_clusters_PEP_StORF_Clustered = collections.OrderedDict()
|
|
112
|
+
|
|
113
|
+
not_StORF_Only_Cluster_IDs = []
|
|
114
|
+
|
|
115
|
+
Combined_clusters = collections.OrderedDict()
|
|
116
|
+
Combined_reps = collections.OrderedDict()
|
|
117
|
+
first = True
|
|
118
|
+
###############
|
|
119
|
+
## We load in the combined PEP and StORF_Reporter data separately
|
|
120
|
+
for line in StORF_In:
|
|
121
|
+
if line.startswith('>'):
|
|
122
|
+
if first == False:
|
|
123
|
+
cluster_size = len(Combined_clusters[cluster_id])
|
|
124
|
+
Combined_reps.update({rep: cluster_size})
|
|
125
|
+
# if len(Combined_pangenome_clusters_PEP_SEQS[cluster_id]) > 1:
|
|
126
|
+
# print("Here")
|
|
127
|
+
if len(Combined_pangenome_clusters_StORF_SEQS[cluster_id]) > 0 and len(Combined_pangenome_clusters_PEP_SEQS[cluster_id]) > 0:
|
|
128
|
+
if len(Combined_pangenome_clusters_PEP_SEQS[cluster_id]) > 1: # If we have clustered >1 PEP family, we need to record 1 as key and all others are val
|
|
129
|
+
all_but_first = Combined_pangenome_clusters_PEP_SEQS[cluster_id][1:]
|
|
130
|
+
storfs_clustered = Combined_pangenome_clusters_StORF_SEQS[cluster_id]
|
|
131
|
+
VALUE = all_but_first+storfs_clustered
|
|
132
|
+
else:
|
|
133
|
+
VALUE = Combined_pangenome_clusters_StORF_SEQS[cluster_id]
|
|
134
|
+
KEY = Combined_pangenome_clusters_PEP_SEQS[cluster_id][0]
|
|
135
|
+
Combined_pangenome_clusters_PEP_StORF_Clustered.update({KEY:VALUE})
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
## Below needs to be rewritten. With >1 genus - be able to record multiple PEPs for each combined...
|
|
139
|
+
# if len(Combined_pangenome_clusters_StORF_Genera[cluster_id]) > 0 and len(Combined_pangenome_clusters_PEP_Genera[cluster_id]) > 0:
|
|
140
|
+
# KEY = Combined_pangenome_clusters_PEP_SEQS[cluster_id][0]
|
|
141
|
+
# VALUE = Combined_pangenome_clusters_StORF_SEQS[cluster_id]
|
|
142
|
+
# Combined_pangenome_clusters_PEP_StORF_Clustered_Genera.update({KEY:VALUE})
|
|
143
|
+
if len(Combined_clusters[cluster_id]) == 1: # Stop at clusters smaller than 10
|
|
144
|
+
print("First Singleton Cluster is: " +str(cluster_id))
|
|
145
|
+
break
|
|
146
|
+
cluster_id = line.strip('>')
|
|
147
|
+
cluster_id = cluster_id.strip('\n')
|
|
148
|
+
cluster_id = cluster_id.split(' ')[1]
|
|
149
|
+
Combined_clusters.update({cluster_id: []})
|
|
150
|
+
# Combined_pangenome_clusters_PEP_Genera.update({cluster_id:[]})
|
|
151
|
+
# Combined_pangenome_clusters_PEP_Species.update({cluster_id: []})
|
|
152
|
+
Combined_pangenome_clusters_PEP_Strains.update({cluster_id: []})
|
|
153
|
+
Combined_pangenome_clusters_PEP_SEQS.update({cluster_id: []})
|
|
154
|
+
#
|
|
155
|
+
Combined_pangenome_clusters_StORF_Genera.update({cluster_id: []})
|
|
156
|
+
# Combined_pangenome_clusters_StORF_Species.update({cluster_id: []})
|
|
157
|
+
Combined_pangenome_clusters_StORF_Strains.update({cluster_id: []})
|
|
158
|
+
Combined_pangenome_clusters_StORF_SEQS.update({cluster_id: []})
|
|
159
|
+
first = False
|
|
160
|
+
else:
|
|
161
|
+
clustered = line.split('\t')[1]
|
|
162
|
+
clustered = clustered.split('>')[1]
|
|
163
|
+
clustered = clustered.split('...')[0]
|
|
164
|
+
if '*' in line:
|
|
165
|
+
rep = clustered
|
|
166
|
+
Combined_reps.update({rep:0})
|
|
167
|
+
if first == False:
|
|
168
|
+
Combined_clusters[cluster_id].append(clustered)
|
|
169
|
+
clustered_genus = get_Genus(clustered)
|
|
170
|
+
clustered_species = get_Species(clustered)
|
|
171
|
+
clustered_strain = clustered.split('|')[0]
|
|
172
|
+
if '_' in clustered_strain[0]: # Remove name error
|
|
173
|
+
clustered_strain = clustered_strain.split('_')[1]
|
|
174
|
+
|
|
175
|
+
if "StORF_Type" in line:
|
|
176
|
+
# if cluster_id not in clusters_With_Con_StORFs: # For counting?
|
|
177
|
+
# clusters_With_Con_StORFs.append(cluster_id)
|
|
178
|
+
if clustered_genus not in Combined_pangenome_clusters_StORF_Genera[cluster_id]:
|
|
179
|
+
Combined_pangenome_clusters_StORF_Genera[cluster_id].append(clustered_genus)
|
|
180
|
+
# if clustered_species not in Combined_pangenome_clusters_StORF_Species[cluster_id]:
|
|
181
|
+
# Combined_pangenome_clusters_StORF_Species[cluster_id].append(clustered_species)
|
|
182
|
+
if clustered_strain not in Combined_pangenome_clusters_StORF_Strains[cluster_id]:
|
|
183
|
+
Combined_pangenome_clusters_StORF_Strains[cluster_id].append(clustered_strain)
|
|
184
|
+
Combined_pangenome_clusters_StORF_SEQS[cluster_id].append(clustered)
|
|
185
|
+
#
|
|
186
|
+
else:
|
|
187
|
+
# if clustered_genus not in Combined_pangenome_clusters_PEP_Genera[cluster_id]:
|
|
188
|
+
# Combined_pangenome_clusters_PEP_Genera[cluster_id].append(clustered_genus)
|
|
189
|
+
# if clustered_species not in Combined_pangenome_clusters_PEP_Species[cluster_id]:
|
|
190
|
+
# Combined_pangenome_clusters_PEP_Species[cluster_id].append(clustered_species)
|
|
191
|
+
if clustered_strain not in Combined_pangenome_clusters_PEP_Strains[cluster_id]:
|
|
192
|
+
Combined_pangenome_clusters_PEP_Strains[cluster_id].append(clustered_strain)
|
|
193
|
+
if cluster_id not in not_StORF_Only_Cluster_IDs:
|
|
194
|
+
not_StORF_Only_Cluster_IDs.append(cluster_id) # Tell us which StORF_Reporter clustered are unmatched to a PEP
|
|
195
|
+
Combined_pangenome_clusters_PEP_SEQS[cluster_id].append(clustered)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
###HERE for tomorrow - copy the updated work from single to here and repeat for genus,species and strain
|
|
200
|
+
list_of_reps = list(reps.keys())
|
|
201
|
+
num_clustered_PEP_Genera = collections.defaultdict(list)
|
|
202
|
+
recorded_PEP = []
|
|
203
|
+
################################# Genera
|
|
204
|
+
pangenome_clusters_Type_Genera = copy.deepcopy(pangenome_clusters_PEP_Genera)
|
|
205
|
+
pangenome_clusters_Type_Strains = collections.defaultdict(list)
|
|
206
|
+
|
|
207
|
+
for cluster, pep_genomes in pangenome_clusters_PEP_Genera.items():
|
|
208
|
+
recorded_PEP.append(cluster)
|
|
209
|
+
rep = list_of_reps[int(cluster)]
|
|
210
|
+
Com_PEPs = 0
|
|
211
|
+
Com_PEP_Genomes = 0
|
|
212
|
+
StORFs = 0
|
|
213
|
+
Added_StORF_Genera = 0
|
|
214
|
+
seen_clust_Strains = []
|
|
215
|
+
|
|
216
|
+
PEP_Strains = pangenome_clusters_PEP_Strains[cluster]
|
|
217
|
+
for clustered_strain in PEP_Strains:
|
|
218
|
+
if '_' in clustered_strain[0]: # Remove name error
|
|
219
|
+
clustered_strain = clustered_strain[1:]
|
|
220
|
+
if clustered_strain not in seen_clust_Strains:
|
|
221
|
+
seen_clust_Strains.append(clustered_strain)
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
try:
|
|
225
|
+
clustered_combined = Combined_pangenome_clusters_PEP_StORF_Clustered[rep]
|
|
226
|
+
seen_clust_Genera = []
|
|
227
|
+
num_clustered_PEP_Genera[cluster].append(rep + '_' + str(len(pep_genomes)))
|
|
228
|
+
for clust in clustered_combined:
|
|
229
|
+
if 'StORF_Type' not in clust:
|
|
230
|
+
### Need to get the number of pep genomes for each pep clustered into this
|
|
231
|
+
Com_PEPs += 1
|
|
232
|
+
clustered_genus = get_Genus(clust)
|
|
233
|
+
#clust_Genome = clust.split('|')[0]
|
|
234
|
+
if clustered_genus not in seen_clust_Genera:
|
|
235
|
+
seen_clust_Genera.append(clustered_genus)
|
|
236
|
+
if clustered_genus not in pep_genomes:
|
|
237
|
+
Com_PEP_Genomes += 1
|
|
238
|
+
try:
|
|
239
|
+
num_clustered_PEP_Genera[cluster].append(clust + '_' + str(reps[clust][1]))
|
|
240
|
+
except TypeError:
|
|
241
|
+
sys.exit("Broken")
|
|
242
|
+
|
|
243
|
+
elif 'StORF_Type' in clust:
|
|
244
|
+
StORFs += 1
|
|
245
|
+
clustered_genus = get_Genus(clust)
|
|
246
|
+
#clust_Genome = clust.split('|')[0]
|
|
247
|
+
if clustered_genus not in seen_clust_Genera:
|
|
248
|
+
seen_clust_Genera.append(clustered_genus)
|
|
249
|
+
if clustered_genus not in pep_genomes:
|
|
250
|
+
Added_StORF_Genera += 1
|
|
251
|
+
else:
|
|
252
|
+
print("WHAT")
|
|
253
|
+
|
|
254
|
+
size_of_pep_clusters = []
|
|
255
|
+
peps = num_clustered_PEP_Genera[cluster]
|
|
256
|
+
for pep in peps:
|
|
257
|
+
pep = pep.rsplit('_', 1)
|
|
258
|
+
size_of_pep_clusters.append(int(pep[1]))
|
|
259
|
+
pangenome_clusters_Type_Genera[cluster] = [len(num_clustered_PEP_Genera[cluster]), sum(size_of_pep_clusters),
|
|
260
|
+
size_of_pep_clusters, Added_StORF_Genera, StORFs]
|
|
261
|
+
pangenome_clusters_Type_Strains[cluster] = seen_clust_Strains
|
|
262
|
+
except KeyError:
|
|
263
|
+
###Singleton
|
|
264
|
+
num_pep_genomes = [len(pep_genomes)]
|
|
265
|
+
pangenome_clusters_Type_Genera[cluster] = [1, len(pep_genomes), num_pep_genomes, Added_StORF_Genera, StORFs]
|
|
266
|
+
pangenome_clusters_Type_Strains[cluster] = seen_clust_Strains
|
|
267
|
+
|
|
268
|
+
print("S")
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
#######################################
|
|
272
|
+
|
|
273
|
+
Without_StORF = open('./Ensem_Clusters_Without_StORFs_To_Be_Nogged_min2','w')
|
|
274
|
+
With_StORF = open('./Ensem_Clusters_With_StORFs_To_Be_Nogged','w')
|
|
275
|
+
#With_Extending_StORF = open('./Ensem_Clusters_With_Extending_Con-StORFs_To_Be_Nogged','w')
|
|
276
|
+
|
|
277
|
+
for key, value in pangenome_clusters_Type_Genera.items():
|
|
278
|
+
pep_strains = pangenome_clusters_Type_Strains[key]
|
|
279
|
+
if value[4] == 0 and len(pep_strains) >=2:
|
|
280
|
+
Without_StORF.write(str(key)+',')
|
|
281
|
+
# elif value[3] != 0:
|
|
282
|
+
# With_Extending_StORF.write(str(key)+',')
|
|
283
|
+
# With_StORF.write(str(key) + ',')
|
|
284
|
+
elif value[4] >=1:
|
|
285
|
+
With_StORF.write(str(key) + ',')
|
|
286
|
+
|
|
287
|
+
With_StORF.close()
|
|
288
|
+
Without_StORF.close()
|
|
289
|
+
#With_Extending_StORF.close()
|
|
290
|
+
|
|
291
|
+
############## Typing for the StORF_Reporter-Data
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
multi_PEP_Combined_By_StORFs = collections.OrderedDict()
|
|
295
|
+
|
|
296
|
+
StORF_Seqs_Extended = []
|
|
297
|
+
StORF_Genomes_Extended = []
|
|
298
|
+
|
|
299
|
+
####################################
|
|
300
|
+
#cores = collections.OrderedDict({'pep_genera_single':[],'pep_genera_multi':[],'extended_genera':[],'comb_extended_genera_single':[],'comb_extended_genera_multi':[],'extended_genera_single':[],'extended_genera_multi':0,'storf_genera_single':0,'storf_genera_multi':0,
|
|
301
|
+
# 'only_storf_genera_single':0,'only_storf_genera_multi':0})
|
|
302
|
+
|
|
303
|
+
cores = collections.OrderedDict({'pep_genera':[],'extended_genera_single_pep':[],'many_extended_genera_pep':[],'extended_genera':[],'comb_extended_genera':[],'storf_genera':[],'only_storf_genera':[],'only_storf_genera_recording':[]})
|
|
304
|
+
|
|
305
|
+
extended = collections.OrderedDict()
|
|
306
|
+
############################
|
|
307
|
+
|
|
308
|
+
clsuters_to_be_validated = collections.defaultdict(list)
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
############################
|
|
312
|
+
def calc_pep_only(pep_num):
|
|
313
|
+
cores['pep_genera'].append(pep_num)
|
|
314
|
+
# if pep_num == 1:# and StORF_num == 0:
|
|
315
|
+
# cores['pep_genera_single'] += 1
|
|
316
|
+
# elif pep_num > 1:# and StORF_num == 0:
|
|
317
|
+
# cores['pep_genera_multi'] += 1
|
|
318
|
+
##########################
|
|
319
|
+
def calc_pep_extended_StORF(cluster,pep_num,storf_num):
|
|
320
|
+
if pep_num != 0 and storf_num >= 1:
|
|
321
|
+
cores['extended_genera'].append(pep_num+storf_num)
|
|
322
|
+
clsuters_to_be_validated['extended_genera'].append(cluster)
|
|
323
|
+
if pep_num != 0 and storf_num >= 10:
|
|
324
|
+
cores['many_extended_genera_pep'].append([cluster,pep_num+storf_num])
|
|
325
|
+
|
|
326
|
+
if pep_num == 1 and storf_num >= 1:
|
|
327
|
+
cores['extended_genera_single_pep'].append([cluster,pep_num + storf_num])
|
|
328
|
+
# cores['extended_genera_single'] +=1
|
|
329
|
+
# if pep_num != 0 and storf_num > 1:
|
|
330
|
+
# cores['extended_genera_multi'] +=1
|
|
331
|
+
##########################
|
|
332
|
+
def calc_multi_pep_extended_StORF(cluster,number_of_pep_clustered,pep_num,storf_num):
|
|
333
|
+
if pep_num !=0 and storf_num >= 1:
|
|
334
|
+
cores['comb_extended_genera'].append(pep_num+storf_num)
|
|
335
|
+
clsuters_to_be_validated['comb_extended_genera'].append(cluster)
|
|
336
|
+
|
|
337
|
+
#########################
|
|
338
|
+
def calc_StORF_only_when_with_pep(cluster,storf_num):
|
|
339
|
+
cores['storf_genera'].append(storf_num)
|
|
340
|
+
clsuters_to_be_validated['storf_genera'].append(cluster)
|
|
341
|
+
# if storf_num == 1:# and StORF_num == 0:
|
|
342
|
+
# cores['storf_genera_single'] += 1
|
|
343
|
+
# elif storf_num > 1:# and StORF_num == 0:
|
|
344
|
+
# cores['storf_genera_multi'] += 1
|
|
345
|
+
######################## What is the difference with these?
|
|
346
|
+
def calc_only_StORF(cluster,storf_num,max_storf_only_genera): # only count the true storf onlies
|
|
347
|
+
cores['only_storf_genera'].append(storf_num)
|
|
348
|
+
clsuters_to_be_validated['only_storf_genera'].append(cluster)
|
|
349
|
+
if storf_num>=6:
|
|
350
|
+
cores['only_storf_genera_recording'].append([cluster, storf_num])
|
|
351
|
+
if storf_num > max_storf_only_genera:
|
|
352
|
+
max_storf_only_genera = storf_num
|
|
353
|
+
# if storf_num == 1:# and StORF_num == 0:
|
|
354
|
+
# cores['only_storf_genera_single'] += 1
|
|
355
|
+
# elif storf_num > 1:# and StORF_num == 0:
|
|
356
|
+
# cores['only_storf_genera_multi'] += 1
|
|
357
|
+
return max_storf_only_genera
|
|
358
|
+
#########################
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
###########################
|
|
363
|
+
print("Running")
|
|
364
|
+
check_all_calced = 0
|
|
365
|
+
for cluster, numbers in pangenome_clusters_Type_Genera.items():
|
|
366
|
+
pep_strains = pangenome_clusters_Type_Strains[cluster]
|
|
367
|
+
if numbers[3] >=1:
|
|
368
|
+
StORF_Genomes_Extended.append(numbers[3])
|
|
369
|
+
if numbers[4] >=1:
|
|
370
|
+
StORF_Seqs_Extended.append(numbers[4])
|
|
371
|
+
############################### Calc PEP only
|
|
372
|
+
if numbers[0] == 1 and len(pep_strains) >= 2: # If StORFs did not combine PEP reps
|
|
373
|
+
calc_pep_only(numbers[1])#,numbers[3])
|
|
374
|
+
check_all_calced +=1
|
|
375
|
+
elif numbers[0] >1: # IF StORFs combined multiple PEP
|
|
376
|
+
calc_pep_only(numbers[2][0])
|
|
377
|
+
check_all_calced += 1
|
|
378
|
+
# for num in numbers[2]:
|
|
379
|
+
# calc_pep_only(num) # ,numbers[3])
|
|
380
|
+
|
|
381
|
+
############################# Calc PEP and StORF_Reporter
|
|
382
|
+
if numbers[0] == 1 and numbers[3] >1: # If StORFs did not combine PEP reps
|
|
383
|
+
calc_pep_extended_StORF(cluster,numbers[1],numbers[3])
|
|
384
|
+
extended.update({cluster:numbers})
|
|
385
|
+
check_all_calced += 1
|
|
386
|
+
elif numbers[0] >1 and numbers[3] >1: # IF StORFs combined multiple PEP - Genera added
|
|
387
|
+
#grouped_pep = sum(numbers[2])
|
|
388
|
+
#for num in numbers[2]:
|
|
389
|
+
calc_multi_pep_extended_StORF(cluster,numbers[2],numbers[1],numbers[3]) # same here
|
|
390
|
+
print("combined: " + str(cluster))
|
|
391
|
+
|
|
392
|
+
extended.update({cluster: numbers})
|
|
393
|
+
check_all_calced += 1
|
|
394
|
+
elif numbers[0] >1 and numbers[4] >1: # IF StORFs combined multiple PEP
|
|
395
|
+
multi_PEP_Combined_By_StORFs.update({cluster: numbers})
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
import os
|
|
399
|
+
###########################
|
|
400
|
+
############################### Calc StORF_Reporter only
|
|
401
|
+
Combined_pangenome_clusters_ONLY_StORF_Type = collections.defaultdict(list)
|
|
402
|
+
Combined_pangenome_clusters_StORF_Type = collections.defaultdict(list)
|
|
403
|
+
|
|
404
|
+
biggest_genera = ""
|
|
405
|
+
big_genera = 0
|
|
406
|
+
biggest_strains = ""
|
|
407
|
+
big_strains = 0
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
#Without_StORF = open('./Ensem_Clusters_Without_Con-StORFs_To_Be_Nogged_min2','w')
|
|
411
|
+
#With_StORF = open('./Ensem_Clusters_With_Con-StORFs_To_Be_Nogged','w')
|
|
412
|
+
#With_Extending_StORF = open('./Ensem_Clusters_With_Extending_Con-StORFs_To_Be_Nogged','w')
|
|
413
|
+
StORF_Only = open("./StORF_Only_Clusters_To_Be_Nogged_min2",'w')
|
|
414
|
+
|
|
415
|
+
for cluster, genera in Combined_pangenome_clusters_StORF_Genera.items():
|
|
416
|
+
storf_strains = Combined_pangenome_clusters_StORF_Strains[cluster]
|
|
417
|
+
pep_strains = Combined_pangenome_clusters_PEP_Strains[cluster]
|
|
418
|
+
if cluster in not_StORF_Only_Cluster_IDs:
|
|
419
|
+
Combined_pangenome_clusters_StORF_Type[cluster] = [cluster,len(genera)]
|
|
420
|
+
#if len(genera) >= 1:
|
|
421
|
+
calc_StORF_only_when_with_pep(cluster,len(genera)) # ,numbers[3])
|
|
422
|
+
else:
|
|
423
|
+
if len(storf_strains) >= 2:
|
|
424
|
+
StORF_Only.write(str(cluster) + ',')
|
|
425
|
+
Combined_pangenome_clusters_ONLY_StORF_Type[cluster] = [cluster,len(genera)]
|
|
426
|
+
max_storf_only_genera = calc_only_StORF(cluster,len(genera),max_storf_only_genera)
|
|
427
|
+
if len(genera) > big_genera:
|
|
428
|
+
big_genera = len(genera)
|
|
429
|
+
biggest_genera = cluster
|
|
430
|
+
if len(storf_strains) >= big_strains:
|
|
431
|
+
big_strains = len(storf_strains)
|
|
432
|
+
biggest_strains = cluster
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
print("Biggest: " +biggest_genera)
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
############################### Calc StORF_Reporter only
|
|
446
|
+
# for cluster, data in Combined_pangenome_clusters_StORF_Type.items():
|
|
447
|
+
# if data[1] >=1:
|
|
448
|
+
# calc_StORF_only_when_with_pep(data[1]) # ,numbers[3])
|
|
449
|
+
#
|
|
450
|
+
#
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
#################
|
|
454
|
+
print(cores)
|
|
455
|
+
#print(extended)
|
|
456
|
+
|
|
457
|
+
from collections import Counter
|
|
458
|
+
|
|
459
|
+
#print(Counter(cores['pep_genera']))
|
|
460
|
+
#print(Counter(cores['extended_genera']))
|
|
461
|
+
#print(Counter(cores['comb_extended_genera']))
|
|
462
|
+
#print(Counter(cores['storf_genera']))
|
|
463
|
+
print(Counter(cores['only_storf_genera']))
|
|
464
|
+
print(cores['only_storf_genera_recording'])
|
|
465
|
+
|
|
466
|
+
print("END")
|
|
467
|
+
|
|
468
|
+
### Emoty file ready for interesting storfs
|
|
469
|
+
# interesting_out = "./StORF_Only_Clusters_To_Be_Swissed.fa"
|
|
470
|
+
# with open(interesting_out, 'r+') as f:
|
|
471
|
+
# f.truncate(4)
|
|
472
|
+
# for cluster, data in Combined_pangenome_clusters_ONLY_StORF_Type.items():
|
|
473
|
+
# #if number >1:
|
|
474
|
+
# if data[1] >=1:
|
|
475
|
+
# calc_only_StORF(data[1]) # ,numbers[3])
|
|
476
|
+
# # if data[1] >= 2:
|
|
477
|
+
# # print("Interesting:" + str(cluster))
|
|
478
|
+
# # os.system(
|
|
479
|
+
# # "python3 Extract_FASTA_From_Cluster.py -f ./All_Ensem_Filtered_PEP_Clustered_With_Unclustered_UR-StORFS_s.fa_CD_c90_s60.fa "
|
|
480
|
+
# # "-c ./All_Ensem_Filtered_PEP_Clustered_With_Unclustered_UR-StORFS_s.fa_CD_c90_s60.clstr -id " + str(
|
|
481
|
+
# # data[0]) + " -o "+ interesting_out)
|
|
482
|
+
# #
|
|
483
|
+
#
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
#
|
|
489
|
+
# ################################## Species
|
|
490
|
+
# pangenome_clusters_Type_Species = copy.deepcopy(pangenome_clusters_PEP_Species)
|
|
491
|
+
# for cluster, genomes in pangenome_clusters_PEP_Species.items():
|
|
492
|
+
# print(str(len(genomes)) + '\t' + str(len(pangenome_clusters_StORF_Species[cluster])))
|
|
493
|
+
# Con_StORFs = pangenome_clusters_StORF_Species[cluster]
|
|
494
|
+
# unique_con = 0
|
|
495
|
+
# all_con = 0
|
|
496
|
+
# for con in Con_StORFs:
|
|
497
|
+
# all_con +=1
|
|
498
|
+
# if con not in genomes:
|
|
499
|
+
# unique_con +=1
|
|
500
|
+
# pangenome_clusters_Type_Species[cluster] = [len(genomes),all_con,unique_con]
|
|
501
|
+
# ################################# Strains
|
|
502
|
+
# pangenome_clusters_Type_Strains = copy.deepcopy(pangenome_clusters_PEP_Strains)
|
|
503
|
+
# for cluster, genomes in pangenome_clusters_PEP_Strains.items():
|
|
504
|
+
# print(str(len(genomes))+'\t'+str(len(pangenome_clusters_StORF_Strains[cluster])))
|
|
505
|
+
# Con_StORFs = pangenome_clusters_StORF_Strains[cluster]
|
|
506
|
+
# unique_con = 0
|
|
507
|
+
# all_con = 0
|
|
508
|
+
# for con in Con_StORFs:
|
|
509
|
+
# all_con +=1
|
|
510
|
+
# if con not in genomes:
|
|
511
|
+
# unique_con +=1
|
|
512
|
+
# pangenome_clusters_Type_Strains[cluster] = [len(genomes),all_con,unique_con]
|
|
513
|
+
# ###################################
|
|
514
|
+
# Chris_Out = open('./Chris_Clusters.txt','w')
|
|
515
|
+
#
|
|
516
|
+
# clusters_For_Chris = collections.OrderedDict()
|
|
517
|
+
# clusters_For_Chris_PEP_0 = collections.OrderedDict()
|
|
518
|
+
#
|
|
519
|
+
# Chris_Out.write("Cluster\tSize\tEnsem_Genera_Num\tCon-StORF_Genera_Num\tCon-StORF_Only_Genera_Num\tEnsem_Species_Num\tCon-StORF_Species_Num\tCon-StORF_Only_Species_Num\tEnsem_Strain_Num\tCon-StORF_Strain_Num\tCon-StORF_Only_Strain_Num\n")
|
|
520
|
+
# #This for-loop will go through ALL Clusters allowing for the extraction of ALL different groupings
|
|
521
|
+
# for cluster, data in clusters.items():
|
|
522
|
+
# genera_numbers = pangenome_clusters_Type_Genera[cluster]
|
|
523
|
+
# species_numbers = pangenome_clusters_Type_Species[cluster]
|
|
524
|
+
# strain_numbers = pangenome_clusters_Type_Strains[cluster]
|
|
525
|
+
#
|
|
526
|
+
# Chris_Out.write(str(cluster)+'\t'+str(len(data))+'\t'+str(genera_numbers[0])+'\t'+str(genera_numbers[1])+'\t'+str(genera_numbers[2])+'\t'+str(species_numbers[0])+'\t'
|
|
527
|
+
# +str(species_numbers[1])+'\t'+str(species_numbers[2])+'\t'+str(strain_numbers[0])+'\t'+str(strain_numbers[1])+'\t'+str(species_numbers[2])+'\n')
|
|
528
|
+
|
|
529
|
+
# if cluster in clusters_With_Con_StORFs:
|
|
530
|
+
# print("Current")
|
|
531
|
+
# size_Of_Cluster = len(clusters[cluster])
|
|
532
|
+
# ensem_Num = 0
|
|
533
|
+
# con_StORF_Num = 0
|
|
534
|
+
# for i in clusters[cluster]:
|
|
535
|
+
# print(i)
|
|
536
|
+
# if 'Con-Stop' in i:
|
|
537
|
+
# con_StORF_Num +=1
|
|
538
|
+
# else:
|
|
539
|
+
# ensem_Num +=1
|
|
540
|
+
# clusters_For_Chris.update({cluster:[size_Of_Cluster,pep_Num,con_StORF_Num,numbers[0],numbers[1]]})
|
|
541
|
+
# ############# Add - Num of
|
|
542
|
+
# Chris_Out.write(str(cluster)+'\t'+str(size_Of_Cluster)+'\t'+str(pep_Num)+'\t'+str(ensem_Genera)+ str(con_StORF_Num)+'\t'+str(numbers[0])+'\t'+str(numbers[1])+'\n')
|
|
543
|
+
# if pep_Num == 0:
|
|
544
|
+
# clusters_For_Chris_PEP_0.update({cluster:[size_Of_Cluster,pep_Num,con_StORF_Num,numbers[0],numbers[1]]})
|
|
545
|
+
|
|
546
|
+
print("Da Da!!!!")
|
|
547
|
+
|
|
548
|
+
|
|
549
|
+
#
|
|
550
|
+
#
|
|
551
|
+
#
|
|
552
|
+
#
|
|
553
|
+
# ###################################
|
|
554
|
+
#
|
|
555
|
+
# core_99 = 9.9/10 * len(genome_dict)
|
|
556
|
+
# core_95 = 9.5/10 * len(genome_dict)
|
|
557
|
+
# core_90 = 9/10 * len(genome_dict)
|
|
558
|
+
# core_15 = 1.5/10 * len(genome_dict)
|
|
559
|
+
#
|
|
560
|
+
# pep_core_99 = 0
|
|
561
|
+
# pep_core_95 = 0
|
|
562
|
+
# pep_core_90 = 0
|
|
563
|
+
# pep_core_15 = 0
|
|
564
|
+
#
|
|
565
|
+
#
|
|
566
|
+
# extended_99 = 0
|
|
567
|
+
# extended_95 = 0
|
|
568
|
+
# extended_90 = 0
|
|
569
|
+
# extended_15 = 0
|
|
570
|
+
# ############### Needs to be redone with new 'numbers'
|
|
571
|
+
# for cluster, numbers in pangenome_clusters_Type_Genera.items():
|
|
572
|
+
# if numbers[0] >= math.floor(core_99) and numbers[1] == 0:
|
|
573
|
+
# pep_core_99 +=1
|
|
574
|
+
# elif numbers[0] >= math.floor(core_95) and numbers[0] < math.floor(core_99) and numbers[1] == 0:
|
|
575
|
+
# pep_core_95 +=1
|
|
576
|
+
# elif numbers[0] >= math.floor(core_90) and numbers[0] < math.floor(core_95) and numbers[1] == 0:
|
|
577
|
+
# pep_core_90 +=1
|
|
578
|
+
# if numbers[0] >= math.floor(core_15) and numbers[0] < math.floor(core_95) and numbers[1] == 0: # this catch captures some from pep_core_90
|
|
579
|
+
# pep_core_15 +=1
|
|
580
|
+
# ############ With Con-StORFs
|
|
581
|
+
# if numbers[0] < math.floor(core_99) and numbers[0] != 0 and numbers[0]+numbers[1] >= math.floor(core_99):
|
|
582
|
+
# extended_99 +=1
|
|
583
|
+
# elif numbers[0] < math.floor(core_95) and numbers[0] != 0 and numbers[0]+numbers[1] >= math.floor(core_95) and numbers[0]+numbers[1] < math.floor(core_99):
|
|
584
|
+
# extended_95 +=1
|
|
585
|
+
# elif numbers[0] < math.floor(core_90) and numbers[0] != 0 and numbers[0]+numbers[1] >= math.floor(core_90) and numbers[0]+numbers[1] < math.floor(core_95):
|
|
586
|
+
# extended_90 +=1
|
|
587
|
+
# if numbers[0] < math.floor(core_15) and numbers[0] != 0 and numbers[0]+numbers[1] >= math.floor(core_15) and numbers[0]+numbers[1] < math.floor(core_95):
|
|
588
|
+
# extended_15 +=1
|
|
589
|
+
#
|
|
590
|
+
# print("Out")
|
|
591
|
+
# print(pep_core_99)
|
|
592
|
+
# print(pep_core_95)
|
|
593
|
+
# print(pep_core_90)
|
|
594
|
+
# print(pep_core_15)
|
|
595
|
+
#
|
|
596
|
+
# print(extended_99)
|
|
597
|
+
# print(extended_95)
|
|
598
|
+
# print(extended_90)
|
|
599
|
+
# print(extended_15)
|
|
600
|
+
#
|
PyamilySeq/Constants.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
PyamilySeq_Version = 'v0.0.1'
|
|
@@ -0,0 +1,586 @@
|
|
|
1
|
+
#from line_profiler_pycharm import profile
|
|
2
|
+
|
|
3
|
+
from collections import OrderedDict,defaultdict
|
|
4
|
+
import copy
|
|
5
|
+
import math
|
|
6
|
+
import sys
|
|
7
|
+
import argparse
|
|
8
|
+
import os
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
from .Constants import *
|
|
12
|
+
except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
|
|
13
|
+
from Constants import *
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def custom_sort_key(k, dict1, dict2):
|
|
17
|
+
return (len(dict1[k]), len(dict2[k]))
|
|
18
|
+
|
|
19
|
+
def sort_keys_by_values(dict1, dict2):
|
|
20
|
+
sorted_keys = sorted(dict1.keys(), key=lambda k: custom_sort_key(k, dict1, dict2), reverse=True)
|
|
21
|
+
return sorted_keys
|
|
22
|
+
|
|
23
|
+
def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted):
|
|
24
|
+
print("Outputting gene_presence_absence file")
|
|
25
|
+
in_name = options.clusters.split('.')[0]
|
|
26
|
+
gpa_outfile = open(in_name+'_gene_presence_absence.csv','w')
|
|
27
|
+
gpa_outfile.write('"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment","'
|
|
28
|
+
'"Accessory Fragment","Accessory Order with Fragment","QC","Min group size nuc","Max group size nuc","Avg group size nuc","')
|
|
29
|
+
gpa_outfile.write('","'.join(genome_dict.keys()))
|
|
30
|
+
gpa_outfile.write('"\n')
|
|
31
|
+
for cluster, sequences in pangenome_clusters_First_sequences_sorted.items():
|
|
32
|
+
average_sequences_per_genome = len(sequences) / len(pangenome_clusters_First_sorted[cluster])
|
|
33
|
+
gpa_outfile.write('"group_'+str(cluster)+'","","'+str(len(pangenome_clusters_First_sorted[cluster]))+'","'+str(len(sequences))+'","'+str(average_sequences_per_genome)+
|
|
34
|
+
'","","","","","","","","",""')
|
|
35
|
+
|
|
36
|
+
full_out = ''
|
|
37
|
+
for genome in genome_dict.keys():
|
|
38
|
+
tmp_list = []
|
|
39
|
+
for value in sequences:
|
|
40
|
+
if value.split('|')[0] == genome:
|
|
41
|
+
tmp_list.append(value)
|
|
42
|
+
if tmp_list:
|
|
43
|
+
full_out += ',"'+''.join(tmp_list)+'"'
|
|
44
|
+
gpa_outfile.write(full_out)
|
|
45
|
+
gpa_outfile.write('\n')
|
|
46
|
+
|
|
47
|
+
### Below is some unfinished code
|
|
48
|
+
# edge_list_outfile = open(in_name+'_edge_list.csv','w')
|
|
49
|
+
# for cluster, sequences in pangenome_clusters_First_sequences_sorted.items():
|
|
50
|
+
# output = []
|
|
51
|
+
# for entry in sequences:
|
|
52
|
+
# # Split each entry at '|'
|
|
53
|
+
# genome, gene = entry.split('|')
|
|
54
|
+
# # Format the result as "gene genome"
|
|
55
|
+
# output.append(f"{gene}\t{genome}")
|
|
56
|
+
# for line in output:
|
|
57
|
+
# edge_list_outfile.write(line + '\n')
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def reorder_dict_by_keys(original_dict, sorted_keys):
|
|
63
|
+
return {k: original_dict[k] for k in sorted_keys}
|
|
64
|
+
|
|
65
|
+
def get_cores(options,genome_dict):
|
|
66
|
+
##Calculate core groups
|
|
67
|
+
groups = OrderedDict()
|
|
68
|
+
cores = OrderedDict()
|
|
69
|
+
prev_top = len(genome_dict)
|
|
70
|
+
first = True
|
|
71
|
+
for group in options.core_groups.split(','):
|
|
72
|
+
calculated_floor = math.floor(int(group) / 100 * len(genome_dict))
|
|
73
|
+
if first == False:
|
|
74
|
+
groups[group] = (calculated_floor,prev_top -1)
|
|
75
|
+
else:
|
|
76
|
+
groups[group] = (calculated_floor, prev_top)
|
|
77
|
+
first = False
|
|
78
|
+
prev_top = calculated_floor
|
|
79
|
+
first_core_group = 'first_core_' + group
|
|
80
|
+
cores[first_core_group] = 0
|
|
81
|
+
if options.reclustered != None:
|
|
82
|
+
extended_core_group = 'extended_core_' + group
|
|
83
|
+
cores[extended_core_group] = 0
|
|
84
|
+
combined_core_group = 'combined_core_' + group
|
|
85
|
+
cores[combined_core_group] = 0
|
|
86
|
+
second_core_group = 'second_core_' + group
|
|
87
|
+
cores[second_core_group] = 0
|
|
88
|
+
only_second_core_group = 'only_second_core_' + group
|
|
89
|
+
cores[only_second_core_group] = 0
|
|
90
|
+
return cores, groups
|
|
91
|
+
|
|
92
|
+
#@profile
|
|
93
|
+
def calc_First_only_core(pep_num, groups, cores):
|
|
94
|
+
groups_as_list = list(groups.values())
|
|
95
|
+
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= pep_num <= fir):
|
|
96
|
+
res = idx
|
|
97
|
+
family_group = list(groups)[res]
|
|
98
|
+
cores['first_core_'+family_group] +=1
|
|
99
|
+
|
|
100
|
+
#@profile
|
|
101
|
+
def calc_single_First_extended_Second_only_core(pep_num, groups, cores, second_num): # Count gene families extended with StORFs
|
|
102
|
+
groups_as_list = list(groups.values())
|
|
103
|
+
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= pep_num+second_num <= fir):
|
|
104
|
+
res = idx
|
|
105
|
+
family_group = list(groups)[res]
|
|
106
|
+
cores['extended_core_' + family_group] += 1
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
#@profile
|
|
110
|
+
def calc_multi_First_extended_Second_only_core(pep_num, groups, cores, second_num): # Count seperately those gene families extended with StORF_Reporter but combined >1 PEP
|
|
111
|
+
groups_as_list = list(groups.values())
|
|
112
|
+
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= pep_num+second_num <= fir):
|
|
113
|
+
res = idx
|
|
114
|
+
family_group = list(groups)[res]
|
|
115
|
+
cores['combined_core_' + family_group] += 1
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
#@profile
|
|
119
|
+
def calc_Second_only_core(groups, cores, second_num):
|
|
120
|
+
groups_as_list = list(groups.values())
|
|
121
|
+
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= second_num <= fir):
|
|
122
|
+
res = idx
|
|
123
|
+
family_group = list(groups)[res]
|
|
124
|
+
cores['second_core_' + family_group] += 1
|
|
125
|
+
|
|
126
|
+
#@profile
|
|
127
|
+
def calc_only_Second_only_core(groups, cores, second_num): # only count the true storf onlies
|
|
128
|
+
groups_as_list = list(groups.values())
|
|
129
|
+
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= second_num <= fir):
|
|
130
|
+
res = idx
|
|
131
|
+
family_group = list(groups)[res]
|
|
132
|
+
cores['only_second_core_' + family_group] += 1
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
#@profile
|
|
139
|
+
def combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered):
|
|
140
|
+
num_clustered_First = defaultdict(list)
|
|
141
|
+
pangenome_clusters_Type = copy.deepcopy(pangenome_clusters_First)
|
|
142
|
+
list_of_reps = list(reps.keys())
|
|
143
|
+
for cluster, pep_genomes in pangenome_clusters_First.items():
|
|
144
|
+
rep = list_of_reps[int(cluster)] # get the rep of the current pep cluster
|
|
145
|
+
Com_PEP_Genomes = 0
|
|
146
|
+
Seconds = 0
|
|
147
|
+
seen_Seconds = []
|
|
148
|
+
added_Second_genomes = 0
|
|
149
|
+
try: # get the cluster from the storf clusters which contains this rep
|
|
150
|
+
clustered_combined = combined_pangenome_clusters_First_Second_clustered[rep] # Not true clusters - I put a PEP as key myself
|
|
151
|
+
seen_clust_Genomes = []
|
|
152
|
+
num_clustered_First[cluster].append(rep + '_' + str(len(pep_genomes)))
|
|
153
|
+
for clust in clustered_combined:
|
|
154
|
+
if options.sequence_tag not in clust: # Not good enough at the moment
|
|
155
|
+
clust_Genome = clust.split('|')[0]
|
|
156
|
+
if clust_Genome not in seen_clust_Genomes:
|
|
157
|
+
seen_clust_Genomes.append(clust_Genome)
|
|
158
|
+
if clust_Genome not in pep_genomes:
|
|
159
|
+
Com_PEP_Genomes += 1
|
|
160
|
+
num_clustered_First[cluster].append(clust + '_' + str(reps[clust][1]))
|
|
161
|
+
elif options.sequence_tag in clust:
|
|
162
|
+
Seconds += 1
|
|
163
|
+
clust_Genome = clust.split('|')[0]
|
|
164
|
+
if clust_Genome not in seen_Seconds:
|
|
165
|
+
seen_Seconds.append(clust_Genome)
|
|
166
|
+
if clust_Genome not in seen_clust_Genomes:
|
|
167
|
+
seen_clust_Genomes.append(clust_Genome)
|
|
168
|
+
if clust_Genome not in pep_genomes:
|
|
169
|
+
added_Second_genomes += 1
|
|
170
|
+
else:
|
|
171
|
+
sys.exit("Error: looking for sequence_tag")
|
|
172
|
+
|
|
173
|
+
size_of_pep_clusters = []
|
|
174
|
+
peps = num_clustered_First[cluster]
|
|
175
|
+
for pep in peps:
|
|
176
|
+
pep = pep.rsplit('_', 1)
|
|
177
|
+
size_of_pep_clusters.append(int(pep[1]))
|
|
178
|
+
pangenome_clusters_Type[cluster] = [len(num_clustered_First[cluster]), sum(size_of_pep_clusters),
|
|
179
|
+
size_of_pep_clusters, added_Second_genomes, Seconds, len(seen_Seconds)]
|
|
180
|
+
|
|
181
|
+
except KeyError:
|
|
182
|
+
###Singleton
|
|
183
|
+
num_pep_genomes = [len(pep_genomes)]
|
|
184
|
+
pangenome_clusters_Type[cluster] = [1, len(pep_genomes), num_pep_genomes, added_Second_genomes, Seconds,
|
|
185
|
+
len(seen_Seconds)]
|
|
186
|
+
|
|
187
|
+
return pangenome_clusters_Type
|
|
188
|
+
|
|
189
|
+
#@profile
|
|
190
|
+
def single_clustering_counting(options, pangenome_clusters_First, reps):
|
|
191
|
+
num_clustered_PEP = defaultdict(list)
|
|
192
|
+
recorded_PEP = []
|
|
193
|
+
pangenome_clusters_Type = copy.deepcopy(pangenome_clusters_First)
|
|
194
|
+
list_of_reps = list(reps.keys())
|
|
195
|
+
for cluster, pep_genomes in pangenome_clusters_First.items():
|
|
196
|
+
rep = list_of_reps[int(cluster)] # get the rep of the current pep cluster
|
|
197
|
+
|
|
198
|
+
try: # get the cluster from the storf clusters which contains this rep
|
|
199
|
+
num_clustered_PEP[cluster].append(rep + '_' + str(len(pep_genomes)))
|
|
200
|
+
size_of_pep_clusters = []
|
|
201
|
+
peps = num_clustered_PEP[cluster]
|
|
202
|
+
for pep in peps:
|
|
203
|
+
pep = pep.rsplit('_', 1)
|
|
204
|
+
size_of_pep_clusters.append(int(pep[1]))
|
|
205
|
+
recorded_PEP.append(pep[0])
|
|
206
|
+
pangenome_clusters_Type[cluster] = [len(num_clustered_PEP[cluster]), sum(size_of_pep_clusters),
|
|
207
|
+
size_of_pep_clusters, 0, 0, 0]
|
|
208
|
+
|
|
209
|
+
except KeyError:
|
|
210
|
+
###Singleton
|
|
211
|
+
num_pep_genomes = [len(pep_genomes)]
|
|
212
|
+
pangenome_clusters_Type[cluster] = [1, len(pep_genomes), num_pep_genomes, 0, 0, 0]
|
|
213
|
+
|
|
214
|
+
return pangenome_clusters_Type
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
#@profile
|
|
219
|
+
def combined_clustering_CDHIT(options, genome_dict):
|
|
220
|
+
unique_genomes = []
|
|
221
|
+
Second_in = open(options.reclustered, 'r')
|
|
222
|
+
combined_pangenome_clusters_First = OrderedDict()
|
|
223
|
+
combined_pangenome_clusters_First_sequences = OrderedDict()
|
|
224
|
+
combined_pangenome_clusters_Second = OrderedDict()
|
|
225
|
+
combined_pangenome_clusters_Second_sequences = OrderedDict()
|
|
226
|
+
combined_pangenome_clusters_First_Second_clustered = OrderedDict()
|
|
227
|
+
|
|
228
|
+
not_Second_only_cluster_ids = []
|
|
229
|
+
already_seen_PEP = []
|
|
230
|
+
Combined_clusters = OrderedDict()
|
|
231
|
+
Combined_reps = OrderedDict()
|
|
232
|
+
first = True
|
|
233
|
+
for line in Second_in:
|
|
234
|
+
if line.startswith('>'):
|
|
235
|
+
if first == False:
|
|
236
|
+
cluster_size = len(Combined_clusters[cluster_id])
|
|
237
|
+
Combined_reps.update({rep: cluster_size})
|
|
238
|
+
for pep in combined_pangenome_clusters_First_sequences[cluster_id]:
|
|
239
|
+
if pep != []:
|
|
240
|
+
if pep in already_seen_PEP:
|
|
241
|
+
continue
|
|
242
|
+
else:
|
|
243
|
+
already_seen_PEP.append(pep)
|
|
244
|
+
if len(combined_pangenome_clusters_Second_sequences[cluster_id]) > 0 and len(combined_pangenome_clusters_First_sequences[cluster_id]) > 0:
|
|
245
|
+
if len(combined_pangenome_clusters_First_sequences[cluster_id]) > 1: # If we have clustered >1 PEP family, we need to record 1 as key and all others are val
|
|
246
|
+
all_but_first = combined_pangenome_clusters_First_sequences[cluster_id][1:]
|
|
247
|
+
storfs_clustered = combined_pangenome_clusters_Second_sequences[cluster_id]
|
|
248
|
+
VALUE = all_but_first + storfs_clustered
|
|
249
|
+
else:
|
|
250
|
+
VALUE = combined_pangenome_clusters_Second_sequences[cluster_id]
|
|
251
|
+
KEY = combined_pangenome_clusters_First_sequences[cluster_id][0]
|
|
252
|
+
combined_pangenome_clusters_First_Second_clustered.update({KEY: VALUE})
|
|
253
|
+
cluster_id = line.strip('>')
|
|
254
|
+
cluster_id = cluster_id.strip('\n')
|
|
255
|
+
cluster_id = cluster_id.split(' ')[1]
|
|
256
|
+
Combined_clusters.update({cluster_id: []})
|
|
257
|
+
combined_pangenome_clusters_First.update({cluster_id: []})
|
|
258
|
+
combined_pangenome_clusters_First_sequences.update({cluster_id: []})
|
|
259
|
+
combined_pangenome_clusters_Second.update({cluster_id: []})
|
|
260
|
+
combined_pangenome_clusters_Second_sequences.update({cluster_id: []})
|
|
261
|
+
|
|
262
|
+
first = False
|
|
263
|
+
else:
|
|
264
|
+
clustered = line.split('\t')[1]
|
|
265
|
+
clustered = clustered.split('>')[1]
|
|
266
|
+
clustered = clustered.split('...')[0]
|
|
267
|
+
genome = clustered.split('|')[0]
|
|
268
|
+
genome_dict[genome] += 1
|
|
269
|
+
if '*' in line:
|
|
270
|
+
rep = clustered
|
|
271
|
+
Combined_reps.update({rep: 0})
|
|
272
|
+
if first == False:
|
|
273
|
+
Combined_clusters[cluster_id].append(clustered)
|
|
274
|
+
clustered_genome = clustered.split('|')[0]
|
|
275
|
+
if options.sequence_tag in line:
|
|
276
|
+
if clustered_genome not in combined_pangenome_clusters_Second[cluster_id]:
|
|
277
|
+
combined_pangenome_clusters_Second[cluster_id].append(clustered_genome)
|
|
278
|
+
combined_pangenome_clusters_Second_sequences[cluster_id].append(clustered)
|
|
279
|
+
else:
|
|
280
|
+
if cluster_id not in not_Second_only_cluster_ids:
|
|
281
|
+
not_Second_only_cluster_ids.append(cluster_id) # Tell us which StORF_Reporter clustered are unmatched to a PEP
|
|
282
|
+
if clustered_genome not in combined_pangenome_clusters_First[cluster_id]:
|
|
283
|
+
combined_pangenome_clusters_First[cluster_id].append(clustered_genome)
|
|
284
|
+
combined_pangenome_clusters_First_sequences[cluster_id].append(clustered)
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
return combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids, combined_pangenome_clusters_Second, unique_genomes
|
|
288
|
+
|
|
289
|
+
def combined_clustering_Edge_List(options, genome_dict):
|
|
290
|
+
if options.format == 'TSV':
|
|
291
|
+
separator = '\t'
|
|
292
|
+
elif options.format == 'CSV':
|
|
293
|
+
separator = ','
|
|
294
|
+
unique_genomes = []
|
|
295
|
+
cluster_id = 0
|
|
296
|
+
last_rep = ''
|
|
297
|
+
Second_in = open(options.reclustered, 'r')
|
|
298
|
+
combined_pangenome_clusters_First = OrderedDict()
|
|
299
|
+
combined_pangenome_clusters_First_sequences = OrderedDict()
|
|
300
|
+
combined_pangenome_clusters_Second = OrderedDict()
|
|
301
|
+
combined_pangenome_clusters_Second_sequences = OrderedDict()
|
|
302
|
+
combined_pangenome_clusters_First_Second_clustered = OrderedDict()
|
|
303
|
+
|
|
304
|
+
not_Second_only_cluster_ids = []
|
|
305
|
+
already_seen_PEP = []
|
|
306
|
+
Combined_clusters = OrderedDict()
|
|
307
|
+
Combined_reps = OrderedDict()
|
|
308
|
+
first = True
|
|
309
|
+
for line in Second_in:
|
|
310
|
+
rep, child = line.strip().split(separator)
|
|
311
|
+
child_genome = child.split('|')[0] # Extracting the genome identifier from the child sequence
|
|
312
|
+
|
|
313
|
+
if first == True:
|
|
314
|
+
Combined_clusters.update({cluster_id: []})
|
|
315
|
+
combined_pangenome_clusters_First.update({cluster_id: []})
|
|
316
|
+
combined_pangenome_clusters_First_sequences.update({cluster_id: []})
|
|
317
|
+
combined_pangenome_clusters_Second.update({cluster_id: []})
|
|
318
|
+
combined_pangenome_clusters_Second_sequences.update({cluster_id: []})
|
|
319
|
+
Combined_reps.update({rep: 0})
|
|
320
|
+
first = False
|
|
321
|
+
|
|
322
|
+
if first == False:
|
|
323
|
+
if rep != last_rep and last_rep != '':
|
|
324
|
+
cluster_size = len(Combined_clusters[cluster_id])
|
|
325
|
+
Combined_reps.update({rep: cluster_size})
|
|
326
|
+
for pep in combined_pangenome_clusters_First_sequences[cluster_id]:
|
|
327
|
+
if pep != []:
|
|
328
|
+
if pep in already_seen_PEP:
|
|
329
|
+
continue
|
|
330
|
+
else:
|
|
331
|
+
already_seen_PEP.append(pep)
|
|
332
|
+
if len(combined_pangenome_clusters_Second_sequences[cluster_id]) > 0 and len(combined_pangenome_clusters_First_sequences[cluster_id]) > 0:
|
|
333
|
+
if len(combined_pangenome_clusters_First_sequences[cluster_id]) > 1: # If we have clustered >1 PEP family, we need to record 1 as key and all others are val
|
|
334
|
+
all_but_first = combined_pangenome_clusters_First_sequences[cluster_id][1:]
|
|
335
|
+
storfs_clustered = combined_pangenome_clusters_Second_sequences[cluster_id]
|
|
336
|
+
VALUE = all_but_first + storfs_clustered
|
|
337
|
+
else:
|
|
338
|
+
VALUE = combined_pangenome_clusters_Second_sequences[cluster_id]
|
|
339
|
+
KEY = combined_pangenome_clusters_First_sequences[cluster_id][0]
|
|
340
|
+
combined_pangenome_clusters_First_Second_clustered.update({KEY: VALUE})
|
|
341
|
+
|
|
342
|
+
cluster_id += 1
|
|
343
|
+
Combined_clusters.update({cluster_id: []})
|
|
344
|
+
combined_pangenome_clusters_First.update({cluster_id: []})
|
|
345
|
+
combined_pangenome_clusters_First_sequences.update({cluster_id: []})
|
|
346
|
+
combined_pangenome_clusters_Second.update({cluster_id: []})
|
|
347
|
+
combined_pangenome_clusters_Second_sequences.update({cluster_id: []})
|
|
348
|
+
Combined_reps.update({rep: 0})
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
Combined_clusters[cluster_id].append(child)
|
|
352
|
+
if options.sequence_tag in line:
|
|
353
|
+
if child_genome not in combined_pangenome_clusters_Second[cluster_id]:
|
|
354
|
+
combined_pangenome_clusters_Second[cluster_id].append(child_genome)
|
|
355
|
+
combined_pangenome_clusters_Second_sequences[cluster_id].append(child)
|
|
356
|
+
else:
|
|
357
|
+
if cluster_id not in not_Second_only_cluster_ids:
|
|
358
|
+
not_Second_only_cluster_ids.append(cluster_id) # Tell us which StORF_Reporter clustered are unmatched to a PEP
|
|
359
|
+
if child_genome not in combined_pangenome_clusters_First[cluster_id]:
|
|
360
|
+
combined_pangenome_clusters_First[cluster_id].append(child_genome)
|
|
361
|
+
combined_pangenome_clusters_First_sequences[cluster_id].append(child)
|
|
362
|
+
|
|
363
|
+
last_rep = rep
|
|
364
|
+
|
|
365
|
+
return combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids, combined_pangenome_clusters_Second, unique_genomes
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
def cluster_EdgeList(options):
|
|
369
|
+
if options.format == 'TSV':
|
|
370
|
+
separator = '\t'
|
|
371
|
+
elif options.format == 'CSV':
|
|
372
|
+
separator = ','
|
|
373
|
+
cluster_id = 0
|
|
374
|
+
last_rep = ''
|
|
375
|
+
first = True
|
|
376
|
+
First_in = open(options.clusters, 'r')
|
|
377
|
+
pangenome_clusters_First = OrderedDict()
|
|
378
|
+
pangenome_clusters_First_sequences = OrderedDict()
|
|
379
|
+
genome_dict = defaultdict(int)
|
|
380
|
+
reps = OrderedDict()
|
|
381
|
+
for line in First_in:
|
|
382
|
+
rep, child = line.strip().split(separator)
|
|
383
|
+
child_genome = child.split('|')[0] # Extracting the genome identifier from the child sequence
|
|
384
|
+
# Counting occurrences of genomes
|
|
385
|
+
genome_dict[child_genome] += 1
|
|
386
|
+
if first == True:
|
|
387
|
+
pangenome_clusters_First[0] = []
|
|
388
|
+
pangenome_clusters_First_sequences[0] = []
|
|
389
|
+
first = False
|
|
390
|
+
|
|
391
|
+
if rep != last_rep and last_rep != '':
|
|
392
|
+
cluster_id +=1
|
|
393
|
+
pangenome_clusters_First[cluster_id] = []
|
|
394
|
+
pangenome_clusters_First_sequences[cluster_id] = []
|
|
395
|
+
cluster_size = len(pangenome_clusters_First_sequences[cluster_id-1])
|
|
396
|
+
reps.update({last_rep: [cluster_size, len(pangenome_clusters_First[cluster_id-1])]})
|
|
397
|
+
pangenome_clusters_First[cluster_id] = []
|
|
398
|
+
pangenome_clusters_First_sequences[cluster_id] = []
|
|
399
|
+
if child_genome not in pangenome_clusters_First[cluster_id]:
|
|
400
|
+
pangenome_clusters_First[cluster_id].append(child_genome)
|
|
401
|
+
|
|
402
|
+
pangenome_clusters_First_sequences[cluster_id].append(child)
|
|
403
|
+
last_rep = rep
|
|
404
|
+
cluster_size = len(pangenome_clusters_First_sequences[cluster_id])
|
|
405
|
+
reps.update({rep: [cluster_size, len(pangenome_clusters_First[cluster_id])]})
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
return genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
def cluster_CDHIT(options):
|
|
413
|
+
First_in = open(options.clusters, 'r')
|
|
414
|
+
clusters = OrderedDict()
|
|
415
|
+
pangenome_clusters_First = OrderedDict()
|
|
416
|
+
pangenome_clusters_First_sequences = OrderedDict()
|
|
417
|
+
first = True
|
|
418
|
+
genome_dict = defaultdict(int)
|
|
419
|
+
reps = OrderedDict()
|
|
420
|
+
## Load in all data for easier reuse later
|
|
421
|
+
for line in First_in:
|
|
422
|
+
if line.startswith('>'):
|
|
423
|
+
if first == False:
|
|
424
|
+
cluster_size = len(clusters[cluster_id])
|
|
425
|
+
reps.update({rep: [cluster_size, len(pangenome_clusters_First[cluster_id])]})
|
|
426
|
+
cluster_id = line.strip('>')
|
|
427
|
+
cluster_id = cluster_id.strip('\n')
|
|
428
|
+
cluster_id = cluster_id.split(' ')[1]
|
|
429
|
+
clusters.update({cluster_id: []})
|
|
430
|
+
pangenome_clusters_First.update({cluster_id: []})
|
|
431
|
+
pangenome_clusters_First_sequences.update({cluster_id: []})
|
|
432
|
+
|
|
433
|
+
first = False
|
|
434
|
+
else:
|
|
435
|
+
clustered = line.split('\t')[1]
|
|
436
|
+
clustered = clustered.split('>')[1]
|
|
437
|
+
clustered = clustered.split('...')[0]
|
|
438
|
+
genome = clustered.split('|')[0]
|
|
439
|
+
genome_dict[genome] += 1
|
|
440
|
+
if '*' in line:
|
|
441
|
+
rep = clustered
|
|
442
|
+
reps.update({rep: [0, 0]})
|
|
443
|
+
if first == False:
|
|
444
|
+
clusters[cluster_id].append(clustered)
|
|
445
|
+
clustered_genome = clustered.split('|')[0]
|
|
446
|
+
if clustered_genome not in pangenome_clusters_First[cluster_id]:
|
|
447
|
+
pangenome_clusters_First[cluster_id].append(clustered_genome)
|
|
448
|
+
pangenome_clusters_First_sequences[cluster_id].append(clustered)
|
|
449
|
+
return genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps
|
|
450
|
+
|
|
451
|
+
#@profile
|
|
452
|
+
def cluster(options):
|
|
453
|
+
|
|
454
|
+
if options.format == 'CD-HIT':
|
|
455
|
+
genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps = cluster_CDHIT(options)
|
|
456
|
+
elif options.format in ['TSV','CSV']:
|
|
457
|
+
genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps = cluster_EdgeList(options)
|
|
458
|
+
|
|
459
|
+
######################################
|
|
460
|
+
cores, groups = get_cores(options, genome_dict)
|
|
461
|
+
###
|
|
462
|
+
|
|
463
|
+
if options.reclustered != None:
|
|
464
|
+
if options.format == 'CD-HIT':
|
|
465
|
+
combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second,\
|
|
466
|
+
unique_genomes = combined_clustering_CDHIT(options, genome_dict)
|
|
467
|
+
if options.format == 'TSV':
|
|
468
|
+
combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second,\
|
|
469
|
+
unique_genomes = combined_clustering_Edge_List(options, genome_dict)
|
|
470
|
+
pangenome_clusters_Type = combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered)
|
|
471
|
+
else:
|
|
472
|
+
pangenome_clusters_Type = single_clustering_counting(options, pangenome_clusters_First, reps)
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
counter = 0
|
|
476
|
+
Number_Of_StORF_Extending_But_Same_Genomes = 0
|
|
477
|
+
|
|
478
|
+
sorted_first_keys = sort_keys_by_values(pangenome_clusters_First, pangenome_clusters_First_sequences)
|
|
479
|
+
pangenome_clusters_First_sorted = reorder_dict_by_keys(pangenome_clusters_First, sorted_first_keys)
|
|
480
|
+
pangenome_clusters_First_sequences_sorted = reorder_dict_by_keys(pangenome_clusters_First_sequences, sorted_first_keys)
|
|
481
|
+
pangenome_clusters_Type_sorted = reorder_dict_by_keys(pangenome_clusters_Type, sorted_first_keys)
|
|
482
|
+
|
|
483
|
+
print("Calculating Groups")
|
|
484
|
+
for cluster, numbers in pangenome_clusters_Type_sorted.items():
|
|
485
|
+
############################### Calculate First only
|
|
486
|
+
if numbers[0] == 1 and numbers[1] >=2:
|
|
487
|
+
calc_First_only_core(numbers[1],groups,cores)
|
|
488
|
+
counter +=1
|
|
489
|
+
elif numbers[0] >1 and numbers[1] >=2:
|
|
490
|
+
calc_First_only_core(numbers[2][0],groups,cores)
|
|
491
|
+
counter += 1
|
|
492
|
+
|
|
493
|
+
if options.reclustered != None:
|
|
494
|
+
############################# Calculate First and Reclustered-Second
|
|
495
|
+
if numbers[0] == 1 and numbers[3] >= 1: # If Seconds did not combine First reps
|
|
496
|
+
calc_single_First_extended_Second_only_core(numbers[1], groups, cores, numbers[3])
|
|
497
|
+
elif numbers[0] > 1 and numbers[3] >= 1: # If unique Secondss combined multiple Firsts
|
|
498
|
+
calc_multi_First_extended_Second_only_core(numbers[1], groups, cores, numbers[3])
|
|
499
|
+
elif numbers[4] >= 1:
|
|
500
|
+
Number_Of_StORF_Extending_But_Same_Genomes += 1
|
|
501
|
+
combined_pangenome_clusters_ONLY_Second_Type = defaultdict(list)
|
|
502
|
+
combined_pangenome_clusters_Second_Type = defaultdict(list)
|
|
503
|
+
for cluster, genomes in combined_pangenome_clusters_Second.items():
|
|
504
|
+
if cluster in not_Second_only_cluster_ids:
|
|
505
|
+
combined_pangenome_clusters_Second_Type[cluster] = [cluster, len(genomes)]
|
|
506
|
+
else:
|
|
507
|
+
combined_pangenome_clusters_ONLY_Second_Type[cluster] = [cluster, len(genomes)]
|
|
508
|
+
for cluster, data in combined_pangenome_clusters_Second_Type.items():
|
|
509
|
+
calc_Second_only_core(groups, cores, data[1])
|
|
510
|
+
for cluster, data in combined_pangenome_clusters_ONLY_Second_Type.items():
|
|
511
|
+
if data[1] >= 2:
|
|
512
|
+
calc_only_Second_only_core(groups, cores, data[1])
|
|
513
|
+
###########################
|
|
514
|
+
print("End")
|
|
515
|
+
key_order = ['first_core_', 'extended_core_', 'combined_core_', 'second_core_','only_second_core_']
|
|
516
|
+
print("Gene Family Groups:")
|
|
517
|
+
for key_prefix in key_order:
|
|
518
|
+
for key, value in cores.items():
|
|
519
|
+
if key.startswith(key_prefix):
|
|
520
|
+
print(f"{key}: {value}")
|
|
521
|
+
|
|
522
|
+
if options.gene_presence_absence_out != None:
|
|
523
|
+
gene_presence_absence_output(options,genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted)
|
|
524
|
+
|
|
525
|
+
|
|
526
|
+
def main():
|
|
527
|
+
|
|
528
|
+
parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ': PyamilySeq Run Parameters.')
|
|
529
|
+
parser._action_groups.pop()
|
|
530
|
+
|
|
531
|
+
required = parser.add_argument_group('Required Arguments')
|
|
532
|
+
required.add_argument('-c', action='store', dest='clusters', help='Clustering output file from CD-HIT, TSV or CSV Edge List',
|
|
533
|
+
required=True)
|
|
534
|
+
required.add_argument('-f', action='store', dest='format', choices=['CD-HIT', 'CSV', 'TSV'],
|
|
535
|
+
help='Which format to use (CD-HIT or Comma/Tab Separated Edge-List (such as MMseqs2 tsv output))', required=True)
|
|
536
|
+
|
|
537
|
+
|
|
538
|
+
optional = parser.add_argument_group('Optional Arguments')
|
|
539
|
+
optional.add_argument('-rc', action='store', dest='reclustered', help='Clustering output file from secondary round of clustering',
|
|
540
|
+
required=False)
|
|
541
|
+
optional.add_argument('-st', action='store', dest='sequence_tag', help='Default - "StORF": Unique identifier to be used to distinguish the second of two rounds of clustered sequences',
|
|
542
|
+
required=False)
|
|
543
|
+
optional.add_argument('-groups', action="store", dest='core_groups', default="99,80,15",
|
|
544
|
+
help='Default - (\'99,95,90,80,15\'): Gene family groups to use')
|
|
545
|
+
optional.add_argument('-gpa', action='store', dest='gene_presence_absence_out', help='Default - False: If selected, a Roary formatted gene_presence_absence.csv will be created - Required for Coinfinder and other downstream tools',
|
|
546
|
+
required=False)
|
|
547
|
+
|
|
548
|
+
misc = parser.add_argument_group('Misc')
|
|
549
|
+
misc.add_argument('-verbose', action='store', dest='verbose', default=False, type=eval, choices=[True, False],
|
|
550
|
+
help='Default - False: Print out runtime messages')
|
|
551
|
+
misc.add_argument('-v', action='store_true', dest='version',
|
|
552
|
+
help='Default - False: Print out version number and exit')
|
|
553
|
+
|
|
554
|
+
|
|
555
|
+
options = parser.parse_args()
|
|
556
|
+
if options.clusters == None or options.format == None:
|
|
557
|
+
if options.version:
|
|
558
|
+
sys.exit(PyamilySeq_Version)
|
|
559
|
+
else:
|
|
560
|
+
exit('PyamilySeq: error: the following arguments are required: -c, -f')
|
|
561
|
+
|
|
562
|
+
if options.sequence_tag == None:
|
|
563
|
+
options.sequence_tag = 'StORF'
|
|
564
|
+
|
|
565
|
+
options.clusters = os.path.normpath(options.clusters)
|
|
566
|
+
options.clusters = os.path.realpath(options.clusters)
|
|
567
|
+
if options.reclustered:
|
|
568
|
+
options.reclustered = os.path.normpath(options.reclustered)
|
|
569
|
+
options.reclustered = os.path.realpath(options.reclustered)
|
|
570
|
+
|
|
571
|
+
|
|
572
|
+
options.core_groups = options.core_groups + ',0'
|
|
573
|
+
|
|
574
|
+
cluster(options)
|
|
575
|
+
|
|
576
|
+
print("Thank you for using PyamilySeq -- A detailed user manual can be found at https://github.com/NickJD/PyamilySeq\n"
|
|
577
|
+
"Please report any issues to: https://github.com/NickJD/PyamilySeq/issues\n#####")
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
|
|
581
|
+
|
|
582
|
+
|
|
583
|
+
if __name__ == "__main__":
|
|
584
|
+
main()
|
|
585
|
+
print("Complete")
|
|
586
|
+
|
PyamilySeq/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
|
|
2
|
+
import argparse
|
|
3
|
+
import gzip
|
|
4
|
+
import glob
|
|
5
|
+
|
|
6
|
+
def combine_files(files, split, glob_location, combined_out):
|
|
7
|
+
count = 0
|
|
8
|
+
|
|
9
|
+
for file in glob.glob(glob_location + '/' + files):
|
|
10
|
+
count += 1
|
|
11
|
+
try:
|
|
12
|
+
with gzip.open(file, 'rb') as genome:
|
|
13
|
+
|
|
14
|
+
for line in genome:
|
|
15
|
+
if line.startswith(b'#'):
|
|
16
|
+
continue
|
|
17
|
+
elif line.startswith(b'>'):
|
|
18
|
+
genome_name = bytes(file.split(split)[0].split('/')[-1], 'utf-8')
|
|
19
|
+
line = line.split(b' ')[0]
|
|
20
|
+
line = line.replace(b'>', b'>' + genome_name + b'|')
|
|
21
|
+
combined_out.write(line.decode('utf-8')+'\n')
|
|
22
|
+
else:
|
|
23
|
+
combined_out.write(line.decode('utf-8'))
|
|
24
|
+
except gzip.BadGzipFile:
|
|
25
|
+
with open(file, 'r') as genome:
|
|
26
|
+
|
|
27
|
+
for line in genome:
|
|
28
|
+
if line.startswith('#'):
|
|
29
|
+
continue
|
|
30
|
+
elif line.startswith('>'):
|
|
31
|
+
genome_name = file.split(split)[0].split('/')[-1]
|
|
32
|
+
line = line.replace('>', '>' + genome_name + '|')
|
|
33
|
+
combined_out.write(line)
|
|
34
|
+
else:
|
|
35
|
+
combined_out.write(line)
|
|
36
|
+
|
|
37
|
+
def main():
|
|
38
|
+
parser = argparse.ArgumentParser(description="Combine gzipped fasta files.")
|
|
39
|
+
parser.add_argument("files", help="File pattern to match within the specified directory.")
|
|
40
|
+
parser.add_argument("split", help="String used to split the file path and extract the genome name.")
|
|
41
|
+
parser.add_argument("glob_location", help="Directory location where the files are located.")
|
|
42
|
+
parser.add_argument("combined_out", help="Output file where the combined data will be written.")
|
|
43
|
+
args = parser.parse_args()
|
|
44
|
+
|
|
45
|
+
with open(args.combined_out, 'w') as combined_out:
|
|
46
|
+
combine_files(args.files, args.split, args.glob_location, combined_out)
|
|
47
|
+
|
|
48
|
+
if __name__ == "__main__":
|
|
49
|
+
main()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: PyamilySeq
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.2
|
|
4
4
|
Summary: PyamilySeq - A a tool to look for sequence-based gene families identified by clustering methods such as CD-HIT, DIAMOND, BLAST or MMseqs2.
|
|
5
5
|
Home-page: https://github.com/NickJD/PyamilySeq
|
|
6
6
|
Author: Nicholas Dimonaco
|
|
@@ -12,7 +12,6 @@ Classifier: Operating System :: OS Independent
|
|
|
12
12
|
Requires-Python: >=3.6
|
|
13
13
|
Description-Content-Type: text/markdown
|
|
14
14
|
License-File: LICENSE
|
|
15
|
-
Requires-Dist: numpy
|
|
16
15
|
|
|
17
16
|
# PyamilySeq
|
|
18
17
|
PyamilySeq (Family Seek) is a Python tool for clustering gene sequences into families based on sequence similarity identified by tools such as CD-HIT, DIAMOND or MMseqs2.
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
PyamilySeq/CD-Hit_StORF-Reporter_Cross-Genera_Builder.py,sha256=UzQ5iOKCNfurxmj1pnkowF11YfWBO5vnBCKxQK6goB8,26538
|
|
2
|
+
PyamilySeq/Constants.py,sha256=hrbTdmPUFEzLfGZOPoQPV0NsAG-VnfIX51291vqb1C8,30
|
|
3
|
+
PyamilySeq/PyamilySeq_Species.py,sha256=34NHcViENyAdvGRltNUbfWjEcNCYnsmbuhDdl8__mH0,28209
|
|
4
|
+
PyamilySeq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
+
PyamilySeq/combine_FASTA_with_genome_IDs.py,sha256=aMUVSk6jKnKX0g04RMM360QueZS83lRLqLLysBtQbLo,2009
|
|
6
|
+
PyamilySeq-0.0.2.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
7
|
+
PyamilySeq-0.0.2.dist-info/METADATA,sha256=v6hOL3kekqt8H5YhjpS6uQOF1QSFcBh4Zy-jNW3xDTk,2550
|
|
8
|
+
PyamilySeq-0.0.2.dist-info/WHEEL,sha256=Z4pYXqR_rTB7OWNDYFOm1qRk0RX6GFP2o8LgvP453Hk,91
|
|
9
|
+
PyamilySeq-0.0.2.dist-info/entry_points.txt,sha256=zGtA2Ycf0LG3PR7zuuT0wjaAKLFxtyGgBc0O_W7E250,66
|
|
10
|
+
PyamilySeq-0.0.2.dist-info/top_level.txt,sha256=J6JhugUQTq4rq96yibAlQu3o4KCM9WuYfqr3w1r119M,11
|
|
11
|
+
PyamilySeq-0.0.2.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
PyamilySeq
|
|
@@ -1,6 +0,0 @@
|
|
|
1
|
-
PyamilySeq-0.0.1.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
2
|
-
PyamilySeq-0.0.1.dist-info/METADATA,sha256=gYAN6guZiV3POfjJJTn20Usj3PJZ-UTsdV5gruMo86g,2571
|
|
3
|
-
PyamilySeq-0.0.1.dist-info/WHEEL,sha256=Z4pYXqR_rTB7OWNDYFOm1qRk0RX6GFP2o8LgvP453Hk,91
|
|
4
|
-
PyamilySeq-0.0.1.dist-info/entry_points.txt,sha256=zGtA2Ycf0LG3PR7zuuT0wjaAKLFxtyGgBc0O_W7E250,66
|
|
5
|
-
PyamilySeq-0.0.1.dist-info/top_level.txt,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
|
6
|
-
PyamilySeq-0.0.1.dist-info/RECORD,,
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
|
|
File without changes
|
|
File without changes
|
|
File without changes
|