PyamilySeq 0.0.1__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyamilySeq/CD-Hit_StORF-Reporter_Cross-Genera_Builder.py +600 -0
- PyamilySeq/Constants.py +1 -0
- PyamilySeq/PyamilySeq_Species.py +647 -0
- PyamilySeq/__init__.py +0 -0
- PyamilySeq/combine_FASTA_with_genome_IDs.py +49 -0
- {PyamilySeq-0.0.1.dist-info → PyamilySeq-0.2.0.dist-info}/METADATA +33 -3
- PyamilySeq-0.2.0.dist-info/RECORD +11 -0
- PyamilySeq-0.2.0.dist-info/top_level.txt +1 -0
- PyamilySeq-0.0.1.dist-info/RECORD +0 -6
- PyamilySeq-0.0.1.dist-info/top_level.txt +0 -1
- {PyamilySeq-0.0.1.dist-info → PyamilySeq-0.2.0.dist-info}/LICENSE +0 -0
- {PyamilySeq-0.0.1.dist-info → PyamilySeq-0.2.0.dist-info}/WHEEL +0 -0
- {PyamilySeq-0.0.1.dist-info → PyamilySeq-0.2.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,600 @@
|
|
|
1
|
+
import collections
|
|
2
|
+
import copy
|
|
3
|
+
import math
|
|
4
|
+
import sys
|
|
5
|
+
import numpy as np
|
|
6
|
+
from itertools import chain
|
|
7
|
+
|
|
8
|
+
def get_Genus(clustered):
|
|
9
|
+
clustered_genus = clustered.split('|')[0]
|
|
10
|
+
if '_' in clustered_genus[0]: # Remove name error
|
|
11
|
+
clustered_genus = clustered_genus.split('_')[1]
|
|
12
|
+
else:
|
|
13
|
+
clustered_genus = clustered_genus.split('_')[0]
|
|
14
|
+
return str(clustered_genus).capitalize()
|
|
15
|
+
|
|
16
|
+
def get_Species(clustered):
|
|
17
|
+
clustered_species = clustered.split('|')[0]
|
|
18
|
+
if '_' in clustered_species[0]: # Remove name error
|
|
19
|
+
clustered_species = clustered_species.split('_')[1]
|
|
20
|
+
else:
|
|
21
|
+
clustered_species = clustered_species.split('_')[:2]
|
|
22
|
+
return str('_'.join(clustered_species)).capitalize()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
PEP_In = open('/home/nick/Documents/Single_Genome/All_Ensembl_PEP_CD_Clustered_90_60.clstr','r')
|
|
26
|
+
StORF_In = open('/home/nick/Documents/Single_Genome/All_Ensem_PEP_CD_Clustered_90_60_Unclustered_UR_StORFs_AA_CD.clstr','r') # Clusters for single Genera
|
|
27
|
+
|
|
28
|
+
clusters = collections.OrderedDict()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
pangenome_clusters_PEP_Genera = collections.OrderedDict()
|
|
32
|
+
pangenome_clusters_PEP_Species = collections.OrderedDict()
|
|
33
|
+
pangenome_clusters_PEP_Strains = collections.OrderedDict()
|
|
34
|
+
pangenome_clusters_PEP_SEQS = collections.OrderedDict()
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
max_storf_only_genera = 0
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
count = 0
|
|
43
|
+
first = True
|
|
44
|
+
genome_dict = collections.defaultdict(int)
|
|
45
|
+
reps = collections.OrderedDict()
|
|
46
|
+
county = 0
|
|
47
|
+
#singleton_cluster = "Null"
|
|
48
|
+
clusters_With_Con_StORFs = []
|
|
49
|
+
## Load in all data for easier reuse later
|
|
50
|
+
for line in PEP_In:
|
|
51
|
+
if line.startswith('>'):
|
|
52
|
+
if first == False:
|
|
53
|
+
Ensem_Con = set(Ensem_genomes).intersection(Con_genomes)
|
|
54
|
+
cluster_size = len(clusters[cluster_id])
|
|
55
|
+
reps.update({rep: [cluster_size,len(pangenome_clusters_PEP_Genera[cluster_id])]}) # Add strains, species here if wanted
|
|
56
|
+
#if len(clusters[cluster_id]) == 1 and "Null" not in singleton_cluster: # Stop at clusters smaller than 10
|
|
57
|
+
# singleton_cluster = cluster_id
|
|
58
|
+
#if len(clusters[cluster_id]) < 10: # Stop at clusters smaller than 10
|
|
59
|
+
# pangenome_clusters_PEP_Species.popitem()
|
|
60
|
+
# pangenome_clusters_PEP_Genera.popitem()
|
|
61
|
+
# pangenome_clusters_PEP_SEQS.popitem()
|
|
62
|
+
# reps.popitem()
|
|
63
|
+
# if len(clusters[cluster_id]) == 1:
|
|
64
|
+
# break # REMEMBER
|
|
65
|
+
Ensem_genomes, Con_genomes = [], []
|
|
66
|
+
cluster_id = line.strip('>')
|
|
67
|
+
cluster_id = cluster_id.strip('\n')
|
|
68
|
+
cluster_id = cluster_id.split(' ')[1]
|
|
69
|
+
clusters.update({cluster_id: []})
|
|
70
|
+
pangenome_clusters_PEP_Genera.update({cluster_id: []})
|
|
71
|
+
pangenome_clusters_PEP_Species.update({cluster_id:[]})
|
|
72
|
+
pangenome_clusters_PEP_Strains.update({cluster_id: []})
|
|
73
|
+
# pangenome_clusters_PEP_SEQS.update({cluster_id:[]})
|
|
74
|
+
|
|
75
|
+
first = False
|
|
76
|
+
else:
|
|
77
|
+
clustered = line.split('\t')[1]
|
|
78
|
+
clustered = clustered.split('>')[1]
|
|
79
|
+
clustered = clustered.split('...')[0]
|
|
80
|
+
genome = clustered.split('|')[0]
|
|
81
|
+
genome_dict[genome] +=1
|
|
82
|
+
if '*' in line:
|
|
83
|
+
rep = clustered
|
|
84
|
+
reps.update({rep:[0,0]})
|
|
85
|
+
if first == False:
|
|
86
|
+
clusters[cluster_id].append(clustered)
|
|
87
|
+
clustered_genus = get_Genus(clustered)
|
|
88
|
+
clustered_species = get_Species(clustered)
|
|
89
|
+
clustered_strain = clustered.split('|')[0]
|
|
90
|
+
|
|
91
|
+
if clustered_genus not in pangenome_clusters_PEP_Genera[cluster_id]:
|
|
92
|
+
pangenome_clusters_PEP_Genera[cluster_id].append(clustered_genus)
|
|
93
|
+
#if clustered_species not in pangenome_clusters_PEP_Species[cluster_id]:
|
|
94
|
+
# pangenome_clusters_PEP_Species[cluster_id].append(clustered_species)
|
|
95
|
+
if genome not in pangenome_clusters_PEP_Strains[cluster_id]:
|
|
96
|
+
pangenome_clusters_PEP_Strains[cluster_id].append(genome)
|
|
97
|
+
# pangenome_clusters_PEP_SEQS[cluster_id].append(clustered)
|
|
98
|
+
print("PEP DONE")
|
|
99
|
+
######################################
|
|
100
|
+
Combined_pangenome_clusters_PEP_Genera = collections.OrderedDict()
|
|
101
|
+
Combined_pangenome_clusters_PEP_Species = collections.OrderedDict()
|
|
102
|
+
Combined_pangenome_clusters_PEP_Strains = collections.OrderedDict()
|
|
103
|
+
Combined_pangenome_clusters_PEP_SEQS = collections.OrderedDict()
|
|
104
|
+
|
|
105
|
+
Combined_pangenome_clusters_StORF_Genera = collections.OrderedDict()
|
|
106
|
+
Combined_pangenome_clusters_StORF_Species = collections.OrderedDict()
|
|
107
|
+
Combined_pangenome_clusters_StORF_Strains = collections.OrderedDict()
|
|
108
|
+
Combined_pangenome_clusters_StORF_SEQS = collections.OrderedDict()
|
|
109
|
+
|
|
110
|
+
Combined_pangenome_clusters_PEP_StORF_Clustered_Genera = collections.OrderedDict()
|
|
111
|
+
Combined_pangenome_clusters_PEP_StORF_Clustered = collections.OrderedDict()
|
|
112
|
+
|
|
113
|
+
not_StORF_Only_Cluster_IDs = []
|
|
114
|
+
|
|
115
|
+
Combined_clusters = collections.OrderedDict()
|
|
116
|
+
Combined_reps = collections.OrderedDict()
|
|
117
|
+
first = True
|
|
118
|
+
###############
|
|
119
|
+
## We load in the combined PEP and StORF_Reporter data separately
|
|
120
|
+
for line in StORF_In:
|
|
121
|
+
if line.startswith('>'):
|
|
122
|
+
if first == False:
|
|
123
|
+
cluster_size = len(Combined_clusters[cluster_id])
|
|
124
|
+
Combined_reps.update({rep: cluster_size})
|
|
125
|
+
# if len(Combined_pangenome_clusters_PEP_SEQS[cluster_id]) > 1:
|
|
126
|
+
# print("Here")
|
|
127
|
+
if len(Combined_pangenome_clusters_StORF_SEQS[cluster_id]) > 0 and len(Combined_pangenome_clusters_PEP_SEQS[cluster_id]) > 0:
|
|
128
|
+
if len(Combined_pangenome_clusters_PEP_SEQS[cluster_id]) > 1: # If we have clustered >1 PEP family, we need to record 1 as key and all others are val
|
|
129
|
+
all_but_first = Combined_pangenome_clusters_PEP_SEQS[cluster_id][1:]
|
|
130
|
+
storfs_clustered = Combined_pangenome_clusters_StORF_SEQS[cluster_id]
|
|
131
|
+
VALUE = all_but_first+storfs_clustered
|
|
132
|
+
else:
|
|
133
|
+
VALUE = Combined_pangenome_clusters_StORF_SEQS[cluster_id]
|
|
134
|
+
KEY = Combined_pangenome_clusters_PEP_SEQS[cluster_id][0]
|
|
135
|
+
Combined_pangenome_clusters_PEP_StORF_Clustered.update({KEY:VALUE})
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
## Below needs to be rewritten. With >1 genus - be able to record multiple PEPs for each combined...
|
|
139
|
+
# if len(Combined_pangenome_clusters_StORF_Genera[cluster_id]) > 0 and len(Combined_pangenome_clusters_PEP_Genera[cluster_id]) > 0:
|
|
140
|
+
# KEY = Combined_pangenome_clusters_PEP_SEQS[cluster_id][0]
|
|
141
|
+
# VALUE = Combined_pangenome_clusters_StORF_SEQS[cluster_id]
|
|
142
|
+
# Combined_pangenome_clusters_PEP_StORF_Clustered_Genera.update({KEY:VALUE})
|
|
143
|
+
if len(Combined_clusters[cluster_id]) == 1: # Stop at clusters smaller than 10
|
|
144
|
+
print("First Singleton Cluster is: " +str(cluster_id))
|
|
145
|
+
break
|
|
146
|
+
cluster_id = line.strip('>')
|
|
147
|
+
cluster_id = cluster_id.strip('\n')
|
|
148
|
+
cluster_id = cluster_id.split(' ')[1]
|
|
149
|
+
Combined_clusters.update({cluster_id: []})
|
|
150
|
+
# Combined_pangenome_clusters_PEP_Genera.update({cluster_id:[]})
|
|
151
|
+
# Combined_pangenome_clusters_PEP_Species.update({cluster_id: []})
|
|
152
|
+
Combined_pangenome_clusters_PEP_Strains.update({cluster_id: []})
|
|
153
|
+
Combined_pangenome_clusters_PEP_SEQS.update({cluster_id: []})
|
|
154
|
+
#
|
|
155
|
+
Combined_pangenome_clusters_StORF_Genera.update({cluster_id: []})
|
|
156
|
+
# Combined_pangenome_clusters_StORF_Species.update({cluster_id: []})
|
|
157
|
+
Combined_pangenome_clusters_StORF_Strains.update({cluster_id: []})
|
|
158
|
+
Combined_pangenome_clusters_StORF_SEQS.update({cluster_id: []})
|
|
159
|
+
first = False
|
|
160
|
+
else:
|
|
161
|
+
clustered = line.split('\t')[1]
|
|
162
|
+
clustered = clustered.split('>')[1]
|
|
163
|
+
clustered = clustered.split('...')[0]
|
|
164
|
+
if '*' in line:
|
|
165
|
+
rep = clustered
|
|
166
|
+
Combined_reps.update({rep:0})
|
|
167
|
+
if first == False:
|
|
168
|
+
Combined_clusters[cluster_id].append(clustered)
|
|
169
|
+
clustered_genus = get_Genus(clustered)
|
|
170
|
+
clustered_species = get_Species(clustered)
|
|
171
|
+
clustered_strain = clustered.split('|')[0]
|
|
172
|
+
if '_' in clustered_strain[0]: # Remove name error
|
|
173
|
+
clustered_strain = clustered_strain.split('_')[1]
|
|
174
|
+
|
|
175
|
+
if "StORF_Type" in line:
|
|
176
|
+
# if cluster_id not in clusters_With_Con_StORFs: # For counting?
|
|
177
|
+
# clusters_With_Con_StORFs.append(cluster_id)
|
|
178
|
+
if clustered_genus not in Combined_pangenome_clusters_StORF_Genera[cluster_id]:
|
|
179
|
+
Combined_pangenome_clusters_StORF_Genera[cluster_id].append(clustered_genus)
|
|
180
|
+
# if clustered_species not in Combined_pangenome_clusters_StORF_Species[cluster_id]:
|
|
181
|
+
# Combined_pangenome_clusters_StORF_Species[cluster_id].append(clustered_species)
|
|
182
|
+
if clustered_strain not in Combined_pangenome_clusters_StORF_Strains[cluster_id]:
|
|
183
|
+
Combined_pangenome_clusters_StORF_Strains[cluster_id].append(clustered_strain)
|
|
184
|
+
Combined_pangenome_clusters_StORF_SEQS[cluster_id].append(clustered)
|
|
185
|
+
#
|
|
186
|
+
else:
|
|
187
|
+
# if clustered_genus not in Combined_pangenome_clusters_PEP_Genera[cluster_id]:
|
|
188
|
+
# Combined_pangenome_clusters_PEP_Genera[cluster_id].append(clustered_genus)
|
|
189
|
+
# if clustered_species not in Combined_pangenome_clusters_PEP_Species[cluster_id]:
|
|
190
|
+
# Combined_pangenome_clusters_PEP_Species[cluster_id].append(clustered_species)
|
|
191
|
+
if clustered_strain not in Combined_pangenome_clusters_PEP_Strains[cluster_id]:
|
|
192
|
+
Combined_pangenome_clusters_PEP_Strains[cluster_id].append(clustered_strain)
|
|
193
|
+
if cluster_id not in not_StORF_Only_Cluster_IDs:
|
|
194
|
+
not_StORF_Only_Cluster_IDs.append(cluster_id) # Tell us which StORF_Reporter clustered are unmatched to a PEP
|
|
195
|
+
Combined_pangenome_clusters_PEP_SEQS[cluster_id].append(clustered)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
###HERE for tomorrow - copy the updated work from single to here and repeat for genus,species and strain
|
|
200
|
+
list_of_reps = list(reps.keys())
|
|
201
|
+
num_clustered_PEP_Genera = collections.defaultdict(list)
|
|
202
|
+
recorded_PEP = []
|
|
203
|
+
################################# Genera
|
|
204
|
+
pangenome_clusters_Type_Genera = copy.deepcopy(pangenome_clusters_PEP_Genera)
|
|
205
|
+
pangenome_clusters_Type_Strains = collections.defaultdict(list)
|
|
206
|
+
|
|
207
|
+
for cluster, pep_genomes in pangenome_clusters_PEP_Genera.items():
|
|
208
|
+
recorded_PEP.append(cluster)
|
|
209
|
+
rep = list_of_reps[int(cluster)]
|
|
210
|
+
Com_PEPs = 0
|
|
211
|
+
Com_PEP_Genomes = 0
|
|
212
|
+
StORFs = 0
|
|
213
|
+
Added_StORF_Genera = 0
|
|
214
|
+
seen_clust_Strains = []
|
|
215
|
+
|
|
216
|
+
PEP_Strains = pangenome_clusters_PEP_Strains[cluster]
|
|
217
|
+
for clustered_strain in PEP_Strains:
|
|
218
|
+
if '_' in clustered_strain[0]: # Remove name error
|
|
219
|
+
clustered_strain = clustered_strain[1:]
|
|
220
|
+
if clustered_strain not in seen_clust_Strains:
|
|
221
|
+
seen_clust_Strains.append(clustered_strain)
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
try:
|
|
225
|
+
clustered_combined = Combined_pangenome_clusters_PEP_StORF_Clustered[rep]
|
|
226
|
+
seen_clust_Genera = []
|
|
227
|
+
num_clustered_PEP_Genera[cluster].append(rep + '_' + str(len(pep_genomes)))
|
|
228
|
+
for clust in clustered_combined:
|
|
229
|
+
if 'StORF_Type' not in clust:
|
|
230
|
+
### Need to get the number of pep genomes for each pep clustered into this
|
|
231
|
+
Com_PEPs += 1
|
|
232
|
+
clustered_genus = get_Genus(clust)
|
|
233
|
+
#clust_Genome = clust.split('|')[0]
|
|
234
|
+
if clustered_genus not in seen_clust_Genera:
|
|
235
|
+
seen_clust_Genera.append(clustered_genus)
|
|
236
|
+
if clustered_genus not in pep_genomes:
|
|
237
|
+
Com_PEP_Genomes += 1
|
|
238
|
+
try:
|
|
239
|
+
num_clustered_PEP_Genera[cluster].append(clust + '_' + str(reps[clust][1]))
|
|
240
|
+
except TypeError:
|
|
241
|
+
sys.exit("Broken")
|
|
242
|
+
|
|
243
|
+
elif 'StORF_Type' in clust:
|
|
244
|
+
StORFs += 1
|
|
245
|
+
clustered_genus = get_Genus(clust)
|
|
246
|
+
#clust_Genome = clust.split('|')[0]
|
|
247
|
+
if clustered_genus not in seen_clust_Genera:
|
|
248
|
+
seen_clust_Genera.append(clustered_genus)
|
|
249
|
+
if clustered_genus not in pep_genomes:
|
|
250
|
+
Added_StORF_Genera += 1
|
|
251
|
+
else:
|
|
252
|
+
print("WHAT")
|
|
253
|
+
|
|
254
|
+
size_of_pep_clusters = []
|
|
255
|
+
peps = num_clustered_PEP_Genera[cluster]
|
|
256
|
+
for pep in peps:
|
|
257
|
+
pep = pep.rsplit('_', 1)
|
|
258
|
+
size_of_pep_clusters.append(int(pep[1]))
|
|
259
|
+
pangenome_clusters_Type_Genera[cluster] = [len(num_clustered_PEP_Genera[cluster]), sum(size_of_pep_clusters),
|
|
260
|
+
size_of_pep_clusters, Added_StORF_Genera, StORFs]
|
|
261
|
+
pangenome_clusters_Type_Strains[cluster] = seen_clust_Strains
|
|
262
|
+
except KeyError:
|
|
263
|
+
###Singleton
|
|
264
|
+
num_pep_genomes = [len(pep_genomes)]
|
|
265
|
+
pangenome_clusters_Type_Genera[cluster] = [1, len(pep_genomes), num_pep_genomes, Added_StORF_Genera, StORFs]
|
|
266
|
+
pangenome_clusters_Type_Strains[cluster] = seen_clust_Strains
|
|
267
|
+
|
|
268
|
+
print("S")
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
#######################################
|
|
272
|
+
|
|
273
|
+
Without_StORF = open('./Ensem_Clusters_Without_StORFs_To_Be_Nogged_min2','w')
|
|
274
|
+
With_StORF = open('./Ensem_Clusters_With_StORFs_To_Be_Nogged','w')
|
|
275
|
+
#With_Extending_StORF = open('./Ensem_Clusters_With_Extending_Con-StORFs_To_Be_Nogged','w')
|
|
276
|
+
|
|
277
|
+
for key, value in pangenome_clusters_Type_Genera.items():
|
|
278
|
+
pep_strains = pangenome_clusters_Type_Strains[key]
|
|
279
|
+
if value[4] == 0 and len(pep_strains) >=2:
|
|
280
|
+
Without_StORF.write(str(key)+',')
|
|
281
|
+
# elif value[3] != 0:
|
|
282
|
+
# With_Extending_StORF.write(str(key)+',')
|
|
283
|
+
# With_StORF.write(str(key) + ',')
|
|
284
|
+
elif value[4] >=1:
|
|
285
|
+
With_StORF.write(str(key) + ',')
|
|
286
|
+
|
|
287
|
+
With_StORF.close()
|
|
288
|
+
Without_StORF.close()
|
|
289
|
+
#With_Extending_StORF.close()
|
|
290
|
+
|
|
291
|
+
############## Typing for the StORF_Reporter-Data
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
multi_PEP_Combined_By_StORFs = collections.OrderedDict()
|
|
295
|
+
|
|
296
|
+
StORF_Seqs_Extended = []
|
|
297
|
+
StORF_Genomes_Extended = []
|
|
298
|
+
|
|
299
|
+
####################################
|
|
300
|
+
#cores = collections.OrderedDict({'pep_genera_single':[],'pep_genera_multi':[],'extended_genera':[],'comb_extended_genera_single':[],'comb_extended_genera_multi':[],'extended_genera_single':[],'extended_genera_multi':0,'storf_genera_single':0,'storf_genera_multi':0,
|
|
301
|
+
# 'only_storf_genera_single':0,'only_storf_genera_multi':0})
|
|
302
|
+
|
|
303
|
+
cores = collections.OrderedDict({'pep_genera':[],'extended_genera_single_pep':[],'many_extended_genera_pep':[],'extended_genera':[],'comb_extended_genera':[],'storf_genera':[],'only_storf_genera':[],'only_storf_genera_recording':[]})
|
|
304
|
+
|
|
305
|
+
extended = collections.OrderedDict()
|
|
306
|
+
############################
|
|
307
|
+
|
|
308
|
+
clsuters_to_be_validated = collections.defaultdict(list)
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
############################
|
|
312
|
+
def calc_pep_only(pep_num):
|
|
313
|
+
cores['pep_genera'].append(pep_num)
|
|
314
|
+
# if pep_num == 1:# and StORF_num == 0:
|
|
315
|
+
# cores['pep_genera_single'] += 1
|
|
316
|
+
# elif pep_num > 1:# and StORF_num == 0:
|
|
317
|
+
# cores['pep_genera_multi'] += 1
|
|
318
|
+
##########################
|
|
319
|
+
def calc_pep_extended_StORF(cluster,pep_num,storf_num):
|
|
320
|
+
if pep_num != 0 and storf_num >= 1:
|
|
321
|
+
cores['extended_genera'].append(pep_num+storf_num)
|
|
322
|
+
clsuters_to_be_validated['extended_genera'].append(cluster)
|
|
323
|
+
if pep_num != 0 and storf_num >= 10:
|
|
324
|
+
cores['many_extended_genera_pep'].append([cluster,pep_num+storf_num])
|
|
325
|
+
|
|
326
|
+
if pep_num == 1 and storf_num >= 1:
|
|
327
|
+
cores['extended_genera_single_pep'].append([cluster,pep_num + storf_num])
|
|
328
|
+
# cores['extended_genera_single'] +=1
|
|
329
|
+
# if pep_num != 0 and storf_num > 1:
|
|
330
|
+
# cores['extended_genera_multi'] +=1
|
|
331
|
+
##########################
|
|
332
|
+
def calc_multi_pep_extended_StORF(cluster,number_of_pep_clustered,pep_num,storf_num):
|
|
333
|
+
if pep_num !=0 and storf_num >= 1:
|
|
334
|
+
cores['comb_extended_genera'].append(pep_num+storf_num)
|
|
335
|
+
clsuters_to_be_validated['comb_extended_genera'].append(cluster)
|
|
336
|
+
|
|
337
|
+
#########################
|
|
338
|
+
def calc_StORF_only_when_with_pep(cluster,storf_num):
|
|
339
|
+
cores['storf_genera'].append(storf_num)
|
|
340
|
+
clsuters_to_be_validated['storf_genera'].append(cluster)
|
|
341
|
+
# if storf_num == 1:# and StORF_num == 0:
|
|
342
|
+
# cores['storf_genera_single'] += 1
|
|
343
|
+
# elif storf_num > 1:# and StORF_num == 0:
|
|
344
|
+
# cores['storf_genera_multi'] += 1
|
|
345
|
+
######################## What is the difference with these?
|
|
346
|
+
def calc_only_StORF(cluster,storf_num,max_storf_only_genera): # only count the true storf onlies
|
|
347
|
+
cores['only_storf_genera'].append(storf_num)
|
|
348
|
+
clsuters_to_be_validated['only_storf_genera'].append(cluster)
|
|
349
|
+
if storf_num>=6:
|
|
350
|
+
cores['only_storf_genera_recording'].append([cluster, storf_num])
|
|
351
|
+
if storf_num > max_storf_only_genera:
|
|
352
|
+
max_storf_only_genera = storf_num
|
|
353
|
+
# if storf_num == 1:# and StORF_num == 0:
|
|
354
|
+
# cores['only_storf_genera_single'] += 1
|
|
355
|
+
# elif storf_num > 1:# and StORF_num == 0:
|
|
356
|
+
# cores['only_storf_genera_multi'] += 1
|
|
357
|
+
return max_storf_only_genera
|
|
358
|
+
#########################
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
###########################
|
|
363
|
+
print("Running")
|
|
364
|
+
check_all_calced = 0
|
|
365
|
+
for cluster, numbers in pangenome_clusters_Type_Genera.items():
|
|
366
|
+
pep_strains = pangenome_clusters_Type_Strains[cluster]
|
|
367
|
+
if numbers[3] >=1:
|
|
368
|
+
StORF_Genomes_Extended.append(numbers[3])
|
|
369
|
+
if numbers[4] >=1:
|
|
370
|
+
StORF_Seqs_Extended.append(numbers[4])
|
|
371
|
+
############################### Calc PEP only
|
|
372
|
+
if numbers[0] == 1 and len(pep_strains) >= 2: # If StORFs did not combine PEP reps
|
|
373
|
+
calc_pep_only(numbers[1])#,numbers[3])
|
|
374
|
+
check_all_calced +=1
|
|
375
|
+
elif numbers[0] >1: # IF StORFs combined multiple PEP
|
|
376
|
+
calc_pep_only(numbers[2][0])
|
|
377
|
+
check_all_calced += 1
|
|
378
|
+
# for num in numbers[2]:
|
|
379
|
+
# calc_pep_only(num) # ,numbers[3])
|
|
380
|
+
|
|
381
|
+
############################# Calc PEP and StORF_Reporter
|
|
382
|
+
if numbers[0] == 1 and numbers[3] >1: # If StORFs did not combine PEP reps
|
|
383
|
+
calc_pep_extended_StORF(cluster,numbers[1],numbers[3])
|
|
384
|
+
extended.update({cluster:numbers})
|
|
385
|
+
check_all_calced += 1
|
|
386
|
+
elif numbers[0] >1 and numbers[3] >1: # IF StORFs combined multiple PEP - Genera added
|
|
387
|
+
#grouped_pep = sum(numbers[2])
|
|
388
|
+
#for num in numbers[2]:
|
|
389
|
+
calc_multi_pep_extended_StORF(cluster,numbers[2],numbers[1],numbers[3]) # same here
|
|
390
|
+
print("combined: " + str(cluster))
|
|
391
|
+
|
|
392
|
+
extended.update({cluster: numbers})
|
|
393
|
+
check_all_calced += 1
|
|
394
|
+
elif numbers[0] >1 and numbers[4] >1: # IF StORFs combined multiple PEP
|
|
395
|
+
multi_PEP_Combined_By_StORFs.update({cluster: numbers})
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
import os
|
|
399
|
+
###########################
|
|
400
|
+
############################### Calc StORF_Reporter only
|
|
401
|
+
Combined_pangenome_clusters_ONLY_StORF_Type = collections.defaultdict(list)
|
|
402
|
+
Combined_pangenome_clusters_StORF_Type = collections.defaultdict(list)
|
|
403
|
+
|
|
404
|
+
biggest_genera = ""
|
|
405
|
+
big_genera = 0
|
|
406
|
+
biggest_strains = ""
|
|
407
|
+
big_strains = 0
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
#Without_StORF = open('./Ensem_Clusters_Without_Con-StORFs_To_Be_Nogged_min2','w')
|
|
411
|
+
#With_StORF = open('./Ensem_Clusters_With_Con-StORFs_To_Be_Nogged','w')
|
|
412
|
+
#With_Extending_StORF = open('./Ensem_Clusters_With_Extending_Con-StORFs_To_Be_Nogged','w')
|
|
413
|
+
StORF_Only = open("./StORF_Only_Clusters_To_Be_Nogged_min2",'w')
|
|
414
|
+
|
|
415
|
+
for cluster, genera in Combined_pangenome_clusters_StORF_Genera.items():
|
|
416
|
+
storf_strains = Combined_pangenome_clusters_StORF_Strains[cluster]
|
|
417
|
+
pep_strains = Combined_pangenome_clusters_PEP_Strains[cluster]
|
|
418
|
+
if cluster in not_StORF_Only_Cluster_IDs:
|
|
419
|
+
Combined_pangenome_clusters_StORF_Type[cluster] = [cluster,len(genera)]
|
|
420
|
+
#if len(genera) >= 1:
|
|
421
|
+
calc_StORF_only_when_with_pep(cluster,len(genera)) # ,numbers[3])
|
|
422
|
+
else:
|
|
423
|
+
if len(storf_strains) >= 2:
|
|
424
|
+
StORF_Only.write(str(cluster) + ',')
|
|
425
|
+
Combined_pangenome_clusters_ONLY_StORF_Type[cluster] = [cluster,len(genera)]
|
|
426
|
+
max_storf_only_genera = calc_only_StORF(cluster,len(genera),max_storf_only_genera)
|
|
427
|
+
if len(genera) > big_genera:
|
|
428
|
+
big_genera = len(genera)
|
|
429
|
+
biggest_genera = cluster
|
|
430
|
+
if len(storf_strains) >= big_strains:
|
|
431
|
+
big_strains = len(storf_strains)
|
|
432
|
+
biggest_strains = cluster
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
print("Biggest: " +biggest_genera)
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
############################### Calc StORF_Reporter only
|
|
446
|
+
# for cluster, data in Combined_pangenome_clusters_StORF_Type.items():
|
|
447
|
+
# if data[1] >=1:
|
|
448
|
+
# calc_StORF_only_when_with_pep(data[1]) # ,numbers[3])
|
|
449
|
+
#
|
|
450
|
+
#
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
#################
|
|
454
|
+
print(cores)
|
|
455
|
+
#print(extended)
|
|
456
|
+
|
|
457
|
+
from collections import Counter
|
|
458
|
+
|
|
459
|
+
#print(Counter(cores['pep_genera']))
|
|
460
|
+
#print(Counter(cores['extended_genera']))
|
|
461
|
+
#print(Counter(cores['comb_extended_genera']))
|
|
462
|
+
#print(Counter(cores['storf_genera']))
|
|
463
|
+
print(Counter(cores['only_storf_genera']))
|
|
464
|
+
print(cores['only_storf_genera_recording'])
|
|
465
|
+
|
|
466
|
+
print("END")
|
|
467
|
+
|
|
468
|
+
### Emoty file ready for interesting storfs
|
|
469
|
+
# interesting_out = "./StORF_Only_Clusters_To_Be_Swissed.fa"
|
|
470
|
+
# with open(interesting_out, 'r+') as f:
|
|
471
|
+
# f.truncate(4)
|
|
472
|
+
# for cluster, data in Combined_pangenome_clusters_ONLY_StORF_Type.items():
|
|
473
|
+
# #if number >1:
|
|
474
|
+
# if data[1] >=1:
|
|
475
|
+
# calc_only_StORF(data[1]) # ,numbers[3])
|
|
476
|
+
# # if data[1] >= 2:
|
|
477
|
+
# # print("Interesting:" + str(cluster))
|
|
478
|
+
# # os.system(
|
|
479
|
+
# # "python3 Extract_FASTA_From_Cluster.py -f ./All_Ensem_Filtered_PEP_Clustered_With_Unclustered_UR-StORFS_s.fa_CD_c90_s60.fa "
|
|
480
|
+
# # "-c ./All_Ensem_Filtered_PEP_Clustered_With_Unclustered_UR-StORFS_s.fa_CD_c90_s60.clstr -id " + str(
|
|
481
|
+
# # data[0]) + " -o "+ interesting_out)
|
|
482
|
+
# #
|
|
483
|
+
#
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
#
|
|
489
|
+
# ################################## Species
|
|
490
|
+
# pangenome_clusters_Type_Species = copy.deepcopy(pangenome_clusters_PEP_Species)
|
|
491
|
+
# for cluster, genomes in pangenome_clusters_PEP_Species.items():
|
|
492
|
+
# print(str(len(genomes)) + '\t' + str(len(pangenome_clusters_StORF_Species[cluster])))
|
|
493
|
+
# Con_StORFs = pangenome_clusters_StORF_Species[cluster]
|
|
494
|
+
# unique_con = 0
|
|
495
|
+
# all_con = 0
|
|
496
|
+
# for con in Con_StORFs:
|
|
497
|
+
# all_con +=1
|
|
498
|
+
# if con not in genomes:
|
|
499
|
+
# unique_con +=1
|
|
500
|
+
# pangenome_clusters_Type_Species[cluster] = [len(genomes),all_con,unique_con]
|
|
501
|
+
# ################################# Strains
|
|
502
|
+
# pangenome_clusters_Type_Strains = copy.deepcopy(pangenome_clusters_PEP_Strains)
|
|
503
|
+
# for cluster, genomes in pangenome_clusters_PEP_Strains.items():
|
|
504
|
+
# print(str(len(genomes))+'\t'+str(len(pangenome_clusters_StORF_Strains[cluster])))
|
|
505
|
+
# Con_StORFs = pangenome_clusters_StORF_Strains[cluster]
|
|
506
|
+
# unique_con = 0
|
|
507
|
+
# all_con = 0
|
|
508
|
+
# for con in Con_StORFs:
|
|
509
|
+
# all_con +=1
|
|
510
|
+
# if con not in genomes:
|
|
511
|
+
# unique_con +=1
|
|
512
|
+
# pangenome_clusters_Type_Strains[cluster] = [len(genomes),all_con,unique_con]
|
|
513
|
+
# ###################################
|
|
514
|
+
# Chris_Out = open('./Chris_Clusters.txt','w')
|
|
515
|
+
#
|
|
516
|
+
# clusters_For_Chris = collections.OrderedDict()
|
|
517
|
+
# clusters_For_Chris_PEP_0 = collections.OrderedDict()
|
|
518
|
+
#
|
|
519
|
+
# Chris_Out.write("Cluster\tSize\tEnsem_Genera_Num\tCon-StORF_Genera_Num\tCon-StORF_Only_Genera_Num\tEnsem_Species_Num\tCon-StORF_Species_Num\tCon-StORF_Only_Species_Num\tEnsem_Strain_Num\tCon-StORF_Strain_Num\tCon-StORF_Only_Strain_Num\n")
|
|
520
|
+
# #This for-loop will go through ALL Clusters allowing for the extraction of ALL different groupings
|
|
521
|
+
# for cluster, data in clusters.items():
|
|
522
|
+
# genera_numbers = pangenome_clusters_Type_Genera[cluster]
|
|
523
|
+
# species_numbers = pangenome_clusters_Type_Species[cluster]
|
|
524
|
+
# strain_numbers = pangenome_clusters_Type_Strains[cluster]
|
|
525
|
+
#
|
|
526
|
+
# Chris_Out.write(str(cluster)+'\t'+str(len(data))+'\t'+str(genera_numbers[0])+'\t'+str(genera_numbers[1])+'\t'+str(genera_numbers[2])+'\t'+str(species_numbers[0])+'\t'
|
|
527
|
+
# +str(species_numbers[1])+'\t'+str(species_numbers[2])+'\t'+str(strain_numbers[0])+'\t'+str(strain_numbers[1])+'\t'+str(species_numbers[2])+'\n')
|
|
528
|
+
|
|
529
|
+
# if cluster in clusters_With_Con_StORFs:
|
|
530
|
+
# print("Current")
|
|
531
|
+
# size_Of_Cluster = len(clusters[cluster])
|
|
532
|
+
# ensem_Num = 0
|
|
533
|
+
# con_StORF_Num = 0
|
|
534
|
+
# for i in clusters[cluster]:
|
|
535
|
+
# print(i)
|
|
536
|
+
# if 'Con-Stop' in i:
|
|
537
|
+
# con_StORF_Num +=1
|
|
538
|
+
# else:
|
|
539
|
+
# ensem_Num +=1
|
|
540
|
+
# clusters_For_Chris.update({cluster:[size_Of_Cluster,pep_Num,con_StORF_Num,numbers[0],numbers[1]]})
|
|
541
|
+
# ############# Add - Num of
|
|
542
|
+
# Chris_Out.write(str(cluster)+'\t'+str(size_Of_Cluster)+'\t'+str(pep_Num)+'\t'+str(ensem_Genera)+ str(con_StORF_Num)+'\t'+str(numbers[0])+'\t'+str(numbers[1])+'\n')
|
|
543
|
+
# if pep_Num == 0:
|
|
544
|
+
# clusters_For_Chris_PEP_0.update({cluster:[size_Of_Cluster,pep_Num,con_StORF_Num,numbers[0],numbers[1]]})
|
|
545
|
+
|
|
546
|
+
print("Da Da!!!!")
|
|
547
|
+
|
|
548
|
+
|
|
549
|
+
#
|
|
550
|
+
#
|
|
551
|
+
#
|
|
552
|
+
#
|
|
553
|
+
# ###################################
|
|
554
|
+
#
|
|
555
|
+
# core_99 = 9.9/10 * len(genome_dict)
|
|
556
|
+
# core_95 = 9.5/10 * len(genome_dict)
|
|
557
|
+
# core_90 = 9/10 * len(genome_dict)
|
|
558
|
+
# core_15 = 1.5/10 * len(genome_dict)
|
|
559
|
+
#
|
|
560
|
+
# pep_core_99 = 0
|
|
561
|
+
# pep_core_95 = 0
|
|
562
|
+
# pep_core_90 = 0
|
|
563
|
+
# pep_core_15 = 0
|
|
564
|
+
#
|
|
565
|
+
#
|
|
566
|
+
# extended_99 = 0
|
|
567
|
+
# extended_95 = 0
|
|
568
|
+
# extended_90 = 0
|
|
569
|
+
# extended_15 = 0
|
|
570
|
+
# ############### Needs to be redone with new 'numbers'
|
|
571
|
+
# for cluster, numbers in pangenome_clusters_Type_Genera.items():
|
|
572
|
+
# if numbers[0] >= math.floor(core_99) and numbers[1] == 0:
|
|
573
|
+
# pep_core_99 +=1
|
|
574
|
+
# elif numbers[0] >= math.floor(core_95) and numbers[0] < math.floor(core_99) and numbers[1] == 0:
|
|
575
|
+
# pep_core_95 +=1
|
|
576
|
+
# elif numbers[0] >= math.floor(core_90) and numbers[0] < math.floor(core_95) and numbers[1] == 0:
|
|
577
|
+
# pep_core_90 +=1
|
|
578
|
+
# if numbers[0] >= math.floor(core_15) and numbers[0] < math.floor(core_95) and numbers[1] == 0: # this catch captures some from pep_core_90
|
|
579
|
+
# pep_core_15 +=1
|
|
580
|
+
# ############ With Con-StORFs
|
|
581
|
+
# if numbers[0] < math.floor(core_99) and numbers[0] != 0 and numbers[0]+numbers[1] >= math.floor(core_99):
|
|
582
|
+
# extended_99 +=1
|
|
583
|
+
# elif numbers[0] < math.floor(core_95) and numbers[0] != 0 and numbers[0]+numbers[1] >= math.floor(core_95) and numbers[0]+numbers[1] < math.floor(core_99):
|
|
584
|
+
# extended_95 +=1
|
|
585
|
+
# elif numbers[0] < math.floor(core_90) and numbers[0] != 0 and numbers[0]+numbers[1] >= math.floor(core_90) and numbers[0]+numbers[1] < math.floor(core_95):
|
|
586
|
+
# extended_90 +=1
|
|
587
|
+
# if numbers[0] < math.floor(core_15) and numbers[0] != 0 and numbers[0]+numbers[1] >= math.floor(core_15) and numbers[0]+numbers[1] < math.floor(core_95):
|
|
588
|
+
# extended_15 +=1
|
|
589
|
+
#
|
|
590
|
+
# print("Out")
|
|
591
|
+
# print(pep_core_99)
|
|
592
|
+
# print(pep_core_95)
|
|
593
|
+
# print(pep_core_90)
|
|
594
|
+
# print(pep_core_15)
|
|
595
|
+
#
|
|
596
|
+
# print(extended_99)
|
|
597
|
+
# print(extended_95)
|
|
598
|
+
# print(extended_90)
|
|
599
|
+
# print(extended_15)
|
|
600
|
+
#
|
PyamilySeq/Constants.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
PyamilySeq_Version = 'v0.2.0'
|