PyamilySeq 0.5.2__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,600 +0,0 @@
1
- import collections
2
- import copy
3
- import math
4
- import sys
5
- import numpy as np
6
- from itertools import chain
7
-
8
- def get_Genus(clustered):
9
- clustered_genus = clustered.split('|')[0]
10
- if '_' in clustered_genus[0]: # Remove name error
11
- clustered_genus = clustered_genus.split('_')[1]
12
- else:
13
- clustered_genus = clustered_genus.split('_')[0]
14
- return str(clustered_genus).capitalize()
15
-
16
- def get_Species(clustered):
17
- clustered_species = clustered.split('|')[0]
18
- if '_' in clustered_species[0]: # Remove name error
19
- clustered_species = clustered_species.split('_')[1]
20
- else:
21
- clustered_species = clustered_species.split('_')[:2]
22
- return str('_'.join(clustered_species)).capitalize()
23
-
24
-
25
- PEP_In = open('/home/nick/Documents/Single_Genome/All_Ensembl_PEP_CD_Clustered_90_60.clstr','r')
26
- StORF_In = open('/home/nick/Documents/Single_Genome/All_Ensem_PEP_CD_Clustered_90_60_Unclustered_UR_StORFs_AA_CD.clstr','r') # Clusters for single Genera
27
-
28
- clusters = collections.OrderedDict()
29
-
30
-
31
- pangenome_clusters_PEP_Genera = collections.OrderedDict()
32
- pangenome_clusters_PEP_Species = collections.OrderedDict()
33
- pangenome_clusters_PEP_Strains = collections.OrderedDict()
34
- pangenome_clusters_PEP_SEQS = collections.OrderedDict()
35
-
36
-
37
- max_storf_only_genera = 0
38
-
39
-
40
-
41
-
42
- count = 0
43
- first = True
44
- genome_dict = collections.defaultdict(int)
45
- reps = collections.OrderedDict()
46
- county = 0
47
- #singleton_cluster = "Null"
48
- clusters_With_Con_StORFs = []
49
- ## Load in all data for easier reuse later
50
- for line in PEP_In:
51
- if line.startswith('>'):
52
- if first == False:
53
- Ensem_Con = set(Ensem_genomes).intersection(Con_genomes)
54
- cluster_size = len(clusters[cluster_id])
55
- reps.update({rep: [cluster_size,len(pangenome_clusters_PEP_Genera[cluster_id])]}) # Add strains, species here if wanted
56
- #if len(clusters[cluster_id]) == 1 and "Null" not in singleton_cluster: # Stop at clusters smaller than 10
57
- # singleton_cluster = cluster_id
58
- #if len(clusters[cluster_id]) < 10: # Stop at clusters smaller than 10
59
- # pangenome_clusters_PEP_Species.popitem()
60
- # pangenome_clusters_PEP_Genera.popitem()
61
- # pangenome_clusters_PEP_SEQS.popitem()
62
- # reps.popitem()
63
- # if len(clusters[cluster_id]) == 1:
64
- # break # REMEMBER
65
- Ensem_genomes, Con_genomes = [], []
66
- cluster_id = line.strip('>')
67
- cluster_id = cluster_id.strip('\n')
68
- cluster_id = cluster_id.split(' ')[1]
69
- clusters.update({cluster_id: []})
70
- pangenome_clusters_PEP_Genera.update({cluster_id: []})
71
- pangenome_clusters_PEP_Species.update({cluster_id:[]})
72
- pangenome_clusters_PEP_Strains.update({cluster_id: []})
73
- # pangenome_clusters_PEP_SEQS.update({cluster_id:[]})
74
-
75
- first = False
76
- else:
77
- clustered = line.split('\t')[1]
78
- clustered = clustered.split('>')[1]
79
- clustered = clustered.split('...')[0]
80
- genome = clustered.split('|')[0]
81
- genome_dict[genome] +=1
82
- if '*' in line:
83
- rep = clustered
84
- reps.update({rep:[0,0]})
85
- if first == False:
86
- clusters[cluster_id].append(clustered)
87
- clustered_genus = get_Genus(clustered)
88
- clustered_species = get_Species(clustered)
89
- clustered_strain = clustered.split('|')[0]
90
-
91
- if clustered_genus not in pangenome_clusters_PEP_Genera[cluster_id]:
92
- pangenome_clusters_PEP_Genera[cluster_id].append(clustered_genus)
93
- #if clustered_species not in pangenome_clusters_PEP_Species[cluster_id]:
94
- # pangenome_clusters_PEP_Species[cluster_id].append(clustered_species)
95
- if genome not in pangenome_clusters_PEP_Strains[cluster_id]:
96
- pangenome_clusters_PEP_Strains[cluster_id].append(genome)
97
- # pangenome_clusters_PEP_SEQS[cluster_id].append(clustered)
98
- print("PEP DONE")
99
- ######################################
100
- Combined_pangenome_clusters_PEP_Genera = collections.OrderedDict()
101
- Combined_pangenome_clusters_PEP_Species = collections.OrderedDict()
102
- Combined_pangenome_clusters_PEP_Strains = collections.OrderedDict()
103
- Combined_pangenome_clusters_PEP_SEQS = collections.OrderedDict()
104
-
105
- Combined_pangenome_clusters_StORF_Genera = collections.OrderedDict()
106
- Combined_pangenome_clusters_StORF_Species = collections.OrderedDict()
107
- Combined_pangenome_clusters_StORF_Strains = collections.OrderedDict()
108
- Combined_pangenome_clusters_StORF_SEQS = collections.OrderedDict()
109
-
110
- Combined_pangenome_clusters_PEP_StORF_Clustered_Genera = collections.OrderedDict()
111
- Combined_pangenome_clusters_PEP_StORF_Clustered = collections.OrderedDict()
112
-
113
- not_StORF_Only_Cluster_IDs = []
114
-
115
- Combined_clusters = collections.OrderedDict()
116
- Combined_reps = collections.OrderedDict()
117
- first = True
118
- ###############
119
- ## We load in the combined PEP and StORF_Reporter data separately
120
- for line in StORF_In:
121
- if line.startswith('>'):
122
- if first == False:
123
- cluster_size = len(Combined_clusters[cluster_id])
124
- Combined_reps.update({rep: cluster_size})
125
- # if len(Combined_pangenome_clusters_PEP_SEQS[cluster_id]) > 1:
126
- # print("Here")
127
- if len(Combined_pangenome_clusters_StORF_SEQS[cluster_id]) > 0 and len(Combined_pangenome_clusters_PEP_SEQS[cluster_id]) > 0:
128
- if len(Combined_pangenome_clusters_PEP_SEQS[cluster_id]) > 1: # If we have clustered >1 PEP family, we need to record 1 as key and all others are val
129
- all_but_first = Combined_pangenome_clusters_PEP_SEQS[cluster_id][1:]
130
- storfs_clustered = Combined_pangenome_clusters_StORF_SEQS[cluster_id]
131
- VALUE = all_but_first+storfs_clustered
132
- else:
133
- VALUE = Combined_pangenome_clusters_StORF_SEQS[cluster_id]
134
- KEY = Combined_pangenome_clusters_PEP_SEQS[cluster_id][0]
135
- Combined_pangenome_clusters_PEP_StORF_Clustered.update({KEY:VALUE})
136
-
137
-
138
- ## Below needs to be rewritten. With >1 genus - be able to record multiple PEPs for each combined...
139
- # if len(Combined_pangenome_clusters_StORF_Genera[cluster_id]) > 0 and len(Combined_pangenome_clusters_PEP_Genera[cluster_id]) > 0:
140
- # KEY = Combined_pangenome_clusters_PEP_SEQS[cluster_id][0]
141
- # VALUE = Combined_pangenome_clusters_StORF_SEQS[cluster_id]
142
- # Combined_pangenome_clusters_PEP_StORF_Clustered_Genera.update({KEY:VALUE})
143
- if len(Combined_clusters[cluster_id]) == 1: # Stop at clusters smaller than 10
144
- print("First Singleton Cluster is: " +str(cluster_id))
145
- break
146
- cluster_id = line.strip('>')
147
- cluster_id = cluster_id.strip('\n')
148
- cluster_id = cluster_id.split(' ')[1]
149
- Combined_clusters.update({cluster_id: []})
150
- # Combined_pangenome_clusters_PEP_Genera.update({cluster_id:[]})
151
- # Combined_pangenome_clusters_PEP_Species.update({cluster_id: []})
152
- Combined_pangenome_clusters_PEP_Strains.update({cluster_id: []})
153
- Combined_pangenome_clusters_PEP_SEQS.update({cluster_id: []})
154
- #
155
- Combined_pangenome_clusters_StORF_Genera.update({cluster_id: []})
156
- # Combined_pangenome_clusters_StORF_Species.update({cluster_id: []})
157
- Combined_pangenome_clusters_StORF_Strains.update({cluster_id: []})
158
- Combined_pangenome_clusters_StORF_SEQS.update({cluster_id: []})
159
- first = False
160
- else:
161
- clustered = line.split('\t')[1]
162
- clustered = clustered.split('>')[1]
163
- clustered = clustered.split('...')[0]
164
- if '*' in line:
165
- rep = clustered
166
- Combined_reps.update({rep:0})
167
- if first == False:
168
- Combined_clusters[cluster_id].append(clustered)
169
- clustered_genus = get_Genus(clustered)
170
- clustered_species = get_Species(clustered)
171
- clustered_strain = clustered.split('|')[0]
172
- if '_' in clustered_strain[0]: # Remove name error
173
- clustered_strain = clustered_strain.split('_')[1]
174
-
175
- if "StORF_Type" in line:
176
- # if cluster_id not in clusters_With_Con_StORFs: # For counting?
177
- # clusters_With_Con_StORFs.append(cluster_id)
178
- if clustered_genus not in Combined_pangenome_clusters_StORF_Genera[cluster_id]:
179
- Combined_pangenome_clusters_StORF_Genera[cluster_id].append(clustered_genus)
180
- # if clustered_species not in Combined_pangenome_clusters_StORF_Species[cluster_id]:
181
- # Combined_pangenome_clusters_StORF_Species[cluster_id].append(clustered_species)
182
- if clustered_strain not in Combined_pangenome_clusters_StORF_Strains[cluster_id]:
183
- Combined_pangenome_clusters_StORF_Strains[cluster_id].append(clustered_strain)
184
- Combined_pangenome_clusters_StORF_SEQS[cluster_id].append(clustered)
185
- #
186
- else:
187
- # if clustered_genus not in Combined_pangenome_clusters_PEP_Genera[cluster_id]:
188
- # Combined_pangenome_clusters_PEP_Genera[cluster_id].append(clustered_genus)
189
- # if clustered_species not in Combined_pangenome_clusters_PEP_Species[cluster_id]:
190
- # Combined_pangenome_clusters_PEP_Species[cluster_id].append(clustered_species)
191
- if clustered_strain not in Combined_pangenome_clusters_PEP_Strains[cluster_id]:
192
- Combined_pangenome_clusters_PEP_Strains[cluster_id].append(clustered_strain)
193
- if cluster_id not in not_StORF_Only_Cluster_IDs:
194
- not_StORF_Only_Cluster_IDs.append(cluster_id) # Tell us which StORF_Reporter clustered are unmatched to a PEP
195
- Combined_pangenome_clusters_PEP_SEQS[cluster_id].append(clustered)
196
-
197
-
198
-
199
- ###HERE for tomorrow - copy the updated work from single to here and repeat for genus,species and strain
200
- list_of_reps = list(reps.keys())
201
- num_clustered_PEP_Genera = collections.defaultdict(list)
202
- recorded_PEP = []
203
- ################################# Genera
204
- pangenome_clusters_Type_Genera = copy.deepcopy(pangenome_clusters_PEP_Genera)
205
- pangenome_clusters_Type_Strains = collections.defaultdict(list)
206
-
207
- for cluster, pep_genomes in pangenome_clusters_PEP_Genera.items():
208
- recorded_PEP.append(cluster)
209
- rep = list_of_reps[int(cluster)]
210
- Com_PEPs = 0
211
- Com_PEP_Genomes = 0
212
- StORFs = 0
213
- Added_StORF_Genera = 0
214
- seen_clust_Strains = []
215
-
216
- PEP_Strains = pangenome_clusters_PEP_Strains[cluster]
217
- for clustered_strain in PEP_Strains:
218
- if '_' in clustered_strain[0]: # Remove name error
219
- clustered_strain = clustered_strain[1:]
220
- if clustered_strain not in seen_clust_Strains:
221
- seen_clust_Strains.append(clustered_strain)
222
-
223
-
224
- try:
225
- clustered_combined = Combined_pangenome_clusters_PEP_StORF_Clustered[rep]
226
- seen_clust_Genera = []
227
- num_clustered_PEP_Genera[cluster].append(rep + '_' + str(len(pep_genomes)))
228
- for clust in clustered_combined:
229
- if 'StORF_Type' not in clust:
230
- ### Need to get the number of pep genomes for each pep clustered into this
231
- Com_PEPs += 1
232
- clustered_genus = get_Genus(clust)
233
- #clust_Genome = clust.split('|')[0]
234
- if clustered_genus not in seen_clust_Genera:
235
- seen_clust_Genera.append(clustered_genus)
236
- if clustered_genus not in pep_genomes:
237
- Com_PEP_Genomes += 1
238
- try:
239
- num_clustered_PEP_Genera[cluster].append(clust + '_' + str(reps[clust][1]))
240
- except TypeError:
241
- sys.exit("Broken")
242
-
243
- elif 'StORF_Type' in clust:
244
- StORFs += 1
245
- clustered_genus = get_Genus(clust)
246
- #clust_Genome = clust.split('|')[0]
247
- if clustered_genus not in seen_clust_Genera:
248
- seen_clust_Genera.append(clustered_genus)
249
- if clustered_genus not in pep_genomes:
250
- Added_StORF_Genera += 1
251
- else:
252
- print("WHAT")
253
-
254
- size_of_pep_clusters = []
255
- peps = num_clustered_PEP_Genera[cluster]
256
- for pep in peps:
257
- pep = pep.rsplit('_', 1)
258
- size_of_pep_clusters.append(int(pep[1]))
259
- pangenome_clusters_Type_Genera[cluster] = [len(num_clustered_PEP_Genera[cluster]), sum(size_of_pep_clusters),
260
- size_of_pep_clusters, Added_StORF_Genera, StORFs]
261
- pangenome_clusters_Type_Strains[cluster] = seen_clust_Strains
262
- except KeyError:
263
- ###Singleton
264
- num_pep_genomes = [len(pep_genomes)]
265
- pangenome_clusters_Type_Genera[cluster] = [1, len(pep_genomes), num_pep_genomes, Added_StORF_Genera, StORFs]
266
- pangenome_clusters_Type_Strains[cluster] = seen_clust_Strains
267
-
268
- print("S")
269
-
270
-
271
- #######################################
272
-
273
- Without_StORF = open('./Ensem_Clusters_Without_StORFs_To_Be_Nogged_min2','w')
274
- With_StORF = open('./Ensem_Clusters_With_StORFs_To_Be_Nogged','w')
275
- #With_Extending_StORF = open('./Ensem_Clusters_With_Extending_Con-StORFs_To_Be_Nogged','w')
276
-
277
- for key, value in pangenome_clusters_Type_Genera.items():
278
- pep_strains = pangenome_clusters_Type_Strains[key]
279
- if value[4] == 0 and len(pep_strains) >=2:
280
- Without_StORF.write(str(key)+',')
281
- # elif value[3] != 0:
282
- # With_Extending_StORF.write(str(key)+',')
283
- # With_StORF.write(str(key) + ',')
284
- elif value[4] >=1:
285
- With_StORF.write(str(key) + ',')
286
-
287
- With_StORF.close()
288
- Without_StORF.close()
289
- #With_Extending_StORF.close()
290
-
291
- ############## Typing for the StORF_Reporter-Data
292
-
293
-
294
- multi_PEP_Combined_By_StORFs = collections.OrderedDict()
295
-
296
- StORF_Seqs_Extended = []
297
- StORF_Genomes_Extended = []
298
-
299
- ####################################
300
- #cores = collections.OrderedDict({'pep_genera_single':[],'pep_genera_multi':[],'extended_genera':[],'comb_extended_genera_single':[],'comb_extended_genera_multi':[],'extended_genera_single':[],'extended_genera_multi':0,'storf_genera_single':0,'storf_genera_multi':0,
301
- # 'only_storf_genera_single':0,'only_storf_genera_multi':0})
302
-
303
- cores = collections.OrderedDict({'pep_genera':[],'extended_genera_single_pep':[],'many_extended_genera_pep':[],'extended_genera':[],'comb_extended_genera':[],'storf_genera':[],'only_storf_genera':[],'only_storf_genera_recording':[]})
304
-
305
- extended = collections.OrderedDict()
306
- ############################
307
-
308
- clsuters_to_be_validated = collections.defaultdict(list)
309
-
310
-
311
- ############################
312
- def calc_pep_only(pep_num):
313
- cores['pep_genera'].append(pep_num)
314
- # if pep_num == 1:# and StORF_num == 0:
315
- # cores['pep_genera_single'] += 1
316
- # elif pep_num > 1:# and StORF_num == 0:
317
- # cores['pep_genera_multi'] += 1
318
- ##########################
319
- def calc_pep_extended_StORF(cluster,pep_num,storf_num):
320
- if pep_num != 0 and storf_num >= 1:
321
- cores['extended_genera'].append(pep_num+storf_num)
322
- clsuters_to_be_validated['extended_genera'].append(cluster)
323
- if pep_num != 0 and storf_num >= 10:
324
- cores['many_extended_genera_pep'].append([cluster,pep_num+storf_num])
325
-
326
- if pep_num == 1 and storf_num >= 1:
327
- cores['extended_genera_single_pep'].append([cluster,pep_num + storf_num])
328
- # cores['extended_genera_single'] +=1
329
- # if pep_num != 0 and storf_num > 1:
330
- # cores['extended_genera_multi'] +=1
331
- ##########################
332
- def calc_multi_pep_extended_StORF(cluster,number_of_pep_clustered,pep_num,storf_num):
333
- if pep_num !=0 and storf_num >= 1:
334
- cores['comb_extended_genera'].append(pep_num+storf_num)
335
- clsuters_to_be_validated['comb_extended_genera'].append(cluster)
336
-
337
- #########################
338
- def calc_StORF_only_when_with_pep(cluster,storf_num):
339
- cores['storf_genera'].append(storf_num)
340
- clsuters_to_be_validated['storf_genera'].append(cluster)
341
- # if storf_num == 1:# and StORF_num == 0:
342
- # cores['storf_genera_single'] += 1
343
- # elif storf_num > 1:# and StORF_num == 0:
344
- # cores['storf_genera_multi'] += 1
345
- ######################## What is the difference with these?
346
- def calc_only_StORF(cluster,storf_num,max_storf_only_genera): # only count the true storf onlies
347
- cores['only_storf_genera'].append(storf_num)
348
- clsuters_to_be_validated['only_storf_genera'].append(cluster)
349
- if storf_num>=6:
350
- cores['only_storf_genera_recording'].append([cluster, storf_num])
351
- if storf_num > max_storf_only_genera:
352
- max_storf_only_genera = storf_num
353
- # if storf_num == 1:# and StORF_num == 0:
354
- # cores['only_storf_genera_single'] += 1
355
- # elif storf_num > 1:# and StORF_num == 0:
356
- # cores['only_storf_genera_multi'] += 1
357
- return max_storf_only_genera
358
- #########################
359
-
360
-
361
-
362
- ###########################
363
- print("Running")
364
- check_all_calced = 0
365
- for cluster, numbers in pangenome_clusters_Type_Genera.items():
366
- pep_strains = pangenome_clusters_Type_Strains[cluster]
367
- if numbers[3] >=1:
368
- StORF_Genomes_Extended.append(numbers[3])
369
- if numbers[4] >=1:
370
- StORF_Seqs_Extended.append(numbers[4])
371
- ############################### Calc PEP only
372
- if numbers[0] == 1 and len(pep_strains) >= 2: # If StORFs did not combine PEP reps
373
- calc_pep_only(numbers[1])#,numbers[3])
374
- check_all_calced +=1
375
- elif numbers[0] >1: # IF StORFs combined multiple PEP
376
- calc_pep_only(numbers[2][0])
377
- check_all_calced += 1
378
- # for num in numbers[2]:
379
- # calc_pep_only(num) # ,numbers[3])
380
-
381
- ############################# Calc PEP and StORF_Reporter
382
- if numbers[0] == 1 and numbers[3] >1: # If StORFs did not combine PEP reps
383
- calc_pep_extended_StORF(cluster,numbers[1],numbers[3])
384
- extended.update({cluster:numbers})
385
- check_all_calced += 1
386
- elif numbers[0] >1 and numbers[3] >1: # IF StORFs combined multiple PEP - Genera added
387
- #grouped_pep = sum(numbers[2])
388
- #for num in numbers[2]:
389
- calc_multi_pep_extended_StORF(cluster,numbers[2],numbers[1],numbers[3]) # same here
390
- print("combined: " + str(cluster))
391
-
392
- extended.update({cluster: numbers})
393
- check_all_calced += 1
394
- elif numbers[0] >1 and numbers[4] >1: # IF StORFs combined multiple PEP
395
- multi_PEP_Combined_By_StORFs.update({cluster: numbers})
396
-
397
-
398
- import os
399
- ###########################
400
- ############################### Calc StORF_Reporter only
401
- Combined_pangenome_clusters_ONLY_StORF_Type = collections.defaultdict(list)
402
- Combined_pangenome_clusters_StORF_Type = collections.defaultdict(list)
403
-
404
- biggest_genera = ""
405
- big_genera = 0
406
- biggest_strains = ""
407
- big_strains = 0
408
-
409
-
410
- #Without_StORF = open('./Ensem_Clusters_Without_Con-StORFs_To_Be_Nogged_min2','w')
411
- #With_StORF = open('./Ensem_Clusters_With_Con-StORFs_To_Be_Nogged','w')
412
- #With_Extending_StORF = open('./Ensem_Clusters_With_Extending_Con-StORFs_To_Be_Nogged','w')
413
- StORF_Only = open("./StORF_Only_Clusters_To_Be_Nogged_min2",'w')
414
-
415
- for cluster, genera in Combined_pangenome_clusters_StORF_Genera.items():
416
- storf_strains = Combined_pangenome_clusters_StORF_Strains[cluster]
417
- pep_strains = Combined_pangenome_clusters_PEP_Strains[cluster]
418
- if cluster in not_StORF_Only_Cluster_IDs:
419
- Combined_pangenome_clusters_StORF_Type[cluster] = [cluster,len(genera)]
420
- #if len(genera) >= 1:
421
- calc_StORF_only_when_with_pep(cluster,len(genera)) # ,numbers[3])
422
- else:
423
- if len(storf_strains) >= 2:
424
- StORF_Only.write(str(cluster) + ',')
425
- Combined_pangenome_clusters_ONLY_StORF_Type[cluster] = [cluster,len(genera)]
426
- max_storf_only_genera = calc_only_StORF(cluster,len(genera),max_storf_only_genera)
427
- if len(genera) > big_genera:
428
- big_genera = len(genera)
429
- biggest_genera = cluster
430
- if len(storf_strains) >= big_strains:
431
- big_strains = len(storf_strains)
432
- biggest_strains = cluster
433
-
434
-
435
-
436
-
437
-
438
-
439
-
440
-
441
- print("Biggest: " +biggest_genera)
442
-
443
-
444
-
445
- ############################### Calc StORF_Reporter only
446
- # for cluster, data in Combined_pangenome_clusters_StORF_Type.items():
447
- # if data[1] >=1:
448
- # calc_StORF_only_when_with_pep(data[1]) # ,numbers[3])
449
- #
450
- #
451
-
452
-
453
- #################
454
- print(cores)
455
- #print(extended)
456
-
457
- from collections import Counter
458
-
459
- #print(Counter(cores['pep_genera']))
460
- #print(Counter(cores['extended_genera']))
461
- #print(Counter(cores['comb_extended_genera']))
462
- #print(Counter(cores['storf_genera']))
463
- print(Counter(cores['only_storf_genera']))
464
- print(cores['only_storf_genera_recording'])
465
-
466
- print("END")
467
-
468
- ### Emoty file ready for interesting storfs
469
- # interesting_out = "./StORF_Only_Clusters_To_Be_Swissed.fa"
470
- # with open(interesting_out, 'r+') as f:
471
- # f.truncate(4)
472
- # for cluster, data in Combined_pangenome_clusters_ONLY_StORF_Type.items():
473
- # #if number >1:
474
- # if data[1] >=1:
475
- # calc_only_StORF(data[1]) # ,numbers[3])
476
- # # if data[1] >= 2:
477
- # # print("Interesting:" + str(cluster))
478
- # # os.system(
479
- # # "python3 Extract_FASTA_From_Cluster.py -f ./All_Ensem_Filtered_PEP_Clustered_With_Unclustered_UR-StORFS_s.fa_CD_c90_s60.fa "
480
- # # "-c ./All_Ensem_Filtered_PEP_Clustered_With_Unclustered_UR-StORFS_s.fa_CD_c90_s60.clstr -id " + str(
481
- # # data[0]) + " -o "+ interesting_out)
482
- # #
483
- #
484
-
485
-
486
-
487
-
488
- #
489
- # ################################## Species
490
- # pangenome_clusters_Type_Species = copy.deepcopy(pangenome_clusters_PEP_Species)
491
- # for cluster, genomes in pangenome_clusters_PEP_Species.items():
492
- # print(str(len(genomes)) + '\t' + str(len(pangenome_clusters_StORF_Species[cluster])))
493
- # Con_StORFs = pangenome_clusters_StORF_Species[cluster]
494
- # unique_con = 0
495
- # all_con = 0
496
- # for con in Con_StORFs:
497
- # all_con +=1
498
- # if con not in genomes:
499
- # unique_con +=1
500
- # pangenome_clusters_Type_Species[cluster] = [len(genomes),all_con,unique_con]
501
- # ################################# Strains
502
- # pangenome_clusters_Type_Strains = copy.deepcopy(pangenome_clusters_PEP_Strains)
503
- # for cluster, genomes in pangenome_clusters_PEP_Strains.items():
504
- # print(str(len(genomes))+'\t'+str(len(pangenome_clusters_StORF_Strains[cluster])))
505
- # Con_StORFs = pangenome_clusters_StORF_Strains[cluster]
506
- # unique_con = 0
507
- # all_con = 0
508
- # for con in Con_StORFs:
509
- # all_con +=1
510
- # if con not in genomes:
511
- # unique_con +=1
512
- # pangenome_clusters_Type_Strains[cluster] = [len(genomes),all_con,unique_con]
513
- # ###################################
514
- # Chris_Out = open('./Chris_Clusters.txt','w')
515
- #
516
- # clusters_For_Chris = collections.OrderedDict()
517
- # clusters_For_Chris_PEP_0 = collections.OrderedDict()
518
- #
519
- # Chris_Out.write("Cluster\tSize\tEnsem_Genera_Num\tCon-StORF_Genera_Num\tCon-StORF_Only_Genera_Num\tEnsem_Species_Num\tCon-StORF_Species_Num\tCon-StORF_Only_Species_Num\tEnsem_Strain_Num\tCon-StORF_Strain_Num\tCon-StORF_Only_Strain_Num\n")
520
- # #This for-loop will go through ALL Clusters allowing for the extraction of ALL different groupings
521
- # for cluster, data in clusters.items():
522
- # genera_numbers = pangenome_clusters_Type_Genera[cluster]
523
- # species_numbers = pangenome_clusters_Type_Species[cluster]
524
- # strain_numbers = pangenome_clusters_Type_Strains[cluster]
525
- #
526
- # Chris_Out.write(str(cluster)+'\t'+str(len(data))+'\t'+str(genera_numbers[0])+'\t'+str(genera_numbers[1])+'\t'+str(genera_numbers[2])+'\t'+str(species_numbers[0])+'\t'
527
- # +str(species_numbers[1])+'\t'+str(species_numbers[2])+'\t'+str(strain_numbers[0])+'\t'+str(strain_numbers[1])+'\t'+str(species_numbers[2])+'\n')
528
-
529
- # if cluster in clusters_With_Con_StORFs:
530
- # print("Current")
531
- # size_Of_Cluster = len(clusters[cluster])
532
- # ensem_Num = 0
533
- # con_StORF_Num = 0
534
- # for i in clusters[cluster]:
535
- # print(i)
536
- # if 'Con-Stop' in i:
537
- # con_StORF_Num +=1
538
- # else:
539
- # ensem_Num +=1
540
- # clusters_For_Chris.update({cluster:[size_Of_Cluster,pep_Num,con_StORF_Num,numbers[0],numbers[1]]})
541
- # ############# Add - Num of
542
- # Chris_Out.write(str(cluster)+'\t'+str(size_Of_Cluster)+'\t'+str(pep_Num)+'\t'+str(ensem_Genera)+ str(con_StORF_Num)+'\t'+str(numbers[0])+'\t'+str(numbers[1])+'\n')
543
- # if pep_Num == 0:
544
- # clusters_For_Chris_PEP_0.update({cluster:[size_Of_Cluster,pep_Num,con_StORF_Num,numbers[0],numbers[1]]})
545
-
546
- print("Da Da!!!!")
547
-
548
-
549
- #
550
- #
551
- #
552
- #
553
- # ###################################
554
- #
555
- # core_99 = 9.9/10 * len(genome_dict)
556
- # core_95 = 9.5/10 * len(genome_dict)
557
- # core_90 = 9/10 * len(genome_dict)
558
- # core_15 = 1.5/10 * len(genome_dict)
559
- #
560
- # pep_core_99 = 0
561
- # pep_core_95 = 0
562
- # pep_core_90 = 0
563
- # pep_core_15 = 0
564
- #
565
- #
566
- # extended_99 = 0
567
- # extended_95 = 0
568
- # extended_90 = 0
569
- # extended_15 = 0
570
- # ############### Needs to be redone with new 'numbers'
571
- # for cluster, numbers in pangenome_clusters_Type_Genera.items():
572
- # if numbers[0] >= math.floor(core_99) and numbers[1] == 0:
573
- # pep_core_99 +=1
574
- # elif numbers[0] >= math.floor(core_95) and numbers[0] < math.floor(core_99) and numbers[1] == 0:
575
- # pep_core_95 +=1
576
- # elif numbers[0] >= math.floor(core_90) and numbers[0] < math.floor(core_95) and numbers[1] == 0:
577
- # pep_core_90 +=1
578
- # if numbers[0] >= math.floor(core_15) and numbers[0] < math.floor(core_95) and numbers[1] == 0: # this catch captures some from pep_core_90
579
- # pep_core_15 +=1
580
- # ############ With Con-StORFs
581
- # if numbers[0] < math.floor(core_99) and numbers[0] != 0 and numbers[0]+numbers[1] >= math.floor(core_99):
582
- # extended_99 +=1
583
- # elif numbers[0] < math.floor(core_95) and numbers[0] != 0 and numbers[0]+numbers[1] >= math.floor(core_95) and numbers[0]+numbers[1] < math.floor(core_99):
584
- # extended_95 +=1
585
- # elif numbers[0] < math.floor(core_90) and numbers[0] != 0 and numbers[0]+numbers[1] >= math.floor(core_90) and numbers[0]+numbers[1] < math.floor(core_95):
586
- # extended_90 +=1
587
- # if numbers[0] < math.floor(core_15) and numbers[0] != 0 and numbers[0]+numbers[1] >= math.floor(core_15) and numbers[0]+numbers[1] < math.floor(core_95):
588
- # extended_15 +=1
589
- #
590
- # print("Out")
591
- # print(pep_core_99)
592
- # print(pep_core_95)
593
- # print(pep_core_90)
594
- # print(pep_core_15)
595
- #
596
- # print(extended_99)
597
- # print(extended_95)
598
- # print(extended_90)
599
- # print(extended_15)
600
- #