PyamilySeq 0.0.1__py3-none-any.whl → 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,600 @@
1
+ import collections
2
+ import copy
3
+ import math
4
+ import sys
5
+ import numpy as np
6
+ from itertools import chain
7
+
8
+ def get_Genus(clustered):
9
+ clustered_genus = clustered.split('|')[0]
10
+ if '_' in clustered_genus[0]: # Remove name error
11
+ clustered_genus = clustered_genus.split('_')[1]
12
+ else:
13
+ clustered_genus = clustered_genus.split('_')[0]
14
+ return str(clustered_genus).capitalize()
15
+
16
+ def get_Species(clustered):
17
+ clustered_species = clustered.split('|')[0]
18
+ if '_' in clustered_species[0]: # Remove name error
19
+ clustered_species = clustered_species.split('_')[1]
20
+ else:
21
+ clustered_species = clustered_species.split('_')[:2]
22
+ return str('_'.join(clustered_species)).capitalize()
23
+
24
+
25
+ PEP_In = open('/home/nick/Documents/Single_Genome/All_Ensembl_PEP_CD_Clustered_90_60.clstr','r')
26
+ StORF_In = open('/home/nick/Documents/Single_Genome/All_Ensem_PEP_CD_Clustered_90_60_Unclustered_UR_StORFs_AA_CD.clstr','r') # Clusters for single Genera
27
+
28
+ clusters = collections.OrderedDict()
29
+
30
+
31
+ pangenome_clusters_PEP_Genera = collections.OrderedDict()
32
+ pangenome_clusters_PEP_Species = collections.OrderedDict()
33
+ pangenome_clusters_PEP_Strains = collections.OrderedDict()
34
+ pangenome_clusters_PEP_SEQS = collections.OrderedDict()
35
+
36
+
37
+ max_storf_only_genera = 0
38
+
39
+
40
+
41
+
42
+ count = 0
43
+ first = True
44
+ genome_dict = collections.defaultdict(int)
45
+ reps = collections.OrderedDict()
46
+ county = 0
47
+ #singleton_cluster = "Null"
48
+ clusters_With_Con_StORFs = []
49
+ ## Load in all data for easier reuse later
50
+ for line in PEP_In:
51
+ if line.startswith('>'):
52
+ if first == False:
53
+ Ensem_Con = set(Ensem_genomes).intersection(Con_genomes)
54
+ cluster_size = len(clusters[cluster_id])
55
+ reps.update({rep: [cluster_size,len(pangenome_clusters_PEP_Genera[cluster_id])]}) # Add strains, species here if wanted
56
+ #if len(clusters[cluster_id]) == 1 and "Null" not in singleton_cluster: # Stop at clusters smaller than 10
57
+ # singleton_cluster = cluster_id
58
+ #if len(clusters[cluster_id]) < 10: # Stop at clusters smaller than 10
59
+ # pangenome_clusters_PEP_Species.popitem()
60
+ # pangenome_clusters_PEP_Genera.popitem()
61
+ # pangenome_clusters_PEP_SEQS.popitem()
62
+ # reps.popitem()
63
+ # if len(clusters[cluster_id]) == 1:
64
+ # break # REMEMBER
65
+ Ensem_genomes, Con_genomes = [], []
66
+ cluster_id = line.strip('>')
67
+ cluster_id = cluster_id.strip('\n')
68
+ cluster_id = cluster_id.split(' ')[1]
69
+ clusters.update({cluster_id: []})
70
+ pangenome_clusters_PEP_Genera.update({cluster_id: []})
71
+ pangenome_clusters_PEP_Species.update({cluster_id:[]})
72
+ pangenome_clusters_PEP_Strains.update({cluster_id: []})
73
+ # pangenome_clusters_PEP_SEQS.update({cluster_id:[]})
74
+
75
+ first = False
76
+ else:
77
+ clustered = line.split('\t')[1]
78
+ clustered = clustered.split('>')[1]
79
+ clustered = clustered.split('...')[0]
80
+ genome = clustered.split('|')[0]
81
+ genome_dict[genome] +=1
82
+ if '*' in line:
83
+ rep = clustered
84
+ reps.update({rep:[0,0]})
85
+ if first == False:
86
+ clusters[cluster_id].append(clustered)
87
+ clustered_genus = get_Genus(clustered)
88
+ clustered_species = get_Species(clustered)
89
+ clustered_strain = clustered.split('|')[0]
90
+
91
+ if clustered_genus not in pangenome_clusters_PEP_Genera[cluster_id]:
92
+ pangenome_clusters_PEP_Genera[cluster_id].append(clustered_genus)
93
+ #if clustered_species not in pangenome_clusters_PEP_Species[cluster_id]:
94
+ # pangenome_clusters_PEP_Species[cluster_id].append(clustered_species)
95
+ if genome not in pangenome_clusters_PEP_Strains[cluster_id]:
96
+ pangenome_clusters_PEP_Strains[cluster_id].append(genome)
97
+ # pangenome_clusters_PEP_SEQS[cluster_id].append(clustered)
98
+ print("PEP DONE")
99
+ ######################################
100
+ Combined_pangenome_clusters_PEP_Genera = collections.OrderedDict()
101
+ Combined_pangenome_clusters_PEP_Species = collections.OrderedDict()
102
+ Combined_pangenome_clusters_PEP_Strains = collections.OrderedDict()
103
+ Combined_pangenome_clusters_PEP_SEQS = collections.OrderedDict()
104
+
105
+ Combined_pangenome_clusters_StORF_Genera = collections.OrderedDict()
106
+ Combined_pangenome_clusters_StORF_Species = collections.OrderedDict()
107
+ Combined_pangenome_clusters_StORF_Strains = collections.OrderedDict()
108
+ Combined_pangenome_clusters_StORF_SEQS = collections.OrderedDict()
109
+
110
+ Combined_pangenome_clusters_PEP_StORF_Clustered_Genera = collections.OrderedDict()
111
+ Combined_pangenome_clusters_PEP_StORF_Clustered = collections.OrderedDict()
112
+
113
+ not_StORF_Only_Cluster_IDs = []
114
+
115
+ Combined_clusters = collections.OrderedDict()
116
+ Combined_reps = collections.OrderedDict()
117
+ first = True
118
+ ###############
119
+ ## We load in the combined PEP and StORF_Reporter data separately
120
+ for line in StORF_In:
121
+ if line.startswith('>'):
122
+ if first == False:
123
+ cluster_size = len(Combined_clusters[cluster_id])
124
+ Combined_reps.update({rep: cluster_size})
125
+ # if len(Combined_pangenome_clusters_PEP_SEQS[cluster_id]) > 1:
126
+ # print("Here")
127
+ if len(Combined_pangenome_clusters_StORF_SEQS[cluster_id]) > 0 and len(Combined_pangenome_clusters_PEP_SEQS[cluster_id]) > 0:
128
+ if len(Combined_pangenome_clusters_PEP_SEQS[cluster_id]) > 1: # If we have clustered >1 PEP family, we need to record 1 as key and all others are val
129
+ all_but_first = Combined_pangenome_clusters_PEP_SEQS[cluster_id][1:]
130
+ storfs_clustered = Combined_pangenome_clusters_StORF_SEQS[cluster_id]
131
+ VALUE = all_but_first+storfs_clustered
132
+ else:
133
+ VALUE = Combined_pangenome_clusters_StORF_SEQS[cluster_id]
134
+ KEY = Combined_pangenome_clusters_PEP_SEQS[cluster_id][0]
135
+ Combined_pangenome_clusters_PEP_StORF_Clustered.update({KEY:VALUE})
136
+
137
+
138
+ ## Below needs to be rewritten. With >1 genus - be able to record multiple PEPs for each combined...
139
+ # if len(Combined_pangenome_clusters_StORF_Genera[cluster_id]) > 0 and len(Combined_pangenome_clusters_PEP_Genera[cluster_id]) > 0:
140
+ # KEY = Combined_pangenome_clusters_PEP_SEQS[cluster_id][0]
141
+ # VALUE = Combined_pangenome_clusters_StORF_SEQS[cluster_id]
142
+ # Combined_pangenome_clusters_PEP_StORF_Clustered_Genera.update({KEY:VALUE})
143
+ if len(Combined_clusters[cluster_id]) == 1: # Stop at clusters smaller than 10
144
+ print("First Singleton Cluster is: " +str(cluster_id))
145
+ break
146
+ cluster_id = line.strip('>')
147
+ cluster_id = cluster_id.strip('\n')
148
+ cluster_id = cluster_id.split(' ')[1]
149
+ Combined_clusters.update({cluster_id: []})
150
+ # Combined_pangenome_clusters_PEP_Genera.update({cluster_id:[]})
151
+ # Combined_pangenome_clusters_PEP_Species.update({cluster_id: []})
152
+ Combined_pangenome_clusters_PEP_Strains.update({cluster_id: []})
153
+ Combined_pangenome_clusters_PEP_SEQS.update({cluster_id: []})
154
+ #
155
+ Combined_pangenome_clusters_StORF_Genera.update({cluster_id: []})
156
+ # Combined_pangenome_clusters_StORF_Species.update({cluster_id: []})
157
+ Combined_pangenome_clusters_StORF_Strains.update({cluster_id: []})
158
+ Combined_pangenome_clusters_StORF_SEQS.update({cluster_id: []})
159
+ first = False
160
+ else:
161
+ clustered = line.split('\t')[1]
162
+ clustered = clustered.split('>')[1]
163
+ clustered = clustered.split('...')[0]
164
+ if '*' in line:
165
+ rep = clustered
166
+ Combined_reps.update({rep:0})
167
+ if first == False:
168
+ Combined_clusters[cluster_id].append(clustered)
169
+ clustered_genus = get_Genus(clustered)
170
+ clustered_species = get_Species(clustered)
171
+ clustered_strain = clustered.split('|')[0]
172
+ if '_' in clustered_strain[0]: # Remove name error
173
+ clustered_strain = clustered_strain.split('_')[1]
174
+
175
+ if "StORF_Type" in line:
176
+ # if cluster_id not in clusters_With_Con_StORFs: # For counting?
177
+ # clusters_With_Con_StORFs.append(cluster_id)
178
+ if clustered_genus not in Combined_pangenome_clusters_StORF_Genera[cluster_id]:
179
+ Combined_pangenome_clusters_StORF_Genera[cluster_id].append(clustered_genus)
180
+ # if clustered_species not in Combined_pangenome_clusters_StORF_Species[cluster_id]:
181
+ # Combined_pangenome_clusters_StORF_Species[cluster_id].append(clustered_species)
182
+ if clustered_strain not in Combined_pangenome_clusters_StORF_Strains[cluster_id]:
183
+ Combined_pangenome_clusters_StORF_Strains[cluster_id].append(clustered_strain)
184
+ Combined_pangenome_clusters_StORF_SEQS[cluster_id].append(clustered)
185
+ #
186
+ else:
187
+ # if clustered_genus not in Combined_pangenome_clusters_PEP_Genera[cluster_id]:
188
+ # Combined_pangenome_clusters_PEP_Genera[cluster_id].append(clustered_genus)
189
+ # if clustered_species not in Combined_pangenome_clusters_PEP_Species[cluster_id]:
190
+ # Combined_pangenome_clusters_PEP_Species[cluster_id].append(clustered_species)
191
+ if clustered_strain not in Combined_pangenome_clusters_PEP_Strains[cluster_id]:
192
+ Combined_pangenome_clusters_PEP_Strains[cluster_id].append(clustered_strain)
193
+ if cluster_id not in not_StORF_Only_Cluster_IDs:
194
+ not_StORF_Only_Cluster_IDs.append(cluster_id) # Tell us which StORF_Reporter clustered are unmatched to a PEP
195
+ Combined_pangenome_clusters_PEP_SEQS[cluster_id].append(clustered)
196
+
197
+
198
+
199
+ ###HERE for tomorrow - copy the updated work from single to here and repeat for genus,species and strain
200
+ list_of_reps = list(reps.keys())
201
+ num_clustered_PEP_Genera = collections.defaultdict(list)
202
+ recorded_PEP = []
203
+ ################################# Genera
204
+ pangenome_clusters_Type_Genera = copy.deepcopy(pangenome_clusters_PEP_Genera)
205
+ pangenome_clusters_Type_Strains = collections.defaultdict(list)
206
+
207
+ for cluster, pep_genomes in pangenome_clusters_PEP_Genera.items():
208
+ recorded_PEP.append(cluster)
209
+ rep = list_of_reps[int(cluster)]
210
+ Com_PEPs = 0
211
+ Com_PEP_Genomes = 0
212
+ StORFs = 0
213
+ Added_StORF_Genera = 0
214
+ seen_clust_Strains = []
215
+
216
+ PEP_Strains = pangenome_clusters_PEP_Strains[cluster]
217
+ for clustered_strain in PEP_Strains:
218
+ if '_' in clustered_strain[0]: # Remove name error
219
+ clustered_strain = clustered_strain[1:]
220
+ if clustered_strain not in seen_clust_Strains:
221
+ seen_clust_Strains.append(clustered_strain)
222
+
223
+
224
+ try:
225
+ clustered_combined = Combined_pangenome_clusters_PEP_StORF_Clustered[rep]
226
+ seen_clust_Genera = []
227
+ num_clustered_PEP_Genera[cluster].append(rep + '_' + str(len(pep_genomes)))
228
+ for clust in clustered_combined:
229
+ if 'StORF_Type' not in clust:
230
+ ### Need to get the number of pep genomes for each pep clustered into this
231
+ Com_PEPs += 1
232
+ clustered_genus = get_Genus(clust)
233
+ #clust_Genome = clust.split('|')[0]
234
+ if clustered_genus not in seen_clust_Genera:
235
+ seen_clust_Genera.append(clustered_genus)
236
+ if clustered_genus not in pep_genomes:
237
+ Com_PEP_Genomes += 1
238
+ try:
239
+ num_clustered_PEP_Genera[cluster].append(clust + '_' + str(reps[clust][1]))
240
+ except TypeError:
241
+ sys.exit("Broken")
242
+
243
+ elif 'StORF_Type' in clust:
244
+ StORFs += 1
245
+ clustered_genus = get_Genus(clust)
246
+ #clust_Genome = clust.split('|')[0]
247
+ if clustered_genus not in seen_clust_Genera:
248
+ seen_clust_Genera.append(clustered_genus)
249
+ if clustered_genus not in pep_genomes:
250
+ Added_StORF_Genera += 1
251
+ else:
252
+ print("WHAT")
253
+
254
+ size_of_pep_clusters = []
255
+ peps = num_clustered_PEP_Genera[cluster]
256
+ for pep in peps:
257
+ pep = pep.rsplit('_', 1)
258
+ size_of_pep_clusters.append(int(pep[1]))
259
+ pangenome_clusters_Type_Genera[cluster] = [len(num_clustered_PEP_Genera[cluster]), sum(size_of_pep_clusters),
260
+ size_of_pep_clusters, Added_StORF_Genera, StORFs]
261
+ pangenome_clusters_Type_Strains[cluster] = seen_clust_Strains
262
+ except KeyError:
263
+ ###Singleton
264
+ num_pep_genomes = [len(pep_genomes)]
265
+ pangenome_clusters_Type_Genera[cluster] = [1, len(pep_genomes), num_pep_genomes, Added_StORF_Genera, StORFs]
266
+ pangenome_clusters_Type_Strains[cluster] = seen_clust_Strains
267
+
268
+ print("S")
269
+
270
+
271
+ #######################################
272
+
273
+ Without_StORF = open('./Ensem_Clusters_Without_StORFs_To_Be_Nogged_min2','w')
274
+ With_StORF = open('./Ensem_Clusters_With_StORFs_To_Be_Nogged','w')
275
+ #With_Extending_StORF = open('./Ensem_Clusters_With_Extending_Con-StORFs_To_Be_Nogged','w')
276
+
277
+ for key, value in pangenome_clusters_Type_Genera.items():
278
+ pep_strains = pangenome_clusters_Type_Strains[key]
279
+ if value[4] == 0 and len(pep_strains) >=2:
280
+ Without_StORF.write(str(key)+',')
281
+ # elif value[3] != 0:
282
+ # With_Extending_StORF.write(str(key)+',')
283
+ # With_StORF.write(str(key) + ',')
284
+ elif value[4] >=1:
285
+ With_StORF.write(str(key) + ',')
286
+
287
+ With_StORF.close()
288
+ Without_StORF.close()
289
+ #With_Extending_StORF.close()
290
+
291
+ ############## Typing for the StORF_Reporter-Data
292
+
293
+
294
+ multi_PEP_Combined_By_StORFs = collections.OrderedDict()
295
+
296
+ StORF_Seqs_Extended = []
297
+ StORF_Genomes_Extended = []
298
+
299
+ ####################################
300
+ #cores = collections.OrderedDict({'pep_genera_single':[],'pep_genera_multi':[],'extended_genera':[],'comb_extended_genera_single':[],'comb_extended_genera_multi':[],'extended_genera_single':[],'extended_genera_multi':0,'storf_genera_single':0,'storf_genera_multi':0,
301
+ # 'only_storf_genera_single':0,'only_storf_genera_multi':0})
302
+
303
+ cores = collections.OrderedDict({'pep_genera':[],'extended_genera_single_pep':[],'many_extended_genera_pep':[],'extended_genera':[],'comb_extended_genera':[],'storf_genera':[],'only_storf_genera':[],'only_storf_genera_recording':[]})
304
+
305
+ extended = collections.OrderedDict()
306
+ ############################
307
+
308
+ clsuters_to_be_validated = collections.defaultdict(list)
309
+
310
+
311
+ ############################
312
+ def calc_pep_only(pep_num):
313
+ cores['pep_genera'].append(pep_num)
314
+ # if pep_num == 1:# and StORF_num == 0:
315
+ # cores['pep_genera_single'] += 1
316
+ # elif pep_num > 1:# and StORF_num == 0:
317
+ # cores['pep_genera_multi'] += 1
318
+ ##########################
319
+ def calc_pep_extended_StORF(cluster,pep_num,storf_num):
320
+ if pep_num != 0 and storf_num >= 1:
321
+ cores['extended_genera'].append(pep_num+storf_num)
322
+ clsuters_to_be_validated['extended_genera'].append(cluster)
323
+ if pep_num != 0 and storf_num >= 10:
324
+ cores['many_extended_genera_pep'].append([cluster,pep_num+storf_num])
325
+
326
+ if pep_num == 1 and storf_num >= 1:
327
+ cores['extended_genera_single_pep'].append([cluster,pep_num + storf_num])
328
+ # cores['extended_genera_single'] +=1
329
+ # if pep_num != 0 and storf_num > 1:
330
+ # cores['extended_genera_multi'] +=1
331
+ ##########################
332
+ def calc_multi_pep_extended_StORF(cluster,number_of_pep_clustered,pep_num,storf_num):
333
+ if pep_num !=0 and storf_num >= 1:
334
+ cores['comb_extended_genera'].append(pep_num+storf_num)
335
+ clsuters_to_be_validated['comb_extended_genera'].append(cluster)
336
+
337
+ #########################
338
+ def calc_StORF_only_when_with_pep(cluster,storf_num):
339
+ cores['storf_genera'].append(storf_num)
340
+ clsuters_to_be_validated['storf_genera'].append(cluster)
341
+ # if storf_num == 1:# and StORF_num == 0:
342
+ # cores['storf_genera_single'] += 1
343
+ # elif storf_num > 1:# and StORF_num == 0:
344
+ # cores['storf_genera_multi'] += 1
345
+ ######################## What is the difference with these?
346
+ def calc_only_StORF(cluster,storf_num,max_storf_only_genera): # only count the true storf onlies
347
+ cores['only_storf_genera'].append(storf_num)
348
+ clsuters_to_be_validated['only_storf_genera'].append(cluster)
349
+ if storf_num>=6:
350
+ cores['only_storf_genera_recording'].append([cluster, storf_num])
351
+ if storf_num > max_storf_only_genera:
352
+ max_storf_only_genera = storf_num
353
+ # if storf_num == 1:# and StORF_num == 0:
354
+ # cores['only_storf_genera_single'] += 1
355
+ # elif storf_num > 1:# and StORF_num == 0:
356
+ # cores['only_storf_genera_multi'] += 1
357
+ return max_storf_only_genera
358
+ #########################
359
+
360
+
361
+
362
+ ###########################
363
+ print("Running")
364
+ check_all_calced = 0
365
+ for cluster, numbers in pangenome_clusters_Type_Genera.items():
366
+ pep_strains = pangenome_clusters_Type_Strains[cluster]
367
+ if numbers[3] >=1:
368
+ StORF_Genomes_Extended.append(numbers[3])
369
+ if numbers[4] >=1:
370
+ StORF_Seqs_Extended.append(numbers[4])
371
+ ############################### Calc PEP only
372
+ if numbers[0] == 1 and len(pep_strains) >= 2: # If StORFs did not combine PEP reps
373
+ calc_pep_only(numbers[1])#,numbers[3])
374
+ check_all_calced +=1
375
+ elif numbers[0] >1: # IF StORFs combined multiple PEP
376
+ calc_pep_only(numbers[2][0])
377
+ check_all_calced += 1
378
+ # for num in numbers[2]:
379
+ # calc_pep_only(num) # ,numbers[3])
380
+
381
+ ############################# Calc PEP and StORF_Reporter
382
+ if numbers[0] == 1 and numbers[3] >1: # If StORFs did not combine PEP reps
383
+ calc_pep_extended_StORF(cluster,numbers[1],numbers[3])
384
+ extended.update({cluster:numbers})
385
+ check_all_calced += 1
386
+ elif numbers[0] >1 and numbers[3] >1: # IF StORFs combined multiple PEP - Genera added
387
+ #grouped_pep = sum(numbers[2])
388
+ #for num in numbers[2]:
389
+ calc_multi_pep_extended_StORF(cluster,numbers[2],numbers[1],numbers[3]) # same here
390
+ print("combined: " + str(cluster))
391
+
392
+ extended.update({cluster: numbers})
393
+ check_all_calced += 1
394
+ elif numbers[0] >1 and numbers[4] >1: # IF StORFs combined multiple PEP
395
+ multi_PEP_Combined_By_StORFs.update({cluster: numbers})
396
+
397
+
398
+ import os
399
+ ###########################
400
+ ############################### Calc StORF_Reporter only
401
+ Combined_pangenome_clusters_ONLY_StORF_Type = collections.defaultdict(list)
402
+ Combined_pangenome_clusters_StORF_Type = collections.defaultdict(list)
403
+
404
+ biggest_genera = ""
405
+ big_genera = 0
406
+ biggest_strains = ""
407
+ big_strains = 0
408
+
409
+
410
+ #Without_StORF = open('./Ensem_Clusters_Without_Con-StORFs_To_Be_Nogged_min2','w')
411
+ #With_StORF = open('./Ensem_Clusters_With_Con-StORFs_To_Be_Nogged','w')
412
+ #With_Extending_StORF = open('./Ensem_Clusters_With_Extending_Con-StORFs_To_Be_Nogged','w')
413
+ StORF_Only = open("./StORF_Only_Clusters_To_Be_Nogged_min2",'w')
414
+
415
+ for cluster, genera in Combined_pangenome_clusters_StORF_Genera.items():
416
+ storf_strains = Combined_pangenome_clusters_StORF_Strains[cluster]
417
+ pep_strains = Combined_pangenome_clusters_PEP_Strains[cluster]
418
+ if cluster in not_StORF_Only_Cluster_IDs:
419
+ Combined_pangenome_clusters_StORF_Type[cluster] = [cluster,len(genera)]
420
+ #if len(genera) >= 1:
421
+ calc_StORF_only_when_with_pep(cluster,len(genera)) # ,numbers[3])
422
+ else:
423
+ if len(storf_strains) >= 2:
424
+ StORF_Only.write(str(cluster) + ',')
425
+ Combined_pangenome_clusters_ONLY_StORF_Type[cluster] = [cluster,len(genera)]
426
+ max_storf_only_genera = calc_only_StORF(cluster,len(genera),max_storf_only_genera)
427
+ if len(genera) > big_genera:
428
+ big_genera = len(genera)
429
+ biggest_genera = cluster
430
+ if len(storf_strains) >= big_strains:
431
+ big_strains = len(storf_strains)
432
+ biggest_strains = cluster
433
+
434
+
435
+
436
+
437
+
438
+
439
+
440
+
441
+ print("Biggest: " +biggest_genera)
442
+
443
+
444
+
445
+ ############################### Calc StORF_Reporter only
446
+ # for cluster, data in Combined_pangenome_clusters_StORF_Type.items():
447
+ # if data[1] >=1:
448
+ # calc_StORF_only_when_with_pep(data[1]) # ,numbers[3])
449
+ #
450
+ #
451
+
452
+
453
+ #################
454
+ print(cores)
455
+ #print(extended)
456
+
457
+ from collections import Counter
458
+
459
+ #print(Counter(cores['pep_genera']))
460
+ #print(Counter(cores['extended_genera']))
461
+ #print(Counter(cores['comb_extended_genera']))
462
+ #print(Counter(cores['storf_genera']))
463
+ print(Counter(cores['only_storf_genera']))
464
+ print(cores['only_storf_genera_recording'])
465
+
466
+ print("END")
467
+
468
+ ### Emoty file ready for interesting storfs
469
+ # interesting_out = "./StORF_Only_Clusters_To_Be_Swissed.fa"
470
+ # with open(interesting_out, 'r+') as f:
471
+ # f.truncate(4)
472
+ # for cluster, data in Combined_pangenome_clusters_ONLY_StORF_Type.items():
473
+ # #if number >1:
474
+ # if data[1] >=1:
475
+ # calc_only_StORF(data[1]) # ,numbers[3])
476
+ # # if data[1] >= 2:
477
+ # # print("Interesting:" + str(cluster))
478
+ # # os.system(
479
+ # # "python3 Extract_FASTA_From_Cluster.py -f ./All_Ensem_Filtered_PEP_Clustered_With_Unclustered_UR-StORFS_s.fa_CD_c90_s60.fa "
480
+ # # "-c ./All_Ensem_Filtered_PEP_Clustered_With_Unclustered_UR-StORFS_s.fa_CD_c90_s60.clstr -id " + str(
481
+ # # data[0]) + " -o "+ interesting_out)
482
+ # #
483
+ #
484
+
485
+
486
+
487
+
488
+ #
489
+ # ################################## Species
490
+ # pangenome_clusters_Type_Species = copy.deepcopy(pangenome_clusters_PEP_Species)
491
+ # for cluster, genomes in pangenome_clusters_PEP_Species.items():
492
+ # print(str(len(genomes)) + '\t' + str(len(pangenome_clusters_StORF_Species[cluster])))
493
+ # Con_StORFs = pangenome_clusters_StORF_Species[cluster]
494
+ # unique_con = 0
495
+ # all_con = 0
496
+ # for con in Con_StORFs:
497
+ # all_con +=1
498
+ # if con not in genomes:
499
+ # unique_con +=1
500
+ # pangenome_clusters_Type_Species[cluster] = [len(genomes),all_con,unique_con]
501
+ # ################################# Strains
502
+ # pangenome_clusters_Type_Strains = copy.deepcopy(pangenome_clusters_PEP_Strains)
503
+ # for cluster, genomes in pangenome_clusters_PEP_Strains.items():
504
+ # print(str(len(genomes))+'\t'+str(len(pangenome_clusters_StORF_Strains[cluster])))
505
+ # Con_StORFs = pangenome_clusters_StORF_Strains[cluster]
506
+ # unique_con = 0
507
+ # all_con = 0
508
+ # for con in Con_StORFs:
509
+ # all_con +=1
510
+ # if con not in genomes:
511
+ # unique_con +=1
512
+ # pangenome_clusters_Type_Strains[cluster] = [len(genomes),all_con,unique_con]
513
+ # ###################################
514
+ # Chris_Out = open('./Chris_Clusters.txt','w')
515
+ #
516
+ # clusters_For_Chris = collections.OrderedDict()
517
+ # clusters_For_Chris_PEP_0 = collections.OrderedDict()
518
+ #
519
+ # Chris_Out.write("Cluster\tSize\tEnsem_Genera_Num\tCon-StORF_Genera_Num\tCon-StORF_Only_Genera_Num\tEnsem_Species_Num\tCon-StORF_Species_Num\tCon-StORF_Only_Species_Num\tEnsem_Strain_Num\tCon-StORF_Strain_Num\tCon-StORF_Only_Strain_Num\n")
520
+ # #This for-loop will go through ALL Clusters allowing for the extraction of ALL different groupings
521
+ # for cluster, data in clusters.items():
522
+ # genera_numbers = pangenome_clusters_Type_Genera[cluster]
523
+ # species_numbers = pangenome_clusters_Type_Species[cluster]
524
+ # strain_numbers = pangenome_clusters_Type_Strains[cluster]
525
+ #
526
+ # Chris_Out.write(str(cluster)+'\t'+str(len(data))+'\t'+str(genera_numbers[0])+'\t'+str(genera_numbers[1])+'\t'+str(genera_numbers[2])+'\t'+str(species_numbers[0])+'\t'
527
+ # +str(species_numbers[1])+'\t'+str(species_numbers[2])+'\t'+str(strain_numbers[0])+'\t'+str(strain_numbers[1])+'\t'+str(species_numbers[2])+'\n')
528
+
529
+ # if cluster in clusters_With_Con_StORFs:
530
+ # print("Current")
531
+ # size_Of_Cluster = len(clusters[cluster])
532
+ # ensem_Num = 0
533
+ # con_StORF_Num = 0
534
+ # for i in clusters[cluster]:
535
+ # print(i)
536
+ # if 'Con-Stop' in i:
537
+ # con_StORF_Num +=1
538
+ # else:
539
+ # ensem_Num +=1
540
+ # clusters_For_Chris.update({cluster:[size_Of_Cluster,pep_Num,con_StORF_Num,numbers[0],numbers[1]]})
541
+ # ############# Add - Num of
542
+ # Chris_Out.write(str(cluster)+'\t'+str(size_Of_Cluster)+'\t'+str(pep_Num)+'\t'+str(ensem_Genera)+ str(con_StORF_Num)+'\t'+str(numbers[0])+'\t'+str(numbers[1])+'\n')
543
+ # if pep_Num == 0:
544
+ # clusters_For_Chris_PEP_0.update({cluster:[size_Of_Cluster,pep_Num,con_StORF_Num,numbers[0],numbers[1]]})
545
+
546
+ print("Da Da!!!!")
547
+
548
+
549
+ #
550
+ #
551
+ #
552
+ #
553
+ # ###################################
554
+ #
555
+ # core_99 = 9.9/10 * len(genome_dict)
556
+ # core_95 = 9.5/10 * len(genome_dict)
557
+ # core_90 = 9/10 * len(genome_dict)
558
+ # core_15 = 1.5/10 * len(genome_dict)
559
+ #
560
+ # pep_core_99 = 0
561
+ # pep_core_95 = 0
562
+ # pep_core_90 = 0
563
+ # pep_core_15 = 0
564
+ #
565
+ #
566
+ # extended_99 = 0
567
+ # extended_95 = 0
568
+ # extended_90 = 0
569
+ # extended_15 = 0
570
+ # ############### Needs to be redone with new 'numbers'
571
+ # for cluster, numbers in pangenome_clusters_Type_Genera.items():
572
+ # if numbers[0] >= math.floor(core_99) and numbers[1] == 0:
573
+ # pep_core_99 +=1
574
+ # elif numbers[0] >= math.floor(core_95) and numbers[0] < math.floor(core_99) and numbers[1] == 0:
575
+ # pep_core_95 +=1
576
+ # elif numbers[0] >= math.floor(core_90) and numbers[0] < math.floor(core_95) and numbers[1] == 0:
577
+ # pep_core_90 +=1
578
+ # if numbers[0] >= math.floor(core_15) and numbers[0] < math.floor(core_95) and numbers[1] == 0: # this catch captures some from pep_core_90
579
+ # pep_core_15 +=1
580
+ # ############ With Con-StORFs
581
+ # if numbers[0] < math.floor(core_99) and numbers[0] != 0 and numbers[0]+numbers[1] >= math.floor(core_99):
582
+ # extended_99 +=1
583
+ # elif numbers[0] < math.floor(core_95) and numbers[0] != 0 and numbers[0]+numbers[1] >= math.floor(core_95) and numbers[0]+numbers[1] < math.floor(core_99):
584
+ # extended_95 +=1
585
+ # elif numbers[0] < math.floor(core_90) and numbers[0] != 0 and numbers[0]+numbers[1] >= math.floor(core_90) and numbers[0]+numbers[1] < math.floor(core_95):
586
+ # extended_90 +=1
587
+ # if numbers[0] < math.floor(core_15) and numbers[0] != 0 and numbers[0]+numbers[1] >= math.floor(core_15) and numbers[0]+numbers[1] < math.floor(core_95):
588
+ # extended_15 +=1
589
+ #
590
+ # print("Out")
591
+ # print(pep_core_99)
592
+ # print(pep_core_95)
593
+ # print(pep_core_90)
594
+ # print(pep_core_15)
595
+ #
596
+ # print(extended_99)
597
+ # print(extended_95)
598
+ # print(extended_90)
599
+ # print(extended_15)
600
+ #
@@ -0,0 +1 @@
1
+ PyamilySeq_Version = 'v0.0.1'
@@ -0,0 +1,586 @@
1
+ #from line_profiler_pycharm import profile
2
+
3
+ from collections import OrderedDict,defaultdict
4
+ import copy
5
+ import math
6
+ import sys
7
+ import argparse
8
+ import os
9
+
10
+ try:
11
+ from .Constants import *
12
+ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
13
+ from Constants import *
14
+
15
+
16
+ def custom_sort_key(k, dict1, dict2):
17
+ return (len(dict1[k]), len(dict2[k]))
18
+
19
+ def sort_keys_by_values(dict1, dict2):
20
+ sorted_keys = sorted(dict1.keys(), key=lambda k: custom_sort_key(k, dict1, dict2), reverse=True)
21
+ return sorted_keys
22
+
23
+ def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted):
24
+ print("Outputting gene_presence_absence file")
25
+ in_name = options.clusters.split('.')[0]
26
+ gpa_outfile = open(in_name+'_gene_presence_absence.csv','w')
27
+ gpa_outfile.write('"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment","'
28
+ '"Accessory Fragment","Accessory Order with Fragment","QC","Min group size nuc","Max group size nuc","Avg group size nuc","')
29
+ gpa_outfile.write('","'.join(genome_dict.keys()))
30
+ gpa_outfile.write('"\n')
31
+ for cluster, sequences in pangenome_clusters_First_sequences_sorted.items():
32
+ average_sequences_per_genome = len(sequences) / len(pangenome_clusters_First_sorted[cluster])
33
+ gpa_outfile.write('"group_'+str(cluster)+'","","'+str(len(pangenome_clusters_First_sorted[cluster]))+'","'+str(len(sequences))+'","'+str(average_sequences_per_genome)+
34
+ '","","","","","","","","",""')
35
+
36
+ full_out = ''
37
+ for genome in genome_dict.keys():
38
+ tmp_list = []
39
+ for value in sequences:
40
+ if value.split('|')[0] == genome:
41
+ tmp_list.append(value)
42
+ if tmp_list:
43
+ full_out += ',"'+''.join(tmp_list)+'"'
44
+ gpa_outfile.write(full_out)
45
+ gpa_outfile.write('\n')
46
+
47
+ ### Below is some unfinished code
48
+ # edge_list_outfile = open(in_name+'_edge_list.csv','w')
49
+ # for cluster, sequences in pangenome_clusters_First_sequences_sorted.items():
50
+ # output = []
51
+ # for entry in sequences:
52
+ # # Split each entry at '|'
53
+ # genome, gene = entry.split('|')
54
+ # # Format the result as "gene genome"
55
+ # output.append(f"{gene}\t{genome}")
56
+ # for line in output:
57
+ # edge_list_outfile.write(line + '\n')
58
+
59
+
60
+
61
+
62
+ def reorder_dict_by_keys(original_dict, sorted_keys):
63
+ return {k: original_dict[k] for k in sorted_keys}
64
+
65
+ def get_cores(options,genome_dict):
66
+ ##Calculate core groups
67
+ groups = OrderedDict()
68
+ cores = OrderedDict()
69
+ prev_top = len(genome_dict)
70
+ first = True
71
+ for group in options.core_groups.split(','):
72
+ calculated_floor = math.floor(int(group) / 100 * len(genome_dict))
73
+ if first == False:
74
+ groups[group] = (calculated_floor,prev_top -1)
75
+ else:
76
+ groups[group] = (calculated_floor, prev_top)
77
+ first = False
78
+ prev_top = calculated_floor
79
+ first_core_group = 'first_core_' + group
80
+ cores[first_core_group] = 0
81
+ if options.reclustered != None:
82
+ extended_core_group = 'extended_core_' + group
83
+ cores[extended_core_group] = 0
84
+ combined_core_group = 'combined_core_' + group
85
+ cores[combined_core_group] = 0
86
+ second_core_group = 'second_core_' + group
87
+ cores[second_core_group] = 0
88
+ only_second_core_group = 'only_second_core_' + group
89
+ cores[only_second_core_group] = 0
90
+ return cores, groups
91
+
92
+ #@profile
93
+ def calc_First_only_core(pep_num, groups, cores):
94
+ groups_as_list = list(groups.values())
95
+ for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= pep_num <= fir):
96
+ res = idx
97
+ family_group = list(groups)[res]
98
+ cores['first_core_'+family_group] +=1
99
+
100
+ #@profile
101
+ def calc_single_First_extended_Second_only_core(pep_num, groups, cores, second_num): # Count gene families extended with StORFs
102
+ groups_as_list = list(groups.values())
103
+ for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= pep_num+second_num <= fir):
104
+ res = idx
105
+ family_group = list(groups)[res]
106
+ cores['extended_core_' + family_group] += 1
107
+
108
+
109
+ #@profile
110
+ def calc_multi_First_extended_Second_only_core(pep_num, groups, cores, second_num): # Count seperately those gene families extended with StORF_Reporter but combined >1 PEP
111
+ groups_as_list = list(groups.values())
112
+ for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= pep_num+second_num <= fir):
113
+ res = idx
114
+ family_group = list(groups)[res]
115
+ cores['combined_core_' + family_group] += 1
116
+
117
+
118
+ #@profile
119
+ def calc_Second_only_core(groups, cores, second_num):
120
+ groups_as_list = list(groups.values())
121
+ for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= second_num <= fir):
122
+ res = idx
123
+ family_group = list(groups)[res]
124
+ cores['second_core_' + family_group] += 1
125
+
126
+ #@profile
127
+ def calc_only_Second_only_core(groups, cores, second_num): # only count the true storf onlies
128
+ groups_as_list = list(groups.values())
129
+ for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= second_num <= fir):
130
+ res = idx
131
+ family_group = list(groups)[res]
132
+ cores['only_second_core_' + family_group] += 1
133
+
134
+
135
+
136
+
137
+
138
+ #@profile
139
+ def combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered):
140
+ num_clustered_First = defaultdict(list)
141
+ pangenome_clusters_Type = copy.deepcopy(pangenome_clusters_First)
142
+ list_of_reps = list(reps.keys())
143
+ for cluster, pep_genomes in pangenome_clusters_First.items():
144
+ rep = list_of_reps[int(cluster)] # get the rep of the current pep cluster
145
+ Com_PEP_Genomes = 0
146
+ Seconds = 0
147
+ seen_Seconds = []
148
+ added_Second_genomes = 0
149
+ try: # get the cluster from the storf clusters which contains this rep
150
+ clustered_combined = combined_pangenome_clusters_First_Second_clustered[rep] # Not true clusters - I put a PEP as key myself
151
+ seen_clust_Genomes = []
152
+ num_clustered_First[cluster].append(rep + '_' + str(len(pep_genomes)))
153
+ for clust in clustered_combined:
154
+ if options.sequence_tag not in clust: # Not good enough at the moment
155
+ clust_Genome = clust.split('|')[0]
156
+ if clust_Genome not in seen_clust_Genomes:
157
+ seen_clust_Genomes.append(clust_Genome)
158
+ if clust_Genome not in pep_genomes:
159
+ Com_PEP_Genomes += 1
160
+ num_clustered_First[cluster].append(clust + '_' + str(reps[clust][1]))
161
+ elif options.sequence_tag in clust:
162
+ Seconds += 1
163
+ clust_Genome = clust.split('|')[0]
164
+ if clust_Genome not in seen_Seconds:
165
+ seen_Seconds.append(clust_Genome)
166
+ if clust_Genome not in seen_clust_Genomes:
167
+ seen_clust_Genomes.append(clust_Genome)
168
+ if clust_Genome not in pep_genomes:
169
+ added_Second_genomes += 1
170
+ else:
171
+ sys.exit("Error: looking for sequence_tag")
172
+
173
+ size_of_pep_clusters = []
174
+ peps = num_clustered_First[cluster]
175
+ for pep in peps:
176
+ pep = pep.rsplit('_', 1)
177
+ size_of_pep_clusters.append(int(pep[1]))
178
+ pangenome_clusters_Type[cluster] = [len(num_clustered_First[cluster]), sum(size_of_pep_clusters),
179
+ size_of_pep_clusters, added_Second_genomes, Seconds, len(seen_Seconds)]
180
+
181
+ except KeyError:
182
+ ###Singleton
183
+ num_pep_genomes = [len(pep_genomes)]
184
+ pangenome_clusters_Type[cluster] = [1, len(pep_genomes), num_pep_genomes, added_Second_genomes, Seconds,
185
+ len(seen_Seconds)]
186
+
187
+ return pangenome_clusters_Type
188
+
189
+ #@profile
190
+ def single_clustering_counting(options, pangenome_clusters_First, reps):
191
+ num_clustered_PEP = defaultdict(list)
192
+ recorded_PEP = []
193
+ pangenome_clusters_Type = copy.deepcopy(pangenome_clusters_First)
194
+ list_of_reps = list(reps.keys())
195
+ for cluster, pep_genomes in pangenome_clusters_First.items():
196
+ rep = list_of_reps[int(cluster)] # get the rep of the current pep cluster
197
+
198
+ try: # get the cluster from the storf clusters which contains this rep
199
+ num_clustered_PEP[cluster].append(rep + '_' + str(len(pep_genomes)))
200
+ size_of_pep_clusters = []
201
+ peps = num_clustered_PEP[cluster]
202
+ for pep in peps:
203
+ pep = pep.rsplit('_', 1)
204
+ size_of_pep_clusters.append(int(pep[1]))
205
+ recorded_PEP.append(pep[0])
206
+ pangenome_clusters_Type[cluster] = [len(num_clustered_PEP[cluster]), sum(size_of_pep_clusters),
207
+ size_of_pep_clusters, 0, 0, 0]
208
+
209
+ except KeyError:
210
+ ###Singleton
211
+ num_pep_genomes = [len(pep_genomes)]
212
+ pangenome_clusters_Type[cluster] = [1, len(pep_genomes), num_pep_genomes, 0, 0, 0]
213
+
214
+ return pangenome_clusters_Type
215
+
216
+
217
+
218
+ #@profile
219
+ def combined_clustering_CDHIT(options, genome_dict):
220
+ unique_genomes = []
221
+ Second_in = open(options.reclustered, 'r')
222
+ combined_pangenome_clusters_First = OrderedDict()
223
+ combined_pangenome_clusters_First_sequences = OrderedDict()
224
+ combined_pangenome_clusters_Second = OrderedDict()
225
+ combined_pangenome_clusters_Second_sequences = OrderedDict()
226
+ combined_pangenome_clusters_First_Second_clustered = OrderedDict()
227
+
228
+ not_Second_only_cluster_ids = []
229
+ already_seen_PEP = []
230
+ Combined_clusters = OrderedDict()
231
+ Combined_reps = OrderedDict()
232
+ first = True
233
+ for line in Second_in:
234
+ if line.startswith('>'):
235
+ if first == False:
236
+ cluster_size = len(Combined_clusters[cluster_id])
237
+ Combined_reps.update({rep: cluster_size})
238
+ for pep in combined_pangenome_clusters_First_sequences[cluster_id]:
239
+ if pep != []:
240
+ if pep in already_seen_PEP:
241
+ continue
242
+ else:
243
+ already_seen_PEP.append(pep)
244
+ if len(combined_pangenome_clusters_Second_sequences[cluster_id]) > 0 and len(combined_pangenome_clusters_First_sequences[cluster_id]) > 0:
245
+ if len(combined_pangenome_clusters_First_sequences[cluster_id]) > 1: # If we have clustered >1 PEP family, we need to record 1 as key and all others are val
246
+ all_but_first = combined_pangenome_clusters_First_sequences[cluster_id][1:]
247
+ storfs_clustered = combined_pangenome_clusters_Second_sequences[cluster_id]
248
+ VALUE = all_but_first + storfs_clustered
249
+ else:
250
+ VALUE = combined_pangenome_clusters_Second_sequences[cluster_id]
251
+ KEY = combined_pangenome_clusters_First_sequences[cluster_id][0]
252
+ combined_pangenome_clusters_First_Second_clustered.update({KEY: VALUE})
253
+ cluster_id = line.strip('>')
254
+ cluster_id = cluster_id.strip('\n')
255
+ cluster_id = cluster_id.split(' ')[1]
256
+ Combined_clusters.update({cluster_id: []})
257
+ combined_pangenome_clusters_First.update({cluster_id: []})
258
+ combined_pangenome_clusters_First_sequences.update({cluster_id: []})
259
+ combined_pangenome_clusters_Second.update({cluster_id: []})
260
+ combined_pangenome_clusters_Second_sequences.update({cluster_id: []})
261
+
262
+ first = False
263
+ else:
264
+ clustered = line.split('\t')[1]
265
+ clustered = clustered.split('>')[1]
266
+ clustered = clustered.split('...')[0]
267
+ genome = clustered.split('|')[0]
268
+ genome_dict[genome] += 1
269
+ if '*' in line:
270
+ rep = clustered
271
+ Combined_reps.update({rep: 0})
272
+ if first == False:
273
+ Combined_clusters[cluster_id].append(clustered)
274
+ clustered_genome = clustered.split('|')[0]
275
+ if options.sequence_tag in line:
276
+ if clustered_genome not in combined_pangenome_clusters_Second[cluster_id]:
277
+ combined_pangenome_clusters_Second[cluster_id].append(clustered_genome)
278
+ combined_pangenome_clusters_Second_sequences[cluster_id].append(clustered)
279
+ else:
280
+ if cluster_id not in not_Second_only_cluster_ids:
281
+ not_Second_only_cluster_ids.append(cluster_id) # Tell us which StORF_Reporter clustered are unmatched to a PEP
282
+ if clustered_genome not in combined_pangenome_clusters_First[cluster_id]:
283
+ combined_pangenome_clusters_First[cluster_id].append(clustered_genome)
284
+ combined_pangenome_clusters_First_sequences[cluster_id].append(clustered)
285
+
286
+
287
+ return combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids, combined_pangenome_clusters_Second, unique_genomes
288
+
289
+ def combined_clustering_Edge_List(options, genome_dict):
290
+ if options.format == 'TSV':
291
+ separator = '\t'
292
+ elif options.format == 'CSV':
293
+ separator = ','
294
+ unique_genomes = []
295
+ cluster_id = 0
296
+ last_rep = ''
297
+ Second_in = open(options.reclustered, 'r')
298
+ combined_pangenome_clusters_First = OrderedDict()
299
+ combined_pangenome_clusters_First_sequences = OrderedDict()
300
+ combined_pangenome_clusters_Second = OrderedDict()
301
+ combined_pangenome_clusters_Second_sequences = OrderedDict()
302
+ combined_pangenome_clusters_First_Second_clustered = OrderedDict()
303
+
304
+ not_Second_only_cluster_ids = []
305
+ already_seen_PEP = []
306
+ Combined_clusters = OrderedDict()
307
+ Combined_reps = OrderedDict()
308
+ first = True
309
+ for line in Second_in:
310
+ rep, child = line.strip().split(separator)
311
+ child_genome = child.split('|')[0] # Extracting the genome identifier from the child sequence
312
+
313
+ if first == True:
314
+ Combined_clusters.update({cluster_id: []})
315
+ combined_pangenome_clusters_First.update({cluster_id: []})
316
+ combined_pangenome_clusters_First_sequences.update({cluster_id: []})
317
+ combined_pangenome_clusters_Second.update({cluster_id: []})
318
+ combined_pangenome_clusters_Second_sequences.update({cluster_id: []})
319
+ Combined_reps.update({rep: 0})
320
+ first = False
321
+
322
+ if first == False:
323
+ if rep != last_rep and last_rep != '':
324
+ cluster_size = len(Combined_clusters[cluster_id])
325
+ Combined_reps.update({rep: cluster_size})
326
+ for pep in combined_pangenome_clusters_First_sequences[cluster_id]:
327
+ if pep != []:
328
+ if pep in already_seen_PEP:
329
+ continue
330
+ else:
331
+ already_seen_PEP.append(pep)
332
+ if len(combined_pangenome_clusters_Second_sequences[cluster_id]) > 0 and len(combined_pangenome_clusters_First_sequences[cluster_id]) > 0:
333
+ if len(combined_pangenome_clusters_First_sequences[cluster_id]) > 1: # If we have clustered >1 PEP family, we need to record 1 as key and all others are val
334
+ all_but_first = combined_pangenome_clusters_First_sequences[cluster_id][1:]
335
+ storfs_clustered = combined_pangenome_clusters_Second_sequences[cluster_id]
336
+ VALUE = all_but_first + storfs_clustered
337
+ else:
338
+ VALUE = combined_pangenome_clusters_Second_sequences[cluster_id]
339
+ KEY = combined_pangenome_clusters_First_sequences[cluster_id][0]
340
+ combined_pangenome_clusters_First_Second_clustered.update({KEY: VALUE})
341
+
342
+ cluster_id += 1
343
+ Combined_clusters.update({cluster_id: []})
344
+ combined_pangenome_clusters_First.update({cluster_id: []})
345
+ combined_pangenome_clusters_First_sequences.update({cluster_id: []})
346
+ combined_pangenome_clusters_Second.update({cluster_id: []})
347
+ combined_pangenome_clusters_Second_sequences.update({cluster_id: []})
348
+ Combined_reps.update({rep: 0})
349
+
350
+
351
+ Combined_clusters[cluster_id].append(child)
352
+ if options.sequence_tag in line:
353
+ if child_genome not in combined_pangenome_clusters_Second[cluster_id]:
354
+ combined_pangenome_clusters_Second[cluster_id].append(child_genome)
355
+ combined_pangenome_clusters_Second_sequences[cluster_id].append(child)
356
+ else:
357
+ if cluster_id not in not_Second_only_cluster_ids:
358
+ not_Second_only_cluster_ids.append(cluster_id) # Tell us which StORF_Reporter clustered are unmatched to a PEP
359
+ if child_genome not in combined_pangenome_clusters_First[cluster_id]:
360
+ combined_pangenome_clusters_First[cluster_id].append(child_genome)
361
+ combined_pangenome_clusters_First_sequences[cluster_id].append(child)
362
+
363
+ last_rep = rep
364
+
365
+ return combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids, combined_pangenome_clusters_Second, unique_genomes
366
+
367
+
368
+ def cluster_EdgeList(options):
369
+ if options.format == 'TSV':
370
+ separator = '\t'
371
+ elif options.format == 'CSV':
372
+ separator = ','
373
+ cluster_id = 0
374
+ last_rep = ''
375
+ first = True
376
+ First_in = open(options.clusters, 'r')
377
+ pangenome_clusters_First = OrderedDict()
378
+ pangenome_clusters_First_sequences = OrderedDict()
379
+ genome_dict = defaultdict(int)
380
+ reps = OrderedDict()
381
+ for line in First_in:
382
+ rep, child = line.strip().split(separator)
383
+ child_genome = child.split('|')[0] # Extracting the genome identifier from the child sequence
384
+ # Counting occurrences of genomes
385
+ genome_dict[child_genome] += 1
386
+ if first == True:
387
+ pangenome_clusters_First[0] = []
388
+ pangenome_clusters_First_sequences[0] = []
389
+ first = False
390
+
391
+ if rep != last_rep and last_rep != '':
392
+ cluster_id +=1
393
+ pangenome_clusters_First[cluster_id] = []
394
+ pangenome_clusters_First_sequences[cluster_id] = []
395
+ cluster_size = len(pangenome_clusters_First_sequences[cluster_id-1])
396
+ reps.update({last_rep: [cluster_size, len(pangenome_clusters_First[cluster_id-1])]})
397
+ pangenome_clusters_First[cluster_id] = []
398
+ pangenome_clusters_First_sequences[cluster_id] = []
399
+ if child_genome not in pangenome_clusters_First[cluster_id]:
400
+ pangenome_clusters_First[cluster_id].append(child_genome)
401
+
402
+ pangenome_clusters_First_sequences[cluster_id].append(child)
403
+ last_rep = rep
404
+ cluster_size = len(pangenome_clusters_First_sequences[cluster_id])
405
+ reps.update({rep: [cluster_size, len(pangenome_clusters_First[cluster_id])]})
406
+
407
+
408
+ return genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps
409
+
410
+
411
+
412
+ def cluster_CDHIT(options):
413
+ First_in = open(options.clusters, 'r')
414
+ clusters = OrderedDict()
415
+ pangenome_clusters_First = OrderedDict()
416
+ pangenome_clusters_First_sequences = OrderedDict()
417
+ first = True
418
+ genome_dict = defaultdict(int)
419
+ reps = OrderedDict()
420
+ ## Load in all data for easier reuse later
421
+ for line in First_in:
422
+ if line.startswith('>'):
423
+ if first == False:
424
+ cluster_size = len(clusters[cluster_id])
425
+ reps.update({rep: [cluster_size, len(pangenome_clusters_First[cluster_id])]})
426
+ cluster_id = line.strip('>')
427
+ cluster_id = cluster_id.strip('\n')
428
+ cluster_id = cluster_id.split(' ')[1]
429
+ clusters.update({cluster_id: []})
430
+ pangenome_clusters_First.update({cluster_id: []})
431
+ pangenome_clusters_First_sequences.update({cluster_id: []})
432
+
433
+ first = False
434
+ else:
435
+ clustered = line.split('\t')[1]
436
+ clustered = clustered.split('>')[1]
437
+ clustered = clustered.split('...')[0]
438
+ genome = clustered.split('|')[0]
439
+ genome_dict[genome] += 1
440
+ if '*' in line:
441
+ rep = clustered
442
+ reps.update({rep: [0, 0]})
443
+ if first == False:
444
+ clusters[cluster_id].append(clustered)
445
+ clustered_genome = clustered.split('|')[0]
446
+ if clustered_genome not in pangenome_clusters_First[cluster_id]:
447
+ pangenome_clusters_First[cluster_id].append(clustered_genome)
448
+ pangenome_clusters_First_sequences[cluster_id].append(clustered)
449
+ return genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps
450
+
451
+ #@profile
452
+ def cluster(options):
453
+
454
+ if options.format == 'CD-HIT':
455
+ genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps = cluster_CDHIT(options)
456
+ elif options.format in ['TSV','CSV']:
457
+ genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps = cluster_EdgeList(options)
458
+
459
+ ######################################
460
+ cores, groups = get_cores(options, genome_dict)
461
+ ###
462
+
463
+ if options.reclustered != None:
464
+ if options.format == 'CD-HIT':
465
+ combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second,\
466
+ unique_genomes = combined_clustering_CDHIT(options, genome_dict)
467
+ if options.format == 'TSV':
468
+ combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second,\
469
+ unique_genomes = combined_clustering_Edge_List(options, genome_dict)
470
+ pangenome_clusters_Type = combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered)
471
+ else:
472
+ pangenome_clusters_Type = single_clustering_counting(options, pangenome_clusters_First, reps)
473
+
474
+
475
+ counter = 0
476
+ Number_Of_StORF_Extending_But_Same_Genomes = 0
477
+
478
+ sorted_first_keys = sort_keys_by_values(pangenome_clusters_First, pangenome_clusters_First_sequences)
479
+ pangenome_clusters_First_sorted = reorder_dict_by_keys(pangenome_clusters_First, sorted_first_keys)
480
+ pangenome_clusters_First_sequences_sorted = reorder_dict_by_keys(pangenome_clusters_First_sequences, sorted_first_keys)
481
+ pangenome_clusters_Type_sorted = reorder_dict_by_keys(pangenome_clusters_Type, sorted_first_keys)
482
+
483
+ print("Calculating Groups")
484
+ for cluster, numbers in pangenome_clusters_Type_sorted.items():
485
+ ############################### Calculate First only
486
+ if numbers[0] == 1 and numbers[1] >=2:
487
+ calc_First_only_core(numbers[1],groups,cores)
488
+ counter +=1
489
+ elif numbers[0] >1 and numbers[1] >=2:
490
+ calc_First_only_core(numbers[2][0],groups,cores)
491
+ counter += 1
492
+
493
+ if options.reclustered != None:
494
+ ############################# Calculate First and Reclustered-Second
495
+ if numbers[0] == 1 and numbers[3] >= 1: # If Seconds did not combine First reps
496
+ calc_single_First_extended_Second_only_core(numbers[1], groups, cores, numbers[3])
497
+ elif numbers[0] > 1 and numbers[3] >= 1: # If unique Secondss combined multiple Firsts
498
+ calc_multi_First_extended_Second_only_core(numbers[1], groups, cores, numbers[3])
499
+ elif numbers[4] >= 1:
500
+ Number_Of_StORF_Extending_But_Same_Genomes += 1
501
+ combined_pangenome_clusters_ONLY_Second_Type = defaultdict(list)
502
+ combined_pangenome_clusters_Second_Type = defaultdict(list)
503
+ for cluster, genomes in combined_pangenome_clusters_Second.items():
504
+ if cluster in not_Second_only_cluster_ids:
505
+ combined_pangenome_clusters_Second_Type[cluster] = [cluster, len(genomes)]
506
+ else:
507
+ combined_pangenome_clusters_ONLY_Second_Type[cluster] = [cluster, len(genomes)]
508
+ for cluster, data in combined_pangenome_clusters_Second_Type.items():
509
+ calc_Second_only_core(groups, cores, data[1])
510
+ for cluster, data in combined_pangenome_clusters_ONLY_Second_Type.items():
511
+ if data[1] >= 2:
512
+ calc_only_Second_only_core(groups, cores, data[1])
513
+ ###########################
514
+ print("End")
515
+ key_order = ['first_core_', 'extended_core_', 'combined_core_', 'second_core_','only_second_core_']
516
+ print("Gene Family Groups:")
517
+ for key_prefix in key_order:
518
+ for key, value in cores.items():
519
+ if key.startswith(key_prefix):
520
+ print(f"{key}: {value}")
521
+
522
+ if options.gene_presence_absence_out != None:
523
+ gene_presence_absence_output(options,genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted)
524
+
525
+
526
+ def main():
527
+
528
+ parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ': PyamilySeq Run Parameters.')
529
+ parser._action_groups.pop()
530
+
531
+ required = parser.add_argument_group('Required Arguments')
532
+ required.add_argument('-c', action='store', dest='clusters', help='Clustering output file from CD-HIT, TSV or CSV Edge List',
533
+ required=True)
534
+ required.add_argument('-f', action='store', dest='format', choices=['CD-HIT', 'CSV', 'TSV'],
535
+ help='Which format to use (CD-HIT or Comma/Tab Separated Edge-List (such as MMseqs2 tsv output))', required=True)
536
+
537
+
538
+ optional = parser.add_argument_group('Optional Arguments')
539
+ optional.add_argument('-rc', action='store', dest='reclustered', help='Clustering output file from secondary round of clustering',
540
+ required=False)
541
+ optional.add_argument('-st', action='store', dest='sequence_tag', help='Default - "StORF": Unique identifier to be used to distinguish the second of two rounds of clustered sequences',
542
+ required=False)
543
+ optional.add_argument('-groups', action="store", dest='core_groups', default="99,80,15",
544
+ help='Default - (\'99,95,90,80,15\'): Gene family groups to use')
545
+ optional.add_argument('-gpa', action='store', dest='gene_presence_absence_out', help='Default - False: If selected, a Roary formatted gene_presence_absence.csv will be created - Required for Coinfinder and other downstream tools',
546
+ required=False)
547
+
548
+ misc = parser.add_argument_group('Misc')
549
+ misc.add_argument('-verbose', action='store', dest='verbose', default=False, type=eval, choices=[True, False],
550
+ help='Default - False: Print out runtime messages')
551
+ misc.add_argument('-v', action='store_true', dest='version',
552
+ help='Default - False: Print out version number and exit')
553
+
554
+
555
+ options = parser.parse_args()
556
+ if options.clusters == None or options.format == None:
557
+ if options.version:
558
+ sys.exit(PyamilySeq_Version)
559
+ else:
560
+ exit('PyamilySeq: error: the following arguments are required: -c, -f')
561
+
562
+ if options.sequence_tag == None:
563
+ options.sequence_tag = 'StORF'
564
+
565
+ options.clusters = os.path.normpath(options.clusters)
566
+ options.clusters = os.path.realpath(options.clusters)
567
+ if options.reclustered:
568
+ options.reclustered = os.path.normpath(options.reclustered)
569
+ options.reclustered = os.path.realpath(options.reclustered)
570
+
571
+
572
+ options.core_groups = options.core_groups + ',0'
573
+
574
+ cluster(options)
575
+
576
+ print("Thank you for using PyamilySeq -- A detailed user manual can be found at https://github.com/NickJD/PyamilySeq\n"
577
+ "Please report any issues to: https://github.com/NickJD/PyamilySeq/issues\n#####")
578
+
579
+
580
+
581
+
582
+
583
+ if __name__ == "__main__":
584
+ main()
585
+ print("Complete")
586
+
PyamilySeq/__init__.py ADDED
File without changes
@@ -0,0 +1,49 @@
1
+
2
+ import argparse
3
+ import gzip
4
+ import glob
5
+
6
+ def combine_files(files, split, glob_location, combined_out):
7
+ count = 0
8
+
9
+ for file in glob.glob(glob_location + '/' + files):
10
+ count += 1
11
+ try:
12
+ with gzip.open(file, 'rb') as genome:
13
+
14
+ for line in genome:
15
+ if line.startswith(b'#'):
16
+ continue
17
+ elif line.startswith(b'>'):
18
+ genome_name = bytes(file.split(split)[0].split('/')[-1], 'utf-8')
19
+ line = line.split(b' ')[0]
20
+ line = line.replace(b'>', b'>' + genome_name + b'|')
21
+ combined_out.write(line.decode('utf-8')+'\n')
22
+ else:
23
+ combined_out.write(line.decode('utf-8'))
24
+ except gzip.BadGzipFile:
25
+ with open(file, 'r') as genome:
26
+
27
+ for line in genome:
28
+ if line.startswith('#'):
29
+ continue
30
+ elif line.startswith('>'):
31
+ genome_name = file.split(split)[0].split('/')[-1]
32
+ line = line.replace('>', '>' + genome_name + '|')
33
+ combined_out.write(line)
34
+ else:
35
+ combined_out.write(line)
36
+
37
+ def main():
38
+ parser = argparse.ArgumentParser(description="Combine gzipped fasta files.")
39
+ parser.add_argument("files", help="File pattern to match within the specified directory.")
40
+ parser.add_argument("split", help="String used to split the file path and extract the genome name.")
41
+ parser.add_argument("glob_location", help="Directory location where the files are located.")
42
+ parser.add_argument("combined_out", help="Output file where the combined data will be written.")
43
+ args = parser.parse_args()
44
+
45
+ with open(args.combined_out, 'w') as combined_out:
46
+ combine_files(args.files, args.split, args.glob_location, combined_out)
47
+
48
+ if __name__ == "__main__":
49
+ main()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: PyamilySeq
3
- Version: 0.0.1
3
+ Version: 0.0.2
4
4
  Summary: PyamilySeq - A a tool to look for sequence-based gene families identified by clustering methods such as CD-HIT, DIAMOND, BLAST or MMseqs2.
5
5
  Home-page: https://github.com/NickJD/PyamilySeq
6
6
  Author: Nicholas Dimonaco
@@ -12,7 +12,6 @@ Classifier: Operating System :: OS Independent
12
12
  Requires-Python: >=3.6
13
13
  Description-Content-Type: text/markdown
14
14
  License-File: LICENSE
15
- Requires-Dist: numpy
16
15
 
17
16
  # PyamilySeq
18
17
  PyamilySeq (Family Seek) is a Python tool for clustering gene sequences into families based on sequence similarity identified by tools such as CD-HIT, DIAMOND or MMseqs2.
@@ -0,0 +1,11 @@
1
+ PyamilySeq/CD-Hit_StORF-Reporter_Cross-Genera_Builder.py,sha256=UzQ5iOKCNfurxmj1pnkowF11YfWBO5vnBCKxQK6goB8,26538
2
+ PyamilySeq/Constants.py,sha256=hrbTdmPUFEzLfGZOPoQPV0NsAG-VnfIX51291vqb1C8,30
3
+ PyamilySeq/PyamilySeq_Species.py,sha256=34NHcViENyAdvGRltNUbfWjEcNCYnsmbuhDdl8__mH0,28209
4
+ PyamilySeq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ PyamilySeq/combine_FASTA_with_genome_IDs.py,sha256=aMUVSk6jKnKX0g04RMM360QueZS83lRLqLLysBtQbLo,2009
6
+ PyamilySeq-0.0.2.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
7
+ PyamilySeq-0.0.2.dist-info/METADATA,sha256=v6hOL3kekqt8H5YhjpS6uQOF1QSFcBh4Zy-jNW3xDTk,2550
8
+ PyamilySeq-0.0.2.dist-info/WHEEL,sha256=Z4pYXqR_rTB7OWNDYFOm1qRk0RX6GFP2o8LgvP453Hk,91
9
+ PyamilySeq-0.0.2.dist-info/entry_points.txt,sha256=zGtA2Ycf0LG3PR7zuuT0wjaAKLFxtyGgBc0O_W7E250,66
10
+ PyamilySeq-0.0.2.dist-info/top_level.txt,sha256=J6JhugUQTq4rq96yibAlQu3o4KCM9WuYfqr3w1r119M,11
11
+ PyamilySeq-0.0.2.dist-info/RECORD,,
@@ -0,0 +1 @@
1
+ PyamilySeq
@@ -1,6 +0,0 @@
1
- PyamilySeq-0.0.1.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
2
- PyamilySeq-0.0.1.dist-info/METADATA,sha256=gYAN6guZiV3POfjJJTn20Usj3PJZ-UTsdV5gruMo86g,2571
3
- PyamilySeq-0.0.1.dist-info/WHEEL,sha256=Z4pYXqR_rTB7OWNDYFOm1qRk0RX6GFP2o8LgvP453Hk,91
4
- PyamilySeq-0.0.1.dist-info/entry_points.txt,sha256=zGtA2Ycf0LG3PR7zuuT0wjaAKLFxtyGgBc0O_W7E250,66
5
- PyamilySeq-0.0.1.dist-info/top_level.txt,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
6
- PyamilySeq-0.0.1.dist-info/RECORD,,
@@ -1 +0,0 @@
1
-