PyamilySeq 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,452 @@
1
+
2
+ import sys
3
+ import copy
4
+ from collections import OrderedDict
5
+ from collections import defaultdict
6
+ from collections import Counter
7
+
8
+ def cluster_CDHIT(options, splitter):
9
+ First_in = open(options.clusters, 'r')
10
+ clusters = OrderedDict()
11
+ pangenome_clusters_First = OrderedDict()
12
+ pangenome_clusters_First_genomes = OrderedDict()
13
+ pangenome_clusters_First_sequences = OrderedDict()
14
+ first = True
15
+ taxa_dict = defaultdict(int)
16
+ reps = OrderedDict()
17
+ tmp_genomes = None
18
+ ## Load in all data for easier reuse later
19
+ for line in First_in:
20
+ if line.startswith('>'):
21
+ if tmp_genomes != None:
22
+ pangenome_clusters_First_genomes[rep] = tmp_genomes
23
+ tmp_genomes = []
24
+ if first == False:
25
+ cluster_size = len(clusters[cluster_id])
26
+ reps.update({rep: [cluster_size, len(pangenome_clusters_First[cluster_id])]})
27
+ cluster_id = line.strip('>')
28
+ cluster_id = cluster_id.strip('\n')
29
+ cluster_id = cluster_id.split(' ')[1]
30
+ clusters.update({cluster_id: []})
31
+ pangenome_clusters_First.update({cluster_id: []})
32
+ pangenome_clusters_First_sequences.update({cluster_id: []})
33
+ first = False
34
+ else:
35
+ clustered = line.split('\t')[1]
36
+ clustered = clustered.split('>')[1]
37
+ clustered = clustered.split('...')[0]
38
+ taxa = clustered.split(splitter)[0]
39
+ taxa_dict[taxa] += 1
40
+ if '*' in line:
41
+ rep = clustered
42
+ reps.update({rep: [0, 0]})
43
+ if first == False:
44
+ clusters[cluster_id].append(clustered)
45
+ clustered_taxa = clustered.split(splitter)[0]
46
+ if clustered_taxa not in pangenome_clusters_First[cluster_id]:
47
+ pangenome_clusters_First[cluster_id].append(clustered_taxa)
48
+ tmp_genomes.append(clustered_taxa)
49
+ pangenome_clusters_First_sequences[cluster_id].append(clustered)
50
+
51
+ pangenome_clusters_First_genomes[rep] = tmp_genomes
52
+
53
+ return taxa_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps
54
+
55
+ def cluster_BLAST(options, splitter):
56
+ separator = '\t'
57
+ First_in = open(options.clusters, 'r')
58
+ pangenome_clusters_First = OrderedDict()
59
+ pangenome_clusters_First_genomes = defaultdict(list)
60
+ pangenome_clusters_First_sequences = defaultdict(list)
61
+ taxa_dict = defaultdict(int)
62
+ reps = OrderedDict()
63
+ edges = defaultdict(list)
64
+ for line in First_in:
65
+ elements = line.strip().split(separator)
66
+ rep, child = elements[0], elements[1]
67
+ child_taxa = child.split(splitter)[0] # Extracting the genome identifier from the child sequence
68
+ # Counting occurrences of genomes
69
+ taxa_dict[child_taxa] += 1
70
+ edges[rep].append(child)
71
+ edges[child].append(rep)
72
+
73
+ visited = set()
74
+ cluster_id = 0
75
+
76
+ def dfs(node, cluster_id):
77
+ stack = [node]
78
+ tmp_genomes = []
79
+ while stack:
80
+ current = stack.pop()
81
+ if current not in visited:
82
+ visited.add(current)
83
+ clustered_taxa = current.split(splitter)[0]
84
+ pangenome_clusters_First_sequences[cluster_id].append(current)
85
+ if clustered_taxa not in pangenome_clusters_First[cluster_id]:
86
+ pangenome_clusters_First[cluster_id].append(clustered_taxa)
87
+ tmp_genomes.append(clustered_taxa)
88
+ for neighbor in edges[current]:
89
+ if neighbor not in visited:
90
+ stack.append(neighbor)
91
+
92
+ pangenome_clusters_First_genomes[node] = tmp_genomes
93
+
94
+ for node in edges:
95
+ if node not in visited:
96
+ pangenome_clusters_First[cluster_id] = []
97
+ pangenome_clusters_First_sequences[cluster_id] = []
98
+ pangenome_clusters_First_genomes[node] = []
99
+ dfs(node, cluster_id)
100
+ cluster_id += 1
101
+
102
+ for rep in pangenome_clusters_First:
103
+ cluster_size = len(pangenome_clusters_First_sequences[rep])
104
+ reps[rep] = [cluster_size, len(pangenome_clusters_First[rep])]
105
+
106
+ return taxa_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps
107
+
108
+ def cluster_MMseqs(options,splitter):
109
+ separator = '\t'
110
+ cluster_id = 0
111
+ last_rep = ''
112
+ first = True
113
+ First_in = open(options.clusters, 'r')
114
+ pangenome_clusters_First = OrderedDict()
115
+ pangenome_clusters_First_genomes = OrderedDict()
116
+ pangenome_clusters_First_sequences = OrderedDict()
117
+ taxa_dict = defaultdict(int)
118
+ reps = OrderedDict()
119
+ tmp_genomes = None
120
+ for line in First_in:
121
+
122
+ elements = line.strip().split(separator)
123
+ rep, child = elements[0], elements[1]
124
+ child_taxa = child.split(splitter)[0] # Extracting the genome identifier from the child sequence
125
+ # Counting occurrences of genomes
126
+ taxa_dict[child_taxa] += 1
127
+ if first == True:
128
+ pangenome_clusters_First['0'] = []
129
+ pangenome_clusters_First_sequences['0'] = []
130
+ first = False
131
+ tmp_genomes = []
132
+
133
+ if rep != last_rep and last_rep != '':
134
+ pangenome_clusters_First_genomes[rep] = tmp_genomes
135
+ tmp_genomes = []
136
+ cluster_id +=1
137
+ pangenome_clusters_First[str(cluster_id)] = []
138
+ pangenome_clusters_First_sequences[str(cluster_id)] = []
139
+ cluster_size = len(pangenome_clusters_First_sequences[str(cluster_id-1)])
140
+ reps.update({last_rep: [cluster_size, len(pangenome_clusters_First[str(cluster_id-1)])]})
141
+ pangenome_clusters_First[str(cluster_id)] = []
142
+ pangenome_clusters_First_sequences[str(cluster_id)] = []
143
+ if child_taxa not in pangenome_clusters_First[str(cluster_id)]:
144
+ pangenome_clusters_First[str(cluster_id)].append(child_taxa)
145
+ tmp_genomes.append(child_taxa)
146
+
147
+ pangenome_clusters_First_sequences[str(cluster_id)].append(child)
148
+ last_rep = rep
149
+ cluster_size = len(pangenome_clusters_First_sequences[str(cluster_id)])
150
+ reps.update({rep: [cluster_size, len(pangenome_clusters_First[str(cluster_id)])]})
151
+
152
+ #!!# May not be needed below
153
+ pangenome_clusters_First_genomes[rep] = tmp_genomes
154
+
155
+ return taxa_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps
156
+
157
+
158
+ #@profile
159
+ def combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered, pangenome_clusters_First_genomes, splitter):
160
+ num_clustered_First = defaultdict(list)
161
+ pangenome_clusters_Type = copy.deepcopy(pangenome_clusters_First)
162
+ list_of_reps = list(reps.keys())
163
+ for cluster, First_genomes in pangenome_clusters_First.items():
164
+ rep = list_of_reps[int(cluster)] # get the rep of the current pep cluster
165
+ Com_PEP_Genomes = 0
166
+ Seconds = 0
167
+ seen_Seconds = []
168
+ added_Second_genomes = 0
169
+ temp_pep_genomes = copy.deepcopy(First_genomes)
170
+ try: # get the cluster from the Second clusters which contains this rep
171
+ clustered_combined = combined_pangenome_clusters_First_Second_clustered[rep]
172
+ #We have to do this to correctly account for Seconds grouping multiple original First clusters
173
+ for clust in clustered_combined:
174
+ ### Get the original clustered genomes first:
175
+ if options.sequence_tag not in clust:
176
+ original_clustered_genomes = pangenome_clusters_First_genomes[clust]
177
+ for genome in original_clustered_genomes:
178
+ if genome not in temp_pep_genomes:
179
+ temp_pep_genomes.append(genome)
180
+
181
+ seen_clust_Genomes = []
182
+ num_clustered_First[cluster].append(rep + '_' + str(len(First_genomes)))
183
+ for clust in clustered_combined:
184
+ if options.sequence_tag not in clust: # Not good enough at the moment
185
+ clust_Genome = clust.split(splitter)[0]
186
+ if clust_Genome not in seen_clust_Genomes:
187
+ seen_clust_Genomes.append(clust_Genome)
188
+ if clust_Genome not in First_genomes:
189
+ Com_PEP_Genomes += 1
190
+ num_clustered_First[cluster].append(clust + '_' + str(reps[clust][1]))
191
+ elif options.sequence_tag in clust:
192
+ Seconds += 1
193
+ clust_Genome = clust.split(splitter)[0]
194
+ if clust_Genome not in seen_Seconds:
195
+ seen_Seconds.append(clust_Genome)
196
+ if clust_Genome not in seen_clust_Genomes:
197
+ seen_clust_Genomes.append(clust_Genome)
198
+ if clust_Genome not in temp_pep_genomes:
199
+ added_Second_genomes += 1
200
+ temp_pep_genomes.append(clust_Genome)
201
+ else:
202
+ sys.exit("Error: looking for sequence_tag")
203
+
204
+ size_of_pep_clusters = []
205
+ genomes = num_clustered_First[cluster]
206
+
207
+
208
+ if len(genomes) > 1: #!!# So that we don't double count - This still needs to account for whether the same genome/genus is present however. Probably need to unique ti
209
+ collecting_genomes = []
210
+ for genome in genomes:
211
+ genome = genome.rsplit('_', 1)
212
+ collecting_genomes.append(pangenome_clusters_First[str(list_of_reps.index(genome[0]))])
213
+ size_of_pep_clusters.append([str(list_of_reps.index(genome[0])) + ':' + genome[1]])
214
+ flattened_list = [item for sublist in collecting_genomes for item in sublist]
215
+ element_counts = Counter(flattened_list)
216
+ unique_elements = [element for element, count in element_counts.items() if count == 1]
217
+ sum_size_of_pep_clusters = len(unique_elements)
218
+ else:
219
+ genome = genomes[0].rsplit('_', 1)
220
+ size_of_pep_clusters.append([str(list_of_reps.index(genome[0]))+':'+genome[1]])
221
+ sum_size_of_pep_clusters = int(genome[1])
222
+
223
+ pangenome_clusters_Type[cluster] = [len(num_clustered_First[cluster]), sum_size_of_pep_clusters,
224
+ size_of_pep_clusters, added_Second_genomes, Seconds, len(seen_Seconds)]
225
+
226
+ except KeyError:
227
+ ###Singleton
228
+ num_First_genomes = [[str(cluster)+':'+str(len(First_genomes))]]
229
+ pangenome_clusters_Type[cluster] = [1, len(First_genomes), num_First_genomes, added_Second_genomes, Seconds,
230
+ len(seen_Seconds)]
231
+ # pangenome_clusters_Type = [Number of First clustered genomes or genera, Size of the cluster, Ditto, Added Seconds,Number of Seconds,Unique Seconds ]
232
+ return pangenome_clusters_Type
233
+
234
+
235
+ #@profile
236
+ def single_clustering_counting(pangenome_clusters_First, reps):
237
+ num_clustered_First = defaultdict(list)
238
+ recorded_First = []
239
+ pangenome_clusters_Type = copy.deepcopy(pangenome_clusters_First)
240
+ list_of_reps = list(reps.keys())
241
+ for cluster, First_taxa in pangenome_clusters_First.items():
242
+ rep = list_of_reps[int(cluster)] # get the rep of the current cluster
243
+
244
+ try: # get the cluster from the storf clusters which contains this rep
245
+ num_clustered_First[str(cluster)].append(str(rep) + '_' + str(len(First_taxa)))
246
+ size_of_First_clusters = []
247
+ Firsts = num_clustered_First[str(cluster)]
248
+ for First in Firsts:
249
+ First = First.rsplit('_', 1)
250
+ size_of_First_clusters.append(int(First[1]))
251
+ recorded_First.append(First[0])
252
+ num_First_genomes = [[str(cluster) + ':' + str(len(First_taxa))]]
253
+ pangenome_clusters_Type[cluster] = [len(num_clustered_First[cluster]), sum(size_of_First_clusters),
254
+ num_First_genomes, 0, 0, 0]
255
+
256
+ except KeyError:
257
+ ###Singleton
258
+ num_First_genomes = [[str(cluster)+':'+str(len(First_taxa))]]
259
+ pangenome_clusters_Type[cluster] = [1, len(First_taxa), num_First_genomes, 0, 0, 0]
260
+
261
+ # pangenome_clusters_Type = [Number of First clustered genomes or genera, Size of the cluster, Ditto, 0,0,0 ]
262
+ return pangenome_clusters_Type
263
+
264
+
265
+
266
+ #@profile
267
+ def combined_clustering_CDHIT(options, taxa_dict, splitter):
268
+ Second_in = open(options.reclustered, 'r')
269
+ combined_pangenome_clusters_First = OrderedDict()
270
+ combined_pangenome_clusters_First_sequences = OrderedDict()
271
+ combined_pangenome_clusters_Second = OrderedDict()
272
+ combined_pangenome_clusters_Second_sequences = OrderedDict()
273
+ combined_pangenome_clusters_First_Second_clustered = OrderedDict()
274
+
275
+ not_Second_only_cluster_ids = []
276
+ already_seen_PEP = []
277
+ Combined_clusters = OrderedDict()
278
+ Combined_reps = OrderedDict()
279
+ first = True
280
+ for line in Second_in:
281
+ if line.startswith('>'):
282
+ if '>Cluster 1997' in line:
283
+ print()
284
+ if first == False:
285
+ cluster_size = len(Combined_clusters[cluster_id])
286
+ Combined_reps.update({rep: cluster_size})
287
+ for pep in combined_pangenome_clusters_First_sequences[cluster_id]:
288
+ if pep != []:
289
+ if pep in already_seen_PEP:
290
+ continue
291
+ else:
292
+ already_seen_PEP.append(pep)
293
+ if len(combined_pangenome_clusters_Second_sequences[cluster_id]) > 0 and len(combined_pangenome_clusters_First_sequences[cluster_id]) > 0:
294
+ if len(combined_pangenome_clusters_First_sequences[cluster_id]) > 1: # If we have clustered >1 First family, we need to record 1 as key and all others are val
295
+ all_but_first = combined_pangenome_clusters_First_sequences[cluster_id][1:]
296
+ storfs_clustered = combined_pangenome_clusters_Second_sequences[cluster_id]
297
+ VALUE = all_but_first + storfs_clustered
298
+ else:
299
+ VALUE = combined_pangenome_clusters_Second_sequences[cluster_id]
300
+ KEY = combined_pangenome_clusters_First_sequences[cluster_id][0]
301
+ combined_pangenome_clusters_First_Second_clustered.update({KEY: VALUE})
302
+
303
+ cluster_id = line.strip('>')
304
+ cluster_id = cluster_id.strip('\n')
305
+ cluster_id = cluster_id.split(' ')[1]
306
+ Combined_clusters.update({cluster_id: []})
307
+ combined_pangenome_clusters_First.update({cluster_id: []})
308
+ combined_pangenome_clusters_First_sequences.update({cluster_id: []})
309
+ combined_pangenome_clusters_Second.update({cluster_id: []})
310
+ combined_pangenome_clusters_Second_sequences.update({cluster_id: []})
311
+
312
+ first = False
313
+ else:
314
+ clustered = line.split('\t')[1]
315
+ clustered = clustered.split('>')[1]
316
+ clustered = clustered.split('...')[0]
317
+ genome = clustered.split(splitter)[0]
318
+ taxa_dict[genome] += 1
319
+ if '*' in line:
320
+ rep = clustered
321
+ Combined_reps.update({rep: 0})
322
+ if first == False:
323
+ Combined_clusters[cluster_id].append(clustered)
324
+ clustered_taxa = clustered.split(splitter)[0]
325
+ if options.sequence_tag in line:
326
+ if clustered_taxa not in combined_pangenome_clusters_Second[cluster_id]:
327
+ combined_pangenome_clusters_Second[cluster_id].append(clustered_taxa)
328
+ combined_pangenome_clusters_Second_sequences[cluster_id].append(clustered)
329
+ else:
330
+ if cluster_id not in not_Second_only_cluster_ids:
331
+ not_Second_only_cluster_ids.append(cluster_id)
332
+ if clustered_taxa not in combined_pangenome_clusters_First[cluster_id]:
333
+ combined_pangenome_clusters_First[cluster_id].append(clustered_taxa)
334
+ combined_pangenome_clusters_First_sequences[cluster_id].append(clustered)
335
+
336
+
337
+ return combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids, combined_pangenome_clusters_Second, combined_pangenome_clusters_Second_sequences
338
+
339
+
340
+
341
+ # def cluster_BLAST(options, splitter):
342
+ # separator = '\t'
343
+ # First_in = open(options.clusters, 'r')
344
+ # pangenome_clusters_First = OrderedDict()
345
+ # pangenome_clusters_First_genomes = defaultdict(list)
346
+ # pangenome_clusters_First_sequences = defaultdict(list)
347
+ # taxa_dict = defaultdict(int)
348
+ # reps = OrderedDict()
349
+ #
350
+ # for line in First_in:
351
+ # elements = line.strip().split(separator)
352
+ # rep, child = elements[0], elements[1]
353
+ # child_taxa = child.split(splitter)[0] # Extracting the genome identifier from the child sequence
354
+ # # Counting occurrences of genomes
355
+ # taxa_dict[child_taxa] += 1
356
+ #
357
+ # if rep not in pangenome_clusters_First:
358
+ # pangenome_clusters_First[rep] = []
359
+ # pangenome_clusters_First_sequences[rep] = []
360
+ #
361
+ # if child_taxa not in pangenome_clusters_First[rep]:
362
+ # pangenome_clusters_First[rep].append(child_taxa)
363
+ # pangenome_clusters_First_genomes[rep].append(child_taxa)
364
+ #
365
+ # pangenome_clusters_First_sequences[rep].append(child)
366
+ #
367
+ # for rep in pangenome_clusters_First:
368
+ # cluster_size = len(pangenome_clusters_First_sequences[rep])
369
+ # reps[rep] = [cluster_size, len(pangenome_clusters_First[rep])]
370
+ #
371
+ # return taxa_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps
372
+
373
+
374
+
375
+
376
+ def combined_clustering_Edge_List(options, splitter):
377
+ if options.cluster_format == 'TSV':
378
+ separator = '\t'
379
+ elif options.cluster_format == 'CSV':
380
+ separator = ','
381
+
382
+ cluster_id = 0
383
+ last_rep = ''
384
+ Second_in = open(options.reclustered, 'r')
385
+ combined_pangenome_clusters_First = OrderedDict()
386
+ combined_pangenome_clusters_First_sequences = OrderedDict()
387
+ combined_pangenome_clusters_Second = OrderedDict()
388
+ combined_pangenome_clusters_Second_sequences = OrderedDict()
389
+ combined_pangenome_clusters_First_Second_clustered = OrderedDict()
390
+
391
+ not_Second_only_cluster_ids = []
392
+ already_seen_PEP = []
393
+ Combined_clusters = OrderedDict()
394
+ Combined_reps = OrderedDict()
395
+ first = True
396
+ for line in Second_in:
397
+ elements = line.strip().split(separator)
398
+ rep, child = elements[0], elements[1]
399
+ child_taxa = child.split(splitter)[0] # Extracting the genome identifier from the child sequence
400
+
401
+ if first == True:
402
+ Combined_clusters.update({str(cluster_id): []})
403
+ combined_pangenome_clusters_First.update({str(cluster_id): []})
404
+ combined_pangenome_clusters_First_sequences.update({str(cluster_id): []})
405
+ combined_pangenome_clusters_Second.update({str(cluster_id): []})
406
+ combined_pangenome_clusters_Second_sequences.update({str(cluster_id): []})
407
+ Combined_reps.update({rep: 0})
408
+ first = False
409
+
410
+ if first == False:
411
+ if rep != last_rep and last_rep != '':
412
+ cluster_size = len(Combined_clusters[str(cluster_id)])
413
+ Combined_reps.update({rep: cluster_size})
414
+ for pep in combined_pangenome_clusters_First_sequences[str(cluster_id)]:
415
+ if pep != []:
416
+ if pep in already_seen_PEP:
417
+ continue
418
+ else:
419
+ already_seen_PEP.append(pep)
420
+ if len(combined_pangenome_clusters_Second_sequences[str(cluster_id)]) > 0 and len(combined_pangenome_clusters_First_sequences[str(cluster_id)]) > 0:
421
+ if len(combined_pangenome_clusters_First_sequences[str(cluster_id)]) > 1: # If we have clustered >1 PEP family, we need to record 1 as key and all others are val
422
+ all_but_first = combined_pangenome_clusters_First_sequences[str(cluster_id)][1:]
423
+ storfs_clustered = combined_pangenome_clusters_Second_sequences[str(cluster_id)]
424
+ VALUE = all_but_first + storfs_clustered
425
+ else:
426
+ VALUE = combined_pangenome_clusters_Second_sequences[str(cluster_id)]
427
+ KEY = combined_pangenome_clusters_First_sequences[str(cluster_id)][0]
428
+ combined_pangenome_clusters_First_Second_clustered.update({KEY: VALUE})
429
+
430
+ cluster_id += 1
431
+ Combined_clusters.update({str(cluster_id): []})
432
+ combined_pangenome_clusters_First.update({str(cluster_id): []})
433
+ combined_pangenome_clusters_First_sequences.update({str(cluster_id): []})
434
+ combined_pangenome_clusters_Second.update({str(cluster_id): []})
435
+ combined_pangenome_clusters_Second_sequences.update({str(cluster_id): []})
436
+ Combined_reps.update({rep: 0})
437
+
438
+ Combined_clusters[str(cluster_id)].append(child)
439
+ if options.sequence_tag in line:
440
+ if child_taxa not in combined_pangenome_clusters_Second[str(cluster_id)]:
441
+ combined_pangenome_clusters_Second[str(cluster_id)].append(child_taxa)
442
+ combined_pangenome_clusters_Second_sequences[str(cluster_id)].append(child)
443
+ else:
444
+ if str(cluster_id) not in not_Second_only_cluster_ids:
445
+ not_Second_only_cluster_ids.append(str(cluster_id)) # Tell us which StORF_Reporter clustered are unmatched to a PEP
446
+ if child_taxa not in combined_pangenome_clusters_First[str(cluster_id)]:
447
+ combined_pangenome_clusters_First[str(cluster_id)].append(child_taxa)
448
+ combined_pangenome_clusters_First_sequences[str(cluster_id)].append(child)
449
+
450
+ last_rep = rep
451
+
452
+ return combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids, combined_pangenome_clusters_Second, combined_pangenome_clusters_Second_sequences
@@ -0,0 +1,2 @@
1
+ PyamilySeq_Version = 'v1.1.0'
2
+