PyamilySeq 0.6.0__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyamilySeq/Constants.py +1 -1
- PyamilySeq/PyamilySeq.py +81 -39
- PyamilySeq/PyamilySeq_Genus.py +85 -102
- PyamilySeq/PyamilySeq_Species.py +101 -94
- PyamilySeq/Seq_Combiner.py +26 -7
- PyamilySeq/clusterings.py +111 -73
- PyamilySeq/utils.py +117 -7
- PyamilySeq-0.7.1.dist-info/METADATA +250 -0
- PyamilySeq-0.7.1.dist-info/RECORD +14 -0
- {PyamilySeq-0.6.0.dist-info → PyamilySeq-0.7.1.dist-info}/WHEEL +1 -1
- PyamilySeq/CD-Hit_StORF-Reporter_Cross-Genera_Builder.py +0 -600
- PyamilySeq-0.6.0.dist-info/METADATA +0 -147
- PyamilySeq-0.6.0.dist-info/RECORD +0 -15
- {PyamilySeq-0.6.0.dist-info → PyamilySeq-0.7.1.dist-info}/LICENSE +0 -0
- {PyamilySeq-0.6.0.dist-info → PyamilySeq-0.7.1.dist-info}/entry_points.txt +0 -0
- {PyamilySeq-0.6.0.dist-info → PyamilySeq-0.7.1.dist-info}/top_level.txt +0 -0
PyamilySeq/clusterings.py
CHANGED
|
@@ -1,25 +1,26 @@
|
|
|
1
|
-
|
|
2
|
-
import shutil
|
|
3
|
-
import os
|
|
4
|
-
import glob
|
|
1
|
+
|
|
5
2
|
import sys
|
|
6
3
|
import copy
|
|
7
4
|
from collections import OrderedDict
|
|
8
5
|
from collections import defaultdict
|
|
6
|
+
from collections import Counter
|
|
9
7
|
|
|
10
8
|
def cluster_CDHIT(options, splitter):
|
|
11
9
|
First_in = open(options.clusters, 'r')
|
|
12
10
|
clusters = OrderedDict()
|
|
13
11
|
pangenome_clusters_First = OrderedDict()
|
|
12
|
+
pangenome_clusters_First_genomes = OrderedDict()
|
|
14
13
|
pangenome_clusters_First_sequences = OrderedDict()
|
|
15
14
|
first = True
|
|
16
15
|
taxa_dict = defaultdict(int)
|
|
17
16
|
reps = OrderedDict()
|
|
17
|
+
tmp_genomes = None
|
|
18
18
|
## Load in all data for easier reuse later
|
|
19
19
|
for line in First_in:
|
|
20
|
-
if '>Cluster 7575' in line:
|
|
21
|
-
print()
|
|
22
20
|
if line.startswith('>'):
|
|
21
|
+
if tmp_genomes != None:
|
|
22
|
+
pangenome_clusters_First_genomes[rep] = tmp_genomes
|
|
23
|
+
tmp_genomes = []
|
|
23
24
|
if first == False:
|
|
24
25
|
cluster_size = len(clusters[cluster_id])
|
|
25
26
|
reps.update({rep: [cluster_size, len(pangenome_clusters_First[cluster_id])]})
|
|
@@ -29,7 +30,6 @@ def cluster_CDHIT(options, splitter):
|
|
|
29
30
|
clusters.update({cluster_id: []})
|
|
30
31
|
pangenome_clusters_First.update({cluster_id: []})
|
|
31
32
|
pangenome_clusters_First_sequences.update({cluster_id: []})
|
|
32
|
-
|
|
33
33
|
first = False
|
|
34
34
|
else:
|
|
35
35
|
clustered = line.split('\t')[1]
|
|
@@ -45,32 +45,46 @@ def cluster_CDHIT(options, splitter):
|
|
|
45
45
|
clustered_taxa = clustered.split(splitter)[0]
|
|
46
46
|
if clustered_taxa not in pangenome_clusters_First[cluster_id]:
|
|
47
47
|
pangenome_clusters_First[cluster_id].append(clustered_taxa)
|
|
48
|
+
tmp_genomes.append(clustered_taxa)
|
|
48
49
|
pangenome_clusters_First_sequences[cluster_id].append(clustered)
|
|
49
|
-
|
|
50
|
+
|
|
51
|
+
pangenome_clusters_First_genomes[rep] = tmp_genomes
|
|
52
|
+
|
|
53
|
+
return taxa_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps
|
|
50
54
|
|
|
51
55
|
|
|
52
56
|
|
|
53
57
|
#@profile
|
|
54
|
-
def combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered, splitter):
|
|
58
|
+
def combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered, pangenome_clusters_First_genomes, splitter):
|
|
55
59
|
num_clustered_First = defaultdict(list)
|
|
56
60
|
pangenome_clusters_Type = copy.deepcopy(pangenome_clusters_First)
|
|
57
61
|
list_of_reps = list(reps.keys())
|
|
58
|
-
for cluster,
|
|
62
|
+
for cluster, First_genomes in pangenome_clusters_First.items():
|
|
59
63
|
rep = list_of_reps[int(cluster)] # get the rep of the current pep cluster
|
|
60
64
|
Com_PEP_Genomes = 0
|
|
61
65
|
Seconds = 0
|
|
62
66
|
seen_Seconds = []
|
|
63
67
|
added_Second_genomes = 0
|
|
64
|
-
|
|
65
|
-
|
|
68
|
+
temp_pep_genomes = copy.deepcopy(First_genomes)
|
|
69
|
+
try: # get the cluster from the Second clusters which contains this rep
|
|
70
|
+
clustered_combined = combined_pangenome_clusters_First_Second_clustered[rep]
|
|
71
|
+
#We have to do this to correctly account for Seconds grouping multiple original First clusters
|
|
72
|
+
for clust in clustered_combined:
|
|
73
|
+
### Get the original clustered genomes first:
|
|
74
|
+
if options.sequence_tag not in clust:
|
|
75
|
+
original_clustered_genomes = pangenome_clusters_First_genomes[clust]
|
|
76
|
+
for genome in original_clustered_genomes:
|
|
77
|
+
if genome not in temp_pep_genomes:
|
|
78
|
+
temp_pep_genomes.append(genome)
|
|
79
|
+
|
|
66
80
|
seen_clust_Genomes = []
|
|
67
|
-
num_clustered_First[cluster].append(rep + '_' + str(len(
|
|
81
|
+
num_clustered_First[cluster].append(rep + '_' + str(len(First_genomes)))
|
|
68
82
|
for clust in clustered_combined:
|
|
69
83
|
if options.sequence_tag not in clust: # Not good enough at the moment
|
|
70
84
|
clust_Genome = clust.split(splitter)[0]
|
|
71
85
|
if clust_Genome not in seen_clust_Genomes:
|
|
72
86
|
seen_clust_Genomes.append(clust_Genome)
|
|
73
|
-
if clust_Genome not in
|
|
87
|
+
if clust_Genome not in First_genomes:
|
|
74
88
|
Com_PEP_Genomes += 1
|
|
75
89
|
num_clustered_First[cluster].append(clust + '_' + str(reps[clust][1]))
|
|
76
90
|
elif options.sequence_tag in clust:
|
|
@@ -80,23 +94,38 @@ def combined_clustering_counting(options, pangenome_clusters_First, reps, combin
|
|
|
80
94
|
seen_Seconds.append(clust_Genome)
|
|
81
95
|
if clust_Genome not in seen_clust_Genomes:
|
|
82
96
|
seen_clust_Genomes.append(clust_Genome)
|
|
83
|
-
if clust_Genome not in
|
|
97
|
+
if clust_Genome not in temp_pep_genomes:
|
|
84
98
|
added_Second_genomes += 1
|
|
99
|
+
temp_pep_genomes.append(clust_Genome)
|
|
85
100
|
else:
|
|
86
101
|
sys.exit("Error: looking for sequence_tag")
|
|
87
102
|
|
|
88
103
|
size_of_pep_clusters = []
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
104
|
+
genomes = num_clustered_First[cluster]
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
if len(genomes) > 1: #!!# So that we don't double count - This still needs to account for whether the same genome/genus is present however. Probably need to unique ti
|
|
108
|
+
collecting_genomes = []
|
|
109
|
+
for genome in genomes:
|
|
110
|
+
genome = genome.rsplit('_', 1)
|
|
111
|
+
collecting_genomes.append(pangenome_clusters_First[str(list_of_reps.index(genome[0]))])
|
|
112
|
+
size_of_pep_clusters.append([str(list_of_reps.index(genome[0])) + ':' + genome[1]])
|
|
113
|
+
flattened_list = [item for sublist in collecting_genomes for item in sublist]
|
|
114
|
+
element_counts = Counter(flattened_list)
|
|
115
|
+
unique_elements = [element for element, count in element_counts.items() if count == 1]
|
|
116
|
+
sum_size_of_pep_clusters = len(unique_elements)
|
|
117
|
+
else:
|
|
118
|
+
genome = genomes[0].rsplit('_', 1)
|
|
119
|
+
size_of_pep_clusters.append([str(list_of_reps.index(genome[0]))+':'+genome[1]])
|
|
120
|
+
sum_size_of_pep_clusters = int(genome[1])
|
|
121
|
+
|
|
122
|
+
pangenome_clusters_Type[cluster] = [len(num_clustered_First[cluster]), sum_size_of_pep_clusters,
|
|
94
123
|
size_of_pep_clusters, added_Second_genomes, Seconds, len(seen_Seconds)]
|
|
95
124
|
|
|
96
125
|
except KeyError:
|
|
97
126
|
###Singleton
|
|
98
|
-
|
|
99
|
-
pangenome_clusters_Type[cluster] = [1, len(
|
|
127
|
+
num_First_genomes = [[str(cluster)+':'+str(len(First_genomes))]]
|
|
128
|
+
pangenome_clusters_Type[cluster] = [1, len(First_genomes), num_First_genomes, added_Second_genomes, Seconds,
|
|
100
129
|
len(seen_Seconds)]
|
|
101
130
|
# pangenome_clusters_Type = [Number of First clustered genomes or genera, Size of the cluster, Ditto, Added Seconds,Number of Seconds,Unique Seconds ]
|
|
102
131
|
return pangenome_clusters_Type
|
|
@@ -112,20 +141,21 @@ def single_clustering_counting(pangenome_clusters_First, reps):
|
|
|
112
141
|
rep = list_of_reps[int(cluster)] # get the rep of the current pep cluster
|
|
113
142
|
|
|
114
143
|
try: # get the cluster from the storf clusters which contains this rep
|
|
115
|
-
num_clustered_First[cluster].append(rep + '_' + str(len(First_taxa)))
|
|
144
|
+
num_clustered_First[str(cluster)].append(rep + '_' + str(len(First_taxa)))
|
|
116
145
|
size_of_First_clusters = []
|
|
117
|
-
Firsts = num_clustered_First[cluster]
|
|
146
|
+
Firsts = num_clustered_First[str(cluster)]
|
|
118
147
|
for First in Firsts:
|
|
119
148
|
First = First.rsplit('_', 1)
|
|
120
149
|
size_of_First_clusters.append(int(First[1]))
|
|
121
150
|
recorded_First.append(First[0])
|
|
151
|
+
num_First_genomes = [[str(cluster) + ':' + str(len(First_taxa))]]
|
|
122
152
|
pangenome_clusters_Type[cluster] = [len(num_clustered_First[cluster]), sum(size_of_First_clusters),
|
|
123
|
-
|
|
153
|
+
num_First_genomes, 0, 0, 0]
|
|
124
154
|
|
|
125
155
|
except KeyError:
|
|
126
156
|
###Singleton
|
|
127
|
-
|
|
128
|
-
pangenome_clusters_Type[cluster] = [1, len(First_taxa),
|
|
157
|
+
num_First_genomes = [[str(cluster)+':'+str(len(First_taxa))]]
|
|
158
|
+
pangenome_clusters_Type[cluster] = [1, len(First_taxa), num_First_genomes, 0, 0, 0]
|
|
129
159
|
|
|
130
160
|
# pangenome_clusters_Type = [Number of First clustered genomes or genera, Size of the cluster, Ditto, 0,0,0 ]
|
|
131
161
|
return pangenome_clusters_Type
|
|
@@ -158,7 +188,7 @@ def combined_clustering_CDHIT(options, taxa_dict, splitter):
|
|
|
158
188
|
else:
|
|
159
189
|
already_seen_PEP.append(pep)
|
|
160
190
|
if len(combined_pangenome_clusters_Second_sequences[cluster_id]) > 0 and len(combined_pangenome_clusters_First_sequences[cluster_id]) > 0:
|
|
161
|
-
if len(combined_pangenome_clusters_First_sequences[cluster_id]) > 1: # If we have clustered >1
|
|
191
|
+
if len(combined_pangenome_clusters_First_sequences[cluster_id]) > 1: # If we have clustered >1 First family, we need to record 1 as key and all others are val
|
|
162
192
|
all_but_first = combined_pangenome_clusters_First_sequences[cluster_id][1:]
|
|
163
193
|
storfs_clustered = combined_pangenome_clusters_Second_sequences[cluster_id]
|
|
164
194
|
VALUE = all_but_first + storfs_clustered
|
|
@@ -194,13 +224,13 @@ def combined_clustering_CDHIT(options, taxa_dict, splitter):
|
|
|
194
224
|
combined_pangenome_clusters_Second_sequences[cluster_id].append(clustered)
|
|
195
225
|
else:
|
|
196
226
|
if cluster_id not in not_Second_only_cluster_ids:
|
|
197
|
-
not_Second_only_cluster_ids.append(cluster_id)
|
|
227
|
+
not_Second_only_cluster_ids.append(cluster_id)
|
|
198
228
|
if clustered_taxa not in combined_pangenome_clusters_First[cluster_id]:
|
|
199
229
|
combined_pangenome_clusters_First[cluster_id].append(clustered_taxa)
|
|
200
230
|
combined_pangenome_clusters_First_sequences[cluster_id].append(clustered)
|
|
201
231
|
|
|
202
232
|
|
|
203
|
-
return combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids, combined_pangenome_clusters_Second
|
|
233
|
+
return combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids, combined_pangenome_clusters_Second, combined_pangenome_clusters_Second_sequences
|
|
204
234
|
|
|
205
235
|
|
|
206
236
|
def cluster_EdgeList(options,splitter):
|
|
@@ -213,37 +243,45 @@ def cluster_EdgeList(options,splitter):
|
|
|
213
243
|
first = True
|
|
214
244
|
First_in = open(options.clusters, 'r')
|
|
215
245
|
pangenome_clusters_First = OrderedDict()
|
|
246
|
+
pangenome_clusters_First_genomes = OrderedDict()
|
|
216
247
|
pangenome_clusters_First_sequences = OrderedDict()
|
|
217
248
|
taxa_dict = defaultdict(int)
|
|
218
249
|
reps = OrderedDict()
|
|
250
|
+
tmp_genomes = None
|
|
219
251
|
for line in First_in:
|
|
220
252
|
rep, child = line.strip().split(separator)
|
|
221
253
|
child_taxa = child.split(splitter)[0] # Extracting the genome identifier from the child sequence
|
|
222
254
|
# Counting occurrences of genomes
|
|
223
255
|
taxa_dict[child_taxa] += 1
|
|
224
256
|
if first == True:
|
|
225
|
-
pangenome_clusters_First[0] = []
|
|
226
|
-
pangenome_clusters_First_sequences[0] = []
|
|
257
|
+
pangenome_clusters_First['0'] = []
|
|
258
|
+
pangenome_clusters_First_sequences['0'] = []
|
|
227
259
|
first = False
|
|
260
|
+
tmp_genomes = []
|
|
228
261
|
|
|
229
262
|
if rep != last_rep and last_rep != '':
|
|
263
|
+
pangenome_clusters_First_genomes[rep] = tmp_genomes
|
|
264
|
+
tmp_genomes = []
|
|
230
265
|
cluster_id +=1
|
|
231
|
-
pangenome_clusters_First[cluster_id] = []
|
|
232
|
-
pangenome_clusters_First_sequences[cluster_id] = []
|
|
233
|
-
cluster_size = len(pangenome_clusters_First_sequences[cluster_id-1])
|
|
234
|
-
reps.update({last_rep: [cluster_size, len(pangenome_clusters_First[cluster_id-1])]})
|
|
235
|
-
pangenome_clusters_First[cluster_id] = []
|
|
236
|
-
pangenome_clusters_First_sequences[cluster_id] = []
|
|
237
|
-
if child_taxa not in pangenome_clusters_First[cluster_id]:
|
|
238
|
-
pangenome_clusters_First[cluster_id].append(child_taxa)
|
|
239
|
-
|
|
240
|
-
|
|
266
|
+
pangenome_clusters_First[str(cluster_id)] = []
|
|
267
|
+
pangenome_clusters_First_sequences[str(cluster_id)] = []
|
|
268
|
+
cluster_size = len(pangenome_clusters_First_sequences[str(cluster_id-1)])
|
|
269
|
+
reps.update({last_rep: [cluster_size, len(pangenome_clusters_First[str(cluster_id-1)])]})
|
|
270
|
+
pangenome_clusters_First[str(cluster_id)] = []
|
|
271
|
+
pangenome_clusters_First_sequences[str(cluster_id)] = []
|
|
272
|
+
if child_taxa not in pangenome_clusters_First[str(cluster_id)]:
|
|
273
|
+
pangenome_clusters_First[str(cluster_id)].append(child_taxa)
|
|
274
|
+
tmp_genomes.append(child_taxa)
|
|
275
|
+
|
|
276
|
+
pangenome_clusters_First_sequences[str(cluster_id)].append(child)
|
|
241
277
|
last_rep = rep
|
|
242
|
-
cluster_size = len(pangenome_clusters_First_sequences[cluster_id])
|
|
243
|
-
reps.update({rep: [cluster_size, len(pangenome_clusters_First[cluster_id])]})
|
|
278
|
+
cluster_size = len(pangenome_clusters_First_sequences[str(cluster_id)])
|
|
279
|
+
reps.update({rep: [cluster_size, len(pangenome_clusters_First[str(cluster_id)])]})
|
|
244
280
|
|
|
281
|
+
#!!# May not be needed below
|
|
282
|
+
pangenome_clusters_First_genomes[rep] = tmp_genomes
|
|
245
283
|
|
|
246
|
-
return taxa_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps
|
|
284
|
+
return taxa_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps
|
|
247
285
|
|
|
248
286
|
|
|
249
287
|
def combined_clustering_Edge_List(options, splitter):
|
|
@@ -271,54 +309,54 @@ def combined_clustering_Edge_List(options, splitter):
|
|
|
271
309
|
child_taxa = child.split(splitter)[0] # Extracting the genome identifier from the child sequence
|
|
272
310
|
|
|
273
311
|
if first == True:
|
|
274
|
-
Combined_clusters.update({cluster_id: []})
|
|
275
|
-
combined_pangenome_clusters_First.update({cluster_id: []})
|
|
276
|
-
combined_pangenome_clusters_First_sequences.update({cluster_id: []})
|
|
277
|
-
combined_pangenome_clusters_Second.update({cluster_id: []})
|
|
278
|
-
combined_pangenome_clusters_Second_sequences.update({cluster_id: []})
|
|
312
|
+
Combined_clusters.update({str(cluster_id): []})
|
|
313
|
+
combined_pangenome_clusters_First.update({str(cluster_id): []})
|
|
314
|
+
combined_pangenome_clusters_First_sequences.update({str(cluster_id): []})
|
|
315
|
+
combined_pangenome_clusters_Second.update({str(cluster_id): []})
|
|
316
|
+
combined_pangenome_clusters_Second_sequences.update({str(cluster_id): []})
|
|
279
317
|
Combined_reps.update({rep: 0})
|
|
280
318
|
first = False
|
|
281
319
|
|
|
282
320
|
if first == False:
|
|
283
321
|
if rep != last_rep and last_rep != '':
|
|
284
|
-
cluster_size = len(Combined_clusters[cluster_id])
|
|
322
|
+
cluster_size = len(Combined_clusters[str(cluster_id)])
|
|
285
323
|
Combined_reps.update({rep: cluster_size})
|
|
286
|
-
for pep in combined_pangenome_clusters_First_sequences[cluster_id]:
|
|
324
|
+
for pep in combined_pangenome_clusters_First_sequences[str(cluster_id)]:
|
|
287
325
|
if pep != []:
|
|
288
326
|
if pep in already_seen_PEP:
|
|
289
327
|
continue
|
|
290
328
|
else:
|
|
291
329
|
already_seen_PEP.append(pep)
|
|
292
|
-
if len(combined_pangenome_clusters_Second_sequences[cluster_id]) > 0 and len(combined_pangenome_clusters_First_sequences[cluster_id]) > 0:
|
|
293
|
-
if len(combined_pangenome_clusters_First_sequences[cluster_id]) > 1: # If we have clustered >1 PEP family, we need to record 1 as key and all others are val
|
|
294
|
-
all_but_first = combined_pangenome_clusters_First_sequences[cluster_id][1:]
|
|
295
|
-
storfs_clustered = combined_pangenome_clusters_Second_sequences[cluster_id]
|
|
330
|
+
if len(combined_pangenome_clusters_Second_sequences[str(cluster_id)]) > 0 and len(combined_pangenome_clusters_First_sequences[str(cluster_id)]) > 0:
|
|
331
|
+
if len(combined_pangenome_clusters_First_sequences[str(cluster_id)]) > 1: # If we have clustered >1 PEP family, we need to record 1 as key and all others are val
|
|
332
|
+
all_but_first = combined_pangenome_clusters_First_sequences[str(cluster_id)][1:]
|
|
333
|
+
storfs_clustered = combined_pangenome_clusters_Second_sequences[str(cluster_id)]
|
|
296
334
|
VALUE = all_but_first + storfs_clustered
|
|
297
335
|
else:
|
|
298
|
-
VALUE = combined_pangenome_clusters_Second_sequences[cluster_id]
|
|
299
|
-
KEY = combined_pangenome_clusters_First_sequences[cluster_id][0]
|
|
336
|
+
VALUE = combined_pangenome_clusters_Second_sequences[str(cluster_id)]
|
|
337
|
+
KEY = combined_pangenome_clusters_First_sequences[str(cluster_id)][0]
|
|
300
338
|
combined_pangenome_clusters_First_Second_clustered.update({KEY: VALUE})
|
|
301
339
|
|
|
302
340
|
cluster_id += 1
|
|
303
|
-
Combined_clusters.update({cluster_id: []})
|
|
304
|
-
combined_pangenome_clusters_First.update({cluster_id: []})
|
|
305
|
-
combined_pangenome_clusters_First_sequences.update({cluster_id: []})
|
|
306
|
-
combined_pangenome_clusters_Second.update({cluster_id: []})
|
|
307
|
-
combined_pangenome_clusters_Second_sequences.update({cluster_id: []})
|
|
341
|
+
Combined_clusters.update({str(cluster_id): []})
|
|
342
|
+
combined_pangenome_clusters_First.update({str(cluster_id): []})
|
|
343
|
+
combined_pangenome_clusters_First_sequences.update({str(cluster_id): []})
|
|
344
|
+
combined_pangenome_clusters_Second.update({str(cluster_id): []})
|
|
345
|
+
combined_pangenome_clusters_Second_sequences.update({str(cluster_id): []})
|
|
308
346
|
Combined_reps.update({rep: 0})
|
|
309
347
|
|
|
310
|
-
Combined_clusters[cluster_id].append(child)
|
|
348
|
+
Combined_clusters[str(cluster_id)].append(child)
|
|
311
349
|
if options.sequence_tag in line:
|
|
312
|
-
if child_taxa not in combined_pangenome_clusters_Second[cluster_id]:
|
|
313
|
-
combined_pangenome_clusters_Second[cluster_id].append(child_taxa)
|
|
314
|
-
combined_pangenome_clusters_Second_sequences[cluster_id].append(child)
|
|
350
|
+
if child_taxa not in combined_pangenome_clusters_Second[str(cluster_id)]:
|
|
351
|
+
combined_pangenome_clusters_Second[str(cluster_id)].append(child_taxa)
|
|
352
|
+
combined_pangenome_clusters_Second_sequences[str(cluster_id)].append(child)
|
|
315
353
|
else:
|
|
316
|
-
if cluster_id not in not_Second_only_cluster_ids:
|
|
317
|
-
not_Second_only_cluster_ids.append(cluster_id) # Tell us which StORF_Reporter clustered are unmatched to a PEP
|
|
318
|
-
if child_taxa not in combined_pangenome_clusters_First[cluster_id]:
|
|
319
|
-
combined_pangenome_clusters_First[cluster_id].append(child_taxa)
|
|
320
|
-
combined_pangenome_clusters_First_sequences[cluster_id].append(child)
|
|
354
|
+
if str(cluster_id) not in not_Second_only_cluster_ids:
|
|
355
|
+
not_Second_only_cluster_ids.append(str(cluster_id)) # Tell us which StORF_Reporter clustered are unmatched to a PEP
|
|
356
|
+
if child_taxa not in combined_pangenome_clusters_First[str(cluster_id)]:
|
|
357
|
+
combined_pangenome_clusters_First[str(cluster_id)].append(child_taxa)
|
|
358
|
+
combined_pangenome_clusters_First_sequences[str(cluster_id)].append(child)
|
|
321
359
|
|
|
322
360
|
last_rep = rep
|
|
323
361
|
|
|
324
|
-
return combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids, combined_pangenome_clusters_Second
|
|
362
|
+
return combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids, combined_pangenome_clusters_Second, combined_pangenome_clusters_Second_sequences
|
PyamilySeq/utils.py
CHANGED
|
@@ -7,6 +7,29 @@ from tempfile import NamedTemporaryFile
|
|
|
7
7
|
import sys
|
|
8
8
|
|
|
9
9
|
|
|
10
|
+
################### We are currently fixed using Table 11
|
|
11
|
+
gencode = {
|
|
12
|
+
'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
|
|
13
|
+
'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
|
|
14
|
+
'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
|
|
15
|
+
'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
|
|
16
|
+
'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
|
|
17
|
+
'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
|
|
18
|
+
'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
|
|
19
|
+
'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
|
|
20
|
+
'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
|
|
21
|
+
'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
|
|
22
|
+
'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
|
|
23
|
+
'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
|
|
24
|
+
'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
|
|
25
|
+
'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
|
|
26
|
+
'TAC':'Y', 'TAT':'Y', 'TAA':'*', 'TAG':'*',
|
|
27
|
+
'TGC':'C', 'TGT':'C', 'TGA':'*', 'TGG':'W'}
|
|
28
|
+
|
|
29
|
+
def translate_frame(sequence):
|
|
30
|
+
translate = ''.join([gencode.get(sequence[3 * i:3 * i + 3], 'X') for i in range(len(sequence) // 3)])
|
|
31
|
+
return translate
|
|
32
|
+
|
|
10
33
|
def is_tool_installed(tool_name):
|
|
11
34
|
"""Check if a tool is installed and available in PATH."""
|
|
12
35
|
# Check if the tool is in the system PATH
|
|
@@ -75,7 +98,7 @@ def select_longest_gene(sequences):
|
|
|
75
98
|
|
|
76
99
|
|
|
77
100
|
def run_mafft_on_sequences(options, sequences, output_file):
|
|
78
|
-
print("Conducting MAFFT alignment.")
|
|
101
|
+
#print("Conducting MAFFT alignment.")
|
|
79
102
|
"""Run mafft on the given sequences and write to output file."""
|
|
80
103
|
# Create a temporary input file for mafft
|
|
81
104
|
with NamedTemporaryFile('w', delete=False) as temp_input_file:
|
|
@@ -106,7 +129,7 @@ def run_mafft_on_sequences(options, sequences, output_file):
|
|
|
106
129
|
|
|
107
130
|
|
|
108
131
|
|
|
109
|
-
def read_separate_files(input_dir, name_split, combined_out):
|
|
132
|
+
def read_separate_files(input_dir, name_split, gene_ident, combined_out, translate):
|
|
110
133
|
with open(combined_out, 'w') as combined_out_file:
|
|
111
134
|
for gff_file in glob.glob(os.path.join(input_dir, '*' + name_split)):
|
|
112
135
|
genome_name = os.path.basename(gff_file).split(name_split)[0]
|
|
@@ -121,7 +144,7 @@ def read_separate_files(input_dir, name_split, combined_out):
|
|
|
121
144
|
for line in lines:
|
|
122
145
|
line_data = line.split('\t')
|
|
123
146
|
if len(line_data) == 9:
|
|
124
|
-
if line_data[2]
|
|
147
|
+
if any(gene_type in line_data[2] for gene_type in gene_ident):
|
|
125
148
|
contig = line_data[0]
|
|
126
149
|
feature = line_data[2]
|
|
127
150
|
strand = line_data[6]
|
|
@@ -158,12 +181,13 @@ def read_separate_files(input_dir, name_split, combined_out):
|
|
|
158
181
|
corrected_stop = max(len(fasta_dict[contig][0]) - int(start - 1), 1)
|
|
159
182
|
full_sequence = fasta_dict[contig][1]
|
|
160
183
|
cds_sequence = full_sequence[corrected_start:corrected_stop]
|
|
161
|
-
|
|
184
|
+
if translate == True:
|
|
185
|
+
cds_sequence = translate_frame(cds_sequence)
|
|
162
186
|
wrapped_sequence = '\n'.join([cds_sequence[i:i + 60] for i in range(0, len(cds_sequence), 60)])
|
|
163
187
|
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
|
|
164
188
|
|
|
165
189
|
|
|
166
|
-
def read_combined_files(input_dir, name_split, combined_out):
|
|
190
|
+
def read_combined_files(input_dir, name_split, gene_ident, combined_out, translate):
|
|
167
191
|
with open(combined_out, 'w') as combined_out_file:
|
|
168
192
|
for gff_file in glob.glob(os.path.join(input_dir, '*' + name_split)):
|
|
169
193
|
genome_name = os.path.basename(gff_file).split(name_split)[0]
|
|
@@ -186,7 +210,7 @@ def read_combined_files(input_dir, name_split, combined_out):
|
|
|
186
210
|
else:
|
|
187
211
|
line_data = line.split('\t')
|
|
188
212
|
if len(line_data) == 9:
|
|
189
|
-
if line_data[2]
|
|
213
|
+
if any(gene_type in line_data[2] for gene_type in gene_ident):
|
|
190
214
|
contig = line_data[0]
|
|
191
215
|
feature = line_data[2]
|
|
192
216
|
strand = line_data[6]
|
|
@@ -215,5 +239,91 @@ def read_combined_files(input_dir, name_split, combined_out):
|
|
|
215
239
|
full_sequence = fasta_dict[contig][1]
|
|
216
240
|
cds_sequence = full_sequence[corrected_start:corrected_stop]
|
|
217
241
|
|
|
242
|
+
if translate == True:
|
|
243
|
+
cds_sequence = translate_frame(cds_sequence)
|
|
218
244
|
wrapped_sequence = '\n'.join([cds_sequence[i:i + 60] for i in range(0, len(cds_sequence), 60)])
|
|
219
|
-
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
|
|
245
|
+
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def read_fasta_files(input_dir, name_split, combined_out, translate):
|
|
249
|
+
with open(combined_out, 'w') as combined_out_file:
|
|
250
|
+
for fasta_file in glob.glob(os.path.join(input_dir, '*' + name_split)):
|
|
251
|
+
genome_name = os.path.basename(fasta_file).split(name_split)[0]
|
|
252
|
+
fasta_dict = collections.defaultdict(str)
|
|
253
|
+
with open(fasta_file, 'r') as file:
|
|
254
|
+
lines = file.readlines()
|
|
255
|
+
for line in lines:
|
|
256
|
+
if line.startswith('>'):
|
|
257
|
+
current_seq = line[1:].split()[0]
|
|
258
|
+
fasta_dict[current_seq] = ''
|
|
259
|
+
else:
|
|
260
|
+
fasta_dict[current_seq] +=line.strip()
|
|
261
|
+
for id, seq in fasta_dict.items():
|
|
262
|
+
if translate == True:
|
|
263
|
+
seq = translate_frame(seq)
|
|
264
|
+
wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
|
|
265
|
+
combined_out_file.write(f">{genome_name}|{id}\n{wrapped_sequence}\n")
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def write_groups(options,output_dir, key_order, cores, sequences, pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences):
|
|
269
|
+
# Create output directory if it doesn't exist
|
|
270
|
+
if not os.path.exists(output_dir):
|
|
271
|
+
os.makedirs(output_dir)
|
|
272
|
+
for key_prefix in key_order:
|
|
273
|
+
for key, values in cores.items():
|
|
274
|
+
if any(part in options.write_groups.split(',') for part in key.split('_')):
|
|
275
|
+
if key.startswith(key_prefix):
|
|
276
|
+
for value in values:
|
|
277
|
+
output_filename = f"{key}_{value}.fasta"
|
|
278
|
+
if 'First' in key_prefix:
|
|
279
|
+
sequences_to_write = pangenome_clusters_First_sequences_sorted[value]
|
|
280
|
+
else: # combined_pangenome_clusters_Second_sequences is None if reclustered isn't being used
|
|
281
|
+
sequences_to_write = combined_pangenome_clusters_Second_sequences[value]
|
|
282
|
+
# Write sequences to output file that are in the sequences dictionary
|
|
283
|
+
with open(os.path.join(output_dir, output_filename), 'w') as outfile:
|
|
284
|
+
for header in sequences_to_write:
|
|
285
|
+
if header in sequences:
|
|
286
|
+
outfile.write(f">{header}\n")
|
|
287
|
+
wrapped_sequence = wrap_sequence(sequences[header])
|
|
288
|
+
outfile.write(f"{wrapped_sequence}\n")
|
|
289
|
+
else:
|
|
290
|
+
if options.verbose == True:
|
|
291
|
+
print("Sequence " + header + " Not found in original_fasta file.")
|
|
292
|
+
|
|
293
|
+
def process_gene_families(options, directory, output_file):
|
|
294
|
+
"""Process each gene family file to select the longest sequence per genome and concatenate aligned sequences."""
|
|
295
|
+
concatenated_sequences = {}
|
|
296
|
+
output_file = directory.replace('Gene_Families_Output',output_file)
|
|
297
|
+
|
|
298
|
+
# Iterate over each gene family file
|
|
299
|
+
for gene_file in os.listdir(directory):
|
|
300
|
+
if gene_file.endswith('.fasta'):
|
|
301
|
+
gene_path = os.path.join(directory, gene_file)
|
|
302
|
+
|
|
303
|
+
# Read sequences from the gene family file
|
|
304
|
+
sequences = read_fasta(gene_path)
|
|
305
|
+
|
|
306
|
+
# Select the longest sequence for each genome
|
|
307
|
+
longest_sequences = select_longest_gene(sequences)
|
|
308
|
+
|
|
309
|
+
# Run mafft on the longest sequences
|
|
310
|
+
aligned_file = f"{gene_file}_aligned.fasta"
|
|
311
|
+
run_mafft_on_sequences(options, {seq_id: seq for seq_id, seq in longest_sequences.values()}, aligned_file)
|
|
312
|
+
|
|
313
|
+
# Read aligned sequences and concatenate them
|
|
314
|
+
aligned_sequences = read_fasta(aligned_file)
|
|
315
|
+
for genome, aligned_seq in aligned_sequences.items():
|
|
316
|
+
genome_name = genome.split('|')[0]
|
|
317
|
+
if genome_name not in concatenated_sequences:
|
|
318
|
+
concatenated_sequences[genome_name] = ""
|
|
319
|
+
concatenated_sequences[genome_name] += aligned_seq
|
|
320
|
+
|
|
321
|
+
# Clean up aligned file
|
|
322
|
+
os.remove(aligned_file)
|
|
323
|
+
|
|
324
|
+
# Write the concatenated sequences to the output file
|
|
325
|
+
with open(output_file, 'w') as out:
|
|
326
|
+
for genome, sequence in concatenated_sequences.items():
|
|
327
|
+
out.write(f">{genome}\n")
|
|
328
|
+
wrapped_sequence = wrap_sequence(sequence, 60)
|
|
329
|
+
out.write(f"{wrapped_sequence}\n")
|