PyamilySeq 0.5.2__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyamilySeq/Constants.py +1 -1
- PyamilySeq/PyamilySeq.py +33 -17
- PyamilySeq/PyamilySeq_Genus.py +84 -484
- PyamilySeq/PyamilySeq_Species.py +63 -514
- PyamilySeq/clusterings.py +324 -0
- PyamilySeq/utils.py +84 -1
- {PyamilySeq-0.5.2.dist-info → PyamilySeq-0.6.0.dist-info}/METADATA +13 -10
- PyamilySeq-0.6.0.dist-info/RECORD +15 -0
- PyamilySeq-0.5.2.dist-info/RECORD +0 -14
- {PyamilySeq-0.5.2.dist-info → PyamilySeq-0.6.0.dist-info}/LICENSE +0 -0
- {PyamilySeq-0.5.2.dist-info → PyamilySeq-0.6.0.dist-info}/WHEEL +0 -0
- {PyamilySeq-0.5.2.dist-info → PyamilySeq-0.6.0.dist-info}/entry_points.txt +0 -0
- {PyamilySeq-0.5.2.dist-info → PyamilySeq-0.6.0.dist-info}/top_level.txt +0 -0
PyamilySeq/PyamilySeq_Species.py
CHANGED
|
@@ -1,67 +1,21 @@
|
|
|
1
1
|
#from line_profiler_pycharm import profile
|
|
2
2
|
|
|
3
|
-
from collections import OrderedDict,defaultdict
|
|
4
3
|
import copy
|
|
5
4
|
import math
|
|
6
5
|
import sys
|
|
7
|
-
from tempfile import NamedTemporaryFile
|
|
8
6
|
|
|
9
7
|
|
|
10
8
|
|
|
11
9
|
try:
|
|
12
10
|
from .Constants import *
|
|
11
|
+
from .clusterings import *
|
|
13
12
|
from .utils import *
|
|
14
13
|
except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
|
|
15
14
|
from Constants import *
|
|
15
|
+
from clusterings import *
|
|
16
16
|
from utils import *
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
def custom_sort_key(k, dict1, dict2):
|
|
20
|
-
return (len(dict1[k]), len(dict2[k]))
|
|
21
|
-
|
|
22
|
-
def sort_keys_by_values(dict1, dict2):
|
|
23
|
-
sorted_keys = sorted(dict1.keys(), key=lambda k: custom_sort_key(k, dict1, dict2), reverse=True)
|
|
24
|
-
return sorted_keys
|
|
25
|
-
|
|
26
|
-
def select_longest_gene(sequences):
|
|
27
|
-
"""Select the longest sequence for each genome."""
|
|
28
|
-
longest_sequences = {}
|
|
29
|
-
for seq_id, sequence in sequences.items():
|
|
30
|
-
genome = seq_id.split('|')[0] # Assuming genome name can be derived from the sequence ID
|
|
31
|
-
if genome not in longest_sequences or len(sequence) > len(longest_sequences[genome][1]):
|
|
32
|
-
longest_sequences[genome] = (seq_id, sequence)
|
|
33
|
-
return longest_sequences
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
def run_mafft_on_sequences(options, sequences, output_file):
|
|
37
|
-
"""Run mafft on the given sequences and write to output file."""
|
|
38
|
-
# Create a temporary input file for mafft
|
|
39
|
-
with NamedTemporaryFile('w', delete=False) as temp_input_file:
|
|
40
|
-
for header, sequence in sequences.items():
|
|
41
|
-
temp_input_file.write(f">{header}\n{sequence}\n")
|
|
42
|
-
temp_input_file_path = temp_input_file.name
|
|
43
|
-
|
|
44
|
-
# Run mafft
|
|
45
|
-
try:
|
|
46
|
-
with open(output_file, 'w') as output_f:
|
|
47
|
-
if options.verbose == 'True':
|
|
48
|
-
subprocess.run(
|
|
49
|
-
['mafft', '--auto', temp_input_file_path],
|
|
50
|
-
stdout=output_f,
|
|
51
|
-
stderr=sys.stderr,
|
|
52
|
-
check=True
|
|
53
|
-
)
|
|
54
|
-
else:
|
|
55
|
-
subprocess.run(
|
|
56
|
-
['mafft', '--auto', temp_input_file_path],
|
|
57
|
-
stdout=output_f,
|
|
58
|
-
stderr=subprocess.DEVNULL, # Suppress stderr
|
|
59
|
-
check=True
|
|
60
|
-
)
|
|
61
|
-
finally:
|
|
62
|
-
os.remove(temp_input_file_path) # Clean up the temporary file
|
|
63
|
-
|
|
64
|
-
|
|
65
19
|
def process_gene_families(options, directory, output_file):
|
|
66
20
|
"""Process each gene family file to select the longest sequence per genome and concatenate aligned sequences."""
|
|
67
21
|
concatenated_sequences = {}
|
|
@@ -102,8 +56,10 @@ def process_gene_families(options, directory, output_file):
|
|
|
102
56
|
|
|
103
57
|
def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted):
|
|
104
58
|
print("Outputting gene_presence_absence file")
|
|
105
|
-
|
|
106
|
-
|
|
59
|
+
output_dir = os.path.abspath(options.output_dir)
|
|
60
|
+
in_name = options.clusters.split('.')[0].split('/')[-1]
|
|
61
|
+
gpa_outfile = os.path.join(output_dir, in_name)
|
|
62
|
+
gpa_outfile = open(gpa_outfile+'_gene_presence_absence.csv','w')
|
|
107
63
|
gpa_outfile.write('"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment","'
|
|
108
64
|
'"Accessory Fragment","Accessory Order with Fragment","QC","Min group size nuc","Max group size nuc","Avg group size nuc","')
|
|
109
65
|
gpa_outfile.write('","'.join(genome_dict.keys()))
|
|
@@ -113,14 +69,17 @@ def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_
|
|
|
113
69
|
gpa_outfile.write('"group_'+str(cluster)+'","","'+str(len(pangenome_clusters_First_sorted[cluster]))+'","'+str(len(sequences))+'","'+str(average_sequences_per_genome)+
|
|
114
70
|
'","","","","","","","","",""')
|
|
115
71
|
|
|
116
|
-
|
|
72
|
+
|
|
117
73
|
for genome in genome_dict.keys():
|
|
74
|
+
full_out = ''
|
|
118
75
|
tmp_list = []
|
|
119
76
|
for value in sequences:
|
|
120
77
|
if value.split('|')[0] == genome:
|
|
121
78
|
tmp_list.append(value)
|
|
122
79
|
if tmp_list:
|
|
123
80
|
full_out += ',"'+''.join(tmp_list)+'"'
|
|
81
|
+
else:
|
|
82
|
+
full_out = ',""'
|
|
124
83
|
gpa_outfile.write(full_out)
|
|
125
84
|
gpa_outfile.write('\n')
|
|
126
85
|
|
|
@@ -137,31 +96,7 @@ def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_
|
|
|
137
96
|
# edge_list_outfile.write(line + '\n')
|
|
138
97
|
|
|
139
98
|
|
|
140
|
-
def wrap_sequence(sequence, width=60):
|
|
141
|
-
wrapped_sequence = []
|
|
142
|
-
for i in range(0, len(sequence), width):
|
|
143
|
-
wrapped_sequence.append(sequence[i:i + width])
|
|
144
|
-
return "\n".join(wrapped_sequence)
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
def read_fasta(fasta_file):
|
|
148
|
-
sequences = {}
|
|
149
|
-
current_sequence = None
|
|
150
|
-
with open(fasta_file, 'r') as file:
|
|
151
|
-
for line in file:
|
|
152
|
-
line = line.strip()
|
|
153
|
-
if not line:
|
|
154
|
-
continue
|
|
155
|
-
if line.startswith('>'):
|
|
156
|
-
current_sequence = line[1:]
|
|
157
|
-
sequences[current_sequence] = ''
|
|
158
|
-
else:
|
|
159
|
-
sequences[current_sequence] += line
|
|
160
|
-
return sequences
|
|
161
|
-
|
|
162
99
|
|
|
163
|
-
def reorder_dict_by_keys(original_dict, sorted_keys):
|
|
164
|
-
return {k: original_dict[k] for k in sorted_keys}
|
|
165
100
|
|
|
166
101
|
def get_cores(options,genome_dict):
|
|
167
102
|
##Calculate core groups
|
|
@@ -172,409 +107,89 @@ def get_cores(options,genome_dict):
|
|
|
172
107
|
for group in options.core_groups.split(','):
|
|
173
108
|
calculated_floor = math.floor(int(group) / 100 * len(genome_dict))
|
|
174
109
|
if first == False:
|
|
175
|
-
# Ensure no overlap
|
|
176
|
-
# if calculated_floor <= prev_top:
|
|
177
|
-
# calculated_floor = prev_top - 1
|
|
178
|
-
|
|
179
110
|
groups[group] = (calculated_floor,prev_top)
|
|
180
111
|
else:
|
|
181
112
|
groups[group] = (calculated_floor, prev_top)
|
|
182
113
|
first = False
|
|
183
114
|
prev_top = calculated_floor
|
|
184
|
-
first_core_group = '
|
|
115
|
+
first_core_group = 'First_core_' + group
|
|
185
116
|
cores[first_core_group] = []
|
|
186
117
|
if options.reclustered != None:
|
|
187
118
|
extended_core_group = 'extended_core_' + group
|
|
188
119
|
cores[extended_core_group] = []
|
|
189
120
|
combined_core_group = 'combined_core_' + group
|
|
190
121
|
cores[combined_core_group] = []
|
|
191
|
-
second_core_group = '
|
|
122
|
+
second_core_group = 'Second_core_' + group
|
|
192
123
|
cores[second_core_group] = []
|
|
193
|
-
only_second_core_group = '
|
|
124
|
+
only_second_core_group = 'only_Second_core_' + group
|
|
194
125
|
cores[only_second_core_group] = []
|
|
195
126
|
return cores, groups
|
|
196
127
|
|
|
197
128
|
#@profile
|
|
198
|
-
def calc_First_only_core(cluster,
|
|
129
|
+
def calc_First_only_core(cluster, First_num, groups, cores):
|
|
199
130
|
groups_as_list = list(groups.values())
|
|
200
|
-
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <=
|
|
131
|
+
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= First_num <= fir):
|
|
201
132
|
res = idx
|
|
202
133
|
family_group = list(groups)[res]
|
|
203
|
-
cores['
|
|
134
|
+
cores['First_core_'+family_group].append(cluster)
|
|
204
135
|
|
|
205
136
|
#@profile
|
|
206
|
-
def calc_single_First_extended_Second_only_core(
|
|
137
|
+
def calc_single_First_extended_Second_only_core(cluster, First_num, groups, cores, Second_num): # Count gene families extended with StORFs
|
|
207
138
|
groups_as_list = list(groups.values())
|
|
208
|
-
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <=
|
|
139
|
+
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= First_num+Second_num <= fir):
|
|
209
140
|
res = idx
|
|
210
141
|
family_group = list(groups)[res]
|
|
211
|
-
cores['extended_core_' + family_group].append(
|
|
142
|
+
cores['extended_core_' + family_group].append(cluster)
|
|
212
143
|
|
|
213
144
|
|
|
214
145
|
#@profile
|
|
215
|
-
def calc_multi_First_extended_Second_only_core(
|
|
146
|
+
def calc_multi_First_extended_Second_only_core(cluster, First_num, groups, cores, Second_num): # Count seperately those gene families extended with StORF_Reporter but combined >1 PEP
|
|
216
147
|
groups_as_list = list(groups.values())
|
|
217
|
-
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <=
|
|
148
|
+
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= First_num+Second_num <= fir):
|
|
218
149
|
res = idx
|
|
219
150
|
family_group = list(groups)[res]
|
|
220
|
-
cores['combined_core_' + family_group]
|
|
151
|
+
cores['combined_core_' + family_group].append(cluster)
|
|
221
152
|
|
|
222
153
|
|
|
223
154
|
#@profile
|
|
224
|
-
def calc_Second_only_core(groups, cores
|
|
155
|
+
def calc_Second_only_core(cluster, Second_num, groups, cores):
|
|
225
156
|
groups_as_list = list(groups.values())
|
|
226
|
-
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <=
|
|
157
|
+
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= Second_num <= fir):
|
|
227
158
|
res = idx
|
|
228
159
|
family_group = list(groups)[res]
|
|
229
|
-
cores['
|
|
160
|
+
cores['Second_core_' + family_group].append(cluster)
|
|
230
161
|
|
|
231
162
|
#@profile
|
|
232
|
-
def calc_only_Second_only_core(groups, cores
|
|
163
|
+
def calc_only_Second_only_core(cluster, Second_num, groups, cores): # only count the true storf onlies
|
|
233
164
|
groups_as_list = list(groups.values())
|
|
234
|
-
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <=
|
|
165
|
+
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= Second_num <= fir):
|
|
235
166
|
res = idx
|
|
236
167
|
family_group = list(groups)[res]
|
|
237
|
-
cores['
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
#@profile
|
|
244
|
-
def combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered):
|
|
245
|
-
num_clustered_First = defaultdict(list)
|
|
246
|
-
pangenome_clusters_Type = copy.deepcopy(pangenome_clusters_First)
|
|
247
|
-
list_of_reps = list(reps.keys())
|
|
248
|
-
for cluster, pep_genomes in pangenome_clusters_First.items():
|
|
249
|
-
rep = list_of_reps[int(cluster)] # get the rep of the current pep cluster
|
|
250
|
-
Com_PEP_Genomes = 0
|
|
251
|
-
Seconds = 0
|
|
252
|
-
seen_Seconds = []
|
|
253
|
-
added_Second_genomes = 0
|
|
254
|
-
try: # get the cluster from the storf clusters which contains this rep
|
|
255
|
-
clustered_combined = combined_pangenome_clusters_First_Second_clustered[rep] # Not true clusters - I put a PEP as key myself
|
|
256
|
-
seen_clust_Genomes = []
|
|
257
|
-
num_clustered_First[cluster].append(rep + '_' + str(len(pep_genomes)))
|
|
258
|
-
for clust in clustered_combined:
|
|
259
|
-
if options.sequence_tag not in clust: # Not good enough at the moment
|
|
260
|
-
clust_Genome = clust.split('|')[0]
|
|
261
|
-
if clust_Genome not in seen_clust_Genomes:
|
|
262
|
-
seen_clust_Genomes.append(clust_Genome)
|
|
263
|
-
if clust_Genome not in pep_genomes:
|
|
264
|
-
Com_PEP_Genomes += 1
|
|
265
|
-
num_clustered_First[cluster].append(clust + '_' + str(reps[clust][1]))
|
|
266
|
-
elif options.sequence_tag in clust:
|
|
267
|
-
Seconds += 1
|
|
268
|
-
clust_Genome = clust.split('|')[0]
|
|
269
|
-
if clust_Genome not in seen_Seconds:
|
|
270
|
-
seen_Seconds.append(clust_Genome)
|
|
271
|
-
if clust_Genome not in seen_clust_Genomes:
|
|
272
|
-
seen_clust_Genomes.append(clust_Genome)
|
|
273
|
-
if clust_Genome not in pep_genomes:
|
|
274
|
-
added_Second_genomes += 1
|
|
275
|
-
else:
|
|
276
|
-
sys.exit("Error: looking for sequence_tag")
|
|
277
|
-
|
|
278
|
-
size_of_pep_clusters = []
|
|
279
|
-
peps = num_clustered_First[cluster]
|
|
280
|
-
for pep in peps:
|
|
281
|
-
pep = pep.rsplit('_', 1)
|
|
282
|
-
size_of_pep_clusters.append(int(pep[1]))
|
|
283
|
-
pangenome_clusters_Type[cluster] = [len(num_clustered_First[cluster]), sum(size_of_pep_clusters),
|
|
284
|
-
size_of_pep_clusters, added_Second_genomes, Seconds, len(seen_Seconds)]
|
|
285
|
-
|
|
286
|
-
except KeyError:
|
|
287
|
-
###Singleton
|
|
288
|
-
num_pep_genomes = [len(pep_genomes)]
|
|
289
|
-
pangenome_clusters_Type[cluster] = [1, len(pep_genomes), num_pep_genomes, added_Second_genomes, Seconds,
|
|
290
|
-
len(seen_Seconds)]
|
|
291
|
-
|
|
292
|
-
return pangenome_clusters_Type
|
|
293
|
-
|
|
294
|
-
#@profile
|
|
295
|
-
def single_clustering_counting(options, pangenome_clusters_First, reps):
|
|
296
|
-
num_clustered_First = defaultdict(list)
|
|
297
|
-
recorded_First = []
|
|
298
|
-
pangenome_clusters_Type = copy.deepcopy(pangenome_clusters_First)
|
|
299
|
-
list_of_reps = list(reps.keys())
|
|
300
|
-
for cluster, First_genomes in pangenome_clusters_First.items():
|
|
301
|
-
rep = list_of_reps[int(cluster)] # get the rep of the current pep cluster
|
|
302
|
-
|
|
303
|
-
try: # get the cluster from the storf clusters which contains this rep
|
|
304
|
-
num_clustered_First[cluster].append(rep + '_' + str(len(First_genomes)))
|
|
305
|
-
size_of_First_clusters = []
|
|
306
|
-
Firsts = num_clustered_First[cluster]
|
|
307
|
-
for First in Firsts:
|
|
308
|
-
First = First.rsplit('_', 1)
|
|
309
|
-
size_of_First_clusters.append(int(First[1]))
|
|
310
|
-
recorded_First.append(First[0])
|
|
311
|
-
pangenome_clusters_Type[cluster] = [len(num_clustered_First[cluster]), sum(size_of_First_clusters),
|
|
312
|
-
size_of_First_clusters, 0, 0, 0]
|
|
313
|
-
|
|
314
|
-
except KeyError:
|
|
315
|
-
###Singleton
|
|
316
|
-
num_pep_genomes = [len(First_genomes)]
|
|
317
|
-
pangenome_clusters_Type[cluster] = [1, len(First_genomes), num_pep_genomes, 0, 0, 0]
|
|
318
|
-
|
|
319
|
-
return pangenome_clusters_Type
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
#@profile
|
|
324
|
-
def combined_clustering_CDHIT(options, genome_dict):
|
|
325
|
-
unique_genomes = []
|
|
326
|
-
Second_in = open(options.reclustered, 'r')
|
|
327
|
-
combined_pangenome_clusters_First = OrderedDict()
|
|
328
|
-
combined_pangenome_clusters_First_sequences = OrderedDict()
|
|
329
|
-
combined_pangenome_clusters_Second = OrderedDict()
|
|
330
|
-
combined_pangenome_clusters_Second_sequences = OrderedDict()
|
|
331
|
-
combined_pangenome_clusters_First_Second_clustered = OrderedDict()
|
|
332
|
-
|
|
333
|
-
not_Second_only_cluster_ids = []
|
|
334
|
-
already_seen_PEP = []
|
|
335
|
-
Combined_clusters = OrderedDict()
|
|
336
|
-
Combined_reps = OrderedDict()
|
|
337
|
-
first = True
|
|
338
|
-
for line in Second_in:
|
|
339
|
-
if line.startswith('>'):
|
|
340
|
-
if first == False:
|
|
341
|
-
cluster_size = len(Combined_clusters[cluster_id])
|
|
342
|
-
Combined_reps.update({rep: cluster_size})
|
|
343
|
-
for pep in combined_pangenome_clusters_First_sequences[cluster_id]:
|
|
344
|
-
if pep != []:
|
|
345
|
-
if pep in already_seen_PEP:
|
|
346
|
-
continue
|
|
347
|
-
else:
|
|
348
|
-
already_seen_PEP.append(pep)
|
|
349
|
-
if len(combined_pangenome_clusters_Second_sequences[cluster_id]) > 0 and len(combined_pangenome_clusters_First_sequences[cluster_id]) > 0:
|
|
350
|
-
if len(combined_pangenome_clusters_First_sequences[cluster_id]) > 1: # If we have clustered >1 PEP family, we need to record 1 as key and all others are val
|
|
351
|
-
all_but_first = combined_pangenome_clusters_First_sequences[cluster_id][1:]
|
|
352
|
-
storfs_clustered = combined_pangenome_clusters_Second_sequences[cluster_id]
|
|
353
|
-
VALUE = all_but_first + storfs_clustered
|
|
354
|
-
else:
|
|
355
|
-
VALUE = combined_pangenome_clusters_Second_sequences[cluster_id]
|
|
356
|
-
KEY = combined_pangenome_clusters_First_sequences[cluster_id][0]
|
|
357
|
-
combined_pangenome_clusters_First_Second_clustered.update({KEY: VALUE})
|
|
358
|
-
cluster_id = line.strip('>')
|
|
359
|
-
cluster_id = cluster_id.strip('\n')
|
|
360
|
-
cluster_id = cluster_id.split(' ')[1]
|
|
361
|
-
Combined_clusters.update({cluster_id: []})
|
|
362
|
-
combined_pangenome_clusters_First.update({cluster_id: []})
|
|
363
|
-
combined_pangenome_clusters_First_sequences.update({cluster_id: []})
|
|
364
|
-
combined_pangenome_clusters_Second.update({cluster_id: []})
|
|
365
|
-
combined_pangenome_clusters_Second_sequences.update({cluster_id: []})
|
|
366
|
-
|
|
367
|
-
first = False
|
|
368
|
-
else:
|
|
369
|
-
clustered = line.split('\t')[1]
|
|
370
|
-
clustered = clustered.split('>')[1]
|
|
371
|
-
clustered = clustered.split('...')[0]
|
|
372
|
-
genome = clustered.split('|')[0]
|
|
373
|
-
genome_dict[genome] += 1
|
|
374
|
-
if '*' in line:
|
|
375
|
-
rep = clustered
|
|
376
|
-
Combined_reps.update({rep: 0})
|
|
377
|
-
if first == False:
|
|
378
|
-
Combined_clusters[cluster_id].append(clustered)
|
|
379
|
-
clustered_genome = clustered.split('|')[0]
|
|
380
|
-
if options.sequence_tag in line:
|
|
381
|
-
if clustered_genome not in combined_pangenome_clusters_Second[cluster_id]:
|
|
382
|
-
combined_pangenome_clusters_Second[cluster_id].append(clustered_genome)
|
|
383
|
-
combined_pangenome_clusters_Second_sequences[cluster_id].append(clustered)
|
|
384
|
-
else:
|
|
385
|
-
if cluster_id not in not_Second_only_cluster_ids:
|
|
386
|
-
not_Second_only_cluster_ids.append(cluster_id) # Tell us which StORF_Reporter clustered are unmatched to a PEP
|
|
387
|
-
if clustered_genome not in combined_pangenome_clusters_First[cluster_id]:
|
|
388
|
-
combined_pangenome_clusters_First[cluster_id].append(clustered_genome)
|
|
389
|
-
combined_pangenome_clusters_First_sequences[cluster_id].append(clustered)
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
return combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids, combined_pangenome_clusters_Second, unique_genomes
|
|
393
|
-
|
|
394
|
-
def combined_clustering_Edge_List(options, genome_dict):
|
|
395
|
-
if options.cluster_format == 'TSV':
|
|
396
|
-
separator = '\t'
|
|
397
|
-
elif options.cluster_format == 'CSV':
|
|
398
|
-
separator = ','
|
|
399
|
-
unique_genomes = []
|
|
400
|
-
cluster_id = 0
|
|
401
|
-
last_rep = ''
|
|
402
|
-
Second_in = open(options.reclustered, 'r')
|
|
403
|
-
combined_pangenome_clusters_First = OrderedDict()
|
|
404
|
-
combined_pangenome_clusters_First_sequences = OrderedDict()
|
|
405
|
-
combined_pangenome_clusters_Second = OrderedDict()
|
|
406
|
-
combined_pangenome_clusters_Second_sequences = OrderedDict()
|
|
407
|
-
combined_pangenome_clusters_First_Second_clustered = OrderedDict()
|
|
408
|
-
|
|
409
|
-
not_Second_only_cluster_ids = []
|
|
410
|
-
already_seen_PEP = []
|
|
411
|
-
Combined_clusters = OrderedDict()
|
|
412
|
-
Combined_reps = OrderedDict()
|
|
413
|
-
first = True
|
|
414
|
-
for line in Second_in:
|
|
415
|
-
rep, child = line.strip().split(separator)
|
|
416
|
-
child_genome = child.split('|')[0] # Extracting the genome identifier from the child sequence
|
|
417
|
-
|
|
418
|
-
if first == True:
|
|
419
|
-
Combined_clusters.update({cluster_id: []})
|
|
420
|
-
combined_pangenome_clusters_First.update({cluster_id: []})
|
|
421
|
-
combined_pangenome_clusters_First_sequences.update({cluster_id: []})
|
|
422
|
-
combined_pangenome_clusters_Second.update({cluster_id: []})
|
|
423
|
-
combined_pangenome_clusters_Second_sequences.update({cluster_id: []})
|
|
424
|
-
Combined_reps.update({rep: 0})
|
|
425
|
-
first = False
|
|
426
|
-
|
|
427
|
-
if first == False:
|
|
428
|
-
if rep != last_rep and last_rep != '':
|
|
429
|
-
cluster_size = len(Combined_clusters[cluster_id])
|
|
430
|
-
Combined_reps.update({rep: cluster_size})
|
|
431
|
-
for pep in combined_pangenome_clusters_First_sequences[cluster_id]:
|
|
432
|
-
if pep != []:
|
|
433
|
-
if pep in already_seen_PEP:
|
|
434
|
-
continue
|
|
435
|
-
else:
|
|
436
|
-
already_seen_PEP.append(pep)
|
|
437
|
-
if len(combined_pangenome_clusters_Second_sequences[cluster_id]) > 0 and len(combined_pangenome_clusters_First_sequences[cluster_id]) > 0:
|
|
438
|
-
if len(combined_pangenome_clusters_First_sequences[cluster_id]) > 1: # If we have clustered >1 PEP family, we need to record 1 as key and all others are val
|
|
439
|
-
all_but_first = combined_pangenome_clusters_First_sequences[cluster_id][1:]
|
|
440
|
-
storfs_clustered = combined_pangenome_clusters_Second_sequences[cluster_id]
|
|
441
|
-
VALUE = all_but_first + storfs_clustered
|
|
442
|
-
else:
|
|
443
|
-
VALUE = combined_pangenome_clusters_Second_sequences[cluster_id]
|
|
444
|
-
KEY = combined_pangenome_clusters_First_sequences[cluster_id][0]
|
|
445
|
-
combined_pangenome_clusters_First_Second_clustered.update({KEY: VALUE})
|
|
446
|
-
|
|
447
|
-
cluster_id += 1
|
|
448
|
-
Combined_clusters.update({cluster_id: []})
|
|
449
|
-
combined_pangenome_clusters_First.update({cluster_id: []})
|
|
450
|
-
combined_pangenome_clusters_First_sequences.update({cluster_id: []})
|
|
451
|
-
combined_pangenome_clusters_Second.update({cluster_id: []})
|
|
452
|
-
combined_pangenome_clusters_Second_sequences.update({cluster_id: []})
|
|
453
|
-
Combined_reps.update({rep: 0})
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
Combined_clusters[cluster_id].append(child)
|
|
457
|
-
if options.sequence_tag in line:
|
|
458
|
-
if child_genome not in combined_pangenome_clusters_Second[cluster_id]:
|
|
459
|
-
combined_pangenome_clusters_Second[cluster_id].append(child_genome)
|
|
460
|
-
combined_pangenome_clusters_Second_sequences[cluster_id].append(child)
|
|
461
|
-
else:
|
|
462
|
-
if cluster_id not in not_Second_only_cluster_ids:
|
|
463
|
-
not_Second_only_cluster_ids.append(cluster_id) # Tell us which StORF_Reporter clustered are unmatched to a PEP
|
|
464
|
-
if child_genome not in combined_pangenome_clusters_First[cluster_id]:
|
|
465
|
-
combined_pangenome_clusters_First[cluster_id].append(child_genome)
|
|
466
|
-
combined_pangenome_clusters_First_sequences[cluster_id].append(child)
|
|
467
|
-
|
|
468
|
-
last_rep = rep
|
|
469
|
-
|
|
470
|
-
return combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids, combined_pangenome_clusters_Second, unique_genomes
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
def cluster_EdgeList(options):
|
|
474
|
-
if options.cluster_format == 'TSV':
|
|
475
|
-
separator = '\t'
|
|
476
|
-
elif options.cluster_format == 'CSV':
|
|
477
|
-
separator = ','
|
|
478
|
-
cluster_id = 0
|
|
479
|
-
last_rep = ''
|
|
480
|
-
first = True
|
|
481
|
-
First_in = open(options.clusters, 'r')
|
|
482
|
-
pangenome_clusters_First = OrderedDict()
|
|
483
|
-
pangenome_clusters_First_sequences = OrderedDict()
|
|
484
|
-
genome_dict = defaultdict(int)
|
|
485
|
-
reps = OrderedDict()
|
|
486
|
-
for line in First_in:
|
|
487
|
-
rep, child = line.strip().split(separator)
|
|
488
|
-
child_genome = child.split('|')[0] # Extracting the genome identifier from the child sequence
|
|
489
|
-
# Counting occurrences of genomes
|
|
490
|
-
genome_dict[child_genome] += 1
|
|
491
|
-
if first == True:
|
|
492
|
-
pangenome_clusters_First[0] = []
|
|
493
|
-
pangenome_clusters_First_sequences[0] = []
|
|
494
|
-
first = False
|
|
495
|
-
|
|
496
|
-
if rep != last_rep and last_rep != '':
|
|
497
|
-
cluster_id +=1
|
|
498
|
-
pangenome_clusters_First[cluster_id] = []
|
|
499
|
-
pangenome_clusters_First_sequences[cluster_id] = []
|
|
500
|
-
cluster_size = len(pangenome_clusters_First_sequences[cluster_id-1])
|
|
501
|
-
reps.update({last_rep: [cluster_size, len(pangenome_clusters_First[cluster_id-1])]})
|
|
502
|
-
pangenome_clusters_First[cluster_id] = []
|
|
503
|
-
pangenome_clusters_First_sequences[cluster_id] = []
|
|
504
|
-
if child_genome not in pangenome_clusters_First[cluster_id]:
|
|
505
|
-
pangenome_clusters_First[cluster_id].append(child_genome)
|
|
506
|
-
|
|
507
|
-
pangenome_clusters_First_sequences[cluster_id].append(child)
|
|
508
|
-
last_rep = rep
|
|
509
|
-
cluster_size = len(pangenome_clusters_First_sequences[cluster_id])
|
|
510
|
-
reps.update({rep: [cluster_size, len(pangenome_clusters_First[cluster_id])]})
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
return genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps
|
|
168
|
+
cores['only_Second_core_' + family_group].append(cluster)
|
|
514
169
|
|
|
515
170
|
|
|
516
171
|
|
|
517
|
-
def cluster_CDHIT(options):
|
|
518
|
-
First_in = open(options.clusters, 'r')
|
|
519
|
-
clusters = OrderedDict()
|
|
520
|
-
pangenome_clusters_First = OrderedDict()
|
|
521
|
-
pangenome_clusters_First_sequences = OrderedDict()
|
|
522
|
-
first = True
|
|
523
|
-
genome_dict = defaultdict(int)
|
|
524
|
-
reps = OrderedDict()
|
|
525
|
-
## Load in all data for easier reuse later
|
|
526
|
-
for line in First_in:
|
|
527
|
-
if line.startswith('>'):
|
|
528
|
-
if first == False:
|
|
529
|
-
cluster_size = len(clusters[cluster_id])
|
|
530
|
-
reps.update({rep: [cluster_size, len(pangenome_clusters_First[cluster_id])]})
|
|
531
|
-
cluster_id = line.strip('>')
|
|
532
|
-
cluster_id = cluster_id.strip('\n')
|
|
533
|
-
cluster_id = cluster_id.split(' ')[1]
|
|
534
|
-
clusters.update({cluster_id: []})
|
|
535
|
-
pangenome_clusters_First.update({cluster_id: []})
|
|
536
|
-
pangenome_clusters_First_sequences.update({cluster_id: []})
|
|
537
|
-
|
|
538
|
-
first = False
|
|
539
|
-
else:
|
|
540
|
-
clustered = line.split('\t')[1]
|
|
541
|
-
clustered = clustered.split('>')[1]
|
|
542
|
-
clustered = clustered.split('...')[0]
|
|
543
|
-
genome = clustered.split('|')[0]
|
|
544
|
-
genome_dict[genome] += 1
|
|
545
|
-
if '*' in line:
|
|
546
|
-
rep = clustered
|
|
547
|
-
reps.update({rep: [0, 0]})
|
|
548
|
-
if first == False:
|
|
549
|
-
clusters[cluster_id].append(clustered)
|
|
550
|
-
clustered_genome = clustered.split('|')[0]
|
|
551
|
-
if clustered_genome not in pangenome_clusters_First[cluster_id]:
|
|
552
|
-
pangenome_clusters_First[cluster_id].append(clustered_genome)
|
|
553
|
-
pangenome_clusters_First_sequences[cluster_id].append(clustered)
|
|
554
|
-
return genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps
|
|
555
|
-
|
|
556
172
|
#@profile
|
|
557
173
|
def cluster(options):
|
|
558
174
|
|
|
559
175
|
if options.cluster_format == 'CD-HIT':
|
|
560
|
-
genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps = cluster_CDHIT(options)
|
|
176
|
+
genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps = cluster_CDHIT(options, '|')
|
|
561
177
|
elif options.cluster_format in ['TSV','CSV']:
|
|
562
|
-
genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps = cluster_EdgeList(options)
|
|
178
|
+
genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps = cluster_EdgeList(options, '|')
|
|
563
179
|
|
|
564
|
-
|
|
180
|
+
###
|
|
565
181
|
cores, groups = get_cores(options, genome_dict)
|
|
566
182
|
###
|
|
567
183
|
|
|
568
184
|
if options.reclustered != None:
|
|
569
185
|
if options.cluster_format == 'CD-HIT':
|
|
570
|
-
combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
pangenome_clusters_Type = combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered)
|
|
186
|
+
combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second = combined_clustering_CDHIT(options, genome_dict, '|')
|
|
187
|
+
if options.cluster_format == ['TSV','CSV']:
|
|
188
|
+
combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second = combined_clustering_Edge_List(options, '|')
|
|
189
|
+
|
|
190
|
+
pangenome_clusters_Type = combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered, '|')
|
|
576
191
|
else:
|
|
577
|
-
pangenome_clusters_Type = single_clustering_counting(
|
|
192
|
+
pangenome_clusters_Type = single_clustering_counting(pangenome_clusters_First, reps)
|
|
578
193
|
|
|
579
194
|
|
|
580
195
|
|
|
@@ -588,21 +203,17 @@ def cluster(options):
|
|
|
588
203
|
print("Calculating Groups")
|
|
589
204
|
for cluster, numbers in pangenome_clusters_Type_sorted.items():
|
|
590
205
|
############################### Calculate First only
|
|
591
|
-
#if numbers[0] == 1 and numbers[1] >=2:
|
|
592
206
|
calc_First_only_core(cluster, numbers[1],groups,cores)
|
|
593
207
|
|
|
594
|
-
# elif numbers[0] >1 and numbers[1] >=2:
|
|
595
|
-
# calc_First_only_core(cluster, numbers[2][0],groups,cores)
|
|
596
|
-
|
|
597
|
-
|
|
598
208
|
if options.reclustered != None:
|
|
599
209
|
############################# Calculate First and Reclustered-Second
|
|
600
210
|
if numbers[0] == 1 and numbers[3] >= 1: # If Seconds did not combine First reps
|
|
601
|
-
calc_single_First_extended_Second_only_core(numbers[1], groups, cores, numbers[3])
|
|
211
|
+
calc_single_First_extended_Second_only_core(cluster, numbers[1], groups, cores, numbers[3])
|
|
602
212
|
elif numbers[0] > 1 and numbers[3] >= 1: # If unique Secondss combined multiple Firsts
|
|
603
|
-
calc_multi_First_extended_Second_only_core(numbers[1], groups, cores, numbers[3])
|
|
213
|
+
calc_multi_First_extended_Second_only_core(cluster, numbers[1], groups, cores, numbers[3])
|
|
604
214
|
elif numbers[4] >= 1:
|
|
605
215
|
Number_Of_Second_Extending_But_Same_Genomes += 1
|
|
216
|
+
|
|
606
217
|
combined_pangenome_clusters_ONLY_Second_Type = defaultdict(list)
|
|
607
218
|
combined_pangenome_clusters_Second_Type = defaultdict(list)
|
|
608
219
|
for cluster, genomes in combined_pangenome_clusters_Second.items():
|
|
@@ -611,26 +222,34 @@ def cluster(options):
|
|
|
611
222
|
else:
|
|
612
223
|
combined_pangenome_clusters_ONLY_Second_Type[cluster] = [cluster, len(genomes)]
|
|
613
224
|
for cluster, data in combined_pangenome_clusters_Second_Type.items():
|
|
614
|
-
|
|
225
|
+
if data[1] >= 1:
|
|
226
|
+
calc_Second_only_core(cluster, data[1], groups, cores)
|
|
615
227
|
for cluster, data in combined_pangenome_clusters_ONLY_Second_Type.items():
|
|
616
|
-
if data[1] >=
|
|
617
|
-
calc_only_Second_only_core(
|
|
228
|
+
if data[1] >= 1:
|
|
229
|
+
calc_only_Second_only_core(cluster, data[1], groups, cores)
|
|
618
230
|
###########################
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
231
|
+
### Output
|
|
232
|
+
output_path = os.path.abspath(options.output_dir)
|
|
233
|
+
stats_out = os.path.join(output_path,'summary_statistics.txt')
|
|
234
|
+
key_order = ['First_core_', 'extended_core_', 'combined_core_', 'Second_core_','only_Second_core_']
|
|
235
|
+
with open(stats_out, 'w') as outfile:
|
|
236
|
+
print("Gene Groups:")
|
|
237
|
+
outfile.write("Gene Groups:\n")
|
|
238
|
+
for key_prefix in key_order:
|
|
239
|
+
for key, value in cores.items():
|
|
240
|
+
if key.startswith(key_prefix):
|
|
241
|
+
print(f"{key}: {len(value)}")
|
|
242
|
+
outfile.write(f"{key}: {len(value)}\n")
|
|
243
|
+
print("Total Number of Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
|
|
244
|
+
outfile.write("Total Number of Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
|
|
626
245
|
|
|
627
246
|
if options.gene_presence_absence_out != None:
|
|
628
247
|
gene_presence_absence_output(options,genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted)
|
|
629
248
|
|
|
630
249
|
if options.write_families != None and options.fasta != None:
|
|
631
250
|
sequences = read_fasta(options.fasta)
|
|
632
|
-
|
|
633
|
-
output_dir = os.path.join(
|
|
251
|
+
output_dir = os.path.dirname(os.path.abspath(options.output_dir))
|
|
252
|
+
output_dir = os.path.join(output_dir, 'Gene_Families_Output')
|
|
634
253
|
|
|
635
254
|
# Create output directory if it doesn't exist
|
|
636
255
|
if not os.path.exists(output_dir):
|
|
@@ -651,79 +270,9 @@ def cluster(options):
|
|
|
651
270
|
outfile.write(f"{wrapped_sequence}\n")
|
|
652
271
|
|
|
653
272
|
if options.con_core != None and options.fasta != None and options.write_families != None:
|
|
654
|
-
process_gene_families(options, os.path.join(
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
# groups_dir = os.path.join(input_dir, 'Gene_Families_Output')
|
|
658
|
-
# """Run mafft on all .fasta files in the given directory."""
|
|
659
|
-
# for filename in os.listdir(groups_dir):
|
|
660
|
-
# if filename.endswith('.fasta'):
|
|
661
|
-
# input_path = os.path.join(groups_dir, filename)
|
|
662
|
-
# output_filename = filename.replace('.fasta', '_mafft.aln')
|
|
663
|
-
# output_path = os.path.join(groups_dir, output_filename)
|
|
664
|
-
#
|
|
665
|
-
# # Call mafft command
|
|
666
|
-
# try:
|
|
667
|
-
# with open(output_path, 'w') as output_file:
|
|
668
|
-
# subprocess.run(
|
|
669
|
-
# ['mafft', '--auto', input_path],
|
|
670
|
-
# stdout=output_file,
|
|
671
|
-
# stderr=subprocess.DEVNULL, # Suppress stderr
|
|
672
|
-
# check=True
|
|
673
|
-
# )
|
|
674
|
-
# print(f"Processed {input_path} -> {output_path}")
|
|
675
|
-
# except subprocess.CalledProcessError as e:
|
|
676
|
-
# print(f"Failed to process {input_path}: {e}")
|
|
677
|
-
|
|
678
|
-
##This could be run once and not above AND here..
|
|
679
|
-
# output_dir = os.path.dirname(os.path.abspath(options.clusters))
|
|
680
|
-
# sequences = read_fasta(options.fasta)
|
|
681
|
-
# concatenated_sequences = {genome: '' for genome in genome_dict.keys()}
|
|
682
|
-
#
|
|
683
|
-
#
|
|
684
|
-
# for key_prefix in key_order:
|
|
685
|
-
# for key, values in cores.items():
|
|
686
|
-
# if any(part in options.con_core.split(',') for part in key.split('_')):
|
|
687
|
-
# if key.startswith(key_prefix):
|
|
688
|
-
# for value in values:
|
|
689
|
-
# length_capture = {genome: [] for genome in genome_dict.keys()}
|
|
690
|
-
# sequences_to_write = pangenome_clusters_First_sequences_sorted[value]
|
|
691
|
-
# for header in sequences_to_write:
|
|
692
|
-
# if header in sequences:
|
|
693
|
-
# length_capture[header.split('|')[0]].append([header,len(sequences[header])])
|
|
694
|
-
# if all(bool(values) for values in length_capture.values()): # If a GF is not present in 'ALL' genomes, do not add to concat
|
|
695
|
-
# for genome, lengths in length_capture.items():
|
|
696
|
-
# max_value = float('-inf')
|
|
697
|
-
# max_item = None
|
|
698
|
-
# for length in lengths:
|
|
699
|
-
# current_value = length[1]
|
|
700
|
-
# if current_value > max_value:
|
|
701
|
-
# max_value = current_value
|
|
702
|
-
# max_item = length[0]
|
|
703
|
-
# concatenated_sequences[genome.split('|')[0]] += sequences[max_item]
|
|
704
|
-
#
|
|
705
|
-
#
|
|
706
|
-
# with open(os.path.join(output_dir, 'core_concat.fasta'), 'w') as outfile:
|
|
707
|
-
# for genome, sequence in concatenated_sequences.items():
|
|
708
|
-
# outfile.write(f">{genome}\n")
|
|
709
|
-
# wrapped_sequence = wrap_sequence(sequence)
|
|
710
|
-
# outfile.write(f"{wrapped_sequence}\n")
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
# for core_gene_family in core_gene_families:
|
|
714
|
-
# found_sequences = {genome: False for genome in genomes}
|
|
715
|
-
#
|
|
716
|
-
# for fasta_file in fasta_files:f
|
|
717
|
-
# sequences = read_fasta(fasta_file)
|
|
718
|
-
# for header, sequence in sequences.items():
|
|
719
|
-
# genome = header.split('|')[0]
|
|
720
|
-
# if genome in genomes and core_gene_family in header:
|
|
721
|
-
# concatenated_sequences[genome] += sequence
|
|
722
|
-
# found_sequences[genome] = True
|
|
723
|
-
#
|
|
724
|
-
# for genome in genomes:
|
|
725
|
-
# if not found_sequences[genome]:
|
|
726
|
-
# concatenated_sequences[genome] += '-' * len(next(iter(sequences.values())))
|
|
273
|
+
process_gene_families(options, os.path.join(output_dir, 'Gene_Families_Output'), 'concatonated_genes_aligned.fasta')
|
|
274
|
+
|
|
275
|
+
|
|
727
276
|
|
|
728
277
|
|
|
729
278
|
|