PyamilySeq 0.5.2__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyamilySeq/Constants.py +1 -1
- PyamilySeq/PyamilySeq.py +100 -53
- PyamilySeq/PyamilySeq_Genus.py +139 -556
- PyamilySeq/PyamilySeq_Species.py +140 -584
- PyamilySeq/Seq_Combiner.py +26 -7
- PyamilySeq/clusterings.py +362 -0
- PyamilySeq/utils.py +199 -6
- PyamilySeq-0.7.0.dist-info/METADATA +251 -0
- PyamilySeq-0.7.0.dist-info/RECORD +14 -0
- {PyamilySeq-0.5.2.dist-info → PyamilySeq-0.7.0.dist-info}/WHEEL +1 -1
- PyamilySeq/CD-Hit_StORF-Reporter_Cross-Genera_Builder.py +0 -600
- PyamilySeq-0.5.2.dist-info/METADATA +0 -144
- PyamilySeq-0.5.2.dist-info/RECORD +0 -14
- {PyamilySeq-0.5.2.dist-info → PyamilySeq-0.7.0.dist-info}/LICENSE +0 -0
- {PyamilySeq-0.5.2.dist-info → PyamilySeq-0.7.0.dist-info}/entry_points.txt +0 -0
- {PyamilySeq-0.5.2.dist-info → PyamilySeq-0.7.0.dist-info}/top_level.txt +0 -0
PyamilySeq/PyamilySeq_Species.py
CHANGED
|
@@ -1,109 +1,23 @@
|
|
|
1
1
|
#from line_profiler_pycharm import profile
|
|
2
2
|
|
|
3
|
-
from collections import OrderedDict,defaultdict
|
|
4
|
-
import copy
|
|
5
3
|
import math
|
|
6
|
-
import sys
|
|
7
|
-
from tempfile import NamedTemporaryFile
|
|
8
|
-
|
|
9
|
-
|
|
10
4
|
|
|
11
5
|
try:
|
|
12
6
|
from .Constants import *
|
|
7
|
+
from .clusterings import *
|
|
13
8
|
from .utils import *
|
|
14
9
|
except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
|
|
15
10
|
from Constants import *
|
|
11
|
+
from clusterings import *
|
|
16
12
|
from utils import *
|
|
17
13
|
|
|
18
14
|
|
|
19
|
-
def custom_sort_key(k, dict1, dict2):
|
|
20
|
-
return (len(dict1[k]), len(dict2[k]))
|
|
21
|
-
|
|
22
|
-
def sort_keys_by_values(dict1, dict2):
|
|
23
|
-
sorted_keys = sorted(dict1.keys(), key=lambda k: custom_sort_key(k, dict1, dict2), reverse=True)
|
|
24
|
-
return sorted_keys
|
|
25
|
-
|
|
26
|
-
def select_longest_gene(sequences):
|
|
27
|
-
"""Select the longest sequence for each genome."""
|
|
28
|
-
longest_sequences = {}
|
|
29
|
-
for seq_id, sequence in sequences.items():
|
|
30
|
-
genome = seq_id.split('|')[0] # Assuming genome name can be derived from the sequence ID
|
|
31
|
-
if genome not in longest_sequences or len(sequence) > len(longest_sequences[genome][1]):
|
|
32
|
-
longest_sequences[genome] = (seq_id, sequence)
|
|
33
|
-
return longest_sequences
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
def run_mafft_on_sequences(options, sequences, output_file):
|
|
37
|
-
"""Run mafft on the given sequences and write to output file."""
|
|
38
|
-
# Create a temporary input file for mafft
|
|
39
|
-
with NamedTemporaryFile('w', delete=False) as temp_input_file:
|
|
40
|
-
for header, sequence in sequences.items():
|
|
41
|
-
temp_input_file.write(f">{header}\n{sequence}\n")
|
|
42
|
-
temp_input_file_path = temp_input_file.name
|
|
43
|
-
|
|
44
|
-
# Run mafft
|
|
45
|
-
try:
|
|
46
|
-
with open(output_file, 'w') as output_f:
|
|
47
|
-
if options.verbose == 'True':
|
|
48
|
-
subprocess.run(
|
|
49
|
-
['mafft', '--auto', temp_input_file_path],
|
|
50
|
-
stdout=output_f,
|
|
51
|
-
stderr=sys.stderr,
|
|
52
|
-
check=True
|
|
53
|
-
)
|
|
54
|
-
else:
|
|
55
|
-
subprocess.run(
|
|
56
|
-
['mafft', '--auto', temp_input_file_path],
|
|
57
|
-
stdout=output_f,
|
|
58
|
-
stderr=subprocess.DEVNULL, # Suppress stderr
|
|
59
|
-
check=True
|
|
60
|
-
)
|
|
61
|
-
finally:
|
|
62
|
-
os.remove(temp_input_file_path) # Clean up the temporary file
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
def process_gene_families(options, directory, output_file):
|
|
66
|
-
"""Process each gene family file to select the longest sequence per genome and concatenate aligned sequences."""
|
|
67
|
-
concatenated_sequences = {}
|
|
68
|
-
output_file = directory.replace('Gene_Families_Output',output_file)
|
|
69
|
-
|
|
70
|
-
# Iterate over each gene family file
|
|
71
|
-
for gene_file in os.listdir(directory):
|
|
72
|
-
if gene_file.endswith('.fasta'):
|
|
73
|
-
gene_path = os.path.join(directory, gene_file)
|
|
74
|
-
|
|
75
|
-
# Read sequences from the gene family file
|
|
76
|
-
sequences = read_fasta(gene_path)
|
|
77
|
-
|
|
78
|
-
# Select the longest sequence for each genome
|
|
79
|
-
longest_sequences = select_longest_gene(sequences)
|
|
80
|
-
|
|
81
|
-
# Run mafft on the longest sequences
|
|
82
|
-
aligned_file = f"{gene_file}_aligned.fasta"
|
|
83
|
-
run_mafft_on_sequences(options, {seq_id: seq for seq_id, seq in longest_sequences.values()}, aligned_file)
|
|
84
|
-
|
|
85
|
-
# Read aligned sequences and concatenate them
|
|
86
|
-
aligned_sequences = read_fasta(aligned_file)
|
|
87
|
-
for genome, aligned_seq in aligned_sequences.items():
|
|
88
|
-
genome_name = genome.split('|')[0]
|
|
89
|
-
if genome_name not in concatenated_sequences:
|
|
90
|
-
concatenated_sequences[genome_name] = ""
|
|
91
|
-
concatenated_sequences[genome_name] += aligned_seq
|
|
92
|
-
|
|
93
|
-
# Clean up aligned file
|
|
94
|
-
os.remove(aligned_file)
|
|
95
|
-
|
|
96
|
-
# Write the concatenated sequences to the output file
|
|
97
|
-
with open(output_file, 'w') as out:
|
|
98
|
-
for genome, sequence in concatenated_sequences.items():
|
|
99
|
-
out.write(f">{genome}\n")
|
|
100
|
-
wrapped_sequence = wrap_sequence(sequence, 60)
|
|
101
|
-
out.write(f"{wrapped_sequence}\n")
|
|
102
|
-
|
|
103
15
|
def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted):
|
|
104
16
|
print("Outputting gene_presence_absence file")
|
|
105
|
-
|
|
106
|
-
|
|
17
|
+
output_dir = os.path.abspath(options.output_dir)
|
|
18
|
+
in_name = options.clusters.split('.')[0].split('/')[-1]
|
|
19
|
+
gpa_outfile = os.path.join(output_dir, in_name)
|
|
20
|
+
gpa_outfile = open(gpa_outfile+'_gene_presence_absence.csv','w')
|
|
107
21
|
gpa_outfile.write('"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment","'
|
|
108
22
|
'"Accessory Fragment","Accessory Order with Fragment","QC","Min group size nuc","Max group size nuc","Avg group size nuc","')
|
|
109
23
|
gpa_outfile.write('","'.join(genome_dict.keys()))
|
|
@@ -113,14 +27,17 @@ def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_
|
|
|
113
27
|
gpa_outfile.write('"group_'+str(cluster)+'","","'+str(len(pangenome_clusters_First_sorted[cluster]))+'","'+str(len(sequences))+'","'+str(average_sequences_per_genome)+
|
|
114
28
|
'","","","","","","","","",""')
|
|
115
29
|
|
|
116
|
-
|
|
30
|
+
|
|
117
31
|
for genome in genome_dict.keys():
|
|
32
|
+
full_out = ''
|
|
118
33
|
tmp_list = []
|
|
119
34
|
for value in sequences:
|
|
120
35
|
if value.split('|')[0] == genome:
|
|
121
36
|
tmp_list.append(value)
|
|
122
37
|
if tmp_list:
|
|
123
38
|
full_out += ',"'+''.join(tmp_list)+'"'
|
|
39
|
+
else:
|
|
40
|
+
full_out = ',""'
|
|
124
41
|
gpa_outfile.write(full_out)
|
|
125
42
|
gpa_outfile.write('\n')
|
|
126
43
|
|
|
@@ -137,31 +54,7 @@ def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_
|
|
|
137
54
|
# edge_list_outfile.write(line + '\n')
|
|
138
55
|
|
|
139
56
|
|
|
140
|
-
def wrap_sequence(sequence, width=60):
|
|
141
|
-
wrapped_sequence = []
|
|
142
|
-
for i in range(0, len(sequence), width):
|
|
143
|
-
wrapped_sequence.append(sequence[i:i + width])
|
|
144
|
-
return "\n".join(wrapped_sequence)
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
def read_fasta(fasta_file):
|
|
148
|
-
sequences = {}
|
|
149
|
-
current_sequence = None
|
|
150
|
-
with open(fasta_file, 'r') as file:
|
|
151
|
-
for line in file:
|
|
152
|
-
line = line.strip()
|
|
153
|
-
if not line:
|
|
154
|
-
continue
|
|
155
|
-
if line.startswith('>'):
|
|
156
|
-
current_sequence = line[1:]
|
|
157
|
-
sequences[current_sequence] = ''
|
|
158
|
-
else:
|
|
159
|
-
sequences[current_sequence] += line
|
|
160
|
-
return sequences
|
|
161
|
-
|
|
162
57
|
|
|
163
|
-
def reorder_dict_by_keys(original_dict, sorted_keys):
|
|
164
|
-
return {k: original_dict[k] for k in sorted_keys}
|
|
165
58
|
|
|
166
59
|
def get_cores(options,genome_dict):
|
|
167
60
|
##Calculate core groups
|
|
@@ -172,409 +65,93 @@ def get_cores(options,genome_dict):
|
|
|
172
65
|
for group in options.core_groups.split(','):
|
|
173
66
|
calculated_floor = math.floor(int(group) / 100 * len(genome_dict))
|
|
174
67
|
if first == False:
|
|
175
|
-
# Ensure no overlap
|
|
176
|
-
# if calculated_floor <= prev_top:
|
|
177
|
-
# calculated_floor = prev_top - 1
|
|
178
|
-
|
|
179
68
|
groups[group] = (calculated_floor,prev_top)
|
|
180
69
|
else:
|
|
181
70
|
groups[group] = (calculated_floor, prev_top)
|
|
182
71
|
first = False
|
|
183
72
|
prev_top = calculated_floor
|
|
184
|
-
first_core_group = '
|
|
73
|
+
first_core_group = 'First_core_' + group
|
|
185
74
|
cores[first_core_group] = []
|
|
186
75
|
if options.reclustered != None:
|
|
187
76
|
extended_core_group = 'extended_core_' + group
|
|
188
77
|
cores[extended_core_group] = []
|
|
189
78
|
combined_core_group = 'combined_core_' + group
|
|
190
79
|
cores[combined_core_group] = []
|
|
191
|
-
second_core_group = '
|
|
80
|
+
second_core_group = 'Second_core_' + group
|
|
192
81
|
cores[second_core_group] = []
|
|
193
|
-
only_second_core_group = '
|
|
82
|
+
only_second_core_group = 'only_Second_core_' + group
|
|
194
83
|
cores[only_second_core_group] = []
|
|
195
84
|
return cores, groups
|
|
196
85
|
|
|
197
86
|
#@profile
|
|
198
|
-
def calc_First_only_core(cluster,
|
|
87
|
+
def calc_First_only_core(cluster, First_num, groups, cores):
|
|
199
88
|
groups_as_list = list(groups.values())
|
|
200
|
-
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <=
|
|
89
|
+
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= int(First_num) <= fir):
|
|
201
90
|
res = idx
|
|
202
91
|
family_group = list(groups)[res]
|
|
203
|
-
cores['
|
|
92
|
+
cores['First_core_'+family_group].append(cluster)
|
|
204
93
|
|
|
205
94
|
#@profile
|
|
206
|
-
def calc_single_First_extended_Second_only_core(
|
|
95
|
+
def calc_single_First_extended_Second_only_core(cluster, First_num, groups, cores, Second_num): # Count gene families extended with StORFs
|
|
207
96
|
groups_as_list = list(groups.values())
|
|
208
|
-
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <=
|
|
97
|
+
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= First_num+Second_num <= fir):
|
|
209
98
|
res = idx
|
|
99
|
+
|
|
210
100
|
family_group = list(groups)[res]
|
|
211
|
-
cores['extended_core_' + family_group].append(
|
|
101
|
+
cores['extended_core_' + family_group].append(cluster)
|
|
212
102
|
|
|
213
103
|
|
|
214
104
|
#@profile
|
|
215
|
-
def calc_multi_First_extended_Second_only_core(
|
|
105
|
+
def calc_multi_First_extended_Second_only_core(cluster, First_num, groups, cores, Second_num): # Count seperately those gene families extended with StORF_Reporter but combined >1 PEP
|
|
216
106
|
groups_as_list = list(groups.values())
|
|
217
|
-
|
|
218
|
-
|
|
107
|
+
# Looping through the list to find the matching condition
|
|
108
|
+
for idx, (sec, fir) in enumerate(groups_as_list):
|
|
109
|
+
if sec <= First_num + Second_num <= fir:
|
|
110
|
+
res = idx
|
|
111
|
+
break
|
|
219
112
|
family_group = list(groups)[res]
|
|
220
|
-
cores['combined_core_' + family_group]
|
|
113
|
+
cores['combined_core_' + family_group].append(cluster)
|
|
221
114
|
|
|
222
115
|
|
|
223
116
|
#@profile
|
|
224
|
-
def calc_Second_only_core(groups, cores
|
|
117
|
+
def calc_Second_only_core(cluster, Second_num, groups, cores):
|
|
225
118
|
groups_as_list = list(groups.values())
|
|
226
|
-
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <=
|
|
119
|
+
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= Second_num <= fir):
|
|
227
120
|
res = idx
|
|
228
121
|
family_group = list(groups)[res]
|
|
229
|
-
cores['
|
|
122
|
+
cores['Second_core_' + family_group].append(cluster)
|
|
230
123
|
|
|
231
124
|
#@profile
|
|
232
|
-
def calc_only_Second_only_core(groups, cores
|
|
125
|
+
def calc_only_Second_only_core(cluster, Second_num, groups, cores): # only count the true storf onlies
|
|
233
126
|
groups_as_list = list(groups.values())
|
|
234
|
-
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <=
|
|
127
|
+
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= Second_num <= fir):
|
|
235
128
|
res = idx
|
|
236
129
|
family_group = list(groups)[res]
|
|
237
|
-
cores['
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
#@profile
|
|
244
|
-
def combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered):
|
|
245
|
-
num_clustered_First = defaultdict(list)
|
|
246
|
-
pangenome_clusters_Type = copy.deepcopy(pangenome_clusters_First)
|
|
247
|
-
list_of_reps = list(reps.keys())
|
|
248
|
-
for cluster, pep_genomes in pangenome_clusters_First.items():
|
|
249
|
-
rep = list_of_reps[int(cluster)] # get the rep of the current pep cluster
|
|
250
|
-
Com_PEP_Genomes = 0
|
|
251
|
-
Seconds = 0
|
|
252
|
-
seen_Seconds = []
|
|
253
|
-
added_Second_genomes = 0
|
|
254
|
-
try: # get the cluster from the storf clusters which contains this rep
|
|
255
|
-
clustered_combined = combined_pangenome_clusters_First_Second_clustered[rep] # Not true clusters - I put a PEP as key myself
|
|
256
|
-
seen_clust_Genomes = []
|
|
257
|
-
num_clustered_First[cluster].append(rep + '_' + str(len(pep_genomes)))
|
|
258
|
-
for clust in clustered_combined:
|
|
259
|
-
if options.sequence_tag not in clust: # Not good enough at the moment
|
|
260
|
-
clust_Genome = clust.split('|')[0]
|
|
261
|
-
if clust_Genome not in seen_clust_Genomes:
|
|
262
|
-
seen_clust_Genomes.append(clust_Genome)
|
|
263
|
-
if clust_Genome not in pep_genomes:
|
|
264
|
-
Com_PEP_Genomes += 1
|
|
265
|
-
num_clustered_First[cluster].append(clust + '_' + str(reps[clust][1]))
|
|
266
|
-
elif options.sequence_tag in clust:
|
|
267
|
-
Seconds += 1
|
|
268
|
-
clust_Genome = clust.split('|')[0]
|
|
269
|
-
if clust_Genome not in seen_Seconds:
|
|
270
|
-
seen_Seconds.append(clust_Genome)
|
|
271
|
-
if clust_Genome not in seen_clust_Genomes:
|
|
272
|
-
seen_clust_Genomes.append(clust_Genome)
|
|
273
|
-
if clust_Genome not in pep_genomes:
|
|
274
|
-
added_Second_genomes += 1
|
|
275
|
-
else:
|
|
276
|
-
sys.exit("Error: looking for sequence_tag")
|
|
277
|
-
|
|
278
|
-
size_of_pep_clusters = []
|
|
279
|
-
peps = num_clustered_First[cluster]
|
|
280
|
-
for pep in peps:
|
|
281
|
-
pep = pep.rsplit('_', 1)
|
|
282
|
-
size_of_pep_clusters.append(int(pep[1]))
|
|
283
|
-
pangenome_clusters_Type[cluster] = [len(num_clustered_First[cluster]), sum(size_of_pep_clusters),
|
|
284
|
-
size_of_pep_clusters, added_Second_genomes, Seconds, len(seen_Seconds)]
|
|
285
|
-
|
|
286
|
-
except KeyError:
|
|
287
|
-
###Singleton
|
|
288
|
-
num_pep_genomes = [len(pep_genomes)]
|
|
289
|
-
pangenome_clusters_Type[cluster] = [1, len(pep_genomes), num_pep_genomes, added_Second_genomes, Seconds,
|
|
290
|
-
len(seen_Seconds)]
|
|
291
|
-
|
|
292
|
-
return pangenome_clusters_Type
|
|
293
|
-
|
|
294
|
-
#@profile
|
|
295
|
-
def single_clustering_counting(options, pangenome_clusters_First, reps):
|
|
296
|
-
num_clustered_First = defaultdict(list)
|
|
297
|
-
recorded_First = []
|
|
298
|
-
pangenome_clusters_Type = copy.deepcopy(pangenome_clusters_First)
|
|
299
|
-
list_of_reps = list(reps.keys())
|
|
300
|
-
for cluster, First_genomes in pangenome_clusters_First.items():
|
|
301
|
-
rep = list_of_reps[int(cluster)] # get the rep of the current pep cluster
|
|
302
|
-
|
|
303
|
-
try: # get the cluster from the storf clusters which contains this rep
|
|
304
|
-
num_clustered_First[cluster].append(rep + '_' + str(len(First_genomes)))
|
|
305
|
-
size_of_First_clusters = []
|
|
306
|
-
Firsts = num_clustered_First[cluster]
|
|
307
|
-
for First in Firsts:
|
|
308
|
-
First = First.rsplit('_', 1)
|
|
309
|
-
size_of_First_clusters.append(int(First[1]))
|
|
310
|
-
recorded_First.append(First[0])
|
|
311
|
-
pangenome_clusters_Type[cluster] = [len(num_clustered_First[cluster]), sum(size_of_First_clusters),
|
|
312
|
-
size_of_First_clusters, 0, 0, 0]
|
|
313
|
-
|
|
314
|
-
except KeyError:
|
|
315
|
-
###Singleton
|
|
316
|
-
num_pep_genomes = [len(First_genomes)]
|
|
317
|
-
pangenome_clusters_Type[cluster] = [1, len(First_genomes), num_pep_genomes, 0, 0, 0]
|
|
318
|
-
|
|
319
|
-
return pangenome_clusters_Type
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
#@profile
|
|
324
|
-
def combined_clustering_CDHIT(options, genome_dict):
|
|
325
|
-
unique_genomes = []
|
|
326
|
-
Second_in = open(options.reclustered, 'r')
|
|
327
|
-
combined_pangenome_clusters_First = OrderedDict()
|
|
328
|
-
combined_pangenome_clusters_First_sequences = OrderedDict()
|
|
329
|
-
combined_pangenome_clusters_Second = OrderedDict()
|
|
330
|
-
combined_pangenome_clusters_Second_sequences = OrderedDict()
|
|
331
|
-
combined_pangenome_clusters_First_Second_clustered = OrderedDict()
|
|
332
|
-
|
|
333
|
-
not_Second_only_cluster_ids = []
|
|
334
|
-
already_seen_PEP = []
|
|
335
|
-
Combined_clusters = OrderedDict()
|
|
336
|
-
Combined_reps = OrderedDict()
|
|
337
|
-
first = True
|
|
338
|
-
for line in Second_in:
|
|
339
|
-
if line.startswith('>'):
|
|
340
|
-
if first == False:
|
|
341
|
-
cluster_size = len(Combined_clusters[cluster_id])
|
|
342
|
-
Combined_reps.update({rep: cluster_size})
|
|
343
|
-
for pep in combined_pangenome_clusters_First_sequences[cluster_id]:
|
|
344
|
-
if pep != []:
|
|
345
|
-
if pep in already_seen_PEP:
|
|
346
|
-
continue
|
|
347
|
-
else:
|
|
348
|
-
already_seen_PEP.append(pep)
|
|
349
|
-
if len(combined_pangenome_clusters_Second_sequences[cluster_id]) > 0 and len(combined_pangenome_clusters_First_sequences[cluster_id]) > 0:
|
|
350
|
-
if len(combined_pangenome_clusters_First_sequences[cluster_id]) > 1: # If we have clustered >1 PEP family, we need to record 1 as key and all others are val
|
|
351
|
-
all_but_first = combined_pangenome_clusters_First_sequences[cluster_id][1:]
|
|
352
|
-
storfs_clustered = combined_pangenome_clusters_Second_sequences[cluster_id]
|
|
353
|
-
VALUE = all_but_first + storfs_clustered
|
|
354
|
-
else:
|
|
355
|
-
VALUE = combined_pangenome_clusters_Second_sequences[cluster_id]
|
|
356
|
-
KEY = combined_pangenome_clusters_First_sequences[cluster_id][0]
|
|
357
|
-
combined_pangenome_clusters_First_Second_clustered.update({KEY: VALUE})
|
|
358
|
-
cluster_id = line.strip('>')
|
|
359
|
-
cluster_id = cluster_id.strip('\n')
|
|
360
|
-
cluster_id = cluster_id.split(' ')[1]
|
|
361
|
-
Combined_clusters.update({cluster_id: []})
|
|
362
|
-
combined_pangenome_clusters_First.update({cluster_id: []})
|
|
363
|
-
combined_pangenome_clusters_First_sequences.update({cluster_id: []})
|
|
364
|
-
combined_pangenome_clusters_Second.update({cluster_id: []})
|
|
365
|
-
combined_pangenome_clusters_Second_sequences.update({cluster_id: []})
|
|
366
|
-
|
|
367
|
-
first = False
|
|
368
|
-
else:
|
|
369
|
-
clustered = line.split('\t')[1]
|
|
370
|
-
clustered = clustered.split('>')[1]
|
|
371
|
-
clustered = clustered.split('...')[0]
|
|
372
|
-
genome = clustered.split('|')[0]
|
|
373
|
-
genome_dict[genome] += 1
|
|
374
|
-
if '*' in line:
|
|
375
|
-
rep = clustered
|
|
376
|
-
Combined_reps.update({rep: 0})
|
|
377
|
-
if first == False:
|
|
378
|
-
Combined_clusters[cluster_id].append(clustered)
|
|
379
|
-
clustered_genome = clustered.split('|')[0]
|
|
380
|
-
if options.sequence_tag in line:
|
|
381
|
-
if clustered_genome not in combined_pangenome_clusters_Second[cluster_id]:
|
|
382
|
-
combined_pangenome_clusters_Second[cluster_id].append(clustered_genome)
|
|
383
|
-
combined_pangenome_clusters_Second_sequences[cluster_id].append(clustered)
|
|
384
|
-
else:
|
|
385
|
-
if cluster_id not in not_Second_only_cluster_ids:
|
|
386
|
-
not_Second_only_cluster_ids.append(cluster_id) # Tell us which StORF_Reporter clustered are unmatched to a PEP
|
|
387
|
-
if clustered_genome not in combined_pangenome_clusters_First[cluster_id]:
|
|
388
|
-
combined_pangenome_clusters_First[cluster_id].append(clustered_genome)
|
|
389
|
-
combined_pangenome_clusters_First_sequences[cluster_id].append(clustered)
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
return combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids, combined_pangenome_clusters_Second, unique_genomes
|
|
393
|
-
|
|
394
|
-
def combined_clustering_Edge_List(options, genome_dict):
|
|
395
|
-
if options.cluster_format == 'TSV':
|
|
396
|
-
separator = '\t'
|
|
397
|
-
elif options.cluster_format == 'CSV':
|
|
398
|
-
separator = ','
|
|
399
|
-
unique_genomes = []
|
|
400
|
-
cluster_id = 0
|
|
401
|
-
last_rep = ''
|
|
402
|
-
Second_in = open(options.reclustered, 'r')
|
|
403
|
-
combined_pangenome_clusters_First = OrderedDict()
|
|
404
|
-
combined_pangenome_clusters_First_sequences = OrderedDict()
|
|
405
|
-
combined_pangenome_clusters_Second = OrderedDict()
|
|
406
|
-
combined_pangenome_clusters_Second_sequences = OrderedDict()
|
|
407
|
-
combined_pangenome_clusters_First_Second_clustered = OrderedDict()
|
|
408
|
-
|
|
409
|
-
not_Second_only_cluster_ids = []
|
|
410
|
-
already_seen_PEP = []
|
|
411
|
-
Combined_clusters = OrderedDict()
|
|
412
|
-
Combined_reps = OrderedDict()
|
|
413
|
-
first = True
|
|
414
|
-
for line in Second_in:
|
|
415
|
-
rep, child = line.strip().split(separator)
|
|
416
|
-
child_genome = child.split('|')[0] # Extracting the genome identifier from the child sequence
|
|
417
|
-
|
|
418
|
-
if first == True:
|
|
419
|
-
Combined_clusters.update({cluster_id: []})
|
|
420
|
-
combined_pangenome_clusters_First.update({cluster_id: []})
|
|
421
|
-
combined_pangenome_clusters_First_sequences.update({cluster_id: []})
|
|
422
|
-
combined_pangenome_clusters_Second.update({cluster_id: []})
|
|
423
|
-
combined_pangenome_clusters_Second_sequences.update({cluster_id: []})
|
|
424
|
-
Combined_reps.update({rep: 0})
|
|
425
|
-
first = False
|
|
426
|
-
|
|
427
|
-
if first == False:
|
|
428
|
-
if rep != last_rep and last_rep != '':
|
|
429
|
-
cluster_size = len(Combined_clusters[cluster_id])
|
|
430
|
-
Combined_reps.update({rep: cluster_size})
|
|
431
|
-
for pep in combined_pangenome_clusters_First_sequences[cluster_id]:
|
|
432
|
-
if pep != []:
|
|
433
|
-
if pep in already_seen_PEP:
|
|
434
|
-
continue
|
|
435
|
-
else:
|
|
436
|
-
already_seen_PEP.append(pep)
|
|
437
|
-
if len(combined_pangenome_clusters_Second_sequences[cluster_id]) > 0 and len(combined_pangenome_clusters_First_sequences[cluster_id]) > 0:
|
|
438
|
-
if len(combined_pangenome_clusters_First_sequences[cluster_id]) > 1: # If we have clustered >1 PEP family, we need to record 1 as key and all others are val
|
|
439
|
-
all_but_first = combined_pangenome_clusters_First_sequences[cluster_id][1:]
|
|
440
|
-
storfs_clustered = combined_pangenome_clusters_Second_sequences[cluster_id]
|
|
441
|
-
VALUE = all_but_first + storfs_clustered
|
|
442
|
-
else:
|
|
443
|
-
VALUE = combined_pangenome_clusters_Second_sequences[cluster_id]
|
|
444
|
-
KEY = combined_pangenome_clusters_First_sequences[cluster_id][0]
|
|
445
|
-
combined_pangenome_clusters_First_Second_clustered.update({KEY: VALUE})
|
|
446
|
-
|
|
447
|
-
cluster_id += 1
|
|
448
|
-
Combined_clusters.update({cluster_id: []})
|
|
449
|
-
combined_pangenome_clusters_First.update({cluster_id: []})
|
|
450
|
-
combined_pangenome_clusters_First_sequences.update({cluster_id: []})
|
|
451
|
-
combined_pangenome_clusters_Second.update({cluster_id: []})
|
|
452
|
-
combined_pangenome_clusters_Second_sequences.update({cluster_id: []})
|
|
453
|
-
Combined_reps.update({rep: 0})
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
Combined_clusters[cluster_id].append(child)
|
|
457
|
-
if options.sequence_tag in line:
|
|
458
|
-
if child_genome not in combined_pangenome_clusters_Second[cluster_id]:
|
|
459
|
-
combined_pangenome_clusters_Second[cluster_id].append(child_genome)
|
|
460
|
-
combined_pangenome_clusters_Second_sequences[cluster_id].append(child)
|
|
461
|
-
else:
|
|
462
|
-
if cluster_id not in not_Second_only_cluster_ids:
|
|
463
|
-
not_Second_only_cluster_ids.append(cluster_id) # Tell us which StORF_Reporter clustered are unmatched to a PEP
|
|
464
|
-
if child_genome not in combined_pangenome_clusters_First[cluster_id]:
|
|
465
|
-
combined_pangenome_clusters_First[cluster_id].append(child_genome)
|
|
466
|
-
combined_pangenome_clusters_First_sequences[cluster_id].append(child)
|
|
130
|
+
cores['only_Second_core_' + family_group].append(cluster)
|
|
467
131
|
|
|
468
|
-
last_rep = rep
|
|
469
132
|
|
|
470
|
-
return combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids, combined_pangenome_clusters_Second, unique_genomes
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
def cluster_EdgeList(options):
|
|
474
|
-
if options.cluster_format == 'TSV':
|
|
475
|
-
separator = '\t'
|
|
476
|
-
elif options.cluster_format == 'CSV':
|
|
477
|
-
separator = ','
|
|
478
|
-
cluster_id = 0
|
|
479
|
-
last_rep = ''
|
|
480
|
-
first = True
|
|
481
|
-
First_in = open(options.clusters, 'r')
|
|
482
|
-
pangenome_clusters_First = OrderedDict()
|
|
483
|
-
pangenome_clusters_First_sequences = OrderedDict()
|
|
484
|
-
genome_dict = defaultdict(int)
|
|
485
|
-
reps = OrderedDict()
|
|
486
|
-
for line in First_in:
|
|
487
|
-
rep, child = line.strip().split(separator)
|
|
488
|
-
child_genome = child.split('|')[0] # Extracting the genome identifier from the child sequence
|
|
489
|
-
# Counting occurrences of genomes
|
|
490
|
-
genome_dict[child_genome] += 1
|
|
491
|
-
if first == True:
|
|
492
|
-
pangenome_clusters_First[0] = []
|
|
493
|
-
pangenome_clusters_First_sequences[0] = []
|
|
494
|
-
first = False
|
|
495
|
-
|
|
496
|
-
if rep != last_rep and last_rep != '':
|
|
497
|
-
cluster_id +=1
|
|
498
|
-
pangenome_clusters_First[cluster_id] = []
|
|
499
|
-
pangenome_clusters_First_sequences[cluster_id] = []
|
|
500
|
-
cluster_size = len(pangenome_clusters_First_sequences[cluster_id-1])
|
|
501
|
-
reps.update({last_rep: [cluster_size, len(pangenome_clusters_First[cluster_id-1])]})
|
|
502
|
-
pangenome_clusters_First[cluster_id] = []
|
|
503
|
-
pangenome_clusters_First_sequences[cluster_id] = []
|
|
504
|
-
if child_genome not in pangenome_clusters_First[cluster_id]:
|
|
505
|
-
pangenome_clusters_First[cluster_id].append(child_genome)
|
|
506
|
-
|
|
507
|
-
pangenome_clusters_First_sequences[cluster_id].append(child)
|
|
508
|
-
last_rep = rep
|
|
509
|
-
cluster_size = len(pangenome_clusters_First_sequences[cluster_id])
|
|
510
|
-
reps.update({rep: [cluster_size, len(pangenome_clusters_First[cluster_id])]})
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
return genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
def cluster_CDHIT(options):
|
|
518
|
-
First_in = open(options.clusters, 'r')
|
|
519
|
-
clusters = OrderedDict()
|
|
520
|
-
pangenome_clusters_First = OrderedDict()
|
|
521
|
-
pangenome_clusters_First_sequences = OrderedDict()
|
|
522
|
-
first = True
|
|
523
|
-
genome_dict = defaultdict(int)
|
|
524
|
-
reps = OrderedDict()
|
|
525
|
-
## Load in all data for easier reuse later
|
|
526
|
-
for line in First_in:
|
|
527
|
-
if line.startswith('>'):
|
|
528
|
-
if first == False:
|
|
529
|
-
cluster_size = len(clusters[cluster_id])
|
|
530
|
-
reps.update({rep: [cluster_size, len(pangenome_clusters_First[cluster_id])]})
|
|
531
|
-
cluster_id = line.strip('>')
|
|
532
|
-
cluster_id = cluster_id.strip('\n')
|
|
533
|
-
cluster_id = cluster_id.split(' ')[1]
|
|
534
|
-
clusters.update({cluster_id: []})
|
|
535
|
-
pangenome_clusters_First.update({cluster_id: []})
|
|
536
|
-
pangenome_clusters_First_sequences.update({cluster_id: []})
|
|
537
|
-
|
|
538
|
-
first = False
|
|
539
|
-
else:
|
|
540
|
-
clustered = line.split('\t')[1]
|
|
541
|
-
clustered = clustered.split('>')[1]
|
|
542
|
-
clustered = clustered.split('...')[0]
|
|
543
|
-
genome = clustered.split('|')[0]
|
|
544
|
-
genome_dict[genome] += 1
|
|
545
|
-
if '*' in line:
|
|
546
|
-
rep = clustered
|
|
547
|
-
reps.update({rep: [0, 0]})
|
|
548
|
-
if first == False:
|
|
549
|
-
clusters[cluster_id].append(clustered)
|
|
550
|
-
clustered_genome = clustered.split('|')[0]
|
|
551
|
-
if clustered_genome not in pangenome_clusters_First[cluster_id]:
|
|
552
|
-
pangenome_clusters_First[cluster_id].append(clustered_genome)
|
|
553
|
-
pangenome_clusters_First_sequences[cluster_id].append(clustered)
|
|
554
|
-
return genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps
|
|
555
133
|
|
|
556
134
|
#@profile
|
|
557
135
|
def cluster(options):
|
|
558
136
|
|
|
559
137
|
if options.cluster_format == 'CD-HIT':
|
|
560
|
-
genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps = cluster_CDHIT(options)
|
|
561
|
-
elif options.cluster_format
|
|
562
|
-
genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps = cluster_EdgeList(options)
|
|
138
|
+
genome_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps = cluster_CDHIT(options, '|')
|
|
139
|
+
elif 'TSV' in options.cluster_format or 'CSV' in options.cluster_format:
|
|
140
|
+
genome_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps = cluster_EdgeList(options, '|')
|
|
563
141
|
|
|
564
|
-
|
|
142
|
+
###
|
|
565
143
|
cores, groups = get_cores(options, genome_dict)
|
|
566
144
|
###
|
|
567
145
|
|
|
568
146
|
if options.reclustered != None:
|
|
569
147
|
if options.cluster_format == 'CD-HIT':
|
|
570
|
-
combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second
|
|
574
|
-
|
|
575
|
-
pangenome_clusters_Type = combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered)
|
|
148
|
+
combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second, combined_pangenome_clusters_Second_sequences = combined_clustering_CDHIT(options, genome_dict, '|')
|
|
149
|
+
elif 'TSV' in options.cluster_format or 'CSV' in options.cluster_format:
|
|
150
|
+
#Fix
|
|
151
|
+
combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second,combined_pangenome_clusters_Second_sequences = combined_clustering_Edge_List(options, '|')
|
|
152
|
+
pangenome_clusters_Type = combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered, pangenome_clusters_First_genomes, '|')
|
|
576
153
|
else:
|
|
577
|
-
pangenome_clusters_Type = single_clustering_counting(
|
|
154
|
+
pangenome_clusters_Type = single_clustering_counting(pangenome_clusters_First, reps)
|
|
578
155
|
|
|
579
156
|
|
|
580
157
|
|
|
@@ -586,23 +163,30 @@ def cluster(options):
|
|
|
586
163
|
pangenome_clusters_Type_sorted = reorder_dict_by_keys(pangenome_clusters_Type, sorted_first_keys)
|
|
587
164
|
|
|
588
165
|
print("Calculating Groups")
|
|
166
|
+
seen_groupings = []
|
|
589
167
|
for cluster, numbers in pangenome_clusters_Type_sorted.items():
|
|
590
168
|
############################### Calculate First only
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
169
|
+
cluster = str(cluster)
|
|
170
|
+
if '78' in cluster:
|
|
171
|
+
pass
|
|
172
|
+
for grouping in numbers[2]: #!!# Could do with a more elegant solution
|
|
173
|
+
current_cluster = grouping[0].split(':')[0]
|
|
174
|
+
if current_cluster not in seen_groupings:
|
|
175
|
+
seen_groupings.append(current_cluster)
|
|
176
|
+
current_cluster_size = grouping[0].split(':')[1]
|
|
177
|
+
calc_First_only_core(current_cluster, current_cluster_size,groups,cores)
|
|
178
|
+
############################# Calculate First and Reclustered-Second
|
|
179
|
+
if numbers[0] == 1 and numbers[3] >= 1: # If Seconds did not combine First reps
|
|
180
|
+
calc_single_First_extended_Second_only_core(cluster, numbers[1], groups, cores, numbers[3])
|
|
181
|
+
elif numbers[0] > 1 and numbers[3] >= 1: # If unique Seconds combined multiple Firsts
|
|
182
|
+
calc_multi_First_extended_Second_only_core(cluster, numbers[1], groups, cores, numbers[3])
|
|
183
|
+
elif numbers[4] >= 1:
|
|
184
|
+
Number_Of_Second_Extending_But_Same_Genomes += 1
|
|
185
|
+
else:
|
|
186
|
+
if options.verbose == True:
|
|
187
|
+
print("First cluster " + current_cluster + " already processed - This is likely because it was clustered with another First representative.")
|
|
597
188
|
|
|
598
189
|
if options.reclustered != None:
|
|
599
|
-
############################# Calculate First and Reclustered-Second
|
|
600
|
-
if numbers[0] == 1 and numbers[3] >= 1: # If Seconds did not combine First reps
|
|
601
|
-
calc_single_First_extended_Second_only_core(numbers[1], groups, cores, numbers[3])
|
|
602
|
-
elif numbers[0] > 1 and numbers[3] >= 1: # If unique Secondss combined multiple Firsts
|
|
603
|
-
calc_multi_First_extended_Second_only_core(numbers[1], groups, cores, numbers[3])
|
|
604
|
-
elif numbers[4] >= 1:
|
|
605
|
-
Number_Of_Second_Extending_But_Same_Genomes += 1
|
|
606
190
|
combined_pangenome_clusters_ONLY_Second_Type = defaultdict(list)
|
|
607
191
|
combined_pangenome_clusters_Second_Type = defaultdict(list)
|
|
608
192
|
for cluster, genomes in combined_pangenome_clusters_Second.items():
|
|
@@ -611,120 +195,92 @@ def cluster(options):
|
|
|
611
195
|
else:
|
|
612
196
|
combined_pangenome_clusters_ONLY_Second_Type[cluster] = [cluster, len(genomes)]
|
|
613
197
|
for cluster, data in combined_pangenome_clusters_Second_Type.items():
|
|
614
|
-
|
|
198
|
+
if data[1] >= 1:
|
|
199
|
+
calc_Second_only_core(cluster, data[1], groups, cores)
|
|
615
200
|
for cluster, data in combined_pangenome_clusters_ONLY_Second_Type.items():
|
|
616
|
-
if data[1] >=
|
|
617
|
-
calc_only_Second_only_core(
|
|
201
|
+
if data[1] >= 1:
|
|
202
|
+
calc_only_Second_only_core(cluster, data[1], groups, cores)
|
|
618
203
|
###########################
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
204
|
+
### Output
|
|
205
|
+
output_path = os.path.abspath(options.output_dir)
|
|
206
|
+
if not os.path.exists(output_path):
|
|
207
|
+
os.makedirs(output_path)
|
|
208
|
+
stats_out = os.path.join(output_path,'summary_statistics.txt')
|
|
209
|
+
key_order = ['First_core_', 'extended_core_', 'combined_core_', 'Second_core_','only_Second_core_']
|
|
210
|
+
with open(stats_out, 'w') as outfile:
|
|
211
|
+
print("Gene Groups:")
|
|
212
|
+
outfile.write("Gene Groups:\n")
|
|
213
|
+
for key_prefix in key_order:
|
|
214
|
+
for key, value in cores.items():
|
|
215
|
+
if key.startswith(key_prefix):
|
|
216
|
+
print(f"{key}: {len(value)}")
|
|
217
|
+
outfile.write(f"{key}: {len(value)}\n")
|
|
218
|
+
print("Total Number of First Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
|
|
219
|
+
outfile.write("Total Number of First Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
|
|
220
|
+
if options.reclustered!= None:
|
|
221
|
+
print("Total Number of Second Gene Groups (Including Singletons): " + str(
|
|
222
|
+
len(combined_pangenome_clusters_Second_sequences)))
|
|
223
|
+
print("Total Number of First Gene Groups That Had Additional Second Sequences But Not New Genomes: " + str(
|
|
224
|
+
Number_Of_Second_Extending_But_Same_Genomes))
|
|
225
|
+
outfile.write("\nTotal Number of Second Gene Groups (Including Singletons): " + str(
|
|
226
|
+
len(combined_pangenome_clusters_Second_sequences)))
|
|
227
|
+
outfile.write("\nTotal Number of First Gene Groups That Had Additional Second Sequences But Not New Genomes: " + str(
|
|
228
|
+
Number_Of_Second_Extending_But_Same_Genomes))
|
|
229
|
+
#Report number of first and second clusters and do the ame for genus
|
|
627
230
|
if options.gene_presence_absence_out != None:
|
|
628
231
|
gene_presence_absence_output(options,genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted)
|
|
629
232
|
|
|
630
|
-
if options.write_families != None and options.fasta != None:
|
|
631
|
-
sequences = read_fasta(options.fasta)
|
|
632
|
-
input_dir = os.path.dirname(os.path.abspath(options.clusters))
|
|
633
|
-
output_dir = os.path.join(input_dir, 'Gene_Families_Output')
|
|
634
233
|
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
os.makedirs(output_dir)
|
|
638
|
-
for key_prefix in key_order:
|
|
639
|
-
for key, values in cores.items():
|
|
640
|
-
if any(part in options.write_families.split(',') for part in key.split('_')):
|
|
641
|
-
if key.startswith(key_prefix):
|
|
642
|
-
for value in values:
|
|
643
|
-
output_filename = f"{key}_{value}.fasta"
|
|
644
|
-
sequences_to_write = pangenome_clusters_First_sequences_sorted[value]
|
|
645
|
-
# Write sequences to output file that are in the sequences dictionary
|
|
646
|
-
with open(os.path.join(output_dir, output_filename), 'w') as outfile:
|
|
647
|
-
for header in sequences_to_write:
|
|
648
|
-
if header in sequences:
|
|
649
|
-
outfile.write(f">{header}\n")
|
|
650
|
-
wrapped_sequence = wrap_sequence(sequences[header])
|
|
651
|
-
outfile.write(f"{wrapped_sequence}\n")
|
|
652
|
-
|
|
653
|
-
if options.con_core != None and options.fasta != None and options.write_families != None:
|
|
654
|
-
process_gene_families(options, os.path.join(input_dir, 'Gene_Families_Output'), 'concatonated_genes_aligned.fasta')
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
# groups_dir = os.path.join(input_dir, 'Gene_Families_Output')
|
|
658
|
-
# """Run mafft on all .fasta files in the given directory."""
|
|
659
|
-
# for filename in os.listdir(groups_dir):
|
|
660
|
-
# if filename.endswith('.fasta'):
|
|
661
|
-
# input_path = os.path.join(groups_dir, filename)
|
|
662
|
-
# output_filename = filename.replace('.fasta', '_mafft.aln')
|
|
663
|
-
# output_path = os.path.join(groups_dir, output_filename)
|
|
664
|
-
#
|
|
665
|
-
# # Call mafft command
|
|
666
|
-
# try:
|
|
667
|
-
# with open(output_path, 'w') as output_file:
|
|
668
|
-
# subprocess.run(
|
|
669
|
-
# ['mafft', '--auto', input_path],
|
|
670
|
-
# stdout=output_file,
|
|
671
|
-
# stderr=subprocess.DEVNULL, # Suppress stderr
|
|
672
|
-
# check=True
|
|
673
|
-
# )
|
|
674
|
-
# print(f"Processed {input_path} -> {output_path}")
|
|
675
|
-
# except subprocess.CalledProcessError as e:
|
|
676
|
-
# print(f"Failed to process {input_path}: {e}")
|
|
677
|
-
|
|
678
|
-
##This could be run once and not above AND here..
|
|
679
|
-
# output_dir = os.path.dirname(os.path.abspath(options.clusters))
|
|
680
|
-
# sequences = read_fasta(options.fasta)
|
|
681
|
-
# concatenated_sequences = {genome: '' for genome in genome_dict.keys()}
|
|
682
|
-
#
|
|
683
|
-
#
|
|
684
|
-
# for key_prefix in key_order:
|
|
685
|
-
# for key, values in cores.items():
|
|
686
|
-
# if any(part in options.con_core.split(',') for part in key.split('_')):
|
|
687
|
-
# if key.startswith(key_prefix):
|
|
688
|
-
# for value in values:
|
|
689
|
-
# length_capture = {genome: [] for genome in genome_dict.keys()}
|
|
690
|
-
# sequences_to_write = pangenome_clusters_First_sequences_sorted[value]
|
|
691
|
-
# for header in sequences_to_write:
|
|
692
|
-
# if header in sequences:
|
|
693
|
-
# length_capture[header.split('|')[0]].append([header,len(sequences[header])])
|
|
694
|
-
# if all(bool(values) for values in length_capture.values()): # If a GF is not present in 'ALL' genomes, do not add to concat
|
|
695
|
-
# for genome, lengths in length_capture.items():
|
|
696
|
-
# max_value = float('-inf')
|
|
697
|
-
# max_item = None
|
|
698
|
-
# for length in lengths:
|
|
699
|
-
# current_value = length[1]
|
|
700
|
-
# if current_value > max_value:
|
|
701
|
-
# max_value = current_value
|
|
702
|
-
# max_item = length[0]
|
|
703
|
-
# concatenated_sequences[genome.split('|')[0]] += sequences[max_item]
|
|
704
|
-
#
|
|
705
|
-
#
|
|
706
|
-
# with open(os.path.join(output_dir, 'core_concat.fasta'), 'w') as outfile:
|
|
707
|
-
# for genome, sequence in concatenated_sequences.items():
|
|
708
|
-
# outfile.write(f">{genome}\n")
|
|
709
|
-
# wrapped_sequence = wrap_sequence(sequence)
|
|
710
|
-
# outfile.write(f"{wrapped_sequence}\n")
|
|
234
|
+
###Need to fix this below. If full/partial the ifs need to be different. If full we first need to output the gfs then align. if -wruite-groups not presented then it needs
|
|
235
|
+
# to be done for alignment full anyway...
|
|
711
236
|
|
|
237
|
+
if options.run_mode == 'Full':
|
|
238
|
+
if options.reclustered == None:
|
|
239
|
+
combined_pangenome_clusters_Second_sequences = None
|
|
240
|
+
if options.write_groups != None:
|
|
241
|
+
print("Outputting gene group FASTA files")
|
|
242
|
+
sequences = read_fasta(options.fasta)
|
|
243
|
+
#output_dir = os.path.dirname(os.path.abspath(options.output_dir))
|
|
244
|
+
output_dir = os.path.join(options.output_dir, 'Gene_Families_Output')
|
|
245
|
+
write_groups(options,output_dir, key_order, cores, sequences,
|
|
246
|
+
pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
|
|
712
247
|
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
248
|
+
if options.align_core != None:
|
|
249
|
+
print("Processing gene group alignment")
|
|
250
|
+
process_gene_families(options, output_dir, 'concatenated_genes_aligned.fasta')
|
|
251
|
+
|
|
252
|
+
elif options.run_mode == 'Partial':
|
|
253
|
+
if options.reclustered == None:
|
|
254
|
+
combined_pangenome_clusters_Second_sequences = None
|
|
255
|
+
if options.write_groups != None and options.fasta != None:
|
|
256
|
+
print("Outputting gene group FASTA files")
|
|
257
|
+
sequences = read_fasta(options.fasta)
|
|
258
|
+
#output_dir = os.path.dirname(os.path.abspath(options.output_dir))
|
|
259
|
+
output_dir = os.path.join(options.output_dir, 'Gene_Families_Output')
|
|
260
|
+
write_groups(options,output_dir, key_order, cores, sequences,
|
|
261
|
+
pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
|
|
727
262
|
|
|
263
|
+
if options.align_core != None:
|
|
264
|
+
print("Processing gene group alignment")
|
|
265
|
+
process_gene_families(options, output_dir, 'concatenated_genes_aligned.fasta')
|
|
728
266
|
|
|
729
267
|
|
|
730
268
|
|
|
269
|
+
#
|
|
270
|
+
# if options.align_core != None:
|
|
271
|
+
# #output_dir = os.path.dirname(os.path.abspath(options.output_dir))
|
|
272
|
+
# output_dir = os.path.join(options.output_dir, 'Gene_Families_Output')
|
|
273
|
+
# if not os.path.exists(output_dir):
|
|
274
|
+
# os.makedirs(output_dir)
|
|
275
|
+
# process_gene_families(options, output_dir, 'concatenated_genes_aligned.fasta')
|
|
276
|
+
|
|
277
|
+
#
|
|
278
|
+
# elif options.run_mode == 'Partial':
|
|
279
|
+
# if options.align_core != None and options.fasta != None and options.write_groups != None:
|
|
280
|
+
# process_gene_families(options, os.path.join(output_dir, 'Gene_Families_Output'), 'concatenated_genes_aligned.fasta')
|
|
281
|
+
#
|
|
282
|
+
#
|
|
283
|
+
#
|
|
284
|
+
#
|
|
285
|
+
#
|
|
286
|
+
|