PyamilySeq 0.5.1__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyamilySeq/Constants.py +1 -1
- PyamilySeq/PyamilySeq.py +43 -19
- PyamilySeq/PyamilySeq_Genus.py +84 -484
- PyamilySeq/PyamilySeq_Species.py +63 -514
- PyamilySeq/clusterings.py +324 -0
- PyamilySeq/utils.py +84 -1
- {PyamilySeq-0.5.1.dist-info → PyamilySeq-0.6.0.dist-info}/METADATA +52 -68
- PyamilySeq-0.6.0.dist-info/RECORD +15 -0
- PyamilySeq-0.5.1.dist-info/RECORD +0 -14
- {PyamilySeq-0.5.1.dist-info → PyamilySeq-0.6.0.dist-info}/LICENSE +0 -0
- {PyamilySeq-0.5.1.dist-info → PyamilySeq-0.6.0.dist-info}/WHEEL +0 -0
- {PyamilySeq-0.5.1.dist-info → PyamilySeq-0.6.0.dist-info}/entry_points.txt +0 -0
- {PyamilySeq-0.5.1.dist-info → PyamilySeq-0.6.0.dist-info}/top_level.txt +0 -0
PyamilySeq/PyamilySeq_Genus.py
CHANGED
|
@@ -1,67 +1,21 @@
|
|
|
1
1
|
#from line_profiler_pycharm import profile
|
|
2
2
|
|
|
3
|
-
from collections import OrderedDict,defaultdict
|
|
4
3
|
import copy
|
|
5
|
-
import math
|
|
6
4
|
import sys
|
|
7
|
-
|
|
8
|
-
|
|
5
|
+
import math
|
|
6
|
+
from collections import Counter
|
|
9
7
|
|
|
10
8
|
|
|
11
9
|
try:
|
|
12
10
|
from .Constants import *
|
|
11
|
+
from .clusterings import *
|
|
13
12
|
from .utils import *
|
|
14
13
|
except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
|
|
15
14
|
from Constants import *
|
|
15
|
+
from clusterings import *
|
|
16
16
|
from utils import *
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
def custom_sort_key(k, dict1, dict2):
|
|
20
|
-
return (len(dict1[k]), len(dict2[k]))
|
|
21
|
-
|
|
22
|
-
def sort_keys_by_values(dict1, dict2):
|
|
23
|
-
sorted_keys = sorted(dict1.keys(), key=lambda k: custom_sort_key(k, dict1, dict2), reverse=True)
|
|
24
|
-
return sorted_keys
|
|
25
|
-
|
|
26
|
-
def select_longest_gene(sequences):
|
|
27
|
-
"""Select the longest sequence for each genome."""
|
|
28
|
-
longest_sequences = {}
|
|
29
|
-
for seq_id, sequence in sequences.items():
|
|
30
|
-
genome = seq_id.split('|')[0] # Assuming genome name can be derived from the sequence ID
|
|
31
|
-
if genome not in longest_sequences or len(sequence) > len(longest_sequences[genome][1]):
|
|
32
|
-
longest_sequences[genome] = (seq_id, sequence)
|
|
33
|
-
return longest_sequences
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
def run_mafft_on_sequences(options, sequences, output_file):
|
|
37
|
-
print("Conducting MAFFT alignment.")
|
|
38
|
-
"""Run mafft on the given sequences and write to output file."""
|
|
39
|
-
# Create a temporary input file for mafft
|
|
40
|
-
with NamedTemporaryFile('w', delete=False) as temp_input_file:
|
|
41
|
-
for header, sequence in sequences.items():
|
|
42
|
-
temp_input_file.write(f">{header}\n{sequence}\n")
|
|
43
|
-
temp_input_file_path = temp_input_file.name
|
|
44
|
-
|
|
45
|
-
# Run mafft
|
|
46
|
-
try:
|
|
47
|
-
with open(output_file, 'w') as output_f:
|
|
48
|
-
if options.verbose == True:
|
|
49
|
-
subprocess.run(
|
|
50
|
-
['mafft', '--auto', temp_input_file_path],
|
|
51
|
-
stdout=output_f,
|
|
52
|
-
stderr=sys.stderr,
|
|
53
|
-
check=True
|
|
54
|
-
)
|
|
55
|
-
else:
|
|
56
|
-
subprocess.run(
|
|
57
|
-
['mafft', '--auto', temp_input_file_path],
|
|
58
|
-
stdout=output_f,
|
|
59
|
-
stderr=subprocess.DEVNULL, # Suppress stderr
|
|
60
|
-
check=True
|
|
61
|
-
)
|
|
62
|
-
finally:
|
|
63
|
-
os.remove(temp_input_file_path) # Clean up the temporary file
|
|
64
|
-
|
|
65
19
|
|
|
66
20
|
def process_gene_families(options, directory, output_file):
|
|
67
21
|
"""Process each gene family file to select the longest sequence per genome and concatenate aligned sequences."""
|
|
@@ -101,27 +55,32 @@ def process_gene_families(options, directory, output_file):
|
|
|
101
55
|
wrapped_sequence = wrap_sequence(sequence, 60)
|
|
102
56
|
out.write(f"{wrapped_sequence}\n")
|
|
103
57
|
|
|
104
|
-
def gene_presence_absence_output(options,
|
|
58
|
+
def gene_presence_absence_output(options, genus_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted):
|
|
105
59
|
print("Outputting gene_presence_absence file")
|
|
106
|
-
|
|
107
|
-
|
|
60
|
+
output_dir = os.path.abspath(options.output_dir)
|
|
61
|
+
in_name = options.clusters.split('.')[0].split('/')[-1]
|
|
62
|
+
gpa_outfile = os.path.join(output_dir, in_name)
|
|
63
|
+
gpa_outfile = open(gpa_outfile+'_gene_presence_absence.csv','w')
|
|
108
64
|
gpa_outfile.write('"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment","'
|
|
109
65
|
'"Accessory Fragment","Accessory Order with Fragment","QC","Min group size nuc","Max group size nuc","Avg group size nuc","')
|
|
110
|
-
gpa_outfile.write('","'.join(
|
|
66
|
+
gpa_outfile.write('","'.join(genus_dict.keys()))
|
|
111
67
|
gpa_outfile.write('"\n')
|
|
112
68
|
for cluster, sequences in pangenome_clusters_First_sequences_sorted.items():
|
|
113
69
|
average_sequences_per_genome = len(sequences) / len(pangenome_clusters_First_sorted[cluster])
|
|
114
70
|
gpa_outfile.write('"group_'+str(cluster)+'","","'+str(len(pangenome_clusters_First_sorted[cluster]))+'","'+str(len(sequences))+'","'+str(average_sequences_per_genome)+
|
|
115
71
|
'","","","","","","","","",""')
|
|
116
72
|
|
|
117
|
-
|
|
118
|
-
for
|
|
73
|
+
|
|
74
|
+
for genus in genus_dict.keys():
|
|
75
|
+
full_out = ''
|
|
119
76
|
tmp_list = []
|
|
120
77
|
for value in sequences:
|
|
121
|
-
if value.split('
|
|
78
|
+
if value.split('_')[0] == genus:
|
|
122
79
|
tmp_list.append(value)
|
|
123
80
|
if tmp_list:
|
|
124
81
|
full_out += ',"'+''.join(tmp_list)+'"'
|
|
82
|
+
else:
|
|
83
|
+
full_out = ',""'
|
|
125
84
|
gpa_outfile.write(full_out)
|
|
126
85
|
gpa_outfile.write('\n')
|
|
127
86
|
|
|
@@ -138,446 +97,86 @@ def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_
|
|
|
138
97
|
# edge_list_outfile.write(line + '\n')
|
|
139
98
|
|
|
140
99
|
|
|
141
|
-
def wrap_sequence(sequence, width=60):
|
|
142
|
-
wrapped_sequence = []
|
|
143
|
-
for i in range(0, len(sequence), width):
|
|
144
|
-
wrapped_sequence.append(sequence[i:i + width])
|
|
145
|
-
return "\n".join(wrapped_sequence)
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
def read_fasta(fasta_file):
|
|
149
|
-
sequences = {}
|
|
150
|
-
current_sequence = None
|
|
151
|
-
with open(fasta_file, 'r') as file:
|
|
152
|
-
for line in file:
|
|
153
|
-
line = line.strip()
|
|
154
|
-
if not line:
|
|
155
|
-
continue # Skip empty lines
|
|
156
|
-
if line.startswith('>'):
|
|
157
|
-
current_sequence = line[1:] # Remove '>' character
|
|
158
|
-
sequences[current_sequence] = ''
|
|
159
|
-
else:
|
|
160
|
-
sequences[current_sequence] += line
|
|
161
|
-
return sequences
|
|
162
|
-
|
|
163
100
|
|
|
164
|
-
def reorder_dict_by_keys(original_dict, sorted_keys):
|
|
165
|
-
return {k: original_dict[k] for k in sorted_keys}
|
|
166
101
|
|
|
167
|
-
def get_cores(options,
|
|
102
|
+
def get_cores(options,genus_dict):
|
|
168
103
|
##Calculate core groups
|
|
169
104
|
groups = OrderedDict()
|
|
170
105
|
cores = OrderedDict()
|
|
171
|
-
prev_top = len(genome_dict)
|
|
172
|
-
first = True
|
|
173
106
|
for group in options.core_groups.split(','):
|
|
174
|
-
|
|
175
|
-
if first == False:
|
|
176
|
-
# Ensure no overlap
|
|
177
|
-
# if calculated_floor <= prev_top:
|
|
178
|
-
# calculated_floor = prev_top - 1
|
|
179
|
-
|
|
180
|
-
groups[group] = (calculated_floor,prev_top)
|
|
181
|
-
else:
|
|
182
|
-
groups[group] = (calculated_floor, prev_top)
|
|
183
|
-
first = False
|
|
184
|
-
prev_top = calculated_floor
|
|
185
|
-
first_core_group = 'first_core_' + group
|
|
107
|
+
first_core_group = 'First_genera_' + group
|
|
186
108
|
cores[first_core_group] = []
|
|
187
109
|
if options.reclustered != None:
|
|
188
|
-
extended_core_group = '
|
|
110
|
+
extended_core_group = 'extended_genera_' + group
|
|
189
111
|
cores[extended_core_group] = []
|
|
190
|
-
combined_core_group = '
|
|
112
|
+
combined_core_group = 'combined_genera_' + group
|
|
191
113
|
cores[combined_core_group] = []
|
|
192
|
-
second_core_group = '
|
|
114
|
+
second_core_group = 'Second_genera_' + group
|
|
193
115
|
cores[second_core_group] = []
|
|
194
|
-
only_second_core_group = '
|
|
116
|
+
only_second_core_group = 'only_Second_genera_' + group
|
|
195
117
|
cores[only_second_core_group] = []
|
|
196
118
|
return cores, groups
|
|
197
119
|
|
|
198
|
-
#@profile
|
|
199
|
-
def calc_First_only_core(cluster, pep_num, groups, cores):
|
|
200
|
-
groups_as_list = list(groups.values())
|
|
201
|
-
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= pep_num <= fir):
|
|
202
|
-
res = idx
|
|
203
|
-
family_group = list(groups)[res]
|
|
204
|
-
cores['first_core_'+family_group].append(cluster)
|
|
205
120
|
|
|
206
121
|
#@profile
|
|
207
|
-
def
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
cores['extended_core_' + family_group].append(pep_num)
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
#@profile
|
|
216
|
-
def calc_multi_First_extended_Second_only_core(pep_num, groups, cores, second_num): # Count seperately those gene families extended with StORF_Reporter but combined >1 PEP
|
|
217
|
-
groups_as_list = list(groups.values())
|
|
218
|
-
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= pep_num+second_num <= fir):
|
|
219
|
-
res = idx
|
|
220
|
-
family_group = list(groups)[res]
|
|
221
|
-
cores['combined_core_' + family_group] += 1
|
|
222
|
-
|
|
223
|
-
|
|
122
|
+
def calc_First_only_core(cluster, First_number, cores):
|
|
123
|
+
try:
|
|
124
|
+
cores['First_genera_'+str(First_number)].append(cluster)
|
|
125
|
+
except KeyError:
|
|
126
|
+
cores['First_genera_>'].append(cluster)
|
|
224
127
|
#@profile
|
|
225
|
-
def
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
128
|
+
def calc_single_First_extended_Second_only_core(cluster, First_num, cores, Second_num): # Count gene families extended with StORFs
|
|
129
|
+
group = First_num + Second_num
|
|
130
|
+
try:
|
|
131
|
+
cores['extended_genera_' + group].append(cluster)
|
|
132
|
+
except KeyError:
|
|
133
|
+
cores['extended_genera_>'].append(cluster)
|
|
232
134
|
#@profile
|
|
233
|
-
def
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
135
|
+
def calc_multi_First_extended_Second_only_core(cluster, First_num, cores, Second_num): # Count seperately those gene families extended with StORF_Reporter but combined >1 PEP
|
|
136
|
+
group = First_num + Second_num
|
|
137
|
+
try:
|
|
138
|
+
cores['combined_genera_' + group].append(cluster)
|
|
139
|
+
except KeyError:
|
|
140
|
+
cores['combined_genera_>' + group].append(cluster)
|
|
244
141
|
#@profile
|
|
245
|
-
def
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
rep = list_of_reps[int(cluster)] # get the rep of the current pep cluster
|
|
251
|
-
Com_PEP_Genomes = 0
|
|
252
|
-
Seconds = 0
|
|
253
|
-
seen_Seconds = []
|
|
254
|
-
added_Second_genomes = 0
|
|
255
|
-
try: # get the cluster from the storf clusters which contains this rep
|
|
256
|
-
clustered_combined = combined_pangenome_clusters_First_Second_clustered[rep] # Not true clusters - I put a PEP as key myself
|
|
257
|
-
seen_clust_Genomes = []
|
|
258
|
-
num_clustered_First[cluster].append(rep + '_' + str(len(pep_genomes)))
|
|
259
|
-
for clust in clustered_combined:
|
|
260
|
-
if options.sequence_tag not in clust: # Not good enough at the moment
|
|
261
|
-
clust_Genome = clust.split('|')[0]
|
|
262
|
-
if clust_Genome not in seen_clust_Genomes:
|
|
263
|
-
seen_clust_Genomes.append(clust_Genome)
|
|
264
|
-
if clust_Genome not in pep_genomes:
|
|
265
|
-
Com_PEP_Genomes += 1
|
|
266
|
-
num_clustered_First[cluster].append(clust + '_' + str(reps[clust][1]))
|
|
267
|
-
elif options.sequence_tag in clust:
|
|
268
|
-
Seconds += 1
|
|
269
|
-
clust_Genome = clust.split('|')[0]
|
|
270
|
-
if clust_Genome not in seen_Seconds:
|
|
271
|
-
seen_Seconds.append(clust_Genome)
|
|
272
|
-
if clust_Genome not in seen_clust_Genomes:
|
|
273
|
-
seen_clust_Genomes.append(clust_Genome)
|
|
274
|
-
if clust_Genome not in pep_genomes:
|
|
275
|
-
added_Second_genomes += 1
|
|
276
|
-
else:
|
|
277
|
-
sys.exit("Error: looking for sequence_tag")
|
|
278
|
-
|
|
279
|
-
size_of_pep_clusters = []
|
|
280
|
-
peps = num_clustered_First[cluster]
|
|
281
|
-
for pep in peps:
|
|
282
|
-
pep = pep.rsplit('_', 1)
|
|
283
|
-
size_of_pep_clusters.append(int(pep[1]))
|
|
284
|
-
pangenome_clusters_Type[cluster] = [len(num_clustered_First[cluster]), sum(size_of_pep_clusters),
|
|
285
|
-
size_of_pep_clusters, added_Second_genomes, Seconds, len(seen_Seconds)]
|
|
286
|
-
|
|
287
|
-
except KeyError:
|
|
288
|
-
###Singleton
|
|
289
|
-
num_pep_genomes = [len(pep_genomes)]
|
|
290
|
-
pangenome_clusters_Type[cluster] = [1, len(pep_genomes), num_pep_genomes, added_Second_genomes, Seconds,
|
|
291
|
-
len(seen_Seconds)]
|
|
292
|
-
|
|
293
|
-
return pangenome_clusters_Type
|
|
294
|
-
|
|
142
|
+
def calc_Second_only_core(cluster, cores, Second_num):
|
|
143
|
+
try:
|
|
144
|
+
cores['Second_genera_' + str(Second_num)].append(cluster)
|
|
145
|
+
except KeyError:
|
|
146
|
+
cores['Second_genera_>'].append(cluster)
|
|
295
147
|
#@profile
|
|
296
|
-
def
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
for cluster, First_genomes in pangenome_clusters_First.items():
|
|
302
|
-
rep = list_of_reps[int(cluster)] # get the rep of the current pep cluster
|
|
303
|
-
|
|
304
|
-
try: # get the cluster from the storf clusters which contains this rep
|
|
305
|
-
num_clustered_First[cluster].append(rep + '_' + str(len(First_genomes)))
|
|
306
|
-
size_of_First_clusters = []
|
|
307
|
-
Firsts = num_clustered_First[cluster]
|
|
308
|
-
for First in Firsts:
|
|
309
|
-
First = First.rsplit('_', 1)
|
|
310
|
-
size_of_First_clusters.append(int(First[1]))
|
|
311
|
-
recorded_First.append(First[0])
|
|
312
|
-
pangenome_clusters_Type[cluster] = [len(num_clustered_First[cluster]), sum(size_of_First_clusters),
|
|
313
|
-
size_of_First_clusters, 0, 0, 0]
|
|
314
|
-
|
|
315
|
-
except KeyError:
|
|
316
|
-
###Singleton
|
|
317
|
-
num_pep_genomes = [len(First_genomes)]
|
|
318
|
-
pangenome_clusters_Type[cluster] = [1, len(First_genomes), num_pep_genomes, 0, 0, 0]
|
|
319
|
-
|
|
320
|
-
return pangenome_clusters_Type
|
|
321
|
-
|
|
148
|
+
def calc_only_Second_only_core(cluster, cores, Second_num): # only count the true storf onlies
|
|
149
|
+
try:
|
|
150
|
+
cores['only_Second_genera_' + str(Second_num)].append(cluster)
|
|
151
|
+
except:
|
|
152
|
+
cores['only_Second_genera_>'].append(cluster)
|
|
322
153
|
|
|
323
154
|
|
|
324
|
-
#@profile
|
|
325
|
-
def combined_clustering_CDHIT(options, genome_dict):
|
|
326
|
-
unique_genomes = []
|
|
327
|
-
Second_in = open(options.reclustered, 'r')
|
|
328
|
-
combined_pangenome_clusters_First = OrderedDict()
|
|
329
|
-
combined_pangenome_clusters_First_sequences = OrderedDict()
|
|
330
|
-
combined_pangenome_clusters_Second = OrderedDict()
|
|
331
|
-
combined_pangenome_clusters_Second_sequences = OrderedDict()
|
|
332
|
-
combined_pangenome_clusters_First_Second_clustered = OrderedDict()
|
|
333
|
-
|
|
334
|
-
not_Second_only_cluster_ids = []
|
|
335
|
-
already_seen_PEP = []
|
|
336
|
-
Combined_clusters = OrderedDict()
|
|
337
|
-
Combined_reps = OrderedDict()
|
|
338
|
-
first = True
|
|
339
|
-
for line in Second_in:
|
|
340
|
-
if line.startswith('>'):
|
|
341
|
-
if first == False:
|
|
342
|
-
cluster_size = len(Combined_clusters[cluster_id])
|
|
343
|
-
Combined_reps.update({rep: cluster_size})
|
|
344
|
-
for pep in combined_pangenome_clusters_First_sequences[cluster_id]:
|
|
345
|
-
if pep != []:
|
|
346
|
-
if pep in already_seen_PEP:
|
|
347
|
-
continue
|
|
348
|
-
else:
|
|
349
|
-
already_seen_PEP.append(pep)
|
|
350
|
-
if len(combined_pangenome_clusters_Second_sequences[cluster_id]) > 0 and len(combined_pangenome_clusters_First_sequences[cluster_id]) > 0:
|
|
351
|
-
if len(combined_pangenome_clusters_First_sequences[cluster_id]) > 1: # If we have clustered >1 PEP family, we need to record 1 as key and all others are val
|
|
352
|
-
all_but_first = combined_pangenome_clusters_First_sequences[cluster_id][1:]
|
|
353
|
-
storfs_clustered = combined_pangenome_clusters_Second_sequences[cluster_id]
|
|
354
|
-
VALUE = all_but_first + storfs_clustered
|
|
355
|
-
else:
|
|
356
|
-
VALUE = combined_pangenome_clusters_Second_sequences[cluster_id]
|
|
357
|
-
KEY = combined_pangenome_clusters_First_sequences[cluster_id][0]
|
|
358
|
-
combined_pangenome_clusters_First_Second_clustered.update({KEY: VALUE})
|
|
359
|
-
cluster_id = line.strip('>')
|
|
360
|
-
cluster_id = cluster_id.strip('\n')
|
|
361
|
-
cluster_id = cluster_id.split(' ')[1]
|
|
362
|
-
Combined_clusters.update({cluster_id: []})
|
|
363
|
-
combined_pangenome_clusters_First.update({cluster_id: []})
|
|
364
|
-
combined_pangenome_clusters_First_sequences.update({cluster_id: []})
|
|
365
|
-
combined_pangenome_clusters_Second.update({cluster_id: []})
|
|
366
|
-
combined_pangenome_clusters_Second_sequences.update({cluster_id: []})
|
|
367
|
-
|
|
368
|
-
first = False
|
|
369
|
-
else:
|
|
370
|
-
clustered = line.split('\t')[1]
|
|
371
|
-
clustered = clustered.split('>')[1]
|
|
372
|
-
clustered = clustered.split('...')[0]
|
|
373
|
-
genome = clustered.split('|')[0]
|
|
374
|
-
genome_dict[genome] += 1
|
|
375
|
-
if '*' in line:
|
|
376
|
-
rep = clustered
|
|
377
|
-
Combined_reps.update({rep: 0})
|
|
378
|
-
if first == False:
|
|
379
|
-
Combined_clusters[cluster_id].append(clustered)
|
|
380
|
-
clustered_genome = clustered.split('|')[0]
|
|
381
|
-
if options.sequence_tag in line:
|
|
382
|
-
if clustered_genome not in combined_pangenome_clusters_Second[cluster_id]:
|
|
383
|
-
combined_pangenome_clusters_Second[cluster_id].append(clustered_genome)
|
|
384
|
-
combined_pangenome_clusters_Second_sequences[cluster_id].append(clustered)
|
|
385
|
-
else:
|
|
386
|
-
if cluster_id not in not_Second_only_cluster_ids:
|
|
387
|
-
not_Second_only_cluster_ids.append(cluster_id) # Tell us which StORF_Reporter clustered are unmatched to a PEP
|
|
388
|
-
if clustered_genome not in combined_pangenome_clusters_First[cluster_id]:
|
|
389
|
-
combined_pangenome_clusters_First[cluster_id].append(clustered_genome)
|
|
390
|
-
combined_pangenome_clusters_First_sequences[cluster_id].append(clustered)
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
return combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids, combined_pangenome_clusters_Second, unique_genomes
|
|
394
|
-
|
|
395
|
-
def combined_clustering_Edge_List(options, genome_dict):
|
|
396
|
-
if options.cluster_format == 'TSV':
|
|
397
|
-
separator = '\t'
|
|
398
|
-
elif options.cluster_format == 'CSV':
|
|
399
|
-
separator = ','
|
|
400
|
-
unique_genomes = []
|
|
401
|
-
cluster_id = 0
|
|
402
|
-
last_rep = ''
|
|
403
|
-
Second_in = open(options.reclustered, 'r')
|
|
404
|
-
combined_pangenome_clusters_First = OrderedDict()
|
|
405
|
-
combined_pangenome_clusters_First_sequences = OrderedDict()
|
|
406
|
-
combined_pangenome_clusters_Second = OrderedDict()
|
|
407
|
-
combined_pangenome_clusters_Second_sequences = OrderedDict()
|
|
408
|
-
combined_pangenome_clusters_First_Second_clustered = OrderedDict()
|
|
409
|
-
|
|
410
|
-
not_Second_only_cluster_ids = []
|
|
411
|
-
already_seen_PEP = []
|
|
412
|
-
Combined_clusters = OrderedDict()
|
|
413
|
-
Combined_reps = OrderedDict()
|
|
414
|
-
first = True
|
|
415
|
-
for line in Second_in:
|
|
416
|
-
rep, child = line.strip().split(separator)
|
|
417
|
-
child_genome = child.split('|')[0] # Extracting the genome identifier from the child sequence
|
|
418
|
-
|
|
419
|
-
if first == True:
|
|
420
|
-
Combined_clusters.update({cluster_id: []})
|
|
421
|
-
combined_pangenome_clusters_First.update({cluster_id: []})
|
|
422
|
-
combined_pangenome_clusters_First_sequences.update({cluster_id: []})
|
|
423
|
-
combined_pangenome_clusters_Second.update({cluster_id: []})
|
|
424
|
-
combined_pangenome_clusters_Second_sequences.update({cluster_id: []})
|
|
425
|
-
Combined_reps.update({rep: 0})
|
|
426
|
-
first = False
|
|
427
|
-
|
|
428
|
-
if first == False:
|
|
429
|
-
if rep != last_rep and last_rep != '':
|
|
430
|
-
cluster_size = len(Combined_clusters[cluster_id])
|
|
431
|
-
Combined_reps.update({rep: cluster_size})
|
|
432
|
-
for pep in combined_pangenome_clusters_First_sequences[cluster_id]:
|
|
433
|
-
if pep != []:
|
|
434
|
-
if pep in already_seen_PEP:
|
|
435
|
-
continue
|
|
436
|
-
else:
|
|
437
|
-
already_seen_PEP.append(pep)
|
|
438
|
-
if len(combined_pangenome_clusters_Second_sequences[cluster_id]) > 0 and len(combined_pangenome_clusters_First_sequences[cluster_id]) > 0:
|
|
439
|
-
if len(combined_pangenome_clusters_First_sequences[cluster_id]) > 1: # If we have clustered >1 PEP family, we need to record 1 as key and all others are val
|
|
440
|
-
all_but_first = combined_pangenome_clusters_First_sequences[cluster_id][1:]
|
|
441
|
-
storfs_clustered = combined_pangenome_clusters_Second_sequences[cluster_id]
|
|
442
|
-
VALUE = all_but_first + storfs_clustered
|
|
443
|
-
else:
|
|
444
|
-
VALUE = combined_pangenome_clusters_Second_sequences[cluster_id]
|
|
445
|
-
KEY = combined_pangenome_clusters_First_sequences[cluster_id][0]
|
|
446
|
-
combined_pangenome_clusters_First_Second_clustered.update({KEY: VALUE})
|
|
447
|
-
|
|
448
|
-
cluster_id += 1
|
|
449
|
-
Combined_clusters.update({cluster_id: []})
|
|
450
|
-
combined_pangenome_clusters_First.update({cluster_id: []})
|
|
451
|
-
combined_pangenome_clusters_First_sequences.update({cluster_id: []})
|
|
452
|
-
combined_pangenome_clusters_Second.update({cluster_id: []})
|
|
453
|
-
combined_pangenome_clusters_Second_sequences.update({cluster_id: []})
|
|
454
|
-
Combined_reps.update({rep: 0})
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
Combined_clusters[cluster_id].append(child)
|
|
458
|
-
if options.sequence_tag in line:
|
|
459
|
-
if child_genome not in combined_pangenome_clusters_Second[cluster_id]:
|
|
460
|
-
combined_pangenome_clusters_Second[cluster_id].append(child_genome)
|
|
461
|
-
combined_pangenome_clusters_Second_sequences[cluster_id].append(child)
|
|
462
|
-
else:
|
|
463
|
-
if cluster_id not in not_Second_only_cluster_ids:
|
|
464
|
-
not_Second_only_cluster_ids.append(cluster_id) # Tell us which StORF_Reporter clustered are unmatched to a PEP
|
|
465
|
-
if child_genome not in combined_pangenome_clusters_First[cluster_id]:
|
|
466
|
-
combined_pangenome_clusters_First[cluster_id].append(child_genome)
|
|
467
|
-
combined_pangenome_clusters_First_sequences[cluster_id].append(child)
|
|
468
|
-
|
|
469
|
-
last_rep = rep
|
|
470
|
-
|
|
471
|
-
return combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids, combined_pangenome_clusters_Second, unique_genomes
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
def cluster_EdgeList(options):
|
|
475
|
-
if options.cluster_format == 'TSV':
|
|
476
|
-
separator = '\t'
|
|
477
|
-
elif options.cluster_format == 'CSV':
|
|
478
|
-
separator = ','
|
|
479
|
-
cluster_id = 0
|
|
480
|
-
last_rep = ''
|
|
481
|
-
first = True
|
|
482
|
-
First_in = open(options.clusters, 'r')
|
|
483
|
-
pangenome_clusters_First = OrderedDict()
|
|
484
|
-
pangenome_clusters_First_sequences = OrderedDict()
|
|
485
|
-
genome_dict = defaultdict(int)
|
|
486
|
-
reps = OrderedDict()
|
|
487
|
-
for line in First_in:
|
|
488
|
-
rep, child = line.strip().split(separator)
|
|
489
|
-
child_genome = child.split('|')[0] # Extracting the genome identifier from the child sequence
|
|
490
|
-
# Counting occurrences of genomes
|
|
491
|
-
genome_dict[child_genome] += 1
|
|
492
|
-
if first == True:
|
|
493
|
-
pangenome_clusters_First[0] = []
|
|
494
|
-
pangenome_clusters_First_sequences[0] = []
|
|
495
|
-
first = False
|
|
496
|
-
|
|
497
|
-
if rep != last_rep and last_rep != '':
|
|
498
|
-
cluster_id +=1
|
|
499
|
-
pangenome_clusters_First[cluster_id] = []
|
|
500
|
-
pangenome_clusters_First_sequences[cluster_id] = []
|
|
501
|
-
cluster_size = len(pangenome_clusters_First_sequences[cluster_id-1])
|
|
502
|
-
reps.update({last_rep: [cluster_size, len(pangenome_clusters_First[cluster_id-1])]})
|
|
503
|
-
pangenome_clusters_First[cluster_id] = []
|
|
504
|
-
pangenome_clusters_First_sequences[cluster_id] = []
|
|
505
|
-
if child_genome not in pangenome_clusters_First[cluster_id]:
|
|
506
|
-
pangenome_clusters_First[cluster_id].append(child_genome)
|
|
507
|
-
|
|
508
|
-
pangenome_clusters_First_sequences[cluster_id].append(child)
|
|
509
|
-
last_rep = rep
|
|
510
|
-
cluster_size = len(pangenome_clusters_First_sequences[cluster_id])
|
|
511
|
-
reps.update({rep: [cluster_size, len(pangenome_clusters_First[cluster_id])]})
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
return genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
def cluster_CDHIT(options):
|
|
519
|
-
First_in = open(options.clusters, 'r')
|
|
520
|
-
clusters = OrderedDict()
|
|
521
|
-
pangenome_clusters_First = OrderedDict()
|
|
522
|
-
pangenome_clusters_First_sequences = OrderedDict()
|
|
523
|
-
first = True
|
|
524
|
-
genome_dict = defaultdict(int)
|
|
525
|
-
reps = OrderedDict()
|
|
526
|
-
## Load in all data for easier reuse later
|
|
527
|
-
for line in First_in:
|
|
528
|
-
if line.startswith('>'):
|
|
529
|
-
if first == False:
|
|
530
|
-
cluster_size = len(clusters[cluster_id])
|
|
531
|
-
reps.update({rep: [cluster_size, len(pangenome_clusters_First[cluster_id])]})
|
|
532
|
-
cluster_id = line.strip('>')
|
|
533
|
-
cluster_id = cluster_id.strip('\n')
|
|
534
|
-
cluster_id = cluster_id.split(' ')[1]
|
|
535
|
-
clusters.update({cluster_id: []})
|
|
536
|
-
pangenome_clusters_First.update({cluster_id: []})
|
|
537
|
-
pangenome_clusters_First_sequences.update({cluster_id: []})
|
|
538
|
-
|
|
539
|
-
first = False
|
|
540
|
-
else:
|
|
541
|
-
clustered = line.split('\t')[1]
|
|
542
|
-
clustered = clustered.split('>')[1]
|
|
543
|
-
clustered = clustered.split('...')[0]
|
|
544
|
-
genome = clustered.split('|')[0]
|
|
545
|
-
genome_dict[genome] += 1
|
|
546
|
-
if '*' in line:
|
|
547
|
-
rep = clustered
|
|
548
|
-
reps.update({rep: [0, 0]})
|
|
549
|
-
if first == False:
|
|
550
|
-
clusters[cluster_id].append(clustered)
|
|
551
|
-
clustered_genome = clustered.split('|')[0]
|
|
552
|
-
if clustered_genome not in pangenome_clusters_First[cluster_id]:
|
|
553
|
-
pangenome_clusters_First[cluster_id].append(clustered_genome)
|
|
554
|
-
pangenome_clusters_First_sequences[cluster_id].append(clustered)
|
|
555
|
-
return genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps
|
|
556
155
|
|
|
557
156
|
#@profile
|
|
558
157
|
def cluster(options):
|
|
559
158
|
|
|
560
159
|
if options.cluster_format == 'CD-HIT':
|
|
561
|
-
|
|
160
|
+
genus_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps = cluster_CDHIT(options, '_')
|
|
562
161
|
elif options.cluster_format in ['TSV','CSV']:
|
|
563
|
-
|
|
162
|
+
genus_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps = cluster_EdgeList(options, '_')
|
|
163
|
+
|
|
564
164
|
|
|
565
|
-
######################################
|
|
566
|
-
cores, groups = get_cores(options, genome_dict)
|
|
567
|
-
###
|
|
568
165
|
|
|
569
166
|
if options.reclustered != None:
|
|
167
|
+
|
|
570
168
|
if options.cluster_format == 'CD-HIT':
|
|
571
|
-
combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
unique_genomes = combined_clustering_Edge_List(options, genome_dict)
|
|
576
|
-
pangenome_clusters_Type = combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered)
|
|
169
|
+
combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second = combined_clustering_CDHIT(options, genus_dict, '_')
|
|
170
|
+
if options.cluster_format == ['TSV','CSV']:
|
|
171
|
+
combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second = combined_clustering_Edge_List(options, '_')
|
|
172
|
+
pangenome_clusters_Type = combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered, '_')
|
|
577
173
|
else:
|
|
578
|
-
pangenome_clusters_Type = single_clustering_counting(options, pangenome_clusters_First, reps)
|
|
579
174
|
|
|
175
|
+
pangenome_clusters_Type = single_clustering_counting(pangenome_clusters_First, reps)
|
|
580
176
|
|
|
177
|
+
###
|
|
178
|
+
cores, groups = get_cores(options, genus_dict)
|
|
179
|
+
###
|
|
581
180
|
|
|
582
181
|
Number_Of_StORF_Extending_But_Same_Genomes = 0
|
|
583
182
|
|
|
@@ -589,21 +188,17 @@ def cluster(options):
|
|
|
589
188
|
print("Calculating Groups")
|
|
590
189
|
for cluster, numbers in pangenome_clusters_Type_sorted.items():
|
|
591
190
|
############################### Calculate First only
|
|
592
|
-
|
|
593
|
-
calc_First_only_core(cluster, numbers[1],groups,cores)
|
|
594
|
-
|
|
595
|
-
# elif numbers[0] >1 and numbers[1] >=2:
|
|
596
|
-
# calc_First_only_core(cluster, numbers[2][0],groups,cores)
|
|
597
|
-
|
|
191
|
+
calc_First_only_core(cluster, numbers[1], cores)
|
|
598
192
|
|
|
599
193
|
if options.reclustered != None:
|
|
600
194
|
############################# Calculate First and Reclustered-Second
|
|
601
195
|
if numbers[0] == 1 and numbers[3] >= 1: # If Seconds did not combine First reps
|
|
602
|
-
calc_single_First_extended_Second_only_core(numbers[1], groups, cores, numbers[3])
|
|
196
|
+
calc_single_First_extended_Second_only_core(cluster, numbers[1], groups, cores, numbers[3])
|
|
603
197
|
elif numbers[0] > 1 and numbers[3] >= 1: # If unique Secondss combined multiple Firsts
|
|
604
|
-
calc_multi_First_extended_Second_only_core(numbers[1], groups, cores, numbers[3])
|
|
198
|
+
calc_multi_First_extended_Second_only_core(cluster, numbers[1], groups, cores, numbers[3])
|
|
605
199
|
elif numbers[4] >= 1:
|
|
606
200
|
Number_Of_StORF_Extending_But_Same_Genomes += 1
|
|
201
|
+
|
|
607
202
|
combined_pangenome_clusters_ONLY_Second_Type = defaultdict(list)
|
|
608
203
|
combined_pangenome_clusters_Second_Type = defaultdict(list)
|
|
609
204
|
for cluster, genomes in combined_pangenome_clusters_Second.items():
|
|
@@ -612,26 +207,31 @@ def cluster(options):
|
|
|
612
207
|
else:
|
|
613
208
|
combined_pangenome_clusters_ONLY_Second_Type[cluster] = [cluster, len(genomes)]
|
|
614
209
|
for cluster, data in combined_pangenome_clusters_Second_Type.items():
|
|
615
|
-
|
|
210
|
+
if data[1] >=1:
|
|
211
|
+
calc_Second_only_core(cluster, cores, data[1])
|
|
616
212
|
for cluster, data in combined_pangenome_clusters_ONLY_Second_Type.items():
|
|
617
|
-
if data[1] >=
|
|
618
|
-
calc_only_Second_only_core(
|
|
213
|
+
if data[1] >= 1 :
|
|
214
|
+
calc_only_Second_only_core(cluster, cores, data[1])
|
|
619
215
|
###########################
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
216
|
+
### Output
|
|
217
|
+
key_order = list(cores.keys())
|
|
218
|
+
output_path = os.path.abspath(options.output_dir)
|
|
219
|
+
stats_out = os.path.join(output_path,'summary_statistics.txt')
|
|
220
|
+
with open(stats_out,'w') as outfile:
|
|
221
|
+
print("Genus Groups:")
|
|
222
|
+
outfile.write("Genus Groups:\n")
|
|
223
|
+
for key in key_order:
|
|
224
|
+
print(key+':\t'+str(len(cores[key])))
|
|
225
|
+
outfile.write(key + ':\t' + str(len(cores[key]))+'\n')
|
|
226
|
+
print("Total Number of Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
|
|
227
|
+
outfile.write("Total Number of Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
|
|
627
228
|
|
|
628
229
|
if options.gene_presence_absence_out != None:
|
|
629
|
-
gene_presence_absence_output(options,
|
|
230
|
+
gene_presence_absence_output(options,genus_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted)
|
|
630
231
|
|
|
631
232
|
if options.write_families != None and options.fasta != None:
|
|
632
233
|
sequences = read_fasta(options.fasta)
|
|
633
|
-
|
|
634
|
-
output_dir = os.path.join(input_dir, 'Gene_Families_Output')
|
|
234
|
+
output_dir = os.path.join(output_path, 'Gene_Families_Output')
|
|
635
235
|
|
|
636
236
|
# Create output directory if it doesn't exist
|
|
637
237
|
if not os.path.exists(output_dir):
|
|
@@ -652,7 +252,7 @@ def cluster(options):
|
|
|
652
252
|
outfile.write(f"{wrapped_sequence}\n")
|
|
653
253
|
|
|
654
254
|
if options.con_core != None and options.fasta != None and options.write_families != None:
|
|
655
|
-
process_gene_families(options, os.path.join(
|
|
255
|
+
process_gene_families(options, os.path.join(output_dir, 'Gene_Families_Output'), 'concatonated_genes_aligned.fasta')
|
|
656
256
|
|
|
657
257
|
|
|
658
258
|
|