PyamilySeq 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyamilySeq/Constants.py +1 -1
- PyamilySeq/PyamilySeq.py +69 -38
- PyamilySeq/PyamilySeq_Genus.py +85 -102
- PyamilySeq/PyamilySeq_Species.py +101 -94
- PyamilySeq/Seq_Combiner.py +26 -7
- PyamilySeq/clusterings.py +111 -73
- PyamilySeq/utils.py +117 -7
- PyamilySeq-0.7.0.dist-info/METADATA +251 -0
- PyamilySeq-0.7.0.dist-info/RECORD +14 -0
- {PyamilySeq-0.6.0.dist-info → PyamilySeq-0.7.0.dist-info}/WHEEL +1 -1
- PyamilySeq/CD-Hit_StORF-Reporter_Cross-Genera_Builder.py +0 -600
- PyamilySeq-0.6.0.dist-info/METADATA +0 -147
- PyamilySeq-0.6.0.dist-info/RECORD +0 -15
- {PyamilySeq-0.6.0.dist-info → PyamilySeq-0.7.0.dist-info}/LICENSE +0 -0
- {PyamilySeq-0.6.0.dist-info → PyamilySeq-0.7.0.dist-info}/entry_points.txt +0 -0
- {PyamilySeq-0.6.0.dist-info → PyamilySeq-0.7.0.dist-info}/top_level.txt +0 -0
PyamilySeq/Constants.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
PyamilySeq_Version = 'v0.
|
|
1
|
+
PyamilySeq_Version = 'v0.7.0'
|
|
2
2
|
|
PyamilySeq/PyamilySeq.py
CHANGED
|
@@ -20,9 +20,9 @@ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
|
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
def run_cd_hit(input_file, clustering_output,
|
|
23
|
+
def run_cd_hit(options, input_file, clustering_output, clustering_mode):
|
|
24
24
|
cdhit_command = [
|
|
25
|
-
|
|
25
|
+
clustering_mode,
|
|
26
26
|
'-i', input_file,
|
|
27
27
|
'-o', clustering_output,
|
|
28
28
|
'-c', str(options.pident),
|
|
@@ -33,14 +33,14 @@ def run_cd_hit(input_file, clustering_output, options):
|
|
|
33
33
|
'-sc', "1",
|
|
34
34
|
'-sf', "1"
|
|
35
35
|
]
|
|
36
|
-
if options.verbose
|
|
36
|
+
if options.verbose != None:
|
|
37
37
|
subprocess.run(cdhit_command)
|
|
38
38
|
else:
|
|
39
39
|
subprocess.run(cdhit_command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
40
40
|
|
|
41
41
|
|
|
42
42
|
def main():
|
|
43
|
-
parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ':
|
|
43
|
+
parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ': A tool that groups genes into unique clusters.')
|
|
44
44
|
### Required Arguments
|
|
45
45
|
required = parser.add_argument_group('Required Arguments')
|
|
46
46
|
required.add_argument('-run_mode', action='store', dest='run_mode', choices=['Full','Partial'],
|
|
@@ -49,8 +49,8 @@ def main():
|
|
|
49
49
|
required.add_argument('-group_mode', action='store', dest='group_type', choices=['Species', 'Genus'],
|
|
50
50
|
help='Group Mode: Should PyamilySeq be run in "Species" or "Genus" mode? ',
|
|
51
51
|
required=True)
|
|
52
|
-
required.add_argument("-
|
|
53
|
-
help="Clustering
|
|
52
|
+
required.add_argument("-clustering_format", action="store", dest="clustering_format", choices=['CD-HIT','TSV','CSV'],
|
|
53
|
+
help="Clustering format to use: CD-HIT or TSV (MMseqs2, BLAST, DIAMOND) / CSV edge-list file (Node1\tNode2).",
|
|
54
54
|
required=True)
|
|
55
55
|
required.add_argument("-output_dir", action="store", dest="output_dir",
|
|
56
56
|
help="Directory for all output files.",
|
|
@@ -67,6 +67,12 @@ def main():
|
|
|
67
67
|
full_mode_args.add_argument("-name_split", action="store", dest="name_split",
|
|
68
68
|
help="substring used to split the filename and extract the genome name ('_combined.gff3' or '.gff').",
|
|
69
69
|
required=False)
|
|
70
|
+
full_mode_args.add_argument('-sequence_type', action='store', dest='sequence_type', default='DNA',choices=['AA', 'DNA'],
|
|
71
|
+
help='Default - DNA: Should clustering be performed in "DNA" or "AA" mode?',
|
|
72
|
+
required=False)
|
|
73
|
+
full_mode_args.add_argument('-gene_ident', action='store', dest='gene_ident', default='CDS',
|
|
74
|
+
help='Identifier used for extraction of sequences such as "misc_RNA,gene,mRNA,CDS,rRNA,tRNA,tmRNA,CRISPR,ncRNA,regulatory_region,oriC,pseudo"',
|
|
75
|
+
required=False)
|
|
70
76
|
full_mode_args.add_argument("-pid", action="store", dest="pident", type=float, default=0.95,
|
|
71
77
|
help="Default 0.95: Pident threshold for clustering.",
|
|
72
78
|
required=False)
|
|
@@ -99,30 +105,32 @@ def main():
|
|
|
99
105
|
grouping_args.add_argument('-core_groups', action="store", dest='core_groups', default="99,95,15",
|
|
100
106
|
help='Default - (\'99,95,15\'): Gene family groups to use for "Species" mode',
|
|
101
107
|
required=False)
|
|
108
|
+
|
|
102
109
|
grouping_args.add_argument('-genus_groups', action="store", dest='genus_groups', default="1,2,3,4,5,6",
|
|
103
110
|
help='Default - (\'1,2,3,4,5,6\'): Gene family groups to use for "Genus" mode',
|
|
104
111
|
required=False)
|
|
105
112
|
|
|
106
113
|
###Output Arguments
|
|
107
114
|
output_args = parser.add_argument_group('Output Parameters')
|
|
108
|
-
output_args.add_argument('-w', action="store", dest='
|
|
109
|
-
help='Default - No output: Output sequences of identified
|
|
110
|
-
' - Must provide FASTA file with -
|
|
115
|
+
output_args.add_argument('-w', action="store", dest='write_groups', default=None,
|
|
116
|
+
help='Default - No output: Output sequences of identified groups (provide levels at which to output - Species "-w 99,95" Genus "-w 2,3"'
|
|
117
|
+
' - Must provide FASTA file with -original_fasta if in Partial run mode.',
|
|
111
118
|
required=False)
|
|
112
|
-
output_args.add_argument('-
|
|
113
|
-
help='Default - No output: Output aligned and concatinated sequences of identified
|
|
114
|
-
' - Must provide FASTA file with -
|
|
119
|
+
output_args.add_argument('-a', action="store_true", dest='align_core', default=None,
|
|
120
|
+
help='Default - No output: SLOW! (Only works for Species mode) Output aligned and concatinated sequences of identified groups -'
|
|
121
|
+
'provide group levels at which to output "-w 99,95" - Must provide FASTA file with -original_fasta in Partial'
|
|
122
|
+
'run mode.',
|
|
115
123
|
required=False)
|
|
116
124
|
output_args.add_argument('-original_fasta', action='store', dest='original_fasta',
|
|
117
125
|
help='FASTA file to use in conjunction with "-w" or "-con" when running in Partial Mode.',
|
|
118
126
|
required=False)
|
|
119
|
-
output_args.add_argument('-gpa', action='
|
|
120
|
-
|
|
127
|
+
output_args.add_argument('-gpa', action='store_true', dest='gene_presence_absence_out', default=None,
|
|
128
|
+
help='Default - False: If selected, a Roary/Panaroo formatted gene_presence_absence.csv will be created - Required for Coinfinder and other downstream tools',
|
|
129
|
+
required=False)
|
|
121
130
|
|
|
122
131
|
### Misc Arguments
|
|
123
132
|
misc = parser.add_argument_group('Misc')
|
|
124
|
-
misc.add_argument('-verbose', action='
|
|
125
|
-
help='Default - False: Print out runtime messages',
|
|
133
|
+
misc.add_argument('-verbose', action='store_true', dest='verbose', default=None, help='Default - False: Print out runtime messages',
|
|
126
134
|
required = False)
|
|
127
135
|
misc.add_argument('-v', action='store_true', dest='version',
|
|
128
136
|
help='Default - False: Print out version number and exit',
|
|
@@ -130,20 +138,24 @@ def main():
|
|
|
130
138
|
|
|
131
139
|
options = parser.parse_args()
|
|
132
140
|
|
|
133
|
-
### Checking all required parameters are provided by user
|
|
141
|
+
### Checking all required parameters are provided by user #!!# Doesn't seem to work
|
|
134
142
|
if options.run_mode == 'Full':
|
|
143
|
+
|
|
135
144
|
if options.reclustered != None:
|
|
136
145
|
sys.exit("Currently reclustering only works on Partial Mode.")
|
|
137
|
-
required_full_mode = [options.input_type, options.input_dir, options.name_split, options.
|
|
146
|
+
required_full_mode = [options.input_type, options.input_dir, options.name_split, options.clustering_format,
|
|
138
147
|
options.pident, options.len_diff]
|
|
139
148
|
if all(required_full_mode):
|
|
140
149
|
# Proceed with the Full mode
|
|
141
150
|
pass
|
|
142
151
|
else:
|
|
143
152
|
missing_options = [opt for opt in
|
|
144
|
-
['input_type', 'input_dir', 'name_split', '
|
|
153
|
+
['input_type', 'input_dir', 'name_split', 'clustering_format', 'pident', 'len_diff'] if
|
|
145
154
|
not options.__dict__[opt]]
|
|
146
155
|
print(f"Missing required options for Full mode: {', '.join(missing_options)}")
|
|
156
|
+
if options.align_core != None:
|
|
157
|
+
if options.write_groups == None:
|
|
158
|
+
sys.exit('Must provide "-w" to output gene groups before alignment "-a" can be done.')
|
|
147
159
|
elif options.run_mode == 'Partial':
|
|
148
160
|
required_partial_mode = [options.cluster_file, ]
|
|
149
161
|
if all(required_partial_mode):
|
|
@@ -154,33 +166,37 @@ def main():
|
|
|
154
166
|
['cluster_file',] if
|
|
155
167
|
not options.__dict__[opt]]
|
|
156
168
|
print(f"Missing required options for Partial mode: {', '.join(missing_options)}")
|
|
169
|
+
if options.align_core != None:
|
|
170
|
+
if options.write_groups == None or options.original_fasta == None:
|
|
171
|
+
sys.exit('Must provide "-w" and "-original_fasta" to output gene groups before alignment "-a" can be done.')
|
|
157
172
|
|
|
158
|
-
if options.
|
|
173
|
+
if options.clustering_format == 'CD-HIT':
|
|
159
174
|
clust_affix = '.clstr'
|
|
160
|
-
elif options.
|
|
175
|
+
elif options.clustering_format == 'TSV':
|
|
161
176
|
clust_affix = '.tsv'
|
|
162
|
-
elif options.
|
|
177
|
+
elif options.clustering_format == 'CSV':
|
|
163
178
|
clust_affix = '.csv'
|
|
164
179
|
|
|
165
180
|
|
|
166
181
|
|
|
182
|
+
|
|
167
183
|
###External tool checks:
|
|
168
184
|
##MAFFT
|
|
169
|
-
if options.
|
|
185
|
+
if options.align_core == True:
|
|
170
186
|
if is_tool_installed('mafft'):
|
|
171
|
-
if options.verbose
|
|
187
|
+
if options.verbose != None:
|
|
172
188
|
print("mafft is installed. Proceeding with alignment.")
|
|
173
189
|
else:
|
|
174
190
|
exit("mafft is not installed. Please install mafft to proceed.")
|
|
175
191
|
##CD-HIT
|
|
176
|
-
if options.
|
|
192
|
+
if options.clustering_format == 'CD-HIT' and options.run_mode == 'Full':
|
|
177
193
|
if is_tool_installed('cd-hit'):
|
|
178
|
-
if options.verbose
|
|
194
|
+
if options.verbose != None:
|
|
179
195
|
print("cd-hit is installed. Proceeding with clustering.")
|
|
180
196
|
else:
|
|
181
197
|
exit("cd-hit is not installed. Please install cd-hit to proceed.")
|
|
182
198
|
|
|
183
|
-
if options.
|
|
199
|
+
if options.write_groups != None and options.original_fasta == False:
|
|
184
200
|
exit("-fasta must br provided if -w is used")
|
|
185
201
|
|
|
186
202
|
|
|
@@ -197,35 +213,48 @@ def main():
|
|
|
197
213
|
|
|
198
214
|
output_path = os.path.abspath(options.output_dir)
|
|
199
215
|
combined_out_file = os.path.join(output_path, "combined_sequences.fasta")
|
|
200
|
-
clustering_output = os.path.join(output_path, 'clustering_' + options.
|
|
216
|
+
clustering_output = os.path.join(output_path, 'clustering_' + options.clustering_format)
|
|
201
217
|
|
|
202
218
|
if options.group_type == 'Species':
|
|
203
219
|
options.core_groups = options.core_groups + ',0'
|
|
204
220
|
groups_to_use = options.core_groups
|
|
205
|
-
|
|
221
|
+
elif options.group_type == 'Genus':
|
|
206
222
|
options.genus_groups = options.genus_groups + ',>'
|
|
207
223
|
groups_to_use = options.genus_groups
|
|
224
|
+
if options.align_core != None:
|
|
225
|
+
sys.exit("-a align_core not a valid option in Genus mode.")
|
|
208
226
|
|
|
209
227
|
|
|
210
228
|
if options.run_mode == 'Full':
|
|
229
|
+
if not os.path.exists(output_path):
|
|
230
|
+
os.makedirs(output_path)
|
|
231
|
+
if options.sequence_type == 'AA':
|
|
232
|
+
clustering_mode = 'cd-hit'
|
|
233
|
+
translate = True
|
|
234
|
+
elif options.sequence_type == 'DNA':
|
|
235
|
+
clustering_mode = 'cd-hit-est'
|
|
236
|
+
translate = False
|
|
211
237
|
if options.input_type == 'separate':
|
|
212
|
-
read_separate_files(options.input_dir, options.name_split, combined_out_file)
|
|
238
|
+
read_separate_files(options.input_dir, options.name_split, options.gene_ident, combined_out_file, translate)
|
|
213
239
|
else:
|
|
214
|
-
read_combined_files(options.input_dir, options.name_split, combined_out_file)
|
|
240
|
+
read_combined_files(options.input_dir, options.name_split, options.gene_ident, combined_out_file, translate)
|
|
215
241
|
|
|
216
|
-
|
|
242
|
+
if options.clustering_format == 'CD-HIT':
|
|
243
|
+
run_cd_hit(options, combined_out_file, clustering_output, clustering_mode)
|
|
217
244
|
|
|
218
245
|
class clustering_options:
|
|
219
246
|
def __init__(self):
|
|
220
|
-
self.
|
|
247
|
+
self.run_mode = options.run_mode
|
|
248
|
+
self.cluster_format = options.clustering_format
|
|
249
|
+
self.sequence_type = options.sequence_type
|
|
221
250
|
self.reclustered = options.reclustered
|
|
222
251
|
self.sequence_tag = options.sequence_tag
|
|
223
252
|
self.core_groups = groups_to_use
|
|
224
253
|
self.clusters = clustering_output + clust_affix
|
|
225
254
|
self.output_dir = options.output_dir
|
|
226
255
|
self.gene_presence_absence_out = options.gene_presence_absence_out
|
|
227
|
-
self.
|
|
228
|
-
self.
|
|
256
|
+
self.write_groups = options.write_groups
|
|
257
|
+
self.align_core = options.align_core
|
|
229
258
|
self.fasta = combined_out_file
|
|
230
259
|
self.verbose = options.verbose
|
|
231
260
|
|
|
@@ -234,15 +263,16 @@ def main():
|
|
|
234
263
|
elif options.run_mode == 'Partial':
|
|
235
264
|
class clustering_options:
|
|
236
265
|
def __init__(self):
|
|
237
|
-
self.
|
|
266
|
+
self.run_mode = options.run_mode
|
|
267
|
+
self.cluster_format = options.clustering_format
|
|
238
268
|
self.reclustered = options.reclustered
|
|
239
269
|
self.sequence_tag = options.sequence_tag
|
|
240
270
|
self.core_groups = groups_to_use
|
|
241
271
|
self.clusters = options.cluster_file
|
|
242
272
|
self.output_dir = options.output_dir
|
|
243
273
|
self.gene_presence_absence_out = options.gene_presence_absence_out
|
|
244
|
-
self.
|
|
245
|
-
self.
|
|
274
|
+
self.write_groups = options.write_groups
|
|
275
|
+
self.align_core = options.align_core
|
|
246
276
|
self.fasta = options.original_fasta
|
|
247
277
|
self.verbose = options.verbose
|
|
248
278
|
|
|
@@ -258,4 +288,5 @@ def main():
|
|
|
258
288
|
"Please report any issues to: https://github.com/NickJD/PyamilySeq/issues\n#####")
|
|
259
289
|
|
|
260
290
|
if __name__ == "__main__":
|
|
291
|
+
print("Running PyamilySeq "+PyamilySeq_Version)
|
|
261
292
|
main()
|
PyamilySeq/PyamilySeq_Genus.py
CHANGED
|
@@ -1,10 +1,5 @@
|
|
|
1
1
|
#from line_profiler_pycharm import profile
|
|
2
2
|
|
|
3
|
-
import copy
|
|
4
|
-
import sys
|
|
5
|
-
import math
|
|
6
|
-
from collections import Counter
|
|
7
|
-
|
|
8
3
|
|
|
9
4
|
try:
|
|
10
5
|
from .Constants import *
|
|
@@ -16,45 +11,6 @@ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
|
|
|
16
11
|
from utils import *
|
|
17
12
|
|
|
18
13
|
|
|
19
|
-
|
|
20
|
-
def process_gene_families(options, directory, output_file):
|
|
21
|
-
"""Process each gene family file to select the longest sequence per genome and concatenate aligned sequences."""
|
|
22
|
-
concatenated_sequences = {}
|
|
23
|
-
output_file = directory.replace('Gene_Families_Output',output_file)
|
|
24
|
-
|
|
25
|
-
# Iterate over each gene family file
|
|
26
|
-
for gene_file in os.listdir(directory):
|
|
27
|
-
if gene_file.endswith('.fasta'):
|
|
28
|
-
gene_path = os.path.join(directory, gene_file)
|
|
29
|
-
|
|
30
|
-
# Read sequences from the gene family file
|
|
31
|
-
sequences = read_fasta(gene_path)
|
|
32
|
-
|
|
33
|
-
# Select the longest sequence for each genome
|
|
34
|
-
longest_sequences = select_longest_gene(sequences)
|
|
35
|
-
|
|
36
|
-
# Run mafft on the longest sequences
|
|
37
|
-
aligned_file = f"{gene_file}_aligned.fasta"
|
|
38
|
-
run_mafft_on_sequences(options, {seq_id: seq for seq_id, seq in longest_sequences.values()}, aligned_file)
|
|
39
|
-
|
|
40
|
-
# Read aligned sequences and concatenate them
|
|
41
|
-
aligned_sequences = read_fasta(aligned_file)
|
|
42
|
-
for genome, aligned_seq in aligned_sequences.items():
|
|
43
|
-
genome_name = genome.split('|')[0]
|
|
44
|
-
if genome_name not in concatenated_sequences:
|
|
45
|
-
concatenated_sequences[genome_name] = ""
|
|
46
|
-
concatenated_sequences[genome_name] += aligned_seq
|
|
47
|
-
|
|
48
|
-
# Clean up aligned file
|
|
49
|
-
os.remove(aligned_file)
|
|
50
|
-
|
|
51
|
-
# Write the concatenated sequences to the output file
|
|
52
|
-
with open(output_file, 'w') as out:
|
|
53
|
-
for genome, sequence in concatenated_sequences.items():
|
|
54
|
-
out.write(f">{genome}\n")
|
|
55
|
-
wrapped_sequence = wrap_sequence(sequence, 60)
|
|
56
|
-
out.write(f"{wrapped_sequence}\n")
|
|
57
|
-
|
|
58
14
|
def gene_presence_absence_output(options, genus_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted):
|
|
59
15
|
print("Outputting gene_presence_absence file")
|
|
60
16
|
output_dir = os.path.abspath(options.output_dir)
|
|
@@ -99,7 +55,7 @@ def gene_presence_absence_output(options, genus_dict, pangenome_clusters_First_s
|
|
|
99
55
|
|
|
100
56
|
|
|
101
57
|
|
|
102
|
-
def get_cores(options
|
|
58
|
+
def get_cores(options):
|
|
103
59
|
##Calculate core groups
|
|
104
60
|
groups = OrderedDict()
|
|
105
61
|
cores = OrderedDict()
|
|
@@ -117,27 +73,26 @@ def get_cores(options,genus_dict):
|
|
|
117
73
|
cores[only_second_core_group] = []
|
|
118
74
|
return cores, groups
|
|
119
75
|
|
|
120
|
-
|
|
121
76
|
#@profile
|
|
122
|
-
def calc_First_only_core(cluster,
|
|
77
|
+
def calc_First_only_core(cluster, First_num, cores):
|
|
123
78
|
try:
|
|
124
|
-
cores['First_genera_'+str(
|
|
79
|
+
cores['First_genera_' + str(First_num)].append(cluster)
|
|
125
80
|
except KeyError:
|
|
126
81
|
cores['First_genera_>'].append(cluster)
|
|
127
82
|
#@profile
|
|
128
83
|
def calc_single_First_extended_Second_only_core(cluster, First_num, cores, Second_num): # Count gene families extended with StORFs
|
|
129
84
|
group = First_num + Second_num
|
|
130
85
|
try:
|
|
131
|
-
cores['extended_genera_' + group].append(cluster)
|
|
86
|
+
cores['extended_genera_' + str(group)].append(cluster)
|
|
132
87
|
except KeyError:
|
|
133
88
|
cores['extended_genera_>'].append(cluster)
|
|
134
89
|
#@profile
|
|
135
90
|
def calc_multi_First_extended_Second_only_core(cluster, First_num, cores, Second_num): # Count seperately those gene families extended with StORF_Reporter but combined >1 PEP
|
|
136
91
|
group = First_num + Second_num
|
|
137
92
|
try:
|
|
138
|
-
cores['combined_genera_' + group].append(cluster)
|
|
93
|
+
cores['combined_genera_' + str(group)].append(cluster)
|
|
139
94
|
except KeyError:
|
|
140
|
-
cores['combined_genera_>'
|
|
95
|
+
cores['combined_genera_>'].append(cluster)
|
|
141
96
|
#@profile
|
|
142
97
|
def calc_Second_only_core(cluster, cores, Second_num):
|
|
143
98
|
try:
|
|
@@ -157,28 +112,26 @@ def calc_only_Second_only_core(cluster, cores, Second_num): # only count the tru
|
|
|
157
112
|
def cluster(options):
|
|
158
113
|
|
|
159
114
|
if options.cluster_format == 'CD-HIT':
|
|
160
|
-
genus_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps = cluster_CDHIT(options, '_')
|
|
161
|
-
elif options.cluster_format
|
|
162
|
-
genus_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps = cluster_EdgeList(options, '_')
|
|
163
|
-
|
|
115
|
+
genus_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps = cluster_CDHIT(options, '_')
|
|
116
|
+
elif 'TSV' in options.cluster_format or 'CSV' in options.cluster_format:
|
|
117
|
+
genus_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps = cluster_EdgeList(options, '_')
|
|
164
118
|
|
|
119
|
+
###
|
|
120
|
+
cores, groups = get_cores(options)
|
|
121
|
+
###
|
|
165
122
|
|
|
166
123
|
if options.reclustered != None:
|
|
167
|
-
|
|
168
124
|
if options.cluster_format == 'CD-HIT':
|
|
169
|
-
combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second = combined_clustering_CDHIT(options, genus_dict, '_')
|
|
170
|
-
|
|
171
|
-
combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second = combined_clustering_Edge_List(options, '_')
|
|
172
|
-
pangenome_clusters_Type = combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered, '_')
|
|
125
|
+
combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second, combined_pangenome_clusters_Second_sequences = combined_clustering_CDHIT(options, genus_dict, '_')
|
|
126
|
+
elif 'TSV' in options.cluster_format or 'CSV' in options.cluster_format:
|
|
127
|
+
combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second, combined_pangenome_clusters_Second_sequences = combined_clustering_Edge_List(options, '_')
|
|
128
|
+
pangenome_clusters_Type = combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered, pangenome_clusters_First_genomes, '_')
|
|
173
129
|
else:
|
|
174
|
-
|
|
175
130
|
pangenome_clusters_Type = single_clustering_counting(pangenome_clusters_First, reps)
|
|
176
131
|
|
|
177
|
-
###
|
|
178
|
-
cores, groups = get_cores(options, genus_dict)
|
|
179
|
-
###
|
|
180
132
|
|
|
181
|
-
|
|
133
|
+
|
|
134
|
+
Number_Of_Second_Extending_But_Same_Genomes = 0
|
|
182
135
|
|
|
183
136
|
sorted_first_keys = sort_keys_by_values(pangenome_clusters_First, pangenome_clusters_First_sequences)
|
|
184
137
|
pangenome_clusters_First_sorted = reorder_dict_by_keys(pangenome_clusters_First, sorted_first_keys)
|
|
@@ -186,19 +139,28 @@ def cluster(options):
|
|
|
186
139
|
pangenome_clusters_Type_sorted = reorder_dict_by_keys(pangenome_clusters_Type, sorted_first_keys)
|
|
187
140
|
|
|
188
141
|
print("Calculating Groups")
|
|
142
|
+
seen_groupings = []
|
|
189
143
|
for cluster, numbers in pangenome_clusters_Type_sorted.items():
|
|
190
144
|
############################### Calculate First only
|
|
191
|
-
|
|
145
|
+
cluster = str(cluster)
|
|
146
|
+
for grouping in numbers[2]: #!!# Could do with a more elegant solution
|
|
147
|
+
current_cluster = grouping[0].split(':')[0]
|
|
148
|
+
if current_cluster not in seen_groupings:
|
|
149
|
+
seen_groupings.append(current_cluster)
|
|
150
|
+
current_cluster_size = grouping[0].split(':')[1]
|
|
151
|
+
calc_First_only_core(current_cluster, current_cluster_size, cores)
|
|
152
|
+
############################# Calculate First and Reclustered-Second
|
|
153
|
+
if numbers[0] == 1 and numbers[3] >= 1: # If Seconds did not combine First reps
|
|
154
|
+
calc_single_First_extended_Second_only_core(cluster, numbers[1], cores, numbers[3])
|
|
155
|
+
elif numbers[0] > 1 and numbers[3] >= 1: # If unique Seconds combined multiple Firsts
|
|
156
|
+
calc_multi_First_extended_Second_only_core(cluster, numbers[1], cores, numbers[3])
|
|
157
|
+
elif numbers[4] >= 1:
|
|
158
|
+
Number_Of_Second_Extending_But_Same_Genomes += 1
|
|
159
|
+
else:
|
|
160
|
+
if options.verbose == True:
|
|
161
|
+
print("First cluster " + current_cluster + " already processed - This is likely because it was clustered with another First representative.")
|
|
192
162
|
|
|
193
163
|
if options.reclustered != None:
|
|
194
|
-
############################# Calculate First and Reclustered-Second
|
|
195
|
-
if numbers[0] == 1 and numbers[3] >= 1: # If Seconds did not combine First reps
|
|
196
|
-
calc_single_First_extended_Second_only_core(cluster, numbers[1], groups, cores, numbers[3])
|
|
197
|
-
elif numbers[0] > 1 and numbers[3] >= 1: # If unique Secondss combined multiple Firsts
|
|
198
|
-
calc_multi_First_extended_Second_only_core(cluster, numbers[1], groups, cores, numbers[3])
|
|
199
|
-
elif numbers[4] >= 1:
|
|
200
|
-
Number_Of_StORF_Extending_But_Same_Genomes += 1
|
|
201
|
-
|
|
202
164
|
combined_pangenome_clusters_ONLY_Second_Type = defaultdict(list)
|
|
203
165
|
combined_pangenome_clusters_Second_Type = defaultdict(list)
|
|
204
166
|
for cluster, genomes in combined_pangenome_clusters_Second.items():
|
|
@@ -207,52 +169,73 @@ def cluster(options):
|
|
|
207
169
|
else:
|
|
208
170
|
combined_pangenome_clusters_ONLY_Second_Type[cluster] = [cluster, len(genomes)]
|
|
209
171
|
for cluster, data in combined_pangenome_clusters_Second_Type.items():
|
|
210
|
-
if data[1] >=1:
|
|
172
|
+
if data[1] >= 1:
|
|
211
173
|
calc_Second_only_core(cluster, cores, data[1])
|
|
212
174
|
for cluster, data in combined_pangenome_clusters_ONLY_Second_Type.items():
|
|
213
|
-
if data[1] >= 1
|
|
175
|
+
if data[1] >= 1:
|
|
214
176
|
calc_only_Second_only_core(cluster, cores, data[1])
|
|
215
177
|
###########################
|
|
216
178
|
### Output
|
|
217
|
-
key_order = list(cores.keys())
|
|
218
179
|
output_path = os.path.abspath(options.output_dir)
|
|
180
|
+
if not os.path.exists(output_path):
|
|
181
|
+
os.makedirs(output_path)
|
|
219
182
|
stats_out = os.path.join(output_path,'summary_statistics.txt')
|
|
183
|
+
key_order = list(cores.keys())
|
|
220
184
|
with open(stats_out,'w') as outfile:
|
|
221
185
|
print("Genus Groups:")
|
|
222
186
|
outfile.write("Genus Groups:\n")
|
|
223
187
|
for key in key_order:
|
|
224
188
|
print(key+':\t'+str(len(cores[key])))
|
|
225
189
|
outfile.write(key + ':\t' + str(len(cores[key]))+'\n')
|
|
226
|
-
print("Total Number of Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
|
|
190
|
+
print("Total Number of First Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
|
|
227
191
|
outfile.write("Total Number of Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
|
|
192
|
+
if options.reclustered!= None:
|
|
193
|
+
print("Total Number of Second Gene Groups (Including Singletons): " + str(
|
|
194
|
+
len(combined_pangenome_clusters_Second_sequences)))
|
|
195
|
+
print("Total Number of First Gene Groups That Had Additional Second Sequences But Not New Genomes: " + str(
|
|
196
|
+
Number_Of_Second_Extending_But_Same_Genomes))
|
|
197
|
+
outfile.write("\nTotal Number of Second Gene Groups (Including Singletons): " + str(
|
|
198
|
+
len(combined_pangenome_clusters_Second_sequences)))
|
|
199
|
+
outfile.write("\nTotal Number of First Gene Groups That Had Additional Second Sequences But Not New Genomes: " + str(
|
|
200
|
+
Number_Of_Second_Extending_But_Same_Genomes))
|
|
228
201
|
|
|
229
202
|
if options.gene_presence_absence_out != None:
|
|
230
203
|
gene_presence_absence_output(options,genus_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted)
|
|
231
204
|
|
|
232
|
-
if options.
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
os.
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
205
|
+
if options.run_mode == 'Full':
|
|
206
|
+
if options.reclustered == None:
|
|
207
|
+
combined_pangenome_clusters_Second_sequences = None
|
|
208
|
+
if options.write_groups != None:
|
|
209
|
+
print("Outputting gene group FASTA files")
|
|
210
|
+
sequences = read_fasta(options.fasta)
|
|
211
|
+
#output_dir = os.path.dirname(os.path.abspath(options.output_dir))
|
|
212
|
+
output_dir = os.path.join(options.output_dir, 'Gene_Families_Output')
|
|
213
|
+
write_groups(options,output_dir, key_order, cores, sequences,
|
|
214
|
+
pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
|
|
215
|
+
|
|
216
|
+
elif options.run_mode == 'Partial':
|
|
217
|
+
if options.reclustered == None:
|
|
218
|
+
combined_pangenome_clusters_Second_sequences = None
|
|
219
|
+
if options.write_groups != None and options.fasta != None:
|
|
220
|
+
print("Outputting gene group FASTA files")
|
|
221
|
+
sequences = read_fasta(options.fasta)
|
|
222
|
+
#output_dir = os.path.dirname(os.path.abspath(options.output_dir))
|
|
223
|
+
output_dir = os.path.join(options.output_dir, 'Gene_Families_Output')
|
|
224
|
+
write_groups(options,output_dir, key_order, cores, sequences,
|
|
225
|
+
pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
# if options.write_groups != None and options.fasta != None:
|
|
229
|
+
# sequences = read_fasta(options.fasta)
|
|
230
|
+
# output_dir = os.path.join(output_path, 'Gene_Families_Output')
|
|
231
|
+
#
|
|
232
|
+
# write_groups(options,output_dir, key_order, cores, sequences,
|
|
233
|
+
# pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
#!!# - Currently only align in Species Mode
|
|
237
|
+
#if options.align_core != None and options.fasta != None and options.write_groups != None:
|
|
238
|
+
# process_gene_families(options, os.path.join(output_path, 'Gene_Families_Output'), 'concatenated_genes_aligned.fasta')
|
|
256
239
|
|
|
257
240
|
|
|
258
241
|
|