PyamilySeq 0.6.0__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyamilySeq/Constants.py +1 -1
- PyamilySeq/PyamilySeq.py +81 -39
- PyamilySeq/PyamilySeq_Genus.py +85 -102
- PyamilySeq/PyamilySeq_Species.py +101 -94
- PyamilySeq/Seq_Combiner.py +26 -7
- PyamilySeq/clusterings.py +111 -73
- PyamilySeq/utils.py +117 -7
- PyamilySeq-0.7.1.dist-info/METADATA +250 -0
- PyamilySeq-0.7.1.dist-info/RECORD +14 -0
- {PyamilySeq-0.6.0.dist-info → PyamilySeq-0.7.1.dist-info}/WHEEL +1 -1
- PyamilySeq/CD-Hit_StORF-Reporter_Cross-Genera_Builder.py +0 -600
- PyamilySeq-0.6.0.dist-info/METADATA +0 -147
- PyamilySeq-0.6.0.dist-info/RECORD +0 -15
- {PyamilySeq-0.6.0.dist-info → PyamilySeq-0.7.1.dist-info}/LICENSE +0 -0
- {PyamilySeq-0.6.0.dist-info → PyamilySeq-0.7.1.dist-info}/entry_points.txt +0 -0
- {PyamilySeq-0.6.0.dist-info → PyamilySeq-0.7.1.dist-info}/top_level.txt +0 -0
PyamilySeq/Constants.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
PyamilySeq_Version = 'v0.
|
|
1
|
+
PyamilySeq_Version = 'v0.7.1'
|
|
2
2
|
|
PyamilySeq/PyamilySeq.py
CHANGED
|
@@ -20,9 +20,9 @@ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
|
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
def run_cd_hit(input_file, clustering_output,
|
|
23
|
+
def run_cd_hit(options, input_file, clustering_output, clustering_mode):
|
|
24
24
|
cdhit_command = [
|
|
25
|
-
|
|
25
|
+
clustering_mode,
|
|
26
26
|
'-i', input_file,
|
|
27
27
|
'-o', clustering_output,
|
|
28
28
|
'-c', str(options.pident),
|
|
@@ -33,14 +33,15 @@ def run_cd_hit(input_file, clustering_output, options):
|
|
|
33
33
|
'-sc', "1",
|
|
34
34
|
'-sf', "1"
|
|
35
35
|
]
|
|
36
|
-
if options.verbose
|
|
36
|
+
if options.verbose != None:
|
|
37
37
|
subprocess.run(cdhit_command)
|
|
38
38
|
else:
|
|
39
39
|
subprocess.run(cdhit_command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
40
40
|
|
|
41
41
|
|
|
42
42
|
def main():
|
|
43
|
-
parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ':
|
|
43
|
+
parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ': A tool that groups genes into unique clusters.')
|
|
44
|
+
vparser = argparse.ArgumentParser()
|
|
44
45
|
### Required Arguments
|
|
45
46
|
required = parser.add_argument_group('Required Arguments')
|
|
46
47
|
required.add_argument('-run_mode', action='store', dest='run_mode', choices=['Full','Partial'],
|
|
@@ -49,8 +50,8 @@ def main():
|
|
|
49
50
|
required.add_argument('-group_mode', action='store', dest='group_type', choices=['Species', 'Genus'],
|
|
50
51
|
help='Group Mode: Should PyamilySeq be run in "Species" or "Genus" mode? ',
|
|
51
52
|
required=True)
|
|
52
|
-
required.add_argument("-
|
|
53
|
-
help="Clustering
|
|
53
|
+
required.add_argument("-clustering_format", action="store", dest="clustering_format", choices=['CD-HIT','TSV','CSV'],
|
|
54
|
+
help="Clustering format to use: CD-HIT or TSV (MMseqs2, BLAST, DIAMOND) / CSV edge-list file (Node1\tNode2).",
|
|
54
55
|
required=True)
|
|
55
56
|
required.add_argument("-output_dir", action="store", dest="output_dir",
|
|
56
57
|
help="Directory for all output files.",
|
|
@@ -67,6 +68,12 @@ def main():
|
|
|
67
68
|
full_mode_args.add_argument("-name_split", action="store", dest="name_split",
|
|
68
69
|
help="substring used to split the filename and extract the genome name ('_combined.gff3' or '.gff').",
|
|
69
70
|
required=False)
|
|
71
|
+
full_mode_args.add_argument('-sequence_type', action='store', dest='sequence_type', default='DNA',choices=['AA', 'DNA'],
|
|
72
|
+
help='Default - DNA: Should clustering be performed in "DNA" or "AA" mode?',
|
|
73
|
+
required=False)
|
|
74
|
+
full_mode_args.add_argument('-gene_ident', action='store', dest='gene_ident', default='CDS',
|
|
75
|
+
help='Identifier used for extraction of sequences such as "misc_RNA,gene,mRNA,CDS,rRNA,tRNA,tmRNA,CRISPR,ncRNA,regulatory_region,oriC,pseudo"',
|
|
76
|
+
required=False)
|
|
70
77
|
full_mode_args.add_argument("-pid", action="store", dest="pident", type=float, default=0.95,
|
|
71
78
|
help="Default 0.95: Pident threshold for clustering.",
|
|
72
79
|
required=False)
|
|
@@ -99,51 +106,67 @@ def main():
|
|
|
99
106
|
grouping_args.add_argument('-core_groups', action="store", dest='core_groups', default="99,95,15",
|
|
100
107
|
help='Default - (\'99,95,15\'): Gene family groups to use for "Species" mode',
|
|
101
108
|
required=False)
|
|
109
|
+
|
|
102
110
|
grouping_args.add_argument('-genus_groups', action="store", dest='genus_groups', default="1,2,3,4,5,6",
|
|
103
111
|
help='Default - (\'1,2,3,4,5,6\'): Gene family groups to use for "Genus" mode',
|
|
104
112
|
required=False)
|
|
105
113
|
|
|
106
114
|
###Output Arguments
|
|
107
115
|
output_args = parser.add_argument_group('Output Parameters')
|
|
108
|
-
output_args.add_argument('-w', action="store", dest='
|
|
109
|
-
help='Default - No output: Output sequences of identified
|
|
110
|
-
' - Must provide FASTA file with -
|
|
116
|
+
output_args.add_argument('-w', action="store", dest='write_groups', default=None,
|
|
117
|
+
help='Default - No output: Output sequences of identified groups (provide levels at which to output - Species "-w 99,95" Genus "-w 2,3"'
|
|
118
|
+
' - Must provide FASTA file with -original_fasta if in Partial run mode.',
|
|
111
119
|
required=False)
|
|
112
|
-
output_args.add_argument('-
|
|
113
|
-
help='Default - No output: Output aligned and concatinated sequences of identified
|
|
114
|
-
' - Must provide FASTA file with -
|
|
120
|
+
output_args.add_argument('-a', action="store_true", dest='align_core', default=None,
|
|
121
|
+
help='Default - No output: SLOW! (Only works for Species mode) Output aligned and concatinated sequences of identified groups -'
|
|
122
|
+
'provide group levels at which to output "-w 99,95" - Must provide FASTA file with -original_fasta in Partial'
|
|
123
|
+
'run mode.',
|
|
115
124
|
required=False)
|
|
116
125
|
output_args.add_argument('-original_fasta', action='store', dest='original_fasta',
|
|
117
126
|
help='FASTA file to use in conjunction with "-w" or "-con" when running in Partial Mode.',
|
|
118
127
|
required=False)
|
|
119
|
-
output_args.add_argument('-gpa', action='
|
|
120
|
-
|
|
128
|
+
output_args.add_argument('-gpa', action='store_true', dest='gene_presence_absence_out', default=None,
|
|
129
|
+
help='Default - False: If selected, a Roary/Panaroo formatted gene_presence_absence.csv will be created - Required for Coinfinder and other downstream tools',
|
|
130
|
+
required=False)
|
|
121
131
|
|
|
122
132
|
### Misc Arguments
|
|
123
133
|
misc = parser.add_argument_group('Misc')
|
|
124
|
-
misc.add_argument('-verbose', action='
|
|
125
|
-
help='Default - False: Print out runtime messages',
|
|
134
|
+
misc.add_argument('-verbose', action='store_true', dest='verbose', default=None, help='Default - False: Print out runtime messages',
|
|
126
135
|
required = False)
|
|
127
|
-
|
|
136
|
+
|
|
137
|
+
### Version Arguments
|
|
138
|
+
version = vparser.add_argument_group('Version')
|
|
139
|
+
version.add_argument('-v', action='store_true', dest='version',
|
|
128
140
|
help='Default - False: Print out version number and exit',
|
|
129
141
|
required=False)
|
|
130
142
|
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
args, unknown = vparser.parse_known_args()
|
|
146
|
+
|
|
147
|
+
if args.version == True:
|
|
148
|
+
sys.exit("PyamilySeq version: "+PyamilySeq_Version)
|
|
149
|
+
|
|
131
150
|
options = parser.parse_args()
|
|
132
151
|
|
|
133
|
-
### Checking all required parameters are provided by user
|
|
152
|
+
### Checking all required parameters are provided by user #!!# Doesn't seem to work
|
|
134
153
|
if options.run_mode == 'Full':
|
|
154
|
+
|
|
135
155
|
if options.reclustered != None:
|
|
136
156
|
sys.exit("Currently reclustering only works on Partial Mode.")
|
|
137
|
-
required_full_mode = [options.input_type, options.input_dir, options.name_split, options.
|
|
157
|
+
required_full_mode = [options.input_type, options.input_dir, options.name_split, options.clustering_format,
|
|
138
158
|
options.pident, options.len_diff]
|
|
139
159
|
if all(required_full_mode):
|
|
140
160
|
# Proceed with the Full mode
|
|
141
161
|
pass
|
|
142
162
|
else:
|
|
143
163
|
missing_options = [opt for opt in
|
|
144
|
-
['input_type', 'input_dir', 'name_split', '
|
|
164
|
+
['input_type', 'input_dir', 'name_split', 'clustering_format', 'pident', 'len_diff'] if
|
|
145
165
|
not options.__dict__[opt]]
|
|
146
166
|
print(f"Missing required options for Full mode: {', '.join(missing_options)}")
|
|
167
|
+
if options.align_core != None:
|
|
168
|
+
if options.write_groups == None:
|
|
169
|
+
sys.exit('Must provide "-w" to output gene groups before alignment "-a" can be done.')
|
|
147
170
|
elif options.run_mode == 'Partial':
|
|
148
171
|
required_partial_mode = [options.cluster_file, ]
|
|
149
172
|
if all(required_partial_mode):
|
|
@@ -154,33 +177,37 @@ def main():
|
|
|
154
177
|
['cluster_file',] if
|
|
155
178
|
not options.__dict__[opt]]
|
|
156
179
|
print(f"Missing required options for Partial mode: {', '.join(missing_options)}")
|
|
180
|
+
if options.align_core != None:
|
|
181
|
+
if options.write_groups == None or options.original_fasta == None:
|
|
182
|
+
sys.exit('Must provide "-w" and "-original_fasta" to output gene groups before alignment "-a" can be done.')
|
|
157
183
|
|
|
158
|
-
if options.
|
|
184
|
+
if options.clustering_format == 'CD-HIT':
|
|
159
185
|
clust_affix = '.clstr'
|
|
160
|
-
elif options.
|
|
186
|
+
elif options.clustering_format == 'TSV':
|
|
161
187
|
clust_affix = '.tsv'
|
|
162
|
-
elif options.
|
|
188
|
+
elif options.clustering_format == 'CSV':
|
|
163
189
|
clust_affix = '.csv'
|
|
164
190
|
|
|
165
191
|
|
|
166
192
|
|
|
193
|
+
|
|
167
194
|
###External tool checks:
|
|
168
195
|
##MAFFT
|
|
169
|
-
if options.
|
|
196
|
+
if options.align_core == True:
|
|
170
197
|
if is_tool_installed('mafft'):
|
|
171
|
-
if options.verbose
|
|
198
|
+
if options.verbose != None:
|
|
172
199
|
print("mafft is installed. Proceeding with alignment.")
|
|
173
200
|
else:
|
|
174
201
|
exit("mafft is not installed. Please install mafft to proceed.")
|
|
175
202
|
##CD-HIT
|
|
176
|
-
if options.
|
|
203
|
+
if options.clustering_format == 'CD-HIT' and options.run_mode == 'Full':
|
|
177
204
|
if is_tool_installed('cd-hit'):
|
|
178
|
-
if options.verbose
|
|
205
|
+
if options.verbose != None:
|
|
179
206
|
print("cd-hit is installed. Proceeding with clustering.")
|
|
180
207
|
else:
|
|
181
208
|
exit("cd-hit is not installed. Please install cd-hit to proceed.")
|
|
182
209
|
|
|
183
|
-
if options.
|
|
210
|
+
if options.write_groups != None and options.original_fasta == False:
|
|
184
211
|
exit("-fasta must br provided if -w is used")
|
|
185
212
|
|
|
186
213
|
|
|
@@ -197,35 +224,48 @@ def main():
|
|
|
197
224
|
|
|
198
225
|
output_path = os.path.abspath(options.output_dir)
|
|
199
226
|
combined_out_file = os.path.join(output_path, "combined_sequences.fasta")
|
|
200
|
-
clustering_output = os.path.join(output_path, 'clustering_' + options.
|
|
227
|
+
clustering_output = os.path.join(output_path, 'clustering_' + options.clustering_format)
|
|
201
228
|
|
|
202
229
|
if options.group_type == 'Species':
|
|
203
230
|
options.core_groups = options.core_groups + ',0'
|
|
204
231
|
groups_to_use = options.core_groups
|
|
205
|
-
|
|
232
|
+
elif options.group_type == 'Genus':
|
|
206
233
|
options.genus_groups = options.genus_groups + ',>'
|
|
207
234
|
groups_to_use = options.genus_groups
|
|
235
|
+
if options.align_core != None:
|
|
236
|
+
sys.exit("-a align_core not a valid option in Genus mode.")
|
|
208
237
|
|
|
209
238
|
|
|
210
239
|
if options.run_mode == 'Full':
|
|
240
|
+
if not os.path.exists(output_path):
|
|
241
|
+
os.makedirs(output_path)
|
|
242
|
+
if options.sequence_type == 'AA':
|
|
243
|
+
clustering_mode = 'cd-hit'
|
|
244
|
+
translate = True
|
|
245
|
+
elif options.sequence_type == 'DNA':
|
|
246
|
+
clustering_mode = 'cd-hit-est'
|
|
247
|
+
translate = False
|
|
211
248
|
if options.input_type == 'separate':
|
|
212
|
-
read_separate_files(options.input_dir, options.name_split, combined_out_file)
|
|
249
|
+
read_separate_files(options.input_dir, options.name_split, options.gene_ident, combined_out_file, translate)
|
|
213
250
|
else:
|
|
214
|
-
read_combined_files(options.input_dir, options.name_split, combined_out_file)
|
|
251
|
+
read_combined_files(options.input_dir, options.name_split, options.gene_ident, combined_out_file, translate)
|
|
215
252
|
|
|
216
|
-
|
|
253
|
+
if options.clustering_format == 'CD-HIT':
|
|
254
|
+
run_cd_hit(options, combined_out_file, clustering_output, clustering_mode)
|
|
217
255
|
|
|
218
256
|
class clustering_options:
|
|
219
257
|
def __init__(self):
|
|
220
|
-
self.
|
|
258
|
+
self.run_mode = options.run_mode
|
|
259
|
+
self.cluster_format = options.clustering_format
|
|
260
|
+
self.sequence_type = options.sequence_type
|
|
221
261
|
self.reclustered = options.reclustered
|
|
222
262
|
self.sequence_tag = options.sequence_tag
|
|
223
263
|
self.core_groups = groups_to_use
|
|
224
264
|
self.clusters = clustering_output + clust_affix
|
|
225
265
|
self.output_dir = options.output_dir
|
|
226
266
|
self.gene_presence_absence_out = options.gene_presence_absence_out
|
|
227
|
-
self.
|
|
228
|
-
self.
|
|
267
|
+
self.write_groups = options.write_groups
|
|
268
|
+
self.align_core = options.align_core
|
|
229
269
|
self.fasta = combined_out_file
|
|
230
270
|
self.verbose = options.verbose
|
|
231
271
|
|
|
@@ -234,15 +274,16 @@ def main():
|
|
|
234
274
|
elif options.run_mode == 'Partial':
|
|
235
275
|
class clustering_options:
|
|
236
276
|
def __init__(self):
|
|
237
|
-
self.
|
|
277
|
+
self.run_mode = options.run_mode
|
|
278
|
+
self.cluster_format = options.clustering_format
|
|
238
279
|
self.reclustered = options.reclustered
|
|
239
280
|
self.sequence_tag = options.sequence_tag
|
|
240
281
|
self.core_groups = groups_to_use
|
|
241
282
|
self.clusters = options.cluster_file
|
|
242
283
|
self.output_dir = options.output_dir
|
|
243
284
|
self.gene_presence_absence_out = options.gene_presence_absence_out
|
|
244
|
-
self.
|
|
245
|
-
self.
|
|
285
|
+
self.write_groups = options.write_groups
|
|
286
|
+
self.align_core = options.align_core
|
|
246
287
|
self.fasta = options.original_fasta
|
|
247
288
|
self.verbose = options.verbose
|
|
248
289
|
|
|
@@ -258,4 +299,5 @@ def main():
|
|
|
258
299
|
"Please report any issues to: https://github.com/NickJD/PyamilySeq/issues\n#####")
|
|
259
300
|
|
|
260
301
|
if __name__ == "__main__":
|
|
302
|
+
#print("Running PyamilySeq "+PyamilySeq_Version)
|
|
261
303
|
main()
|
PyamilySeq/PyamilySeq_Genus.py
CHANGED
|
@@ -1,10 +1,5 @@
|
|
|
1
1
|
#from line_profiler_pycharm import profile
|
|
2
2
|
|
|
3
|
-
import copy
|
|
4
|
-
import sys
|
|
5
|
-
import math
|
|
6
|
-
from collections import Counter
|
|
7
|
-
|
|
8
3
|
|
|
9
4
|
try:
|
|
10
5
|
from .Constants import *
|
|
@@ -16,45 +11,6 @@ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
|
|
|
16
11
|
from utils import *
|
|
17
12
|
|
|
18
13
|
|
|
19
|
-
|
|
20
|
-
def process_gene_families(options, directory, output_file):
|
|
21
|
-
"""Process each gene family file to select the longest sequence per genome and concatenate aligned sequences."""
|
|
22
|
-
concatenated_sequences = {}
|
|
23
|
-
output_file = directory.replace('Gene_Families_Output',output_file)
|
|
24
|
-
|
|
25
|
-
# Iterate over each gene family file
|
|
26
|
-
for gene_file in os.listdir(directory):
|
|
27
|
-
if gene_file.endswith('.fasta'):
|
|
28
|
-
gene_path = os.path.join(directory, gene_file)
|
|
29
|
-
|
|
30
|
-
# Read sequences from the gene family file
|
|
31
|
-
sequences = read_fasta(gene_path)
|
|
32
|
-
|
|
33
|
-
# Select the longest sequence for each genome
|
|
34
|
-
longest_sequences = select_longest_gene(sequences)
|
|
35
|
-
|
|
36
|
-
# Run mafft on the longest sequences
|
|
37
|
-
aligned_file = f"{gene_file}_aligned.fasta"
|
|
38
|
-
run_mafft_on_sequences(options, {seq_id: seq for seq_id, seq in longest_sequences.values()}, aligned_file)
|
|
39
|
-
|
|
40
|
-
# Read aligned sequences and concatenate them
|
|
41
|
-
aligned_sequences = read_fasta(aligned_file)
|
|
42
|
-
for genome, aligned_seq in aligned_sequences.items():
|
|
43
|
-
genome_name = genome.split('|')[0]
|
|
44
|
-
if genome_name not in concatenated_sequences:
|
|
45
|
-
concatenated_sequences[genome_name] = ""
|
|
46
|
-
concatenated_sequences[genome_name] += aligned_seq
|
|
47
|
-
|
|
48
|
-
# Clean up aligned file
|
|
49
|
-
os.remove(aligned_file)
|
|
50
|
-
|
|
51
|
-
# Write the concatenated sequences to the output file
|
|
52
|
-
with open(output_file, 'w') as out:
|
|
53
|
-
for genome, sequence in concatenated_sequences.items():
|
|
54
|
-
out.write(f">{genome}\n")
|
|
55
|
-
wrapped_sequence = wrap_sequence(sequence, 60)
|
|
56
|
-
out.write(f"{wrapped_sequence}\n")
|
|
57
|
-
|
|
58
14
|
def gene_presence_absence_output(options, genus_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted):
|
|
59
15
|
print("Outputting gene_presence_absence file")
|
|
60
16
|
output_dir = os.path.abspath(options.output_dir)
|
|
@@ -99,7 +55,7 @@ def gene_presence_absence_output(options, genus_dict, pangenome_clusters_First_s
|
|
|
99
55
|
|
|
100
56
|
|
|
101
57
|
|
|
102
|
-
def get_cores(options
|
|
58
|
+
def get_cores(options):
|
|
103
59
|
##Calculate core groups
|
|
104
60
|
groups = OrderedDict()
|
|
105
61
|
cores = OrderedDict()
|
|
@@ -117,27 +73,26 @@ def get_cores(options,genus_dict):
|
|
|
117
73
|
cores[only_second_core_group] = []
|
|
118
74
|
return cores, groups
|
|
119
75
|
|
|
120
|
-
|
|
121
76
|
#@profile
|
|
122
|
-
def calc_First_only_core(cluster,
|
|
77
|
+
def calc_First_only_core(cluster, First_num, cores):
|
|
123
78
|
try:
|
|
124
|
-
cores['First_genera_'+str(
|
|
79
|
+
cores['First_genera_' + str(First_num)].append(cluster)
|
|
125
80
|
except KeyError:
|
|
126
81
|
cores['First_genera_>'].append(cluster)
|
|
127
82
|
#@profile
|
|
128
83
|
def calc_single_First_extended_Second_only_core(cluster, First_num, cores, Second_num): # Count gene families extended with StORFs
|
|
129
84
|
group = First_num + Second_num
|
|
130
85
|
try:
|
|
131
|
-
cores['extended_genera_' + group].append(cluster)
|
|
86
|
+
cores['extended_genera_' + str(group)].append(cluster)
|
|
132
87
|
except KeyError:
|
|
133
88
|
cores['extended_genera_>'].append(cluster)
|
|
134
89
|
#@profile
|
|
135
90
|
def calc_multi_First_extended_Second_only_core(cluster, First_num, cores, Second_num): # Count seperately those gene families extended with StORF_Reporter but combined >1 PEP
|
|
136
91
|
group = First_num + Second_num
|
|
137
92
|
try:
|
|
138
|
-
cores['combined_genera_' + group].append(cluster)
|
|
93
|
+
cores['combined_genera_' + str(group)].append(cluster)
|
|
139
94
|
except KeyError:
|
|
140
|
-
cores['combined_genera_>'
|
|
95
|
+
cores['combined_genera_>'].append(cluster)
|
|
141
96
|
#@profile
|
|
142
97
|
def calc_Second_only_core(cluster, cores, Second_num):
|
|
143
98
|
try:
|
|
@@ -157,28 +112,26 @@ def calc_only_Second_only_core(cluster, cores, Second_num): # only count the tru
|
|
|
157
112
|
def cluster(options):
|
|
158
113
|
|
|
159
114
|
if options.cluster_format == 'CD-HIT':
|
|
160
|
-
genus_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps = cluster_CDHIT(options, '_')
|
|
161
|
-
elif options.cluster_format
|
|
162
|
-
genus_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps = cluster_EdgeList(options, '_')
|
|
163
|
-
|
|
115
|
+
genus_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps = cluster_CDHIT(options, '_')
|
|
116
|
+
elif 'TSV' in options.cluster_format or 'CSV' in options.cluster_format:
|
|
117
|
+
genus_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps = cluster_EdgeList(options, '_')
|
|
164
118
|
|
|
119
|
+
###
|
|
120
|
+
cores, groups = get_cores(options)
|
|
121
|
+
###
|
|
165
122
|
|
|
166
123
|
if options.reclustered != None:
|
|
167
|
-
|
|
168
124
|
if options.cluster_format == 'CD-HIT':
|
|
169
|
-
combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second = combined_clustering_CDHIT(options, genus_dict, '_')
|
|
170
|
-
|
|
171
|
-
combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second = combined_clustering_Edge_List(options, '_')
|
|
172
|
-
pangenome_clusters_Type = combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered, '_')
|
|
125
|
+
combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second, combined_pangenome_clusters_Second_sequences = combined_clustering_CDHIT(options, genus_dict, '_')
|
|
126
|
+
elif 'TSV' in options.cluster_format or 'CSV' in options.cluster_format:
|
|
127
|
+
combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second, combined_pangenome_clusters_Second_sequences = combined_clustering_Edge_List(options, '_')
|
|
128
|
+
pangenome_clusters_Type = combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered, pangenome_clusters_First_genomes, '_')
|
|
173
129
|
else:
|
|
174
|
-
|
|
175
130
|
pangenome_clusters_Type = single_clustering_counting(pangenome_clusters_First, reps)
|
|
176
131
|
|
|
177
|
-
###
|
|
178
|
-
cores, groups = get_cores(options, genus_dict)
|
|
179
|
-
###
|
|
180
132
|
|
|
181
|
-
|
|
133
|
+
|
|
134
|
+
Number_Of_Second_Extending_But_Same_Genomes = 0
|
|
182
135
|
|
|
183
136
|
sorted_first_keys = sort_keys_by_values(pangenome_clusters_First, pangenome_clusters_First_sequences)
|
|
184
137
|
pangenome_clusters_First_sorted = reorder_dict_by_keys(pangenome_clusters_First, sorted_first_keys)
|
|
@@ -186,19 +139,28 @@ def cluster(options):
|
|
|
186
139
|
pangenome_clusters_Type_sorted = reorder_dict_by_keys(pangenome_clusters_Type, sorted_first_keys)
|
|
187
140
|
|
|
188
141
|
print("Calculating Groups")
|
|
142
|
+
seen_groupings = []
|
|
189
143
|
for cluster, numbers in pangenome_clusters_Type_sorted.items():
|
|
190
144
|
############################### Calculate First only
|
|
191
|
-
|
|
145
|
+
cluster = str(cluster)
|
|
146
|
+
for grouping in numbers[2]: #!!# Could do with a more elegant solution
|
|
147
|
+
current_cluster = grouping[0].split(':')[0]
|
|
148
|
+
if current_cluster not in seen_groupings:
|
|
149
|
+
seen_groupings.append(current_cluster)
|
|
150
|
+
current_cluster_size = grouping[0].split(':')[1]
|
|
151
|
+
calc_First_only_core(current_cluster, current_cluster_size, cores)
|
|
152
|
+
############################# Calculate First and Reclustered-Second
|
|
153
|
+
if numbers[0] == 1 and numbers[3] >= 1: # If Seconds did not combine First reps
|
|
154
|
+
calc_single_First_extended_Second_only_core(cluster, numbers[1], cores, numbers[3])
|
|
155
|
+
elif numbers[0] > 1 and numbers[3] >= 1: # If unique Seconds combined multiple Firsts
|
|
156
|
+
calc_multi_First_extended_Second_only_core(cluster, numbers[1], cores, numbers[3])
|
|
157
|
+
elif numbers[4] >= 1:
|
|
158
|
+
Number_Of_Second_Extending_But_Same_Genomes += 1
|
|
159
|
+
else:
|
|
160
|
+
if options.verbose == True:
|
|
161
|
+
print("First cluster " + current_cluster + " already processed - This is likely because it was clustered with another First representative.")
|
|
192
162
|
|
|
193
163
|
if options.reclustered != None:
|
|
194
|
-
############################# Calculate First and Reclustered-Second
|
|
195
|
-
if numbers[0] == 1 and numbers[3] >= 1: # If Seconds did not combine First reps
|
|
196
|
-
calc_single_First_extended_Second_only_core(cluster, numbers[1], groups, cores, numbers[3])
|
|
197
|
-
elif numbers[0] > 1 and numbers[3] >= 1: # If unique Secondss combined multiple Firsts
|
|
198
|
-
calc_multi_First_extended_Second_only_core(cluster, numbers[1], groups, cores, numbers[3])
|
|
199
|
-
elif numbers[4] >= 1:
|
|
200
|
-
Number_Of_StORF_Extending_But_Same_Genomes += 1
|
|
201
|
-
|
|
202
164
|
combined_pangenome_clusters_ONLY_Second_Type = defaultdict(list)
|
|
203
165
|
combined_pangenome_clusters_Second_Type = defaultdict(list)
|
|
204
166
|
for cluster, genomes in combined_pangenome_clusters_Second.items():
|
|
@@ -207,52 +169,73 @@ def cluster(options):
|
|
|
207
169
|
else:
|
|
208
170
|
combined_pangenome_clusters_ONLY_Second_Type[cluster] = [cluster, len(genomes)]
|
|
209
171
|
for cluster, data in combined_pangenome_clusters_Second_Type.items():
|
|
210
|
-
if data[1] >=1:
|
|
172
|
+
if data[1] >= 1:
|
|
211
173
|
calc_Second_only_core(cluster, cores, data[1])
|
|
212
174
|
for cluster, data in combined_pangenome_clusters_ONLY_Second_Type.items():
|
|
213
|
-
if data[1] >= 1
|
|
175
|
+
if data[1] >= 1:
|
|
214
176
|
calc_only_Second_only_core(cluster, cores, data[1])
|
|
215
177
|
###########################
|
|
216
178
|
### Output
|
|
217
|
-
key_order = list(cores.keys())
|
|
218
179
|
output_path = os.path.abspath(options.output_dir)
|
|
180
|
+
if not os.path.exists(output_path):
|
|
181
|
+
os.makedirs(output_path)
|
|
219
182
|
stats_out = os.path.join(output_path,'summary_statistics.txt')
|
|
183
|
+
key_order = list(cores.keys())
|
|
220
184
|
with open(stats_out,'w') as outfile:
|
|
221
185
|
print("Genus Groups:")
|
|
222
186
|
outfile.write("Genus Groups:\n")
|
|
223
187
|
for key in key_order:
|
|
224
188
|
print(key+':\t'+str(len(cores[key])))
|
|
225
189
|
outfile.write(key + ':\t' + str(len(cores[key]))+'\n')
|
|
226
|
-
print("Total Number of Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
|
|
190
|
+
print("Total Number of First Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
|
|
227
191
|
outfile.write("Total Number of Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
|
|
192
|
+
if options.reclustered!= None:
|
|
193
|
+
print("Total Number of Second Gene Groups (Including Singletons): " + str(
|
|
194
|
+
len(combined_pangenome_clusters_Second_sequences)))
|
|
195
|
+
print("Total Number of First Gene Groups That Had Additional Second Sequences But Not New Genomes: " + str(
|
|
196
|
+
Number_Of_Second_Extending_But_Same_Genomes))
|
|
197
|
+
outfile.write("\nTotal Number of Second Gene Groups (Including Singletons): " + str(
|
|
198
|
+
len(combined_pangenome_clusters_Second_sequences)))
|
|
199
|
+
outfile.write("\nTotal Number of First Gene Groups That Had Additional Second Sequences But Not New Genomes: " + str(
|
|
200
|
+
Number_Of_Second_Extending_But_Same_Genomes))
|
|
228
201
|
|
|
229
202
|
if options.gene_presence_absence_out != None:
|
|
230
203
|
gene_presence_absence_output(options,genus_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted)
|
|
231
204
|
|
|
232
|
-
if options.
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
os.
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
205
|
+
if options.run_mode == 'Full':
|
|
206
|
+
if options.reclustered == None:
|
|
207
|
+
combined_pangenome_clusters_Second_sequences = None
|
|
208
|
+
if options.write_groups != None:
|
|
209
|
+
print("Outputting gene group FASTA files")
|
|
210
|
+
sequences = read_fasta(options.fasta)
|
|
211
|
+
#output_dir = os.path.dirname(os.path.abspath(options.output_dir))
|
|
212
|
+
output_dir = os.path.join(options.output_dir, 'Gene_Families_Output')
|
|
213
|
+
write_groups(options,output_dir, key_order, cores, sequences,
|
|
214
|
+
pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
|
|
215
|
+
|
|
216
|
+
elif options.run_mode == 'Partial':
|
|
217
|
+
if options.reclustered == None:
|
|
218
|
+
combined_pangenome_clusters_Second_sequences = None
|
|
219
|
+
if options.write_groups != None and options.fasta != None:
|
|
220
|
+
print("Outputting gene group FASTA files")
|
|
221
|
+
sequences = read_fasta(options.fasta)
|
|
222
|
+
#output_dir = os.path.dirname(os.path.abspath(options.output_dir))
|
|
223
|
+
output_dir = os.path.join(options.output_dir, 'Gene_Families_Output')
|
|
224
|
+
write_groups(options,output_dir, key_order, cores, sequences,
|
|
225
|
+
pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
# if options.write_groups != None and options.fasta != None:
|
|
229
|
+
# sequences = read_fasta(options.fasta)
|
|
230
|
+
# output_dir = os.path.join(output_path, 'Gene_Families_Output')
|
|
231
|
+
#
|
|
232
|
+
# write_groups(options,output_dir, key_order, cores, sequences,
|
|
233
|
+
# pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
#!!# - Currently only align in Species Mode
|
|
237
|
+
#if options.align_core != None and options.fasta != None and options.write_groups != None:
|
|
238
|
+
# process_gene_families(options, os.path.join(output_path, 'Gene_Families_Output'), 'concatenated_genes_aligned.fasta')
|
|
256
239
|
|
|
257
240
|
|
|
258
241
|
|