PyamilySeq 0.5.2__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyamilySeq/Constants.py +1 -1
- PyamilySeq/PyamilySeq.py +100 -53
- PyamilySeq/PyamilySeq_Genus.py +139 -556
- PyamilySeq/PyamilySeq_Species.py +140 -584
- PyamilySeq/Seq_Combiner.py +26 -7
- PyamilySeq/clusterings.py +362 -0
- PyamilySeq/utils.py +199 -6
- PyamilySeq-0.7.0.dist-info/METADATA +251 -0
- PyamilySeq-0.7.0.dist-info/RECORD +14 -0
- {PyamilySeq-0.5.2.dist-info → PyamilySeq-0.7.0.dist-info}/WHEEL +1 -1
- PyamilySeq/CD-Hit_StORF-Reporter_Cross-Genera_Builder.py +0 -600
- PyamilySeq-0.5.2.dist-info/METADATA +0 -144
- PyamilySeq-0.5.2.dist-info/RECORD +0 -14
- {PyamilySeq-0.5.2.dist-info → PyamilySeq-0.7.0.dist-info}/LICENSE +0 -0
- {PyamilySeq-0.5.2.dist-info → PyamilySeq-0.7.0.dist-info}/entry_points.txt +0 -0
- {PyamilySeq-0.5.2.dist-info → PyamilySeq-0.7.0.dist-info}/top_level.txt +0 -0
PyamilySeq/Constants.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
PyamilySeq_Version = 'v0.
|
|
1
|
+
PyamilySeq_Version = 'v0.7.0'
|
|
2
2
|
|
PyamilySeq/PyamilySeq.py
CHANGED
|
@@ -7,20 +7,22 @@ import subprocess
|
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
try:
|
|
10
|
-
from .PyamilySeq_Species import cluster
|
|
10
|
+
from .PyamilySeq_Species import cluster as species_cluster
|
|
11
|
+
from .PyamilySeq_Genus import cluster as genus_cluster
|
|
11
12
|
from .Constants import *
|
|
12
13
|
from .utils import *
|
|
13
14
|
except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
|
|
14
|
-
from PyamilySeq_Species import cluster
|
|
15
|
+
from PyamilySeq_Species import cluster as species_cluster
|
|
16
|
+
from PyamilySeq_Genus import cluster as genus_cluster
|
|
15
17
|
from Constants import *
|
|
16
18
|
from utils import *
|
|
17
19
|
|
|
18
20
|
|
|
19
21
|
|
|
20
22
|
|
|
21
|
-
def run_cd_hit(input_file, clustering_output,
|
|
23
|
+
def run_cd_hit(options, input_file, clustering_output, clustering_mode):
|
|
22
24
|
cdhit_command = [
|
|
23
|
-
|
|
25
|
+
clustering_mode,
|
|
24
26
|
'-i', input_file,
|
|
25
27
|
'-o', clustering_output,
|
|
26
28
|
'-c', str(options.pident),
|
|
@@ -31,24 +33,24 @@ def run_cd_hit(input_file, clustering_output, options):
|
|
|
31
33
|
'-sc', "1",
|
|
32
34
|
'-sf', "1"
|
|
33
35
|
]
|
|
34
|
-
if options.verbose
|
|
36
|
+
if options.verbose != None:
|
|
35
37
|
subprocess.run(cdhit_command)
|
|
36
38
|
else:
|
|
37
39
|
subprocess.run(cdhit_command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
38
40
|
|
|
39
41
|
|
|
40
42
|
def main():
|
|
41
|
-
parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ':
|
|
43
|
+
parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ': A tool that groups genes into unique clusters.')
|
|
42
44
|
### Required Arguments
|
|
43
45
|
required = parser.add_argument_group('Required Arguments')
|
|
44
46
|
required.add_argument('-run_mode', action='store', dest='run_mode', choices=['Full','Partial'],
|
|
45
47
|
help='Run Mode: Should PyamilySeq be run in "Full" or "Partial" mode?',
|
|
46
48
|
required=True)
|
|
47
|
-
required.add_argument('-group_mode', action='store', dest='group_type', choices=['Species'],
|
|
48
|
-
help='Group Mode: Should PyamilySeq be run in "Species" or "Genus" mode?
|
|
49
|
+
required.add_argument('-group_mode', action='store', dest='group_type', choices=['Species', 'Genus'],
|
|
50
|
+
help='Group Mode: Should PyamilySeq be run in "Species" or "Genus" mode? ',
|
|
49
51
|
required=True)
|
|
50
|
-
required.add_argument("-
|
|
51
|
-
help="Clustering
|
|
52
|
+
required.add_argument("-clustering_format", action="store", dest="clustering_format", choices=['CD-HIT','TSV','CSV'],
|
|
53
|
+
help="Clustering format to use: CD-HIT or TSV (MMseqs2, BLAST, DIAMOND) / CSV edge-list file (Node1\tNode2).",
|
|
52
54
|
required=True)
|
|
53
55
|
required.add_argument("-output_dir", action="store", dest="output_dir",
|
|
54
56
|
help="Directory for all output files.",
|
|
@@ -65,6 +67,12 @@ def main():
|
|
|
65
67
|
full_mode_args.add_argument("-name_split", action="store", dest="name_split",
|
|
66
68
|
help="substring used to split the filename and extract the genome name ('_combined.gff3' or '.gff').",
|
|
67
69
|
required=False)
|
|
70
|
+
full_mode_args.add_argument('-sequence_type', action='store', dest='sequence_type', default='DNA',choices=['AA', 'DNA'],
|
|
71
|
+
help='Default - DNA: Should clustering be performed in "DNA" or "AA" mode?',
|
|
72
|
+
required=False)
|
|
73
|
+
full_mode_args.add_argument('-gene_ident', action='store', dest='gene_ident', default='CDS',
|
|
74
|
+
help='Identifier used for extraction of sequences such as "misc_RNA,gene,mRNA,CDS,rRNA,tRNA,tmRNA,CRISPR,ncRNA,regulatory_region,oriC,pseudo"',
|
|
75
|
+
required=False)
|
|
68
76
|
full_mode_args.add_argument("-pid", action="store", dest="pident", type=float, default=0.95,
|
|
69
77
|
help="Default 0.95: Pident threshold for clustering.",
|
|
70
78
|
required=False)
|
|
@@ -88,35 +96,41 @@ def main():
|
|
|
88
96
|
|
|
89
97
|
###Grouping Arguments
|
|
90
98
|
grouping_args = parser.add_argument_group('Grouping Arguments - Use to fine-tune grouping of genes after clustering')
|
|
91
|
-
grouping_args.add_argument('-reclustered', action='store', dest='reclustered',
|
|
99
|
+
grouping_args.add_argument('-reclustered', action='store', dest='reclustered',
|
|
100
|
+
help='Currently only works on Partial Mode: Clustering output file from secondary round of clustering.',
|
|
92
101
|
required=False)
|
|
93
102
|
grouping_args.add_argument('-seq_tag', action='store', dest='sequence_tag', default='StORF',
|
|
94
103
|
help='Default - "StORF": Unique identifier to be used to distinguish the second of two rounds of clustered sequences',
|
|
95
104
|
required=False)
|
|
96
|
-
grouping_args.add_argument('-
|
|
97
|
-
help='Default - (\'99,95,15\'): Gene family groups to use',
|
|
105
|
+
grouping_args.add_argument('-core_groups', action="store", dest='core_groups', default="99,95,15",
|
|
106
|
+
help='Default - (\'99,95,15\'): Gene family groups to use for "Species" mode',
|
|
107
|
+
required=False)
|
|
108
|
+
|
|
109
|
+
grouping_args.add_argument('-genus_groups', action="store", dest='genus_groups', default="1,2,3,4,5,6",
|
|
110
|
+
help='Default - (\'1,2,3,4,5,6\'): Gene family groups to use for "Genus" mode',
|
|
98
111
|
required=False)
|
|
99
112
|
|
|
100
113
|
###Output Arguments
|
|
101
114
|
output_args = parser.add_argument_group('Output Parameters')
|
|
102
|
-
output_args.add_argument('-w', action="store", dest='
|
|
103
|
-
help='Default - No output: Output sequences of identified
|
|
104
|
-
' - Must provide FASTA file with -
|
|
115
|
+
output_args.add_argument('-w', action="store", dest='write_groups', default=None,
|
|
116
|
+
help='Default - No output: Output sequences of identified groups (provide levels at which to output - Species "-w 99,95" Genus "-w 2,3"'
|
|
117
|
+
' - Must provide FASTA file with -original_fasta if in Partial run mode.',
|
|
105
118
|
required=False)
|
|
106
|
-
output_args.add_argument('-
|
|
107
|
-
help='Default - No output: Output aligned and concatinated sequences of identified
|
|
108
|
-
' - Must provide FASTA file with -
|
|
119
|
+
output_args.add_argument('-a', action="store_true", dest='align_core', default=None,
|
|
120
|
+
help='Default - No output: SLOW! (Only works for Species mode) Output aligned and concatinated sequences of identified groups -'
|
|
121
|
+
'provide group levels at which to output "-w 99,95" - Must provide FASTA file with -original_fasta in Partial'
|
|
122
|
+
'run mode.',
|
|
109
123
|
required=False)
|
|
110
124
|
output_args.add_argument('-original_fasta', action='store', dest='original_fasta',
|
|
111
125
|
help='FASTA file to use in conjunction with "-w" or "-con" when running in Partial Mode.',
|
|
112
126
|
required=False)
|
|
113
|
-
output_args.add_argument('-gpa', action='
|
|
114
|
-
|
|
127
|
+
output_args.add_argument('-gpa', action='store_true', dest='gene_presence_absence_out', default=None,
|
|
128
|
+
help='Default - False: If selected, a Roary/Panaroo formatted gene_presence_absence.csv will be created - Required for Coinfinder and other downstream tools',
|
|
129
|
+
required=False)
|
|
115
130
|
|
|
116
131
|
### Misc Arguments
|
|
117
132
|
misc = parser.add_argument_group('Misc')
|
|
118
|
-
misc.add_argument('-verbose', action='
|
|
119
|
-
help='Default - False: Print out runtime messages',
|
|
133
|
+
misc.add_argument('-verbose', action='store_true', dest='verbose', default=None, help='Default - False: Print out runtime messages',
|
|
120
134
|
required = False)
|
|
121
135
|
misc.add_argument('-v', action='store_true', dest='version',
|
|
122
136
|
help='Default - False: Print out version number and exit',
|
|
@@ -124,18 +138,24 @@ def main():
|
|
|
124
138
|
|
|
125
139
|
options = parser.parse_args()
|
|
126
140
|
|
|
127
|
-
### Checking all required parameters are provided by user
|
|
141
|
+
### Checking all required parameters are provided by user #!!# Doesn't seem to work
|
|
128
142
|
if options.run_mode == 'Full':
|
|
129
|
-
|
|
143
|
+
|
|
144
|
+
if options.reclustered != None:
|
|
145
|
+
sys.exit("Currently reclustering only works on Partial Mode.")
|
|
146
|
+
required_full_mode = [options.input_type, options.input_dir, options.name_split, options.clustering_format,
|
|
130
147
|
options.pident, options.len_diff]
|
|
131
148
|
if all(required_full_mode):
|
|
132
149
|
# Proceed with the Full mode
|
|
133
150
|
pass
|
|
134
151
|
else:
|
|
135
152
|
missing_options = [opt for opt in
|
|
136
|
-
['input_type', 'input_dir', 'name_split', '
|
|
153
|
+
['input_type', 'input_dir', 'name_split', 'clustering_format', 'pident', 'len_diff'] if
|
|
137
154
|
not options.__dict__[opt]]
|
|
138
155
|
print(f"Missing required options for Full mode: {', '.join(missing_options)}")
|
|
156
|
+
if options.align_core != None:
|
|
157
|
+
if options.write_groups == None:
|
|
158
|
+
sys.exit('Must provide "-w" to output gene groups before alignment "-a" can be done.')
|
|
139
159
|
elif options.run_mode == 'Partial':
|
|
140
160
|
required_partial_mode = [options.cluster_file, ]
|
|
141
161
|
if all(required_partial_mode):
|
|
@@ -146,36 +166,40 @@ def main():
|
|
|
146
166
|
['cluster_file',] if
|
|
147
167
|
not options.__dict__[opt]]
|
|
148
168
|
print(f"Missing required options for Partial mode: {', '.join(missing_options)}")
|
|
169
|
+
if options.align_core != None:
|
|
170
|
+
if options.write_groups == None or options.original_fasta == None:
|
|
171
|
+
sys.exit('Must provide "-w" and "-original_fasta" to output gene groups before alignment "-a" can be done.')
|
|
149
172
|
|
|
150
|
-
if options.
|
|
173
|
+
if options.clustering_format == 'CD-HIT':
|
|
151
174
|
clust_affix = '.clstr'
|
|
152
|
-
elif options.
|
|
175
|
+
elif options.clustering_format == 'TSV':
|
|
153
176
|
clust_affix = '.tsv'
|
|
154
|
-
elif options.
|
|
177
|
+
elif options.clustering_format == 'CSV':
|
|
155
178
|
clust_affix = '.csv'
|
|
156
179
|
|
|
157
180
|
|
|
158
181
|
|
|
182
|
+
|
|
159
183
|
###External tool checks:
|
|
160
184
|
##MAFFT
|
|
161
|
-
if options.
|
|
185
|
+
if options.align_core == True:
|
|
162
186
|
if is_tool_installed('mafft'):
|
|
163
|
-
if options.verbose
|
|
187
|
+
if options.verbose != None:
|
|
164
188
|
print("mafft is installed. Proceeding with alignment.")
|
|
165
189
|
else:
|
|
166
190
|
exit("mafft is not installed. Please install mafft to proceed.")
|
|
167
191
|
##CD-HIT
|
|
168
|
-
if options.
|
|
192
|
+
if options.clustering_format == 'CD-HIT' and options.run_mode == 'Full':
|
|
169
193
|
if is_tool_installed('cd-hit'):
|
|
170
|
-
if options.verbose
|
|
194
|
+
if options.verbose != None:
|
|
171
195
|
print("cd-hit is installed. Proceeding with clustering.")
|
|
172
196
|
else:
|
|
173
197
|
exit("cd-hit is not installed. Please install cd-hit to proceed.")
|
|
174
198
|
|
|
175
|
-
if options.
|
|
199
|
+
if options.write_groups != None and options.original_fasta == False:
|
|
176
200
|
exit("-fasta must br provided if -w is used")
|
|
177
201
|
|
|
178
|
-
|
|
202
|
+
|
|
179
203
|
|
|
180
204
|
|
|
181
205
|
if options.cluster_file:
|
|
@@ -189,29 +213,48 @@ def main():
|
|
|
189
213
|
|
|
190
214
|
output_path = os.path.abspath(options.output_dir)
|
|
191
215
|
combined_out_file = os.path.join(output_path, "combined_sequences.fasta")
|
|
192
|
-
clustering_output = os.path.join(output_path, 'clustering_' + options.
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
if options.run_mode == 'Full':
|
|
216
|
+
clustering_output = os.path.join(output_path, 'clustering_' + options.clustering_format)
|
|
196
217
|
|
|
218
|
+
if options.group_type == 'Species':
|
|
219
|
+
options.core_groups = options.core_groups + ',0'
|
|
220
|
+
groups_to_use = options.core_groups
|
|
221
|
+
elif options.group_type == 'Genus':
|
|
222
|
+
options.genus_groups = options.genus_groups + ',>'
|
|
223
|
+
groups_to_use = options.genus_groups
|
|
224
|
+
if options.align_core != None:
|
|
225
|
+
sys.exit("-a align_core not a valid option in Genus mode.")
|
|
197
226
|
|
|
198
227
|
|
|
228
|
+
if options.run_mode == 'Full':
|
|
229
|
+
if not os.path.exists(output_path):
|
|
230
|
+
os.makedirs(output_path)
|
|
231
|
+
if options.sequence_type == 'AA':
|
|
232
|
+
clustering_mode = 'cd-hit'
|
|
233
|
+
translate = True
|
|
234
|
+
elif options.sequence_type == 'DNA':
|
|
235
|
+
clustering_mode = 'cd-hit-est'
|
|
236
|
+
translate = False
|
|
199
237
|
if options.input_type == 'separate':
|
|
200
|
-
read_separate_files(options.input_dir, options.name_split, combined_out_file)
|
|
238
|
+
read_separate_files(options.input_dir, options.name_split, options.gene_ident, combined_out_file, translate)
|
|
201
239
|
else:
|
|
202
|
-
read_combined_files(options.input_dir, options.name_split, combined_out_file)
|
|
240
|
+
read_combined_files(options.input_dir, options.name_split, options.gene_ident, combined_out_file, translate)
|
|
241
|
+
|
|
242
|
+
if options.clustering_format == 'CD-HIT':
|
|
243
|
+
run_cd_hit(options, combined_out_file, clustering_output, clustering_mode)
|
|
203
244
|
|
|
204
|
-
run_cd_hit(combined_out_file, clustering_output, options)
|
|
205
245
|
class clustering_options:
|
|
206
246
|
def __init__(self):
|
|
207
|
-
self.
|
|
247
|
+
self.run_mode = options.run_mode
|
|
248
|
+
self.cluster_format = options.clustering_format
|
|
249
|
+
self.sequence_type = options.sequence_type
|
|
208
250
|
self.reclustered = options.reclustered
|
|
209
251
|
self.sequence_tag = options.sequence_tag
|
|
210
|
-
self.core_groups =
|
|
252
|
+
self.core_groups = groups_to_use
|
|
211
253
|
self.clusters = clustering_output + clust_affix
|
|
254
|
+
self.output_dir = options.output_dir
|
|
212
255
|
self.gene_presence_absence_out = options.gene_presence_absence_out
|
|
213
|
-
self.
|
|
214
|
-
self.
|
|
256
|
+
self.write_groups = options.write_groups
|
|
257
|
+
self.align_core = options.align_core
|
|
215
258
|
self.fasta = combined_out_file
|
|
216
259
|
self.verbose = options.verbose
|
|
217
260
|
|
|
@@ -220,26 +263,30 @@ def main():
|
|
|
220
263
|
elif options.run_mode == 'Partial':
|
|
221
264
|
class clustering_options:
|
|
222
265
|
def __init__(self):
|
|
223
|
-
self.
|
|
266
|
+
self.run_mode = options.run_mode
|
|
267
|
+
self.cluster_format = options.clustering_format
|
|
224
268
|
self.reclustered = options.reclustered
|
|
225
269
|
self.sequence_tag = options.sequence_tag
|
|
226
|
-
self.core_groups =
|
|
270
|
+
self.core_groups = groups_to_use
|
|
227
271
|
self.clusters = options.cluster_file
|
|
272
|
+
self.output_dir = options.output_dir
|
|
228
273
|
self.gene_presence_absence_out = options.gene_presence_absence_out
|
|
229
|
-
self.
|
|
230
|
-
self.
|
|
274
|
+
self.write_groups = options.write_groups
|
|
275
|
+
self.align_core = options.align_core
|
|
231
276
|
self.fasta = options.original_fasta
|
|
232
277
|
self.verbose = options.verbose
|
|
233
278
|
|
|
234
279
|
clustering_options = clustering_options()
|
|
235
280
|
|
|
236
281
|
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
282
|
+
if options.group_type == 'Species':
|
|
283
|
+
species_cluster(clustering_options)
|
|
284
|
+
elif options.group_type == 'Genus':
|
|
285
|
+
genus_cluster((clustering_options))
|
|
240
286
|
|
|
241
287
|
print("Thank you for using PyamilySeq -- A detailed user manual can be found at https://github.com/NickJD/PyamilySeq\n"
|
|
242
288
|
"Please report any issues to: https://github.com/NickJD/PyamilySeq/issues\n#####")
|
|
243
289
|
|
|
244
290
|
if __name__ == "__main__":
|
|
291
|
+
print("Running PyamilySeq "+PyamilySeq_Version)
|
|
245
292
|
main()
|