PyamilySeq 0.5.1__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyamilySeq/Constants.py +1 -1
- PyamilySeq/PyamilySeq.py +43 -19
- PyamilySeq/PyamilySeq_Genus.py +84 -484
- PyamilySeq/PyamilySeq_Species.py +63 -514
- PyamilySeq/clusterings.py +324 -0
- PyamilySeq/utils.py +84 -1
- {PyamilySeq-0.5.1.dist-info → PyamilySeq-0.6.0.dist-info}/METADATA +52 -68
- PyamilySeq-0.6.0.dist-info/RECORD +15 -0
- PyamilySeq-0.5.1.dist-info/RECORD +0 -14
- {PyamilySeq-0.5.1.dist-info → PyamilySeq-0.6.0.dist-info}/LICENSE +0 -0
- {PyamilySeq-0.5.1.dist-info → PyamilySeq-0.6.0.dist-info}/WHEEL +0 -0
- {PyamilySeq-0.5.1.dist-info → PyamilySeq-0.6.0.dist-info}/entry_points.txt +0 -0
- {PyamilySeq-0.5.1.dist-info → PyamilySeq-0.6.0.dist-info}/top_level.txt +0 -0
PyamilySeq/Constants.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
PyamilySeq_Version = 'v0.
|
|
1
|
+
PyamilySeq_Version = 'v0.6.0'
|
|
2
2
|
|
PyamilySeq/PyamilySeq.py
CHANGED
|
@@ -7,11 +7,13 @@ import subprocess
|
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
try:
|
|
10
|
-
from .PyamilySeq_Species import cluster
|
|
10
|
+
from .PyamilySeq_Species import cluster as species_cluster
|
|
11
|
+
from .PyamilySeq_Genus import cluster as genus_cluster
|
|
11
12
|
from .Constants import *
|
|
12
13
|
from .utils import *
|
|
13
14
|
except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
|
|
14
|
-
from PyamilySeq_Species import cluster
|
|
15
|
+
from PyamilySeq_Species import cluster as species_cluster
|
|
16
|
+
from PyamilySeq_Genus import cluster as genus_cluster
|
|
15
17
|
from Constants import *
|
|
16
18
|
from utils import *
|
|
17
19
|
|
|
@@ -25,7 +27,8 @@ def run_cd_hit(input_file, clustering_output, options):
|
|
|
25
27
|
'-o', clustering_output,
|
|
26
28
|
'-c', str(options.pident),
|
|
27
29
|
'-s', str(options.len_diff),
|
|
28
|
-
'-T',
|
|
30
|
+
'-T', str(options.clustering_threads),
|
|
31
|
+
'-M', str(options.clustering_memory),
|
|
29
32
|
'-d', "0",
|
|
30
33
|
'-sc', "1",
|
|
31
34
|
'-sf', "1"
|
|
@@ -43,8 +46,8 @@ def main():
|
|
|
43
46
|
required.add_argument('-run_mode', action='store', dest='run_mode', choices=['Full','Partial'],
|
|
44
47
|
help='Run Mode: Should PyamilySeq be run in "Full" or "Partial" mode?',
|
|
45
48
|
required=True)
|
|
46
|
-
required.add_argument('-group_mode', action='store', dest='group_type', choices=['Species'],
|
|
47
|
-
help='Group Mode: Should PyamilySeq be run in "Species" or "Genus" mode?
|
|
49
|
+
required.add_argument('-group_mode', action='store', dest='group_type', choices=['Species', 'Genus'],
|
|
50
|
+
help='Group Mode: Should PyamilySeq be run in "Species" or "Genus" mode? ',
|
|
48
51
|
required=True)
|
|
49
52
|
required.add_argument("-clust_tool", action="store", dest="clust_tool", choices=['CD-HIT'],
|
|
50
53
|
help="Clustering tool to use: CD-HIT, DIAMOND, BLAST or MMseqs2.",
|
|
@@ -70,7 +73,14 @@ def main():
|
|
|
70
73
|
full_mode_args.add_argument("-len_diff", action="store", dest="len_diff", type=float, default=0.80,
|
|
71
74
|
help="Default 0.80: Minimum length difference between clustered sequences - (-s) threshold for CD-HIT clustering.",
|
|
72
75
|
required=False)
|
|
73
|
-
|
|
76
|
+
###Clustering Arguments
|
|
77
|
+
clustering_args = parser.add_argument_group('Clustering Runtime Arguments - Optional when "-run_mode Full" is used')
|
|
78
|
+
clustering_args.add_argument("-mem", action="store", dest="clustering_memory", type=int, default=4000,
|
|
79
|
+
help="Default 4000: Memory to be allocated for clustering (in MBs).",
|
|
80
|
+
required=False)
|
|
81
|
+
clustering_args.add_argument("-t", action="store", dest="clustering_threads", type=int, default=4,
|
|
82
|
+
help="Default 4: Threads to be allocated for clustering.",
|
|
83
|
+
required=False)
|
|
74
84
|
|
|
75
85
|
###Partial-Mode Arguments
|
|
76
86
|
partial_mode_args = parser.add_argument_group('Partial-Mode Arguments - Required when "-run_mode Partial" is used')
|
|
@@ -80,13 +90,17 @@ def main():
|
|
|
80
90
|
|
|
81
91
|
###Grouping Arguments
|
|
82
92
|
grouping_args = parser.add_argument_group('Grouping Arguments - Use to fine-tune grouping of genes after clustering')
|
|
83
|
-
grouping_args.add_argument('-reclustered', action='store', dest='reclustered',
|
|
93
|
+
grouping_args.add_argument('-reclustered', action='store', dest='reclustered',
|
|
94
|
+
help='Currently only works on Partial Mode: Clustering output file from secondary round of clustering.',
|
|
84
95
|
required=False)
|
|
85
96
|
grouping_args.add_argument('-seq_tag', action='store', dest='sequence_tag', default='StORF',
|
|
86
97
|
help='Default - "StORF": Unique identifier to be used to distinguish the second of two rounds of clustered sequences',
|
|
87
98
|
required=False)
|
|
88
|
-
grouping_args.add_argument('-
|
|
89
|
-
help='Default - (\'99,95,15\'): Gene family groups to use',
|
|
99
|
+
grouping_args.add_argument('-core_groups', action="store", dest='core_groups', default="99,95,15",
|
|
100
|
+
help='Default - (\'99,95,15\'): Gene family groups to use for "Species" mode',
|
|
101
|
+
required=False)
|
|
102
|
+
grouping_args.add_argument('-genus_groups', action="store", dest='genus_groups', default="1,2,3,4,5,6",
|
|
103
|
+
help='Default - (\'1,2,3,4,5,6\'): Gene family groups to use for "Genus" mode',
|
|
90
104
|
required=False)
|
|
91
105
|
|
|
92
106
|
###Output Arguments
|
|
@@ -118,6 +132,8 @@ def main():
|
|
|
118
132
|
|
|
119
133
|
### Checking all required parameters are provided by user
|
|
120
134
|
if options.run_mode == 'Full':
|
|
135
|
+
if options.reclustered != None:
|
|
136
|
+
sys.exit("Currently reclustering only works on Partial Mode.")
|
|
121
137
|
required_full_mode = [options.input_type, options.input_dir, options.name_split, options.clust_tool,
|
|
122
138
|
options.pident, options.len_diff]
|
|
123
139
|
if all(required_full_mode):
|
|
@@ -157,7 +173,7 @@ def main():
|
|
|
157
173
|
else:
|
|
158
174
|
exit("mafft is not installed. Please install mafft to proceed.")
|
|
159
175
|
##CD-HIT
|
|
160
|
-
if options.clust_tool == 'CD-HIT':
|
|
176
|
+
if options.clust_tool == 'CD-HIT' and options.run_mode == 'Full':
|
|
161
177
|
if is_tool_installed('cd-hit'):
|
|
162
178
|
if options.verbose == True:
|
|
163
179
|
print("cd-hit is installed. Proceeding with clustering.")
|
|
@@ -167,7 +183,7 @@ def main():
|
|
|
167
183
|
if options.write_families != None and options.original_fasta == False:
|
|
168
184
|
exit("-fasta must br provided if -w is used")
|
|
169
185
|
|
|
170
|
-
|
|
186
|
+
|
|
171
187
|
|
|
172
188
|
|
|
173
189
|
if options.cluster_file:
|
|
@@ -183,24 +199,30 @@ def main():
|
|
|
183
199
|
combined_out_file = os.path.join(output_path, "combined_sequences.fasta")
|
|
184
200
|
clustering_output = os.path.join(output_path, 'clustering_' + options.clust_tool)
|
|
185
201
|
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
202
|
+
if options.group_type == 'Species':
|
|
203
|
+
options.core_groups = options.core_groups + ',0'
|
|
204
|
+
groups_to_use = options.core_groups
|
|
205
|
+
else:
|
|
206
|
+
options.genus_groups = options.genus_groups + ',>'
|
|
207
|
+
groups_to_use = options.genus_groups
|
|
189
208
|
|
|
190
209
|
|
|
210
|
+
if options.run_mode == 'Full':
|
|
191
211
|
if options.input_type == 'separate':
|
|
192
212
|
read_separate_files(options.input_dir, options.name_split, combined_out_file)
|
|
193
213
|
else:
|
|
194
214
|
read_combined_files(options.input_dir, options.name_split, combined_out_file)
|
|
195
215
|
|
|
196
216
|
run_cd_hit(combined_out_file, clustering_output, options)
|
|
217
|
+
|
|
197
218
|
class clustering_options:
|
|
198
219
|
def __init__(self):
|
|
199
220
|
self.cluster_format = options.clust_tool
|
|
200
221
|
self.reclustered = options.reclustered
|
|
201
222
|
self.sequence_tag = options.sequence_tag
|
|
202
|
-
self.core_groups =
|
|
223
|
+
self.core_groups = groups_to_use
|
|
203
224
|
self.clusters = clustering_output + clust_affix
|
|
225
|
+
self.output_dir = options.output_dir
|
|
204
226
|
self.gene_presence_absence_out = options.gene_presence_absence_out
|
|
205
227
|
self.write_families = options.write_families
|
|
206
228
|
self.con_core = options.con_core
|
|
@@ -215,8 +237,9 @@ def main():
|
|
|
215
237
|
self.cluster_format = options.clust_tool
|
|
216
238
|
self.reclustered = options.reclustered
|
|
217
239
|
self.sequence_tag = options.sequence_tag
|
|
218
|
-
self.core_groups =
|
|
240
|
+
self.core_groups = groups_to_use
|
|
219
241
|
self.clusters = options.cluster_file
|
|
242
|
+
self.output_dir = options.output_dir
|
|
220
243
|
self.gene_presence_absence_out = options.gene_presence_absence_out
|
|
221
244
|
self.write_families = options.write_families
|
|
222
245
|
self.con_core = options.con_core
|
|
@@ -226,9 +249,10 @@ def main():
|
|
|
226
249
|
clustering_options = clustering_options()
|
|
227
250
|
|
|
228
251
|
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
252
|
+
if options.group_type == 'Species':
|
|
253
|
+
species_cluster(clustering_options)
|
|
254
|
+
elif options.group_type == 'Genus':
|
|
255
|
+
genus_cluster((clustering_options))
|
|
232
256
|
|
|
233
257
|
print("Thank you for using PyamilySeq -- A detailed user manual can be found at https://github.com/NickJD/PyamilySeq\n"
|
|
234
258
|
"Please report any issues to: https://github.com/NickJD/PyamilySeq/issues\n#####")
|