PyamilySeq 0.5.2__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyamilySeq/Constants.py +1 -1
- PyamilySeq/PyamilySeq.py +33 -17
- PyamilySeq/PyamilySeq_Genus.py +84 -484
- PyamilySeq/PyamilySeq_Species.py +63 -514
- PyamilySeq/clusterings.py +324 -0
- PyamilySeq/utils.py +84 -1
- {PyamilySeq-0.5.2.dist-info → PyamilySeq-0.6.0.dist-info}/METADATA +13 -10
- PyamilySeq-0.6.0.dist-info/RECORD +15 -0
- PyamilySeq-0.5.2.dist-info/RECORD +0 -14
- {PyamilySeq-0.5.2.dist-info → PyamilySeq-0.6.0.dist-info}/LICENSE +0 -0
- {PyamilySeq-0.5.2.dist-info → PyamilySeq-0.6.0.dist-info}/WHEEL +0 -0
- {PyamilySeq-0.5.2.dist-info → PyamilySeq-0.6.0.dist-info}/entry_points.txt +0 -0
- {PyamilySeq-0.5.2.dist-info → PyamilySeq-0.6.0.dist-info}/top_level.txt +0 -0
PyamilySeq/Constants.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
PyamilySeq_Version = 'v0.
|
|
1
|
+
PyamilySeq_Version = 'v0.6.0'
|
|
2
2
|
|
PyamilySeq/PyamilySeq.py
CHANGED
|
@@ -7,11 +7,13 @@ import subprocess
|
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
try:
|
|
10
|
-
from .PyamilySeq_Species import cluster
|
|
10
|
+
from .PyamilySeq_Species import cluster as species_cluster
|
|
11
|
+
from .PyamilySeq_Genus import cluster as genus_cluster
|
|
11
12
|
from .Constants import *
|
|
12
13
|
from .utils import *
|
|
13
14
|
except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
|
|
14
|
-
from PyamilySeq_Species import cluster
|
|
15
|
+
from PyamilySeq_Species import cluster as species_cluster
|
|
16
|
+
from PyamilySeq_Genus import cluster as genus_cluster
|
|
15
17
|
from Constants import *
|
|
16
18
|
from utils import *
|
|
17
19
|
|
|
@@ -44,8 +46,8 @@ def main():
|
|
|
44
46
|
required.add_argument('-run_mode', action='store', dest='run_mode', choices=['Full','Partial'],
|
|
45
47
|
help='Run Mode: Should PyamilySeq be run in "Full" or "Partial" mode?',
|
|
46
48
|
required=True)
|
|
47
|
-
required.add_argument('-group_mode', action='store', dest='group_type', choices=['Species'],
|
|
48
|
-
help='Group Mode: Should PyamilySeq be run in "Species" or "Genus" mode?
|
|
49
|
+
required.add_argument('-group_mode', action='store', dest='group_type', choices=['Species', 'Genus'],
|
|
50
|
+
help='Group Mode: Should PyamilySeq be run in "Species" or "Genus" mode? ',
|
|
49
51
|
required=True)
|
|
50
52
|
required.add_argument("-clust_tool", action="store", dest="clust_tool", choices=['CD-HIT'],
|
|
51
53
|
help="Clustering tool to use: CD-HIT, DIAMOND, BLAST or MMseqs2.",
|
|
@@ -88,13 +90,17 @@ def main():
|
|
|
88
90
|
|
|
89
91
|
###Grouping Arguments
|
|
90
92
|
grouping_args = parser.add_argument_group('Grouping Arguments - Use to fine-tune grouping of genes after clustering')
|
|
91
|
-
grouping_args.add_argument('-reclustered', action='store', dest='reclustered',
|
|
93
|
+
grouping_args.add_argument('-reclustered', action='store', dest='reclustered',
|
|
94
|
+
help='Currently only works on Partial Mode: Clustering output file from secondary round of clustering.',
|
|
92
95
|
required=False)
|
|
93
96
|
grouping_args.add_argument('-seq_tag', action='store', dest='sequence_tag', default='StORF',
|
|
94
97
|
help='Default - "StORF": Unique identifier to be used to distinguish the second of two rounds of clustered sequences',
|
|
95
98
|
required=False)
|
|
96
|
-
grouping_args.add_argument('-
|
|
97
|
-
help='Default - (\'99,95,15\'): Gene family groups to use',
|
|
99
|
+
grouping_args.add_argument('-core_groups', action="store", dest='core_groups', default="99,95,15",
|
|
100
|
+
help='Default - (\'99,95,15\'): Gene family groups to use for "Species" mode',
|
|
101
|
+
required=False)
|
|
102
|
+
grouping_args.add_argument('-genus_groups', action="store", dest='genus_groups', default="1,2,3,4,5,6",
|
|
103
|
+
help='Default - (\'1,2,3,4,5,6\'): Gene family groups to use for "Genus" mode',
|
|
98
104
|
required=False)
|
|
99
105
|
|
|
100
106
|
###Output Arguments
|
|
@@ -126,6 +132,8 @@ def main():
|
|
|
126
132
|
|
|
127
133
|
### Checking all required parameters are provided by user
|
|
128
134
|
if options.run_mode == 'Full':
|
|
135
|
+
if options.reclustered != None:
|
|
136
|
+
sys.exit("Currently reclustering only works on Partial Mode.")
|
|
129
137
|
required_full_mode = [options.input_type, options.input_dir, options.name_split, options.clust_tool,
|
|
130
138
|
options.pident, options.len_diff]
|
|
131
139
|
if all(required_full_mode):
|
|
@@ -165,7 +173,7 @@ def main():
|
|
|
165
173
|
else:
|
|
166
174
|
exit("mafft is not installed. Please install mafft to proceed.")
|
|
167
175
|
##CD-HIT
|
|
168
|
-
if options.clust_tool == 'CD-HIT':
|
|
176
|
+
if options.clust_tool == 'CD-HIT' and options.run_mode == 'Full':
|
|
169
177
|
if is_tool_installed('cd-hit'):
|
|
170
178
|
if options.verbose == True:
|
|
171
179
|
print("cd-hit is installed. Proceeding with clustering.")
|
|
@@ -175,7 +183,7 @@ def main():
|
|
|
175
183
|
if options.write_families != None and options.original_fasta == False:
|
|
176
184
|
exit("-fasta must br provided if -w is used")
|
|
177
185
|
|
|
178
|
-
|
|
186
|
+
|
|
179
187
|
|
|
180
188
|
|
|
181
189
|
if options.cluster_file:
|
|
@@ -191,24 +199,30 @@ def main():
|
|
|
191
199
|
combined_out_file = os.path.join(output_path, "combined_sequences.fasta")
|
|
192
200
|
clustering_output = os.path.join(output_path, 'clustering_' + options.clust_tool)
|
|
193
201
|
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
202
|
+
if options.group_type == 'Species':
|
|
203
|
+
options.core_groups = options.core_groups + ',0'
|
|
204
|
+
groups_to_use = options.core_groups
|
|
205
|
+
else:
|
|
206
|
+
options.genus_groups = options.genus_groups + ',>'
|
|
207
|
+
groups_to_use = options.genus_groups
|
|
197
208
|
|
|
198
209
|
|
|
210
|
+
if options.run_mode == 'Full':
|
|
199
211
|
if options.input_type == 'separate':
|
|
200
212
|
read_separate_files(options.input_dir, options.name_split, combined_out_file)
|
|
201
213
|
else:
|
|
202
214
|
read_combined_files(options.input_dir, options.name_split, combined_out_file)
|
|
203
215
|
|
|
204
216
|
run_cd_hit(combined_out_file, clustering_output, options)
|
|
217
|
+
|
|
205
218
|
class clustering_options:
|
|
206
219
|
def __init__(self):
|
|
207
220
|
self.cluster_format = options.clust_tool
|
|
208
221
|
self.reclustered = options.reclustered
|
|
209
222
|
self.sequence_tag = options.sequence_tag
|
|
210
|
-
self.core_groups =
|
|
223
|
+
self.core_groups = groups_to_use
|
|
211
224
|
self.clusters = clustering_output + clust_affix
|
|
225
|
+
self.output_dir = options.output_dir
|
|
212
226
|
self.gene_presence_absence_out = options.gene_presence_absence_out
|
|
213
227
|
self.write_families = options.write_families
|
|
214
228
|
self.con_core = options.con_core
|
|
@@ -223,8 +237,9 @@ def main():
|
|
|
223
237
|
self.cluster_format = options.clust_tool
|
|
224
238
|
self.reclustered = options.reclustered
|
|
225
239
|
self.sequence_tag = options.sequence_tag
|
|
226
|
-
self.core_groups =
|
|
240
|
+
self.core_groups = groups_to_use
|
|
227
241
|
self.clusters = options.cluster_file
|
|
242
|
+
self.output_dir = options.output_dir
|
|
228
243
|
self.gene_presence_absence_out = options.gene_presence_absence_out
|
|
229
244
|
self.write_families = options.write_families
|
|
230
245
|
self.con_core = options.con_core
|
|
@@ -234,9 +249,10 @@ def main():
|
|
|
234
249
|
clustering_options = clustering_options()
|
|
235
250
|
|
|
236
251
|
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
252
|
+
if options.group_type == 'Species':
|
|
253
|
+
species_cluster(clustering_options)
|
|
254
|
+
elif options.group_type == 'Genus':
|
|
255
|
+
genus_cluster((clustering_options))
|
|
240
256
|
|
|
241
257
|
print("Thank you for using PyamilySeq -- A detailed user manual can be found at https://github.com/NickJD/PyamilySeq\n"
|
|
242
258
|
"Please report any issues to: https://github.com/NickJD/PyamilySeq/issues\n#####")
|