PyamilySeq 0.5.2__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
PyamilySeq/Constants.py CHANGED
@@ -1,2 +1,2 @@
1
- PyamilySeq_Version = 'v0.5.2'
1
+ PyamilySeq_Version = 'v0.7.0'
2
2
 
PyamilySeq/PyamilySeq.py CHANGED
@@ -7,20 +7,22 @@ import subprocess
7
7
 
8
8
 
9
9
  try:
10
- from .PyamilySeq_Species import cluster
10
+ from .PyamilySeq_Species import cluster as species_cluster
11
+ from .PyamilySeq_Genus import cluster as genus_cluster
11
12
  from .Constants import *
12
13
  from .utils import *
13
14
  except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
14
- from PyamilySeq_Species import cluster
15
+ from PyamilySeq_Species import cluster as species_cluster
16
+ from PyamilySeq_Genus import cluster as genus_cluster
15
17
  from Constants import *
16
18
  from utils import *
17
19
 
18
20
 
19
21
 
20
22
 
21
- def run_cd_hit(input_file, clustering_output, options):
23
+ def run_cd_hit(options, input_file, clustering_output, clustering_mode):
22
24
  cdhit_command = [
23
- 'cd-hit-est',
25
+ clustering_mode,
24
26
  '-i', input_file,
25
27
  '-o', clustering_output,
26
28
  '-c', str(options.pident),
@@ -31,24 +33,24 @@ def run_cd_hit(input_file, clustering_output, options):
31
33
  '-sc', "1",
32
34
  '-sf', "1"
33
35
  ]
34
- if options.verbose == True:
36
+ if options.verbose != None:
35
37
  subprocess.run(cdhit_command)
36
38
  else:
37
39
  subprocess.run(cdhit_command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
38
40
 
39
41
 
40
42
  def main():
41
- parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ': PyamilySeq Run Parameters.')
43
+ parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ': A tool that groups genes into unique clusters.')
42
44
  ### Required Arguments
43
45
  required = parser.add_argument_group('Required Arguments')
44
46
  required.add_argument('-run_mode', action='store', dest='run_mode', choices=['Full','Partial'],
45
47
  help='Run Mode: Should PyamilySeq be run in "Full" or "Partial" mode?',
46
48
  required=True)
47
- required.add_argument('-group_mode', action='store', dest='group_type', choices=['Species'],
48
- help='Group Mode: Should PyamilySeq be run in "Species" or "Genus" mode? - Genus mode not currently functioning',
49
+ required.add_argument('-group_mode', action='store', dest='group_type', choices=['Species', 'Genus'],
50
+ help='Group Mode: Should PyamilySeq be run in "Species" or "Genus" mode? ',
49
51
  required=True)
50
- required.add_argument("-clust_tool", action="store", dest="clust_tool", choices=['CD-HIT'],
51
- help="Clustering tool to use: CD-HIT, DIAMOND, BLAST or MMseqs2.",
52
+ required.add_argument("-clustering_format", action="store", dest="clustering_format", choices=['CD-HIT','TSV','CSV'],
53
+ help="Clustering format to use: CD-HIT or TSV (MMseqs2, BLAST, DIAMOND) / CSV edge-list file (Node1\tNode2).",
52
54
  required=True)
53
55
  required.add_argument("-output_dir", action="store", dest="output_dir",
54
56
  help="Directory for all output files.",
@@ -65,6 +67,12 @@ def main():
65
67
  full_mode_args.add_argument("-name_split", action="store", dest="name_split",
66
68
  help="substring used to split the filename and extract the genome name ('_combined.gff3' or '.gff').",
67
69
  required=False)
70
+ full_mode_args.add_argument('-sequence_type', action='store', dest='sequence_type', default='DNA',choices=['AA', 'DNA'],
71
+ help='Default - DNA: Should clustering be performed in "DNA" or "AA" mode?',
72
+ required=False)
73
+ full_mode_args.add_argument('-gene_ident', action='store', dest='gene_ident', default='CDS',
74
+ help='Identifier used for extraction of sequences such as "misc_RNA,gene,mRNA,CDS,rRNA,tRNA,tmRNA,CRISPR,ncRNA,regulatory_region,oriC,pseudo"',
75
+ required=False)
68
76
  full_mode_args.add_argument("-pid", action="store", dest="pident", type=float, default=0.95,
69
77
  help="Default 0.95: Pident threshold for clustering.",
70
78
  required=False)
@@ -88,35 +96,41 @@ def main():
88
96
 
89
97
  ###Grouping Arguments
90
98
  grouping_args = parser.add_argument_group('Grouping Arguments - Use to fine-tune grouping of genes after clustering')
91
- grouping_args.add_argument('-reclustered', action='store', dest='reclustered', help='Clustering output file from secondary round of clustering',
99
+ grouping_args.add_argument('-reclustered', action='store', dest='reclustered',
100
+ help='Currently only works on Partial Mode: Clustering output file from secondary round of clustering.',
92
101
  required=False)
93
102
  grouping_args.add_argument('-seq_tag', action='store', dest='sequence_tag', default='StORF',
94
103
  help='Default - "StORF": Unique identifier to be used to distinguish the second of two rounds of clustered sequences',
95
104
  required=False)
96
- grouping_args.add_argument('-groups', action="store", dest='core_groups', default="99,95,15",
97
- help='Default - (\'99,95,15\'): Gene family groups to use',
105
+ grouping_args.add_argument('-core_groups', action="store", dest='core_groups', default="99,95,15",
106
+ help='Default - (\'99,95,15\'): Gene family groups to use for "Species" mode',
107
+ required=False)
108
+
109
+ grouping_args.add_argument('-genus_groups', action="store", dest='genus_groups', default="1,2,3,4,5,6",
110
+ help='Default - (\'1,2,3,4,5,6\'): Gene family groups to use for "Genus" mode',
98
111
  required=False)
99
112
 
100
113
  ###Output Arguments
101
114
  output_args = parser.add_argument_group('Output Parameters')
102
- output_args.add_argument('-w', action="store", dest='write_families', default=None,
103
- help='Default - No output: Output sequences of identified families (provide levels at which to output "-w 99,95"'
104
- ' - Must provide FASTA file with -fasta',
115
+ output_args.add_argument('-w', action="store", dest='write_groups', default=None,
116
+ help='Default - No output: Output sequences of identified groups (provide levels at which to output - Species "-w 99,95" Genus "-w 2,3"'
117
+ ' - Must provide FASTA file with -original_fasta if in Partial run mode.',
105
118
  required=False)
106
- output_args.add_argument('-con', action="store", dest='con_core', default=None,
107
- help='Default - No output: Output aligned and concatinated sequences of identified families - used for MSA (provide levels at which to output "-w 99,95"'
108
- ' - Must provide FASTA file with -fasta',
119
+ output_args.add_argument('-a', action="store_true", dest='align_core', default=None,
120
+ help='Default - No output: SLOW! (Only works for Species mode) Output aligned and concatinated sequences of identified groups -'
121
+ 'provide group levels at which to output "-w 99,95" - Must provide FASTA file with -original_fasta in Partial'
122
+ 'run mode.',
109
123
  required=False)
110
124
  output_args.add_argument('-original_fasta', action='store', dest='original_fasta',
111
125
  help='FASTA file to use in conjunction with "-w" or "-con" when running in Partial Mode.',
112
126
  required=False)
113
- output_args.add_argument('-gpa', action='store', dest='gene_presence_absence_out', help='Default - False: If selected, a Roary formatted gene_presence_absence.csv will be created - Required for Coinfinder and other downstream tools',
114
- required=False)
127
+ output_args.add_argument('-gpa', action='store_true', dest='gene_presence_absence_out', default=None,
128
+ help='Default - False: If selected, a Roary/Panaroo formatted gene_presence_absence.csv will be created - Required for Coinfinder and other downstream tools',
129
+ required=False)
115
130
 
116
131
  ### Misc Arguments
117
132
  misc = parser.add_argument_group('Misc')
118
- misc.add_argument('-verbose', action='store', dest='verbose', default=False, type=eval, choices=[True, False],
119
- help='Default - False: Print out runtime messages',
133
+ misc.add_argument('-verbose', action='store_true', dest='verbose', default=None, help='Default - False: Print out runtime messages',
120
134
  required = False)
121
135
  misc.add_argument('-v', action='store_true', dest='version',
122
136
  help='Default - False: Print out version number and exit',
@@ -124,18 +138,24 @@ def main():
124
138
 
125
139
  options = parser.parse_args()
126
140
 
127
- ### Checking all required parameters are provided by user
141
+ ### Checking all required parameters are provided by user #!!# Doesn't seem to work
128
142
  if options.run_mode == 'Full':
129
- required_full_mode = [options.input_type, options.input_dir, options.name_split, options.clust_tool,
143
+
144
+ if options.reclustered != None:
145
+ sys.exit("Currently reclustering only works on Partial Mode.")
146
+ required_full_mode = [options.input_type, options.input_dir, options.name_split, options.clustering_format,
130
147
  options.pident, options.len_diff]
131
148
  if all(required_full_mode):
132
149
  # Proceed with the Full mode
133
150
  pass
134
151
  else:
135
152
  missing_options = [opt for opt in
136
- ['input_type', 'input_dir', 'name_split', 'clust_tool', 'pident', 'len_diff'] if
153
+ ['input_type', 'input_dir', 'name_split', 'clustering_format', 'pident', 'len_diff'] if
137
154
  not options.__dict__[opt]]
138
155
  print(f"Missing required options for Full mode: {', '.join(missing_options)}")
156
+ if options.align_core != None:
157
+ if options.write_groups == None:
158
+ sys.exit('Must provide "-w" to output gene groups before alignment "-a" can be done.')
139
159
  elif options.run_mode == 'Partial':
140
160
  required_partial_mode = [options.cluster_file, ]
141
161
  if all(required_partial_mode):
@@ -146,36 +166,40 @@ def main():
146
166
  ['cluster_file',] if
147
167
  not options.__dict__[opt]]
148
168
  print(f"Missing required options for Partial mode: {', '.join(missing_options)}")
169
+ if options.align_core != None:
170
+ if options.write_groups == None or options.original_fasta == None:
171
+ sys.exit('Must provide "-w" and "-original_fasta" to output gene groups before alignment "-a" can be done.')
149
172
 
150
- if options.clust_tool == 'CD-HIT':
173
+ if options.clustering_format == 'CD-HIT':
151
174
  clust_affix = '.clstr'
152
- elif options.clust_tool == 'TSV':
175
+ elif options.clustering_format == 'TSV':
153
176
  clust_affix = '.tsv'
154
- elif options.clust_tool == 'CSV':
177
+ elif options.clustering_format == 'CSV':
155
178
  clust_affix = '.csv'
156
179
 
157
180
 
158
181
 
182
+
159
183
  ###External tool checks:
160
184
  ##MAFFT
161
- if options.con_core == True:
185
+ if options.align_core == True:
162
186
  if is_tool_installed('mafft'):
163
- if options.verbose == True:
187
+ if options.verbose != None:
164
188
  print("mafft is installed. Proceeding with alignment.")
165
189
  else:
166
190
  exit("mafft is not installed. Please install mafft to proceed.")
167
191
  ##CD-HIT
168
- if options.clust_tool == 'CD-HIT':
192
+ if options.clustering_format == 'CD-HIT' and options.run_mode == 'Full':
169
193
  if is_tool_installed('cd-hit'):
170
- if options.verbose == True:
194
+ if options.verbose != None:
171
195
  print("cd-hit is installed. Proceeding with clustering.")
172
196
  else:
173
197
  exit("cd-hit is not installed. Please install cd-hit to proceed.")
174
198
 
175
- if options.write_families != None and options.original_fasta == False:
199
+ if options.write_groups != None and options.original_fasta == False:
176
200
  exit("-fasta must br provided if -w is used")
177
201
 
178
- options.core_groups = options.core_groups + ',0'
202
+
179
203
 
180
204
 
181
205
  if options.cluster_file:
@@ -189,29 +213,48 @@ def main():
189
213
 
190
214
  output_path = os.path.abspath(options.output_dir)
191
215
  combined_out_file = os.path.join(output_path, "combined_sequences.fasta")
192
- clustering_output = os.path.join(output_path, 'clustering_' + options.clust_tool)
193
-
194
-
195
- if options.run_mode == 'Full':
216
+ clustering_output = os.path.join(output_path, 'clustering_' + options.clustering_format)
196
217
 
218
+ if options.group_type == 'Species':
219
+ options.core_groups = options.core_groups + ',0'
220
+ groups_to_use = options.core_groups
221
+ elif options.group_type == 'Genus':
222
+ options.genus_groups = options.genus_groups + ',>'
223
+ groups_to_use = options.genus_groups
224
+ if options.align_core != None:
225
+ sys.exit("-a align_core not a valid option in Genus mode.")
197
226
 
198
227
 
228
+ if options.run_mode == 'Full':
229
+ if not os.path.exists(output_path):
230
+ os.makedirs(output_path)
231
+ if options.sequence_type == 'AA':
232
+ clustering_mode = 'cd-hit'
233
+ translate = True
234
+ elif options.sequence_type == 'DNA':
235
+ clustering_mode = 'cd-hit-est'
236
+ translate = False
199
237
  if options.input_type == 'separate':
200
- read_separate_files(options.input_dir, options.name_split, combined_out_file)
238
+ read_separate_files(options.input_dir, options.name_split, options.gene_ident, combined_out_file, translate)
201
239
  else:
202
- read_combined_files(options.input_dir, options.name_split, combined_out_file)
240
+ read_combined_files(options.input_dir, options.name_split, options.gene_ident, combined_out_file, translate)
241
+
242
+ if options.clustering_format == 'CD-HIT':
243
+ run_cd_hit(options, combined_out_file, clustering_output, clustering_mode)
203
244
 
204
- run_cd_hit(combined_out_file, clustering_output, options)
205
245
  class clustering_options:
206
246
  def __init__(self):
207
- self.cluster_format = options.clust_tool
247
+ self.run_mode = options.run_mode
248
+ self.cluster_format = options.clustering_format
249
+ self.sequence_type = options.sequence_type
208
250
  self.reclustered = options.reclustered
209
251
  self.sequence_tag = options.sequence_tag
210
- self.core_groups = '99,95,15,0'
252
+ self.core_groups = groups_to_use
211
253
  self.clusters = clustering_output + clust_affix
254
+ self.output_dir = options.output_dir
212
255
  self.gene_presence_absence_out = options.gene_presence_absence_out
213
- self.write_families = options.write_families
214
- self.con_core = options.con_core
256
+ self.write_groups = options.write_groups
257
+ self.align_core = options.align_core
215
258
  self.fasta = combined_out_file
216
259
  self.verbose = options.verbose
217
260
 
@@ -220,26 +263,30 @@ def main():
220
263
  elif options.run_mode == 'Partial':
221
264
  class clustering_options:
222
265
  def __init__(self):
223
- self.cluster_format = options.clust_tool
266
+ self.run_mode = options.run_mode
267
+ self.cluster_format = options.clustering_format
224
268
  self.reclustered = options.reclustered
225
269
  self.sequence_tag = options.sequence_tag
226
- self.core_groups = '99,95,15,0'
270
+ self.core_groups = groups_to_use
227
271
  self.clusters = options.cluster_file
272
+ self.output_dir = options.output_dir
228
273
  self.gene_presence_absence_out = options.gene_presence_absence_out
229
- self.write_families = options.write_families
230
- self.con_core = options.con_core
274
+ self.write_groups = options.write_groups
275
+ self.align_core = options.align_core
231
276
  self.fasta = options.original_fasta
232
277
  self.verbose = options.verbose
233
278
 
234
279
  clustering_options = clustering_options()
235
280
 
236
281
 
237
-
238
-
239
- cluster(clustering_options)
282
+ if options.group_type == 'Species':
283
+ species_cluster(clustering_options)
284
+ elif options.group_type == 'Genus':
285
+ genus_cluster((clustering_options))
240
286
 
241
287
  print("Thank you for using PyamilySeq -- A detailed user manual can be found at https://github.com/NickJD/PyamilySeq\n"
242
288
  "Please report any issues to: https://github.com/NickJD/PyamilySeq/issues\n#####")
243
289
 
244
290
  if __name__ == "__main__":
291
+ print("Running PyamilySeq "+PyamilySeq_Version)
245
292
  main()