PyamilySeq 0.9.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
PyamilySeq/PyamilySeq.py CHANGED
@@ -1,20 +1,15 @@
1
1
  import argparse
2
- import collections
3
- import os
4
- import glob
5
- import subprocess
6
-
7
2
 
8
3
 
9
4
  try:
10
5
  from .PyamilySeq_Species import cluster as species_cluster
11
6
  from .PyamilySeq_Genus import cluster as genus_cluster
12
- from .Constants import *
7
+ from .constants import *
13
8
  from .utils import *
14
9
  except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
15
10
  from PyamilySeq_Species import cluster as species_cluster
16
11
  from PyamilySeq_Genus import cluster as genus_cluster
17
- from Constants import *
12
+ from constants import *
18
13
  from utils import *
19
14
 
20
15
 
@@ -28,150 +23,139 @@ def run_cd_hit(options, input_file, clustering_output, clustering_mode):
28
23
  '-c', str(options.pident),
29
24
  '-s', str(options.len_diff),
30
25
  '-T', str(options.threads),
31
- '-M', str(options.clustering_memory),
26
+ '-M', str(options.mem),
32
27
  '-d', "0",
33
- '-g', "1",
28
+ '-g', str(options.fast_mode),
34
29
  '-sc', "1",
35
30
  '-sf', "1"
36
31
  ]
37
- if options.verbose != None:
32
+ if options.verbose == True:
38
33
  subprocess.run(cdhit_command)
39
34
  else:
40
35
  subprocess.run(cdhit_command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
41
36
 
42
37
 
43
38
  def main():
44
- parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ': A tool that groups genes into unique clusters.')
45
- ### Required Arguments
46
- required = parser.add_argument_group('Required Parameters')
47
- required.add_argument('-run_mode', action='store', dest='run_mode', choices=['Full','Partial'],
48
- help='Run Mode: Should PyamilySeq be run in "Full" or "Partial" mode?',
49
- required=True)
50
- required.add_argument('-group_mode', action='store', dest='group_type', choices=['Species', 'Genus'],
51
- help='Group Mode: Should PyamilySeq be run in "Species" or "Genus" mode? ',
52
- required=True)
53
- required.add_argument("-clustering_format", action="store", dest="clustering_format", choices=['CD-HIT','TSV','CSV'],
54
- help="Clustering format to use: CD-HIT or TSV (MMseqs2, BLAST, DIAMOND) / CSV edge-list file (Node1\tNode2).",
55
- required=True)
56
- required.add_argument("-output_dir", action="store", dest="output_dir",
57
- help="Directory for all output files.",
58
- required=True)
59
- ### Full-Mode Arguments
60
- full_mode_args = parser.add_argument_group('Full-Mode Parameters - Required when "-run_mode Full" is used')
61
- full_mode_args.add_argument("-input_type", action="store", dest="input_type", choices=['separate', 'combined'],
62
- help="Type of input files: 'separate' for separate FASTA and GFF files,"
63
- " 'combined' for GFF files with embedded FASTA sequences.",
64
- required=False)
65
- full_mode_args.add_argument("-input_dir", action="store", dest="input_dir",
66
- help="Directory containing GFF/FASTA files.",
67
- required=False)
68
- full_mode_args.add_argument("-name_split", action="store", dest="name_split",
69
- help="substring used to split the filename and extract the genome name ('_combined.gff3' or '.gff').",
70
- required=False)
71
- full_mode_args.add_argument('-sequence_type', action='store', dest='sequence_type', default='DNA',choices=['AA', 'DNA'],
72
- help='Default - DNA: Should clustering be performed in "DNA" or "AA" mode?',
73
- required=False)
74
- full_mode_args.add_argument('-gene_ident', action='store', dest='gene_ident', default='CDS',
75
- help='Identifier used for extraction of sequences such as "misc_RNA,gene,mRNA,CDS,rRNA,tRNA,tmRNA,CRISPR,ncRNA,regulatory_region,oriC,pseudo"',
76
- required=False)
77
- full_mode_args.add_argument("-pid", action="store", dest="pident", type=float, default=0.95,
78
- help="Default 0.95: Pident threshold for clustering.",
79
- required=False)
80
- full_mode_args.add_argument("-len_diff", action="store", dest="len_diff", type=float, default=0.80,
81
- help="Default 0.80: Minimum length difference between clustered sequences - (-s) threshold for CD-HIT clustering.",
82
- required=False)
83
- ###Clustering Arguments
84
- clustering_args = parser.add_argument_group('Clustering Runtime Arguments - Optional when "-run_mode Full" is used')
85
- clustering_args.add_argument("-mem", action="store", dest="clustering_memory", type=int, default=4000,
86
- help="Default 4000: Memory to be allocated for clustering (in MBs).",
87
- required=False)
88
- clustering_args.add_argument("-t", action="store", dest="threads", type=int, default=8,
89
- help="Default 8: Threads to be allocated for clustering and/or alignment.",
90
- required=False)
91
-
92
- ###Partial-Mode Arguments
93
- partial_mode_args = parser.add_argument_group("Partial-Mode Parameters - Required when '-run_mode Partial' is used")
94
- partial_mode_args.add_argument("-cluster_file", action="store", dest="cluster_file",
95
- help="Clustering output file containing CD-HIT, TSV or CSV Edge List",
96
- required=False)
97
-
98
- ###Grouping Arguments
99
- grouping_args = parser.add_argument_group('Grouping Parameters - Use to fine-tune grouping of genes after clustering')
100
- grouping_args.add_argument('-reclustered', action='store', dest='reclustered',
101
- help='Currently only works on Partial Mode: Clustering output file from secondary round of clustering.',
102
- required=False)
103
- grouping_args.add_argument('-seq_tag', action='store', dest='sequence_tag', default='StORF',
104
- help='Default - "StORF": Unique identifier to be used to distinguish the second of two rounds of clustered sequences',
105
- required=False)
106
- grouping_args.add_argument('-core_groups', action="store", dest='core_groups', default="99,95,15",
107
- help='Default - (\'99,95,15\'): Gene family groups to use for "Species" mode',
108
- required=False)
109
-
110
- grouping_args.add_argument('-genus_groups', action="store", dest='genus_groups', default="1,2,3,4,5,6",
111
- help='Default - (\'1,2,3,4,5,6\'): Gene family groups to use for "Genus" mode',
112
- required=False)
113
-
114
- ###Output Arguments
115
- output_args = parser.add_argument_group('Output Parameters')
116
- output_args.add_argument('-w', action="store", dest='write_groups', default=None,
117
- help='Default - No output: Output sequences of identified groups (provide levels at which to output - Species "-w 99,95" Genus "-w 2,3"'
118
- ' - Must provide FASTA file with -original_fasta if in Partial run mode.',
119
- required=False)
120
- output_args.add_argument('-a', action="store_true", dest='align_core', default=None,
121
- help='Default - No output: SLOW! (Only works for Species mode) Output aligned and concatinated sequences of identified groups -'
122
- 'provide group levels at which to output "-w 99,95" - Must provide FASTA file with -original_fasta in Partial'
123
- 'run mode.',
124
- required=False)
125
- output_args.add_argument('-original_fasta', action='store', dest='original_fasta',
126
- help='FASTA file to use in conjunction with "-w" or "-con" when running in Partial Mode.',
127
- required=False)
128
- output_args.add_argument('-no_gpa', action='store_false', dest='gene_presence_absence_out',
129
- help='Do not create a Roary/Panaroo formatted gene_presence_absence.csv (created by default) - Required for Coinfinder and other downstream tools',
130
- required=False)
131
-
132
- ### Misc Arguments
133
- misc = parser.add_argument_group("Misc Parameters")
134
- misc.add_argument("-verbose", action="store_true", dest="verbose",
135
- help="Print verbose output.",
136
- required=False)
137
- misc.add_argument("-v", "--version", action="version",
138
- version=f"PyamilySeq version {PyamilySeq_Version} - Exiting",
139
- help="Print out version number and exit")
140
-
141
-
39
+ parser = argparse.ArgumentParser(description=f"PyamilySeq {PyamilySeq_Version}: A tool for gene clustering and analysis.")
40
+
41
+ # Add subparsers for Full and Partial modes
42
+ subparsers = parser.add_subparsers(dest="run_mode", required=True, help="Choose a mode: 'Full' or 'Partial'.")
43
+
44
+ # Full Mode Subparser
45
+ full_parser = subparsers.add_parser("Full",
46
+ help="Full mode: PyamilySeq to cluster with CD-HIT and process output.")
47
+ #full_parser.add_argument("-clustering_format", choices=['CD-HIT', 'MMseqs', 'BLAST'], required=True,
48
+ # help="Clustering format to use: CD-HIT, MMseqs2, or BLAST.")
49
+ full_parser.add_argument("-output_dir", required=True,
50
+ help="Directory for all output files.")
51
+ full_parser.add_argument("-input_type", choices=['separate', 'combined', 'fasta'], required=True,
52
+ help="Type of input files: 'separate' for matching FASTA and GFF files, 'combined' for GFF+FASTA, or 'fasta' for a prepared FASTA file.")
53
+ full_parser.add_argument("-input_dir", required=False,
54
+ help="Directory containing GFF/FASTA files - Use with -input_type separate/combined.")
55
+ full_parser.add_argument("-input_fasta", required=False,
56
+ help="Input FASTA file - Use with - input_type fasta.")
57
+ full_parser.add_argument("-name_split", required=False,
58
+ help="Substring to split filenames and extract genome names (e.g., '_combined.gff3') - Use with -input_type separate/combined.")
59
+ full_parser.add_argument("-sequence_type", choices=['AA', 'DNA'], default="AA", required=False,
60
+ help="Clustering mode: 'DNA' or 'AA'.")
61
+ full_parser.add_argument("-gene_ident", default="CDS", required=False,
62
+ help="Gene identifiers to extract sequences (e.g., 'CDS, tRNA').")
63
+ full_parser.add_argument("-c", type=float, dest="pident", default=0.90, required=False,
64
+ help="Sequence identity threshold for clustering (default: 0.90) - CD-HIT parameter '-c'.")
65
+ full_parser.add_argument("-s", type=float, dest="len_diff", default=0.80, required=False,
66
+ help="Length difference threshold for clustering (default: 0.80) - CD-HIT parameter '-s'.")
67
+ full_parser.add_argument("-fast_mode", action="store_true", required=False,
68
+ help="Enable fast mode for CD-HIT (not recommended) - CD-HIT parameter '-g'.")
69
+
70
+
71
+ # Partial Mode Subparser
72
+ partial_parser = subparsers.add_parser("Partial", help="Partial mode: PyamilySeq to process pre-clustered data.")
73
+ partial_parser.add_argument("-clustering_format", choices=['CD-HIT', 'MMseqs', 'BLAST'], required=True,
74
+ help="Clustering format used: CD-HIT, MMseqs2, or BLAST.")
75
+ partial_parser.add_argument("-cluster_file", required=True,
76
+ help="Cluster file containing pre-clustered groups from CD-HIT, MMseqs, BLAST etc.")
77
+ partial_parser.add_argument("-original_fasta", required=True,
78
+ help="FASTA file used in pre-clustering (Provide sequences in DNA form).")
79
+ partial_parser.add_argument("-output_dir", required=True,
80
+ help="Directory for all output files.")
81
+ partial_parser.add_argument("-reclustered", required=False,
82
+ help="Clustering output file from a second round of clustering.")
83
+ partial_parser.add_argument("-seq_tag", default="StORF", dest="sequence_tag", required=False,
84
+ help="Tag for distinguishing reclustered sequences.")
85
+
86
+ # Common Grouping Arguments
87
+ for subparser in [full_parser, partial_parser]:
88
+ subparser.add_argument("-group_mode", choices=['Species', 'Genus'], default="Species", required=False,
89
+ help="Grouping mode: 'Species' or 'Genus'.")
90
+ subparser.add_argument("-species_groups", default="99,95,15", required=False,
91
+ help="Gene groupings for 'Species' mode (default: '99,95,15').")
92
+ subparser.add_argument("-genus_groups", default="1,2,3,4,5,6,7,8,9,10", required=False,
93
+ help="Gene groupings for 'Genus' mode (default: '1-10').")
94
+ subparser.add_argument("-w", default=None, dest="write_groups", required=False,
95
+ help="Output gene groups as a single FASTA file (specify levels: e.g., '-w 99,95').")
96
+ subparser.add_argument("-wi", action="store_true", dest="write_individual_groups", required=False,
97
+ help="Output individual FASTA files for each group.")
98
+ subparser.add_argument("-a", action="store_true", dest="align_core", required=False,
99
+ help="Align and concatenate sequences for 'core' groups.")
100
+ subparser.add_argument("-align_aa", action="store_true", required=False,
101
+ help="Align sequences as amino acids.")
102
+ subparser.add_argument("-no_gpa", action="store_false", dest="gene_presence_absence_out", required=False,
103
+ help="Skip creation of gene_presence_absence.csv.")
104
+ subparser.add_argument("-M", type=int, default=4000, dest="mem", required=False,
105
+ help="Memory allocation for clustering (MB) - CD-HIT parameter '-M'.")
106
+ subparser.add_argument("-T", type=int, default=8, dest="threads", required=False,
107
+ help="Number of threads for clustering/alignment - CD-HIT parameter '-T' | MAFFT parameter '--thread'.")
108
+
109
+ # Miscellaneous Arguments
110
+ subparser.add_argument("-verbose", action="store_true", required=False,
111
+ help="Print verbose output.")
112
+ subparser.add_argument("-v", "--version", action="version",
113
+ version=f"PyamilySeq {PyamilySeq_Version}: Exiting.", help="Print version number and exit.")
114
+
115
+ # Parse Arguments
142
116
  options = parser.parse_args()
143
- print("Running PyamilySeq: " + PyamilySeq_Version)
117
+
118
+ # Example of conditional logic based on selected mode
119
+ print(f"Running PyamilySeq {PyamilySeq_Version} in {options.run_mode} mode:")
120
+ if options.run_mode == "Full" and options.verbose == True:
121
+ print("Processing Full mode with options:", vars(options))
122
+ elif options.run_mode == "Partial" and options.verbose == True:
123
+ print("Processing Partial mode with options:", vars(options))
144
124
 
145
125
  ### Checking all required parameters are provided by user #!!# Doesn't seem to work
146
126
  if options.run_mode == 'Full':
147
- if options.reclustered != None:
127
+ options.clustering_format = 'CD-HIT'
128
+ if getattr(options, 'reclustered', None) is not None:
148
129
  sys.exit("Currently reclustering only works on Partial Mode.")
149
- required_full_mode = [options.input_type, options.input_dir, options.name_split, options.clustering_format,
150
- options.pident, options.len_diff]
130
+ required_full_mode = [options.input_type, options.pident, options.len_diff]
131
+ if options.input_type != 'fasta':
132
+ required_full_mode.extend([options.input_dir, options.name_split])
151
133
  if all(required_full_mode):
152
134
  # Proceed with the Full mode
153
135
  pass
154
136
  else:
155
137
  missing_options = [opt for opt in
156
138
  ['input_type', 'input_dir', 'name_split', 'clustering_format', 'pident', 'len_diff'] if
157
- not options.__dict__[opt]]
158
- print(f"Missing required options for Full mode: {', '.join(missing_options)}")
159
- if options.align_core != None:
139
+ not options.__dict__.get(opt)]
140
+ sys.exit(f"Missing required options for Full mode: {', '.join(missing_options)}")
141
+ if options.align_core:
142
+ options.write_individual_groups = True
160
143
  if options.write_groups == None:
161
144
  sys.exit('Must provide "-w" to output gene groups before alignment "-a" can be done.')
162
145
  elif options.run_mode == 'Partial':
163
- required_partial_mode = [options.cluster_file, ]
146
+ required_partial_mode = [options.cluster_file, options.original_fasta]
164
147
  if all(required_partial_mode):
165
148
  # Proceed with the Partial mode
166
149
  pass
167
150
  else:
168
151
  missing_options = [opt for opt in
169
- ['cluster_file',] if
152
+ ['cluster_file','original_fasta'] if
170
153
  not options.__dict__[opt]]
171
- print(f"Missing required options for Partial mode: {', '.join(missing_options)}")
172
- if options.align_core != None:
154
+ sys.exit(f"Missing required options for Partial mode: {', '.join(missing_options)}")
155
+ if options.align_core:
156
+ options.write_individual_groups = True
173
157
  if options.write_groups == None or options.original_fasta == None:
174
- sys.exit('Must provide "-w" and "-original_fasta" to output gene groups before alignment "-a" can be done.')
158
+ sys.exit('Must provide "-w" to output gene groups before alignment "-a" can be done.')
175
159
 
176
160
  if options.clustering_format == 'CD-HIT':
177
161
  clust_affix = '.clstr'
@@ -180,48 +164,55 @@ def main():
180
164
  elif options.clustering_format == 'CSV':
181
165
  clust_affix = '.csv'
182
166
 
183
-
184
-
185
-
186
167
  ###External tool checks:
187
168
  ##MAFFT
188
169
  if options.align_core == True:
189
170
  if is_tool_installed('mafft'):
190
- if options.verbose != None:
171
+ if options.verbose == True:
191
172
  print("mafft is installed. Proceeding with alignment.")
192
173
  else:
193
174
  exit("mafft is not installed. Please install mafft to proceed.")
194
175
  ##CD-HIT
195
- if options.clustering_format == 'CD-HIT' and options.run_mode == 'Full':
176
+ if options.run_mode == 'Full':
196
177
  if is_tool_installed('cd-hit'):
197
- if options.verbose != None:
178
+ if options.verbose == True:
198
179
  print("cd-hit is installed. Proceeding with clustering.")
180
+ if options.sequence_type == 'DNA':
181
+ clustering_mode = 'cd-hit-est'
182
+ elif options.sequence_type == 'AA':
183
+ clustering_mode = 'cd-hit'
184
+ if options.fast_mode == True:
185
+ options.fast_mode = 0
186
+ if options.verbose == True:
187
+ print("Running CD-HIT in fast mode.")
188
+ else:
189
+ options.fast_mode = 1
190
+ if options.verbose == True:
191
+ print("Running CD-HIT in slow mode.")
199
192
  else:
200
193
  exit("cd-hit is not installed. Please install cd-hit to proceed.")
201
194
 
202
- if options.write_groups != None and options.original_fasta == False:
203
- exit("-fasta must br provided if -w is used")
204
-
205
195
 
196
+ # if options.write_groups != None and options.original_fasta == False:
197
+ # exit("-fasta must br provided if -w is used")
206
198
 
207
-
208
- if options.cluster_file:
199
+ if hasattr(options, 'cluster_file') and options.cluster_file:
209
200
  options.cluster_file = fix_path(options.cluster_file)
210
- if options.reclustered:
201
+ if hasattr(options, 'reclustered') and options.reclustered:
211
202
  options.reclustered = fix_path(options.reclustered)
212
- if options.input_dir:
203
+ if hasattr(options, 'input_dir') and options.input_dir:
213
204
  options.input_dir = fix_path(options.input_dir)
214
- if options.output_dir:
205
+ if hasattr(options, 'output_dir') and options.output_dir:
215
206
  options.output_dir = fix_path(options.output_dir)
216
207
 
217
208
  output_path = os.path.abspath(options.output_dir)
218
- combined_out_file = os.path.join(output_path, "combined_sequences.fasta")
209
+ combined_out_file = os.path.join(output_path, "combined_sequences_dna.fasta")
219
210
  clustering_output = os.path.join(output_path, 'clustering_' + options.clustering_format)
220
211
 
221
- if options.group_type == 'Species':
222
- options.core_groups = options.core_groups + ',0'
223
- groups_to_use = options.core_groups
224
- elif options.group_type == 'Genus':
212
+ if options.group_mode == 'Species':
213
+ options.species_groups = options.species_groups + ',0'
214
+ groups_to_use = options.species_groups
215
+ elif options.group_mode == 'Genus':
225
216
  options.genus_groups = options.genus_groups + ',>'
226
217
  groups_to_use = options.genus_groups
227
218
  if options.align_core != None:
@@ -229,36 +220,56 @@ def main():
229
220
 
230
221
 
231
222
  if options.run_mode == 'Full':
223
+ if options.clustering_format != 'CD-HIT':
224
+ sys.exit('Only CD-HIT clsutering works in Full Mode')
225
+
232
226
  if not os.path.exists(output_path):
233
227
  os.makedirs(output_path)
234
228
  if options.sequence_type == 'AA':
235
229
  clustering_mode = 'cd-hit'
230
+ file_to_cluster = combined_out_file.replace('_dna.fasta','_aa.fasta')
236
231
  translate = True
237
232
  elif options.sequence_type == 'DNA':
238
233
  clustering_mode = 'cd-hit-est'
239
234
  translate = False
235
+ file_to_cluster = combined_out_file
240
236
  if options.input_type == 'separate':
241
237
  read_separate_files(options.input_dir, options.name_split, options.gene_ident, combined_out_file, translate)
242
- else:
238
+ run_cd_hit(options, file_to_cluster, clustering_output, clustering_mode)
239
+ elif options.input_type == 'combined':
243
240
  read_combined_files(options.input_dir, options.name_split, options.gene_ident, combined_out_file, translate)
241
+ run_cd_hit(options, file_to_cluster, clustering_output, clustering_mode)
242
+ elif options.input_type == 'fasta':
243
+ combined_out_file = options.input_fasta
244
+ ### FIX write code to detect if DNA or AA and if sequence tpye is AA then translate
245
+ # Detect if the input FASTA file contains DNA or AA sequences
246
+ is_dna = detect_sequence_type(options.input_fasta)
247
+ # If the sequence type is AA and the input is DNA, translate the DNA to AA
248
+ if options.sequence_type == 'AA' and is_dna:
249
+ translated_fasta = os.path.join(output_path, os.path.splitext(os.path.basename(options.input_fasta))[0] + '_aa.fasta')
250
+ translate_dna_to_aa(options.input_fasta, translated_fasta)
251
+ file_to_cluster = translated_fasta
252
+ else:
253
+ file_to_cluster = options.input_fasta
254
+ run_cd_hit(options, file_to_cluster, clustering_output, clustering_mode)
244
255
 
245
- if options.clustering_format == 'CD-HIT':
246
- run_cd_hit(options, combined_out_file, clustering_output, clustering_mode)
247
256
 
248
257
  class clustering_options:
249
258
  def __init__(self):
250
259
  self.run_mode = options.run_mode
251
260
  self.cluster_format = options.clustering_format
252
261
  self.sequence_type = options.sequence_type
253
- self.reclustered = options.reclustered
254
- self.sequence_tag = options.sequence_tag
255
- self.core_groups = groups_to_use
262
+ self.reclustered = None
263
+ self.sequence_tag = None
264
+ self.species_groups = groups_to_use
256
265
  self.clusters = clustering_output + clust_affix
257
266
  self.output_dir = options.output_dir
258
267
  self.gene_presence_absence_out = options.gene_presence_absence_out
259
268
  self.write_groups = options.write_groups
269
+ self.write_individual_groups = options.write_individual_groups
260
270
  self.threads = options.threads
261
271
  self.align_core = options.align_core
272
+ self.align_aa = options.align_aa
262
273
  self.fasta = combined_out_file
263
274
  self.verbose = options.verbose
264
275
 
@@ -269,26 +280,35 @@ def main():
269
280
  def __init__(self):
270
281
  self.run_mode = options.run_mode
271
282
  self.cluster_format = options.clustering_format
283
+ self.sequence_type = None
272
284
  self.reclustered = options.reclustered
273
285
  self.sequence_tag = options.sequence_tag
274
- self.core_groups = groups_to_use
286
+ self.species_groups = groups_to_use
275
287
  self.clusters = options.cluster_file
276
288
  self.output_dir = options.output_dir
277
289
  self.gene_presence_absence_out = options.gene_presence_absence_out
278
290
  self.write_groups = options.write_groups
291
+ self.write_individual_groups = options.write_individual_groups
279
292
  self.threads = options.threads
280
293
  self.align_core = options.align_core
294
+ self.align_aa = options.align_aa
281
295
  self.fasta = options.original_fasta
282
296
  self.verbose = options.verbose
283
297
 
284
298
  clustering_options = clustering_options()
285
299
 
286
300
 
287
- if options.group_type == 'Species':
301
+ if options.group_mode == 'Species':
288
302
  species_cluster(clustering_options)
289
- elif options.group_type == 'Genus':
303
+ elif options.group_mode == 'Genus':
290
304
  genus_cluster((clustering_options))
291
305
 
306
+
307
+ # Save arguments to a text file
308
+ with open(output_path+"/PyamilySeq_params.txt", "w") as outfile:
309
+ for arg, value in vars(options).items():
310
+ outfile.write(f"{arg}: {value}\n")
311
+
292
312
  print("Thank you for using PyamilySeq -- A detailed user manual can be found at https://github.com/NickJD/PyamilySeq\n"
293
313
  "Please report any issues to: https://github.com/NickJD/PyamilySeq/issues\n#####")
294
314
 
@@ -2,11 +2,11 @@
2
2
 
3
3
 
4
4
  try:
5
- from .Constants import *
5
+ from .constants import *
6
6
  from .clusterings import *
7
7
  from .utils import *
8
8
  except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
9
- from Constants import *
9
+ from constants import *
10
10
  from clusterings import *
11
11
  from utils import *
12
12
 
@@ -14,16 +14,16 @@ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
14
14
  def gene_presence_absence_output(options, genus_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted):
15
15
  print("Outputting gene_presence_absence file")
16
16
  output_dir = os.path.abspath(options.output_dir)
17
- in_name = options.clusters.split('.')[0].split('/')[-1]
18
- gpa_outfile = os.path.join(output_dir, in_name)
19
- gpa_outfile = open(gpa_outfile+'_gene_presence_absence.csv','w')
17
+ #in_name = options.clusters.split('.')[0].split('/')[-1]
18
+ gpa_outfile = os.path.join(output_dir, 'gene_presence_absence.csv')
19
+ gpa_outfile = open(gpa_outfile, 'w')
20
20
  gpa_outfile.write('"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment","'
21
21
  '"Accessory Fragment","Accessory Order with Fragment","QC","Min group size nuc","Max group size nuc","Avg group size nuc","')
22
22
  gpa_outfile.write('","'.join(genus_dict.keys()))
23
23
  gpa_outfile.write('"\n')
24
24
  for cluster, sequences in pangenome_clusters_First_sequences_sorted.items():
25
25
  average_sequences_per_genome = len(sequences) / len(pangenome_clusters_First_sorted[cluster])
26
- gpa_outfile.write('"group_'+str(cluster)+'","","'+str(len(pangenome_clusters_First_sorted[cluster]))+'","'+str(len(sequences))+'","'+str(average_sequences_per_genome)+
26
+ gpa_outfile.write('"group_'+str(cluster)+'","","",'+str(len(pangenome_clusters_First_sorted[cluster]))+'","'+str(len(sequences))+'","'+str(average_sequences_per_genome)+
27
27
  '","","","","","","","","",""')
28
28
 
29
29
 
@@ -183,7 +183,7 @@ def cluster(options):
183
183
  key_order = list(cores.keys())
184
184
  with open(stats_out,'w') as outfile:
185
185
  print("Genus Groups:")
186
- outfile.write("Genus Groups:\n")
186
+ outfile.write("Genus Groups\n")
187
187
  for key in key_order:
188
188
  print(key+':\t'+str(len(cores[key])))
189
189
  outfile.write(key + ':\t' + str(len(cores[key]))+'\n')
@@ -209,8 +209,8 @@ def cluster(options):
209
209
  print("Outputting gene group FASTA files")
210
210
  sequences = read_fasta(options.fasta)
211
211
  #output_dir = os.path.dirname(os.path.abspath(options.output_dir))
212
- output_dir = os.path.join(options.output_dir, 'Gene_Families_Output')
213
- write_groups(options,output_dir, key_order, cores, sequences,
212
+ output_dir = os.path.join(options.output_dir, 'Gene_Groups_Output')
213
+ write_groups_func(options,output_dir, key_order, cores, sequences,
214
214
  pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
215
215
 
216
216
  elif options.run_mode == 'Partial':
@@ -220,8 +220,8 @@ def cluster(options):
220
220
  print("Outputting gene group FASTA files")
221
221
  sequences = read_fasta(options.fasta)
222
222
  #output_dir = os.path.dirname(os.path.abspath(options.output_dir))
223
- output_dir = os.path.join(options.output_dir, 'Gene_Families_Output')
224
- write_groups(options,output_dir, key_order, cores, sequences,
223
+ output_dir = os.path.join(options.output_dir, 'Gene_Groups_Output')
224
+ write_groups_func(options,output_dir, key_order, cores, sequences,
225
225
  pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
226
226
 
227
227