PyamilySeq 0.6.0__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
PyamilySeq/Constants.py CHANGED
@@ -1,2 +1,2 @@
1
- PyamilySeq_Version = 'v0.6.0'
1
+ PyamilySeq_Version = 'v0.7.1'
2
2
 
PyamilySeq/PyamilySeq.py CHANGED
@@ -20,9 +20,9 @@ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
20
20
 
21
21
 
22
22
 
23
- def run_cd_hit(input_file, clustering_output, options):
23
+ def run_cd_hit(options, input_file, clustering_output, clustering_mode):
24
24
  cdhit_command = [
25
- 'cd-hit-est',
25
+ clustering_mode,
26
26
  '-i', input_file,
27
27
  '-o', clustering_output,
28
28
  '-c', str(options.pident),
@@ -33,14 +33,15 @@ def run_cd_hit(input_file, clustering_output, options):
33
33
  '-sc', "1",
34
34
  '-sf', "1"
35
35
  ]
36
- if options.verbose == True:
36
+ if options.verbose != None:
37
37
  subprocess.run(cdhit_command)
38
38
  else:
39
39
  subprocess.run(cdhit_command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
40
40
 
41
41
 
42
42
  def main():
43
- parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ': PyamilySeq Run Parameters.')
43
+ parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ': A tool that groups genes into unique clusters.')
44
+ vparser = argparse.ArgumentParser()
44
45
  ### Required Arguments
45
46
  required = parser.add_argument_group('Required Arguments')
46
47
  required.add_argument('-run_mode', action='store', dest='run_mode', choices=['Full','Partial'],
@@ -49,8 +50,8 @@ def main():
49
50
  required.add_argument('-group_mode', action='store', dest='group_type', choices=['Species', 'Genus'],
50
51
  help='Group Mode: Should PyamilySeq be run in "Species" or "Genus" mode? ',
51
52
  required=True)
52
- required.add_argument("-clust_tool", action="store", dest="clust_tool", choices=['CD-HIT'],
53
- help="Clustering tool to use: CD-HIT, DIAMOND, BLAST or MMseqs2.",
53
+ required.add_argument("-clustering_format", action="store", dest="clustering_format", choices=['CD-HIT','TSV','CSV'],
54
+ help="Clustering format to use: CD-HIT or TSV (MMseqs2, BLAST, DIAMOND) / CSV edge-list file (Node1\tNode2).",
54
55
  required=True)
55
56
  required.add_argument("-output_dir", action="store", dest="output_dir",
56
57
  help="Directory for all output files.",
@@ -67,6 +68,12 @@ def main():
67
68
  full_mode_args.add_argument("-name_split", action="store", dest="name_split",
68
69
  help="substring used to split the filename and extract the genome name ('_combined.gff3' or '.gff').",
69
70
  required=False)
71
+ full_mode_args.add_argument('-sequence_type', action='store', dest='sequence_type', default='DNA',choices=['AA', 'DNA'],
72
+ help='Default - DNA: Should clustering be performed in "DNA" or "AA" mode?',
73
+ required=False)
74
+ full_mode_args.add_argument('-gene_ident', action='store', dest='gene_ident', default='CDS',
75
+ help='Identifier used for extraction of sequences such as "misc_RNA,gene,mRNA,CDS,rRNA,tRNA,tmRNA,CRISPR,ncRNA,regulatory_region,oriC,pseudo"',
76
+ required=False)
70
77
  full_mode_args.add_argument("-pid", action="store", dest="pident", type=float, default=0.95,
71
78
  help="Default 0.95: Pident threshold for clustering.",
72
79
  required=False)
@@ -99,51 +106,67 @@ def main():
99
106
  grouping_args.add_argument('-core_groups', action="store", dest='core_groups', default="99,95,15",
100
107
  help='Default - (\'99,95,15\'): Gene family groups to use for "Species" mode',
101
108
  required=False)
109
+
102
110
  grouping_args.add_argument('-genus_groups', action="store", dest='genus_groups', default="1,2,3,4,5,6",
103
111
  help='Default - (\'1,2,3,4,5,6\'): Gene family groups to use for "Genus" mode',
104
112
  required=False)
105
113
 
106
114
  ###Output Arguments
107
115
  output_args = parser.add_argument_group('Output Parameters')
108
- output_args.add_argument('-w', action="store", dest='write_families', default=None,
109
- help='Default - No output: Output sequences of identified families (provide levels at which to output "-w 99,95"'
110
- ' - Must provide FASTA file with -fasta',
116
+ output_args.add_argument('-w', action="store", dest='write_groups', default=None,
117
+ help='Default - No output: Output sequences of identified groups (provide levels at which to output - Species "-w 99,95" Genus "-w 2,3"'
118
+ ' - Must provide FASTA file with -original_fasta if in Partial run mode.',
111
119
  required=False)
112
- output_args.add_argument('-con', action="store", dest='con_core', default=None,
113
- help='Default - No output: Output aligned and concatinated sequences of identified families - used for MSA (provide levels at which to output "-w 99,95"'
114
- ' - Must provide FASTA file with -fasta',
120
+ output_args.add_argument('-a', action="store_true", dest='align_core', default=None,
121
+ help='Default - No output: SLOW! (Only works for Species mode) Output aligned and concatinated sequences of identified groups -'
122
+ 'provide group levels at which to output "-w 99,95" - Must provide FASTA file with -original_fasta in Partial'
123
+ 'run mode.',
115
124
  required=False)
116
125
  output_args.add_argument('-original_fasta', action='store', dest='original_fasta',
117
126
  help='FASTA file to use in conjunction with "-w" or "-con" when running in Partial Mode.',
118
127
  required=False)
119
- output_args.add_argument('-gpa', action='store', dest='gene_presence_absence_out', help='Default - False: If selected, a Roary formatted gene_presence_absence.csv will be created - Required for Coinfinder and other downstream tools',
120
- required=False)
128
+ output_args.add_argument('-gpa', action='store_true', dest='gene_presence_absence_out', default=None,
129
+ help='Default - False: If selected, a Roary/Panaroo formatted gene_presence_absence.csv will be created - Required for Coinfinder and other downstream tools',
130
+ required=False)
121
131
 
122
132
  ### Misc Arguments
123
133
  misc = parser.add_argument_group('Misc')
124
- misc.add_argument('-verbose', action='store', dest='verbose', default=False, type=eval, choices=[True, False],
125
- help='Default - False: Print out runtime messages',
134
+ misc.add_argument('-verbose', action='store_true', dest='verbose', default=None, help='Default - False: Print out runtime messages',
126
135
  required = False)
127
- misc.add_argument('-v', action='store_true', dest='version',
136
+
137
+ ### Version Arguments
138
+ version = vparser.add_argument_group('Version')
139
+ version.add_argument('-v', action='store_true', dest='version',
128
140
  help='Default - False: Print out version number and exit',
129
141
  required=False)
130
142
 
143
+
144
+
145
+ args, unknown = vparser.parse_known_args()
146
+
147
+ if args.version == True:
148
+ sys.exit("PyamilySeq version: "+PyamilySeq_Version)
149
+
131
150
  options = parser.parse_args()
132
151
 
133
- ### Checking all required parameters are provided by user
152
+ ### Checking all required parameters are provided by user #!!# Doesn't seem to work
134
153
  if options.run_mode == 'Full':
154
+
135
155
  if options.reclustered != None:
136
156
  sys.exit("Currently reclustering only works on Partial Mode.")
137
- required_full_mode = [options.input_type, options.input_dir, options.name_split, options.clust_tool,
157
+ required_full_mode = [options.input_type, options.input_dir, options.name_split, options.clustering_format,
138
158
  options.pident, options.len_diff]
139
159
  if all(required_full_mode):
140
160
  # Proceed with the Full mode
141
161
  pass
142
162
  else:
143
163
  missing_options = [opt for opt in
144
- ['input_type', 'input_dir', 'name_split', 'clust_tool', 'pident', 'len_diff'] if
164
+ ['input_type', 'input_dir', 'name_split', 'clustering_format', 'pident', 'len_diff'] if
145
165
  not options.__dict__[opt]]
146
166
  print(f"Missing required options for Full mode: {', '.join(missing_options)}")
167
+ if options.align_core != None:
168
+ if options.write_groups == None:
169
+ sys.exit('Must provide "-w" to output gene groups before alignment "-a" can be done.')
147
170
  elif options.run_mode == 'Partial':
148
171
  required_partial_mode = [options.cluster_file, ]
149
172
  if all(required_partial_mode):
@@ -154,33 +177,37 @@ def main():
154
177
  ['cluster_file',] if
155
178
  not options.__dict__[opt]]
156
179
  print(f"Missing required options for Partial mode: {', '.join(missing_options)}")
180
+ if options.align_core != None:
181
+ if options.write_groups == None or options.original_fasta == None:
182
+ sys.exit('Must provide "-w" and "-original_fasta" to output gene groups before alignment "-a" can be done.')
157
183
 
158
- if options.clust_tool == 'CD-HIT':
184
+ if options.clustering_format == 'CD-HIT':
159
185
  clust_affix = '.clstr'
160
- elif options.clust_tool == 'TSV':
186
+ elif options.clustering_format == 'TSV':
161
187
  clust_affix = '.tsv'
162
- elif options.clust_tool == 'CSV':
188
+ elif options.clustering_format == 'CSV':
163
189
  clust_affix = '.csv'
164
190
 
165
191
 
166
192
 
193
+
167
194
  ###External tool checks:
168
195
  ##MAFFT
169
- if options.con_core == True:
196
+ if options.align_core == True:
170
197
  if is_tool_installed('mafft'):
171
- if options.verbose == True:
198
+ if options.verbose != None:
172
199
  print("mafft is installed. Proceeding with alignment.")
173
200
  else:
174
201
  exit("mafft is not installed. Please install mafft to proceed.")
175
202
  ##CD-HIT
176
- if options.clust_tool == 'CD-HIT' and options.run_mode == 'Full':
203
+ if options.clustering_format == 'CD-HIT' and options.run_mode == 'Full':
177
204
  if is_tool_installed('cd-hit'):
178
- if options.verbose == True:
205
+ if options.verbose != None:
179
206
  print("cd-hit is installed. Proceeding with clustering.")
180
207
  else:
181
208
  exit("cd-hit is not installed. Please install cd-hit to proceed.")
182
209
 
183
- if options.write_families != None and options.original_fasta == False:
210
+ if options.write_groups != None and options.original_fasta == False:
184
211
  exit("-fasta must br provided if -w is used")
185
212
 
186
213
 
@@ -197,35 +224,48 @@ def main():
197
224
 
198
225
  output_path = os.path.abspath(options.output_dir)
199
226
  combined_out_file = os.path.join(output_path, "combined_sequences.fasta")
200
- clustering_output = os.path.join(output_path, 'clustering_' + options.clust_tool)
227
+ clustering_output = os.path.join(output_path, 'clustering_' + options.clustering_format)
201
228
 
202
229
  if options.group_type == 'Species':
203
230
  options.core_groups = options.core_groups + ',0'
204
231
  groups_to_use = options.core_groups
205
- else:
232
+ elif options.group_type == 'Genus':
206
233
  options.genus_groups = options.genus_groups + ',>'
207
234
  groups_to_use = options.genus_groups
235
+ if options.align_core != None:
236
+ sys.exit("-a align_core not a valid option in Genus mode.")
208
237
 
209
238
 
210
239
  if options.run_mode == 'Full':
240
+ if not os.path.exists(output_path):
241
+ os.makedirs(output_path)
242
+ if options.sequence_type == 'AA':
243
+ clustering_mode = 'cd-hit'
244
+ translate = True
245
+ elif options.sequence_type == 'DNA':
246
+ clustering_mode = 'cd-hit-est'
247
+ translate = False
211
248
  if options.input_type == 'separate':
212
- read_separate_files(options.input_dir, options.name_split, combined_out_file)
249
+ read_separate_files(options.input_dir, options.name_split, options.gene_ident, combined_out_file, translate)
213
250
  else:
214
- read_combined_files(options.input_dir, options.name_split, combined_out_file)
251
+ read_combined_files(options.input_dir, options.name_split, options.gene_ident, combined_out_file, translate)
215
252
 
216
- run_cd_hit(combined_out_file, clustering_output, options)
253
+ if options.clustering_format == 'CD-HIT':
254
+ run_cd_hit(options, combined_out_file, clustering_output, clustering_mode)
217
255
 
218
256
  class clustering_options:
219
257
  def __init__(self):
220
- self.cluster_format = options.clust_tool
258
+ self.run_mode = options.run_mode
259
+ self.cluster_format = options.clustering_format
260
+ self.sequence_type = options.sequence_type
221
261
  self.reclustered = options.reclustered
222
262
  self.sequence_tag = options.sequence_tag
223
263
  self.core_groups = groups_to_use
224
264
  self.clusters = clustering_output + clust_affix
225
265
  self.output_dir = options.output_dir
226
266
  self.gene_presence_absence_out = options.gene_presence_absence_out
227
- self.write_families = options.write_families
228
- self.con_core = options.con_core
267
+ self.write_groups = options.write_groups
268
+ self.align_core = options.align_core
229
269
  self.fasta = combined_out_file
230
270
  self.verbose = options.verbose
231
271
 
@@ -234,15 +274,16 @@ def main():
234
274
  elif options.run_mode == 'Partial':
235
275
  class clustering_options:
236
276
  def __init__(self):
237
- self.cluster_format = options.clust_tool
277
+ self.run_mode = options.run_mode
278
+ self.cluster_format = options.clustering_format
238
279
  self.reclustered = options.reclustered
239
280
  self.sequence_tag = options.sequence_tag
240
281
  self.core_groups = groups_to_use
241
282
  self.clusters = options.cluster_file
242
283
  self.output_dir = options.output_dir
243
284
  self.gene_presence_absence_out = options.gene_presence_absence_out
244
- self.write_families = options.write_families
245
- self.con_core = options.con_core
285
+ self.write_groups = options.write_groups
286
+ self.align_core = options.align_core
246
287
  self.fasta = options.original_fasta
247
288
  self.verbose = options.verbose
248
289
 
@@ -258,4 +299,5 @@ def main():
258
299
  "Please report any issues to: https://github.com/NickJD/PyamilySeq/issues\n#####")
259
300
 
260
301
  if __name__ == "__main__":
302
+ #print("Running PyamilySeq "+PyamilySeq_Version)
261
303
  main()
@@ -1,10 +1,5 @@
1
1
  #from line_profiler_pycharm import profile
2
2
 
3
- import copy
4
- import sys
5
- import math
6
- from collections import Counter
7
-
8
3
 
9
4
  try:
10
5
  from .Constants import *
@@ -16,45 +11,6 @@ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
16
11
  from utils import *
17
12
 
18
13
 
19
-
20
- def process_gene_families(options, directory, output_file):
21
- """Process each gene family file to select the longest sequence per genome and concatenate aligned sequences."""
22
- concatenated_sequences = {}
23
- output_file = directory.replace('Gene_Families_Output',output_file)
24
-
25
- # Iterate over each gene family file
26
- for gene_file in os.listdir(directory):
27
- if gene_file.endswith('.fasta'):
28
- gene_path = os.path.join(directory, gene_file)
29
-
30
- # Read sequences from the gene family file
31
- sequences = read_fasta(gene_path)
32
-
33
- # Select the longest sequence for each genome
34
- longest_sequences = select_longest_gene(sequences)
35
-
36
- # Run mafft on the longest sequences
37
- aligned_file = f"{gene_file}_aligned.fasta"
38
- run_mafft_on_sequences(options, {seq_id: seq for seq_id, seq in longest_sequences.values()}, aligned_file)
39
-
40
- # Read aligned sequences and concatenate them
41
- aligned_sequences = read_fasta(aligned_file)
42
- for genome, aligned_seq in aligned_sequences.items():
43
- genome_name = genome.split('|')[0]
44
- if genome_name not in concatenated_sequences:
45
- concatenated_sequences[genome_name] = ""
46
- concatenated_sequences[genome_name] += aligned_seq
47
-
48
- # Clean up aligned file
49
- os.remove(aligned_file)
50
-
51
- # Write the concatenated sequences to the output file
52
- with open(output_file, 'w') as out:
53
- for genome, sequence in concatenated_sequences.items():
54
- out.write(f">{genome}\n")
55
- wrapped_sequence = wrap_sequence(sequence, 60)
56
- out.write(f"{wrapped_sequence}\n")
57
-
58
14
  def gene_presence_absence_output(options, genus_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted):
59
15
  print("Outputting gene_presence_absence file")
60
16
  output_dir = os.path.abspath(options.output_dir)
@@ -99,7 +55,7 @@ def gene_presence_absence_output(options, genus_dict, pangenome_clusters_First_s
99
55
 
100
56
 
101
57
 
102
- def get_cores(options,genus_dict):
58
+ def get_cores(options):
103
59
  ##Calculate core groups
104
60
  groups = OrderedDict()
105
61
  cores = OrderedDict()
@@ -117,27 +73,26 @@ def get_cores(options,genus_dict):
117
73
  cores[only_second_core_group] = []
118
74
  return cores, groups
119
75
 
120
-
121
76
  #@profile
122
- def calc_First_only_core(cluster, First_number, cores):
77
+ def calc_First_only_core(cluster, First_num, cores):
123
78
  try:
124
- cores['First_genera_'+str(First_number)].append(cluster)
79
+ cores['First_genera_' + str(First_num)].append(cluster)
125
80
  except KeyError:
126
81
  cores['First_genera_>'].append(cluster)
127
82
  #@profile
128
83
  def calc_single_First_extended_Second_only_core(cluster, First_num, cores, Second_num): # Count gene families extended with StORFs
129
84
  group = First_num + Second_num
130
85
  try:
131
- cores['extended_genera_' + group].append(cluster)
86
+ cores['extended_genera_' + str(group)].append(cluster)
132
87
  except KeyError:
133
88
  cores['extended_genera_>'].append(cluster)
134
89
  #@profile
135
90
  def calc_multi_First_extended_Second_only_core(cluster, First_num, cores, Second_num): # Count seperately those gene families extended with StORF_Reporter but combined >1 PEP
136
91
  group = First_num + Second_num
137
92
  try:
138
- cores['combined_genera_' + group].append(cluster)
93
+ cores['combined_genera_' + str(group)].append(cluster)
139
94
  except KeyError:
140
- cores['combined_genera_>' + group].append(cluster)
95
+ cores['combined_genera_>'].append(cluster)
141
96
  #@profile
142
97
  def calc_Second_only_core(cluster, cores, Second_num):
143
98
  try:
@@ -157,28 +112,26 @@ def calc_only_Second_only_core(cluster, cores, Second_num): # only count the tru
157
112
  def cluster(options):
158
113
 
159
114
  if options.cluster_format == 'CD-HIT':
160
- genus_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps = cluster_CDHIT(options, '_')
161
- elif options.cluster_format in ['TSV','CSV']:
162
- genus_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps = cluster_EdgeList(options, '_')
163
-
115
+ genus_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps = cluster_CDHIT(options, '_')
116
+ elif 'TSV' in options.cluster_format or 'CSV' in options.cluster_format:
117
+ genus_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps = cluster_EdgeList(options, '_')
164
118
 
119
+ ###
120
+ cores, groups = get_cores(options)
121
+ ###
165
122
 
166
123
  if options.reclustered != None:
167
-
168
124
  if options.cluster_format == 'CD-HIT':
169
- combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second = combined_clustering_CDHIT(options, genus_dict, '_')
170
- if options.cluster_format == ['TSV','CSV']:
171
- combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second = combined_clustering_Edge_List(options, '_')
172
- pangenome_clusters_Type = combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered, '_')
125
+ combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second, combined_pangenome_clusters_Second_sequences = combined_clustering_CDHIT(options, genus_dict, '_')
126
+ elif 'TSV' in options.cluster_format or 'CSV' in options.cluster_format:
127
+ combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second, combined_pangenome_clusters_Second_sequences = combined_clustering_Edge_List(options, '_')
128
+ pangenome_clusters_Type = combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered, pangenome_clusters_First_genomes, '_')
173
129
  else:
174
-
175
130
  pangenome_clusters_Type = single_clustering_counting(pangenome_clusters_First, reps)
176
131
 
177
- ###
178
- cores, groups = get_cores(options, genus_dict)
179
- ###
180
132
 
181
- Number_Of_StORF_Extending_But_Same_Genomes = 0
133
+
134
+ Number_Of_Second_Extending_But_Same_Genomes = 0
182
135
 
183
136
  sorted_first_keys = sort_keys_by_values(pangenome_clusters_First, pangenome_clusters_First_sequences)
184
137
  pangenome_clusters_First_sorted = reorder_dict_by_keys(pangenome_clusters_First, sorted_first_keys)
@@ -186,19 +139,28 @@ def cluster(options):
186
139
  pangenome_clusters_Type_sorted = reorder_dict_by_keys(pangenome_clusters_Type, sorted_first_keys)
187
140
 
188
141
  print("Calculating Groups")
142
+ seen_groupings = []
189
143
  for cluster, numbers in pangenome_clusters_Type_sorted.items():
190
144
  ############################### Calculate First only
191
- calc_First_only_core(cluster, numbers[1], cores)
145
+ cluster = str(cluster)
146
+ for grouping in numbers[2]: #!!# Could do with a more elegant solution
147
+ current_cluster = grouping[0].split(':')[0]
148
+ if current_cluster not in seen_groupings:
149
+ seen_groupings.append(current_cluster)
150
+ current_cluster_size = grouping[0].split(':')[1]
151
+ calc_First_only_core(current_cluster, current_cluster_size, cores)
152
+ ############################# Calculate First and Reclustered-Second
153
+ if numbers[0] == 1 and numbers[3] >= 1: # If Seconds did not combine First reps
154
+ calc_single_First_extended_Second_only_core(cluster, numbers[1], cores, numbers[3])
155
+ elif numbers[0] > 1 and numbers[3] >= 1: # If unique Seconds combined multiple Firsts
156
+ calc_multi_First_extended_Second_only_core(cluster, numbers[1], cores, numbers[3])
157
+ elif numbers[4] >= 1:
158
+ Number_Of_Second_Extending_But_Same_Genomes += 1
159
+ else:
160
+ if options.verbose == True:
161
+ print("First cluster " + current_cluster + " already processed - This is likely because it was clustered with another First representative.")
192
162
 
193
163
  if options.reclustered != None:
194
- ############################# Calculate First and Reclustered-Second
195
- if numbers[0] == 1 and numbers[3] >= 1: # If Seconds did not combine First reps
196
- calc_single_First_extended_Second_only_core(cluster, numbers[1], groups, cores, numbers[3])
197
- elif numbers[0] > 1 and numbers[3] >= 1: # If unique Secondss combined multiple Firsts
198
- calc_multi_First_extended_Second_only_core(cluster, numbers[1], groups, cores, numbers[3])
199
- elif numbers[4] >= 1:
200
- Number_Of_StORF_Extending_But_Same_Genomes += 1
201
-
202
164
  combined_pangenome_clusters_ONLY_Second_Type = defaultdict(list)
203
165
  combined_pangenome_clusters_Second_Type = defaultdict(list)
204
166
  for cluster, genomes in combined_pangenome_clusters_Second.items():
@@ -207,52 +169,73 @@ def cluster(options):
207
169
  else:
208
170
  combined_pangenome_clusters_ONLY_Second_Type[cluster] = [cluster, len(genomes)]
209
171
  for cluster, data in combined_pangenome_clusters_Second_Type.items():
210
- if data[1] >=1:
172
+ if data[1] >= 1:
211
173
  calc_Second_only_core(cluster, cores, data[1])
212
174
  for cluster, data in combined_pangenome_clusters_ONLY_Second_Type.items():
213
- if data[1] >= 1 :
175
+ if data[1] >= 1:
214
176
  calc_only_Second_only_core(cluster, cores, data[1])
215
177
  ###########################
216
178
  ### Output
217
- key_order = list(cores.keys())
218
179
  output_path = os.path.abspath(options.output_dir)
180
+ if not os.path.exists(output_path):
181
+ os.makedirs(output_path)
219
182
  stats_out = os.path.join(output_path,'summary_statistics.txt')
183
+ key_order = list(cores.keys())
220
184
  with open(stats_out,'w') as outfile:
221
185
  print("Genus Groups:")
222
186
  outfile.write("Genus Groups:\n")
223
187
  for key in key_order:
224
188
  print(key+':\t'+str(len(cores[key])))
225
189
  outfile.write(key + ':\t' + str(len(cores[key]))+'\n')
226
- print("Total Number of Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
190
+ print("Total Number of First Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
227
191
  outfile.write("Total Number of Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
192
+ if options.reclustered!= None:
193
+ print("Total Number of Second Gene Groups (Including Singletons): " + str(
194
+ len(combined_pangenome_clusters_Second_sequences)))
195
+ print("Total Number of First Gene Groups That Had Additional Second Sequences But Not New Genomes: " + str(
196
+ Number_Of_Second_Extending_But_Same_Genomes))
197
+ outfile.write("\nTotal Number of Second Gene Groups (Including Singletons): " + str(
198
+ len(combined_pangenome_clusters_Second_sequences)))
199
+ outfile.write("\nTotal Number of First Gene Groups That Had Additional Second Sequences But Not New Genomes: " + str(
200
+ Number_Of_Second_Extending_But_Same_Genomes))
228
201
 
229
202
  if options.gene_presence_absence_out != None:
230
203
  gene_presence_absence_output(options,genus_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted)
231
204
 
232
- if options.write_families != None and options.fasta != None:
233
- sequences = read_fasta(options.fasta)
234
- output_dir = os.path.join(output_path, 'Gene_Families_Output')
235
-
236
- # Create output directory if it doesn't exist
237
- if not os.path.exists(output_dir):
238
- os.makedirs(output_dir)
239
- for key_prefix in key_order:
240
- for key, values in cores.items():
241
- if any(part in options.write_families.split(',') for part in key.split('_')):
242
- if key.startswith(key_prefix):
243
- for value in values:
244
- output_filename = f"{key}_{value}.fasta"
245
- sequences_to_write = pangenome_clusters_First_sequences_sorted[value]
246
- # Write sequences to output file that are in the sequences dictionary
247
- with open(os.path.join(output_dir, output_filename), 'w') as outfile:
248
- for header in sequences_to_write:
249
- if header in sequences:
250
- outfile.write(f">{header}\n")
251
- wrapped_sequence = wrap_sequence(sequences[header])
252
- outfile.write(f"{wrapped_sequence}\n")
253
-
254
- if options.con_core != None and options.fasta != None and options.write_families != None:
255
- process_gene_families(options, os.path.join(output_dir, 'Gene_Families_Output'), 'concatonated_genes_aligned.fasta')
205
+ if options.run_mode == 'Full':
206
+ if options.reclustered == None:
207
+ combined_pangenome_clusters_Second_sequences = None
208
+ if options.write_groups != None:
209
+ print("Outputting gene group FASTA files")
210
+ sequences = read_fasta(options.fasta)
211
+ #output_dir = os.path.dirname(os.path.abspath(options.output_dir))
212
+ output_dir = os.path.join(options.output_dir, 'Gene_Families_Output')
213
+ write_groups(options,output_dir, key_order, cores, sequences,
214
+ pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
215
+
216
+ elif options.run_mode == 'Partial':
217
+ if options.reclustered == None:
218
+ combined_pangenome_clusters_Second_sequences = None
219
+ if options.write_groups != None and options.fasta != None:
220
+ print("Outputting gene group FASTA files")
221
+ sequences = read_fasta(options.fasta)
222
+ #output_dir = os.path.dirname(os.path.abspath(options.output_dir))
223
+ output_dir = os.path.join(options.output_dir, 'Gene_Families_Output')
224
+ write_groups(options,output_dir, key_order, cores, sequences,
225
+ pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
226
+
227
+
228
+ # if options.write_groups != None and options.fasta != None:
229
+ # sequences = read_fasta(options.fasta)
230
+ # output_dir = os.path.join(output_path, 'Gene_Families_Output')
231
+ #
232
+ # write_groups(options,output_dir, key_order, cores, sequences,
233
+ # pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
234
+
235
+
236
+ #!!# - Currently only align in Species Mode
237
+ #if options.align_core != None and options.fasta != None and options.write_groups != None:
238
+ # process_gene_families(options, os.path.join(output_path, 'Gene_Families_Output'), 'concatenated_genes_aligned.fasta')
256
239
 
257
240
 
258
241