PyamilySeq 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
PyamilySeq/Constants.py CHANGED
@@ -1,2 +1,2 @@
1
- PyamilySeq_Version = 'v0.6.0'
1
+ PyamilySeq_Version = 'v0.7.0'
2
2
 
PyamilySeq/PyamilySeq.py CHANGED
@@ -20,9 +20,9 @@ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
20
20
 
21
21
 
22
22
 
23
- def run_cd_hit(input_file, clustering_output, options):
23
+ def run_cd_hit(options, input_file, clustering_output, clustering_mode):
24
24
  cdhit_command = [
25
- 'cd-hit-est',
25
+ clustering_mode,
26
26
  '-i', input_file,
27
27
  '-o', clustering_output,
28
28
  '-c', str(options.pident),
@@ -33,14 +33,14 @@ def run_cd_hit(input_file, clustering_output, options):
33
33
  '-sc', "1",
34
34
  '-sf', "1"
35
35
  ]
36
- if options.verbose == True:
36
+ if options.verbose != None:
37
37
  subprocess.run(cdhit_command)
38
38
  else:
39
39
  subprocess.run(cdhit_command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
40
40
 
41
41
 
42
42
  def main():
43
- parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ': PyamilySeq Run Parameters.')
43
+ parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ': A tool that groups genes into unique clusters.')
44
44
  ### Required Arguments
45
45
  required = parser.add_argument_group('Required Arguments')
46
46
  required.add_argument('-run_mode', action='store', dest='run_mode', choices=['Full','Partial'],
@@ -49,8 +49,8 @@ def main():
49
49
  required.add_argument('-group_mode', action='store', dest='group_type', choices=['Species', 'Genus'],
50
50
  help='Group Mode: Should PyamilySeq be run in "Species" or "Genus" mode? ',
51
51
  required=True)
52
- required.add_argument("-clust_tool", action="store", dest="clust_tool", choices=['CD-HIT'],
53
- help="Clustering tool to use: CD-HIT, DIAMOND, BLAST or MMseqs2.",
52
+ required.add_argument("-clustering_format", action="store", dest="clustering_format", choices=['CD-HIT','TSV','CSV'],
53
+ help="Clustering format to use: CD-HIT or TSV (MMseqs2, BLAST, DIAMOND) / CSV edge-list file (Node1\tNode2).",
54
54
  required=True)
55
55
  required.add_argument("-output_dir", action="store", dest="output_dir",
56
56
  help="Directory for all output files.",
@@ -67,6 +67,12 @@ def main():
67
67
  full_mode_args.add_argument("-name_split", action="store", dest="name_split",
68
68
  help="substring used to split the filename and extract the genome name ('_combined.gff3' or '.gff').",
69
69
  required=False)
70
+ full_mode_args.add_argument('-sequence_type', action='store', dest='sequence_type', default='DNA',choices=['AA', 'DNA'],
71
+ help='Default - DNA: Should clustering be performed in "DNA" or "AA" mode?',
72
+ required=False)
73
+ full_mode_args.add_argument('-gene_ident', action='store', dest='gene_ident', default='CDS',
74
+ help='Identifier used for extraction of sequences such as "misc_RNA,gene,mRNA,CDS,rRNA,tRNA,tmRNA,CRISPR,ncRNA,regulatory_region,oriC,pseudo"',
75
+ required=False)
70
76
  full_mode_args.add_argument("-pid", action="store", dest="pident", type=float, default=0.95,
71
77
  help="Default 0.95: Pident threshold for clustering.",
72
78
  required=False)
@@ -99,30 +105,32 @@ def main():
99
105
  grouping_args.add_argument('-core_groups', action="store", dest='core_groups', default="99,95,15",
100
106
  help='Default - (\'99,95,15\'): Gene family groups to use for "Species" mode',
101
107
  required=False)
108
+
102
109
  grouping_args.add_argument('-genus_groups', action="store", dest='genus_groups', default="1,2,3,4,5,6",
103
110
  help='Default - (\'1,2,3,4,5,6\'): Gene family groups to use for "Genus" mode',
104
111
  required=False)
105
112
 
106
113
  ###Output Arguments
107
114
  output_args = parser.add_argument_group('Output Parameters')
108
- output_args.add_argument('-w', action="store", dest='write_families', default=None,
109
- help='Default - No output: Output sequences of identified families (provide levels at which to output "-w 99,95"'
110
- ' - Must provide FASTA file with -fasta',
115
+ output_args.add_argument('-w', action="store", dest='write_groups', default=None,
116
+ help='Default - No output: Output sequences of identified groups (provide levels at which to output - Species "-w 99,95" Genus "-w 2,3"'
117
+ ' - Must provide FASTA file with -original_fasta if in Partial run mode.',
111
118
  required=False)
112
- output_args.add_argument('-con', action="store", dest='con_core', default=None,
113
- help='Default - No output: Output aligned and concatinated sequences of identified families - used for MSA (provide levels at which to output "-w 99,95"'
114
- ' - Must provide FASTA file with -fasta',
119
+ output_args.add_argument('-a', action="store_true", dest='align_core', default=None,
120
+ help='Default - No output: SLOW! (Only works for Species mode) Output aligned and concatinated sequences of identified groups -'
121
+ 'provide group levels at which to output "-w 99,95" - Must provide FASTA file with -original_fasta in Partial'
122
+ 'run mode.',
115
123
  required=False)
116
124
  output_args.add_argument('-original_fasta', action='store', dest='original_fasta',
117
125
  help='FASTA file to use in conjunction with "-w" or "-con" when running in Partial Mode.',
118
126
  required=False)
119
- output_args.add_argument('-gpa', action='store', dest='gene_presence_absence_out', help='Default - False: If selected, a Roary formatted gene_presence_absence.csv will be created - Required for Coinfinder and other downstream tools',
120
- required=False)
127
+ output_args.add_argument('-gpa', action='store_true', dest='gene_presence_absence_out', default=None,
128
+ help='Default - False: If selected, a Roary/Panaroo formatted gene_presence_absence.csv will be created - Required for Coinfinder and other downstream tools',
129
+ required=False)
121
130
 
122
131
  ### Misc Arguments
123
132
  misc = parser.add_argument_group('Misc')
124
- misc.add_argument('-verbose', action='store', dest='verbose', default=False, type=eval, choices=[True, False],
125
- help='Default - False: Print out runtime messages',
133
+ misc.add_argument('-verbose', action='store_true', dest='verbose', default=None, help='Default - False: Print out runtime messages',
126
134
  required = False)
127
135
  misc.add_argument('-v', action='store_true', dest='version',
128
136
  help='Default - False: Print out version number and exit',
@@ -130,20 +138,24 @@ def main():
130
138
 
131
139
  options = parser.parse_args()
132
140
 
133
- ### Checking all required parameters are provided by user
141
+ ### Checking all required parameters are provided by user #!!# Doesn't seem to work
134
142
  if options.run_mode == 'Full':
143
+
135
144
  if options.reclustered != None:
136
145
  sys.exit("Currently reclustering only works on Partial Mode.")
137
- required_full_mode = [options.input_type, options.input_dir, options.name_split, options.clust_tool,
146
+ required_full_mode = [options.input_type, options.input_dir, options.name_split, options.clustering_format,
138
147
  options.pident, options.len_diff]
139
148
  if all(required_full_mode):
140
149
  # Proceed with the Full mode
141
150
  pass
142
151
  else:
143
152
  missing_options = [opt for opt in
144
- ['input_type', 'input_dir', 'name_split', 'clust_tool', 'pident', 'len_diff'] if
153
+ ['input_type', 'input_dir', 'name_split', 'clustering_format', 'pident', 'len_diff'] if
145
154
  not options.__dict__[opt]]
146
155
  print(f"Missing required options for Full mode: {', '.join(missing_options)}")
156
+ if options.align_core != None:
157
+ if options.write_groups == None:
158
+ sys.exit('Must provide "-w" to output gene groups before alignment "-a" can be done.')
147
159
  elif options.run_mode == 'Partial':
148
160
  required_partial_mode = [options.cluster_file, ]
149
161
  if all(required_partial_mode):
@@ -154,33 +166,37 @@ def main():
154
166
  ['cluster_file',] if
155
167
  not options.__dict__[opt]]
156
168
  print(f"Missing required options for Partial mode: {', '.join(missing_options)}")
169
+ if options.align_core != None:
170
+ if options.write_groups == None or options.original_fasta == None:
171
+ sys.exit('Must provide "-w" and "-original_fasta" to output gene groups before alignment "-a" can be done.')
157
172
 
158
- if options.clust_tool == 'CD-HIT':
173
+ if options.clustering_format == 'CD-HIT':
159
174
  clust_affix = '.clstr'
160
- elif options.clust_tool == 'TSV':
175
+ elif options.clustering_format == 'TSV':
161
176
  clust_affix = '.tsv'
162
- elif options.clust_tool == 'CSV':
177
+ elif options.clustering_format == 'CSV':
163
178
  clust_affix = '.csv'
164
179
 
165
180
 
166
181
 
182
+
167
183
  ###External tool checks:
168
184
  ##MAFFT
169
- if options.con_core == True:
185
+ if options.align_core == True:
170
186
  if is_tool_installed('mafft'):
171
- if options.verbose == True:
187
+ if options.verbose != None:
172
188
  print("mafft is installed. Proceeding with alignment.")
173
189
  else:
174
190
  exit("mafft is not installed. Please install mafft to proceed.")
175
191
  ##CD-HIT
176
- if options.clust_tool == 'CD-HIT' and options.run_mode == 'Full':
192
+ if options.clustering_format == 'CD-HIT' and options.run_mode == 'Full':
177
193
  if is_tool_installed('cd-hit'):
178
- if options.verbose == True:
194
+ if options.verbose != None:
179
195
  print("cd-hit is installed. Proceeding with clustering.")
180
196
  else:
181
197
  exit("cd-hit is not installed. Please install cd-hit to proceed.")
182
198
 
183
- if options.write_families != None and options.original_fasta == False:
199
+ if options.write_groups != None and options.original_fasta == False:
184
200
  exit("-fasta must br provided if -w is used")
185
201
 
186
202
 
@@ -197,35 +213,48 @@ def main():
197
213
 
198
214
  output_path = os.path.abspath(options.output_dir)
199
215
  combined_out_file = os.path.join(output_path, "combined_sequences.fasta")
200
- clustering_output = os.path.join(output_path, 'clustering_' + options.clust_tool)
216
+ clustering_output = os.path.join(output_path, 'clustering_' + options.clustering_format)
201
217
 
202
218
  if options.group_type == 'Species':
203
219
  options.core_groups = options.core_groups + ',0'
204
220
  groups_to_use = options.core_groups
205
- else:
221
+ elif options.group_type == 'Genus':
206
222
  options.genus_groups = options.genus_groups + ',>'
207
223
  groups_to_use = options.genus_groups
224
+ if options.align_core != None:
225
+ sys.exit("-a align_core not a valid option in Genus mode.")
208
226
 
209
227
 
210
228
  if options.run_mode == 'Full':
229
+ if not os.path.exists(output_path):
230
+ os.makedirs(output_path)
231
+ if options.sequence_type == 'AA':
232
+ clustering_mode = 'cd-hit'
233
+ translate = True
234
+ elif options.sequence_type == 'DNA':
235
+ clustering_mode = 'cd-hit-est'
236
+ translate = False
211
237
  if options.input_type == 'separate':
212
- read_separate_files(options.input_dir, options.name_split, combined_out_file)
238
+ read_separate_files(options.input_dir, options.name_split, options.gene_ident, combined_out_file, translate)
213
239
  else:
214
- read_combined_files(options.input_dir, options.name_split, combined_out_file)
240
+ read_combined_files(options.input_dir, options.name_split, options.gene_ident, combined_out_file, translate)
215
241
 
216
- run_cd_hit(combined_out_file, clustering_output, options)
242
+ if options.clustering_format == 'CD-HIT':
243
+ run_cd_hit(options, combined_out_file, clustering_output, clustering_mode)
217
244
 
218
245
  class clustering_options:
219
246
  def __init__(self):
220
- self.cluster_format = options.clust_tool
247
+ self.run_mode = options.run_mode
248
+ self.cluster_format = options.clustering_format
249
+ self.sequence_type = options.sequence_type
221
250
  self.reclustered = options.reclustered
222
251
  self.sequence_tag = options.sequence_tag
223
252
  self.core_groups = groups_to_use
224
253
  self.clusters = clustering_output + clust_affix
225
254
  self.output_dir = options.output_dir
226
255
  self.gene_presence_absence_out = options.gene_presence_absence_out
227
- self.write_families = options.write_families
228
- self.con_core = options.con_core
256
+ self.write_groups = options.write_groups
257
+ self.align_core = options.align_core
229
258
  self.fasta = combined_out_file
230
259
  self.verbose = options.verbose
231
260
 
@@ -234,15 +263,16 @@ def main():
234
263
  elif options.run_mode == 'Partial':
235
264
  class clustering_options:
236
265
  def __init__(self):
237
- self.cluster_format = options.clust_tool
266
+ self.run_mode = options.run_mode
267
+ self.cluster_format = options.clustering_format
238
268
  self.reclustered = options.reclustered
239
269
  self.sequence_tag = options.sequence_tag
240
270
  self.core_groups = groups_to_use
241
271
  self.clusters = options.cluster_file
242
272
  self.output_dir = options.output_dir
243
273
  self.gene_presence_absence_out = options.gene_presence_absence_out
244
- self.write_families = options.write_families
245
- self.con_core = options.con_core
274
+ self.write_groups = options.write_groups
275
+ self.align_core = options.align_core
246
276
  self.fasta = options.original_fasta
247
277
  self.verbose = options.verbose
248
278
 
@@ -258,4 +288,5 @@ def main():
258
288
  "Please report any issues to: https://github.com/NickJD/PyamilySeq/issues\n#####")
259
289
 
260
290
  if __name__ == "__main__":
291
+ print("Running PyamilySeq "+PyamilySeq_Version)
261
292
  main()
@@ -1,10 +1,5 @@
1
1
  #from line_profiler_pycharm import profile
2
2
 
3
- import copy
4
- import sys
5
- import math
6
- from collections import Counter
7
-
8
3
 
9
4
  try:
10
5
  from .Constants import *
@@ -16,45 +11,6 @@ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
16
11
  from utils import *
17
12
 
18
13
 
19
-
20
- def process_gene_families(options, directory, output_file):
21
- """Process each gene family file to select the longest sequence per genome and concatenate aligned sequences."""
22
- concatenated_sequences = {}
23
- output_file = directory.replace('Gene_Families_Output',output_file)
24
-
25
- # Iterate over each gene family file
26
- for gene_file in os.listdir(directory):
27
- if gene_file.endswith('.fasta'):
28
- gene_path = os.path.join(directory, gene_file)
29
-
30
- # Read sequences from the gene family file
31
- sequences = read_fasta(gene_path)
32
-
33
- # Select the longest sequence for each genome
34
- longest_sequences = select_longest_gene(sequences)
35
-
36
- # Run mafft on the longest sequences
37
- aligned_file = f"{gene_file}_aligned.fasta"
38
- run_mafft_on_sequences(options, {seq_id: seq for seq_id, seq in longest_sequences.values()}, aligned_file)
39
-
40
- # Read aligned sequences and concatenate them
41
- aligned_sequences = read_fasta(aligned_file)
42
- for genome, aligned_seq in aligned_sequences.items():
43
- genome_name = genome.split('|')[0]
44
- if genome_name not in concatenated_sequences:
45
- concatenated_sequences[genome_name] = ""
46
- concatenated_sequences[genome_name] += aligned_seq
47
-
48
- # Clean up aligned file
49
- os.remove(aligned_file)
50
-
51
- # Write the concatenated sequences to the output file
52
- with open(output_file, 'w') as out:
53
- for genome, sequence in concatenated_sequences.items():
54
- out.write(f">{genome}\n")
55
- wrapped_sequence = wrap_sequence(sequence, 60)
56
- out.write(f"{wrapped_sequence}\n")
57
-
58
14
  def gene_presence_absence_output(options, genus_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted):
59
15
  print("Outputting gene_presence_absence file")
60
16
  output_dir = os.path.abspath(options.output_dir)
@@ -99,7 +55,7 @@ def gene_presence_absence_output(options, genus_dict, pangenome_clusters_First_s
99
55
 
100
56
 
101
57
 
102
- def get_cores(options,genus_dict):
58
+ def get_cores(options):
103
59
  ##Calculate core groups
104
60
  groups = OrderedDict()
105
61
  cores = OrderedDict()
@@ -117,27 +73,26 @@ def get_cores(options,genus_dict):
117
73
  cores[only_second_core_group] = []
118
74
  return cores, groups
119
75
 
120
-
121
76
  #@profile
122
- def calc_First_only_core(cluster, First_number, cores):
77
+ def calc_First_only_core(cluster, First_num, cores):
123
78
  try:
124
- cores['First_genera_'+str(First_number)].append(cluster)
79
+ cores['First_genera_' + str(First_num)].append(cluster)
125
80
  except KeyError:
126
81
  cores['First_genera_>'].append(cluster)
127
82
  #@profile
128
83
  def calc_single_First_extended_Second_only_core(cluster, First_num, cores, Second_num): # Count gene families extended with StORFs
129
84
  group = First_num + Second_num
130
85
  try:
131
- cores['extended_genera_' + group].append(cluster)
86
+ cores['extended_genera_' + str(group)].append(cluster)
132
87
  except KeyError:
133
88
  cores['extended_genera_>'].append(cluster)
134
89
  #@profile
135
90
  def calc_multi_First_extended_Second_only_core(cluster, First_num, cores, Second_num): # Count seperately those gene families extended with StORF_Reporter but combined >1 PEP
136
91
  group = First_num + Second_num
137
92
  try:
138
- cores['combined_genera_' + group].append(cluster)
93
+ cores['combined_genera_' + str(group)].append(cluster)
139
94
  except KeyError:
140
- cores['combined_genera_>' + group].append(cluster)
95
+ cores['combined_genera_>'].append(cluster)
141
96
  #@profile
142
97
  def calc_Second_only_core(cluster, cores, Second_num):
143
98
  try:
@@ -157,28 +112,26 @@ def calc_only_Second_only_core(cluster, cores, Second_num): # only count the tru
157
112
  def cluster(options):
158
113
 
159
114
  if options.cluster_format == 'CD-HIT':
160
- genus_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps = cluster_CDHIT(options, '_')
161
- elif options.cluster_format in ['TSV','CSV']:
162
- genus_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps = cluster_EdgeList(options, '_')
163
-
115
+ genus_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps = cluster_CDHIT(options, '_')
116
+ elif 'TSV' in options.cluster_format or 'CSV' in options.cluster_format:
117
+ genus_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps = cluster_EdgeList(options, '_')
164
118
 
119
+ ###
120
+ cores, groups = get_cores(options)
121
+ ###
165
122
 
166
123
  if options.reclustered != None:
167
-
168
124
  if options.cluster_format == 'CD-HIT':
169
- combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second = combined_clustering_CDHIT(options, genus_dict, '_')
170
- if options.cluster_format == ['TSV','CSV']:
171
- combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second = combined_clustering_Edge_List(options, '_')
172
- pangenome_clusters_Type = combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered, '_')
125
+ combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second, combined_pangenome_clusters_Second_sequences = combined_clustering_CDHIT(options, genus_dict, '_')
126
+ elif 'TSV' in options.cluster_format or 'CSV' in options.cluster_format:
127
+ combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second, combined_pangenome_clusters_Second_sequences = combined_clustering_Edge_List(options, '_')
128
+ pangenome_clusters_Type = combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered, pangenome_clusters_First_genomes, '_')
173
129
  else:
174
-
175
130
  pangenome_clusters_Type = single_clustering_counting(pangenome_clusters_First, reps)
176
131
 
177
- ###
178
- cores, groups = get_cores(options, genus_dict)
179
- ###
180
132
 
181
- Number_Of_StORF_Extending_But_Same_Genomes = 0
133
+
134
+ Number_Of_Second_Extending_But_Same_Genomes = 0
182
135
 
183
136
  sorted_first_keys = sort_keys_by_values(pangenome_clusters_First, pangenome_clusters_First_sequences)
184
137
  pangenome_clusters_First_sorted = reorder_dict_by_keys(pangenome_clusters_First, sorted_first_keys)
@@ -186,19 +139,28 @@ def cluster(options):
186
139
  pangenome_clusters_Type_sorted = reorder_dict_by_keys(pangenome_clusters_Type, sorted_first_keys)
187
140
 
188
141
  print("Calculating Groups")
142
+ seen_groupings = []
189
143
  for cluster, numbers in pangenome_clusters_Type_sorted.items():
190
144
  ############################### Calculate First only
191
- calc_First_only_core(cluster, numbers[1], cores)
145
+ cluster = str(cluster)
146
+ for grouping in numbers[2]: #!!# Could do with a more elegant solution
147
+ current_cluster = grouping[0].split(':')[0]
148
+ if current_cluster not in seen_groupings:
149
+ seen_groupings.append(current_cluster)
150
+ current_cluster_size = grouping[0].split(':')[1]
151
+ calc_First_only_core(current_cluster, current_cluster_size, cores)
152
+ ############################# Calculate First and Reclustered-Second
153
+ if numbers[0] == 1 and numbers[3] >= 1: # If Seconds did not combine First reps
154
+ calc_single_First_extended_Second_only_core(cluster, numbers[1], cores, numbers[3])
155
+ elif numbers[0] > 1 and numbers[3] >= 1: # If unique Seconds combined multiple Firsts
156
+ calc_multi_First_extended_Second_only_core(cluster, numbers[1], cores, numbers[3])
157
+ elif numbers[4] >= 1:
158
+ Number_Of_Second_Extending_But_Same_Genomes += 1
159
+ else:
160
+ if options.verbose == True:
161
+ print("First cluster " + current_cluster + " already processed - This is likely because it was clustered with another First representative.")
192
162
 
193
163
  if options.reclustered != None:
194
- ############################# Calculate First and Reclustered-Second
195
- if numbers[0] == 1 and numbers[3] >= 1: # If Seconds did not combine First reps
196
- calc_single_First_extended_Second_only_core(cluster, numbers[1], groups, cores, numbers[3])
197
- elif numbers[0] > 1 and numbers[3] >= 1: # If unique Secondss combined multiple Firsts
198
- calc_multi_First_extended_Second_only_core(cluster, numbers[1], groups, cores, numbers[3])
199
- elif numbers[4] >= 1:
200
- Number_Of_StORF_Extending_But_Same_Genomes += 1
201
-
202
164
  combined_pangenome_clusters_ONLY_Second_Type = defaultdict(list)
203
165
  combined_pangenome_clusters_Second_Type = defaultdict(list)
204
166
  for cluster, genomes in combined_pangenome_clusters_Second.items():
@@ -207,52 +169,73 @@ def cluster(options):
207
169
  else:
208
170
  combined_pangenome_clusters_ONLY_Second_Type[cluster] = [cluster, len(genomes)]
209
171
  for cluster, data in combined_pangenome_clusters_Second_Type.items():
210
- if data[1] >=1:
172
+ if data[1] >= 1:
211
173
  calc_Second_only_core(cluster, cores, data[1])
212
174
  for cluster, data in combined_pangenome_clusters_ONLY_Second_Type.items():
213
- if data[1] >= 1 :
175
+ if data[1] >= 1:
214
176
  calc_only_Second_only_core(cluster, cores, data[1])
215
177
  ###########################
216
178
  ### Output
217
- key_order = list(cores.keys())
218
179
  output_path = os.path.abspath(options.output_dir)
180
+ if not os.path.exists(output_path):
181
+ os.makedirs(output_path)
219
182
  stats_out = os.path.join(output_path,'summary_statistics.txt')
183
+ key_order = list(cores.keys())
220
184
  with open(stats_out,'w') as outfile:
221
185
  print("Genus Groups:")
222
186
  outfile.write("Genus Groups:\n")
223
187
  for key in key_order:
224
188
  print(key+':\t'+str(len(cores[key])))
225
189
  outfile.write(key + ':\t' + str(len(cores[key]))+'\n')
226
- print("Total Number of Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
190
+ print("Total Number of First Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
227
191
  outfile.write("Total Number of Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
192
+ if options.reclustered!= None:
193
+ print("Total Number of Second Gene Groups (Including Singletons): " + str(
194
+ len(combined_pangenome_clusters_Second_sequences)))
195
+ print("Total Number of First Gene Groups That Had Additional Second Sequences But Not New Genomes: " + str(
196
+ Number_Of_Second_Extending_But_Same_Genomes))
197
+ outfile.write("\nTotal Number of Second Gene Groups (Including Singletons): " + str(
198
+ len(combined_pangenome_clusters_Second_sequences)))
199
+ outfile.write("\nTotal Number of First Gene Groups That Had Additional Second Sequences But Not New Genomes: " + str(
200
+ Number_Of_Second_Extending_But_Same_Genomes))
228
201
 
229
202
  if options.gene_presence_absence_out != None:
230
203
  gene_presence_absence_output(options,genus_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted)
231
204
 
232
- if options.write_families != None and options.fasta != None:
233
- sequences = read_fasta(options.fasta)
234
- output_dir = os.path.join(output_path, 'Gene_Families_Output')
235
-
236
- # Create output directory if it doesn't exist
237
- if not os.path.exists(output_dir):
238
- os.makedirs(output_dir)
239
- for key_prefix in key_order:
240
- for key, values in cores.items():
241
- if any(part in options.write_families.split(',') for part in key.split('_')):
242
- if key.startswith(key_prefix):
243
- for value in values:
244
- output_filename = f"{key}_{value}.fasta"
245
- sequences_to_write = pangenome_clusters_First_sequences_sorted[value]
246
- # Write sequences to output file that are in the sequences dictionary
247
- with open(os.path.join(output_dir, output_filename), 'w') as outfile:
248
- for header in sequences_to_write:
249
- if header in sequences:
250
- outfile.write(f">{header}\n")
251
- wrapped_sequence = wrap_sequence(sequences[header])
252
- outfile.write(f"{wrapped_sequence}\n")
253
-
254
- if options.con_core != None and options.fasta != None and options.write_families != None:
255
- process_gene_families(options, os.path.join(output_dir, 'Gene_Families_Output'), 'concatonated_genes_aligned.fasta')
205
+ if options.run_mode == 'Full':
206
+ if options.reclustered == None:
207
+ combined_pangenome_clusters_Second_sequences = None
208
+ if options.write_groups != None:
209
+ print("Outputting gene group FASTA files")
210
+ sequences = read_fasta(options.fasta)
211
+ #output_dir = os.path.dirname(os.path.abspath(options.output_dir))
212
+ output_dir = os.path.join(options.output_dir, 'Gene_Families_Output')
213
+ write_groups(options,output_dir, key_order, cores, sequences,
214
+ pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
215
+
216
+ elif options.run_mode == 'Partial':
217
+ if options.reclustered == None:
218
+ combined_pangenome_clusters_Second_sequences = None
219
+ if options.write_groups != None and options.fasta != None:
220
+ print("Outputting gene group FASTA files")
221
+ sequences = read_fasta(options.fasta)
222
+ #output_dir = os.path.dirname(os.path.abspath(options.output_dir))
223
+ output_dir = os.path.join(options.output_dir, 'Gene_Families_Output')
224
+ write_groups(options,output_dir, key_order, cores, sequences,
225
+ pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
226
+
227
+
228
+ # if options.write_groups != None and options.fasta != None:
229
+ # sequences = read_fasta(options.fasta)
230
+ # output_dir = os.path.join(output_path, 'Gene_Families_Output')
231
+ #
232
+ # write_groups(options,output_dir, key_order, cores, sequences,
233
+ # pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
234
+
235
+
236
+ #!!# - Currently only align in Species Mode
237
+ #if options.align_core != None and options.fasta != None and options.write_groups != None:
238
+ # process_gene_families(options, os.path.join(output_path, 'Gene_Families_Output'), 'concatenated_genes_aligned.fasta')
256
239
 
257
240
 
258
241