PyamilySeq 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,571 @@
1
+
2
+ import argparse
3
+ from collections import defaultdict, OrderedDict
4
+
5
+
6
+ try:
7
+ from .constants import *
8
+ from .utils import *
9
+ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
10
+ from constants import *
11
+ from utils import *
12
+
13
+ def run_cd_hit(options, input_file, clustering_output, clustering_mode):
14
+ cdhit_command = [
15
+ clustering_mode,
16
+ '-i', input_file,
17
+ '-o', clustering_output,
18
+ '-c', str(options.pident),
19
+ '-s', str(options.len_diff),
20
+ '-T', str(options.clustering_threads),
21
+ '-M', str(options.clustering_memory),
22
+ '-d', "0",
23
+ '-g', str(options.fast_mode),
24
+ '-sc', "1",
25
+ '-sf', "1"
26
+ ]
27
+ if options.verbose == True:
28
+ subprocess.run(cdhit_command)
29
+ else:
30
+ subprocess.run(cdhit_command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
31
+
32
+ #'@profile
33
+ def calculate_new_rep_seq(cluster_data, length_weight=1.0, identity_weight=1.0):
34
+ total_length = sum(entry['length'] for entry in cluster_data)
35
+ avg_length = total_length / len(cluster_data)
36
+
37
+ total_identity = sum(entry['percent_identity'] for entry in cluster_data)
38
+ avg_identity = total_identity / len(cluster_data)
39
+
40
+ # Normalize length and identity
41
+ max_length = max(entry['length'] for entry in cluster_data)
42
+ max_identity = 100 # Assuming percent_identity is out of 100
43
+
44
+ # Calculate a score based on both length difference and percent identity
45
+ def score(entry):
46
+ normalized_length_diff = abs(entry['length'] - avg_length) / max_length
47
+ normalized_identity_diff = abs(entry['percent_identity'] - avg_identity) / max_identity
48
+ return (length_weight * normalized_length_diff) + (identity_weight * (1 - normalized_identity_diff))
49
+
50
+ rep_entry = min(cluster_data, key=score)
51
+ return rep_entry
52
+
53
+
54
+
55
+ #def length_within_threshold(rep_length, length, len_diff):
56
+ # return abs(rep_length - length) / rep_length <= len_diff
57
+
58
+
59
+ def check_if_all_identical(clustered_sequences):
60
+ lengths = {entry['length'] for cluster in clustered_sequences.values() for entry in cluster}
61
+ perc_idents = {entry['percent_identity'] for cluster in clustered_sequences.values() for entry in cluster}
62
+
63
+ return len(lengths) == 1 and len(perc_idents) == 1
64
+
65
+
66
+
67
+ def read_fasta_groups(options, groups_to_use):
68
+ groups = defaultdict(list)
69
+ genome_count = defaultdict(int)
70
+ current_group = None
71
+ current_sequence = []
72
+
73
+ if options.sequence_type == 'AA':
74
+ affix = '_aa.fasta'
75
+ else:
76
+ affix = '_dna.fasta'
77
+
78
+ combined_groups_fasta = options.input_directory + '/Gene_Groups_Output/combined_group_sequences' + affix
79
+
80
+ if groups_to_use[0] == 'ids':
81
+ selected_group_ids = [int(g.strip()) for g in groups_to_use[1].split(',')]
82
+ elif groups_to_use[0] == 'groups':
83
+ selected_groups = set(range(int(groups_to_use[1]), 101))
84
+ # Scan the directory for filenames that match the criteria
85
+ selected_group_ids = []
86
+ for filename in os.listdir(os.path.dirname(combined_groups_fasta)):
87
+ if 'core' in filename and filename.endswith('.fasta'):
88
+ try:
89
+ group_number = int(filename.split('_')[2])
90
+ if group_number in selected_groups:
91
+ selected_group_ids.append(int(filename.split('_')[3].split('.')[0]))
92
+ except ValueError:
93
+ continue
94
+
95
+
96
+ group_number = None
97
+ with open(combined_groups_fasta, 'r') as f:
98
+ for line in f:
99
+ if line.startswith('>'):
100
+ if current_group is not None and (selected_group_ids is None or group_number in selected_group_ids):
101
+ groups[current_group].append((current_group_header, ''.join(current_sequence)))
102
+
103
+ current_group_header = line.strip()
104
+ current_group = current_group_header.split('|')[0]
105
+ genome = current_group_header.split('|')[1]
106
+ current_sequence = []
107
+ genome_count[genome] += 1
108
+
109
+ # Only process if group matches the selected_groups or if no specific groups were provided
110
+ group_number = int(current_group.replace('>Group_', '')) # Assuming format 'Group_n'
111
+ if selected_group_ids is not None and group_number not in selected_group_ids:
112
+ current_group = None # Skip this group
113
+ continue
114
+
115
+ else:
116
+ current_sequence.append(line.strip())
117
+
118
+ if current_group is not None:
119
+ groups[current_group].append((current_group_header, ''.join(current_sequence)))
120
+
121
+
122
+ return groups, genome_count
123
+
124
+
125
+
126
+
127
+ def write_fasta(sequences, output_file):
128
+ with open(output_file, 'w') as f:
129
+ for header, seq in sequences:
130
+ f.write(f"{header}\n{seq}\n")
131
+
132
+
133
+ def read_cd_hit_output(clustering_output):
134
+ clusters = OrderedDict()
135
+
136
+ with open(clustering_output, 'r') as f:
137
+ current_cluster_id = None
138
+
139
+ for line in f:
140
+ line = line.strip()
141
+ if line.startswith(">Cluster"):
142
+ current_cluster_id = line.split(' ')[1]
143
+ clusters[current_cluster_id] = []
144
+ elif line and current_cluster_id is not None:
145
+ parts = line.split('\t')
146
+ if len(parts) > 1:
147
+ clustered_info = parts[1]
148
+ length = clustered_info.split(',')[0]
149
+ length = int(''.join(c for c in length if c.isdigit()))
150
+ clustered_header = clustered_info.split('>')[1].split('...')[0]
151
+ clustered_header = '>' + clustered_header
152
+
153
+ if 'at ' in clustered_info and '%' in clustered_info.split('at ')[-1]:
154
+ percent_identity = extract_identity(line)
155
+ elif line.endswith('*'):
156
+ percent_identity = 100.0
157
+ else:
158
+ raise ValueError("Percent identity not found in the string.")
159
+
160
+ clusters[current_cluster_id].append({
161
+ 'header': clustered_header,
162
+ 'length': length,
163
+ 'percent_identity': percent_identity
164
+ })
165
+
166
+ return clusters
167
+
168
+ #@profile
169
+ def separate_groups(options, clustering_mode, groups_to_use):
170
+ groups, genome_count = read_fasta_groups(options, groups_to_use)
171
+
172
+ paralog_groups = defaultdict(lambda: {'count': 0, 'sizes': []}) # To track number of paralog groups and their sizes
173
+
174
+ for group_header, sequences in groups.items():
175
+ if options.verbose == True:
176
+ print(f"\n###\nCurrent Group: {group_header.replace('>','')}\n")
177
+
178
+ group_name = group_header.split('|')[0] # Get the group part (e.g., '>Group_n')
179
+
180
+ # Count genomes with more than one gene
181
+ genome_to_gene_count = defaultdict(int)
182
+ for header, _ in sequences:
183
+ genome = header.split('|')[1]
184
+ genome_to_gene_count[genome] += 1
185
+
186
+ num_genomes_with_multiple_genes = sum(1 for count in genome_to_gene_count.values() if count > 1)
187
+
188
+ # Check if the group meets the threshold for having paralogs
189
+ #if options.groups == None:
190
+ if (num_genomes_with_multiple_genes / options.genome_num) * 100 < options.group_threshold:
191
+ continue
192
+
193
+
194
+ group_file_name = group_name.replace('>','')
195
+
196
+ temp_fasta = f"{options.gene_groups_output}/{group_file_name}.fasta"
197
+ write_fasta(sequences, temp_fasta)
198
+
199
+ # Run cd-hit on the individual group
200
+ clustering_output = f"{options.gene_groups_output}/{group_file_name}_clustering"
201
+
202
+ run_cd_hit(options, temp_fasta, clustering_output, clustering_mode)
203
+
204
+ # Read the clustering results to find subgroups
205
+ clustered_sequences = read_cd_hit_output(clustering_output + '.clstr')
206
+
207
+ if len(clustered_sequences) == 1:
208
+ # Detect if all sequences are identical in length and percentage identity
209
+ all_same = check_if_all_identical(clustered_sequences)
210
+
211
+ # **Global subgroup counter for the entire major group**
212
+ subgroup_id = 0
213
+
214
+
215
+ if not all_same:
216
+ # Iterate through each cluster in clustered_sequences
217
+ for cluster_key, cluster in clustered_sequences.items():
218
+
219
+ remaining_sequences_tmp = sequences.copy() # Track unprocessed sequences
220
+ remaining_sequences = [entry for entry in remaining_sequences_tmp if entry[0] in
221
+ {seq_entry['header'] for seq_entry in cluster}]
222
+ sequences_to_remove = []
223
+
224
+ while remaining_sequences:
225
+ # Track subgroups for this cluster pass
226
+ subgroup_sequences = []
227
+ genome_seen = set()
228
+
229
+ # Recalculate representative sequence dynamically for this cluster
230
+ rep = calculate_new_rep_seq(
231
+ [entry for entry in cluster if entry['header'] in (h for h, _ in remaining_sequences)]
232
+ )
233
+
234
+ # Find the sequence corresponding to rep['header'] from the list of sequences
235
+ rep_seq = next((seq for header, seq in sequences if header == rep['header']), None)
236
+
237
+ # Save previously checked seqs, so we don't have to compare them again.
238
+ checked = collections.defaultdict(float)
239
+
240
+ # Process each genome to select the best matching sequence
241
+ for genome in genome_to_gene_count:
242
+ best_sequence = None
243
+ best_score = None # Initialise with a very low score, so that even negative scores can be selected
244
+
245
+ # Iterate over each sequence in the remaining sequences for this genome
246
+ for header, seq in remaining_sequences:
247
+ genome_id = header.split('|')[1]
248
+
249
+ if genome_id == genome: # Ensure this sequence belongs to the current genome
250
+ if rep_seq == seq:
251
+ levenshtein_distance = 0
252
+ else:
253
+ if seq in checked:
254
+ levenshtein_distance = checked[seq]
255
+ else:
256
+ levenshtein_distance = levenshtein_distance_calc(rep_seq,seq)
257
+ checked[seq] = levenshtein_distance
258
+ # Lower Levenshtein distance means more 'similar' sequences
259
+ score = levenshtein_distance
260
+
261
+ # Check if this sequence has a higher score than the current best
262
+ if best_sequence == None:
263
+ best_score = score
264
+ best_sequence = (header, seq) # Store the best matching sequence for this genome
265
+ elif score < best_score:
266
+ best_score = score
267
+ best_sequence = (header, seq) # Store the best matching sequence for this genome
268
+
269
+ # Add the best sequence for this genome to the subgroup
270
+ if best_sequence is not None:
271
+ new_header = f">{group_file_name}_subgroup_{subgroup_id}|{best_sequence[0].split('|')[1]}|{best_sequence[0].split('|')[2]}"
272
+ subgroup_sequences.append((new_header, best_sequence[1]))
273
+ sequences_to_remove.append(best_sequence)
274
+ genome_seen.add(genome)
275
+
276
+ # Write each subgroup into a separate FASTA file
277
+ if subgroup_sequences:
278
+ subgroup_file = f"{options.sub_groups_output}/{group_file_name}_subgroup_{subgroup_id}.fasta"
279
+ write_fasta(subgroup_sequences, subgroup_file)
280
+
281
+ # Remove processed sequences from the remaining list
282
+ remaining_sequences = [item for item in remaining_sequences if
283
+ item[0] not in {h for h, _ in sequences_to_remove}]
284
+
285
+ # Increment subgroup ID for the next subgroup
286
+ subgroup_id += 1
287
+ paralog_groups[group_name]['count'] += 1 # Count this group as a paralog group
288
+ paralog_groups[group_name]['sizes'].append(len(subgroup_sequences)) # Record the size of the subgroup
289
+
290
+
291
+
292
+
293
+ else:
294
+ # Condition 2: If sequences are identical, distribute genes evenly into subgroups
295
+ num_subgroups = 1000
296
+ subgroup_sequences = defaultdict(list) # Store sequences for each subgroup
297
+ genome_count = defaultdict(int) # Count how many genes have been assigned to each genome
298
+
299
+ # Iterate over all sequences regardless of whether the genome has been seen
300
+ for header, seq in sequences:
301
+ genome = header.split('|')[1]
302
+
303
+ # Determine the next subgroup for this genome
304
+ subgroup_id = genome_count[genome] % num_subgroups
305
+ new_header = f"{group_file_name}_subgroup_{subgroup_id}|{genome}|{header.split('|')[2]}"
306
+ subgroup_sequences[subgroup_id].append((new_header, seq))
307
+
308
+ # Increment the count for this genome
309
+ genome_count[genome] += 1
310
+
311
+ # Write out each subgroup to a separate FASTA file
312
+ for subgroup_id, seqs in subgroup_sequences.items():
313
+ subgroup_file = f"{options.input_directory}/{group_file_name}_subgroup_{subgroup_id}.fasta"
314
+ write_fasta(seqs, subgroup_file)
315
+
316
+ # Increment subgroup ID globally for the next subgroup
317
+ subgroup_id += 1
318
+ paralog_groups[group_name]['count'] += 1 # Count this group as a paralog group
319
+ paralog_groups[group_name]['sizes'].append(len(seqs)) # Record the size of the subgroup
320
+
321
+
322
+
323
+ # Clean up temporary fasta file if the option is set
324
+ if options.delete_temp_files:
325
+ if temp_fasta and os.path.exists(temp_fasta):
326
+ os.remove(temp_fasta)
327
+ if os.path.exists(clustering_output + '.clstr'):
328
+ os.remove(clustering_output + '.clstr')
329
+ if os.path.exists(clustering_output):
330
+ os.remove(clustering_output)
331
+
332
+
333
+ return paralog_groups
334
+
335
+
336
+ def main():
337
+ parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ': Group-Splitter - A tool to split multi-copy gene groups identified by PyamilySeq.')
338
+ ### Required Arguments
339
+ required = parser.add_argument_group('Required Parameters')
340
+ required.add_argument('-input_directory', action='store', dest='input_directory',
341
+ help='Provide the directory of a PyamilySeq run.',
342
+ required=True)
343
+ required.add_argument('-sequence_type', action='store', dest='sequence_type', default='AA',choices=['AA', 'DNA'],
344
+ help='Default - AA: Are groups "DNA" or "AA" sequences?',
345
+ required=True)
346
+ required.add_argument('-genome_num', action='store', dest='genome_num', type=int,
347
+ help='The total number of genomes must be provide',
348
+ required=True)
349
+
350
+
351
+ ### Regrouping Arguments
352
+ regrouping_params = parser.add_argument_group('Regrouping Parameters')
353
+ regrouping_params.add_argument('-groups', action="store", dest='groups', type=int, default=None,
354
+ help='Default - 99: groups to be split by pangenome grouping (see -group_threshold). '
355
+ 'Provide "-groups 99" to split specific groups.',
356
+ required=False)
357
+ regrouping_params.add_argument('-group_ids', action="store", dest='group_ids', default=None,
358
+ help='Default - None: Provide "-group_ids 1,2,3,4" to split specific groups (see -group_threshold).',
359
+ required=False)
360
+ regrouping_params.add_argument('-group_threshold', action='store', dest='group_threshold', type=float, default=80,
361
+ help='Default: 80: Minimum percentage of genomes with multi-copy in a gene group to be split.',
362
+ required=False)
363
+
364
+ ### Output Arguments
365
+ output_args = parser.add_argument_group('Output Parameters')
366
+ output_args.add_argument('-a', action="store", dest='align_core', default=None,
367
+ help='Default - No output: SLOW! (Only works for Species mode) Output aligned and concatinated sequences of identified groups -'
368
+ 'provide group levels at which to output "-a 99,95".',
369
+ required=False)
370
+
371
+
372
+ ### CD-HIT Reclustering Arguments
373
+ cdhit_params = parser.add_argument_group('CD-HIT Reclustering Parameters')
374
+ cdhit_params.add_argument('-c', action='store', dest='pident', type=float, default=0.8,
375
+ help='Sequence identity threshold (default: 0.8) - Probably should be higher than what was used in initial clustering.')
376
+ cdhit_params.add_argument('-s', action='store', dest='len_diff', type=float, default=0.20,
377
+ help="Length difference cutoff (default: 0.20) - Often the most impactful parameter to split 'multi-copy' gene groups.")
378
+ cdhit_params.add_argument('-fastmode', action='store_true', dest='fast_mode',
379
+ help='Default False: Run CD-HIT with "-g 0" to speed up but reduce accuracy of clustering.',
380
+ required=False)
381
+ cdhit_params.add_argument('-T', action='store', dest='clustering_threads', type=int, default=8,
382
+ help='Number of threads for clustering (default: 8)')
383
+ cdhit_params.add_argument('-M', action='store', dest='clustering_memory', type=int, default=2000,
384
+ help='Memory limit in MB for clustering (default: 2000)')
385
+
386
+ ### MAFFT Alignment Arguments
387
+ alignment_args = parser.add_argument_group('Alignment Runtime Arguments - Optional when "-a" is provided.')
388
+
389
+ alignment_args.add_argument("-t", action="store", dest="threads", type=int, default=8,
390
+ help="Default 8: Threads to be allocated for clustering and/or alignment.",
391
+ required=False)
392
+
393
+ ### Misc Arguments
394
+ misc = parser.add_argument_group("Misc Parameters")
395
+ misc.add_argument('-no_delete_temp_files', action='store_false', dest='delete_temp_files',
396
+ help='Default: Delete all temporary files after processing.',
397
+ required=False)
398
+ misc.add_argument("-verbose", action="store_true", dest="verbose",
399
+ help="Print verbose output.",
400
+ required=False)
401
+ misc.add_argument("-v", "--version", action="version",
402
+ version=f"PyamilySeq: Group-Splitter version {PyamilySeq_Version} - Exiting",
403
+ help="Print out version number and exit")
404
+
405
+
406
+ options = parser.parse_args()
407
+ print("Running PyamilySeq: Group-Splitter " + PyamilySeq_Version)
408
+
409
+
410
+
411
+ ###External tool checks:
412
+ ##MAFFT
413
+ if options.align_core == True:
414
+ if is_tool_installed('mafft'):
415
+ if options.verbose == True:
416
+ print("mafft is installed. Proceeding with alignment.")
417
+ else:
418
+ exit("mafft is not installed. Please install mafft to proceed.")
419
+ ##CD-HIT
420
+
421
+ if is_tool_installed('cd-hit'):
422
+ if options.verbose == True:
423
+ print("cd-hit is installed. Proceeding with clustering.")
424
+ if options.sequence_type == 'DNA':
425
+ clustering_mode = 'cd-hit-est'
426
+ else:
427
+ clustering_mode = 'cd-hit'
428
+ if options.fast_mode == True:
429
+ options.fast_mode = 0
430
+ if options.verbose == True:
431
+ print("Running CD-HIT in fast mode.")
432
+ else:
433
+ options.fast_mode = 1
434
+ if options.verbose == True:
435
+ print("Running CD-HIT in slow mode.")
436
+ else:
437
+ exit("cd-hit is not installed. Please install cd-hit to proceed.")
438
+
439
+ ##Alignment
440
+ if options.align_core != None:
441
+ if options.groups == None and options.group_ids == None:
442
+ sys.exit('Must provide "-groups" or "-group_ids" when requesting alignment with "-a".')
443
+
444
+ ##Output Directories
445
+ gene_groups_output = os.path.join(options.input_directory, "Gene_Groups_Output")
446
+ options.gene_groups_output = gene_groups_output
447
+ sub_groups_output = os.path.join(options.input_directory, "Sub_Groups_Output")
448
+ options.sub_groups_output = sub_groups_output
449
+ if not os.path.exists(gene_groups_output):
450
+ os.makedirs(gene_groups_output)
451
+ if not os.path.exists(sub_groups_output):
452
+ os.makedirs(sub_groups_output)
453
+
454
+ ## Get Summary Stats
455
+ summary_file = os.path.join(options.input_directory, 'summary_statistics.txt')
456
+
457
+ # Save arguments to a text file
458
+ params_out = os.path.join(options.input_directory, 'Group-Splitter_params.txt')
459
+ with open(params_out, "w") as outfile:
460
+ for arg, value in vars(options).items():
461
+ outfile.write(f"{arg}: {value}\n")
462
+
463
+
464
+
465
+ ## Group Selction - FIX THIS - currently fails if either are not provided
466
+ if options.groups != None and options.group_ids != None:
467
+ sys.exit('Must provide "-group_ids" or "-groups", not both.')
468
+ elif options.group_ids != None:
469
+ groups_to_use = ('ids', options.group_ids)
470
+ elif options.groups != None:
471
+ groups_to_use = ('groups', options.groups)
472
+ else:
473
+ groups_to_use = ('groups', 99)
474
+
475
+
476
+
477
+ paralog_groups = separate_groups(options, clustering_mode, groups_to_use)
478
+ ###
479
+ # Print metrics about paralog groups
480
+ print(f"Identified {len(paralog_groups)} paralog groups:")
481
+ for group_id, data in paralog_groups.items():
482
+ print(f"Group ID: {group_id}, Number of new groups: {data['count']}, Sizes: {data['sizes']}")
483
+ ###
484
+
485
+
486
+ # Read summary statistics
487
+ with open(summary_file, 'r') as f:
488
+ summary_data = f.read().splitlines()
489
+
490
+ summary_info = {}
491
+ for line in summary_data:
492
+ if ':' in line:
493
+ key, value = line.split(':')
494
+ summary_info[key.strip()] = int(value.strip())
495
+
496
+ genome_num = summary_info['Number of Genomes']
497
+ core_99 = summary_info['First_core_99']
498
+ core_95 = summary_info['First_core_95']
499
+ core_15 = summary_info['First_core_15']
500
+ core_0 = summary_info['First_core_0']
501
+ total_gene_groups = summary_info['Total Number of First Gene Groups (Including Singletons)']
502
+
503
+ # Initialise new core values
504
+ new_core_99 = core_99
505
+ new_core_95 = core_95
506
+ new_core_15 = core_15
507
+ new_core_0 = core_0
508
+
509
+ # Recalculate each *_core_* value
510
+ for group_id, data in paralog_groups.items():
511
+ group_id = group_id.replace('>Group_', '')
512
+ original_group = next((f for f in os.listdir(gene_groups_output) if f.endswith(f'_{group_id}.fasta')), None)
513
+ original_group = int(original_group.split('_')[2])
514
+ if original_group == 99:
515
+ new_core_99 -= 1
516
+ elif original_group == 95:
517
+ new_core_95 -= 1
518
+ elif original_group == 15:
519
+ new_core_15 -= 1
520
+ elif original_group == 0:
521
+ new_core_0 -= 1
522
+
523
+ for size in data['sizes']:
524
+ if size >= math.floor(99 * genome_num / 100):
525
+ new_core_99 += 1
526
+ elif size >= math.floor(95 * genome_num / 100):
527
+ new_core_95 += 1
528
+ elif size >= math.floor(15 * genome_num / 100):
529
+ new_core_15 += 1
530
+ elif size >= math.floor(0 * genome_num / 100):
531
+ new_core_0 += 1
532
+
533
+
534
+
535
+
536
+ # Write out the new summary statistics - currently only works for default cores
537
+ stats_out = summary_file.replace('.txt','_recalculated.txt')
538
+ key_order = ['First_core_', 'extended_core_', 'combined_core_', 'Second_core_','only_Second_core_']
539
+ with open(stats_out, 'w') as outfile:
540
+ print("Number of Genomes: " + str(options.genome_num))
541
+ outfile.write("Number of Genomes: " + str(options.genome_num) + "\n")
542
+ print("Reclaculated Gene Groups:")
543
+ outfile.write("Recalculated Gene Groups\n")
544
+ print(f"First_core_99: {new_core_99}")
545
+ outfile.write(f"First_core_99: {new_core_99}\n")
546
+ print(f"First_core_95: {new_core_95}")
547
+ outfile.write(f"First_core_95: {new_core_95}\n")
548
+ print(f"First_core_15: {new_core_15}")
549
+ outfile.write(f"First_core_15: {new_core_15}\n")
550
+ print(f"First_core_0: {new_core_0}")
551
+ outfile.write(f"First_core_0: {new_core_0}\n")
552
+ print("Total Number of First Gene Groups (Including Singletons): " + str(total_gene_groups))
553
+ outfile.write("Total Number of First Gene Groups (Including Singletons): " + str(total_gene_groups))
554
+
555
+ # Alignment
556
+ if options.align_core != None:
557
+ print("\n\nProcessing gene group alignment")
558
+ group_directory = options.gene_groups_output
559
+ sub_group_directory = options.sub_groups_output
560
+ genome_list = read_genomes_from_fasta(options.gene_groups_output + '/combined_group_sequences_dna.fasta')
561
+ process_gene_groups(options, group_directory, sub_group_directory, paralog_groups, genome_list, 'concatenated_genes_post_splitting_aligned_dna.fasta')
562
+
563
+
564
+
565
+
566
+
567
+
568
+
569
+ if __name__ == "__main__":
570
+
571
+ main()