PyamilySeq 1.0.1__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -22,14 +22,17 @@ def main():
22
22
  ' "combined" for GFF files with embedded FASTA sequences and "fasta" for combining multiple '
23
23
  'FASTA files together.',
24
24
  required=True)
25
- required.add_argument("-name_split", action="store", dest="name_split",
26
- help="substring used to split the filename and extract the genome name ('_combined.gff3' or '.gff').",
27
- required=True)
25
+ required.add_argument("-name_split_gff", action="store", dest="name_split_gff",
26
+ help="Substring used to split the filename and extract the genome name ('_combined.gff3' or '.gff'). - Not needed with -input_type fasta",
27
+ required=False)
28
+ required.add_argument("-name_split_fasta", action="store", dest="name_split_fasta",
29
+ help="Substring used to split filenames and extract genome names for fasta files if named differently to paired gff files (e.g., '_dna.fasta').",
30
+ required=False)
28
31
  required.add_argument("-output_dir", action="store", dest="output_dir",
29
32
  help="Directory for all output files.",
30
33
  required=True)
31
34
  required.add_argument("-output_name", action="store", dest="output_file",
32
- help="Output file name (without .fasta).",
35
+ help="Output file name.",
33
36
  required=True)
34
37
 
35
38
  optional = parser.add_argument_group('Optional Arguments')
@@ -48,19 +51,33 @@ def main():
48
51
  options = parser.parse_args()
49
52
 
50
53
 
54
+ if options.input_type == 'separate' and options.name_split_gff is None:
55
+ print("Please provide a substring to split the filename and extract the genome name.")
56
+ exit(1)
57
+ if options.input_type == 'combined' and options.name_split_gff is None:
58
+ print("Please provide a substring to split the filename and extract the genome name.")
59
+ exit(1)
60
+ if options.input_type == 'fasta' and options.name_split_fasta is None:
61
+ print("Please provide a substring to split the filename and extract the genome name.")
62
+ exit
51
63
 
52
64
  output_path = os.path.abspath(options.output_dir)
53
65
  if not os.path.exists(output_path):
54
66
  os.makedirs(output_path)
55
67
 
56
- combined_out_file = os.path.join(output_path, options.output_file + '.fasta')
68
+ #output_file = options.output_file + '.fasta'
69
+ if os.path.exists(os.path.join(output_path, options.output_file)):
70
+ print(f"Output file {options.output_file} already exists in the output directory. Please delete or rename the file and try again.")
71
+ exit(1)
72
+
73
+ combined_out_file = os.path.join(output_path, options.output_file )
57
74
 
58
75
  if options.input_type == 'separate':
59
- read_separate_files(options.input_dir, options.name_split, options.gene_ident, combined_out_file, options.translate)
76
+ read_separate_files(options.input_dir, options.name_split_gff, options.name_split_fasta, options.gene_ident, combined_out_file, options.translate, True)
60
77
  elif options.input_type == 'combined':
61
- read_combined_files(options.input_dir, options.name_split, options.gene_ident, combined_out_file, options.translate)
78
+ read_combined_files(options.input_dir, options.name_split_gff, options.gene_ident, combined_out_file, options.translate, True)
62
79
  elif options.input_type == 'fasta':
63
- read_fasta_files(options.input_dir, options.name_split, combined_out_file, options.translate)
80
+ read_fasta_files(options.input_dir, options.name_split_fasta, combined_out_file, options.translate)
64
81
 
65
82
  if __name__ == "__main__":
66
83
  main()
PyamilySeq/clusterings.py CHANGED
@@ -279,8 +279,6 @@ def combined_clustering_CDHIT(options, taxa_dict, splitter):
279
279
  first = True
280
280
  for line in Second_in:
281
281
  if line.startswith('>'):
282
- if '>Cluster 1997' in line:
283
- print()
284
282
  if first == False:
285
283
  cluster_size = len(Combined_clusters[cluster_id])
286
284
  Combined_reps.update({rep: cluster_size})
PyamilySeq/constants.py CHANGED
@@ -1,2 +1,2 @@
1
- PyamilySeq_Version = 'v1.0.1'
1
+ PyamilySeq_Version = 'v1.1.1'
2
2
 
PyamilySeq/utils.py CHANGED
@@ -228,15 +228,39 @@ def run_mafft_on_sequences(options, sequences, output_file):
228
228
 
229
229
 
230
230
 
231
- def read_separate_files(input_dir, name_split, gene_ident, combined_out, translate):
232
- with open(combined_out, 'w') as combined_out_file, open(combined_out.replace('_dna.fasta','_aa.fasta'), 'w') as combined_out_file_aa:
233
- for gff_file in glob.glob(os.path.join(input_dir, '*' + name_split)):
234
- genome_name = os.path.basename(gff_file).split(name_split)[0]
235
- corresponding_fasta_file = os.path.splitext(gff_file)[0] + '.fa'
236
- if not os.path.exists(corresponding_fasta_file):
237
- continue
231
+ def read_separate_files(input_dir, name_split_gff, name_split_fasta, gene_ident, combined_out, translate, run_as_combiner):
232
+ if run_as_combiner == True:
233
+ combined_out_file_aa = None
234
+ else:
235
+ combined_out_file_aa = combined_out.replace('_dna.fasta','_aa.fasta')
236
+
237
+ with open(combined_out, 'w') as combined_out_file, (open(combined_out_file_aa, 'w') if combined_out_file_aa else open(os.devnull, 'w')) as combined_out_file_aa:
238
+ paired_files_found = None
239
+ #with open(combined_out, 'w') as combined_out_file, open(combined_out.replace('_dna.fasta','_aa.fasta'), 'w') as combined_out_file_aa:
240
+ gff_files = glob.glob(os.path.join(input_dir, '*' + name_split_gff))
241
+ if not gff_files:
242
+ sys.exit("Error: No GFF files found.")
243
+ for gff_file in gff_files:
244
+ genome_name = os.path.basename(gff_file).split(name_split_gff)[0]
245
+ if name_split_fasta == None:
246
+ possible_extensions = ['.fa', '.fasta', '.fna']
247
+ corresponding_fasta_file = None
248
+ for ext in possible_extensions:
249
+ temp_file = os.path.splitext(gff_file)[0] + ext
250
+ if os.path.exists(temp_file):
251
+ corresponding_fasta_file = temp_file
252
+ break
253
+ if corresponding_fasta_file is None:
254
+ print("Corresponding FASTA file for GFF file '" + gff_file + "' not found. Skipping. - Try using the -name_split_fasta option.")
255
+ continue
256
+ else:
257
+ corresponding_fasta_file = os.path.join(input_dir, genome_name + name_split_fasta)
258
+ if not os.path.exists(corresponding_fasta_file):
259
+ print("Corresponding FASTA file for GFF file '" + gff_file + "' not found. Skipping. - Try using the -name_split_fasta option.")
260
+ continue
238
261
 
239
262
  gff_features = []
263
+ paired_files_found = True
240
264
  with open(gff_file, 'r') as file:
241
265
  seen_seq_ids = collections.defaultdict(int)
242
266
  lines = file.readlines()
@@ -244,6 +268,7 @@ def read_separate_files(input_dir, name_split, gene_ident, combined_out, transla
244
268
  line_data = line.split('\t')
245
269
  if len(line_data) == 9:
246
270
  if any(gene_type in line_data[2] for gene_type in gene_ident):
271
+ seq_id = line_data[8].split('ID=')[1].split(';')[0]
247
272
  contig = line_data[0]
248
273
  feature = line_data[2]
249
274
  strand = line_data[6]
@@ -253,7 +278,6 @@ def read_separate_files(input_dir, name_split, gene_ident, combined_out, transla
253
278
  seen_seq_ids[seq_id] + 1
254
279
  else:
255
280
  seen_seq_ids[seq_id] = 1
256
- seq_id = line_data[8].split('ID=')[1].split(';')[0]
257
281
  gff_features.append((contig, start, end, strand, feature, seq_id))
258
282
  fasta_dict = collections.defaultdict(str)
259
283
  with open(corresponding_fasta_file, 'r') as file:
@@ -281,21 +305,44 @@ def read_separate_files(input_dir, name_split, gene_ident, combined_out, transla
281
305
  full_sequence = fasta_dict[contig][1]
282
306
  seq = full_sequence[corrected_start:corrected_stop]
283
307
 
284
- if translate == True:
285
- seq_aa = translate_frame(seq)
286
- wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
287
- combined_out_file_aa.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
288
- wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
289
- combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
308
+ if run_as_combiner == True:
309
+ if translate == True:
310
+ seq_aa = translate_frame(seq)
311
+ wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
312
+ combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
313
+ else:
314
+ wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
315
+ combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
316
+ else:
317
+ if translate == True:
318
+ seq_aa = translate_frame(seq)
319
+ wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
320
+ combined_out_file_aa.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
321
+ wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
322
+ combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
290
323
 
291
- if translate == False:
324
+ if not paired_files_found:
325
+ sys.exit("Could not find matching GFF/FASTA files - Please check input directory and -name_split_gff and -name_split_fasta parameters.")
326
+ if translate == False or translate == None:
292
327
  #Clean up unused file
293
- os.remove(combined_out_file_aa.name)
328
+ try: # Catches is combined_out_file_aa is None
329
+ if combined_out_file.name != combined_out_file_aa.name:
330
+ os.remove(combined_out_file_aa.name)
331
+ except AttributeError:
332
+ pass
294
333
 
295
334
 
296
- def read_combined_files(input_dir, name_split, gene_ident, combined_out, translate):
297
- with open(combined_out, 'w') as combined_out_file, open(combined_out.replace('_dna.fasta','_aa.fasta'), 'w') as combined_out_file_aa:
298
- for gff_file in glob.glob(os.path.join(input_dir, '*' + name_split)):
335
+ def read_combined_files(input_dir, name_split, gene_ident, combined_out, translate, run_as_combiner):
336
+ if run_as_combiner == True:
337
+ combined_out_file_aa = None
338
+ else:
339
+ combined_out_file_aa = combined_out.replace('_dna.fasta','_aa.fasta')
340
+ #with open(combined_out, 'w') as combined_out_file, open(combined_out_file_aa, 'w') if combined_out_file_aa else open(os.devnull, 'w'):
341
+ with open(combined_out, 'w') as combined_out_file, (open(combined_out_file_aa, 'w') if combined_out_file_aa else open(os.devnull, 'w')) as combined_out_file_aa:
342
+ gff_files = glob.glob(os.path.join(input_dir, '*' + name_split))
343
+ if not gff_files:
344
+ sys.exit("Error: No GFF files found - check input directory and -name_split_gff parameter.")
345
+ for gff_file in gff_files:
299
346
  genome_name = os.path.basename(gff_file).split(name_split)[0]
300
347
  fasta_dict = collections.defaultdict(str)
301
348
  gff_features = []
@@ -331,7 +378,7 @@ def read_combined_files(input_dir, name_split, gene_ident, combined_out, transla
331
378
 
332
379
  for contig, fasta in fasta_dict.items():
333
380
  reverse_sequence = reverse_complement(fasta[0])
334
- fasta_dict[contig][1]=reverse_sequence
381
+ fasta_dict[contig][1] = reverse_sequence
335
382
 
336
383
  if fasta_dict and gff_features:
337
384
  for contig, start, end, strand, feature, seq_id in gff_features:
@@ -345,23 +392,43 @@ def read_combined_files(input_dir, name_split, gene_ident, combined_out, transla
345
392
  full_sequence = fasta_dict[contig][1]
346
393
  seq = full_sequence[corrected_start:corrected_stop]
347
394
 
348
- if translate == True:
349
- seq_aa = translate_frame(seq)
350
- wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
351
- combined_out_file_aa.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
352
- wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
353
- combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
354
-
355
- if translate == False:
395
+ if run_as_combiner == True:
396
+ if translate == True:
397
+ seq_aa = translate_frame(seq)
398
+ wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
399
+ combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
400
+ else:
401
+ wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
402
+ combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
403
+ else:
404
+ if translate == True:
405
+ seq_aa = translate_frame(seq)
406
+ wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
407
+ combined_out_file_aa.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
408
+ wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
409
+ combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
410
+
411
+ if translate == False or translate == None:
356
412
  #Clean up unused file
357
- os.remove(combined_out_file_aa.name)
413
+ try: # Catches is combined_out_file_aa is None
414
+ if combined_out_file.name != combined_out_file_aa.name:
415
+ os.remove(combined_out_file_aa.name)
416
+ except AttributeError:
417
+ pass
358
418
 
359
419
 
360
420
 
361
- def read_fasta_files(input_dir, name_split, combined_out, translate):
362
- with open(combined_out, 'w') as combined_out_file, open(combined_out.replace('_dna.fasta','_aa.fasta'), 'w') as combined_out_file_aa:
363
- for fasta_file in glob.glob(os.path.join(input_dir, '*' + name_split)):
364
- genome_name = os.path.basename(fasta_file).split(name_split)[0]
421
+ def read_fasta_files(input_dir, name_split_fasta, combined_out, translate, run_as_combiner):
422
+ if run_as_combiner == True:
423
+ combined_out_file_aa = None
424
+ else:
425
+ combined_out_file_aa = combined_out.replace('_dna.fasta','_aa.fasta')
426
+ with open(combined_out, 'w') as combined_out_file, (open(combined_out_file_aa, 'w') if combined_out_file_aa else open(os.devnull, 'w')) as combined_out_file_aa:
427
+ fasta_files = glob.glob(os.path.join(input_dir, '*' + name_split_fasta))
428
+ if not fasta_files:
429
+ sys.exit("Error: No GFF files found.")
430
+ for fasta_file in fasta_files:
431
+ genome_name = os.path.basename(fasta_file).split(name_split_fasta)[0]
365
432
  fasta_dict = collections.defaultdict(str)
366
433
  with open(fasta_file, 'r') as file:
367
434
  lines = file.readlines()
@@ -372,16 +439,30 @@ def read_fasta_files(input_dir, name_split, combined_out, translate):
372
439
  else:
373
440
  fasta_dict[current_seq] +=line.strip()
374
441
  for seq_id, seq in fasta_dict.items():
375
- if translate == True:
376
- seq_aa = translate_frame(seq)
377
- wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
378
- combined_out_file_aa.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
379
- wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
380
- combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
381
-
382
- if translate == False:
442
+ if run_as_combiner == True:
443
+ if translate == True:
444
+ seq_aa = translate_frame(seq)
445
+ wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
446
+ combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
447
+ else:
448
+ wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
449
+ combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
450
+ else:
451
+ if translate == True:
452
+ seq_aa = translate_frame(seq)
453
+ wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
454
+ combined_out_file_aa.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
455
+ wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
456
+ combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
457
+
458
+ if translate == False or translate == None:
383
459
  #Clean up unused file
384
- os.remove(combined_out_file_aa)
460
+ try: # Catches is combined_out_file_aa is None
461
+ if combined_out_file.name != combined_out_file_aa.name:
462
+ os.remove(combined_out_file_aa.name)
463
+ except AttributeError:
464
+ pass
465
+
385
466
 
386
467
  def write_groups_func(options, output_dir, key_order, cores, sequences,
387
468
  pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences):
@@ -401,63 +482,65 @@ def write_groups_func(options, output_dir, key_order, cores, sequences,
401
482
  if not os.path.exists(output_dir):
402
483
  os.makedirs(output_dir)
403
484
 
404
- combined_fasta_filename = os.path.join(output_dir, "combined_group_sequences_dna.fasta")
405
-
406
- # Open combined FASTA file for writing all sequences
407
- with open(combined_fasta_filename, 'w') as combined_fasta, open(combined_fasta_filename.replace('_dna.fasta','_aa.fasta'), 'w') as combined_fasta_aa:
408
- for key_prefix in key_order:
409
- for key, values in cores.items():
410
- if any(part in options.write_groups.split(',') for part in key.split('_')):
411
- if key.startswith(key_prefix):
412
- for value in values:
413
- output_filename = f"{key}_{value}_dna.fasta"
414
- if 'First' in key_prefix:
415
- sequences_to_write = pangenome_clusters_First_sequences_sorted[value]
416
- else:
417
- sequences_to_write = combined_pangenome_clusters_Second_sequences[value]
418
-
419
- # Write individual FASTA file
420
- with open(os.path.join(output_dir,output_filename), 'w') as outfile, open(os.path.join(output_dir, output_filename.replace('_dna.fasta','_aa.fasta')), 'w') as outfile_aa:
421
- for header in sequences_to_write:
422
- if header in sequences:
423
- sequence = sequences[header]
424
- wrapped_sequence = wrap_sequence(sequence)
425
- # Handle Amino Acid Sequences (AA)
426
- if options.sequence_type == 'AA':
427
- seq_aa = translate_frame(sequence)
428
- wrapped_sequence_aa = wrap_sequence(seq_aa)
429
- # Write individual group file for AA, if option is enabled
485
+ for group in options.write_groups.split(','):
486
+
487
+ combined_fasta_filename = os.path.join(output_dir, "combined_group_sequences_" + group + "_dna.fasta")
488
+
489
+ # Open combined FASTA file for writing all sequences
490
+ with open(combined_fasta_filename, 'w') as combined_fasta, open(combined_fasta_filename.replace('_dna.fasta','_aa.fasta'), 'w') as combined_fasta_aa:
491
+ for key_prefix in key_order:
492
+ for key, values in cores.items():
493
+ if any(part in group for part in key.split('_')):
494
+ if key.startswith(key_prefix):
495
+ for value in values:
496
+ output_filename = f"{key}_{value}_dna.fasta"
497
+ if 'First' in key_prefix:
498
+ sequences_to_write = pangenome_clusters_First_sequences_sorted[value]
499
+ else:
500
+ sequences_to_write = combined_pangenome_clusters_Second_sequences[value]
501
+
502
+ # Write individual FASTA file
503
+ with open(os.path.join(output_dir,output_filename), 'w') as outfile, open(os.path.join(output_dir, output_filename.replace('_dna.fasta','_aa.fasta')), 'w') as outfile_aa:
504
+ for header in sequences_to_write:
505
+ if header in sequences:
506
+ sequence = sequences[header]
507
+ wrapped_sequence = wrap_sequence(sequence)
508
+ # Handle Amino Acid Sequences (AA)
509
+ if options.sequence_type == 'AA':
510
+ seq_aa = translate_frame(sequence)
511
+ wrapped_sequence_aa = wrap_sequence(seq_aa)
512
+ # Write individual group file for AA, if option is enabled
513
+ if options.write_individual_groups:
514
+ outfile_aa.write(f">{header}\n")
515
+ outfile_aa.write(f"{wrapped_sequence_aa}\n")
516
+ else:
517
+ os.remove(outfile_aa.name) # Delete individual file if option is disabled
518
+ # Always write to the combined AA file
519
+ combined_fasta_aa.write(f">Group_{value}|{header}\n")
520
+ combined_fasta_aa.write(f"{wrapped_sequence_aa}\n")
521
+ # Handle Nucleotide Sequences
522
+ else:
523
+ # If the option is disabled, delete individual AA file (if created)
524
+ try:
525
+ os.remove(outfile_aa.name) # Ensure outfile_aa is removed when sequence_type isn't 'AA'
526
+ except FileNotFoundError:
527
+ pass
528
+ # Write individual group file for nucleotide sequence, if option is enabled
430
529
  if options.write_individual_groups:
431
- outfile_aa.write(f">{header}\n")
432
- outfile_aa.write(f"{wrapped_sequence_aa}\n")
530
+ outfile.write(f">{header}\n")
531
+ outfile.write(f"{wrapped_sequence}\n")
433
532
  else:
434
- os.remove(outfile_aa.name) # Delete individual file if option is disabled
435
- # Always write to the combined AA file
436
- combined_fasta_aa.write(f">Group_{value}|{header}\n")
437
- combined_fasta_aa.write(f"{wrapped_sequence_aa}\n")
438
- # Handle Nucleotide Sequences
439
- else:
440
- # If the option is disabled, delete individual AA file (if created)
441
- try:
442
- os.remove(outfile_aa.name) # Ensure outfile_aa is removed when sequence_type isn't 'AA'
443
- except FileNotFoundError:
444
- pass
445
- # Write individual group file for nucleotide sequence, if option is enabled
446
- if options.write_individual_groups:
447
- outfile.write(f">{header}\n")
448
- outfile.write(f"{wrapped_sequence}\n")
533
+ os.remove(outfile.name) # Delete individual file if option is disabled
534
+ # Always write to the combined nucleotide file
535
+ combined_fasta.write(f">Group_{value}|{header}\n")
536
+ combined_fasta.write(f"{wrapped_sequence}\n")
537
+
449
538
  else:
450
- os.remove(outfile.name) # Delete individual file if option is disabled
451
- # Always write to the combined nucleotide file
452
- combined_fasta.write(f">Group_{value}|{header}\n")
453
- combined_fasta.write(f"{wrapped_sequence}\n")
454
-
455
- else:
456
- if options.verbose == True:
457
- print(f"Sequence {header} not found in original_fasta file.")
458
- if options.sequence_type != 'AA':
459
- #Clean up unused file
460
- os.remove(combined_fasta_aa.name)
539
+ if options.verbose == True:
540
+ print(f"Sequence {header} not found in original_fasta file.")
541
+ if options.sequence_type != 'AA':
542
+ #Clean up unused file
543
+ os.remove(combined_fasta_aa.name)
461
544
  print(f"Combined FASTA file saved to: {combined_fasta_filename}")
462
545
 
463
546
 
@@ -502,7 +585,8 @@ def write_groups_func(options, output_dir, key_order, cores, sequences,
502
585
  def perform_alignment(gene_path,group_directory, gene_file, options, concatenated_sequences, subgrouped):
503
586
  # Read sequences from the gene family file
504
587
  sequences = read_fasta(gene_path)
505
-
588
+ if len(sequences) == 1: # We can't align a single sequence
589
+ return concatenated_sequences
506
590
  # Select the longest sequence for each genome
507
591
  longest_sequences = select_longest_gene(sequences, subgrouped)
508
592
 
@@ -539,23 +623,22 @@ def process_gene_groups(options, group_directory, sub_group_directory, paralog_g
539
623
  else:
540
624
  affix = '_dna.fasta'
541
625
 
542
- # Iterate over each gene family file
543
- for gene_file in os.listdir(group_directory):
544
- if gene_file.endswith(affix) and not gene_file.startswith('combined_group_sequences'):
545
- #print(gene_file)
546
- current_group = int(gene_file.split('_')[3].split('.')[0])
547
- gene_path = os.path.join(group_directory, gene_file)
548
-
549
- # Check for matching group in paralog_groups
550
- if sub_group_directory and paralog_groups and '>Group_'+str(current_group) in paralog_groups:
551
- for subgroup, size in enumerate(paralog_groups['>Group_' + str(current_group)]['sizes']):
552
- if size >= threshold_size:
553
- gene_path = os.path.join(sub_group_directory,f"Group_{current_group}_subgroup_{subgroup}{affix}")
554
- concatenated_sequences = perform_alignment(gene_path, group_directory, gene_file, options, concatenated_sequences, True)
555
-
556
- else:
557
- concatenated_sequences = perform_alignment(gene_path, group_directory, gene_file, options, concatenated_sequences, False)
558
-
626
+ if options.align_core == True:
627
+ # Iterate over each gene family file
628
+ for gene_file in os.listdir(group_directory):
629
+ if gene_file.endswith(affix) and not gene_file.startswith('combined_group_sequences'):
630
+ current_group = int(gene_file.split('_')[3].split('.')[0])
631
+ gene_path = os.path.join(group_directory, gene_file)
632
+ # Could add more catches here to work with First and Secondary groups - This ensures only core '99/100' are aligned
633
+ if 'First_core_99' in gene_file or 'First_core_100' in gene_file:
634
+ # Check for matching group in paralog_groups
635
+ if sub_group_directory and paralog_groups and '>Group_'+str(current_group) in paralog_groups:
636
+ for subgroup, size in enumerate(paralog_groups['>Group_' + str(current_group)]['sizes']):
637
+ if size >= threshold_size:
638
+ gene_path = os.path.join(sub_group_directory,f"Group_{current_group}_subgroup_{subgroup}{affix}")
639
+ concatenated_sequences = perform_alignment(gene_path, group_directory, gene_file, options, concatenated_sequences, True)
640
+ else:
641
+ concatenated_sequences = perform_alignment(gene_path, group_directory, gene_file, options, concatenated_sequences, False)
559
642
 
560
643
  # Write the concatenated sequences to the output file
561
644
  with open(output_file, 'w') as out: