PyamilySeq 1.0.1__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyamilySeq/Cluster_Compare.py +108 -0
- PyamilySeq/Cluster_Summary.py +59 -64
- PyamilySeq/Group_Extractor.py +83 -0
- PyamilySeq/Group_Sizes.py +87 -0
- PyamilySeq/PyamilySeq.py +26 -18
- PyamilySeq/PyamilySeq_Genus.py +3 -3
- PyamilySeq/PyamilySeq_Species.py +10 -8
- PyamilySeq/Seq_Combiner.py +25 -8
- PyamilySeq/clusterings.py +0 -2
- PyamilySeq/constants.py +1 -1
- PyamilySeq/utils.py +197 -114
- {PyamilySeq-1.0.1.dist-info → pyamilyseq-1.1.1.dist-info}/METADATA +46 -85
- pyamilyseq-1.1.1.dist-info/RECORD +21 -0
- {PyamilySeq-1.0.1.dist-info → pyamilyseq-1.1.1.dist-info}/WHEEL +1 -1
- {PyamilySeq-1.0.1.dist-info → pyamilyseq-1.1.1.dist-info}/entry_points.txt +6 -0
- PyamilySeq-1.0.1.dist-info/RECORD +0 -18
- {PyamilySeq-1.0.1.dist-info → pyamilyseq-1.1.1.dist-info}/LICENSE +0 -0
- {PyamilySeq-1.0.1.dist-info → pyamilyseq-1.1.1.dist-info}/top_level.txt +0 -0
PyamilySeq/Seq_Combiner.py
CHANGED
|
@@ -22,14 +22,17 @@ def main():
|
|
|
22
22
|
' "combined" for GFF files with embedded FASTA sequences and "fasta" for combining multiple '
|
|
23
23
|
'FASTA files together.',
|
|
24
24
|
required=True)
|
|
25
|
-
required.add_argument("-
|
|
26
|
-
help="
|
|
27
|
-
required=
|
|
25
|
+
required.add_argument("-name_split_gff", action="store", dest="name_split_gff",
|
|
26
|
+
help="Substring used to split the filename and extract the genome name ('_combined.gff3' or '.gff'). - Not needed with -input_type fasta",
|
|
27
|
+
required=False)
|
|
28
|
+
required.add_argument("-name_split_fasta", action="store", dest="name_split_fasta",
|
|
29
|
+
help="Substring used to split filenames and extract genome names for fasta files if named differently to paired gff files (e.g., '_dna.fasta').",
|
|
30
|
+
required=False)
|
|
28
31
|
required.add_argument("-output_dir", action="store", dest="output_dir",
|
|
29
32
|
help="Directory for all output files.",
|
|
30
33
|
required=True)
|
|
31
34
|
required.add_argument("-output_name", action="store", dest="output_file",
|
|
32
|
-
help="Output file name
|
|
35
|
+
help="Output file name.",
|
|
33
36
|
required=True)
|
|
34
37
|
|
|
35
38
|
optional = parser.add_argument_group('Optional Arguments')
|
|
@@ -48,19 +51,33 @@ def main():
|
|
|
48
51
|
options = parser.parse_args()
|
|
49
52
|
|
|
50
53
|
|
|
54
|
+
if options.input_type == 'separate' and options.name_split_gff is None:
|
|
55
|
+
print("Please provide a substring to split the filename and extract the genome name.")
|
|
56
|
+
exit(1)
|
|
57
|
+
if options.input_type == 'combined' and options.name_split_gff is None:
|
|
58
|
+
print("Please provide a substring to split the filename and extract the genome name.")
|
|
59
|
+
exit(1)
|
|
60
|
+
if options.input_type == 'fasta' and options.name_split_fasta is None:
|
|
61
|
+
print("Please provide a substring to split the filename and extract the genome name.")
|
|
62
|
+
exit
|
|
51
63
|
|
|
52
64
|
output_path = os.path.abspath(options.output_dir)
|
|
53
65
|
if not os.path.exists(output_path):
|
|
54
66
|
os.makedirs(output_path)
|
|
55
67
|
|
|
56
|
-
|
|
68
|
+
#output_file = options.output_file + '.fasta'
|
|
69
|
+
if os.path.exists(os.path.join(output_path, options.output_file)):
|
|
70
|
+
print(f"Output file {options.output_file} already exists in the output directory. Please delete or rename the file and try again.")
|
|
71
|
+
exit(1)
|
|
72
|
+
|
|
73
|
+
combined_out_file = os.path.join(output_path, options.output_file )
|
|
57
74
|
|
|
58
75
|
if options.input_type == 'separate':
|
|
59
|
-
read_separate_files(options.input_dir, options.
|
|
76
|
+
read_separate_files(options.input_dir, options.name_split_gff, options.name_split_fasta, options.gene_ident, combined_out_file, options.translate, True)
|
|
60
77
|
elif options.input_type == 'combined':
|
|
61
|
-
read_combined_files(options.input_dir, options.
|
|
78
|
+
read_combined_files(options.input_dir, options.name_split_gff, options.gene_ident, combined_out_file, options.translate, True)
|
|
62
79
|
elif options.input_type == 'fasta':
|
|
63
|
-
read_fasta_files(options.input_dir, options.
|
|
80
|
+
read_fasta_files(options.input_dir, options.name_split_fasta, combined_out_file, options.translate)
|
|
64
81
|
|
|
65
82
|
if __name__ == "__main__":
|
|
66
83
|
main()
|
PyamilySeq/clusterings.py
CHANGED
|
@@ -279,8 +279,6 @@ def combined_clustering_CDHIT(options, taxa_dict, splitter):
|
|
|
279
279
|
first = True
|
|
280
280
|
for line in Second_in:
|
|
281
281
|
if line.startswith('>'):
|
|
282
|
-
if '>Cluster 1997' in line:
|
|
283
|
-
print()
|
|
284
282
|
if first == False:
|
|
285
283
|
cluster_size = len(Combined_clusters[cluster_id])
|
|
286
284
|
Combined_reps.update({rep: cluster_size})
|
PyamilySeq/constants.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
PyamilySeq_Version = 'v1.
|
|
1
|
+
PyamilySeq_Version = 'v1.1.1'
|
|
2
2
|
|
PyamilySeq/utils.py
CHANGED
|
@@ -228,15 +228,39 @@ def run_mafft_on_sequences(options, sequences, output_file):
|
|
|
228
228
|
|
|
229
229
|
|
|
230
230
|
|
|
231
|
-
def read_separate_files(input_dir,
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
231
|
+
def read_separate_files(input_dir, name_split_gff, name_split_fasta, gene_ident, combined_out, translate, run_as_combiner):
|
|
232
|
+
if run_as_combiner == True:
|
|
233
|
+
combined_out_file_aa = None
|
|
234
|
+
else:
|
|
235
|
+
combined_out_file_aa = combined_out.replace('_dna.fasta','_aa.fasta')
|
|
236
|
+
|
|
237
|
+
with open(combined_out, 'w') as combined_out_file, (open(combined_out_file_aa, 'w') if combined_out_file_aa else open(os.devnull, 'w')) as combined_out_file_aa:
|
|
238
|
+
paired_files_found = None
|
|
239
|
+
#with open(combined_out, 'w') as combined_out_file, open(combined_out.replace('_dna.fasta','_aa.fasta'), 'w') as combined_out_file_aa:
|
|
240
|
+
gff_files = glob.glob(os.path.join(input_dir, '*' + name_split_gff))
|
|
241
|
+
if not gff_files:
|
|
242
|
+
sys.exit("Error: No GFF files found.")
|
|
243
|
+
for gff_file in gff_files:
|
|
244
|
+
genome_name = os.path.basename(gff_file).split(name_split_gff)[0]
|
|
245
|
+
if name_split_fasta == None:
|
|
246
|
+
possible_extensions = ['.fa', '.fasta', '.fna']
|
|
247
|
+
corresponding_fasta_file = None
|
|
248
|
+
for ext in possible_extensions:
|
|
249
|
+
temp_file = os.path.splitext(gff_file)[0] + ext
|
|
250
|
+
if os.path.exists(temp_file):
|
|
251
|
+
corresponding_fasta_file = temp_file
|
|
252
|
+
break
|
|
253
|
+
if corresponding_fasta_file is None:
|
|
254
|
+
print("Corresponding FASTA file for GFF file '" + gff_file + "' not found. Skipping. - Try using the -name_split_fasta option.")
|
|
255
|
+
continue
|
|
256
|
+
else:
|
|
257
|
+
corresponding_fasta_file = os.path.join(input_dir, genome_name + name_split_fasta)
|
|
258
|
+
if not os.path.exists(corresponding_fasta_file):
|
|
259
|
+
print("Corresponding FASTA file for GFF file '" + gff_file + "' not found. Skipping. - Try using the -name_split_fasta option.")
|
|
260
|
+
continue
|
|
238
261
|
|
|
239
262
|
gff_features = []
|
|
263
|
+
paired_files_found = True
|
|
240
264
|
with open(gff_file, 'r') as file:
|
|
241
265
|
seen_seq_ids = collections.defaultdict(int)
|
|
242
266
|
lines = file.readlines()
|
|
@@ -244,6 +268,7 @@ def read_separate_files(input_dir, name_split, gene_ident, combined_out, transla
|
|
|
244
268
|
line_data = line.split('\t')
|
|
245
269
|
if len(line_data) == 9:
|
|
246
270
|
if any(gene_type in line_data[2] for gene_type in gene_ident):
|
|
271
|
+
seq_id = line_data[8].split('ID=')[1].split(';')[0]
|
|
247
272
|
contig = line_data[0]
|
|
248
273
|
feature = line_data[2]
|
|
249
274
|
strand = line_data[6]
|
|
@@ -253,7 +278,6 @@ def read_separate_files(input_dir, name_split, gene_ident, combined_out, transla
|
|
|
253
278
|
seen_seq_ids[seq_id] + 1
|
|
254
279
|
else:
|
|
255
280
|
seen_seq_ids[seq_id] = 1
|
|
256
|
-
seq_id = line_data[8].split('ID=')[1].split(';')[0]
|
|
257
281
|
gff_features.append((contig, start, end, strand, feature, seq_id))
|
|
258
282
|
fasta_dict = collections.defaultdict(str)
|
|
259
283
|
with open(corresponding_fasta_file, 'r') as file:
|
|
@@ -281,21 +305,44 @@ def read_separate_files(input_dir, name_split, gene_ident, combined_out, transla
|
|
|
281
305
|
full_sequence = fasta_dict[contig][1]
|
|
282
306
|
seq = full_sequence[corrected_start:corrected_stop]
|
|
283
307
|
|
|
284
|
-
if
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
308
|
+
if run_as_combiner == True:
|
|
309
|
+
if translate == True:
|
|
310
|
+
seq_aa = translate_frame(seq)
|
|
311
|
+
wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
|
|
312
|
+
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
|
|
313
|
+
else:
|
|
314
|
+
wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
|
|
315
|
+
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
|
|
316
|
+
else:
|
|
317
|
+
if translate == True:
|
|
318
|
+
seq_aa = translate_frame(seq)
|
|
319
|
+
wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
|
|
320
|
+
combined_out_file_aa.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
|
|
321
|
+
wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
|
|
322
|
+
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
|
|
290
323
|
|
|
291
|
-
if
|
|
324
|
+
if not paired_files_found:
|
|
325
|
+
sys.exit("Could not find matching GFF/FASTA files - Please check input directory and -name_split_gff and -name_split_fasta parameters.")
|
|
326
|
+
if translate == False or translate == None:
|
|
292
327
|
#Clean up unused file
|
|
293
|
-
|
|
328
|
+
try: # Catches is combined_out_file_aa is None
|
|
329
|
+
if combined_out_file.name != combined_out_file_aa.name:
|
|
330
|
+
os.remove(combined_out_file_aa.name)
|
|
331
|
+
except AttributeError:
|
|
332
|
+
pass
|
|
294
333
|
|
|
295
334
|
|
|
296
|
-
def read_combined_files(input_dir, name_split, gene_ident, combined_out, translate):
|
|
297
|
-
|
|
298
|
-
|
|
335
|
+
def read_combined_files(input_dir, name_split, gene_ident, combined_out, translate, run_as_combiner):
|
|
336
|
+
if run_as_combiner == True:
|
|
337
|
+
combined_out_file_aa = None
|
|
338
|
+
else:
|
|
339
|
+
combined_out_file_aa = combined_out.replace('_dna.fasta','_aa.fasta')
|
|
340
|
+
#with open(combined_out, 'w') as combined_out_file, open(combined_out_file_aa, 'w') if combined_out_file_aa else open(os.devnull, 'w'):
|
|
341
|
+
with open(combined_out, 'w') as combined_out_file, (open(combined_out_file_aa, 'w') if combined_out_file_aa else open(os.devnull, 'w')) as combined_out_file_aa:
|
|
342
|
+
gff_files = glob.glob(os.path.join(input_dir, '*' + name_split))
|
|
343
|
+
if not gff_files:
|
|
344
|
+
sys.exit("Error: No GFF files found - check input directory and -name_split_gff parameter.")
|
|
345
|
+
for gff_file in gff_files:
|
|
299
346
|
genome_name = os.path.basename(gff_file).split(name_split)[0]
|
|
300
347
|
fasta_dict = collections.defaultdict(str)
|
|
301
348
|
gff_features = []
|
|
@@ -331,7 +378,7 @@ def read_combined_files(input_dir, name_split, gene_ident, combined_out, transla
|
|
|
331
378
|
|
|
332
379
|
for contig, fasta in fasta_dict.items():
|
|
333
380
|
reverse_sequence = reverse_complement(fasta[0])
|
|
334
|
-
fasta_dict[contig][1]=reverse_sequence
|
|
381
|
+
fasta_dict[contig][1] = reverse_sequence
|
|
335
382
|
|
|
336
383
|
if fasta_dict and gff_features:
|
|
337
384
|
for contig, start, end, strand, feature, seq_id in gff_features:
|
|
@@ -345,23 +392,43 @@ def read_combined_files(input_dir, name_split, gene_ident, combined_out, transla
|
|
|
345
392
|
full_sequence = fasta_dict[contig][1]
|
|
346
393
|
seq = full_sequence[corrected_start:corrected_stop]
|
|
347
394
|
|
|
348
|
-
if
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
395
|
+
if run_as_combiner == True:
|
|
396
|
+
if translate == True:
|
|
397
|
+
seq_aa = translate_frame(seq)
|
|
398
|
+
wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
|
|
399
|
+
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
|
|
400
|
+
else:
|
|
401
|
+
wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
|
|
402
|
+
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
|
|
403
|
+
else:
|
|
404
|
+
if translate == True:
|
|
405
|
+
seq_aa = translate_frame(seq)
|
|
406
|
+
wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
|
|
407
|
+
combined_out_file_aa.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
|
|
408
|
+
wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
|
|
409
|
+
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
|
|
410
|
+
|
|
411
|
+
if translate == False or translate == None:
|
|
356
412
|
#Clean up unused file
|
|
357
|
-
|
|
413
|
+
try: # Catches is combined_out_file_aa is None
|
|
414
|
+
if combined_out_file.name != combined_out_file_aa.name:
|
|
415
|
+
os.remove(combined_out_file_aa.name)
|
|
416
|
+
except AttributeError:
|
|
417
|
+
pass
|
|
358
418
|
|
|
359
419
|
|
|
360
420
|
|
|
361
|
-
def read_fasta_files(input_dir,
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
421
|
+
def read_fasta_files(input_dir, name_split_fasta, combined_out, translate, run_as_combiner):
|
|
422
|
+
if run_as_combiner == True:
|
|
423
|
+
combined_out_file_aa = None
|
|
424
|
+
else:
|
|
425
|
+
combined_out_file_aa = combined_out.replace('_dna.fasta','_aa.fasta')
|
|
426
|
+
with open(combined_out, 'w') as combined_out_file, (open(combined_out_file_aa, 'w') if combined_out_file_aa else open(os.devnull, 'w')) as combined_out_file_aa:
|
|
427
|
+
fasta_files = glob.glob(os.path.join(input_dir, '*' + name_split_fasta))
|
|
428
|
+
if not fasta_files:
|
|
429
|
+
sys.exit("Error: No GFF files found.")
|
|
430
|
+
for fasta_file in fasta_files:
|
|
431
|
+
genome_name = os.path.basename(fasta_file).split(name_split_fasta)[0]
|
|
365
432
|
fasta_dict = collections.defaultdict(str)
|
|
366
433
|
with open(fasta_file, 'r') as file:
|
|
367
434
|
lines = file.readlines()
|
|
@@ -372,16 +439,30 @@ def read_fasta_files(input_dir, name_split, combined_out, translate):
|
|
|
372
439
|
else:
|
|
373
440
|
fasta_dict[current_seq] +=line.strip()
|
|
374
441
|
for seq_id, seq in fasta_dict.items():
|
|
375
|
-
if
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
442
|
+
if run_as_combiner == True:
|
|
443
|
+
if translate == True:
|
|
444
|
+
seq_aa = translate_frame(seq)
|
|
445
|
+
wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
|
|
446
|
+
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
|
|
447
|
+
else:
|
|
448
|
+
wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
|
|
449
|
+
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
|
|
450
|
+
else:
|
|
451
|
+
if translate == True:
|
|
452
|
+
seq_aa = translate_frame(seq)
|
|
453
|
+
wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
|
|
454
|
+
combined_out_file_aa.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
|
|
455
|
+
wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
|
|
456
|
+
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
|
|
457
|
+
|
|
458
|
+
if translate == False or translate == None:
|
|
383
459
|
#Clean up unused file
|
|
384
|
-
|
|
460
|
+
try: # Catches is combined_out_file_aa is None
|
|
461
|
+
if combined_out_file.name != combined_out_file_aa.name:
|
|
462
|
+
os.remove(combined_out_file_aa.name)
|
|
463
|
+
except AttributeError:
|
|
464
|
+
pass
|
|
465
|
+
|
|
385
466
|
|
|
386
467
|
def write_groups_func(options, output_dir, key_order, cores, sequences,
|
|
387
468
|
pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences):
|
|
@@ -401,63 +482,65 @@ def write_groups_func(options, output_dir, key_order, cores, sequences,
|
|
|
401
482
|
if not os.path.exists(output_dir):
|
|
402
483
|
os.makedirs(output_dir)
|
|
403
484
|
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
for
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
485
|
+
for group in options.write_groups.split(','):
|
|
486
|
+
|
|
487
|
+
combined_fasta_filename = os.path.join(output_dir, "combined_group_sequences_" + group + "_dna.fasta")
|
|
488
|
+
|
|
489
|
+
# Open combined FASTA file for writing all sequences
|
|
490
|
+
with open(combined_fasta_filename, 'w') as combined_fasta, open(combined_fasta_filename.replace('_dna.fasta','_aa.fasta'), 'w') as combined_fasta_aa:
|
|
491
|
+
for key_prefix in key_order:
|
|
492
|
+
for key, values in cores.items():
|
|
493
|
+
if any(part in group for part in key.split('_')):
|
|
494
|
+
if key.startswith(key_prefix):
|
|
495
|
+
for value in values:
|
|
496
|
+
output_filename = f"{key}_{value}_dna.fasta"
|
|
497
|
+
if 'First' in key_prefix:
|
|
498
|
+
sequences_to_write = pangenome_clusters_First_sequences_sorted[value]
|
|
499
|
+
else:
|
|
500
|
+
sequences_to_write = combined_pangenome_clusters_Second_sequences[value]
|
|
501
|
+
|
|
502
|
+
# Write individual FASTA file
|
|
503
|
+
with open(os.path.join(output_dir,output_filename), 'w') as outfile, open(os.path.join(output_dir, output_filename.replace('_dna.fasta','_aa.fasta')), 'w') as outfile_aa:
|
|
504
|
+
for header in sequences_to_write:
|
|
505
|
+
if header in sequences:
|
|
506
|
+
sequence = sequences[header]
|
|
507
|
+
wrapped_sequence = wrap_sequence(sequence)
|
|
508
|
+
# Handle Amino Acid Sequences (AA)
|
|
509
|
+
if options.sequence_type == 'AA':
|
|
510
|
+
seq_aa = translate_frame(sequence)
|
|
511
|
+
wrapped_sequence_aa = wrap_sequence(seq_aa)
|
|
512
|
+
# Write individual group file for AA, if option is enabled
|
|
513
|
+
if options.write_individual_groups:
|
|
514
|
+
outfile_aa.write(f">{header}\n")
|
|
515
|
+
outfile_aa.write(f"{wrapped_sequence_aa}\n")
|
|
516
|
+
else:
|
|
517
|
+
os.remove(outfile_aa.name) # Delete individual file if option is disabled
|
|
518
|
+
# Always write to the combined AA file
|
|
519
|
+
combined_fasta_aa.write(f">Group_{value}|{header}\n")
|
|
520
|
+
combined_fasta_aa.write(f"{wrapped_sequence_aa}\n")
|
|
521
|
+
# Handle Nucleotide Sequences
|
|
522
|
+
else:
|
|
523
|
+
# If the option is disabled, delete individual AA file (if created)
|
|
524
|
+
try:
|
|
525
|
+
os.remove(outfile_aa.name) # Ensure outfile_aa is removed when sequence_type isn't 'AA'
|
|
526
|
+
except FileNotFoundError:
|
|
527
|
+
pass
|
|
528
|
+
# Write individual group file for nucleotide sequence, if option is enabled
|
|
430
529
|
if options.write_individual_groups:
|
|
431
|
-
|
|
432
|
-
|
|
530
|
+
outfile.write(f">{header}\n")
|
|
531
|
+
outfile.write(f"{wrapped_sequence}\n")
|
|
433
532
|
else:
|
|
434
|
-
os.remove(
|
|
435
|
-
# Always write to the combined
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
else:
|
|
440
|
-
# If the option is disabled, delete individual AA file (if created)
|
|
441
|
-
try:
|
|
442
|
-
os.remove(outfile_aa.name) # Ensure outfile_aa is removed when sequence_type isn't 'AA'
|
|
443
|
-
except FileNotFoundError:
|
|
444
|
-
pass
|
|
445
|
-
# Write individual group file for nucleotide sequence, if option is enabled
|
|
446
|
-
if options.write_individual_groups:
|
|
447
|
-
outfile.write(f">{header}\n")
|
|
448
|
-
outfile.write(f"{wrapped_sequence}\n")
|
|
533
|
+
os.remove(outfile.name) # Delete individual file if option is disabled
|
|
534
|
+
# Always write to the combined nucleotide file
|
|
535
|
+
combined_fasta.write(f">Group_{value}|{header}\n")
|
|
536
|
+
combined_fasta.write(f"{wrapped_sequence}\n")
|
|
537
|
+
|
|
449
538
|
else:
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
else:
|
|
456
|
-
if options.verbose == True:
|
|
457
|
-
print(f"Sequence {header} not found in original_fasta file.")
|
|
458
|
-
if options.sequence_type != 'AA':
|
|
459
|
-
#Clean up unused file
|
|
460
|
-
os.remove(combined_fasta_aa.name)
|
|
539
|
+
if options.verbose == True:
|
|
540
|
+
print(f"Sequence {header} not found in original_fasta file.")
|
|
541
|
+
if options.sequence_type != 'AA':
|
|
542
|
+
#Clean up unused file
|
|
543
|
+
os.remove(combined_fasta_aa.name)
|
|
461
544
|
print(f"Combined FASTA file saved to: {combined_fasta_filename}")
|
|
462
545
|
|
|
463
546
|
|
|
@@ -502,7 +585,8 @@ def write_groups_func(options, output_dir, key_order, cores, sequences,
|
|
|
502
585
|
def perform_alignment(gene_path,group_directory, gene_file, options, concatenated_sequences, subgrouped):
|
|
503
586
|
# Read sequences from the gene family file
|
|
504
587
|
sequences = read_fasta(gene_path)
|
|
505
|
-
|
|
588
|
+
if len(sequences) == 1: # We can't align a single sequence
|
|
589
|
+
return concatenated_sequences
|
|
506
590
|
# Select the longest sequence for each genome
|
|
507
591
|
longest_sequences = select_longest_gene(sequences, subgrouped)
|
|
508
592
|
|
|
@@ -539,23 +623,22 @@ def process_gene_groups(options, group_directory, sub_group_directory, paralog_g
|
|
|
539
623
|
else:
|
|
540
624
|
affix = '_dna.fasta'
|
|
541
625
|
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
626
|
+
if options.align_core == True:
|
|
627
|
+
# Iterate over each gene family file
|
|
628
|
+
for gene_file in os.listdir(group_directory):
|
|
629
|
+
if gene_file.endswith(affix) and not gene_file.startswith('combined_group_sequences'):
|
|
630
|
+
current_group = int(gene_file.split('_')[3].split('.')[0])
|
|
631
|
+
gene_path = os.path.join(group_directory, gene_file)
|
|
632
|
+
# Could add more catches here to work with First and Secondary groups - This ensures only core '99/100' are aligned
|
|
633
|
+
if 'First_core_99' in gene_file or 'First_core_100' in gene_file:
|
|
634
|
+
# Check for matching group in paralog_groups
|
|
635
|
+
if sub_group_directory and paralog_groups and '>Group_'+str(current_group) in paralog_groups:
|
|
636
|
+
for subgroup, size in enumerate(paralog_groups['>Group_' + str(current_group)]['sizes']):
|
|
637
|
+
if size >= threshold_size:
|
|
638
|
+
gene_path = os.path.join(sub_group_directory,f"Group_{current_group}_subgroup_{subgroup}{affix}")
|
|
639
|
+
concatenated_sequences = perform_alignment(gene_path, group_directory, gene_file, options, concatenated_sequences, True)
|
|
640
|
+
else:
|
|
641
|
+
concatenated_sequences = perform_alignment(gene_path, group_directory, gene_file, options, concatenated_sequences, False)
|
|
559
642
|
|
|
560
643
|
# Write the concatenated sequences to the output file
|
|
561
644
|
with open(output_file, 'w') as out:
|