PyamilySeq 0.9.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyamilySeq/Cluster_Summary.py +2 -2
- PyamilySeq/Group_Splitter.py +247 -58
- PyamilySeq/PyamilySeq.py +168 -148
- PyamilySeq/PyamilySeq_Genus.py +11 -11
- PyamilySeq/PyamilySeq_Species.py +51 -29
- PyamilySeq/Seq_Combiner.py +6 -7
- PyamilySeq/Seq_Extractor.py +64 -0
- PyamilySeq/Seq_Finder.py +56 -0
- PyamilySeq/clusterings.py +139 -49
- PyamilySeq/constants.py +2 -0
- PyamilySeq/utils.py +214 -56
- {PyamilySeq-0.9.0.dist-info → PyamilySeq-1.0.1.dist-info}/METADATA +174 -138
- PyamilySeq-1.0.1.dist-info/RECORD +18 -0
- {PyamilySeq-0.9.0.dist-info → PyamilySeq-1.0.1.dist-info}/WHEEL +1 -1
- {PyamilySeq-0.9.0.dist-info → PyamilySeq-1.0.1.dist-info}/entry_points.txt +2 -0
- PyamilySeq/Constants.py +0 -2
- PyamilySeq-0.9.0.dist-info/RECORD +0 -16
- {PyamilySeq-0.9.0.dist-info → PyamilySeq-1.0.1.dist-info}/LICENSE +0 -0
- {PyamilySeq-0.9.0.dist-info → PyamilySeq-1.0.1.dist-info}/top_level.txt +0 -0
PyamilySeq/utils.py
CHANGED
|
@@ -5,8 +5,8 @@ import glob
|
|
|
5
5
|
import collections
|
|
6
6
|
from tempfile import NamedTemporaryFile
|
|
7
7
|
import sys
|
|
8
|
-
from line_profiler_pycharm import profile
|
|
9
8
|
import re
|
|
9
|
+
import math
|
|
10
10
|
|
|
11
11
|
####
|
|
12
12
|
# Placeholder for the distance function
|
|
@@ -44,7 +44,7 @@ except (ModuleNotFoundError, ImportError):
|
|
|
44
44
|
#####
|
|
45
45
|
|
|
46
46
|
################### We are currently fixed using Table 11
|
|
47
|
-
|
|
47
|
+
codon_table = {
|
|
48
48
|
'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
|
|
49
49
|
'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
|
|
50
50
|
'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
|
|
@@ -63,10 +63,44 @@ gencode = {
|
|
|
63
63
|
'TGC':'C', 'TGT':'C', 'TGA':'*', 'TGG':'W'}
|
|
64
64
|
|
|
65
65
|
def translate_frame(sequence):
|
|
66
|
-
translate = ''.join([
|
|
66
|
+
translate = ''.join([codon_table.get(sequence[3 * i:3 * i + 3], 'X') for i in range(len(sequence) // 3)])
|
|
67
67
|
return translate
|
|
68
68
|
|
|
69
|
+
def translate_dna_to_aa(dna_fasta, aa_fasta):
|
|
70
|
+
def translate_dna_sequence(dna_seq):
|
|
71
|
+
aa_seq = ""
|
|
72
|
+
for i in range(0, len(dna_seq) - 2, 3):
|
|
73
|
+
codon = dna_seq[i:i+3]
|
|
74
|
+
aa_seq += codon_table.get(codon, 'X') # 'X' for unknown codons
|
|
75
|
+
return aa_seq
|
|
76
|
+
|
|
77
|
+
with open(dna_fasta, 'r') as infile, open(aa_fasta, 'w') as outfile:
|
|
78
|
+
dna_seq = ""
|
|
79
|
+
header = ""
|
|
80
|
+
for line in infile:
|
|
81
|
+
if line.startswith('>'):
|
|
82
|
+
if dna_seq:
|
|
83
|
+
aa_seq = translate_dna_sequence(dna_seq)
|
|
84
|
+
wrapped_aa_seq = wrap_sequence(aa_seq, 60)
|
|
85
|
+
outfile.write(f"{header}\n{wrapped_aa_seq}\n")
|
|
86
|
+
header = line.strip()
|
|
87
|
+
dna_seq = ""
|
|
88
|
+
else:
|
|
89
|
+
dna_seq += line.strip()
|
|
90
|
+
if dna_seq:
|
|
91
|
+
aa_seq = translate_dna_sequence(dna_seq)
|
|
92
|
+
wrapped_aa_seq = wrap_sequence(aa_seq, 60)
|
|
93
|
+
outfile.write(f"{header}\n{wrapped_aa_seq}\n")
|
|
94
|
+
|
|
69
95
|
|
|
96
|
+
def detect_sequence_type(fasta_file):
|
|
97
|
+
with open(fasta_file, 'r') as f:
|
|
98
|
+
for line in f:
|
|
99
|
+
if line.startswith('>'):
|
|
100
|
+
continue
|
|
101
|
+
if any(base in line for base in 'EFILPQZ'):
|
|
102
|
+
return False # Contains amino acids
|
|
103
|
+
return True # Contains DNA
|
|
70
104
|
|
|
71
105
|
|
|
72
106
|
def is_tool_installed(tool_name):
|
|
@@ -113,6 +147,16 @@ def wrap_sequence(sequence, width=60):
|
|
|
113
147
|
return "\n".join(wrapped_sequence)
|
|
114
148
|
|
|
115
149
|
|
|
150
|
+
def read_genomes_from_fasta(fasta_file):
|
|
151
|
+
genomes = set()
|
|
152
|
+
with open(fasta_file, 'r') as file:
|
|
153
|
+
for line in file:
|
|
154
|
+
line = line.strip()
|
|
155
|
+
if line.startswith('>'):
|
|
156
|
+
genome = line.split('|')[1]
|
|
157
|
+
genomes.add(genome)
|
|
158
|
+
return list(genomes)
|
|
159
|
+
|
|
116
160
|
def read_fasta(fasta_file):
|
|
117
161
|
sequences = {}
|
|
118
162
|
current_sequence = None
|
|
@@ -138,11 +182,14 @@ def sort_keys_by_values(dict1, dict2):
|
|
|
138
182
|
sorted_keys = sorted(dict1.keys(), key=lambda k: custom_sort_key(k, dict1, dict2), reverse=True)
|
|
139
183
|
return sorted_keys
|
|
140
184
|
|
|
141
|
-
def select_longest_gene(sequences):
|
|
185
|
+
def select_longest_gene(sequences, subgrouped):
|
|
142
186
|
"""Select the longest sequence for each genome."""
|
|
143
187
|
longest_sequences = {}
|
|
144
188
|
for seq_id, sequence in sequences.items():
|
|
145
|
-
|
|
189
|
+
if subgrouped == False:
|
|
190
|
+
genome = seq_id.split('|')[0] # Assuming genome name can be derived from the sequence ID
|
|
191
|
+
elif subgrouped == True:
|
|
192
|
+
genome = seq_id.split('|')[1]
|
|
146
193
|
if genome not in longest_sequences or len(sequence) > len(longest_sequences[genome][1]):
|
|
147
194
|
longest_sequences[genome] = (seq_id, sequence)
|
|
148
195
|
return longest_sequences
|
|
@@ -182,7 +229,7 @@ def run_mafft_on_sequences(options, sequences, output_file):
|
|
|
182
229
|
|
|
183
230
|
|
|
184
231
|
def read_separate_files(input_dir, name_split, gene_ident, combined_out, translate):
|
|
185
|
-
with open(combined_out, 'w') as combined_out_file:
|
|
232
|
+
with open(combined_out, 'w') as combined_out_file, open(combined_out.replace('_dna.fasta','_aa.fasta'), 'w') as combined_out_file_aa:
|
|
186
233
|
for gff_file in glob.glob(os.path.join(input_dir, '*' + name_split)):
|
|
187
234
|
genome_name = os.path.basename(gff_file).split(name_split)[0]
|
|
188
235
|
corresponding_fasta_file = os.path.splitext(gff_file)[0] + '.fa'
|
|
@@ -227,20 +274,27 @@ def read_separate_files(input_dir, name_split, gene_ident, combined_out, transla
|
|
|
227
274
|
if contig in fasta_dict:
|
|
228
275
|
if strand == '+':
|
|
229
276
|
full_sequence = fasta_dict[contig][0]
|
|
230
|
-
|
|
277
|
+
seq = full_sequence[start - 1:end]
|
|
231
278
|
elif strand == '-':
|
|
232
279
|
corrected_start = max(len(fasta_dict[contig][0]) - int(end), 1)
|
|
233
280
|
corrected_stop = max(len(fasta_dict[contig][0]) - int(start - 1), 1)
|
|
234
281
|
full_sequence = fasta_dict[contig][1]
|
|
235
|
-
|
|
282
|
+
seq = full_sequence[corrected_start:corrected_stop]
|
|
283
|
+
|
|
236
284
|
if translate == True:
|
|
237
|
-
|
|
238
|
-
|
|
285
|
+
seq_aa = translate_frame(seq)
|
|
286
|
+
wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
|
|
287
|
+
combined_out_file_aa.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
|
|
288
|
+
wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
|
|
239
289
|
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
|
|
240
290
|
|
|
291
|
+
if translate == False:
|
|
292
|
+
#Clean up unused file
|
|
293
|
+
os.remove(combined_out_file_aa.name)
|
|
294
|
+
|
|
241
295
|
|
|
242
296
|
def read_combined_files(input_dir, name_split, gene_ident, combined_out, translate):
|
|
243
|
-
with open(combined_out, 'w') as combined_out_file:
|
|
297
|
+
with open(combined_out, 'w') as combined_out_file, open(combined_out.replace('_dna.fasta','_aa.fasta'), 'w') as combined_out_file_aa:
|
|
244
298
|
for gff_file in glob.glob(os.path.join(input_dir, '*' + name_split)):
|
|
245
299
|
genome_name = os.path.basename(gff_file).split(name_split)[0]
|
|
246
300
|
fasta_dict = collections.defaultdict(str)
|
|
@@ -284,21 +338,28 @@ def read_combined_files(input_dir, name_split, gene_ident, combined_out, transla
|
|
|
284
338
|
if contig in fasta_dict:
|
|
285
339
|
if strand == '+':
|
|
286
340
|
full_sequence = fasta_dict[contig][0]
|
|
287
|
-
|
|
341
|
+
seq = full_sequence[start - 1:end]
|
|
288
342
|
elif strand == '-':
|
|
289
343
|
corrected_start = max(len(fasta_dict[contig][0]) - int(end), 1)
|
|
290
344
|
corrected_stop = max(len(fasta_dict[contig][0]) - int(start - 1), 1)
|
|
291
345
|
full_sequence = fasta_dict[contig][1]
|
|
292
|
-
|
|
346
|
+
seq = full_sequence[corrected_start:corrected_stop]
|
|
293
347
|
|
|
294
348
|
if translate == True:
|
|
295
|
-
|
|
296
|
-
|
|
349
|
+
seq_aa = translate_frame(seq)
|
|
350
|
+
wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
|
|
351
|
+
combined_out_file_aa.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
|
|
352
|
+
wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
|
|
297
353
|
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
|
|
298
354
|
|
|
355
|
+
if translate == False:
|
|
356
|
+
#Clean up unused file
|
|
357
|
+
os.remove(combined_out_file_aa.name)
|
|
358
|
+
|
|
359
|
+
|
|
299
360
|
|
|
300
361
|
def read_fasta_files(input_dir, name_split, combined_out, translate):
|
|
301
|
-
with open(combined_out, 'w') as combined_out_file:
|
|
362
|
+
with open(combined_out, 'w') as combined_out_file, open(combined_out.replace('_dna.fasta','_aa.fasta'), 'w') as combined_out_file_aa:
|
|
302
363
|
for fasta_file in glob.glob(os.path.join(input_dir, '*' + name_split)):
|
|
303
364
|
genome_name = os.path.basename(fasta_file).split(name_split)[0]
|
|
304
365
|
fasta_dict = collections.defaultdict(str)
|
|
@@ -310,14 +371,19 @@ def read_fasta_files(input_dir, name_split, combined_out, translate):
|
|
|
310
371
|
fasta_dict[current_seq] = ''
|
|
311
372
|
else:
|
|
312
373
|
fasta_dict[current_seq] +=line.strip()
|
|
313
|
-
for
|
|
374
|
+
for seq_id, seq in fasta_dict.items():
|
|
314
375
|
if translate == True:
|
|
315
|
-
|
|
376
|
+
seq_aa = translate_frame(seq)
|
|
377
|
+
wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
|
|
378
|
+
combined_out_file_aa.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
|
|
316
379
|
wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
|
|
317
|
-
combined_out_file.write(f">{genome_name}|{
|
|
380
|
+
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
|
|
318
381
|
|
|
382
|
+
if translate == False:
|
|
383
|
+
#Clean up unused file
|
|
384
|
+
os.remove(combined_out_file_aa)
|
|
319
385
|
|
|
320
|
-
def
|
|
386
|
+
def write_groups_func(options, output_dir, key_order, cores, sequences,
|
|
321
387
|
pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences):
|
|
322
388
|
"""
|
|
323
389
|
Writes individual FASTA files and a combined FASTA file for all sequences.
|
|
@@ -335,74 +401,166 @@ def write_groups(options, output_dir, key_order, cores, sequences,
|
|
|
335
401
|
if not os.path.exists(output_dir):
|
|
336
402
|
os.makedirs(output_dir)
|
|
337
403
|
|
|
338
|
-
combined_fasta_filename = os.path.join(output_dir, "
|
|
404
|
+
combined_fasta_filename = os.path.join(output_dir, "combined_group_sequences_dna.fasta")
|
|
339
405
|
|
|
340
406
|
# Open combined FASTA file for writing all sequences
|
|
341
|
-
with open(combined_fasta_filename, 'w') as combined_fasta:
|
|
407
|
+
with open(combined_fasta_filename, 'w') as combined_fasta, open(combined_fasta_filename.replace('_dna.fasta','_aa.fasta'), 'w') as combined_fasta_aa:
|
|
342
408
|
for key_prefix in key_order:
|
|
343
409
|
for key, values in cores.items():
|
|
344
410
|
if any(part in options.write_groups.split(',') for part in key.split('_')):
|
|
345
411
|
if key.startswith(key_prefix):
|
|
346
412
|
for value in values:
|
|
347
|
-
output_filename = f"{key}_{value}.fasta"
|
|
413
|
+
output_filename = f"{key}_{value}_dna.fasta"
|
|
348
414
|
if 'First' in key_prefix:
|
|
349
415
|
sequences_to_write = pangenome_clusters_First_sequences_sorted[value]
|
|
350
416
|
else:
|
|
351
417
|
sequences_to_write = combined_pangenome_clusters_Second_sequences[value]
|
|
352
418
|
|
|
353
419
|
# Write individual FASTA file
|
|
354
|
-
with open(os.path.join(output_dir, output_filename), 'w') as
|
|
420
|
+
with open(os.path.join(output_dir,output_filename), 'w') as outfile, open(os.path.join(output_dir, output_filename.replace('_dna.fasta','_aa.fasta')), 'w') as outfile_aa:
|
|
355
421
|
for header in sequences_to_write:
|
|
356
422
|
if header in sequences:
|
|
357
423
|
sequence = sequences[header]
|
|
358
|
-
outfile.write(f">{header}\n")
|
|
359
424
|
wrapped_sequence = wrap_sequence(sequence)
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
425
|
+
# Handle Amino Acid Sequences (AA)
|
|
426
|
+
if options.sequence_type == 'AA':
|
|
427
|
+
seq_aa = translate_frame(sequence)
|
|
428
|
+
wrapped_sequence_aa = wrap_sequence(seq_aa)
|
|
429
|
+
# Write individual group file for AA, if option is enabled
|
|
430
|
+
if options.write_individual_groups:
|
|
431
|
+
outfile_aa.write(f">{header}\n")
|
|
432
|
+
outfile_aa.write(f"{wrapped_sequence_aa}\n")
|
|
433
|
+
else:
|
|
434
|
+
os.remove(outfile_aa.name) # Delete individual file if option is disabled
|
|
435
|
+
# Always write to the combined AA file
|
|
436
|
+
combined_fasta_aa.write(f">Group_{value}|{header}\n")
|
|
437
|
+
combined_fasta_aa.write(f"{wrapped_sequence_aa}\n")
|
|
438
|
+
# Handle Nucleotide Sequences
|
|
439
|
+
else:
|
|
440
|
+
# If the option is disabled, delete individual AA file (if created)
|
|
441
|
+
try:
|
|
442
|
+
os.remove(outfile_aa.name) # Ensure outfile_aa is removed when sequence_type isn't 'AA'
|
|
443
|
+
except FileNotFoundError:
|
|
444
|
+
pass
|
|
445
|
+
# Write individual group file for nucleotide sequence, if option is enabled
|
|
446
|
+
if options.write_individual_groups:
|
|
447
|
+
outfile.write(f">{header}\n")
|
|
448
|
+
outfile.write(f"{wrapped_sequence}\n")
|
|
449
|
+
else:
|
|
450
|
+
os.remove(outfile.name) # Delete individual file if option is disabled
|
|
451
|
+
# Always write to the combined nucleotide file
|
|
363
452
|
combined_fasta.write(f">Group_{value}|{header}\n")
|
|
364
453
|
combined_fasta.write(f"{wrapped_sequence}\n")
|
|
454
|
+
|
|
365
455
|
else:
|
|
366
|
-
if options.verbose:
|
|
456
|
+
if options.verbose == True:
|
|
367
457
|
print(f"Sequence {header} not found in original_fasta file.")
|
|
368
|
-
|
|
458
|
+
if options.sequence_type != 'AA':
|
|
459
|
+
#Clean up unused file
|
|
460
|
+
os.remove(combined_fasta_aa.name)
|
|
369
461
|
print(f"Combined FASTA file saved to: {combined_fasta_filename}")
|
|
370
462
|
|
|
371
463
|
|
|
372
|
-
def
|
|
464
|
+
# def process_gene_groups(options, group_directory, sub_group_directory, paralog_groups, output_file):
|
|
465
|
+
# """Process each gene family file to select the longest sequence per genome and concatenate aligned sequences."""
|
|
466
|
+
# concatenated_sequences = {}
|
|
467
|
+
# output_file = group_directory.replace('Gene_Groups_Output',output_file)
|
|
468
|
+
#
|
|
469
|
+
# # Iterate over each gene family file
|
|
470
|
+
# for gene_file in os.listdir(group_directory):
|
|
471
|
+
# if gene_file.endswith('.fasta') and not gene_file.endswith('combined_group_sequences.fasta') :
|
|
472
|
+
# gene_path = os.path.join(group_directory, gene_file)
|
|
473
|
+
#
|
|
474
|
+
# # Read sequences from the gene family file
|
|
475
|
+
# sequences = read_fasta(gene_path)
|
|
476
|
+
#
|
|
477
|
+
# # Select the longest sequence for each genome
|
|
478
|
+
# longest_sequences = select_longest_gene(sequences)
|
|
479
|
+
#
|
|
480
|
+
# # Run mafft on the longest sequences
|
|
481
|
+
# aligned_file = f"{group_directory}/{gene_file}_aligned.fasta.tmp"
|
|
482
|
+
# run_mafft_on_sequences(options, {seq_id: seq for seq_id, seq in longest_sequences.values()}, aligned_file)
|
|
483
|
+
#
|
|
484
|
+
# # Read aligned sequences and concatenate them
|
|
485
|
+
# aligned_sequences = read_fasta(aligned_file)
|
|
486
|
+
# for genome, aligned_seq in aligned_sequences.items():
|
|
487
|
+
# genome_name = genome.split('|')[0]
|
|
488
|
+
# if genome_name not in concatenated_sequences:
|
|
489
|
+
# concatenated_sequences[genome_name] = ""
|
|
490
|
+
# concatenated_sequences[genome_name] += aligned_seq
|
|
491
|
+
#
|
|
492
|
+
# # Clean up aligned file
|
|
493
|
+
# os.remove(aligned_file)
|
|
494
|
+
#
|
|
495
|
+
# # Write the concatenated sequences to the output file
|
|
496
|
+
# with open(output_file, 'w') as out:
|
|
497
|
+
# for genome, sequence in concatenated_sequences.items():
|
|
498
|
+
# out.write(f">{genome}\n")
|
|
499
|
+
# wrapped_sequence = wrap_sequence(sequence, 60)
|
|
500
|
+
# out.write(f"{wrapped_sequence}\n")
|
|
501
|
+
|
|
502
|
+
def perform_alignment(gene_path,group_directory, gene_file, options, concatenated_sequences, subgrouped):
|
|
503
|
+
# Read sequences from the gene family file
|
|
504
|
+
sequences = read_fasta(gene_path)
|
|
505
|
+
|
|
506
|
+
# Select the longest sequence for each genome
|
|
507
|
+
longest_sequences = select_longest_gene(sequences, subgrouped)
|
|
508
|
+
|
|
509
|
+
# Run mafft on the longest sequences
|
|
510
|
+
aligned_file = f"{group_directory}/{gene_file}_aligned.fasta.tmp"
|
|
511
|
+
run_mafft_on_sequences(options, {seq_id: seq for seq_id, seq in longest_sequences.values()}, aligned_file)
|
|
512
|
+
|
|
513
|
+
# Read aligned sequences and concatenate them
|
|
514
|
+
aligned_sequences = read_fasta(aligned_file)
|
|
515
|
+
# Find the length of the longest sequence in aligned_sequences
|
|
516
|
+
max_length = max(len(seq) for seq in aligned_sequences.values())
|
|
517
|
+
|
|
518
|
+
for genome, sequence in concatenated_sequences.items():
|
|
519
|
+
if any(genome in key for key in aligned_sequences.keys()):
|
|
520
|
+
genome_name_in_aligned = next(key for key in aligned_sequences.keys() if genome in key)#.split('|')[split_by]
|
|
521
|
+
concatenated_sequences[genome] += aligned_sequences[genome_name_in_aligned]
|
|
522
|
+
else:
|
|
523
|
+
concatenated_sequences[genome] += "-" * max_length
|
|
524
|
+
|
|
525
|
+
# Clean up aligned file
|
|
526
|
+
os.remove(aligned_file)
|
|
527
|
+
|
|
528
|
+
return concatenated_sequences
|
|
529
|
+
|
|
530
|
+
def process_gene_groups(options, group_directory, sub_group_directory, paralog_groups, genome_list, output_file):
|
|
373
531
|
"""Process each gene family file to select the longest sequence per genome and concatenate aligned sequences."""
|
|
374
|
-
concatenated_sequences = {}
|
|
375
|
-
output_file =
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
for gene_file in os.listdir(directory):
|
|
379
|
-
if gene_file.endswith('.fasta') and not gene_file.endswith('combined_group_sequences.fasta'):
|
|
380
|
-
gene_path = os.path.join(directory, gene_file)
|
|
381
|
-
|
|
382
|
-
# Read sequences from the gene family file
|
|
383
|
-
sequences = read_fasta(gene_path)
|
|
532
|
+
concatenated_sequences = {genome: "" for genome in genome_list}
|
|
533
|
+
output_file = group_directory.replace('Gene_Groups_Output', output_file)
|
|
534
|
+
if paralog_groups != None:
|
|
535
|
+
threshold_size = math.floor(int(options.align_core) * int(options.genome_num) / 100)
|
|
384
536
|
|
|
385
|
-
|
|
386
|
-
|
|
537
|
+
if options.align_aa == True:
|
|
538
|
+
affix = '_aa.fasta'
|
|
539
|
+
else:
|
|
540
|
+
affix = '_dna.fasta'
|
|
387
541
|
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
542
|
+
# Iterate over each gene family file
|
|
543
|
+
for gene_file in os.listdir(group_directory):
|
|
544
|
+
if gene_file.endswith(affix) and not gene_file.startswith('combined_group_sequences'):
|
|
545
|
+
#print(gene_file)
|
|
546
|
+
current_group = int(gene_file.split('_')[3].split('.')[0])
|
|
547
|
+
gene_path = os.path.join(group_directory, gene_file)
|
|
548
|
+
|
|
549
|
+
# Check for matching group in paralog_groups
|
|
550
|
+
if sub_group_directory and paralog_groups and '>Group_'+str(current_group) in paralog_groups:
|
|
551
|
+
for subgroup, size in enumerate(paralog_groups['>Group_' + str(current_group)]['sizes']):
|
|
552
|
+
if size >= threshold_size:
|
|
553
|
+
gene_path = os.path.join(sub_group_directory,f"Group_{current_group}_subgroup_{subgroup}{affix}")
|
|
554
|
+
concatenated_sequences = perform_alignment(gene_path, group_directory, gene_file, options, concatenated_sequences, True)
|
|
391
555
|
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
for genome, aligned_seq in aligned_sequences.items():
|
|
395
|
-
genome_name = genome.split('|')[0]
|
|
396
|
-
if genome_name not in concatenated_sequences:
|
|
397
|
-
concatenated_sequences[genome_name] = ""
|
|
398
|
-
concatenated_sequences[genome_name] += aligned_seq
|
|
556
|
+
else:
|
|
557
|
+
concatenated_sequences = perform_alignment(gene_path, group_directory, gene_file, options, concatenated_sequences, False)
|
|
399
558
|
|
|
400
|
-
# Clean up aligned file
|
|
401
|
-
os.remove(aligned_file)
|
|
402
559
|
|
|
403
560
|
# Write the concatenated sequences to the output file
|
|
404
561
|
with open(output_file, 'w') as out:
|
|
405
562
|
for genome, sequence in concatenated_sequences.items():
|
|
406
563
|
out.write(f">{genome}\n")
|
|
407
564
|
wrapped_sequence = wrap_sequence(sequence, 60)
|
|
408
|
-
out.write(f"{wrapped_sequence}\n")
|
|
565
|
+
out.write(f"{wrapped_sequence}\n")
|
|
566
|
+
|