PyamilySeq 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
PyamilySeq/utils.py ADDED
@@ -0,0 +1,598 @@
1
+ import subprocess
2
+ import shutil
3
+ import os
4
+ import glob
5
+ import collections
6
+ from tempfile import NamedTemporaryFile
7
+ import sys
8
+ import re
9
+ import math
10
+
11
+ ####
12
+ # Placeholder for the distance function
13
+ levenshtein_distance_cal = None
14
+ # Check for Levenshtein library once
15
+ try:
16
+ import Levenshtein as LV
17
+ # Assign the optimized function
18
+ def levenshtein_distance_calc(seq1, seq2):
19
+ return LV.distance(seq1, seq2)
20
+ except (ModuleNotFoundError, ImportError):
21
+ print("Levenshtein package not installed - Will fallback to slower Python implementation.")
22
+ # Fallback implementation
23
+ def levenshtein_distance_calc(seq1, seq2):
24
+ # Slower Python implementation of Levenshtein distance
25
+ len1, len2 = len(seq1), len(seq2)
26
+ dp = [[0] * (len2 + 1) for _ in range(len1 + 1)]
27
+
28
+ for i in range(len1 + 1):
29
+ dp[i][0] = i
30
+ for j in range(len2 + 1):
31
+ dp[0][j] = j
32
+
33
+ for i in range(1, len1 + 1):
34
+ for j in range(1, len2 + 1):
35
+ if seq1[i - 1] == seq2[j - 1]:
36
+ cost = 0
37
+ else:
38
+ cost = 1
39
+ dp[i][j] = min(dp[i - 1][j] + 1, # Deletion
40
+ dp[i][j - 1] + 1, # Insertion
41
+ dp[i - 1][j - 1] + cost) # Substitution
42
+
43
+ return dp[len1][len2]
44
+ #####
45
+
46
+ ################### We are currently fixed using Table 11
47
+ codon_table = {
48
+ 'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
49
+ 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
50
+ 'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
51
+ 'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
52
+ 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
53
+ 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
54
+ 'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
55
+ 'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
56
+ 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
57
+ 'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
58
+ 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
59
+ 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
60
+ 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
61
+ 'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
62
+ 'TAC':'Y', 'TAT':'Y', 'TAA':'*', 'TAG':'*',
63
+ 'TGC':'C', 'TGT':'C', 'TGA':'*', 'TGG':'W'}
64
+
65
+ def translate_frame(sequence):
66
+ translate = ''.join([codon_table.get(sequence[3 * i:3 * i + 3], 'X') for i in range(len(sequence) // 3)])
67
+ return translate
68
+
69
+ def translate_dna_to_aa(dna_fasta, aa_fasta):
70
+ def translate_dna_sequence(dna_seq):
71
+ aa_seq = ""
72
+ for i in range(0, len(dna_seq) - 2, 3):
73
+ codon = dna_seq[i:i+3]
74
+ aa_seq += codon_table.get(codon, 'X') # 'X' for unknown codons
75
+ return aa_seq
76
+
77
+ with open(dna_fasta, 'r') as infile, open(aa_fasta, 'w') as outfile:
78
+ dna_seq = ""
79
+ header = ""
80
+ for line in infile:
81
+ if line.startswith('>'):
82
+ if dna_seq:
83
+ aa_seq = translate_dna_sequence(dna_seq)
84
+ wrapped_aa_seq = wrap_sequence(aa_seq, 60)
85
+ outfile.write(f"{header}\n{wrapped_aa_seq}\n")
86
+ header = line.strip()
87
+ dna_seq = ""
88
+ else:
89
+ dna_seq += line.strip()
90
+ if dna_seq:
91
+ aa_seq = translate_dna_sequence(dna_seq)
92
+ wrapped_aa_seq = wrap_sequence(aa_seq, 60)
93
+ outfile.write(f"{header}\n{wrapped_aa_seq}\n")
94
+
95
+
96
+ def detect_sequence_type(fasta_file):
97
+ with open(fasta_file, 'r') as f:
98
+ for line in f:
99
+ if line.startswith('>'):
100
+ continue
101
+ if any(base in line for base in 'EFILPQZ'):
102
+ return False # Contains amino acids
103
+ return True # Contains DNA
104
+
105
+
106
+ def is_tool_installed(tool_name):
107
+ """Check if a tool is installed and available in PATH."""
108
+ # Check if the tool is in the system PATH
109
+ if shutil.which(tool_name) is None:
110
+ return False
111
+
112
+ # Try running the tool to ensure it's executable
113
+ try:
114
+ subprocess.run([tool_name, '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
115
+ return True
116
+ except subprocess.CalledProcessError:
117
+ return True # The tool is installed and ran, even if it returns an error code
118
+ except FileNotFoundError:
119
+ return False # This shouldn't happen due to the earlier check
120
+
121
+ def reverse_complement(seq):
122
+ complement = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'N': 'N'}
123
+ return ''.join(complement[base] for base in reversed(seq))
124
+
125
+
126
+ def fix_path(path):
127
+ fixed_path = os.path.normpath(path)
128
+ fixed_path = os.path.realpath(fixed_path)
129
+ return fixed_path
130
+
131
+
132
+ def extract_identity(clustered_info):
133
+ # Use regex to capture percentage, including optional '-' or '+' before it
134
+ match = re.search(r'at [+-/]*(\d+\.\d+)%', clustered_info)
135
+
136
+ if match:
137
+ percent_identity = float(match.group(1)) # Extract the percentage value
138
+ return percent_identity
139
+ else:
140
+ raise ValueError("Percent identity not found in the string.")
141
+
142
+
143
+ def wrap_sequence(sequence, width=60):
144
+ wrapped_sequence = []
145
+ for i in range(0, len(sequence), width):
146
+ wrapped_sequence.append(sequence[i:i + width])
147
+ return "\n".join(wrapped_sequence)
148
+
149
+
150
+ def read_genomes_from_fasta(fasta_file):
151
+ genomes = set()
152
+ with open(fasta_file, 'r') as file:
153
+ for line in file:
154
+ line = line.strip()
155
+ if line.startswith('>'):
156
+ genome = line.split('|')[1]
157
+ genomes.add(genome)
158
+ return list(genomes)
159
+
160
+ def read_fasta(fasta_file):
161
+ sequences = {}
162
+ current_sequence = None
163
+ with open(fasta_file, 'r') as file:
164
+ for line in file:
165
+ line = line.strip()
166
+ if not line:
167
+ continue # Skip empty lines
168
+ if line.startswith('>'):
169
+ current_sequence = line[1:] # Remove '>' character
170
+ sequences[current_sequence] = ''
171
+ else:
172
+ sequences[current_sequence] += line
173
+ return sequences
174
+
175
+
176
+ def reorder_dict_by_keys(original_dict, sorted_keys):
177
+ return {k: original_dict[k] for k in sorted_keys}
178
+ def custom_sort_key(k, dict1, dict2):
179
+ return (len(dict1[k]), len(dict2[k]))
180
+
181
+ def sort_keys_by_values(dict1, dict2):
182
+ sorted_keys = sorted(dict1.keys(), key=lambda k: custom_sort_key(k, dict1, dict2), reverse=True)
183
+ return sorted_keys
184
+
185
+ def select_longest_gene(sequences, subgrouped):
186
+ """Select the longest sequence for each genome."""
187
+ longest_sequences = {}
188
+ for seq_id, sequence in sequences.items():
189
+ if subgrouped == False:
190
+ genome = seq_id.split('|')[0] # Assuming genome name can be derived from the sequence ID
191
+ elif subgrouped == True:
192
+ genome = seq_id.split('|')[1]
193
+ if genome not in longest_sequences or len(sequence) > len(longest_sequences[genome][1]):
194
+ longest_sequences[genome] = (seq_id, sequence)
195
+ return longest_sequences
196
+
197
+
198
+ def run_mafft_on_sequences(options, sequences, output_file):
199
+ #print("Conducting MAFFT alignment.")
200
+ """Run mafft on the given sequences and write to output file."""
201
+ # Create a temporary input file for mafft
202
+ with NamedTemporaryFile('w', delete=False) as temp_input_file:
203
+ for header, sequence in sequences.items():
204
+ temp_input_file.write(f">{header}\n{sequence}\n")
205
+ temp_input_file_path = temp_input_file.name
206
+
207
+ # Run mafft
208
+ try:
209
+ with open(output_file, 'w') as output_f:
210
+ if options.verbose == True:
211
+ subprocess.run(
212
+ ['mafft', '--auto', '--thread', str(options.threads), temp_input_file_path],
213
+ stdout=output_f,
214
+ stderr=sys.stderr,
215
+ check=True
216
+ )
217
+
218
+ else:
219
+ subprocess.run(
220
+ ['mafft', '--auto', '--thread', str(options.threads), temp_input_file_path],
221
+ stdout=output_f,
222
+ stderr=subprocess.DEVNULL, # Suppress stderr
223
+ check=True
224
+ )
225
+ finally:
226
+ os.remove(temp_input_file_path) # Clean up the temporary file
227
+
228
+
229
+
230
+
231
+ def read_separate_files(input_dir, name_split_gff, name_split_fasta, gene_ident, combined_out, translate):
232
+ paired_files_found = None
233
+ with open(combined_out, 'w') as combined_out_file, open(combined_out.replace('_dna.fasta','_aa.fasta'), 'w') as combined_out_file_aa:
234
+ gff_files = glob.glob(os.path.join(input_dir, '*' + name_split_gff))
235
+ if not gff_files:
236
+ sys.exit("Error: No GFF files found.")
237
+ for gff_file in gff_files:
238
+ genome_name = os.path.basename(gff_file).split(name_split_gff)[0]
239
+ if name_split_fasta == None:
240
+ possible_extensions = ['.fa', '.fasta', '.fna']
241
+ corresponding_fasta_file = None
242
+ for ext in possible_extensions:
243
+ temp_file = os.path.splitext(gff_file)[0] + ext
244
+ if os.path.exists(temp_file):
245
+ corresponding_fasta_file = temp_file
246
+ break
247
+ if corresponding_fasta_file is None:
248
+ print("Corresponding FASTA file for GFF file '" + gff_file + "' not found. Skipping. - Try using the -name_split_fasta option.")
249
+ continue
250
+ else:
251
+ corresponding_fasta_file = os.path.join(input_dir, genome_name + name_split_fasta)
252
+ if not os.path.exists(corresponding_fasta_file):
253
+ print("Corresponding FASTA file for GFF file '" + gff_file + "' not found. Skipping. - Try using the -name_split_fasta option.")
254
+ continue
255
+
256
+ gff_features = []
257
+ paired_files_found = True
258
+ with open(gff_file, 'r') as file:
259
+ seen_seq_ids = collections.defaultdict(int)
260
+ lines = file.readlines()
261
+ for line in lines:
262
+ line_data = line.split('\t')
263
+ if len(line_data) == 9:
264
+ if any(gene_type in line_data[2] for gene_type in gene_ident):
265
+ seq_id = line_data[8].split('ID=')[1].split(';')[0]
266
+ contig = line_data[0]
267
+ feature = line_data[2]
268
+ strand = line_data[6]
269
+ start, end = int(line_data[3]), int(line_data[4])
270
+ if seq_id in seen_seq_ids:
271
+ seq_id += '_' + str(seen_seq_ids[seq_id])
272
+ seen_seq_ids[seq_id] + 1
273
+ else:
274
+ seen_seq_ids[seq_id] = 1
275
+ gff_features.append((contig, start, end, strand, feature, seq_id))
276
+ fasta_dict = collections.defaultdict(str)
277
+ with open(corresponding_fasta_file, 'r') as file:
278
+ lines = file.readlines()
279
+ for line in lines:
280
+ if line.startswith('>'):
281
+ current_contig = line[1:].split()[0]
282
+ fasta_dict[current_contig] = ['', '']
283
+ else:
284
+ fasta_dict[current_contig][0] += line.strip()
285
+
286
+ for contig, fasta in fasta_dict.items():
287
+ reverse_sequence = reverse_complement(fasta[0])
288
+ fasta_dict[contig][1] = reverse_sequence
289
+
290
+ if fasta_dict and gff_features:
291
+ for contig, start, end, strand, feature, seq_id in gff_features:
292
+ if contig in fasta_dict:
293
+ if strand == '+':
294
+ full_sequence = fasta_dict[contig][0]
295
+ seq = full_sequence[start - 1:end]
296
+ elif strand == '-':
297
+ corrected_start = max(len(fasta_dict[contig][0]) - int(end), 1)
298
+ corrected_stop = max(len(fasta_dict[contig][0]) - int(start - 1), 1)
299
+ full_sequence = fasta_dict[contig][1]
300
+ seq = full_sequence[corrected_start:corrected_stop]
301
+
302
+ if translate == True:
303
+ seq_aa = translate_frame(seq)
304
+ wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
305
+ combined_out_file_aa.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
306
+ wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
307
+ combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
308
+
309
+ if not paired_files_found:
310
+ sys.exit("Could not find matching GFF/FASTA files - Please check input directory and -name_split_gff and -name_split_fasta parameters.")
311
+ if translate == False or translate == None:
312
+ #Clean up unused file
313
+ if combined_out_file.name != combined_out_file_aa.name:
314
+ os.remove(combined_out_file_aa.name)
315
+
316
+
317
+ def read_combined_files(input_dir, name_split, gene_ident, combined_out, translate):
318
+ with open(combined_out, 'w') as combined_out_file, open(combined_out.replace('_dna.fasta','_aa.fasta'), 'w') as combined_out_file_aa:
319
+ gff_files = glob.glob(os.path.join(input_dir, '*' + name_split))
320
+ if not gff_files:
321
+ sys.exit("Error: No GFF files found - check input directory and -name_split_gff parameter.")
322
+ for gff_file in gff_files:
323
+ genome_name = os.path.basename(gff_file).split(name_split)[0]
324
+ fasta_dict = collections.defaultdict(str)
325
+ gff_features = []
326
+ with open(gff_file, 'r') as file:
327
+ seen_seq_ids = collections.defaultdict(int)
328
+ lines = file.readlines()
329
+ fasta_section = False
330
+ for line in lines:
331
+ if line.startswith('##FASTA'):
332
+ fasta_section = True
333
+ continue
334
+ if fasta_section:
335
+ if line.startswith('>'):
336
+ current_contig = line[1:].split()[0]
337
+ fasta_dict[current_contig] = ['','']
338
+ else:
339
+ fasta_dict[current_contig][0] +=line.strip()
340
+ else:
341
+ line_data = line.split('\t')
342
+ if len(line_data) == 9:
343
+ if any(gene_type in line_data[2] for gene_type in gene_ident):
344
+ contig = line_data[0]
345
+ feature = line_data[2]
346
+ strand = line_data[6]
347
+ start, end = int(line_data[3]), int(line_data[4])
348
+ seq_id = line_data[8].split('ID=')[1].split(';')[0]
349
+ if seq_id in seen_seq_ids:
350
+ seq_id += '_' + str(seen_seq_ids[seq_id])
351
+ seen_seq_ids[seq_id] + 1
352
+ else:
353
+ seen_seq_ids[seq_id] = 1
354
+ gff_features.append((contig, start, end, strand, feature, seq_id))
355
+
356
+ for contig, fasta in fasta_dict.items():
357
+ reverse_sequence = reverse_complement(fasta[0])
358
+ fasta_dict[contig][1]=reverse_sequence
359
+
360
+ if fasta_dict and gff_features:
361
+ for contig, start, end, strand, feature, seq_id in gff_features:
362
+ if contig in fasta_dict:
363
+ if strand == '+':
364
+ full_sequence = fasta_dict[contig][0]
365
+ seq = full_sequence[start - 1:end]
366
+ elif strand == '-':
367
+ corrected_start = max(len(fasta_dict[contig][0]) - int(end), 1)
368
+ corrected_stop = max(len(fasta_dict[contig][0]) - int(start - 1), 1)
369
+ full_sequence = fasta_dict[contig][1]
370
+ seq = full_sequence[corrected_start:corrected_stop]
371
+
372
+ if translate == True:
373
+ seq_aa = translate_frame(seq)
374
+ wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
375
+ combined_out_file_aa.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
376
+ wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
377
+ combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
378
+
379
+ if translate == False or translate == None:
380
+ #Clean up unused file
381
+ if combined_out_file.name != combined_out_file_aa.name:
382
+ os.remove(combined_out_file_aa.name)
383
+
384
+
385
+
386
+ def read_fasta_files(input_dir, name_split_fasta, combined_out, translate):
387
+ with open(combined_out, 'w') as combined_out_file, open(combined_out.replace('_dna.fasta','_aa.fasta'), 'w') as combined_out_file_aa:
388
+ fasta_files = glob.glob(os.path.join(input_dir, '*' + name_split_fasta))
389
+ if not fasta_files:
390
+ sys.exit("Error: No GFF files found.")
391
+ for fasta_file in fasta_files:
392
+ genome_name = os.path.basename(fasta_file).split(name_split_fasta)[0]
393
+ fasta_dict = collections.defaultdict(str)
394
+ with open(fasta_file, 'r') as file:
395
+ lines = file.readlines()
396
+ for line in lines:
397
+ if line.startswith('>'):
398
+ current_seq = line[1:].split()[0]
399
+ fasta_dict[current_seq] = ''
400
+ else:
401
+ fasta_dict[current_seq] +=line.strip()
402
+ for seq_id, seq in fasta_dict.items():
403
+ if translate == True:
404
+ seq_aa = translate_frame(seq)
405
+ wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
406
+ combined_out_file_aa.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
407
+ wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
408
+ combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
409
+
410
+ if translate == False or translate == None:
411
+ #Clean up unused file
412
+ if combined_out_file.name != combined_out_file_aa.name:
413
+ os.remove(combined_out_file_aa.name)
414
+
415
+ def write_groups_func(options, output_dir, key_order, cores, sequences,
416
+ pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences):
417
+ """
418
+ Writes individual FASTA files and a combined FASTA file for all sequences.
419
+
420
+ Parameters:
421
+ - options: Command-line options.
422
+ - output_dir: Directory where output FASTA files will be saved.
423
+ - key_order: The order in which to process keys.
424
+ - cores: Dictionary of core genes.
425
+ - sequences: Dictionary mapping headers to sequences.
426
+ - pangenome_clusters_First_sequences_sorted: Dictionary of first sequence clusters.
427
+ - combined_pangenome_clusters_Second_sequences: Dictionary of second sequence clusters.
428
+ """
429
+ # Create output directory if it doesn't exist
430
+ if not os.path.exists(output_dir):
431
+ os.makedirs(output_dir)
432
+
433
+ for group in options.write_groups.split(','):
434
+
435
+ combined_fasta_filename = os.path.join(output_dir, "combined_group_sequences_" + group + "_dna.fasta")
436
+
437
+ # Open combined FASTA file for writing all sequences
438
+ with open(combined_fasta_filename, 'w') as combined_fasta, open(combined_fasta_filename.replace('_dna.fasta','_aa.fasta'), 'w') as combined_fasta_aa:
439
+ for key_prefix in key_order:
440
+ for key, values in cores.items():
441
+ if any(part in group for part in key.split('_')):
442
+ if key.startswith(key_prefix):
443
+ for value in values:
444
+ output_filename = f"{key}_{value}_dna.fasta"
445
+ if 'First' in key_prefix:
446
+ sequences_to_write = pangenome_clusters_First_sequences_sorted[value]
447
+ else:
448
+ sequences_to_write = combined_pangenome_clusters_Second_sequences[value]
449
+
450
+ # Write individual FASTA file
451
+ with open(os.path.join(output_dir,output_filename), 'w') as outfile, open(os.path.join(output_dir, output_filename.replace('_dna.fasta','_aa.fasta')), 'w') as outfile_aa:
452
+ for header in sequences_to_write:
453
+ if header in sequences:
454
+ sequence = sequences[header]
455
+ wrapped_sequence = wrap_sequence(sequence)
456
+ # Handle Amino Acid Sequences (AA)
457
+ if options.sequence_type == 'AA':
458
+ seq_aa = translate_frame(sequence)
459
+ wrapped_sequence_aa = wrap_sequence(seq_aa)
460
+ # Write individual group file for AA, if option is enabled
461
+ if options.write_individual_groups:
462
+ outfile_aa.write(f">{header}\n")
463
+ outfile_aa.write(f"{wrapped_sequence_aa}\n")
464
+ else:
465
+ os.remove(outfile_aa.name) # Delete individual file if option is disabled
466
+ # Always write to the combined AA file
467
+ combined_fasta_aa.write(f">Group_{value}|{header}\n")
468
+ combined_fasta_aa.write(f"{wrapped_sequence_aa}\n")
469
+ # Handle Nucleotide Sequences
470
+ else:
471
+ # If the option is disabled, delete individual AA file (if created)
472
+ try:
473
+ os.remove(outfile_aa.name) # Ensure outfile_aa is removed when sequence_type isn't 'AA'
474
+ except FileNotFoundError:
475
+ pass
476
+ # Write individual group file for nucleotide sequence, if option is enabled
477
+ if options.write_individual_groups:
478
+ outfile.write(f">{header}\n")
479
+ outfile.write(f"{wrapped_sequence}\n")
480
+ else:
481
+ os.remove(outfile.name) # Delete individual file if option is disabled
482
+ # Always write to the combined nucleotide file
483
+ combined_fasta.write(f">Group_{value}|{header}\n")
484
+ combined_fasta.write(f"{wrapped_sequence}\n")
485
+
486
+ else:
487
+ if options.verbose == True:
488
+ print(f"Sequence {header} not found in original_fasta file.")
489
+ if options.sequence_type != 'AA':
490
+ #Clean up unused file
491
+ os.remove(combined_fasta_aa.name)
492
+ print(f"Combined FASTA file saved to: {combined_fasta_filename}")
493
+
494
+
495
+ # def process_gene_groups(options, group_directory, sub_group_directory, paralog_groups, output_file):
496
+ # """Process each gene family file to select the longest sequence per genome and concatenate aligned sequences."""
497
+ # concatenated_sequences = {}
498
+ # output_file = group_directory.replace('Gene_Groups_Output',output_file)
499
+ #
500
+ # # Iterate over each gene family file
501
+ # for gene_file in os.listdir(group_directory):
502
+ # if gene_file.endswith('.fasta') and not gene_file.endswith('combined_group_sequences.fasta') :
503
+ # gene_path = os.path.join(group_directory, gene_file)
504
+ #
505
+ # # Read sequences from the gene family file
506
+ # sequences = read_fasta(gene_path)
507
+ #
508
+ # # Select the longest sequence for each genome
509
+ # longest_sequences = select_longest_gene(sequences)
510
+ #
511
+ # # Run mafft on the longest sequences
512
+ # aligned_file = f"{group_directory}/{gene_file}_aligned.fasta.tmp"
513
+ # run_mafft_on_sequences(options, {seq_id: seq for seq_id, seq in longest_sequences.values()}, aligned_file)
514
+ #
515
+ # # Read aligned sequences and concatenate them
516
+ # aligned_sequences = read_fasta(aligned_file)
517
+ # for genome, aligned_seq in aligned_sequences.items():
518
+ # genome_name = genome.split('|')[0]
519
+ # if genome_name not in concatenated_sequences:
520
+ # concatenated_sequences[genome_name] = ""
521
+ # concatenated_sequences[genome_name] += aligned_seq
522
+ #
523
+ # # Clean up aligned file
524
+ # os.remove(aligned_file)
525
+ #
526
+ # # Write the concatenated sequences to the output file
527
+ # with open(output_file, 'w') as out:
528
+ # for genome, sequence in concatenated_sequences.items():
529
+ # out.write(f">{genome}\n")
530
+ # wrapped_sequence = wrap_sequence(sequence, 60)
531
+ # out.write(f"{wrapped_sequence}\n")
532
+
533
+ def perform_alignment(gene_path,group_directory, gene_file, options, concatenated_sequences, subgrouped):
534
+ # Read sequences from the gene family file
535
+ sequences = read_fasta(gene_path)
536
+
537
+ # Select the longest sequence for each genome
538
+ longest_sequences = select_longest_gene(sequences, subgrouped)
539
+
540
+ # Run mafft on the longest sequences
541
+ aligned_file = f"{group_directory}/{gene_file}_aligned.fasta.tmp"
542
+ run_mafft_on_sequences(options, {seq_id: seq for seq_id, seq in longest_sequences.values()}, aligned_file)
543
+
544
+ # Read aligned sequences and concatenate them
545
+ aligned_sequences = read_fasta(aligned_file)
546
+ # Find the length of the longest sequence in aligned_sequences
547
+ max_length = max(len(seq) for seq in aligned_sequences.values())
548
+
549
+ for genome, sequence in concatenated_sequences.items():
550
+ if any(genome in key for key in aligned_sequences.keys()):
551
+ genome_name_in_aligned = next(key for key in aligned_sequences.keys() if genome in key)#.split('|')[split_by]
552
+ concatenated_sequences[genome] += aligned_sequences[genome_name_in_aligned]
553
+ else:
554
+ concatenated_sequences[genome] += "-" * max_length
555
+
556
+ # Clean up aligned file
557
+ os.remove(aligned_file)
558
+
559
+ return concatenated_sequences
560
+
561
+ def process_gene_groups(options, group_directory, sub_group_directory, paralog_groups, genome_list, output_file):
562
+ """Process each gene family file to select the longest sequence per genome and concatenate aligned sequences."""
563
+ concatenated_sequences = {genome: "" for genome in genome_list}
564
+ output_file = group_directory.replace('Gene_Groups_Output', output_file)
565
+ if paralog_groups != None:
566
+ threshold_size = math.floor(int(options.align_core) * int(options.genome_num) / 100)
567
+
568
+ if options.align_aa == True:
569
+ affix = '_aa.fasta'
570
+ else:
571
+ affix = '_dna.fasta'
572
+
573
+ if options.align_core == True:
574
+ # Iterate over each gene family file
575
+ for gene_file in os.listdir(group_directory):
576
+ if gene_file.endswith(affix) and not gene_file.startswith('combined_group_sequences'):
577
+ #print(gene_file)
578
+ current_group = int(gene_file.split('_')[3].split('.')[0])
579
+ gene_path = os.path.join(group_directory, gene_file)
580
+
581
+ # Check for matching group in paralog_groups
582
+ if sub_group_directory and paralog_groups and '>Group_'+str(current_group) in paralog_groups:
583
+ for subgroup, size in enumerate(paralog_groups['>Group_' + str(current_group)]['sizes']):
584
+ if size >= threshold_size:
585
+ gene_path = os.path.join(sub_group_directory,f"Group_{current_group}_subgroup_{subgroup}{affix}")
586
+ concatenated_sequences = perform_alignment(gene_path, group_directory, gene_file, options, concatenated_sequences, True)
587
+
588
+ else:
589
+ concatenated_sequences = perform_alignment(gene_path, group_directory, gene_file, options, concatenated_sequences, False)
590
+
591
+
592
+ # Write the concatenated sequences to the output file
593
+ with open(output_file, 'w') as out:
594
+ for genome, sequence in concatenated_sequences.items():
595
+ out.write(f">{genome}\n")
596
+ wrapped_sequence = wrap_sequence(sequence, 60)
597
+ out.write(f"{wrapped_sequence}\n")
598
+