PyamilySeq 0.9.0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyamilySeq-1.0.0.dist-info/METADATA +17 -0
- PyamilySeq-1.0.0.dist-info/RECORD +6 -0
- {PyamilySeq-0.9.0.dist-info → PyamilySeq-1.0.0.dist-info}/WHEEL +1 -1
- PyamilySeq-1.0.0.dist-info/entry_points.txt +2 -0
- PyamilySeq-1.0.0.dist-info/top_level.txt +1 -0
- PyamilySeq/Cluster_Summary.py +0 -163
- PyamilySeq/Constants.py +0 -2
- PyamilySeq/Group_Splitter.py +0 -382
- PyamilySeq/PyamilySeq.py +0 -296
- PyamilySeq/PyamilySeq_Genus.py +0 -242
- PyamilySeq/PyamilySeq_Species.py +0 -287
- PyamilySeq/Seq_Combiner.py +0 -67
- PyamilySeq/__init__.py +0 -0
- PyamilySeq/clusterings.py +0 -362
- PyamilySeq/utils.py +0 -408
- PyamilySeq-0.9.0.dist-info/METADATA +0 -345
- PyamilySeq-0.9.0.dist-info/RECORD +0 -16
- PyamilySeq-0.9.0.dist-info/entry_points.txt +0 -5
- PyamilySeq-0.9.0.dist-info/top_level.txt +0 -1
- {PyamilySeq-0.9.0.dist-info → PyamilySeq-1.0.0.dist-info}/LICENSE +0 -0
PyamilySeq/utils.py
DELETED
|
@@ -1,408 +0,0 @@
|
|
|
1
|
-
import subprocess
|
|
2
|
-
import shutil
|
|
3
|
-
import os
|
|
4
|
-
import glob
|
|
5
|
-
import collections
|
|
6
|
-
from tempfile import NamedTemporaryFile
|
|
7
|
-
import sys
|
|
8
|
-
from line_profiler_pycharm import profile
|
|
9
|
-
import re
|
|
10
|
-
|
|
11
|
-
####
|
|
12
|
-
# Placeholder for the distance function
|
|
13
|
-
levenshtein_distance_cal = None
|
|
14
|
-
# Check for Levenshtein library once
|
|
15
|
-
try:
|
|
16
|
-
import Levenshtein as LV
|
|
17
|
-
# Assign the optimized function
|
|
18
|
-
def levenshtein_distance_calc(seq1, seq2):
|
|
19
|
-
return LV.distance(seq1, seq2)
|
|
20
|
-
except (ModuleNotFoundError, ImportError):
|
|
21
|
-
print("Levenshtein package not installed - Will fallback to slower Python implementation.")
|
|
22
|
-
# Fallback implementation
|
|
23
|
-
def levenshtein_distance_calc(seq1, seq2):
|
|
24
|
-
# Slower Python implementation of Levenshtein distance
|
|
25
|
-
len1, len2 = len(seq1), len(seq2)
|
|
26
|
-
dp = [[0] * (len2 + 1) for _ in range(len1 + 1)]
|
|
27
|
-
|
|
28
|
-
for i in range(len1 + 1):
|
|
29
|
-
dp[i][0] = i
|
|
30
|
-
for j in range(len2 + 1):
|
|
31
|
-
dp[0][j] = j
|
|
32
|
-
|
|
33
|
-
for i in range(1, len1 + 1):
|
|
34
|
-
for j in range(1, len2 + 1):
|
|
35
|
-
if seq1[i - 1] == seq2[j - 1]:
|
|
36
|
-
cost = 0
|
|
37
|
-
else:
|
|
38
|
-
cost = 1
|
|
39
|
-
dp[i][j] = min(dp[i - 1][j] + 1, # Deletion
|
|
40
|
-
dp[i][j - 1] + 1, # Insertion
|
|
41
|
-
dp[i - 1][j - 1] + cost) # Substitution
|
|
42
|
-
|
|
43
|
-
return dp[len1][len2]
|
|
44
|
-
#####
|
|
45
|
-
|
|
46
|
-
################### We are currently fixed using Table 11
|
|
47
|
-
gencode = {
|
|
48
|
-
'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
|
|
49
|
-
'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
|
|
50
|
-
'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
|
|
51
|
-
'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
|
|
52
|
-
'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
|
|
53
|
-
'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
|
|
54
|
-
'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
|
|
55
|
-
'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
|
|
56
|
-
'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
|
|
57
|
-
'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
|
|
58
|
-
'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
|
|
59
|
-
'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
|
|
60
|
-
'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
|
|
61
|
-
'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
|
|
62
|
-
'TAC':'Y', 'TAT':'Y', 'TAA':'*', 'TAG':'*',
|
|
63
|
-
'TGC':'C', 'TGT':'C', 'TGA':'*', 'TGG':'W'}
|
|
64
|
-
|
|
65
|
-
def translate_frame(sequence):
|
|
66
|
-
translate = ''.join([gencode.get(sequence[3 * i:3 * i + 3], 'X') for i in range(len(sequence) // 3)])
|
|
67
|
-
return translate
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
def is_tool_installed(tool_name):
|
|
73
|
-
"""Check if a tool is installed and available in PATH."""
|
|
74
|
-
# Check if the tool is in the system PATH
|
|
75
|
-
if shutil.which(tool_name) is None:
|
|
76
|
-
return False
|
|
77
|
-
|
|
78
|
-
# Try running the tool to ensure it's executable
|
|
79
|
-
try:
|
|
80
|
-
subprocess.run([tool_name, '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
|
|
81
|
-
return True
|
|
82
|
-
except subprocess.CalledProcessError:
|
|
83
|
-
return True # The tool is installed and ran, even if it returns an error code
|
|
84
|
-
except FileNotFoundError:
|
|
85
|
-
return False # This shouldn't happen due to the earlier check
|
|
86
|
-
|
|
87
|
-
def reverse_complement(seq):
|
|
88
|
-
complement = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'N': 'N'}
|
|
89
|
-
return ''.join(complement[base] for base in reversed(seq))
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
def fix_path(path):
|
|
93
|
-
fixed_path = os.path.normpath(path)
|
|
94
|
-
fixed_path = os.path.realpath(fixed_path)
|
|
95
|
-
return fixed_path
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
def extract_identity(clustered_info):
|
|
99
|
-
# Use regex to capture percentage, including optional '-' or '+' before it
|
|
100
|
-
match = re.search(r'at [+-/]*(\d+\.\d+)%', clustered_info)
|
|
101
|
-
|
|
102
|
-
if match:
|
|
103
|
-
percent_identity = float(match.group(1)) # Extract the percentage value
|
|
104
|
-
return percent_identity
|
|
105
|
-
else:
|
|
106
|
-
raise ValueError("Percent identity not found in the string.")
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
def wrap_sequence(sequence, width=60):
|
|
110
|
-
wrapped_sequence = []
|
|
111
|
-
for i in range(0, len(sequence), width):
|
|
112
|
-
wrapped_sequence.append(sequence[i:i + width])
|
|
113
|
-
return "\n".join(wrapped_sequence)
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
def read_fasta(fasta_file):
|
|
117
|
-
sequences = {}
|
|
118
|
-
current_sequence = None
|
|
119
|
-
with open(fasta_file, 'r') as file:
|
|
120
|
-
for line in file:
|
|
121
|
-
line = line.strip()
|
|
122
|
-
if not line:
|
|
123
|
-
continue # Skip empty lines
|
|
124
|
-
if line.startswith('>'):
|
|
125
|
-
current_sequence = line[1:] # Remove '>' character
|
|
126
|
-
sequences[current_sequence] = ''
|
|
127
|
-
else:
|
|
128
|
-
sequences[current_sequence] += line
|
|
129
|
-
return sequences
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
def reorder_dict_by_keys(original_dict, sorted_keys):
|
|
133
|
-
return {k: original_dict[k] for k in sorted_keys}
|
|
134
|
-
def custom_sort_key(k, dict1, dict2):
|
|
135
|
-
return (len(dict1[k]), len(dict2[k]))
|
|
136
|
-
|
|
137
|
-
def sort_keys_by_values(dict1, dict2):
|
|
138
|
-
sorted_keys = sorted(dict1.keys(), key=lambda k: custom_sort_key(k, dict1, dict2), reverse=True)
|
|
139
|
-
return sorted_keys
|
|
140
|
-
|
|
141
|
-
def select_longest_gene(sequences):
|
|
142
|
-
"""Select the longest sequence for each genome."""
|
|
143
|
-
longest_sequences = {}
|
|
144
|
-
for seq_id, sequence in sequences.items():
|
|
145
|
-
genome = seq_id.split('|')[0] # Assuming genome name can be derived from the sequence ID
|
|
146
|
-
if genome not in longest_sequences or len(sequence) > len(longest_sequences[genome][1]):
|
|
147
|
-
longest_sequences[genome] = (seq_id, sequence)
|
|
148
|
-
return longest_sequences
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
def run_mafft_on_sequences(options, sequences, output_file):
|
|
152
|
-
#print("Conducting MAFFT alignment.")
|
|
153
|
-
"""Run mafft on the given sequences and write to output file."""
|
|
154
|
-
# Create a temporary input file for mafft
|
|
155
|
-
with NamedTemporaryFile('w', delete=False) as temp_input_file:
|
|
156
|
-
for header, sequence in sequences.items():
|
|
157
|
-
temp_input_file.write(f">{header}\n{sequence}\n")
|
|
158
|
-
temp_input_file_path = temp_input_file.name
|
|
159
|
-
|
|
160
|
-
# Run mafft
|
|
161
|
-
try:
|
|
162
|
-
with open(output_file, 'w') as output_f:
|
|
163
|
-
if options.verbose == True:
|
|
164
|
-
subprocess.run(
|
|
165
|
-
['mafft', '--auto', '--thread', str(options.threads), temp_input_file_path],
|
|
166
|
-
stdout=output_f,
|
|
167
|
-
stderr=sys.stderr,
|
|
168
|
-
check=True
|
|
169
|
-
)
|
|
170
|
-
|
|
171
|
-
else:
|
|
172
|
-
subprocess.run(
|
|
173
|
-
['mafft', '--auto', '--thread', str(options.threads), temp_input_file_path],
|
|
174
|
-
stdout=output_f,
|
|
175
|
-
stderr=subprocess.DEVNULL, # Suppress stderr
|
|
176
|
-
check=True
|
|
177
|
-
)
|
|
178
|
-
finally:
|
|
179
|
-
os.remove(temp_input_file_path) # Clean up the temporary file
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
def read_separate_files(input_dir, name_split, gene_ident, combined_out, translate):
|
|
185
|
-
with open(combined_out, 'w') as combined_out_file:
|
|
186
|
-
for gff_file in glob.glob(os.path.join(input_dir, '*' + name_split)):
|
|
187
|
-
genome_name = os.path.basename(gff_file).split(name_split)[0]
|
|
188
|
-
corresponding_fasta_file = os.path.splitext(gff_file)[0] + '.fa'
|
|
189
|
-
if not os.path.exists(corresponding_fasta_file):
|
|
190
|
-
continue
|
|
191
|
-
|
|
192
|
-
gff_features = []
|
|
193
|
-
with open(gff_file, 'r') as file:
|
|
194
|
-
seen_seq_ids = collections.defaultdict(int)
|
|
195
|
-
lines = file.readlines()
|
|
196
|
-
for line in lines:
|
|
197
|
-
line_data = line.split('\t')
|
|
198
|
-
if len(line_data) == 9:
|
|
199
|
-
if any(gene_type in line_data[2] for gene_type in gene_ident):
|
|
200
|
-
contig = line_data[0]
|
|
201
|
-
feature = line_data[2]
|
|
202
|
-
strand = line_data[6]
|
|
203
|
-
start, end = int(line_data[3]), int(line_data[4])
|
|
204
|
-
if seq_id in seen_seq_ids:
|
|
205
|
-
seq_id += '_' + str(seen_seq_ids[seq_id])
|
|
206
|
-
seen_seq_ids[seq_id] + 1
|
|
207
|
-
else:
|
|
208
|
-
seen_seq_ids[seq_id] = 1
|
|
209
|
-
seq_id = line_data[8].split('ID=')[1].split(';')[0]
|
|
210
|
-
gff_features.append((contig, start, end, strand, feature, seq_id))
|
|
211
|
-
fasta_dict = collections.defaultdict(str)
|
|
212
|
-
with open(corresponding_fasta_file, 'r') as file:
|
|
213
|
-
lines = file.readlines()
|
|
214
|
-
for line in lines:
|
|
215
|
-
if line.startswith('>'):
|
|
216
|
-
current_contig = line[1:].split()[0]
|
|
217
|
-
fasta_dict[current_contig] = ['', '']
|
|
218
|
-
else:
|
|
219
|
-
fasta_dict[current_contig][0] += line.strip()
|
|
220
|
-
|
|
221
|
-
for contig, fasta in fasta_dict.items():
|
|
222
|
-
reverse_sequence = reverse_complement(fasta[0])
|
|
223
|
-
fasta_dict[contig][1] = reverse_sequence
|
|
224
|
-
|
|
225
|
-
if fasta_dict and gff_features:
|
|
226
|
-
for contig, start, end, strand, feature, seq_id in gff_features:
|
|
227
|
-
if contig in fasta_dict:
|
|
228
|
-
if strand == '+':
|
|
229
|
-
full_sequence = fasta_dict[contig][0]
|
|
230
|
-
cds_sequence = full_sequence[start - 1:end]
|
|
231
|
-
elif strand == '-':
|
|
232
|
-
corrected_start = max(len(fasta_dict[contig][0]) - int(end), 1)
|
|
233
|
-
corrected_stop = max(len(fasta_dict[contig][0]) - int(start - 1), 1)
|
|
234
|
-
full_sequence = fasta_dict[contig][1]
|
|
235
|
-
cds_sequence = full_sequence[corrected_start:corrected_stop]
|
|
236
|
-
if translate == True:
|
|
237
|
-
cds_sequence = translate_frame(cds_sequence)
|
|
238
|
-
wrapped_sequence = '\n'.join([cds_sequence[i:i + 60] for i in range(0, len(cds_sequence), 60)])
|
|
239
|
-
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
def read_combined_files(input_dir, name_split, gene_ident, combined_out, translate):
|
|
243
|
-
with open(combined_out, 'w') as combined_out_file:
|
|
244
|
-
for gff_file in glob.glob(os.path.join(input_dir, '*' + name_split)):
|
|
245
|
-
genome_name = os.path.basename(gff_file).split(name_split)[0]
|
|
246
|
-
fasta_dict = collections.defaultdict(str)
|
|
247
|
-
gff_features = []
|
|
248
|
-
with open(gff_file, 'r') as file:
|
|
249
|
-
seen_seq_ids = collections.defaultdict(int)
|
|
250
|
-
lines = file.readlines()
|
|
251
|
-
fasta_section = False
|
|
252
|
-
for line in lines:
|
|
253
|
-
if line.startswith('##FASTA'):
|
|
254
|
-
fasta_section = True
|
|
255
|
-
continue
|
|
256
|
-
if fasta_section:
|
|
257
|
-
if line.startswith('>'):
|
|
258
|
-
current_contig = line[1:].split()[0]
|
|
259
|
-
fasta_dict[current_contig] = ['','']
|
|
260
|
-
else:
|
|
261
|
-
fasta_dict[current_contig][0] +=line.strip()
|
|
262
|
-
else:
|
|
263
|
-
line_data = line.split('\t')
|
|
264
|
-
if len(line_data) == 9:
|
|
265
|
-
if any(gene_type in line_data[2] for gene_type in gene_ident):
|
|
266
|
-
contig = line_data[0]
|
|
267
|
-
feature = line_data[2]
|
|
268
|
-
strand = line_data[6]
|
|
269
|
-
start, end = int(line_data[3]), int(line_data[4])
|
|
270
|
-
seq_id = line_data[8].split('ID=')[1].split(';')[0]
|
|
271
|
-
if seq_id in seen_seq_ids:
|
|
272
|
-
seq_id += '_' + str(seen_seq_ids[seq_id])
|
|
273
|
-
seen_seq_ids[seq_id] + 1
|
|
274
|
-
else:
|
|
275
|
-
seen_seq_ids[seq_id] = 1
|
|
276
|
-
gff_features.append((contig, start, end, strand, feature, seq_id))
|
|
277
|
-
|
|
278
|
-
for contig, fasta in fasta_dict.items():
|
|
279
|
-
reverse_sequence = reverse_complement(fasta[0])
|
|
280
|
-
fasta_dict[contig][1]=reverse_sequence
|
|
281
|
-
|
|
282
|
-
if fasta_dict and gff_features:
|
|
283
|
-
for contig, start, end, strand, feature, seq_id in gff_features:
|
|
284
|
-
if contig in fasta_dict:
|
|
285
|
-
if strand == '+':
|
|
286
|
-
full_sequence = fasta_dict[contig][0]
|
|
287
|
-
cds_sequence = full_sequence[start - 1:end]
|
|
288
|
-
elif strand == '-':
|
|
289
|
-
corrected_start = max(len(fasta_dict[contig][0]) - int(end), 1)
|
|
290
|
-
corrected_stop = max(len(fasta_dict[contig][0]) - int(start - 1), 1)
|
|
291
|
-
full_sequence = fasta_dict[contig][1]
|
|
292
|
-
cds_sequence = full_sequence[corrected_start:corrected_stop]
|
|
293
|
-
|
|
294
|
-
if translate == True:
|
|
295
|
-
cds_sequence = translate_frame(cds_sequence)
|
|
296
|
-
wrapped_sequence = '\n'.join([cds_sequence[i:i + 60] for i in range(0, len(cds_sequence), 60)])
|
|
297
|
-
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
def read_fasta_files(input_dir, name_split, combined_out, translate):
|
|
301
|
-
with open(combined_out, 'w') as combined_out_file:
|
|
302
|
-
for fasta_file in glob.glob(os.path.join(input_dir, '*' + name_split)):
|
|
303
|
-
genome_name = os.path.basename(fasta_file).split(name_split)[0]
|
|
304
|
-
fasta_dict = collections.defaultdict(str)
|
|
305
|
-
with open(fasta_file, 'r') as file:
|
|
306
|
-
lines = file.readlines()
|
|
307
|
-
for line in lines:
|
|
308
|
-
if line.startswith('>'):
|
|
309
|
-
current_seq = line[1:].split()[0]
|
|
310
|
-
fasta_dict[current_seq] = ''
|
|
311
|
-
else:
|
|
312
|
-
fasta_dict[current_seq] +=line.strip()
|
|
313
|
-
for id, seq in fasta_dict.items():
|
|
314
|
-
if translate == True:
|
|
315
|
-
seq = translate_frame(seq)
|
|
316
|
-
wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
|
|
317
|
-
combined_out_file.write(f">{genome_name}|{id}\n{wrapped_sequence}\n")
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
def write_groups(options, output_dir, key_order, cores, sequences,
|
|
321
|
-
pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences):
|
|
322
|
-
"""
|
|
323
|
-
Writes individual FASTA files and a combined FASTA file for all sequences.
|
|
324
|
-
|
|
325
|
-
Parameters:
|
|
326
|
-
- options: Command-line options.
|
|
327
|
-
- output_dir: Directory where output FASTA files will be saved.
|
|
328
|
-
- key_order: The order in which to process keys.
|
|
329
|
-
- cores: Dictionary of core genes.
|
|
330
|
-
- sequences: Dictionary mapping headers to sequences.
|
|
331
|
-
- pangenome_clusters_First_sequences_sorted: Dictionary of first sequence clusters.
|
|
332
|
-
- combined_pangenome_clusters_Second_sequences: Dictionary of second sequence clusters.
|
|
333
|
-
"""
|
|
334
|
-
# Create output directory if it doesn't exist
|
|
335
|
-
if not os.path.exists(output_dir):
|
|
336
|
-
os.makedirs(output_dir)
|
|
337
|
-
|
|
338
|
-
combined_fasta_filename = os.path.join(output_dir, "combined_group_sequences.fasta")
|
|
339
|
-
|
|
340
|
-
# Open combined FASTA file for writing all sequences
|
|
341
|
-
with open(combined_fasta_filename, 'w') as combined_fasta:
|
|
342
|
-
for key_prefix in key_order:
|
|
343
|
-
for key, values in cores.items():
|
|
344
|
-
if any(part in options.write_groups.split(',') for part in key.split('_')):
|
|
345
|
-
if key.startswith(key_prefix):
|
|
346
|
-
for value in values:
|
|
347
|
-
output_filename = f"{key}_{value}.fasta"
|
|
348
|
-
if 'First' in key_prefix:
|
|
349
|
-
sequences_to_write = pangenome_clusters_First_sequences_sorted[value]
|
|
350
|
-
else:
|
|
351
|
-
sequences_to_write = combined_pangenome_clusters_Second_sequences[value]
|
|
352
|
-
|
|
353
|
-
# Write individual FASTA file
|
|
354
|
-
with open(os.path.join(output_dir, output_filename), 'w') as outfile:
|
|
355
|
-
for header in sequences_to_write:
|
|
356
|
-
if header in sequences:
|
|
357
|
-
sequence = sequences[header]
|
|
358
|
-
outfile.write(f">{header}\n")
|
|
359
|
-
wrapped_sequence = wrap_sequence(sequence)
|
|
360
|
-
outfile.write(f"{wrapped_sequence}\n")
|
|
361
|
-
|
|
362
|
-
# Also write to the combined FASTA file
|
|
363
|
-
combined_fasta.write(f">Group_{value}|{header}\n")
|
|
364
|
-
combined_fasta.write(f"{wrapped_sequence}\n")
|
|
365
|
-
else:
|
|
366
|
-
if options.verbose:
|
|
367
|
-
print(f"Sequence {header} not found in original_fasta file.")
|
|
368
|
-
|
|
369
|
-
print(f"Combined FASTA file saved to: {combined_fasta_filename}")
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
def process_gene_families(options, directory, output_file):
|
|
373
|
-
"""Process each gene family file to select the longest sequence per genome and concatenate aligned sequences."""
|
|
374
|
-
concatenated_sequences = {}
|
|
375
|
-
output_file = directory.replace('Gene_Families_Output',output_file)
|
|
376
|
-
|
|
377
|
-
# Iterate over each gene family file
|
|
378
|
-
for gene_file in os.listdir(directory):
|
|
379
|
-
if gene_file.endswith('.fasta') and not gene_file.endswith('combined_group_sequences.fasta'):
|
|
380
|
-
gene_path = os.path.join(directory, gene_file)
|
|
381
|
-
|
|
382
|
-
# Read sequences from the gene family file
|
|
383
|
-
sequences = read_fasta(gene_path)
|
|
384
|
-
|
|
385
|
-
# Select the longest sequence for each genome
|
|
386
|
-
longest_sequences = select_longest_gene(sequences)
|
|
387
|
-
|
|
388
|
-
# Run mafft on the longest sequences
|
|
389
|
-
aligned_file = f"{directory}/{gene_file}_aligned.fasta.tmp"
|
|
390
|
-
run_mafft_on_sequences(options, {seq_id: seq for seq_id, seq in longest_sequences.values()}, aligned_file)
|
|
391
|
-
|
|
392
|
-
# Read aligned sequences and concatenate them
|
|
393
|
-
aligned_sequences = read_fasta(aligned_file)
|
|
394
|
-
for genome, aligned_seq in aligned_sequences.items():
|
|
395
|
-
genome_name = genome.split('|')[0]
|
|
396
|
-
if genome_name not in concatenated_sequences:
|
|
397
|
-
concatenated_sequences[genome_name] = ""
|
|
398
|
-
concatenated_sequences[genome_name] += aligned_seq
|
|
399
|
-
|
|
400
|
-
# Clean up aligned file
|
|
401
|
-
os.remove(aligned_file)
|
|
402
|
-
|
|
403
|
-
# Write the concatenated sequences to the output file
|
|
404
|
-
with open(output_file, 'w') as out:
|
|
405
|
-
for genome, sequence in concatenated_sequences.items():
|
|
406
|
-
out.write(f">{genome}\n")
|
|
407
|
-
wrapped_sequence = wrap_sequence(sequence, 60)
|
|
408
|
-
out.write(f"{wrapped_sequence}\n")
|