levseq 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- levseq/IO_processor.py +565 -0
- levseq/__init__.py +34 -0
- levseq/barcoding/__init__.py +1 -0
- levseq/barcoding/demultiplex +0 -0
- levseq/barcoding/demultiplex-arm64 +0 -0
- levseq/barcoding/demultiplex-x86 +0 -0
- levseq/barcoding/minion_barcodes.fasta +386 -0
- levseq/basecaller.py +80 -0
- levseq/cmd.py +23 -0
- levseq/globals.py +66 -0
- levseq/interface.py +85 -0
- levseq/parser.py +82 -0
- levseq/run_levseq.py +558 -0
- levseq/screen.py +38 -0
- levseq/simulation.py +311 -0
- levseq/user.py +157 -0
- levseq/utils.py +474 -0
- levseq/variantcaller.py +252 -0
- levseq/visualization.py +1130 -0
- levseq-1.0.0.data/data/LICENSE +674 -0
- levseq-1.0.0.dist-info/LICENSE +674 -0
- levseq-1.0.0.dist-info/METADATA +180 -0
- levseq-1.0.0.dist-info/RECORD +26 -0
- levseq-1.0.0.dist-info/WHEEL +5 -0
- levseq-1.0.0.dist-info/entry_points.txt +2 -0
- levseq-1.0.0.dist-info/top_level.txt +1 -0
levseq/simulation.py
ADDED
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
###############################################################################
|
|
2
|
+
# #
|
|
3
|
+
# This program is free software: you can redistribute it and/or modify #
|
|
4
|
+
# it under the terms of the GNU General Public License as published by #
|
|
5
|
+
# the Free Software Foundation, either version 3 of the License, or #
|
|
6
|
+
# (at your option) any later version. #
|
|
7
|
+
# #
|
|
8
|
+
# This program is distributed in the hope that it will be useful, #
|
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
11
|
+
# GNU General Public License for more details. #
|
|
12
|
+
# #
|
|
13
|
+
# You should have received a copy of the GNU General Public License #
|
|
14
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>. #
|
|
15
|
+
# #
|
|
16
|
+
###############################################################################
|
|
17
|
+
# Import all packages
|
|
18
|
+
import random
|
|
19
|
+
|
|
20
|
+
from levseq.variantcaller import *
|
|
21
|
+
import math
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def get_dummy_plate_df(plate_name='Plate', well_name='Well', number_of_wells=96):
|
|
25
|
+
"""
|
|
26
|
+
Make a dummy plate.
|
|
27
|
+
Plate Well Path Alignment_count P value Mixed Well Variant Average mutation frequency P adj. value
|
|
28
|
+
"""
|
|
29
|
+
df = pd.DataFrame([i for i in range(0, number_of_wells)], columns=['index'])
|
|
30
|
+
df['Plate'] = plate_name
|
|
31
|
+
df['Well'] = well_name
|
|
32
|
+
df['Path'] = ''
|
|
33
|
+
df['Alignment_count'] = 0
|
|
34
|
+
df['P value'] = 1.0
|
|
35
|
+
df['Mixed Well'] = False
|
|
36
|
+
df['Variant'] = ''
|
|
37
|
+
df['mutation'] = ''
|
|
38
|
+
df['frequency'] = 0
|
|
39
|
+
df['P adj.'] = 0
|
|
40
|
+
df["True Variant"] = ''
|
|
41
|
+
df.set_index('index', inplace=True)
|
|
42
|
+
return df
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def mutate_sequence(sequence, mutation_frequency, bases=None):
|
|
46
|
+
"""
|
|
47
|
+
Mutates a given nucleotide sequence at a specified mutation frequency.
|
|
48
|
+
"""
|
|
49
|
+
bases = bases if bases is not None else ['A', 'T', 'G', 'C', '-'] # Inlucde deletions
|
|
50
|
+
|
|
51
|
+
sequence_list = list(sequence)
|
|
52
|
+
|
|
53
|
+
# Iterate over the sequence and mutate bases with the given probability
|
|
54
|
+
for i, base in enumerate(sequence_list):
|
|
55
|
+
if random.random() < mutation_frequency:
|
|
56
|
+
# Choose a new base different from the current one (possibly extend this to have a preference?)
|
|
57
|
+
new_base = random.choice([b for b in bases if b != base])
|
|
58
|
+
sequence_list[i] = new_base
|
|
59
|
+
|
|
60
|
+
# Convert the list back to a string
|
|
61
|
+
mutated_sequence = ''.join(sequence_list)
|
|
62
|
+
return mutated_sequence
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def insert_nt(original_nt_seq, protein_mutations, codon_usage):
|
|
66
|
+
nt_seq_list = list(original_nt_seq)
|
|
67
|
+
for pos, new_aa in protein_mutations:
|
|
68
|
+
# Convert protein position to nucleotide position
|
|
69
|
+
nt_pos = pos * 3 # Assuming the codon starts at this position
|
|
70
|
+
|
|
71
|
+
# Select a codon for the new amino acid
|
|
72
|
+
new_codon = codon_usage[new_aa]
|
|
73
|
+
|
|
74
|
+
# Replace the original codon in the nucleotide sequence
|
|
75
|
+
nt_seq_list[nt_pos:nt_pos + 3] = list(new_codon)
|
|
76
|
+
|
|
77
|
+
# Convert the list back to a string
|
|
78
|
+
mutated_nt_seq = ''.join(nt_seq_list)
|
|
79
|
+
return mutated_nt_seq
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def generate_ssm_library(positions, parent_sequence_aa, parent_sequence_nt, codon_usage):
|
|
83
|
+
"""
|
|
84
|
+
For each position, generate a SSM library for a given parent and a set of positions.
|
|
85
|
+
"""
|
|
86
|
+
amino_acids = 'ACDEFGHIKLMNPQRSTVWY' # 20 standard amino acids
|
|
87
|
+
library = []
|
|
88
|
+
nt_seq_list = list(parent_sequence_nt)
|
|
89
|
+
for position in positions:
|
|
90
|
+
for aa in amino_acids:
|
|
91
|
+
if parent_sequence_aa[position] != aa: # i.e. don't dup the original
|
|
92
|
+
nt_pos = position * 3 # Assuming the codon starts at this position
|
|
93
|
+
|
|
94
|
+
# Select a codon for the new amino acid
|
|
95
|
+
new_codon = codon_usage[aa]
|
|
96
|
+
nt_seq_list[nt_pos:nt_pos + 3] = list(new_codon)
|
|
97
|
+
|
|
98
|
+
# Convert the list back to a string
|
|
99
|
+
mutated_nt_seq = ''.join(nt_seq_list)
|
|
100
|
+
library.append(mutated_nt_seq)
|
|
101
|
+
|
|
102
|
+
return library
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def make_experiment(run_label, read_depth, sequencing_error_rate, parent_sequence, library_number,
|
|
106
|
+
number_of_wells, epcr_mutation_rate, frequency_cutoff=0.5, number_wells_to_mix=0,
|
|
107
|
+
mixture_rate=0, qc_files_path=None):
|
|
108
|
+
# Make a full experiment setup
|
|
109
|
+
mixed_wells = None
|
|
110
|
+
if number_wells_to_mix > 0:
|
|
111
|
+
# mixed_wells tells us which wells are truely mixed
|
|
112
|
+
mutated_sequence, mixed_wells = make_mixed_well_epcr_de_experiment(read_depth, sequencing_error_rate, parent_sequence, library_number,
|
|
113
|
+
epcr_mutation_rate, number_wells_to_mix, mixture_rate)
|
|
114
|
+
else:
|
|
115
|
+
mutated_sequence = make_epcr_de_experiment(read_depth, sequencing_error_rate, parent_sequence, library_number,
|
|
116
|
+
epcr_mutation_rate)
|
|
117
|
+
|
|
118
|
+
variant_df = get_dummy_plate_df(run_label, 'Well', number_of_wells)
|
|
119
|
+
mutant_to_well_df = {}
|
|
120
|
+
current_well = 0
|
|
121
|
+
variant_df['True Mixed Well'] = False
|
|
122
|
+
for mutant in tqdm(mutated_sequence):
|
|
123
|
+
parent_name = 'Parent'
|
|
124
|
+
reads = []
|
|
125
|
+
read_ids = []
|
|
126
|
+
quals = []
|
|
127
|
+
for i, seq in enumerate(mutated_sequence[mutant]):
|
|
128
|
+
read_ids.append(f'read_{i}')
|
|
129
|
+
reads.append(seq)
|
|
130
|
+
quals.append(100) # Dummy don't need
|
|
131
|
+
|
|
132
|
+
well_df = make_well_df_from_reads(reads, read_ids, quals)
|
|
133
|
+
rows_all = make_row_from_read_pileup_across_well(well_df, parent_sequence, parent_name)
|
|
134
|
+
well_df = pd.DataFrame(rows_all)
|
|
135
|
+
well_df.columns = ['gene_name', 'position', 'ref', 'most_frequent', 'freq_non_ref', 'total_other',
|
|
136
|
+
'total_reads', 'p_value', 'percent_most_freq_mutation', 'A', 'p(a)', 'T', 'p(t)', 'G',
|
|
137
|
+
'p(g)', 'C', 'p(c)', 'N', 'p(n)']
|
|
138
|
+
well_df = calculate_mutation_significance_across_well(well_df)
|
|
139
|
+
if qc_files_path is not None:
|
|
140
|
+
# Save QC data
|
|
141
|
+
qc_well_df = make_well_df_for_saving(reads, read_ids, quals)
|
|
142
|
+
write_msa_for_df(qc_well_df, well_df, parent_name, parent_sequence,
|
|
143
|
+
os.path.join(qc_files_path, f'{run_label}_{current_well}.fa'),
|
|
144
|
+
os.path.join(qc_files_path, f'{run_label}_{current_well}.csv'))
|
|
145
|
+
label, frequency, combined_p_value, mixed_well = get_variant_label_for_well(well_df, frequency_cutoff)
|
|
146
|
+
mutant_to_well_df[f'{mutant}_{current_well}'] = well_df
|
|
147
|
+
variant_df.at[current_well, "Mixed Well"] = mixed_well
|
|
148
|
+
variant_df.at[current_well, "Variant"] = label
|
|
149
|
+
variant_df.at[current_well, "True Variant"] = mutant
|
|
150
|
+
variant_df.at[current_well, "frequency"] = frequency
|
|
151
|
+
variant_df.at[current_well, "P value"] = combined_p_value
|
|
152
|
+
variant_df.at[current_well, "Well"] = f'Well {current_well}'
|
|
153
|
+
variant_df.at[current_well, "Alignment_count"] = read_depth
|
|
154
|
+
if mixed_wells is not None:
|
|
155
|
+
variant_df.at[current_well, "True Mixed Well"] = mixed_wells[mutant] # Save this as a true mixed well
|
|
156
|
+
current_well += 1
|
|
157
|
+
|
|
158
|
+
# Before returning adjust the pvalues
|
|
159
|
+
variant_df['P adj.'] = multipletests(list(variant_df["P value"].values), alpha=0.05, method='fdr_bh')[1]
|
|
160
|
+
# Also get the accuracy
|
|
161
|
+
variant_df = check_variants(variant_df, parent_sequence)
|
|
162
|
+
return variant_df
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def make_well_df_for_saving(seqs, read_ids, read_quals):
|
|
166
|
+
"""
|
|
167
|
+
Make a dataframe in a specific format taking the reads and read IDs and filtering duplicates based on the
|
|
168
|
+
read quality. Keeps the highest quality scoring read for a given read ID.
|
|
169
|
+
"""
|
|
170
|
+
seq_df = pd.DataFrame([list(s) for s in seqs]) # Convert each string to a list so that we get positions nicely
|
|
171
|
+
# Also add in the read_ids and sort by the quality to only take the highest quality one
|
|
172
|
+
seq_df['read_id'] = read_ids
|
|
173
|
+
seq_df['read_qual'] = read_quals
|
|
174
|
+
seq_df['seqs'] = seqs
|
|
175
|
+
seq_df = seq_df.sort_values(by='read_qual', ascending=False)
|
|
176
|
+
# Should now be sorted by the highest quality
|
|
177
|
+
seq_df = seq_df.drop_duplicates(subset=['read_id'], keep='first')
|
|
178
|
+
return seq_df
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def write_msa_for_df(reads_across_well_df, well_df, parent_name, parent_sequence, msa_path, df_path):
|
|
182
|
+
""" This is for checking that we have the correct data."""
|
|
183
|
+
read_ids = reads_across_well_df['read_id']
|
|
184
|
+
seqs = reads_across_well_df['seqs']
|
|
185
|
+
# Check if we want to write a MSA
|
|
186
|
+
if msa_path is not None:
|
|
187
|
+
with open(msa_path, 'w+') as fout:
|
|
188
|
+
# Write the reference first
|
|
189
|
+
fout.write(f'>{parent_name}\n{parent_sequence}\n')
|
|
190
|
+
|
|
191
|
+
for i, seq in enumerate(seqs):
|
|
192
|
+
fout.write(f'>{read_ids[i]}\n{"".join(seq)}\n')
|
|
193
|
+
well_df.to_csv(df_path)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def generate_epcr_library(parent_sequence, mutation_rate, library_number):
|
|
197
|
+
"""
|
|
198
|
+
For a parent make a number of sequenes using the error prone PCR.
|
|
199
|
+
"""
|
|
200
|
+
return [mutate_sequence(parent_sequence, mutation_rate, ['A', 'T', 'G', 'C']) for c in range(0, library_number)]
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def make_epcr_de_experiment(read_depth, sequencing_error_rate, parent_sequence, library_number, epcr_mutation_rate):
|
|
204
|
+
"""
|
|
205
|
+
library_number would normally be the number of wells for example.
|
|
206
|
+
"""
|
|
207
|
+
# Simulate the mutation frequncey
|
|
208
|
+
# First make the library
|
|
209
|
+
library = generate_epcr_library(parent_sequence, epcr_mutation_rate, library_number)
|
|
210
|
+
# For this library for each one, simulate the number of reads with a sequencing error rate
|
|
211
|
+
reads_per_well = {}
|
|
212
|
+
for seq in library:
|
|
213
|
+
reads_per_well[seq] = [mutate_sequence(seq, sequencing_error_rate) for i in range(0, read_depth)]
|
|
214
|
+
return reads_per_well
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def make_ssm_de_experiment(read_depth, sequencing_error_rate, parent_sequence, positions, parent_sequence_aa,
|
|
218
|
+
codon_usage):
|
|
219
|
+
"""
|
|
220
|
+
library_number would normally be the number of wells for example.
|
|
221
|
+
"""
|
|
222
|
+
# Simulate the mutation freq
|
|
223
|
+
library = generate_ssm_library(positions, parent_sequence_aa, parent_sequence, codon_usage)
|
|
224
|
+
# For this library for each one, simulate the number of reads with a sequencing error rate
|
|
225
|
+
reads_per_well = {}
|
|
226
|
+
for seq in library:
|
|
227
|
+
# i.e. the key is the seq and the value is the mutated reads
|
|
228
|
+
reads_per_well[seq] = [mutate_sequence(seq, sequencing_error_rate) for i in range(0, read_depth)]
|
|
229
|
+
return reads_per_well
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def make_mixed_well_epcr_de_experiment(read_depth, sequencing_error_rate, parent_sequence, library_number,
|
|
233
|
+
epcr_mutation_rate, number_wells_to_mix, mixture_rate):
|
|
234
|
+
"""
|
|
235
|
+
Make a mixed well experiment to test code with.
|
|
236
|
+
"""
|
|
237
|
+
# Simulate the mutation frequncey
|
|
238
|
+
# First make the library
|
|
239
|
+
library = generate_epcr_library(parent_sequence, epcr_mutation_rate, library_number)
|
|
240
|
+
# For this library for each one, simulate the number of reads with a sequencing error rate
|
|
241
|
+
reads_per_well = {}
|
|
242
|
+
reads_mutated_label = {}
|
|
243
|
+
for seq in library:
|
|
244
|
+
# Randomly mix some of the wells at the mixture rate. Here we'll just randomly "dope" in some of the randomly
|
|
245
|
+
reads_per_well[seq] = [mutate_sequence(seq, sequencing_error_rate) for i in range(0, read_depth)]
|
|
246
|
+
reads_mutated_label[seq] = False
|
|
247
|
+
wells_to_mix = random.sample(list(reads_per_well.keys()), number_wells_to_mix)
|
|
248
|
+
# Combine them
|
|
249
|
+
for well_seq in wells_to_mix:
|
|
250
|
+
# For each well, randomly select one from the other wells
|
|
251
|
+
dope_in_seq = random.sample(wells_to_mix, 1)[0] # We only want one!
|
|
252
|
+
if dope_in_seq != well_seq: # Make sure the sequences are different
|
|
253
|
+
# Swap out a percentage of the wells from dope in into the other well
|
|
254
|
+
number_to_add_in = math.floor(read_depth*mixture_rate)
|
|
255
|
+
for read_position in range(0, number_to_add_in):
|
|
256
|
+
# Just make the top X the other doped in seq
|
|
257
|
+
reads_per_well[well_seq][read_position] = reads_per_well[dope_in_seq][read_position]
|
|
258
|
+
reads_mutated_label[well_seq] = True
|
|
259
|
+
|
|
260
|
+
return reads_per_well, reads_mutated_label
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def check_variants(variant_df, parent_sequence):
|
|
264
|
+
""" This just checks if the variants are actually correct! """
|
|
265
|
+
|
|
266
|
+
corrects = []
|
|
267
|
+
incorrects = []
|
|
268
|
+
true_positives = 0
|
|
269
|
+
true_negatives = 0
|
|
270
|
+
false_negatives = 0
|
|
271
|
+
false_positives = 0
|
|
272
|
+
for predicted_variant, true_variant in variant_df[['Variant', 'True Variant']].values:
|
|
273
|
+
count_correct = 0
|
|
274
|
+
count_incorrect = 0
|
|
275
|
+
for mutation in predicted_variant.split('_'):
|
|
276
|
+
try:
|
|
277
|
+
if 'PARENT' in mutation:
|
|
278
|
+
# Check that the two seqeunces are correct
|
|
279
|
+
for i in range(0, len(true_variant)):
|
|
280
|
+
if true_variant[i] == parent_sequence[i]:
|
|
281
|
+
count_correct += 1
|
|
282
|
+
true_positives += 1
|
|
283
|
+
else:
|
|
284
|
+
count_incorrect += 1
|
|
285
|
+
false_negatives += 1
|
|
286
|
+
else:
|
|
287
|
+
# true_variant is a sequence while predicated variant is just the mutations
|
|
288
|
+
if 'DEL' not in mutation:
|
|
289
|
+
mut_pos = int(mutation[1:-1]) # A1T
|
|
290
|
+
mut = mutation[-1]
|
|
291
|
+
else:
|
|
292
|
+
mut_pos = int(mutation[1:].replace('DEL', ''))
|
|
293
|
+
mut = 'DEL'
|
|
294
|
+
if true_variant[mut_pos - 1] == mut:
|
|
295
|
+
count_correct += 1
|
|
296
|
+
true_positives += 1
|
|
297
|
+
else:
|
|
298
|
+
count_incorrect += 1
|
|
299
|
+
try:
|
|
300
|
+
if parent_sequence[mut_pos - 1] != mutation[0]:
|
|
301
|
+
print("WARNING!")
|
|
302
|
+
except:
|
|
303
|
+
print(mut_pos, len(parent_sequence))
|
|
304
|
+
except:
|
|
305
|
+
print(mutation)
|
|
306
|
+
corrects.append(count_correct)
|
|
307
|
+
incorrects.append(count_incorrect)
|
|
308
|
+
variant_df['correct'] = corrects
|
|
309
|
+
variant_df['incorrect'] = incorrects
|
|
310
|
+
variant_df['accuracy'] = np.array(corrects) / (np.array(corrects) + np.array(incorrects))
|
|
311
|
+
return variant_df
|
levseq/user.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
###############################################################################
|
|
2
|
+
# #
|
|
3
|
+
# This program is free software: you can redistribute it and/or modify #
|
|
4
|
+
# it under the terms of the GNU General Public License as published by #
|
|
5
|
+
# the Free Software Foundation, either version 3 of the License, or #
|
|
6
|
+
# (at your option) any later version. #
|
|
7
|
+
# #
|
|
8
|
+
# This program is distributed in the hope that it will be useful, #
|
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
11
|
+
# GNU General Public License for more details. #
|
|
12
|
+
# #
|
|
13
|
+
# You should have received a copy of the GNU General Public License #
|
|
14
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>. #
|
|
15
|
+
# #
|
|
16
|
+
###############################################################################
|
|
17
|
+
import random
|
|
18
|
+
|
|
19
|
+
import numpy as np
|
|
20
|
+
|
|
21
|
+
from levseq.variantcaller import *
|
|
22
|
+
from Bio import AlignIO
|
|
23
|
+
from sklearn.decomposition import PCA
|
|
24
|
+
|
|
25
|
+
"""
|
|
26
|
+
Functions for interfacing with users.
|
|
27
|
+
|
|
28
|
+
1. Converting the final sequences to MSA
|
|
29
|
+
2. Validation that same sequences got the same fitness
|
|
30
|
+
3. Info/summaries based on what the mutations were that did good
|
|
31
|
+
4. Encoding etc for the sequneces
|
|
32
|
+
5. Generating figures.
|
|
33
|
+
|
|
34
|
+
Nonpolar (Hydrophobic)
|
|
35
|
+
Alanine (Ala, A) - Nonpolar, aliphatic side chain
|
|
36
|
+
Isoleucine (Ile, I) - Nonpolar, aliphatic side chain
|
|
37
|
+
Leucine (Leu, L) - Nonpolar, aliphatic side chain
|
|
38
|
+
Methionine (Met, M) - Nonpolar, sulfur-containing side chain
|
|
39
|
+
Phenylalanine (Phe, F) - Nonpolar, aromatic side chain
|
|
40
|
+
Proline (Pro, P) - Nonpolar, cyclic aliphatic side chain
|
|
41
|
+
Tryptophan (Trp, W) - Nonpolar, aromatic side chain
|
|
42
|
+
Valine (Val, V) - Nonpolar, aliphatic side chain
|
|
43
|
+
Polar, Uncharged
|
|
44
|
+
Asparagine (Asn, N) - Polar, amide-containing side chain
|
|
45
|
+
Cysteine (Cys, C) - Polar, sulfur-containing side chain
|
|
46
|
+
Glutamine (Gln, Q) - Polar, amide-containing side chain
|
|
47
|
+
Serine (Ser, S) - Polar, hydroxyl-containing side chain
|
|
48
|
+
Threonine (Thr, T) - Polar, hydroxyl-containing side chain
|
|
49
|
+
Tyrosine (Tyr, Y) - Polar, aromatic side chain with a hydroxyl group
|
|
50
|
+
Polar, Acidic (Negatively Charged at Physiological pH)
|
|
51
|
+
Aspartic Acid (Asp, D) - Acidic, carboxylate-containing side chain
|
|
52
|
+
Glutamic Acid (Glu, E) - Acidic, carboxylate-containing side chain
|
|
53
|
+
Polar, Basic (Positively Charged at Physiological pH)
|
|
54
|
+
Arginine (Arg, R) - Basic, contains a guanidinium group
|
|
55
|
+
Histidine (His, H) - Basic, contains an imidazole group
|
|
56
|
+
Lysine (Lys, K) - Basic, contains an amino group
|
|
57
|
+
Special Cases
|
|
58
|
+
Glycine (Gly, G) - The simplest amino acid, with a hydrogen as its side chain. It's often classified as nonpolar due to its minimal side chain, but its small size allows it to fit into both polar and nonpolar environments, making it quite versatile.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
# Define the standard amino acids
|
|
62
|
+
amino_acids = 'GAVCPLIMWFKRHSTYNQDE'
|
|
63
|
+
# Create a mapping from amino acids to their index
|
|
64
|
+
aa_to_index = {aa: i for i, aa in enumerate(amino_acids)}
|
|
65
|
+
# init 0 as default value
|
|
66
|
+
|
|
67
|
+
from sciutil import SciUtil
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
# Just pretty printing.
|
|
71
|
+
u = SciUtil()
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def convert_variant_df_to_seqs(variant_df, parent_seq):
|
|
75
|
+
"""
|
|
76
|
+
Converts the variant DF to a MSA.
|
|
77
|
+
"""
|
|
78
|
+
# Get the sequence from the DF, using the reference we'll convert this to the aa and then run a MSA on it
|
|
79
|
+
seqs = [translate(parent_seq)] # always start with the parent
|
|
80
|
+
seq_ids = ['PARENT']
|
|
81
|
+
for plate, well, predicted_variant in variant_df[['Plate', 'Well', 'Variant']].values:
|
|
82
|
+
label = f'{plate} {well}'
|
|
83
|
+
seq = list(parent_seq)
|
|
84
|
+
if isinstance(predicted_variant, str):
|
|
85
|
+
try:
|
|
86
|
+
for mutation in predicted_variant.split('_'):
|
|
87
|
+
if 'PARENT' in mutation:
|
|
88
|
+
continue # We skip this!
|
|
89
|
+
else:
|
|
90
|
+
# true_variant is a sequence while predicated variant is just the mutations
|
|
91
|
+
if 'DEL' not in mutation:
|
|
92
|
+
mut_pos = int(mutation[1:-1]) - 1 # A1T
|
|
93
|
+
mut = mutation[-1]
|
|
94
|
+
else:
|
|
95
|
+
mut_pos = int(mutation[1:].replace('DEL', '')) - 1
|
|
96
|
+
mut = 'DEL'
|
|
97
|
+
# This is the mutation at that point so we add it to our seq
|
|
98
|
+
if mut == 'DEL':
|
|
99
|
+
seq[mut_pos] = '-' # We'll remove these later
|
|
100
|
+
else:
|
|
101
|
+
seq[mut_pos] = mut
|
|
102
|
+
except Exception as e:
|
|
103
|
+
print(e)
|
|
104
|
+
print(plate, well, mut_pos, len(predicted_variant), len(parent_seq))
|
|
105
|
+
seqs.append(translate(''.join(seq).replace('-', ''))) # Remove the gaps and translate the sequence
|
|
106
|
+
seq_ids.append(label)
|
|
107
|
+
else:
|
|
108
|
+
print(label, predicted_variant)
|
|
109
|
+
return seqs, seq_ids
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def get_colour(aa):
|
|
113
|
+
"""
|
|
114
|
+
Get colour for an amino acid sequence.
|
|
115
|
+
"""
|
|
116
|
+
return 0
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def make_msa(seqs, seq_ids, file_to_align='/tmp/msa.fa'):
|
|
120
|
+
"""
|
|
121
|
+
Potentialy change this so that the MSA file has the unique time stamp so that we don't get overriding issues.
|
|
122
|
+
"""
|
|
123
|
+
with open(file_to_align, 'w+') as fout:
|
|
124
|
+
for i, seq in enumerate(seqs):
|
|
125
|
+
if isinstance(seq, str) and len(seq) > 0:
|
|
126
|
+
fout.write(f'>{seq_ids[i]}\n{seq}\n')
|
|
127
|
+
|
|
128
|
+
# Now make the msa
|
|
129
|
+
msa_file = f'{file_to_align.replace(".fa", "_msa.fa")}'
|
|
130
|
+
# Write each one as a fasta file then run the clustal and then the tree
|
|
131
|
+
os.system(f'../software/./clustal-omega-1.2.3-macosx --force -i {file_to_align} -o {msa_file} -v')
|
|
132
|
+
u.dp(['Done MSA'])
|
|
133
|
+
# Reading the alignment file
|
|
134
|
+
alignment = AlignIO.read(msa_file, 'fasta')
|
|
135
|
+
return alignment
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def make_pca(encoded_sequences):
|
|
139
|
+
pca = PCA(n_components=2)
|
|
140
|
+
# Fit PCA on the standardized data
|
|
141
|
+
return pca.fit_transform(encoded_sequences)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def one_hot_encode(sequence):
|
|
145
|
+
sequence = sequence.replace('X', '-')
|
|
146
|
+
sequence = sequence.replace('?', '-')
|
|
147
|
+
|
|
148
|
+
# Initialize an array to hold the one-hot encoded sequence
|
|
149
|
+
one_hot_sequence = np.zeros((len(sequence), len(amino_acids)), dtype=int)
|
|
150
|
+
|
|
151
|
+
# Fill the array with one-hot encodings
|
|
152
|
+
for i, aa in enumerate(sequence):
|
|
153
|
+
if aa_to_index.get(aa) is not None:
|
|
154
|
+
one_hot_sequence[i, aa_to_index.get(aa)] = 1
|
|
155
|
+
|
|
156
|
+
flat_seq = one_hot_sequence.flatten()
|
|
157
|
+
return flat_seq
|