levseq 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
levseq/simulation.py ADDED
@@ -0,0 +1,311 @@
1
+ ###############################################################################
2
+ # #
3
+ # This program is free software: you can redistribute it and/or modify #
4
+ # it under the terms of the GNU General Public License as published by #
5
+ # the Free Software Foundation, either version 3 of the License, or #
6
+ # (at your option) any later version. #
7
+ # #
8
+ # This program is distributed in the hope that it will be useful, #
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
11
+ # GNU General Public License for more details. #
12
+ # #
13
+ # You should have received a copy of the GNU General Public License #
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>. #
15
+ # #
16
+ ###############################################################################
17
+ # Import all packages
18
+ import random
19
+
20
+ from levseq.variantcaller import *
21
+ import math
22
+
23
+
24
+ def get_dummy_plate_df(plate_name='Plate', well_name='Well', number_of_wells=96):
25
+ """
26
+ Make a dummy plate.
27
+ Plate Well Path Alignment_count P value Mixed Well Variant Average mutation frequency P adj. value
28
+ """
29
+ df = pd.DataFrame([i for i in range(0, number_of_wells)], columns=['index'])
30
+ df['Plate'] = plate_name
31
+ df['Well'] = well_name
32
+ df['Path'] = ''
33
+ df['Alignment_count'] = 0
34
+ df['P value'] = 1.0
35
+ df['Mixed Well'] = False
36
+ df['Variant'] = ''
37
+ df['mutation'] = ''
38
+ df['frequency'] = 0
39
+ df['P adj.'] = 0
40
+ df["True Variant"] = ''
41
+ df.set_index('index', inplace=True)
42
+ return df
43
+
44
+
45
+ def mutate_sequence(sequence, mutation_frequency, bases=None):
46
+ """
47
+ Mutates a given nucleotide sequence at a specified mutation frequency.
48
+ """
49
+ bases = bases if bases is not None else ['A', 'T', 'G', 'C', '-'] # Inlucde deletions
50
+
51
+ sequence_list = list(sequence)
52
+
53
+ # Iterate over the sequence and mutate bases with the given probability
54
+ for i, base in enumerate(sequence_list):
55
+ if random.random() < mutation_frequency:
56
+ # Choose a new base different from the current one (possibly extend this to have a preference?)
57
+ new_base = random.choice([b for b in bases if b != base])
58
+ sequence_list[i] = new_base
59
+
60
+ # Convert the list back to a string
61
+ mutated_sequence = ''.join(sequence_list)
62
+ return mutated_sequence
63
+
64
+
65
+ def insert_nt(original_nt_seq, protein_mutations, codon_usage):
66
+ nt_seq_list = list(original_nt_seq)
67
+ for pos, new_aa in protein_mutations:
68
+ # Convert protein position to nucleotide position
69
+ nt_pos = pos * 3 # Assuming the codon starts at this position
70
+
71
+ # Select a codon for the new amino acid
72
+ new_codon = codon_usage[new_aa]
73
+
74
+ # Replace the original codon in the nucleotide sequence
75
+ nt_seq_list[nt_pos:nt_pos + 3] = list(new_codon)
76
+
77
+ # Convert the list back to a string
78
+ mutated_nt_seq = ''.join(nt_seq_list)
79
+ return mutated_nt_seq
80
+
81
+
82
+ def generate_ssm_library(positions, parent_sequence_aa, parent_sequence_nt, codon_usage):
83
+ """
84
+ For each position, generate a SSM library for a given parent and a set of positions.
85
+ """
86
+ amino_acids = 'ACDEFGHIKLMNPQRSTVWY' # 20 standard amino acids
87
+ library = []
88
+ nt_seq_list = list(parent_sequence_nt)
89
+ for position in positions:
90
+ for aa in amino_acids:
91
+ if parent_sequence_aa[position] != aa: # i.e. don't dup the original
92
+ nt_pos = position * 3 # Assuming the codon starts at this position
93
+
94
+ # Select a codon for the new amino acid
95
+ new_codon = codon_usage[aa]
96
+ nt_seq_list[nt_pos:nt_pos + 3] = list(new_codon)
97
+
98
+ # Convert the list back to a string
99
+ mutated_nt_seq = ''.join(nt_seq_list)
100
+ library.append(mutated_nt_seq)
101
+
102
+ return library
103
+
104
+
105
+ def make_experiment(run_label, read_depth, sequencing_error_rate, parent_sequence, library_number,
106
+ number_of_wells, epcr_mutation_rate, frequency_cutoff=0.5, number_wells_to_mix=0,
107
+ mixture_rate=0, qc_files_path=None):
108
+ # Make a full experiment setup
109
+ mixed_wells = None
110
+ if number_wells_to_mix > 0:
111
+ # mixed_wells tells us which wells are truely mixed
112
+ mutated_sequence, mixed_wells = make_mixed_well_epcr_de_experiment(read_depth, sequencing_error_rate, parent_sequence, library_number,
113
+ epcr_mutation_rate, number_wells_to_mix, mixture_rate)
114
+ else:
115
+ mutated_sequence = make_epcr_de_experiment(read_depth, sequencing_error_rate, parent_sequence, library_number,
116
+ epcr_mutation_rate)
117
+
118
+ variant_df = get_dummy_plate_df(run_label, 'Well', number_of_wells)
119
+ mutant_to_well_df = {}
120
+ current_well = 0
121
+ variant_df['True Mixed Well'] = False
122
+ for mutant in tqdm(mutated_sequence):
123
+ parent_name = 'Parent'
124
+ reads = []
125
+ read_ids = []
126
+ quals = []
127
+ for i, seq in enumerate(mutated_sequence[mutant]):
128
+ read_ids.append(f'read_{i}')
129
+ reads.append(seq)
130
+ quals.append(100) # Dummy don't need
131
+
132
+ well_df = make_well_df_from_reads(reads, read_ids, quals)
133
+ rows_all = make_row_from_read_pileup_across_well(well_df, parent_sequence, parent_name)
134
+ well_df = pd.DataFrame(rows_all)
135
+ well_df.columns = ['gene_name', 'position', 'ref', 'most_frequent', 'freq_non_ref', 'total_other',
136
+ 'total_reads', 'p_value', 'percent_most_freq_mutation', 'A', 'p(a)', 'T', 'p(t)', 'G',
137
+ 'p(g)', 'C', 'p(c)', 'N', 'p(n)']
138
+ well_df = calculate_mutation_significance_across_well(well_df)
139
+ if qc_files_path is not None:
140
+ # Save QC data
141
+ qc_well_df = make_well_df_for_saving(reads, read_ids, quals)
142
+ write_msa_for_df(qc_well_df, well_df, parent_name, parent_sequence,
143
+ os.path.join(qc_files_path, f'{run_label}_{current_well}.fa'),
144
+ os.path.join(qc_files_path, f'{run_label}_{current_well}.csv'))
145
+ label, frequency, combined_p_value, mixed_well = get_variant_label_for_well(well_df, frequency_cutoff)
146
+ mutant_to_well_df[f'{mutant}_{current_well}'] = well_df
147
+ variant_df.at[current_well, "Mixed Well"] = mixed_well
148
+ variant_df.at[current_well, "Variant"] = label
149
+ variant_df.at[current_well, "True Variant"] = mutant
150
+ variant_df.at[current_well, "frequency"] = frequency
151
+ variant_df.at[current_well, "P value"] = combined_p_value
152
+ variant_df.at[current_well, "Well"] = f'Well {current_well}'
153
+ variant_df.at[current_well, "Alignment_count"] = read_depth
154
+ if mixed_wells is not None:
155
+ variant_df.at[current_well, "True Mixed Well"] = mixed_wells[mutant] # Save this as a true mixed well
156
+ current_well += 1
157
+
158
+ # Before returning adjust the pvalues
159
+ variant_df['P adj.'] = multipletests(list(variant_df["P value"].values), alpha=0.05, method='fdr_bh')[1]
160
+ # Also get the accuracy
161
+ variant_df = check_variants(variant_df, parent_sequence)
162
+ return variant_df
163
+
164
+
165
+ def make_well_df_for_saving(seqs, read_ids, read_quals):
166
+ """
167
+ Make a dataframe in a specific format taking the reads and read IDs and filtering duplicates based on the
168
+ read quality. Keeps the highest quality scoring read for a given read ID.
169
+ """
170
+ seq_df = pd.DataFrame([list(s) for s in seqs]) # Convert each string to a list so that we get positions nicely
171
+ # Also add in the read_ids and sort by the quality to only take the highest quality one
172
+ seq_df['read_id'] = read_ids
173
+ seq_df['read_qual'] = read_quals
174
+ seq_df['seqs'] = seqs
175
+ seq_df = seq_df.sort_values(by='read_qual', ascending=False)
176
+ # Should now be sorted by the highest quality
177
+ seq_df = seq_df.drop_duplicates(subset=['read_id'], keep='first')
178
+ return seq_df
179
+
180
+
181
+ def write_msa_for_df(reads_across_well_df, well_df, parent_name, parent_sequence, msa_path, df_path):
182
+ """ This is for checking that we have the correct data."""
183
+ read_ids = reads_across_well_df['read_id']
184
+ seqs = reads_across_well_df['seqs']
185
+ # Check if we want to write a MSA
186
+ if msa_path is not None:
187
+ with open(msa_path, 'w+') as fout:
188
+ # Write the reference first
189
+ fout.write(f'>{parent_name}\n{parent_sequence}\n')
190
+
191
+ for i, seq in enumerate(seqs):
192
+ fout.write(f'>{read_ids[i]}\n{"".join(seq)}\n')
193
+ well_df.to_csv(df_path)
194
+
195
+
196
+ def generate_epcr_library(parent_sequence, mutation_rate, library_number):
197
+ """
198
+ For a parent make a number of sequenes using the error prone PCR.
199
+ """
200
+ return [mutate_sequence(parent_sequence, mutation_rate, ['A', 'T', 'G', 'C']) for c in range(0, library_number)]
201
+
202
+
203
+ def make_epcr_de_experiment(read_depth, sequencing_error_rate, parent_sequence, library_number, epcr_mutation_rate):
204
+ """
205
+ library_number would normally be the number of wells for example.
206
+ """
207
+ # Simulate the mutation frequncey
208
+ # First make the library
209
+ library = generate_epcr_library(parent_sequence, epcr_mutation_rate, library_number)
210
+ # For this library for each one, simulate the number of reads with a sequencing error rate
211
+ reads_per_well = {}
212
+ for seq in library:
213
+ reads_per_well[seq] = [mutate_sequence(seq, sequencing_error_rate) for i in range(0, read_depth)]
214
+ return reads_per_well
215
+
216
+
217
+ def make_ssm_de_experiment(read_depth, sequencing_error_rate, parent_sequence, positions, parent_sequence_aa,
218
+ codon_usage):
219
+ """
220
+ library_number would normally be the number of wells for example.
221
+ """
222
+ # Simulate the mutation freq
223
+ library = generate_ssm_library(positions, parent_sequence_aa, parent_sequence, codon_usage)
224
+ # For this library for each one, simulate the number of reads with a sequencing error rate
225
+ reads_per_well = {}
226
+ for seq in library:
227
+ # i.e. the key is the seq and the value is the mutated reads
228
+ reads_per_well[seq] = [mutate_sequence(seq, sequencing_error_rate) for i in range(0, read_depth)]
229
+ return reads_per_well
230
+
231
+
232
+ def make_mixed_well_epcr_de_experiment(read_depth, sequencing_error_rate, parent_sequence, library_number,
233
+ epcr_mutation_rate, number_wells_to_mix, mixture_rate):
234
+ """
235
+ Make a mixed well experiment to test code with.
236
+ """
237
+ # Simulate the mutation frequncey
238
+ # First make the library
239
+ library = generate_epcr_library(parent_sequence, epcr_mutation_rate, library_number)
240
+ # For this library for each one, simulate the number of reads with a sequencing error rate
241
+ reads_per_well = {}
242
+ reads_mutated_label = {}
243
+ for seq in library:
244
+ # Randomly mix some of the wells at the mixture rate. Here we'll just randomly "dope" in some of the randomly
245
+ reads_per_well[seq] = [mutate_sequence(seq, sequencing_error_rate) for i in range(0, read_depth)]
246
+ reads_mutated_label[seq] = False
247
+ wells_to_mix = random.sample(list(reads_per_well.keys()), number_wells_to_mix)
248
+ # Combine them
249
+ for well_seq in wells_to_mix:
250
+ # For each well, randomly select one from the other wells
251
+ dope_in_seq = random.sample(wells_to_mix, 1)[0] # We only want one!
252
+ if dope_in_seq != well_seq: # Make sure the sequences are different
253
+ # Swap out a percentage of the wells from dope in into the other well
254
+ number_to_add_in = math.floor(read_depth*mixture_rate)
255
+ for read_position in range(0, number_to_add_in):
256
+ # Just make the top X the other doped in seq
257
+ reads_per_well[well_seq][read_position] = reads_per_well[dope_in_seq][read_position]
258
+ reads_mutated_label[well_seq] = True
259
+
260
+ return reads_per_well, reads_mutated_label
261
+
262
+
263
+ def check_variants(variant_df, parent_sequence):
264
+ """ This just checks if the variants are actually correct! """
265
+
266
+ corrects = []
267
+ incorrects = []
268
+ true_positives = 0
269
+ true_negatives = 0
270
+ false_negatives = 0
271
+ false_positives = 0
272
+ for predicted_variant, true_variant in variant_df[['Variant', 'True Variant']].values:
273
+ count_correct = 0
274
+ count_incorrect = 0
275
+ for mutation in predicted_variant.split('_'):
276
+ try:
277
+ if 'PARENT' in mutation:
278
+ # Check that the two seqeunces are correct
279
+ for i in range(0, len(true_variant)):
280
+ if true_variant[i] == parent_sequence[i]:
281
+ count_correct += 1
282
+ true_positives += 1
283
+ else:
284
+ count_incorrect += 1
285
+ false_negatives += 1
286
+ else:
287
+ # true_variant is a sequence while predicated variant is just the mutations
288
+ if 'DEL' not in mutation:
289
+ mut_pos = int(mutation[1:-1]) # A1T
290
+ mut = mutation[-1]
291
+ else:
292
+ mut_pos = int(mutation[1:].replace('DEL', ''))
293
+ mut = 'DEL'
294
+ if true_variant[mut_pos - 1] == mut:
295
+ count_correct += 1
296
+ true_positives += 1
297
+ else:
298
+ count_incorrect += 1
299
+ try:
300
+ if parent_sequence[mut_pos - 1] != mutation[0]:
301
+ print("WARNING!")
302
+ except:
303
+ print(mut_pos, len(parent_sequence))
304
+ except:
305
+ print(mutation)
306
+ corrects.append(count_correct)
307
+ incorrects.append(count_incorrect)
308
+ variant_df['correct'] = corrects
309
+ variant_df['incorrect'] = incorrects
310
+ variant_df['accuracy'] = np.array(corrects) / (np.array(corrects) + np.array(incorrects))
311
+ return variant_df
levseq/user.py ADDED
@@ -0,0 +1,157 @@
1
+ ###############################################################################
2
+ # #
3
+ # This program is free software: you can redistribute it and/or modify #
4
+ # it under the terms of the GNU General Public License as published by #
5
+ # the Free Software Foundation, either version 3 of the License, or #
6
+ # (at your option) any later version. #
7
+ # #
8
+ # This program is distributed in the hope that it will be useful, #
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
11
+ # GNU General Public License for more details. #
12
+ # #
13
+ # You should have received a copy of the GNU General Public License #
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>. #
15
+ # #
16
+ ###############################################################################
17
+ import random
18
+
19
+ import numpy as np
20
+
21
+ from levseq.variantcaller import *
22
+ from Bio import AlignIO
23
+ from sklearn.decomposition import PCA
24
+
25
+ """
26
+ Functions for interfacing with users.
27
+
28
+ 1. Converting the final sequences to MSA
29
+ 2. Validation that same sequences got the same fitness
30
+ 3. Info/summaries based on what the mutations were that did good
31
+ 4. Encoding etc for the sequneces
32
+ 5. Generating figures.
33
+
34
+ Nonpolar (Hydrophobic)
35
+ Alanine (Ala, A) - Nonpolar, aliphatic side chain
36
+ Isoleucine (Ile, I) - Nonpolar, aliphatic side chain
37
+ Leucine (Leu, L) - Nonpolar, aliphatic side chain
38
+ Methionine (Met, M) - Nonpolar, sulfur-containing side chain
39
+ Phenylalanine (Phe, F) - Nonpolar, aromatic side chain
40
+ Proline (Pro, P) - Nonpolar, cyclic aliphatic side chain
41
+ Tryptophan (Trp, W) - Nonpolar, aromatic side chain
42
+ Valine (Val, V) - Nonpolar, aliphatic side chain
43
+ Polar, Uncharged
44
+ Asparagine (Asn, N) - Polar, amide-containing side chain
45
+ Cysteine (Cys, C) - Polar, sulfur-containing side chain
46
+ Glutamine (Gln, Q) - Polar, amide-containing side chain
47
+ Serine (Ser, S) - Polar, hydroxyl-containing side chain
48
+ Threonine (Thr, T) - Polar, hydroxyl-containing side chain
49
+ Tyrosine (Tyr, Y) - Polar, aromatic side chain with a hydroxyl group
50
+ Polar, Acidic (Negatively Charged at Physiological pH)
51
+ Aspartic Acid (Asp, D) - Acidic, carboxylate-containing side chain
52
+ Glutamic Acid (Glu, E) - Acidic, carboxylate-containing side chain
53
+ Polar, Basic (Positively Charged at Physiological pH)
54
+ Arginine (Arg, R) - Basic, contains a guanidinium group
55
+ Histidine (His, H) - Basic, contains an imidazole group
56
+ Lysine (Lys, K) - Basic, contains an amino group
57
+ Special Cases
58
+ Glycine (Gly, G) - The simplest amino acid, with a hydrogen as its side chain. It's often classified as nonpolar due to its minimal side chain, but its small size allows it to fit into both polar and nonpolar environments, making it quite versatile.
59
+ """
60
+
61
+ # Define the standard amino acids
62
+ amino_acids = 'GAVCPLIMWFKRHSTYNQDE'
63
+ # Create a mapping from amino acids to their index
64
+ aa_to_index = {aa: i for i, aa in enumerate(amino_acids)}
65
+ # init 0 as default value
66
+
67
+ from sciutil import SciUtil
68
+
69
+
70
+ # Just pretty printing.
71
+ u = SciUtil()
72
+
73
+
74
+ def convert_variant_df_to_seqs(variant_df, parent_seq):
75
+ """
76
+ Converts the variant DF to a MSA.
77
+ """
78
+ # Get the sequence from the DF, using the reference we'll convert this to the aa and then run a MSA on it
79
+ seqs = [translate(parent_seq)] # always start with the parent
80
+ seq_ids = ['PARENT']
81
+ for plate, well, predicted_variant in variant_df[['Plate', 'Well', 'Variant']].values:
82
+ label = f'{plate} {well}'
83
+ seq = list(parent_seq)
84
+ if isinstance(predicted_variant, str):
85
+ try:
86
+ for mutation in predicted_variant.split('_'):
87
+ if 'PARENT' in mutation:
88
+ continue # We skip this!
89
+ else:
90
+ # true_variant is a sequence while predicated variant is just the mutations
91
+ if 'DEL' not in mutation:
92
+ mut_pos = int(mutation[1:-1]) - 1 # A1T
93
+ mut = mutation[-1]
94
+ else:
95
+ mut_pos = int(mutation[1:].replace('DEL', '')) - 1
96
+ mut = 'DEL'
97
+ # This is the mutation at that point so we add it to our seq
98
+ if mut == 'DEL':
99
+ seq[mut_pos] = '-' # We'll remove these later
100
+ else:
101
+ seq[mut_pos] = mut
102
+ except Exception as e:
103
+ print(e)
104
+ print(plate, well, mut_pos, len(predicted_variant), len(parent_seq))
105
+ seqs.append(translate(''.join(seq).replace('-', ''))) # Remove the gaps and translate the sequence
106
+ seq_ids.append(label)
107
+ else:
108
+ print(label, predicted_variant)
109
+ return seqs, seq_ids
110
+
111
+
112
+ def get_colour(aa):
113
+ """
114
+ Get colour for an amino acid sequence.
115
+ """
116
+ return 0
117
+
118
+
119
+ def make_msa(seqs, seq_ids, file_to_align='/tmp/msa.fa'):
120
+ """
121
+ Potentialy change this so that the MSA file has the unique time stamp so that we don't get overriding issues.
122
+ """
123
+ with open(file_to_align, 'w+') as fout:
124
+ for i, seq in enumerate(seqs):
125
+ if isinstance(seq, str) and len(seq) > 0:
126
+ fout.write(f'>{seq_ids[i]}\n{seq}\n')
127
+
128
+ # Now make the msa
129
+ msa_file = f'{file_to_align.replace(".fa", "_msa.fa")}'
130
+ # Write each one as a fasta file then run the clustal and then the tree
131
+ os.system(f'../software/./clustal-omega-1.2.3-macosx --force -i {file_to_align} -o {msa_file} -v')
132
+ u.dp(['Done MSA'])
133
+ # Reading the alignment file
134
+ alignment = AlignIO.read(msa_file, 'fasta')
135
+ return alignment
136
+
137
+
138
+ def make_pca(encoded_sequences):
139
+ pca = PCA(n_components=2)
140
+ # Fit PCA on the standardized data
141
+ return pca.fit_transform(encoded_sequences)
142
+
143
+
144
+ def one_hot_encode(sequence):
145
+ sequence = sequence.replace('X', '-')
146
+ sequence = sequence.replace('?', '-')
147
+
148
+ # Initialize an array to hold the one-hot encoded sequence
149
+ one_hot_sequence = np.zeros((len(sequence), len(amino_acids)), dtype=int)
150
+
151
+ # Fill the array with one-hot encodings
152
+ for i, aa in enumerate(sequence):
153
+ if aa_to_index.get(aa) is not None:
154
+ one_hot_sequence[i, aa_to_index.get(aa)] = 1
155
+
156
+ flat_seq = one_hot_sequence.flatten()
157
+ return flat_seq