levseq 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
levseq/utils.py ADDED
@@ -0,0 +1,474 @@
1
+ ###############################################################################
2
+ # #
3
+ # This program is free software: you can redistribute it and/or modify #
4
+ # it under the terms of the GNU General Public License as published by #
5
+ # the Free Software Foundation, either version 3 of the License, or #
6
+ # (at your option) any later version. #
7
+ # #
8
+ # This program is distributed in the hope that it will be useful, #
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
11
+ # GNU General Public License for more details. #
12
+ # #
13
+ # You should have received a copy of the GNU General Public License #
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>. #
15
+ # #
16
+ ###############################################################################
17
+ # Import all packages
18
+ import pandas as pd
19
+ import pysam
20
+ import os
21
+ import numpy as np
22
+ from copy import deepcopy
23
+ from collections import defaultdict
24
+ from scipy.stats import binomtest
25
+ from statsmodels.stats.multitest import multipletests
26
+ from pathlib import Path
27
+ from scipy.stats import combine_pvalues
28
+ from Bio import SeqIO
29
+ from Bio.PDB.Polypeptide import aa1
30
+
31
+ amino_acid_to_codon = {
32
+ 'A': 'GCT', 'R': 'CGT', 'N': 'AAT', 'D': 'GAT', 'C': 'TGT',
33
+ 'Q': 'CAA', 'E': 'GAA', 'G': 'GGT', 'H': 'CAT', 'I': 'ATT',
34
+ 'L': 'CTT', 'K': 'AAA', 'M': 'ATG', 'F': 'TTT', 'P': 'CCT',
35
+ 'S': 'TCT', 'T': 'ACT', 'W': 'TGG', 'Y': 'TAT', 'V': 'GTT',
36
+ '*': 'TAA'
37
+ }
38
+
39
+
40
+ ALL_AAS = deepcopy(list(aa1))
41
+
42
+ def translate(seq):
43
+ table = {
44
+ 'ATA': 'I', 'ATC': 'I', 'ATT': 'I', 'ATG': 'M',
45
+ 'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACT': 'T',
46
+ 'AAC': 'N', 'AAT': 'N', 'AAA': 'K', 'AAG': 'K',
47
+ 'AGC': 'S', 'AGT': 'S', 'AGA': 'R', 'AGG': 'R',
48
+ 'CTA': 'L', 'CTC': 'L', 'CTG': 'L', 'CTT': 'L',
49
+ 'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P',
50
+ 'CAC': 'H', 'CAT': 'H', 'CAA': 'Q', 'CAG': 'Q',
51
+ 'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGT': 'R',
52
+ 'GTA': 'V', 'GTC': 'V', 'GTG': 'V', 'GTT': 'V',
53
+ 'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A',
54
+ 'GAC': 'D', 'GAT': 'D', 'GAA': 'E', 'GAG': 'E',
55
+ 'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGT': 'G',
56
+ 'TCA': 'S', 'TCC': 'S', 'TCG': 'S', 'TCT': 'S',
57
+ 'TTC': 'F', 'TTT': 'F', 'TTA': 'L', 'TTG': 'L',
58
+ 'TAC': 'Y', 'TAT': 'Y', 'TAA': '*', 'TAG': '*',
59
+ 'TGC': 'C', 'TGT': 'C', 'TGA': '*', 'TGG': 'W',
60
+ }
61
+ protein = ""
62
+ if len(seq) % 3 == 0:
63
+ for i in range(0, len(seq), 3):
64
+ codon = seq[i:i + 3]
65
+ protein += table[codon]
66
+ return protein
67
+
68
+
69
+ # Get mutated sequence by splitting template sequence
70
+ def get_mut(temp_seq, aa_seq):
71
+ i = 0
72
+ mut_ls = []
73
+ for i in range(len(list(zip(temp_seq, aa_seq)))):
74
+ if temp_seq[i] != aa_seq[i]:
75
+ mut_ls.append(temp_seq[i]+str(i+1)+aa_seq[i])
76
+ i = i + 1
77
+ return mut_ls
78
+
79
+
80
+ def check_demultiplexing(demultiplex_folder: Path, reverse_prefix="RB", forward_prefix="NB", verbose=True):
81
+ """
82
+ Check if the demultiplexing was done correctly. If not, return the user that the sequences were not demultiplexed.
83
+
84
+ Args:
85
+ - demultiplex_folder: Path to the folder containing the demultiplexed fastq files
86
+ - verbose: If True, print the name of each parent folder and the count of child folders
87
+
88
+ Return:
89
+ - Tuple: Number of parent folders and child folders
90
+ """
91
+ demultiplex_path = Path(demultiplex_folder)
92
+ parent_folder_count = 0
93
+ child_folder_count = 0
94
+
95
+ for child in demultiplex_path.iterdir():
96
+ if child.is_dir() and (child.name.startswith(reverse_prefix) or child.name.startswith(forward_prefix)):
97
+ parent_folder_count += 1
98
+ child_folder_count += len(list(child.iterdir()))
99
+ if verbose:
100
+ print(f"Parent folder '{child.name}' contains {len(list(child.iterdir()))} folders.")
101
+
102
+ return parent_folder_count, child_folder_count
103
+
104
+
105
+ def get_template_df(plate_numbers: list, barcode_dicts: dict = None, rowwise=True):
106
+ """
107
+ To have coherent df for each experiment, a template df is created. The template also have the desired plates and columns in the desired order
108
+ Input:
109
+ - demultiplex_folder, folder where the demultiplexed files are located
110
+ - rowwise, if True, the reverse barcodes are rows and not plates
111
+ """
112
+
113
+ if barcode_dicts is None:
114
+ raise ValueError("No barcode dictionary provided")
115
+
116
+ n_rbc = len(barcode_dicts.items())
117
+
118
+ rows = ["A", "B", "C", "D", "E", "F", "G", "H"]
119
+ columns = [i for i in range(1, 13)]
120
+
121
+ if rowwise:
122
+ template = {"Plate": [], "Well": []}
123
+
124
+ for row in rows:
125
+ for column in columns:
126
+ template["Plate"].append(1)
127
+ template["Well"].append(f"{row}{column}")
128
+
129
+ else:
130
+
131
+ template = {"Plate": [], "Well": []}
132
+
133
+ for i in range(n_rbc):
134
+ for row in rows:
135
+ for column in columns:
136
+ template["Plate"].append(plate_numbers[i])
137
+ template["Well"].append(f"{row}{column}")
138
+
139
+ return pd.DataFrame(template)
140
+
141
+
142
+ def get_dummy_plate_df(plate_name='Plate', well_name='Well', number_of_wells=96):
143
+ """
144
+ Make a dummy plate.
145
+ Plate Well Path Alignment_count P value Mixed Well Variant Average mutation frequency P adj. value
146
+ """
147
+ df = pd.DataFrame([i for i in range(0, number_of_wells)], columns=['index'])
148
+ df['Plate'] = plate_name
149
+ df['Well'] = well_name
150
+ df['Path'] = ''
151
+ df['Alignment_count'] = 0
152
+ df['P value'] = 1.0
153
+ df['Mixed Well'] = False
154
+ df['Variant'] = ''
155
+ df['mutation'] = ''
156
+ df['frequency'] = 0
157
+ df['P adj.'] = 0
158
+ df['value'] = 0
159
+ df.set_index('index', inplace=True)
160
+ return df
161
+
162
+
163
+ def make_well_df_from_reads(seqs, read_ids, read_quals):
164
+ """
165
+ Make a dataframe in a specific format taking the reads and read IDs and filtering duplicates based on the
166
+ read quality. Keeps the highest quality scoring read for a given read ID.
167
+ """
168
+ seq_df = pd.DataFrame([list(s) for s in seqs]) # Convert each string to a list so that we get positions nicely
169
+ # Also add in the read_ids and sort by the quality to only take the highest quality one
170
+ seq_df['read_id'] = read_ids
171
+ #seq_df['read_qual'] = read_quals
172
+ seq_df['seqs'] = seqs
173
+ #seq_df = seq_df.sort_values(by='read_qual', ascending=False)
174
+ # Should now be sorted by the highest quality
175
+ seq_df = seq_df.drop_duplicates(subset=['read_id'], keep='first')
176
+ return seq_df.drop(columns=['read_id', 'seqs'])
177
+
178
+
179
+ def calculate_mutation_significance_across_well(seq_df):
180
+ """
181
+ Calculate the background error as just the mean frequency of non-reference sequneces (here we "smooth"
182
+ out the induced mutations.
183
+ """
184
+ mean_error = np.mean(seq_df['freq_non_ref'].values)
185
+ seq_df.reset_index(inplace=True)
186
+ i = 0
187
+ if mean_error > 0.4:
188
+ print('-----------------------------------------')
189
+ print("WARNING!!! Your mean error rate across was too high!!! It was: ", mean_error)
190
+ print('-----------------------------------------')
191
+
192
+ # Using this we can calculate the significance of the different errors
193
+ for ref_seq, num_a, num_t, num_g, num_c, num_dels, num_reads, num_total_non_ref_reads in seq_df[
194
+ ['ref', 'A', 'T', 'G', 'C', 'N', 'total_reads', 'total_other']].values:
195
+ actual_seq, val, p_value, p_a, p_t, p_g, p_c, p_n = calc_mutation_significance_for_position_in_well(ref_seq, num_a,
196
+ num_t, num_g, num_c,
197
+ num_dels, num_reads,
198
+ num_total_non_ref_reads,
199
+ mean_error)
200
+ seq_df.at[i, 'p(a)'] = p_a
201
+ seq_df.at[i, 'p(t)'] = p_t
202
+ seq_df.at[i, 'p(g)'] = p_g
203
+ seq_df.at[i, 'p(c)'] = p_c
204
+ seq_df.at[i, 'p(n)'] = p_n
205
+ seq_df.at[i, 'p_value'] = p_value
206
+ seq_df.at[i, 'percent_most_freq_mutation'] = val
207
+ seq_df.at[i, 'most_frequent'] = actual_seq
208
+ i += 1
209
+
210
+ # Do multiple test correction to correct each of the pvalues
211
+ for p in ['p_value', 'p(a)', 'p(t)', 'p(g)', 'p(c)', 'p(n)']:
212
+ # Do B.H which is the simplest possibly change to have alpha be a variable! ToDo :D
213
+ padjs = multipletests(seq_df[p].values, alpha=0.05, method='fdr_bh')
214
+ seq_df[f'{p} adj.'] = padjs[1]
215
+ return seq_df
216
+
217
+
218
+ def get_reads_for_well(parent_name, bam_file_path: str, ref_str: str, min_coverage=5, msa_path=None):
219
+ """
220
+ Rows are the reads, columns are the columns in the reference. Insertions are ignored.
221
+ """
222
+ bam = pysam.AlignmentFile(bam_file_path, "rb")
223
+ # Ensure the BAM file is indexed
224
+ if not os.path.exists(bam_file_path + ".bai"):
225
+ pysam.index(bam_file_path)
226
+
227
+ cramHeader = bam.header.to_dict()
228
+ rows_all = []
229
+ seqs = []
230
+ read_ids = []
231
+ read_quals = []
232
+
233
+ for read in bam.fetch(until_eof=True):
234
+ if read.query_sequence is not None and len(read.query_sequence) > 0.9*len(ref_str) and read.cigartuples is not None:
235
+ seq, ref, qual, ins = alignment_from_cigar(read.cigartuples, read.query_sequence, ref_str,
236
+ read.query_qualities)
237
+ # Make it totally align
238
+ seq = "-" * read.reference_start + seq + "-" * (len(ref_str) - (read.reference_start + len(seq)))
239
+ seqs.append(seq)
240
+ #seqs.append(read.query_sequence)
241
+ read_ids.append(f'{read.query_name}')
242
+ read_quals.append(read.qual)
243
+
244
+ # Check if we want to write a MSA
245
+ if msa_path is not None:
246
+ print("Writing MSA")
247
+ with open(msa_path, 'w+') as fout:
248
+ # Write the reference first
249
+ fout.write(f'>{parent_name}\n{ref_str}\n')
250
+ for i, seq in enumerate(seqs):
251
+ fout.write(f'>{read_ids[i]}\n{"".join(seq)}\n')
252
+ # # Align using clustal for debugging if you need the adapter! Here you would change above to use a different version
253
+ # print(f'/Users/ariane/Documents/code/MinION/software/./clustal-omega-1.2.3-macosx --force -i {msa_path} -o {msa_path.replace(".fa", "_msa.fa")}')
254
+ # os.system(f'/Users/ariane/Documents/code/MinION/software/./clustal-omega-1.2.3-macosx --force -i "{msa_path}" -o "{msa_path.replace(".fa", "_msa.fa")}"')
255
+ # seqs = [str(record.seq) for record in SeqIO.parse(msa_path.replace(".fa", "_msa.fa"), "fasta")]
256
+ # read_ids = [str(record.id) for record in SeqIO.parse(msa_path.replace(".fa", "_msa.fa"), "fasta")]
257
+ # Again check that we actually had enough reads for this to be considered a good well
258
+ if len(seqs) > min_coverage:
259
+ seq_df = make_well_df_from_reads(seqs, read_ids, read_quals)
260
+ # Seqs[0] is always the parent
261
+ rows_all = make_row_from_read_pileup_across_well(seq_df, ref_str, parent_name)
262
+
263
+ bam.close()
264
+
265
+ if len(rows_all) > 1: # Check if we have anything to return
266
+ seq_df = pd.DataFrame(rows_all)
267
+ seq_df.columns = ['gene_name', 'position', 'ref', 'most_frequent', 'freq_non_ref', 'total_other',
268
+ 'total_reads', 'p_value', 'percent_most_freq_mutation', 'A', 'p(a)', 'T', 'p(t)', 'G', 'p(g)',
269
+ 'C', 'p(c)', 'N', 'p(n)']
270
+ return calculate_mutation_significance_across_well(seq_df)
271
+
272
+
273
+ def make_row_from_read_pileup_across_well(well_df, ref_str, label):
274
+ """
275
+ Given a pileup of reads, we want to get some summary information about that sequence
276
+ """
277
+ rows = []
278
+ for col in well_df:
279
+ vc = well_df[col].values
280
+ ref_seq = ref_str[col] # Keep track of the reference
281
+ if ref_seq != '-':
282
+ # Check if there are at least 25% with a different value compared to the reference.
283
+ total_reads = len(vc)
284
+ total_other = len(vc[vc != ref_seq])
285
+ freq_non_ref = total_other / total_reads
286
+ actual_seq = ref_seq
287
+ # Dummy values that will be filled in later once we calculate the background error rate
288
+ rows.append([label, col, ref_seq, actual_seq, freq_non_ref, total_other, total_reads, 1.0, 0.0,
289
+ len(vc[vc == 'A']), 1.0, len(vc[vc == 'T']), 1.0, len(vc[vc == 'G']), 1.0,
290
+ len(vc[vc == 'C']), 1.0, len(vc[vc == '-']),
291
+ 1.0])
292
+ return rows
293
+
294
+
295
+ def calc_mutation_significance_for_position_in_well(ref_seq, num_a, num_t, num_g, num_c, num_dels, num_reads,
296
+ num_total_non_ref_reads, background_error_rate):
297
+ """
298
+ Use the binomial test to check if we have a significant result for any of the observed reads.
299
+ """
300
+ p_a = binomtest(num_a, num_reads, background_error_rate, 'greater').pvalue
301
+ p_t = binomtest(num_t, num_reads, background_error_rate, 'greater').pvalue
302
+ p_g = binomtest(num_g, num_reads, background_error_rate, 'greater').pvalue
303
+ p_c = binomtest(num_c, num_reads, background_error_rate, 'greater').pvalue
304
+ p_n = binomtest(num_dels, num_reads, background_error_rate, 'greater').pvalue
305
+ val = 0
306
+ actual_seq = ref_seq
307
+ p_value = float('nan') # Could also use 0 not sure what is optimal here!
308
+ if num_total_non_ref_reads == 0:
309
+ val = 0.0 # i.e. they were 100% the reference
310
+ p_value = 1.0 # i.e. they are all this
311
+ else:
312
+ if num_a > 0 and 'A' != ref_seq and num_a / num_reads > val:
313
+ val = num_a / num_reads
314
+ actual_seq = 'A'
315
+ p_value = p_a
316
+ if num_t > 0 and 'T' != ref_seq and num_t / num_reads > val:
317
+ val = num_t / num_reads
318
+ actual_seq = 'T'
319
+ p_value = p_t
320
+ if num_g > 0 and 'G' != ref_seq and num_g / num_reads > val:
321
+ val = num_g / num_reads
322
+ actual_seq = 'G'
323
+ p_value = p_g
324
+ if num_c > 0 and 'C' != ref_seq and num_c / num_reads > val:
325
+ val = num_c / num_reads
326
+ actual_seq = 'C'
327
+ p_value = p_c
328
+ if num_dels > 0 and '-' != ref_seq and num_dels / num_reads > val:
329
+ val = num_dels / num_reads
330
+ actual_seq = 'DEL'
331
+ p_value = p_n
332
+ return actual_seq, val, p_value, p_a, p_t, p_g, p_c, p_n
333
+
334
+
335
+ def alignment_from_cigar(cigar: str, alignment: str, ref: str, query_qualities: list):
336
+ """
337
+ Generate the alignment from the cigar string.
338
+ Operation Description Consumes query Consumes reference
339
+ 0 M alignment match (can be a sequence match or mismatch) yes yes
340
+ 1 I insertion to the reference yes no
341
+ 2 D deletion from the reference no yes
342
+ 3 N skipped region from the reference no yes
343
+ 4 S soft clipping (clipped sequences present in SEQ) yes no
344
+ 5 H hard clipping (clipped sequences NOT present in SEQ) no no
345
+ 6 P padding (silent deletion from padded reference) no no
346
+ 7 = sequence match yes yes
347
+ 8 X sequence mismatch yes yes
348
+ """
349
+ new_seq = ''
350
+ ref_seq = ''
351
+ qual = []
352
+ inserts = []
353
+ pos = 0
354
+ ref_pos = 0
355
+ for op, op_len in cigar:
356
+ if op == 0: # alignment match (can be a sequence match or mismatch)
357
+ new_seq += alignment[pos:pos + op_len]
358
+ qual += query_qualities[pos:pos + op_len]
359
+
360
+ ref_seq += ref[ref_pos:ref_pos + op_len]
361
+ pos += op_len
362
+ ref_pos += op_len
363
+ elif op == 1: # insertion to the reference
364
+ inserts.append(alignment[pos - 1:pos + op_len])
365
+ pos += op_len
366
+ elif op == 2: # deletion from the reference
367
+ new_seq += '-' * op_len
368
+ qual += [-1] * op_len
369
+ ref_seq += ref[ref_pos:ref_pos + op_len]
370
+ ref_pos += op_len
371
+ elif op == 3: # skipped region from the reference
372
+ new_seq += '*' * op_len
373
+ qual += [-2] * op_len
374
+ ref_pos += op_len
375
+ elif op == 4: # soft clipping (clipped sequences present in SEQ)
376
+ inserts.append(alignment[pos:pos + op_len])
377
+ pos += op_len
378
+ elif op == 5: # hard clipping (clipped sequences NOT present in SEQ)
379
+ continue
380
+ elif op == 6: # padding (silent deletion from padded reference)
381
+ continue
382
+ elif op == 7: # sequence mismatch
383
+ new_seq += alignment[pos:pos + op_len]
384
+ ref_seq += ref[ref_pos:ref_pos + op_len]
385
+ qual += query_qualities[pos:pos + op_len]
386
+ pos += op_len
387
+ ref_pos += op_len
388
+ return new_seq, ref_seq, qual, inserts
389
+
390
+
391
+ def postprocess_variant_df(df, cutoff=5, output_path=None):
392
+ """
393
+ Postprocess the variant DF to check for any positions that appear to have a higher than expected
394
+ difference to the parent or that occur too many times.
395
+ """
396
+ mutation_map = defaultdict(lambda: defaultdict(int))
397
+ all_plates = pd.DataFrame()
398
+ for plate in set(df['Plate'].values):
399
+ plate_df = df[df['Plate'] == plate]
400
+ positions = []
401
+ for m in plate_df['Variant'].values:
402
+ if str(m) != 'nan':
403
+ m = m.split('_')
404
+ for mutation in m:
405
+ if 'DEL' not in mutation:
406
+ position = mutation[1:-1] # i.e. trim off what it was
407
+ # Get the position and also keep what it was mutated to
408
+ mutation_map[position][mutation[-1]] += 1 # get what it was mutated too
409
+ else:
410
+ position = mutation[1:].replace('DEL', '') # i.e. trim off what it was
411
+ # Get the position and also keep what it was mutated to
412
+ mutation_map[position]['DEL'] += 1 # get what it was mutated too
413
+ positions.append(position)
414
+ # Make into a DF that has positions and then the number and types of muattions
415
+ positions = list(set(positions))
416
+ positions.sort()
417
+ rows = []
418
+ for position in positions:
419
+ pos = mutation_map[position]
420
+ a_ = pos['A'] or 0
421
+ t_ = pos['T'] or 0
422
+ c_ = pos['C'] or 0
423
+ g_ = pos['G'] or 0
424
+ del_ = pos['DEL'] or 0
425
+ total = a_ + t_ + g_ + c_ + del_
426
+ rows.append([position, total, a_, t_, g_, c_, del_])
427
+ # CHeck if the well has a problem at a specific position
428
+ if total > cutoff:
429
+ print(f"Warning! Position {position} in plate {plate} was mutated: {total} times. "
430
+ f"This may be an error with your parent.")
431
+ p_df = pd.DataFrame(rows, columns=['Position', 'Total wells mutated in', 'A', 'T', 'G', 'C', 'DEL'])
432
+ if output_path:
433
+ # Save QC file if the user specifies a path.
434
+ p_df.to_csv(f'{output_path}Plate_{plate}_QC.csv', index=False)
435
+ p_df['Plate'] = plate
436
+ all_plates = pd.concat([all_plates, p_df])
437
+ all_plates = all_plates.sort_values(by='Total wells mutated in', ascending=False)
438
+ return all_plates # this gives an idea about what might be mutated.
439
+
440
+
441
+ def get_variant_label_for_well(seq_df, threshold):
442
+ """
443
+ Classify/label the variants and identify whether there is a mixed well at position i.
444
+ """
445
+ # Now use the filter for wells which have a certain threshold of non-reference mutations
446
+ # Filter based on significance to determine whether there is a
447
+ non_refs = seq_df[seq_df['freq_non_ref'] > threshold].sort_values(by='position')
448
+ mixed_well = False
449
+ if len(non_refs) > 0:
450
+ positions = non_refs['position'].values
451
+ refs = non_refs['ref'].values
452
+ label = [f'{refs[i]}{positions[i] + 1}{actual}' for i, actual in enumerate(non_refs['most_frequent'].values)]
453
+ # Check if it is a mixed well i.e. there were multiple with significant greater than 0.05
454
+ padj_vals = non_refs[['p(a) adj.', 'p(t) adj.', 'p(g) adj.', 'p(c) adj.', 'p(n) adj.']].values
455
+ for p in padj_vals:
456
+ c_sig = 0
457
+ for padj in p:
458
+ if padj < 0.05: # Have this as a variable
459
+ c_sig += 1
460
+ if c_sig > 1: # potential mixed well
461
+ mixed_well = True
462
+ label = '_'.join(label)
463
+ # Only keep the frequency of the most frequent mutation
464
+ probability = np.mean([x for x in non_refs['percent_most_freq_mutation'].values])
465
+ # Combine the values
466
+ chi2_statistic, combined_p_value = combine_pvalues([x for x in non_refs['p_value adj.'].values],
467
+ method='fisher')
468
+ else:
469
+ label = '#PARENT#'
470
+ probability = np.mean([1 - x for x in non_refs['freq_non_ref'].values])
471
+ combined_p_value = float("nan")
472
+
473
+ return label, probability, combined_p_value, mixed_well
474
+