levseq 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,252 @@
1
+ ###############################################################################
2
+ # #
3
+ # This program is free software: you can redistribute it and/or modify #
4
+ # it under the terms of the GNU General Public License as published by #
5
+ # the Free Software Foundation, either version 3 of the License, or #
6
+ # (at your option) any later version. #
7
+ # #
8
+ # This program is distributed in the hope that it will be useful, #
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
11
+ # GNU General Public License for more details. #
12
+ # #
13
+ # You should have received a copy of the GNU General Public License #
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>. #
15
+ # #
16
+ ###############################################################################
17
+ import pandas as pd
18
+
19
+ from levseq.utils import *
20
+ import subprocess
21
+ import os
22
+ import glob
23
+ from multiprocessing.dummy import Pool as ThreadPool
24
+ from pathlib import Path
25
+ from Bio import SeqIO
26
+ import re
27
+ from tqdm import tqdm
28
+
29
+ '''
30
+ Script for variant calling
31
+
32
+ The variant caller starts from demultiplexed fastq files.
33
+
34
+ 1) Before variant calling, check in the demultiplexed folder if the alignment file exists. If not, return the user that the sequences were not demultiplexed
35
+ 2) If the files exist, create MSA using minimap2
36
+ 3) Call variant with soft alignment
37
+
38
+ '''
39
+
40
+
41
+ class VariantCaller:
42
+ """
43
+ Variant caller class.
44
+
45
+ """
46
+
47
+ def __init__(self, experiment_name, experiment_folder: Path, template_fasta: Path, barcode_path: Path, padding_start: int = 0, padding_end: int = 0) -> None:
48
+ self.barcode_path = barcode_path
49
+ self.experiment_name = experiment_name
50
+ self.experiment_folder = experiment_folder
51
+ self.padding_start = padding_start
52
+ self.padding_end = padding_end
53
+ self.template_fasta = template_fasta
54
+ self.alignment_name = 'alignment_minimap'
55
+ self.variant_dict = {}
56
+ self.ref_name = experiment_name
57
+ self.ref_str = str(SeqIO.read(template_fasta,'fasta').seq)
58
+ self.variant_df = self.build_variant_df_from_barcodes(barcode_path, experiment_name)
59
+
60
+ def build_variant_df_from_barcodes(self, barcode_path, experiment_name) -> pd.DataFrame:
61
+ """
62
+ Build variant dataframe from barcodes, forward and reverse barcodes.
63
+ """
64
+ forward_barcode_ids = []
65
+ reverse_barcode_ids = []
66
+ for record in SeqIO.parse(barcode_path, "fasta"):
67
+ if record.id.startswith('NB'):
68
+ forward_barcode_ids.append(record.id)
69
+ elif record.id.startswith('RB'):
70
+ reverse_barcode_ids.append(record.id)
71
+ # Make the dataframe using these and converting them to something more readable (i.e. the name the user assigned
72
+ # to the plate)
73
+ barcode_ids = []
74
+ renamed_ids = []
75
+ plates = []
76
+ wells = []
77
+ self.variant_dict = defaultdict(dict)
78
+ for reverse_barcode in reverse_barcode_ids:
79
+ for forward_barcode in forward_barcode_ids:
80
+ barcode_ids.append(f'{reverse_barcode}_{forward_barcode}')
81
+ well = self._barcode_to_well(forward_barcode)
82
+ plate = experiment_name
83
+ renamed_ids.append(f'{plate}_{well}')
84
+ plates.append(experiment_name)
85
+ wells.append(well)
86
+ self.variant_dict[f'{plate}_{well}'] = {'Plate': experiment_name, 'Well': well,
87
+ 'Barcodes': f'{reverse_barcode}_{forward_barcode}',
88
+ 'Path': os.path.join(self.experiment_folder, f'{reverse_barcode}/{forward_barcode}')}
89
+ df = pd.DataFrame()
90
+ df['Plate'] = plates
91
+ df['Well'] = wells
92
+ df['Barcode'] = barcode_ids
93
+ df['ID'] = renamed_ids
94
+ return df
95
+
96
+ @staticmethod
97
+ def load_reference(reference_path):
98
+ # The reference enables multiple parents to be used for different
99
+ # WARNING: this assumes all the parents are the same
100
+ ref_seq = str(SeqIO.read(template_fasta,'fasta').seq)
101
+ barcode_to_plate_name = experiment_name
102
+ return 'Parent', ref_seq, barcode_to_plate_name
103
+
104
+ @staticmethod
105
+ def _barcode_to_well(barcode):
106
+ match = re.search(r'\d+', barcode)
107
+ if match:
108
+ number = int(match.group())
109
+ rows = 'ABCDEFGH'
110
+ row = rows[(number - 1) // 12]
111
+ col = (number - 1) % 12 + 1
112
+ return f"{row}{col}"
113
+ else:
114
+ return "NA"
115
+
116
+ def _align_sequences(self, output_dir: Path, filename, scores: list = [4, 2, 10],
117
+ alignment_name: str = "alignment_minimap") -> None:
118
+ """
119
+ Aligns sequences using minimap2, converts to BAM, sorts and indexes the BAM file.
120
+
121
+ Args:
122
+ - ref (Path): Path to the reference file.
123
+ - output_dir (str or Path): Directory to store output files.
124
+ - scores (list, optional): List of match, mismatch and gap opening scores. Defaults to [4,2,10].
125
+ - site_saturation (bool, optional): If True, uses site saturation parameters for minimap2. Defaults to False.
126
+ - alignment_name (str, optional): Name of the alignment file. Defaults to "alignment_minimap".
127
+
128
+ Returns:
129
+ - None
130
+ """
131
+ all_fastq = os.path.join(output_dir, '*.fastq')
132
+ fastq_list = glob.glob(all_fastq)
133
+ fastq_files = os.path.join(output_dir, f"demultiplexed_{filename}.fastq")
134
+
135
+ if not fastq_files:
136
+ raise FileNotFoundError("No FASTQ files found in the specified output directory.")
137
+ with open(fastq_files, 'w') as outfile:
138
+ for fastq in fastq_list:
139
+ with open(fastq, 'r') as infile:
140
+ outfile.write(infile.read())
141
+ os.remove(fastq)
142
+ fastq_files_str = fastq_files
143
+
144
+ # Alignment using minimap2
145
+ minimap_cmd = f"minimap2 -ax map-ont -A {scores[0]} -B {scores[1]} -O {scores[2]},24 '{self.template_fasta}' '{fastq_files_str}' > '{output_dir}/{alignment_name}.sam'"
146
+ subprocess.run(minimap_cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
147
+
148
+ view_cmd = f"samtools view -bS '{output_dir}/{alignment_name}.sam' > '{output_dir}/{alignment_name}.bam'"
149
+ subprocess.run(view_cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
150
+
151
+ sort_cmd = f"samtools sort '{output_dir}/{alignment_name}.bam' -o '{output_dir}/{alignment_name}.bam'"
152
+ subprocess.run(sort_cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
153
+
154
+ index_cmd = f"samtools index '{output_dir}/{alignment_name}.bam'"
155
+ subprocess.run(index_cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
156
+
157
+ # Remove SAM file
158
+ os.remove(f"{output_dir}/{alignment_name}.sam")
159
+
160
+ def _run_variant_thread(self, args):
161
+ """
162
+ Runs a thread of variant calling.
163
+ """
164
+ barcode_ids, threshold, min_depth, output_dir = args[0], args[1], args[2], args[3]
165
+ for barcode_id in tqdm(barcode_ids):
166
+ row = self.variant_dict.get(barcode_id)
167
+ bam_file = os.path.join(row["Path"], f'{self.alignment_name}.bam')
168
+ # Check if the alignment file exists
169
+ if os.path.exists(row["Path"]):
170
+ if not os.path.exists(bam_file):
171
+ # Try aligning the sequences
172
+ print(f"Aligning sequences for {row['Path']}")
173
+ self._align_sequences(row["Path"], row['Barcodes'])
174
+
175
+ # Check alignment count
176
+ well_df = get_reads_for_well(self.experiment_name, bam_file, self.ref_str, msa_path=f'{row["Path"]}/msa_{barcode_id}.fa')
177
+ self.variant_dict[barcode_id]['Alignment Count'] = well_df['total_reads'].values[0] if well_df is not None else 0
178
+ if well_df is not None:
179
+ #well_df.to_csv(row['Path'],f'{output_dir}seq_{barcode_id}.csv')
180
+ label, freq, combined_p_value, mixed_well = get_variant_label_for_well(well_df, threshold)
181
+ self.variant_dict[barcode_id]["Variant"] = label
182
+ self.variant_dict[barcode_id]["Mixed Well"] = mixed_well
183
+ self.variant_dict[barcode_id]["Average mutation frequency"] = freq
184
+ self.variant_dict[barcode_id]["P value"] = combined_p_value
185
+
186
+ def get_variant_df(self, threshold: float = 0.5, min_depth: int = 5, output_dir='', num_threads=10):
187
+ """
188
+ Get Variant Data Frame for all samples in the experiment
189
+
190
+ Args:
191
+ - alignment_file (Path): Path to the alignment file (.bam).
192
+ - qualities (bool, optional): If True, include base qualities in the analysis. Defaults to True.
193
+ - threshold (float, optional): Threshold for calling a variant. Defaults to 0.5.
194
+ """
195
+ self.variant_df['P value'] = float("nan")
196
+ self.variant_df['Mixed Well'] = False
197
+ pool = ThreadPool(num_threads)
198
+ data = []
199
+ num = int(len(self.variant_df) / num_threads)
200
+ self.variant_df.reset_index(inplace=True)
201
+ for i in range(0, len(self.variant_df), num):
202
+ end_i = i + num if i + num < len(self.variant_df) else len(self.variant_df)
203
+ sub_df = self.variant_df.iloc[i: end_i]['ID'].values
204
+ sub_data = [sub_df, threshold, min_depth, output_dir]
205
+ data.append(sub_data)
206
+
207
+ # Thread it
208
+ pool.map(self._run_variant_thread, data)
209
+
210
+ self.variant_df['Variant'] = [self.variant_dict[b_id].get('Variant') for b_id in self.variant_df['ID'].values]
211
+ self.variant_df['Mixed Well'] = [self.variant_dict[b_id].get('Mixed Well') for b_id in self.variant_df['ID'].values]
212
+ self.variant_df['Average mutation frequency'] = [self.variant_dict[b_id].get('Average mutation frequency') for b_id in self.variant_df['ID'].values]
213
+ self.variant_df['P value'] = [self.variant_dict[b_id].get('P value') if self.variant_dict[b_id].get('P value') else 1.0 for b_id in self.variant_df['ID'].values]
214
+ self.variant_df['Alignment Count'] = [self.variant_dict[b_id].get('Alignment Count') for b_id in self.variant_df['ID'].values]
215
+
216
+ # Adjust p-values using bonferroni make it simple
217
+ self.variant_df['P adj. value'] = len(self.variant_df) * self.variant_df["P value"].values
218
+ self.variant_df['P adj. value'] = [1 if x > 1 else x for x in self.variant_df["P adj. value"].values]
219
+
220
+ return self.variant_df
221
+
222
+ def _get_alignment_count(self, sample_folder_path: Path):
223
+ """
224
+ Get the number of alignments in a BAM file.
225
+
226
+ Args:
227
+ - sample_folder_path (Path): Path to the folder containing the BAM file.
228
+
229
+ Returns:
230
+ - int: Number of alignments in the BAM file.
231
+ """
232
+ bam_file = os.path.join(sample_folder_path, self.alignment_name)
233
+
234
+ if not os.path.exists(bam_file):
235
+ return 0
236
+
237
+ try:
238
+ alignment_count = int(
239
+ subprocess.run(f"samtools view -c {bam_file}", shell=True, capture_output=True).stdout.decode(
240
+ "utf-8").strip())
241
+ except:
242
+ # ToDo: Return a meaningful error here
243
+ print(f'Warning! your bamfile: {bam_file} had no counts! Check the header manually.')
244
+ return 0
245
+ return alignment_count
246
+
247
+ def _apply_alignment_count(self):
248
+ """
249
+ Get alignment count for each sample
250
+ """
251
+ for barcode_id, entry in self.variant_dict.items():
252
+ self.variant_dict[barcode_id]["Alignment_count"] = self._get_alignment_count(entry["Path"])