levseq 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- levseq/IO_processor.py +565 -0
- levseq/__init__.py +34 -0
- levseq/barcoding/__init__.py +1 -0
- levseq/barcoding/demultiplex +0 -0
- levseq/barcoding/demultiplex-arm64 +0 -0
- levseq/barcoding/demultiplex-x86 +0 -0
- levseq/barcoding/minion_barcodes.fasta +386 -0
- levseq/basecaller.py +80 -0
- levseq/cmd.py +23 -0
- levseq/globals.py +66 -0
- levseq/interface.py +85 -0
- levseq/parser.py +82 -0
- levseq/run_levseq.py +558 -0
- levseq/screen.py +38 -0
- levseq/simulation.py +311 -0
- levseq/user.py +157 -0
- levseq/utils.py +474 -0
- levseq/variantcaller.py +252 -0
- levseq/visualization.py +1130 -0
- levseq-1.0.0.data/data/LICENSE +674 -0
- levseq-1.0.0.dist-info/LICENSE +674 -0
- levseq-1.0.0.dist-info/METADATA +180 -0
- levseq-1.0.0.dist-info/RECORD +26 -0
- levseq-1.0.0.dist-info/WHEEL +5 -0
- levseq-1.0.0.dist-info/entry_points.txt +2 -0
- levseq-1.0.0.dist-info/top_level.txt +1 -0
levseq/variantcaller.py
ADDED
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
###############################################################################
|
|
2
|
+
# #
|
|
3
|
+
# This program is free software: you can redistribute it and/or modify #
|
|
4
|
+
# it under the terms of the GNU General Public License as published by #
|
|
5
|
+
# the Free Software Foundation, either version 3 of the License, or #
|
|
6
|
+
# (at your option) any later version. #
|
|
7
|
+
# #
|
|
8
|
+
# This program is distributed in the hope that it will be useful, #
|
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
11
|
+
# GNU General Public License for more details. #
|
|
12
|
+
# #
|
|
13
|
+
# You should have received a copy of the GNU General Public License #
|
|
14
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>. #
|
|
15
|
+
# #
|
|
16
|
+
###############################################################################
|
|
17
|
+
import pandas as pd
|
|
18
|
+
|
|
19
|
+
from levseq.utils import *
|
|
20
|
+
import subprocess
|
|
21
|
+
import os
|
|
22
|
+
import glob
|
|
23
|
+
from multiprocessing.dummy import Pool as ThreadPool
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
from Bio import SeqIO
|
|
26
|
+
import re
|
|
27
|
+
from tqdm import tqdm
|
|
28
|
+
|
|
29
|
+
'''
|
|
30
|
+
Script for variant calling
|
|
31
|
+
|
|
32
|
+
The variant caller starts from demultiplexed fastq files.
|
|
33
|
+
|
|
34
|
+
1) Before variant calling, check in the demultiplexed folder if the alignment file exists. If not, return the user that the sequences were not demultiplexed
|
|
35
|
+
2) If the files exist, create MSA using minimap2
|
|
36
|
+
3) Call variant with soft alignment
|
|
37
|
+
|
|
38
|
+
'''
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class VariantCaller:
|
|
42
|
+
"""
|
|
43
|
+
Variant caller class.
|
|
44
|
+
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init__(self, experiment_name, experiment_folder: Path, template_fasta: Path, barcode_path: Path, padding_start: int = 0, padding_end: int = 0) -> None:
|
|
48
|
+
self.barcode_path = barcode_path
|
|
49
|
+
self.experiment_name = experiment_name
|
|
50
|
+
self.experiment_folder = experiment_folder
|
|
51
|
+
self.padding_start = padding_start
|
|
52
|
+
self.padding_end = padding_end
|
|
53
|
+
self.template_fasta = template_fasta
|
|
54
|
+
self.alignment_name = 'alignment_minimap'
|
|
55
|
+
self.variant_dict = {}
|
|
56
|
+
self.ref_name = experiment_name
|
|
57
|
+
self.ref_str = str(SeqIO.read(template_fasta,'fasta').seq)
|
|
58
|
+
self.variant_df = self.build_variant_df_from_barcodes(barcode_path, experiment_name)
|
|
59
|
+
|
|
60
|
+
def build_variant_df_from_barcodes(self, barcode_path, experiment_name) -> pd.DataFrame:
|
|
61
|
+
"""
|
|
62
|
+
Build variant dataframe from barcodes, forward and reverse barcodes.
|
|
63
|
+
"""
|
|
64
|
+
forward_barcode_ids = []
|
|
65
|
+
reverse_barcode_ids = []
|
|
66
|
+
for record in SeqIO.parse(barcode_path, "fasta"):
|
|
67
|
+
if record.id.startswith('NB'):
|
|
68
|
+
forward_barcode_ids.append(record.id)
|
|
69
|
+
elif record.id.startswith('RB'):
|
|
70
|
+
reverse_barcode_ids.append(record.id)
|
|
71
|
+
# Make the dataframe using these and converting them to something more readable (i.e. the name the user assigned
|
|
72
|
+
# to the plate)
|
|
73
|
+
barcode_ids = []
|
|
74
|
+
renamed_ids = []
|
|
75
|
+
plates = []
|
|
76
|
+
wells = []
|
|
77
|
+
self.variant_dict = defaultdict(dict)
|
|
78
|
+
for reverse_barcode in reverse_barcode_ids:
|
|
79
|
+
for forward_barcode in forward_barcode_ids:
|
|
80
|
+
barcode_ids.append(f'{reverse_barcode}_{forward_barcode}')
|
|
81
|
+
well = self._barcode_to_well(forward_barcode)
|
|
82
|
+
plate = experiment_name
|
|
83
|
+
renamed_ids.append(f'{plate}_{well}')
|
|
84
|
+
plates.append(experiment_name)
|
|
85
|
+
wells.append(well)
|
|
86
|
+
self.variant_dict[f'{plate}_{well}'] = {'Plate': experiment_name, 'Well': well,
|
|
87
|
+
'Barcodes': f'{reverse_barcode}_{forward_barcode}',
|
|
88
|
+
'Path': os.path.join(self.experiment_folder, f'{reverse_barcode}/{forward_barcode}')}
|
|
89
|
+
df = pd.DataFrame()
|
|
90
|
+
df['Plate'] = plates
|
|
91
|
+
df['Well'] = wells
|
|
92
|
+
df['Barcode'] = barcode_ids
|
|
93
|
+
df['ID'] = renamed_ids
|
|
94
|
+
return df
|
|
95
|
+
|
|
96
|
+
@staticmethod
|
|
97
|
+
def load_reference(reference_path):
|
|
98
|
+
# The reference enables multiple parents to be used for different
|
|
99
|
+
# WARNING: this assumes all the parents are the same
|
|
100
|
+
ref_seq = str(SeqIO.read(template_fasta,'fasta').seq)
|
|
101
|
+
barcode_to_plate_name = experiment_name
|
|
102
|
+
return 'Parent', ref_seq, barcode_to_plate_name
|
|
103
|
+
|
|
104
|
+
@staticmethod
|
|
105
|
+
def _barcode_to_well(barcode):
|
|
106
|
+
match = re.search(r'\d+', barcode)
|
|
107
|
+
if match:
|
|
108
|
+
number = int(match.group())
|
|
109
|
+
rows = 'ABCDEFGH'
|
|
110
|
+
row = rows[(number - 1) // 12]
|
|
111
|
+
col = (number - 1) % 12 + 1
|
|
112
|
+
return f"{row}{col}"
|
|
113
|
+
else:
|
|
114
|
+
return "NA"
|
|
115
|
+
|
|
116
|
+
def _align_sequences(self, output_dir: Path, filename, scores: list = [4, 2, 10],
|
|
117
|
+
alignment_name: str = "alignment_minimap") -> None:
|
|
118
|
+
"""
|
|
119
|
+
Aligns sequences using minimap2, converts to BAM, sorts and indexes the BAM file.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
- ref (Path): Path to the reference file.
|
|
123
|
+
- output_dir (str or Path): Directory to store output files.
|
|
124
|
+
- scores (list, optional): List of match, mismatch and gap opening scores. Defaults to [4,2,10].
|
|
125
|
+
- site_saturation (bool, optional): If True, uses site saturation parameters for minimap2. Defaults to False.
|
|
126
|
+
- alignment_name (str, optional): Name of the alignment file. Defaults to "alignment_minimap".
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
- None
|
|
130
|
+
"""
|
|
131
|
+
all_fastq = os.path.join(output_dir, '*.fastq')
|
|
132
|
+
fastq_list = glob.glob(all_fastq)
|
|
133
|
+
fastq_files = os.path.join(output_dir, f"demultiplexed_{filename}.fastq")
|
|
134
|
+
|
|
135
|
+
if not fastq_files:
|
|
136
|
+
raise FileNotFoundError("No FASTQ files found in the specified output directory.")
|
|
137
|
+
with open(fastq_files, 'w') as outfile:
|
|
138
|
+
for fastq in fastq_list:
|
|
139
|
+
with open(fastq, 'r') as infile:
|
|
140
|
+
outfile.write(infile.read())
|
|
141
|
+
os.remove(fastq)
|
|
142
|
+
fastq_files_str = fastq_files
|
|
143
|
+
|
|
144
|
+
# Alignment using minimap2
|
|
145
|
+
minimap_cmd = f"minimap2 -ax map-ont -A {scores[0]} -B {scores[1]} -O {scores[2]},24 '{self.template_fasta}' '{fastq_files_str}' > '{output_dir}/{alignment_name}.sam'"
|
|
146
|
+
subprocess.run(minimap_cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
147
|
+
|
|
148
|
+
view_cmd = f"samtools view -bS '{output_dir}/{alignment_name}.sam' > '{output_dir}/{alignment_name}.bam'"
|
|
149
|
+
subprocess.run(view_cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
150
|
+
|
|
151
|
+
sort_cmd = f"samtools sort '{output_dir}/{alignment_name}.bam' -o '{output_dir}/{alignment_name}.bam'"
|
|
152
|
+
subprocess.run(sort_cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
153
|
+
|
|
154
|
+
index_cmd = f"samtools index '{output_dir}/{alignment_name}.bam'"
|
|
155
|
+
subprocess.run(index_cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
156
|
+
|
|
157
|
+
# Remove SAM file
|
|
158
|
+
os.remove(f"{output_dir}/{alignment_name}.sam")
|
|
159
|
+
|
|
160
|
+
def _run_variant_thread(self, args):
|
|
161
|
+
"""
|
|
162
|
+
Runs a thread of variant calling.
|
|
163
|
+
"""
|
|
164
|
+
barcode_ids, threshold, min_depth, output_dir = args[0], args[1], args[2], args[3]
|
|
165
|
+
for barcode_id in tqdm(barcode_ids):
|
|
166
|
+
row = self.variant_dict.get(barcode_id)
|
|
167
|
+
bam_file = os.path.join(row["Path"], f'{self.alignment_name}.bam')
|
|
168
|
+
# Check if the alignment file exists
|
|
169
|
+
if os.path.exists(row["Path"]):
|
|
170
|
+
if not os.path.exists(bam_file):
|
|
171
|
+
# Try aligning the sequences
|
|
172
|
+
print(f"Aligning sequences for {row['Path']}")
|
|
173
|
+
self._align_sequences(row["Path"], row['Barcodes'])
|
|
174
|
+
|
|
175
|
+
# Check alignment count
|
|
176
|
+
well_df = get_reads_for_well(self.experiment_name, bam_file, self.ref_str, msa_path=f'{row["Path"]}/msa_{barcode_id}.fa')
|
|
177
|
+
self.variant_dict[barcode_id]['Alignment Count'] = well_df['total_reads'].values[0] if well_df is not None else 0
|
|
178
|
+
if well_df is not None:
|
|
179
|
+
#well_df.to_csv(row['Path'],f'{output_dir}seq_{barcode_id}.csv')
|
|
180
|
+
label, freq, combined_p_value, mixed_well = get_variant_label_for_well(well_df, threshold)
|
|
181
|
+
self.variant_dict[barcode_id]["Variant"] = label
|
|
182
|
+
self.variant_dict[barcode_id]["Mixed Well"] = mixed_well
|
|
183
|
+
self.variant_dict[barcode_id]["Average mutation frequency"] = freq
|
|
184
|
+
self.variant_dict[barcode_id]["P value"] = combined_p_value
|
|
185
|
+
|
|
186
|
+
def get_variant_df(self, threshold: float = 0.5, min_depth: int = 5, output_dir='', num_threads=10):
|
|
187
|
+
"""
|
|
188
|
+
Get Variant Data Frame for all samples in the experiment
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
- alignment_file (Path): Path to the alignment file (.bam).
|
|
192
|
+
- qualities (bool, optional): If True, include base qualities in the analysis. Defaults to True.
|
|
193
|
+
- threshold (float, optional): Threshold for calling a variant. Defaults to 0.5.
|
|
194
|
+
"""
|
|
195
|
+
self.variant_df['P value'] = float("nan")
|
|
196
|
+
self.variant_df['Mixed Well'] = False
|
|
197
|
+
pool = ThreadPool(num_threads)
|
|
198
|
+
data = []
|
|
199
|
+
num = int(len(self.variant_df) / num_threads)
|
|
200
|
+
self.variant_df.reset_index(inplace=True)
|
|
201
|
+
for i in range(0, len(self.variant_df), num):
|
|
202
|
+
end_i = i + num if i + num < len(self.variant_df) else len(self.variant_df)
|
|
203
|
+
sub_df = self.variant_df.iloc[i: end_i]['ID'].values
|
|
204
|
+
sub_data = [sub_df, threshold, min_depth, output_dir]
|
|
205
|
+
data.append(sub_data)
|
|
206
|
+
|
|
207
|
+
# Thread it
|
|
208
|
+
pool.map(self._run_variant_thread, data)
|
|
209
|
+
|
|
210
|
+
self.variant_df['Variant'] = [self.variant_dict[b_id].get('Variant') for b_id in self.variant_df['ID'].values]
|
|
211
|
+
self.variant_df['Mixed Well'] = [self.variant_dict[b_id].get('Mixed Well') for b_id in self.variant_df['ID'].values]
|
|
212
|
+
self.variant_df['Average mutation frequency'] = [self.variant_dict[b_id].get('Average mutation frequency') for b_id in self.variant_df['ID'].values]
|
|
213
|
+
self.variant_df['P value'] = [self.variant_dict[b_id].get('P value') if self.variant_dict[b_id].get('P value') else 1.0 for b_id in self.variant_df['ID'].values]
|
|
214
|
+
self.variant_df['Alignment Count'] = [self.variant_dict[b_id].get('Alignment Count') for b_id in self.variant_df['ID'].values]
|
|
215
|
+
|
|
216
|
+
# Adjust p-values using bonferroni make it simple
|
|
217
|
+
self.variant_df['P adj. value'] = len(self.variant_df) * self.variant_df["P value"].values
|
|
218
|
+
self.variant_df['P adj. value'] = [1 if x > 1 else x for x in self.variant_df["P adj. value"].values]
|
|
219
|
+
|
|
220
|
+
return self.variant_df
|
|
221
|
+
|
|
222
|
+
def _get_alignment_count(self, sample_folder_path: Path):
|
|
223
|
+
"""
|
|
224
|
+
Get the number of alignments in a BAM file.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
- sample_folder_path (Path): Path to the folder containing the BAM file.
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
- int: Number of alignments in the BAM file.
|
|
231
|
+
"""
|
|
232
|
+
bam_file = os.path.join(sample_folder_path, self.alignment_name)
|
|
233
|
+
|
|
234
|
+
if not os.path.exists(bam_file):
|
|
235
|
+
return 0
|
|
236
|
+
|
|
237
|
+
try:
|
|
238
|
+
alignment_count = int(
|
|
239
|
+
subprocess.run(f"samtools view -c {bam_file}", shell=True, capture_output=True).stdout.decode(
|
|
240
|
+
"utf-8").strip())
|
|
241
|
+
except:
|
|
242
|
+
# ToDo: Return a meaningful error here
|
|
243
|
+
print(f'Warning! your bamfile: {bam_file} had no counts! Check the header manually.')
|
|
244
|
+
return 0
|
|
245
|
+
return alignment_count
|
|
246
|
+
|
|
247
|
+
def _apply_alignment_count(self):
|
|
248
|
+
"""
|
|
249
|
+
Get alignment count for each sample
|
|
250
|
+
"""
|
|
251
|
+
for barcode_id, entry in self.variant_dict.items():
|
|
252
|
+
self.variant_dict[barcode_id]["Alignment_count"] = self._get_alignment_count(entry["Path"])
|