levseq 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- levseq/IO_processor.py +565 -0
- levseq/__init__.py +34 -0
- levseq/barcoding/__init__.py +1 -0
- levseq/barcoding/demultiplex +0 -0
- levseq/barcoding/demultiplex-arm64 +0 -0
- levseq/barcoding/demultiplex-x86 +0 -0
- levseq/barcoding/minion_barcodes.fasta +386 -0
- levseq/basecaller.py +80 -0
- levseq/cmd.py +23 -0
- levseq/globals.py +66 -0
- levseq/interface.py +85 -0
- levseq/parser.py +82 -0
- levseq/run_levseq.py +558 -0
- levseq/screen.py +38 -0
- levseq/simulation.py +311 -0
- levseq/user.py +157 -0
- levseq/utils.py +474 -0
- levseq/variantcaller.py +252 -0
- levseq/visualization.py +1130 -0
- levseq-1.0.0.data/data/LICENSE +674 -0
- levseq-1.0.0.dist-info/LICENSE +674 -0
- levseq-1.0.0.dist-info/METADATA +180 -0
- levseq-1.0.0.dist-info/RECORD +26 -0
- levseq-1.0.0.dist-info/WHEEL +5 -0
- levseq-1.0.0.dist-info/entry_points.txt +2 -0
- levseq-1.0.0.dist-info/top_level.txt +1 -0
levseq/parser.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
###############################################################################
|
|
2
|
+
# #
|
|
3
|
+
# This program is free software: you can redistribute it and/or modify #
|
|
4
|
+
# it under the terms of the GNU General Public License as published by #
|
|
5
|
+
# the Free Software Foundation, either version 3 of the License, or #
|
|
6
|
+
# (at your option) any later version. #
|
|
7
|
+
# #
|
|
8
|
+
# This program is distributed in the hope that it will be useful, #
|
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
11
|
+
# GNU General Public License for more details. #
|
|
12
|
+
# #
|
|
13
|
+
# You should have received a copy of the GNU General Public License #
|
|
14
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>. #
|
|
15
|
+
# #
|
|
16
|
+
###############################################################################
|
|
17
|
+
|
|
18
|
+
import argparse
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def create_parser():
|
|
23
|
+
parser = argparse.ArgumentParser(description='evSeq levseq pipeline. Enter the experiment name from your run')
|
|
24
|
+
|
|
25
|
+
# Arguments
|
|
26
|
+
parser.add_argument('--experiment_name',
|
|
27
|
+
metavar='n',
|
|
28
|
+
type=str,
|
|
29
|
+
required=True,
|
|
30
|
+
help='Name of experiment. The name must overlap with the name given for Sequencing')
|
|
31
|
+
|
|
32
|
+
parser.add_argument('--ref',
|
|
33
|
+
metavar='r',
|
|
34
|
+
type=Path,
|
|
35
|
+
required=True,
|
|
36
|
+
help='Path to reference sequence.')
|
|
37
|
+
|
|
38
|
+
parser.add_argument('--output_path',
|
|
39
|
+
metavar='o',
|
|
40
|
+
default=None,
|
|
41
|
+
type=Path,
|
|
42
|
+
required=False,
|
|
43
|
+
help='Path to output folder. If not given, the output folder will be created in the current directory')
|
|
44
|
+
|
|
45
|
+
parser.add_argument("--output_name",
|
|
46
|
+
metavar='on',
|
|
47
|
+
type=str,
|
|
48
|
+
help="Name of the output folder. If not given, the name will be the same as the experiment name")
|
|
49
|
+
|
|
50
|
+
parser.add_argument("--skip_basecalling",
|
|
51
|
+
action="store_true",
|
|
52
|
+
help="Skip the basecalling step.")
|
|
53
|
+
|
|
54
|
+
parser.add_argument("--skip_demultiplex",
|
|
55
|
+
action="store_true",
|
|
56
|
+
help="Skip the demultiplexing step.")
|
|
57
|
+
|
|
58
|
+
parser.add_argument("--skip_consensus",
|
|
59
|
+
action="store_true",
|
|
60
|
+
help="Skip the consensus step.")
|
|
61
|
+
|
|
62
|
+
parser.add_argument('--json_file',
|
|
63
|
+
metavar='j',
|
|
64
|
+
type=Path,
|
|
65
|
+
required=False,
|
|
66
|
+
help='Path to json file. If not given, default json file will be used')
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
return parser
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def check_parser(parser, args):
|
|
73
|
+
"""Check if the parser argumemtns are valid
|
|
74
|
+
Input: parser object, parser arguments
|
|
75
|
+
Output: True if valid, else exit"""
|
|
76
|
+
|
|
77
|
+
if args.experiment_name is None:
|
|
78
|
+
parser.print_help()
|
|
79
|
+
assert "Please enter the experiment name"
|
|
80
|
+
exit(1)
|
|
81
|
+
else:
|
|
82
|
+
return True
|
levseq/run_levseq.py
ADDED
|
@@ -0,0 +1,558 @@
|
|
|
1
|
+
###############################################################################
|
|
2
|
+
# #
|
|
3
|
+
# This program is free software: you can redistribute it and/or modify #
|
|
4
|
+
# it under the terms of the GNU General Public License as published by #
|
|
5
|
+
# the Free Software Foundation, either version 3 of the License, or #
|
|
6
|
+
# (at your option) any later version. #
|
|
7
|
+
# #
|
|
8
|
+
# This program is distributed in the hope that it will be useful, #
|
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
11
|
+
# GNU General Public License for more details. #
|
|
12
|
+
# #
|
|
13
|
+
# You should have received a copy of the GNU General Public License #
|
|
14
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>. #
|
|
15
|
+
# #
|
|
16
|
+
###############################################################################
|
|
17
|
+
|
|
18
|
+
# Import MinION objects
|
|
19
|
+
from levseq import *
|
|
20
|
+
|
|
21
|
+
# Import external packages
|
|
22
|
+
import logging
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
import numpy as np
|
|
25
|
+
import pandas as pd
|
|
26
|
+
from importlib import resources
|
|
27
|
+
import subprocess
|
|
28
|
+
from Bio import SeqIO
|
|
29
|
+
import tqdm
|
|
30
|
+
import platform
|
|
31
|
+
import subprocess
|
|
32
|
+
import os
|
|
33
|
+
import re
|
|
34
|
+
import gzip
|
|
35
|
+
import shutil
|
|
36
|
+
|
|
37
|
+
import panel as pn
|
|
38
|
+
import holoviews as hv
|
|
39
|
+
from holoviews.streams import Tap
|
|
40
|
+
|
|
41
|
+
output_notebook()
|
|
42
|
+
|
|
43
|
+
pn.extension()
|
|
44
|
+
pn.config.comms = "vscode"
|
|
45
|
+
|
|
46
|
+
hv.extension("bokeh")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# Get barcode used
|
|
50
|
+
def barcode_user(cl_args, i):
|
|
51
|
+
try:
|
|
52
|
+
# Set some default values if user did not provide barcodes
|
|
53
|
+
fmin = 1
|
|
54
|
+
fmax = 96
|
|
55
|
+
bc_df = pd.read_csv(cl_args["summary"])
|
|
56
|
+
rbc = bc_df["barcode_plate"][i]
|
|
57
|
+
logging.info(f"Demultiplex executed successfully for index {i}.")
|
|
58
|
+
|
|
59
|
+
return int(fmin), int(fmax), int(rbc)
|
|
60
|
+
|
|
61
|
+
except Exception as e:
|
|
62
|
+
logging.error("Demultiplex failed to execute for index {i}.", exc_info=True)
|
|
63
|
+
raise
|
|
64
|
+
|
|
65
|
+
# Split fastq file into 4000 reads per chunk
|
|
66
|
+
def split_fastq_file(fastq_file: Path, output_dir: Path, reads_per_file: int):
|
|
67
|
+
"""
|
|
68
|
+
Splits a FASTQ file into multiple files, each containing up to a specified number of reads.
|
|
69
|
+
|
|
70
|
+
Parameters:
|
|
71
|
+
- fastq_file (Path): The input FASTQ file to be split.
|
|
72
|
+
- output_dir (Path): The directory where the split files will be saved.
|
|
73
|
+
- reads_per_file (int): The number of reads per output file.
|
|
74
|
+
"""
|
|
75
|
+
try:
|
|
76
|
+
with open(fastq_file, "rt") as handle:
|
|
77
|
+
record_iter = SeqIO.parse(handle, "fastq")
|
|
78
|
+
file_count = 0
|
|
79
|
+
while True:
|
|
80
|
+
chunk = []
|
|
81
|
+
try:
|
|
82
|
+
for _ in range(reads_per_file):
|
|
83
|
+
chunk.append(next(record_iter))
|
|
84
|
+
except StopIteration:
|
|
85
|
+
# End of the file reached
|
|
86
|
+
if chunk:
|
|
87
|
+
output_file = output_dir / f"{fastq_file.stem}_part{file_count}.fastq"
|
|
88
|
+
with open(output_file, "wt") as out_handle:
|
|
89
|
+
SeqIO.write(chunk, out_handle, "fastq")
|
|
90
|
+
logging.info(f"Created {output_file} with {len(chunk)} reads")
|
|
91
|
+
break # Exit the loop once we reach the end of the records
|
|
92
|
+
output_file = output_dir / f"{fastq_file.stem}_part{file_count}.fastq"
|
|
93
|
+
with open(output_file, "wt") as out_handle:
|
|
94
|
+
SeqIO.write(chunk, out_handle, "fastq")
|
|
95
|
+
logging.info(f"Created {output_file} with {len(chunk)} reads")
|
|
96
|
+
file_count += 1
|
|
97
|
+
|
|
98
|
+
logging.info(f"Splitting complete for {fastq_file}. {file_count} parts created.")
|
|
99
|
+
except Exception as e:
|
|
100
|
+
logging.error(f"Failed to split FASTQ file {fastq_file}: {str(e)}", exc_info=True)
|
|
101
|
+
raise
|
|
102
|
+
|
|
103
|
+
def cat_fastq_files(folder_path: str, output_path: str, reads_per_file: int = 4000):
|
|
104
|
+
"""
|
|
105
|
+
Copies all .fastq and .fastq.gz files from the provided folder_path to the output_path.
|
|
106
|
+
If there's only one .fastq file, it will be split into smaller files of a specified number
|
|
107
|
+
of reads each.
|
|
108
|
+
|
|
109
|
+
Parameters:
|
|
110
|
+
- folder_path (str): The path to the directory containing .fastq and .fastq.gz files.
|
|
111
|
+
- output_path (str): The path to the directory where the files should be copied or split.
|
|
112
|
+
- reads_per_file (int): The number of reads per output file when splitting a FASTQ file.
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
- str: The path to the output directory where the files were copied.
|
|
116
|
+
|
|
117
|
+
Raises:
|
|
118
|
+
- ValueError: If the folder_path is not a valid directory or if no .fastq/.fastq.gz files are found.
|
|
119
|
+
- Exception: For any other errors that occur during file copying or splitting.
|
|
120
|
+
"""
|
|
121
|
+
try:
|
|
122
|
+
folder_path = Path(folder_path)
|
|
123
|
+
output_path = Path(output_path)
|
|
124
|
+
|
|
125
|
+
if not folder_path.is_dir():
|
|
126
|
+
raise ValueError(f"The provided path {folder_path} is not a valid directory")
|
|
127
|
+
|
|
128
|
+
if not output_path.exists():
|
|
129
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
130
|
+
|
|
131
|
+
# Find all fastq and fastq.gz files excluding those in fastq_fail folders
|
|
132
|
+
fastq_files = []
|
|
133
|
+
for root, dirs, files in os.walk(folder_path):
|
|
134
|
+
if "fastq_fail" not in root:
|
|
135
|
+
for file in files:
|
|
136
|
+
if file.endswith((".fastq", ".fastq.gz")):
|
|
137
|
+
fastq_files.append(Path(root) / file)
|
|
138
|
+
|
|
139
|
+
if not fastq_files:
|
|
140
|
+
raise ValueError(f"No FASTQ files found in {folder_path}")
|
|
141
|
+
|
|
142
|
+
# If there is only one FASTQ file, split it into smaller files
|
|
143
|
+
if len(fastq_files) == 1 and fastq_files[0].suffix == '.fastq':
|
|
144
|
+
logging.info(f"Splitting single FASTQ file into {reads_per_file} reads per file")
|
|
145
|
+
split_fastq_file(fastq_files[0], output_path, reads_per_file)
|
|
146
|
+
else:
|
|
147
|
+
# Copy each .fastq or .fastq.gz file to the output directory
|
|
148
|
+
for fastq_file in fastq_files:
|
|
149
|
+
destination = output_path / fastq_file.name
|
|
150
|
+
shutil.copy(fastq_file, destination)
|
|
151
|
+
logging.info(f"Copied {fastq_file} to {destination}")
|
|
152
|
+
|
|
153
|
+
logging.info(f"All FASTQ files processed successfully to {output_path}")
|
|
154
|
+
return str(output_path)
|
|
155
|
+
|
|
156
|
+
except Exception as e:
|
|
157
|
+
logging.error(f"Failed to copy or split fastq files. An error occurred: {str(e)}", exc_info=True)
|
|
158
|
+
raise
|
|
159
|
+
|
|
160
|
+
# Create result folder
|
|
161
|
+
def create_result_folder(cl_args: dict) -> str:
|
|
162
|
+
folder_name = cl_args.get("name")
|
|
163
|
+
if not folder_name:
|
|
164
|
+
raise ValueError("The 'name' key is required in cl_args")
|
|
165
|
+
output_path = cl_args.get("output", os.getcwd())
|
|
166
|
+
result_folder = Path(output_path) / folder_name
|
|
167
|
+
# Create the directory if it doesn't exist
|
|
168
|
+
result_folder.mkdir(parents=True, exist_ok=True)
|
|
169
|
+
return str(result_folder)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
# Return and create filtered barcodes
|
|
173
|
+
def filter_bc(cl_args: dict, name_folder: Path, i: int) -> Path:
|
|
174
|
+
front_min, front_max, rbc = barcode_user(cl_args, i)
|
|
175
|
+
# Use importlib.resources to get the path to the barcode file
|
|
176
|
+
try:
|
|
177
|
+
with resources.path('levseq.barcoding', 'minion_barcodes.fasta') as barcode_path:
|
|
178
|
+
barcode_path = Path(barcode_path)
|
|
179
|
+
except ImportError:
|
|
180
|
+
# Fallback method if the above fails
|
|
181
|
+
package_root = Path(__file__).resolve().parent.parent
|
|
182
|
+
barcode_path = package_root / "levseq" / "barcoding" / "minion_barcodes.fasta"
|
|
183
|
+
# Ensure the barcode file exists
|
|
184
|
+
if not barcode_path.exists():
|
|
185
|
+
raise FileNotFoundError(f"Barcode file not found: {barcode_path}")
|
|
186
|
+
|
|
187
|
+
front_prefix = "NB"
|
|
188
|
+
back_prefix = "RB"
|
|
189
|
+
barcode_path_filter = os.path.join(name_folder, "levseq_barcodes_filtered.fasta")
|
|
190
|
+
|
|
191
|
+
filter_barcodes(
|
|
192
|
+
str(barcode_path),
|
|
193
|
+
str(barcode_path_filter),
|
|
194
|
+
(front_min, front_max),
|
|
195
|
+
rbc,
|
|
196
|
+
front_prefix,
|
|
197
|
+
back_prefix,
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
return barcode_path_filter
|
|
201
|
+
|
|
202
|
+
# Filter barcodes
|
|
203
|
+
def filter_barcodes(
|
|
204
|
+
input_fasta, output_fasta, barcode_range, rbc, front_prefix, back_prefix
|
|
205
|
+
):
|
|
206
|
+
front_min, front_max = barcode_range
|
|
207
|
+
filtered_records = []
|
|
208
|
+
|
|
209
|
+
for record in SeqIO.parse(input_fasta, "fasta"):
|
|
210
|
+
if (
|
|
211
|
+
record.id.startswith(front_prefix)
|
|
212
|
+
and front_min <= int(record.id[len(front_prefix) :]) <= front_max
|
|
213
|
+
) or (
|
|
214
|
+
record.id.startswith(back_prefix)
|
|
215
|
+
and int(record.id[len(back_prefix) :]) == rbc
|
|
216
|
+
):
|
|
217
|
+
filtered_records.append(record)
|
|
218
|
+
|
|
219
|
+
with open(output_fasta, "w") as output_handle:
|
|
220
|
+
SeqIO.write(filtered_records, output_handle, "fasta")
|
|
221
|
+
|
|
222
|
+
# Demultiplex fastq reads into plate and wells format
|
|
223
|
+
def demux_fastq(file_to_fastq, result_folder, barcode_path):
|
|
224
|
+
# Determine the system architecture
|
|
225
|
+
system_architecture = platform.machine().lower()
|
|
226
|
+
|
|
227
|
+
# Choose the appropriate executable based on the architecture
|
|
228
|
+
if system_architecture == 'arm64':
|
|
229
|
+
executable_name = "demultiplex-arm64"
|
|
230
|
+
elif system_architecture == 'aarch64':
|
|
231
|
+
executable_name = "demultiplex"
|
|
232
|
+
elif system_architecture == 'x86_64':
|
|
233
|
+
executable_name = "demultiplex-x86"
|
|
234
|
+
else:
|
|
235
|
+
raise ValueError(f"Unsupported architecture: {system_architecture}")
|
|
236
|
+
|
|
237
|
+
# Use importlib.resources to get the path to the executable
|
|
238
|
+
try:
|
|
239
|
+
with resources.path('levseq.barcoding', executable_name) as executable_path:
|
|
240
|
+
executable_path = Path(executable_path)
|
|
241
|
+
except ImportError:
|
|
242
|
+
# Fallback method if the above fails
|
|
243
|
+
package_root = Path(__file__).resolve().parent.parent
|
|
244
|
+
executable_path = package_root / "levseq" / "barcoding" / executable_name
|
|
245
|
+
|
|
246
|
+
# Ensure the executable exists
|
|
247
|
+
if not executable_path.exists():
|
|
248
|
+
raise FileNotFoundError(f"Executable not found: {executable_path}")
|
|
249
|
+
|
|
250
|
+
# Get min and max sequence length if user specified, otherwise use default
|
|
251
|
+
seq_min = 800
|
|
252
|
+
seq_max = 5000
|
|
253
|
+
|
|
254
|
+
# Use subprocess to run the executable
|
|
255
|
+
prompt = f"{executable_path} -f {file_to_fastq} -d {result_folder} -b {barcode_path} -w 100 -r 100 -m {seq_min} -x {seq_max}"
|
|
256
|
+
subprocess.run(prompt, shell=True, check=True)
|
|
257
|
+
|
|
258
|
+
# Variant calling using VariantCaller class and generate dataframe
|
|
259
|
+
def call_variant(experiment_name, experiment_folder, template_fasta, filtered_barcodes):
|
|
260
|
+
try:
|
|
261
|
+
vc = VariantCaller(
|
|
262
|
+
experiment_name,
|
|
263
|
+
experiment_folder,
|
|
264
|
+
template_fasta,
|
|
265
|
+
filtered_barcodes,
|
|
266
|
+
padding_start=0,
|
|
267
|
+
padding_end=0,
|
|
268
|
+
)
|
|
269
|
+
variant_df = vc.get_variant_df(threshold=0.5, min_depth=5)
|
|
270
|
+
logging.info("Variant calling to create consensus reads successful")
|
|
271
|
+
return variant_df
|
|
272
|
+
except Exception as e:
|
|
273
|
+
logging.error("Variant calling failed", exc_info=True)
|
|
274
|
+
raise
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
# Saving heatmaps and csv in the results folder
|
|
278
|
+
def save_platemap_to_file(heatmaps, outputdir, name, show_msa):
|
|
279
|
+
if not os.path.exists(os.path.join(outputdir, "Platemaps")):
|
|
280
|
+
os.makedirs(os.path.join(outputdir, "Platemaps"))
|
|
281
|
+
file_path = os.path.join(outputdir, "Platemaps", name)
|
|
282
|
+
if show_msa:
|
|
283
|
+
heatmaps.save(file_path + "_msa.html", embed=True)
|
|
284
|
+
else:
|
|
285
|
+
hv.renderer("bokeh").save(heatmaps, file_path)
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def save_csv(df, outputdir, name):
|
|
289
|
+
if not os.path.exists(os.path.join(outputdir, "Results")):
|
|
290
|
+
os.makedirs(os.path.join(outputdir, "Results"))
|
|
291
|
+
file_path = os.path.join(outputdir, "Results", name + ".csv")
|
|
292
|
+
df.to_csv(file_path)
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
# Generate dataframe for visualization
|
|
296
|
+
def create_df_v(variants_df):
|
|
297
|
+
# Make copy of dataframe
|
|
298
|
+
df_variants_ = variants_df.copy()
|
|
299
|
+
|
|
300
|
+
# Fill in empty cells
|
|
301
|
+
df_variants_["Variant"].tolist()
|
|
302
|
+
df_variants_["Variant"] = df_variants_["Variant"].replace(np.nan, "", regex=True)
|
|
303
|
+
|
|
304
|
+
# Create nc_variant column
|
|
305
|
+
df_variants_["nc_variant"] = df_variants_.apply(
|
|
306
|
+
lambda row: create_nc_variant(row["Variant"], row["refseq"]), axis=1
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
# Translate nc_variant to aa_variant
|
|
310
|
+
df_variants_["aa_variant"] = df_variants_["nc_variant"].apply(
|
|
311
|
+
lambda x: "Deletion" if x == "Deletion" else translate(x)
|
|
312
|
+
)
|
|
313
|
+
# Fill in 'Deletion' in 'aa_variant' column
|
|
314
|
+
df_variants_.loc[
|
|
315
|
+
df_variants_["nc_variant"] == "Deletion", "aa_variant"
|
|
316
|
+
] = "Deletion"
|
|
317
|
+
|
|
318
|
+
# Compare aa_variant with translated refseq and generate mutations column
|
|
319
|
+
df_variants_["Mutations"] = df_variants_.apply(get_mutations, axis=1)
|
|
320
|
+
|
|
321
|
+
# Fill in empty empty values
|
|
322
|
+
df_variants_["Alignment Probability"] = df_variants_[
|
|
323
|
+
"Average mutation frequency"
|
|
324
|
+
].fillna(0.0)
|
|
325
|
+
df_variants_["Alignment Count"] = df_variants_["Alignment Count"].fillna(0.0)
|
|
326
|
+
|
|
327
|
+
# Fill in Deletion into mutations Column
|
|
328
|
+
for i in df_variants_.index:
|
|
329
|
+
if df_variants_["nc_variant"].iloc[i] == "Deletion":
|
|
330
|
+
df_variants_.Mutations.iat[i] = df_variants_.Mutations.iat[i].replace(
|
|
331
|
+
"", "-"
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
# Add row and columns
|
|
335
|
+
Well = df_variants_["Well"].tolist()
|
|
336
|
+
row = []
|
|
337
|
+
column = []
|
|
338
|
+
for well in Well:
|
|
339
|
+
if len(well) >= 2:
|
|
340
|
+
row.append(well[0])
|
|
341
|
+
if well[1:].isdigit():
|
|
342
|
+
column.append(well[1:])
|
|
343
|
+
else:
|
|
344
|
+
column.append("")
|
|
345
|
+
else:
|
|
346
|
+
row.append("")
|
|
347
|
+
column.append("")
|
|
348
|
+
|
|
349
|
+
df_variants_["Row"] = row
|
|
350
|
+
df_variants_["Column"] = column
|
|
351
|
+
df_variants_["Plate"] = df_variants_["name"].astype(str)
|
|
352
|
+
|
|
353
|
+
# Update 'Plate' column from '1'-'9' to '01'-'09'
|
|
354
|
+
df_variants_["Plate"] = df_variants_["Plate"].apply(
|
|
355
|
+
lambda x: f"0{x}" if len(x) == 1 else x
|
|
356
|
+
)
|
|
357
|
+
# Select the desired columns in the desired order
|
|
358
|
+
restructured_df = df_variants_[
|
|
359
|
+
[
|
|
360
|
+
"barcode_plate",
|
|
361
|
+
"Plate",
|
|
362
|
+
"Well",
|
|
363
|
+
"Variant",
|
|
364
|
+
"Alignment Count",
|
|
365
|
+
"Average mutation frequency",
|
|
366
|
+
"P value",
|
|
367
|
+
"P adj. value",
|
|
368
|
+
"Mutations",
|
|
369
|
+
"nc_variant",
|
|
370
|
+
"aa_variant",
|
|
371
|
+
]
|
|
372
|
+
]
|
|
373
|
+
# Set 'Mutations' and 'Variant' columns to '#N.A.#' if 'Alignment Count' is smaller than 5
|
|
374
|
+
restructured_df.loc[
|
|
375
|
+
restructured_df["Alignment Count"] < 6, ["Mutations", "Variant"]
|
|
376
|
+
] = "#N.A.#"
|
|
377
|
+
df_variants_.loc[
|
|
378
|
+
df_variants_["Alignment Count"] < 6, ["Mutations", "Variant"]
|
|
379
|
+
] = "#N.A.#"
|
|
380
|
+
restructured_df.loc[
|
|
381
|
+
restructured_df["Mutations"] == "#PARENT#", ["Alignment Probability"]
|
|
382
|
+
] = 1.0
|
|
383
|
+
df_variants_.loc[
|
|
384
|
+
df_variants_["Mutations"] == "#PARENT#", ["Alignment Probability"]
|
|
385
|
+
] = 1.0
|
|
386
|
+
|
|
387
|
+
return restructured_df, df_variants_
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
def create_nc_variant(variant, refseq):
|
|
391
|
+
if isinstance(variant, np.ndarray):
|
|
392
|
+
variant = variant.tolist()
|
|
393
|
+
if variant == "" or pd.isnull(variant):
|
|
394
|
+
return refseq
|
|
395
|
+
elif variant == "#PARENT#":
|
|
396
|
+
return refseq
|
|
397
|
+
elif "DEL" in variant:
|
|
398
|
+
return "Deletion"
|
|
399
|
+
else:
|
|
400
|
+
mutations = variant.split("_")
|
|
401
|
+
nc_variant = list(refseq)
|
|
402
|
+
for mutation in mutations:
|
|
403
|
+
if len(mutation) >= 2:
|
|
404
|
+
position = int(re.findall(r"\d+", mutation)[0]) - 1
|
|
405
|
+
original = mutation[0]
|
|
406
|
+
new = mutation[-1]
|
|
407
|
+
if position < len(nc_variant) and nc_variant[position] == original:
|
|
408
|
+
nc_variant[position] = new
|
|
409
|
+
return "".join(nc_variant)
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
def is_valid_dna_sequence(sequence):
|
|
413
|
+
return all(nucleotide in 'ATGC' for nucleotide in sequence) and len(sequence) % 3 == 0
|
|
414
|
+
|
|
415
|
+
def get_mutations(row):
|
|
416
|
+
try:
|
|
417
|
+
refseq = row["refseq"]
|
|
418
|
+
|
|
419
|
+
if not is_valid_dna_sequence(refseq):
|
|
420
|
+
return "Invalid refseq provided, check template sequence. Only A, T, G, C and sequence dividable by 3 are accepted."
|
|
421
|
+
|
|
422
|
+
refseq_aa = translate(refseq)
|
|
423
|
+
variant_aa = row["aa_variant"]
|
|
424
|
+
alignment_count = row["Alignment Count"]
|
|
425
|
+
|
|
426
|
+
if variant_aa == "Deletion":
|
|
427
|
+
return ""
|
|
428
|
+
else:
|
|
429
|
+
mutations = []
|
|
430
|
+
if len(refseq_aa) == len(variant_aa):
|
|
431
|
+
for i in range(len(refseq_aa)):
|
|
432
|
+
if refseq_aa[i] != variant_aa[i]:
|
|
433
|
+
mutations.append(f"{refseq_aa[i]}{i+1}{variant_aa[i]}")
|
|
434
|
+
if not mutations:
|
|
435
|
+
if alignment_count < 5:
|
|
436
|
+
return "#N.A.#"
|
|
437
|
+
else:
|
|
438
|
+
return "#PARENT#"
|
|
439
|
+
else:
|
|
440
|
+
return "LEN"
|
|
441
|
+
return "_".join(mutations) if mutations else ""
|
|
442
|
+
|
|
443
|
+
except Exception as e:
|
|
444
|
+
logging.error(
|
|
445
|
+
"Translation to amino acids failed, check template sequence. Only A, T, G, C and sequence dividable by 3 are accepted.",
|
|
446
|
+
exc_info=True,
|
|
447
|
+
)
|
|
448
|
+
raise
|
|
449
|
+
|
|
450
|
+
# Process the summary file
|
|
451
|
+
def process_ref_csv(cl_args):
|
|
452
|
+
ref_df = pd.read_csv(cl_args["summary"])
|
|
453
|
+
|
|
454
|
+
result_folder = create_result_folder(cl_args)
|
|
455
|
+
|
|
456
|
+
variant_csv_path = os.path.join(result_folder, "variants.csv")
|
|
457
|
+
if os.path.exists(variant_csv_path):
|
|
458
|
+
variant_df = pd.read_csv(variant_csv_path)
|
|
459
|
+
else:
|
|
460
|
+
variant_df = pd.DataFrame(
|
|
461
|
+
columns=["barcode_plate", "name", "refseq", "variant"]
|
|
462
|
+
)
|
|
463
|
+
for i, row in ref_df.iterrows():
|
|
464
|
+
barcode_plate = row["barcode_plate"]
|
|
465
|
+
name = row["name"]
|
|
466
|
+
refseq = row["refseq"].upper()
|
|
467
|
+
|
|
468
|
+
# Create a subfolder for the current iteration using the name value
|
|
469
|
+
name_folder = os.path.join(result_folder, name)
|
|
470
|
+
os.makedirs(name_folder, exist_ok=True)
|
|
471
|
+
|
|
472
|
+
# Write the refseq to a temporary fasta file
|
|
473
|
+
temp_fasta_path = os.path.join(name_folder, f"temp_{name}.fasta")
|
|
474
|
+
with open(temp_fasta_path, "w") as f:
|
|
475
|
+
f.write(f">{name}\n{refseq}\n")
|
|
476
|
+
# Create filtered barcode path
|
|
477
|
+
barcode_path = filter_bc(cl_args, name_folder, i)
|
|
478
|
+
# Find fastq.gz files
|
|
479
|
+
output_dir = Path(result_folder) / "basecalled_reads"
|
|
480
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
481
|
+
|
|
482
|
+
file_to_fastq = cat_fastq_files(cl_args.get("path"), output_dir)
|
|
483
|
+
|
|
484
|
+
if not cl_args["skip_demultiplexing"]:
|
|
485
|
+
demux_fastq(output_dir, name_folder, barcode_path)
|
|
486
|
+
if not cl_args["skip_variantcalling"]:
|
|
487
|
+
variant_result = call_variant(
|
|
488
|
+
f"{name}", name_folder, temp_fasta_path, barcode_path
|
|
489
|
+
)
|
|
490
|
+
variant_result["barcode_plate"] = barcode_plate
|
|
491
|
+
variant_result["name"] = name
|
|
492
|
+
variant_result["refseq"] = refseq
|
|
493
|
+
|
|
494
|
+
variant_df = pd.concat([variant_df, variant_result])
|
|
495
|
+
|
|
496
|
+
# Remove the temporary fasta file
|
|
497
|
+
# os.remove(temp_fasta_path)
|
|
498
|
+
variant_df.to_csv(variant_csv_path, index=False)
|
|
499
|
+
return variant_df
|
|
500
|
+
|
|
501
|
+
|
|
502
|
+
# Run LevSeq
|
|
503
|
+
def run_LevSeq(cl_args, tqdm_fn=tqdm.tqdm):
|
|
504
|
+
# Create output folder
|
|
505
|
+
result_folder = create_result_folder(cl_args)
|
|
506
|
+
|
|
507
|
+
# Configure logging to save in the output directory
|
|
508
|
+
log_format = "%(asctime)s:%(levelname)s:%(message)s"
|
|
509
|
+
|
|
510
|
+
# INFO level logger
|
|
511
|
+
info_handler = logging.FileHandler(os.path.join(result_folder, "LevSeq_run.log"))
|
|
512
|
+
info_handler.setLevel(logging.INFO)
|
|
513
|
+
info_handler.setFormatter(logging.Formatter(log_format))
|
|
514
|
+
|
|
515
|
+
# ERROR level logger
|
|
516
|
+
error_handler = logging.FileHandler(os.path.join(result_folder, "LevSeq_error.log"))
|
|
517
|
+
error_handler.setLevel(logging.ERROR)
|
|
518
|
+
error_handler.setFormatter(logging.Formatter(log_format))
|
|
519
|
+
|
|
520
|
+
# Configure the root logger
|
|
521
|
+
logging.basicConfig(level=logging.INFO, handlers=[info_handler, error_handler])
|
|
522
|
+
try:
|
|
523
|
+
# Process summary file by row using demux, call_variant function
|
|
524
|
+
variant_df = process_ref_csv(cl_args)
|
|
525
|
+
|
|
526
|
+
# Check if variants.csv already exist
|
|
527
|
+
variant_csv_path = os.path.join(result_folder, "variants.csv")
|
|
528
|
+
if os.path.exists(variant_csv_path):
|
|
529
|
+
variant_df = pd.read_csv(variant_csv_path)
|
|
530
|
+
df_variants, df_vis = create_df_v(variant_df)
|
|
531
|
+
# Clean up and prepare dataframe for visualization
|
|
532
|
+
else:
|
|
533
|
+
df_variants, df_vis = create_df_v(variant_df)
|
|
534
|
+
|
|
535
|
+
processed_csv = os.path.join(result_folder, "visualization.csv")
|
|
536
|
+
df_vis.to_csv(processed_csv, index=False)
|
|
537
|
+
|
|
538
|
+
layout = generate_platemaps(
|
|
539
|
+
max_combo_data=df_vis,
|
|
540
|
+
result_folder=result_folder,
|
|
541
|
+
show_msa=cl_args["show_msa"],
|
|
542
|
+
)
|
|
543
|
+
|
|
544
|
+
# Saving heatmap and csv
|
|
545
|
+
save_platemap_to_file(
|
|
546
|
+
heatmaps=layout,
|
|
547
|
+
outputdir=result_folder,
|
|
548
|
+
name=cl_args["name"],
|
|
549
|
+
show_msa=cl_args["show_msa"],
|
|
550
|
+
)
|
|
551
|
+
save_csv(df_variants, result_folder, cl_args["name"])
|
|
552
|
+
logging.info("Run successful, see visualization and results")
|
|
553
|
+
except Exception as e:
|
|
554
|
+
logging.error(
|
|
555
|
+
"An error occured while executing LevSeq, check log file for detail",
|
|
556
|
+
exc_info=True,
|
|
557
|
+
)
|
|
558
|
+
raise
|
levseq/screen.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
|
|
4
|
+
# Load the data
|
|
5
|
+
file_path = 'path_to_your_csv_file.csv'
|
|
6
|
+
data = pd.read_csv(file_path)
|
|
7
|
+
|
|
8
|
+
# Filter out rows with '*' or '-' in the "Mutations" column
|
|
9
|
+
filtered_data = data[~data["Mutations"].str.contains(r"[*-]", na=False)]
|
|
10
|
+
|
|
11
|
+
# Rename columns
|
|
12
|
+
filtered_data = filtered_data.rename(columns={"Mutations": "Sample name", "Well": "Vial"})
|
|
13
|
+
|
|
14
|
+
# Group by plate and randomly select 8 'PARENT' rows per plate
|
|
15
|
+
parent_data = filtered_data[filtered_data['Sample name'] == '#PARENT#']
|
|
16
|
+
filtered_parent_data = parent_data.groupby('Plate').apply(lambda x: x.sample(min(8, len(x)))).reset_index(drop=True)
|
|
17
|
+
|
|
18
|
+
# Update the "Sample name" column to include plate information
|
|
19
|
+
filtered_parent_data['Sample name'] = filtered_parent_data['Plate'] + '_' + filtered_parent_data['Sample name']
|
|
20
|
+
|
|
21
|
+
# Combine the filtered parent data with the rest of the data
|
|
22
|
+
non_parent_data = filtered_data[filtered_data['Sample name'] != '#PARENT#']
|
|
23
|
+
non_parent_data['Sample name'] = non_parent_data['Plate'] + '_' + non_parent_data['Sample name']
|
|
24
|
+
|
|
25
|
+
final_data = pd.concat([filtered_parent_data, non_parent_data])
|
|
26
|
+
|
|
27
|
+
# Add the new columns
|
|
28
|
+
final_data['Action'] = 'Inject'
|
|
29
|
+
final_data['Sample type'] = 'Sample'
|
|
30
|
+
final_data['Injection source'] = 'HipAls'
|
|
31
|
+
|
|
32
|
+
# Select relevant columns
|
|
33
|
+
final_result = final_data[['Sample name', 'Vial', 'Action', 'Sample type', 'Injection source']]
|
|
34
|
+
|
|
35
|
+
# Save the resulting DataFrame to a new CSV file
|
|
36
|
+
output_path = 'path_to_output_csv_file.csv'
|
|
37
|
+
final_result.to_csv(output_path, index=False)
|
|
38
|
+
|