levseq 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
levseq/parser.py ADDED
@@ -0,0 +1,82 @@
1
+ ###############################################################################
2
+ # #
3
+ # This program is free software: you can redistribute it and/or modify #
4
+ # it under the terms of the GNU General Public License as published by #
5
+ # the Free Software Foundation, either version 3 of the License, or #
6
+ # (at your option) any later version. #
7
+ # #
8
+ # This program is distributed in the hope that it will be useful, #
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
11
+ # GNU General Public License for more details. #
12
+ # #
13
+ # You should have received a copy of the GNU General Public License #
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>. #
15
+ # #
16
+ ###############################################################################
17
+
18
+ import argparse
19
+ from pathlib import Path
20
+
21
+
22
+ def create_parser():
23
+ parser = argparse.ArgumentParser(description='evSeq levseq pipeline. Enter the experiment name from your run')
24
+
25
+ # Arguments
26
+ parser.add_argument('--experiment_name',
27
+ metavar='n',
28
+ type=str,
29
+ required=True,
30
+ help='Name of experiment. The name must overlap with the name given for Sequencing')
31
+
32
+ parser.add_argument('--ref',
33
+ metavar='r',
34
+ type=Path,
35
+ required=True,
36
+ help='Path to reference sequence.')
37
+
38
+ parser.add_argument('--output_path',
39
+ metavar='o',
40
+ default=None,
41
+ type=Path,
42
+ required=False,
43
+ help='Path to output folder. If not given, the output folder will be created in the current directory')
44
+
45
+ parser.add_argument("--output_name",
46
+ metavar='on',
47
+ type=str,
48
+ help="Name of the output folder. If not given, the name will be the same as the experiment name")
49
+
50
+ parser.add_argument("--skip_basecalling",
51
+ action="store_true",
52
+ help="Skip the basecalling step.")
53
+
54
+ parser.add_argument("--skip_demultiplex",
55
+ action="store_true",
56
+ help="Skip the demultiplexing step.")
57
+
58
+ parser.add_argument("--skip_consensus",
59
+ action="store_true",
60
+ help="Skip the consensus step.")
61
+
62
+ parser.add_argument('--json_file',
63
+ metavar='j',
64
+ type=Path,
65
+ required=False,
66
+ help='Path to json file. If not given, default json file will be used')
67
+
68
+
69
+ return parser
70
+
71
+
72
+ def check_parser(parser, args):
73
+ """Check if the parser argumemtns are valid
74
+ Input: parser object, parser arguments
75
+ Output: True if valid, else exit"""
76
+
77
+ if args.experiment_name is None:
78
+ parser.print_help()
79
+ assert "Please enter the experiment name"
80
+ exit(1)
81
+ else:
82
+ return True
levseq/run_levseq.py ADDED
@@ -0,0 +1,558 @@
1
+ ###############################################################################
2
+ # #
3
+ # This program is free software: you can redistribute it and/or modify #
4
+ # it under the terms of the GNU General Public License as published by #
5
+ # the Free Software Foundation, either version 3 of the License, or #
6
+ # (at your option) any later version. #
7
+ # #
8
+ # This program is distributed in the hope that it will be useful, #
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
11
+ # GNU General Public License for more details. #
12
+ # #
13
+ # You should have received a copy of the GNU General Public License #
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>. #
15
+ # #
16
+ ###############################################################################
17
+
18
+ # Import MinION objects
19
+ from levseq import *
20
+
21
+ # Import external packages
22
+ import logging
23
+ from pathlib import Path
24
+ import numpy as np
25
+ import pandas as pd
26
+ from importlib import resources
27
+ import subprocess
28
+ from Bio import SeqIO
29
+ import tqdm
30
+ import platform
31
+ import subprocess
32
+ import os
33
+ import re
34
+ import gzip
35
+ import shutil
36
+
37
+ import panel as pn
38
+ import holoviews as hv
39
+ from holoviews.streams import Tap
40
+
41
+ output_notebook()
42
+
43
+ pn.extension()
44
+ pn.config.comms = "vscode"
45
+
46
+ hv.extension("bokeh")
47
+
48
+
49
+ # Get barcode used
50
+ def barcode_user(cl_args, i):
51
+ try:
52
+ # Set some default values if user did not provide barcodes
53
+ fmin = 1
54
+ fmax = 96
55
+ bc_df = pd.read_csv(cl_args["summary"])
56
+ rbc = bc_df["barcode_plate"][i]
57
+ logging.info(f"Demultiplex executed successfully for index {i}.")
58
+
59
+ return int(fmin), int(fmax), int(rbc)
60
+
61
+ except Exception as e:
62
+ logging.error("Demultiplex failed to execute for index {i}.", exc_info=True)
63
+ raise
64
+
65
+ # Split fastq file into 4000 reads per chunk
66
+ def split_fastq_file(fastq_file: Path, output_dir: Path, reads_per_file: int):
67
+ """
68
+ Splits a FASTQ file into multiple files, each containing up to a specified number of reads.
69
+
70
+ Parameters:
71
+ - fastq_file (Path): The input FASTQ file to be split.
72
+ - output_dir (Path): The directory where the split files will be saved.
73
+ - reads_per_file (int): The number of reads per output file.
74
+ """
75
+ try:
76
+ with open(fastq_file, "rt") as handle:
77
+ record_iter = SeqIO.parse(handle, "fastq")
78
+ file_count = 0
79
+ while True:
80
+ chunk = []
81
+ try:
82
+ for _ in range(reads_per_file):
83
+ chunk.append(next(record_iter))
84
+ except StopIteration:
85
+ # End of the file reached
86
+ if chunk:
87
+ output_file = output_dir / f"{fastq_file.stem}_part{file_count}.fastq"
88
+ with open(output_file, "wt") as out_handle:
89
+ SeqIO.write(chunk, out_handle, "fastq")
90
+ logging.info(f"Created {output_file} with {len(chunk)} reads")
91
+ break # Exit the loop once we reach the end of the records
92
+ output_file = output_dir / f"{fastq_file.stem}_part{file_count}.fastq"
93
+ with open(output_file, "wt") as out_handle:
94
+ SeqIO.write(chunk, out_handle, "fastq")
95
+ logging.info(f"Created {output_file} with {len(chunk)} reads")
96
+ file_count += 1
97
+
98
+ logging.info(f"Splitting complete for {fastq_file}. {file_count} parts created.")
99
+ except Exception as e:
100
+ logging.error(f"Failed to split FASTQ file {fastq_file}: {str(e)}", exc_info=True)
101
+ raise
102
+
103
+ def cat_fastq_files(folder_path: str, output_path: str, reads_per_file: int = 4000):
104
+ """
105
+ Copies all .fastq and .fastq.gz files from the provided folder_path to the output_path.
106
+ If there's only one .fastq file, it will be split into smaller files of a specified number
107
+ of reads each.
108
+
109
+ Parameters:
110
+ - folder_path (str): The path to the directory containing .fastq and .fastq.gz files.
111
+ - output_path (str): The path to the directory where the files should be copied or split.
112
+ - reads_per_file (int): The number of reads per output file when splitting a FASTQ file.
113
+
114
+ Returns:
115
+ - str: The path to the output directory where the files were copied.
116
+
117
+ Raises:
118
+ - ValueError: If the folder_path is not a valid directory or if no .fastq/.fastq.gz files are found.
119
+ - Exception: For any other errors that occur during file copying or splitting.
120
+ """
121
+ try:
122
+ folder_path = Path(folder_path)
123
+ output_path = Path(output_path)
124
+
125
+ if not folder_path.is_dir():
126
+ raise ValueError(f"The provided path {folder_path} is not a valid directory")
127
+
128
+ if not output_path.exists():
129
+ output_path.mkdir(parents=True, exist_ok=True)
130
+
131
+ # Find all fastq and fastq.gz files excluding those in fastq_fail folders
132
+ fastq_files = []
133
+ for root, dirs, files in os.walk(folder_path):
134
+ if "fastq_fail" not in root:
135
+ for file in files:
136
+ if file.endswith((".fastq", ".fastq.gz")):
137
+ fastq_files.append(Path(root) / file)
138
+
139
+ if not fastq_files:
140
+ raise ValueError(f"No FASTQ files found in {folder_path}")
141
+
142
+ # If there is only one FASTQ file, split it into smaller files
143
+ if len(fastq_files) == 1 and fastq_files[0].suffix == '.fastq':
144
+ logging.info(f"Splitting single FASTQ file into {reads_per_file} reads per file")
145
+ split_fastq_file(fastq_files[0], output_path, reads_per_file)
146
+ else:
147
+ # Copy each .fastq or .fastq.gz file to the output directory
148
+ for fastq_file in fastq_files:
149
+ destination = output_path / fastq_file.name
150
+ shutil.copy(fastq_file, destination)
151
+ logging.info(f"Copied {fastq_file} to {destination}")
152
+
153
+ logging.info(f"All FASTQ files processed successfully to {output_path}")
154
+ return str(output_path)
155
+
156
+ except Exception as e:
157
+ logging.error(f"Failed to copy or split fastq files. An error occurred: {str(e)}", exc_info=True)
158
+ raise
159
+
160
+ # Create result folder
161
+ def create_result_folder(cl_args: dict) -> str:
162
+ folder_name = cl_args.get("name")
163
+ if not folder_name:
164
+ raise ValueError("The 'name' key is required in cl_args")
165
+ output_path = cl_args.get("output", os.getcwd())
166
+ result_folder = Path(output_path) / folder_name
167
+ # Create the directory if it doesn't exist
168
+ result_folder.mkdir(parents=True, exist_ok=True)
169
+ return str(result_folder)
170
+
171
+
172
+ # Return and create filtered barcodes
173
+ def filter_bc(cl_args: dict, name_folder: Path, i: int) -> Path:
174
+ front_min, front_max, rbc = barcode_user(cl_args, i)
175
+ # Use importlib.resources to get the path to the barcode file
176
+ try:
177
+ with resources.path('levseq.barcoding', 'minion_barcodes.fasta') as barcode_path:
178
+ barcode_path = Path(barcode_path)
179
+ except ImportError:
180
+ # Fallback method if the above fails
181
+ package_root = Path(__file__).resolve().parent.parent
182
+ barcode_path = package_root / "levseq" / "barcoding" / "minion_barcodes.fasta"
183
+ # Ensure the barcode file exists
184
+ if not barcode_path.exists():
185
+ raise FileNotFoundError(f"Barcode file not found: {barcode_path}")
186
+
187
+ front_prefix = "NB"
188
+ back_prefix = "RB"
189
+ barcode_path_filter = os.path.join(name_folder, "levseq_barcodes_filtered.fasta")
190
+
191
+ filter_barcodes(
192
+ str(barcode_path),
193
+ str(barcode_path_filter),
194
+ (front_min, front_max),
195
+ rbc,
196
+ front_prefix,
197
+ back_prefix,
198
+ )
199
+
200
+ return barcode_path_filter
201
+
202
+ # Filter barcodes
203
+ def filter_barcodes(
204
+ input_fasta, output_fasta, barcode_range, rbc, front_prefix, back_prefix
205
+ ):
206
+ front_min, front_max = barcode_range
207
+ filtered_records = []
208
+
209
+ for record in SeqIO.parse(input_fasta, "fasta"):
210
+ if (
211
+ record.id.startswith(front_prefix)
212
+ and front_min <= int(record.id[len(front_prefix) :]) <= front_max
213
+ ) or (
214
+ record.id.startswith(back_prefix)
215
+ and int(record.id[len(back_prefix) :]) == rbc
216
+ ):
217
+ filtered_records.append(record)
218
+
219
+ with open(output_fasta, "w") as output_handle:
220
+ SeqIO.write(filtered_records, output_handle, "fasta")
221
+
222
+ # Demultiplex fastq reads into plate and wells format
223
+ def demux_fastq(file_to_fastq, result_folder, barcode_path):
224
+ # Determine the system architecture
225
+ system_architecture = platform.machine().lower()
226
+
227
+ # Choose the appropriate executable based on the architecture
228
+ if system_architecture == 'arm64':
229
+ executable_name = "demultiplex-arm64"
230
+ elif system_architecture == 'aarch64':
231
+ executable_name = "demultiplex"
232
+ elif system_architecture == 'x86_64':
233
+ executable_name = "demultiplex-x86"
234
+ else:
235
+ raise ValueError(f"Unsupported architecture: {system_architecture}")
236
+
237
+ # Use importlib.resources to get the path to the executable
238
+ try:
239
+ with resources.path('levseq.barcoding', executable_name) as executable_path:
240
+ executable_path = Path(executable_path)
241
+ except ImportError:
242
+ # Fallback method if the above fails
243
+ package_root = Path(__file__).resolve().parent.parent
244
+ executable_path = package_root / "levseq" / "barcoding" / executable_name
245
+
246
+ # Ensure the executable exists
247
+ if not executable_path.exists():
248
+ raise FileNotFoundError(f"Executable not found: {executable_path}")
249
+
250
+ # Get min and max sequence length if user specified, otherwise use default
251
+ seq_min = 800
252
+ seq_max = 5000
253
+
254
+ # Use subprocess to run the executable
255
+ prompt = f"{executable_path} -f {file_to_fastq} -d {result_folder} -b {barcode_path} -w 100 -r 100 -m {seq_min} -x {seq_max}"
256
+ subprocess.run(prompt, shell=True, check=True)
257
+
258
+ # Variant calling using VariantCaller class and generate dataframe
259
+ def call_variant(experiment_name, experiment_folder, template_fasta, filtered_barcodes):
260
+ try:
261
+ vc = VariantCaller(
262
+ experiment_name,
263
+ experiment_folder,
264
+ template_fasta,
265
+ filtered_barcodes,
266
+ padding_start=0,
267
+ padding_end=0,
268
+ )
269
+ variant_df = vc.get_variant_df(threshold=0.5, min_depth=5)
270
+ logging.info("Variant calling to create consensus reads successful")
271
+ return variant_df
272
+ except Exception as e:
273
+ logging.error("Variant calling failed", exc_info=True)
274
+ raise
275
+
276
+
277
+ # Saving heatmaps and csv in the results folder
278
+ def save_platemap_to_file(heatmaps, outputdir, name, show_msa):
279
+ if not os.path.exists(os.path.join(outputdir, "Platemaps")):
280
+ os.makedirs(os.path.join(outputdir, "Platemaps"))
281
+ file_path = os.path.join(outputdir, "Platemaps", name)
282
+ if show_msa:
283
+ heatmaps.save(file_path + "_msa.html", embed=True)
284
+ else:
285
+ hv.renderer("bokeh").save(heatmaps, file_path)
286
+
287
+
288
+ def save_csv(df, outputdir, name):
289
+ if not os.path.exists(os.path.join(outputdir, "Results")):
290
+ os.makedirs(os.path.join(outputdir, "Results"))
291
+ file_path = os.path.join(outputdir, "Results", name + ".csv")
292
+ df.to_csv(file_path)
293
+
294
+
295
+ # Generate dataframe for visualization
296
+ def create_df_v(variants_df):
297
+ # Make copy of dataframe
298
+ df_variants_ = variants_df.copy()
299
+
300
+ # Fill in empty cells
301
+ df_variants_["Variant"].tolist()
302
+ df_variants_["Variant"] = df_variants_["Variant"].replace(np.nan, "", regex=True)
303
+
304
+ # Create nc_variant column
305
+ df_variants_["nc_variant"] = df_variants_.apply(
306
+ lambda row: create_nc_variant(row["Variant"], row["refseq"]), axis=1
307
+ )
308
+
309
+ # Translate nc_variant to aa_variant
310
+ df_variants_["aa_variant"] = df_variants_["nc_variant"].apply(
311
+ lambda x: "Deletion" if x == "Deletion" else translate(x)
312
+ )
313
+ # Fill in 'Deletion' in 'aa_variant' column
314
+ df_variants_.loc[
315
+ df_variants_["nc_variant"] == "Deletion", "aa_variant"
316
+ ] = "Deletion"
317
+
318
+ # Compare aa_variant with translated refseq and generate mutations column
319
+ df_variants_["Mutations"] = df_variants_.apply(get_mutations, axis=1)
320
+
321
+ # Fill in empty empty values
322
+ df_variants_["Alignment Probability"] = df_variants_[
323
+ "Average mutation frequency"
324
+ ].fillna(0.0)
325
+ df_variants_["Alignment Count"] = df_variants_["Alignment Count"].fillna(0.0)
326
+
327
+ # Fill in Deletion into mutations Column
328
+ for i in df_variants_.index:
329
+ if df_variants_["nc_variant"].iloc[i] == "Deletion":
330
+ df_variants_.Mutations.iat[i] = df_variants_.Mutations.iat[i].replace(
331
+ "", "-"
332
+ )
333
+
334
+ # Add row and columns
335
+ Well = df_variants_["Well"].tolist()
336
+ row = []
337
+ column = []
338
+ for well in Well:
339
+ if len(well) >= 2:
340
+ row.append(well[0])
341
+ if well[1:].isdigit():
342
+ column.append(well[1:])
343
+ else:
344
+ column.append("")
345
+ else:
346
+ row.append("")
347
+ column.append("")
348
+
349
+ df_variants_["Row"] = row
350
+ df_variants_["Column"] = column
351
+ df_variants_["Plate"] = df_variants_["name"].astype(str)
352
+
353
+ # Update 'Plate' column from '1'-'9' to '01'-'09'
354
+ df_variants_["Plate"] = df_variants_["Plate"].apply(
355
+ lambda x: f"0{x}" if len(x) == 1 else x
356
+ )
357
+ # Select the desired columns in the desired order
358
+ restructured_df = df_variants_[
359
+ [
360
+ "barcode_plate",
361
+ "Plate",
362
+ "Well",
363
+ "Variant",
364
+ "Alignment Count",
365
+ "Average mutation frequency",
366
+ "P value",
367
+ "P adj. value",
368
+ "Mutations",
369
+ "nc_variant",
370
+ "aa_variant",
371
+ ]
372
+ ]
373
+ # Set 'Mutations' and 'Variant' columns to '#N.A.#' if 'Alignment Count' is smaller than 5
374
+ restructured_df.loc[
375
+ restructured_df["Alignment Count"] < 6, ["Mutations", "Variant"]
376
+ ] = "#N.A.#"
377
+ df_variants_.loc[
378
+ df_variants_["Alignment Count"] < 6, ["Mutations", "Variant"]
379
+ ] = "#N.A.#"
380
+ restructured_df.loc[
381
+ restructured_df["Mutations"] == "#PARENT#", ["Alignment Probability"]
382
+ ] = 1.0
383
+ df_variants_.loc[
384
+ df_variants_["Mutations"] == "#PARENT#", ["Alignment Probability"]
385
+ ] = 1.0
386
+
387
+ return restructured_df, df_variants_
388
+
389
+
390
+ def create_nc_variant(variant, refseq):
391
+ if isinstance(variant, np.ndarray):
392
+ variant = variant.tolist()
393
+ if variant == "" or pd.isnull(variant):
394
+ return refseq
395
+ elif variant == "#PARENT#":
396
+ return refseq
397
+ elif "DEL" in variant:
398
+ return "Deletion"
399
+ else:
400
+ mutations = variant.split("_")
401
+ nc_variant = list(refseq)
402
+ for mutation in mutations:
403
+ if len(mutation) >= 2:
404
+ position = int(re.findall(r"\d+", mutation)[0]) - 1
405
+ original = mutation[0]
406
+ new = mutation[-1]
407
+ if position < len(nc_variant) and nc_variant[position] == original:
408
+ nc_variant[position] = new
409
+ return "".join(nc_variant)
410
+
411
+
412
+ def is_valid_dna_sequence(sequence):
413
+ return all(nucleotide in 'ATGC' for nucleotide in sequence) and len(sequence) % 3 == 0
414
+
415
+ def get_mutations(row):
416
+ try:
417
+ refseq = row["refseq"]
418
+
419
+ if not is_valid_dna_sequence(refseq):
420
+ return "Invalid refseq provided, check template sequence. Only A, T, G, C and sequence dividable by 3 are accepted."
421
+
422
+ refseq_aa = translate(refseq)
423
+ variant_aa = row["aa_variant"]
424
+ alignment_count = row["Alignment Count"]
425
+
426
+ if variant_aa == "Deletion":
427
+ return ""
428
+ else:
429
+ mutations = []
430
+ if len(refseq_aa) == len(variant_aa):
431
+ for i in range(len(refseq_aa)):
432
+ if refseq_aa[i] != variant_aa[i]:
433
+ mutations.append(f"{refseq_aa[i]}{i+1}{variant_aa[i]}")
434
+ if not mutations:
435
+ if alignment_count < 5:
436
+ return "#N.A.#"
437
+ else:
438
+ return "#PARENT#"
439
+ else:
440
+ return "LEN"
441
+ return "_".join(mutations) if mutations else ""
442
+
443
+ except Exception as e:
444
+ logging.error(
445
+ "Translation to amino acids failed, check template sequence. Only A, T, G, C and sequence dividable by 3 are accepted.",
446
+ exc_info=True,
447
+ )
448
+ raise
449
+
450
+ # Process the summary file
451
+ def process_ref_csv(cl_args):
452
+ ref_df = pd.read_csv(cl_args["summary"])
453
+
454
+ result_folder = create_result_folder(cl_args)
455
+
456
+ variant_csv_path = os.path.join(result_folder, "variants.csv")
457
+ if os.path.exists(variant_csv_path):
458
+ variant_df = pd.read_csv(variant_csv_path)
459
+ else:
460
+ variant_df = pd.DataFrame(
461
+ columns=["barcode_plate", "name", "refseq", "variant"]
462
+ )
463
+ for i, row in ref_df.iterrows():
464
+ barcode_plate = row["barcode_plate"]
465
+ name = row["name"]
466
+ refseq = row["refseq"].upper()
467
+
468
+ # Create a subfolder for the current iteration using the name value
469
+ name_folder = os.path.join(result_folder, name)
470
+ os.makedirs(name_folder, exist_ok=True)
471
+
472
+ # Write the refseq to a temporary fasta file
473
+ temp_fasta_path = os.path.join(name_folder, f"temp_{name}.fasta")
474
+ with open(temp_fasta_path, "w") as f:
475
+ f.write(f">{name}\n{refseq}\n")
476
+ # Create filtered barcode path
477
+ barcode_path = filter_bc(cl_args, name_folder, i)
478
+ # Find fastq.gz files
479
+ output_dir = Path(result_folder) / "basecalled_reads"
480
+ output_dir.mkdir(parents=True, exist_ok=True)
481
+
482
+ file_to_fastq = cat_fastq_files(cl_args.get("path"), output_dir)
483
+
484
+ if not cl_args["skip_demultiplexing"]:
485
+ demux_fastq(output_dir, name_folder, barcode_path)
486
+ if not cl_args["skip_variantcalling"]:
487
+ variant_result = call_variant(
488
+ f"{name}", name_folder, temp_fasta_path, barcode_path
489
+ )
490
+ variant_result["barcode_plate"] = barcode_plate
491
+ variant_result["name"] = name
492
+ variant_result["refseq"] = refseq
493
+
494
+ variant_df = pd.concat([variant_df, variant_result])
495
+
496
+ # Remove the temporary fasta file
497
+ # os.remove(temp_fasta_path)
498
+ variant_df.to_csv(variant_csv_path, index=False)
499
+ return variant_df
500
+
501
+
502
+ # Run LevSeq
503
+ def run_LevSeq(cl_args, tqdm_fn=tqdm.tqdm):
504
+ # Create output folder
505
+ result_folder = create_result_folder(cl_args)
506
+
507
+ # Configure logging to save in the output directory
508
+ log_format = "%(asctime)s:%(levelname)s:%(message)s"
509
+
510
+ # INFO level logger
511
+ info_handler = logging.FileHandler(os.path.join(result_folder, "LevSeq_run.log"))
512
+ info_handler.setLevel(logging.INFO)
513
+ info_handler.setFormatter(logging.Formatter(log_format))
514
+
515
+ # ERROR level logger
516
+ error_handler = logging.FileHandler(os.path.join(result_folder, "LevSeq_error.log"))
517
+ error_handler.setLevel(logging.ERROR)
518
+ error_handler.setFormatter(logging.Formatter(log_format))
519
+
520
+ # Configure the root logger
521
+ logging.basicConfig(level=logging.INFO, handlers=[info_handler, error_handler])
522
+ try:
523
+ # Process summary file by row using demux, call_variant function
524
+ variant_df = process_ref_csv(cl_args)
525
+
526
+ # Check if variants.csv already exist
527
+ variant_csv_path = os.path.join(result_folder, "variants.csv")
528
+ if os.path.exists(variant_csv_path):
529
+ variant_df = pd.read_csv(variant_csv_path)
530
+ df_variants, df_vis = create_df_v(variant_df)
531
+ # Clean up and prepare dataframe for visualization
532
+ else:
533
+ df_variants, df_vis = create_df_v(variant_df)
534
+
535
+ processed_csv = os.path.join(result_folder, "visualization.csv")
536
+ df_vis.to_csv(processed_csv, index=False)
537
+
538
+ layout = generate_platemaps(
539
+ max_combo_data=df_vis,
540
+ result_folder=result_folder,
541
+ show_msa=cl_args["show_msa"],
542
+ )
543
+
544
+ # Saving heatmap and csv
545
+ save_platemap_to_file(
546
+ heatmaps=layout,
547
+ outputdir=result_folder,
548
+ name=cl_args["name"],
549
+ show_msa=cl_args["show_msa"],
550
+ )
551
+ save_csv(df_variants, result_folder, cl_args["name"])
552
+ logging.info("Run successful, see visualization and results")
553
+ except Exception as e:
554
+ logging.error(
555
+ "An error occured while executing LevSeq, check log file for detail",
556
+ exc_info=True,
557
+ )
558
+ raise
levseq/screen.py ADDED
@@ -0,0 +1,38 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ # Load the data
5
+ file_path = 'path_to_your_csv_file.csv'
6
+ data = pd.read_csv(file_path)
7
+
8
+ # Filter out rows with '*' or '-' in the "Mutations" column
9
+ filtered_data = data[~data["Mutations"].str.contains(r"[*-]", na=False)]
10
+
11
+ # Rename columns
12
+ filtered_data = filtered_data.rename(columns={"Mutations": "Sample name", "Well": "Vial"})
13
+
14
+ # Group by plate and randomly select 8 'PARENT' rows per plate
15
+ parent_data = filtered_data[filtered_data['Sample name'] == '#PARENT#']
16
+ filtered_parent_data = parent_data.groupby('Plate').apply(lambda x: x.sample(min(8, len(x)))).reset_index(drop=True)
17
+
18
+ # Update the "Sample name" column to include plate information
19
+ filtered_parent_data['Sample name'] = filtered_parent_data['Plate'] + '_' + filtered_parent_data['Sample name']
20
+
21
+ # Combine the filtered parent data with the rest of the data
22
+ non_parent_data = filtered_data[filtered_data['Sample name'] != '#PARENT#']
23
+ non_parent_data['Sample name'] = non_parent_data['Plate'] + '_' + non_parent_data['Sample name']
24
+
25
+ final_data = pd.concat([filtered_parent_data, non_parent_data])
26
+
27
+ # Add the new columns
28
+ final_data['Action'] = 'Inject'
29
+ final_data['Sample type'] = 'Sample'
30
+ final_data['Injection source'] = 'HipAls'
31
+
32
+ # Select relevant columns
33
+ final_result = final_data[['Sample name', 'Vial', 'Action', 'Sample type', 'Injection source']]
34
+
35
+ # Save the resulting DataFrame to a new CSV file
36
+ output_path = 'path_to_output_csv_file.csv'
37
+ final_result.to_csv(output_path, index=False)
38
+