levseq 1.3.2__tar.gz → 1.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. {levseq-1.3.2/levseq.egg-info → levseq-1.4.0}/PKG-INFO +15 -2
  2. {levseq-1.3.2 → levseq-1.4.0}/README.md +15 -2
  3. {levseq-1.3.2 → levseq-1.4.0}/levseq/__init__.py +2 -2
  4. {levseq-1.3.2 → levseq-1.4.0}/levseq/barcoding/demultiplex-arm64 +0 -0
  5. levseq-1.4.0/levseq/barcoding/demultiplex-x86 +0 -0
  6. levseq-1.4.0/levseq/filter_orientation.py +115 -0
  7. {levseq-1.3.2 → levseq-1.4.0}/levseq/interface.py +3 -0
  8. {levseq-1.3.2 → levseq-1.4.0}/levseq/run_levseq.py +89 -8
  9. {levseq-1.3.2 → levseq-1.4.0}/levseq/utils.py +14 -13
  10. {levseq-1.3.2 → levseq-1.4.0}/levseq/variantcaller.py +58 -42
  11. {levseq-1.3.2 → levseq-1.4.0}/levseq/visualization.py +52 -0
  12. {levseq-1.3.2 → levseq-1.4.0/levseq.egg-info}/PKG-INFO +15 -2
  13. {levseq-1.3.2 → levseq-1.4.0}/levseq.egg-info/SOURCES.txt +1 -0
  14. {levseq-1.3.2 → levseq-1.4.0}/tests/test_deploy.py +26 -17
  15. {levseq-1.3.2 → levseq-1.4.0}/tests/test_opligopools.py +7 -33
  16. levseq-1.3.2/levseq/barcoding/demultiplex-x86 +0 -0
  17. {levseq-1.3.2 → levseq-1.4.0}/LICENSE +0 -0
  18. {levseq-1.3.2 → levseq-1.4.0}/MANIFEST.in +0 -0
  19. {levseq-1.3.2 → levseq-1.4.0}/levseq/IO_processor.py +0 -0
  20. {levseq-1.3.2 → levseq-1.4.0}/levseq/barcoding/__init__.py +0 -0
  21. {levseq-1.3.2 → levseq-1.4.0}/levseq/barcoding/demultiplex +0 -0
  22. {levseq-1.3.2 → levseq-1.4.0}/levseq/barcoding/minion_barcodes.fasta +0 -0
  23. {levseq-1.3.2 → levseq-1.4.0}/levseq/basecaller.py +0 -0
  24. {levseq-1.3.2 → levseq-1.4.0}/levseq/cmd.py +0 -0
  25. {levseq-1.3.2 → levseq-1.4.0}/levseq/coordinates.py +0 -0
  26. {levseq-1.3.2 → levseq-1.4.0}/levseq/globals.py +0 -0
  27. {levseq-1.3.2 → levseq-1.4.0}/levseq/parser.py +0 -0
  28. {levseq-1.3.2 → levseq-1.4.0}/levseq/screen.py +0 -0
  29. {levseq-1.3.2 → levseq-1.4.0}/levseq/seqfit.py +0 -0
  30. {levseq-1.3.2 → levseq-1.4.0}/levseq/simulation.py +0 -0
  31. {levseq-1.3.2 → levseq-1.4.0}/levseq/user.py +0 -0
  32. {levseq-1.3.2 → levseq-1.4.0}/levseq.egg-info/dependency_links.txt +0 -0
  33. {levseq-1.3.2 → levseq-1.4.0}/levseq.egg-info/entry_points.txt +0 -0
  34. {levseq-1.3.2 → levseq-1.4.0}/levseq.egg-info/requires.txt +0 -0
  35. {levseq-1.3.2 → levseq-1.4.0}/levseq.egg-info/top_level.txt +0 -0
  36. {levseq-1.3.2 → levseq-1.4.0}/setup.cfg +0 -0
  37. {levseq-1.3.2 → levseq-1.4.0}/setup.py +0 -0
  38. {levseq-1.3.2 → levseq-1.4.0}/tests/test_demultiplex_docker.py +0 -0
  39. {levseq-1.3.2 → levseq-1.4.0}/tests/test_seqfitvis.py +0 -0
  40. {levseq-1.3.2 → levseq-1.4.0}/tests/test_seqs.py +0 -0
  41. {levseq-1.3.2 → levseq-1.4.0}/tests/test_statistics.py +0 -0
  42. {levseq-1.3.2 → levseq-1.4.0}/tests/test_variant_calling.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: levseq
3
- Version: 1.3.2
3
+ Version: 1.4.0
4
4
  Home-page: https://github.com/fhalab/levseq/
5
5
  Author: Yueming Long, Ariane Mora, Francesca-Zhoufan Li, Emre Gursoy
6
6
  Author-email: ylong@caltech.edu
@@ -52,9 +52,13 @@ In directed evolution, sequencing every variant enhances data insight and create
52
52
  ![Figure 1: LevSeq Workflow](manuscript/figures/LevSeq_Figure-1.jpeg)
53
53
  Figure 1: Overview of the LevSeq variant sequencing workflow using Nanopore technology. This diagram illustrates the key steps in the process, from sample preparation to data analysis and visualization.
54
54
 
55
+ ## Website
56
+ A beta website is available [here](https://levseqdb.streamlit.app/) you just load directly your output from LevSeq and your LCMS results and get visualisations and per plate normalizations.
57
+
58
+ ## Data
55
59
 
56
60
  - Data to reproduce the results and to test are available on zenodo [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.13694463.svg)](https://doi.org/10.5281/zenodo.13694463)
57
- - A dockerized website and database for labs to locally host and visualize their data: website is available [here](https://levseqdb.streamlit.app/) and code to host locally [here](https://github.com/fhalab/LevSeq_db)
61
+ - A dockerized website and database for labs to locally host and visualize their data: website is available [here](https://levseqdb.streamlit.app/) and code to host locally [here](https://github.com/fhalab/LevSeq_db)
58
62
 
59
63
  ## Setup
60
64
 
@@ -87,6 +91,10 @@ conda create --name levseq python=3.12 -y
87
91
  conda activate levseq
88
92
  ```
89
93
 
94
+ ```
95
+ pip install levseq
96
+ ```
97
+
90
98
  #### Dependencies
91
99
 
92
100
  1. Samtools: https://www.htslib.org/download/
@@ -100,6 +108,11 @@ conda install -c bioconda -c conda-forge samtools
100
108
  ```
101
109
  conda install -c bioconda -c conda-forge minimap2
102
110
  ```
111
+ 3. gcc 13 and 14 on Mac M1 through M4 chips
112
+ ```
113
+ brew install gcc@13
114
+ brew install gcc@14
115
+ ```
103
116
  ### Docker Installation (Recommended for full pipeline)
104
117
  For installing the whole pipeline, you'll need to use the docker image. For this, install docker as required for your
105
118
  operating system (https://docs.docker.com/engine/install/).
@@ -5,9 +5,13 @@ In directed evolution, sequencing every variant enhances data insight and create
5
5
  ![Figure 1: LevSeq Workflow](manuscript/figures/LevSeq_Figure-1.jpeg)
6
6
  Figure 1: Overview of the LevSeq variant sequencing workflow using Nanopore technology. This diagram illustrates the key steps in the process, from sample preparation to data analysis and visualization.
7
7
 
8
+ ## Website
9
+ A beta website is available [here](https://levseqdb.streamlit.app/) you just load directly your output from LevSeq and your LCMS results and get visualisations and per plate normalizations.
10
+
11
+ ## Data
8
12
 
9
13
  - Data to reproduce the results and to test are available on zenodo [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.13694463.svg)](https://doi.org/10.5281/zenodo.13694463)
10
- - A dockerized website and database for labs to locally host and visualize their data: website is available [here](https://levseqdb.streamlit.app/) and code to host locally [here](https://github.com/fhalab/LevSeq_db)
14
+ - A dockerized website and database for labs to locally host and visualize their data: website is available [here](https://levseqdb.streamlit.app/) and code to host locally [here](https://github.com/fhalab/LevSeq_db)
11
15
 
12
16
  ## Setup
13
17
 
@@ -40,6 +44,10 @@ conda create --name levseq python=3.12 -y
40
44
  conda activate levseq
41
45
  ```
42
46
 
47
+ ```
48
+ pip install levseq
49
+ ```
50
+
43
51
  #### Dependencies
44
52
 
45
53
  1. Samtools: https://www.htslib.org/download/
@@ -53,6 +61,11 @@ conda install -c bioconda -c conda-forge samtools
53
61
  ```
54
62
  conda install -c bioconda -c conda-forge minimap2
55
63
  ```
64
+ 3. gcc 13 and 14 on Mac M1 through M4 chips
65
+ ```
66
+ brew install gcc@13
67
+ brew install gcc@14
68
+ ```
56
69
  ### Docker Installation (Recommended for full pipeline)
57
70
  For installing the whole pipeline, you'll need to use the docker image. For this, install docker as required for your
58
71
  operating system (https://docs.docker.com/engine/install/).
@@ -138,4 +151,4 @@ If you have found LevSeq useful, please cite our [paper](https://pubs.acs.org/do
138
151
 
139
152
  #### Contact
140
153
 
141
- Leave a feature request in the issues or reach us via [email](mailto:levseqdb@gmail.com).
154
+ Leave a feature request in the issues or reach us via [email](mailto:levseqdb@gmail.com).
@@ -18,7 +18,7 @@
18
18
  __title__ = 'levseq'
19
19
  __description__ = 'LevSeq nanopore sequencing'
20
20
  __url__ = 'https://github.com/fhalab/levseq/'
21
- __version__ = '1.3.2'
21
+ __version__ = '1.4.0'
22
22
  __author__ = 'Yueming Long, Ariane Mora, Francesca-Zhoufan Li, Emre Gursoy'
23
23
  __author_email__ = 'ylong@caltech.edu'
24
24
  __license__ = 'GPL3'
@@ -31,4 +31,4 @@ from levseq.cmd import *
31
31
  from levseq.utils import *
32
32
  from levseq.simulation import *
33
33
  from levseq.user import *
34
-
34
+ from levseq.filter_orientation import *
@@ -0,0 +1,115 @@
1
+ from Bio import SeqIO
2
+ from Bio.Seq import Seq
3
+ import os
4
+ from pathlib import Path
5
+ import logging
6
+ from Bio.Align import PairwiseAligner
7
+ import shutil
8
+ from concurrent.futures import ThreadPoolExecutor, as_completed
9
+ from tqdm import tqdm
10
+
11
+ def calculate_alignment_score(seq1, seq2):
12
+ """Calculate alignment score between two sequences using PairwiseAligner."""
13
+ aligner = PairwiseAligner()
14
+ aligner.mode = 'global'
15
+ alignment = aligner.align(seq1, seq2)[0]
16
+ return alignment.score / max(len(seq1), len(seq2))
17
+
18
+ def filter_single_file(args):
19
+ """
20
+ Filter a single fastq file. Used for parallel processing.
21
+
22
+ Args:
23
+ args: tuple containing (input_file, parent_seq, parent_rev_comp)
24
+ Returns:
25
+ tuple: (file_path, total_reads, kept_reads, filtered_records)
26
+ """
27
+ input_file, parent_seq, parent_rev_comp = args
28
+ kept_reads = []
29
+ total_reads = 0
30
+ kept_count = 0
31
+
32
+ is_forward = "forward" in str(input_file).lower()
33
+
34
+ for record in SeqIO.parse(input_file, "fastq"):
35
+ total_reads += 1
36
+ seq = str(record.seq)
37
+
38
+ forward_score = calculate_alignment_score(seq, str(parent_seq))
39
+ reverse_score = calculate_alignment_score(seq, str(parent_rev_comp))
40
+
41
+ # If it's in forward file (plate barcode was rev comp)
42
+ # Then read should align to reverse complement parent sequence
43
+ if is_forward and reverse_score > forward_score:
44
+ kept_reads.append(record)
45
+ kept_count += 1
46
+ # If it's in reverse file (plate barcode was forward)
47
+ # Then read was already reverse complemented by demultiplexer
48
+ # So it should align to forward parent sequence
49
+ elif not is_forward and forward_score > reverse_score:
50
+ kept_reads.append(record)
51
+ kept_count += 1
52
+
53
+ return str(input_file), total_reads, kept_count, kept_reads
54
+
55
+ def filter_demultiplexed_folder(experiment_folder, parent_sequence, num_threads=8):
56
+ """
57
+ Filter demultiplexed files using multiple threads.
58
+
59
+ Args:
60
+ experiment_folder (str): Path to experiment folder containing RBC/FBC structure
61
+ parent_sequence (str): Parent sequence for alignment checking
62
+ num_threads (int): Number of threads to use
63
+ """
64
+ exp_path = Path(experiment_folder)
65
+ filtered_counts = {}
66
+
67
+ # Prepare parent sequences once
68
+ parent_seq = Seq(parent_sequence)
69
+ parent_rev_comp = parent_seq.reverse_complement()
70
+
71
+ # Collect all fastq files
72
+ fastq_files = []
73
+ for rbc_dir in exp_path.glob("RB*"):
74
+ if not rbc_dir.is_dir():
75
+ continue
76
+ for fbc_dir in rbc_dir.glob("NB*"):
77
+ if not fbc_dir.is_dir():
78
+ continue
79
+ fastq_files.extend(list(fbc_dir.glob("*.fastq")))
80
+
81
+ if not fastq_files:
82
+ logging.warning(f"No fastq files found in {experiment_folder}")
83
+ return filtered_counts
84
+
85
+ # Prepare arguments for parallel processing
86
+ file_args = [(f, parent_seq, parent_rev_comp) for f in fastq_files]
87
+
88
+ # Process files in parallel with progress bar
89
+ with ThreadPoolExecutor(max_workers=num_threads) as executor:
90
+ futures = [executor.submit(filter_single_file, args) for args in file_args]
91
+
92
+ with tqdm(total=len(fastq_files), desc="Filtering files") as pbar:
93
+ for future in as_completed(futures):
94
+ try:
95
+ file_path, total, kept, filtered_records = future.result()
96
+
97
+ # Write filtered reads
98
+ temp_file = Path(file_path).parent / f"temp_{Path(file_path).name}"
99
+ SeqIO.write(filtered_records, temp_file, "fastq")
100
+ shutil.move(str(temp_file), file_path)
101
+
102
+ filtered_counts[file_path] = {
103
+ 'total': total,
104
+ 'kept': kept,
105
+ 'filtered': total - kept
106
+ }
107
+
108
+ logging.info(f"Processed {file_path}: {kept}/{total} reads kept")
109
+ pbar.update(1)
110
+
111
+ except Exception as e:
112
+ logging.error(f"Error processing file {file_path}: {str(e)}")
113
+ pbar.update(1)
114
+
115
+ return filtered_counts
@@ -63,6 +63,9 @@ def build_cli_parser():
63
63
  optional_args_group.add_argument("--skip_variantcalling",
64
64
  action="store_true",
65
65
  help="Skip the variant calling step, default is false")
66
+ optional_args_group.add_argument("--oligopool",
67
+ action="store_true",
68
+ help="Whether this experiment came from an oligopool, default is false.")
66
69
  optional_args_group.add_argument("--show_msa",
67
70
  default=False,
68
71
  help="Skip showing msa")
@@ -17,7 +17,7 @@
17
17
 
18
18
  # Import MinION objects
19
19
  from levseq import *
20
-
20
+ from levseq.filter_orientation import filter_demultiplexed_folder
21
21
  # Import external packages
22
22
  import logging
23
23
  from pathlib import Path
@@ -221,13 +221,14 @@ def demux_fastq(file_to_fastq, result_folder, barcode_path):
221
221
  executable_path = package_root / "levseq" / "barcoding" / executable_name
222
222
  if not executable_path.exists():
223
223
  raise FileNotFoundError(f"Executable not found: {executable_path}")
224
- seq_min = 200
224
+ seq_min = 200
225
225
  seq_max = 10000
226
226
  prompt = f"{executable_path} -f {file_to_fastq} -d {result_folder} -b {barcode_path} -w 100 -r 100 -m {seq_min} -x {seq_max}"
227
227
  subprocess.run(prompt, shell=True, check=True)
228
228
 
229
229
  # Variant calling using VariantCaller class
230
- def call_variant(experiment_name, experiment_folder, template_fasta, filtered_barcodes):
230
+
231
+ def call_variant(experiment_name, experiment_folder, template_fasta, filtered_barcodes, threshold=0.5, oligopool=False):
231
232
  try:
232
233
  vc = VariantCaller(
233
234
  experiment_name,
@@ -236,8 +237,9 @@ def call_variant(experiment_name, experiment_folder, template_fasta, filtered_ba
236
237
  filtered_barcodes,
237
238
  padding_start=0,
238
239
  padding_end=0,
240
+ oligopool=oligopool
239
241
  )
240
- variant_df = vc.get_variant_df(threshold=0.5, min_depth=5)
242
+ variant_df = vc.get_variant_df(threshold=threshold, min_depth=5)
241
243
  logging.info("Variant calling to create consensus reads successful")
242
244
  return variant_df
243
245
  except Exception as e:
@@ -441,6 +443,63 @@ def save_csv(df, outputdir, name):
441
443
  file_path = os.path.join(outputdir, "Results", name + ".csv")
442
444
  df.to_csv(file_path)
443
445
 
446
+ # Function to process the reference CSV and generate variants
447
+ def process_ref_csv_oligopool(cl_args, tqdm_fn=tqdm.tqdm):
448
+ ref_df = pd.read_csv(cl_args["summary"])
449
+ result_folder = create_result_folder(cl_args)
450
+ variant_csv_path = os.path.join(result_folder, "variants.csv")
451
+ variant_df = pd.DataFrame(columns=["barcode_plate", "name", "refseq", "variant"])
452
+
453
+ # First get the different barcode plates (these will be unique)
454
+ barcode_plates = ref_df["barcode_plate"].unique()
455
+ ref_df["barcode_index"] = [i for i in range(len(ref_df))]
456
+ barcode_to_index = dict(zip(ref_df.barcode_plate, ref_df.barcode_index))
457
+ for barcode_plate in barcode_plates:
458
+ if not cl_args["skip_demultiplexing"]:
459
+ i = barcode_to_index[barcode_plate]
460
+ name_folder = os.path.join(result_folder, f'RB{barcode_plate}')
461
+ os.makedirs(name_folder, exist_ok=True)
462
+ barcode_path = filter_bc(cl_args, name_folder, i)
463
+ output_dir = Path(result_folder) / f"{cl_args['name']}_fastq"
464
+ output_dir.mkdir(parents=True, exist_ok=True)
465
+
466
+ file_to_fastq = cat_fastq_files(cl_args.get("path"), output_dir)
467
+ try:
468
+ demux_fastq(output_dir, name_folder, barcode_path)
469
+ except Exception as e:
470
+ logging.error("An error occurred during demultiplexing for sample {}. Skipping this sample.".format(barcode_plate), exc_info=True)
471
+ continue
472
+ # Check this - need to see if the code works... ToDo: Ariane
473
+ # Now they are all demultiplexed, we can call variants
474
+ if not cl_args["skip_variantcalling"]:
475
+ for i, row in tqdm_fn(ref_df.iterrows(), total=len(ref_df), desc="Processing Samples"):
476
+ barcode_plate = row["barcode_plate"]
477
+ name = row["name"]
478
+ refseq = row["refseq"].upper()
479
+ # Get the name folder and barcode path
480
+ temp_fasta_path = os.path.join(result_folder, f"temp_{name}.fasta")
481
+ if not os.path.exists(temp_fasta_path):
482
+ with open(temp_fasta_path, "w") as f:
483
+ f.write(f">{name}\n{refseq}\n")
484
+ else:
485
+ logging.info(f"Fasta file for {name} already exists. Skipping write.")
486
+ try:
487
+ filtered_barcodes = filter_bc(cl_args, result_folder, i)
488
+ variant_result = call_variant(f"{name}", result_folder, temp_fasta_path, filtered_barcodes,
489
+ oligopool=True)
490
+ variant_result["barcode_plate"] = barcode_plate
491
+ variant_result["name"] = name
492
+ variant_result["refseq"] = refseq
493
+ variant_df = pd.concat([variant_df, variant_result])
494
+ except Exception as e:
495
+ logging.error("An error occurred during variant calling for sample {}. Skipping this sample.".format(name), exc_info=True)
496
+ continue
497
+
498
+ variant_df.to_csv(variant_csv_path, index=False)
499
+ # visualize it as well
500
+ return variant_df, ref_df
501
+
502
+
444
503
  # Function to process the reference CSV and generate variants
445
504
  def process_ref_csv(cl_args, tqdm_fn=tqdm.tqdm):
446
505
  ref_df = pd.read_csv(cl_args["summary"])
@@ -472,14 +531,30 @@ def process_ref_csv(cl_args, tqdm_fn=tqdm.tqdm):
472
531
  file_to_fastq = cat_fastq_files(cl_args.get("path"), output_dir)
473
532
  try:
474
533
  demux_fastq(output_dir, name_folder, barcode_path)
534
+
535
+ # Add filtering step here with multithreading
536
+ filtered_counts = filter_demultiplexed_folder(
537
+ name_folder,
538
+ refseq,
539
+ num_threads=10
540
+ )
541
+ logging.info(f"Orientation filtering completed for {name}")
542
+ total_reads = sum(counts['total'] for counts in filtered_counts.values())
543
+ kept_reads = sum(counts['kept'] for counts in filtered_counts.values())
544
+ logging.info(f"Total filtering results: {kept_reads}/{total_reads} reads kept ({kept_reads/total_reads*100:.2f}%)")
545
+ for file, counts in filtered_counts.items():
546
+ logging.info(f"{file}: {counts['kept']}/{counts['total']} reads kept")
547
+
548
+
475
549
  except Exception as e:
476
- logging.error("An error occurred during demultiplexing for sample {}. Skipping this sample.".format(name), exc_info=True)
550
+ logging.error("An error occurred during demultiplexing/filtering for sample {}. Skipping this sample.".format(name), exc_info=True)
477
551
  continue
478
552
 
479
553
  if not cl_args["skip_variantcalling"]:
480
554
  try:
555
+ threshold = cl_args.get("threshold") if cl_args.get("threshold") is not None else 0.5
481
556
  variant_result = call_variant(
482
- f"{name}", name_folder, temp_fasta_path, barcode_path
557
+ f"{name}", name_folder, temp_fasta_path, barcode_path, threshold=threshold
483
558
  )
484
559
  variant_result["barcode_plate"] = barcode_plate
485
560
  variant_result["name"] = name
@@ -493,6 +568,7 @@ def process_ref_csv(cl_args, tqdm_fn=tqdm.tqdm):
493
568
  variant_df.to_csv(variant_csv_path, index=False)
494
569
  return variant_df, ref_df
495
570
 
571
+
496
572
  # Main function to run LevSeq and ensure saving of intermediate results if an error occurs
497
573
  def run_LevSeq(cl_args, tqdm_fn=tqdm.tqdm):
498
574
  result_folder = create_result_folder(cl_args)
@@ -504,9 +580,12 @@ def run_LevSeq(cl_args, tqdm_fn=tqdm.tqdm):
504
580
  logging.info("Logging configured. Starting program.")
505
581
 
506
582
  variant_df = pd.DataFrame(columns=["barcode_plate", "name", "refseq", "variant"])
507
-
583
+
508
584
  try:
509
- variant_df, ref_df = process_ref_csv(cl_args, tqdm_fn)
585
+ if cl_args["oligopool"]:
586
+ variant_df, ref_df = process_ref_csv_oligopool(cl_args, tqdm_fn)
587
+ else:
588
+ variant_df, ref_df = process_ref_csv(cl_args, tqdm_fn)
510
589
  ref_df_path = os.path.join(ref_folder, cl_args["name"]+".csv")
511
590
  ref_df.to_csv(ref_df_path, index=False)
512
591
 
@@ -529,6 +608,8 @@ def run_LevSeq(cl_args, tqdm_fn=tqdm.tqdm):
529
608
  df_variants, df_vis = create_df_v(variant_df)
530
609
  processed_csv = os.path.join(result_folder, "visualization_partial.csv")
531
610
  df_vis.to_csv(processed_csv, index=False)
611
+ if cl_args["oligopool"]:
612
+ make_oligopool_plates(df_vis, result_folder=result_folder, save_files=True)
532
613
  except Exception as e:
533
614
  processed_csv = os.path.join(result_folder, "visualization_partial.csv")
534
615
  if 'df_vis' in locals():
@@ -59,12 +59,13 @@ def translate(seq):
59
59
  'TTC': 'F', 'TTT': 'F', 'TTA': 'L', 'TTG': 'L',
60
60
  'TAC': 'Y', 'TAT': 'Y', 'TAA': '*', 'TAG': '*',
61
61
  'TGC': 'C', 'TGT': 'C', 'TGA': '*', 'TGG': 'W',
62
+ 'GTS': "X"
62
63
  }
63
64
  protein = ""
64
65
  if len(seq) % 3 == 0:
65
66
  for i in range(0, len(seq), 3):
66
67
  codon = seq[i:i + 3]
67
- protein += table[codon]
68
+ protein += table.get(codon, 'X')
68
69
  return protein
69
70
 
70
71
 
@@ -290,8 +291,7 @@ def get_reads_for_well(parent_name, bam_file_path: str, ref_str: str, msa_path=N
290
291
  insert_map = defaultdict(list)
291
292
  for read in bam.fetch(until_eof=True):
292
293
  # Ensure we have at least 75% coverage
293
- if read.query_sequence is not None and len(read.query_sequence) > 0.75 * len(
294
- ref_str) and read.cigartuples is not None:
294
+ if read.query_sequence is not None and read.cigartuples is not None: # and len(read.query_sequence) > 0.75 * len(ref_str) and read.cigartuples is not None:
295
295
  seq, ref, qual, ins = alignment_from_cigar(read.cigartuples, read.query_sequence, ref_str,
296
296
  read.query_qualities)
297
297
  # Make it totally align
@@ -313,16 +313,17 @@ def get_reads_for_well(parent_name, bam_file_path: str, ref_str: str, msa_path=N
313
313
  # Do this for all wells
314
314
  seq_df = make_well_df_from_reads(seqs, read_ids, read_quals)
315
315
  alignment_count = len(seq_df.values)
316
- rows_all = make_row_from_read_pileup_across_well(seq_df, ref_str, parent_name, insert_map)
317
- bam.close()
318
-
319
- if len(rows_all) > 2: # Check if we have anything to return
320
- seq_df = pd.DataFrame(rows_all)
321
- seq_df.columns = ['gene_name', 'position', 'ref', 'most_frequent', 'freq_non_ref', 'total_other',
322
- 'total_reads', 'p_value', 'percent_most_freq_mutation', 'A', 'p(a)', 'T', 'p(t)', 'G', 'p(g)',
323
- 'C', 'p(c)', 'N', 'p(n)', 'I', 'p(i)', 'Warnings']
324
- return calculate_mutation_significance_across_well(seq_df), alignment_count
325
-
316
+ if alignment_count > 0:
317
+ rows_all = make_row_from_read_pileup_across_well(seq_df, ref_str, parent_name, insert_map)
318
+ bam.close()
319
+
320
+ if len(rows_all) > 2: # Check if we have anything to return
321
+ seq_df = pd.DataFrame(rows_all)
322
+ seq_df.columns = ['gene_name', 'position', 'ref', 'most_frequent', 'freq_non_ref', 'total_other',
323
+ 'total_reads', 'p_value', 'percent_most_freq_mutation', 'A', 'p(a)', 'T', 'p(t)', 'G', 'p(g)',
324
+ 'C', 'p(c)', 'N', 'p(n)', 'I', 'p(i)', 'Warnings']
325
+ return calculate_mutation_significance_across_well(seq_df), alignment_count
326
+ return None, 0
326
327
  def make_row_from_read_pileup_across_well(well_df, ref_str, label, insert_map):
327
328
  """
328
329
  Given a pileup of reads, we want to get some summary information about that sequence
@@ -51,11 +51,13 @@ class VariantCaller:
51
51
 
52
52
  """
53
53
 
54
- def __init__(self, experiment_name, experiment_folder: Path, template_fasta: Path, barcode_path: Path, padding_start: int = 0, padding_end: int = 0) -> None:
54
+ def __init__(self, experiment_name, experiment_folder: Path, template_fasta: Path, barcode_path: Path,
55
+ padding_start: int = 0, padding_end: int = 0, oligopool=True) -> None:
55
56
  self.barcode_path = barcode_path
56
57
  self.experiment_name = experiment_name
57
58
  self.experiment_folder = experiment_folder
58
59
  self.padding_start = padding_start
60
+ self.oligopool = oligopool
59
61
  self.padding_end = padding_end
60
62
  self.template_fasta = template_fasta
61
63
  self.alignment_name = 'alignment_minimap'
@@ -90,9 +92,15 @@ class VariantCaller:
90
92
  renamed_ids.append(f'{plate}_{well}')
91
93
  plates.append(experiment_name)
92
94
  wells.append(well)
93
- self.variant_dict[f'{plate}_{well}'] = {'Plate': experiment_name, 'Well': well,
94
- 'Barcodes': f'{reverse_barcode}_{forward_barcode}',
95
- 'Path': os.path.join(self.experiment_folder, f'{reverse_barcode}/{forward_barcode}')}
95
+ if self.oligopool:
96
+ self.variant_dict[f'{plate}_{well}'] = {'Plate': experiment_name, 'Well': well,
97
+ 'Barcodes': f'{reverse_barcode}_{forward_barcode}',
98
+ 'Path': os.path.join(self.experiment_folder,
99
+ f'{reverse_barcode}/{reverse_barcode}/{forward_barcode}')}
100
+ else:
101
+ self.variant_dict[f'{plate}_{well}'] = {'Plate': experiment_name, 'Well': well,
102
+ 'Barcodes': f'{reverse_barcode}_{forward_barcode}',
103
+ 'Path': os.path.join(self.experiment_folder, f'{reverse_barcode}/{forward_barcode}')}
96
104
  df = pd.DataFrame()
97
105
  df['Plate'] = plates
98
106
  df['Well'] = wells
@@ -100,14 +108,6 @@ class VariantCaller:
100
108
  df['ID'] = renamed_ids
101
109
  return df
102
110
 
103
- @staticmethod
104
- def load_reference(reference_path):
105
- # The reference enables multiple parents to be used for different
106
- # WARNING: this assumes all the parents are the same
107
- ref_seq = str(SeqIO.read(template_fasta,'fasta').seq)
108
- barcode_to_plate_name = experiment_name
109
- return 'Parent', ref_seq, barcode_to_plate_name
110
-
111
111
  @staticmethod
112
112
  def _barcode_to_well(barcode):
113
113
  match = re.search(r'\d+', barcode)
@@ -124,28 +124,32 @@ class VariantCaller:
124
124
  try:
125
125
  all_fastq = os.path.join(output_dir, '*.fastq')
126
126
  fastq_list = glob.glob(all_fastq)
127
- fastq_files = os.path.join(output_dir, f"demultiplexed_{filename}.fastq")
127
+ fastq_files = all_fastq # os.path.join(output_dir, f"demultiplexed_{filename}.fastq")
128
128
 
129
- if not fastq_list:
129
+ if not all_fastq:
130
130
  logger.error("No FASTQ files found in the specified output directory.")
131
131
  return
132
132
 
133
- # Combining fastq files into one
134
- with open(fastq_files, 'w') as outfile:
135
- for fastq in fastq_list:
136
- with open(fastq, 'r') as infile:
137
- outfile.write(infile.read())
138
-
133
+ # Combining fastq files into one if there are more than 1
134
+ if len(fastq_list) > 1:
135
+ with open(fastq_files, 'w') as outfile:
136
+ for fastq in fastq_list:
137
+ with open(fastq, 'r') as infile:
138
+ outfile.write(infile.read())
139
+ else:
140
+ fastq_files = fastq_list[0]
139
141
  # Alignment using minimap2
140
142
  minimap_cmd = f"minimap2 -ax map-ont -A {scores[0]} -B {scores[1]} -O {scores[2]},24 '{self.template_fasta}' '{fastq_files}' > '{output_dir}/{alignment_name}.sam'"
141
143
  subprocess.run(minimap_cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
142
-
144
+ print(minimap_cmd)
143
145
  # Convert SAM to BAM and sort
144
146
  view_cmd = f"samtools view -bS '{output_dir}/{alignment_name}.sam' > '{output_dir}/{alignment_name}.bam'"
145
147
  subprocess.run(view_cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
148
+ print(view_cmd)
146
149
 
147
150
  sort_cmd = f"samtools sort '{output_dir}/{alignment_name}.bam' -o '{output_dir}/{alignment_name}.bam'"
148
151
  subprocess.run(sort_cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
152
+ print(sort_cmd)
149
153
 
150
154
  # Index the BAM file
151
155
  index_cmd = f"samtools index '{output_dir}/{alignment_name}.bam'"
@@ -163,18 +167,22 @@ class VariantCaller:
163
167
  for barcode_id in pbar:
164
168
  try:
165
169
  row = self.variant_dict.get(barcode_id)
166
- bam_file = os.path.join(row["Path"], f'{self.alignment_name}.bam')
170
+ bam_file = os.path.join(row["Path"], f'{self.alignment_name}_{barcode_id}.bam')
167
171
 
168
172
  # Check if alignment file exists, if not, align sequences
169
173
  if not os.path.exists(bam_file):
170
- logger.info(f"Aligning sequences for {row['Path']}")
171
- self._align_sequences(row["Path"], row['Barcodes'])
174
+ logger.info(f"Aligning sequences for {row['Path']}")
175
+ self._align_sequences(row["Path"], row['Barcodes'],
176
+ alignment_name=f'{self.alignment_name}_{barcode_id}')
172
177
 
173
178
  # Placeholder function calls to demonstrate workflow
174
179
  well_df, alignment_count = get_reads_for_well(self.experiment_name, bam_file,
175
- self.ref_str, f'{row["Path"]}/msa.fa')
176
- self.variant_dict[barcode_id]['Alignment Count'] = alignment_count
180
+ self.ref_str, f'{row["Path"]}/{self.alignment_name}_{barcode_id}.fa')
177
181
  if well_df is not None:
182
+ if self.oligopool:
183
+ if len(well_df.values) < 10:
184
+ continue
185
+ self.variant_dict[barcode_id]['Alignment Count'] = alignment_count
178
186
  well_df.to_csv(f"{row['Path']}/seq_{barcode_id}.csv", index=False)
179
187
  label, freq, combined_p_value, mixed_well, avg_error_rate = get_variant_label_for_well(well_df, threshold)
180
188
  self.variant_dict[barcode_id]['Variant'] = label
@@ -187,7 +195,7 @@ class VariantCaller:
187
195
  finally:
188
196
  pbar.update(1)
189
197
 
190
- def get_variant_df(self, threshold: float = 0.5, min_depth: int = 5, output_dir='', num_threads=10):
198
+ def get_variant_df(self, threshold: float = 0.5, min_depth: int = 5, output_dir='', num_threads=20):
191
199
  """
192
200
  Get Variant Data Frame for all samples in the experiment
193
201
 
@@ -202,26 +210,34 @@ class VariantCaller:
202
210
  data = []
203
211
  num = int(len(self.variant_df) / num_threads)
204
212
  self.variant_df.reset_index(inplace=True)
205
- for i in range(0, len(self.variant_df), num):
206
- end_i = i + num if i + num < len(self.variant_df) else len(self.variant_df)
207
- sub_df = self.variant_df.iloc[i: end_i]['ID'].values
208
- sub_data = [sub_df, threshold, min_depth, output_dir]
209
- data.append(sub_data)
213
+ if num_threads > 1:
214
+ for i in range(0, len(self.variant_df), num):
215
+ end_i = i + num if i + num < len(self.variant_df) else len(self.variant_df)
216
+ sub_df = self.variant_df.iloc[i: end_i]['ID'].values
217
+ sub_data = [sub_df, threshold, min_depth, output_dir]
218
+ data.append(sub_data)
210
219
 
211
- # Thread it
212
- pool.map(self._run_variant_thread, data)
220
+ # Thread it
221
+ pool.map(self._run_variant_thread, data)
222
+ else:
223
+ self._run_variant_thread([self.variant_df, threshold, min_depth, output_dir])
213
224
 
214
225
  self.variant_df['Variant'] = [self.variant_dict[b_id].get('Variant') for b_id in self.variant_df['ID'].values]
215
- self.variant_df['Mixed Well'] = [self.variant_dict[b_id].get('Mixed Well') for b_id in self.variant_df['ID'].values]
216
- self.variant_df['Average mutation frequency'] = [self.variant_dict[b_id].get('Average mutation frequency') for b_id in self.variant_df['ID'].values]
226
+ self.variant_df['Mixed Well'] = [self.variant_dict[b_id].get('Mixed Well') for b_id in
227
+ self.variant_df['ID'].values]
228
+ self.variant_df['Average mutation frequency'] = [self.variant_dict[b_id].get('Average mutation frequency') for
229
+ b_id in self.variant_df['ID'].values]
217
230
  self.variant_df['P value'] = [self.variant_dict[b_id].get('P value') for b_id in self.variant_df['ID'].values]
218
- self.variant_df['Alignment Count'] = [self.variant_dict[b_id].get('Alignment Count') for b_id in self.variant_df['ID'].values]
219
- self.variant_df['Average error rate'] = [self.variant_dict[b_id].get('Average error rate') for b_id in self.variant_df['ID'].values]
220
-
231
+ self.variant_df['Alignment Count'] = [self.variant_dict[b_id].get('Alignment Count') for b_id in
232
+ self.variant_df['ID'].values]
233
+ self.variant_df['Average error rate'] = [self.variant_dict[b_id].get('Average error rate') for b_id in
234
+ self.variant_df['ID'].values]
221
235
  # Adjust p-values using bonferroni make it simple
222
- self.variant_df['P adj. value'] = len(self.variant_df) * self.variant_df["P value"].values
223
- self.variant_df['P adj. value'] = [1 if x > 1 else x for x in self.variant_df["P adj. value"].values]
224
-
236
+ self.variant_df['P adj. value'] = [len(self.variant_df) * p if p else None for p in self.variant_df["P value"].values]
237
+ self.variant_df['P adj. value'] = [1 if x and x > 1 else x for x in self.variant_df["P adj. value"].values]
238
+ if self.oligopool:
239
+ # Filter this so we don't get all the junk
240
+ self.variant_df = self.variant_df[self.variant_df['Alignment Count'] > 2]
225
241
  return self.variant_df
226
242
 
227
243
  def _get_alignment_count(self, sample_folder_path: Path):
@@ -63,6 +63,7 @@ from bokeh.events import Tap
63
63
  from bokeh.io import save, show, output_file, output_notebook
64
64
 
65
65
  import panel as pn
66
+ import seaborn as sns
66
67
 
67
68
  from levseq.utils import *
68
69
 
@@ -1147,3 +1148,54 @@ def plot_sequence_alignment(
1147
1148
  toolbar_location=None,
1148
1149
  sizing_mode=sizing_mode,
1149
1150
  )
1151
+
1152
+
1153
+ def make_oligopool_plates(vis_df, result_folder, save_files=False):
1154
+ """ Simple heatmaps saved as SVGs for oligopool plates."""
1155
+ parents = vis_df[vis_df['amino_acid_substitutions'] == '#PARENT#']
1156
+ top_well_df = parents.sort_values(by='Alignment Count', ascending=False)
1157
+ top_well_df = top_well_df.drop_duplicates('name', keep='first')
1158
+ # This is one of the things that they will want returned
1159
+ if save_files:
1160
+ top_well_df.to_csv(os.path.join(result_folder, 'best_aligned_parents.csv'), index=False)
1161
+ # Now for each plate we make a heatmap
1162
+ vis_df['amino_acid_substitutions'] = [n if x == '#PARENT#' else x for n, x in
1163
+ vis_df[['name', 'amino_acid_substitutions']].values]
1164
+
1165
+ plates = set(vis_df['barcode_plate'].values)
1166
+ # Drop mixed well plates
1167
+ for plate in plates:
1168
+ df = vis_df[vis_df['barcode_plate'] == plate]
1169
+ df = df.sort_values(by='Alignment Count', ascending=False)
1170
+ # Keep only one of the variants per well (i.e. the dominant one)
1171
+ df = df.drop_duplicates('Well')
1172
+ # Reshape into a well format
1173
+ df['Column'] = [int(i[1:]) for i in df['Well'].values]
1174
+ df['Row'] = [i[0] for i in df['Well'].values]
1175
+ df.sort_values(by=['Column', 'Row'], inplace=True, ascending=[False, True])
1176
+ # Load the example flights dataset and convert to long-form
1177
+ platemap = (
1178
+ df
1179
+ .pivot(index="Row", columns="Column", values="Alignment Count")
1180
+ )
1181
+ platemap_labels = (
1182
+ df
1183
+ .pivot(index="Row", columns="Column", values="amino_acid_substitutions")
1184
+ )
1185
+ plot_seaborn_heatmap(platemap, platemap_labels,f'{plate}', result_folder)
1186
+
1187
+ def plot_seaborn_heatmap(platemap, platemap_labels, label: str, result_folder):
1188
+ """ Plot the seaborn platemap using the data"""
1189
+ platemap = platemap.fillna(0)
1190
+ sns.set_theme()
1191
+ f, ax = plt.subplots(figsize=(16, 8))
1192
+ plt.rcParams['svg.fonttype'] = 'none' # Ensure text is saved as text
1193
+ row_labels = [str(s) for s in list(platemap.index)]
1194
+ col_labels = [str(s) for s in list(platemap.columns)]
1195
+ data = platemap.values
1196
+ pc = sns.heatmap(data, cmap='Reds', annot=platemap_labels.values, xticklabels=col_labels, yticklabels=row_labels,
1197
+ fmt='', linewidths=.1)
1198
+ ax = pc.axes
1199
+ plt.yticks(rotation=0)
1200
+ plt.setp(ax.get_yticklabels(), ha="center")
1201
+ plt.savefig(os.path.join(result_folder, f'platemap_{label}.svg'))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: levseq
3
- Version: 1.3.2
3
+ Version: 1.4.0
4
4
  Home-page: https://github.com/fhalab/levseq/
5
5
  Author: Yueming Long, Ariane Mora, Francesca-Zhoufan Li, Emre Gursoy
6
6
  Author-email: ylong@caltech.edu
@@ -52,9 +52,13 @@ In directed evolution, sequencing every variant enhances data insight and create
52
52
  ![Figure 1: LevSeq Workflow](manuscript/figures/LevSeq_Figure-1.jpeg)
53
53
  Figure 1: Overview of the LevSeq variant sequencing workflow using Nanopore technology. This diagram illustrates the key steps in the process, from sample preparation to data analysis and visualization.
54
54
 
55
+ ## Website
56
+ A beta website is available [here](https://levseqdb.streamlit.app/) you just load directly your output from LevSeq and your LCMS results and get visualisations and per plate normalizations.
57
+
58
+ ## Data
55
59
 
56
60
  - Data to reproduce the results and to test are available on zenodo [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.13694463.svg)](https://doi.org/10.5281/zenodo.13694463)
57
- - A dockerized website and database for labs to locally host and visualize their data: website is available [here](https://levseqdb.streamlit.app/) and code to host locally [here](https://github.com/fhalab/LevSeq_db)
61
+ - A dockerized website and database for labs to locally host and visualize their data: website is available [here](https://levseqdb.streamlit.app/) and code to host locally [here](https://github.com/fhalab/LevSeq_db)
58
62
 
59
63
  ## Setup
60
64
 
@@ -87,6 +91,10 @@ conda create --name levseq python=3.12 -y
87
91
  conda activate levseq
88
92
  ```
89
93
 
94
+ ```
95
+ pip install levseq
96
+ ```
97
+
90
98
  #### Dependencies
91
99
 
92
100
  1. Samtools: https://www.htslib.org/download/
@@ -100,6 +108,11 @@ conda install -c bioconda -c conda-forge samtools
100
108
  ```
101
109
  conda install -c bioconda -c conda-forge minimap2
102
110
  ```
111
+ 3. gcc 13 and 14 on Mac M1 through M4 chips
112
+ ```
113
+ brew install gcc@13
114
+ brew install gcc@14
115
+ ```
103
116
  ### Docker Installation (Recommended for full pipeline)
104
117
  For installing the whole pipeline, you'll need to use the docker image. For this, install docker as required for your
105
118
  operating system (https://docs.docker.com/engine/install/).
@@ -7,6 +7,7 @@ levseq/__init__.py
7
7
  levseq/basecaller.py
8
8
  levseq/cmd.py
9
9
  levseq/coordinates.py
10
+ levseq/filter_orientation.py
10
11
  levseq/globals.py
11
12
  levseq/interface.py
12
13
  levseq/parser.py
@@ -21,9 +21,11 @@ import unittest
21
21
  import matplotlib.pyplot as plt
22
22
  from levseq import *
23
23
  from levseq.run_levseq import process_ref_csv
24
+
24
25
  u = SciUtil()
25
26
  import math
26
27
 
28
+
27
29
  class TestClass(unittest.TestCase):
28
30
 
29
31
  @classmethod
@@ -45,47 +47,54 @@ class TestClass(unittest.TestCase):
45
47
  def teardown_class(self):
46
48
  shutil.rmtree(self.tmp_dir)
47
49
 
50
+
48
51
  class TestDeploy(TestClass):
49
-
52
+
50
53
  def test_deploy(self):
51
54
  cmd_list = [
52
55
  'docker', # Needs to be installed as vina.
53
56
  'run',
54
57
  '--rm',
55
58
  '-v',
56
- f'{os.getcwd()}:/levseq_results',
59
+ f'{os.getcwd()}/test_data/laragen_run:/levseq_results',
57
60
  'levseq',
58
61
  'test_deploy',
59
- 'test_data/laragen_run/levseq-1.2.7/',
60
- 'test_data/laragen_run/20241116-LevSeq-Review-Validation-levseq_ref.csv'
62
+ 'levseq_results/levseq-1.2.7/',
63
+ 'levseq_results/20241116-LevSeq-Review-Validation-levseq_ref.csv'
61
64
  ]
65
+ print(' '.join(cmd_list))
62
66
  # ToDo: add in scoring function for ad4
63
- cmd_return = subprocess.run(cmd_list, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
64
- print(cmd_return.stdout, cmd_return)
65
-
67
+
68
+ # cmd_return = subprocess.run(cmd_list, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
69
+ # print(cmd_return.stdout, cmd_return)
70
+
66
71
  def test_variant_calling(self):
67
72
  # Take as input the demultiplexed fastq files and the reference csv file
68
- cl_args = {'skip_demultiplexing': True, 'skip_variantcalling': False}
73
+ cl_args = {'skip_demultiplexing': True, 'skip_variantcalling': False, 'threshold': 0.5}
69
74
  cl_args["name"] = 'test_deploy'
70
75
  cl_args['path'] = 'test_data/laragen_run/levseq-1.2.7/'
71
76
  cl_args["summary"] = 'test_data/laragen_run/20241116-LevSeq-Review-Validation-levseq_ref.csv'
72
77
  variant_df, ref_df = process_ref_csv(cl_args)
78
+ variant_df.to_csv('laragen_test_run.csv')
73
79
  # Now we want to check all the variants are the same as in the original case:
74
80
  checked_variants_df = pd.read_csv('test_data/laragen_run/levseq-1.2.7/variants_gold_standard.csv')
75
81
  checked_variants = checked_variants_df['Variant'].values
76
- checked_sig = checked_variants_df['P adj. value'].values
82
+ checked_sig = checked_variants_df['Average mutation frequency'].values
83
+ checked_alignments = checked_variants_df['Alignment Count'].values
84
+
77
85
  i = 0
78
- for variant, pval in variant_df[['Variant', 'P adj. value']].values:
86
+ for variant, freq, alignment_count, pval in variant_df[['Variant', 'Average mutation frequency',
87
+ 'Alignment Count', 'P adj. value']].values:
79
88
  print(variant, checked_variants[i])
80
89
  if checked_variants[i]:
81
90
  if variant:
82
91
  assert variant == checked_variants[i]
83
- # if pval < 0.05:
84
- # assert checked_sig[i] < 0.05
85
- # elif math.isnan(pval):
86
- # assert math.isnan(checked_sig[i])
87
- # else:
88
- # assert checked_sig[i] >= 0.05
89
- print(pval, checked_sig[i])
92
+ assert alignment_count == checked_alignments[i]
93
+ if freq != checked_sig[i]:
94
+ print(freq, checked_sig[i])
90
95
  i += 1
91
96
 
97
+
98
+ # docker run --rm -v /Users/arianemora/Documents/code/LevSeq/data/degradeo/20250121-JR-IM-HS:/levseq_results levseq 20250121-JR-IM-HS_oligopool levseq_results/ levseq_results/ref_seq_oligopools_single.csv --skip_variantcalling
99
+ # levseq oligpool_20250121-JR-IM-HS /Users/arianemora/Documents/code/LevSeq/data/degradeo/20250121-JR-IM-HS/ /Users/arianemora/Documents/code/LevSeq/data/degradeo/20250121-JR-IM-HS/ref_seq_oligopools_all.csv --skip_variantcalling
100
+ # levseq results results/ /Users/arianemora/Documents/code/LevSeq/data/degradeo/20250121-JR-IM-HS/ref_seq_oligopools_all.csv --skip_demultiplexing --oligopool
@@ -58,36 +58,10 @@ class TestClass(unittest.TestCase):
58
58
  def teardown_class(self):
59
59
  shutil.rmtree(self.tmp_dir)
60
60
 
61
- def test_making_pools(self):
62
- u.dp(["Testing SSM"])
63
- cl_args = {'skip_demultiplexing': True, 'skip_variantcalling': False}
64
- cl_args["name"] = 'oligopools'
65
- cl_args['path'] = '/Users/arianemora/Documents/projects/LevSeq/oligopools/'
66
- cl_args["summary"] = '/Users/arianemora/Documents/projects/LevSeq/oligopools/oligopool_seqs.csv'
67
- variant_df = process_ref_csv(cl_args)
68
-
69
- # Check if variants.csv already exist
70
- variant_csv_path = os.path.join('oligopools', "variants.csv")
71
- if os.path.exists(variant_csv_path):
72
- variant_df = pd.read_csv(variant_csv_path)
73
- df_variants, df_vis = create_df_v(variant_df)
74
- # Clean up and prepare dataframe for visualization
75
- else:
76
- df_variants, df_vis = create_df_v(variant_df)
77
-
78
- def test_pools(self):
79
- u.dp(["Testing SSM"])
80
- cl_args = {'skip_demultiplexing': True, 'skip_variantcalling': False}
81
- cl_args["name"] = 'oligopools'
82
- cl_args['path'] = '/Users/arianemora/Documents/projects/LevSeq/oligopools/'
83
- cl_args["summary"] = '/Users/arianemora/Documents/projects/LevSeq/oligopools/oligopool_seqs.csv'
84
- variant_df = process_ref_csv(cl_args)
85
-
86
- # Check if variants.csv already exist
87
- variant_csv_path = os.path.join('oligopools', "variants.csv")
88
- if os.path.exists(variant_csv_path):
89
- variant_df = pd.read_csv(variant_csv_path)
90
- df_variants, df_vis = create_df_v(variant_df)
91
- # Clean up and prepare dataframe for visualization
92
- else:
93
- df_variants, df_vis = create_df_v(variant_df)
61
+ def test_demultipluxing_pools(self):
62
+ # Take as input the demultiplexed fastq files and the reference csv file
63
+ cl_args = {'skip_demultiplexing': False, 'skip_variantcalling': False, 'threshold': 0.5, 'oligopool': True, 'show_msa': False}
64
+ cl_args["name"] = 'oligotest_21032025'
65
+ cl_args['path'] = 'test_oligopool_2103/'
66
+ cl_args["summary"] = 'test_oligopool_2103/test_oligopool_2103.csv'
67
+ run_LevSeq(cl_args)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes