levseq 1.3.3__tar.gz → 1.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. {levseq-1.3.3/levseq.egg-info → levseq-1.4.0}/PKG-INFO +11 -2
  2. {levseq-1.3.3 → levseq-1.4.0}/README.md +10 -1
  3. {levseq-1.3.3 → levseq-1.4.0}/levseq/__init__.py +1 -1
  4. {levseq-1.3.3 → levseq-1.4.0}/levseq/interface.py +3 -0
  5. {levseq-1.3.3 → levseq-1.4.0}/levseq/run_levseq.py +72 -6
  6. {levseq-1.3.3 → levseq-1.4.0}/levseq/utils.py +14 -13
  7. {levseq-1.3.3 → levseq-1.4.0}/levseq/variantcaller.py +58 -42
  8. {levseq-1.3.3 → levseq-1.4.0}/levseq/visualization.py +52 -0
  9. {levseq-1.3.3 → levseq-1.4.0/levseq.egg-info}/PKG-INFO +11 -2
  10. {levseq-1.3.3 → levseq-1.4.0}/tests/test_deploy.py +26 -17
  11. {levseq-1.3.3 → levseq-1.4.0}/tests/test_opligopools.py +7 -33
  12. {levseq-1.3.3 → levseq-1.4.0}/LICENSE +0 -0
  13. {levseq-1.3.3 → levseq-1.4.0}/MANIFEST.in +0 -0
  14. {levseq-1.3.3 → levseq-1.4.0}/levseq/IO_processor.py +0 -0
  15. {levseq-1.3.3 → levseq-1.4.0}/levseq/barcoding/__init__.py +0 -0
  16. {levseq-1.3.3 → levseq-1.4.0}/levseq/barcoding/demultiplex +0 -0
  17. {levseq-1.3.3 → levseq-1.4.0}/levseq/barcoding/demultiplex-arm64 +0 -0
  18. {levseq-1.3.3 → levseq-1.4.0}/levseq/barcoding/demultiplex-x86 +0 -0
  19. {levseq-1.3.3 → levseq-1.4.0}/levseq/barcoding/minion_barcodes.fasta +0 -0
  20. {levseq-1.3.3 → levseq-1.4.0}/levseq/basecaller.py +0 -0
  21. {levseq-1.3.3 → levseq-1.4.0}/levseq/cmd.py +0 -0
  22. {levseq-1.3.3 → levseq-1.4.0}/levseq/coordinates.py +0 -0
  23. {levseq-1.3.3 → levseq-1.4.0}/levseq/filter_orientation.py +0 -0
  24. {levseq-1.3.3 → levseq-1.4.0}/levseq/globals.py +0 -0
  25. {levseq-1.3.3 → levseq-1.4.0}/levseq/parser.py +0 -0
  26. {levseq-1.3.3 → levseq-1.4.0}/levseq/screen.py +0 -0
  27. {levseq-1.3.3 → levseq-1.4.0}/levseq/seqfit.py +0 -0
  28. {levseq-1.3.3 → levseq-1.4.0}/levseq/simulation.py +0 -0
  29. {levseq-1.3.3 → levseq-1.4.0}/levseq/user.py +0 -0
  30. {levseq-1.3.3 → levseq-1.4.0}/levseq.egg-info/SOURCES.txt +0 -0
  31. {levseq-1.3.3 → levseq-1.4.0}/levseq.egg-info/dependency_links.txt +0 -0
  32. {levseq-1.3.3 → levseq-1.4.0}/levseq.egg-info/entry_points.txt +0 -0
  33. {levseq-1.3.3 → levseq-1.4.0}/levseq.egg-info/requires.txt +0 -0
  34. {levseq-1.3.3 → levseq-1.4.0}/levseq.egg-info/top_level.txt +0 -0
  35. {levseq-1.3.3 → levseq-1.4.0}/setup.cfg +0 -0
  36. {levseq-1.3.3 → levseq-1.4.0}/setup.py +0 -0
  37. {levseq-1.3.3 → levseq-1.4.0}/tests/test_demultiplex_docker.py +0 -0
  38. {levseq-1.3.3 → levseq-1.4.0}/tests/test_seqfitvis.py +0 -0
  39. {levseq-1.3.3 → levseq-1.4.0}/tests/test_seqs.py +0 -0
  40. {levseq-1.3.3 → levseq-1.4.0}/tests/test_statistics.py +0 -0
  41. {levseq-1.3.3 → levseq-1.4.0}/tests/test_variant_calling.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: levseq
3
- Version: 1.3.3
3
+ Version: 1.4.0
4
4
  Home-page: https://github.com/fhalab/levseq/
5
5
  Author: Yueming Long, Ariane Mora, Francesca-Zhoufan Li, Emre Gursoy
6
6
  Author-email: ylong@caltech.edu
@@ -52,9 +52,13 @@ In directed evolution, sequencing every variant enhances data insight and create
52
52
  ![Figure 1: LevSeq Workflow](manuscript/figures/LevSeq_Figure-1.jpeg)
53
53
  Figure 1: Overview of the LevSeq variant sequencing workflow using Nanopore technology. This diagram illustrates the key steps in the process, from sample preparation to data analysis and visualization.
54
54
 
55
+ ## Website
56
+ A beta website is available [here](https://levseqdb.streamlit.app/) you just load directly your output from LevSeq and your LCMS results and get visualisations and per plate normalizations.
57
+
58
+ ## Data
55
59
 
56
60
  - Data to reproduce the results and to test are available on zenodo [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.13694463.svg)](https://doi.org/10.5281/zenodo.13694463)
57
- - A dockerized website and database for labs to locally host and visualize their data: website is available [here](https://levseqdb.streamlit.app/) and code to host locally [here](https://github.com/fhalab/LevSeq_db)
61
+ - A dockerized website and database for labs to locally host and visualize their data: website is available [here](https://levseqdb.streamlit.app/) and code to host locally [here](https://github.com/fhalab/LevSeq_db)
58
62
 
59
63
  ## Setup
60
64
 
@@ -104,6 +108,11 @@ conda install -c bioconda -c conda-forge samtools
104
108
  ```
105
109
  conda install -c bioconda -c conda-forge minimap2
106
110
  ```
111
+ 3. gcc 13 and 14 on Mac M1 through M4 chips
112
+ ```
113
+ brew install gcc@13
114
+ brew install gcc@14
115
+ ```
107
116
  ### Docker Installation (Recommended for full pipeline)
108
117
  For installing the whole pipeline, you'll need to use the docker image. For this, install docker as required for your
109
118
  operating system (https://docs.docker.com/engine/install/).
@@ -5,9 +5,13 @@ In directed evolution, sequencing every variant enhances data insight and create
5
5
  ![Figure 1: LevSeq Workflow](manuscript/figures/LevSeq_Figure-1.jpeg)
6
6
  Figure 1: Overview of the LevSeq variant sequencing workflow using Nanopore technology. This diagram illustrates the key steps in the process, from sample preparation to data analysis and visualization.
7
7
 
8
+ ## Website
9
+ A beta website is available [here](https://levseqdb.streamlit.app/) you just load directly your output from LevSeq and your LCMS results and get visualisations and per plate normalizations.
10
+
11
+ ## Data
8
12
 
9
13
  - Data to reproduce the results and to test are available on zenodo [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.13694463.svg)](https://doi.org/10.5281/zenodo.13694463)
10
- - A dockerized website and database for labs to locally host and visualize their data: website is available [here](https://levseqdb.streamlit.app/) and code to host locally [here](https://github.com/fhalab/LevSeq_db)
14
+ - A dockerized website and database for labs to locally host and visualize their data: website is available [here](https://levseqdb.streamlit.app/) and code to host locally [here](https://github.com/fhalab/LevSeq_db)
11
15
 
12
16
  ## Setup
13
17
 
@@ -57,6 +61,11 @@ conda install -c bioconda -c conda-forge samtools
57
61
  ```
58
62
  conda install -c bioconda -c conda-forge minimap2
59
63
  ```
64
+ 3. gcc 13 and 14 on Mac M1 through M4 chips
65
+ ```
66
+ brew install gcc@13
67
+ brew install gcc@14
68
+ ```
60
69
  ### Docker Installation (Recommended for full pipeline)
61
70
  For installing the whole pipeline, you'll need to use the docker image. For this, install docker as required for your
62
71
  operating system (https://docs.docker.com/engine/install/).
@@ -18,7 +18,7 @@
18
18
  __title__ = 'levseq'
19
19
  __description__ = 'LevSeq nanopore sequencing'
20
20
  __url__ = 'https://github.com/fhalab/levseq/'
21
- __version__ = '1.3.3'
21
+ __version__ = '1.4.0'
22
22
  __author__ = 'Yueming Long, Ariane Mora, Francesca-Zhoufan Li, Emre Gursoy'
23
23
  __author_email__ = 'ylong@caltech.edu'
24
24
  __license__ = 'GPL3'
@@ -63,6 +63,9 @@ def build_cli_parser():
63
63
  optional_args_group.add_argument("--skip_variantcalling",
64
64
  action="store_true",
65
65
  help="Skip the variant calling step, default is false")
66
+ optional_args_group.add_argument("--oligopool",
67
+ action="store_true",
68
+ help="Whether this experiment came from an oligopool, default is false.")
66
69
  optional_args_group.add_argument("--show_msa",
67
70
  default=False,
68
71
  help="Skip showing msa")
@@ -221,13 +221,14 @@ def demux_fastq(file_to_fastq, result_folder, barcode_path):
221
221
  executable_path = package_root / "levseq" / "barcoding" / executable_name
222
222
  if not executable_path.exists():
223
223
  raise FileNotFoundError(f"Executable not found: {executable_path}")
224
- seq_min = 200
224
+ seq_min = 200
225
225
  seq_max = 10000
226
226
  prompt = f"{executable_path} -f {file_to_fastq} -d {result_folder} -b {barcode_path} -w 100 -r 100 -m {seq_min} -x {seq_max}"
227
227
  subprocess.run(prompt, shell=True, check=True)
228
228
 
229
229
  # Variant calling using VariantCaller class
230
- def call_variant(experiment_name, experiment_folder, template_fasta, filtered_barcodes):
230
+
231
+ def call_variant(experiment_name, experiment_folder, template_fasta, filtered_barcodes, threshold=0.5, oligopool=False):
231
232
  try:
232
233
  vc = VariantCaller(
233
234
  experiment_name,
@@ -236,8 +237,9 @@ def call_variant(experiment_name, experiment_folder, template_fasta, filtered_ba
236
237
  filtered_barcodes,
237
238
  padding_start=0,
238
239
  padding_end=0,
240
+ oligopool=oligopool
239
241
  )
240
- variant_df = vc.get_variant_df(threshold=0.5, min_depth=5)
242
+ variant_df = vc.get_variant_df(threshold=threshold, min_depth=5)
241
243
  logging.info("Variant calling to create consensus reads successful")
242
244
  return variant_df
243
245
  except Exception as e:
@@ -441,6 +443,63 @@ def save_csv(df, outputdir, name):
441
443
  file_path = os.path.join(outputdir, "Results", name + ".csv")
442
444
  df.to_csv(file_path)
443
445
 
446
+ # Function to process the reference CSV and generate variants
447
+ def process_ref_csv_oligopool(cl_args, tqdm_fn=tqdm.tqdm):
448
+ ref_df = pd.read_csv(cl_args["summary"])
449
+ result_folder = create_result_folder(cl_args)
450
+ variant_csv_path = os.path.join(result_folder, "variants.csv")
451
+ variant_df = pd.DataFrame(columns=["barcode_plate", "name", "refseq", "variant"])
452
+
453
+ # First get the different barcode plates (these will be unique)
454
+ barcode_plates = ref_df["barcode_plate"].unique()
455
+ ref_df["barcode_index"] = [i for i in range(len(ref_df))]
456
+ barcode_to_index = dict(zip(ref_df.barcode_plate, ref_df.barcode_index))
457
+ for barcode_plate in barcode_plates:
458
+ if not cl_args["skip_demultiplexing"]:
459
+ i = barcode_to_index[barcode_plate]
460
+ name_folder = os.path.join(result_folder, f'RB{barcode_plate}')
461
+ os.makedirs(name_folder, exist_ok=True)
462
+ barcode_path = filter_bc(cl_args, name_folder, i)
463
+ output_dir = Path(result_folder) / f"{cl_args['name']}_fastq"
464
+ output_dir.mkdir(parents=True, exist_ok=True)
465
+
466
+ file_to_fastq = cat_fastq_files(cl_args.get("path"), output_dir)
467
+ try:
468
+ demux_fastq(output_dir, name_folder, barcode_path)
469
+ except Exception as e:
470
+ logging.error("An error occurred during demultiplexing for sample {}. Skipping this sample.".format(barcode_plate), exc_info=True)
471
+ continue
472
+ # Check this - need to see if the code works... ToDo: Ariane
473
+ # Now they are all demultiplexed, we can call variants
474
+ if not cl_args["skip_variantcalling"]:
475
+ for i, row in tqdm_fn(ref_df.iterrows(), total=len(ref_df), desc="Processing Samples"):
476
+ barcode_plate = row["barcode_plate"]
477
+ name = row["name"]
478
+ refseq = row["refseq"].upper()
479
+ # Get the name folder and barcode path
480
+ temp_fasta_path = os.path.join(result_folder, f"temp_{name}.fasta")
481
+ if not os.path.exists(temp_fasta_path):
482
+ with open(temp_fasta_path, "w") as f:
483
+ f.write(f">{name}\n{refseq}\n")
484
+ else:
485
+ logging.info(f"Fasta file for {name} already exists. Skipping write.")
486
+ try:
487
+ filtered_barcodes = filter_bc(cl_args, result_folder, i)
488
+ variant_result = call_variant(f"{name}", result_folder, temp_fasta_path, filtered_barcodes,
489
+ oligopool=True)
490
+ variant_result["barcode_plate"] = barcode_plate
491
+ variant_result["name"] = name
492
+ variant_result["refseq"] = refseq
493
+ variant_df = pd.concat([variant_df, variant_result])
494
+ except Exception as e:
495
+ logging.error("An error occurred during variant calling for sample {}. Skipping this sample.".format(name), exc_info=True)
496
+ continue
497
+
498
+ variant_df.to_csv(variant_csv_path, index=False)
499
+ # visualize it as well
500
+ return variant_df, ref_df
501
+
502
+
444
503
  # Function to process the reference CSV and generate variants
445
504
  def process_ref_csv(cl_args, tqdm_fn=tqdm.tqdm):
446
505
  ref_df = pd.read_csv(cl_args["summary"])
@@ -493,8 +552,9 @@ def process_ref_csv(cl_args, tqdm_fn=tqdm.tqdm):
493
552
 
494
553
  if not cl_args["skip_variantcalling"]:
495
554
  try:
555
+ threshold = cl_args.get("threshold") if cl_args.get("threshold") is not None else 0.5
496
556
  variant_result = call_variant(
497
- f"{name}", name_folder, temp_fasta_path, barcode_path
557
+ f"{name}", name_folder, temp_fasta_path, barcode_path, threshold=threshold
498
558
  )
499
559
  variant_result["barcode_plate"] = barcode_plate
500
560
  variant_result["name"] = name
@@ -508,6 +568,7 @@ def process_ref_csv(cl_args, tqdm_fn=tqdm.tqdm):
508
568
  variant_df.to_csv(variant_csv_path, index=False)
509
569
  return variant_df, ref_df
510
570
 
571
+
511
572
  # Main function to run LevSeq and ensure saving of intermediate results if an error occurs
512
573
  def run_LevSeq(cl_args, tqdm_fn=tqdm.tqdm):
513
574
  result_folder = create_result_folder(cl_args)
@@ -519,9 +580,12 @@ def run_LevSeq(cl_args, tqdm_fn=tqdm.tqdm):
519
580
  logging.info("Logging configured. Starting program.")
520
581
 
521
582
  variant_df = pd.DataFrame(columns=["barcode_plate", "name", "refseq", "variant"])
522
-
583
+
523
584
  try:
524
- variant_df, ref_df = process_ref_csv(cl_args, tqdm_fn)
585
+ if cl_args["oligopool"]:
586
+ variant_df, ref_df = process_ref_csv_oligopool(cl_args, tqdm_fn)
587
+ else:
588
+ variant_df, ref_df = process_ref_csv(cl_args, tqdm_fn)
525
589
  ref_df_path = os.path.join(ref_folder, cl_args["name"]+".csv")
526
590
  ref_df.to_csv(ref_df_path, index=False)
527
591
 
@@ -544,6 +608,8 @@ def run_LevSeq(cl_args, tqdm_fn=tqdm.tqdm):
544
608
  df_variants, df_vis = create_df_v(variant_df)
545
609
  processed_csv = os.path.join(result_folder, "visualization_partial.csv")
546
610
  df_vis.to_csv(processed_csv, index=False)
611
+ if cl_args["oligopool"]:
612
+ make_oligopool_plates(df_vis, result_folder=result_folder, save_files=True)
547
613
  except Exception as e:
548
614
  processed_csv = os.path.join(result_folder, "visualization_partial.csv")
549
615
  if 'df_vis' in locals():
@@ -59,12 +59,13 @@ def translate(seq):
59
59
  'TTC': 'F', 'TTT': 'F', 'TTA': 'L', 'TTG': 'L',
60
60
  'TAC': 'Y', 'TAT': 'Y', 'TAA': '*', 'TAG': '*',
61
61
  'TGC': 'C', 'TGT': 'C', 'TGA': '*', 'TGG': 'W',
62
+ 'GTS': "X"
62
63
  }
63
64
  protein = ""
64
65
  if len(seq) % 3 == 0:
65
66
  for i in range(0, len(seq), 3):
66
67
  codon = seq[i:i + 3]
67
- protein += table[codon]
68
+ protein += table.get(codon, 'X')
68
69
  return protein
69
70
 
70
71
 
@@ -290,8 +291,7 @@ def get_reads_for_well(parent_name, bam_file_path: str, ref_str: str, msa_path=N
290
291
  insert_map = defaultdict(list)
291
292
  for read in bam.fetch(until_eof=True):
292
293
  # Ensure we have at least 75% coverage
293
- if read.query_sequence is not None and len(read.query_sequence) > 0.75 * len(
294
- ref_str) and read.cigartuples is not None:
294
+ if read.query_sequence is not None and read.cigartuples is not None: # and len(read.query_sequence) > 0.75 * len(ref_str) and read.cigartuples is not None:
295
295
  seq, ref, qual, ins = alignment_from_cigar(read.cigartuples, read.query_sequence, ref_str,
296
296
  read.query_qualities)
297
297
  # Make it totally align
@@ -313,16 +313,17 @@ def get_reads_for_well(parent_name, bam_file_path: str, ref_str: str, msa_path=N
313
313
  # Do this for all wells
314
314
  seq_df = make_well_df_from_reads(seqs, read_ids, read_quals)
315
315
  alignment_count = len(seq_df.values)
316
- rows_all = make_row_from_read_pileup_across_well(seq_df, ref_str, parent_name, insert_map)
317
- bam.close()
318
-
319
- if len(rows_all) > 2: # Check if we have anything to return
320
- seq_df = pd.DataFrame(rows_all)
321
- seq_df.columns = ['gene_name', 'position', 'ref', 'most_frequent', 'freq_non_ref', 'total_other',
322
- 'total_reads', 'p_value', 'percent_most_freq_mutation', 'A', 'p(a)', 'T', 'p(t)', 'G', 'p(g)',
323
- 'C', 'p(c)', 'N', 'p(n)', 'I', 'p(i)', 'Warnings']
324
- return calculate_mutation_significance_across_well(seq_df), alignment_count
325
-
316
+ if alignment_count > 0:
317
+ rows_all = make_row_from_read_pileup_across_well(seq_df, ref_str, parent_name, insert_map)
318
+ bam.close()
319
+
320
+ if len(rows_all) > 2: # Check if we have anything to return
321
+ seq_df = pd.DataFrame(rows_all)
322
+ seq_df.columns = ['gene_name', 'position', 'ref', 'most_frequent', 'freq_non_ref', 'total_other',
323
+ 'total_reads', 'p_value', 'percent_most_freq_mutation', 'A', 'p(a)', 'T', 'p(t)', 'G', 'p(g)',
324
+ 'C', 'p(c)', 'N', 'p(n)', 'I', 'p(i)', 'Warnings']
325
+ return calculate_mutation_significance_across_well(seq_df), alignment_count
326
+ return None, 0
326
327
  def make_row_from_read_pileup_across_well(well_df, ref_str, label, insert_map):
327
328
  """
328
329
  Given a pileup of reads, we want to get some summary information about that sequence
@@ -51,11 +51,13 @@ class VariantCaller:
51
51
 
52
52
  """
53
53
 
54
- def __init__(self, experiment_name, experiment_folder: Path, template_fasta: Path, barcode_path: Path, padding_start: int = 0, padding_end: int = 0) -> None:
54
+ def __init__(self, experiment_name, experiment_folder: Path, template_fasta: Path, barcode_path: Path,
55
+ padding_start: int = 0, padding_end: int = 0, oligopool=True) -> None:
55
56
  self.barcode_path = barcode_path
56
57
  self.experiment_name = experiment_name
57
58
  self.experiment_folder = experiment_folder
58
59
  self.padding_start = padding_start
60
+ self.oligopool = oligopool
59
61
  self.padding_end = padding_end
60
62
  self.template_fasta = template_fasta
61
63
  self.alignment_name = 'alignment_minimap'
@@ -90,9 +92,15 @@ class VariantCaller:
90
92
  renamed_ids.append(f'{plate}_{well}')
91
93
  plates.append(experiment_name)
92
94
  wells.append(well)
93
- self.variant_dict[f'{plate}_{well}'] = {'Plate': experiment_name, 'Well': well,
94
- 'Barcodes': f'{reverse_barcode}_{forward_barcode}',
95
- 'Path': os.path.join(self.experiment_folder, f'{reverse_barcode}/{forward_barcode}')}
95
+ if self.oligopool:
96
+ self.variant_dict[f'{plate}_{well}'] = {'Plate': experiment_name, 'Well': well,
97
+ 'Barcodes': f'{reverse_barcode}_{forward_barcode}',
98
+ 'Path': os.path.join(self.experiment_folder,
99
+ f'{reverse_barcode}/{reverse_barcode}/{forward_barcode}')}
100
+ else:
101
+ self.variant_dict[f'{plate}_{well}'] = {'Plate': experiment_name, 'Well': well,
102
+ 'Barcodes': f'{reverse_barcode}_{forward_barcode}',
103
+ 'Path': os.path.join(self.experiment_folder, f'{reverse_barcode}/{forward_barcode}')}
96
104
  df = pd.DataFrame()
97
105
  df['Plate'] = plates
98
106
  df['Well'] = wells
@@ -100,14 +108,6 @@ class VariantCaller:
100
108
  df['ID'] = renamed_ids
101
109
  return df
102
110
 
103
- @staticmethod
104
- def load_reference(reference_path):
105
- # The reference enables multiple parents to be used for different
106
- # WARNING: this assumes all the parents are the same
107
- ref_seq = str(SeqIO.read(template_fasta,'fasta').seq)
108
- barcode_to_plate_name = experiment_name
109
- return 'Parent', ref_seq, barcode_to_plate_name
110
-
111
111
  @staticmethod
112
112
  def _barcode_to_well(barcode):
113
113
  match = re.search(r'\d+', barcode)
@@ -124,28 +124,32 @@ class VariantCaller:
124
124
  try:
125
125
  all_fastq = os.path.join(output_dir, '*.fastq')
126
126
  fastq_list = glob.glob(all_fastq)
127
- fastq_files = os.path.join(output_dir, f"demultiplexed_{filename}.fastq")
127
+ fastq_files = all_fastq # os.path.join(output_dir, f"demultiplexed_{filename}.fastq")
128
128
 
129
- if not fastq_list:
129
+ if not all_fastq:
130
130
  logger.error("No FASTQ files found in the specified output directory.")
131
131
  return
132
132
 
133
- # Combining fastq files into one
134
- with open(fastq_files, 'w') as outfile:
135
- for fastq in fastq_list:
136
- with open(fastq, 'r') as infile:
137
- outfile.write(infile.read())
138
-
133
+ # Combining fastq files into one if there are more than 1
134
+ if len(fastq_list) > 1:
135
+ with open(fastq_files, 'w') as outfile:
136
+ for fastq in fastq_list:
137
+ with open(fastq, 'r') as infile:
138
+ outfile.write(infile.read())
139
+ else:
140
+ fastq_files = fastq_list[0]
139
141
  # Alignment using minimap2
140
142
  minimap_cmd = f"minimap2 -ax map-ont -A {scores[0]} -B {scores[1]} -O {scores[2]},24 '{self.template_fasta}' '{fastq_files}' > '{output_dir}/{alignment_name}.sam'"
141
143
  subprocess.run(minimap_cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
142
-
144
+ print(minimap_cmd)
143
145
  # Convert SAM to BAM and sort
144
146
  view_cmd = f"samtools view -bS '{output_dir}/{alignment_name}.sam' > '{output_dir}/{alignment_name}.bam'"
145
147
  subprocess.run(view_cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
148
+ print(view_cmd)
146
149
 
147
150
  sort_cmd = f"samtools sort '{output_dir}/{alignment_name}.bam' -o '{output_dir}/{alignment_name}.bam'"
148
151
  subprocess.run(sort_cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
152
+ print(sort_cmd)
149
153
 
150
154
  # Index the BAM file
151
155
  index_cmd = f"samtools index '{output_dir}/{alignment_name}.bam'"
@@ -163,18 +167,22 @@ class VariantCaller:
163
167
  for barcode_id in pbar:
164
168
  try:
165
169
  row = self.variant_dict.get(barcode_id)
166
- bam_file = os.path.join(row["Path"], f'{self.alignment_name}.bam')
170
+ bam_file = os.path.join(row["Path"], f'{self.alignment_name}_{barcode_id}.bam')
167
171
 
168
172
  # Check if alignment file exists, if not, align sequences
169
173
  if not os.path.exists(bam_file):
170
- logger.info(f"Aligning sequences for {row['Path']}")
171
- self._align_sequences(row["Path"], row['Barcodes'])
174
+ logger.info(f"Aligning sequences for {row['Path']}")
175
+ self._align_sequences(row["Path"], row['Barcodes'],
176
+ alignment_name=f'{self.alignment_name}_{barcode_id}')
172
177
 
173
178
  # Placeholder function calls to demonstrate workflow
174
179
  well_df, alignment_count = get_reads_for_well(self.experiment_name, bam_file,
175
- self.ref_str, f'{row["Path"]}/msa.fa')
176
- self.variant_dict[barcode_id]['Alignment Count'] = alignment_count
180
+ self.ref_str, f'{row["Path"]}/{self.alignment_name}_{barcode_id}.fa')
177
181
  if well_df is not None:
182
+ if self.oligopool:
183
+ if len(well_df.values) < 10:
184
+ continue
185
+ self.variant_dict[barcode_id]['Alignment Count'] = alignment_count
178
186
  well_df.to_csv(f"{row['Path']}/seq_{barcode_id}.csv", index=False)
179
187
  label, freq, combined_p_value, mixed_well, avg_error_rate = get_variant_label_for_well(well_df, threshold)
180
188
  self.variant_dict[barcode_id]['Variant'] = label
@@ -187,7 +195,7 @@ class VariantCaller:
187
195
  finally:
188
196
  pbar.update(1)
189
197
 
190
- def get_variant_df(self, threshold: float = 0.5, min_depth: int = 5, output_dir='', num_threads=10):
198
+ def get_variant_df(self, threshold: float = 0.5, min_depth: int = 5, output_dir='', num_threads=20):
191
199
  """
192
200
  Get Variant Data Frame for all samples in the experiment
193
201
 
@@ -202,26 +210,34 @@ class VariantCaller:
202
210
  data = []
203
211
  num = int(len(self.variant_df) / num_threads)
204
212
  self.variant_df.reset_index(inplace=True)
205
- for i in range(0, len(self.variant_df), num):
206
- end_i = i + num if i + num < len(self.variant_df) else len(self.variant_df)
207
- sub_df = self.variant_df.iloc[i: end_i]['ID'].values
208
- sub_data = [sub_df, threshold, min_depth, output_dir]
209
- data.append(sub_data)
213
+ if num_threads > 1:
214
+ for i in range(0, len(self.variant_df), num):
215
+ end_i = i + num if i + num < len(self.variant_df) else len(self.variant_df)
216
+ sub_df = self.variant_df.iloc[i: end_i]['ID'].values
217
+ sub_data = [sub_df, threshold, min_depth, output_dir]
218
+ data.append(sub_data)
210
219
 
211
- # Thread it
212
- pool.map(self._run_variant_thread, data)
220
+ # Thread it
221
+ pool.map(self._run_variant_thread, data)
222
+ else:
223
+ self._run_variant_thread([self.variant_df, threshold, min_depth, output_dir])
213
224
 
214
225
  self.variant_df['Variant'] = [self.variant_dict[b_id].get('Variant') for b_id in self.variant_df['ID'].values]
215
- self.variant_df['Mixed Well'] = [self.variant_dict[b_id].get('Mixed Well') for b_id in self.variant_df['ID'].values]
216
- self.variant_df['Average mutation frequency'] = [self.variant_dict[b_id].get('Average mutation frequency') for b_id in self.variant_df['ID'].values]
226
+ self.variant_df['Mixed Well'] = [self.variant_dict[b_id].get('Mixed Well') for b_id in
227
+ self.variant_df['ID'].values]
228
+ self.variant_df['Average mutation frequency'] = [self.variant_dict[b_id].get('Average mutation frequency') for
229
+ b_id in self.variant_df['ID'].values]
217
230
  self.variant_df['P value'] = [self.variant_dict[b_id].get('P value') for b_id in self.variant_df['ID'].values]
218
- self.variant_df['Alignment Count'] = [self.variant_dict[b_id].get('Alignment Count') for b_id in self.variant_df['ID'].values]
219
- self.variant_df['Average error rate'] = [self.variant_dict[b_id].get('Average error rate') for b_id in self.variant_df['ID'].values]
220
-
231
+ self.variant_df['Alignment Count'] = [self.variant_dict[b_id].get('Alignment Count') for b_id in
232
+ self.variant_df['ID'].values]
233
+ self.variant_df['Average error rate'] = [self.variant_dict[b_id].get('Average error rate') for b_id in
234
+ self.variant_df['ID'].values]
221
235
  # Adjust p-values using bonferroni make it simple
222
- self.variant_df['P adj. value'] = len(self.variant_df) * self.variant_df["P value"].values
223
- self.variant_df['P adj. value'] = [1 if x > 1 else x for x in self.variant_df["P adj. value"].values]
224
-
236
+ self.variant_df['P adj. value'] = [len(self.variant_df) * p if p else None for p in self.variant_df["P value"].values]
237
+ self.variant_df['P adj. value'] = [1 if x and x > 1 else x for x in self.variant_df["P adj. value"].values]
238
+ if self.oligopool:
239
+ # Filter this so we don't get all the junk
240
+ self.variant_df = self.variant_df[self.variant_df['Alignment Count'] > 2]
225
241
  return self.variant_df
226
242
 
227
243
  def _get_alignment_count(self, sample_folder_path: Path):
@@ -63,6 +63,7 @@ from bokeh.events import Tap
63
63
  from bokeh.io import save, show, output_file, output_notebook
64
64
 
65
65
  import panel as pn
66
+ import seaborn as sns
66
67
 
67
68
  from levseq.utils import *
68
69
 
@@ -1147,3 +1148,54 @@ def plot_sequence_alignment(
1147
1148
  toolbar_location=None,
1148
1149
  sizing_mode=sizing_mode,
1149
1150
  )
1151
+
1152
+
1153
+ def make_oligopool_plates(vis_df, result_folder, save_files=False):
1154
+ """ Simple heatmaps saved as SVGs for oligopool plates."""
1155
+ parents = vis_df[vis_df['amino_acid_substitutions'] == '#PARENT#']
1156
+ top_well_df = parents.sort_values(by='Alignment Count', ascending=False)
1157
+ top_well_df = top_well_df.drop_duplicates('name', keep='first')
1158
+ # This is one of the things that they will want returned
1159
+ if save_files:
1160
+ top_well_df.to_csv(os.path.join(result_folder, 'best_aligned_parents.csv'), index=False)
1161
+ # Now for each plate we make a heatmap
1162
+ vis_df['amino_acid_substitutions'] = [n if x == '#PARENT#' else x for n, x in
1163
+ vis_df[['name', 'amino_acid_substitutions']].values]
1164
+
1165
+ plates = set(vis_df['barcode_plate'].values)
1166
+ # Drop mixed well plates
1167
+ for plate in plates:
1168
+ df = vis_df[vis_df['barcode_plate'] == plate]
1169
+ df = df.sort_values(by='Alignment Count', ascending=False)
1170
+ # Keep only one of the variants per well (i.e. the dominant one)
1171
+ df = df.drop_duplicates('Well')
1172
+ # Reshape into a well format
1173
+ df['Column'] = [int(i[1:]) for i in df['Well'].values]
1174
+ df['Row'] = [i[0] for i in df['Well'].values]
1175
+ df.sort_values(by=['Column', 'Row'], inplace=True, ascending=[False, True])
1176
+ # Load the example flights dataset and convert to long-form
1177
+ platemap = (
1178
+ df
1179
+ .pivot(index="Row", columns="Column", values="Alignment Count")
1180
+ )
1181
+ platemap_labels = (
1182
+ df
1183
+ .pivot(index="Row", columns="Column", values="amino_acid_substitutions")
1184
+ )
1185
+ plot_seaborn_heatmap(platemap, platemap_labels,f'{plate}', result_folder)
1186
+
1187
+ def plot_seaborn_heatmap(platemap, platemap_labels, label: str, result_folder):
1188
+ """ Plot the seaborn platemap using the data"""
1189
+ platemap = platemap.fillna(0)
1190
+ sns.set_theme()
1191
+ f, ax = plt.subplots(figsize=(16, 8))
1192
+ plt.rcParams['svg.fonttype'] = 'none' # Ensure text is saved as text
1193
+ row_labels = [str(s) for s in list(platemap.index)]
1194
+ col_labels = [str(s) for s in list(platemap.columns)]
1195
+ data = platemap.values
1196
+ pc = sns.heatmap(data, cmap='Reds', annot=platemap_labels.values, xticklabels=col_labels, yticklabels=row_labels,
1197
+ fmt='', linewidths=.1)
1198
+ ax = pc.axes
1199
+ plt.yticks(rotation=0)
1200
+ plt.setp(ax.get_yticklabels(), ha="center")
1201
+ plt.savefig(os.path.join(result_folder, f'platemap_{label}.svg'))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: levseq
3
- Version: 1.3.3
3
+ Version: 1.4.0
4
4
  Home-page: https://github.com/fhalab/levseq/
5
5
  Author: Yueming Long, Ariane Mora, Francesca-Zhoufan Li, Emre Gursoy
6
6
  Author-email: ylong@caltech.edu
@@ -52,9 +52,13 @@ In directed evolution, sequencing every variant enhances data insight and create
52
52
  ![Figure 1: LevSeq Workflow](manuscript/figures/LevSeq_Figure-1.jpeg)
53
53
  Figure 1: Overview of the LevSeq variant sequencing workflow using Nanopore technology. This diagram illustrates the key steps in the process, from sample preparation to data analysis and visualization.
54
54
 
55
+ ## Website
56
+ A beta website is available [here](https://levseqdb.streamlit.app/) you just load directly your output from LevSeq and your LCMS results and get visualisations and per plate normalizations.
57
+
58
+ ## Data
55
59
 
56
60
  - Data to reproduce the results and to test are available on zenodo [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.13694463.svg)](https://doi.org/10.5281/zenodo.13694463)
57
- - A dockerized website and database for labs to locally host and visualize their data: website is available [here](https://levseqdb.streamlit.app/) and code to host locally [here](https://github.com/fhalab/LevSeq_db)
61
+ - A dockerized website and database for labs to locally host and visualize their data: website is available [here](https://levseqdb.streamlit.app/) and code to host locally [here](https://github.com/fhalab/LevSeq_db)
58
62
 
59
63
  ## Setup
60
64
 
@@ -104,6 +108,11 @@ conda install -c bioconda -c conda-forge samtools
104
108
  ```
105
109
  conda install -c bioconda -c conda-forge minimap2
106
110
  ```
111
+ 3. gcc 13 and 14 on Mac M1 through M4 chips
112
+ ```
113
+ brew install gcc@13
114
+ brew install gcc@14
115
+ ```
107
116
  ### Docker Installation (Recommended for full pipeline)
108
117
  For installing the whole pipeline, you'll need to use the docker image. For this, install docker as required for your
109
118
  operating system (https://docs.docker.com/engine/install/).
@@ -21,9 +21,11 @@ import unittest
21
21
  import matplotlib.pyplot as plt
22
22
  from levseq import *
23
23
  from levseq.run_levseq import process_ref_csv
24
+
24
25
  u = SciUtil()
25
26
  import math
26
27
 
28
+
27
29
  class TestClass(unittest.TestCase):
28
30
 
29
31
  @classmethod
@@ -45,47 +47,54 @@ class TestClass(unittest.TestCase):
45
47
  def teardown_class(self):
46
48
  shutil.rmtree(self.tmp_dir)
47
49
 
50
+
48
51
  class TestDeploy(TestClass):
49
-
52
+
50
53
  def test_deploy(self):
51
54
  cmd_list = [
52
55
  'docker', # Needs to be installed as vina.
53
56
  'run',
54
57
  '--rm',
55
58
  '-v',
56
- f'{os.getcwd()}:/levseq_results',
59
+ f'{os.getcwd()}/test_data/laragen_run:/levseq_results',
57
60
  'levseq',
58
61
  'test_deploy',
59
- 'test_data/laragen_run/levseq-1.2.7/',
60
- 'test_data/laragen_run/20241116-LevSeq-Review-Validation-levseq_ref.csv'
62
+ 'levseq_results/levseq-1.2.7/',
63
+ 'levseq_results/20241116-LevSeq-Review-Validation-levseq_ref.csv'
61
64
  ]
65
+ print(' '.join(cmd_list))
62
66
  # ToDo: add in scoring function for ad4
63
- cmd_return = subprocess.run(cmd_list, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
64
- print(cmd_return.stdout, cmd_return)
65
-
67
+
68
+ # cmd_return = subprocess.run(cmd_list, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
69
+ # print(cmd_return.stdout, cmd_return)
70
+
66
71
  def test_variant_calling(self):
67
72
  # Take as input the demultiplexed fastq files and the reference csv file
68
- cl_args = {'skip_demultiplexing': True, 'skip_variantcalling': False}
73
+ cl_args = {'skip_demultiplexing': True, 'skip_variantcalling': False, 'threshold': 0.5}
69
74
  cl_args["name"] = 'test_deploy'
70
75
  cl_args['path'] = 'test_data/laragen_run/levseq-1.2.7/'
71
76
  cl_args["summary"] = 'test_data/laragen_run/20241116-LevSeq-Review-Validation-levseq_ref.csv'
72
77
  variant_df, ref_df = process_ref_csv(cl_args)
78
+ variant_df.to_csv('laragen_test_run.csv')
73
79
  # Now we want to check all the variants are the same as in the original case:
74
80
  checked_variants_df = pd.read_csv('test_data/laragen_run/levseq-1.2.7/variants_gold_standard.csv')
75
81
  checked_variants = checked_variants_df['Variant'].values
76
- checked_sig = checked_variants_df['P adj. value'].values
82
+ checked_sig = checked_variants_df['Average mutation frequency'].values
83
+ checked_alignments = checked_variants_df['Alignment Count'].values
84
+
77
85
  i = 0
78
- for variant, pval in variant_df[['Variant', 'P adj. value']].values:
86
+ for variant, freq, alignment_count, pval in variant_df[['Variant', 'Average mutation frequency',
87
+ 'Alignment Count', 'P adj. value']].values:
79
88
  print(variant, checked_variants[i])
80
89
  if checked_variants[i]:
81
90
  if variant:
82
91
  assert variant == checked_variants[i]
83
- # if pval < 0.05:
84
- # assert checked_sig[i] < 0.05
85
- # elif math.isnan(pval):
86
- # assert math.isnan(checked_sig[i])
87
- # else:
88
- # assert checked_sig[i] >= 0.05
89
- print(pval, checked_sig[i])
92
+ assert alignment_count == checked_alignments[i]
93
+ if freq != checked_sig[i]:
94
+ print(freq, checked_sig[i])
90
95
  i += 1
91
96
 
97
+
98
+ # docker run --rm -v /Users/arianemora/Documents/code/LevSeq/data/degradeo/20250121-JR-IM-HS:/levseq_results levseq 20250121-JR-IM-HS_oligopool levseq_results/ levseq_results/ref_seq_oligopools_single.csv --skip_variantcalling
99
+ # levseq oligpool_20250121-JR-IM-HS /Users/arianemora/Documents/code/LevSeq/data/degradeo/20250121-JR-IM-HS/ /Users/arianemora/Documents/code/LevSeq/data/degradeo/20250121-JR-IM-HS/ref_seq_oligopools_all.csv --skip_variantcalling
100
+ # levseq results results/ /Users/arianemora/Documents/code/LevSeq/data/degradeo/20250121-JR-IM-HS/ref_seq_oligopools_all.csv --skip_demultiplexing --oligopool
@@ -58,36 +58,10 @@ class TestClass(unittest.TestCase):
58
58
  def teardown_class(self):
59
59
  shutil.rmtree(self.tmp_dir)
60
60
 
61
- def test_making_pools(self):
62
- u.dp(["Testing SSM"])
63
- cl_args = {'skip_demultiplexing': True, 'skip_variantcalling': False}
64
- cl_args["name"] = 'oligopools'
65
- cl_args['path'] = '/Users/arianemora/Documents/projects/LevSeq/oligopools/'
66
- cl_args["summary"] = '/Users/arianemora/Documents/projects/LevSeq/oligopools/oligopool_seqs.csv'
67
- variant_df = process_ref_csv(cl_args)
68
-
69
- # Check if variants.csv already exist
70
- variant_csv_path = os.path.join('oligopools', "variants.csv")
71
- if os.path.exists(variant_csv_path):
72
- variant_df = pd.read_csv(variant_csv_path)
73
- df_variants, df_vis = create_df_v(variant_df)
74
- # Clean up and prepare dataframe for visualization
75
- else:
76
- df_variants, df_vis = create_df_v(variant_df)
77
-
78
- def test_pools(self):
79
- u.dp(["Testing SSM"])
80
- cl_args = {'skip_demultiplexing': True, 'skip_variantcalling': False}
81
- cl_args["name"] = 'oligopools'
82
- cl_args['path'] = '/Users/arianemora/Documents/projects/LevSeq/oligopools/'
83
- cl_args["summary"] = '/Users/arianemora/Documents/projects/LevSeq/oligopools/oligopool_seqs.csv'
84
- variant_df = process_ref_csv(cl_args)
85
-
86
- # Check if variants.csv already exist
87
- variant_csv_path = os.path.join('oligopools', "variants.csv")
88
- if os.path.exists(variant_csv_path):
89
- variant_df = pd.read_csv(variant_csv_path)
90
- df_variants, df_vis = create_df_v(variant_df)
91
- # Clean up and prepare dataframe for visualization
92
- else:
93
- df_variants, df_vis = create_df_v(variant_df)
61
+ def test_demultipluxing_pools(self):
62
+ # Take as input the demultiplexed fastq files and the reference csv file
63
+ cl_args = {'skip_demultiplexing': False, 'skip_variantcalling': False, 'threshold': 0.5, 'oligopool': True, 'show_msa': False}
64
+ cl_args["name"] = 'oligotest_21032025'
65
+ cl_args['path'] = 'test_oligopool_2103/'
66
+ cl_args["summary"] = 'test_oligopool_2103/test_oligopool_2103.csv'
67
+ run_LevSeq(cl_args)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes