levseq 1.4.2__tar.gz → 1.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. {levseq-1.4.2/levseq.egg-info → levseq-1.5}/PKG-INFO +51 -2
  2. {levseq-1.4.2 → levseq-1.5}/README.md +38 -1
  3. {levseq-1.4.2 → levseq-1.5}/levseq/__init__.py +1 -1
  4. levseq-1.5/levseq/filter_orientation.py +221 -0
  5. {levseq-1.4.2 → levseq-1.5}/levseq/run_levseq.py +103 -29
  6. {levseq-1.4.2 → levseq-1.5}/levseq/utils.py +6 -4
  7. {levseq-1.4.2 → levseq-1.5}/levseq/variantcaller.py +107 -33
  8. {levseq-1.4.2 → levseq-1.5}/levseq/visualization.py +53 -22
  9. {levseq-1.4.2 → levseq-1.5/levseq.egg-info}/PKG-INFO +51 -2
  10. {levseq-1.4.2 → levseq-1.5}/tests/test_variant_calling.py +1 -1
  11. levseq-1.4.2/levseq/filter_orientation.py +0 -115
  12. {levseq-1.4.2 → levseq-1.5}/LICENSE +0 -0
  13. {levseq-1.4.2 → levseq-1.5}/MANIFEST.in +0 -0
  14. {levseq-1.4.2 → levseq-1.5}/levseq/IO_processor.py +0 -0
  15. {levseq-1.4.2 → levseq-1.5}/levseq/barcoding/__init__.py +0 -0
  16. {levseq-1.4.2 → levseq-1.5}/levseq/barcoding/demultiplex +0 -0
  17. {levseq-1.4.2 → levseq-1.5}/levseq/barcoding/demultiplex-arm64 +0 -0
  18. {levseq-1.4.2 → levseq-1.5}/levseq/barcoding/demultiplex-x86 +0 -0
  19. {levseq-1.4.2 → levseq-1.5}/levseq/barcoding/minion_barcodes.fasta +0 -0
  20. {levseq-1.4.2 → levseq-1.5}/levseq/basecaller.py +0 -0
  21. {levseq-1.4.2 → levseq-1.5}/levseq/cmd.py +0 -0
  22. {levseq-1.4.2 → levseq-1.5}/levseq/coordinates.py +0 -0
  23. {levseq-1.4.2 → levseq-1.5}/levseq/globals.py +0 -0
  24. {levseq-1.4.2 → levseq-1.5}/levseq/interface.py +0 -0
  25. {levseq-1.4.2 → levseq-1.5}/levseq/parser.py +0 -0
  26. {levseq-1.4.2 → levseq-1.5}/levseq/screen.py +0 -0
  27. {levseq-1.4.2 → levseq-1.5}/levseq/seqfit.py +0 -0
  28. {levseq-1.4.2 → levseq-1.5}/levseq/simulation.py +0 -0
  29. {levseq-1.4.2 → levseq-1.5}/levseq/user.py +0 -0
  30. {levseq-1.4.2 → levseq-1.5}/levseq.egg-info/SOURCES.txt +0 -0
  31. {levseq-1.4.2 → levseq-1.5}/levseq.egg-info/dependency_links.txt +0 -0
  32. {levseq-1.4.2 → levseq-1.5}/levseq.egg-info/entry_points.txt +0 -0
  33. {levseq-1.4.2 → levseq-1.5}/levseq.egg-info/requires.txt +0 -0
  34. {levseq-1.4.2 → levseq-1.5}/levseq.egg-info/top_level.txt +0 -0
  35. {levseq-1.4.2 → levseq-1.5}/setup.cfg +0 -0
  36. {levseq-1.4.2 → levseq-1.5}/setup.py +0 -0
  37. {levseq-1.4.2 → levseq-1.5}/tests/test_copy_fastq.py +0 -0
  38. {levseq-1.4.2 → levseq-1.5}/tests/test_demultiplex_docker.py +0 -0
  39. {levseq-1.4.2 → levseq-1.5}/tests/test_deploy.py +0 -0
  40. {levseq-1.4.2 → levseq-1.5}/tests/test_opligopools.py +0 -0
  41. {levseq-1.4.2 → levseq-1.5}/tests/test_seqfitvis.py +0 -0
  42. {levseq-1.4.2 → levseq-1.5}/tests/test_seqs.py +0 -0
  43. {levseq-1.4.2 → levseq-1.5}/tests/test_statistics.py +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: levseq
3
- Version: 1.4.2
3
+ Version: 1.5
4
4
  Home-page: https://github.com/fhalab/levseq/
5
5
  Author: Yueming Long, Ariane Mora, Francesca-Zhoufan Li, Emre Gursoy
6
6
  Author-email: ylong@caltech.edu
@@ -44,6 +44,18 @@ Requires-Dist: scikit-learn
44
44
  Requires-Dist: statsmodels
45
45
  Requires-Dist: tqdm
46
46
  Requires-Dist: biopandas
47
+ Dynamic: author
48
+ Dynamic: author-email
49
+ Dynamic: classifier
50
+ Dynamic: description
51
+ Dynamic: description-content-type
52
+ Dynamic: home-page
53
+ Dynamic: keywords
54
+ Dynamic: license
55
+ Dynamic: license-file
56
+ Dynamic: project-url
57
+ Dynamic: requires-dist
58
+ Dynamic: requires-python
47
59
 
48
60
  # Variant Sequencing with Nanopore (LevSeq)
49
61
 
@@ -52,8 +64,35 @@ LevSeq provides a streamlined pipeline for sequencing and analyzing genetic vari
52
64
  ![Figure 1: LevSeq Workflow](manuscript/figures/LevSeq_Figure-1.jpeg)
53
65
  Figure 1: Overview of the LevSeq variant sequencing workflow using Nanopore technology. This diagram illustrates the key steps in the process, from sample preparation to data analysis and visualization.
54
66
 
67
+ ## <span style="color: orange;">**Important: Barcode Improvements and LevSeq 2.0 Development**</span>
68
+
69
+ **We have identified and resolved demultiplexing challenges in the original barcode set.** Version 1.4 introduced alignment-aware variant calling to address these issues and significantly improve accuracy.
70
+
71
+ **We are actively developing LevSeq 2.0** in collaboration with DTU and AITHYRA to fundamentally redesign the barcode system. The updated approach includes:
72
+
73
+ - **Enhanced barcode design**: New barcodes will be strain-aware and sequence-aware, generated using an advanced barcode design tool
74
+ - **Reversed workflow architecture**: LevSeq 2.0 will perform alignment first, then demultiplexing (rather than the current demultiplexing-first approach), resolving issues with forward and reverse read handling
75
+ - **Improved accuracy**: These changes will provide more robust demultiplexing and variant calling across diverse experimental conditions
76
+
77
+ **Please reach out to us at ylong@caltech.edu if you are planning to order barcoded primers now**
78
+
79
+ ## Notes
80
+
81
+ LevSeq was designed for epPCR and SSM experiments, however, we are currently extending it to work for other enzyme engineering designs as well, the current features are under development:
82
+
83
+ 1. Insertion handling (see version 4.1.3) - thanks to Brian Zhong for his contributions to this section!
84
+ 2. Gene calling (handling different genes, use the `--oligopool` flag)
85
+
86
+ If you notice any issues with new features or have adapted the LevSeq code for your own use cases, we would love community contributions! Please submit either an issue, or a pull request and we will aim to incorperate the changes.
87
+
88
+ Performance update: demultiplexing now runs in parallel batches of 8 plates and input FASTQs are staged once per run, improving throughput on multi-core systems.
89
+
55
90
  ## Quick Start
56
91
 
92
+ Note the current stable version is: `1.5`, the latest version is `1.5`.
93
+
94
+ For stable releases these are made available via docker and pip. For latest versions, please clone the repo and install locally (see *Local development or install of latest version* below).
95
+
57
96
  ### Docker Installation (Recommended)
58
97
 
59
98
  1. Install Docker: [https://docs.docker.com/engine/install/](https://docs.docker.com/engine/install/)
@@ -183,6 +222,16 @@ For the wet lab protocol:
183
222
  - **Advanced Usage**: See the [manuscript notebook](https://github.com/fhalab/LevSeq/blob/main/manuscript/notebooks/epPCR_10plates.ipynb)
184
223
  - **Troubleshooting**: See our [computational protocols wiki](https://github.com/fhalab/LevSeq/wiki/Computational-protocols)
185
224
 
225
+ ### Local development or install of latest version
226
+
227
+ ```
228
+ conda create --name levseq python=3.10
229
+ git clone git@github.com:fhalab/LevSeq.git
230
+ cd LevSeq
231
+ python setup.py sdist bdist_wheel
232
+ pip install dist/levseq-1.4.3.tar.gz
233
+ ```
234
+
186
235
  ## Citing LevSeq
187
236
 
188
237
  If you find LevSeq useful, please cite our paper:
@@ -5,8 +5,35 @@ LevSeq provides a streamlined pipeline for sequencing and analyzing genetic vari
5
5
  ![Figure 1: LevSeq Workflow](manuscript/figures/LevSeq_Figure-1.jpeg)
6
6
  Figure 1: Overview of the LevSeq variant sequencing workflow using Nanopore technology. This diagram illustrates the key steps in the process, from sample preparation to data analysis and visualization.
7
7
 
8
+ ## <span style="color: orange;">**Important: Barcode Improvements and LevSeq 2.0 Development**</span>
9
+
10
+ **We have identified and resolved demultiplexing challenges in the original barcode set.** Version 1.4 introduced alignment-aware variant calling to address these issues and significantly improve accuracy.
11
+
12
+ **We are actively developing LevSeq 2.0** in collaboration with DTU and AITHYRA to fundamentally redesign the barcode system. The updated approach includes:
13
+
14
+ - **Enhanced barcode design**: New barcodes will be strain-aware and sequence-aware, generated using an advanced barcode design tool
15
+ - **Reversed workflow architecture**: LevSeq 2.0 will perform alignment first, then demultiplexing (rather than the current demultiplexing-first approach), resolving issues with forward and reverse read handling
16
+ - **Improved accuracy**: These changes will provide more robust demultiplexing and variant calling across diverse experimental conditions
17
+
18
+ **Please reach out to us at ylong@caltech.edu if you are planning to order barcoded primers now**
19
+
20
+ ## Notes
21
+
22
+ LevSeq was designed for epPCR and SSM experiments, however, we are currently extending it to work for other enzyme engineering designs as well, the current features are under development:
23
+
24
+ 1. Insertion handling (see version 4.1.3) - thanks to Brian Zhong for his contributions to this section!
25
+ 2. Gene calling (handling different genes, use the `--oligopool` flag)
26
+
27
+ If you notice any issues with new features or have adapted the LevSeq code for your own use cases, we would love community contributions! Please submit either an issue, or a pull request and we will aim to incorperate the changes.
28
+
29
+ Performance update: demultiplexing now runs in parallel batches of 8 plates and input FASTQs are staged once per run, improving throughput on multi-core systems.
30
+
8
31
  ## Quick Start
9
32
 
33
+ Note the current stable version is: `1.5`, the latest version is `1.5`.
34
+
35
+ For stable releases these are made available via docker and pip. For latest versions, please clone the repo and install locally (see *Local development or install of latest version* below).
36
+
10
37
  ### Docker Installation (Recommended)
11
38
 
12
39
  1. Install Docker: [https://docs.docker.com/engine/install/](https://docs.docker.com/engine/install/)
@@ -136,6 +163,16 @@ For the wet lab protocol:
136
163
  - **Advanced Usage**: See the [manuscript notebook](https://github.com/fhalab/LevSeq/blob/main/manuscript/notebooks/epPCR_10plates.ipynb)
137
164
  - **Troubleshooting**: See our [computational protocols wiki](https://github.com/fhalab/LevSeq/wiki/Computational-protocols)
138
165
 
166
+ ### Local development or install of latest version
167
+
168
+ ```
169
+ conda create --name levseq python=3.10
170
+ git clone git@github.com:fhalab/LevSeq.git
171
+ cd LevSeq
172
+ python setup.py sdist bdist_wheel
173
+ pip install dist/levseq-1.4.3.tar.gz
174
+ ```
175
+
139
176
  ## Citing LevSeq
140
177
 
141
178
  If you find LevSeq useful, please cite our paper:
@@ -152,4 +189,4 @@ If you find LevSeq useful, please cite our paper:
152
189
 
153
190
  ## Contact
154
191
 
155
- Leave a feature request in the issues or reach us via [email](mailto:levseqdb@gmail.com).
192
+ Leave a feature request in the issues or reach us via [email](mailto:levseqdb@gmail.com).
@@ -18,7 +18,7 @@
18
18
  __title__ = 'levseq'
19
19
  __description__ = 'LevSeq nanopore sequencing'
20
20
  __url__ = 'https://github.com/fhalab/levseq/'
21
- __version__ = '1.4.2'
21
+ __version__ = '1.5'
22
22
  __author__ = 'Yueming Long, Ariane Mora, Francesca-Zhoufan Li, Emre Gursoy'
23
23
  __author_email__ = 'ylong@caltech.edu'
24
24
  __license__ = 'GPL3'
@@ -0,0 +1,221 @@
1
+ from Bio import SeqIO
2
+ from Bio.Seq import Seq
3
+ from pathlib import Path
4
+ import gzip
5
+ import logging
6
+ import math
7
+ import shutil
8
+ from concurrent.futures import ThreadPoolExecutor, as_completed
9
+ from tqdm import tqdm
10
+
11
+ _VALID_BASES = set("ACGT")
12
+
13
+
14
+ def build_kmer_set(seq, kmer_size):
15
+ """Build a set of k-mers from the parent sequence."""
16
+ if kmer_size <= 0:
17
+ return set()
18
+ seq = seq.upper()
19
+ if len(seq) < kmer_size:
20
+ return set()
21
+ kmers = set()
22
+ for i in range(len(seq) - kmer_size + 1):
23
+ kmer = seq[i:i + kmer_size]
24
+ if set(kmer) <= _VALID_BASES:
25
+ kmers.add(kmer)
26
+ return kmers
27
+
28
+
29
+ def sample_kmer_positions(seq_len, kmer_size, samples, skip_front, skip_back):
30
+ start_min = max(skip_front, 0)
31
+ start_max = seq_len - max(skip_back, 0) - kmer_size
32
+ if start_max < start_min or samples <= 0:
33
+ return []
34
+ span = start_max - start_min
35
+ if span == 0:
36
+ return [start_min]
37
+ if samples == 1:
38
+ return [start_min + span // 2]
39
+ step = span / (samples - 1)
40
+ positions = [int(round(start_min + i * step)) for i in range(samples)]
41
+ seen = set()
42
+ uniq_positions = []
43
+ for pos in positions:
44
+ if pos < start_min or pos > start_max:
45
+ continue
46
+ if pos in seen:
47
+ continue
48
+ seen.add(pos)
49
+ uniq_positions.append(pos)
50
+ return uniq_positions
51
+
52
+
53
+ def count_kmer_hits(seq, positions, kmer_size, parent_kmers, rev_kmers):
54
+ forward_hits = 0
55
+ reverse_hits = 0
56
+ sampled = 0
57
+ for pos in positions:
58
+ kmer = seq[pos:pos + kmer_size]
59
+ if len(kmer) != kmer_size:
60
+ continue
61
+ if set(kmer) <= _VALID_BASES:
62
+ sampled += 1
63
+ if kmer in parent_kmers:
64
+ forward_hits += 1
65
+ if kmer in rev_kmers:
66
+ reverse_hits += 1
67
+ return forward_hits, reverse_hits, sampled
68
+
69
+
70
+ def iter_fastq_records(handle):
71
+ while True:
72
+ header = handle.readline()
73
+ if not header:
74
+ break
75
+ seq = handle.readline()
76
+ plus = handle.readline()
77
+ qual = handle.readline()
78
+ if not seq or not plus or not qual:
79
+ break
80
+ yield header, seq, plus, qual
81
+
82
+ def filter_single_file(args):
83
+ """
84
+ Filter a single fastq file. Used for parallel processing.
85
+
86
+ Args:
87
+ args: tuple containing (input_file, parent_kmers, rev_kmers, kmer_size, samples,
88
+ skip_front, skip_back, min_delta, min_ratio)
89
+ Returns:
90
+ tuple: (file_path, total_reads, kept_reads, temp_file)
91
+ """
92
+ (input_file, parent_kmers, rev_kmers, kmer_size, samples,
93
+ skip_front, skip_back, min_delta, min_ratio) = args
94
+ total_reads = 0
95
+ kept_count = 0
96
+
97
+ is_forward = "forward" in str(input_file).lower()
98
+
99
+ input_path = Path(input_file)
100
+ temp_file = input_path.parent / f"temp_{input_path.name}"
101
+ position_cache = {}
102
+ open_fn = gzip.open if input_path.suffix == ".gz" else open
103
+ with open_fn(input_path, "rt") as input_handle, open(temp_file, "w") as output_handle:
104
+ for header, seq_line, plus, qual in iter_fastq_records(input_handle):
105
+ total_reads += 1
106
+ seq = seq_line.strip().upper()
107
+ seq_len = len(seq)
108
+ if seq_len not in position_cache:
109
+ position_cache[seq_len] = sample_kmer_positions(
110
+ seq_len, kmer_size, samples, skip_front, skip_back
111
+ )
112
+ positions = position_cache[seq_len]
113
+ forward_hits, reverse_hits, sampled = count_kmer_hits(
114
+ seq, positions, kmer_size, parent_kmers, rev_kmers
115
+ )
116
+ if sampled == 0:
117
+ continue
118
+ required_delta = max(min_delta, int(math.ceil(min_ratio * sampled)))
119
+ if required_delta > sampled:
120
+ required_delta = sampled
121
+
122
+ # If it's in forward file (plate barcode was rev comp)
123
+ # Then read should align to reverse complement parent sequence
124
+ if is_forward and (reverse_hits - forward_hits) >= required_delta:
125
+ output_handle.write(header)
126
+ output_handle.write(seq_line)
127
+ output_handle.write(plus)
128
+ output_handle.write(qual)
129
+ kept_count += 1
130
+ # If it's in reverse file (plate barcode was forward)
131
+ # Then read was already reverse complemented by demultiplexer
132
+ # So it should align to forward parent sequence
133
+ elif not is_forward and (forward_hits - reverse_hits) >= required_delta:
134
+ output_handle.write(header)
135
+ output_handle.write(seq_line)
136
+ output_handle.write(plus)
137
+ output_handle.write(qual)
138
+ kept_count += 1
139
+
140
+ return str(input_file), total_reads, kept_count, str(temp_file)
141
+
142
+ def filter_demultiplexed_folder(
143
+ experiment_folder,
144
+ parent_sequence,
145
+ num_threads=8,
146
+ kmer_size=6,
147
+ samples=40,
148
+ skip_front=100,
149
+ skip_back=0,
150
+ min_delta=4,
151
+ min_ratio=0.1,
152
+ ):
153
+ """
154
+ Filter demultiplexed files using a k-mer orientation heuristic.
155
+
156
+ Args:
157
+ experiment_folder (str): Path to experiment folder containing RBC/FBC structure
158
+ parent_sequence (str): Parent sequence for alignment checking
159
+ num_threads (int): Number of threads to use
160
+ kmer_size (int): Length of k-mer used for orientation checks
161
+ samples (int): Number of k-mers sampled per read
162
+ skip_front (int): Bases to skip from the front of the read
163
+ skip_back (int): Bases to skip from the end of the read
164
+ min_delta (int): Minimum hit difference to keep a read
165
+ min_ratio (float): Minimum hit difference as a ratio of sampled k-mers
166
+ """
167
+ exp_path = Path(experiment_folder)
168
+ filtered_counts = {}
169
+
170
+ # Prepare parent sequences once
171
+ parent_seq_obj = Seq(parent_sequence)
172
+ parent_seq = str(parent_seq_obj).upper()
173
+ parent_rev_comp = str(parent_seq_obj.reverse_complement()).upper()
174
+ parent_kmers = build_kmer_set(parent_seq, kmer_size)
175
+ rev_kmers = build_kmer_set(parent_rev_comp, kmer_size)
176
+
177
+ # Collect all fastq files
178
+ fastq_files = []
179
+ for rbc_dir in exp_path.glob("RB*"):
180
+ if not rbc_dir.is_dir():
181
+ continue
182
+ for fbc_dir in rbc_dir.glob("NB*"):
183
+ if not fbc_dir.is_dir():
184
+ continue
185
+ fastq_files.extend(list(fbc_dir.glob("*.fastq")))
186
+
187
+ if not fastq_files:
188
+ logging.warning(f"No fastq files found in {experiment_folder}")
189
+ return filtered_counts
190
+
191
+ # Prepare arguments for parallel processing
192
+ file_args = [
193
+ (f, parent_kmers, rev_kmers, kmer_size, samples, skip_front, skip_back, min_delta, min_ratio)
194
+ for f in fastq_files
195
+ ]
196
+
197
+ # Process files in parallel with progress bar
198
+ with ThreadPoolExecutor(max_workers=num_threads) as executor:
199
+ futures = [executor.submit(filter_single_file, args) for args in file_args]
200
+
201
+ with tqdm(total=len(fastq_files), desc="Filtering files") as pbar:
202
+ for future in as_completed(futures):
203
+ try:
204
+ file_path, total, kept, temp_file = future.result()
205
+
206
+ shutil.move(temp_file, file_path)
207
+
208
+ filtered_counts[file_path] = {
209
+ 'total': total,
210
+ 'kept': kept,
211
+ 'filtered': total - kept
212
+ }
213
+
214
+ logging.info(f"Processed {file_path}: {kept}/{total} reads kept")
215
+ pbar.update(1)
216
+
217
+ except Exception as e:
218
+ logging.error(f"Error processing file {file_path}: {str(e)}")
219
+ pbar.update(1)
220
+
221
+ return filtered_counts
@@ -62,6 +62,7 @@ import numpy as np
62
62
  import tqdm
63
63
  import panel as pn
64
64
  import holoviews as hv
65
+ from concurrent.futures import ThreadPoolExecutor, as_completed
65
66
  from importlib import resources
66
67
  from holoviews.streams import Tap
67
68
 
@@ -485,6 +486,11 @@ def process_ref_csv_oligopool(cl_args, tqdm_fn=tqdm.tqdm):
485
486
  result_folder = create_result_folder(cl_args)
486
487
  variant_csv_path = os.path.join(result_folder, "variants.csv")
487
488
  variant_df = pd.DataFrame(columns=["barcode_plate", "name", "refseq", "variant"])
489
+
490
+ output_dir = Path(result_folder) / f"{cl_args['name']}_fastq"
491
+ output_dir.mkdir(parents=True, exist_ok=True)
492
+ if not cl_args["skip_demultiplexing"]:
493
+ cat_fastq_files(cl_args.get("path"), output_dir)
488
494
 
489
495
  # First get the different barcode plates (these will be unique)
490
496
  barcode_plates = ref_df["barcode_plate"].unique()
@@ -496,10 +502,7 @@ def process_ref_csv_oligopool(cl_args, tqdm_fn=tqdm.tqdm):
496
502
  name_folder = os.path.join(result_folder, f'RB{barcode_plate}')
497
503
  os.makedirs(name_folder, exist_ok=True)
498
504
  barcode_path = filter_bc(cl_args, name_folder, i)
499
- output_dir = Path(result_folder) / f"{cl_args['name']}_fastq"
500
- output_dir.mkdir(parents=True, exist_ok=True)
501
505
 
502
- file_to_fastq = cat_fastq_files(cl_args.get("path"), output_dir)
503
506
  try:
504
507
  demux_fastq(output_dir, name_folder, barcode_path)
505
508
  except Exception as e:
@@ -543,62 +546,134 @@ def process_ref_csv(cl_args, tqdm_fn=tqdm.tqdm):
543
546
  variant_csv_path = os.path.join(result_folder, "variants.csv")
544
547
 
545
548
  variant_df = pd.DataFrame(columns=["barcode_plate", "name", "refseq", "variant"])
546
-
547
- for i, row in tqdm_fn(ref_df.iterrows(), total=len(ref_df), desc="Processing Samples"):
549
+
550
+ output_dir = Path(result_folder) / f"{cl_args['name']}_fastq"
551
+ output_dir.mkdir(parents=True, exist_ok=True)
552
+ if not cl_args["skip_demultiplexing"]:
553
+ cat_fastq_files(cl_args.get("path"), output_dir)
554
+
555
+ samples = []
556
+ for i, row in ref_df.iterrows():
548
557
  barcode_plate = row["barcode_plate"]
549
558
  name = row["name"]
550
559
  refseq = row["refseq"].upper()
551
560
 
552
561
  name_folder = os.path.join(result_folder, name)
553
562
  os.makedirs(name_folder, exist_ok=True)
554
-
563
+
555
564
  temp_fasta_path = os.path.join(name_folder, f"temp_{name}.fasta")
556
565
  if not os.path.exists(temp_fasta_path):
557
566
  with open(temp_fasta_path, "w") as f:
558
567
  f.write(f">{name}\n{refseq}\n")
559
568
  else:
560
569
  logging.info(f"Fasta file for {name} already exists. Skipping write.")
561
-
562
- barcode_path = filter_bc(cl_args, name_folder, i)
563
- output_dir = Path(result_folder) / f"{cl_args['name']}_fastq"
564
- output_dir.mkdir(parents=True, exist_ok=True)
565
570
 
566
- if not cl_args["skip_demultiplexing"]:
567
- file_to_fastq = cat_fastq_files(cl_args.get("path"), output_dir)
571
+ barcode_path = filter_bc(cl_args, name_folder, i)
572
+ samples.append({
573
+ "barcode_plate": barcode_plate,
574
+ "name": name,
575
+ "refseq": refseq,
576
+ "name_folder": name_folder,
577
+ "temp_fasta_path": temp_fasta_path,
578
+ "barcode_path": barcode_path,
579
+ "demux_ok": True,
580
+ })
581
+
582
+ def _demux_only(sample):
583
+ name = sample["name"]
584
+ name_folder = sample["name_folder"]
585
+ barcode_path = sample["barcode_path"]
586
+ try:
587
+ demux_fastq(output_dir, name_folder, barcode_path)
588
+ return True
589
+ except Exception:
590
+ logging.error(
591
+ "An error occurred during demultiplexing for sample {}. Skipping this sample.".format(name),
592
+ exc_info=True,
593
+ )
594
+ return False
595
+
596
+ if not cl_args["skip_demultiplexing"]:
597
+ batch_size = 8
598
+ if samples:
599
+ pbar = tqdm_fn(total=len(samples), desc="Demultiplex plates")
568
600
  try:
569
- demux_fastq(output_dir, name_folder, barcode_path)
601
+ for i in range(0, len(samples), batch_size):
602
+ batch = samples[i:i + batch_size]
603
+ max_workers = len(batch)
604
+ if max_workers <= 1:
605
+ sample = batch[0]
606
+ sample["demux_ok"] = _demux_only(sample)
607
+ pbar.update(1)
608
+ continue
609
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
610
+ futures = {executor.submit(_demux_only, sample): sample for sample in batch}
611
+ for future in as_completed(futures):
612
+ sample = futures[future]
613
+ sample["demux_ok"] = future.result()
614
+ pbar.update(1)
615
+ finally:
616
+ pbar.close()
617
+ else:
618
+ for sample in samples:
619
+ sample["demux_ok"] = True
570
620
 
571
- # Add filtering step here with multithreading
621
+ if not cl_args["skip_demultiplexing"]:
622
+ for sample in tqdm_fn(samples, total=len(samples), desc="Filter plates"):
623
+ if not sample["demux_ok"]:
624
+ continue
625
+ name = sample["name"]
626
+ refseq = sample["refseq"]
627
+ name_folder = sample["name_folder"]
628
+ try:
572
629
  filtered_counts = filter_demultiplexed_folder(
573
- name_folder,
574
- refseq,
575
- num_threads=10
630
+ name_folder,
631
+ refseq,
632
+ num_threads=10,
576
633
  )
577
634
  logging.info(f"Orientation filtering completed for {name}")
578
635
  total_reads = sum(counts['total'] for counts in filtered_counts.values())
579
636
  kept_reads = sum(counts['kept'] for counts in filtered_counts.values())
580
- logging.info(f"Total filtering results: {kept_reads}/{total_reads} reads kept ({kept_reads/total_reads*100:.2f}%)")
637
+ if total_reads:
638
+ logging.info(
639
+ "Total filtering results: %d/%d reads kept (%.2f%%)",
640
+ kept_reads,
641
+ total_reads,
642
+ kept_reads / total_reads * 100,
643
+ )
581
644
  for file, counts in filtered_counts.items():
582
645
  logging.info(f"{file}: {counts['kept']}/{counts['total']} reads kept")
646
+ except Exception:
647
+ logging.error(
648
+ "An error occurred during filtering for sample {}. Skipping this sample.".format(name),
649
+ exc_info=True,
650
+ )
651
+ sample["demux_ok"] = False
652
+ continue
583
653
 
584
-
585
- except Exception as e:
586
- logging.error("An error occurred during demultiplexing/filtering for sample {}. Skipping this sample.".format(name), exc_info=True)
654
+ if not cl_args["skip_variantcalling"]:
655
+ for sample in tqdm_fn(samples, total=len(samples), desc="Calling variants"):
656
+ if not sample["demux_ok"] and not cl_args["skip_demultiplexing"]:
587
657
  continue
588
-
589
- if not cl_args["skip_variantcalling"]:
590
658
  try:
591
659
  threshold = cl_args.get("threshold") if cl_args.get("threshold") is not None else 0.5
592
660
  variant_result = call_variant(
593
- f"{name}", name_folder, temp_fasta_path, barcode_path, threshold=threshold
661
+ f"{sample['name']}",
662
+ sample["name_folder"],
663
+ sample["temp_fasta_path"],
664
+ sample["barcode_path"],
665
+ threshold=threshold,
594
666
  )
595
- variant_result["barcode_plate"] = barcode_plate
596
- variant_result["name"] = name
597
- variant_result["refseq"] = refseq
667
+ variant_result["barcode_plate"] = sample["barcode_plate"]
668
+ variant_result["name"] = sample["name"]
669
+ variant_result["refseq"] = sample["refseq"]
598
670
 
599
671
  variant_df = pd.concat([variant_df, variant_result])
600
672
  except Exception as e:
601
- logging.error("An error occurred during variant calling for sample {}. Skipping this sample.".format(name), exc_info=True)
673
+ logging.error(
674
+ "An error occurred during variant calling for sample {}. Skipping this sample.".format(sample["name"]),
675
+ exc_info=True,
676
+ )
602
677
  continue
603
678
 
604
679
  variant_df.to_csv(variant_csv_path, index=False)
@@ -676,4 +751,3 @@ def run_LevSeq(cl_args, tqdm_fn=tqdm.tqdm):
676
751
 
677
752
  # This modification saves the results at each critical stage, ensuring that even in the case of failure,
678
753
  # the user has access to intermediate results and does not lose all the progress.
679
-
@@ -205,7 +205,7 @@ def calculate_mutation_significance_across_well(seq_df):
205
205
  seq_df.at[i, 'p(g)'] = p_g
206
206
  seq_df.at[i, 'p(c)'] = p_c
207
207
  seq_df.at[i, 'p(n)'] = p_n
208
- seq_df.at[i, 'p(i)'] = p_n
208
+ seq_df.at[i, 'p(i)'] = p_i
209
209
  seq_df.at[i, 'p_value'] = p_value
210
210
  seq_df.at[i, 'percent_most_freq_mutation'] = val
211
211
  seq_df.at[i, 'most_frequent'] = actual_seq
@@ -324,6 +324,8 @@ def get_reads_for_well(parent_name, bam_file_path: str, ref_str: str, msa_path=N
324
324
  'C', 'p(c)', 'N', 'p(n)', 'I', 'p(i)', 'Warnings']
325
325
  return calculate_mutation_significance_across_well(seq_df), alignment_count
326
326
  return None, 0
327
+
328
+
327
329
  def make_row_from_read_pileup_across_well(well_df, ref_str, label, insert_map):
328
330
  """
329
331
  Given a pileup of reads, we want to get some summary information about that sequence
@@ -349,12 +351,12 @@ def make_row_from_read_pileup_across_well(well_df, ref_str, label, insert_map):
349
351
  warning = f'WARNING: INSERT.'
350
352
  rows.append([label, col, ref_seq, actual_seq, freq_non_ref, total_other, total_reads, 1.0, 0.0,
351
353
  len(vc[vc == 'A']), 1.0, len(vc[vc == 'T']), 1.0, len(vc[vc == 'G']), 1.0,
352
- len(vc[vc == 'C']), 1.0, len(vc[vc == '-']), 1.0, len(insert_map.get(col)),
354
+ len(vc[vc == 'C']), 1.0, len(vc[vc == '-']), 1.0, len(vc[vc == 'I']),
353
355
  1.0, warning])
354
- if ref_seq != '-':
356
+ elif ref_seq != '-':
355
357
  rows.append([label, col, ref_seq, actual_seq, freq_non_ref, total_other, total_reads, 1.0, 0.0,
356
358
  len(vc[vc == 'A']), 1.0, len(vc[vc == 'T']), 1.0, len(vc[vc == 'G']), 1.0,
357
- len(vc[vc == 'C']), 1.0, len(vc[vc == '-']), 1.0, 0,
359
+ len(vc[vc == 'C']), 1.0, len(vc[vc == '-']), 1.0, len(vc[vc == 'I']),
358
360
  1.0, warning])
359
361
  return rows
360
362
 
@@ -18,6 +18,7 @@ import pandas as pd
18
18
  import logging
19
19
  from levseq.utils import *
20
20
  import subprocess
21
+ import shutil
21
22
  import os
22
23
  from collections import defaultdict
23
24
  import glob
@@ -41,9 +42,7 @@ The variant caller starts from demultiplexed fastq files.
41
42
 
42
43
  logger = logging.getLogger(__name__)
43
44
  logger.setLevel(logging.WARNING) # Set default level for this module
44
- # Use the logger in this file
45
- logger.warning("This is a warning message.")
46
- logger.info("This won't show unless logging is configured to INFO elsewhere.")
45
+ # Use the logger in this file.
47
46
 
48
47
  class VariantCaller:
49
48
  """
@@ -123,47 +122,108 @@ class VariantCaller:
123
122
  def _align_sequences(self, output_dir, filename, scores=[4, 2, 10], alignment_name="alignment_minimap"):
124
123
  try:
125
124
  all_fastq = os.path.join(output_dir, '*.fastq')
126
- fastq_list = glob.glob(all_fastq)
127
- fastq_files = all_fastq # os.path.join(output_dir, f"demultiplexed_{filename}.fastq")
128
-
129
- if not all_fastq:
125
+ fastq_list = sorted(glob.glob(all_fastq))
126
+ if not fastq_list:
130
127
  logger.error("No FASTQ files found in the specified output directory.")
131
128
  return
132
129
 
133
- # Combining fastq files into one if there are more than 1
134
- if len(fastq_list) > 1:
135
- with open(fastq_files, 'w') as outfile:
136
- for fastq in fastq_list:
137
- with open(fastq, 'r') as infile:
138
- outfile.write(infile.read())
139
- else:
140
- fastq_files = fastq_list[0]
130
+ sam_path = os.path.join(output_dir, f"{alignment_name}.sam")
141
131
  # Alignment using minimap2
142
- minimap_cmd = f"minimap2 -ax map-ont -A {scores[0]} -B {scores[1]} -O {scores[2]},24 '{self.template_fasta}' '{fastq_files}' > '{output_dir}/{alignment_name}.sam'"
143
- subprocess.run(minimap_cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
132
+ minimap_cmd = [
133
+ "minimap2", "-ax", "map-ont",
134
+ "-A", str(scores[0]),
135
+ "-B", str(scores[1]),
136
+ "-O", f"{scores[2]},24",
137
+ str(self.template_fasta),
138
+ *fastq_list,
139
+ ]
140
+ with open(sam_path, "w") as sam_handle:
141
+ minimap_result = subprocess.run(
142
+ minimap_cmd,
143
+ stdout=sam_handle,
144
+ stderr=subprocess.PIPE,
145
+ text=True,
146
+ )
147
+ if minimap_result.returncode != 0:
148
+ logger.error(
149
+ "minimap2 failed for %s: %s",
150
+ filename,
151
+ minimap_result.stderr.strip(),
152
+ )
153
+ return
144
154
  # print(minimap_cmd)
145
- # Convert SAM to BAM and sort
146
- view_cmd = f"samtools view -bS '{output_dir}/{alignment_name}.sam' > '{output_dir}/{alignment_name}.bam'"
147
- subprocess.run(view_cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
148
- # print(view_cmd)
155
+ # Convert SAM to BAM
156
+ unsorted_bam = os.path.join(output_dir, f"{alignment_name}.unsorted.bam")
157
+ with open(unsorted_bam, "wb") as bam_handle:
158
+ view_result = subprocess.run(
159
+ ["samtools", "view", "-bS", sam_path],
160
+ stdout=bam_handle,
161
+ stderr=subprocess.PIPE,
162
+ )
163
+ if view_result.returncode != 0:
164
+ logger.error(
165
+ "samtools view failed for %s: %s",
166
+ filename,
167
+ view_result.stderr.decode().strip(),
168
+ )
169
+ return
149
170
 
150
- sort_cmd = f"samtools sort '{output_dir}/{alignment_name}.bam' -o '{output_dir}/{alignment_name}.bam'"
151
- subprocess.run(sort_cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
152
- # print(sort_cmd)
171
+ # Sort BAM (support both modern and legacy samtools syntax)
172
+ sorted_bam = os.path.join(output_dir, f"{alignment_name}.bam")
173
+ sort_result = subprocess.run(
174
+ ["samtools", "sort", "-o", sorted_bam, unsorted_bam],
175
+ stdout=subprocess.DEVNULL,
176
+ stderr=subprocess.PIPE,
177
+ )
178
+ if sort_result.returncode != 0 or not os.path.exists(sorted_bam):
179
+ legacy_prefix = os.path.join(output_dir, f"{alignment_name}.sorted")
180
+ legacy_result = subprocess.run(
181
+ ["samtools", "sort", unsorted_bam, legacy_prefix],
182
+ stdout=subprocess.DEVNULL,
183
+ stderr=subprocess.PIPE,
184
+ )
185
+ if legacy_result.returncode != 0:
186
+ logger.error(
187
+ "samtools sort failed for %s: %s",
188
+ filename,
189
+ legacy_result.stderr.decode().strip(),
190
+ )
191
+ return
192
+ legacy_bam = f"{legacy_prefix}.bam"
193
+ if not os.path.exists(legacy_bam):
194
+ logger.error("samtools sort did not produce %s", legacy_bam)
195
+ return
196
+ shutil.move(legacy_bam, sorted_bam)
153
197
 
154
198
  # Index the BAM file
155
- index_cmd = f"samtools index '{output_dir}/{alignment_name}.bam'"
156
- subprocess.run(index_cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
199
+ if not os.path.exists(sorted_bam):
200
+ logger.error("samtools sort did not produce %s", sorted_bam)
201
+ return
202
+ index_cmd = ["samtools", "index", sorted_bam]
203
+ index_result = subprocess.run(
204
+ index_cmd,
205
+ stdout=subprocess.DEVNULL,
206
+ stderr=subprocess.PIPE,
207
+ )
208
+ if index_result.returncode != 0:
209
+ logger.error(
210
+ "samtools index failed for %s: %s",
211
+ filename,
212
+ index_result.stderr.decode().strip(),
213
+ )
214
+ return
157
215
 
158
216
  # Cleanup SAM file to save space
159
- os.remove(f"{output_dir}/{alignment_name}.sam")
217
+ os.remove(sam_path)
218
+ os.remove(unsorted_bam)
160
219
  except Exception as e:
161
220
  logger.error(f"Error during alignment for {filename}: {e}")
162
221
 
163
222
  def _run_variant_thread(self, args):
164
223
  barcode_ids, threshold, min_depth, output_dir = args
165
- # Overall progress bar for all barcodes in this thread
166
- with tqdm(barcode_ids, desc="Processing barcodes", leave=False) as pbar:
224
+ logger.info("Variant calling: processing %d barcodes", len(barcode_ids))
225
+ # Overall progress bar for all barcodes in this thread (disabled to reduce console spam)
226
+ with tqdm(barcode_ids, desc="Processing barcodes", leave=False, disable=True) as pbar:
167
227
  for barcode_id in pbar:
168
228
  try:
169
229
  row = self.variant_dict.get(barcode_id)
@@ -171,9 +231,18 @@ class VariantCaller:
171
231
 
172
232
  # Check if alignment file exists, if not, align sequences
173
233
  if not os.path.exists(bam_file):
174
- logger.info(f"Aligning sequences for {row['Path']}")
175
- self._align_sequences(row["Path"], row['Barcodes'],
176
- alignment_name=f'{self.alignment_name}_{barcode_id}')
234
+ logger.info(f"Aligning sequences for {row['Path']}")
235
+ self._align_sequences(
236
+ row["Path"],
237
+ row['Barcodes'],
238
+ alignment_name=f'{self.alignment_name}_{barcode_id}',
239
+ )
240
+ elif not os.path.exists(f"{bam_file}.bai"):
241
+ subprocess.run(
242
+ ["samtools", "index", bam_file],
243
+ stdout=subprocess.DEVNULL,
244
+ stderr=subprocess.DEVNULL,
245
+ )
177
246
 
178
247
  # Placeholder function calls to demonstrate workflow
179
248
  well_df, alignment_count = get_reads_for_well(self.experiment_name, bam_file,
@@ -184,7 +253,12 @@ class VariantCaller:
184
253
  continue
185
254
  self.variant_dict[barcode_id]['Alignment Count'] = alignment_count
186
255
  well_df.to_csv(f"{row['Path']}/seq_{barcode_id}.csv", index=False)
187
- label, freq, combined_p_value, mixed_well, avg_error_rate = get_variant_label_for_well(well_df, threshold)
256
+ # Suppress noisy numerical warnings from downstream stats on sparse wells.
257
+ with warnings.catch_warnings():
258
+ warnings.filterwarnings("ignore", category=RuntimeWarning)
259
+ label, freq, combined_p_value, mixed_well, avg_error_rate = get_variant_label_for_well(
260
+ well_df, threshold
261
+ )
188
262
  self.variant_dict[barcode_id]['Variant'] = label
189
263
  self.variant_dict[barcode_id]['Mixed Well'] = mixed_well
190
264
  self.variant_dict[barcode_id]['Average mutation frequency'] = freq
@@ -67,13 +67,36 @@ import seaborn as sns
67
67
 
68
68
  from levseq.utils import *
69
69
 
70
- output_notebook()
70
+ def _in_notebook():
71
+ try:
72
+ from IPython import get_ipython
73
+ except Exception:
74
+ return False
75
+ ip = get_ipython()
76
+ if ip is None:
77
+ return False
78
+ return ip.__class__.__name__ == "ZMQInteractiveShell"
79
+
80
+
81
+ def _should_init_notebook():
82
+ if os.environ.get("LEVSEQ_DISABLE_NOTEBOOK_INIT") == "1":
83
+ return False
84
+ if os.environ.get("LEVSEQ_FORCE_NOTEBOOK_INIT") == "1":
85
+ return True
86
+ return _in_notebook()
87
+
71
88
 
72
- pn.extension()
73
- pn.config.comms = "vscode"
89
+ def init_notebook_env():
90
+ # Avoid notebook UI side-effects during plain imports/tests.
91
+ output_notebook()
92
+ pn.extension()
93
+ pn.config.comms = "vscode"
94
+ hv.extension("bokeh")
95
+ hv.renderer("bokeh").webgl = True
74
96
 
75
- hv.extension("bokeh")
76
- hv.renderer("bokeh").webgl = True
97
+
98
+ if _should_init_notebook():
99
+ init_notebook_env()
77
100
 
78
101
  # warnings.filterwarnings("ignore")
79
102
  #warnings.filterwarnings("ignore", category=Warning)
@@ -392,25 +415,33 @@ def generate_platemaps(
392
415
  out=np.zeros_like(max_combo_df["Alignment Count"], dtype=float),
393
416
  where=max_combo_df["Alignment Count"] != 0,
394
417
  )
395
- # Set the center
396
- center = np.log(10)
418
+ max_combo_df["logseqdepth"] = max_combo_df["logseqdepth"].fillna(0.0)
419
+
420
+ min_val = max_combo_df["logseqdepth"].min()
421
+ max_val = max_combo_df["logseqdepth"].max()
422
+ if not np.isfinite(min_val) or not np.isfinite(max_val) or min_val == max_val:
423
+ # Avoid invalid colormap centers when data has no range.
424
+ color_levels = [min_val - 0.1, min_val, min_val + 0.1]
425
+ else:
426
+ # Set the center
427
+ center = np.log(10)
397
428
 
398
- add_min = False
399
- if max_combo_df["logseqdepth"].min() >= center:
400
- add_min = True
429
+ add_min = False
430
+ if min_val >= center:
431
+ add_min = True
401
432
 
402
- # Adjust if it is greater than max of data (avoids ValueError)
403
- if max_combo_df["logseqdepth"].max() <= center:
404
- # Adjust the center
405
- center = max_combo_df["logseqdepth"].median()
433
+ # Adjust if it is greater than max of data (avoids ValueError)
434
+ if max_val <= center:
435
+ # Adjust the center
436
+ center = max_combo_df["logseqdepth"].median()
406
437
 
407
- # center colormap
408
- if not add_min:
409
- color_levels = ns.viz._center_colormap(max_combo_df["logseqdepth"], center)
410
- else:
411
- color_levels = ns.viz._center_colormap(
412
- list(max_combo_df["logseqdepth"]) + [np.log(1)], center
413
- )
438
+ # center colormap
439
+ if not add_min:
440
+ color_levels = ns.viz._center_colormap(max_combo_df["logseqdepth"], center)
441
+ else:
442
+ color_levels = ns.viz._center_colormap(
443
+ list(max_combo_df["logseqdepth"]) + [np.log(1)], center
444
+ )
414
445
 
415
446
  # dictionary for storing plots
416
447
  hm_dict = {}
@@ -1198,4 +1229,4 @@ def plot_seaborn_heatmap(platemap, platemap_labels, label: str, result_folder):
1198
1229
  ax = pc.axes
1199
1230
  plt.yticks(rotation=0)
1200
1231
  plt.setp(ax.get_yticklabels(), ha="center")
1201
- plt.savefig(os.path.join(result_folder, f'platemap_{label}.svg'))
1232
+ plt.savefig(os.path.join(result_folder, f'platemap_{label}.svg'))
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: levseq
3
- Version: 1.4.2
3
+ Version: 1.5
4
4
  Home-page: https://github.com/fhalab/levseq/
5
5
  Author: Yueming Long, Ariane Mora, Francesca-Zhoufan Li, Emre Gursoy
6
6
  Author-email: ylong@caltech.edu
@@ -44,6 +44,18 @@ Requires-Dist: scikit-learn
44
44
  Requires-Dist: statsmodels
45
45
  Requires-Dist: tqdm
46
46
  Requires-Dist: biopandas
47
+ Dynamic: author
48
+ Dynamic: author-email
49
+ Dynamic: classifier
50
+ Dynamic: description
51
+ Dynamic: description-content-type
52
+ Dynamic: home-page
53
+ Dynamic: keywords
54
+ Dynamic: license
55
+ Dynamic: license-file
56
+ Dynamic: project-url
57
+ Dynamic: requires-dist
58
+ Dynamic: requires-python
47
59
 
48
60
  # Variant Sequencing with Nanopore (LevSeq)
49
61
 
@@ -52,8 +64,35 @@ LevSeq provides a streamlined pipeline for sequencing and analyzing genetic vari
52
64
  ![Figure 1: LevSeq Workflow](manuscript/figures/LevSeq_Figure-1.jpeg)
53
65
  Figure 1: Overview of the LevSeq variant sequencing workflow using Nanopore technology. This diagram illustrates the key steps in the process, from sample preparation to data analysis and visualization.
54
66
 
67
+ ## <span style="color: orange;">**Important: Barcode Improvements and LevSeq 2.0 Development**</span>
68
+
69
+ **We have identified and resolved demultiplexing challenges in the original barcode set.** Version 1.4 introduced alignment-aware variant calling to address these issues and significantly improve accuracy.
70
+
71
+ **We are actively developing LevSeq 2.0** in collaboration with DTU and AITHYRA to fundamentally redesign the barcode system. The updated approach includes:
72
+
73
+ - **Enhanced barcode design**: New barcodes will be strain-aware and sequence-aware, generated using an advanced barcode design tool
74
+ - **Reversed workflow architecture**: LevSeq 2.0 will perform alignment first, then demultiplexing (rather than the current demultiplexing-first approach), resolving issues with forward and reverse read handling
75
+ - **Improved accuracy**: These changes will provide more robust demultiplexing and variant calling across diverse experimental conditions
76
+
77
+ **Please reach out to us at ylong@caltech.edu if you are planning to order barcoded primers now**
78
+
79
+ ## Notes
80
+
81
+ LevSeq was designed for epPCR and SSM experiments, however, we are currently extending it to work for other enzyme engineering designs as well, the current features are under development:
82
+
83
+ 1. Insertion handling (see version 4.1.3) - thanks to Brian Zhong for his contributions to this section!
84
+ 2. Gene calling (handling different genes, use the `--oligopool` flag)
85
+
86
+ If you notice any issues with new features or have adapted the LevSeq code for your own use cases, we would love community contributions! Please submit either an issue, or a pull request and we will aim to incorperate the changes.
87
+
88
+ Performance update: demultiplexing now runs in parallel batches of 8 plates and input FASTQs are staged once per run, improving throughput on multi-core systems.
89
+
55
90
  ## Quick Start
56
91
 
92
+ Note the current stable version is: `1.5`, the latest version is `1.5`.
93
+
94
+ For stable releases these are made available via docker and pip. For latest versions, please clone the repo and install locally (see *Local development or install of latest version* below).
95
+
57
96
  ### Docker Installation (Recommended)
58
97
 
59
98
  1. Install Docker: [https://docs.docker.com/engine/install/](https://docs.docker.com/engine/install/)
@@ -183,6 +222,16 @@ For the wet lab protocol:
183
222
  - **Advanced Usage**: See the [manuscript notebook](https://github.com/fhalab/LevSeq/blob/main/manuscript/notebooks/epPCR_10plates.ipynb)
184
223
  - **Troubleshooting**: See our [computational protocols wiki](https://github.com/fhalab/LevSeq/wiki/Computational-protocols)
185
224
 
225
+ ### Local development or install of latest version
226
+
227
+ ```
228
+ conda create --name levseq python=3.10
229
+ git clone git@github.com:fhalab/LevSeq.git
230
+ cd LevSeq
231
+ python setup.py sdist bdist_wheel
232
+ pip install dist/levseq-1.4.3.tar.gz
233
+ ```
234
+
186
235
  ## Citing LevSeq
187
236
 
188
237
  If you find LevSeq useful, please cite our paper:
@@ -283,7 +283,7 @@ class TestVariantCalling(TestClass):
283
283
 
284
284
  def test_calling_variant_with_insert(self):
285
285
  u.dp(["Testing calling variants using SSM with error"])
286
-
286
+ # ToDo: Update this with new calling need a new test for this
287
287
  parent_sequence = "ATGAGT"
288
288
  mutated_sequence = 'ATGAGT' # Not actually mutated
289
289
  parent_name = 'parent'
@@ -1,115 +0,0 @@
1
- from Bio import SeqIO
2
- from Bio.Seq import Seq
3
- import os
4
- from pathlib import Path
5
- import logging
6
- from Bio.Align import PairwiseAligner
7
- import shutil
8
- from concurrent.futures import ThreadPoolExecutor, as_completed
9
- from tqdm import tqdm
10
-
11
- def calculate_alignment_score(seq1, seq2):
12
- """Calculate alignment score between two sequences using PairwiseAligner."""
13
- aligner = PairwiseAligner()
14
- aligner.mode = 'global'
15
- alignment = aligner.align(seq1, seq2)[0]
16
- return alignment.score / max(len(seq1), len(seq2))
17
-
18
- def filter_single_file(args):
19
- """
20
- Filter a single fastq file. Used for parallel processing.
21
-
22
- Args:
23
- args: tuple containing (input_file, parent_seq, parent_rev_comp)
24
- Returns:
25
- tuple: (file_path, total_reads, kept_reads, filtered_records)
26
- """
27
- input_file, parent_seq, parent_rev_comp = args
28
- kept_reads = []
29
- total_reads = 0
30
- kept_count = 0
31
-
32
- is_forward = "forward" in str(input_file).lower()
33
-
34
- for record in SeqIO.parse(input_file, "fastq"):
35
- total_reads += 1
36
- seq = str(record.seq)
37
-
38
- forward_score = calculate_alignment_score(seq, str(parent_seq))
39
- reverse_score = calculate_alignment_score(seq, str(parent_rev_comp))
40
-
41
- # If it's in forward file (plate barcode was rev comp)
42
- # Then read should align to reverse complement parent sequence
43
- if is_forward and reverse_score > forward_score:
44
- kept_reads.append(record)
45
- kept_count += 1
46
- # If it's in reverse file (plate barcode was forward)
47
- # Then read was already reverse complemented by demultiplexer
48
- # So it should align to forward parent sequence
49
- elif not is_forward and forward_score > reverse_score:
50
- kept_reads.append(record)
51
- kept_count += 1
52
-
53
- return str(input_file), total_reads, kept_count, kept_reads
54
-
55
- def filter_demultiplexed_folder(experiment_folder, parent_sequence, num_threads=8):
56
- """
57
- Filter demultiplexed files using multiple threads.
58
-
59
- Args:
60
- experiment_folder (str): Path to experiment folder containing RBC/FBC structure
61
- parent_sequence (str): Parent sequence for alignment checking
62
- num_threads (int): Number of threads to use
63
- """
64
- exp_path = Path(experiment_folder)
65
- filtered_counts = {}
66
-
67
- # Prepare parent sequences once
68
- parent_seq = Seq(parent_sequence)
69
- parent_rev_comp = parent_seq.reverse_complement()
70
-
71
- # Collect all fastq files
72
- fastq_files = []
73
- for rbc_dir in exp_path.glob("RB*"):
74
- if not rbc_dir.is_dir():
75
- continue
76
- for fbc_dir in rbc_dir.glob("NB*"):
77
- if not fbc_dir.is_dir():
78
- continue
79
- fastq_files.extend(list(fbc_dir.glob("*.fastq")))
80
-
81
- if not fastq_files:
82
- logging.warning(f"No fastq files found in {experiment_folder}")
83
- return filtered_counts
84
-
85
- # Prepare arguments for parallel processing
86
- file_args = [(f, parent_seq, parent_rev_comp) for f in fastq_files]
87
-
88
- # Process files in parallel with progress bar
89
- with ThreadPoolExecutor(max_workers=num_threads) as executor:
90
- futures = [executor.submit(filter_single_file, args) for args in file_args]
91
-
92
- with tqdm(total=len(fastq_files), desc="Filtering files") as pbar:
93
- for future in as_completed(futures):
94
- try:
95
- file_path, total, kept, filtered_records = future.result()
96
-
97
- # Write filtered reads
98
- temp_file = Path(file_path).parent / f"temp_{Path(file_path).name}"
99
- SeqIO.write(filtered_records, temp_file, "fastq")
100
- shutil.move(str(temp_file), file_path)
101
-
102
- filtered_counts[file_path] = {
103
- 'total': total,
104
- 'kept': kept,
105
- 'filtered': total - kept
106
- }
107
-
108
- logging.info(f"Processed {file_path}: {kept}/{total} reads kept")
109
- pbar.update(1)
110
-
111
- except Exception as e:
112
- logging.error(f"Error processing file {file_path}: {str(e)}")
113
- pbar.update(1)
114
-
115
- return filtered_counts
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes