levseq 1.4.2__tar.gz → 1.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {levseq-1.4.2/levseq.egg-info → levseq-1.5}/PKG-INFO +51 -2
- {levseq-1.4.2 → levseq-1.5}/README.md +38 -1
- {levseq-1.4.2 → levseq-1.5}/levseq/__init__.py +1 -1
- levseq-1.5/levseq/filter_orientation.py +221 -0
- {levseq-1.4.2 → levseq-1.5}/levseq/run_levseq.py +103 -29
- {levseq-1.4.2 → levseq-1.5}/levseq/utils.py +6 -4
- {levseq-1.4.2 → levseq-1.5}/levseq/variantcaller.py +107 -33
- {levseq-1.4.2 → levseq-1.5}/levseq/visualization.py +53 -22
- {levseq-1.4.2 → levseq-1.5/levseq.egg-info}/PKG-INFO +51 -2
- {levseq-1.4.2 → levseq-1.5}/tests/test_variant_calling.py +1 -1
- levseq-1.4.2/levseq/filter_orientation.py +0 -115
- {levseq-1.4.2 → levseq-1.5}/LICENSE +0 -0
- {levseq-1.4.2 → levseq-1.5}/MANIFEST.in +0 -0
- {levseq-1.4.2 → levseq-1.5}/levseq/IO_processor.py +0 -0
- {levseq-1.4.2 → levseq-1.5}/levseq/barcoding/__init__.py +0 -0
- {levseq-1.4.2 → levseq-1.5}/levseq/barcoding/demultiplex +0 -0
- {levseq-1.4.2 → levseq-1.5}/levseq/barcoding/demultiplex-arm64 +0 -0
- {levseq-1.4.2 → levseq-1.5}/levseq/barcoding/demultiplex-x86 +0 -0
- {levseq-1.4.2 → levseq-1.5}/levseq/barcoding/minion_barcodes.fasta +0 -0
- {levseq-1.4.2 → levseq-1.5}/levseq/basecaller.py +0 -0
- {levseq-1.4.2 → levseq-1.5}/levseq/cmd.py +0 -0
- {levseq-1.4.2 → levseq-1.5}/levseq/coordinates.py +0 -0
- {levseq-1.4.2 → levseq-1.5}/levseq/globals.py +0 -0
- {levseq-1.4.2 → levseq-1.5}/levseq/interface.py +0 -0
- {levseq-1.4.2 → levseq-1.5}/levseq/parser.py +0 -0
- {levseq-1.4.2 → levseq-1.5}/levseq/screen.py +0 -0
- {levseq-1.4.2 → levseq-1.5}/levseq/seqfit.py +0 -0
- {levseq-1.4.2 → levseq-1.5}/levseq/simulation.py +0 -0
- {levseq-1.4.2 → levseq-1.5}/levseq/user.py +0 -0
- {levseq-1.4.2 → levseq-1.5}/levseq.egg-info/SOURCES.txt +0 -0
- {levseq-1.4.2 → levseq-1.5}/levseq.egg-info/dependency_links.txt +0 -0
- {levseq-1.4.2 → levseq-1.5}/levseq.egg-info/entry_points.txt +0 -0
- {levseq-1.4.2 → levseq-1.5}/levseq.egg-info/requires.txt +0 -0
- {levseq-1.4.2 → levseq-1.5}/levseq.egg-info/top_level.txt +0 -0
- {levseq-1.4.2 → levseq-1.5}/setup.cfg +0 -0
- {levseq-1.4.2 → levseq-1.5}/setup.py +0 -0
- {levseq-1.4.2 → levseq-1.5}/tests/test_copy_fastq.py +0 -0
- {levseq-1.4.2 → levseq-1.5}/tests/test_demultiplex_docker.py +0 -0
- {levseq-1.4.2 → levseq-1.5}/tests/test_deploy.py +0 -0
- {levseq-1.4.2 → levseq-1.5}/tests/test_opligopools.py +0 -0
- {levseq-1.4.2 → levseq-1.5}/tests/test_seqfitvis.py +0 -0
- {levseq-1.4.2 → levseq-1.5}/tests/test_seqs.py +0 -0
- {levseq-1.4.2 → levseq-1.5}/tests/test_statistics.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: levseq
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.5
|
|
4
4
|
Home-page: https://github.com/fhalab/levseq/
|
|
5
5
|
Author: Yueming Long, Ariane Mora, Francesca-Zhoufan Li, Emre Gursoy
|
|
6
6
|
Author-email: ylong@caltech.edu
|
|
@@ -44,6 +44,18 @@ Requires-Dist: scikit-learn
|
|
|
44
44
|
Requires-Dist: statsmodels
|
|
45
45
|
Requires-Dist: tqdm
|
|
46
46
|
Requires-Dist: biopandas
|
|
47
|
+
Dynamic: author
|
|
48
|
+
Dynamic: author-email
|
|
49
|
+
Dynamic: classifier
|
|
50
|
+
Dynamic: description
|
|
51
|
+
Dynamic: description-content-type
|
|
52
|
+
Dynamic: home-page
|
|
53
|
+
Dynamic: keywords
|
|
54
|
+
Dynamic: license
|
|
55
|
+
Dynamic: license-file
|
|
56
|
+
Dynamic: project-url
|
|
57
|
+
Dynamic: requires-dist
|
|
58
|
+
Dynamic: requires-python
|
|
47
59
|
|
|
48
60
|
# Variant Sequencing with Nanopore (LevSeq)
|
|
49
61
|
|
|
@@ -52,8 +64,35 @@ LevSeq provides a streamlined pipeline for sequencing and analyzing genetic vari
|
|
|
52
64
|

|
|
53
65
|
Figure 1: Overview of the LevSeq variant sequencing workflow using Nanopore technology. This diagram illustrates the key steps in the process, from sample preparation to data analysis and visualization.
|
|
54
66
|
|
|
67
|
+
## <span style="color: orange;">**Important: Barcode Improvements and LevSeq 2.0 Development**</span>
|
|
68
|
+
|
|
69
|
+
**We have identified and resolved demultiplexing challenges in the original barcode set.** Version 1.4 introduced alignment-aware variant calling to address these issues and significantly improve accuracy.
|
|
70
|
+
|
|
71
|
+
**We are actively developing LevSeq 2.0** in collaboration with DTU and AITHYRA to fundamentally redesign the barcode system. The updated approach includes:
|
|
72
|
+
|
|
73
|
+
- **Enhanced barcode design**: New barcodes will be strain-aware and sequence-aware, generated using an advanced barcode design tool
|
|
74
|
+
- **Reversed workflow architecture**: LevSeq 2.0 will perform alignment first, then demultiplexing (rather than the current demultiplexing-first approach), resolving issues with forward and reverse read handling
|
|
75
|
+
- **Improved accuracy**: These changes will provide more robust demultiplexing and variant calling across diverse experimental conditions
|
|
76
|
+
|
|
77
|
+
**Please reach out to us at ylong@caltech.edu if you are planning to order barcoded primers now**
|
|
78
|
+
|
|
79
|
+
## Notes
|
|
80
|
+
|
|
81
|
+
LevSeq was designed for epPCR and SSM experiments, however, we are currently extending it to work for other enzyme engineering designs as well, the current features are under development:
|
|
82
|
+
|
|
83
|
+
1. Insertion handling (see version 4.1.3) - thanks to Brian Zhong for his contributions to this section!
|
|
84
|
+
2. Gene calling (handling different genes, use the `--oligopool` flag)
|
|
85
|
+
|
|
86
|
+
If you notice any issues with new features or have adapted the LevSeq code for your own use cases, we would love community contributions! Please submit either an issue, or a pull request and we will aim to incorperate the changes.
|
|
87
|
+
|
|
88
|
+
Performance update: demultiplexing now runs in parallel batches of 8 plates and input FASTQs are staged once per run, improving throughput on multi-core systems.
|
|
89
|
+
|
|
55
90
|
## Quick Start
|
|
56
91
|
|
|
92
|
+
Note the current stable version is: `1.5`, the latest version is `1.5`.
|
|
93
|
+
|
|
94
|
+
For stable releases these are made available via docker and pip. For latest versions, please clone the repo and install locally (see *Local development or install of latest version* below).
|
|
95
|
+
|
|
57
96
|
### Docker Installation (Recommended)
|
|
58
97
|
|
|
59
98
|
1. Install Docker: [https://docs.docker.com/engine/install/](https://docs.docker.com/engine/install/)
|
|
@@ -183,6 +222,16 @@ For the wet lab protocol:
|
|
|
183
222
|
- **Advanced Usage**: See the [manuscript notebook](https://github.com/fhalab/LevSeq/blob/main/manuscript/notebooks/epPCR_10plates.ipynb)
|
|
184
223
|
- **Troubleshooting**: See our [computational protocols wiki](https://github.com/fhalab/LevSeq/wiki/Computational-protocols)
|
|
185
224
|
|
|
225
|
+
### Local development or install of latest version
|
|
226
|
+
|
|
227
|
+
```
|
|
228
|
+
conda create --name levseq python=3.10
|
|
229
|
+
git clone git@github.com:fhalab/LevSeq.git
|
|
230
|
+
cd LevSeq
|
|
231
|
+
python setup.py sdist bdist_wheel
|
|
232
|
+
pip install dist/levseq-1.4.3.tar.gz
|
|
233
|
+
```
|
|
234
|
+
|
|
186
235
|
## Citing LevSeq
|
|
187
236
|
|
|
188
237
|
If you find LevSeq useful, please cite our paper:
|
|
@@ -5,8 +5,35 @@ LevSeq provides a streamlined pipeline for sequencing and analyzing genetic vari
|
|
|
5
5
|

|
|
6
6
|
Figure 1: Overview of the LevSeq variant sequencing workflow using Nanopore technology. This diagram illustrates the key steps in the process, from sample preparation to data analysis and visualization.
|
|
7
7
|
|
|
8
|
+
## <span style="color: orange;">**Important: Barcode Improvements and LevSeq 2.0 Development**</span>
|
|
9
|
+
|
|
10
|
+
**We have identified and resolved demultiplexing challenges in the original barcode set.** Version 1.4 introduced alignment-aware variant calling to address these issues and significantly improve accuracy.
|
|
11
|
+
|
|
12
|
+
**We are actively developing LevSeq 2.0** in collaboration with DTU and AITHYRA to fundamentally redesign the barcode system. The updated approach includes:
|
|
13
|
+
|
|
14
|
+
- **Enhanced barcode design**: New barcodes will be strain-aware and sequence-aware, generated using an advanced barcode design tool
|
|
15
|
+
- **Reversed workflow architecture**: LevSeq 2.0 will perform alignment first, then demultiplexing (rather than the current demultiplexing-first approach), resolving issues with forward and reverse read handling
|
|
16
|
+
- **Improved accuracy**: These changes will provide more robust demultiplexing and variant calling across diverse experimental conditions
|
|
17
|
+
|
|
18
|
+
**Please reach out to us at ylong@caltech.edu if you are planning to order barcoded primers now**
|
|
19
|
+
|
|
20
|
+
## Notes
|
|
21
|
+
|
|
22
|
+
LevSeq was designed for epPCR and SSM experiments, however, we are currently extending it to work for other enzyme engineering designs as well, the current features are under development:
|
|
23
|
+
|
|
24
|
+
1. Insertion handling (see version 4.1.3) - thanks to Brian Zhong for his contributions to this section!
|
|
25
|
+
2. Gene calling (handling different genes, use the `--oligopool` flag)
|
|
26
|
+
|
|
27
|
+
If you notice any issues with new features or have adapted the LevSeq code for your own use cases, we would love community contributions! Please submit either an issue, or a pull request and we will aim to incorperate the changes.
|
|
28
|
+
|
|
29
|
+
Performance update: demultiplexing now runs in parallel batches of 8 plates and input FASTQs are staged once per run, improving throughput on multi-core systems.
|
|
30
|
+
|
|
8
31
|
## Quick Start
|
|
9
32
|
|
|
33
|
+
Note the current stable version is: `1.5`, the latest version is `1.5`.
|
|
34
|
+
|
|
35
|
+
For stable releases these are made available via docker and pip. For latest versions, please clone the repo and install locally (see *Local development or install of latest version* below).
|
|
36
|
+
|
|
10
37
|
### Docker Installation (Recommended)
|
|
11
38
|
|
|
12
39
|
1. Install Docker: [https://docs.docker.com/engine/install/](https://docs.docker.com/engine/install/)
|
|
@@ -136,6 +163,16 @@ For the wet lab protocol:
|
|
|
136
163
|
- **Advanced Usage**: See the [manuscript notebook](https://github.com/fhalab/LevSeq/blob/main/manuscript/notebooks/epPCR_10plates.ipynb)
|
|
137
164
|
- **Troubleshooting**: See our [computational protocols wiki](https://github.com/fhalab/LevSeq/wiki/Computational-protocols)
|
|
138
165
|
|
|
166
|
+
### Local development or install of latest version
|
|
167
|
+
|
|
168
|
+
```
|
|
169
|
+
conda create --name levseq python=3.10
|
|
170
|
+
git clone git@github.com:fhalab/LevSeq.git
|
|
171
|
+
cd LevSeq
|
|
172
|
+
python setup.py sdist bdist_wheel
|
|
173
|
+
pip install dist/levseq-1.4.3.tar.gz
|
|
174
|
+
```
|
|
175
|
+
|
|
139
176
|
## Citing LevSeq
|
|
140
177
|
|
|
141
178
|
If you find LevSeq useful, please cite our paper:
|
|
@@ -152,4 +189,4 @@ If you find LevSeq useful, please cite our paper:
|
|
|
152
189
|
|
|
153
190
|
## Contact
|
|
154
191
|
|
|
155
|
-
Leave a feature request in the issues or reach us via [email](mailto:levseqdb@gmail.com).
|
|
192
|
+
Leave a feature request in the issues or reach us via [email](mailto:levseqdb@gmail.com).
|
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
__title__ = 'levseq'
|
|
19
19
|
__description__ = 'LevSeq nanopore sequencing'
|
|
20
20
|
__url__ = 'https://github.com/fhalab/levseq/'
|
|
21
|
-
__version__ = '1.
|
|
21
|
+
__version__ = '1.5'
|
|
22
22
|
__author__ = 'Yueming Long, Ariane Mora, Francesca-Zhoufan Li, Emre Gursoy'
|
|
23
23
|
__author_email__ = 'ylong@caltech.edu'
|
|
24
24
|
__license__ = 'GPL3'
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
from Bio import SeqIO
|
|
2
|
+
from Bio.Seq import Seq
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
import gzip
|
|
5
|
+
import logging
|
|
6
|
+
import math
|
|
7
|
+
import shutil
|
|
8
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
9
|
+
from tqdm import tqdm
|
|
10
|
+
|
|
11
|
+
_VALID_BASES = set("ACGT")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def build_kmer_set(seq, kmer_size):
|
|
15
|
+
"""Build a set of k-mers from the parent sequence."""
|
|
16
|
+
if kmer_size <= 0:
|
|
17
|
+
return set()
|
|
18
|
+
seq = seq.upper()
|
|
19
|
+
if len(seq) < kmer_size:
|
|
20
|
+
return set()
|
|
21
|
+
kmers = set()
|
|
22
|
+
for i in range(len(seq) - kmer_size + 1):
|
|
23
|
+
kmer = seq[i:i + kmer_size]
|
|
24
|
+
if set(kmer) <= _VALID_BASES:
|
|
25
|
+
kmers.add(kmer)
|
|
26
|
+
return kmers
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def sample_kmer_positions(seq_len, kmer_size, samples, skip_front, skip_back):
|
|
30
|
+
start_min = max(skip_front, 0)
|
|
31
|
+
start_max = seq_len - max(skip_back, 0) - kmer_size
|
|
32
|
+
if start_max < start_min or samples <= 0:
|
|
33
|
+
return []
|
|
34
|
+
span = start_max - start_min
|
|
35
|
+
if span == 0:
|
|
36
|
+
return [start_min]
|
|
37
|
+
if samples == 1:
|
|
38
|
+
return [start_min + span // 2]
|
|
39
|
+
step = span / (samples - 1)
|
|
40
|
+
positions = [int(round(start_min + i * step)) for i in range(samples)]
|
|
41
|
+
seen = set()
|
|
42
|
+
uniq_positions = []
|
|
43
|
+
for pos in positions:
|
|
44
|
+
if pos < start_min or pos > start_max:
|
|
45
|
+
continue
|
|
46
|
+
if pos in seen:
|
|
47
|
+
continue
|
|
48
|
+
seen.add(pos)
|
|
49
|
+
uniq_positions.append(pos)
|
|
50
|
+
return uniq_positions
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def count_kmer_hits(seq, positions, kmer_size, parent_kmers, rev_kmers):
|
|
54
|
+
forward_hits = 0
|
|
55
|
+
reverse_hits = 0
|
|
56
|
+
sampled = 0
|
|
57
|
+
for pos in positions:
|
|
58
|
+
kmer = seq[pos:pos + kmer_size]
|
|
59
|
+
if len(kmer) != kmer_size:
|
|
60
|
+
continue
|
|
61
|
+
if set(kmer) <= _VALID_BASES:
|
|
62
|
+
sampled += 1
|
|
63
|
+
if kmer in parent_kmers:
|
|
64
|
+
forward_hits += 1
|
|
65
|
+
if kmer in rev_kmers:
|
|
66
|
+
reverse_hits += 1
|
|
67
|
+
return forward_hits, reverse_hits, sampled
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def iter_fastq_records(handle):
|
|
71
|
+
while True:
|
|
72
|
+
header = handle.readline()
|
|
73
|
+
if not header:
|
|
74
|
+
break
|
|
75
|
+
seq = handle.readline()
|
|
76
|
+
plus = handle.readline()
|
|
77
|
+
qual = handle.readline()
|
|
78
|
+
if not seq or not plus or not qual:
|
|
79
|
+
break
|
|
80
|
+
yield header, seq, plus, qual
|
|
81
|
+
|
|
82
|
+
def filter_single_file(args):
|
|
83
|
+
"""
|
|
84
|
+
Filter a single fastq file. Used for parallel processing.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
args: tuple containing (input_file, parent_kmers, rev_kmers, kmer_size, samples,
|
|
88
|
+
skip_front, skip_back, min_delta, min_ratio)
|
|
89
|
+
Returns:
|
|
90
|
+
tuple: (file_path, total_reads, kept_reads, temp_file)
|
|
91
|
+
"""
|
|
92
|
+
(input_file, parent_kmers, rev_kmers, kmer_size, samples,
|
|
93
|
+
skip_front, skip_back, min_delta, min_ratio) = args
|
|
94
|
+
total_reads = 0
|
|
95
|
+
kept_count = 0
|
|
96
|
+
|
|
97
|
+
is_forward = "forward" in str(input_file).lower()
|
|
98
|
+
|
|
99
|
+
input_path = Path(input_file)
|
|
100
|
+
temp_file = input_path.parent / f"temp_{input_path.name}"
|
|
101
|
+
position_cache = {}
|
|
102
|
+
open_fn = gzip.open if input_path.suffix == ".gz" else open
|
|
103
|
+
with open_fn(input_path, "rt") as input_handle, open(temp_file, "w") as output_handle:
|
|
104
|
+
for header, seq_line, plus, qual in iter_fastq_records(input_handle):
|
|
105
|
+
total_reads += 1
|
|
106
|
+
seq = seq_line.strip().upper()
|
|
107
|
+
seq_len = len(seq)
|
|
108
|
+
if seq_len not in position_cache:
|
|
109
|
+
position_cache[seq_len] = sample_kmer_positions(
|
|
110
|
+
seq_len, kmer_size, samples, skip_front, skip_back
|
|
111
|
+
)
|
|
112
|
+
positions = position_cache[seq_len]
|
|
113
|
+
forward_hits, reverse_hits, sampled = count_kmer_hits(
|
|
114
|
+
seq, positions, kmer_size, parent_kmers, rev_kmers
|
|
115
|
+
)
|
|
116
|
+
if sampled == 0:
|
|
117
|
+
continue
|
|
118
|
+
required_delta = max(min_delta, int(math.ceil(min_ratio * sampled)))
|
|
119
|
+
if required_delta > sampled:
|
|
120
|
+
required_delta = sampled
|
|
121
|
+
|
|
122
|
+
# If it's in forward file (plate barcode was rev comp)
|
|
123
|
+
# Then read should align to reverse complement parent sequence
|
|
124
|
+
if is_forward and (reverse_hits - forward_hits) >= required_delta:
|
|
125
|
+
output_handle.write(header)
|
|
126
|
+
output_handle.write(seq_line)
|
|
127
|
+
output_handle.write(plus)
|
|
128
|
+
output_handle.write(qual)
|
|
129
|
+
kept_count += 1
|
|
130
|
+
# If it's in reverse file (plate barcode was forward)
|
|
131
|
+
# Then read was already reverse complemented by demultiplexer
|
|
132
|
+
# So it should align to forward parent sequence
|
|
133
|
+
elif not is_forward and (forward_hits - reverse_hits) >= required_delta:
|
|
134
|
+
output_handle.write(header)
|
|
135
|
+
output_handle.write(seq_line)
|
|
136
|
+
output_handle.write(plus)
|
|
137
|
+
output_handle.write(qual)
|
|
138
|
+
kept_count += 1
|
|
139
|
+
|
|
140
|
+
return str(input_file), total_reads, kept_count, str(temp_file)
|
|
141
|
+
|
|
142
|
+
def filter_demultiplexed_folder(
|
|
143
|
+
experiment_folder,
|
|
144
|
+
parent_sequence,
|
|
145
|
+
num_threads=8,
|
|
146
|
+
kmer_size=6,
|
|
147
|
+
samples=40,
|
|
148
|
+
skip_front=100,
|
|
149
|
+
skip_back=0,
|
|
150
|
+
min_delta=4,
|
|
151
|
+
min_ratio=0.1,
|
|
152
|
+
):
|
|
153
|
+
"""
|
|
154
|
+
Filter demultiplexed files using a k-mer orientation heuristic.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
experiment_folder (str): Path to experiment folder containing RBC/FBC structure
|
|
158
|
+
parent_sequence (str): Parent sequence for alignment checking
|
|
159
|
+
num_threads (int): Number of threads to use
|
|
160
|
+
kmer_size (int): Length of k-mer used for orientation checks
|
|
161
|
+
samples (int): Number of k-mers sampled per read
|
|
162
|
+
skip_front (int): Bases to skip from the front of the read
|
|
163
|
+
skip_back (int): Bases to skip from the end of the read
|
|
164
|
+
min_delta (int): Minimum hit difference to keep a read
|
|
165
|
+
min_ratio (float): Minimum hit difference as a ratio of sampled k-mers
|
|
166
|
+
"""
|
|
167
|
+
exp_path = Path(experiment_folder)
|
|
168
|
+
filtered_counts = {}
|
|
169
|
+
|
|
170
|
+
# Prepare parent sequences once
|
|
171
|
+
parent_seq_obj = Seq(parent_sequence)
|
|
172
|
+
parent_seq = str(parent_seq_obj).upper()
|
|
173
|
+
parent_rev_comp = str(parent_seq_obj.reverse_complement()).upper()
|
|
174
|
+
parent_kmers = build_kmer_set(parent_seq, kmer_size)
|
|
175
|
+
rev_kmers = build_kmer_set(parent_rev_comp, kmer_size)
|
|
176
|
+
|
|
177
|
+
# Collect all fastq files
|
|
178
|
+
fastq_files = []
|
|
179
|
+
for rbc_dir in exp_path.glob("RB*"):
|
|
180
|
+
if not rbc_dir.is_dir():
|
|
181
|
+
continue
|
|
182
|
+
for fbc_dir in rbc_dir.glob("NB*"):
|
|
183
|
+
if not fbc_dir.is_dir():
|
|
184
|
+
continue
|
|
185
|
+
fastq_files.extend(list(fbc_dir.glob("*.fastq")))
|
|
186
|
+
|
|
187
|
+
if not fastq_files:
|
|
188
|
+
logging.warning(f"No fastq files found in {experiment_folder}")
|
|
189
|
+
return filtered_counts
|
|
190
|
+
|
|
191
|
+
# Prepare arguments for parallel processing
|
|
192
|
+
file_args = [
|
|
193
|
+
(f, parent_kmers, rev_kmers, kmer_size, samples, skip_front, skip_back, min_delta, min_ratio)
|
|
194
|
+
for f in fastq_files
|
|
195
|
+
]
|
|
196
|
+
|
|
197
|
+
# Process files in parallel with progress bar
|
|
198
|
+
with ThreadPoolExecutor(max_workers=num_threads) as executor:
|
|
199
|
+
futures = [executor.submit(filter_single_file, args) for args in file_args]
|
|
200
|
+
|
|
201
|
+
with tqdm(total=len(fastq_files), desc="Filtering files") as pbar:
|
|
202
|
+
for future in as_completed(futures):
|
|
203
|
+
try:
|
|
204
|
+
file_path, total, kept, temp_file = future.result()
|
|
205
|
+
|
|
206
|
+
shutil.move(temp_file, file_path)
|
|
207
|
+
|
|
208
|
+
filtered_counts[file_path] = {
|
|
209
|
+
'total': total,
|
|
210
|
+
'kept': kept,
|
|
211
|
+
'filtered': total - kept
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
logging.info(f"Processed {file_path}: {kept}/{total} reads kept")
|
|
215
|
+
pbar.update(1)
|
|
216
|
+
|
|
217
|
+
except Exception as e:
|
|
218
|
+
logging.error(f"Error processing file {file_path}: {str(e)}")
|
|
219
|
+
pbar.update(1)
|
|
220
|
+
|
|
221
|
+
return filtered_counts
|
|
@@ -62,6 +62,7 @@ import numpy as np
|
|
|
62
62
|
import tqdm
|
|
63
63
|
import panel as pn
|
|
64
64
|
import holoviews as hv
|
|
65
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
65
66
|
from importlib import resources
|
|
66
67
|
from holoviews.streams import Tap
|
|
67
68
|
|
|
@@ -485,6 +486,11 @@ def process_ref_csv_oligopool(cl_args, tqdm_fn=tqdm.tqdm):
|
|
|
485
486
|
result_folder = create_result_folder(cl_args)
|
|
486
487
|
variant_csv_path = os.path.join(result_folder, "variants.csv")
|
|
487
488
|
variant_df = pd.DataFrame(columns=["barcode_plate", "name", "refseq", "variant"])
|
|
489
|
+
|
|
490
|
+
output_dir = Path(result_folder) / f"{cl_args['name']}_fastq"
|
|
491
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
492
|
+
if not cl_args["skip_demultiplexing"]:
|
|
493
|
+
cat_fastq_files(cl_args.get("path"), output_dir)
|
|
488
494
|
|
|
489
495
|
# First get the different barcode plates (these will be unique)
|
|
490
496
|
barcode_plates = ref_df["barcode_plate"].unique()
|
|
@@ -496,10 +502,7 @@ def process_ref_csv_oligopool(cl_args, tqdm_fn=tqdm.tqdm):
|
|
|
496
502
|
name_folder = os.path.join(result_folder, f'RB{barcode_plate}')
|
|
497
503
|
os.makedirs(name_folder, exist_ok=True)
|
|
498
504
|
barcode_path = filter_bc(cl_args, name_folder, i)
|
|
499
|
-
output_dir = Path(result_folder) / f"{cl_args['name']}_fastq"
|
|
500
|
-
output_dir.mkdir(parents=True, exist_ok=True)
|
|
501
505
|
|
|
502
|
-
file_to_fastq = cat_fastq_files(cl_args.get("path"), output_dir)
|
|
503
506
|
try:
|
|
504
507
|
demux_fastq(output_dir, name_folder, barcode_path)
|
|
505
508
|
except Exception as e:
|
|
@@ -543,62 +546,134 @@ def process_ref_csv(cl_args, tqdm_fn=tqdm.tqdm):
|
|
|
543
546
|
variant_csv_path = os.path.join(result_folder, "variants.csv")
|
|
544
547
|
|
|
545
548
|
variant_df = pd.DataFrame(columns=["barcode_plate", "name", "refseq", "variant"])
|
|
546
|
-
|
|
547
|
-
|
|
549
|
+
|
|
550
|
+
output_dir = Path(result_folder) / f"{cl_args['name']}_fastq"
|
|
551
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
552
|
+
if not cl_args["skip_demultiplexing"]:
|
|
553
|
+
cat_fastq_files(cl_args.get("path"), output_dir)
|
|
554
|
+
|
|
555
|
+
samples = []
|
|
556
|
+
for i, row in ref_df.iterrows():
|
|
548
557
|
barcode_plate = row["barcode_plate"]
|
|
549
558
|
name = row["name"]
|
|
550
559
|
refseq = row["refseq"].upper()
|
|
551
560
|
|
|
552
561
|
name_folder = os.path.join(result_folder, name)
|
|
553
562
|
os.makedirs(name_folder, exist_ok=True)
|
|
554
|
-
|
|
563
|
+
|
|
555
564
|
temp_fasta_path = os.path.join(name_folder, f"temp_{name}.fasta")
|
|
556
565
|
if not os.path.exists(temp_fasta_path):
|
|
557
566
|
with open(temp_fasta_path, "w") as f:
|
|
558
567
|
f.write(f">{name}\n{refseq}\n")
|
|
559
568
|
else:
|
|
560
569
|
logging.info(f"Fasta file for {name} already exists. Skipping write.")
|
|
561
|
-
|
|
562
|
-
barcode_path = filter_bc(cl_args, name_folder, i)
|
|
563
|
-
output_dir = Path(result_folder) / f"{cl_args['name']}_fastq"
|
|
564
|
-
output_dir.mkdir(parents=True, exist_ok=True)
|
|
565
570
|
|
|
566
|
-
|
|
567
|
-
|
|
571
|
+
barcode_path = filter_bc(cl_args, name_folder, i)
|
|
572
|
+
samples.append({
|
|
573
|
+
"barcode_plate": barcode_plate,
|
|
574
|
+
"name": name,
|
|
575
|
+
"refseq": refseq,
|
|
576
|
+
"name_folder": name_folder,
|
|
577
|
+
"temp_fasta_path": temp_fasta_path,
|
|
578
|
+
"barcode_path": barcode_path,
|
|
579
|
+
"demux_ok": True,
|
|
580
|
+
})
|
|
581
|
+
|
|
582
|
+
def _demux_only(sample):
|
|
583
|
+
name = sample["name"]
|
|
584
|
+
name_folder = sample["name_folder"]
|
|
585
|
+
barcode_path = sample["barcode_path"]
|
|
586
|
+
try:
|
|
587
|
+
demux_fastq(output_dir, name_folder, barcode_path)
|
|
588
|
+
return True
|
|
589
|
+
except Exception:
|
|
590
|
+
logging.error(
|
|
591
|
+
"An error occurred during demultiplexing for sample {}. Skipping this sample.".format(name),
|
|
592
|
+
exc_info=True,
|
|
593
|
+
)
|
|
594
|
+
return False
|
|
595
|
+
|
|
596
|
+
if not cl_args["skip_demultiplexing"]:
|
|
597
|
+
batch_size = 8
|
|
598
|
+
if samples:
|
|
599
|
+
pbar = tqdm_fn(total=len(samples), desc="Demultiplex plates")
|
|
568
600
|
try:
|
|
569
|
-
|
|
601
|
+
for i in range(0, len(samples), batch_size):
|
|
602
|
+
batch = samples[i:i + batch_size]
|
|
603
|
+
max_workers = len(batch)
|
|
604
|
+
if max_workers <= 1:
|
|
605
|
+
sample = batch[0]
|
|
606
|
+
sample["demux_ok"] = _demux_only(sample)
|
|
607
|
+
pbar.update(1)
|
|
608
|
+
continue
|
|
609
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
610
|
+
futures = {executor.submit(_demux_only, sample): sample for sample in batch}
|
|
611
|
+
for future in as_completed(futures):
|
|
612
|
+
sample = futures[future]
|
|
613
|
+
sample["demux_ok"] = future.result()
|
|
614
|
+
pbar.update(1)
|
|
615
|
+
finally:
|
|
616
|
+
pbar.close()
|
|
617
|
+
else:
|
|
618
|
+
for sample in samples:
|
|
619
|
+
sample["demux_ok"] = True
|
|
570
620
|
|
|
571
|
-
|
|
621
|
+
if not cl_args["skip_demultiplexing"]:
|
|
622
|
+
for sample in tqdm_fn(samples, total=len(samples), desc="Filter plates"):
|
|
623
|
+
if not sample["demux_ok"]:
|
|
624
|
+
continue
|
|
625
|
+
name = sample["name"]
|
|
626
|
+
refseq = sample["refseq"]
|
|
627
|
+
name_folder = sample["name_folder"]
|
|
628
|
+
try:
|
|
572
629
|
filtered_counts = filter_demultiplexed_folder(
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
630
|
+
name_folder,
|
|
631
|
+
refseq,
|
|
632
|
+
num_threads=10,
|
|
576
633
|
)
|
|
577
634
|
logging.info(f"Orientation filtering completed for {name}")
|
|
578
635
|
total_reads = sum(counts['total'] for counts in filtered_counts.values())
|
|
579
636
|
kept_reads = sum(counts['kept'] for counts in filtered_counts.values())
|
|
580
|
-
|
|
637
|
+
if total_reads:
|
|
638
|
+
logging.info(
|
|
639
|
+
"Total filtering results: %d/%d reads kept (%.2f%%)",
|
|
640
|
+
kept_reads,
|
|
641
|
+
total_reads,
|
|
642
|
+
kept_reads / total_reads * 100,
|
|
643
|
+
)
|
|
581
644
|
for file, counts in filtered_counts.items():
|
|
582
645
|
logging.info(f"{file}: {counts['kept']}/{counts['total']} reads kept")
|
|
646
|
+
except Exception:
|
|
647
|
+
logging.error(
|
|
648
|
+
"An error occurred during filtering for sample {}. Skipping this sample.".format(name),
|
|
649
|
+
exc_info=True,
|
|
650
|
+
)
|
|
651
|
+
sample["demux_ok"] = False
|
|
652
|
+
continue
|
|
583
653
|
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
654
|
+
if not cl_args["skip_variantcalling"]:
|
|
655
|
+
for sample in tqdm_fn(samples, total=len(samples), desc="Calling variants"):
|
|
656
|
+
if not sample["demux_ok"] and not cl_args["skip_demultiplexing"]:
|
|
587
657
|
continue
|
|
588
|
-
|
|
589
|
-
if not cl_args["skip_variantcalling"]:
|
|
590
658
|
try:
|
|
591
659
|
threshold = cl_args.get("threshold") if cl_args.get("threshold") is not None else 0.5
|
|
592
660
|
variant_result = call_variant(
|
|
593
|
-
f"{name}",
|
|
661
|
+
f"{sample['name']}",
|
|
662
|
+
sample["name_folder"],
|
|
663
|
+
sample["temp_fasta_path"],
|
|
664
|
+
sample["barcode_path"],
|
|
665
|
+
threshold=threshold,
|
|
594
666
|
)
|
|
595
|
-
variant_result["barcode_plate"] = barcode_plate
|
|
596
|
-
variant_result["name"] = name
|
|
597
|
-
variant_result["refseq"] = refseq
|
|
667
|
+
variant_result["barcode_plate"] = sample["barcode_plate"]
|
|
668
|
+
variant_result["name"] = sample["name"]
|
|
669
|
+
variant_result["refseq"] = sample["refseq"]
|
|
598
670
|
|
|
599
671
|
variant_df = pd.concat([variant_df, variant_result])
|
|
600
672
|
except Exception as e:
|
|
601
|
-
logging.error(
|
|
673
|
+
logging.error(
|
|
674
|
+
"An error occurred during variant calling for sample {}. Skipping this sample.".format(sample["name"]),
|
|
675
|
+
exc_info=True,
|
|
676
|
+
)
|
|
602
677
|
continue
|
|
603
678
|
|
|
604
679
|
variant_df.to_csv(variant_csv_path, index=False)
|
|
@@ -676,4 +751,3 @@ def run_LevSeq(cl_args, tqdm_fn=tqdm.tqdm):
|
|
|
676
751
|
|
|
677
752
|
# This modification saves the results at each critical stage, ensuring that even in the case of failure,
|
|
678
753
|
# the user has access to intermediate results and does not lose all the progress.
|
|
679
|
-
|
|
@@ -205,7 +205,7 @@ def calculate_mutation_significance_across_well(seq_df):
|
|
|
205
205
|
seq_df.at[i, 'p(g)'] = p_g
|
|
206
206
|
seq_df.at[i, 'p(c)'] = p_c
|
|
207
207
|
seq_df.at[i, 'p(n)'] = p_n
|
|
208
|
-
seq_df.at[i, 'p(i)'] =
|
|
208
|
+
seq_df.at[i, 'p(i)'] = p_i
|
|
209
209
|
seq_df.at[i, 'p_value'] = p_value
|
|
210
210
|
seq_df.at[i, 'percent_most_freq_mutation'] = val
|
|
211
211
|
seq_df.at[i, 'most_frequent'] = actual_seq
|
|
@@ -324,6 +324,8 @@ def get_reads_for_well(parent_name, bam_file_path: str, ref_str: str, msa_path=N
|
|
|
324
324
|
'C', 'p(c)', 'N', 'p(n)', 'I', 'p(i)', 'Warnings']
|
|
325
325
|
return calculate_mutation_significance_across_well(seq_df), alignment_count
|
|
326
326
|
return None, 0
|
|
327
|
+
|
|
328
|
+
|
|
327
329
|
def make_row_from_read_pileup_across_well(well_df, ref_str, label, insert_map):
|
|
328
330
|
"""
|
|
329
331
|
Given a pileup of reads, we want to get some summary information about that sequence
|
|
@@ -349,12 +351,12 @@ def make_row_from_read_pileup_across_well(well_df, ref_str, label, insert_map):
|
|
|
349
351
|
warning = f'WARNING: INSERT.'
|
|
350
352
|
rows.append([label, col, ref_seq, actual_seq, freq_non_ref, total_other, total_reads, 1.0, 0.0,
|
|
351
353
|
len(vc[vc == 'A']), 1.0, len(vc[vc == 'T']), 1.0, len(vc[vc == 'G']), 1.0,
|
|
352
|
-
len(vc[vc == 'C']), 1.0, len(vc[vc == '-']), 1.0, len(
|
|
354
|
+
len(vc[vc == 'C']), 1.0, len(vc[vc == '-']), 1.0, len(vc[vc == 'I']),
|
|
353
355
|
1.0, warning])
|
|
354
|
-
|
|
356
|
+
elif ref_seq != '-':
|
|
355
357
|
rows.append([label, col, ref_seq, actual_seq, freq_non_ref, total_other, total_reads, 1.0, 0.0,
|
|
356
358
|
len(vc[vc == 'A']), 1.0, len(vc[vc == 'T']), 1.0, len(vc[vc == 'G']), 1.0,
|
|
357
|
-
len(vc[vc == 'C']), 1.0, len(vc[vc == '-']), 1.0,
|
|
359
|
+
len(vc[vc == 'C']), 1.0, len(vc[vc == '-']), 1.0, len(vc[vc == 'I']),
|
|
358
360
|
1.0, warning])
|
|
359
361
|
return rows
|
|
360
362
|
|
|
@@ -18,6 +18,7 @@ import pandas as pd
|
|
|
18
18
|
import logging
|
|
19
19
|
from levseq.utils import *
|
|
20
20
|
import subprocess
|
|
21
|
+
import shutil
|
|
21
22
|
import os
|
|
22
23
|
from collections import defaultdict
|
|
23
24
|
import glob
|
|
@@ -41,9 +42,7 @@ The variant caller starts from demultiplexed fastq files.
|
|
|
41
42
|
|
|
42
43
|
logger = logging.getLogger(__name__)
|
|
43
44
|
logger.setLevel(logging.WARNING) # Set default level for this module
|
|
44
|
-
# Use the logger in this file
|
|
45
|
-
logger.warning("This is a warning message.")
|
|
46
|
-
logger.info("This won't show unless logging is configured to INFO elsewhere.")
|
|
45
|
+
# Use the logger in this file.
|
|
47
46
|
|
|
48
47
|
class VariantCaller:
|
|
49
48
|
"""
|
|
@@ -123,47 +122,108 @@ class VariantCaller:
|
|
|
123
122
|
def _align_sequences(self, output_dir, filename, scores=[4, 2, 10], alignment_name="alignment_minimap"):
|
|
124
123
|
try:
|
|
125
124
|
all_fastq = os.path.join(output_dir, '*.fastq')
|
|
126
|
-
fastq_list = glob.glob(all_fastq)
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
if not all_fastq:
|
|
125
|
+
fastq_list = sorted(glob.glob(all_fastq))
|
|
126
|
+
if not fastq_list:
|
|
130
127
|
logger.error("No FASTQ files found in the specified output directory.")
|
|
131
128
|
return
|
|
132
129
|
|
|
133
|
-
|
|
134
|
-
if len(fastq_list) > 1:
|
|
135
|
-
with open(fastq_files, 'w') as outfile:
|
|
136
|
-
for fastq in fastq_list:
|
|
137
|
-
with open(fastq, 'r') as infile:
|
|
138
|
-
outfile.write(infile.read())
|
|
139
|
-
else:
|
|
140
|
-
fastq_files = fastq_list[0]
|
|
130
|
+
sam_path = os.path.join(output_dir, f"{alignment_name}.sam")
|
|
141
131
|
# Alignment using minimap2
|
|
142
|
-
minimap_cmd =
|
|
143
|
-
|
|
132
|
+
minimap_cmd = [
|
|
133
|
+
"minimap2", "-ax", "map-ont",
|
|
134
|
+
"-A", str(scores[0]),
|
|
135
|
+
"-B", str(scores[1]),
|
|
136
|
+
"-O", f"{scores[2]},24",
|
|
137
|
+
str(self.template_fasta),
|
|
138
|
+
*fastq_list,
|
|
139
|
+
]
|
|
140
|
+
with open(sam_path, "w") as sam_handle:
|
|
141
|
+
minimap_result = subprocess.run(
|
|
142
|
+
minimap_cmd,
|
|
143
|
+
stdout=sam_handle,
|
|
144
|
+
stderr=subprocess.PIPE,
|
|
145
|
+
text=True,
|
|
146
|
+
)
|
|
147
|
+
if minimap_result.returncode != 0:
|
|
148
|
+
logger.error(
|
|
149
|
+
"minimap2 failed for %s: %s",
|
|
150
|
+
filename,
|
|
151
|
+
minimap_result.stderr.strip(),
|
|
152
|
+
)
|
|
153
|
+
return
|
|
144
154
|
# print(minimap_cmd)
|
|
145
|
-
# Convert SAM to BAM
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
155
|
+
# Convert SAM to BAM
|
|
156
|
+
unsorted_bam = os.path.join(output_dir, f"{alignment_name}.unsorted.bam")
|
|
157
|
+
with open(unsorted_bam, "wb") as bam_handle:
|
|
158
|
+
view_result = subprocess.run(
|
|
159
|
+
["samtools", "view", "-bS", sam_path],
|
|
160
|
+
stdout=bam_handle,
|
|
161
|
+
stderr=subprocess.PIPE,
|
|
162
|
+
)
|
|
163
|
+
if view_result.returncode != 0:
|
|
164
|
+
logger.error(
|
|
165
|
+
"samtools view failed for %s: %s",
|
|
166
|
+
filename,
|
|
167
|
+
view_result.stderr.decode().strip(),
|
|
168
|
+
)
|
|
169
|
+
return
|
|
149
170
|
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
171
|
+
# Sort BAM (support both modern and legacy samtools syntax)
|
|
172
|
+
sorted_bam = os.path.join(output_dir, f"{alignment_name}.bam")
|
|
173
|
+
sort_result = subprocess.run(
|
|
174
|
+
["samtools", "sort", "-o", sorted_bam, unsorted_bam],
|
|
175
|
+
stdout=subprocess.DEVNULL,
|
|
176
|
+
stderr=subprocess.PIPE,
|
|
177
|
+
)
|
|
178
|
+
if sort_result.returncode != 0 or not os.path.exists(sorted_bam):
|
|
179
|
+
legacy_prefix = os.path.join(output_dir, f"{alignment_name}.sorted")
|
|
180
|
+
legacy_result = subprocess.run(
|
|
181
|
+
["samtools", "sort", unsorted_bam, legacy_prefix],
|
|
182
|
+
stdout=subprocess.DEVNULL,
|
|
183
|
+
stderr=subprocess.PIPE,
|
|
184
|
+
)
|
|
185
|
+
if legacy_result.returncode != 0:
|
|
186
|
+
logger.error(
|
|
187
|
+
"samtools sort failed for %s: %s",
|
|
188
|
+
filename,
|
|
189
|
+
legacy_result.stderr.decode().strip(),
|
|
190
|
+
)
|
|
191
|
+
return
|
|
192
|
+
legacy_bam = f"{legacy_prefix}.bam"
|
|
193
|
+
if not os.path.exists(legacy_bam):
|
|
194
|
+
logger.error("samtools sort did not produce %s", legacy_bam)
|
|
195
|
+
return
|
|
196
|
+
shutil.move(legacy_bam, sorted_bam)
|
|
153
197
|
|
|
154
198
|
# Index the BAM file
|
|
155
|
-
|
|
156
|
-
|
|
199
|
+
if not os.path.exists(sorted_bam):
|
|
200
|
+
logger.error("samtools sort did not produce %s", sorted_bam)
|
|
201
|
+
return
|
|
202
|
+
index_cmd = ["samtools", "index", sorted_bam]
|
|
203
|
+
index_result = subprocess.run(
|
|
204
|
+
index_cmd,
|
|
205
|
+
stdout=subprocess.DEVNULL,
|
|
206
|
+
stderr=subprocess.PIPE,
|
|
207
|
+
)
|
|
208
|
+
if index_result.returncode != 0:
|
|
209
|
+
logger.error(
|
|
210
|
+
"samtools index failed for %s: %s",
|
|
211
|
+
filename,
|
|
212
|
+
index_result.stderr.decode().strip(),
|
|
213
|
+
)
|
|
214
|
+
return
|
|
157
215
|
|
|
158
216
|
# Cleanup SAM file to save space
|
|
159
|
-
os.remove(
|
|
217
|
+
os.remove(sam_path)
|
|
218
|
+
os.remove(unsorted_bam)
|
|
160
219
|
except Exception as e:
|
|
161
220
|
logger.error(f"Error during alignment for {filename}: {e}")
|
|
162
221
|
|
|
163
222
|
def _run_variant_thread(self, args):
|
|
164
223
|
barcode_ids, threshold, min_depth, output_dir = args
|
|
165
|
-
|
|
166
|
-
|
|
224
|
+
logger.info("Variant calling: processing %d barcodes", len(barcode_ids))
|
|
225
|
+
# Overall progress bar for all barcodes in this thread (disabled to reduce console spam)
|
|
226
|
+
with tqdm(barcode_ids, desc="Processing barcodes", leave=False, disable=True) as pbar:
|
|
167
227
|
for barcode_id in pbar:
|
|
168
228
|
try:
|
|
169
229
|
row = self.variant_dict.get(barcode_id)
|
|
@@ -171,9 +231,18 @@ class VariantCaller:
|
|
|
171
231
|
|
|
172
232
|
# Check if alignment file exists, if not, align sequences
|
|
173
233
|
if not os.path.exists(bam_file):
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
234
|
+
logger.info(f"Aligning sequences for {row['Path']}")
|
|
235
|
+
self._align_sequences(
|
|
236
|
+
row["Path"],
|
|
237
|
+
row['Barcodes'],
|
|
238
|
+
alignment_name=f'{self.alignment_name}_{barcode_id}',
|
|
239
|
+
)
|
|
240
|
+
elif not os.path.exists(f"{bam_file}.bai"):
|
|
241
|
+
subprocess.run(
|
|
242
|
+
["samtools", "index", bam_file],
|
|
243
|
+
stdout=subprocess.DEVNULL,
|
|
244
|
+
stderr=subprocess.DEVNULL,
|
|
245
|
+
)
|
|
177
246
|
|
|
178
247
|
# Placeholder function calls to demonstrate workflow
|
|
179
248
|
well_df, alignment_count = get_reads_for_well(self.experiment_name, bam_file,
|
|
@@ -184,7 +253,12 @@ class VariantCaller:
|
|
|
184
253
|
continue
|
|
185
254
|
self.variant_dict[barcode_id]['Alignment Count'] = alignment_count
|
|
186
255
|
well_df.to_csv(f"{row['Path']}/seq_{barcode_id}.csv", index=False)
|
|
187
|
-
|
|
256
|
+
# Suppress noisy numerical warnings from downstream stats on sparse wells.
|
|
257
|
+
with warnings.catch_warnings():
|
|
258
|
+
warnings.filterwarnings("ignore", category=RuntimeWarning)
|
|
259
|
+
label, freq, combined_p_value, mixed_well, avg_error_rate = get_variant_label_for_well(
|
|
260
|
+
well_df, threshold
|
|
261
|
+
)
|
|
188
262
|
self.variant_dict[barcode_id]['Variant'] = label
|
|
189
263
|
self.variant_dict[barcode_id]['Mixed Well'] = mixed_well
|
|
190
264
|
self.variant_dict[barcode_id]['Average mutation frequency'] = freq
|
|
@@ -67,13 +67,36 @@ import seaborn as sns
|
|
|
67
67
|
|
|
68
68
|
from levseq.utils import *
|
|
69
69
|
|
|
70
|
-
|
|
70
|
+
def _in_notebook():
|
|
71
|
+
try:
|
|
72
|
+
from IPython import get_ipython
|
|
73
|
+
except Exception:
|
|
74
|
+
return False
|
|
75
|
+
ip = get_ipython()
|
|
76
|
+
if ip is None:
|
|
77
|
+
return False
|
|
78
|
+
return ip.__class__.__name__ == "ZMQInteractiveShell"
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _should_init_notebook():
|
|
82
|
+
if os.environ.get("LEVSEQ_DISABLE_NOTEBOOK_INIT") == "1":
|
|
83
|
+
return False
|
|
84
|
+
if os.environ.get("LEVSEQ_FORCE_NOTEBOOK_INIT") == "1":
|
|
85
|
+
return True
|
|
86
|
+
return _in_notebook()
|
|
87
|
+
|
|
71
88
|
|
|
72
|
-
|
|
73
|
-
|
|
89
|
+
def init_notebook_env():
|
|
90
|
+
# Avoid notebook UI side-effects during plain imports/tests.
|
|
91
|
+
output_notebook()
|
|
92
|
+
pn.extension()
|
|
93
|
+
pn.config.comms = "vscode"
|
|
94
|
+
hv.extension("bokeh")
|
|
95
|
+
hv.renderer("bokeh").webgl = True
|
|
74
96
|
|
|
75
|
-
|
|
76
|
-
|
|
97
|
+
|
|
98
|
+
if _should_init_notebook():
|
|
99
|
+
init_notebook_env()
|
|
77
100
|
|
|
78
101
|
# warnings.filterwarnings("ignore")
|
|
79
102
|
#warnings.filterwarnings("ignore", category=Warning)
|
|
@@ -392,25 +415,33 @@ def generate_platemaps(
|
|
|
392
415
|
out=np.zeros_like(max_combo_df["Alignment Count"], dtype=float),
|
|
393
416
|
where=max_combo_df["Alignment Count"] != 0,
|
|
394
417
|
)
|
|
395
|
-
|
|
396
|
-
|
|
418
|
+
max_combo_df["logseqdepth"] = max_combo_df["logseqdepth"].fillna(0.0)
|
|
419
|
+
|
|
420
|
+
min_val = max_combo_df["logseqdepth"].min()
|
|
421
|
+
max_val = max_combo_df["logseqdepth"].max()
|
|
422
|
+
if not np.isfinite(min_val) or not np.isfinite(max_val) or min_val == max_val:
|
|
423
|
+
# Avoid invalid colormap centers when data has no range.
|
|
424
|
+
color_levels = [min_val - 0.1, min_val, min_val + 0.1]
|
|
425
|
+
else:
|
|
426
|
+
# Set the center
|
|
427
|
+
center = np.log(10)
|
|
397
428
|
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
429
|
+
add_min = False
|
|
430
|
+
if min_val >= center:
|
|
431
|
+
add_min = True
|
|
401
432
|
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
433
|
+
# Adjust if it is greater than max of data (avoids ValueError)
|
|
434
|
+
if max_val <= center:
|
|
435
|
+
# Adjust the center
|
|
436
|
+
center = max_combo_df["logseqdepth"].median()
|
|
406
437
|
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
438
|
+
# center colormap
|
|
439
|
+
if not add_min:
|
|
440
|
+
color_levels = ns.viz._center_colormap(max_combo_df["logseqdepth"], center)
|
|
441
|
+
else:
|
|
442
|
+
color_levels = ns.viz._center_colormap(
|
|
443
|
+
list(max_combo_df["logseqdepth"]) + [np.log(1)], center
|
|
444
|
+
)
|
|
414
445
|
|
|
415
446
|
# dictionary for storing plots
|
|
416
447
|
hm_dict = {}
|
|
@@ -1198,4 +1229,4 @@ def plot_seaborn_heatmap(platemap, platemap_labels, label: str, result_folder):
|
|
|
1198
1229
|
ax = pc.axes
|
|
1199
1230
|
plt.yticks(rotation=0)
|
|
1200
1231
|
plt.setp(ax.get_yticklabels(), ha="center")
|
|
1201
|
-
plt.savefig(os.path.join(result_folder, f'platemap_{label}.svg'))
|
|
1232
|
+
plt.savefig(os.path.join(result_folder, f'platemap_{label}.svg'))
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: levseq
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.5
|
|
4
4
|
Home-page: https://github.com/fhalab/levseq/
|
|
5
5
|
Author: Yueming Long, Ariane Mora, Francesca-Zhoufan Li, Emre Gursoy
|
|
6
6
|
Author-email: ylong@caltech.edu
|
|
@@ -44,6 +44,18 @@ Requires-Dist: scikit-learn
|
|
|
44
44
|
Requires-Dist: statsmodels
|
|
45
45
|
Requires-Dist: tqdm
|
|
46
46
|
Requires-Dist: biopandas
|
|
47
|
+
Dynamic: author
|
|
48
|
+
Dynamic: author-email
|
|
49
|
+
Dynamic: classifier
|
|
50
|
+
Dynamic: description
|
|
51
|
+
Dynamic: description-content-type
|
|
52
|
+
Dynamic: home-page
|
|
53
|
+
Dynamic: keywords
|
|
54
|
+
Dynamic: license
|
|
55
|
+
Dynamic: license-file
|
|
56
|
+
Dynamic: project-url
|
|
57
|
+
Dynamic: requires-dist
|
|
58
|
+
Dynamic: requires-python
|
|
47
59
|
|
|
48
60
|
# Variant Sequencing with Nanopore (LevSeq)
|
|
49
61
|
|
|
@@ -52,8 +64,35 @@ LevSeq provides a streamlined pipeline for sequencing and analyzing genetic vari
|
|
|
52
64
|

|
|
53
65
|
Figure 1: Overview of the LevSeq variant sequencing workflow using Nanopore technology. This diagram illustrates the key steps in the process, from sample preparation to data analysis and visualization.
|
|
54
66
|
|
|
67
|
+
## <span style="color: orange;">**Important: Barcode Improvements and LevSeq 2.0 Development**</span>
|
|
68
|
+
|
|
69
|
+
**We have identified and resolved demultiplexing challenges in the original barcode set.** Version 1.4 introduced alignment-aware variant calling to address these issues and significantly improve accuracy.
|
|
70
|
+
|
|
71
|
+
**We are actively developing LevSeq 2.0** in collaboration with DTU and AITHYRA to fundamentally redesign the barcode system. The updated approach includes:
|
|
72
|
+
|
|
73
|
+
- **Enhanced barcode design**: New barcodes will be strain-aware and sequence-aware, generated using an advanced barcode design tool
|
|
74
|
+
- **Reversed workflow architecture**: LevSeq 2.0 will perform alignment first, then demultiplexing (rather than the current demultiplexing-first approach), resolving issues with forward and reverse read handling
|
|
75
|
+
- **Improved accuracy**: These changes will provide more robust demultiplexing and variant calling across diverse experimental conditions
|
|
76
|
+
|
|
77
|
+
**Please reach out to us at ylong@caltech.edu if you are planning to order barcoded primers now**
|
|
78
|
+
|
|
79
|
+
## Notes
|
|
80
|
+
|
|
81
|
+
LevSeq was designed for epPCR and SSM experiments, however, we are currently extending it to work for other enzyme engineering designs as well, the current features are under development:
|
|
82
|
+
|
|
83
|
+
1. Insertion handling (see version 4.1.3) - thanks to Brian Zhong for his contributions to this section!
|
|
84
|
+
2. Gene calling (handling different genes, use the `--oligopool` flag)
|
|
85
|
+
|
|
86
|
+
If you notice any issues with new features or have adapted the LevSeq code for your own use cases, we would love community contributions! Please submit either an issue, or a pull request and we will aim to incorperate the changes.
|
|
87
|
+
|
|
88
|
+
Performance update: demultiplexing now runs in parallel batches of 8 plates and input FASTQs are staged once per run, improving throughput on multi-core systems.
|
|
89
|
+
|
|
55
90
|
## Quick Start
|
|
56
91
|
|
|
92
|
+
Note the current stable version is: `1.5`, the latest version is `1.5`.
|
|
93
|
+
|
|
94
|
+
For stable releases these are made available via docker and pip. For latest versions, please clone the repo and install locally (see *Local development or install of latest version* below).
|
|
95
|
+
|
|
57
96
|
### Docker Installation (Recommended)
|
|
58
97
|
|
|
59
98
|
1. Install Docker: [https://docs.docker.com/engine/install/](https://docs.docker.com/engine/install/)
|
|
@@ -183,6 +222,16 @@ For the wet lab protocol:
|
|
|
183
222
|
- **Advanced Usage**: See the [manuscript notebook](https://github.com/fhalab/LevSeq/blob/main/manuscript/notebooks/epPCR_10plates.ipynb)
|
|
184
223
|
- **Troubleshooting**: See our [computational protocols wiki](https://github.com/fhalab/LevSeq/wiki/Computational-protocols)
|
|
185
224
|
|
|
225
|
+
### Local development or install of latest version
|
|
226
|
+
|
|
227
|
+
```
|
|
228
|
+
conda create --name levseq python=3.10
|
|
229
|
+
git clone git@github.com:fhalab/LevSeq.git
|
|
230
|
+
cd LevSeq
|
|
231
|
+
python setup.py sdist bdist_wheel
|
|
232
|
+
pip install dist/levseq-1.4.3.tar.gz
|
|
233
|
+
```
|
|
234
|
+
|
|
186
235
|
## Citing LevSeq
|
|
187
236
|
|
|
188
237
|
If you find LevSeq useful, please cite our paper:
|
|
@@ -283,7 +283,7 @@ class TestVariantCalling(TestClass):
|
|
|
283
283
|
|
|
284
284
|
def test_calling_variant_with_insert(self):
|
|
285
285
|
u.dp(["Testing calling variants using SSM with error"])
|
|
286
|
-
|
|
286
|
+
# ToDo: Update this with new calling need a new test for this
|
|
287
287
|
parent_sequence = "ATGAGT"
|
|
288
288
|
mutated_sequence = 'ATGAGT' # Not actually mutated
|
|
289
289
|
parent_name = 'parent'
|
|
@@ -1,115 +0,0 @@
|
|
|
1
|
-
from Bio import SeqIO
|
|
2
|
-
from Bio.Seq import Seq
|
|
3
|
-
import os
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
import logging
|
|
6
|
-
from Bio.Align import PairwiseAligner
|
|
7
|
-
import shutil
|
|
8
|
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
9
|
-
from tqdm import tqdm
|
|
10
|
-
|
|
11
|
-
def calculate_alignment_score(seq1, seq2):
|
|
12
|
-
"""Calculate alignment score between two sequences using PairwiseAligner."""
|
|
13
|
-
aligner = PairwiseAligner()
|
|
14
|
-
aligner.mode = 'global'
|
|
15
|
-
alignment = aligner.align(seq1, seq2)[0]
|
|
16
|
-
return alignment.score / max(len(seq1), len(seq2))
|
|
17
|
-
|
|
18
|
-
def filter_single_file(args):
|
|
19
|
-
"""
|
|
20
|
-
Filter a single fastq file. Used for parallel processing.
|
|
21
|
-
|
|
22
|
-
Args:
|
|
23
|
-
args: tuple containing (input_file, parent_seq, parent_rev_comp)
|
|
24
|
-
Returns:
|
|
25
|
-
tuple: (file_path, total_reads, kept_reads, filtered_records)
|
|
26
|
-
"""
|
|
27
|
-
input_file, parent_seq, parent_rev_comp = args
|
|
28
|
-
kept_reads = []
|
|
29
|
-
total_reads = 0
|
|
30
|
-
kept_count = 0
|
|
31
|
-
|
|
32
|
-
is_forward = "forward" in str(input_file).lower()
|
|
33
|
-
|
|
34
|
-
for record in SeqIO.parse(input_file, "fastq"):
|
|
35
|
-
total_reads += 1
|
|
36
|
-
seq = str(record.seq)
|
|
37
|
-
|
|
38
|
-
forward_score = calculate_alignment_score(seq, str(parent_seq))
|
|
39
|
-
reverse_score = calculate_alignment_score(seq, str(parent_rev_comp))
|
|
40
|
-
|
|
41
|
-
# If it's in forward file (plate barcode was rev comp)
|
|
42
|
-
# Then read should align to reverse complement parent sequence
|
|
43
|
-
if is_forward and reverse_score > forward_score:
|
|
44
|
-
kept_reads.append(record)
|
|
45
|
-
kept_count += 1
|
|
46
|
-
# If it's in reverse file (plate barcode was forward)
|
|
47
|
-
# Then read was already reverse complemented by demultiplexer
|
|
48
|
-
# So it should align to forward parent sequence
|
|
49
|
-
elif not is_forward and forward_score > reverse_score:
|
|
50
|
-
kept_reads.append(record)
|
|
51
|
-
kept_count += 1
|
|
52
|
-
|
|
53
|
-
return str(input_file), total_reads, kept_count, kept_reads
|
|
54
|
-
|
|
55
|
-
def filter_demultiplexed_folder(experiment_folder, parent_sequence, num_threads=8):
|
|
56
|
-
"""
|
|
57
|
-
Filter demultiplexed files using multiple threads.
|
|
58
|
-
|
|
59
|
-
Args:
|
|
60
|
-
experiment_folder (str): Path to experiment folder containing RBC/FBC structure
|
|
61
|
-
parent_sequence (str): Parent sequence for alignment checking
|
|
62
|
-
num_threads (int): Number of threads to use
|
|
63
|
-
"""
|
|
64
|
-
exp_path = Path(experiment_folder)
|
|
65
|
-
filtered_counts = {}
|
|
66
|
-
|
|
67
|
-
# Prepare parent sequences once
|
|
68
|
-
parent_seq = Seq(parent_sequence)
|
|
69
|
-
parent_rev_comp = parent_seq.reverse_complement()
|
|
70
|
-
|
|
71
|
-
# Collect all fastq files
|
|
72
|
-
fastq_files = []
|
|
73
|
-
for rbc_dir in exp_path.glob("RB*"):
|
|
74
|
-
if not rbc_dir.is_dir():
|
|
75
|
-
continue
|
|
76
|
-
for fbc_dir in rbc_dir.glob("NB*"):
|
|
77
|
-
if not fbc_dir.is_dir():
|
|
78
|
-
continue
|
|
79
|
-
fastq_files.extend(list(fbc_dir.glob("*.fastq")))
|
|
80
|
-
|
|
81
|
-
if not fastq_files:
|
|
82
|
-
logging.warning(f"No fastq files found in {experiment_folder}")
|
|
83
|
-
return filtered_counts
|
|
84
|
-
|
|
85
|
-
# Prepare arguments for parallel processing
|
|
86
|
-
file_args = [(f, parent_seq, parent_rev_comp) for f in fastq_files]
|
|
87
|
-
|
|
88
|
-
# Process files in parallel with progress bar
|
|
89
|
-
with ThreadPoolExecutor(max_workers=num_threads) as executor:
|
|
90
|
-
futures = [executor.submit(filter_single_file, args) for args in file_args]
|
|
91
|
-
|
|
92
|
-
with tqdm(total=len(fastq_files), desc="Filtering files") as pbar:
|
|
93
|
-
for future in as_completed(futures):
|
|
94
|
-
try:
|
|
95
|
-
file_path, total, kept, filtered_records = future.result()
|
|
96
|
-
|
|
97
|
-
# Write filtered reads
|
|
98
|
-
temp_file = Path(file_path).parent / f"temp_{Path(file_path).name}"
|
|
99
|
-
SeqIO.write(filtered_records, temp_file, "fastq")
|
|
100
|
-
shutil.move(str(temp_file), file_path)
|
|
101
|
-
|
|
102
|
-
filtered_counts[file_path] = {
|
|
103
|
-
'total': total,
|
|
104
|
-
'kept': kept,
|
|
105
|
-
'filtered': total - kept
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
logging.info(f"Processed {file_path}: {kept}/{total} reads kept")
|
|
109
|
-
pbar.update(1)
|
|
110
|
-
|
|
111
|
-
except Exception as e:
|
|
112
|
-
logging.error(f"Error processing file {file_path}: {str(e)}")
|
|
113
|
-
pbar.update(1)
|
|
114
|
-
|
|
115
|
-
return filtered_counts
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|