levseq 1.3.3__tar.gz → 1.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- levseq-1.4.1/PKG-INFO +187 -0
- levseq-1.4.1/README.md +140 -0
- {levseq-1.3.3 → levseq-1.4.1}/levseq/__init__.py +1 -1
- {levseq-1.3.3 → levseq-1.4.1}/levseq/interface.py +8 -2
- {levseq-1.3.3 → levseq-1.4.1}/levseq/run_levseq.py +115 -13
- {levseq-1.3.3 → levseq-1.4.1}/levseq/utils.py +14 -13
- {levseq-1.3.3 → levseq-1.4.1}/levseq/variantcaller.py +58 -42
- {levseq-1.3.3 → levseq-1.4.1}/levseq/visualization.py +52 -0
- levseq-1.4.1/levseq.egg-info/PKG-INFO +187 -0
- {levseq-1.3.3 → levseq-1.4.1}/levseq.egg-info/SOURCES.txt +1 -0
- {levseq-1.3.3 → levseq-1.4.1}/setup.py +1 -0
- levseq-1.4.1/tests/test_copy_fastq.py +75 -0
- {levseq-1.3.3 → levseq-1.4.1}/tests/test_deploy.py +27 -17
- {levseq-1.3.3 → levseq-1.4.1}/tests/test_opligopools.py +7 -33
- levseq-1.3.3/PKG-INFO +0 -192
- levseq-1.3.3/README.md +0 -145
- levseq-1.3.3/levseq.egg-info/PKG-INFO +0 -192
- {levseq-1.3.3 → levseq-1.4.1}/LICENSE +0 -0
- {levseq-1.3.3 → levseq-1.4.1}/MANIFEST.in +0 -0
- {levseq-1.3.3 → levseq-1.4.1}/levseq/IO_processor.py +0 -0
- {levseq-1.3.3 → levseq-1.4.1}/levseq/barcoding/__init__.py +0 -0
- {levseq-1.3.3 → levseq-1.4.1}/levseq/barcoding/demultiplex +0 -0
- {levseq-1.3.3 → levseq-1.4.1}/levseq/barcoding/demultiplex-arm64 +0 -0
- {levseq-1.3.3 → levseq-1.4.1}/levseq/barcoding/demultiplex-x86 +0 -0
- {levseq-1.3.3 → levseq-1.4.1}/levseq/barcoding/minion_barcodes.fasta +0 -0
- {levseq-1.3.3 → levseq-1.4.1}/levseq/basecaller.py +0 -0
- {levseq-1.3.3 → levseq-1.4.1}/levseq/cmd.py +0 -0
- {levseq-1.3.3 → levseq-1.4.1}/levseq/coordinates.py +0 -0
- {levseq-1.3.3 → levseq-1.4.1}/levseq/filter_orientation.py +0 -0
- {levseq-1.3.3 → levseq-1.4.1}/levseq/globals.py +0 -0
- {levseq-1.3.3 → levseq-1.4.1}/levseq/parser.py +0 -0
- {levseq-1.3.3 → levseq-1.4.1}/levseq/screen.py +0 -0
- {levseq-1.3.3 → levseq-1.4.1}/levseq/seqfit.py +0 -0
- {levseq-1.3.3 → levseq-1.4.1}/levseq/simulation.py +0 -0
- {levseq-1.3.3 → levseq-1.4.1}/levseq/user.py +0 -0
- {levseq-1.3.3 → levseq-1.4.1}/levseq.egg-info/dependency_links.txt +0 -0
- {levseq-1.3.3 → levseq-1.4.1}/levseq.egg-info/entry_points.txt +0 -0
- {levseq-1.3.3 → levseq-1.4.1}/levseq.egg-info/requires.txt +0 -0
- {levseq-1.3.3 → levseq-1.4.1}/levseq.egg-info/top_level.txt +0 -0
- {levseq-1.3.3 → levseq-1.4.1}/setup.cfg +0 -0
- {levseq-1.3.3 → levseq-1.4.1}/tests/test_demultiplex_docker.py +0 -0
- {levseq-1.3.3 → levseq-1.4.1}/tests/test_seqfitvis.py +0 -0
- {levseq-1.3.3 → levseq-1.4.1}/tests/test_seqs.py +0 -0
- {levseq-1.3.3 → levseq-1.4.1}/tests/test_statistics.py +0 -0
- {levseq-1.3.3 → levseq-1.4.1}/tests/test_variant_calling.py +0 -0
levseq-1.4.1/PKG-INFO
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: levseq
|
|
3
|
+
Version: 1.4.1
|
|
4
|
+
Home-page: https://github.com/fhalab/levseq/
|
|
5
|
+
Author: Yueming Long, Ariane Mora, Francesca-Zhoufan Li, Emre Gursoy
|
|
6
|
+
Author-email: ylong@caltech.edu
|
|
7
|
+
License: GPL3
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/fhalab/levseq/
|
|
9
|
+
Project-URL: Documentation, https://github.com/fhalab/levseq/
|
|
10
|
+
Project-URL: Source Code, https://github.com/fhalab/levseq/
|
|
11
|
+
Keywords: Nanopore,ONT,evSeq
|
|
12
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
15
|
+
Classifier: Natural Language :: English
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.6
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
21
|
+
Requires-Python: >=3.8
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: Bio
|
|
25
|
+
Requires-Dist: biopython
|
|
26
|
+
Requires-Dist: fsspec
|
|
27
|
+
Requires-Dist: h5py
|
|
28
|
+
Requires-Dist: holoviews
|
|
29
|
+
Requires-Dist: jupyterlab
|
|
30
|
+
Requires-Dist: mappy
|
|
31
|
+
Requires-Dist: matplotlib
|
|
32
|
+
Requires-Dist: ninetysix
|
|
33
|
+
Requires-Dist: numpy
|
|
34
|
+
Requires-Dist: pandas
|
|
35
|
+
Requires-Dist: pybedtools
|
|
36
|
+
Requires-Dist: pycoQC
|
|
37
|
+
Requires-Dist: pyfaidx
|
|
38
|
+
Requires-Dist: pyparsing
|
|
39
|
+
Requires-Dist: pysam
|
|
40
|
+
Requires-Dist: scipy
|
|
41
|
+
Requires-Dist: sciutil
|
|
42
|
+
Requires-Dist: seaborn
|
|
43
|
+
Requires-Dist: scikit-learn
|
|
44
|
+
Requires-Dist: statsmodels
|
|
45
|
+
Requires-Dist: tqdm
|
|
46
|
+
Requires-Dist: biopandas
|
|
47
|
+
|
|
48
|
+
# Variant Sequencing with Nanopore (LevSeq)
|
|
49
|
+
|
|
50
|
+
LevSeq provides a streamlined pipeline for sequencing and analyzing genetic variants using Oxford Nanopore technology. In directed evolution experiments, LevSeq enables sequencing of every variant, enhancing data insight and creating datasets suitable for AI/ML methods. Sequence variants can be generated within a day at an extremely low cost.
|
|
51
|
+
|
|
52
|
+

|
|
53
|
+
Figure 1: Overview of the LevSeq variant sequencing workflow using Nanopore technology. This diagram illustrates the key steps in the process, from sample preparation to data analysis and visualization.
|
|
54
|
+
|
|
55
|
+
## Quick Start
|
|
56
|
+
|
|
57
|
+
### Docker Installation (Recommended)
|
|
58
|
+
|
|
59
|
+
1. Install Docker: [https://docs.docker.com/engine/install/](https://docs.docker.com/engine/install/)
|
|
60
|
+
2. Pull the appropriate image:
|
|
61
|
+
```bash
|
|
62
|
+
# For Linux/Windows x86 systems:
|
|
63
|
+
docker pull yueminglong/levseq:levseq-1.4-x86
|
|
64
|
+
|
|
65
|
+
# For Mac M-series chips (M1, M2, M3, M4):
|
|
66
|
+
docker pull yueminglong/levseq:levseq-1.4-arm64
|
|
67
|
+
```
|
|
68
|
+
3. Run LevSeq:
|
|
69
|
+
```bash
|
|
70
|
+
docker run --rm -v "/full/path/to/data:/levseq_results" yueminglong/levseq:levseq-1.4-arm64 my_experiment levseq_results/ levseq_results/ref.csv
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### Pip Installation (Mac/Linux only)
|
|
74
|
+
|
|
75
|
+
**IMPORTANT**: On Mac M-series chips (M1-M4), gcc 13 and 14 are **REQUIRED**:
|
|
76
|
+
```bash
|
|
77
|
+
brew install gcc@13 gcc@14
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
1. Create and activate conda environment:
|
|
81
|
+
```bash
|
|
82
|
+
conda create --name levseq python=3.12 -y
|
|
83
|
+
conda activate levseq
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
2. Install dependencies:
|
|
87
|
+
```bash
|
|
88
|
+
conda install -c bioconda -c conda-forge samtools minimap2
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
3. Install LevSeq:
|
|
92
|
+
```bash
|
|
93
|
+
pip install levseq
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
4. Run LevSeq:
|
|
97
|
+
```bash
|
|
98
|
+
levseq my_experiment /path/to/data/ /path/to/ref.csv
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## Data and Visualization
|
|
102
|
+
|
|
103
|
+
- **Test Data**: Sample data is available on Zenodo [](https://doi.org/10.5281/zenodo.13694463)
|
|
104
|
+
- **Visualization Tool**: A web application is available at [https://levseqdb.streamlit.app/](https://levseqdb.streamlit.app/) - simply upload your LevSeq output and LCMS results
|
|
105
|
+
- **Self-hosted Solution**: You can deploy your own instance using our [LevSeq_db repository](https://github.com/fhalab/LevSeq_db)
|
|
106
|
+
|
|
107
|
+
## Reference File Format (ref.csv)
|
|
108
|
+
|
|
109
|
+
Your reference CSV file must contain the following columns:
|
|
110
|
+
|
|
111
|
+
| barcode_plate | name | refseq |
|
|
112
|
+
|---------------|--------|-----------|
|
|
113
|
+
| 33 | Q97A76 | ATGCGC... |
|
|
114
|
+
|
|
115
|
+
For oligopool experiments (multiple proteins per plate), use:
|
|
116
|
+
|
|
117
|
+
| barcode_plate | name | refseq |
|
|
118
|
+
|---------------|--------|-----------|
|
|
119
|
+
| 33 | Q97A76 | ATGCGCAAG |
|
|
120
|
+
| 33 | P96084 | ATGGATCA |
|
|
121
|
+
| 34 | P46209 | ATGGGGCAA |
|
|
122
|
+
| 34 | Q60336 | ATGGGGCC |
|
|
123
|
+
|
|
124
|
+
## Command Line Arguments
|
|
125
|
+
|
|
126
|
+
### Required Arguments
|
|
127
|
+
1. **name**: Name of the experiment (output folder)
|
|
128
|
+
2. **path**: Location of basecalled fastq files
|
|
129
|
+
3. **summary**: Path to reference CSV file
|
|
130
|
+
|
|
131
|
+
### Optional Arguments
|
|
132
|
+
- `--skip_demultiplexing`: Skip the demultiplexing step
|
|
133
|
+
- `--skip_variantcalling`: Skip the variant calling step
|
|
134
|
+
- `--output`: Custom save location (defaults to current directory)
|
|
135
|
+
- `--show_msa`: Show multiple sequence alignment for each well
|
|
136
|
+
- `--oligopool`: Process data as oligopool experiment
|
|
137
|
+
|
|
138
|
+
## Step-by-Step Tutorial
|
|
139
|
+
|
|
140
|
+
1. **Prepare your sequencing data**:
|
|
141
|
+
- Your fastq files should be in a directory structure similar to Nanopore's output
|
|
142
|
+
- Prepare a reference CSV file with barcode plates, sample names, and reference sequences
|
|
143
|
+
|
|
144
|
+
2. **Run LevSeq**:
|
|
145
|
+
```bash
|
|
146
|
+
# Via Docker
|
|
147
|
+
docker run --rm -v "/path/to/data:/levseq_results" yueminglong/levseq:levseq-1.4-arm64 my_experiment levseq_results/ levseq_results/ref.csv
|
|
148
|
+
|
|
149
|
+
# Via pip
|
|
150
|
+
levseq my_experiment /path/to/data/ /path/to/ref.csv
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
3. **Analyze results**:
|
|
154
|
+
- Output includes variant data (CSV) and interactive visualizations (HTML)
|
|
155
|
+
- Upload results to the LevSeq visualization tool for further analysis
|
|
156
|
+
|
|
157
|
+
## Experimental Setup
|
|
158
|
+
|
|
159
|
+
For the wet lab protocol:
|
|
160
|
+
- Refer to the [wiki](https://github.com/fhalab/LevSeq/wiki/Experimental-protocols)
|
|
161
|
+
- See the methods section of [our paper](https://pubs.acs.org/doi/10.1021/acssynbio.4c00625)
|
|
162
|
+
- Order forward and reverse primers compatible with your plasmid
|
|
163
|
+
- Install Oxford Nanopore's software for basecalling if needed
|
|
164
|
+
|
|
165
|
+
## Additional Resources
|
|
166
|
+
|
|
167
|
+
- **Example Notebook**: See `example/Example.ipynb` for a walkthrough
|
|
168
|
+
- **Advanced Usage**: See the [manuscript notebook](https://github.com/fhalab/LevSeq/blob/main/manuscript/notebooks/epPCR_10plates.ipynb)
|
|
169
|
+
- **Troubleshooting**: See our [computational protocols wiki](https://github.com/fhalab/LevSeq/wiki/Computational-protocols)
|
|
170
|
+
|
|
171
|
+
## Citing LevSeq
|
|
172
|
+
|
|
173
|
+
If you find LevSeq useful, please cite our paper:
|
|
174
|
+
|
|
175
|
+
```bibtex
|
|
176
|
+
@article{long2024levseq,
|
|
177
|
+
title={LevSeq: Rapid Generation of Sequence-Function Data for Directed Evolution and Machine Learning},
|
|
178
|
+
author={Long, Yueming and Mora, Ariane and Li, Francesca-Zhoufan and Gürsoy, Emre and Johnston, Kadina E and Arnold, Frances H},
|
|
179
|
+
journal={ACS Synthetic Biology},
|
|
180
|
+
year={2024},
|
|
181
|
+
publisher={American Chemical Society}
|
|
182
|
+
}
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
## Contact
|
|
186
|
+
|
|
187
|
+
Leave a feature request in the issues or reach us via [email](mailto:levseqdb@gmail.com).
|
levseq-1.4.1/README.md
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
# Variant Sequencing with Nanopore (LevSeq)
|
|
2
|
+
|
|
3
|
+
LevSeq provides a streamlined pipeline for sequencing and analyzing genetic variants using Oxford Nanopore technology. In directed evolution experiments, LevSeq enables sequencing of every variant, enhancing data insight and creating datasets suitable for AI/ML methods. Sequence variants can be generated within a day at an extremely low cost.
|
|
4
|
+
|
|
5
|
+

|
|
6
|
+
Figure 1: Overview of the LevSeq variant sequencing workflow using Nanopore technology. This diagram illustrates the key steps in the process, from sample preparation to data analysis and visualization.
|
|
7
|
+
|
|
8
|
+
## Quick Start
|
|
9
|
+
|
|
10
|
+
### Docker Installation (Recommended)
|
|
11
|
+
|
|
12
|
+
1. Install Docker: [https://docs.docker.com/engine/install/](https://docs.docker.com/engine/install/)
|
|
13
|
+
2. Pull the appropriate image:
|
|
14
|
+
```bash
|
|
15
|
+
# For Linux/Windows x86 systems:
|
|
16
|
+
docker pull yueminglong/levseq:levseq-1.4-x86
|
|
17
|
+
|
|
18
|
+
# For Mac M-series chips (M1, M2, M3, M4):
|
|
19
|
+
docker pull yueminglong/levseq:levseq-1.4-arm64
|
|
20
|
+
```
|
|
21
|
+
3. Run LevSeq:
|
|
22
|
+
```bash
|
|
23
|
+
docker run --rm -v "/full/path/to/data:/levseq_results" yueminglong/levseq:levseq-1.4-arm64 my_experiment levseq_results/ levseq_results/ref.csv
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
### Pip Installation (Mac/Linux only)
|
|
27
|
+
|
|
28
|
+
**IMPORTANT**: On Mac M-series chips (M1-M4), gcc 13 and 14 are **REQUIRED**:
|
|
29
|
+
```bash
|
|
30
|
+
brew install gcc@13 gcc@14
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
1. Create and activate conda environment:
|
|
34
|
+
```bash
|
|
35
|
+
conda create --name levseq python=3.12 -y
|
|
36
|
+
conda activate levseq
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
2. Install dependencies:
|
|
40
|
+
```bash
|
|
41
|
+
conda install -c bioconda -c conda-forge samtools minimap2
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
3. Install LevSeq:
|
|
45
|
+
```bash
|
|
46
|
+
pip install levseq
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
4. Run LevSeq:
|
|
50
|
+
```bash
|
|
51
|
+
levseq my_experiment /path/to/data/ /path/to/ref.csv
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Data and Visualization
|
|
55
|
+
|
|
56
|
+
- **Test Data**: Sample data is available on Zenodo [](https://doi.org/10.5281/zenodo.13694463)
|
|
57
|
+
- **Visualization Tool**: A web application is available at [https://levseqdb.streamlit.app/](https://levseqdb.streamlit.app/) - simply upload your LevSeq output and LCMS results
|
|
58
|
+
- **Self-hosted Solution**: You can deploy your own instance using our [LevSeq_db repository](https://github.com/fhalab/LevSeq_db)
|
|
59
|
+
|
|
60
|
+
## Reference File Format (ref.csv)
|
|
61
|
+
|
|
62
|
+
Your reference CSV file must contain the following columns:
|
|
63
|
+
|
|
64
|
+
| barcode_plate | name | refseq |
|
|
65
|
+
|---------------|--------|-----------|
|
|
66
|
+
| 33 | Q97A76 | ATGCGC... |
|
|
67
|
+
|
|
68
|
+
For oligopool experiments (multiple proteins per plate), use:
|
|
69
|
+
|
|
70
|
+
| barcode_plate | name | refseq |
|
|
71
|
+
|---------------|--------|-----------|
|
|
72
|
+
| 33 | Q97A76 | ATGCGCAAG |
|
|
73
|
+
| 33 | P96084 | ATGGATCA |
|
|
74
|
+
| 34 | P46209 | ATGGGGCAA |
|
|
75
|
+
| 34 | Q60336 | ATGGGGCC |
|
|
76
|
+
|
|
77
|
+
## Command Line Arguments
|
|
78
|
+
|
|
79
|
+
### Required Arguments
|
|
80
|
+
1. **name**: Name of the experiment (output folder)
|
|
81
|
+
2. **path**: Location of basecalled fastq files
|
|
82
|
+
3. **summary**: Path to reference CSV file
|
|
83
|
+
|
|
84
|
+
### Optional Arguments
|
|
85
|
+
- `--skip_demultiplexing`: Skip the demultiplexing step
|
|
86
|
+
- `--skip_variantcalling`: Skip the variant calling step
|
|
87
|
+
- `--output`: Custom save location (defaults to current directory)
|
|
88
|
+
- `--show_msa`: Show multiple sequence alignment for each well
|
|
89
|
+
- `--oligopool`: Process data as oligopool experiment
|
|
90
|
+
|
|
91
|
+
## Step-by-Step Tutorial
|
|
92
|
+
|
|
93
|
+
1. **Prepare your sequencing data**:
|
|
94
|
+
- Your fastq files should be in a directory structure similar to Nanopore's output
|
|
95
|
+
- Prepare a reference CSV file with barcode plates, sample names, and reference sequences
|
|
96
|
+
|
|
97
|
+
2. **Run LevSeq**:
|
|
98
|
+
```bash
|
|
99
|
+
# Via Docker
|
|
100
|
+
docker run --rm -v "/path/to/data:/levseq_results" yueminglong/levseq:levseq-1.4-arm64 my_experiment levseq_results/ levseq_results/ref.csv
|
|
101
|
+
|
|
102
|
+
# Via pip
|
|
103
|
+
levseq my_experiment /path/to/data/ /path/to/ref.csv
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
3. **Analyze results**:
|
|
107
|
+
- Output includes variant data (CSV) and interactive visualizations (HTML)
|
|
108
|
+
- Upload results to the LevSeq visualization tool for further analysis
|
|
109
|
+
|
|
110
|
+
## Experimental Setup
|
|
111
|
+
|
|
112
|
+
For the wet lab protocol:
|
|
113
|
+
- Refer to the [wiki](https://github.com/fhalab/LevSeq/wiki/Experimental-protocols)
|
|
114
|
+
- See the methods section of [our paper](https://pubs.acs.org/doi/10.1021/acssynbio.4c00625)
|
|
115
|
+
- Order forward and reverse primers compatible with your plasmid
|
|
116
|
+
- Install Oxford Nanopore's software for basecalling if needed
|
|
117
|
+
|
|
118
|
+
## Additional Resources
|
|
119
|
+
|
|
120
|
+
- **Example Notebook**: See `example/Example.ipynb` for a walkthrough
|
|
121
|
+
- **Advanced Usage**: See the [manuscript notebook](https://github.com/fhalab/LevSeq/blob/main/manuscript/notebooks/epPCR_10plates.ipynb)
|
|
122
|
+
- **Troubleshooting**: See our [computational protocols wiki](https://github.com/fhalab/LevSeq/wiki/Computational-protocols)
|
|
123
|
+
|
|
124
|
+
## Citing LevSeq
|
|
125
|
+
|
|
126
|
+
If you find LevSeq useful, please cite our paper:
|
|
127
|
+
|
|
128
|
+
```bibtex
|
|
129
|
+
@article{long2024levseq,
|
|
130
|
+
title={LevSeq: Rapid Generation of Sequence-Function Data for Directed Evolution and Machine Learning},
|
|
131
|
+
author={Long, Yueming and Mora, Ariane and Li, Francesca-Zhoufan and Gürsoy, Emre and Johnston, Kadina E and Arnold, Frances H},
|
|
132
|
+
journal={ACS Synthetic Biology},
|
|
133
|
+
year={2024},
|
|
134
|
+
publisher={American Chemical Society}
|
|
135
|
+
}
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
## Contact
|
|
139
|
+
|
|
140
|
+
Leave a feature request in the issues or reach us via [email](mailto:levseqdb@gmail.com).
|
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
__title__ = 'levseq'
|
|
19
19
|
__description__ = 'LevSeq nanopore sequencing'
|
|
20
20
|
__url__ = 'https://github.com/fhalab/levseq/'
|
|
21
|
-
__version__ = '1.
|
|
21
|
+
__version__ = '1.4.1'
|
|
22
22
|
__author__ = 'Yueming Long, Ariane Mora, Francesca-Zhoufan Li, Emre Gursoy'
|
|
23
23
|
__author_email__ = 'ylong@caltech.edu'
|
|
24
24
|
__license__ = 'GPL3'
|
|
@@ -63,6 +63,9 @@ def build_cli_parser():
|
|
|
63
63
|
optional_args_group.add_argument("--skip_variantcalling",
|
|
64
64
|
action="store_true",
|
|
65
65
|
help="Skip the variant calling step, default is false")
|
|
66
|
+
optional_args_group.add_argument("--oligopool",
|
|
67
|
+
action="store_true",
|
|
68
|
+
help="Whether this experiment came from an oligopool, default is false.")
|
|
66
69
|
optional_args_group.add_argument("--show_msa",
|
|
67
70
|
default=False,
|
|
68
71
|
help="Skip showing msa")
|
|
@@ -79,7 +82,10 @@ def execute_LevSeq():
|
|
|
79
82
|
tqdm_fn = tqdm.tqdm
|
|
80
83
|
# Run LevSeq
|
|
81
84
|
try:
|
|
85
|
+
from levseq import __version__
|
|
86
|
+
print(f"Starting LevSeq v{__version__}...")
|
|
82
87
|
run_LevSeq(CL_ARGS, tqdm_fn)
|
|
88
|
+
print(f"Run completed successfully. Results and logs stored in {os.path.join(CL_ARGS.get('output', CWD), CL_ARGS.get('name', ''))}")
|
|
83
89
|
except Exception as e:
|
|
84
|
-
print(e)
|
|
85
|
-
|
|
90
|
+
print(f"Error: {e}")
|
|
91
|
+
print(f"Check error logs for details in {os.path.join(CL_ARGS.get('output', CWD), CL_ARGS.get('name', ''))}")
|
|
@@ -66,8 +66,14 @@ from importlib import resources
|
|
|
66
66
|
from holoviews.streams import Tap
|
|
67
67
|
|
|
68
68
|
# Utility function to configure logging
|
|
69
|
-
def configure_logging(result_folder):
|
|
70
|
-
|
|
69
|
+
def configure_logging(result_folder, cl_args):
|
|
70
|
+
import sys
|
|
71
|
+
from levseq import __version__
|
|
72
|
+
|
|
73
|
+
# Define a more detailed log format with clean separation
|
|
74
|
+
log_format = "%(asctime)s : %(levelname)s : %(message)s"
|
|
75
|
+
|
|
76
|
+
# Create log handlers
|
|
71
77
|
info_handler = logging.FileHandler(os.path.join(result_folder, "LevSeq_run.log"))
|
|
72
78
|
info_handler.setLevel(logging.INFO)
|
|
73
79
|
info_handler.setFormatter(logging.Formatter(log_format))
|
|
@@ -76,7 +82,30 @@ def configure_logging(result_folder):
|
|
|
76
82
|
error_handler.setLevel(logging.ERROR)
|
|
77
83
|
error_handler.setFormatter(logging.Formatter(log_format))
|
|
78
84
|
|
|
85
|
+
# Set up basic configuration with both handlers
|
|
79
86
|
logging.basicConfig(level=logging.INFO, handlers=[info_handler, error_handler])
|
|
87
|
+
|
|
88
|
+
# Log version information and command used to run
|
|
89
|
+
command_used = " ".join(sys.argv)
|
|
90
|
+
logging.info(f"LevSeq Version: {__version__}")
|
|
91
|
+
logging.info(f"Command: {command_used}")
|
|
92
|
+
|
|
93
|
+
# Log essential run parameters
|
|
94
|
+
logging.info(f"Run name: {cl_args.get('name', 'Not specified')}")
|
|
95
|
+
logging.info(f"Input path: {cl_args.get('path', 'Not specified')}")
|
|
96
|
+
logging.info(f"Summary file: {cl_args.get('summary', 'Not specified')}")
|
|
97
|
+
|
|
98
|
+
# Log optional parameters if specified
|
|
99
|
+
if cl_args.get('output') and cl_args.get('output') != os.getcwd():
|
|
100
|
+
logging.info(f"Output directory: {cl_args.get('output')}")
|
|
101
|
+
if cl_args.get('oligopool'):
|
|
102
|
+
logging.info("Running in oligopool mode")
|
|
103
|
+
if cl_args.get('skip_demultiplexing'):
|
|
104
|
+
logging.info("Skipping demultiplexing step")
|
|
105
|
+
if cl_args.get('skip_variantcalling'):
|
|
106
|
+
logging.info("Skipping variant calling step")
|
|
107
|
+
if cl_args.get('threshold'):
|
|
108
|
+
logging.info(f"Using variant threshold: {cl_args.get('threshold')}")
|
|
80
109
|
|
|
81
110
|
# Create result folder
|
|
82
111
|
def create_result_folder(cl_args):
|
|
@@ -139,8 +168,15 @@ def cat_fastq_files(folder_path: str, output_path: str, reads_per_file: int = 40
|
|
|
139
168
|
else:
|
|
140
169
|
for fastq_file in fastq_files:
|
|
141
170
|
destination = output_path / fastq_file.name
|
|
142
|
-
|
|
143
|
-
|
|
171
|
+
# Skip copying if source and destination are identical
|
|
172
|
+
if str(fastq_file) == str(destination):
|
|
173
|
+
logging.info("Skipping copy of %s (source and destination are identical)", fastq_file)
|
|
174
|
+
continue
|
|
175
|
+
try:
|
|
176
|
+
shutil.copy(fastq_file, destination)
|
|
177
|
+
logging.info("Copied %s to %s", fastq_file, destination)
|
|
178
|
+
except shutil.SameFileError:
|
|
179
|
+
logging.info("Skipping copy of %s (source and destination are identical files)", fastq_file)
|
|
144
180
|
logging.info("All FASTQ files processed successfully to %s", output_path)
|
|
145
181
|
return str(output_path)
|
|
146
182
|
except Exception as e:
|
|
@@ -221,13 +257,14 @@ def demux_fastq(file_to_fastq, result_folder, barcode_path):
|
|
|
221
257
|
executable_path = package_root / "levseq" / "barcoding" / executable_name
|
|
222
258
|
if not executable_path.exists():
|
|
223
259
|
raise FileNotFoundError(f"Executable not found: {executable_path}")
|
|
224
|
-
seq_min = 200
|
|
260
|
+
seq_min = 200
|
|
225
261
|
seq_max = 10000
|
|
226
262
|
prompt = f"{executable_path} -f {file_to_fastq} -d {result_folder} -b {barcode_path} -w 100 -r 100 -m {seq_min} -x {seq_max}"
|
|
227
263
|
subprocess.run(prompt, shell=True, check=True)
|
|
228
264
|
|
|
229
265
|
# Variant calling using VariantCaller class
|
|
230
|
-
|
|
266
|
+
|
|
267
|
+
def call_variant(experiment_name, experiment_folder, template_fasta, filtered_barcodes, threshold=0.5, oligopool=False):
|
|
231
268
|
try:
|
|
232
269
|
vc = VariantCaller(
|
|
233
270
|
experiment_name,
|
|
@@ -236,8 +273,9 @@ def call_variant(experiment_name, experiment_folder, template_fasta, filtered_ba
|
|
|
236
273
|
filtered_barcodes,
|
|
237
274
|
padding_start=0,
|
|
238
275
|
padding_end=0,
|
|
276
|
+
oligopool=oligopool
|
|
239
277
|
)
|
|
240
|
-
variant_df = vc.get_variant_df(threshold=
|
|
278
|
+
variant_df = vc.get_variant_df(threshold=threshold, min_depth=5)
|
|
241
279
|
logging.info("Variant calling to create consensus reads successful")
|
|
242
280
|
return variant_df
|
|
243
281
|
except Exception as e:
|
|
@@ -332,7 +370,7 @@ def create_df_v(variants_df):
|
|
|
332
370
|
|
|
333
371
|
# Create a copy for restructuring to avoid affecting the original
|
|
334
372
|
restructured_df = df_variants_.copy()
|
|
335
|
-
restructured_df.columns = restructured_df.columns.str.lower().str.replace('[\s-]', '_', regex=True)
|
|
373
|
+
restructured_df.columns = restructured_df.columns.str.lower().str.replace(r'[\s-]', '_', regex=True)
|
|
336
374
|
# Fix the specific column name
|
|
337
375
|
restructured_df.columns = restructured_df.columns.str.replace('p_adj._value', 'p_adj_value')
|
|
338
376
|
|
|
@@ -441,6 +479,63 @@ def save_csv(df, outputdir, name):
|
|
|
441
479
|
file_path = os.path.join(outputdir, "Results", name + ".csv")
|
|
442
480
|
df.to_csv(file_path)
|
|
443
481
|
|
|
482
|
+
# Function to process the reference CSV and generate variants
|
|
483
|
+
def process_ref_csv_oligopool(cl_args, tqdm_fn=tqdm.tqdm):
|
|
484
|
+
ref_df = pd.read_csv(cl_args["summary"])
|
|
485
|
+
result_folder = create_result_folder(cl_args)
|
|
486
|
+
variant_csv_path = os.path.join(result_folder, "variants.csv")
|
|
487
|
+
variant_df = pd.DataFrame(columns=["barcode_plate", "name", "refseq", "variant"])
|
|
488
|
+
|
|
489
|
+
# First get the different barcode plates (these will be unique)
|
|
490
|
+
barcode_plates = ref_df["barcode_plate"].unique()
|
|
491
|
+
ref_df["barcode_index"] = [i for i in range(len(ref_df))]
|
|
492
|
+
barcode_to_index = dict(zip(ref_df.barcode_plate, ref_df.barcode_index))
|
|
493
|
+
for barcode_plate in barcode_plates:
|
|
494
|
+
if not cl_args["skip_demultiplexing"]:
|
|
495
|
+
i = barcode_to_index[barcode_plate]
|
|
496
|
+
name_folder = os.path.join(result_folder, f'RB{barcode_plate}')
|
|
497
|
+
os.makedirs(name_folder, exist_ok=True)
|
|
498
|
+
barcode_path = filter_bc(cl_args, name_folder, i)
|
|
499
|
+
output_dir = Path(result_folder) / f"{cl_args['name']}_fastq"
|
|
500
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
501
|
+
|
|
502
|
+
file_to_fastq = cat_fastq_files(cl_args.get("path"), output_dir)
|
|
503
|
+
try:
|
|
504
|
+
demux_fastq(output_dir, name_folder, barcode_path)
|
|
505
|
+
except Exception as e:
|
|
506
|
+
logging.error("An error occurred during demultiplexing for sample {}. Skipping this sample.".format(barcode_plate), exc_info=True)
|
|
507
|
+
continue
|
|
508
|
+
# Check this - need to see if the code works... ToDo: Ariane
|
|
509
|
+
# Now they are all demultiplexed, we can call variants
|
|
510
|
+
if not cl_args["skip_variantcalling"]:
|
|
511
|
+
for i, row in tqdm_fn(ref_df.iterrows(), total=len(ref_df), desc="Processing Samples"):
|
|
512
|
+
barcode_plate = row["barcode_plate"]
|
|
513
|
+
name = row["name"]
|
|
514
|
+
refseq = row["refseq"].upper()
|
|
515
|
+
# Get the name folder and barcode path
|
|
516
|
+
temp_fasta_path = os.path.join(result_folder, f"temp_{name}.fasta")
|
|
517
|
+
if not os.path.exists(temp_fasta_path):
|
|
518
|
+
with open(temp_fasta_path, "w") as f:
|
|
519
|
+
f.write(f">{name}\n{refseq}\n")
|
|
520
|
+
else:
|
|
521
|
+
logging.info(f"Fasta file for {name} already exists. Skipping write.")
|
|
522
|
+
try:
|
|
523
|
+
filtered_barcodes = filter_bc(cl_args, result_folder, i)
|
|
524
|
+
variant_result = call_variant(f"{name}", result_folder, temp_fasta_path, filtered_barcodes,
|
|
525
|
+
oligopool=True)
|
|
526
|
+
variant_result["barcode_plate"] = barcode_plate
|
|
527
|
+
variant_result["name"] = name
|
|
528
|
+
variant_result["refseq"] = refseq
|
|
529
|
+
variant_df = pd.concat([variant_df, variant_result])
|
|
530
|
+
except Exception as e:
|
|
531
|
+
logging.error("An error occurred during variant calling for sample {}. Skipping this sample.".format(name), exc_info=True)
|
|
532
|
+
continue
|
|
533
|
+
|
|
534
|
+
variant_df.to_csv(variant_csv_path, index=False)
|
|
535
|
+
# visualize it as well
|
|
536
|
+
return variant_df, ref_df
|
|
537
|
+
|
|
538
|
+
|
|
444
539
|
# Function to process the reference CSV and generate variants
|
|
445
540
|
def process_ref_csv(cl_args, tqdm_fn=tqdm.tqdm):
|
|
446
541
|
ref_df = pd.read_csv(cl_args["summary"])
|
|
@@ -493,8 +588,9 @@ def process_ref_csv(cl_args, tqdm_fn=tqdm.tqdm):
|
|
|
493
588
|
|
|
494
589
|
if not cl_args["skip_variantcalling"]:
|
|
495
590
|
try:
|
|
591
|
+
threshold = cl_args.get("threshold") if cl_args.get("threshold") is not None else 0.5
|
|
496
592
|
variant_result = call_variant(
|
|
497
|
-
f"{name}", name_folder, temp_fasta_path, barcode_path
|
|
593
|
+
f"{name}", name_folder, temp_fasta_path, barcode_path, threshold=threshold
|
|
498
594
|
)
|
|
499
595
|
variant_result["barcode_plate"] = barcode_plate
|
|
500
596
|
variant_result["name"] = name
|
|
@@ -508,6 +604,7 @@ def process_ref_csv(cl_args, tqdm_fn=tqdm.tqdm):
|
|
|
508
604
|
variant_df.to_csv(variant_csv_path, index=False)
|
|
509
605
|
return variant_df, ref_df
|
|
510
606
|
|
|
607
|
+
|
|
511
608
|
# Main function to run LevSeq and ensure saving of intermediate results if an error occurs
|
|
512
609
|
def run_LevSeq(cl_args, tqdm_fn=tqdm.tqdm):
|
|
513
610
|
result_folder = create_result_folder(cl_args)
|
|
@@ -515,13 +612,16 @@ def run_LevSeq(cl_args, tqdm_fn=tqdm.tqdm):
|
|
|
515
612
|
ref_folder = os.path.join(result_folder, "ref")
|
|
516
613
|
os.makedirs(ref_folder, exist_ok=True)
|
|
517
614
|
|
|
518
|
-
configure_logging(result_folder)
|
|
519
|
-
logging.info("Logging configured. Starting
|
|
615
|
+
configure_logging(result_folder, cl_args)
|
|
616
|
+
logging.info("Logging configured. Starting analysis...")
|
|
520
617
|
|
|
521
618
|
variant_df = pd.DataFrame(columns=["barcode_plate", "name", "refseq", "variant"])
|
|
522
|
-
|
|
619
|
+
|
|
523
620
|
try:
|
|
524
|
-
|
|
621
|
+
if cl_args["oligopool"]:
|
|
622
|
+
variant_df, ref_df = process_ref_csv_oligopool(cl_args, tqdm_fn)
|
|
623
|
+
else:
|
|
624
|
+
variant_df, ref_df = process_ref_csv(cl_args, tqdm_fn)
|
|
525
625
|
ref_df_path = os.path.join(ref_folder, cl_args["name"]+".csv")
|
|
526
626
|
ref_df.to_csv(ref_df_path, index=False)
|
|
527
627
|
|
|
@@ -544,6 +644,8 @@ def run_LevSeq(cl_args, tqdm_fn=tqdm.tqdm):
|
|
|
544
644
|
df_variants, df_vis = create_df_v(variant_df)
|
|
545
645
|
processed_csv = os.path.join(result_folder, "visualization_partial.csv")
|
|
546
646
|
df_vis.to_csv(processed_csv, index=False)
|
|
647
|
+
if cl_args["oligopool"]:
|
|
648
|
+
make_oligopool_plates(df_vis, result_folder=result_folder, save_files=True)
|
|
547
649
|
except Exception as e:
|
|
548
650
|
processed_csv = os.path.join(result_folder, "visualization_partial.csv")
|
|
549
651
|
if 'df_vis' in locals():
|
|
@@ -59,12 +59,13 @@ def translate(seq):
|
|
|
59
59
|
'TTC': 'F', 'TTT': 'F', 'TTA': 'L', 'TTG': 'L',
|
|
60
60
|
'TAC': 'Y', 'TAT': 'Y', 'TAA': '*', 'TAG': '*',
|
|
61
61
|
'TGC': 'C', 'TGT': 'C', 'TGA': '*', 'TGG': 'W',
|
|
62
|
+
'GTS': "X"
|
|
62
63
|
}
|
|
63
64
|
protein = ""
|
|
64
65
|
if len(seq) % 3 == 0:
|
|
65
66
|
for i in range(0, len(seq), 3):
|
|
66
67
|
codon = seq[i:i + 3]
|
|
67
|
-
protein += table
|
|
68
|
+
protein += table.get(codon, 'X')
|
|
68
69
|
return protein
|
|
69
70
|
|
|
70
71
|
|
|
@@ -290,8 +291,7 @@ def get_reads_for_well(parent_name, bam_file_path: str, ref_str: str, msa_path=N
|
|
|
290
291
|
insert_map = defaultdict(list)
|
|
291
292
|
for read in bam.fetch(until_eof=True):
|
|
292
293
|
# Ensure we have at least 75% coverage
|
|
293
|
-
if read.query_sequence is not None and len(read.query_sequence) > 0.75 * len(
|
|
294
|
-
ref_str) and read.cigartuples is not None:
|
|
294
|
+
if read.query_sequence is not None and read.cigartuples is not None: # and len(read.query_sequence) > 0.75 * len(ref_str) and read.cigartuples is not None:
|
|
295
295
|
seq, ref, qual, ins = alignment_from_cigar(read.cigartuples, read.query_sequence, ref_str,
|
|
296
296
|
read.query_qualities)
|
|
297
297
|
# Make it totally align
|
|
@@ -313,16 +313,17 @@ def get_reads_for_well(parent_name, bam_file_path: str, ref_str: str, msa_path=N
|
|
|
313
313
|
# Do this for all wells
|
|
314
314
|
seq_df = make_well_df_from_reads(seqs, read_ids, read_quals)
|
|
315
315
|
alignment_count = len(seq_df.values)
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
316
|
+
if alignment_count > 0:
|
|
317
|
+
rows_all = make_row_from_read_pileup_across_well(seq_df, ref_str, parent_name, insert_map)
|
|
318
|
+
bam.close()
|
|
319
|
+
|
|
320
|
+
if len(rows_all) > 2: # Check if we have anything to return
|
|
321
|
+
seq_df = pd.DataFrame(rows_all)
|
|
322
|
+
seq_df.columns = ['gene_name', 'position', 'ref', 'most_frequent', 'freq_non_ref', 'total_other',
|
|
323
|
+
'total_reads', 'p_value', 'percent_most_freq_mutation', 'A', 'p(a)', 'T', 'p(t)', 'G', 'p(g)',
|
|
324
|
+
'C', 'p(c)', 'N', 'p(n)', 'I', 'p(i)', 'Warnings']
|
|
325
|
+
return calculate_mutation_significance_across_well(seq_df), alignment_count
|
|
326
|
+
return None, 0
|
|
326
327
|
def make_row_from_read_pileup_across_well(well_df, ref_str, label, insert_map):
|
|
327
328
|
"""
|
|
328
329
|
Given a pileup of reads, we want to get some summary information about that sequence
|