levseq 1.4.0__tar.gz → 1.4.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- levseq-1.4.2/PKG-INFO +202 -0
- levseq-1.4.2/README.md +155 -0
- {levseq-1.4.0 → levseq-1.4.2}/levseq/__init__.py +1 -1
- levseq-1.4.2/levseq/interface.py +150 -0
- {levseq-1.4.0 → levseq-1.4.2}/levseq/run_levseq.py +44 -7
- {levseq-1.4.0 → levseq-1.4.2}/levseq/variantcaller.py +3 -3
- levseq-1.4.2/levseq.egg-info/PKG-INFO +202 -0
- {levseq-1.4.0 → levseq-1.4.2}/levseq.egg-info/SOURCES.txt +1 -0
- {levseq-1.4.0 → levseq-1.4.2}/setup.py +1 -0
- levseq-1.4.2/tests/test_copy_fastq.py +75 -0
- {levseq-1.4.0 → levseq-1.4.2}/tests/test_deploy.py +2 -1
- levseq-1.4.0/PKG-INFO +0 -201
- levseq-1.4.0/README.md +0 -154
- levseq-1.4.0/levseq/interface.py +0 -88
- levseq-1.4.0/levseq.egg-info/PKG-INFO +0 -201
- {levseq-1.4.0 → levseq-1.4.2}/LICENSE +0 -0
- {levseq-1.4.0 → levseq-1.4.2}/MANIFEST.in +0 -0
- {levseq-1.4.0 → levseq-1.4.2}/levseq/IO_processor.py +0 -0
- {levseq-1.4.0 → levseq-1.4.2}/levseq/barcoding/__init__.py +0 -0
- {levseq-1.4.0 → levseq-1.4.2}/levseq/barcoding/demultiplex +0 -0
- {levseq-1.4.0 → levseq-1.4.2}/levseq/barcoding/demultiplex-arm64 +0 -0
- {levseq-1.4.0 → levseq-1.4.2}/levseq/barcoding/demultiplex-x86 +0 -0
- {levseq-1.4.0 → levseq-1.4.2}/levseq/barcoding/minion_barcodes.fasta +0 -0
- {levseq-1.4.0 → levseq-1.4.2}/levseq/basecaller.py +0 -0
- {levseq-1.4.0 → levseq-1.4.2}/levseq/cmd.py +0 -0
- {levseq-1.4.0 → levseq-1.4.2}/levseq/coordinates.py +0 -0
- {levseq-1.4.0 → levseq-1.4.2}/levseq/filter_orientation.py +0 -0
- {levseq-1.4.0 → levseq-1.4.2}/levseq/globals.py +0 -0
- {levseq-1.4.0 → levseq-1.4.2}/levseq/parser.py +0 -0
- {levseq-1.4.0 → levseq-1.4.2}/levseq/screen.py +0 -0
- {levseq-1.4.0 → levseq-1.4.2}/levseq/seqfit.py +0 -0
- {levseq-1.4.0 → levseq-1.4.2}/levseq/simulation.py +0 -0
- {levseq-1.4.0 → levseq-1.4.2}/levseq/user.py +0 -0
- {levseq-1.4.0 → levseq-1.4.2}/levseq/utils.py +0 -0
- {levseq-1.4.0 → levseq-1.4.2}/levseq/visualization.py +0 -0
- {levseq-1.4.0 → levseq-1.4.2}/levseq.egg-info/dependency_links.txt +0 -0
- {levseq-1.4.0 → levseq-1.4.2}/levseq.egg-info/entry_points.txt +0 -0
- {levseq-1.4.0 → levseq-1.4.2}/levseq.egg-info/requires.txt +0 -0
- {levseq-1.4.0 → levseq-1.4.2}/levseq.egg-info/top_level.txt +0 -0
- {levseq-1.4.0 → levseq-1.4.2}/setup.cfg +0 -0
- {levseq-1.4.0 → levseq-1.4.2}/tests/test_demultiplex_docker.py +0 -0
- {levseq-1.4.0 → levseq-1.4.2}/tests/test_opligopools.py +0 -0
- {levseq-1.4.0 → levseq-1.4.2}/tests/test_seqfitvis.py +0 -0
- {levseq-1.4.0 → levseq-1.4.2}/tests/test_seqs.py +0 -0
- {levseq-1.4.0 → levseq-1.4.2}/tests/test_statistics.py +0 -0
- {levseq-1.4.0 → levseq-1.4.2}/tests/test_variant_calling.py +0 -0
levseq-1.4.2/PKG-INFO
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: levseq
|
|
3
|
+
Version: 1.4.2
|
|
4
|
+
Home-page: https://github.com/fhalab/levseq/
|
|
5
|
+
Author: Yueming Long, Ariane Mora, Francesca-Zhoufan Li, Emre Gursoy
|
|
6
|
+
Author-email: ylong@caltech.edu
|
|
7
|
+
License: GPL3
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/fhalab/levseq/
|
|
9
|
+
Project-URL: Documentation, https://github.com/fhalab/levseq/
|
|
10
|
+
Project-URL: Source Code, https://github.com/fhalab/levseq/
|
|
11
|
+
Keywords: Nanopore,ONT,evSeq
|
|
12
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
15
|
+
Classifier: Natural Language :: English
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.6
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
21
|
+
Requires-Python: >=3.8
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: Bio
|
|
25
|
+
Requires-Dist: biopython
|
|
26
|
+
Requires-Dist: fsspec
|
|
27
|
+
Requires-Dist: h5py
|
|
28
|
+
Requires-Dist: holoviews
|
|
29
|
+
Requires-Dist: jupyterlab
|
|
30
|
+
Requires-Dist: mappy
|
|
31
|
+
Requires-Dist: matplotlib
|
|
32
|
+
Requires-Dist: ninetysix
|
|
33
|
+
Requires-Dist: numpy
|
|
34
|
+
Requires-Dist: pandas
|
|
35
|
+
Requires-Dist: pybedtools
|
|
36
|
+
Requires-Dist: pycoQC
|
|
37
|
+
Requires-Dist: pyfaidx
|
|
38
|
+
Requires-Dist: pyparsing
|
|
39
|
+
Requires-Dist: pysam
|
|
40
|
+
Requires-Dist: scipy
|
|
41
|
+
Requires-Dist: sciutil
|
|
42
|
+
Requires-Dist: seaborn
|
|
43
|
+
Requires-Dist: scikit-learn
|
|
44
|
+
Requires-Dist: statsmodels
|
|
45
|
+
Requires-Dist: tqdm
|
|
46
|
+
Requires-Dist: biopandas
|
|
47
|
+
|
|
48
|
+
# Variant Sequencing with Nanopore (LevSeq)
|
|
49
|
+
|
|
50
|
+
LevSeq provides a streamlined pipeline for sequencing and analyzing genetic variants using Oxford Nanopore technology. In directed evolution experiments, LevSeq enables sequencing of every variant, enhancing data insight and creating datasets suitable for AI/ML methods. Sequence variants can be generated within a day at an extremely low cost.
|
|
51
|
+
|
|
52
|
+

|
|
53
|
+
Figure 1: Overview of the LevSeq variant sequencing workflow using Nanopore technology. This diagram illustrates the key steps in the process, from sample preparation to data analysis and visualization.
|
|
54
|
+
|
|
55
|
+
## Quick Start
|
|
56
|
+
|
|
57
|
+
### Docker Installation (Recommended)
|
|
58
|
+
|
|
59
|
+
1. Install Docker: [https://docs.docker.com/engine/install/](https://docs.docker.com/engine/install/)
|
|
60
|
+
2. Pull the appropriate image:
|
|
61
|
+
```bash
|
|
62
|
+
# For Linux/Windows x86 systems:
|
|
63
|
+
docker pull yueminglong/levseq:levseq-1.4-x86
|
|
64
|
+
|
|
65
|
+
# For Mac M-series chips (M1, M2, M3, M4):
|
|
66
|
+
docker pull yueminglong/levseq:levseq-1.4-arm64
|
|
67
|
+
```
|
|
68
|
+
3. Run LevSeq:
|
|
69
|
+
```bash
|
|
70
|
+
docker run --rm -v "/full/path/to/data:/levseq_results" yueminglong/levseq:levseq-1.4-arm64 my_experiment levseq_results/ levseq_results/ref.csv
|
|
71
|
+
```
|
|
72
|
+
4. Connect function data to your sequence data
|
|
73
|
+
```bash
|
|
74
|
+
docker run --rm -v "/full/path/to/data:/levseq_results" yueminglong/levseq:levseq-1.4-arm64 my_experiment levseq_results/ levseq_results/ref.csv --fitness_files "levseq_results/20250712_epPCR_Q06714_37.csv,levseq_results/20250712_epPCR_Q06714_39.csv,levseq_results/20250712_epPCR_Q06714_40.csv" --smiles 'O=P(OC1=CC=CC=C1)(OC2=CC=CC=C2)OC3=CC=CC=C3>>O=P(O)(OC4=CC=CC=C4)OC5=CC=CC=C5' --compound dPPi --variant_df "levseq_results/visualization_partial.csv"
|
|
75
|
+
```
|
|
76
|
+
### Pip Installation (Mac/Linux only)
|
|
77
|
+
|
|
78
|
+
**IMPORTANT**: On Mac M-series chips (M1-M4), gcc 13 and 14 are **REQUIRED**:
|
|
79
|
+
```bash
|
|
80
|
+
brew install gcc@13 gcc@14
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
1. Create and activate conda environment:
|
|
84
|
+
```bash
|
|
85
|
+
conda create --name levseq python=3.12 -y
|
|
86
|
+
conda activate levseq
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
2. Install dependencies:
|
|
90
|
+
```bash
|
|
91
|
+
conda install -c bioconda -c conda-forge samtools minimap2
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
3. Install LevSeq:
|
|
95
|
+
```bash
|
|
96
|
+
pip install levseq
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
4. Run LevSeq:
|
|
100
|
+
```bash
|
|
101
|
+
levseq my_experiment /path/to/data/ /path/to/ref.csv
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
5. Combine function data:
|
|
105
|
+
```bash
|
|
106
|
+
levseq my_experiment /path/to/data/ /path/to/ref.csv "LCMS_file_{barcode1}.csv,LCMS_file_{barcode2}.csv," --smiles 'reaction_smiles_string' --compound "name_of_compound_in_LCMS_file" --variant_df "visualization_partial.csv"
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
Note for function data we currently expect a LCMS file e.g. with the columns:
|
|
110
|
+
- `Sample Vial Number` (corresponding to the well that the sample was from).
|
|
111
|
+
- `Area` (which becomes fitness value).
|
|
112
|
+
- `Compound Name` which is the name of the compound we filter for that is passed as a parameter.
|
|
113
|
+
- The last `_X.csv` needs to be the barcode number to match that sample to your plate e.g. if you ran LevSeq with barcode 33 for plate 2 you need to have `_33.csv` for the fitness file for plate 2. e.g. `some_fitnes_for_plate_2_33.csv`.
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
## Data and Visualization
|
|
117
|
+
|
|
118
|
+
- **Test Data**: Sample data is available on Zenodo [](https://doi.org/10.5281/zenodo.13694463)
|
|
119
|
+
- **Visualization Tool**: A web application is available at [https://levseqdb.streamlit.app/](https://levseqdb.streamlit.app/) - simply upload your LevSeq output and LCMS results
|
|
120
|
+
- **Self-hosted Solution**: You can deploy your own instance using our [LevSeq_db repository](https://github.com/fhalab/LevSeq_db)
|
|
121
|
+
|
|
122
|
+
## Reference File Format (ref.csv)
|
|
123
|
+
|
|
124
|
+
Your reference CSV file must contain the following columns:
|
|
125
|
+
|
|
126
|
+
| barcode_plate | name | refseq |
|
|
127
|
+
|---------------|--------|-----------|
|
|
128
|
+
| 33 | Q97A76 | ATGCGC... |
|
|
129
|
+
|
|
130
|
+
For oligopool experiments (multiple proteins per plate), use:
|
|
131
|
+
|
|
132
|
+
| barcode_plate | name | refseq |
|
|
133
|
+
|---------------|--------|-----------|
|
|
134
|
+
| 33 | Q97A76 | ATGCGCAAG |
|
|
135
|
+
| 33 | P96084 | ATGGATCA |
|
|
136
|
+
| 34 | P46209 | ATGGGGCAA |
|
|
137
|
+
| 34 | Q60336 | ATGGGGCC |
|
|
138
|
+
|
|
139
|
+
## Command Line Arguments
|
|
140
|
+
|
|
141
|
+
### Required Arguments
|
|
142
|
+
1. **name**: Name of the experiment (output folder)
|
|
143
|
+
2. **path**: Location of basecalled fastq files
|
|
144
|
+
3. **summary**: Path to reference CSV file
|
|
145
|
+
|
|
146
|
+
### Optional Arguments
|
|
147
|
+
- `--skip_demultiplexing`: Skip the demultiplexing step
|
|
148
|
+
- `--skip_variantcalling`: Skip the variant calling step
|
|
149
|
+
- `--output`: Custom save location (defaults to current directory)
|
|
150
|
+
- `--show_msa`: Show multiple sequence alignment for each well
|
|
151
|
+
- `--oligopool`: Process data as oligopool experiment
|
|
152
|
+
|
|
153
|
+
## Step-by-Step Tutorial
|
|
154
|
+
|
|
155
|
+
1. **Prepare your sequencing data**:
|
|
156
|
+
- Your fastq files should be in a directory structure similar to Nanopore's output
|
|
157
|
+
- Prepare a reference CSV file with barcode plates, sample names, and reference sequences
|
|
158
|
+
|
|
159
|
+
2. **Run LevSeq**:
|
|
160
|
+
```bash
|
|
161
|
+
# Via Docker
|
|
162
|
+
docker run --rm -v "/path/to/data:/levseq_results" yueminglong/levseq:levseq-1.4-arm64 my_experiment levseq_results/ levseq_results/ref.csv
|
|
163
|
+
|
|
164
|
+
# Via pip
|
|
165
|
+
levseq my_experiment /path/to/data/ /path/to/ref.csv
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
3. **Analyze results**:
|
|
169
|
+
- Output includes variant data (CSV) and interactive visualizations (HTML)
|
|
170
|
+
- Upload results to the LevSeq visualization tool for further analysis
|
|
171
|
+
|
|
172
|
+
## Experimental Setup
|
|
173
|
+
|
|
174
|
+
For the wet lab protocol:
|
|
175
|
+
- Refer to the [wiki](https://github.com/fhalab/LevSeq/wiki/Experimental-protocols)
|
|
176
|
+
- See the methods section of [our paper](https://pubs.acs.org/doi/10.1021/acssynbio.4c00625)
|
|
177
|
+
- Order forward and reverse primers compatible with your plasmid
|
|
178
|
+
- Install Oxford Nanopore's software for basecalling if needed
|
|
179
|
+
|
|
180
|
+
## Additional Resources
|
|
181
|
+
|
|
182
|
+
- **Example Notebook**: See `example/Example.ipynb` for a walkthrough
|
|
183
|
+
- **Advanced Usage**: See the [manuscript notebook](https://github.com/fhalab/LevSeq/blob/main/manuscript/notebooks/epPCR_10plates.ipynb)
|
|
184
|
+
- **Troubleshooting**: See our [computational protocols wiki](https://github.com/fhalab/LevSeq/wiki/Computational-protocols)
|
|
185
|
+
|
|
186
|
+
## Citing LevSeq
|
|
187
|
+
|
|
188
|
+
If you find LevSeq useful, please cite our paper:
|
|
189
|
+
|
|
190
|
+
```bibtex
|
|
191
|
+
@article{long2024levseq,
|
|
192
|
+
title={LevSeq: Rapid Generation of Sequence-Function Data for Directed Evolution and Machine Learning},
|
|
193
|
+
author={Long, Yueming and Mora, Ariane and Li, Francesca-Zhoufan and Gürsoy, Emre and Johnston, Kadina E and Arnold, Frances H},
|
|
194
|
+
journal={ACS Synthetic Biology},
|
|
195
|
+
year={2024},
|
|
196
|
+
publisher={American Chemical Society}
|
|
197
|
+
}
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
## Contact
|
|
201
|
+
|
|
202
|
+
Leave a feature request in the issues or reach us via [email](mailto:levseqdb@gmail.com).
|
levseq-1.4.2/README.md
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
# Variant Sequencing with Nanopore (LevSeq)
|
|
2
|
+
|
|
3
|
+
LevSeq provides a streamlined pipeline for sequencing and analyzing genetic variants using Oxford Nanopore technology. In directed evolution experiments, LevSeq enables sequencing of every variant, enhancing data insight and creating datasets suitable for AI/ML methods. Sequence variants can be generated within a day at an extremely low cost.
|
|
4
|
+
|
|
5
|
+

|
|
6
|
+
Figure 1: Overview of the LevSeq variant sequencing workflow using Nanopore technology. This diagram illustrates the key steps in the process, from sample preparation to data analysis and visualization.
|
|
7
|
+
|
|
8
|
+
## Quick Start
|
|
9
|
+
|
|
10
|
+
### Docker Installation (Recommended)
|
|
11
|
+
|
|
12
|
+
1. Install Docker: [https://docs.docker.com/engine/install/](https://docs.docker.com/engine/install/)
|
|
13
|
+
2. Pull the appropriate image:
|
|
14
|
+
```bash
|
|
15
|
+
# For Linux/Windows x86 systems:
|
|
16
|
+
docker pull yueminglong/levseq:levseq-1.4-x86
|
|
17
|
+
|
|
18
|
+
# For Mac M-series chips (M1, M2, M3, M4):
|
|
19
|
+
docker pull yueminglong/levseq:levseq-1.4-arm64
|
|
20
|
+
```
|
|
21
|
+
3. Run LevSeq:
|
|
22
|
+
```bash
|
|
23
|
+
docker run --rm -v "/full/path/to/data:/levseq_results" yueminglong/levseq:levseq-1.4-arm64 my_experiment levseq_results/ levseq_results/ref.csv
|
|
24
|
+
```
|
|
25
|
+
4. Connect function data to your sequence data
|
|
26
|
+
```bash
|
|
27
|
+
docker run --rm -v "/full/path/to/data:/levseq_results" yueminglong/levseq:levseq-1.4-arm64 my_experiment levseq_results/ levseq_results/ref.csv --fitness_files "levseq_results/20250712_epPCR_Q06714_37.csv,levseq_results/20250712_epPCR_Q06714_39.csv,levseq_results/20250712_epPCR_Q06714_40.csv" --smiles 'O=P(OC1=CC=CC=C1)(OC2=CC=CC=C2)OC3=CC=CC=C3>>O=P(O)(OC4=CC=CC=C4)OC5=CC=CC=C5' --compound dPPi --variant_df "levseq_results/visualization_partial.csv"
|
|
28
|
+
```
|
|
29
|
+
### Pip Installation (Mac/Linux only)
|
|
30
|
+
|
|
31
|
+
**IMPORTANT**: On Mac M-series chips (M1-M4), gcc 13 and 14 are **REQUIRED**:
|
|
32
|
+
```bash
|
|
33
|
+
brew install gcc@13 gcc@14
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
1. Create and activate conda environment:
|
|
37
|
+
```bash
|
|
38
|
+
conda create --name levseq python=3.12 -y
|
|
39
|
+
conda activate levseq
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
2. Install dependencies:
|
|
43
|
+
```bash
|
|
44
|
+
conda install -c bioconda -c conda-forge samtools minimap2
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
3. Install LevSeq:
|
|
48
|
+
```bash
|
|
49
|
+
pip install levseq
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
4. Run LevSeq:
|
|
53
|
+
```bash
|
|
54
|
+
levseq my_experiment /path/to/data/ /path/to/ref.csv
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
5. Combine function data:
|
|
58
|
+
```bash
|
|
59
|
+
levseq my_experiment /path/to/data/ /path/to/ref.csv "LCMS_file_{barcode1}.csv,LCMS_file_{barcode2}.csv," --smiles 'reaction_smiles_string' --compound "name_of_compound_in_LCMS_file" --variant_df "visualization_partial.csv"
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
Note for function data we currently expect a LCMS file e.g. with the columns:
|
|
63
|
+
- `Sample Vial Number` (corresponding to the well that the sample was from).
|
|
64
|
+
- `Area` (which becomes fitness value).
|
|
65
|
+
- `Compound Name` which is the name of the compound we filter for that is passed as a parameter.
|
|
66
|
+
- The last `_X.csv` needs to be the barcode number to match that sample to your plate e.g. if you ran LevSeq with barcode 33 for plate 2 you need to have `_33.csv` for the fitness file for plate 2. e.g. `some_fitnes_for_plate_2_33.csv`.
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
## Data and Visualization
|
|
70
|
+
|
|
71
|
+
- **Test Data**: Sample data is available on Zenodo [](https://doi.org/10.5281/zenodo.13694463)
|
|
72
|
+
- **Visualization Tool**: A web application is available at [https://levseqdb.streamlit.app/](https://levseqdb.streamlit.app/) - simply upload your LevSeq output and LCMS results
|
|
73
|
+
- **Self-hosted Solution**: You can deploy your own instance using our [LevSeq_db repository](https://github.com/fhalab/LevSeq_db)
|
|
74
|
+
|
|
75
|
+
## Reference File Format (ref.csv)
|
|
76
|
+
|
|
77
|
+
Your reference CSV file must contain the following columns:
|
|
78
|
+
|
|
79
|
+
| barcode_plate | name | refseq |
|
|
80
|
+
|---------------|--------|-----------|
|
|
81
|
+
| 33 | Q97A76 | ATGCGC... |
|
|
82
|
+
|
|
83
|
+
For oligopool experiments (multiple proteins per plate), use:
|
|
84
|
+
|
|
85
|
+
| barcode_plate | name | refseq |
|
|
86
|
+
|---------------|--------|-----------|
|
|
87
|
+
| 33 | Q97A76 | ATGCGCAAG |
|
|
88
|
+
| 33 | P96084 | ATGGATCA |
|
|
89
|
+
| 34 | P46209 | ATGGGGCAA |
|
|
90
|
+
| 34 | Q60336 | ATGGGGCC |
|
|
91
|
+
|
|
92
|
+
## Command Line Arguments
|
|
93
|
+
|
|
94
|
+
### Required Arguments
|
|
95
|
+
1. **name**: Name of the experiment (output folder)
|
|
96
|
+
2. **path**: Location of basecalled fastq files
|
|
97
|
+
3. **summary**: Path to reference CSV file
|
|
98
|
+
|
|
99
|
+
### Optional Arguments
|
|
100
|
+
- `--skip_demultiplexing`: Skip the demultiplexing step
|
|
101
|
+
- `--skip_variantcalling`: Skip the variant calling step
|
|
102
|
+
- `--output`: Custom save location (defaults to current directory)
|
|
103
|
+
- `--show_msa`: Show multiple sequence alignment for each well
|
|
104
|
+
- `--oligopool`: Process data as oligopool experiment
|
|
105
|
+
|
|
106
|
+
## Step-by-Step Tutorial
|
|
107
|
+
|
|
108
|
+
1. **Prepare your sequencing data**:
|
|
109
|
+
- Your fastq files should be in a directory structure similar to Nanopore's output
|
|
110
|
+
- Prepare a reference CSV file with barcode plates, sample names, and reference sequences
|
|
111
|
+
|
|
112
|
+
2. **Run LevSeq**:
|
|
113
|
+
```bash
|
|
114
|
+
# Via Docker
|
|
115
|
+
docker run --rm -v "/path/to/data:/levseq_results" yueminglong/levseq:levseq-1.4-arm64 my_experiment levseq_results/ levseq_results/ref.csv
|
|
116
|
+
|
|
117
|
+
# Via pip
|
|
118
|
+
levseq my_experiment /path/to/data/ /path/to/ref.csv
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
3. **Analyze results**:
|
|
122
|
+
- Output includes variant data (CSV) and interactive visualizations (HTML)
|
|
123
|
+
- Upload results to the LevSeq visualization tool for further analysis
|
|
124
|
+
|
|
125
|
+
## Experimental Setup
|
|
126
|
+
|
|
127
|
+
For the wet lab protocol:
|
|
128
|
+
- Refer to the [wiki](https://github.com/fhalab/LevSeq/wiki/Experimental-protocols)
|
|
129
|
+
- See the methods section of [our paper](https://pubs.acs.org/doi/10.1021/acssynbio.4c00625)
|
|
130
|
+
- Order forward and reverse primers compatible with your plasmid
|
|
131
|
+
- Install Oxford Nanopore's software for basecalling if needed
|
|
132
|
+
|
|
133
|
+
## Additional Resources
|
|
134
|
+
|
|
135
|
+
- **Example Notebook**: See `example/Example.ipynb` for a walkthrough
|
|
136
|
+
- **Advanced Usage**: See the [manuscript notebook](https://github.com/fhalab/LevSeq/blob/main/manuscript/notebooks/epPCR_10plates.ipynb)
|
|
137
|
+
- **Troubleshooting**: See our [computational protocols wiki](https://github.com/fhalab/LevSeq/wiki/Computational-protocols)
|
|
138
|
+
|
|
139
|
+
## Citing LevSeq
|
|
140
|
+
|
|
141
|
+
If you find LevSeq useful, please cite our paper:
|
|
142
|
+
|
|
143
|
+
```bibtex
|
|
144
|
+
@article{long2024levseq,
|
|
145
|
+
title={LevSeq: Rapid Generation of Sequence-Function Data for Directed Evolution and Machine Learning},
|
|
146
|
+
author={Long, Yueming and Mora, Ariane and Li, Francesca-Zhoufan and Gürsoy, Emre and Johnston, Kadina E and Arnold, Frances H},
|
|
147
|
+
journal={ACS Synthetic Biology},
|
|
148
|
+
year={2024},
|
|
149
|
+
publisher={American Chemical Society}
|
|
150
|
+
}
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
## Contact
|
|
154
|
+
|
|
155
|
+
Leave a feature request in the issues or reach us via [email](mailto:levseqdb@gmail.com).
|
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
__title__ = 'levseq'
|
|
19
19
|
__description__ = 'LevSeq nanopore sequencing'
|
|
20
20
|
__url__ = 'https://github.com/fhalab/levseq/'
|
|
21
|
-
__version__ = '1.4.
|
|
21
|
+
__version__ = '1.4.2'
|
|
22
22
|
__author__ = 'Yueming Long, Ariane Mora, Francesca-Zhoufan Li, Emre Gursoy'
|
|
23
23
|
__author_email__ = 'ylong@caltech.edu'
|
|
24
24
|
__license__ = 'GPL3'
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
###############################################################################
|
|
2
|
+
# #
|
|
3
|
+
# This program is free software: you can redistribute it and/or modify #
|
|
4
|
+
# it under the terms of the GNU General Public License as published by #
|
|
5
|
+
# the Free Software Foundation, either version 3 of the License, or #
|
|
6
|
+
# (at your option) any later version. #
|
|
7
|
+
# #
|
|
8
|
+
# This program is distributed in the hope that it will be useful, #
|
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
11
|
+
# GNU General Public License for more details. #
|
|
12
|
+
# #
|
|
13
|
+
# You should have received a copy of the GNU General Public License #
|
|
14
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>. #
|
|
15
|
+
# #
|
|
16
|
+
###############################################################################
|
|
17
|
+
"""
|
|
18
|
+
Contain argument parsers used for command line interface and web interface
|
|
19
|
+
"""
|
|
20
|
+
# Import packages
|
|
21
|
+
import os
|
|
22
|
+
import tqdm
|
|
23
|
+
import argparse
|
|
24
|
+
import pandas as pd
|
|
25
|
+
|
|
26
|
+
# Import local packages
|
|
27
|
+
from levseq.run_levseq import run_LevSeq
|
|
28
|
+
|
|
29
|
+
# Get the working directory
|
|
30
|
+
CWD = os.getcwd()
|
|
31
|
+
|
|
32
|
+
# Set default arguments
|
|
33
|
+
padding_start = 0
|
|
34
|
+
padding_end = 0
|
|
35
|
+
min_depth = 5
|
|
36
|
+
threshold = 0.2
|
|
37
|
+
basecall_model = 'sup'
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# Build the CLI argparser
|
|
41
|
+
def build_cli_parser():
|
|
42
|
+
# Initialize
|
|
43
|
+
parser = argparse.ArgumentParser()
|
|
44
|
+
|
|
45
|
+
# Add required arguments
|
|
46
|
+
required_args_group = parser.add_argument_group("Required Arguments", "Arguments required for each run")
|
|
47
|
+
required_args_group.add_argument('name',
|
|
48
|
+
help = 'User defined name for the output folder')
|
|
49
|
+
required_args_group.add_argument("path",
|
|
50
|
+
help="Path to folder containing fastq.pass or pod5_pass files.")
|
|
51
|
+
required_args_group.add_argument("summary",
|
|
52
|
+
help="CSV file containig barcodes used, name of each plate and reference sequence in string")
|
|
53
|
+
# Add optional arguments
|
|
54
|
+
optional_args_group = parser.add_argument_group("Optional Arguments", "Aditional arguments")
|
|
55
|
+
optional_args_group.add_argument("--output",
|
|
56
|
+
help="Save location for run. Defaults to current working directory.",
|
|
57
|
+
required=False,
|
|
58
|
+
default=CWD)
|
|
59
|
+
optional_args_group.add_argument("--perform_basecalling",
|
|
60
|
+
action="store_true",
|
|
61
|
+
help="Skip the basecalling step, default is false")
|
|
62
|
+
optional_args_group.add_argument("--skip_demultiplexing",
|
|
63
|
+
action="store_true",
|
|
64
|
+
help="Skip the demultiplexing step, default is false")
|
|
65
|
+
optional_args_group.add_argument("--skip_variantcalling",
|
|
66
|
+
action="store_true",
|
|
67
|
+
help="Skip the variant calling step, default is false")
|
|
68
|
+
optional_args_group.add_argument("--oligopool",
|
|
69
|
+
action="store_true",
|
|
70
|
+
help="Whether this experiment came from an oligopool, default is false.")
|
|
71
|
+
optional_args_group.add_argument("--show_msa",
|
|
72
|
+
default=False,
|
|
73
|
+
help="Skip showing msa")
|
|
74
|
+
# if cl_args.get('fitness_files') and cl_args.get('smiles'):
|
|
75
|
+
optional_args_group.add_argument("--fitness_files",
|
|
76
|
+
default=None,
|
|
77
|
+
help="A comma separated list of fitness files (full path) with string quotation marks around them.")
|
|
78
|
+
optional_args_group.add_argument("--smiles",
|
|
79
|
+
default=None,
|
|
80
|
+
help="A smiles string of the reaction with quotation marks around.")
|
|
81
|
+
optional_args_group.add_argument("--compound",
|
|
82
|
+
default=None,
|
|
83
|
+
help="The compound in the fitness files (e.g. pDT or pdt - case sensitive).")
|
|
84
|
+
optional_args_group.add_argument("--variant_df",
|
|
85
|
+
default=None,
|
|
86
|
+
help="The variant dataframe to combine with fitness data.")
|
|
87
|
+
return parser
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def combine_seq_func_data(cl_args):
|
|
91
|
+
# Also check if we have any fitness data
|
|
92
|
+
if cl_args.get('fitness_files') and cl_args.get('smiles') and cl_args.get('variant_df'):
|
|
93
|
+
variant_filename = cl_args.get('variant_df')
|
|
94
|
+
variant_df = pd.read_csv(variant_filename)
|
|
95
|
+
# Combine the fitness data with the plate data (note the barcode has to be the last _[barcode])
|
|
96
|
+
# The smiles has to be the reaction smiles
|
|
97
|
+
function_files = cl_args.get('fitness_files')
|
|
98
|
+
compound_name = cl_args.get('compound') if cl_args.get('compound') else 'pdt'
|
|
99
|
+
print(function_files, compound_name)
|
|
100
|
+
all_function_df = pd.DataFrame()
|
|
101
|
+
for function_file in function_files.split(','):
|
|
102
|
+
barcode = function_file.split('.csv')[0].split('_')[-1]
|
|
103
|
+
function_df = pd.read_csv(f'{function_file}')
|
|
104
|
+
function_df.columns = [c.replace('\n', ' ') for c in function_df.columns]
|
|
105
|
+
function_df['function_well'] = [x.split('-')[-1] if isinstance(x, str) else None for x in function_df['Sample Vial Number'].values]
|
|
106
|
+
function_df['function_barcode_plate'] = barcode
|
|
107
|
+
function_df = function_df[function_df['Compound Name'] == compound_name] # We only use pdt or Pdt
|
|
108
|
+
# Convert it to numeric
|
|
109
|
+
function_df['Area'] = pd.to_numeric(function_df['Area'], errors='coerce')
|
|
110
|
+
|
|
111
|
+
function_df['barcode_well'] = [f'{p}_{w}' for w, p in function_df[['function_well', 'function_barcode_plate']].values]
|
|
112
|
+
function_df['filename'] = function_file
|
|
113
|
+
print(function_df.head())
|
|
114
|
+
all_function_df = pd.concat([all_function_df, function_df])
|
|
115
|
+
# Join this with the variant_df barcode plate
|
|
116
|
+
variant_df['barcode_well'] = [f'{p}_{w}' for w, p in variant_df[['Well', 'barcode_plate']].values]
|
|
117
|
+
# Join the two
|
|
118
|
+
variant_df.set_index('barcode_well', inplace=True)
|
|
119
|
+
all_function_df.set_index('barcode_well', inplace=True)
|
|
120
|
+
variant_df = variant_df.join(all_function_df, how='left')
|
|
121
|
+
reaction_smiles = cl_args.get('smiles')
|
|
122
|
+
variant_df['smiles_string'] = reaction_smiles.split('>>')[-1]
|
|
123
|
+
variant_df['reaction_smiles'] = reaction_smiles
|
|
124
|
+
variant_df.columns = [c.lower().replace(' ', '_') for c in variant_df.columns]
|
|
125
|
+
variant_df.rename(columns={'area': 'fitness_value'}, inplace=True)
|
|
126
|
+
variant_df.to_csv(f'{variant_filename.replace(".csv", "_seqfunc.csv")}')
|
|
127
|
+
|
|
128
|
+
# levseq levseq_4.1 ref.csv fitness --fitness_files "20250712_epPCR_Q06714_37.csv,20250712_epPCR_Q06714_38.csv,20250712_epPCR_Q06714_39.csv,20250712_epPCR_Q06714_40.csv" --smiles 'O=P(OC1=CC=CC=C1)(OC2=CC=CC=C2)OC3=CC=CC=C3>>O=P(O)(OC4=CC=CC=C4)OC5=CC=CC=C5' --compound dPPi --variant_df visualization_partial.csv
|
|
129
|
+
return variant_df
|
|
130
|
+
|
|
131
|
+
# Execute LevSeq
|
|
132
|
+
def execute_LevSeq():
|
|
133
|
+
# Build parser
|
|
134
|
+
parser = build_cli_parser()
|
|
135
|
+
# Parse the arguments
|
|
136
|
+
CL_ARGS = vars(parser.parse_args())
|
|
137
|
+
if CL_ARGS.get('fitness_files') and CL_ARGS.get('smiles') and CL_ARGS.get('variant_df'):
|
|
138
|
+
print('Combining LevSeq')
|
|
139
|
+
return combine_seq_func_data(CL_ARGS)
|
|
140
|
+
# Set up progres bar
|
|
141
|
+
tqdm_fn = tqdm.tqdm
|
|
142
|
+
# Run LevSeq
|
|
143
|
+
try:
|
|
144
|
+
from levseq import __version__
|
|
145
|
+
print(f"Starting LevSeq v{__version__}...")
|
|
146
|
+
run_LevSeq(CL_ARGS, tqdm_fn)
|
|
147
|
+
print(f"Run completed successfully. Results and logs stored in {os.path.join(CL_ARGS.get('output', CWD), CL_ARGS.get('name', ''))}")
|
|
148
|
+
except Exception as e:
|
|
149
|
+
print(f"Error: {e}")
|
|
150
|
+
print(f"Check error logs for details in {os.path.join(CL_ARGS.get('output', CWD), CL_ARGS.get('name', ''))}")
|
|
@@ -66,8 +66,14 @@ from importlib import resources
|
|
|
66
66
|
from holoviews.streams import Tap
|
|
67
67
|
|
|
68
68
|
# Utility function to configure logging
|
|
69
|
-
def configure_logging(result_folder):
|
|
70
|
-
|
|
69
|
+
def configure_logging(result_folder, cl_args):
|
|
70
|
+
import sys
|
|
71
|
+
from levseq import __version__
|
|
72
|
+
|
|
73
|
+
# Define a more detailed log format with clean separation
|
|
74
|
+
log_format = "%(asctime)s : %(levelname)s : %(message)s"
|
|
75
|
+
|
|
76
|
+
# Create log handlers
|
|
71
77
|
info_handler = logging.FileHandler(os.path.join(result_folder, "LevSeq_run.log"))
|
|
72
78
|
info_handler.setLevel(logging.INFO)
|
|
73
79
|
info_handler.setFormatter(logging.Formatter(log_format))
|
|
@@ -76,7 +82,30 @@ def configure_logging(result_folder):
|
|
|
76
82
|
error_handler.setLevel(logging.ERROR)
|
|
77
83
|
error_handler.setFormatter(logging.Formatter(log_format))
|
|
78
84
|
|
|
85
|
+
# Set up basic configuration with both handlers
|
|
79
86
|
logging.basicConfig(level=logging.INFO, handlers=[info_handler, error_handler])
|
|
87
|
+
|
|
88
|
+
# Log version information and command used to run
|
|
89
|
+
command_used = " ".join(sys.argv)
|
|
90
|
+
logging.info(f"LevSeq Version: {__version__}")
|
|
91
|
+
logging.info(f"Command: {command_used}")
|
|
92
|
+
|
|
93
|
+
# Log essential run parameters
|
|
94
|
+
logging.info(f"Run name: {cl_args.get('name', 'Not specified')}")
|
|
95
|
+
logging.info(f"Input path: {cl_args.get('path', 'Not specified')}")
|
|
96
|
+
logging.info(f"Summary file: {cl_args.get('summary', 'Not specified')}")
|
|
97
|
+
|
|
98
|
+
# Log optional parameters if specified
|
|
99
|
+
if cl_args.get('output') and cl_args.get('output') != os.getcwd():
|
|
100
|
+
logging.info(f"Output directory: {cl_args.get('output')}")
|
|
101
|
+
if cl_args.get('oligopool'):
|
|
102
|
+
logging.info("Running in oligopool mode")
|
|
103
|
+
if cl_args.get('skip_demultiplexing'):
|
|
104
|
+
logging.info("Skipping demultiplexing step")
|
|
105
|
+
if cl_args.get('skip_variantcalling'):
|
|
106
|
+
logging.info("Skipping variant calling step")
|
|
107
|
+
if cl_args.get('threshold'):
|
|
108
|
+
logging.info(f"Using variant threshold: {cl_args.get('threshold')}")
|
|
80
109
|
|
|
81
110
|
# Create result folder
|
|
82
111
|
def create_result_folder(cl_args):
|
|
@@ -139,8 +168,15 @@ def cat_fastq_files(folder_path: str, output_path: str, reads_per_file: int = 40
|
|
|
139
168
|
else:
|
|
140
169
|
for fastq_file in fastq_files:
|
|
141
170
|
destination = output_path / fastq_file.name
|
|
142
|
-
|
|
143
|
-
|
|
171
|
+
# Skip copying if source and destination are identical
|
|
172
|
+
if str(fastq_file) == str(destination):
|
|
173
|
+
logging.info("Skipping copy of %s (source and destination are identical)", fastq_file)
|
|
174
|
+
continue
|
|
175
|
+
try:
|
|
176
|
+
shutil.copy(fastq_file, destination)
|
|
177
|
+
logging.info("Copied %s to %s", fastq_file, destination)
|
|
178
|
+
except shutil.SameFileError:
|
|
179
|
+
logging.info("Skipping copy of %s (source and destination are identical files)", fastq_file)
|
|
144
180
|
logging.info("All FASTQ files processed successfully to %s", output_path)
|
|
145
181
|
return str(output_path)
|
|
146
182
|
except Exception as e:
|
|
@@ -334,7 +370,7 @@ def create_df_v(variants_df):
|
|
|
334
370
|
|
|
335
371
|
# Create a copy for restructuring to avoid affecting the original
|
|
336
372
|
restructured_df = df_variants_.copy()
|
|
337
|
-
restructured_df.columns = restructured_df.columns.str.lower().str.replace('[\s-]', '_', regex=True)
|
|
373
|
+
restructured_df.columns = restructured_df.columns.str.lower().str.replace(r'[\s-]', '_', regex=True)
|
|
338
374
|
# Fix the specific column name
|
|
339
375
|
restructured_df.columns = restructured_df.columns.str.replace('p_adj._value', 'p_adj_value')
|
|
340
376
|
|
|
@@ -566,6 +602,7 @@ def process_ref_csv(cl_args, tqdm_fn=tqdm.tqdm):
|
|
|
566
602
|
continue
|
|
567
603
|
|
|
568
604
|
variant_df.to_csv(variant_csv_path, index=False)
|
|
605
|
+
|
|
569
606
|
return variant_df, ref_df
|
|
570
607
|
|
|
571
608
|
|
|
@@ -576,8 +613,8 @@ def run_LevSeq(cl_args, tqdm_fn=tqdm.tqdm):
|
|
|
576
613
|
ref_folder = os.path.join(result_folder, "ref")
|
|
577
614
|
os.makedirs(ref_folder, exist_ok=True)
|
|
578
615
|
|
|
579
|
-
configure_logging(result_folder)
|
|
580
|
-
logging.info("Logging configured. Starting
|
|
616
|
+
configure_logging(result_folder, cl_args)
|
|
617
|
+
logging.info("Logging configured. Starting analysis...")
|
|
581
618
|
|
|
582
619
|
variant_df = pd.DataFrame(columns=["barcode_plate", "name", "refseq", "variant"])
|
|
583
620
|
|
|
@@ -141,15 +141,15 @@ class VariantCaller:
|
|
|
141
141
|
# Alignment using minimap2
|
|
142
142
|
minimap_cmd = f"minimap2 -ax map-ont -A {scores[0]} -B {scores[1]} -O {scores[2]},24 '{self.template_fasta}' '{fastq_files}' > '{output_dir}/{alignment_name}.sam'"
|
|
143
143
|
subprocess.run(minimap_cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
144
|
-
print(minimap_cmd)
|
|
144
|
+
# print(minimap_cmd)
|
|
145
145
|
# Convert SAM to BAM and sort
|
|
146
146
|
view_cmd = f"samtools view -bS '{output_dir}/{alignment_name}.sam' > '{output_dir}/{alignment_name}.bam'"
|
|
147
147
|
subprocess.run(view_cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
148
|
-
print(view_cmd)
|
|
148
|
+
# print(view_cmd)
|
|
149
149
|
|
|
150
150
|
sort_cmd = f"samtools sort '{output_dir}/{alignment_name}.bam' -o '{output_dir}/{alignment_name}.bam'"
|
|
151
151
|
subprocess.run(sort_cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
152
|
-
print(sort_cmd)
|
|
152
|
+
# print(sort_cmd)
|
|
153
153
|
|
|
154
154
|
# Index the BAM file
|
|
155
155
|
index_cmd = f"samtools index '{output_dir}/{alignment_name}.bam'"
|