levseq 1.4.0__tar.gz → 1.4.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. levseq-1.4.2/PKG-INFO +202 -0
  2. levseq-1.4.2/README.md +155 -0
  3. {levseq-1.4.0 → levseq-1.4.2}/levseq/__init__.py +1 -1
  4. levseq-1.4.2/levseq/interface.py +150 -0
  5. {levseq-1.4.0 → levseq-1.4.2}/levseq/run_levseq.py +44 -7
  6. {levseq-1.4.0 → levseq-1.4.2}/levseq/variantcaller.py +3 -3
  7. levseq-1.4.2/levseq.egg-info/PKG-INFO +202 -0
  8. {levseq-1.4.0 → levseq-1.4.2}/levseq.egg-info/SOURCES.txt +1 -0
  9. {levseq-1.4.0 → levseq-1.4.2}/setup.py +1 -0
  10. levseq-1.4.2/tests/test_copy_fastq.py +75 -0
  11. {levseq-1.4.0 → levseq-1.4.2}/tests/test_deploy.py +2 -1
  12. levseq-1.4.0/PKG-INFO +0 -201
  13. levseq-1.4.0/README.md +0 -154
  14. levseq-1.4.0/levseq/interface.py +0 -88
  15. levseq-1.4.0/levseq.egg-info/PKG-INFO +0 -201
  16. {levseq-1.4.0 → levseq-1.4.2}/LICENSE +0 -0
  17. {levseq-1.4.0 → levseq-1.4.2}/MANIFEST.in +0 -0
  18. {levseq-1.4.0 → levseq-1.4.2}/levseq/IO_processor.py +0 -0
  19. {levseq-1.4.0 → levseq-1.4.2}/levseq/barcoding/__init__.py +0 -0
  20. {levseq-1.4.0 → levseq-1.4.2}/levseq/barcoding/demultiplex +0 -0
  21. {levseq-1.4.0 → levseq-1.4.2}/levseq/barcoding/demultiplex-arm64 +0 -0
  22. {levseq-1.4.0 → levseq-1.4.2}/levseq/barcoding/demultiplex-x86 +0 -0
  23. {levseq-1.4.0 → levseq-1.4.2}/levseq/barcoding/minion_barcodes.fasta +0 -0
  24. {levseq-1.4.0 → levseq-1.4.2}/levseq/basecaller.py +0 -0
  25. {levseq-1.4.0 → levseq-1.4.2}/levseq/cmd.py +0 -0
  26. {levseq-1.4.0 → levseq-1.4.2}/levseq/coordinates.py +0 -0
  27. {levseq-1.4.0 → levseq-1.4.2}/levseq/filter_orientation.py +0 -0
  28. {levseq-1.4.0 → levseq-1.4.2}/levseq/globals.py +0 -0
  29. {levseq-1.4.0 → levseq-1.4.2}/levseq/parser.py +0 -0
  30. {levseq-1.4.0 → levseq-1.4.2}/levseq/screen.py +0 -0
  31. {levseq-1.4.0 → levseq-1.4.2}/levseq/seqfit.py +0 -0
  32. {levseq-1.4.0 → levseq-1.4.2}/levseq/simulation.py +0 -0
  33. {levseq-1.4.0 → levseq-1.4.2}/levseq/user.py +0 -0
  34. {levseq-1.4.0 → levseq-1.4.2}/levseq/utils.py +0 -0
  35. {levseq-1.4.0 → levseq-1.4.2}/levseq/visualization.py +0 -0
  36. {levseq-1.4.0 → levseq-1.4.2}/levseq.egg-info/dependency_links.txt +0 -0
  37. {levseq-1.4.0 → levseq-1.4.2}/levseq.egg-info/entry_points.txt +0 -0
  38. {levseq-1.4.0 → levseq-1.4.2}/levseq.egg-info/requires.txt +0 -0
  39. {levseq-1.4.0 → levseq-1.4.2}/levseq.egg-info/top_level.txt +0 -0
  40. {levseq-1.4.0 → levseq-1.4.2}/setup.cfg +0 -0
  41. {levseq-1.4.0 → levseq-1.4.2}/tests/test_demultiplex_docker.py +0 -0
  42. {levseq-1.4.0 → levseq-1.4.2}/tests/test_opligopools.py +0 -0
  43. {levseq-1.4.0 → levseq-1.4.2}/tests/test_seqfitvis.py +0 -0
  44. {levseq-1.4.0 → levseq-1.4.2}/tests/test_seqs.py +0 -0
  45. {levseq-1.4.0 → levseq-1.4.2}/tests/test_statistics.py +0 -0
  46. {levseq-1.4.0 → levseq-1.4.2}/tests/test_variant_calling.py +0 -0
levseq-1.4.2/PKG-INFO ADDED
@@ -0,0 +1,202 @@
1
+ Metadata-Version: 2.1
2
+ Name: levseq
3
+ Version: 1.4.2
4
+ Home-page: https://github.com/fhalab/levseq/
5
+ Author: Yueming Long, Ariane Mora, Francesca-Zhoufan Li, Emre Gursoy
6
+ Author-email: ylong@caltech.edu
7
+ License: GPL3
8
+ Project-URL: Bug Tracker, https://github.com/fhalab/levseq/
9
+ Project-URL: Documentation, https://github.com/fhalab/levseq/
10
+ Project-URL: Source Code, https://github.com/fhalab/levseq/
11
+ Keywords: Nanopore,ONT,evSeq
12
+ Classifier: Development Status :: 5 - Production/Stable
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
15
+ Classifier: Natural Language :: English
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3.6
18
+ Classifier: Programming Language :: Python :: 3.7
19
+ Classifier: Programming Language :: Python :: 3.8
20
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
21
+ Requires-Python: >=3.8
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ Requires-Dist: Bio
25
+ Requires-Dist: biopython
26
+ Requires-Dist: fsspec
27
+ Requires-Dist: h5py
28
+ Requires-Dist: holoviews
29
+ Requires-Dist: jupyterlab
30
+ Requires-Dist: mappy
31
+ Requires-Dist: matplotlib
32
+ Requires-Dist: ninetysix
33
+ Requires-Dist: numpy
34
+ Requires-Dist: pandas
35
+ Requires-Dist: pybedtools
36
+ Requires-Dist: pycoQC
37
+ Requires-Dist: pyfaidx
38
+ Requires-Dist: pyparsing
39
+ Requires-Dist: pysam
40
+ Requires-Dist: scipy
41
+ Requires-Dist: sciutil
42
+ Requires-Dist: seaborn
43
+ Requires-Dist: scikit-learn
44
+ Requires-Dist: statsmodels
45
+ Requires-Dist: tqdm
46
+ Requires-Dist: biopandas
47
+
48
+ # Variant Sequencing with Nanopore (LevSeq)
49
+
50
+ LevSeq provides a streamlined pipeline for sequencing and analyzing genetic variants using Oxford Nanopore technology. In directed evolution experiments, LevSeq enables sequencing of every variant, enhancing data insight and creating datasets suitable for AI/ML methods. Sequence variants can be generated within a day at an extremely low cost.
51
+
52
+ ![Figure 1: LevSeq Workflow](manuscript/figures/LevSeq_Figure-1.jpeg)
53
+ Figure 1: Overview of the LevSeq variant sequencing workflow using Nanopore technology. This diagram illustrates the key steps in the process, from sample preparation to data analysis and visualization.
54
+
55
+ ## Quick Start
56
+
57
+ ### Docker Installation (Recommended)
58
+
59
+ 1. Install Docker: [https://docs.docker.com/engine/install/](https://docs.docker.com/engine/install/)
60
+ 2. Pull the appropriate image:
61
+ ```bash
62
+ # For Linux/Windows x86 systems:
63
+ docker pull yueminglong/levseq:levseq-1.4-x86
64
+
65
+ # For Mac M-series chips (M1, M2, M3, M4):
66
+ docker pull yueminglong/levseq:levseq-1.4-arm64
67
+ ```
68
+ 3. Run LevSeq:
69
+ ```bash
70
+ docker run --rm -v "/full/path/to/data:/levseq_results" yueminglong/levseq:levseq-1.4-arm64 my_experiment levseq_results/ levseq_results/ref.csv
71
+ ```
72
+ 4. Connect function data to your sequence data
73
+ ```bash
74
+ docker run --rm -v "/full/path/to/data:/levseq_results" yueminglong/levseq:levseq-1.4-arm64 my_experiment levseq_results/ levseq_results/ref.csv --fitness_files "levseq_results/20250712_epPCR_Q06714_37.csv,levseq_results/20250712_epPCR_Q06714_39.csv,levseq_results/20250712_epPCR_Q06714_40.csv" --smiles 'O=P(OC1=CC=CC=C1)(OC2=CC=CC=C2)OC3=CC=CC=C3>>O=P(O)(OC4=CC=CC=C4)OC5=CC=CC=C5' --compound dPPi --variant_df "levseq_results/visualization_partial.csv"
75
+ ```
76
+ ### Pip Installation (Mac/Linux only)
77
+
78
+ **IMPORTANT**: On Mac M-series chips (M1-M4), gcc 13 and 14 are **REQUIRED**:
79
+ ```bash
80
+ brew install gcc@13 gcc@14
81
+ ```
82
+
83
+ 1. Create and activate conda environment:
84
+ ```bash
85
+ conda create --name levseq python=3.12 -y
86
+ conda activate levseq
87
+ ```
88
+
89
+ 2. Install dependencies:
90
+ ```bash
91
+ conda install -c bioconda -c conda-forge samtools minimap2
92
+ ```
93
+
94
+ 3. Install LevSeq:
95
+ ```bash
96
+ pip install levseq
97
+ ```
98
+
99
+ 4. Run LevSeq:
100
+ ```bash
101
+ levseq my_experiment /path/to/data/ /path/to/ref.csv
102
+ ```
103
+
104
+ 5. Combine function data:
105
+ ```bash
106
+ levseq my_experiment /path/to/data/ /path/to/ref.csv "LCMS_file_{barcode1}.csv,LCMS_file_{barcode2}.csv," --smiles 'reaction_smiles_string' --compound "name_of_compound_in_LCMS_file" --variant_df "visualization_partial.csv"
107
+ ```
108
+
109
+ Note for function data we currently expect a LCMS file e.g. with the columns:
110
+ - `Sample Vial Number` (corresponding to the well that the sample was from).
111
+ - `Area` (which becomes fitness value).
112
+ - `Compound Name` which is the name of the compound we filter for that is passed as a parameter.
113
+ - The last `_X.csv` needs to be the barcode number to match that sample to your plate e.g. if you ran LevSeq with barcode 33 for plate 2 you need to have `_33.csv` for the fitness file for plate 2. e.g. `some_fitnes_for_plate_2_33.csv`.
114
+
115
+
116
+ ## Data and Visualization
117
+
118
+ - **Test Data**: Sample data is available on Zenodo [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.13694463.svg)](https://doi.org/10.5281/zenodo.13694463)
119
+ - **Visualization Tool**: A web application is available at [https://levseqdb.streamlit.app/](https://levseqdb.streamlit.app/) - simply upload your LevSeq output and LCMS results
120
+ - **Self-hosted Solution**: You can deploy your own instance using our [LevSeq_db repository](https://github.com/fhalab/LevSeq_db)
121
+
122
+ ## Reference File Format (ref.csv)
123
+
124
+ Your reference CSV file must contain the following columns:
125
+
126
+ | barcode_plate | name | refseq |
127
+ |---------------|--------|-----------|
128
+ | 33 | Q97A76 | ATGCGC... |
129
+
130
+ For oligopool experiments (multiple proteins per plate), use:
131
+
132
+ | barcode_plate | name | refseq |
133
+ |---------------|--------|-----------|
134
+ | 33 | Q97A76 | ATGCGCAAG |
135
+ | 33 | P96084 | ATGGATCA |
136
+ | 34 | P46209 | ATGGGGCAA |
137
+ | 34 | Q60336 | ATGGGGCC |
138
+
139
+ ## Command Line Arguments
140
+
141
+ ### Required Arguments
142
+ 1. **name**: Name of the experiment (output folder)
143
+ 2. **path**: Location of basecalled fastq files
144
+ 3. **summary**: Path to reference CSV file
145
+
146
+ ### Optional Arguments
147
+ - `--skip_demultiplexing`: Skip the demultiplexing step
148
+ - `--skip_variantcalling`: Skip the variant calling step
149
+ - `--output`: Custom save location (defaults to current directory)
150
+ - `--show_msa`: Show multiple sequence alignment for each well
151
+ - `--oligopool`: Process data as oligopool experiment
152
+
153
+ ## Step-by-Step Tutorial
154
+
155
+ 1. **Prepare your sequencing data**:
156
+ - Your fastq files should be in a directory structure similar to Nanopore's output
157
+ - Prepare a reference CSV file with barcode plates, sample names, and reference sequences
158
+
159
+ 2. **Run LevSeq**:
160
+ ```bash
161
+ # Via Docker
162
+ docker run --rm -v "/path/to/data:/levseq_results" yueminglong/levseq:levseq-1.4-arm64 my_experiment levseq_results/ levseq_results/ref.csv
163
+
164
+ # Via pip
165
+ levseq my_experiment /path/to/data/ /path/to/ref.csv
166
+ ```
167
+
168
+ 3. **Analyze results**:
169
+ - Output includes variant data (CSV) and interactive visualizations (HTML)
170
+ - Upload results to the LevSeq visualization tool for further analysis
171
+
172
+ ## Experimental Setup
173
+
174
+ For the wet lab protocol:
175
+ - Refer to the [wiki](https://github.com/fhalab/LevSeq/wiki/Experimental-protocols)
176
+ - See the methods section of [our paper](https://pubs.acs.org/doi/10.1021/acssynbio.4c00625)
177
+ - Order forward and reverse primers compatible with your plasmid
178
+ - Install Oxford Nanopore's software for basecalling if needed
179
+
180
+ ## Additional Resources
181
+
182
+ - **Example Notebook**: See `example/Example.ipynb` for a walkthrough
183
+ - **Advanced Usage**: See the [manuscript notebook](https://github.com/fhalab/LevSeq/blob/main/manuscript/notebooks/epPCR_10plates.ipynb)
184
+ - **Troubleshooting**: See our [computational protocols wiki](https://github.com/fhalab/LevSeq/wiki/Computational-protocols)
185
+
186
+ ## Citing LevSeq
187
+
188
+ If you find LevSeq useful, please cite our paper:
189
+
190
+ ```bibtex
191
+ @article{long2024levseq,
192
+ title={LevSeq: Rapid Generation of Sequence-Function Data for Directed Evolution and Machine Learning},
193
+ author={Long, Yueming and Mora, Ariane and Li, Francesca-Zhoufan and Gürsoy, Emre and Johnston, Kadina E and Arnold, Frances H},
194
+ journal={ACS Synthetic Biology},
195
+ year={2024},
196
+ publisher={American Chemical Society}
197
+ }
198
+ ```
199
+
200
+ ## Contact
201
+
202
+ Leave a feature request in the issues or reach us via [email](mailto:levseqdb@gmail.com).
levseq-1.4.2/README.md ADDED
@@ -0,0 +1,155 @@
1
+ # Variant Sequencing with Nanopore (LevSeq)
2
+
3
+ LevSeq provides a streamlined pipeline for sequencing and analyzing genetic variants using Oxford Nanopore technology. In directed evolution experiments, LevSeq enables sequencing of every variant, enhancing data insight and creating datasets suitable for AI/ML methods. Sequence variants can be generated within a day at an extremely low cost.
4
+
5
+ ![Figure 1: LevSeq Workflow](manuscript/figures/LevSeq_Figure-1.jpeg)
6
+ Figure 1: Overview of the LevSeq variant sequencing workflow using Nanopore technology. This diagram illustrates the key steps in the process, from sample preparation to data analysis and visualization.
7
+
8
+ ## Quick Start
9
+
10
+ ### Docker Installation (Recommended)
11
+
12
+ 1. Install Docker: [https://docs.docker.com/engine/install/](https://docs.docker.com/engine/install/)
13
+ 2. Pull the appropriate image:
14
+ ```bash
15
+ # For Linux/Windows x86 systems:
16
+ docker pull yueminglong/levseq:levseq-1.4-x86
17
+
18
+ # For Mac M-series chips (M1, M2, M3, M4):
19
+ docker pull yueminglong/levseq:levseq-1.4-arm64
20
+ ```
21
+ 3. Run LevSeq:
22
+ ```bash
23
+ docker run --rm -v "/full/path/to/data:/levseq_results" yueminglong/levseq:levseq-1.4-arm64 my_experiment levseq_results/ levseq_results/ref.csv
24
+ ```
25
+ 4. Connect function data to your sequence data
26
+ ```bash
27
+ docker run --rm -v "/full/path/to/data:/levseq_results" yueminglong/levseq:levseq-1.4-arm64 my_experiment levseq_results/ levseq_results/ref.csv --fitness_files "levseq_results/20250712_epPCR_Q06714_37.csv,levseq_results/20250712_epPCR_Q06714_39.csv,levseq_results/20250712_epPCR_Q06714_40.csv" --smiles 'O=P(OC1=CC=CC=C1)(OC2=CC=CC=C2)OC3=CC=CC=C3>>O=P(O)(OC4=CC=CC=C4)OC5=CC=CC=C5' --compound dPPi --variant_df "levseq_results/visualization_partial.csv"
28
+ ```
29
+ ### Pip Installation (Mac/Linux only)
30
+
31
+ **IMPORTANT**: On Mac M-series chips (M1-M4), gcc 13 and 14 are **REQUIRED**:
32
+ ```bash
33
+ brew install gcc@13 gcc@14
34
+ ```
35
+
36
+ 1. Create and activate conda environment:
37
+ ```bash
38
+ conda create --name levseq python=3.12 -y
39
+ conda activate levseq
40
+ ```
41
+
42
+ 2. Install dependencies:
43
+ ```bash
44
+ conda install -c bioconda -c conda-forge samtools minimap2
45
+ ```
46
+
47
+ 3. Install LevSeq:
48
+ ```bash
49
+ pip install levseq
50
+ ```
51
+
52
+ 4. Run LevSeq:
53
+ ```bash
54
+ levseq my_experiment /path/to/data/ /path/to/ref.csv
55
+ ```
56
+
57
+ 5. Combine function data:
58
+ ```bash
59
+ levseq my_experiment /path/to/data/ /path/to/ref.csv "LCMS_file_{barcode1}.csv,LCMS_file_{barcode2}.csv," --smiles 'reaction_smiles_string' --compound "name_of_compound_in_LCMS_file" --variant_df "visualization_partial.csv"
60
+ ```
61
+
62
+ Note for function data we currently expect a LCMS file e.g. with the columns:
63
+ - `Sample Vial Number` (corresponding to the well that the sample was from).
64
+ - `Area` (which becomes fitness value).
65
+ - `Compound Name` which is the name of the compound we filter for that is passed as a parameter.
66
+ - The last `_X.csv` needs to be the barcode number to match that sample to your plate e.g. if you ran LevSeq with barcode 33 for plate 2 you need to have `_33.csv` for the fitness file for plate 2. e.g. `some_fitnes_for_plate_2_33.csv`.
67
+
68
+
69
+ ## Data and Visualization
70
+
71
+ - **Test Data**: Sample data is available on Zenodo [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.13694463.svg)](https://doi.org/10.5281/zenodo.13694463)
72
+ - **Visualization Tool**: A web application is available at [https://levseqdb.streamlit.app/](https://levseqdb.streamlit.app/) - simply upload your LevSeq output and LCMS results
73
+ - **Self-hosted Solution**: You can deploy your own instance using our [LevSeq_db repository](https://github.com/fhalab/LevSeq_db)
74
+
75
+ ## Reference File Format (ref.csv)
76
+
77
+ Your reference CSV file must contain the following columns:
78
+
79
+ | barcode_plate | name | refseq |
80
+ |---------------|--------|-----------|
81
+ | 33 | Q97A76 | ATGCGC... |
82
+
83
+ For oligopool experiments (multiple proteins per plate), use:
84
+
85
+ | barcode_plate | name | refseq |
86
+ |---------------|--------|-----------|
87
+ | 33 | Q97A76 | ATGCGCAAG |
88
+ | 33 | P96084 | ATGGATCA |
89
+ | 34 | P46209 | ATGGGGCAA |
90
+ | 34 | Q60336 | ATGGGGCC |
91
+
92
+ ## Command Line Arguments
93
+
94
+ ### Required Arguments
95
+ 1. **name**: Name of the experiment (output folder)
96
+ 2. **path**: Location of basecalled fastq files
97
+ 3. **summary**: Path to reference CSV file
98
+
99
+ ### Optional Arguments
100
+ - `--skip_demultiplexing`: Skip the demultiplexing step
101
+ - `--skip_variantcalling`: Skip the variant calling step
102
+ - `--output`: Custom save location (defaults to current directory)
103
+ - `--show_msa`: Show multiple sequence alignment for each well
104
+ - `--oligopool`: Process data as oligopool experiment
105
+
106
+ ## Step-by-Step Tutorial
107
+
108
+ 1. **Prepare your sequencing data**:
109
+ - Your fastq files should be in a directory structure similar to Nanopore's output
110
+ - Prepare a reference CSV file with barcode plates, sample names, and reference sequences
111
+
112
+ 2. **Run LevSeq**:
113
+ ```bash
114
+ # Via Docker
115
+ docker run --rm -v "/path/to/data:/levseq_results" yueminglong/levseq:levseq-1.4-arm64 my_experiment levseq_results/ levseq_results/ref.csv
116
+
117
+ # Via pip
118
+ levseq my_experiment /path/to/data/ /path/to/ref.csv
119
+ ```
120
+
121
+ 3. **Analyze results**:
122
+ - Output includes variant data (CSV) and interactive visualizations (HTML)
123
+ - Upload results to the LevSeq visualization tool for further analysis
124
+
125
+ ## Experimental Setup
126
+
127
+ For the wet lab protocol:
128
+ - Refer to the [wiki](https://github.com/fhalab/LevSeq/wiki/Experimental-protocols)
129
+ - See the methods section of [our paper](https://pubs.acs.org/doi/10.1021/acssynbio.4c00625)
130
+ - Order forward and reverse primers compatible with your plasmid
131
+ - Install Oxford Nanopore's software for basecalling if needed
132
+
133
+ ## Additional Resources
134
+
135
+ - **Example Notebook**: See `example/Example.ipynb` for a walkthrough
136
+ - **Advanced Usage**: See the [manuscript notebook](https://github.com/fhalab/LevSeq/blob/main/manuscript/notebooks/epPCR_10plates.ipynb)
137
+ - **Troubleshooting**: See our [computational protocols wiki](https://github.com/fhalab/LevSeq/wiki/Computational-protocols)
138
+
139
+ ## Citing LevSeq
140
+
141
+ If you find LevSeq useful, please cite our paper:
142
+
143
+ ```bibtex
144
+ @article{long2024levseq,
145
+ title={LevSeq: Rapid Generation of Sequence-Function Data for Directed Evolution and Machine Learning},
146
+ author={Long, Yueming and Mora, Ariane and Li, Francesca-Zhoufan and Gürsoy, Emre and Johnston, Kadina E and Arnold, Frances H},
147
+ journal={ACS Synthetic Biology},
148
+ year={2024},
149
+ publisher={American Chemical Society}
150
+ }
151
+ ```
152
+
153
+ ## Contact
154
+
155
+ Leave a feature request in the issues or reach us via [email](mailto:levseqdb@gmail.com).
@@ -18,7 +18,7 @@
18
18
  __title__ = 'levseq'
19
19
  __description__ = 'LevSeq nanopore sequencing'
20
20
  __url__ = 'https://github.com/fhalab/levseq/'
21
- __version__ = '1.4.0'
21
+ __version__ = '1.4.2'
22
22
  __author__ = 'Yueming Long, Ariane Mora, Francesca-Zhoufan Li, Emre Gursoy'
23
23
  __author_email__ = 'ylong@caltech.edu'
24
24
  __license__ = 'GPL3'
@@ -0,0 +1,150 @@
1
+ ###############################################################################
2
+ # #
3
+ # This program is free software: you can redistribute it and/or modify #
4
+ # it under the terms of the GNU General Public License as published by #
5
+ # the Free Software Foundation, either version 3 of the License, or #
6
+ # (at your option) any later version. #
7
+ # #
8
+ # This program is distributed in the hope that it will be useful, #
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
11
+ # GNU General Public License for more details. #
12
+ # #
13
+ # You should have received a copy of the GNU General Public License #
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>. #
15
+ # #
16
+ ###############################################################################
17
+ """
18
+ Contain argument parsers used for command line interface and web interface
19
+ """
20
+ # Import packages
21
+ import os
22
+ import tqdm
23
+ import argparse
24
+ import pandas as pd
25
+
26
+ # Import local packages
27
+ from levseq.run_levseq import run_LevSeq
28
+
29
+ # Get the working directory
30
+ CWD = os.getcwd()
31
+
32
+ # Set default arguments
33
+ padding_start = 0
34
+ padding_end = 0
35
+ min_depth = 5
36
+ threshold = 0.2
37
+ basecall_model = 'sup'
38
+
39
+
40
+ # Build the CLI argparser
41
+ def build_cli_parser():
42
+ # Initialize
43
+ parser = argparse.ArgumentParser()
44
+
45
+ # Add required arguments
46
+ required_args_group = parser.add_argument_group("Required Arguments", "Arguments required for each run")
47
+ required_args_group.add_argument('name',
48
+ help = 'User defined name for the output folder')
49
+ required_args_group.add_argument("path",
50
+ help="Path to folder containing fastq.pass or pod5_pass files.")
51
+ required_args_group.add_argument("summary",
52
+ help="CSV file containig barcodes used, name of each plate and reference sequence in string")
53
+ # Add optional arguments
54
+ optional_args_group = parser.add_argument_group("Optional Arguments", "Aditional arguments")
55
+ optional_args_group.add_argument("--output",
56
+ help="Save location for run. Defaults to current working directory.",
57
+ required=False,
58
+ default=CWD)
59
+ optional_args_group.add_argument("--perform_basecalling",
60
+ action="store_true",
61
+ help="Skip the basecalling step, default is false")
62
+ optional_args_group.add_argument("--skip_demultiplexing",
63
+ action="store_true",
64
+ help="Skip the demultiplexing step, default is false")
65
+ optional_args_group.add_argument("--skip_variantcalling",
66
+ action="store_true",
67
+ help="Skip the variant calling step, default is false")
68
+ optional_args_group.add_argument("--oligopool",
69
+ action="store_true",
70
+ help="Whether this experiment came from an oligopool, default is false.")
71
+ optional_args_group.add_argument("--show_msa",
72
+ default=False,
73
+ help="Skip showing msa")
74
+ # if cl_args.get('fitness_files') and cl_args.get('smiles'):
75
+ optional_args_group.add_argument("--fitness_files",
76
+ default=None,
77
+ help="A comma separated list of fitness files (full path) with string quotation marks around them.")
78
+ optional_args_group.add_argument("--smiles",
79
+ default=None,
80
+ help="A smiles string of the reaction with quotation marks around.")
81
+ optional_args_group.add_argument("--compound",
82
+ default=None,
83
+ help="The compound in the fitness files (e.g. pDT or pdt - case sensitive).")
84
+ optional_args_group.add_argument("--variant_df",
85
+ default=None,
86
+ help="The variant dataframe to combine with fitness data.")
87
+ return parser
88
+
89
+
90
+ def combine_seq_func_data(cl_args):
91
+ # Also check if we have any fitness data
92
+ if cl_args.get('fitness_files') and cl_args.get('smiles') and cl_args.get('variant_df'):
93
+ variant_filename = cl_args.get('variant_df')
94
+ variant_df = pd.read_csv(variant_filename)
95
+ # Combine the fitness data with the plate data (note the barcode has to be the last _[barcode])
96
+ # The smiles has to be the reaction smiles
97
+ function_files = cl_args.get('fitness_files')
98
+ compound_name = cl_args.get('compound') if cl_args.get('compound') else 'pdt'
99
+ print(function_files, compound_name)
100
+ all_function_df = pd.DataFrame()
101
+ for function_file in function_files.split(','):
102
+ barcode = function_file.split('.csv')[0].split('_')[-1]
103
+ function_df = pd.read_csv(f'{function_file}')
104
+ function_df.columns = [c.replace('\n', ' ') for c in function_df.columns]
105
+ function_df['function_well'] = [x.split('-')[-1] if isinstance(x, str) else None for x in function_df['Sample Vial Number'].values]
106
+ function_df['function_barcode_plate'] = barcode
107
+ function_df = function_df[function_df['Compound Name'] == compound_name] # We only use pdt or Pdt
108
+ # Convert it to numeric
109
+ function_df['Area'] = pd.to_numeric(function_df['Area'], errors='coerce')
110
+
111
+ function_df['barcode_well'] = [f'{p}_{w}' for w, p in function_df[['function_well', 'function_barcode_plate']].values]
112
+ function_df['filename'] = function_file
113
+ print(function_df.head())
114
+ all_function_df = pd.concat([all_function_df, function_df])
115
+ # Join this with the variant_df barcode plate
116
+ variant_df['barcode_well'] = [f'{p}_{w}' for w, p in variant_df[['Well', 'barcode_plate']].values]
117
+ # Join the two
118
+ variant_df.set_index('barcode_well', inplace=True)
119
+ all_function_df.set_index('barcode_well', inplace=True)
120
+ variant_df = variant_df.join(all_function_df, how='left')
121
+ reaction_smiles = cl_args.get('smiles')
122
+ variant_df['smiles_string'] = reaction_smiles.split('>>')[-1]
123
+ variant_df['reaction_smiles'] = reaction_smiles
124
+ variant_df.columns = [c.lower().replace(' ', '_') for c in variant_df.columns]
125
+ variant_df.rename(columns={'area': 'fitness_value'}, inplace=True)
126
+ variant_df.to_csv(f'{variant_filename.replace(".csv", "_seqfunc.csv")}')
127
+
128
+ # levseq levseq_4.1 ref.csv fitness --fitness_files "20250712_epPCR_Q06714_37.csv,20250712_epPCR_Q06714_38.csv,20250712_epPCR_Q06714_39.csv,20250712_epPCR_Q06714_40.csv" --smiles 'O=P(OC1=CC=CC=C1)(OC2=CC=CC=C2)OC3=CC=CC=C3>>O=P(O)(OC4=CC=CC=C4)OC5=CC=CC=C5' --compound dPPi --variant_df visualization_partial.csv
129
+ return variant_df
130
+
131
+ # Execute LevSeq
132
+ def execute_LevSeq():
133
+ # Build parser
134
+ parser = build_cli_parser()
135
+ # Parse the arguments
136
+ CL_ARGS = vars(parser.parse_args())
137
+ if CL_ARGS.get('fitness_files') and CL_ARGS.get('smiles') and CL_ARGS.get('variant_df'):
138
+ print('Combining LevSeq')
139
+ return combine_seq_func_data(CL_ARGS)
140
+ # Set up progres bar
141
+ tqdm_fn = tqdm.tqdm
142
+ # Run LevSeq
143
+ try:
144
+ from levseq import __version__
145
+ print(f"Starting LevSeq v{__version__}...")
146
+ run_LevSeq(CL_ARGS, tqdm_fn)
147
+ print(f"Run completed successfully. Results and logs stored in {os.path.join(CL_ARGS.get('output', CWD), CL_ARGS.get('name', ''))}")
148
+ except Exception as e:
149
+ print(f"Error: {e}")
150
+ print(f"Check error logs for details in {os.path.join(CL_ARGS.get('output', CWD), CL_ARGS.get('name', ''))}")
@@ -66,8 +66,14 @@ from importlib import resources
66
66
  from holoviews.streams import Tap
67
67
 
68
68
  # Utility function to configure logging
69
- def configure_logging(result_folder):
70
- log_format = "%(asctime)s:%(levelname)s:%(message)s"
69
+ def configure_logging(result_folder, cl_args):
70
+ import sys
71
+ from levseq import __version__
72
+
73
+ # Define a more detailed log format with clean separation
74
+ log_format = "%(asctime)s : %(levelname)s : %(message)s"
75
+
76
+ # Create log handlers
71
77
  info_handler = logging.FileHandler(os.path.join(result_folder, "LevSeq_run.log"))
72
78
  info_handler.setLevel(logging.INFO)
73
79
  info_handler.setFormatter(logging.Formatter(log_format))
@@ -76,7 +82,30 @@ def configure_logging(result_folder):
76
82
  error_handler.setLevel(logging.ERROR)
77
83
  error_handler.setFormatter(logging.Formatter(log_format))
78
84
 
85
+ # Set up basic configuration with both handlers
79
86
  logging.basicConfig(level=logging.INFO, handlers=[info_handler, error_handler])
87
+
88
+ # Log version information and command used to run
89
+ command_used = " ".join(sys.argv)
90
+ logging.info(f"LevSeq Version: {__version__}")
91
+ logging.info(f"Command: {command_used}")
92
+
93
+ # Log essential run parameters
94
+ logging.info(f"Run name: {cl_args.get('name', 'Not specified')}")
95
+ logging.info(f"Input path: {cl_args.get('path', 'Not specified')}")
96
+ logging.info(f"Summary file: {cl_args.get('summary', 'Not specified')}")
97
+
98
+ # Log optional parameters if specified
99
+ if cl_args.get('output') and cl_args.get('output') != os.getcwd():
100
+ logging.info(f"Output directory: {cl_args.get('output')}")
101
+ if cl_args.get('oligopool'):
102
+ logging.info("Running in oligopool mode")
103
+ if cl_args.get('skip_demultiplexing'):
104
+ logging.info("Skipping demultiplexing step")
105
+ if cl_args.get('skip_variantcalling'):
106
+ logging.info("Skipping variant calling step")
107
+ if cl_args.get('threshold'):
108
+ logging.info(f"Using variant threshold: {cl_args.get('threshold')}")
80
109
 
81
110
  # Create result folder
82
111
  def create_result_folder(cl_args):
@@ -139,8 +168,15 @@ def cat_fastq_files(folder_path: str, output_path: str, reads_per_file: int = 40
139
168
  else:
140
169
  for fastq_file in fastq_files:
141
170
  destination = output_path / fastq_file.name
142
- shutil.copy(fastq_file, destination)
143
- logging.info("Copied %s to %s", fastq_file, destination)
171
+ # Skip copying if source and destination are identical
172
+ if str(fastq_file) == str(destination):
173
+ logging.info("Skipping copy of %s (source and destination are identical)", fastq_file)
174
+ continue
175
+ try:
176
+ shutil.copy(fastq_file, destination)
177
+ logging.info("Copied %s to %s", fastq_file, destination)
178
+ except shutil.SameFileError:
179
+ logging.info("Skipping copy of %s (source and destination are identical files)", fastq_file)
144
180
  logging.info("All FASTQ files processed successfully to %s", output_path)
145
181
  return str(output_path)
146
182
  except Exception as e:
@@ -334,7 +370,7 @@ def create_df_v(variants_df):
334
370
 
335
371
  # Create a copy for restructuring to avoid affecting the original
336
372
  restructured_df = df_variants_.copy()
337
- restructured_df.columns = restructured_df.columns.str.lower().str.replace('[\s-]', '_', regex=True)
373
+ restructured_df.columns = restructured_df.columns.str.lower().str.replace(r'[\s-]', '_', regex=True)
338
374
  # Fix the specific column name
339
375
  restructured_df.columns = restructured_df.columns.str.replace('p_adj._value', 'p_adj_value')
340
376
 
@@ -566,6 +602,7 @@ def process_ref_csv(cl_args, tqdm_fn=tqdm.tqdm):
566
602
  continue
567
603
 
568
604
  variant_df.to_csv(variant_csv_path, index=False)
605
+
569
606
  return variant_df, ref_df
570
607
 
571
608
 
@@ -576,8 +613,8 @@ def run_LevSeq(cl_args, tqdm_fn=tqdm.tqdm):
576
613
  ref_folder = os.path.join(result_folder, "ref")
577
614
  os.makedirs(ref_folder, exist_ok=True)
578
615
 
579
- configure_logging(result_folder)
580
- logging.info("Logging configured. Starting program.")
616
+ configure_logging(result_folder, cl_args)
617
+ logging.info("Logging configured. Starting analysis...")
581
618
 
582
619
  variant_df = pd.DataFrame(columns=["barcode_plate", "name", "refseq", "variant"])
583
620
 
@@ -141,15 +141,15 @@ class VariantCaller:
141
141
  # Alignment using minimap2
142
142
  minimap_cmd = f"minimap2 -ax map-ont -A {scores[0]} -B {scores[1]} -O {scores[2]},24 '{self.template_fasta}' '{fastq_files}' > '{output_dir}/{alignment_name}.sam'"
143
143
  subprocess.run(minimap_cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
144
- print(minimap_cmd)
144
+ # print(minimap_cmd)
145
145
  # Convert SAM to BAM and sort
146
146
  view_cmd = f"samtools view -bS '{output_dir}/{alignment_name}.sam' > '{output_dir}/{alignment_name}.bam'"
147
147
  subprocess.run(view_cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
148
- print(view_cmd)
148
+ # print(view_cmd)
149
149
 
150
150
  sort_cmd = f"samtools sort '{output_dir}/{alignment_name}.bam' -o '{output_dir}/{alignment_name}.bam'"
151
151
  subprocess.run(sort_cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
152
- print(sort_cmd)
152
+ # print(sort_cmd)
153
153
 
154
154
  # Index the BAM file
155
155
  index_cmd = f"samtools index '{output_dir}/{alignment_name}.bam'"