levseq 1.2.1__tar.gz → 1.2.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {levseq-1.2.1/levseq.egg-info → levseq-1.2.6}/PKG-INFO +60 -71
- levseq-1.2.6/README.md +127 -0
- {levseq-1.2.1 → levseq-1.2.6}/levseq/__init__.py +1 -1
- levseq-1.2.6/levseq/coordinates.py +76 -0
- {levseq-1.2.1 → levseq-1.2.6}/levseq/run_levseq.py +63 -52
- {levseq-1.2.1 → levseq-1.2.6}/levseq/seqfit.py +601 -103
- {levseq-1.2.1 → levseq-1.2.6}/levseq/utils.py +26 -23
- {levseq-1.2.1 → levseq-1.2.6}/levseq/variantcaller.py +2 -2
- {levseq-1.2.1 → levseq-1.2.6}/levseq/visualization.py +4 -4
- {levseq-1.2.1 → levseq-1.2.6/levseq.egg-info}/PKG-INFO +60 -71
- {levseq-1.2.1 → levseq-1.2.6}/levseq.egg-info/SOURCES.txt +1 -0
- {levseq-1.2.1 → levseq-1.2.6}/levseq.egg-info/requires.txt +1 -0
- {levseq-1.2.1 → levseq-1.2.6}/setup.py +2 -1
- {levseq-1.2.1 → levseq-1.2.6}/tests/test_variant_calling.py +48 -6
- levseq-1.2.1/README.md +0 -139
- {levseq-1.2.1 → levseq-1.2.6}/LICENSE +0 -0
- {levseq-1.2.1 → levseq-1.2.6}/MANIFEST.in +0 -0
- {levseq-1.2.1 → levseq-1.2.6}/levseq/IO_processor.py +0 -0
- {levseq-1.2.1 → levseq-1.2.6}/levseq/barcoding/__init__.py +0 -0
- {levseq-1.2.1 → levseq-1.2.6}/levseq/barcoding/demultiplex +0 -0
- {levseq-1.2.1 → levseq-1.2.6}/levseq/barcoding/demultiplex-arm64 +0 -0
- {levseq-1.2.1 → levseq-1.2.6}/levseq/barcoding/demultiplex-x86 +0 -0
- {levseq-1.2.1 → levseq-1.2.6}/levseq/barcoding/minion_barcodes.fasta +0 -0
- {levseq-1.2.1 → levseq-1.2.6}/levseq/basecaller.py +0 -0
- {levseq-1.2.1 → levseq-1.2.6}/levseq/cmd.py +0 -0
- {levseq-1.2.1 → levseq-1.2.6}/levseq/globals.py +0 -0
- {levseq-1.2.1 → levseq-1.2.6}/levseq/interface.py +0 -0
- {levseq-1.2.1 → levseq-1.2.6}/levseq/parser.py +0 -0
- {levseq-1.2.1 → levseq-1.2.6}/levseq/screen.py +0 -0
- {levseq-1.2.1 → levseq-1.2.6}/levseq/simulation.py +0 -0
- {levseq-1.2.1 → levseq-1.2.6}/levseq/user.py +0 -0
- {levseq-1.2.1 → levseq-1.2.6}/levseq.egg-info/dependency_links.txt +0 -0
- {levseq-1.2.1 → levseq-1.2.6}/levseq.egg-info/entry_points.txt +0 -0
- {levseq-1.2.1 → levseq-1.2.6}/levseq.egg-info/top_level.txt +0 -0
- {levseq-1.2.1 → levseq-1.2.6}/setup.cfg +0 -0
- {levseq-1.2.1 → levseq-1.2.6}/tests/test_demultiplex_docker.py +0 -0
- {levseq-1.2.1 → levseq-1.2.6}/tests/test_opligopools.py +0 -0
- {levseq-1.2.1 → levseq-1.2.6}/tests/test_seqfitvis.py +0 -0
- {levseq-1.2.1 → levseq-1.2.6}/tests/test_seqs.py +0 -0
- {levseq-1.2.1 → levseq-1.2.6}/tests/test_statistics.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: levseq
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.6
|
|
4
4
|
Home-page: https://github.com/fhalab/levseq/
|
|
5
5
|
Author: Yueming Long, Emreay Gursoy, Ariane Mora, Francesca-Zhoufan Li
|
|
6
6
|
Author-email: ylong@caltech.edu
|
|
@@ -43,6 +43,7 @@ Requires-Dist: seaborn
|
|
|
43
43
|
Requires-Dist: scikit-learn
|
|
44
44
|
Requires-Dist: statsmodels
|
|
45
45
|
Requires-Dist: tqdm
|
|
46
|
+
Requires-Dist: biopandas
|
|
46
47
|
|
|
47
48
|
# Variant Sequencing with Nanopore
|
|
48
49
|
|
|
@@ -53,8 +54,7 @@ Figure 1: Overview of the LevSeq variant sequencing workflow using Nanopore tech
|
|
|
53
54
|
|
|
54
55
|
|
|
55
56
|
- Data to reproduce the results and to test are available on zenodo [](https://doi.org/10.5281/zenodo.13694463)
|
|
56
|
-
|
|
57
|
-
- A dockerized website and database for labs to locally host and visualize their data: website is available at: https://levseq.caltech.edu/ and code to host locally at: https://github.com/fhalab/LevSeq_VDB/
|
|
57
|
+
- A dockerized website and database for labs to locally host and visualize their data: website is available [here](https://github.com/ArianeMora/LevSeq_vis/) and code to host locally at: https://github.com/fhalab/LevSeq_VDB/
|
|
58
58
|
|
|
59
59
|
## Setup
|
|
60
60
|
|
|
@@ -65,76 +65,92 @@ For setting up the experimental side of LevSeq we suggest the following preparat
|
|
|
65
65
|
|
|
66
66
|
## How to Use LevSeq
|
|
67
67
|
|
|
68
|
-
The wet lab part is detailed in the method section of the paper.
|
|
68
|
+
The wet lab part is detailed in the method section of the paper or via the [wiki](https://github.com/fhalab/LevSeq/wiki/Experimental-protocols).
|
|
69
69
|
|
|
70
70
|
Once samples are prepared, the multiplexed sample is used for sequencing, and the sequencing data is stored in the `../data` folder as per the typical Nanopore flow (refer to Nanopore documentation for this).
|
|
71
71
|
|
|
72
72
|
After sequencing, you can identify variants, demultiplex, and combine with your variant function here! For simple applications, we recommend using the notebook `example/Example.ipynb`.
|
|
73
73
|
|
|
74
|
-
###
|
|
75
|
-
|
|
76
|
-
1. **Basecalling**: This step converts Nanopore's FAST5 files to sequences. For basecalling, we use Nanopore's basecaller, Medaka, which can run in parallel with sequencing (recommended) or afterward.
|
|
77
|
-
|
|
78
|
-
2. **Demultiplexing**: After sequencing, the reads, stored as bulk FASTQ files, are sorted. During demultiplexing, each read is assigned to its correct plate/well combination and stored as a FASTQ file.
|
|
79
|
-
|
|
80
|
-
3. **Variant Calling**: For each sample, the consensus sequence is compared to the reference sequence. A variant is called if it differs from the reference sequence. The success of variant calling depends on the number of reads sequenced and their quality.
|
|
74
|
+
### Installation
|
|
81
75
|
|
|
76
|
+
We aimed to make LevSeq as simple to use as possible, this means you should be able to run it all using pip (note you need `samtools`
|
|
77
|
+
and `minimap2` installed on your path. However, if you have issues we recommend using the Docker instance!
|
|
78
|
+
(the pip version doesn't work well with mac M3 but docker does.)
|
|
82
79
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
We aimed to make LevSeq as simple to use as possible, this means you should be able to run it all using pip. However, if you have issues we recomend using the Docker instance!
|
|
86
|
-
|
|
87
|
-
We recommend using command line interface(Terminal) and a conda environment for installation:
|
|
88
|
-
```
|
|
89
|
-
git clone https://github.com/fhalab/LevSeq.git
|
|
90
|
-
```
|
|
80
|
+
We recommend using terminal and a conda environment for installation:
|
|
91
81
|
|
|
92
82
|
```
|
|
93
|
-
conda create --name levseq python=3.
|
|
83
|
+
conda create --name levseq python=3.12 -y
|
|
94
84
|
```
|
|
95
85
|
|
|
96
86
|
```
|
|
97
87
|
conda activate levseq
|
|
98
88
|
```
|
|
99
89
|
|
|
100
|
-
From the LevSeq folder, install the package using pip:
|
|
101
|
-
|
|
102
|
-
```
|
|
103
|
-
pip install levseq
|
|
104
|
-
```
|
|
105
90
|
#### Dependencies
|
|
106
91
|
|
|
107
92
|
1. Samtools: https://www.htslib.org/download/
|
|
108
93
|
```
|
|
109
94
|
conda install -c bioconda -c conda-forge samtools
|
|
110
95
|
```
|
|
111
|
-
|
|
96
|
+
|
|
112
97
|
|
|
113
98
|
2. Minimap2: https://github.com/lh3/minimap2
|
|
99
|
+
|
|
114
100
|
```
|
|
115
101
|
conda install -c bioconda -c conda-forge minimap2
|
|
116
102
|
```
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
103
|
+
### Docker Installation (Recommended for full pipeline)
|
|
104
|
+
For installing the whole pipeline, you'll need to use the docker image. For this, install docker as required for your
|
|
105
|
+
operating system (https://docs.docker.com/engine/install/).
|
|
106
|
+
|
|
107
|
+
### Usage
|
|
108
|
+
|
|
109
|
+
#### Run via pip
|
|
110
|
+
```
|
|
111
|
+
levseq <name of the run you can make this whatever> <location to data folder> <location of reference csv file>
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
#### Run via docker
|
|
115
|
+
If using linux system
|
|
121
116
|
```
|
|
122
|
-
|
|
123
|
-
brew install gcc@13
|
|
117
|
+
docker pull yueminglong/levseq:levseq-1.2.5-x86
|
|
124
118
|
```
|
|
125
|
-
|
|
119
|
+
If using Mac M chips (image tested on M1, M3, and M4)
|
|
126
120
|
```
|
|
127
|
-
|
|
128
|
-
conda install conda-forge::gcc=13
|
|
121
|
+
docker pull yueminglong/levseq:levseq-1.2.5-arm64
|
|
129
122
|
```
|
|
130
123
|
|
|
131
|
-
### Usage
|
|
132
|
-
#### Command Line Interface
|
|
133
|
-
LevSeq can be run using the command line interface. Here's the basic structure of the command:
|
|
134
|
-
|
|
135
124
|
```
|
|
136
|
-
|
|
125
|
+
docker run --rm -v "$(pwd):/levseq_results" yueminglong/levseq:levseq-1.2.5-<architecture> <name> <location to data folder> <location of reference csv file>
|
|
137
126
|
```
|
|
127
|
+
Explanation:
|
|
128
|
+
|
|
129
|
+
--rm: Automatically removes the container after the command finishes.
|
|
130
|
+
|
|
131
|
+
-v "$(pwd):/levseq\_results": Mounts the current directory ($(pwd)) to /levseq\_results inside the container, ensuring the results are saved to your current directory.
|
|
132
|
+
|
|
133
|
+
yueminglong/levseq:levseq-1.2.5-\<architecture\>: Specifies the Docker image to run. Replace \<architecture\> with the appropriate platform (e.g., x86).
|
|
134
|
+
|
|
135
|
+
\<name\>: The name or identifier for the analysis.
|
|
136
|
+
|
|
137
|
+
\<location to data folder\>: Path to the folder containing input data.
|
|
138
|
+
|
|
139
|
+
\<location of reference csv file\>: Path to the reference .csv file.
|
|
140
|
+
|
|
141
|
+
Important Notes:
|
|
142
|
+
|
|
143
|
+
If the current directory is mounted to the container (via -v "$(pwd):/levseq\_results"), the basecalled result in FASTQ format and the ref.csv file must be located in the current directory.
|
|
144
|
+
|
|
145
|
+
If these files are not present in the current directory, they will not be processed by the tool.
|
|
146
|
+
|
|
147
|
+
Output:
|
|
148
|
+
|
|
149
|
+
The results of the analysis will be saved to your current working directory.
|
|
150
|
+
|
|
151
|
+
See the [manuscrtipt notebook](https://github.com/fhalab/LevSeq/blob/main/manuscript/notebooks/epPCR_10plates.ipynb) for an example.
|
|
152
|
+
*Note: if using docker, the html and csv final output will be saved in the directory that you are running from instead of in the Platemaps or Results subfolder.
|
|
153
|
+
|
|
138
154
|
#### Required Arguments
|
|
139
155
|
1. Name of the experiment, this will be the name of the output folder
|
|
140
156
|
2. Location of basecalled fastq files, this is the direct output from using the MinKnow software for sequencing
|
|
@@ -149,37 +165,10 @@ levseq <name of the run you can make this whatever> <location to data folder> <l
|
|
|
149
165
|
|
|
150
166
|
--show\_msa Showing multiple sequence alignment for each well
|
|
151
167
|
|
|
152
|
-
|
|
153
|
-
For installing the whole pipeline, you'll need to use the docker image. For this, install docker as required for your
|
|
154
|
-
operating system (https://docs.docker.com/engine/install/).
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
To build the docker image run (within the main folder that contains the `Dockerfile`). Note building does **not** work
|
|
158
|
-
on Mac M3 chip, please use a ubuntu machine to build the docker image!
|
|
159
|
-
|
|
160
|
-
```
|
|
161
|
-
docker build -t levseq .
|
|
162
|
-
```
|
|
163
|
-
|
|
164
|
-
This gives us the access to the lebSeq command line interface via:
|
|
165
|
-
|
|
166
|
-
```
|
|
167
|
-
docker run levseq
|
|
168
|
-
```
|
|
169
|
-
Note! The docker image should work with linux, and mac, however, different mac architectures may have issues (owing to the different M1/M3 processers.)
|
|
170
|
-
|
|
171
|
-
Basically the -v connects a folder on your computer with the output from the minION sequencer with the docker image that will take these results and then perform
|
|
172
|
-
demultiplexing and variant calling.
|
|
173
|
-
|
|
174
|
-
docker run -v /disk1/ariane/vscode/LevSeq/manuscript/Data/20241116-YL-LevSeq-parlqep400-1-2-P25-28:/levseq_results/ levseq docker-test levseq_results/ levseq_results/LevSeq-T1.csv
|
|
175
|
-
```
|
|
176
|
-
docker run -v /Users/XXXX/Documents/LevSeq/data:/levseq_results/ levseq 20240502 levseq_results/20240502/ levseq_results/20240502-YL-ParLQ-ep2.csv
|
|
177
|
-
```
|
|
178
|
-
|
|
179
|
-
In this command: `/Users/XXXX/Documents/LevSeq/data` is a folder on your computer, which contains a subfolder `20240502`
|
|
168
|
+
Great you should be all done!
|
|
180
169
|
|
|
181
|
-
|
|
170
|
+
For more details or trouble shooting please look at our [computational_protocols](https://github.com/fhalab/LevSeq/wiki/Computational-protocols).
|
|
182
171
|
|
|
183
|
-
|
|
172
|
+
#### Citing
|
|
184
173
|
|
|
185
|
-
If you
|
|
174
|
+
If you have found LevSeq useful, please cite out [paper](https://doi.org/10.1101/2024.09.04.611255).
|
levseq-1.2.6/README.md
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
# Variant Sequencing with Nanopore
|
|
2
|
+
|
|
3
|
+
In directed evolution, sequencing every variant enhances data insight and creates datasets suitable for AI/ML methods. This method is presented as an extension of the original Every Variant Sequencer using Illumina technology. With this approach, sequence variants can be generated within a day at an extremely low cost.
|
|
4
|
+
|
|
5
|
+

|
|
6
|
+
Figure 1: Overview of the LevSeq variant sequencing workflow using Nanopore technology. This diagram illustrates the key steps in the process, from sample preparation to data analysis and visualization.
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
- Data to reproduce the results and to test are available on zenodo [](https://doi.org/10.5281/zenodo.13694463)
|
|
10
|
+
- A dockerized website and database for labs to locally host and visualize their data: website is available [here](https://github.com/ArianeMora/LevSeq_vis/) and code to host locally at: https://github.com/fhalab/LevSeq_VDB/
|
|
11
|
+
|
|
12
|
+
## Setup
|
|
13
|
+
|
|
14
|
+
For setting up the experimental side of LevSeq we suggest the following preparations:
|
|
15
|
+
|
|
16
|
+
- Order forward and reverse primers compatible with the desired plasmid, see methods section of [our paper](http://biorxiv.org/cgi/content/short/2024.09.04.611255v1?rss=1).
|
|
17
|
+
- Successfully install Oxford Nanopore's software (this is only for if you are doing basecalling/minION processing). [Link to installation guide](https://nanoporetech.com/).
|
|
18
|
+
|
|
19
|
+
## How to Use LevSeq
|
|
20
|
+
|
|
21
|
+
The wet lab part is detailed in the method section of the paper or via the [wiki](https://github.com/fhalab/LevSeq/wiki/Experimental-protocols).
|
|
22
|
+
|
|
23
|
+
Once samples are prepared, the multiplexed sample is used for sequencing, and the sequencing data is stored in the `../data` folder as per the typical Nanopore flow (refer to Nanopore documentation for this).
|
|
24
|
+
|
|
25
|
+
After sequencing, you can identify variants, demultiplex, and combine with your variant function here! For simple applications, we recommend using the notebook `example/Example.ipynb`.
|
|
26
|
+
|
|
27
|
+
### Installation
|
|
28
|
+
|
|
29
|
+
We aimed to make LevSeq as simple to use as possible, this means you should be able to run it all using pip (note you need `samtools`
|
|
30
|
+
and `minimap2` installed on your path. However, if you have issues we recommend using the Docker instance!
|
|
31
|
+
(the pip version doesn't work well with mac M3 but docker does.)
|
|
32
|
+
|
|
33
|
+
We recommend using terminal and a conda environment for installation:
|
|
34
|
+
|
|
35
|
+
```
|
|
36
|
+
conda create --name levseq python=3.12 -y
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
```
|
|
40
|
+
conda activate levseq
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
#### Dependencies
|
|
44
|
+
|
|
45
|
+
1. Samtools: https://www.htslib.org/download/
|
|
46
|
+
```
|
|
47
|
+
conda install -c bioconda -c conda-forge samtools
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
2. Minimap2: https://github.com/lh3/minimap2
|
|
52
|
+
|
|
53
|
+
```
|
|
54
|
+
conda install -c bioconda -c conda-forge minimap2
|
|
55
|
+
```
|
|
56
|
+
### Docker Installation (Recommended for full pipeline)
|
|
57
|
+
For installing the whole pipeline, you'll need to use the docker image. For this, install docker as required for your
|
|
58
|
+
operating system (https://docs.docker.com/engine/install/).
|
|
59
|
+
|
|
60
|
+
### Usage
|
|
61
|
+
|
|
62
|
+
#### Run via pip
|
|
63
|
+
```
|
|
64
|
+
levseq <name of the run you can make this whatever> <location to data folder> <location of reference csv file>
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
#### Run via docker
|
|
68
|
+
If using linux system
|
|
69
|
+
```
|
|
70
|
+
docker pull yueminglong/levseq:levseq-1.2.5-x86
|
|
71
|
+
```
|
|
72
|
+
If using Mac M chips (image tested on M1, M3, and M4)
|
|
73
|
+
```
|
|
74
|
+
docker pull yueminglong/levseq:levseq-1.2.5-arm64
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
```
|
|
78
|
+
docker run --rm -v "$(pwd):/levseq_results" yueminglong/levseq:levseq-1.2.5-<architecture> <name> <location to data folder> <location of reference csv file>
|
|
79
|
+
```
|
|
80
|
+
Explanation:
|
|
81
|
+
|
|
82
|
+
--rm: Automatically removes the container after the command finishes.
|
|
83
|
+
|
|
84
|
+
-v "$(pwd):/levseq\_results": Mounts the current directory ($(pwd)) to /levseq\_results inside the container, ensuring the results are saved to your current directory.
|
|
85
|
+
|
|
86
|
+
yueminglong/levseq:levseq-1.2.5-\<architecture\>: Specifies the Docker image to run. Replace \<architecture\> with the appropriate platform (e.g., x86).
|
|
87
|
+
|
|
88
|
+
\<name\>: The name or identifier for the analysis.
|
|
89
|
+
|
|
90
|
+
\<location to data folder\>: Path to the folder containing input data.
|
|
91
|
+
|
|
92
|
+
\<location of reference csv file\>: Path to the reference .csv file.
|
|
93
|
+
|
|
94
|
+
Important Notes:
|
|
95
|
+
|
|
96
|
+
If the current directory is mounted to the container (via -v "$(pwd):/levseq\_results"), the basecalled result in FASTQ format and the ref.csv file must be located in the current directory.
|
|
97
|
+
|
|
98
|
+
If these files are not present in the current directory, they will not be processed by the tool.
|
|
99
|
+
|
|
100
|
+
Output:
|
|
101
|
+
|
|
102
|
+
The results of the analysis will be saved to your current working directory.
|
|
103
|
+
|
|
104
|
+
See the [manuscrtipt notebook](https://github.com/fhalab/LevSeq/blob/main/manuscript/notebooks/epPCR_10plates.ipynb) for an example.
|
|
105
|
+
*Note: if using docker, the html and csv final output will be saved in the directory that you are running from instead of in the Platemaps or Results subfolder.
|
|
106
|
+
|
|
107
|
+
#### Required Arguments
|
|
108
|
+
1. Name of the experiment, this will be the name of the output folder
|
|
109
|
+
2. Location of basecalled fastq files, this is the direct output from using the MinKnow software for sequencing
|
|
110
|
+
3. Location of reference csv file, this file includes information of barcodes used for each plate and the DNA sequence used for reference for each plate
|
|
111
|
+
|
|
112
|
+
#### Optional Arguments
|
|
113
|
+
--skip\_demultiplexing If enabled, demultiplexing step will be skipped
|
|
114
|
+
|
|
115
|
+
--skip\_variantcalling If enabled, variant valling step will be skipped
|
|
116
|
+
|
|
117
|
+
--output Save location for output, if not provided default to where the program is executed
|
|
118
|
+
|
|
119
|
+
--show\_msa Showing multiple sequence alignment for each well
|
|
120
|
+
|
|
121
|
+
Great you should be all done!
|
|
122
|
+
|
|
123
|
+
For more details or trouble shooting please look at our [computational_protocols](https://github.com/fhalab/LevSeq/wiki/Computational-protocols).
|
|
124
|
+
|
|
125
|
+
#### Citing
|
|
126
|
+
|
|
127
|
+
If you have found LevSeq useful, please cite out [paper](https://doi.org/10.1101/2024.09.04.611255).
|
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
__title__ = 'levseq'
|
|
19
19
|
__description__ = 'LevSeq nanopore sequencing'
|
|
20
20
|
__url__ = 'https://github.com/fhalab/levseq/'
|
|
21
|
-
__version__ = '1.2.
|
|
21
|
+
__version__ = '1.2.6'
|
|
22
22
|
__author__ = 'Yueming Long, Emreay Gursoy, Ariane Mora, Francesca-Zhoufan Li'
|
|
23
23
|
__author_email__ = 'ylong@caltech.edu'
|
|
24
24
|
__license__ = 'GPL3'
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import esm
|
|
2
|
+
import torch
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from sklearn.decomposition import PCA
|
|
5
|
+
import os
|
|
6
|
+
import argparse
|
|
7
|
+
|
|
8
|
+
def preprocess_sequence(sequence):
|
|
9
|
+
"""
|
|
10
|
+
Preprocesses the amino acid sequence by removing everything after the first '*' (stop codon).
|
|
11
|
+
"""
|
|
12
|
+
if '*' in sequence:
|
|
13
|
+
sequence = sequence.split('*')[0] # Take everything before the first '*'
|
|
14
|
+
return sequence
|
|
15
|
+
|
|
16
|
+
def process_file(input_file, output_file=None):
|
|
17
|
+
# Load the dataset
|
|
18
|
+
data = pd.read_csv(input_file)
|
|
19
|
+
|
|
20
|
+
# Remove the "Unnamed: 0" column if it exists
|
|
21
|
+
if 'Unnamed: 0' in data.columns:
|
|
22
|
+
data = data.drop(columns=['Unnamed: 0'])
|
|
23
|
+
|
|
24
|
+
# Create the ID column as the combination of `Plate` and `Well`
|
|
25
|
+
data['ID'] = data['Plate'] + '-' + data['Well']
|
|
26
|
+
data = data[['ID'] + [col for col in data.columns if col != 'ID']] # Reorder to make ID the first column
|
|
27
|
+
|
|
28
|
+
# Filter valid sequences from the `aa_sequence` column
|
|
29
|
+
valid_sequences = data['aa_sequence'].dropna()
|
|
30
|
+
valid_sequences = valid_sequences[~valid_sequences.str.contains('#N.A.#|Deletion')]
|
|
31
|
+
|
|
32
|
+
# Preprocess sequences to handle stop codons
|
|
33
|
+
valid_sequences = valid_sequences.apply(preprocess_sequence)
|
|
34
|
+
|
|
35
|
+
# Load the ESM-2 model
|
|
36
|
+
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
|
|
37
|
+
batch_converter = alphabet.get_batch_converter()
|
|
38
|
+
|
|
39
|
+
# Prepare sequences for embedding
|
|
40
|
+
sequences = valid_sequences.tolist()
|
|
41
|
+
sequence_names = [f"Sequence {i}" for i in range(len(sequences))]
|
|
42
|
+
batch_labels, batch_strs, batch_tokens = batch_converter(list(zip(sequence_names, sequences)))
|
|
43
|
+
|
|
44
|
+
# Extract embeddings
|
|
45
|
+
with torch.no_grad():
|
|
46
|
+
results = model(batch_tokens, repr_layers=[33])
|
|
47
|
+
embeddings = results["representations"][33] # Use the top (last) layer representations
|
|
48
|
+
|
|
49
|
+
# Average embeddings across residues for sequence-level representation
|
|
50
|
+
sequence_embeddings = embeddings.mean(1).numpy()
|
|
51
|
+
|
|
52
|
+
# Dimensionality Reduction using PCA
|
|
53
|
+
pca = PCA(n_components=2)
|
|
54
|
+
xy_coordinates = pca.fit_transform(sequence_embeddings)
|
|
55
|
+
|
|
56
|
+
# Add x, y coordinates back to the dataframe
|
|
57
|
+
xy_df = pd.DataFrame(xy_coordinates, columns=['x_coordinate', 'y_coordinate'], index=valid_sequences.index)
|
|
58
|
+
data = pd.concat([data, xy_df], axis=1)
|
|
59
|
+
|
|
60
|
+
# Determine output file location
|
|
61
|
+
if output_file is None:
|
|
62
|
+
input_name, input_ext = os.path.splitext(input_file)
|
|
63
|
+
output_file = f"{input_name}_xy{input_ext}"
|
|
64
|
+
|
|
65
|
+
# Save the updated dataframe to a file
|
|
66
|
+
data.to_csv(output_file, index=False)
|
|
67
|
+
print(f"Processed data with x, y coordinates saved to: {output_file}")
|
|
68
|
+
|
|
69
|
+
if __name__ == "__main__":
|
|
70
|
+
parser = argparse.ArgumentParser(description="Generate x, y coordinates for amino acid sequences")
|
|
71
|
+
parser.add_argument('input_file', type=str, help="Path to the input CSV file")
|
|
72
|
+
parser.add_argument('--output_file', type=str, default=None, help="Path to save the output CSV file (optional)")
|
|
73
|
+
args = parser.parse_args()
|
|
74
|
+
|
|
75
|
+
process_file(args.input_file, args.output_file)
|
|
76
|
+
|
|
@@ -243,6 +243,19 @@ def call_variant(experiment_name, experiment_folder, template_fasta, filtered_ba
|
|
|
243
243
|
except Exception as e:
|
|
244
244
|
logging.error("Variant calling failed", exc_info=True)
|
|
245
245
|
raise
|
|
246
|
+
|
|
247
|
+
def assign_alignment_probability(row):
|
|
248
|
+
if row["Variant"] == "#PARENT#":
|
|
249
|
+
if row["Alignment Count"] > 20:
|
|
250
|
+
return 1
|
|
251
|
+
elif 10 <= row["Alignment Count"] <= 20:
|
|
252
|
+
return (row["Alignment Count"] - 10) / 10 # Ranges from 0 to 1 linearly
|
|
253
|
+
else:
|
|
254
|
+
return 0
|
|
255
|
+
else:
|
|
256
|
+
return row["Average mutation frequency"]
|
|
257
|
+
|
|
258
|
+
|
|
246
259
|
# Full version of create_df_v function
|
|
247
260
|
def create_df_v(variants_df):
|
|
248
261
|
# Make copy of dataframe
|
|
@@ -258,28 +271,19 @@ def create_df_v(variants_df):
|
|
|
258
271
|
|
|
259
272
|
# Translate nc_variant to aa_variant
|
|
260
273
|
df_variants_["aa_variant"] = df_variants_["nc_variant"].apply(
|
|
261
|
-
|
|
274
|
+
lambda x: x if x in ["Deletion", "#N.A.#", 'Insertion'] else translate(x)
|
|
262
275
|
)
|
|
263
276
|
# Fill in 'Deletion' in 'aa_variant' column
|
|
264
277
|
df_variants_.loc[
|
|
265
278
|
df_variants_["nc_variant"] == "Deletion", "aa_variant"
|
|
266
279
|
] = "Deletion"
|
|
280
|
+
df_variants_.loc[
|
|
281
|
+
df_variants_["nc_variant"] == "Insertion", "aa_variant"
|
|
282
|
+
] = "Insertion"
|
|
267
283
|
|
|
268
284
|
# Compare aa_variant with translated refseq and generate Substitutions column
|
|
269
285
|
df_variants_["Substitutions"] = df_variants_.apply(get_mutations, axis=1)
|
|
270
|
-
|
|
271
286
|
# Adding sequence quality to Alignment Probability before filling in empty values
|
|
272
|
-
def assign_alignment_probability(row):
|
|
273
|
-
if row["Variant"] == "#PARENT#":
|
|
274
|
-
if row["Alignment Count"] > 20:
|
|
275
|
-
return 1
|
|
276
|
-
elif 10 <= row["Alignment Count"] <= 20:
|
|
277
|
-
return (row["Alignment Count"] - 10) / 10 # Ranges from 0 to 1 linearly
|
|
278
|
-
else:
|
|
279
|
-
return 0
|
|
280
|
-
else:
|
|
281
|
-
return row["Average mutation frequency"]
|
|
282
|
-
|
|
283
287
|
df_variants_["Alignment Probability"] = df_variants_.apply(assign_alignment_probability, axis=1)
|
|
284
288
|
df_variants_["Alignment Probability"] = df_variants_["Alignment Probability"].fillna(0.0)
|
|
285
289
|
df_variants_["Alignment Count"] = df_variants_["Alignment Count"].fillna(0.0)
|
|
@@ -291,7 +295,9 @@ def create_df_v(variants_df):
|
|
|
291
295
|
elif df_variants_["nc_variant"].iloc[i] == "#N.A.#":
|
|
292
296
|
df_variants_.Substitutions.iat[i] = "#N.A.#"
|
|
293
297
|
|
|
294
|
-
|
|
298
|
+
# Low read counts override low mutations
|
|
299
|
+
df_variants_["Substitutions"] = ["#LOW#" if a < 10 and a > 0 else s for a, s in df_variants_[["Alignment Count", "Substitutions"]].values]
|
|
300
|
+
|
|
295
301
|
# Add row and columns
|
|
296
302
|
Well = df_variants_["Well"].tolist()
|
|
297
303
|
row = []
|
|
@@ -315,32 +321,36 @@ def create_df_v(variants_df):
|
|
|
315
321
|
df_variants_["Plate"] = df_variants_["Plate"].apply(
|
|
316
322
|
lambda x: f"0{x}" if len(x) == 1 else x
|
|
317
323
|
)
|
|
318
|
-
|
|
324
|
+
|
|
325
|
+
# First rename columns as before
|
|
319
326
|
df_variants_.rename(columns={
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
restructured_df =
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
327
|
+
"Variant": "nucleotide_mutation",
|
|
328
|
+
"Substitutions": "amino_acid_substitutions",
|
|
329
|
+
"nc_variant": "nt_sequence",
|
|
330
|
+
"aa_variant": "aa_sequence"
|
|
331
|
+
}, inplace=True)
|
|
332
|
+
|
|
333
|
+
# Create a copy for restructuring to avoid affecting the original
|
|
334
|
+
restructured_df = df_variants_.copy()
|
|
335
|
+
restructured_df.columns = restructured_df.columns.str.lower().str.replace('[\s-]', '_', regex=True)
|
|
336
|
+
# Fix the specific column name
|
|
337
|
+
restructured_df.columns = restructured_df.columns.str.replace('p_adj._value', 'p_adj_value')
|
|
338
|
+
|
|
339
|
+
# Select the desired columns in the desired order
|
|
340
|
+
restructured_df = restructured_df[[
|
|
341
|
+
"barcode_plate",
|
|
342
|
+
"plate",
|
|
343
|
+
"well",
|
|
344
|
+
"alignment_count",
|
|
345
|
+
"nucleotide_mutation",
|
|
346
|
+
"amino_acid_substitutions",
|
|
347
|
+
"alignment_probability",
|
|
348
|
+
"average_mutation_frequency",
|
|
349
|
+
"p_value",
|
|
350
|
+
"p_adj_value",
|
|
351
|
+
"nt_sequence",
|
|
352
|
+
"aa_sequence"
|
|
353
|
+
]]
|
|
344
354
|
|
|
345
355
|
return restructured_df, df_variants_
|
|
346
356
|
|
|
@@ -354,16 +364,21 @@ def create_nc_variant(variant, refseq):
|
|
|
354
364
|
return refseq
|
|
355
365
|
elif "DEL" in variant:
|
|
356
366
|
return "Deletion"
|
|
367
|
+
elif variant == '+':
|
|
368
|
+
return "Insertion"
|
|
357
369
|
else:
|
|
358
370
|
mutations = variant.split("_")
|
|
359
371
|
nc_variant = list(refseq)
|
|
360
372
|
for mutation in mutations:
|
|
361
|
-
|
|
373
|
+
try:
|
|
362
374
|
position = int(re.findall(r"\d+", mutation)[0]) - 1
|
|
363
375
|
original = mutation[0]
|
|
364
376
|
new = mutation[-1]
|
|
365
|
-
|
|
366
|
-
|
|
377
|
+
if position < len(nc_variant) and nc_variant[position] == original:
|
|
378
|
+
nc_variant[position] = new
|
|
379
|
+
except:
|
|
380
|
+
print('WARNING! UNABLE TO PROCESS THIS')
|
|
381
|
+
print(mutation)
|
|
367
382
|
return "".join(nc_variant)
|
|
368
383
|
|
|
369
384
|
|
|
@@ -377,7 +392,9 @@ def get_mutations(row):
|
|
|
377
392
|
# Check if alignment_count is zero and return "#N.A.#" if true
|
|
378
393
|
if alignment_count == 0:
|
|
379
394
|
return "#N.A.#"
|
|
380
|
-
|
|
395
|
+
if alignment_count <= 10:
|
|
396
|
+
return "#LOW#"
|
|
397
|
+
|
|
381
398
|
refseq = row["refseq"]
|
|
382
399
|
|
|
383
400
|
if not is_valid_dna_sequence(refseq):
|
|
@@ -395,10 +412,7 @@ def get_mutations(row):
|
|
|
395
412
|
if refseq_aa[i] != variant_aa[i]:
|
|
396
413
|
mutations.append(f"{refseq_aa[i]}{i+1}{variant_aa[i]}")
|
|
397
414
|
if not mutations:
|
|
398
|
-
|
|
399
|
-
return "#N.A.#"
|
|
400
|
-
else:
|
|
401
|
-
return "#PARENT#"
|
|
415
|
+
return "#PARENT#"
|
|
402
416
|
else:
|
|
403
417
|
return "LEN"
|
|
404
418
|
return "_".join(mutations) if mutations else ""
|
|
@@ -433,10 +447,7 @@ def process_ref_csv(cl_args, tqdm_fn=tqdm.tqdm):
|
|
|
433
447
|
result_folder = create_result_folder(cl_args)
|
|
434
448
|
variant_csv_path = os.path.join(result_folder, "variants.csv")
|
|
435
449
|
|
|
436
|
-
|
|
437
|
-
variant_df = pd.read_csv(variant_csv_path)
|
|
438
|
-
else:
|
|
439
|
-
variant_df = pd.DataFrame(columns=["barcode_plate", "name", "refseq", "variant"])
|
|
450
|
+
variant_df = pd.DataFrame(columns=["barcode_plate", "name", "refseq", "variant"])
|
|
440
451
|
|
|
441
452
|
for i, row in tqdm_fn(ref_df.iterrows(), total=len(ref_df), desc="Processing Samples"):
|
|
442
453
|
barcode_plate = row["barcode_plate"]
|
|
@@ -456,9 +467,9 @@ def process_ref_csv(cl_args, tqdm_fn=tqdm.tqdm):
|
|
|
456
467
|
barcode_path = filter_bc(cl_args, name_folder, i)
|
|
457
468
|
output_dir = Path(result_folder) / "basecalled_reads"
|
|
458
469
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
459
|
-
file_to_fastq = cat_fastq_files(cl_args.get("path"), output_dir)
|
|
460
470
|
|
|
461
471
|
if not cl_args["skip_demultiplexing"]:
|
|
472
|
+
file_to_fastq = cat_fastq_files(cl_args.get("path"), output_dir)
|
|
462
473
|
try:
|
|
463
474
|
demux_fastq(output_dir, name_folder, barcode_path)
|
|
464
475
|
except Exception as e:
|