levseq 1.2.1__tar.gz → 1.2.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {levseq-1.2.1/levseq.egg-info → levseq-1.2.5}/PKG-INFO +24 -70
- {levseq-1.2.1 → levseq-1.2.5}/README.md +22 -69
- {levseq-1.2.1 → levseq-1.2.5}/levseq/__init__.py +1 -1
- {levseq-1.2.1 → levseq-1.2.5}/levseq/run_levseq.py +53 -48
- {levseq-1.2.1 → levseq-1.2.5}/levseq/seqfit.py +86 -7
- {levseq-1.2.1 → levseq-1.2.5}/levseq/utils.py +26 -23
- {levseq-1.2.1 → levseq-1.2.5}/levseq/variantcaller.py +2 -2
- {levseq-1.2.1 → levseq-1.2.5/levseq.egg-info}/PKG-INFO +24 -70
- {levseq-1.2.1 → levseq-1.2.5}/levseq.egg-info/requires.txt +1 -0
- {levseq-1.2.1 → levseq-1.2.5}/setup.py +2 -1
- {levseq-1.2.1 → levseq-1.2.5}/tests/test_variant_calling.py +48 -6
- {levseq-1.2.1 → levseq-1.2.5}/LICENSE +0 -0
- {levseq-1.2.1 → levseq-1.2.5}/MANIFEST.in +0 -0
- {levseq-1.2.1 → levseq-1.2.5}/levseq/IO_processor.py +0 -0
- {levseq-1.2.1 → levseq-1.2.5}/levseq/barcoding/__init__.py +0 -0
- {levseq-1.2.1 → levseq-1.2.5}/levseq/barcoding/demultiplex +0 -0
- {levseq-1.2.1 → levseq-1.2.5}/levseq/barcoding/demultiplex-arm64 +0 -0
- {levseq-1.2.1 → levseq-1.2.5}/levseq/barcoding/demultiplex-x86 +0 -0
- {levseq-1.2.1 → levseq-1.2.5}/levseq/barcoding/minion_barcodes.fasta +0 -0
- {levseq-1.2.1 → levseq-1.2.5}/levseq/basecaller.py +0 -0
- {levseq-1.2.1 → levseq-1.2.5}/levseq/cmd.py +0 -0
- {levseq-1.2.1 → levseq-1.2.5}/levseq/globals.py +0 -0
- {levseq-1.2.1 → levseq-1.2.5}/levseq/interface.py +0 -0
- {levseq-1.2.1 → levseq-1.2.5}/levseq/parser.py +0 -0
- {levseq-1.2.1 → levseq-1.2.5}/levseq/screen.py +0 -0
- {levseq-1.2.1 → levseq-1.2.5}/levseq/simulation.py +0 -0
- {levseq-1.2.1 → levseq-1.2.5}/levseq/user.py +0 -0
- {levseq-1.2.1 → levseq-1.2.5}/levseq/visualization.py +0 -0
- {levseq-1.2.1 → levseq-1.2.5}/levseq.egg-info/SOURCES.txt +0 -0
- {levseq-1.2.1 → levseq-1.2.5}/levseq.egg-info/dependency_links.txt +0 -0
- {levseq-1.2.1 → levseq-1.2.5}/levseq.egg-info/entry_points.txt +0 -0
- {levseq-1.2.1 → levseq-1.2.5}/levseq.egg-info/top_level.txt +0 -0
- {levseq-1.2.1 → levseq-1.2.5}/setup.cfg +0 -0
- {levseq-1.2.1 → levseq-1.2.5}/tests/test_demultiplex_docker.py +0 -0
- {levseq-1.2.1 → levseq-1.2.5}/tests/test_opligopools.py +0 -0
- {levseq-1.2.1 → levseq-1.2.5}/tests/test_seqfitvis.py +0 -0
- {levseq-1.2.1 → levseq-1.2.5}/tests/test_seqs.py +0 -0
- {levseq-1.2.1 → levseq-1.2.5}/tests/test_statistics.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: levseq
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.5
|
|
4
4
|
Home-page: https://github.com/fhalab/levseq/
|
|
5
5
|
Author: Yueming Long, Emreay Gursoy, Ariane Mora, Francesca-Zhoufan Li
|
|
6
6
|
Author-email: ylong@caltech.edu
|
|
@@ -43,6 +43,7 @@ Requires-Dist: seaborn
|
|
|
43
43
|
Requires-Dist: scikit-learn
|
|
44
44
|
Requires-Dist: statsmodels
|
|
45
45
|
Requires-Dist: tqdm
|
|
46
|
+
Requires-Dist: biopandas
|
|
46
47
|
|
|
47
48
|
# Variant Sequencing with Nanopore
|
|
48
49
|
|
|
@@ -53,8 +54,7 @@ Figure 1: Overview of the LevSeq variant sequencing workflow using Nanopore tech
|
|
|
53
54
|
|
|
54
55
|
|
|
55
56
|
- Data to reproduce the results and to test are available on zenodo [](https://doi.org/10.5281/zenodo.13694463)
|
|
56
|
-
|
|
57
|
-
- A dockerized website and database for labs to locally host and visualize their data: website is available at: https://levseq.caltech.edu/ and code to host locally at: https://github.com/fhalab/LevSeq_VDB/
|
|
57
|
+
- A dockerized website and database for labs to locally host and visualize their data: website is available [here](https://github.com/ArianeMora/LevSeq_vis/) and code to host locally at: https://github.com/fhalab/LevSeq_VDB/
|
|
58
58
|
|
|
59
59
|
## Setup
|
|
60
60
|
|
|
@@ -65,29 +65,19 @@ For setting up the experimental side of LevSeq we suggest the following preparat
|
|
|
65
65
|
|
|
66
66
|
## How to Use LevSeq
|
|
67
67
|
|
|
68
|
-
The wet lab part is detailed in the method section of the paper.
|
|
68
|
+
The wet lab part is detailed in the method section of the paper or via the [wiki](https://github.com/fhalab/LevSeq/wiki/Experimental-protocols).
|
|
69
69
|
|
|
70
70
|
Once samples are prepared, the multiplexed sample is used for sequencing, and the sequencing data is stored in the `../data` folder as per the typical Nanopore flow (refer to Nanopore documentation for this).
|
|
71
71
|
|
|
72
72
|
After sequencing, you can identify variants, demultiplex, and combine with your variant function here! For simple applications, we recommend using the notebook `example/Example.ipynb`.
|
|
73
73
|
|
|
74
|
-
###
|
|
75
|
-
|
|
76
|
-
1. **Basecalling**: This step converts Nanopore's FAST5 files to sequences. For basecalling, we use Nanopore's basecaller, Medaka, which can run in parallel with sequencing (recommended) or afterward.
|
|
77
|
-
|
|
78
|
-
2. **Demultiplexing**: After sequencing, the reads, stored as bulk FASTQ files, are sorted. During demultiplexing, each read is assigned to its correct plate/well combination and stored as a FASTQ file.
|
|
79
|
-
|
|
80
|
-
3. **Variant Calling**: For each sample, the consensus sequence is compared to the reference sequence. A variant is called if it differs from the reference sequence. The success of variant calling depends on the number of reads sequenced and their quality.
|
|
81
|
-
|
|
74
|
+
### Installation
|
|
82
75
|
|
|
83
|
-
|
|
76
|
+
We aimed to make LevSeq as simple to use as possible, this means you should be able to run it all using pip (note you need `samtools`
|
|
77
|
+
and `minimap2` installed on your path. However, if you have issues we recommend using the Docker instance!
|
|
78
|
+
(the pip version doesn't work well with mac M3 but docker does.)
|
|
84
79
|
|
|
85
|
-
We
|
|
86
|
-
|
|
87
|
-
We recommend using command line interface(Terminal) and a conda environment for installation:
|
|
88
|
-
```
|
|
89
|
-
git clone https://github.com/fhalab/LevSeq.git
|
|
90
|
-
```
|
|
80
|
+
We recommend using terminal and a conda environment for installation:
|
|
91
81
|
|
|
92
82
|
```
|
|
93
83
|
conda create --name levseq python=3.10 -y
|
|
@@ -97,11 +87,6 @@ conda create --name levseq python=3.10 -y
|
|
|
97
87
|
conda activate levseq
|
|
98
88
|
```
|
|
99
89
|
|
|
100
|
-
From the LevSeq folder, install the package using pip:
|
|
101
|
-
|
|
102
|
-
```
|
|
103
|
-
pip install levseq
|
|
104
|
-
```
|
|
105
90
|
#### Dependencies
|
|
106
91
|
|
|
107
92
|
1. Samtools: https://www.htslib.org/download/
|
|
@@ -111,30 +96,26 @@ conda install -c bioconda -c conda-forge samtools
|
|
|
111
96
|
or for mac users you can use: `brew install samtools`
|
|
112
97
|
|
|
113
98
|
2. Minimap2: https://github.com/lh3/minimap2
|
|
99
|
+
|
|
114
100
|
```
|
|
115
101
|
conda install -c bioconda -c conda-forge minimap2
|
|
116
102
|
```
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
For Mac M chip users: installation via homebrew
|
|
121
|
-
```
|
|
122
|
-
brew install gcc@14
|
|
123
|
-
brew install gcc@13
|
|
124
|
-
```
|
|
125
|
-
For Linux users: installation via conda
|
|
126
|
-
```
|
|
127
|
-
conda install conda-forge::gcc=14
|
|
128
|
-
conda install conda-forge::gcc=13
|
|
129
|
-
```
|
|
103
|
+
### Docker Installation (Recommended for full pipeline)
|
|
104
|
+
For installing the whole pipeline, you'll need to use the docker image. For this, install docker as required for your
|
|
105
|
+
operating system (https://docs.docker.com/engine/install/).
|
|
130
106
|
|
|
131
107
|
### Usage
|
|
132
|
-
#### Command Line Interface
|
|
133
|
-
LevSeq can be run using the command line interface. Here's the basic structure of the command:
|
|
134
108
|
|
|
109
|
+
#### Run via pip
|
|
135
110
|
```
|
|
136
111
|
levseq <name of the run you can make this whatever> <location to data folder> <location of reference csv file>
|
|
137
112
|
```
|
|
113
|
+
#### Run via docker
|
|
114
|
+
```
|
|
115
|
+
docker run --rm -v "$(pwd):/levseq_results" levseq <name> <location to data folder> <location of reference csv file>
|
|
116
|
+
```
|
|
117
|
+
See the [manuscrtipt notebook](https://github.com/fhalab/LevSeq/blob/main/manuscript/notebooks/epPCR_10plates.ipynb) for an example.
|
|
118
|
+
|
|
138
119
|
#### Required Arguments
|
|
139
120
|
1. Name of the experiment, this will be the name of the output folder
|
|
140
121
|
2. Location of basecalled fastq files, this is the direct output from using the MinKnow software for sequencing
|
|
@@ -149,37 +130,10 @@ levseq <name of the run you can make this whatever> <location to data folder> <l
|
|
|
149
130
|
|
|
150
131
|
--show\_msa Showing multiple sequence alignment for each well
|
|
151
132
|
|
|
152
|
-
|
|
153
|
-
For installing the whole pipeline, you'll need to use the docker image. For this, install docker as required for your
|
|
154
|
-
operating system (https://docs.docker.com/engine/install/).
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
To build the docker image run (within the main folder that contains the `Dockerfile`). Note building does **not** work
|
|
158
|
-
on Mac M3 chip, please use a ubuntu machine to build the docker image!
|
|
159
|
-
|
|
160
|
-
```
|
|
161
|
-
docker build -t levseq .
|
|
162
|
-
```
|
|
163
|
-
|
|
164
|
-
This gives us the access to the lebSeq command line interface via:
|
|
165
|
-
|
|
166
|
-
```
|
|
167
|
-
docker run levseq
|
|
168
|
-
```
|
|
169
|
-
Note! The docker image should work with linux, and mac, however, different mac architectures may have issues (owing to the different M1/M3 processers.)
|
|
170
|
-
|
|
171
|
-
Basically the -v connects a folder on your computer with the output from the minION sequencer with the docker image that will take these results and then perform
|
|
172
|
-
demultiplexing and variant calling.
|
|
173
|
-
|
|
174
|
-
docker run -v /disk1/ariane/vscode/LevSeq/manuscript/Data/20241116-YL-LevSeq-parlqep400-1-2-P25-28:/levseq_results/ levseq docker-test levseq_results/ levseq_results/LevSeq-T1.csv
|
|
175
|
-
```
|
|
176
|
-
docker run -v /Users/XXXX/Documents/LevSeq/data:/levseq_results/ levseq 20240502 levseq_results/20240502/ levseq_results/20240502-YL-ParLQ-ep2.csv
|
|
177
|
-
```
|
|
178
|
-
|
|
179
|
-
In this command: `/Users/XXXX/Documents/LevSeq/data` is a folder on your computer, which contains a subfolder `20240502`
|
|
133
|
+
Great you should be all done!
|
|
180
134
|
|
|
181
|
-
|
|
135
|
+
For more details or trouble shooting please look at our [computational_protocols](https://github.com/fhalab/LevSeq/wiki/Computational-protocols).
|
|
182
136
|
|
|
183
|
-
|
|
137
|
+
#### Citing
|
|
184
138
|
|
|
185
|
-
If you
|
|
139
|
+
If you have found LevSeq useful, please cite out [paper](https://doi.org/10.1101/2024.09.04.611255).
|
|
@@ -7,8 +7,7 @@ Figure 1: Overview of the LevSeq variant sequencing workflow using Nanopore tech
|
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
- Data to reproduce the results and to test are available on zenodo [](https://doi.org/10.5281/zenodo.13694463)
|
|
10
|
-
|
|
11
|
-
- A dockerized website and database for labs to locally host and visualize their data: website is available at: https://levseq.caltech.edu/ and code to host locally at: https://github.com/fhalab/LevSeq_VDB/
|
|
10
|
+
- A dockerized website and database for labs to locally host and visualize their data: website is available [here](https://github.com/ArianeMora/LevSeq_vis/) and code to host locally at: https://github.com/fhalab/LevSeq_VDB/
|
|
12
11
|
|
|
13
12
|
## Setup
|
|
14
13
|
|
|
@@ -19,29 +18,19 @@ For setting up the experimental side of LevSeq we suggest the following preparat
|
|
|
19
18
|
|
|
20
19
|
## How to Use LevSeq
|
|
21
20
|
|
|
22
|
-
The wet lab part is detailed in the method section of the paper.
|
|
21
|
+
The wet lab part is detailed in the method section of the paper or via the [wiki](https://github.com/fhalab/LevSeq/wiki/Experimental-protocols).
|
|
23
22
|
|
|
24
23
|
Once samples are prepared, the multiplexed sample is used for sequencing, and the sequencing data is stored in the `../data` folder as per the typical Nanopore flow (refer to Nanopore documentation for this).
|
|
25
24
|
|
|
26
25
|
After sequencing, you can identify variants, demultiplex, and combine with your variant function here! For simple applications, we recommend using the notebook `example/Example.ipynb`.
|
|
27
26
|
|
|
28
|
-
###
|
|
29
|
-
|
|
30
|
-
1. **Basecalling**: This step converts Nanopore's FAST5 files to sequences. For basecalling, we use Nanopore's basecaller, Medaka, which can run in parallel with sequencing (recommended) or afterward.
|
|
31
|
-
|
|
32
|
-
2. **Demultiplexing**: After sequencing, the reads, stored as bulk FASTQ files, are sorted. During demultiplexing, each read is assigned to its correct plate/well combination and stored as a FASTQ file.
|
|
33
|
-
|
|
34
|
-
3. **Variant Calling**: For each sample, the consensus sequence is compared to the reference sequence. A variant is called if it differs from the reference sequence. The success of variant calling depends on the number of reads sequenced and their quality.
|
|
35
|
-
|
|
27
|
+
### Installation
|
|
36
28
|
|
|
37
|
-
|
|
29
|
+
We aimed to make LevSeq as simple to use as possible, this means you should be able to run it all using pip (note you need `samtools`
|
|
30
|
+
and `minimap2` installed on your path. However, if you have issues we recommend using the Docker instance!
|
|
31
|
+
(the pip version doesn't work well with mac M3 but docker does.)
|
|
38
32
|
|
|
39
|
-
We
|
|
40
|
-
|
|
41
|
-
We recommend using command line interface(Terminal) and a conda environment for installation:
|
|
42
|
-
```
|
|
43
|
-
git clone https://github.com/fhalab/LevSeq.git
|
|
44
|
-
```
|
|
33
|
+
We recommend using terminal and a conda environment for installation:
|
|
45
34
|
|
|
46
35
|
```
|
|
47
36
|
conda create --name levseq python=3.10 -y
|
|
@@ -51,11 +40,6 @@ conda create --name levseq python=3.10 -y
|
|
|
51
40
|
conda activate levseq
|
|
52
41
|
```
|
|
53
42
|
|
|
54
|
-
From the LevSeq folder, install the package using pip:
|
|
55
|
-
|
|
56
|
-
```
|
|
57
|
-
pip install levseq
|
|
58
|
-
```
|
|
59
43
|
#### Dependencies
|
|
60
44
|
|
|
61
45
|
1. Samtools: https://www.htslib.org/download/
|
|
@@ -65,30 +49,26 @@ conda install -c bioconda -c conda-forge samtools
|
|
|
65
49
|
or for mac users you can use: `brew install samtools`
|
|
66
50
|
|
|
67
51
|
2. Minimap2: https://github.com/lh3/minimap2
|
|
52
|
+
|
|
68
53
|
```
|
|
69
54
|
conda install -c bioconda -c conda-forge minimap2
|
|
70
55
|
```
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
For Mac M chip users: installation via homebrew
|
|
75
|
-
```
|
|
76
|
-
brew install gcc@14
|
|
77
|
-
brew install gcc@13
|
|
78
|
-
```
|
|
79
|
-
For Linux users: installation via conda
|
|
80
|
-
```
|
|
81
|
-
conda install conda-forge::gcc=14
|
|
82
|
-
conda install conda-forge::gcc=13
|
|
83
|
-
```
|
|
56
|
+
### Docker Installation (Recommended for full pipeline)
|
|
57
|
+
For installing the whole pipeline, you'll need to use the docker image. For this, install docker as required for your
|
|
58
|
+
operating system (https://docs.docker.com/engine/install/).
|
|
84
59
|
|
|
85
60
|
### Usage
|
|
86
|
-
#### Command Line Interface
|
|
87
|
-
LevSeq can be run using the command line interface. Here's the basic structure of the command:
|
|
88
61
|
|
|
62
|
+
#### Run via pip
|
|
89
63
|
```
|
|
90
64
|
levseq <name of the run you can make this whatever> <location to data folder> <location of reference csv file>
|
|
91
65
|
```
|
|
66
|
+
#### Run via docker
|
|
67
|
+
```
|
|
68
|
+
docker run --rm -v "$(pwd):/levseq_results" levseq <name> <location to data folder> <location of reference csv file>
|
|
69
|
+
```
|
|
70
|
+
See the [manuscrtipt notebook](https://github.com/fhalab/LevSeq/blob/main/manuscript/notebooks/epPCR_10plates.ipynb) for an example.
|
|
71
|
+
|
|
92
72
|
#### Required Arguments
|
|
93
73
|
1. Name of the experiment, this will be the name of the output folder
|
|
94
74
|
2. Location of basecalled fastq files, this is the direct output from using the MinKnow software for sequencing
|
|
@@ -103,37 +83,10 @@ levseq <name of the run you can make this whatever> <location to data folder> <l
|
|
|
103
83
|
|
|
104
84
|
--show\_msa Showing multiple sequence alignment for each well
|
|
105
85
|
|
|
106
|
-
|
|
107
|
-
For installing the whole pipeline, you'll need to use the docker image. For this, install docker as required for your
|
|
108
|
-
operating system (https://docs.docker.com/engine/install/).
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
To build the docker image run (within the main folder that contains the `Dockerfile`). Note building does **not** work
|
|
112
|
-
on Mac M3 chip, please use a ubuntu machine to build the docker image!
|
|
113
|
-
|
|
114
|
-
```
|
|
115
|
-
docker build -t levseq .
|
|
116
|
-
```
|
|
117
|
-
|
|
118
|
-
This gives us the access to the lebSeq command line interface via:
|
|
119
|
-
|
|
120
|
-
```
|
|
121
|
-
docker run levseq
|
|
122
|
-
```
|
|
123
|
-
Note! The docker image should work with linux, and mac, however, different mac architectures may have issues (owing to the different M1/M3 processers.)
|
|
124
|
-
|
|
125
|
-
Basically the -v connects a folder on your computer with the output from the minION sequencer with the docker image that will take these results and then perform
|
|
126
|
-
demultiplexing and variant calling.
|
|
127
|
-
|
|
128
|
-
docker run -v /disk1/ariane/vscode/LevSeq/manuscript/Data/20241116-YL-LevSeq-parlqep400-1-2-P25-28:/levseq_results/ levseq docker-test levseq_results/ levseq_results/LevSeq-T1.csv
|
|
129
|
-
```
|
|
130
|
-
docker run -v /Users/XXXX/Documents/LevSeq/data:/levseq_results/ levseq 20240502 levseq_results/20240502/ levseq_results/20240502-YL-ParLQ-ep2.csv
|
|
131
|
-
```
|
|
132
|
-
|
|
133
|
-
In this command: `/Users/XXXX/Documents/LevSeq/data` is a folder on your computer, which contains a subfolder `20240502`
|
|
86
|
+
Great you should be all done!
|
|
134
87
|
|
|
135
|
-
|
|
88
|
+
For more details or trouble shooting please look at our [computational_protocols](https://github.com/fhalab/LevSeq/wiki/Computational-protocols).
|
|
136
89
|
|
|
137
|
-
|
|
90
|
+
#### Citing
|
|
138
91
|
|
|
139
|
-
If you
|
|
92
|
+
If you have found LevSeq useful, please cite out [paper](https://doi.org/10.1101/2024.09.04.611255).
|
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
__title__ = 'levseq'
|
|
19
19
|
__description__ = 'LevSeq nanopore sequencing'
|
|
20
20
|
__url__ = 'https://github.com/fhalab/levseq/'
|
|
21
|
-
__version__ = '1.2.
|
|
21
|
+
__version__ = '1.2.5'
|
|
22
22
|
__author__ = 'Yueming Long, Emreay Gursoy, Ariane Mora, Francesca-Zhoufan Li'
|
|
23
23
|
__author_email__ = 'ylong@caltech.edu'
|
|
24
24
|
__license__ = 'GPL3'
|
|
@@ -243,6 +243,19 @@ def call_variant(experiment_name, experiment_folder, template_fasta, filtered_ba
|
|
|
243
243
|
except Exception as e:
|
|
244
244
|
logging.error("Variant calling failed", exc_info=True)
|
|
245
245
|
raise
|
|
246
|
+
|
|
247
|
+
def assign_alignment_probability(row):
|
|
248
|
+
if row["Variant"] == "#PARENT#":
|
|
249
|
+
if row["Alignment Count"] > 20:
|
|
250
|
+
return 1
|
|
251
|
+
elif 10 <= row["Alignment Count"] <= 20:
|
|
252
|
+
return (row["Alignment Count"] - 10) / 10 # Ranges from 0 to 1 linearly
|
|
253
|
+
else:
|
|
254
|
+
return 0
|
|
255
|
+
else:
|
|
256
|
+
return row["Average mutation frequency"]
|
|
257
|
+
|
|
258
|
+
|
|
246
259
|
# Full version of create_df_v function
|
|
247
260
|
def create_df_v(variants_df):
|
|
248
261
|
# Make copy of dataframe
|
|
@@ -258,28 +271,19 @@ def create_df_v(variants_df):
|
|
|
258
271
|
|
|
259
272
|
# Translate nc_variant to aa_variant
|
|
260
273
|
df_variants_["aa_variant"] = df_variants_["nc_variant"].apply(
|
|
261
|
-
|
|
274
|
+
lambda x: x if x in ["Deletion", "#N.A.#", 'Insertion'] else translate(x)
|
|
262
275
|
)
|
|
263
276
|
# Fill in 'Deletion' in 'aa_variant' column
|
|
264
277
|
df_variants_.loc[
|
|
265
278
|
df_variants_["nc_variant"] == "Deletion", "aa_variant"
|
|
266
279
|
] = "Deletion"
|
|
280
|
+
df_variants_.loc[
|
|
281
|
+
df_variants_["nc_variant"] == "Insertion", "aa_variant"
|
|
282
|
+
] = "Insertion"
|
|
267
283
|
|
|
268
284
|
# Compare aa_variant with translated refseq and generate Substitutions column
|
|
269
285
|
df_variants_["Substitutions"] = df_variants_.apply(get_mutations, axis=1)
|
|
270
|
-
|
|
271
286
|
# Adding sequence quality to Alignment Probability before filling in empty values
|
|
272
|
-
def assign_alignment_probability(row):
|
|
273
|
-
if row["Variant"] == "#PARENT#":
|
|
274
|
-
if row["Alignment Count"] > 20:
|
|
275
|
-
return 1
|
|
276
|
-
elif 10 <= row["Alignment Count"] <= 20:
|
|
277
|
-
return (row["Alignment Count"] - 10) / 10 # Ranges from 0 to 1 linearly
|
|
278
|
-
else:
|
|
279
|
-
return 0
|
|
280
|
-
else:
|
|
281
|
-
return row["Average mutation frequency"]
|
|
282
|
-
|
|
283
287
|
df_variants_["Alignment Probability"] = df_variants_.apply(assign_alignment_probability, axis=1)
|
|
284
288
|
df_variants_["Alignment Probability"] = df_variants_["Alignment Probability"].fillna(0.0)
|
|
285
289
|
df_variants_["Alignment Count"] = df_variants_["Alignment Count"].fillna(0.0)
|
|
@@ -291,7 +295,9 @@ def create_df_v(variants_df):
|
|
|
291
295
|
elif df_variants_["nc_variant"].iloc[i] == "#N.A.#":
|
|
292
296
|
df_variants_.Substitutions.iat[i] = "#N.A.#"
|
|
293
297
|
|
|
294
|
-
|
|
298
|
+
# Low read counts override low mutations
|
|
299
|
+
df_variants_["Substitutions"] = ["#LOW#" if a < 10 and a > 0 else s for a, s in df_variants_[["Alignment Count", "Substitutions"]].values]
|
|
300
|
+
|
|
295
301
|
# Add row and columns
|
|
296
302
|
Well = df_variants_["Well"].tolist()
|
|
297
303
|
row = []
|
|
@@ -317,29 +323,27 @@ def create_df_v(variants_df):
|
|
|
317
323
|
)
|
|
318
324
|
# Rename columns as per the request
|
|
319
325
|
df_variants_.rename(columns={
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
+
"Variant": "nucleotide_mutation",
|
|
327
|
+
"Substitutions": "amino-acid_substitutions",
|
|
328
|
+
"nc_variant": "nt_sequence",
|
|
329
|
+
"aa_variant": "aa_sequence"
|
|
330
|
+
},inplace=True)
|
|
326
331
|
|
|
327
332
|
# Select the desired columns in the desired order
|
|
328
|
-
restructured_df = df_variants_[
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
]
|
|
333
|
+
restructured_df = df_variants_[[
|
|
334
|
+
"barcode_plate",
|
|
335
|
+
"Plate",
|
|
336
|
+
"Well",
|
|
337
|
+
"Alignment Count",
|
|
338
|
+
"nucleotide_mutation",
|
|
339
|
+
"amino-acid_substitutions",
|
|
340
|
+
"Alignment Probability",
|
|
341
|
+
"Average mutation frequency",
|
|
342
|
+
"P value",
|
|
343
|
+
"P adj. value",
|
|
344
|
+
"nt_sequence",
|
|
345
|
+
"aa_sequence",
|
|
346
|
+
]
|
|
343
347
|
]
|
|
344
348
|
|
|
345
349
|
return restructured_df, df_variants_
|
|
@@ -354,16 +358,21 @@ def create_nc_variant(variant, refseq):
|
|
|
354
358
|
return refseq
|
|
355
359
|
elif "DEL" in variant:
|
|
356
360
|
return "Deletion"
|
|
361
|
+
elif variant == '+':
|
|
362
|
+
return "Insertion"
|
|
357
363
|
else:
|
|
358
364
|
mutations = variant.split("_")
|
|
359
365
|
nc_variant = list(refseq)
|
|
360
366
|
for mutation in mutations:
|
|
361
|
-
|
|
367
|
+
try:
|
|
362
368
|
position = int(re.findall(r"\d+", mutation)[0]) - 1
|
|
363
369
|
original = mutation[0]
|
|
364
370
|
new = mutation[-1]
|
|
365
|
-
|
|
366
|
-
|
|
371
|
+
if position < len(nc_variant) and nc_variant[position] == original:
|
|
372
|
+
nc_variant[position] = new
|
|
373
|
+
except:
|
|
374
|
+
print('WARNING! UNABLE TO PROCESS THIS')
|
|
375
|
+
print(mutation)
|
|
367
376
|
return "".join(nc_variant)
|
|
368
377
|
|
|
369
378
|
|
|
@@ -377,7 +386,9 @@ def get_mutations(row):
|
|
|
377
386
|
# Check if alignment_count is zero and return "#N.A.#" if true
|
|
378
387
|
if alignment_count == 0:
|
|
379
388
|
return "#N.A.#"
|
|
380
|
-
|
|
389
|
+
if alignment_count <= 10:
|
|
390
|
+
return "#LOW#"
|
|
391
|
+
|
|
381
392
|
refseq = row["refseq"]
|
|
382
393
|
|
|
383
394
|
if not is_valid_dna_sequence(refseq):
|
|
@@ -395,10 +406,7 @@ def get_mutations(row):
|
|
|
395
406
|
if refseq_aa[i] != variant_aa[i]:
|
|
396
407
|
mutations.append(f"{refseq_aa[i]}{i+1}{variant_aa[i]}")
|
|
397
408
|
if not mutations:
|
|
398
|
-
|
|
399
|
-
return "#N.A.#"
|
|
400
|
-
else:
|
|
401
|
-
return "#PARENT#"
|
|
409
|
+
return "#PARENT#"
|
|
402
410
|
else:
|
|
403
411
|
return "LEN"
|
|
404
412
|
return "_".join(mutations) if mutations else ""
|
|
@@ -433,10 +441,7 @@ def process_ref_csv(cl_args, tqdm_fn=tqdm.tqdm):
|
|
|
433
441
|
result_folder = create_result_folder(cl_args)
|
|
434
442
|
variant_csv_path = os.path.join(result_folder, "variants.csv")
|
|
435
443
|
|
|
436
|
-
|
|
437
|
-
variant_df = pd.read_csv(variant_csv_path)
|
|
438
|
-
else:
|
|
439
|
-
variant_df = pd.DataFrame(columns=["barcode_plate", "name", "refseq", "variant"])
|
|
444
|
+
variant_df = pd.DataFrame(columns=["barcode_plate", "name", "refseq", "variant"])
|
|
440
445
|
|
|
441
446
|
for i, row in tqdm_fn(ref_df.iterrows(), total=len(ref_df), desc="Processing Samples"):
|
|
442
447
|
barcode_plate = row["barcode_plate"]
|
|
@@ -456,9 +461,9 @@ def process_ref_csv(cl_args, tqdm_fn=tqdm.tqdm):
|
|
|
456
461
|
barcode_path = filter_bc(cl_args, name_folder, i)
|
|
457
462
|
output_dir = Path(result_folder) / "basecalled_reads"
|
|
458
463
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
459
|
-
file_to_fastq = cat_fastq_files(cl_args.get("path"), output_dir)
|
|
460
464
|
|
|
461
465
|
if not cl_args["skip_demultiplexing"]:
|
|
466
|
+
file_to_fastq = cat_fastq_files(cl_args.get("path"), output_dir)
|
|
462
467
|
try:
|
|
463
468
|
demux_fastq(output_dir, name_folder, barcode_path)
|
|
464
469
|
except Exception as e:
|
|
@@ -115,7 +115,7 @@ def calculate_mutation_combinations(stats_df):
|
|
|
115
115
|
|
|
116
116
|
|
|
117
117
|
def normalise_calculate_stats(processed_plate_df, value_columns, normalise='standard', stats_method='mannwhitneyu',
|
|
118
|
-
parent_label='#PARENT#'):
|
|
118
|
+
parent_label='#PARENT#', normalise_method='median'):
|
|
119
119
|
parent = parent_label
|
|
120
120
|
# if nomrliase normalize with standard normalisation
|
|
121
121
|
normalised_value_columns = []
|
|
@@ -125,7 +125,11 @@ def normalise_calculate_stats(processed_plate_df, value_columns, normalise='stan
|
|
|
125
125
|
for value_column in value_columns:
|
|
126
126
|
sub_df = processed_plate_df[processed_plate_df['Plate'] == plate]
|
|
127
127
|
parent_values = sub_df[sub_df['amino-acid_substitutions'] == parent][value_column].values
|
|
128
|
-
|
|
128
|
+
# By default use the median
|
|
129
|
+
if normalise_method == 'median':
|
|
130
|
+
parent_mean = np.median(parent_values)
|
|
131
|
+
else:
|
|
132
|
+
parent_mean = np.mean(parent_values)
|
|
129
133
|
parent_sd = np.std(parent_values)
|
|
130
134
|
|
|
131
135
|
# For each plate we normalise to the parent of that plate
|
|
@@ -148,7 +152,10 @@ def normalise_calculate_stats(processed_plate_df, value_columns, normalise='stan
|
|
|
148
152
|
if mutation != parent:
|
|
149
153
|
for value_column in normalised_value_columns:
|
|
150
154
|
parent_values = list(processed_plate_df[processed_plate_df['amino-acid_substitutions'] == parent][value_column].values)
|
|
151
|
-
|
|
155
|
+
if normalise_method == 'median':
|
|
156
|
+
parent_mean = np.median(parent_values)
|
|
157
|
+
else:
|
|
158
|
+
parent_mean = np.mean(parent_values)
|
|
152
159
|
parent_sd = np.std(parent_values)
|
|
153
160
|
|
|
154
161
|
vals = list(grp[value_column].values)
|
|
@@ -254,8 +261,12 @@ def work_up_lcms(
|
|
|
254
261
|
--------
|
|
255
262
|
plate: ns.Plate object (DataFrame-like)
|
|
256
263
|
"""
|
|
257
|
-
|
|
258
|
-
|
|
264
|
+
if isinstance(file, str):
|
|
265
|
+
# Read in the data
|
|
266
|
+
df = pd.read_csv(file, header=[1])
|
|
267
|
+
else:
|
|
268
|
+
# Change to handling both
|
|
269
|
+
df = file
|
|
259
270
|
# Convert nans to 0
|
|
260
271
|
df = df.fillna(0)
|
|
261
272
|
# Only grab the Sample Acq Order No.s that have a numeric value
|
|
@@ -304,6 +315,72 @@ def work_up_lcms(
|
|
|
304
315
|
return plate
|
|
305
316
|
|
|
306
317
|
|
|
318
|
+
def process_files(results_df, plate_df, plate: str, product: list) -> pd.DataFrame:
|
|
319
|
+
"""
|
|
320
|
+
Process and combine a single plate file
|
|
321
|
+
|
|
322
|
+
Args:
|
|
323
|
+
- product : str
|
|
324
|
+
The name of the product to be analyzed. ie pdt
|
|
325
|
+
- plate : str, ie 'HMC0225_HMC0226.csv'
|
|
326
|
+
The name of the input CSV file containing the plate data.
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
- pd.DataFrame
|
|
330
|
+
A pandas DataFrame containing the processed data.
|
|
331
|
+
- str
|
|
332
|
+
The path of the output CSV file containing the processed data.
|
|
333
|
+
"""
|
|
334
|
+
filtered_df = results_df[["Plate", "Well", "amino-acid_substitutions", "nt_sequence", "aa_sequence"]]
|
|
335
|
+
filtered_df = filtered_df[(filtered_df["amino-acid_substitutions"] != "#N.A.#")].dropna()
|
|
336
|
+
|
|
337
|
+
# Extract the unique entries of Plate
|
|
338
|
+
unique_plates = filtered_df["Plate"].unique()
|
|
339
|
+
|
|
340
|
+
# Create an empty list to store the processed plate data
|
|
341
|
+
processed_data = []
|
|
342
|
+
|
|
343
|
+
# Iterate over unique Plates and search for corresponding CSV files in the current directory
|
|
344
|
+
plate_object = work_up_lcms(plate_df, product)
|
|
345
|
+
|
|
346
|
+
# Extract attributes from plate_object as needed for downstream processes
|
|
347
|
+
if hasattr(plate_object, "df"):
|
|
348
|
+
# Assuming plate_object has a dataframe-like attribute 'df' that we can work with
|
|
349
|
+
plate_df = plate_object.df
|
|
350
|
+
plate_df["Plate"] = plate # Add the plate identifier for reference
|
|
351
|
+
|
|
352
|
+
# Merge filtered_df with plate_df to retain amino-acid_substitutionss and nt_sequence columns
|
|
353
|
+
merged_df = pd.merge(
|
|
354
|
+
plate_df, filtered_df, on=["Plate", "Well"], how="left"
|
|
355
|
+
)
|
|
356
|
+
columns_order = (
|
|
357
|
+
["Plate", "Well", "Row", "Column", "amino-acid_substitutions"]
|
|
358
|
+
+ product
|
|
359
|
+
+ ["nt_sequence", "aa_sequence"]
|
|
360
|
+
)
|
|
361
|
+
merged_df = merged_df[columns_order]
|
|
362
|
+
processed_data.append(merged_df)
|
|
363
|
+
|
|
364
|
+
# Concatenate all dataframes if available
|
|
365
|
+
if processed_data:
|
|
366
|
+
processed_df = pd.concat(processed_data, ignore_index=True)
|
|
367
|
+
else:
|
|
368
|
+
processed_df = pd.DataFrame(
|
|
369
|
+
columns=["Plate", "Well", "Row", "Column", "amino-acid_substitutions"]
|
|
370
|
+
+ product
|
|
371
|
+
+ ["nt_sequence", "aa_sequence"]
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
# Ensure all entries in 'Mutations' are treated as strings
|
|
375
|
+
processed_df["amino-acid_substitutions"] = processed_df["amino-acid_substitutions"].astype(str)
|
|
376
|
+
|
|
377
|
+
# Remove any rows with empty values
|
|
378
|
+
processed_df = processed_df.dropna()
|
|
379
|
+
|
|
380
|
+
# Return the processed DataFrame for downstream processes
|
|
381
|
+
return processed_df
|
|
382
|
+
|
|
383
|
+
|
|
307
384
|
# Function to process the plate files
|
|
308
385
|
def process_plate_files(product: str, input_csv: str) -> pd.DataFrame:
|
|
309
386
|
|
|
@@ -1146,8 +1223,10 @@ def gen_seqfitvis(
|
|
|
1146
1223
|
):
|
|
1147
1224
|
|
|
1148
1225
|
# normalized per plate to parent
|
|
1149
|
-
|
|
1150
|
-
|
|
1226
|
+
if isinstance(seqfit_path, str):
|
|
1227
|
+
df = pd.read_csv(seqfit_path)
|
|
1228
|
+
else:
|
|
1229
|
+
df = seqfit_path
|
|
1151
1230
|
# ignore deletion meaning "Mutations" == "-"
|
|
1152
1231
|
df = df[df["amino-acid_substitutions"] != "-"].copy()
|
|
1153
1232
|
# count number of sites mutated and append mutation details
|
|
@@ -170,12 +170,12 @@ def make_well_df_from_reads(seqs, read_ids, read_quals):
|
|
|
170
170
|
seq_df = pd.DataFrame([list(s) for s in seqs]) # Convert each string to a list so that we get positions nicely
|
|
171
171
|
# Also add in the read_ids and sort by the quality to only take the highest quality one
|
|
172
172
|
seq_df['read_id'] = read_ids
|
|
173
|
-
#seq_df['read_qual'] = read_quals
|
|
174
173
|
seq_df['seqs'] = seqs
|
|
175
|
-
|
|
174
|
+
seq_df['read_qual'] = [0 if isinstance(r, str) else r for r in read_quals]
|
|
175
|
+
seq_df = seq_df.sort_values(by='read_qual', ascending=False)
|
|
176
176
|
# Should now be sorted by the highest quality
|
|
177
177
|
seq_df = seq_df.drop_duplicates(subset=['read_id'], keep='first')
|
|
178
|
-
return seq_df.drop(columns=['read_id', 'seqs'])
|
|
178
|
+
return seq_df.drop(columns=['read_id', 'seqs', 'read_qual'])
|
|
179
179
|
|
|
180
180
|
|
|
181
181
|
def calculate_mutation_significance_across_well(seq_df):
|
|
@@ -273,7 +273,7 @@ def alignment_from_cigar(cigar: str, alignment: str, ref: str, query_qualities:
|
|
|
273
273
|
ref_pos += op_len
|
|
274
274
|
return new_seq, ref_seq, qual, inserts
|
|
275
275
|
|
|
276
|
-
def get_reads_for_well(parent_name, bam_file_path: str, ref_str: str,
|
|
276
|
+
def get_reads_for_well(parent_name, bam_file_path: str, ref_str: str, msa_path=None):
|
|
277
277
|
"""
|
|
278
278
|
Rows are the reads, columns are the columns in the reference. Insertions are ignored.
|
|
279
279
|
"""
|
|
@@ -298,22 +298,19 @@ def get_reads_for_well(parent_name, bam_file_path: str, ref_str: str, min_covera
|
|
|
298
298
|
for i, insert in ins.items():
|
|
299
299
|
insert_map[i].append(insert)
|
|
300
300
|
read_ids.append(f'{read.query_name}')
|
|
301
|
-
read_quals.append(
|
|
301
|
+
read_quals.append(qual)
|
|
302
302
|
|
|
303
303
|
# Check if we want to write a MSA
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
# for i, seq in enumerate(seqs):
|
|
312
|
-
# fout.write(f'>{read_ids[i]}\n{"".join(seq)}\n')
|
|
313
|
-
# # Align using clustal for debugging if you need the adapter! Here you would change above to use a different version
|
|
314
|
-
# os.system(f'clustal-omega --force -i "{msa_path}" -o "{msa_path.replace(".fa", "_msa.fa")}"')
|
|
304
|
+
if msa_path is not None:
|
|
305
|
+
with open(msa_path, 'w+') as fout:
|
|
306
|
+
# Write the reference first
|
|
307
|
+
fout.write(f'>{parent_name}\n{ref_str}\n')
|
|
308
|
+
for i, seq in enumerate(seqs):
|
|
309
|
+
fout.write(f'>{read_ids[i]}\n{str(seq)}\n')
|
|
310
|
+
|
|
315
311
|
# Do this for all wells
|
|
316
312
|
seq_df = make_well_df_from_reads(seqs, read_ids, read_quals)
|
|
313
|
+
alignment_count = len(seq_df.values)
|
|
317
314
|
rows_all = make_row_from_read_pileup_across_well(seq_df, ref_str, parent_name, insert_map)
|
|
318
315
|
bam.close()
|
|
319
316
|
|
|
@@ -322,7 +319,7 @@ def get_reads_for_well(parent_name, bam_file_path: str, ref_str: str, min_covera
|
|
|
322
319
|
seq_df.columns = ['gene_name', 'position', 'ref', 'most_frequent', 'freq_non_ref', 'total_other',
|
|
323
320
|
'total_reads', 'p_value', 'percent_most_freq_mutation', 'A', 'p(a)', 'T', 'p(t)', 'G', 'p(g)',
|
|
324
321
|
'C', 'p(c)', 'N', 'p(n)', 'I', 'p(i)', 'Warnings']
|
|
325
|
-
return calculate_mutation_significance_across_well(seq_df),
|
|
322
|
+
return calculate_mutation_significance_across_well(seq_df), alignment_count
|
|
326
323
|
|
|
327
324
|
def make_row_from_read_pileup_across_well(well_df, ref_str, label, insert_map):
|
|
328
325
|
"""
|
|
@@ -339,13 +336,12 @@ def make_row_from_read_pileup_across_well(well_df, ref_str, label, insert_map):
|
|
|
339
336
|
|
|
340
337
|
# Dummy values that will be filled in later once we calculate the background error rate
|
|
341
338
|
warning = ''
|
|
342
|
-
if total_reads <
|
|
343
|
-
warning =
|
|
344
|
-
f'second sequencing method on this well.')
|
|
339
|
+
if total_reads < 20:
|
|
340
|
+
warning = f'WARNING: you had: {total_reads}, we recommend looking at the BAM file or using a second sequencing method on this well.'
|
|
345
341
|
# Check if there was an insert
|
|
346
|
-
if insert_map.get(col) and len(insert_map[col]) > total_reads/2: # i.e. at least half have the insert
|
|
342
|
+
if insert_map.get(col) and len(insert_map[col][0]) > total_reads/2: # i.e. at least half have the insert
|
|
347
343
|
if warning:
|
|
348
|
-
warning += '
|
|
344
|
+
warning += '\nINSERT'
|
|
349
345
|
else:
|
|
350
346
|
warning = f'WARNING: INSERT.'
|
|
351
347
|
rows.append([label, col, ref_seq, actual_seq, freq_non_ref, total_other, total_reads, 1.0, 0.0,
|
|
@@ -468,12 +464,19 @@ def get_variant_label_for_well(seq_df, threshold):
|
|
|
468
464
|
# Filter based on significance to determine whether there is a
|
|
469
465
|
non_refs = seq_df[seq_df['freq_non_ref'] > threshold].sort_values(by='position')
|
|
470
466
|
mixed_well = False
|
|
471
|
-
if
|
|
467
|
+
# Have section for inserts to check if they are > 50% of the reads
|
|
468
|
+
if seq_df['p(i) adj.'].min() < 0.05 and seq_df['I'].max() > len(seq_df) / 2:
|
|
469
|
+
label = '+'
|
|
470
|
+
probability = np.mean([1 - x for x in non_refs['freq_non_ref'].values])
|
|
471
|
+
combined_p_value = float("nan")
|
|
472
|
+
|
|
473
|
+
elif len(non_refs) > 0:
|
|
472
474
|
positions = non_refs['position'].values
|
|
473
475
|
refs = non_refs['ref'].values
|
|
474
476
|
label = [f'{refs[i]}{positions[i] + 1}{actual}' for i, actual in enumerate(non_refs['most_frequent'].values)]
|
|
475
477
|
# Check if it is a mixed well i.e. there were multiple with significant greater than 0.05
|
|
476
478
|
padj_vals = non_refs[['p(a) adj.', 'p(t) adj.', 'p(g) adj.', 'p(c) adj.', 'p(n) adj.', 'p(i) adj.']].values
|
|
479
|
+
|
|
477
480
|
for p in padj_vals:
|
|
478
481
|
c_sig = 0
|
|
479
482
|
for padj in p:
|
|
@@ -169,9 +169,9 @@ class VariantCaller:
|
|
|
169
169
|
self._align_sequences(row["Path"], row['Barcodes'])
|
|
170
170
|
|
|
171
171
|
# Placeholder function calls to demonstrate workflow
|
|
172
|
-
well_df, alignment_count = get_reads_for_well(self.experiment_name, bam_file,
|
|
172
|
+
well_df, alignment_count = get_reads_for_well(self.experiment_name, bam_file,
|
|
173
|
+
self.ref_str, f'{row["Path"]}/msa.fa')
|
|
173
174
|
self.variant_dict[barcode_id]['Alignment Count'] = alignment_count
|
|
174
|
-
|
|
175
175
|
if well_df is not None:
|
|
176
176
|
well_df.to_csv(f"{row['Path']}/seq_{barcode_id}.csv", index=False)
|
|
177
177
|
label, freq, combined_p_value, mixed_well, avg_error_rate = get_variant_label_for_well(well_df, threshold)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: levseq
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.5
|
|
4
4
|
Home-page: https://github.com/fhalab/levseq/
|
|
5
5
|
Author: Yueming Long, Emreay Gursoy, Ariane Mora, Francesca-Zhoufan Li
|
|
6
6
|
Author-email: ylong@caltech.edu
|
|
@@ -43,6 +43,7 @@ Requires-Dist: seaborn
|
|
|
43
43
|
Requires-Dist: scikit-learn
|
|
44
44
|
Requires-Dist: statsmodels
|
|
45
45
|
Requires-Dist: tqdm
|
|
46
|
+
Requires-Dist: biopandas
|
|
46
47
|
|
|
47
48
|
# Variant Sequencing with Nanopore
|
|
48
49
|
|
|
@@ -53,8 +54,7 @@ Figure 1: Overview of the LevSeq variant sequencing workflow using Nanopore tech
|
|
|
53
54
|
|
|
54
55
|
|
|
55
56
|
- Data to reproduce the results and to test are available on zenodo [](https://doi.org/10.5281/zenodo.13694463)
|
|
56
|
-
|
|
57
|
-
- A dockerized website and database for labs to locally host and visualize their data: website is available at: https://levseq.caltech.edu/ and code to host locally at: https://github.com/fhalab/LevSeq_VDB/
|
|
57
|
+
- A dockerized website and database for labs to locally host and visualize their data: website is available [here](https://github.com/ArianeMora/LevSeq_vis/) and code to host locally at: https://github.com/fhalab/LevSeq_VDB/
|
|
58
58
|
|
|
59
59
|
## Setup
|
|
60
60
|
|
|
@@ -65,29 +65,19 @@ For setting up the experimental side of LevSeq we suggest the following preparat
|
|
|
65
65
|
|
|
66
66
|
## How to Use LevSeq
|
|
67
67
|
|
|
68
|
-
The wet lab part is detailed in the method section of the paper.
|
|
68
|
+
The wet lab part is detailed in the method section of the paper or via the [wiki](https://github.com/fhalab/LevSeq/wiki/Experimental-protocols).
|
|
69
69
|
|
|
70
70
|
Once samples are prepared, the multiplexed sample is used for sequencing, and the sequencing data is stored in the `../data` folder as per the typical Nanopore flow (refer to Nanopore documentation for this).
|
|
71
71
|
|
|
72
72
|
After sequencing, you can identify variants, demultiplex, and combine with your variant function here! For simple applications, we recommend using the notebook `example/Example.ipynb`.
|
|
73
73
|
|
|
74
|
-
###
|
|
75
|
-
|
|
76
|
-
1. **Basecalling**: This step converts Nanopore's FAST5 files to sequences. For basecalling, we use Nanopore's basecaller, Medaka, which can run in parallel with sequencing (recommended) or afterward.
|
|
77
|
-
|
|
78
|
-
2. **Demultiplexing**: After sequencing, the reads, stored as bulk FASTQ files, are sorted. During demultiplexing, each read is assigned to its correct plate/well combination and stored as a FASTQ file.
|
|
79
|
-
|
|
80
|
-
3. **Variant Calling**: For each sample, the consensus sequence is compared to the reference sequence. A variant is called if it differs from the reference sequence. The success of variant calling depends on the number of reads sequenced and their quality.
|
|
81
|
-
|
|
74
|
+
### Installation
|
|
82
75
|
|
|
83
|
-
|
|
76
|
+
We aimed to make LevSeq as simple to use as possible, this means you should be able to run it all using pip (note you need `samtools`
|
|
77
|
+
and `minimap2` installed on your path. However, if you have issues we recommend using the Docker instance!
|
|
78
|
+
(the pip version doesn't work well with mac M3 but docker does.)
|
|
84
79
|
|
|
85
|
-
We
|
|
86
|
-
|
|
87
|
-
We recommend using command line interface(Terminal) and a conda environment for installation:
|
|
88
|
-
```
|
|
89
|
-
git clone https://github.com/fhalab/LevSeq.git
|
|
90
|
-
```
|
|
80
|
+
We recommend using terminal and a conda environment for installation:
|
|
91
81
|
|
|
92
82
|
```
|
|
93
83
|
conda create --name levseq python=3.10 -y
|
|
@@ -97,11 +87,6 @@ conda create --name levseq python=3.10 -y
|
|
|
97
87
|
conda activate levseq
|
|
98
88
|
```
|
|
99
89
|
|
|
100
|
-
From the LevSeq folder, install the package using pip:
|
|
101
|
-
|
|
102
|
-
```
|
|
103
|
-
pip install levseq
|
|
104
|
-
```
|
|
105
90
|
#### Dependencies
|
|
106
91
|
|
|
107
92
|
1. Samtools: https://www.htslib.org/download/
|
|
@@ -111,30 +96,26 @@ conda install -c bioconda -c conda-forge samtools
|
|
|
111
96
|
or for mac users you can use: `brew install samtools`
|
|
112
97
|
|
|
113
98
|
2. Minimap2: https://github.com/lh3/minimap2
|
|
99
|
+
|
|
114
100
|
```
|
|
115
101
|
conda install -c bioconda -c conda-forge minimap2
|
|
116
102
|
```
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
For Mac M chip users: installation via homebrew
|
|
121
|
-
```
|
|
122
|
-
brew install gcc@14
|
|
123
|
-
brew install gcc@13
|
|
124
|
-
```
|
|
125
|
-
For Linux users: installation via conda
|
|
126
|
-
```
|
|
127
|
-
conda install conda-forge::gcc=14
|
|
128
|
-
conda install conda-forge::gcc=13
|
|
129
|
-
```
|
|
103
|
+
### Docker Installation (Recommended for full pipeline)
|
|
104
|
+
For installing the whole pipeline, you'll need to use the docker image. For this, install docker as required for your
|
|
105
|
+
operating system (https://docs.docker.com/engine/install/).
|
|
130
106
|
|
|
131
107
|
### Usage
|
|
132
|
-
#### Command Line Interface
|
|
133
|
-
LevSeq can be run using the command line interface. Here's the basic structure of the command:
|
|
134
108
|
|
|
109
|
+
#### Run via pip
|
|
135
110
|
```
|
|
136
111
|
levseq <name of the run you can make this whatever> <location to data folder> <location of reference csv file>
|
|
137
112
|
```
|
|
113
|
+
#### Run via docker
|
|
114
|
+
```
|
|
115
|
+
docker run --rm -v "$(pwd):/levseq_results" levseq <name> <location to data folder> <location of reference csv file>
|
|
116
|
+
```
|
|
117
|
+
See the [manuscrtipt notebook](https://github.com/fhalab/LevSeq/blob/main/manuscript/notebooks/epPCR_10plates.ipynb) for an example.
|
|
118
|
+
|
|
138
119
|
#### Required Arguments
|
|
139
120
|
1. Name of the experiment, this will be the name of the output folder
|
|
140
121
|
2. Location of basecalled fastq files, this is the direct output from using the MinKnow software for sequencing
|
|
@@ -149,37 +130,10 @@ levseq <name of the run you can make this whatever> <location to data folder> <l
|
|
|
149
130
|
|
|
150
131
|
--show\_msa Showing multiple sequence alignment for each well
|
|
151
132
|
|
|
152
|
-
|
|
153
|
-
For installing the whole pipeline, you'll need to use the docker image. For this, install docker as required for your
|
|
154
|
-
operating system (https://docs.docker.com/engine/install/).
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
To build the docker image run (within the main folder that contains the `Dockerfile`). Note building does **not** work
|
|
158
|
-
on Mac M3 chip, please use a ubuntu machine to build the docker image!
|
|
159
|
-
|
|
160
|
-
```
|
|
161
|
-
docker build -t levseq .
|
|
162
|
-
```
|
|
163
|
-
|
|
164
|
-
This gives us the access to the lebSeq command line interface via:
|
|
165
|
-
|
|
166
|
-
```
|
|
167
|
-
docker run levseq
|
|
168
|
-
```
|
|
169
|
-
Note! The docker image should work with linux, and mac, however, different mac architectures may have issues (owing to the different M1/M3 processers.)
|
|
170
|
-
|
|
171
|
-
Basically the -v connects a folder on your computer with the output from the minION sequencer with the docker image that will take these results and then perform
|
|
172
|
-
demultiplexing and variant calling.
|
|
173
|
-
|
|
174
|
-
docker run -v /disk1/ariane/vscode/LevSeq/manuscript/Data/20241116-YL-LevSeq-parlqep400-1-2-P25-28:/levseq_results/ levseq docker-test levseq_results/ levseq_results/LevSeq-T1.csv
|
|
175
|
-
```
|
|
176
|
-
docker run -v /Users/XXXX/Documents/LevSeq/data:/levseq_results/ levseq 20240502 levseq_results/20240502/ levseq_results/20240502-YL-ParLQ-ep2.csv
|
|
177
|
-
```
|
|
178
|
-
|
|
179
|
-
In this command: `/Users/XXXX/Documents/LevSeq/data` is a folder on your computer, which contains a subfolder `20240502`
|
|
133
|
+
Great you should be all done!
|
|
180
134
|
|
|
181
|
-
|
|
135
|
+
For more details or trouble shooting please look at our [computational_protocols](https://github.com/fhalab/LevSeq/wiki/Computational-protocols).
|
|
182
136
|
|
|
183
|
-
|
|
137
|
+
#### Citing
|
|
184
138
|
|
|
185
|
-
If you
|
|
139
|
+
If you have found LevSeq useful, please cite out [paper](https://doi.org/10.1101/2024.09.04.611255).
|
|
@@ -263,14 +263,14 @@ class TestVariantCalling(TestClass):
|
|
|
263
263
|
quals.append(100) # Dummy don't need
|
|
264
264
|
|
|
265
265
|
well_df = make_well_df_from_reads(reads, read_ids, quals)
|
|
266
|
-
rows_all = make_row_from_read_pileup_across_well(well_df, parent_sequence, parent_name)
|
|
266
|
+
rows_all = make_row_from_read_pileup_across_well(well_df, parent_sequence, parent_name, defaultdict(list))
|
|
267
267
|
well_df = pd.DataFrame(rows_all)
|
|
268
268
|
well_df.columns = ['gene_name', 'position', 'ref', 'most_frequent', 'freq_non_ref', 'total_other',
|
|
269
269
|
'total_reads', 'p_value', 'percent_most_freq_mutation', 'A', 'p(a)', 'T', 'p(t)', 'G',
|
|
270
270
|
'p(g)',
|
|
271
|
-
'C', 'p(c)', 'N', 'p(n)']
|
|
271
|
+
'C', 'p(c)', 'N', 'p(n)', 'I', 'p(i)', 'Warning']
|
|
272
272
|
well_df = calculate_mutation_significance_across_well(well_df)
|
|
273
|
-
label, frequency, combined_p_value, mixed_well = get_variant_label_for_well(well_df, 0.5)
|
|
273
|
+
label, frequency, combined_p_value, mixed_well, mean_mutation_rate = get_variant_label_for_well(well_df, 0.5)
|
|
274
274
|
# This should be mutated at 100% - the rate of our sequencing errror
|
|
275
275
|
u.dp([f"Input parent: {parent_sequence}", f"Variant: {mutant}"])
|
|
276
276
|
u.dp(["label", label, f"frequency", frequency, f"combined_p_value", combined_p_value, "mixed_well", mixed_well])
|
|
@@ -280,6 +280,38 @@ class TestVariantCalling(TestClass):
|
|
|
280
280
|
assert combined_p_value < 0.05
|
|
281
281
|
assert mixed_well is False
|
|
282
282
|
|
|
283
|
+
|
|
284
|
+
def test_calling_variant_with_insert(self):
|
|
285
|
+
u.dp(["Testing calling variants using SSM with error"])
|
|
286
|
+
|
|
287
|
+
parent_sequence = "ATGAGT"
|
|
288
|
+
mutated_sequence = 'ATGAGT' # Not actually mutated
|
|
289
|
+
parent_name = 'parent'
|
|
290
|
+
reads = []
|
|
291
|
+
read_ids = []
|
|
292
|
+
quals = []
|
|
293
|
+
insert_map = defaultdict(list)
|
|
294
|
+
for i in range(0, 30):
|
|
295
|
+
read_ids.append(f'read_{i}')
|
|
296
|
+
reads.append(mutated_sequence)
|
|
297
|
+
insert_map[1].append('C') # Making them all have an insert at C
|
|
298
|
+
quals.append(100) # Dummy don't need
|
|
299
|
+
|
|
300
|
+
well_df = make_well_df_from_reads(reads, read_ids, quals)
|
|
301
|
+
rows_all = make_row_from_read_pileup_across_well(well_df, parent_sequence, parent_name, insert_map)
|
|
302
|
+
well_df = pd.DataFrame(rows_all)
|
|
303
|
+
well_df.columns = ['gene_name', 'position', 'ref', 'most_frequent', 'freq_non_ref', 'total_other',
|
|
304
|
+
'total_reads', 'p_value', 'percent_most_freq_mutation', 'A', 'p(a)', 'T', 'p(t)', 'G',
|
|
305
|
+
'p(g)',
|
|
306
|
+
'C', 'p(c)', 'N', 'p(n)', 'I', 'p(i)', 'Warning']
|
|
307
|
+
print(well_df['I'].describe())
|
|
308
|
+
well_df = calculate_mutation_significance_across_well(well_df)
|
|
309
|
+
label, frequency, combined_p_value, mixed_well, mean_mutation_rate = get_variant_label_for_well(well_df, 0.5)
|
|
310
|
+
# This should be mutated at 100% - the rate of our sequencing errror
|
|
311
|
+
u.dp([f"Input parent: {parent_sequence}"])
|
|
312
|
+
u.dp(["label", label, f"frequency", frequency, f"combined_p_value", combined_p_value, "mixed_well", mixed_well])
|
|
313
|
+
assert label == '+'
|
|
314
|
+
|
|
283
315
|
def test_mixed_wells(self):
|
|
284
316
|
# Test whether we're able to call mixed well populations
|
|
285
317
|
u.dp(["Testing ePCR with error"])
|
|
@@ -303,9 +335,19 @@ class TestVariantCalling(TestClass):
|
|
|
303
335
|
def test_variant_calling_main(self):
|
|
304
336
|
cl_args = {'skip_demultiplexing': True, 'skip_variantcalling': False}
|
|
305
337
|
cl_args["name"] = 'Laragen_Validation'
|
|
306
|
-
cl_args["output"] = '
|
|
307
|
-
cl_args['path'] = '
|
|
308
|
-
cl_args["summary"] = '
|
|
338
|
+
cl_args["output"] = '/Users/arianemora/Documents/code/MinION/manuscript/Data/'
|
|
339
|
+
cl_args['path'] = '/Users/arianemora/Documents/code/MinION/manuscript/Data/20241116-YL-LevSeq-parlqep400-1-2-P25-28/no_sample_id/'
|
|
340
|
+
cl_args["summary"] = '/Users/arianemora/Documents/code/MinION/manuscript/Data/20241116-YL-LevSeq-parlqep400-1-2-P25-28/LevSeq-T1.csv'
|
|
341
|
+
variant_df = process_ref_csv(cl_args)
|
|
342
|
+
variant_df.to_csv('variant_NEW.csv', index=False)
|
|
343
|
+
print(variant_df.head())
|
|
344
|
+
|
|
345
|
+
def test_variant_calling_for_LevSeq(self):
|
|
346
|
+
cl_args = {'skip_demultiplexing': True, 'skip_variantcalling': False}
|
|
347
|
+
cl_args["name"] = 'parLQ_20240502'
|
|
348
|
+
cl_args["output"] = '/Users/arianemora/Documents/code/LevSeq/data/epPCR/epPCR_main_manuscript/ParLQ-ep2/'
|
|
349
|
+
cl_args['path'] = '/Users/arianemora/Documents/code/LevSeq/data/epPCR/epPCR_main_manuscript/ParLQ-ep2/'
|
|
350
|
+
cl_args["summary"] = '/Users/arianemora/Documents/code/LevSeq/data/epPCR/epPCR_main_manuscript/ParLQ-ep2/20240502-YL-ParLQ-ep2.csv'
|
|
309
351
|
variant_df = process_ref_csv(cl_args)
|
|
310
352
|
variant_df.to_csv('variant_NEW.csv', index=False)
|
|
311
353
|
print(variant_df.head())
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|