levseq 1.2.1__tar.gz → 1.2.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. {levseq-1.2.1/levseq.egg-info → levseq-1.2.6}/PKG-INFO +60 -71
  2. levseq-1.2.6/README.md +127 -0
  3. {levseq-1.2.1 → levseq-1.2.6}/levseq/__init__.py +1 -1
  4. levseq-1.2.6/levseq/coordinates.py +76 -0
  5. {levseq-1.2.1 → levseq-1.2.6}/levseq/run_levseq.py +63 -52
  6. {levseq-1.2.1 → levseq-1.2.6}/levseq/seqfit.py +601 -103
  7. {levseq-1.2.1 → levseq-1.2.6}/levseq/utils.py +26 -23
  8. {levseq-1.2.1 → levseq-1.2.6}/levseq/variantcaller.py +2 -2
  9. {levseq-1.2.1 → levseq-1.2.6}/levseq/visualization.py +4 -4
  10. {levseq-1.2.1 → levseq-1.2.6/levseq.egg-info}/PKG-INFO +60 -71
  11. {levseq-1.2.1 → levseq-1.2.6}/levseq.egg-info/SOURCES.txt +1 -0
  12. {levseq-1.2.1 → levseq-1.2.6}/levseq.egg-info/requires.txt +1 -0
  13. {levseq-1.2.1 → levseq-1.2.6}/setup.py +2 -1
  14. {levseq-1.2.1 → levseq-1.2.6}/tests/test_variant_calling.py +48 -6
  15. levseq-1.2.1/README.md +0 -139
  16. {levseq-1.2.1 → levseq-1.2.6}/LICENSE +0 -0
  17. {levseq-1.2.1 → levseq-1.2.6}/MANIFEST.in +0 -0
  18. {levseq-1.2.1 → levseq-1.2.6}/levseq/IO_processor.py +0 -0
  19. {levseq-1.2.1 → levseq-1.2.6}/levseq/barcoding/__init__.py +0 -0
  20. {levseq-1.2.1 → levseq-1.2.6}/levseq/barcoding/demultiplex +0 -0
  21. {levseq-1.2.1 → levseq-1.2.6}/levseq/barcoding/demultiplex-arm64 +0 -0
  22. {levseq-1.2.1 → levseq-1.2.6}/levseq/barcoding/demultiplex-x86 +0 -0
  23. {levseq-1.2.1 → levseq-1.2.6}/levseq/barcoding/minion_barcodes.fasta +0 -0
  24. {levseq-1.2.1 → levseq-1.2.6}/levseq/basecaller.py +0 -0
  25. {levseq-1.2.1 → levseq-1.2.6}/levseq/cmd.py +0 -0
  26. {levseq-1.2.1 → levseq-1.2.6}/levseq/globals.py +0 -0
  27. {levseq-1.2.1 → levseq-1.2.6}/levseq/interface.py +0 -0
  28. {levseq-1.2.1 → levseq-1.2.6}/levseq/parser.py +0 -0
  29. {levseq-1.2.1 → levseq-1.2.6}/levseq/screen.py +0 -0
  30. {levseq-1.2.1 → levseq-1.2.6}/levseq/simulation.py +0 -0
  31. {levseq-1.2.1 → levseq-1.2.6}/levseq/user.py +0 -0
  32. {levseq-1.2.1 → levseq-1.2.6}/levseq.egg-info/dependency_links.txt +0 -0
  33. {levseq-1.2.1 → levseq-1.2.6}/levseq.egg-info/entry_points.txt +0 -0
  34. {levseq-1.2.1 → levseq-1.2.6}/levseq.egg-info/top_level.txt +0 -0
  35. {levseq-1.2.1 → levseq-1.2.6}/setup.cfg +0 -0
  36. {levseq-1.2.1 → levseq-1.2.6}/tests/test_demultiplex_docker.py +0 -0
  37. {levseq-1.2.1 → levseq-1.2.6}/tests/test_opligopools.py +0 -0
  38. {levseq-1.2.1 → levseq-1.2.6}/tests/test_seqfitvis.py +0 -0
  39. {levseq-1.2.1 → levseq-1.2.6}/tests/test_seqs.py +0 -0
  40. {levseq-1.2.1 → levseq-1.2.6}/tests/test_statistics.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: levseq
3
- Version: 1.2.1
3
+ Version: 1.2.6
4
4
  Home-page: https://github.com/fhalab/levseq/
5
5
  Author: Yueming Long, Emreay Gursoy, Ariane Mora, Francesca-Zhoufan Li
6
6
  Author-email: ylong@caltech.edu
@@ -43,6 +43,7 @@ Requires-Dist: seaborn
43
43
  Requires-Dist: scikit-learn
44
44
  Requires-Dist: statsmodels
45
45
  Requires-Dist: tqdm
46
+ Requires-Dist: biopandas
46
47
 
47
48
  # Variant Sequencing with Nanopore
48
49
 
@@ -53,8 +54,7 @@ Figure 1: Overview of the LevSeq variant sequencing workflow using Nanopore tech
53
54
 
54
55
 
55
56
  - Data to reproduce the results and to test are available on zenodo [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.13694463.svg)](https://doi.org/10.5281/zenodo.13694463)
56
-
57
- - A dockerized website and database for labs to locally host and visualize their data: website is available at: https://levseq.caltech.edu/ and code to host locally at: https://github.com/fhalab/LevSeq_VDB/
57
+ - A dockerized website and database for labs to locally host and visualize their data: website is available [here](https://github.com/ArianeMora/LevSeq_vis/) and code to host locally at: https://github.com/fhalab/LevSeq_VDB/
58
58
 
59
59
  ## Setup
60
60
 
@@ -65,76 +65,92 @@ For setting up the experimental side of LevSeq we suggest the following preparat
65
65
 
66
66
  ## How to Use LevSeq
67
67
 
68
- The wet lab part is detailed in the method section of the paper.
68
+ The wet lab part is detailed in the method section of the paper or via the [wiki](https://github.com/fhalab/LevSeq/wiki/Experimental-protocols).
69
69
 
70
70
  Once samples are prepared, the multiplexed sample is used for sequencing, and the sequencing data is stored in the `../data` folder as per the typical Nanopore flow (refer to Nanopore documentation for this).
71
71
 
72
72
  After sequencing, you can identify variants, demultiplex, and combine with your variant function here! For simple applications, we recommend using the notebook `example/Example.ipynb`.
73
73
 
74
- ### Steps of LevSeq:
75
-
76
- 1. **Basecalling**: This step converts Nanopore's FAST5 files to sequences. For basecalling, we use Nanopore's basecaller, Medaka, which can run in parallel with sequencing (recommended) or afterward.
77
-
78
- 2. **Demultiplexing**: After sequencing, the reads, stored as bulk FASTQ files, are sorted. During demultiplexing, each read is assigned to its correct plate/well combination and stored as a FASTQ file.
79
-
80
- 3. **Variant Calling**: For each sample, the consensus sequence is compared to the reference sequence. A variant is called if it differs from the reference sequence. The success of variant calling depends on the number of reads sequenced and their quality.
74
+ ### Installation
81
75
 
76
+ We aimed to make LevSeq as simple to use as possible, this means you should be able to run it all using pip (note you need `samtools`
77
+ and `minimap2` installed on your path. However, if you have issues we recommend using the Docker instance!
78
+ (the pip version doesn't work well with mac M3 but docker does.)
82
79
 
83
- ### Installation:
84
-
85
- We aimed to make LevSeq as simple to use as possible, this means you should be able to run it all using pip. However, if you have issues we recomend using the Docker instance!
86
-
87
- We recommend using command line interface(Terminal) and a conda environment for installation:
88
- ```
89
- git clone https://github.com/fhalab/LevSeq.git
90
- ```
80
+ We recommend using terminal and a conda environment for installation:
91
81
 
92
82
  ```
93
- conda create --name levseq python=3.10 -y
83
+ conda create --name levseq python=3.12 -y
94
84
  ```
95
85
 
96
86
  ```
97
87
  conda activate levseq
98
88
  ```
99
89
 
100
- From the LevSeq folder, install the package using pip:
101
-
102
- ```
103
- pip install levseq
104
- ```
105
90
  #### Dependencies
106
91
 
107
92
  1. Samtools: https://www.htslib.org/download/
108
93
  ```
109
94
  conda install -c bioconda -c conda-forge samtools
110
95
  ```
111
- or for mac users you can use: `brew install samtools`
96
+
112
97
 
113
98
  2. Minimap2: https://github.com/lh3/minimap2
99
+
114
100
  ```
115
101
  conda install -c bioconda -c conda-forge minimap2
116
102
  ```
117
- or for mac users you can use: `brew install minimap2`
118
- Once dependencies are all installed, you can run LevSeq using command line.
119
- 3. GCC version 13 and 14 are both needed
120
- For Mac M chip users: installation via homebrew
103
+ ### Docker Installation (Recommended for full pipeline)
104
+ For installing the whole pipeline, you'll need to use the docker image. For this, install docker as required for your
105
+ operating system (https://docs.docker.com/engine/install/).
106
+
107
+ ### Usage
108
+
109
+ #### Run via pip
110
+ ```
111
+ levseq <name of the run you can make this whatever> <location to data folder> <location of reference csv file>
112
+ ```
113
+
114
+ #### Run via docker
115
+ If using linux system
121
116
  ```
122
- brew install gcc@14
123
- brew install gcc@13
117
+ docker pull yueminglong/levseq:levseq-1.2.5-x86
124
118
  ```
125
- For Linux users: installation via conda
119
+ If using Mac M chips (image tested on M1, M3, and M4)
126
120
  ```
127
- conda install conda-forge::gcc=14
128
- conda install conda-forge::gcc=13
121
+ docker pull yueminglong/levseq:levseq-1.2.5-arm64
129
122
  ```
130
123
 
131
- ### Usage
132
- #### Command Line Interface
133
- LevSeq can be run using the command line interface. Here's the basic structure of the command:
134
-
135
124
  ```
136
- levseq <name of the run you can make this whatever> <location to data folder> <location of reference csv file>
125
+ docker run --rm -v "$(pwd):/levseq_results" yueminglong/levseq:levseq-1.2.5-<architecture> <name> <location to data folder> <location of reference csv file>
137
126
  ```
127
+ Explanation:
128
+
129
+ --rm: Automatically removes the container after the command finishes.
130
+
131
+ -v "$(pwd):/levseq\_results": Mounts the current directory ($(pwd)) to /levseq\_results inside the container, ensuring the results are saved to your current directory.
132
+
133
+ yueminglong/levseq:levseq-1.2.5-\<architecture\>: Specifies the Docker image to run. Replace \<architecture\> with the appropriate platform (e.g., x86).
134
+
135
+ \<name\>: The name or identifier for the analysis.
136
+
137
+ \<location to data folder\>: Path to the folder containing input data.
138
+
139
+ \<location of reference csv file\>: Path to the reference .csv file.
140
+
141
+ Important Notes:
142
+
143
+ If the current directory is mounted to the container (via -v "$(pwd):/levseq\_results"), the basecalled result in FASTQ format and the ref.csv file must be located in the current directory.
144
+
145
+ If these files are not present in the current directory, they will not be processed by the tool.
146
+
147
+ Output:
148
+
149
+ The results of the analysis will be saved to your current working directory.
150
+
151
+ See the [manuscrtipt notebook](https://github.com/fhalab/LevSeq/blob/main/manuscript/notebooks/epPCR_10plates.ipynb) for an example.
152
+ *Note: if using docker, the html and csv final output will be saved in the directory that you are running from instead of in the Platemaps or Results subfolder.
153
+
138
154
  #### Required Arguments
139
155
  1. Name of the experiment, this will be the name of the output folder
140
156
  2. Location of basecalled fastq files, this is the direct output from using the MinKnow software for sequencing
@@ -149,37 +165,10 @@ levseq <name of the run you can make this whatever> <location to data folder> <l
149
165
 
150
166
  --show\_msa Showing multiple sequence alignment for each well
151
167
 
152
- ### Docker Installation (Recommended for full pipeline)
153
- For installing the whole pipeline, you'll need to use the docker image. For this, install docker as required for your
154
- operating system (https://docs.docker.com/engine/install/).
155
-
156
-
157
- To build the docker image run (within the main folder that contains the `Dockerfile`). Note building does **not** work
158
- on Mac M3 chip, please use a ubuntu machine to build the docker image!
159
-
160
- ```
161
- docker build -t levseq .
162
- ```
163
-
164
- This gives us the access to the lebSeq command line interface via:
165
-
166
- ```
167
- docker run levseq
168
- ```
169
- Note! The docker image should work with linux, and mac, however, different mac architectures may have issues (owing to the different M1/M3 processers.)
170
-
171
- Basically the -v connects a folder on your computer with the output from the minION sequencer with the docker image that will take these results and then perform
172
- demultiplexing and variant calling.
173
-
174
- docker run -v /disk1/ariane/vscode/LevSeq/manuscript/Data/20241116-YL-LevSeq-parlqep400-1-2-P25-28:/levseq_results/ levseq docker-test levseq_results/ levseq_results/LevSeq-T1.csv
175
- ```
176
- docker run -v /Users/XXXX/Documents/LevSeq/data:/levseq_results/ levseq 20240502 levseq_results/20240502/ levseq_results/20240502-YL-ParLQ-ep2.csv
177
- ```
178
-
179
- In this command: `/Users/XXXX/Documents/LevSeq/data` is a folder on your computer, which contains a subfolder `20240502`
168
+ Great you should be all done!
180
169
 
181
- ### Issues and Troubleshooting
170
+ For more details or trouble shooting please look at our [computational_protocols](https://github.com/fhalab/LevSeq/wiki/Computational-protocols).
182
171
 
183
- If you have any issues, please check the LevSeq\_error.log find in the output direectory and report the issue. If the problem persists, please open an issue on the GitHub repository with the error details.
172
+ #### Citing
184
173
 
185
- If you solve something code wise, submit a pull request! We would love community input.
174
+ If you have found LevSeq useful, please cite out [paper](https://doi.org/10.1101/2024.09.04.611255).
levseq-1.2.6/README.md ADDED
@@ -0,0 +1,127 @@
1
+ # Variant Sequencing with Nanopore
2
+
3
+ In directed evolution, sequencing every variant enhances data insight and creates datasets suitable for AI/ML methods. This method is presented as an extension of the original Every Variant Sequencer using Illumina technology. With this approach, sequence variants can be generated within a day at an extremely low cost.
4
+
5
+ ![Figure 1: LevSeq Workflow](manuscript/figures/LevSeq_Figure-1.png)
6
+ Figure 1: Overview of the LevSeq variant sequencing workflow using Nanopore technology. This diagram illustrates the key steps in the process, from sample preparation to data analysis and visualization.
7
+
8
+
9
+ - Data to reproduce the results and to test are available on zenodo [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.13694463.svg)](https://doi.org/10.5281/zenodo.13694463)
10
+ - A dockerized website and database for labs to locally host and visualize their data: website is available [here](https://github.com/ArianeMora/LevSeq_vis/) and code to host locally at: https://github.com/fhalab/LevSeq_VDB/
11
+
12
+ ## Setup
13
+
14
+ For setting up the experimental side of LevSeq we suggest the following preparations:
15
+
16
+ - Order forward and reverse primers compatible with the desired plasmid, see methods section of [our paper](http://biorxiv.org/cgi/content/short/2024.09.04.611255v1?rss=1).
17
+ - Successfully install Oxford Nanopore's software (this is only for if you are doing basecalling/minION processing). [Link to installation guide](https://nanoporetech.com/).
18
+
19
+ ## How to Use LevSeq
20
+
21
+ The wet lab part is detailed in the method section of the paper or via the [wiki](https://github.com/fhalab/LevSeq/wiki/Experimental-protocols).
22
+
23
+ Once samples are prepared, the multiplexed sample is used for sequencing, and the sequencing data is stored in the `../data` folder as per the typical Nanopore flow (refer to Nanopore documentation for this).
24
+
25
+ After sequencing, you can identify variants, demultiplex, and combine with your variant function here! For simple applications, we recommend using the notebook `example/Example.ipynb`.
26
+
27
+ ### Installation
28
+
29
+ We aimed to make LevSeq as simple to use as possible, this means you should be able to run it all using pip (note you need `samtools`
30
+ and `minimap2` installed on your path. However, if you have issues we recommend using the Docker instance!
31
+ (the pip version doesn't work well with mac M3 but docker does.)
32
+
33
+ We recommend using terminal and a conda environment for installation:
34
+
35
+ ```
36
+ conda create --name levseq python=3.12 -y
37
+ ```
38
+
39
+ ```
40
+ conda activate levseq
41
+ ```
42
+
43
+ #### Dependencies
44
+
45
+ 1. Samtools: https://www.htslib.org/download/
46
+ ```
47
+ conda install -c bioconda -c conda-forge samtools
48
+ ```
49
+
50
+
51
+ 2. Minimap2: https://github.com/lh3/minimap2
52
+
53
+ ```
54
+ conda install -c bioconda -c conda-forge minimap2
55
+ ```
56
+ ### Docker Installation (Recommended for full pipeline)
57
+ For installing the whole pipeline, you'll need to use the docker image. For this, install docker as required for your
58
+ operating system (https://docs.docker.com/engine/install/).
59
+
60
+ ### Usage
61
+
62
+ #### Run via pip
63
+ ```
64
+ levseq <name of the run you can make this whatever> <location to data folder> <location of reference csv file>
65
+ ```
66
+
67
+ #### Run via docker
68
+ If using linux system
69
+ ```
70
+ docker pull yueminglong/levseq:levseq-1.2.5-x86
71
+ ```
72
+ If using Mac M chips (image tested on M1, M3, and M4)
73
+ ```
74
+ docker pull yueminglong/levseq:levseq-1.2.5-arm64
75
+ ```
76
+
77
+ ```
78
+ docker run --rm -v "$(pwd):/levseq_results" yueminglong/levseq:levseq-1.2.5-<architecture> <name> <location to data folder> <location of reference csv file>
79
+ ```
80
+ Explanation:
81
+
82
+ --rm: Automatically removes the container after the command finishes.
83
+
84
+ -v "$(pwd):/levseq\_results": Mounts the current directory ($(pwd)) to /levseq\_results inside the container, ensuring the results are saved to your current directory.
85
+
86
+ yueminglong/levseq:levseq-1.2.5-\<architecture\>: Specifies the Docker image to run. Replace \<architecture\> with the appropriate platform (e.g., x86).
87
+
88
+ \<name\>: The name or identifier for the analysis.
89
+
90
+ \<location to data folder\>: Path to the folder containing input data.
91
+
92
+ \<location of reference csv file\>: Path to the reference .csv file.
93
+
94
+ Important Notes:
95
+
96
+ If the current directory is mounted to the container (via -v "$(pwd):/levseq\_results"), the basecalled result in FASTQ format and the ref.csv file must be located in the current directory.
97
+
98
+ If these files are not present in the current directory, they will not be processed by the tool.
99
+
100
+ Output:
101
+
102
+ The results of the analysis will be saved to your current working directory.
103
+
104
+ See the [manuscrtipt notebook](https://github.com/fhalab/LevSeq/blob/main/manuscript/notebooks/epPCR_10plates.ipynb) for an example.
105
+ *Note: if using docker, the html and csv final output will be saved in the directory that you are running from instead of in the Platemaps or Results subfolder.
106
+
107
+ #### Required Arguments
108
+ 1. Name of the experiment, this will be the name of the output folder
109
+ 2. Location of basecalled fastq files, this is the direct output from using the MinKnow software for sequencing
110
+ 3. Location of reference csv file, this file includes information of barcodes used for each plate and the DNA sequence used for reference for each plate
111
+
112
+ #### Optional Arguments
113
+ --skip\_demultiplexing If enabled, demultiplexing step will be skipped
114
+
115
+ --skip\_variantcalling If enabled, variant valling step will be skipped
116
+
117
+ --output Save location for output, if not provided default to where the program is executed
118
+
119
+ --show\_msa Showing multiple sequence alignment for each well
120
+
121
+ Great you should be all done!
122
+
123
+ For more details or trouble shooting please look at our [computational_protocols](https://github.com/fhalab/LevSeq/wiki/Computational-protocols).
124
+
125
+ #### Citing
126
+
127
+ If you have found LevSeq useful, please cite out [paper](https://doi.org/10.1101/2024.09.04.611255).
@@ -18,7 +18,7 @@
18
18
  __title__ = 'levseq'
19
19
  __description__ = 'LevSeq nanopore sequencing'
20
20
  __url__ = 'https://github.com/fhalab/levseq/'
21
- __version__ = '1.2.1'
21
+ __version__ = '1.2.6'
22
22
  __author__ = 'Yueming Long, Emreay Gursoy, Ariane Mora, Francesca-Zhoufan Li'
23
23
  __author_email__ = 'ylong@caltech.edu'
24
24
  __license__ = 'GPL3'
@@ -0,0 +1,76 @@
1
+ import esm
2
+ import torch
3
+ import pandas as pd
4
+ from sklearn.decomposition import PCA
5
+ import os
6
+ import argparse
7
+
8
+ def preprocess_sequence(sequence):
9
+ """
10
+ Preprocesses the amino acid sequence by removing everything after the first '*' (stop codon).
11
+ """
12
+ if '*' in sequence:
13
+ sequence = sequence.split('*')[0] # Take everything before the first '*'
14
+ return sequence
15
+
16
+ def process_file(input_file, output_file=None):
17
+ # Load the dataset
18
+ data = pd.read_csv(input_file)
19
+
20
+ # Remove the "Unnamed: 0" column if it exists
21
+ if 'Unnamed: 0' in data.columns:
22
+ data = data.drop(columns=['Unnamed: 0'])
23
+
24
+ # Create the ID column as the combination of `Plate` and `Well`
25
+ data['ID'] = data['Plate'] + '-' + data['Well']
26
+ data = data[['ID'] + [col for col in data.columns if col != 'ID']] # Reorder to make ID the first column
27
+
28
+ # Filter valid sequences from the `aa_sequence` column
29
+ valid_sequences = data['aa_sequence'].dropna()
30
+ valid_sequences = valid_sequences[~valid_sequences.str.contains('#N.A.#|Deletion')]
31
+
32
+ # Preprocess sequences to handle stop codons
33
+ valid_sequences = valid_sequences.apply(preprocess_sequence)
34
+
35
+ # Load the ESM-2 model
36
+ model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
37
+ batch_converter = alphabet.get_batch_converter()
38
+
39
+ # Prepare sequences for embedding
40
+ sequences = valid_sequences.tolist()
41
+ sequence_names = [f"Sequence {i}" for i in range(len(sequences))]
42
+ batch_labels, batch_strs, batch_tokens = batch_converter(list(zip(sequence_names, sequences)))
43
+
44
+ # Extract embeddings
45
+ with torch.no_grad():
46
+ results = model(batch_tokens, repr_layers=[33])
47
+ embeddings = results["representations"][33] # Use the top (last) layer representations
48
+
49
+ # Average embeddings across residues for sequence-level representation
50
+ sequence_embeddings = embeddings.mean(1).numpy()
51
+
52
+ # Dimensionality Reduction using PCA
53
+ pca = PCA(n_components=2)
54
+ xy_coordinates = pca.fit_transform(sequence_embeddings)
55
+
56
+ # Add x, y coordinates back to the dataframe
57
+ xy_df = pd.DataFrame(xy_coordinates, columns=['x_coordinate', 'y_coordinate'], index=valid_sequences.index)
58
+ data = pd.concat([data, xy_df], axis=1)
59
+
60
+ # Determine output file location
61
+ if output_file is None:
62
+ input_name, input_ext = os.path.splitext(input_file)
63
+ output_file = f"{input_name}_xy{input_ext}"
64
+
65
+ # Save the updated dataframe to a file
66
+ data.to_csv(output_file, index=False)
67
+ print(f"Processed data with x, y coordinates saved to: {output_file}")
68
+
69
+ if __name__ == "__main__":
70
+ parser = argparse.ArgumentParser(description="Generate x, y coordinates for amino acid sequences")
71
+ parser.add_argument('input_file', type=str, help="Path to the input CSV file")
72
+ parser.add_argument('--output_file', type=str, default=None, help="Path to save the output CSV file (optional)")
73
+ args = parser.parse_args()
74
+
75
+ process_file(args.input_file, args.output_file)
76
+
@@ -243,6 +243,19 @@ def call_variant(experiment_name, experiment_folder, template_fasta, filtered_ba
243
243
  except Exception as e:
244
244
  logging.error("Variant calling failed", exc_info=True)
245
245
  raise
246
+
247
+ def assign_alignment_probability(row):
248
+ if row["Variant"] == "#PARENT#":
249
+ if row["Alignment Count"] > 20:
250
+ return 1
251
+ elif 10 <= row["Alignment Count"] <= 20:
252
+ return (row["Alignment Count"] - 10) / 10 # Ranges from 0 to 1 linearly
253
+ else:
254
+ return 0
255
+ else:
256
+ return row["Average mutation frequency"]
257
+
258
+
246
259
  # Full version of create_df_v function
247
260
  def create_df_v(variants_df):
248
261
  # Make copy of dataframe
@@ -258,28 +271,19 @@ def create_df_v(variants_df):
258
271
 
259
272
  # Translate nc_variant to aa_variant
260
273
  df_variants_["aa_variant"] = df_variants_["nc_variant"].apply(
261
- lambda x: x if x in ["Deletion", "#N.A.#"] else translate(x)
274
+ lambda x: x if x in ["Deletion", "#N.A.#", 'Insertion'] else translate(x)
262
275
  )
263
276
  # Fill in 'Deletion' in 'aa_variant' column
264
277
  df_variants_.loc[
265
278
  df_variants_["nc_variant"] == "Deletion", "aa_variant"
266
279
  ] = "Deletion"
280
+ df_variants_.loc[
281
+ df_variants_["nc_variant"] == "Insertion", "aa_variant"
282
+ ] = "Insertion"
267
283
 
268
284
  # Compare aa_variant with translated refseq and generate Substitutions column
269
285
  df_variants_["Substitutions"] = df_variants_.apply(get_mutations, axis=1)
270
-
271
286
  # Adding sequence quality to Alignment Probability before filling in empty values
272
- def assign_alignment_probability(row):
273
- if row["Variant"] == "#PARENT#":
274
- if row["Alignment Count"] > 20:
275
- return 1
276
- elif 10 <= row["Alignment Count"] <= 20:
277
- return (row["Alignment Count"] - 10) / 10 # Ranges from 0 to 1 linearly
278
- else:
279
- return 0
280
- else:
281
- return row["Average mutation frequency"]
282
-
283
287
  df_variants_["Alignment Probability"] = df_variants_.apply(assign_alignment_probability, axis=1)
284
288
  df_variants_["Alignment Probability"] = df_variants_["Alignment Probability"].fillna(0.0)
285
289
  df_variants_["Alignment Count"] = df_variants_["Alignment Count"].fillna(0.0)
@@ -291,7 +295,9 @@ def create_df_v(variants_df):
291
295
  elif df_variants_["nc_variant"].iloc[i] == "#N.A.#":
292
296
  df_variants_.Substitutions.iat[i] = "#N.A.#"
293
297
 
294
-
298
+ # Low read counts override low mutations
299
+ df_variants_["Substitutions"] = ["#LOW#" if a < 10 and a > 0 else s for a, s in df_variants_[["Alignment Count", "Substitutions"]].values]
300
+
295
301
  # Add row and columns
296
302
  Well = df_variants_["Well"].tolist()
297
303
  row = []
@@ -315,32 +321,36 @@ def create_df_v(variants_df):
315
321
  df_variants_["Plate"] = df_variants_["Plate"].apply(
316
322
  lambda x: f"0{x}" if len(x) == 1 else x
317
323
  )
318
- # Rename columns as per the request
324
+
325
+ # First rename columns as before
319
326
  df_variants_.rename(columns={
320
- "Variant": "nucleotide_mutation",
321
- "Substitutions": "amino-acid_substitutions",
322
- "nc_variant": "nt_sequence",
323
- "aa_variant": "aa_sequence"
324
- }, inplace=True)
325
-
326
-
327
- # Select the desired columns in the desired order
328
- restructured_df = df_variants_[
329
- [
330
- "barcode_plate",
331
- "Plate",
332
- "Well",
333
- "Alignment Count",
334
- "nucleotide_mutation",
335
- "amino-acid_substitutions",
336
- "Alignment Probability",
337
- "Average mutation frequency",
338
- "P value",
339
- "P adj. value",
340
- "nt_sequence",
341
- "aa_sequence",
342
- ]
343
- ]
327
+ "Variant": "nucleotide_mutation",
328
+ "Substitutions": "amino_acid_substitutions",
329
+ "nc_variant": "nt_sequence",
330
+ "aa_variant": "aa_sequence"
331
+ }, inplace=True)
332
+
333
+ # Create a copy for restructuring to avoid affecting the original
334
+ restructured_df = df_variants_.copy()
335
+ restructured_df.columns = restructured_df.columns.str.lower().str.replace('[\s-]', '_', regex=True)
336
+ # Fix the specific column name
337
+ restructured_df.columns = restructured_df.columns.str.replace('p_adj._value', 'p_adj_value')
338
+
339
+ # Select the desired columns in the desired order
340
+ restructured_df = restructured_df[[
341
+ "barcode_plate",
342
+ "plate",
343
+ "well",
344
+ "alignment_count",
345
+ "nucleotide_mutation",
346
+ "amino_acid_substitutions",
347
+ "alignment_probability",
348
+ "average_mutation_frequency",
349
+ "p_value",
350
+ "p_adj_value",
351
+ "nt_sequence",
352
+ "aa_sequence"
353
+ ]]
344
354
 
345
355
  return restructured_df, df_variants_
346
356
 
@@ -354,16 +364,21 @@ def create_nc_variant(variant, refseq):
354
364
  return refseq
355
365
  elif "DEL" in variant:
356
366
  return "Deletion"
367
+ elif variant == '+':
368
+ return "Insertion"
357
369
  else:
358
370
  mutations = variant.split("_")
359
371
  nc_variant = list(refseq)
360
372
  for mutation in mutations:
361
- if len(mutation) >= 2:
373
+ try:
362
374
  position = int(re.findall(r"\d+", mutation)[0]) - 1
363
375
  original = mutation[0]
364
376
  new = mutation[-1]
365
- if position < len(nc_variant) and nc_variant[position] == original:
366
- nc_variant[position] = new
377
+ if position < len(nc_variant) and nc_variant[position] == original:
378
+ nc_variant[position] = new
379
+ except:
380
+ print('WARNING! UNABLE TO PROCESS THIS')
381
+ print(mutation)
367
382
  return "".join(nc_variant)
368
383
 
369
384
 
@@ -377,7 +392,9 @@ def get_mutations(row):
377
392
  # Check if alignment_count is zero and return "#N.A.#" if true
378
393
  if alignment_count == 0:
379
394
  return "#N.A.#"
380
-
395
+ if alignment_count <= 10:
396
+ return "#LOW#"
397
+
381
398
  refseq = row["refseq"]
382
399
 
383
400
  if not is_valid_dna_sequence(refseq):
@@ -395,10 +412,7 @@ def get_mutations(row):
395
412
  if refseq_aa[i] != variant_aa[i]:
396
413
  mutations.append(f"{refseq_aa[i]}{i+1}{variant_aa[i]}")
397
414
  if not mutations:
398
- if alignment_count < 15:
399
- return "#N.A.#"
400
- else:
401
- return "#PARENT#"
415
+ return "#PARENT#"
402
416
  else:
403
417
  return "LEN"
404
418
  return "_".join(mutations) if mutations else ""
@@ -433,10 +447,7 @@ def process_ref_csv(cl_args, tqdm_fn=tqdm.tqdm):
433
447
  result_folder = create_result_folder(cl_args)
434
448
  variant_csv_path = os.path.join(result_folder, "variants.csv")
435
449
 
436
- if os.path.exists(variant_csv_path):
437
- variant_df = pd.read_csv(variant_csv_path)
438
- else:
439
- variant_df = pd.DataFrame(columns=["barcode_plate", "name", "refseq", "variant"])
450
+ variant_df = pd.DataFrame(columns=["barcode_plate", "name", "refseq", "variant"])
440
451
 
441
452
  for i, row in tqdm_fn(ref_df.iterrows(), total=len(ref_df), desc="Processing Samples"):
442
453
  barcode_plate = row["barcode_plate"]
@@ -456,9 +467,9 @@ def process_ref_csv(cl_args, tqdm_fn=tqdm.tqdm):
456
467
  barcode_path = filter_bc(cl_args, name_folder, i)
457
468
  output_dir = Path(result_folder) / "basecalled_reads"
458
469
  output_dir.mkdir(parents=True, exist_ok=True)
459
- file_to_fastq = cat_fastq_files(cl_args.get("path"), output_dir)
460
470
 
461
471
  if not cl_args["skip_demultiplexing"]:
472
+ file_to_fastq = cat_fastq_files(cl_args.get("path"), output_dir)
462
473
  try:
463
474
  demux_fastq(output_dir, name_folder, barcode_path)
464
475
  except Exception as e: