levseq 1.2.5__tar.gz → 1.2.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {levseq-1.2.5/levseq.egg-info → levseq-1.2.6}/PKG-INFO +39 -4
- {levseq-1.2.5 → levseq-1.2.6}/README.md +39 -4
- {levseq-1.2.5 → levseq-1.2.6}/levseq/__init__.py +1 -1
- levseq-1.2.6/levseq/coordinates.py +76 -0
- {levseq-1.2.5 → levseq-1.2.6}/levseq/run_levseq.py +26 -20
- {levseq-1.2.5 → levseq-1.2.6}/levseq/seqfit.py +536 -117
- {levseq-1.2.5 → levseq-1.2.6}/levseq/visualization.py +4 -4
- {levseq-1.2.5 → levseq-1.2.6/levseq.egg-info}/PKG-INFO +39 -4
- {levseq-1.2.5 → levseq-1.2.6}/levseq.egg-info/SOURCES.txt +1 -0
- {levseq-1.2.5 → levseq-1.2.6}/LICENSE +0 -0
- {levseq-1.2.5 → levseq-1.2.6}/MANIFEST.in +0 -0
- {levseq-1.2.5 → levseq-1.2.6}/levseq/IO_processor.py +0 -0
- {levseq-1.2.5 → levseq-1.2.6}/levseq/barcoding/__init__.py +0 -0
- {levseq-1.2.5 → levseq-1.2.6}/levseq/barcoding/demultiplex +0 -0
- {levseq-1.2.5 → levseq-1.2.6}/levseq/barcoding/demultiplex-arm64 +0 -0
- {levseq-1.2.5 → levseq-1.2.6}/levseq/barcoding/demultiplex-x86 +0 -0
- {levseq-1.2.5 → levseq-1.2.6}/levseq/barcoding/minion_barcodes.fasta +0 -0
- {levseq-1.2.5 → levseq-1.2.6}/levseq/basecaller.py +0 -0
- {levseq-1.2.5 → levseq-1.2.6}/levseq/cmd.py +0 -0
- {levseq-1.2.5 → levseq-1.2.6}/levseq/globals.py +0 -0
- {levseq-1.2.5 → levseq-1.2.6}/levseq/interface.py +0 -0
- {levseq-1.2.5 → levseq-1.2.6}/levseq/parser.py +0 -0
- {levseq-1.2.5 → levseq-1.2.6}/levseq/screen.py +0 -0
- {levseq-1.2.5 → levseq-1.2.6}/levseq/simulation.py +0 -0
- {levseq-1.2.5 → levseq-1.2.6}/levseq/user.py +0 -0
- {levseq-1.2.5 → levseq-1.2.6}/levseq/utils.py +0 -0
- {levseq-1.2.5 → levseq-1.2.6}/levseq/variantcaller.py +0 -0
- {levseq-1.2.5 → levseq-1.2.6}/levseq.egg-info/dependency_links.txt +0 -0
- {levseq-1.2.5 → levseq-1.2.6}/levseq.egg-info/entry_points.txt +0 -0
- {levseq-1.2.5 → levseq-1.2.6}/levseq.egg-info/requires.txt +0 -0
- {levseq-1.2.5 → levseq-1.2.6}/levseq.egg-info/top_level.txt +0 -0
- {levseq-1.2.5 → levseq-1.2.6}/setup.cfg +0 -0
- {levseq-1.2.5 → levseq-1.2.6}/setup.py +0 -0
- {levseq-1.2.5 → levseq-1.2.6}/tests/test_demultiplex_docker.py +0 -0
- {levseq-1.2.5 → levseq-1.2.6}/tests/test_opligopools.py +0 -0
- {levseq-1.2.5 → levseq-1.2.6}/tests/test_seqfitvis.py +0 -0
- {levseq-1.2.5 → levseq-1.2.6}/tests/test_seqs.py +0 -0
- {levseq-1.2.5 → levseq-1.2.6}/tests/test_statistics.py +0 -0
- {levseq-1.2.5 → levseq-1.2.6}/tests/test_variant_calling.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: levseq
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.6
|
|
4
4
|
Home-page: https://github.com/fhalab/levseq/
|
|
5
5
|
Author: Yueming Long, Emreay Gursoy, Ariane Mora, Francesca-Zhoufan Li
|
|
6
6
|
Author-email: ylong@caltech.edu
|
|
@@ -80,7 +80,7 @@ and `minimap2` installed on your path. However, if you have issues we recommend
|
|
|
80
80
|
We recommend using terminal and a conda environment for installation:
|
|
81
81
|
|
|
82
82
|
```
|
|
83
|
-
conda create --name levseq python=3.
|
|
83
|
+
conda create --name levseq python=3.12 -y
|
|
84
84
|
```
|
|
85
85
|
|
|
86
86
|
```
|
|
@@ -93,7 +93,7 @@ conda activate levseq
|
|
|
93
93
|
```
|
|
94
94
|
conda install -c bioconda -c conda-forge samtools
|
|
95
95
|
```
|
|
96
|
-
|
|
96
|
+
|
|
97
97
|
|
|
98
98
|
2. Minimap2: https://github.com/lh3/minimap2
|
|
99
99
|
|
|
@@ -110,11 +110,46 @@ operating system (https://docs.docker.com/engine/install/).
|
|
|
110
110
|
```
|
|
111
111
|
levseq <name of the run you can make this whatever> <location to data folder> <location of reference csv file>
|
|
112
112
|
```
|
|
113
|
+
|
|
113
114
|
#### Run via docker
|
|
115
|
+
If using linux system
|
|
116
|
+
```
|
|
117
|
+
docker pull yueminglong/levseq:levseq-1.2.5-x86
|
|
118
|
+
```
|
|
119
|
+
If using Mac M chips (image tested on M1, M3, and M4)
|
|
120
|
+
```
|
|
121
|
+
docker pull yueminglong/levseq:levseq-1.2.5-arm64
|
|
114
122
|
```
|
|
115
|
-
|
|
123
|
+
|
|
124
|
+
```
|
|
125
|
+
docker run --rm -v "$(pwd):/levseq_results" yueminglong/levseq:levseq-1.2.5-<architecture> <name> <location to data folder> <location of reference csv file>
|
|
116
126
|
```
|
|
127
|
+
Explanation:
|
|
128
|
+
|
|
129
|
+
--rm: Automatically removes the container after the command finishes.
|
|
130
|
+
|
|
131
|
+
-v "$(pwd):/levseq\_results": Mounts the current directory ($(pwd)) to /levseq\_results inside the container, ensuring the results are saved to your current directory.
|
|
132
|
+
|
|
133
|
+
yueminglong/levseq:levseq-1.2.5-\<architecture\>: Specifies the Docker image to run. Replace \<architecture\> with the appropriate platform (e.g., x86).
|
|
134
|
+
|
|
135
|
+
\<name\>: The name or identifier for the analysis.
|
|
136
|
+
|
|
137
|
+
\<location to data folder\>: Path to the folder containing input data.
|
|
138
|
+
|
|
139
|
+
\<location of reference csv file\>: Path to the reference .csv file.
|
|
140
|
+
|
|
141
|
+
Important Notes:
|
|
142
|
+
|
|
143
|
+
If the current directory is mounted to the container (via -v "$(pwd):/levseq\_results"), the basecalled result in FASTQ format and the ref.csv file must be located in the current directory.
|
|
144
|
+
|
|
145
|
+
If these files are not present in the current directory, they will not be processed by the tool.
|
|
146
|
+
|
|
147
|
+
Output:
|
|
148
|
+
|
|
149
|
+
The results of the analysis will be saved to your current working directory.
|
|
150
|
+
|
|
117
151
|
See the [manuscrtipt notebook](https://github.com/fhalab/LevSeq/blob/main/manuscript/notebooks/epPCR_10plates.ipynb) for an example.
|
|
152
|
+
*Note: if using docker, the html and csv final output will be saved in the directory that you are running from instead of in the Platemaps or Results subfolder.
|
|
118
153
|
|
|
119
154
|
#### Required Arguments
|
|
120
155
|
1. Name of the experiment, this will be the name of the output folder
|
|
@@ -33,7 +33,7 @@ and `minimap2` installed on your path. However, if you have issues we recommend
|
|
|
33
33
|
We recommend using terminal and a conda environment for installation:
|
|
34
34
|
|
|
35
35
|
```
|
|
36
|
-
conda create --name levseq python=3.
|
|
36
|
+
conda create --name levseq python=3.12 -y
|
|
37
37
|
```
|
|
38
38
|
|
|
39
39
|
```
|
|
@@ -46,7 +46,7 @@ conda activate levseq
|
|
|
46
46
|
```
|
|
47
47
|
conda install -c bioconda -c conda-forge samtools
|
|
48
48
|
```
|
|
49
|
-
|
|
49
|
+
|
|
50
50
|
|
|
51
51
|
2. Minimap2: https://github.com/lh3/minimap2
|
|
52
52
|
|
|
@@ -63,11 +63,46 @@ operating system (https://docs.docker.com/engine/install/).
|
|
|
63
63
|
```
|
|
64
64
|
levseq <name of the run you can make this whatever> <location to data folder> <location of reference csv file>
|
|
65
65
|
```
|
|
66
|
+
|
|
66
67
|
#### Run via docker
|
|
68
|
+
If using linux system
|
|
69
|
+
```
|
|
70
|
+
docker pull yueminglong/levseq:levseq-1.2.5-x86
|
|
71
|
+
```
|
|
72
|
+
If using Mac M chips (image tested on M1, M3, and M4)
|
|
73
|
+
```
|
|
74
|
+
docker pull yueminglong/levseq:levseq-1.2.5-arm64
|
|
67
75
|
```
|
|
68
|
-
|
|
76
|
+
|
|
77
|
+
```
|
|
78
|
+
docker run --rm -v "$(pwd):/levseq_results" yueminglong/levseq:levseq-1.2.5-<architecture> <name> <location to data folder> <location of reference csv file>
|
|
69
79
|
```
|
|
80
|
+
Explanation:
|
|
81
|
+
|
|
82
|
+
--rm: Automatically removes the container after the command finishes.
|
|
83
|
+
|
|
84
|
+
-v "$(pwd):/levseq\_results": Mounts the current directory ($(pwd)) to /levseq\_results inside the container, ensuring the results are saved to your current directory.
|
|
85
|
+
|
|
86
|
+
yueminglong/levseq:levseq-1.2.5-\<architecture\>: Specifies the Docker image to run. Replace \<architecture\> with the appropriate platform (e.g., x86).
|
|
87
|
+
|
|
88
|
+
\<name\>: The name or identifier for the analysis.
|
|
89
|
+
|
|
90
|
+
\<location to data folder\>: Path to the folder containing input data.
|
|
91
|
+
|
|
92
|
+
\<location of reference csv file\>: Path to the reference .csv file.
|
|
93
|
+
|
|
94
|
+
Important Notes:
|
|
95
|
+
|
|
96
|
+
If the current directory is mounted to the container (via -v "$(pwd):/levseq\_results"), the basecalled result in FASTQ format and the ref.csv file must be located in the current directory.
|
|
97
|
+
|
|
98
|
+
If these files are not present in the current directory, they will not be processed by the tool.
|
|
99
|
+
|
|
100
|
+
Output:
|
|
101
|
+
|
|
102
|
+
The results of the analysis will be saved to your current working directory.
|
|
103
|
+
|
|
70
104
|
See the [manuscrtipt notebook](https://github.com/fhalab/LevSeq/blob/main/manuscript/notebooks/epPCR_10plates.ipynb) for an example.
|
|
105
|
+
*Note: if using docker, the html and csv final output will be saved in the directory that you are running from instead of in the Platemaps or Results subfolder.
|
|
71
106
|
|
|
72
107
|
#### Required Arguments
|
|
73
108
|
1. Name of the experiment, this will be the name of the output folder
|
|
@@ -89,4 +124,4 @@ For more details or trouble shooting please look at our [computational_protocols
|
|
|
89
124
|
|
|
90
125
|
#### Citing
|
|
91
126
|
|
|
92
|
-
If you have found LevSeq useful, please cite out [paper](https://doi.org/10.1101/2024.09.04.611255).
|
|
127
|
+
If you have found LevSeq useful, please cite out [paper](https://doi.org/10.1101/2024.09.04.611255).
|
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
__title__ = 'levseq'
|
|
19
19
|
__description__ = 'LevSeq nanopore sequencing'
|
|
20
20
|
__url__ = 'https://github.com/fhalab/levseq/'
|
|
21
|
-
__version__ = '1.2.
|
|
21
|
+
__version__ = '1.2.6'
|
|
22
22
|
__author__ = 'Yueming Long, Emreay Gursoy, Ariane Mora, Francesca-Zhoufan Li'
|
|
23
23
|
__author_email__ = 'ylong@caltech.edu'
|
|
24
24
|
__license__ = 'GPL3'
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import esm
|
|
2
|
+
import torch
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from sklearn.decomposition import PCA
|
|
5
|
+
import os
|
|
6
|
+
import argparse
|
|
7
|
+
|
|
8
|
+
def preprocess_sequence(sequence):
|
|
9
|
+
"""
|
|
10
|
+
Preprocesses the amino acid sequence by removing everything after the first '*' (stop codon).
|
|
11
|
+
"""
|
|
12
|
+
if '*' in sequence:
|
|
13
|
+
sequence = sequence.split('*')[0] # Take everything before the first '*'
|
|
14
|
+
return sequence
|
|
15
|
+
|
|
16
|
+
def process_file(input_file, output_file=None):
|
|
17
|
+
# Load the dataset
|
|
18
|
+
data = pd.read_csv(input_file)
|
|
19
|
+
|
|
20
|
+
# Remove the "Unnamed: 0" column if it exists
|
|
21
|
+
if 'Unnamed: 0' in data.columns:
|
|
22
|
+
data = data.drop(columns=['Unnamed: 0'])
|
|
23
|
+
|
|
24
|
+
# Create the ID column as the combination of `Plate` and `Well`
|
|
25
|
+
data['ID'] = data['Plate'] + '-' + data['Well']
|
|
26
|
+
data = data[['ID'] + [col for col in data.columns if col != 'ID']] # Reorder to make ID the first column
|
|
27
|
+
|
|
28
|
+
# Filter valid sequences from the `aa_sequence` column
|
|
29
|
+
valid_sequences = data['aa_sequence'].dropna()
|
|
30
|
+
valid_sequences = valid_sequences[~valid_sequences.str.contains('#N.A.#|Deletion')]
|
|
31
|
+
|
|
32
|
+
# Preprocess sequences to handle stop codons
|
|
33
|
+
valid_sequences = valid_sequences.apply(preprocess_sequence)
|
|
34
|
+
|
|
35
|
+
# Load the ESM-2 model
|
|
36
|
+
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
|
|
37
|
+
batch_converter = alphabet.get_batch_converter()
|
|
38
|
+
|
|
39
|
+
# Prepare sequences for embedding
|
|
40
|
+
sequences = valid_sequences.tolist()
|
|
41
|
+
sequence_names = [f"Sequence {i}" for i in range(len(sequences))]
|
|
42
|
+
batch_labels, batch_strs, batch_tokens = batch_converter(list(zip(sequence_names, sequences)))
|
|
43
|
+
|
|
44
|
+
# Extract embeddings
|
|
45
|
+
with torch.no_grad():
|
|
46
|
+
results = model(batch_tokens, repr_layers=[33])
|
|
47
|
+
embeddings = results["representations"][33] # Use the top (last) layer representations
|
|
48
|
+
|
|
49
|
+
# Average embeddings across residues for sequence-level representation
|
|
50
|
+
sequence_embeddings = embeddings.mean(1).numpy()
|
|
51
|
+
|
|
52
|
+
# Dimensionality Reduction using PCA
|
|
53
|
+
pca = PCA(n_components=2)
|
|
54
|
+
xy_coordinates = pca.fit_transform(sequence_embeddings)
|
|
55
|
+
|
|
56
|
+
# Add x, y coordinates back to the dataframe
|
|
57
|
+
xy_df = pd.DataFrame(xy_coordinates, columns=['x_coordinate', 'y_coordinate'], index=valid_sequences.index)
|
|
58
|
+
data = pd.concat([data, xy_df], axis=1)
|
|
59
|
+
|
|
60
|
+
# Determine output file location
|
|
61
|
+
if output_file is None:
|
|
62
|
+
input_name, input_ext = os.path.splitext(input_file)
|
|
63
|
+
output_file = f"{input_name}_xy{input_ext}"
|
|
64
|
+
|
|
65
|
+
# Save the updated dataframe to a file
|
|
66
|
+
data.to_csv(output_file, index=False)
|
|
67
|
+
print(f"Processed data with x, y coordinates saved to: {output_file}")
|
|
68
|
+
|
|
69
|
+
if __name__ == "__main__":
|
|
70
|
+
parser = argparse.ArgumentParser(description="Generate x, y coordinates for amino acid sequences")
|
|
71
|
+
parser.add_argument('input_file', type=str, help="Path to the input CSV file")
|
|
72
|
+
parser.add_argument('--output_file', type=str, default=None, help="Path to save the output CSV file (optional)")
|
|
73
|
+
args = parser.parse_args()
|
|
74
|
+
|
|
75
|
+
process_file(args.input_file, args.output_file)
|
|
76
|
+
|
|
@@ -321,30 +321,36 @@ def create_df_v(variants_df):
|
|
|
321
321
|
df_variants_["Plate"] = df_variants_["Plate"].apply(
|
|
322
322
|
lambda x: f"0{x}" if len(x) == 1 else x
|
|
323
323
|
)
|
|
324
|
-
|
|
324
|
+
|
|
325
|
+
# First rename columns as before
|
|
325
326
|
df_variants_.rename(columns={
|
|
326
327
|
"Variant": "nucleotide_mutation",
|
|
327
|
-
"Substitutions": "
|
|
328
|
+
"Substitutions": "amino_acid_substitutions",
|
|
328
329
|
"nc_variant": "nt_sequence",
|
|
329
330
|
"aa_variant": "aa_sequence"
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
restructured_df = df_variants_
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
331
|
+
}, inplace=True)
|
|
332
|
+
|
|
333
|
+
# Create a copy for restructuring to avoid affecting the original
|
|
334
|
+
restructured_df = df_variants_.copy()
|
|
335
|
+
restructured_df.columns = restructured_df.columns.str.lower().str.replace('[\s-]', '_', regex=True)
|
|
336
|
+
# Fix the specific column name
|
|
337
|
+
restructured_df.columns = restructured_df.columns.str.replace('p_adj._value', 'p_adj_value')
|
|
338
|
+
|
|
339
|
+
# Select the desired columns in the desired order
|
|
340
|
+
restructured_df = restructured_df[[
|
|
341
|
+
"barcode_plate",
|
|
342
|
+
"plate",
|
|
343
|
+
"well",
|
|
344
|
+
"alignment_count",
|
|
345
|
+
"nucleotide_mutation",
|
|
346
|
+
"amino_acid_substitutions",
|
|
347
|
+
"alignment_probability",
|
|
348
|
+
"average_mutation_frequency",
|
|
349
|
+
"p_value",
|
|
350
|
+
"p_adj_value",
|
|
351
|
+
"nt_sequence",
|
|
352
|
+
"aa_sequence"
|
|
353
|
+
]]
|
|
348
354
|
|
|
349
355
|
return restructured_df, df_variants_
|
|
350
356
|
|