levseq 1.2.5__tar.gz → 1.2.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. {levseq-1.2.5/levseq.egg-info → levseq-1.2.6}/PKG-INFO +39 -4
  2. {levseq-1.2.5 → levseq-1.2.6}/README.md +39 -4
  3. {levseq-1.2.5 → levseq-1.2.6}/levseq/__init__.py +1 -1
  4. levseq-1.2.6/levseq/coordinates.py +76 -0
  5. {levseq-1.2.5 → levseq-1.2.6}/levseq/run_levseq.py +26 -20
  6. {levseq-1.2.5 → levseq-1.2.6}/levseq/seqfit.py +536 -117
  7. {levseq-1.2.5 → levseq-1.2.6}/levseq/visualization.py +4 -4
  8. {levseq-1.2.5 → levseq-1.2.6/levseq.egg-info}/PKG-INFO +39 -4
  9. {levseq-1.2.5 → levseq-1.2.6}/levseq.egg-info/SOURCES.txt +1 -0
  10. {levseq-1.2.5 → levseq-1.2.6}/LICENSE +0 -0
  11. {levseq-1.2.5 → levseq-1.2.6}/MANIFEST.in +0 -0
  12. {levseq-1.2.5 → levseq-1.2.6}/levseq/IO_processor.py +0 -0
  13. {levseq-1.2.5 → levseq-1.2.6}/levseq/barcoding/__init__.py +0 -0
  14. {levseq-1.2.5 → levseq-1.2.6}/levseq/barcoding/demultiplex +0 -0
  15. {levseq-1.2.5 → levseq-1.2.6}/levseq/barcoding/demultiplex-arm64 +0 -0
  16. {levseq-1.2.5 → levseq-1.2.6}/levseq/barcoding/demultiplex-x86 +0 -0
  17. {levseq-1.2.5 → levseq-1.2.6}/levseq/barcoding/minion_barcodes.fasta +0 -0
  18. {levseq-1.2.5 → levseq-1.2.6}/levseq/basecaller.py +0 -0
  19. {levseq-1.2.5 → levseq-1.2.6}/levseq/cmd.py +0 -0
  20. {levseq-1.2.5 → levseq-1.2.6}/levseq/globals.py +0 -0
  21. {levseq-1.2.5 → levseq-1.2.6}/levseq/interface.py +0 -0
  22. {levseq-1.2.5 → levseq-1.2.6}/levseq/parser.py +0 -0
  23. {levseq-1.2.5 → levseq-1.2.6}/levseq/screen.py +0 -0
  24. {levseq-1.2.5 → levseq-1.2.6}/levseq/simulation.py +0 -0
  25. {levseq-1.2.5 → levseq-1.2.6}/levseq/user.py +0 -0
  26. {levseq-1.2.5 → levseq-1.2.6}/levseq/utils.py +0 -0
  27. {levseq-1.2.5 → levseq-1.2.6}/levseq/variantcaller.py +0 -0
  28. {levseq-1.2.5 → levseq-1.2.6}/levseq.egg-info/dependency_links.txt +0 -0
  29. {levseq-1.2.5 → levseq-1.2.6}/levseq.egg-info/entry_points.txt +0 -0
  30. {levseq-1.2.5 → levseq-1.2.6}/levseq.egg-info/requires.txt +0 -0
  31. {levseq-1.2.5 → levseq-1.2.6}/levseq.egg-info/top_level.txt +0 -0
  32. {levseq-1.2.5 → levseq-1.2.6}/setup.cfg +0 -0
  33. {levseq-1.2.5 → levseq-1.2.6}/setup.py +0 -0
  34. {levseq-1.2.5 → levseq-1.2.6}/tests/test_demultiplex_docker.py +0 -0
  35. {levseq-1.2.5 → levseq-1.2.6}/tests/test_opligopools.py +0 -0
  36. {levseq-1.2.5 → levseq-1.2.6}/tests/test_seqfitvis.py +0 -0
  37. {levseq-1.2.5 → levseq-1.2.6}/tests/test_seqs.py +0 -0
  38. {levseq-1.2.5 → levseq-1.2.6}/tests/test_statistics.py +0 -0
  39. {levseq-1.2.5 → levseq-1.2.6}/tests/test_variant_calling.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: levseq
3
- Version: 1.2.5
3
+ Version: 1.2.6
4
4
  Home-page: https://github.com/fhalab/levseq/
5
5
  Author: Yueming Long, Emreay Gursoy, Ariane Mora, Francesca-Zhoufan Li
6
6
  Author-email: ylong@caltech.edu
@@ -80,7 +80,7 @@ and `minimap2` installed on your path. However, if you have issues we recommend
80
80
  We recommend using terminal and a conda environment for installation:
81
81
 
82
82
  ```
83
- conda create --name levseq python=3.10 -y
83
+ conda create --name levseq python=3.12 -y
84
84
  ```
85
85
 
86
86
  ```
@@ -93,7 +93,7 @@ conda activate levseq
93
93
  ```
94
94
  conda install -c bioconda -c conda-forge samtools
95
95
  ```
96
- or for mac users you can use: `brew install samtools`
96
+
97
97
 
98
98
  2. Minimap2: https://github.com/lh3/minimap2
99
99
 
@@ -110,11 +110,46 @@ operating system (https://docs.docker.com/engine/install/).
110
110
  ```
111
111
  levseq <name of the run you can make this whatever> <location to data folder> <location of reference csv file>
112
112
  ```
113
+
113
114
  #### Run via docker
115
+ If using linux system
116
+ ```
117
+ docker pull yueminglong/levseq:levseq-1.2.5-x86
118
+ ```
119
+ If using Mac M chips (image tested on M1, M3, and M4)
120
+ ```
121
+ docker pull yueminglong/levseq:levseq-1.2.5-arm64
114
122
  ```
115
- docker run --rm -v "$(pwd):/levseq_results" levseq <name> <location to data folder> <location of reference csv file>
123
+
124
+ ```
125
+ docker run --rm -v "$(pwd):/levseq_results" yueminglong/levseq:levseq-1.2.5-<architecture> <name> <location to data folder> <location of reference csv file>
116
126
  ```
127
+ Explanation:
128
+
129
+ --rm: Automatically removes the container after the command finishes.
130
+
131
+ -v "$(pwd):/levseq\_results": Mounts the current directory ($(pwd)) to /levseq\_results inside the container, ensuring the results are saved to your current directory.
132
+
133
+ yueminglong/levseq:levseq-1.2.5-\<architecture\>: Specifies the Docker image to run. Replace \<architecture\> with the appropriate platform (e.g., x86).
134
+
135
+ \<name\>: The name or identifier for the analysis.
136
+
137
+ \<location to data folder\>: Path to the folder containing input data.
138
+
139
+ \<location of reference csv file\>: Path to the reference .csv file.
140
+
141
+ Important Notes:
142
+
143
+ If the current directory is mounted to the container (via -v "$(pwd):/levseq\_results"), the basecalled result in FASTQ format and the ref.csv file must be located in the current directory.
144
+
145
+ If these files are not present in the current directory, they will not be processed by the tool.
146
+
147
+ Output:
148
+
149
+ The results of the analysis will be saved to your current working directory.
150
+
117
151
  See the [manuscrtipt notebook](https://github.com/fhalab/LevSeq/blob/main/manuscript/notebooks/epPCR_10plates.ipynb) for an example.
152
+ *Note: if using docker, the html and csv final output will be saved in the directory that you are running from instead of in the Platemaps or Results subfolder.
118
153
 
119
154
  #### Required Arguments
120
155
  1. Name of the experiment, this will be the name of the output folder
@@ -33,7 +33,7 @@ and `minimap2` installed on your path. However, if you have issues we recommend
33
33
  We recommend using terminal and a conda environment for installation:
34
34
 
35
35
  ```
36
- conda create --name levseq python=3.10 -y
36
+ conda create --name levseq python=3.12 -y
37
37
  ```
38
38
 
39
39
  ```
@@ -46,7 +46,7 @@ conda activate levseq
46
46
  ```
47
47
  conda install -c bioconda -c conda-forge samtools
48
48
  ```
49
- or for mac users you can use: `brew install samtools`
49
+
50
50
 
51
51
  2. Minimap2: https://github.com/lh3/minimap2
52
52
 
@@ -63,11 +63,46 @@ operating system (https://docs.docker.com/engine/install/).
63
63
  ```
64
64
  levseq <name of the run you can make this whatever> <location to data folder> <location of reference csv file>
65
65
  ```
66
+
66
67
  #### Run via docker
68
+ If using linux system
69
+ ```
70
+ docker pull yueminglong/levseq:levseq-1.2.5-x86
71
+ ```
72
+ If using Mac M chips (image tested on M1, M3, and M4)
73
+ ```
74
+ docker pull yueminglong/levseq:levseq-1.2.5-arm64
67
75
  ```
68
- docker run --rm -v "$(pwd):/levseq_results" levseq <name> <location to data folder> <location of reference csv file>
76
+
77
+ ```
78
+ docker run --rm -v "$(pwd):/levseq_results" yueminglong/levseq:levseq-1.2.5-<architecture> <name> <location to data folder> <location of reference csv file>
69
79
  ```
80
+ Explanation:
81
+
82
+ --rm: Automatically removes the container after the command finishes.
83
+
84
+ -v "$(pwd):/levseq\_results": Mounts the current directory ($(pwd)) to /levseq\_results inside the container, ensuring the results are saved to your current directory.
85
+
86
+ yueminglong/levseq:levseq-1.2.5-\<architecture\>: Specifies the Docker image to run. Replace \<architecture\> with the appropriate platform (e.g., x86).
87
+
88
+ \<name\>: The name or identifier for the analysis.
89
+
90
+ \<location to data folder\>: Path to the folder containing input data.
91
+
92
+ \<location of reference csv file\>: Path to the reference .csv file.
93
+
94
+ Important Notes:
95
+
96
+ If the current directory is mounted to the container (via -v "$(pwd):/levseq\_results"), the basecalled result in FASTQ format and the ref.csv file must be located in the current directory.
97
+
98
+ If these files are not present in the current directory, they will not be processed by the tool.
99
+
100
+ Output:
101
+
102
+ The results of the analysis will be saved to your current working directory.
103
+
70
104
  See the [manuscrtipt notebook](https://github.com/fhalab/LevSeq/blob/main/manuscript/notebooks/epPCR_10plates.ipynb) for an example.
105
+ *Note: if using docker, the html and csv final output will be saved in the directory that you are running from instead of in the Platemaps or Results subfolder.
71
106
 
72
107
  #### Required Arguments
73
108
  1. Name of the experiment, this will be the name of the output folder
@@ -89,4 +124,4 @@ For more details or trouble shooting please look at our [computational_protocols
89
124
 
90
125
  #### Citing
91
126
 
92
- If you have found LevSeq useful, please cite out [paper](https://doi.org/10.1101/2024.09.04.611255).
127
+ If you have found LevSeq useful, please cite out [paper](https://doi.org/10.1101/2024.09.04.611255).
@@ -18,7 +18,7 @@
18
18
  __title__ = 'levseq'
19
19
  __description__ = 'LevSeq nanopore sequencing'
20
20
  __url__ = 'https://github.com/fhalab/levseq/'
21
- __version__ = '1.2.5'
21
+ __version__ = '1.2.6'
22
22
  __author__ = 'Yueming Long, Emreay Gursoy, Ariane Mora, Francesca-Zhoufan Li'
23
23
  __author_email__ = 'ylong@caltech.edu'
24
24
  __license__ = 'GPL3'
@@ -0,0 +1,76 @@
1
+ import esm
2
+ import torch
3
+ import pandas as pd
4
+ from sklearn.decomposition import PCA
5
+ import os
6
+ import argparse
7
+
8
+ def preprocess_sequence(sequence):
9
+ """
10
+ Preprocesses the amino acid sequence by removing everything after the first '*' (stop codon).
11
+ """
12
+ if '*' in sequence:
13
+ sequence = sequence.split('*')[0] # Take everything before the first '*'
14
+ return sequence
15
+
16
+ def process_file(input_file, output_file=None):
17
+ # Load the dataset
18
+ data = pd.read_csv(input_file)
19
+
20
+ # Remove the "Unnamed: 0" column if it exists
21
+ if 'Unnamed: 0' in data.columns:
22
+ data = data.drop(columns=['Unnamed: 0'])
23
+
24
+ # Create the ID column as the combination of `Plate` and `Well`
25
+ data['ID'] = data['Plate'] + '-' + data['Well']
26
+ data = data[['ID'] + [col for col in data.columns if col != 'ID']] # Reorder to make ID the first column
27
+
28
+ # Filter valid sequences from the `aa_sequence` column
29
+ valid_sequences = data['aa_sequence'].dropna()
30
+ valid_sequences = valid_sequences[~valid_sequences.str.contains('#N.A.#|Deletion')]
31
+
32
+ # Preprocess sequences to handle stop codons
33
+ valid_sequences = valid_sequences.apply(preprocess_sequence)
34
+
35
+ # Load the ESM-2 model
36
+ model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
37
+ batch_converter = alphabet.get_batch_converter()
38
+
39
+ # Prepare sequences for embedding
40
+ sequences = valid_sequences.tolist()
41
+ sequence_names = [f"Sequence {i}" for i in range(len(sequences))]
42
+ batch_labels, batch_strs, batch_tokens = batch_converter(list(zip(sequence_names, sequences)))
43
+
44
+ # Extract embeddings
45
+ with torch.no_grad():
46
+ results = model(batch_tokens, repr_layers=[33])
47
+ embeddings = results["representations"][33] # Use the top (last) layer representations
48
+
49
+ # Average embeddings across residues for sequence-level representation
50
+ sequence_embeddings = embeddings.mean(1).numpy()
51
+
52
+ # Dimensionality Reduction using PCA
53
+ pca = PCA(n_components=2)
54
+ xy_coordinates = pca.fit_transform(sequence_embeddings)
55
+
56
+ # Add x, y coordinates back to the dataframe
57
+ xy_df = pd.DataFrame(xy_coordinates, columns=['x_coordinate', 'y_coordinate'], index=valid_sequences.index)
58
+ data = pd.concat([data, xy_df], axis=1)
59
+
60
+ # Determine output file location
61
+ if output_file is None:
62
+ input_name, input_ext = os.path.splitext(input_file)
63
+ output_file = f"{input_name}_xy{input_ext}"
64
+
65
+ # Save the updated dataframe to a file
66
+ data.to_csv(output_file, index=False)
67
+ print(f"Processed data with x, y coordinates saved to: {output_file}")
68
+
69
+ if __name__ == "__main__":
70
+ parser = argparse.ArgumentParser(description="Generate x, y coordinates for amino acid sequences")
71
+ parser.add_argument('input_file', type=str, help="Path to the input CSV file")
72
+ parser.add_argument('--output_file', type=str, default=None, help="Path to save the output CSV file (optional)")
73
+ args = parser.parse_args()
74
+
75
+ process_file(args.input_file, args.output_file)
76
+
@@ -321,30 +321,36 @@ def create_df_v(variants_df):
321
321
  df_variants_["Plate"] = df_variants_["Plate"].apply(
322
322
  lambda x: f"0{x}" if len(x) == 1 else x
323
323
  )
324
- # Rename columns as per the request
324
+
325
+ # First rename columns as before
325
326
  df_variants_.rename(columns={
326
327
  "Variant": "nucleotide_mutation",
327
- "Substitutions": "amino-acid_substitutions",
328
+ "Substitutions": "amino_acid_substitutions",
328
329
  "nc_variant": "nt_sequence",
329
330
  "aa_variant": "aa_sequence"
330
- },inplace=True)
331
-
332
- # Select the desired columns in the desired order
333
- restructured_df = df_variants_[[
334
- "barcode_plate",
335
- "Plate",
336
- "Well",
337
- "Alignment Count",
338
- "nucleotide_mutation",
339
- "amino-acid_substitutions",
340
- "Alignment Probability",
341
- "Average mutation frequency",
342
- "P value",
343
- "P adj. value",
344
- "nt_sequence",
345
- "aa_sequence",
346
- ]
347
- ]
331
+ }, inplace=True)
332
+
333
+ # Create a copy for restructuring to avoid affecting the original
334
+ restructured_df = df_variants_.copy()
335
+ restructured_df.columns = restructured_df.columns.str.lower().str.replace('[\s-]', '_', regex=True)
336
+ # Fix the specific column name
337
+ restructured_df.columns = restructured_df.columns.str.replace('p_adj._value', 'p_adj_value')
338
+
339
+ # Select the desired columns in the desired order
340
+ restructured_df = restructured_df[[
341
+ "barcode_plate",
342
+ "plate",
343
+ "well",
344
+ "alignment_count",
345
+ "nucleotide_mutation",
346
+ "amino_acid_substitutions",
347
+ "alignment_probability",
348
+ "average_mutation_frequency",
349
+ "p_value",
350
+ "p_adj_value",
351
+ "nt_sequence",
352
+ "aa_sequence"
353
+ ]]
348
354
 
349
355
  return restructured_df, df_variants_
350
356