levseq 1.2.5__tar.gz → 1.2.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. {levseq-1.2.5/levseq.egg-info → levseq-1.2.7}/PKG-INFO +44 -5
  2. {levseq-1.2.5 → levseq-1.2.7}/README.md +44 -5
  3. {levseq-1.2.5 → levseq-1.2.7}/levseq/__init__.py +1 -1
  4. levseq-1.2.7/levseq/coordinates.py +76 -0
  5. {levseq-1.2.5 → levseq-1.2.7}/levseq/run_levseq.py +44 -30
  6. {levseq-1.2.5 → levseq-1.2.7}/levseq/seqfit.py +536 -117
  7. {levseq-1.2.5 → levseq-1.2.7}/levseq/variantcaller.py +5 -4
  8. {levseq-1.2.5 → levseq-1.2.7}/levseq/visualization.py +4 -4
  9. {levseq-1.2.5 → levseq-1.2.7/levseq.egg-info}/PKG-INFO +44 -5
  10. {levseq-1.2.5 → levseq-1.2.7}/levseq.egg-info/SOURCES.txt +1 -0
  11. {levseq-1.2.5 → levseq-1.2.7}/LICENSE +0 -0
  12. {levseq-1.2.5 → levseq-1.2.7}/MANIFEST.in +0 -0
  13. {levseq-1.2.5 → levseq-1.2.7}/levseq/IO_processor.py +0 -0
  14. {levseq-1.2.5 → levseq-1.2.7}/levseq/barcoding/__init__.py +0 -0
  15. {levseq-1.2.5 → levseq-1.2.7}/levseq/barcoding/demultiplex +0 -0
  16. {levseq-1.2.5 → levseq-1.2.7}/levseq/barcoding/demultiplex-arm64 +0 -0
  17. {levseq-1.2.5 → levseq-1.2.7}/levseq/barcoding/demultiplex-x86 +0 -0
  18. {levseq-1.2.5 → levseq-1.2.7}/levseq/barcoding/minion_barcodes.fasta +0 -0
  19. {levseq-1.2.5 → levseq-1.2.7}/levseq/basecaller.py +0 -0
  20. {levseq-1.2.5 → levseq-1.2.7}/levseq/cmd.py +0 -0
  21. {levseq-1.2.5 → levseq-1.2.7}/levseq/globals.py +0 -0
  22. {levseq-1.2.5 → levseq-1.2.7}/levseq/interface.py +0 -0
  23. {levseq-1.2.5 → levseq-1.2.7}/levseq/parser.py +0 -0
  24. {levseq-1.2.5 → levseq-1.2.7}/levseq/screen.py +0 -0
  25. {levseq-1.2.5 → levseq-1.2.7}/levseq/simulation.py +0 -0
  26. {levseq-1.2.5 → levseq-1.2.7}/levseq/user.py +0 -0
  27. {levseq-1.2.5 → levseq-1.2.7}/levseq/utils.py +0 -0
  28. {levseq-1.2.5 → levseq-1.2.7}/levseq.egg-info/dependency_links.txt +0 -0
  29. {levseq-1.2.5 → levseq-1.2.7}/levseq.egg-info/entry_points.txt +0 -0
  30. {levseq-1.2.5 → levseq-1.2.7}/levseq.egg-info/requires.txt +0 -0
  31. {levseq-1.2.5 → levseq-1.2.7}/levseq.egg-info/top_level.txt +0 -0
  32. {levseq-1.2.5 → levseq-1.2.7}/setup.cfg +0 -0
  33. {levseq-1.2.5 → levseq-1.2.7}/setup.py +0 -0
  34. {levseq-1.2.5 → levseq-1.2.7}/tests/test_demultiplex_docker.py +0 -0
  35. {levseq-1.2.5 → levseq-1.2.7}/tests/test_opligopools.py +0 -0
  36. {levseq-1.2.5 → levseq-1.2.7}/tests/test_seqfitvis.py +0 -0
  37. {levseq-1.2.5 → levseq-1.2.7}/tests/test_seqs.py +0 -0
  38. {levseq-1.2.5 → levseq-1.2.7}/tests/test_statistics.py +0 -0
  39. {levseq-1.2.5 → levseq-1.2.7}/tests/test_variant_calling.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: levseq
3
- Version: 1.2.5
3
+ Version: 1.2.7
4
4
  Home-page: https://github.com/fhalab/levseq/
5
5
  Author: Yueming Long, Emreay Gursoy, Ariane Mora, Francesca-Zhoufan Li
6
6
  Author-email: ylong@caltech.edu
@@ -54,7 +54,7 @@ Figure 1: Overview of the LevSeq variant sequencing workflow using Nanopore tech
54
54
 
55
55
 
56
56
  - Data to reproduce the results and to test are available on zenodo [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.13694463.svg)](https://doi.org/10.5281/zenodo.13694463)
57
- - A dockerized website and database for labs to locally host and visualize their data: website is available [here](https://github.com/ArianeMora/LevSeq_vis/) and code to host locally at: https://github.com/fhalab/LevSeq_VDB/
57
+ - A dockerized website and database for labs to locally host and visualize their data: website is available [here](https://levseqdb.streamlit.app/) and code to host locally [here](https://github.com/fhalab/LevSeq_db)
58
58
 
59
59
  ## Setup
60
60
 
@@ -80,7 +80,7 @@ and `minimap2` installed on your path. However, if you have issues we recommend
80
80
  We recommend using terminal and a conda environment for installation:
81
81
 
82
82
  ```
83
- conda create --name levseq python=3.10 -y
83
+ conda create --name levseq python=3.12 -y
84
84
  ```
85
85
 
86
86
  ```
@@ -93,7 +93,7 @@ conda activate levseq
93
93
  ```
94
94
  conda install -c bioconda -c conda-forge samtools
95
95
  ```
96
- or for mac users you can use: `brew install samtools`
96
+
97
97
 
98
98
  2. Minimap2: https://github.com/lh3/minimap2
99
99
 
@@ -110,11 +110,46 @@ operating system (https://docs.docker.com/engine/install/).
110
110
  ```
111
111
  levseq <name of the run you can make this whatever> <location to data folder> <location of reference csv file>
112
112
  ```
113
+
113
114
  #### Run via docker
115
+ If using linux system
116
+ ```
117
+ docker pull yueminglong/levseq:levseq-1.2.5-x86
114
118
  ```
115
- docker run --rm -v "$(pwd):/levseq_results" levseq <name> <location to data folder> <location of reference csv file>
119
+ If using Mac M chips (image tested on M1, M3, and M4)
120
+ ```
121
+ docker pull yueminglong/levseq:levseq-1.2.5-arm64
122
+ ```
123
+
116
124
  ```
125
+ docker run --rm -v "$(pwd):/levseq_results" yueminglong/levseq:levseq-1.2.5-<architecture> <name> <location to data folder> <location of reference csv file>
126
+ ```
127
+ Explanation:
128
+
129
+ --rm: Automatically removes the container after the command finishes.
130
+
131
+ -v "$(pwd):/levseq\_results": Mounts the current directory ($(pwd)) to /levseq\_results inside the container, ensuring the results are saved to your current directory.
132
+
133
+ yueminglong/levseq:levseq-1.2.5-\<architecture\>: Specifies the Docker image to run. Replace \<architecture\> with the appropriate platform (e.g., x86).
134
+
135
+ \<name\>: The name or identifier for the analysis.
136
+
137
+ \<location to data folder\>: Path to the folder containing input data.
138
+
139
+ \<location of reference csv file\>: Path to the reference .csv file.
140
+
141
+ Important Notes:
142
+
143
+ If the current directory is mounted to the container (via -v "$(pwd):/levseq\_results"), the basecalled result in FASTQ format and the ref.csv file must be located in the current directory.
144
+
145
+ If these files are not present in the current directory, they will not be processed by the tool.
146
+
147
+ Output:
148
+
149
+ The results of the analysis will be saved to your current working directory.
150
+
117
151
  See the [manuscrtipt notebook](https://github.com/fhalab/LevSeq/blob/main/manuscript/notebooks/epPCR_10plates.ipynb) for an example.
152
+ *Note: if using docker, the html and csv final output will be saved in the directory that you are running from instead of in the Platemaps or Results subfolder.
118
153
 
119
154
  #### Required Arguments
120
155
  1. Name of the experiment, this will be the name of the output folder
@@ -137,3 +172,7 @@ For more details or trouble shooting please look at our [computational_protocols
137
172
  #### Citing
138
173
 
139
174
  If you have found LevSeq useful, please cite out [paper](https://doi.org/10.1101/2024.09.04.611255).
175
+
176
+ #### Contact
177
+
178
+ Leave a feature request in the issues or reach us via [email](mailto:levseqdb@gmail.com).
@@ -7,7 +7,7 @@ Figure 1: Overview of the LevSeq variant sequencing workflow using Nanopore tech
7
7
 
8
8
 
9
9
  - Data to reproduce the results and to test are available on zenodo [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.13694463.svg)](https://doi.org/10.5281/zenodo.13694463)
10
- - A dockerized website and database for labs to locally host and visualize their data: website is available [here](https://github.com/ArianeMora/LevSeq_vis/) and code to host locally at: https://github.com/fhalab/LevSeq_VDB/
10
+ - A dockerized website and database for labs to locally host and visualize their data: website is available [here](https://levseqdb.streamlit.app/) and code to host locally [here](https://github.com/fhalab/LevSeq_db)
11
11
 
12
12
  ## Setup
13
13
 
@@ -33,7 +33,7 @@ and `minimap2` installed on your path. However, if you have issues we recommend
33
33
  We recommend using terminal and a conda environment for installation:
34
34
 
35
35
  ```
36
- conda create --name levseq python=3.10 -y
36
+ conda create --name levseq python=3.12 -y
37
37
  ```
38
38
 
39
39
  ```
@@ -46,7 +46,7 @@ conda activate levseq
46
46
  ```
47
47
  conda install -c bioconda -c conda-forge samtools
48
48
  ```
49
- or for mac users you can use: `brew install samtools`
49
+
50
50
 
51
51
  2. Minimap2: https://github.com/lh3/minimap2
52
52
 
@@ -63,11 +63,46 @@ operating system (https://docs.docker.com/engine/install/).
63
63
  ```
64
64
  levseq <name of the run you can make this whatever> <location to data folder> <location of reference csv file>
65
65
  ```
66
+
66
67
  #### Run via docker
68
+ If using linux system
69
+ ```
70
+ docker pull yueminglong/levseq:levseq-1.2.5-x86
67
71
  ```
68
- docker run --rm -v "$(pwd):/levseq_results" levseq <name> <location to data folder> <location of reference csv file>
72
+ If using Mac M chips (image tested on M1, M3, and M4)
73
+ ```
74
+ docker pull yueminglong/levseq:levseq-1.2.5-arm64
75
+ ```
76
+
69
77
  ```
78
+ docker run --rm -v "$(pwd):/levseq_results" yueminglong/levseq:levseq-1.2.5-<architecture> <name> <location to data folder> <location of reference csv file>
79
+ ```
80
+ Explanation:
81
+
82
+ --rm: Automatically removes the container after the command finishes.
83
+
84
+ -v "$(pwd):/levseq\_results": Mounts the current directory ($(pwd)) to /levseq\_results inside the container, ensuring the results are saved to your current directory.
85
+
86
+ yueminglong/levseq:levseq-1.2.5-\<architecture\>: Specifies the Docker image to run. Replace \<architecture\> with the appropriate platform (e.g., x86).
87
+
88
+ \<name\>: The name or identifier for the analysis.
89
+
90
+ \<location to data folder\>: Path to the folder containing input data.
91
+
92
+ \<location of reference csv file\>: Path to the reference .csv file.
93
+
94
+ Important Notes:
95
+
96
+ If the current directory is mounted to the container (via -v "$(pwd):/levseq\_results"), the basecalled result in FASTQ format and the ref.csv file must be located in the current directory.
97
+
98
+ If these files are not present in the current directory, they will not be processed by the tool.
99
+
100
+ Output:
101
+
102
+ The results of the analysis will be saved to your current working directory.
103
+
70
104
  See the [manuscrtipt notebook](https://github.com/fhalab/LevSeq/blob/main/manuscript/notebooks/epPCR_10plates.ipynb) for an example.
105
+ *Note: if using docker, the html and csv final output will be saved in the directory that you are running from instead of in the Platemaps or Results subfolder.
71
106
 
72
107
  #### Required Arguments
73
108
  1. Name of the experiment, this will be the name of the output folder
@@ -89,4 +124,8 @@ For more details or trouble shooting please look at our [computational_protocols
89
124
 
90
125
  #### Citing
91
126
 
92
- If you have found LevSeq useful, please cite out [paper](https://doi.org/10.1101/2024.09.04.611255).
127
+ If you have found LevSeq useful, please cite out [paper](https://doi.org/10.1101/2024.09.04.611255).
128
+
129
+ #### Contact
130
+
131
+ Leave a feature request in the issues or reach us via [email](mailto:levseqdb@gmail.com).
@@ -18,7 +18,7 @@
18
18
  __title__ = 'levseq'
19
19
  __description__ = 'LevSeq nanopore sequencing'
20
20
  __url__ = 'https://github.com/fhalab/levseq/'
21
- __version__ = '1.2.5'
21
+ __version__ = '1.2.7'
22
22
  __author__ = 'Yueming Long, Emreay Gursoy, Ariane Mora, Francesca-Zhoufan Li'
23
23
  __author_email__ = 'ylong@caltech.edu'
24
24
  __license__ = 'GPL3'
@@ -0,0 +1,76 @@
1
+ import esm
2
+ import torch
3
+ import pandas as pd
4
+ from sklearn.decomposition import PCA
5
+ import os
6
+ import argparse
7
+
8
+ def preprocess_sequence(sequence):
9
+ """
10
+ Preprocesses the amino acid sequence by removing everything after the first '*' (stop codon).
11
+ """
12
+ if '*' in sequence:
13
+ sequence = sequence.split('*')[0] # Take everything before the first '*'
14
+ return sequence
15
+
16
+ def process_file(input_file, output_file=None):
17
+ # Load the dataset
18
+ data = pd.read_csv(input_file)
19
+
20
+ # Remove the "Unnamed: 0" column if it exists
21
+ if 'Unnamed: 0' in data.columns:
22
+ data = data.drop(columns=['Unnamed: 0'])
23
+
24
+ # Create the ID column as the combination of `Plate` and `Well`
25
+ data['ID'] = data['Plate'] + '-' + data['Well']
26
+ data = data[['ID'] + [col for col in data.columns if col != 'ID']] # Reorder to make ID the first column
27
+
28
+ # Filter valid sequences from the `aa_sequence` column
29
+ valid_sequences = data['aa_sequence'].dropna()
30
+ valid_sequences = valid_sequences[~valid_sequences.str.contains('#N.A.#|Deletion')]
31
+
32
+ # Preprocess sequences to handle stop codons
33
+ valid_sequences = valid_sequences.apply(preprocess_sequence)
34
+
35
+ # Load the ESM-2 model
36
+ model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
37
+ batch_converter = alphabet.get_batch_converter()
38
+
39
+ # Prepare sequences for embedding
40
+ sequences = valid_sequences.tolist()
41
+ sequence_names = [f"Sequence {i}" for i in range(len(sequences))]
42
+ batch_labels, batch_strs, batch_tokens = batch_converter(list(zip(sequence_names, sequences)))
43
+
44
+ # Extract embeddings
45
+ with torch.no_grad():
46
+ results = model(batch_tokens, repr_layers=[33])
47
+ embeddings = results["representations"][33] # Use the top (last) layer representations
48
+
49
+ # Average embeddings across residues for sequence-level representation
50
+ sequence_embeddings = embeddings.mean(1).numpy()
51
+
52
+ # Dimensionality Reduction using PCA
53
+ pca = PCA(n_components=2)
54
+ xy_coordinates = pca.fit_transform(sequence_embeddings)
55
+
56
+ # Add x, y coordinates back to the dataframe
57
+ xy_df = pd.DataFrame(xy_coordinates, columns=['x_coordinate', 'y_coordinate'], index=valid_sequences.index)
58
+ data = pd.concat([data, xy_df], axis=1)
59
+
60
+ # Determine output file location
61
+ if output_file is None:
62
+ input_name, input_ext = os.path.splitext(input_file)
63
+ output_file = f"{input_name}_xy{input_ext}"
64
+
65
+ # Save the updated dataframe to a file
66
+ data.to_csv(output_file, index=False)
67
+ print(f"Processed data with x, y coordinates saved to: {output_file}")
68
+
69
+ if __name__ == "__main__":
70
+ parser = argparse.ArgumentParser(description="Generate x, y coordinates for amino acid sequences")
71
+ parser.add_argument('input_file', type=str, help="Path to the input CSV file")
72
+ parser.add_argument('--output_file', type=str, default=None, help="Path to save the output CSV file (optional)")
73
+ args = parser.parse_args()
74
+
75
+ process_file(args.input_file, args.output_file)
76
+
@@ -275,11 +275,11 @@ def create_df_v(variants_df):
275
275
  )
276
276
  # Fill in 'Deletion' in 'aa_variant' column
277
277
  df_variants_.loc[
278
- df_variants_["nc_variant"] == "Deletion", "aa_variant"
279
- ] = "Deletion"
278
+ df_variants_["nc_variant"] == "#DEL#", "aa_variant"
279
+ ] = "#DEL#"
280
280
  df_variants_.loc[
281
- df_variants_["nc_variant"] == "Insertion", "aa_variant"
282
- ] = "Insertion"
281
+ df_variants_["nc_variant"] == "#INS#", "aa_variant"
282
+ ] = "#INS#"
283
283
 
284
284
  # Compare aa_variant with translated refseq and generate Substitutions column
285
285
  df_variants_["Substitutions"] = df_variants_.apply(get_mutations, axis=1)
@@ -291,7 +291,7 @@ def create_df_v(variants_df):
291
291
  # Fill in Deletion into Substitutions Column, keep #N.A.# unchanged
292
292
  for i in df_variants_.index:
293
293
  if df_variants_["nc_variant"].iloc[i] == "Deletion":
294
- df_variants_.Substitutions.iat[i] = df_variants_.Substitutions.iat[i].replace("", "-")
294
+ df_variants_.Substitutions.iat[i] = df_variants_.Substitutions.iat[i].replace("", "#DEL#")
295
295
  elif df_variants_["nc_variant"].iloc[i] == "#N.A.#":
296
296
  df_variants_.Substitutions.iat[i] = "#N.A.#"
297
297
 
@@ -321,30 +321,36 @@ def create_df_v(variants_df):
321
321
  df_variants_["Plate"] = df_variants_["Plate"].apply(
322
322
  lambda x: f"0{x}" if len(x) == 1 else x
323
323
  )
324
- # Rename columns as per the request
324
+
325
+ # First rename columns as before
325
326
  df_variants_.rename(columns={
326
327
  "Variant": "nucleotide_mutation",
327
- "Substitutions": "amino-acid_substitutions",
328
+ "Substitutions": "amino_acid_substitutions",
328
329
  "nc_variant": "nt_sequence",
329
330
  "aa_variant": "aa_sequence"
330
- },inplace=True)
331
-
332
- # Select the desired columns in the desired order
333
- restructured_df = df_variants_[[
334
- "barcode_plate",
335
- "Plate",
336
- "Well",
337
- "Alignment Count",
338
- "nucleotide_mutation",
339
- "amino-acid_substitutions",
340
- "Alignment Probability",
341
- "Average mutation frequency",
342
- "P value",
343
- "P adj. value",
344
- "nt_sequence",
345
- "aa_sequence",
346
- ]
347
- ]
331
+ }, inplace=True)
332
+
333
+ # Create a copy for restructuring to avoid affecting the original
334
+ restructured_df = df_variants_.copy()
335
+ restructured_df.columns = restructured_df.columns.str.lower().str.replace('[\s-]', '_', regex=True)
336
+ # Fix the specific column name
337
+ restructured_df.columns = restructured_df.columns.str.replace('p_adj._value', 'p_adj_value')
338
+
339
+ # Select the desired columns in the desired order
340
+ restructured_df = restructured_df[[
341
+ "barcode_plate",
342
+ "plate",
343
+ "well",
344
+ "alignment_count",
345
+ "nucleotide_mutation",
346
+ "amino_acid_substitutions",
347
+ "alignment_probability",
348
+ "average_mutation_frequency",
349
+ "p_value",
350
+ "p_adj_value",
351
+ "nt_sequence",
352
+ "aa_sequence"
353
+ ]]
348
354
 
349
355
  return restructured_df, df_variants_
350
356
 
@@ -357,9 +363,9 @@ def create_nc_variant(variant, refseq):
357
363
  elif variant == "#PARENT#":
358
364
  return refseq
359
365
  elif "DEL" in variant:
360
- return "Deletion"
366
+ return "#DEL#"
361
367
  elif variant == '+':
362
- return "Insertion"
368
+ return "#INS#"
363
369
  else:
364
370
  mutations = variant.split("_")
365
371
  nc_variant = list(refseq)
@@ -459,7 +465,7 @@ def process_ref_csv(cl_args, tqdm_fn=tqdm.tqdm):
459
465
  logging.info(f"Fasta file for {name} already exists. Skipping write.")
460
466
 
461
467
  barcode_path = filter_bc(cl_args, name_folder, i)
462
- output_dir = Path(result_folder) / "basecalled_reads"
468
+ output_dir = Path(result_folder) / f"{cl_args['name']}_fastq"
463
469
  output_dir.mkdir(parents=True, exist_ok=True)
464
470
 
465
471
  if not cl_args["skip_demultiplexing"]:
@@ -485,17 +491,25 @@ def process_ref_csv(cl_args, tqdm_fn=tqdm.tqdm):
485
491
  continue
486
492
 
487
493
  variant_df.to_csv(variant_csv_path, index=False)
488
- return variant_df
494
+ return variant_df, ref_df
489
495
 
490
496
  # Main function to run LevSeq and ensure saving of intermediate results if an error occurs
491
497
  def run_LevSeq(cl_args, tqdm_fn=tqdm.tqdm):
492
498
  result_folder = create_result_folder(cl_args)
499
+ # Ref folder for saving ref csv file
500
+ ref_folder = os.path.join(result_folder, "ref")
501
+ os.makedirs(ref_folder, exist_ok=True)
502
+
493
503
  configure_logging(result_folder)
504
+ logging.info("Logging configured. Starting program.")
494
505
 
495
506
  variant_df = pd.DataFrame(columns=["barcode_plate", "name", "refseq", "variant"])
496
507
 
497
508
  try:
498
- variant_df = process_ref_csv(cl_args, tqdm_fn)
509
+ variant_df, ref_df = process_ref_csv(cl_args, tqdm_fn)
510
+ ref_df_path = os.path.join(ref_folder, cl_args["name"]+".csv")
511
+ ref_df.to_csv(ref_df_path, index=False)
512
+
499
513
  if variant_df.empty:
500
514
  logging.warning("No data found during CSV processing. The CSV is empty.")
501
515
  except Exception as e: