levseq 1.3.3__tar.gz → 1.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. levseq-1.4.1/PKG-INFO +187 -0
  2. levseq-1.4.1/README.md +140 -0
  3. {levseq-1.3.3 → levseq-1.4.1}/levseq/__init__.py +1 -1
  4. {levseq-1.3.3 → levseq-1.4.1}/levseq/interface.py +8 -2
  5. {levseq-1.3.3 → levseq-1.4.1}/levseq/run_levseq.py +115 -13
  6. {levseq-1.3.3 → levseq-1.4.1}/levseq/utils.py +14 -13
  7. {levseq-1.3.3 → levseq-1.4.1}/levseq/variantcaller.py +58 -42
  8. {levseq-1.3.3 → levseq-1.4.1}/levseq/visualization.py +52 -0
  9. levseq-1.4.1/levseq.egg-info/PKG-INFO +187 -0
  10. {levseq-1.3.3 → levseq-1.4.1}/levseq.egg-info/SOURCES.txt +1 -0
  11. {levseq-1.3.3 → levseq-1.4.1}/setup.py +1 -0
  12. levseq-1.4.1/tests/test_copy_fastq.py +75 -0
  13. {levseq-1.3.3 → levseq-1.4.1}/tests/test_deploy.py +27 -17
  14. {levseq-1.3.3 → levseq-1.4.1}/tests/test_opligopools.py +7 -33
  15. levseq-1.3.3/PKG-INFO +0 -192
  16. levseq-1.3.3/README.md +0 -145
  17. levseq-1.3.3/levseq.egg-info/PKG-INFO +0 -192
  18. {levseq-1.3.3 → levseq-1.4.1}/LICENSE +0 -0
  19. {levseq-1.3.3 → levseq-1.4.1}/MANIFEST.in +0 -0
  20. {levseq-1.3.3 → levseq-1.4.1}/levseq/IO_processor.py +0 -0
  21. {levseq-1.3.3 → levseq-1.4.1}/levseq/barcoding/__init__.py +0 -0
  22. {levseq-1.3.3 → levseq-1.4.1}/levseq/barcoding/demultiplex +0 -0
  23. {levseq-1.3.3 → levseq-1.4.1}/levseq/barcoding/demultiplex-arm64 +0 -0
  24. {levseq-1.3.3 → levseq-1.4.1}/levseq/barcoding/demultiplex-x86 +0 -0
  25. {levseq-1.3.3 → levseq-1.4.1}/levseq/barcoding/minion_barcodes.fasta +0 -0
  26. {levseq-1.3.3 → levseq-1.4.1}/levseq/basecaller.py +0 -0
  27. {levseq-1.3.3 → levseq-1.4.1}/levseq/cmd.py +0 -0
  28. {levseq-1.3.3 → levseq-1.4.1}/levseq/coordinates.py +0 -0
  29. {levseq-1.3.3 → levseq-1.4.1}/levseq/filter_orientation.py +0 -0
  30. {levseq-1.3.3 → levseq-1.4.1}/levseq/globals.py +0 -0
  31. {levseq-1.3.3 → levseq-1.4.1}/levseq/parser.py +0 -0
  32. {levseq-1.3.3 → levseq-1.4.1}/levseq/screen.py +0 -0
  33. {levseq-1.3.3 → levseq-1.4.1}/levseq/seqfit.py +0 -0
  34. {levseq-1.3.3 → levseq-1.4.1}/levseq/simulation.py +0 -0
  35. {levseq-1.3.3 → levseq-1.4.1}/levseq/user.py +0 -0
  36. {levseq-1.3.3 → levseq-1.4.1}/levseq.egg-info/dependency_links.txt +0 -0
  37. {levseq-1.3.3 → levseq-1.4.1}/levseq.egg-info/entry_points.txt +0 -0
  38. {levseq-1.3.3 → levseq-1.4.1}/levseq.egg-info/requires.txt +0 -0
  39. {levseq-1.3.3 → levseq-1.4.1}/levseq.egg-info/top_level.txt +0 -0
  40. {levseq-1.3.3 → levseq-1.4.1}/setup.cfg +0 -0
  41. {levseq-1.3.3 → levseq-1.4.1}/tests/test_demultiplex_docker.py +0 -0
  42. {levseq-1.3.3 → levseq-1.4.1}/tests/test_seqfitvis.py +0 -0
  43. {levseq-1.3.3 → levseq-1.4.1}/tests/test_seqs.py +0 -0
  44. {levseq-1.3.3 → levseq-1.4.1}/tests/test_statistics.py +0 -0
  45. {levseq-1.3.3 → levseq-1.4.1}/tests/test_variant_calling.py +0 -0
levseq-1.4.1/PKG-INFO ADDED
@@ -0,0 +1,187 @@
1
+ Metadata-Version: 2.1
2
+ Name: levseq
3
+ Version: 1.4.1
4
+ Home-page: https://github.com/fhalab/levseq/
5
+ Author: Yueming Long, Ariane Mora, Francesca-Zhoufan Li, Emre Gursoy
6
+ Author-email: ylong@caltech.edu
7
+ License: GPL3
8
+ Project-URL: Bug Tracker, https://github.com/fhalab/levseq/
9
+ Project-URL: Documentation, https://github.com/fhalab/levseq/
10
+ Project-URL: Source Code, https://github.com/fhalab/levseq/
11
+ Keywords: Nanopore,ONT,evSeq
12
+ Classifier: Development Status :: 5 - Production/Stable
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
15
+ Classifier: Natural Language :: English
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3.6
18
+ Classifier: Programming Language :: Python :: 3.7
19
+ Classifier: Programming Language :: Python :: 3.8
20
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
21
+ Requires-Python: >=3.8
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ Requires-Dist: Bio
25
+ Requires-Dist: biopython
26
+ Requires-Dist: fsspec
27
+ Requires-Dist: h5py
28
+ Requires-Dist: holoviews
29
+ Requires-Dist: jupyterlab
30
+ Requires-Dist: mappy
31
+ Requires-Dist: matplotlib
32
+ Requires-Dist: ninetysix
33
+ Requires-Dist: numpy
34
+ Requires-Dist: pandas
35
+ Requires-Dist: pybedtools
36
+ Requires-Dist: pycoQC
37
+ Requires-Dist: pyfaidx
38
+ Requires-Dist: pyparsing
39
+ Requires-Dist: pysam
40
+ Requires-Dist: scipy
41
+ Requires-Dist: sciutil
42
+ Requires-Dist: seaborn
43
+ Requires-Dist: scikit-learn
44
+ Requires-Dist: statsmodels
45
+ Requires-Dist: tqdm
46
+ Requires-Dist: biopandas
47
+
48
+ # Variant Sequencing with Nanopore (LevSeq)
49
+
50
+ LevSeq provides a streamlined pipeline for sequencing and analyzing genetic variants using Oxford Nanopore technology. In directed evolution experiments, LevSeq enables sequencing of every variant, enhancing data insight and creating datasets suitable for AI/ML methods. Sequence variants can be generated within a day at an extremely low cost.
51
+
52
+ ![Figure 1: LevSeq Workflow](manuscript/figures/LevSeq_Figure-1.jpeg)
53
+ Figure 1: Overview of the LevSeq variant sequencing workflow using Nanopore technology. This diagram illustrates the key steps in the process, from sample preparation to data analysis and visualization.
54
+
55
+ ## Quick Start
56
+
57
+ ### Docker Installation (Recommended)
58
+
59
+ 1. Install Docker: [https://docs.docker.com/engine/install/](https://docs.docker.com/engine/install/)
60
+ 2. Pull the appropriate image:
61
+ ```bash
62
+ # For Linux/Windows x86 systems:
63
+ docker pull yueminglong/levseq:levseq-1.4-x86
64
+
65
+ # For Mac M-series chips (M1, M2, M3, M4):
66
+ docker pull yueminglong/levseq:levseq-1.4-arm64
67
+ ```
68
+ 3. Run LevSeq:
69
+ ```bash
70
+ docker run --rm -v "/full/path/to/data:/levseq_results" yueminglong/levseq:levseq-1.4-arm64 my_experiment levseq_results/ levseq_results/ref.csv
71
+ ```
72
+
73
+ ### Pip Installation (Mac/Linux only)
74
+
75
+ **IMPORTANT**: On Mac M-series chips (M1-M4), gcc 13 and 14 are **REQUIRED**:
76
+ ```bash
77
+ brew install gcc@13 gcc@14
78
+ ```
79
+
80
+ 1. Create and activate conda environment:
81
+ ```bash
82
+ conda create --name levseq python=3.12 -y
83
+ conda activate levseq
84
+ ```
85
+
86
+ 2. Install dependencies:
87
+ ```bash
88
+ conda install -c bioconda -c conda-forge samtools minimap2
89
+ ```
90
+
91
+ 3. Install LevSeq:
92
+ ```bash
93
+ pip install levseq
94
+ ```
95
+
96
+ 4. Run LevSeq:
97
+ ```bash
98
+ levseq my_experiment /path/to/data/ /path/to/ref.csv
99
+ ```
100
+
101
+ ## Data and Visualization
102
+
103
+ - **Test Data**: Sample data is available on Zenodo [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.13694463.svg)](https://doi.org/10.5281/zenodo.13694463)
104
+ - **Visualization Tool**: A web application is available at [https://levseqdb.streamlit.app/](https://levseqdb.streamlit.app/) - simply upload your LevSeq output and LCMS results
105
+ - **Self-hosted Solution**: You can deploy your own instance using our [LevSeq_db repository](https://github.com/fhalab/LevSeq_db)
106
+
107
+ ## Reference File Format (ref.csv)
108
+
109
+ Your reference CSV file must contain the following columns:
110
+
111
+ | barcode_plate | name | refseq |
112
+ |---------------|--------|-----------|
113
+ | 33 | Q97A76 | ATGCGC... |
114
+
115
+ For oligopool experiments (multiple proteins per plate), use:
116
+
117
+ | barcode_plate | name | refseq |
118
+ |---------------|--------|-----------|
119
+ | 33 | Q97A76 | ATGCGCAAG |
120
+ | 33 | P96084 | ATGGATCA |
121
+ | 34 | P46209 | ATGGGGCAA |
122
+ | 34 | Q60336 | ATGGGGCC |
123
+
124
+ ## Command Line Arguments
125
+
126
+ ### Required Arguments
127
+ 1. **name**: Name of the experiment (output folder)
128
+ 2. **path**: Location of basecalled fastq files
129
+ 3. **summary**: Path to reference CSV file
130
+
131
+ ### Optional Arguments
132
+ - `--skip_demultiplexing`: Skip the demultiplexing step
133
+ - `--skip_variantcalling`: Skip the variant calling step
134
+ - `--output`: Custom save location (defaults to current directory)
135
+ - `--show_msa`: Show multiple sequence alignment for each well
136
+ - `--oligopool`: Process data as oligopool experiment
137
+
138
+ ## Step-by-Step Tutorial
139
+
140
+ 1. **Prepare your sequencing data**:
141
+ - Your fastq files should be in a directory structure similar to Nanopore's output
142
+ - Prepare a reference CSV file with barcode plates, sample names, and reference sequences
143
+
144
+ 2. **Run LevSeq**:
145
+ ```bash
146
+ # Via Docker
147
+ docker run --rm -v "/path/to/data:/levseq_results" yueminglong/levseq:levseq-1.4-arm64 my_experiment levseq_results/ levseq_results/ref.csv
148
+
149
+ # Via pip
150
+ levseq my_experiment /path/to/data/ /path/to/ref.csv
151
+ ```
152
+
153
+ 3. **Analyze results**:
154
+ - Output includes variant data (CSV) and interactive visualizations (HTML)
155
+ - Upload results to the LevSeq visualization tool for further analysis
156
+
157
+ ## Experimental Setup
158
+
159
+ For the wet lab protocol:
160
+ - Refer to the [wiki](https://github.com/fhalab/LevSeq/wiki/Experimental-protocols)
161
+ - See the methods section of [our paper](https://pubs.acs.org/doi/10.1021/acssynbio.4c00625)
162
+ - Order forward and reverse primers compatible with your plasmid
163
+ - Install Oxford Nanopore's software for basecalling if needed
164
+
165
+ ## Additional Resources
166
+
167
+ - **Example Notebook**: See `example/Example.ipynb` for a walkthrough
168
+ - **Advanced Usage**: See the [manuscript notebook](https://github.com/fhalab/LevSeq/blob/main/manuscript/notebooks/epPCR_10plates.ipynb)
169
+ - **Troubleshooting**: See our [computational protocols wiki](https://github.com/fhalab/LevSeq/wiki/Computational-protocols)
170
+
171
+ ## Citing LevSeq
172
+
173
+ If you find LevSeq useful, please cite our paper:
174
+
175
+ ```bibtex
176
+ @article{long2024levseq,
177
+ title={LevSeq: Rapid Generation of Sequence-Function Data for Directed Evolution and Machine Learning},
178
+ author={Long, Yueming and Mora, Ariane and Li, Francesca-Zhoufan and Gürsoy, Emre and Johnston, Kadina E and Arnold, Frances H},
179
+ journal={ACS Synthetic Biology},
180
+ year={2024},
181
+ publisher={American Chemical Society}
182
+ }
183
+ ```
184
+
185
+ ## Contact
186
+
187
+ Leave a feature request in the issues or reach us via [email](mailto:levseqdb@gmail.com).
levseq-1.4.1/README.md ADDED
@@ -0,0 +1,140 @@
1
+ # Variant Sequencing with Nanopore (LevSeq)
2
+
3
+ LevSeq provides a streamlined pipeline for sequencing and analyzing genetic variants using Oxford Nanopore technology. In directed evolution experiments, LevSeq enables sequencing of every variant, enhancing data insight and creating datasets suitable for AI/ML methods. Sequence variants can be generated within a day at an extremely low cost.
4
+
5
+ ![Figure 1: LevSeq Workflow](manuscript/figures/LevSeq_Figure-1.jpeg)
6
+ Figure 1: Overview of the LevSeq variant sequencing workflow using Nanopore technology. This diagram illustrates the key steps in the process, from sample preparation to data analysis and visualization.
7
+
8
+ ## Quick Start
9
+
10
+ ### Docker Installation (Recommended)
11
+
12
+ 1. Install Docker: [https://docs.docker.com/engine/install/](https://docs.docker.com/engine/install/)
13
+ 2. Pull the appropriate image:
14
+ ```bash
15
+ # For Linux/Windows x86 systems:
16
+ docker pull yueminglong/levseq:levseq-1.4-x86
17
+
18
+ # For Mac M-series chips (M1, M2, M3, M4):
19
+ docker pull yueminglong/levseq:levseq-1.4-arm64
20
+ ```
21
+ 3. Run LevSeq:
22
+ ```bash
23
+ docker run --rm -v "/full/path/to/data:/levseq_results" yueminglong/levseq:levseq-1.4-arm64 my_experiment levseq_results/ levseq_results/ref.csv
24
+ ```
25
+
26
+ ### Pip Installation (Mac/Linux only)
27
+
28
+ **IMPORTANT**: On Mac M-series chips (M1-M4), gcc 13 and 14 are **REQUIRED**:
29
+ ```bash
30
+ brew install gcc@13 gcc@14
31
+ ```
32
+
33
+ 1. Create and activate conda environment:
34
+ ```bash
35
+ conda create --name levseq python=3.12 -y
36
+ conda activate levseq
37
+ ```
38
+
39
+ 2. Install dependencies:
40
+ ```bash
41
+ conda install -c bioconda -c conda-forge samtools minimap2
42
+ ```
43
+
44
+ 3. Install LevSeq:
45
+ ```bash
46
+ pip install levseq
47
+ ```
48
+
49
+ 4. Run LevSeq:
50
+ ```bash
51
+ levseq my_experiment /path/to/data/ /path/to/ref.csv
52
+ ```
53
+
54
+ ## Data and Visualization
55
+
56
+ - **Test Data**: Sample data is available on Zenodo [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.13694463.svg)](https://doi.org/10.5281/zenodo.13694463)
57
+ - **Visualization Tool**: A web application is available at [https://levseqdb.streamlit.app/](https://levseqdb.streamlit.app/) - simply upload your LevSeq output and LCMS results
58
+ - **Self-hosted Solution**: You can deploy your own instance using our [LevSeq_db repository](https://github.com/fhalab/LevSeq_db)
59
+
60
+ ## Reference File Format (ref.csv)
61
+
62
+ Your reference CSV file must contain the following columns:
63
+
64
+ | barcode_plate | name | refseq |
65
+ |---------------|--------|-----------|
66
+ | 33 | Q97A76 | ATGCGC... |
67
+
68
+ For oligopool experiments (multiple proteins per plate), use:
69
+
70
+ | barcode_plate | name | refseq |
71
+ |---------------|--------|-----------|
72
+ | 33 | Q97A76 | ATGCGCAAG |
73
+ | 33 | P96084 | ATGGATCA |
74
+ | 34 | P46209 | ATGGGGCAA |
75
+ | 34 | Q60336 | ATGGGGCC |
76
+
77
+ ## Command Line Arguments
78
+
79
+ ### Required Arguments
80
+ 1. **name**: Name of the experiment (output folder)
81
+ 2. **path**: Location of basecalled fastq files
82
+ 3. **summary**: Path to reference CSV file
83
+
84
+ ### Optional Arguments
85
+ - `--skip_demultiplexing`: Skip the demultiplexing step
86
+ - `--skip_variantcalling`: Skip the variant calling step
87
+ - `--output`: Custom save location (defaults to current directory)
88
+ - `--show_msa`: Show multiple sequence alignment for each well
89
+ - `--oligopool`: Process data as oligopool experiment
90
+
91
+ ## Step-by-Step Tutorial
92
+
93
+ 1. **Prepare your sequencing data**:
94
+ - Your fastq files should be in a directory structure similar to Nanopore's output
95
+ - Prepare a reference CSV file with barcode plates, sample names, and reference sequences
96
+
97
+ 2. **Run LevSeq**:
98
+ ```bash
99
+ # Via Docker
100
+ docker run --rm -v "/path/to/data:/levseq_results" yueminglong/levseq:levseq-1.4-arm64 my_experiment levseq_results/ levseq_results/ref.csv
101
+
102
+ # Via pip
103
+ levseq my_experiment /path/to/data/ /path/to/ref.csv
104
+ ```
105
+
106
+ 3. **Analyze results**:
107
+ - Output includes variant data (CSV) and interactive visualizations (HTML)
108
+ - Upload results to the LevSeq visualization tool for further analysis
109
+
110
+ ## Experimental Setup
111
+
112
+ For the wet lab protocol:
113
+ - Refer to the [wiki](https://github.com/fhalab/LevSeq/wiki/Experimental-protocols)
114
+ - See the methods section of [our paper](https://pubs.acs.org/doi/10.1021/acssynbio.4c00625)
115
+ - Order forward and reverse primers compatible with your plasmid
116
+ - Install Oxford Nanopore's software for basecalling if needed
117
+
118
+ ## Additional Resources
119
+
120
+ - **Example Notebook**: See `example/Example.ipynb` for a walkthrough
121
+ - **Advanced Usage**: See the [manuscript notebook](https://github.com/fhalab/LevSeq/blob/main/manuscript/notebooks/epPCR_10plates.ipynb)
122
+ - **Troubleshooting**: See our [computational protocols wiki](https://github.com/fhalab/LevSeq/wiki/Computational-protocols)
123
+
124
+ ## Citing LevSeq
125
+
126
+ If you find LevSeq useful, please cite our paper:
127
+
128
+ ```bibtex
129
+ @article{long2024levseq,
130
+ title={LevSeq: Rapid Generation of Sequence-Function Data for Directed Evolution and Machine Learning},
131
+ author={Long, Yueming and Mora, Ariane and Li, Francesca-Zhoufan and Gürsoy, Emre and Johnston, Kadina E and Arnold, Frances H},
132
+ journal={ACS Synthetic Biology},
133
+ year={2024},
134
+ publisher={American Chemical Society}
135
+ }
136
+ ```
137
+
138
+ ## Contact
139
+
140
+ Leave a feature request in the issues or reach us via [email](mailto:levseqdb@gmail.com).
@@ -18,7 +18,7 @@
18
18
  __title__ = 'levseq'
19
19
  __description__ = 'LevSeq nanopore sequencing'
20
20
  __url__ = 'https://github.com/fhalab/levseq/'
21
- __version__ = '1.3.3'
21
+ __version__ = '1.4.1'
22
22
  __author__ = 'Yueming Long, Ariane Mora, Francesca-Zhoufan Li, Emre Gursoy'
23
23
  __author_email__ = 'ylong@caltech.edu'
24
24
  __license__ = 'GPL3'
@@ -63,6 +63,9 @@ def build_cli_parser():
63
63
  optional_args_group.add_argument("--skip_variantcalling",
64
64
  action="store_true",
65
65
  help="Skip the variant calling step, default is false")
66
+ optional_args_group.add_argument("--oligopool",
67
+ action="store_true",
68
+ help="Whether this experiment came from an oligopool, default is false.")
66
69
  optional_args_group.add_argument("--show_msa",
67
70
  default=False,
68
71
  help="Skip showing msa")
@@ -79,7 +82,10 @@ def execute_LevSeq():
79
82
  tqdm_fn = tqdm.tqdm
80
83
  # Run LevSeq
81
84
  try:
85
+ from levseq import __version__
86
+ print(f"Starting LevSeq v{__version__}...")
82
87
  run_LevSeq(CL_ARGS, tqdm_fn)
88
+ print(f"Run completed successfully. Results and logs stored in {os.path.join(CL_ARGS.get('output', CWD), CL_ARGS.get('name', ''))}")
83
89
  except Exception as e:
84
- print(e)
85
- print("Run Complete, add log info")
90
+ print(f"Error: {e}")
91
+ print(f"Check error logs for details in {os.path.join(CL_ARGS.get('output', CWD), CL_ARGS.get('name', ''))}")
@@ -66,8 +66,14 @@ from importlib import resources
66
66
  from holoviews.streams import Tap
67
67
 
68
68
  # Utility function to configure logging
69
- def configure_logging(result_folder):
70
- log_format = "%(asctime)s:%(levelname)s:%(message)s"
69
+ def configure_logging(result_folder, cl_args):
70
+ import sys
71
+ from levseq import __version__
72
+
73
+ # Define a more detailed log format with clean separation
74
+ log_format = "%(asctime)s : %(levelname)s : %(message)s"
75
+
76
+ # Create log handlers
71
77
  info_handler = logging.FileHandler(os.path.join(result_folder, "LevSeq_run.log"))
72
78
  info_handler.setLevel(logging.INFO)
73
79
  info_handler.setFormatter(logging.Formatter(log_format))
@@ -76,7 +82,30 @@ def configure_logging(result_folder):
76
82
  error_handler.setLevel(logging.ERROR)
77
83
  error_handler.setFormatter(logging.Formatter(log_format))
78
84
 
85
+ # Set up basic configuration with both handlers
79
86
  logging.basicConfig(level=logging.INFO, handlers=[info_handler, error_handler])
87
+
88
+ # Log version information and command used to run
89
+ command_used = " ".join(sys.argv)
90
+ logging.info(f"LevSeq Version: {__version__}")
91
+ logging.info(f"Command: {command_used}")
92
+
93
+ # Log essential run parameters
94
+ logging.info(f"Run name: {cl_args.get('name', 'Not specified')}")
95
+ logging.info(f"Input path: {cl_args.get('path', 'Not specified')}")
96
+ logging.info(f"Summary file: {cl_args.get('summary', 'Not specified')}")
97
+
98
+ # Log optional parameters if specified
99
+ if cl_args.get('output') and cl_args.get('output') != os.getcwd():
100
+ logging.info(f"Output directory: {cl_args.get('output')}")
101
+ if cl_args.get('oligopool'):
102
+ logging.info("Running in oligopool mode")
103
+ if cl_args.get('skip_demultiplexing'):
104
+ logging.info("Skipping demultiplexing step")
105
+ if cl_args.get('skip_variantcalling'):
106
+ logging.info("Skipping variant calling step")
107
+ if cl_args.get('threshold'):
108
+ logging.info(f"Using variant threshold: {cl_args.get('threshold')}")
80
109
 
81
110
  # Create result folder
82
111
  def create_result_folder(cl_args):
@@ -139,8 +168,15 @@ def cat_fastq_files(folder_path: str, output_path: str, reads_per_file: int = 40
139
168
  else:
140
169
  for fastq_file in fastq_files:
141
170
  destination = output_path / fastq_file.name
142
- shutil.copy(fastq_file, destination)
143
- logging.info("Copied %s to %s", fastq_file, destination)
171
+ # Skip copying if source and destination are identical
172
+ if str(fastq_file) == str(destination):
173
+ logging.info("Skipping copy of %s (source and destination are identical)", fastq_file)
174
+ continue
175
+ try:
176
+ shutil.copy(fastq_file, destination)
177
+ logging.info("Copied %s to %s", fastq_file, destination)
178
+ except shutil.SameFileError:
179
+ logging.info("Skipping copy of %s (source and destination are identical files)", fastq_file)
144
180
  logging.info("All FASTQ files processed successfully to %s", output_path)
145
181
  return str(output_path)
146
182
  except Exception as e:
@@ -221,13 +257,14 @@ def demux_fastq(file_to_fastq, result_folder, barcode_path):
221
257
  executable_path = package_root / "levseq" / "barcoding" / executable_name
222
258
  if not executable_path.exists():
223
259
  raise FileNotFoundError(f"Executable not found: {executable_path}")
224
- seq_min = 200
260
+ seq_min = 200
225
261
  seq_max = 10000
226
262
  prompt = f"{executable_path} -f {file_to_fastq} -d {result_folder} -b {barcode_path} -w 100 -r 100 -m {seq_min} -x {seq_max}"
227
263
  subprocess.run(prompt, shell=True, check=True)
228
264
 
229
265
  # Variant calling using VariantCaller class
230
- def call_variant(experiment_name, experiment_folder, template_fasta, filtered_barcodes):
266
+
267
+ def call_variant(experiment_name, experiment_folder, template_fasta, filtered_barcodes, threshold=0.5, oligopool=False):
231
268
  try:
232
269
  vc = VariantCaller(
233
270
  experiment_name,
@@ -236,8 +273,9 @@ def call_variant(experiment_name, experiment_folder, template_fasta, filtered_ba
236
273
  filtered_barcodes,
237
274
  padding_start=0,
238
275
  padding_end=0,
276
+ oligopool=oligopool
239
277
  )
240
- variant_df = vc.get_variant_df(threshold=0.5, min_depth=5)
278
+ variant_df = vc.get_variant_df(threshold=threshold, min_depth=5)
241
279
  logging.info("Variant calling to create consensus reads successful")
242
280
  return variant_df
243
281
  except Exception as e:
@@ -332,7 +370,7 @@ def create_df_v(variants_df):
332
370
 
333
371
  # Create a copy for restructuring to avoid affecting the original
334
372
  restructured_df = df_variants_.copy()
335
- restructured_df.columns = restructured_df.columns.str.lower().str.replace('[\s-]', '_', regex=True)
373
+ restructured_df.columns = restructured_df.columns.str.lower().str.replace(r'[\s-]', '_', regex=True)
336
374
  # Fix the specific column name
337
375
  restructured_df.columns = restructured_df.columns.str.replace('p_adj._value', 'p_adj_value')
338
376
 
@@ -441,6 +479,63 @@ def save_csv(df, outputdir, name):
441
479
  file_path = os.path.join(outputdir, "Results", name + ".csv")
442
480
  df.to_csv(file_path)
443
481
 
482
+ # Function to process the reference CSV and generate variants
483
+ def process_ref_csv_oligopool(cl_args, tqdm_fn=tqdm.tqdm):
484
+ ref_df = pd.read_csv(cl_args["summary"])
485
+ result_folder = create_result_folder(cl_args)
486
+ variant_csv_path = os.path.join(result_folder, "variants.csv")
487
+ variant_df = pd.DataFrame(columns=["barcode_plate", "name", "refseq", "variant"])
488
+
489
+ # First get the different barcode plates (these will be unique)
490
+ barcode_plates = ref_df["barcode_plate"].unique()
491
+ ref_df["barcode_index"] = [i for i in range(len(ref_df))]
492
+ barcode_to_index = dict(zip(ref_df.barcode_plate, ref_df.barcode_index))
493
+ for barcode_plate in barcode_plates:
494
+ if not cl_args["skip_demultiplexing"]:
495
+ i = barcode_to_index[barcode_plate]
496
+ name_folder = os.path.join(result_folder, f'RB{barcode_plate}')
497
+ os.makedirs(name_folder, exist_ok=True)
498
+ barcode_path = filter_bc(cl_args, name_folder, i)
499
+ output_dir = Path(result_folder) / f"{cl_args['name']}_fastq"
500
+ output_dir.mkdir(parents=True, exist_ok=True)
501
+
502
+ file_to_fastq = cat_fastq_files(cl_args.get("path"), output_dir)
503
+ try:
504
+ demux_fastq(output_dir, name_folder, barcode_path)
505
+ except Exception as e:
506
+ logging.error("An error occurred during demultiplexing for sample {}. Skipping this sample.".format(barcode_plate), exc_info=True)
507
+ continue
508
+ # Check this - need to see if the code works... ToDo: Ariane
509
+ # Now they are all demultiplexed, we can call variants
510
+ if not cl_args["skip_variantcalling"]:
511
+ for i, row in tqdm_fn(ref_df.iterrows(), total=len(ref_df), desc="Processing Samples"):
512
+ barcode_plate = row["barcode_plate"]
513
+ name = row["name"]
514
+ refseq = row["refseq"].upper()
515
+ # Get the name folder and barcode path
516
+ temp_fasta_path = os.path.join(result_folder, f"temp_{name}.fasta")
517
+ if not os.path.exists(temp_fasta_path):
518
+ with open(temp_fasta_path, "w") as f:
519
+ f.write(f">{name}\n{refseq}\n")
520
+ else:
521
+ logging.info(f"Fasta file for {name} already exists. Skipping write.")
522
+ try:
523
+ filtered_barcodes = filter_bc(cl_args, result_folder, i)
524
+ variant_result = call_variant(f"{name}", result_folder, temp_fasta_path, filtered_barcodes,
525
+ oligopool=True)
526
+ variant_result["barcode_plate"] = barcode_plate
527
+ variant_result["name"] = name
528
+ variant_result["refseq"] = refseq
529
+ variant_df = pd.concat([variant_df, variant_result])
530
+ except Exception as e:
531
+ logging.error("An error occurred during variant calling for sample {}. Skipping this sample.".format(name), exc_info=True)
532
+ continue
533
+
534
+ variant_df.to_csv(variant_csv_path, index=False)
535
+ # visualize it as well
536
+ return variant_df, ref_df
537
+
538
+
444
539
  # Function to process the reference CSV and generate variants
445
540
  def process_ref_csv(cl_args, tqdm_fn=tqdm.tqdm):
446
541
  ref_df = pd.read_csv(cl_args["summary"])
@@ -493,8 +588,9 @@ def process_ref_csv(cl_args, tqdm_fn=tqdm.tqdm):
493
588
 
494
589
  if not cl_args["skip_variantcalling"]:
495
590
  try:
591
+ threshold = cl_args.get("threshold") if cl_args.get("threshold") is not None else 0.5
496
592
  variant_result = call_variant(
497
- f"{name}", name_folder, temp_fasta_path, barcode_path
593
+ f"{name}", name_folder, temp_fasta_path, barcode_path, threshold=threshold
498
594
  )
499
595
  variant_result["barcode_plate"] = barcode_plate
500
596
  variant_result["name"] = name
@@ -508,6 +604,7 @@ def process_ref_csv(cl_args, tqdm_fn=tqdm.tqdm):
508
604
  variant_df.to_csv(variant_csv_path, index=False)
509
605
  return variant_df, ref_df
510
606
 
607
+
511
608
  # Main function to run LevSeq and ensure saving of intermediate results if an error occurs
512
609
  def run_LevSeq(cl_args, tqdm_fn=tqdm.tqdm):
513
610
  result_folder = create_result_folder(cl_args)
@@ -515,13 +612,16 @@ def run_LevSeq(cl_args, tqdm_fn=tqdm.tqdm):
515
612
  ref_folder = os.path.join(result_folder, "ref")
516
613
  os.makedirs(ref_folder, exist_ok=True)
517
614
 
518
- configure_logging(result_folder)
519
- logging.info("Logging configured. Starting program.")
615
+ configure_logging(result_folder, cl_args)
616
+ logging.info("Logging configured. Starting analysis...")
520
617
 
521
618
  variant_df = pd.DataFrame(columns=["barcode_plate", "name", "refseq", "variant"])
522
-
619
+
523
620
  try:
524
- variant_df, ref_df = process_ref_csv(cl_args, tqdm_fn)
621
+ if cl_args["oligopool"]:
622
+ variant_df, ref_df = process_ref_csv_oligopool(cl_args, tqdm_fn)
623
+ else:
624
+ variant_df, ref_df = process_ref_csv(cl_args, tqdm_fn)
525
625
  ref_df_path = os.path.join(ref_folder, cl_args["name"]+".csv")
526
626
  ref_df.to_csv(ref_df_path, index=False)
527
627
 
@@ -544,6 +644,8 @@ def run_LevSeq(cl_args, tqdm_fn=tqdm.tqdm):
544
644
  df_variants, df_vis = create_df_v(variant_df)
545
645
  processed_csv = os.path.join(result_folder, "visualization_partial.csv")
546
646
  df_vis.to_csv(processed_csv, index=False)
647
+ if cl_args["oligopool"]:
648
+ make_oligopool_plates(df_vis, result_folder=result_folder, save_files=True)
547
649
  except Exception as e:
548
650
  processed_csv = os.path.join(result_folder, "visualization_partial.csv")
549
651
  if 'df_vis' in locals():
@@ -59,12 +59,13 @@ def translate(seq):
59
59
  'TTC': 'F', 'TTT': 'F', 'TTA': 'L', 'TTG': 'L',
60
60
  'TAC': 'Y', 'TAT': 'Y', 'TAA': '*', 'TAG': '*',
61
61
  'TGC': 'C', 'TGT': 'C', 'TGA': '*', 'TGG': 'W',
62
+ 'GTS': "X"
62
63
  }
63
64
  protein = ""
64
65
  if len(seq) % 3 == 0:
65
66
  for i in range(0, len(seq), 3):
66
67
  codon = seq[i:i + 3]
67
- protein += table[codon]
68
+ protein += table.get(codon, 'X')
68
69
  return protein
69
70
 
70
71
 
@@ -290,8 +291,7 @@ def get_reads_for_well(parent_name, bam_file_path: str, ref_str: str, msa_path=N
290
291
  insert_map = defaultdict(list)
291
292
  for read in bam.fetch(until_eof=True):
292
293
  # Ensure we have at least 75% coverage
293
- if read.query_sequence is not None and len(read.query_sequence) > 0.75 * len(
294
- ref_str) and read.cigartuples is not None:
294
+ if read.query_sequence is not None and read.cigartuples is not None: # and len(read.query_sequence) > 0.75 * len(ref_str) and read.cigartuples is not None:
295
295
  seq, ref, qual, ins = alignment_from_cigar(read.cigartuples, read.query_sequence, ref_str,
296
296
  read.query_qualities)
297
297
  # Make it totally align
@@ -313,16 +313,17 @@ def get_reads_for_well(parent_name, bam_file_path: str, ref_str: str, msa_path=N
313
313
  # Do this for all wells
314
314
  seq_df = make_well_df_from_reads(seqs, read_ids, read_quals)
315
315
  alignment_count = len(seq_df.values)
316
- rows_all = make_row_from_read_pileup_across_well(seq_df, ref_str, parent_name, insert_map)
317
- bam.close()
318
-
319
- if len(rows_all) > 2: # Check if we have anything to return
320
- seq_df = pd.DataFrame(rows_all)
321
- seq_df.columns = ['gene_name', 'position', 'ref', 'most_frequent', 'freq_non_ref', 'total_other',
322
- 'total_reads', 'p_value', 'percent_most_freq_mutation', 'A', 'p(a)', 'T', 'p(t)', 'G', 'p(g)',
323
- 'C', 'p(c)', 'N', 'p(n)', 'I', 'p(i)', 'Warnings']
324
- return calculate_mutation_significance_across_well(seq_df), alignment_count
325
-
316
+ if alignment_count > 0:
317
+ rows_all = make_row_from_read_pileup_across_well(seq_df, ref_str, parent_name, insert_map)
318
+ bam.close()
319
+
320
+ if len(rows_all) > 2: # Check if we have anything to return
321
+ seq_df = pd.DataFrame(rows_all)
322
+ seq_df.columns = ['gene_name', 'position', 'ref', 'most_frequent', 'freq_non_ref', 'total_other',
323
+ 'total_reads', 'p_value', 'percent_most_freq_mutation', 'A', 'p(a)', 'T', 'p(t)', 'G', 'p(g)',
324
+ 'C', 'p(c)', 'N', 'p(n)', 'I', 'p(i)', 'Warnings']
325
+ return calculate_mutation_significance_across_well(seq_df), alignment_count
326
+ return None, 0
326
327
  def make_row_from_read_pileup_across_well(well_df, ref_str, label, insert_map):
327
328
  """
328
329
  Given a pileup of reads, we want to get some summary information about that sequence