vntyper 2.0.0b0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. vntyper-2.0.0b0/LICENSE +31 -0
  2. vntyper-2.0.0b0/MANIFEST.in +21 -0
  3. vntyper-2.0.0b0/PKG-INFO +364 -0
  4. vntyper-2.0.0b0/README.md +329 -0
  5. vntyper-2.0.0b0/setup.cfg +4 -0
  6. vntyper-2.0.0b0/setup.py +74 -0
  7. vntyper-2.0.0b0/vntyper/__init__.py +3 -0
  8. vntyper-2.0.0b0/vntyper/cli.py +706 -0
  9. vntyper-2.0.0b0/vntyper/config.json +88 -0
  10. vntyper-2.0.0b0/vntyper/dependencies/kestrel/commons-lang3-3.4.jar +0 -0
  11. vntyper-2.0.0b0/vntyper/dependencies/kestrel/java-getopt-1.0.14.jar +0 -0
  12. vntyper-2.0.0b0/vntyper/dependencies/kestrel/kanalyze.jar +0 -0
  13. vntyper-2.0.0b0/vntyper/dependencies/kestrel/kestrel.jar +0 -0
  14. vntyper-2.0.0b0/vntyper/dependencies/kestrel/logback-classic-1.1.3.jar +0 -0
  15. vntyper-2.0.0b0/vntyper/dependencies/kestrel/logback-core-1.1.3.jar +0 -0
  16. vntyper-2.0.0b0/vntyper/dependencies/kestrel/slf4j-api-1.7.12.jar +0 -0
  17. vntyper-2.0.0b0/vntyper/modules/__init__.py +44 -0
  18. vntyper-2.0.0b0/vntyper/modules/advntr/__init__.py +4 -0
  19. vntyper-2.0.0b0/vntyper/modules/advntr/advntr_config.json +16 -0
  20. vntyper-2.0.0b0/vntyper/modules/advntr/advntr_genotyping.py +479 -0
  21. vntyper-2.0.0b0/vntyper/modules/shark/__init__.py +4 -0
  22. vntyper-2.0.0b0/vntyper/modules/shark/shark_config.json +5 -0
  23. vntyper-2.0.0b0/vntyper/modules/shark/shark_filtering.py +83 -0
  24. vntyper-2.0.0b0/vntyper/scripts/__init__.py +1 -0
  25. vntyper-2.0.0b0/vntyper/scripts/alignment_processing.py +141 -0
  26. vntyper-2.0.0b0/vntyper/scripts/cohort_summary.py +892 -0
  27. vntyper-2.0.0b0/vntyper/scripts/confidence_assignment.py +151 -0
  28. vntyper-2.0.0b0/vntyper/scripts/cross_match.py +193 -0
  29. vntyper-2.0.0b0/vntyper/scripts/extract_unmapped_from_offset.py +79 -0
  30. vntyper-2.0.0b0/vntyper/scripts/fastq_bam_processing.py +705 -0
  31. vntyper-2.0.0b0/vntyper/scripts/file_processing.py +72 -0
  32. vntyper-2.0.0b0/vntyper/scripts/flagging.py +257 -0
  33. vntyper-2.0.0b0/vntyper/scripts/generate_report.py +923 -0
  34. vntyper-2.0.0b0/vntyper/scripts/install_references.py +517 -0
  35. vntyper-2.0.0b0/vntyper/scripts/install_references_config.json +45 -0
  36. vntyper-2.0.0b0/vntyper/scripts/kestrel_config.json +57 -0
  37. vntyper-2.0.0b0/vntyper/scripts/kestrel_genotyping.py +647 -0
  38. vntyper-2.0.0b0/vntyper/scripts/motif_processing.py +334 -0
  39. vntyper-2.0.0b0/vntyper/scripts/online_mode.py +242 -0
  40. vntyper-2.0.0b0/vntyper/scripts/pipeline.py +711 -0
  41. vntyper-2.0.0b0/vntyper/scripts/report_config.json +291 -0
  42. vntyper-2.0.0b0/vntyper/scripts/scoring.py +157 -0
  43. vntyper-2.0.0b0/vntyper/scripts/summary.py +358 -0
  44. vntyper-2.0.0b0/vntyper/scripts/utils.py +356 -0
  45. vntyper-2.0.0b0/vntyper/scripts/variant_parsing.py +130 -0
  46. vntyper-2.0.0b0/vntyper/templates/cohort_summary_template.html +193 -0
  47. vntyper-2.0.0b0/vntyper/templates/report_template.html +525 -0
  48. vntyper-2.0.0b0/vntyper/version.py +3 -0
  49. vntyper-2.0.0b0/vntyper.egg-info/PKG-INFO +364 -0
  50. vntyper-2.0.0b0/vntyper.egg-info/SOURCES.txt +52 -0
  51. vntyper-2.0.0b0/vntyper.egg-info/dependency_links.txt +1 -0
  52. vntyper-2.0.0b0/vntyper.egg-info/entry_points.txt +2 -0
  53. vntyper-2.0.0b0/vntyper.egg-info/requires.txt +11 -0
  54. vntyper-2.0.0b0/vntyper.egg-info/top_level.txt +1 -0
@@ -0,0 +1,31 @@
1
+ BSD 3-Clause License
2
+ The Regents of the Université Paris Cité (Imagine Institute)
3
+ Berlin Institute of Health at Charité, Universitätsmedizin Berlin
4
+
5
+ Copyright (c) 2024, Hassan Saei
6
+ Copyright (c) 2024, Bernt Popp
7
+
8
+ Redistribution and use in source and binary forms, with or without
9
+ modification, are permitted provided that the following conditions are met:
10
+
11
+ 1. Redistributions of source code must retain the above copyright notice, this
12
+ list of conditions and the following disclaimer.
13
+
14
+ 2. Redistributions in binary form must reproduce the above copyright notice,
15
+ this list of conditions and the following disclaimer in the documentation
16
+ and/or other materials provided with the distribution.
17
+
18
+ 3. Neither the name of the copyright holder nor the names of its
19
+ contributors may be used to endorse or promote products derived from
20
+ this software without specific prior written permission.
21
+
22
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
23
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25
+ DISCLAIMED. IN NO EVENT SHALL THE IMAGINE INSTITUTE OR CONTRIBUTORS BE LIABLE
26
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
28
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,21 @@
1
+ # Include the main documentation and license
2
+ include README.md
3
+ include LICENSE
4
+
5
+ # Include the main config file
6
+ include vntyper/config.json
7
+
8
+ # Include the templates
9
+ include vntyper/templates/report_template.html
10
+ include vntyper/templates/cohort_summary_template.html
11
+
12
+ # Include all JAR files in the dependencies/kestrel directory
13
+ recursive-include vntyper/dependencies/kestrel *.jar
14
+
15
+ # Include the configuration files in the scripts subpackage
16
+ include vntyper/scripts/kestrel_config.json
17
+ include vntyper/scripts/install_references_config.json
18
+
19
+ # Include the configuration files for the modules
20
+ include vntyper/modules/shark/shark_config.json
21
+ include vntyper/modules/advntr/advntr_config.json
@@ -0,0 +1,364 @@
1
+ Metadata-Version: 2.4
2
+ Name: vntyper
3
+ Version: 2.0.0b0
4
+ Summary: VNtyper: A tool for genotyping MUC1-VNTR
5
+ Home-page: https://github.com/hassansaei/vntyper
6
+ Author: Hassan Saei, Bernt Popp
7
+ Author-email: hassan.saei@inserm.fr, bernt.popp.md@gmail.com
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.9
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: pandas>=2.2.0
15
+ Requires-Dist: numpy>=2.0.2
16
+ Requires-Dist: regex>=2024.7.24
17
+ Requires-Dist: biopython>=1.84
18
+ Requires-Dist: setuptools>=72.2.0
19
+ Requires-Dist: pysam>=0.22.1
20
+ Provides-Extra: dev
21
+ Requires-Dist: pytest; extra == "dev"
22
+ Requires-Dist: black; extra == "dev"
23
+ Requires-Dist: flake8; extra == "dev"
24
+ Dynamic: author
25
+ Dynamic: author-email
26
+ Dynamic: classifier
27
+ Dynamic: description
28
+ Dynamic: description-content-type
29
+ Dynamic: home-page
30
+ Dynamic: license-file
31
+ Dynamic: provides-extra
32
+ Dynamic: requires-dist
33
+ Dynamic: requires-python
34
+ Dynamic: summary
35
+
36
+ # VNtyper 2.0 - A Pipeline to genotype the MUC1-VNTR
37
+
38
+ **VNtyper 2.0** is an advanced pipeline designed to genotype MUC1 coding Variable Number Tandem Repeats (VNTR) in Autosomal Dominant Tubulointerstitial Kidney Disease (ADTKD-MUC1) using Short-Read Sequencing (SRS) data. This version is a refactored version of VNtyper v1 integrates enhanced variant calling algorithms, robust logging mechanisms, and streamlined installation processes to provide researchers with a powerful tool for VNTR analysis.
39
+
40
+ - We have developed a web server to provide free access to VNtyper, which runs in the background for ease of use.
41
+ Access it through the following link: [vntyper-online](https://vntyper.org/)
42
+
43
+ ---
44
+
45
+ ## Table of Contents
46
+
47
+ 1. [Features](#features)
48
+ 2. [Installation](#installation)
49
+ 3. [Usage](#usage)
50
+ 4. [Pipeline Overview](#pipeline-overview)
51
+ 5. [Dependencies](#dependencies)
52
+ 6. [Linting and Code Formatting](#linting-and-code-formatting)
53
+ 7. [Pipeline Logic Diagram](#pipeline-logic-diagram)
54
+ 8. [Results](#results)
55
+ 9. [Notes](#notes)
56
+ 10. [Citations](#citations)
57
+ 11. [Contributing](#contributing)
58
+ 12. [License](#license)
59
+ 13. [Contact](#contact)
60
+
61
+ ---
62
+
63
+ ## Features
64
+
65
+ - **Variant Calling Algorithms:**
66
+ - **Kestrel:** Mapping-free genotyping using k-mer frequencies.
67
+ - **code-adVNTR (optional):** Profile-HMM-based method for VNTR genotyping.
68
+ - **SHARK (optional, FASTQ-only):** Rapid filtering and read extraction for MUC1 region in exome/whole-genome data.
69
+
70
+ - **Comprehensive Logging:**
71
+ - Logs both to the console and a dedicated log file.
72
+ - Generates MD5 checksums for all downloaded and processed files.
73
+
74
+ - **Flexible Installation:**
75
+ - Supports installation via `pip` using `setup.py`.
76
+ - Provides Conda environment setup for easy dependency management.
77
+
78
+ - **Subcommands:**
79
+ - `install-references`
80
+ - `pipeline`
81
+ - `fastq`
82
+ - `bam`
83
+ - `kestrel`
84
+ - `report`
85
+ - `cohort`
86
+ - `online`
87
+
88
+ ---
89
+
90
+ ## Installation
91
+
92
+ VNtyper 2.0 can be installed using either `pip` with `setup.py` or via Conda environments for streamlined dependency management.
93
+
94
+ ### Using `setup.py` and `pip`
95
+
96
+ 1. **Clone the Repository:**
97
+
98
+ ```bash
99
+ mkdir vntyper
100
+ git clone https://github.com/hassansaei/vntyper.git
101
+ cd vntyper
102
+ pip install .
103
+ ```
104
+
105
+ ---
106
+
107
+ ## Usage
108
+
109
+ VNtyper 2.0 offers multiple subcommands that can be used depending on your input data and requirements. Below are the main subcommands available:
110
+
111
+ ### 1. Running the Full Pipeline
112
+
113
+ To run the entire pipeline using a BAM file:
114
+
115
+ ```bash
116
+ vntyper --config-path /path/to/config.json pipeline \
117
+ --bam /path/to/sample.bam \
118
+ --output-dir /path/to/output/dir \
119
+ --threads 4 --fast-mode
120
+ ```
121
+
122
+ Alternatively, using paired-end FASTQ files:
123
+
124
+ ```bash
125
+ vntyper --config-path /path/to/config.json pipeline \
126
+ --fastq1 /path/to/sample_R1.fastq.gz \
127
+ --fastq2 /path/to/sample_R2.fastq.gz \
128
+ --output-dir /path/to/output/dir \
129
+ --threads 4 --fast-mode
130
+ ```
131
+
132
+ The adVNTR genotyping is optional and skipped by default. To enable adVNTR genotyping, use the `--extra-modules advntr` option.
133
+
134
+ **New**: To enable SHARK filtering on FASTQ reads *before* the usual QC and alignment (for improved MUC1 detection), add `shark` to the `--extra-modules` flag (e.g., `--extra-modules shark`). This can be done as:
135
+
136
+ ```bash
137
+ vntyper --config-path /path/to/config.json pipeline \
138
+ --fastq1 /path/to/sample_R1.fastq.gz \
139
+ --fastq2 /path/to/sample_R2.fastq.gz \
140
+ --extra-modules shark \
141
+ --threads 4 \
142
+ --output-dir /path/to/output/dir
143
+ ```
144
+
145
+ - SHARK will run first on the raw FASTQ files to extract and filter reads covering the MUC1 VNTR region.
146
+ - **Important**: SHARK is only supported in FASTQ mode. If you try to use `--extra-modules shark` together with `--bam` or `--cram`, VNtyper will exit gracefully with a warning.
147
+
148
+ ### 2. Running VNtyper with Docker
149
+
150
+ Docker image for VNtyper 2.0 is provided and can be pulled and used as follows:
151
+
152
+ ```bash
153
+ # pull the docker image
154
+ docker pull saei/vntyper:main
155
+
156
+ # run the pipeline using the docker image
157
+ docker run -w /opt/vntyper --rm \
158
+ -v /local/input/folder/:/opt/vntyper/input \
159
+ -v /local/output/folder/:/opt/vntyper/output \
160
+ saei/vntyper:latest \
161
+ vntyper pipeline \
162
+ --bam /opt/vntyper/input/filename.bam \
163
+ -o /opt/vntyper/output/filename/
164
+ ```
165
+
166
+ > **Important Host Volume Permissions Note:**
167
+ > When mounting host directories into the container (using the `-v` flag), please ensure that the host directories (e.g., `/local/input/folder/` and `/local/output/folder/`) have the appropriate permissions so that they are writable by the container's non-root user.
168
+ >
169
+ > **Why Non-Root?**
170
+ > VNtyper runs as a non-root user for enhanced security and to avoid file ownership issues on your host. Running as root may create files owned by root, leading to permission problems later.
171
+ >
172
+ > There are two ways to ensure proper permissions:
173
+ >
174
+ > 1. **Adjust Host Directory Permissions:**
175
+ > Change the ownership/permissions on the host directories so that the UID and GID match those expected by VNtyper in the container.
176
+ >
177
+ > 2. **Use the `--user` Flag:**
178
+ > Run the container with the `--user` flag to specify your current user’s UID and GID. For example:
179
+ >
180
+ > ```bash
181
+ > docker run --user $(id -u):$(id -g) -w /opt/vntyper --rm \
182
+ > -v /local/input/folder/:/opt/vntyper/input \
183
+ > -v /local/output/folder/:/opt/vntyper/output \
184
+ > saei/vntyper:latest \
185
+ > vntyper pipeline \
186
+ > --bam /opt/vntyper/input/filename.bam \
187
+ > -o /opt/vntyper/output/filename/
188
+ > ```
189
+ >
190
+ > Using either method ensures VNtyper can write its log files (e.g., `pipeline.log`) and other outputs without encountering permission errors.
191
+
192
+ An Apptainer image can be generated from the Docker image as follows:
193
+
194
+ ```bash
195
+ # create the apptainer sif image
196
+ apptainer pull docker://saei/vntyper:main
197
+
198
+ # run the pipeline using the apptainer image
199
+ apptainer run --pwd /opt/vntyper \
200
+ -B /local/input/folder/:/opt/vntyper/input \
201
+ -B /local/output/folder/:/opt/vntyper/output \
202
+ vntyper_latest.sif vntyper pipeline \
203
+ --bam /opt/vntyper/input/filename.bam \
204
+ -o /opt/vntyper/output/filename/
205
+ ```
206
+
207
+ ### 3. Installing References
208
+
209
+ ```bash
210
+ vntyper --config-path /path/to/config.json install-references \
211
+ --output-dir /path/to/reference/install \
212
+ --skip-indexing # Optional: skip BWA indexing if needed
213
+ ```
214
+
215
+ ### 4. Generating Reports
216
+
217
+ ```bash
218
+ vntyper --config-path /path/to/config.json report \
219
+ --output-dir /path/to/output/dir
220
+ ```
221
+
222
+ ---
223
+
224
+ ## Pipeline Overview
225
+
226
+ VNtyper 2.0 integrates multiple steps into a streamlined pipeline. The following is an overview of the steps involved:
227
+
228
+ 1. **FASTQ Quality Control**: Raw FASTQ files are checked for quality.
229
+ 2. **(Optional) SHARK Filtering**: If `shark` is specified in `--extra-modules`, raw FASTQ reads are first filtered to extract MUC1-specific reads (especially relevant for exome or large WGS datasets).
230
+ 3. **Alignment**: Reads are aligned using BWA (if FASTQ files are provided).
231
+ 4. **Kestrel Genotyping**: Mapping-free genotyping of VNTRs.
232
+ 5. **(Optional) adVNTR Genotyping**: Profile-HMM-based method for VNTR genotyping (requires additional setup).
233
+ 6. **Summary Report Generation**: A final HTML report is generated to summarize the results.
234
+
235
+ ---
236
+
237
+ ## Dependencies
238
+
239
+ VNtyper 2.0 relies on several tools and Python libraries. Ensure that the following dependencies are available in your environment:
240
+
241
+ - Python >= 3.9
242
+ - BWA
243
+ - Samtools
244
+ - Fastp
245
+ - Pandas
246
+ - Numpy
247
+ - Biopython
248
+ - Pysam
249
+ - Jinja2
250
+ - Matplotlib
251
+ - Seaborn
252
+ - IGV-Reports
253
+
254
+ You can easily set up these dependencies via the provided Conda environment file.
255
+
256
+ ---
257
+
258
+ ## Linting and Code Formatting
259
+
260
+ VNtyper adheres to PEP8 style guidelines to ensure clean, readable, and maintainable code. We recommend the following tools:
261
+
262
+ ### Using flake8 for Linting
263
+
264
+ **flake8** is used to check for style violations. Note that flake8 only reports issues—it does not automatically fix them.
265
+
266
+ 1. **Install flake8:**
267
+ You can install it as part of the development extras:
268
+
269
+ ```bash
270
+ pip install -e .[dev]
271
+ ```
272
+
273
+ *Or install it directly:*
274
+
275
+ ```bash
276
+ pip install flake8
277
+ ```
278
+
279
+ 2. **Run flake8:**
280
+ To check your code, run the following command from the project root:
281
+
282
+ ```bash
283
+ flake8 .
284
+ ```
285
+
286
+ This command will recursively scan your project and report any PEP8 issues.
287
+
288
+ ### Automatic Code Formatting with Black
289
+
290
+ For automatic formatting, we use **Black**, which is already included in the development extras.
291
+
292
+ 1. **Run Black:**
293
+ Simply execute the following command in the project root:
294
+
295
+ ```bash
296
+ black .
297
+ ```
298
+
299
+ Black will automatically reformat your code according to its opinionated style, which is also compliant with PEP8.
300
+
301
+ ---
302
+
303
+ ## Pipeline Logic Diagram
304
+
305
+ Below is a logical overview of the VNtyper pipeline:
306
+
307
+ ```mermaid
308
+ graph TD
309
+ A[Input: FASTQ/BAM] -->|Quality Control| B[Alignment BWA]
310
+ B -->|Genotyping| C[Kestrel]
311
+ C --> D[Optional: adVNTR]
312
+ D --> E[Generate Summary Report]
313
+ E --> F[Output: VCF, Summary HTML]
314
+ ```
315
+
316
+ ---
317
+
318
+ ## Results
319
+
320
+ Once the pipeline completes, you will have:
321
+
322
+ - **BAM or FASTQ** slices containing MUC1-specific reads.
323
+ - **VCF files** or **TSV files** with genotyping results (for Kestrel and optional adVNTR).
324
+ - **HTML summary report** including:
325
+ - **VNTR Region Coverage Statistics**: Detailed coverage metrics specifically for the VNTR region, including mean, median, standard deviation, minimum, and maximum coverage, as well as the percentage of the VNTR region with zero coverage.
326
+ - **Genotyping Calls**: Results from Kestrel and optional adVNTR analyses.
327
+ - **Quality Metrics**: When available, includes duplication rate, Q20/Q30 rates, and other quality indicators.
328
+ - **Pipeline Log**: Comprehensive logging information about the pipeline execution.
329
+
330
+ ---
331
+
332
+ ## Notes
333
+
334
+ 1. This tool is for **research use only**.
335
+ 2. Ensure **high-coverage WES/WGS or targeted data** is used to genotype MUC1 VNTR accurately.
336
+ 3. For questions or issues, refer to the GitHub repository for support.
337
+
338
+ ---
339
+
340
+ ## Citations
341
+
342
+ If you use VNtyper 2.0 in your research, please cite the following:
343
+
344
+ 1. Saei H, Morinière V, Heidet L, et al. VNtyper enables accurate alignment-free genotyping of MUC1 coding VNTR using short-read sequencing data. iScience. 2023.
345
+ 2. Audano PA, Ravishankar S, et al. Mapping-free variant calling using haplotype reconstruction from k-mer frequencies. Bioinformatics. 2018.
346
+ 3. Park J, Bakhtiari M, et al. Detecting tandem repeat variants in coding regions using code-adVNTR. iScience. 2022.
347
+
348
+ ---
349
+
350
+ ## Contributing
351
+
352
+ We welcome contributions to VNtyper. Please refer to the [CONTRIBUTING.md](CONTRIBUTING.md) file for guidelines.
353
+
354
+ ---
355
+
356
+ ## License
357
+
358
+ VNtyper is licensed under the BSD 3-Clause License. See the LICENSE file for more details.
359
+
360
+ ---
361
+
362
+ ## Contact
363
+
364
+ For questions or issues, please open an [issue on GitHub](https://github.com/hassansaei/vntyper/issues) or email the corresponding authors listed in the manuscript.