DAJIN2 0.1.32a0__zip → 0.3.2__zip

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. {DAJIN2-0.1.32a0 → DAJIN2-0.3.2}/MANIFEST.in +3 -0
  2. DAJIN2-0.3.2/PKG-INFO +258 -0
  3. DAJIN2-0.3.2/README.md +240 -0
  4. DAJIN2-0.3.2/requirements.txt +21 -0
  5. {DAJIN2-0.1.32a0 → DAJIN2-0.3.2}/setup.py +8 -4
  6. DAJIN2-0.3.2/src/DAJIN2/core/classification/__init__.py +1 -0
  7. DAJIN2-0.3.2/src/DAJIN2/core/classification/classifier.py +49 -0
  8. DAJIN2-0.3.2/src/DAJIN2/core/clustering/__init__.py +3 -0
  9. DAJIN2-0.3.2/src/DAJIN2/core/clustering/appender.py +44 -0
  10. DAJIN2-0.3.2/src/DAJIN2/core/clustering/clustering.py +150 -0
  11. DAJIN2-0.3.2/src/DAJIN2/core/clustering/kmer_generator.py +46 -0
  12. DAJIN2-0.3.2/src/DAJIN2/core/clustering/label_extractor.py +112 -0
  13. DAJIN2-0.3.2/src/DAJIN2/core/clustering/label_handler.py +42 -0
  14. DAJIN2-0.3.2/src/DAJIN2/core/clustering/label_merger.py +49 -0
  15. DAJIN2-0.3.2/src/DAJIN2/core/clustering/score_handler.py +142 -0
  16. DAJIN2-0.3.2/src/DAJIN2/core/consensus/__init__.py +5 -0
  17. DAJIN2-0.3.2/src/DAJIN2/core/consensus/consensus.py +132 -0
  18. DAJIN2-0.3.2/src/DAJIN2/core/consensus/name_handler.py +88 -0
  19. DAJIN2-0.3.2/src/DAJIN2/core/core.py +293 -0
  20. DAJIN2-0.3.2/src/DAJIN2/core/preprocess/__init__.py +13 -0
  21. DAJIN2-0.3.2/src/DAJIN2/core/preprocess/cache_checker.py +24 -0
  22. DAJIN2-0.3.2/src/DAJIN2/core/preprocess/directories.py +34 -0
  23. DAJIN2-0.3.2/src/DAJIN2/core/preprocess/fastx_parser.py +59 -0
  24. DAJIN2-0.3.2/src/DAJIN2/core/preprocess/genome_fetcher.py +43 -0
  25. DAJIN2-0.3.2/src/DAJIN2/core/preprocess/homopolymer_handler.py +139 -0
  26. DAJIN2-0.3.2/src/DAJIN2/core/preprocess/insertions_to_fasta.py +346 -0
  27. DAJIN2-0.3.2/src/DAJIN2/core/preprocess/knockin_handler.py +45 -0
  28. DAJIN2-0.3.2/src/DAJIN2/core/preprocess/mapping.py +123 -0
  29. DAJIN2-0.3.2/src/DAJIN2/core/preprocess/midsv_caller.py +138 -0
  30. DAJIN2-0.3.2/src/DAJIN2/core/preprocess/mutation_extractor.py +305 -0
  31. DAJIN2-0.3.2/src/DAJIN2/core/report/__init__.py +3 -0
  32. DAJIN2-0.3.2/src/DAJIN2/core/report/report_bam.py +135 -0
  33. DAJIN2-0.3.2/src/DAJIN2/core/report/report_files.py +35 -0
  34. DAJIN2-0.3.2/src/DAJIN2/core/report/report_mutation.py +198 -0
  35. {DAJIN2-0.1.32a0 → DAJIN2-0.3.2}/src/DAJIN2/gui.py +19 -10
  36. DAJIN2-0.3.2/src/DAJIN2/main.py +223 -0
  37. DAJIN2-0.3.2/src/DAJIN2/utils/config.py +65 -0
  38. DAJIN2-0.3.2/src/DAJIN2/utils/cssplits_handler.py +92 -0
  39. DAJIN2-0.3.2/src/DAJIN2/utils/dna_handler.py +7 -0
  40. DAJIN2-0.3.2/src/DAJIN2/utils/input_validator.py +169 -0
  41. DAJIN2-0.3.2/src/DAJIN2/utils/io.py +131 -0
  42. DAJIN2-0.3.2/src/DAJIN2/utils/multiprocess.py +66 -0
  43. DAJIN2-0.1.32a0/src/DAJIN2/postprocess/report.py → DAJIN2-0.3.2/src/DAJIN2/utils/report_generator.py +5 -4
  44. DAJIN2-0.3.2/src/DAJIN2/utils/sam_handler.py +193 -0
  45. {DAJIN2-0.1.32a0 → DAJIN2-0.3.2}/src/DAJIN2/view.py +5 -3
  46. DAJIN2-0.3.2/src/DAJIN2.egg-info/PKG-INFO +258 -0
  47. DAJIN2-0.3.2/src/DAJIN2.egg-info/SOURCES.txt +57 -0
  48. DAJIN2-0.3.2/src/DAJIN2.egg-info/entry_points.txt +2 -0
  49. {DAJIN2-0.1.32a0 → DAJIN2-0.3.2}/src/DAJIN2.egg-info/requires.txt +7 -5
  50. DAJIN2-0.1.32a0/PKG-INFO +0 -71
  51. DAJIN2-0.1.32a0/README.md +0 -57
  52. DAJIN2-0.1.32a0/src/DAJIN2/DAJIN2.py +0 -266
  53. DAJIN2-0.1.32a0/src/DAJIN2/batch.py +0 -92
  54. DAJIN2-0.1.32a0/src/DAJIN2/core/classification/__init__.py +0 -2
  55. DAJIN2-0.1.32a0/src/DAJIN2/core/classification/classify.py +0 -83
  56. DAJIN2-0.1.32a0/src/DAJIN2/core/classification/detect_sv.py +0 -17
  57. DAJIN2-0.1.32a0/src/DAJIN2/core/clustering/__init__.py +0 -2
  58. DAJIN2-0.1.32a0/src/DAJIN2/core/clustering/clustering.py +0 -171
  59. DAJIN2-0.1.32a0/src/DAJIN2/core/clustering/make_score.py +0 -71
  60. DAJIN2-0.1.32a0/src/DAJIN2/core/clustering/merge_clusters.py +0 -40
  61. DAJIN2-0.1.32a0/src/DAJIN2/core/clustering/return_labels.py +0 -106
  62. DAJIN2-0.1.32a0/src/DAJIN2/core/consensus/__init__.py +0 -5
  63. DAJIN2-0.1.32a0/src/DAJIN2/core/consensus/consensus.py +0 -151
  64. DAJIN2-0.1.32a0/src/DAJIN2/core/core_execute.py +0 -202
  65. DAJIN2-0.1.32a0/src/DAJIN2/core/preprocess/__init__.py +0 -12
  66. DAJIN2-0.1.32a0/src/DAJIN2/core/preprocess/call_midsv.py +0 -131
  67. DAJIN2-0.1.32a0/src/DAJIN2/core/preprocess/correct_knockin.py +0 -161
  68. DAJIN2-0.1.32a0/src/DAJIN2/core/preprocess/correct_revititive_deletions.py +0 -138
  69. DAJIN2-0.1.32a0/src/DAJIN2/core/preprocess/correct_sequence_error.py +0 -240
  70. DAJIN2-0.1.32a0/src/DAJIN2/core/preprocess/extract_knockin_loci.py +0 -31
  71. DAJIN2-0.1.32a0/src/DAJIN2/core/preprocess/extract_mutation_loci.py +0 -160
  72. DAJIN2-0.1.32a0/src/DAJIN2/core/preprocess/format_inputs.py +0 -119
  73. DAJIN2-0.1.32a0/src/DAJIN2/core/preprocess/mappy_align.py +0 -116
  74. DAJIN2-0.1.32a0/src/DAJIN2/core/preprocess/replace_NtoD.py +0 -48
  75. DAJIN2-0.1.32a0/src/DAJIN2/core/preprocess/validate_inputs.py +0 -150
  76. DAJIN2-0.1.32a0/src/DAJIN2/core/report/__init__.py +0 -1
  77. DAJIN2-0.1.32a0/src/DAJIN2/core/report/report_bam.py +0 -319
  78. DAJIN2-0.1.32a0/src/DAJIN2/core/report/report_files.py +0 -38
  79. DAJIN2-0.1.32a0/src/DAJIN2/postprocess/__init__.py +0 -0
  80. DAJIN2-0.1.32a0/src/DAJIN2/single.py +0 -27
  81. DAJIN2-0.1.32a0/src/DAJIN2.egg-info/PKG-INFO +0 -71
  82. DAJIN2-0.1.32a0/src/DAJIN2.egg-info/SOURCES.txt +0 -48
  83. DAJIN2-0.1.32a0/src/DAJIN2.egg-info/entry_points.txt +0 -2
  84. {DAJIN2-0.1.32a0 → DAJIN2-0.3.2}/LICENSE +0 -0
  85. {DAJIN2-0.1.32a0 → DAJIN2-0.3.2}/setup.cfg +0 -0
  86. {DAJIN2-0.1.32a0 → DAJIN2-0.3.2}/src/DAJIN2/__init__.py +0 -0
  87. {DAJIN2-0.1.32a0 → DAJIN2-0.3.2}/src/DAJIN2/core/__init__.py +0 -0
  88. /DAJIN2-0.1.32a0/src/DAJIN2/core/consensus/subset.py → /DAJIN2-0.3.2/src/DAJIN2/core/consensus/clust_subsetter.py +0 -0
  89. {DAJIN2-0.1.32a0 → DAJIN2-0.3.2}/src/DAJIN2/static/css/style.css +0 -0
  90. {DAJIN2-0.1.32a0 → DAJIN2-0.3.2}/src/DAJIN2/template_igvjs.html +0 -0
  91. {DAJIN2-0.1.32a0 → DAJIN2-0.3.2}/src/DAJIN2/templates/index.html +0 -0
  92. {DAJIN2-0.1.32a0 → DAJIN2-0.3.2}/src/DAJIN2.egg-info/dependency_links.txt +0 -0
  93. {DAJIN2-0.1.32a0 → DAJIN2-0.3.2}/src/DAJIN2.egg-info/top_level.txt +0 -0
@@ -1,3 +1,6 @@
1
+ include requirements.txt
1
2
  include src/DAJIN2/template_igvjs.html
3
+
2
4
  graft src/DAJIN2/templates
3
5
  graft src/DAJIN2/static
6
+ graft src/DAJIN2/utils
DAJIN2-0.3.2/PKG-INFO ADDED
@@ -0,0 +1,258 @@
1
+ Metadata-Version: 2.1
2
+ Name: DAJIN2
3
+ Version: 0.3.2
4
+ Summary: One-step genotyping tools for targeted long-read sequencing
5
+ Home-page: https://github.com/akikuno/DAJIN2
6
+ Author: Akihiro Kuno
7
+ Author-email: akuno@md.tsukuba.ac.jp
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Environment :: Console
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Operating System :: POSIX
13
+ Classifier: Operating System :: MacOS
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
16
+ Description-Content-Type: text/markdown
17
+ License-File: LICENSE
18
+
19
+ [![License](https://img.shields.io/badge/License-MIT-9cf.svg?style=flat-square)](https://choosealicense.com/licenses/mit/)
20
+ [![Test](https://img.shields.io/github/actions/workflow/status/akikuno/dajin2/pytest.yml?branch=main&label=Test&color=brightgreen&style=flat-square)](https://github.com/akikuno/dajin2/actions)
21
+ [![Python](https://img.shields.io/pypi/pyversions/DAJIN2.svg?label=Python&color=blue&style=flat-square)](https://pypi.org/project/DAJIN2/)
22
+ [![PyPI](https://img.shields.io/pypi/v/DAJIN2.svg?label=PyPI&color=orange&style=flat-square)](https://pypi.org/project/DAJIN2/)
23
+ [![Bioconda](https://img.shields.io/conda/v/bioconda/dajin2?label=Bioconda&color=orange&style=flat-square)](https://anaconda.org/bioconda/dajin2)
24
+ [![DOI](https://zenodo.org/badge/387721337.svg)](https://zenodo.org/badge/latestdoi/387721337)
25
+
26
+
27
+ <p align="center">
28
+ <img src="https://user-images.githubusercontent.com/15861316/261833016-7f356960-88cf-4574-87e2-36162b174340.png" width="90%">
29
+ </p>
30
+
31
+ [日本語はこちら](https://github.com/akikuno/DAJIN2/blob/main/docs/README_JP.md)
32
+
33
+ DAJIN2 is a genotyping software designed for organisms that have undergone genome editing, utilizing nanopore sequencing technology.
34
+
35
+ The name DAJIN is inspired by the term 一網**打尽** (Ichimou **DAJIN** or Yīwǎng **Dǎjìn**), which signifies capturing everything in a single net.
36
+
37
+ ## 🙏 Feedbacks
38
+
39
+ DAJIN2 is still in the development phase.
40
+ Basic tests covering point mutations, deletions, and insertion designs have been conducted.
41
+ If you encounter any bugs or issues, please report them via [Issues](https://github.com/akikuno/DAJIN2/issues).
42
+
43
+
44
+
45
+ ## 🛠 Installation
46
+
47
+ ### From [Bioconda](https://anaconda.org/bioconda/DAJIN2) (Recommended)
48
+
49
+ ```bash
50
+ conda install -c bioconda DAJIN2
51
+ ```
52
+
53
+ ### From [PyPI](https://pypi.org/project/DAJIN2/)
54
+
55
+ ```bash
56
+ pip install DAJIN2
57
+ ```
58
+
59
+ > **Warning**
60
+ > If you encounter the error **Failed to build mappy** when installing DAJIN2 from pip, please install `gcc` and `zlib`.
61
+ > `sudo apt install gcc zlib1g zlib1g-dev` (Ubuntu)
62
+ > `brew install gcc zlib` (macOS)
63
+
64
+ <!-- ```bash
65
+ # Ubuntu
66
+ sudo apt install gcc zlib1g zlib1g-dev
67
+ ```
68
+
69
+ ```bash
70
+ # macOS
71
+ brew install gcc zlib
72
+ ``` -->
73
+
74
+
75
+ ## 💡 Usage
76
+
77
+ ### Single Sample Analysis
78
+
79
+ DAJIN2 allows for the analysis of single samples (one sample vs one control).
80
+
81
+ ```bash
82
+ DAJIN2 <-s|--sample> <-c|--control> <-a|--allele> <-n|--name> [-g|--genome] [-t|--threads] [-h|--help] [-v|--version]
83
+
84
+ options:
85
+ -s, --sample Path to a sample FASTQ file
86
+ -c, --control Path to a control FASTQ file
87
+ -a, --allele Path to a FASTA file
88
+ -n, --name Output directory name
89
+ -g, --genome (Optional) Reference genome ID (e.g hg38, mm39) [default: '']
90
+ -t, --threads (Optional) Number of threads [default: 1]
91
+ -h, --help show this help message and exit
92
+ -v, --version show the version number and exit
93
+ ```
94
+
95
+ #### Example
96
+
97
+ ```bash
98
+ # Donwload the example dataset
99
+ wget https://github.com/akikuno/DAJIN2/raw/main/examples/example-single.tar.gz
100
+ tar -xf example-single.tar.gz
101
+
102
+ # Run DAJIN2
103
+ DAJIN2 \
104
+ --name stx2-deletion \
105
+ --sample example-single/sample.fq.gz \
106
+ --control example-single/control.fq.gz \
107
+ --allele example-single/design.fa \
108
+ --genome mm39 \
109
+ --threads 10
110
+
111
+ # 2023-06-04 11:30:03: example-single/control.fq.gz is now processing...
112
+ # 2023-06-04 11:30:06: Preprocess example-single/control.fq.gz...
113
+ # 2023-06-04 11:30:06: Mapping example-single/control.fq.gz...
114
+ # 2023-06-04 11:30:21: Call MIDSV example-single/control.fq.gz...
115
+ # 2023-06-04 11:30:31: 🍵 example-single/control.fq.gz is finished!
116
+ # 2023-06-04 11:30:31: example-single/sample.fq.gz is now processing...
117
+ # 2023-06-04 11:30:35: Preprocess example-single/sample.fq.gz...
118
+ # 2023-06-04 11:34:13: Classify example-single/sample.fq.gz...
119
+ # 2023-06-04 11:34:18: Clustering example-single/sample.fq.gz...
120
+ # 2023-06-04 11:35:01: Consensus calling example-single/sample.fq.gz...
121
+ # 2023-06-04 11:35:08: 🍵 example-single/sample.fq.gz is finished!
122
+ # 🎉 Finished! Open DAJIN_Results/stx2-deletion to see the report.
123
+ ```
124
+
125
+ ### Batch Processing
126
+
127
+ By using the `batch` subcommand, you can process multiple FASTQ files simultaneously.
128
+ For this purpose, a CSV or Excel file consolidating the sample information is required.
129
+ For a specific example, please refer to [this link](https://github.com/akikuno/DAJIN2/blob/main/examples/example-batch/batch.csv).
130
+
131
+
132
+ ```bash
133
+ DAJIN2 batch <-f|--file> [-t|--threads] [-h]
134
+
135
+ options:
136
+ -f, --file Path to a CSV or Excel file
137
+ -t, --threads (Optional) Number of threads [default: 1]
138
+ -h, --help Show this help message and exit
139
+ ```
140
+
141
+ #### Example
142
+
143
+ ```bash
144
+ # Donwload the example dataset
145
+ wget https://github.com/akikuno/DAJIN2/raw/main/examples/example-batch.tar.gz
146
+ tar -xf example-batch.tar.gz
147
+
148
+ # Run DAJIN2
149
+ DAJIN2 batch --file example-batch/batch.csv --threads 3
150
+
151
+ # 2023-07-31 17:01:10: example-batch/tyr_control.fq.gz is now processing...
152
+ # 2023-07-31 17:01:16: Preprocess example-batch/tyr_control.fq.gz...
153
+ # 2023-07-31 17:01:48: Output BAM files of example-batch/tyr_control.fq.gz...
154
+ # 2023-07-31 17:01:52: 🍵 example-batch/tyr_control.fq.gz is finished!
155
+ # 2023-07-31 17:01:52: example-batch/tyr_c230gt_50%.fq.gz is now processing...
156
+ # 2023-07-31 17:01:52: example-batch/tyr_c230gt_10%.fq.gz is now processing...
157
+ # 2023-07-31 17:01:52: example-batch/tyr_c230gt_01%.fq.gz is now processing...
158
+ # 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_01%.fq.gz...
159
+ # 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_50%.fq.gz...
160
+ # 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_10%.fq.gz...
161
+ # 2023-07-31 17:02:17: Classify example-batch/tyr_c230gt_50%.fq.gz...
162
+ # 2023-07-31 17:02:19: Clustering example-batch/tyr_c230gt_50%.fq.gz...
163
+ # 2023-07-31 17:02:34: Classify example-batch/tyr_c230gt_01%.fq.gz...
164
+ # 2023-07-31 17:02:35: Classify example-batch/tyr_c230gt_10%.fq.gz...
165
+ # 2023-07-31 17:02:39: Clustering example-batch/tyr_c230gt_01%.fq.gz...
166
+ # 2023-07-31 17:02:39: Clustering example-batch/tyr_c230gt_10%.fq.gz...
167
+ # 2023-07-31 17:02:53: Consensus calling of example-batch/tyr_c230gt_50%.fq.gz...
168
+ # 2023-07-31 17:02:59: Output reports of example-batch/tyr_c230gt_50%.fq.gz...
169
+ # 2023-07-31 17:03:04: 🍵 example-batch/tyr_c230gt_50%.fq.gz is finished!
170
+ # 2023-07-31 17:03:39: Consensus calling of example-batch/tyr_c230gt_01%.fq.gz...
171
+ # 2023-07-31 17:03:51: Output reports of example-batch/tyr_c230gt_01%.fq.gz...
172
+ # 2023-07-31 17:04:03: 🍵 example-batch/tyr_c230gt_01%.fq.gz is finished!
173
+ # 2023-07-31 17:04:08: Consensus calling of example-batch/tyr_c230gt_10%.fq.gz...
174
+ # 2023-07-31 17:04:16: Output reports of example-batch/tyr_c230gt_10%.fq.gz...
175
+ # 2023-07-31 17:04:24: 🍵 example-batch/tyr_c230gt_10%.fq.gz is finished!
176
+ # 🎉 Finished! Open DAJIN_Results/tyr-substitution to see the report.
177
+ ```
178
+
179
+ ## 📈 Report Contents
180
+
181
+ Upon completion of DAJIN2 processing, a directory named **DAJIN_Results** is generated.
182
+ Inside the **DAJIN_Results** directory, the following files can be found:
183
+
184
+ ```
185
+ DAJIN_Results/tyr-substitution
186
+ ├── BAM
187
+ │ ├── tyr_c230gt_01%
188
+ │ ├── tyr_c230gt_10%
189
+ │ ├── tyr_c230gt_50%
190
+ │ └── tyr_control
191
+ ├── FASTA
192
+ │ ├── tyr_c230gt_01%
193
+ │ ├── tyr_c230gt_10%
194
+ │ └── tyr_c230gt_50%
195
+ ├── HTML
196
+ │ ├── tyr_c230gt_01%
197
+ │ ├── tyr_c230gt_10%
198
+ │ └── tyr_c230gt_50%
199
+ ├── MUTATION_INFO
200
+ │ ├── tyr_c230gt_01%.csv
201
+ │ ├── tyr_c230gt_10%.csv
202
+ │ └── tyr_c230gt_50%.csv
203
+ ├── read_all.csv
204
+ ├── read_plot.html
205
+ ├── read_plot.pdf
206
+ └── read_summary.csv
207
+ ```
208
+
209
+ ### 1. BAM
210
+
211
+ The BAM directory contains the BAM files of reads classified per allele.
212
+
213
+ > **Note**
214
+ > Specifying a reference genome using the `genome` option will align the reads to that genome.
215
+ > Without `genome` options, the reads will align to the control allele within the input FASTA file.
216
+
217
+ ### 2. FASTA and HTML
218
+
219
+ The FASTA directory stores the FASTA files of each allele.
220
+ The HTML directory contains HTML files for each allele, where mutation sites are color-highlighted.
221
+ For example, Tyr point mutation is highlighted in **green**.
222
+
223
+ <img src="https://user-images.githubusercontent.com/15861316/274518501-2ca3f442-1b86-4635-be3d-fd37575c4ca2.png" width="75%" />
224
+
225
+ ### 3. MUTATION_INFO
226
+
227
+ The MUTATION_INFO directory saves tables depicting mutation sites for each allele.
228
+ An example of a Tyr point mutation is described by its position on the chromosome and the type of mutation.
229
+
230
+ <img src="https://user-images.githubusercontent.com/15861316/274519342-a613490d-5dbb-4a27-a2cf-bca0686b30f0.png" width="75%">
231
+
232
+ ### 4. read_plot.html and read_plot.pdf
233
+
234
+ Both read_plot.html and read_plot.pdf illustrate the proportions of each allele.
235
+ The chart's **Allele type** indicates the type of allele, and **% of reads** shows the proportion of reads for that allele.
236
+
237
+ Additionally, the types of **Allele type** include:
238
+ - **intact**: Alleles that perfectly match the input FASTA allele.
239
+ - **indels**: Substitutions, deletions, insertions, or inversions within 50 bases.
240
+ - **sv**: Substitutions, deletions, insertions, or inversions beyond 50 bases.
241
+
242
+ <img src="https://user-images.githubusercontent.com/15861316/274521067-4d217251-4c62-4dc9-9c05-7f5377dd3025.png" width="75%">
243
+
244
+ > **Warning**
245
+ > In PCR amplicon sequencing, the % of reads might not match the actual allele proportions due to amplification bias.
246
+ > Especially when large deletions are present, the deletion alleles might be significantly amplified, potentially not reflecting the actual allele proportions.
247
+
248
+ ### 5. read_all.csv and read_summary.csv
249
+
250
+ - read_all.csv: Records which allele each read is classified under.
251
+ - read_summary.csv: Describes the number of reads and presence proportion for each allele.
252
+
253
+
254
+ ## 📄 References
255
+
256
+ For more information, please refer to the following publication:
257
+
258
+ [Kuno A, et al. (2022) DAJIN enables multiplex genotyping to simultaneously validate intended and unintended target genome editing outcomes. *PLoS Biology* 20(1): e3001507.](https://doi.org/10.1371/journal.pbio.3001507)
DAJIN2-0.3.2/README.md ADDED
@@ -0,0 +1,240 @@
1
+ [![License](https://img.shields.io/badge/License-MIT-9cf.svg?style=flat-square)](https://choosealicense.com/licenses/mit/)
2
+ [![Test](https://img.shields.io/github/actions/workflow/status/akikuno/dajin2/pytest.yml?branch=main&label=Test&color=brightgreen&style=flat-square)](https://github.com/akikuno/dajin2/actions)
3
+ [![Python](https://img.shields.io/pypi/pyversions/DAJIN2.svg?label=Python&color=blue&style=flat-square)](https://pypi.org/project/DAJIN2/)
4
+ [![PyPI](https://img.shields.io/pypi/v/DAJIN2.svg?label=PyPI&color=orange&style=flat-square)](https://pypi.org/project/DAJIN2/)
5
+ [![Bioconda](https://img.shields.io/conda/v/bioconda/dajin2?label=Bioconda&color=orange&style=flat-square)](https://anaconda.org/bioconda/dajin2)
6
+ [![DOI](https://zenodo.org/badge/387721337.svg)](https://zenodo.org/badge/latestdoi/387721337)
7
+
8
+
9
+ <p align="center">
10
+ <img src="https://user-images.githubusercontent.com/15861316/261833016-7f356960-88cf-4574-87e2-36162b174340.png" width="90%">
11
+ </p>
12
+
13
+ [日本語はこちら](https://github.com/akikuno/DAJIN2/blob/main/docs/README_JP.md)
14
+
15
+ DAJIN2 is a genotyping software designed for organisms that have undergone genome editing, utilizing nanopore sequencing technology.
16
+
17
+ The name DAJIN is inspired by the term 一網**打尽** (Ichimou **DAJIN** or Yīwǎng **Dǎjìn**), which signifies capturing everything in a single net.
18
+
19
+ ## 🙏 Feedbacks
20
+
21
+ DAJIN2 is still in the development phase.
22
+ Basic tests covering point mutations, deletions, and insertion designs have been conducted.
23
+ If you encounter any bugs or issues, please report them via [Issues](https://github.com/akikuno/DAJIN2/issues).
24
+
25
+
26
+
27
+ ## 🛠 Installation
28
+
29
+ ### From [Bioconda](https://anaconda.org/bioconda/DAJIN2) (Recommended)
30
+
31
+ ```bash
32
+ conda install -c bioconda DAJIN2
33
+ ```
34
+
35
+ ### From [PyPI](https://pypi.org/project/DAJIN2/)
36
+
37
+ ```bash
38
+ pip install DAJIN2
39
+ ```
40
+
41
+ > **Warning**
42
+ > If you encounter the error **Failed to build mappy** when installing DAJIN2 from pip, please install `gcc` and `zlib`.
43
+ > `sudo apt install gcc zlib1g zlib1g-dev` (Ubuntu)
44
+ > `brew install gcc zlib` (macOS)
45
+
46
+ <!-- ```bash
47
+ # Ubuntu
48
+ sudo apt install gcc zlib1g zlib1g-dev
49
+ ```
50
+
51
+ ```bash
52
+ # macOS
53
+ brew install gcc zlib
54
+ ``` -->
55
+
56
+
57
+ ## 💡 Usage
58
+
59
+ ### Single Sample Analysis
60
+
61
+ DAJIN2 allows for the analysis of single samples (one sample vs one control).
62
+
63
+ ```bash
64
+ DAJIN2 <-s|--sample> <-c|--control> <-a|--allele> <-n|--name> [-g|--genome] [-t|--threads] [-h|--help] [-v|--version]
65
+
66
+ options:
67
+ -s, --sample Path to a sample FASTQ file
68
+ -c, --control Path to a control FASTQ file
69
+ -a, --allele Path to a FASTA file
70
+ -n, --name Output directory name
71
+ -g, --genome (Optional) Reference genome ID (e.g hg38, mm39) [default: '']
72
+ -t, --threads (Optional) Number of threads [default: 1]
73
+ -h, --help show this help message and exit
74
+ -v, --version show the version number and exit
75
+ ```
76
+
77
+ #### Example
78
+
79
+ ```bash
80
+ # Donwload the example dataset
81
+ wget https://github.com/akikuno/DAJIN2/raw/main/examples/example-single.tar.gz
82
+ tar -xf example-single.tar.gz
83
+
84
+ # Run DAJIN2
85
+ DAJIN2 \
86
+ --name stx2-deletion \
87
+ --sample example-single/sample.fq.gz \
88
+ --control example-single/control.fq.gz \
89
+ --allele example-single/design.fa \
90
+ --genome mm39 \
91
+ --threads 10
92
+
93
+ # 2023-06-04 11:30:03: example-single/control.fq.gz is now processing...
94
+ # 2023-06-04 11:30:06: Preprocess example-single/control.fq.gz...
95
+ # 2023-06-04 11:30:06: Mapping example-single/control.fq.gz...
96
+ # 2023-06-04 11:30:21: Call MIDSV example-single/control.fq.gz...
97
+ # 2023-06-04 11:30:31: 🍵 example-single/control.fq.gz is finished!
98
+ # 2023-06-04 11:30:31: example-single/sample.fq.gz is now processing...
99
+ # 2023-06-04 11:30:35: Preprocess example-single/sample.fq.gz...
100
+ # 2023-06-04 11:34:13: Classify example-single/sample.fq.gz...
101
+ # 2023-06-04 11:34:18: Clustering example-single/sample.fq.gz...
102
+ # 2023-06-04 11:35:01: Consensus calling example-single/sample.fq.gz...
103
+ # 2023-06-04 11:35:08: 🍵 example-single/sample.fq.gz is finished!
104
+ # 🎉 Finished! Open DAJIN_Results/stx2-deletion to see the report.
105
+ ```
106
+
107
+ ### Batch Processing
108
+
109
+ By using the `batch` subcommand, you can process multiple FASTQ files simultaneously.
110
+ For this purpose, a CSV or Excel file consolidating the sample information is required.
111
+ For a specific example, please refer to [this link](https://github.com/akikuno/DAJIN2/blob/main/examples/example-batch/batch.csv).
112
+
113
+
114
+ ```bash
115
+ DAJIN2 batch <-f|--file> [-t|--threads] [-h]
116
+
117
+ options:
118
+ -f, --file Path to a CSV or Excel file
119
+ -t, --threads (Optional) Number of threads [default: 1]
120
+ -h, --help Show this help message and exit
121
+ ```
122
+
123
+ #### Example
124
+
125
+ ```bash
126
+ # Donwload the example dataset
127
+ wget https://github.com/akikuno/DAJIN2/raw/main/examples/example-batch.tar.gz
128
+ tar -xf example-batch.tar.gz
129
+
130
+ # Run DAJIN2
131
+ DAJIN2 batch --file example-batch/batch.csv --threads 3
132
+
133
+ # 2023-07-31 17:01:10: example-batch/tyr_control.fq.gz is now processing...
134
+ # 2023-07-31 17:01:16: Preprocess example-batch/tyr_control.fq.gz...
135
+ # 2023-07-31 17:01:48: Output BAM files of example-batch/tyr_control.fq.gz...
136
+ # 2023-07-31 17:01:52: 🍵 example-batch/tyr_control.fq.gz is finished!
137
+ # 2023-07-31 17:01:52: example-batch/tyr_c230gt_50%.fq.gz is now processing...
138
+ # 2023-07-31 17:01:52: example-batch/tyr_c230gt_10%.fq.gz is now processing...
139
+ # 2023-07-31 17:01:52: example-batch/tyr_c230gt_01%.fq.gz is now processing...
140
+ # 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_01%.fq.gz...
141
+ # 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_50%.fq.gz...
142
+ # 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_10%.fq.gz...
143
+ # 2023-07-31 17:02:17: Classify example-batch/tyr_c230gt_50%.fq.gz...
144
+ # 2023-07-31 17:02:19: Clustering example-batch/tyr_c230gt_50%.fq.gz...
145
+ # 2023-07-31 17:02:34: Classify example-batch/tyr_c230gt_01%.fq.gz...
146
+ # 2023-07-31 17:02:35: Classify example-batch/tyr_c230gt_10%.fq.gz...
147
+ # 2023-07-31 17:02:39: Clustering example-batch/tyr_c230gt_01%.fq.gz...
148
+ # 2023-07-31 17:02:39: Clustering example-batch/tyr_c230gt_10%.fq.gz...
149
+ # 2023-07-31 17:02:53: Consensus calling of example-batch/tyr_c230gt_50%.fq.gz...
150
+ # 2023-07-31 17:02:59: Output reports of example-batch/tyr_c230gt_50%.fq.gz...
151
+ # 2023-07-31 17:03:04: 🍵 example-batch/tyr_c230gt_50%.fq.gz is finished!
152
+ # 2023-07-31 17:03:39: Consensus calling of example-batch/tyr_c230gt_01%.fq.gz...
153
+ # 2023-07-31 17:03:51: Output reports of example-batch/tyr_c230gt_01%.fq.gz...
154
+ # 2023-07-31 17:04:03: 🍵 example-batch/tyr_c230gt_01%.fq.gz is finished!
155
+ # 2023-07-31 17:04:08: Consensus calling of example-batch/tyr_c230gt_10%.fq.gz...
156
+ # 2023-07-31 17:04:16: Output reports of example-batch/tyr_c230gt_10%.fq.gz...
157
+ # 2023-07-31 17:04:24: 🍵 example-batch/tyr_c230gt_10%.fq.gz is finished!
158
+ # 🎉 Finished! Open DAJIN_Results/tyr-substitution to see the report.
159
+ ```
160
+
161
+ ## 📈 Report Contents
162
+
163
+ Upon completion of DAJIN2 processing, a directory named **DAJIN_Results** is generated.
164
+ Inside the **DAJIN_Results** directory, the following files can be found:
165
+
166
+ ```
167
+ DAJIN_Results/tyr-substitution
168
+ ├── BAM
169
+ │ ├── tyr_c230gt_01%
170
+ │ ├── tyr_c230gt_10%
171
+ │ ├── tyr_c230gt_50%
172
+ │ └── tyr_control
173
+ ├── FASTA
174
+ │ ├── tyr_c230gt_01%
175
+ │ ├── tyr_c230gt_10%
176
+ │ └── tyr_c230gt_50%
177
+ ├── HTML
178
+ │ ├── tyr_c230gt_01%
179
+ │ ├── tyr_c230gt_10%
180
+ │ └── tyr_c230gt_50%
181
+ ├── MUTATION_INFO
182
+ │ ├── tyr_c230gt_01%.csv
183
+ │ ├── tyr_c230gt_10%.csv
184
+ │ └── tyr_c230gt_50%.csv
185
+ ├── read_all.csv
186
+ ├── read_plot.html
187
+ ├── read_plot.pdf
188
+ └── read_summary.csv
189
+ ```
190
+
191
+ ### 1. BAM
192
+
193
+ The BAM directory contains the BAM files of reads classified per allele.
194
+
195
+ > **Note**
196
+ > Specifying a reference genome using the `genome` option will align the reads to that genome.
197
+ > Without `genome` options, the reads will align to the control allele within the input FASTA file.
198
+
199
+ ### 2. FASTA and HTML
200
+
201
+ The FASTA directory stores the FASTA files of each allele.
202
+ The HTML directory contains HTML files for each allele, where mutation sites are color-highlighted.
203
+ For example, Tyr point mutation is highlighted in **green**.
204
+
205
+ <img src="https://user-images.githubusercontent.com/15861316/274518501-2ca3f442-1b86-4635-be3d-fd37575c4ca2.png" width="75%" />
206
+
207
+ ### 3. MUTATION_INFO
208
+
209
+ The MUTATION_INFO directory saves tables depicting mutation sites for each allele.
210
+ An example of a Tyr point mutation is described by its position on the chromosome and the type of mutation.
211
+
212
+ <img src="https://user-images.githubusercontent.com/15861316/274519342-a613490d-5dbb-4a27-a2cf-bca0686b30f0.png" width="75%">
213
+
214
+ ### 4. read_plot.html and read_plot.pdf
215
+
216
+ Both read_plot.html and read_plot.pdf illustrate the proportions of each allele.
217
+ The chart's **Allele type** indicates the type of allele, and **% of reads** shows the proportion of reads for that allele.
218
+
219
+ Additionally, the types of **Allele type** include:
220
+ - **intact**: Alleles that perfectly match the input FASTA allele.
221
+ - **indels**: Substitutions, deletions, insertions, or inversions within 50 bases.
222
+ - **sv**: Substitutions, deletions, insertions, or inversions beyond 50 bases.
223
+
224
+ <img src="https://user-images.githubusercontent.com/15861316/274521067-4d217251-4c62-4dc9-9c05-7f5377dd3025.png" width="75%">
225
+
226
+ > **Warning**
227
+ > In PCR amplicon sequencing, the % of reads might not match the actual allele proportions due to amplification bias.
228
+ > Especially when large deletions are present, the deletion alleles might be significantly amplified, potentially not reflecting the actual allele proportions.
229
+
230
+ ### 5. read_all.csv and read_summary.csv
231
+
232
+ - read_all.csv: Records which allele each read is classified under.
233
+ - read_summary.csv: Describes the number of reads and presence proportion for each allele.
234
+
235
+
236
+ ## 📄 References
237
+
238
+ For more information, please refer to the following publication:
239
+
240
+ [Kuno A, et al. (2022) DAJIN enables multiplex genotyping to simultaneously validate intended and unintended target genome editing outcomes. *PLoS Biology* 20(1): e3001507.](https://doi.org/10.1371/journal.pbio.3001507)
@@ -0,0 +1,21 @@
1
+ numpy >= 1.20.0
2
+ scipy >= 1.6.0
3
+ pandas >= 1.0.0
4
+ openpyxl >= 3.0.0
5
+ rapidfuzz >=3.0.0
6
+ statsmodels >= 0.13.5
7
+ scikit-learn >= 1.0.0
8
+
9
+ mappy >= 2.24
10
+ pysam >= 0.19.0
11
+
12
+ Flask >= 2.2.0
13
+ waitress >= 2.1.0
14
+ Jinja2 >= 3.1.0
15
+
16
+ plotly >= 5.0.0
17
+ kaleido >= 0.2.0
18
+
19
+ cstag >= 0.4.1
20
+ midsv >= 0.10.1
21
+ wslPath >=0.3.0
@@ -9,10 +9,10 @@ with open("requirements.txt") as requirements_file:
9
9
 
10
10
  setuptools.setup(
11
11
  name="DAJIN2",
12
- version="0.1.32.alpha",
12
+ version="0.3.2",
13
13
  author="Akihiro Kuno",
14
14
  author_email="akuno@md.tsukuba.ac.jp",
15
- description="One-step genotyping tools for Nanopore amplicon sequencing",
15
+ description="One-step genotyping tools for targeted long-read sequencing",
16
16
  long_description=long_description,
17
17
  long_description_content_type="text/markdown",
18
18
  url="https://github.com/akikuno/DAJIN2",
@@ -21,12 +21,16 @@ setuptools.setup(
21
21
  where="src",
22
22
  ),
23
23
  package_dir={"": "src"},
24
- entry_points={"console_scripts": ["DAJIN2=DAJIN2.DAJIN2:main"]},
24
+ entry_points={"console_scripts": ["DAJIN2=DAJIN2.main:execute"]},
25
25
  include_package_data=True,
26
26
  classifiers=[
27
+ "Development Status :: 4 - Beta",
28
+ "Environment :: Console",
27
29
  "Programming Language :: Python :: 3",
28
30
  "License :: OSI Approved :: MIT License",
29
31
  "Operating System :: POSIX",
32
+ "Operating System :: MacOS",
33
+ "Intended Audience :: Science/Research",
34
+ "Topic :: Scientific/Engineering :: Bio-Informatics",
30
35
  ],
31
- python_requires=">=3.7",
32
36
  )
@@ -0,0 +1 @@
1
+ from DAJIN2.core.classification.classifier import classify_alleles
@@ -0,0 +1,49 @@
1
+ from __future__ import annotations
2
+
3
+ import midsv
4
+ from pathlib import Path
5
+ from itertools import groupby
6
+
7
+
8
+ def _calc_match(CSSPLIT: str) -> float:
9
+ match_score = CSSPLIT.count("=")
10
+ match_score -= CSSPLIT.count("+") # insertion
11
+ match_score -= sum(cs.islower() for cs in CSSPLIT) # inversion
12
+ cssplit = CSSPLIT.split(",")
13
+
14
+ return match_score / len(cssplit)
15
+
16
+
17
+ def _score_allele(TEMPDIR: Path, allele: str, SAMPLE_NAME: str) -> list[dict]:
18
+ midsv_sample = midsv.read_jsonl(Path(TEMPDIR, SAMPLE_NAME, "midsv", f"{allele}.json"))
19
+ scored_alleles = []
20
+
21
+ for dict_midsv in midsv_sample:
22
+ score = _calc_match(dict_midsv["CSSPLIT"])
23
+ dict_midsv.update({"SCORE": score, "ALLELE": allele})
24
+ scored_alleles.append(dict_midsv)
25
+
26
+ return scored_alleles
27
+
28
+
29
+ def _extract_alleles_with_max_score(score_of_each_alleles: list[dict]) -> list[dict]:
30
+ alleles_with_max_score = []
31
+ score_of_each_alleles.sort(key=lambda x: x["QNAME"])
32
+ for _, group in groupby(score_of_each_alleles, key=lambda x: x["QNAME"]):
33
+ max_read = max(group, key=lambda x: x["SCORE"])
34
+ del max_read["SCORE"]
35
+ alleles_with_max_score.append(max_read)
36
+ return alleles_with_max_score
37
+
38
+
39
+ ##########################################################
40
+ # main
41
+ ##########################################################
42
+
43
+
44
+ def classify_alleles(TEMPDIR: Path, FASTA_ALLELES: dict, SAMPLE_NAME: str) -> list[dict]:
45
+ score_of_each_alleles = []
46
+ for allele in FASTA_ALLELES:
47
+ score_of_each_alleles.extend(_score_allele(TEMPDIR, allele, SAMPLE_NAME))
48
+
49
+ return _extract_alleles_with_max_score(score_of_each_alleles)
@@ -0,0 +1,3 @@
1
+ from DAJIN2.core.clustering.label_extractor import extract_labels
2
+ from DAJIN2.core.clustering.appender import add_labels, add_readnum, add_percent
3
+ from DAJIN2.core.clustering.label_handler import update_labels
@@ -0,0 +1,44 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ def add_labels(classif_sample: list[dict], labels: list[int]) -> list[dict]:
5
+ """Add 'LABEL' key to each sample dict, indicating each clster of allele."""
6
+ clust_sample = classif_sample.copy()
7
+
8
+ for clust, label in zip(clust_sample, labels):
9
+ clust["LABEL"] = label
10
+
11
+ return clust_sample
12
+
13
+
14
+ def count_labels(clust_sample: list[dict]) -> dict[int, int]:
15
+ """Count the occurrences of each label in the cluster sample."""
16
+ label_count = {}
17
+ for sample in clust_sample:
18
+ label = sample["LABEL"]
19
+ label_count[label] = label_count.get(label, 0) + 1
20
+ return label_count
21
+
22
+
23
+ def add_readnum(clust_sample: list[dict]) -> list[dict]:
24
+ """Add 'READNUM' key to each sample dict, indicating the number of occurrences of each label."""
25
+ clust_result = clust_sample.copy()
26
+ label_count = count_labels(clust_result)
27
+
28
+ for sample in clust_result:
29
+ sample["READNUM"] = label_count[sample["LABEL"]]
30
+
31
+ return clust_result
32
+
33
+
34
+ def add_percent(clust_sample: list[dict]) -> list[dict]:
35
+ """Add 'PERCENT' key to each sample dict, indicating the percentage of occurrences of each label."""
36
+ clust_result = clust_sample.copy()
37
+ total_samples = len(clust_result)
38
+ label_count = count_labels(clust_result)
39
+
40
+ for sample in clust_result:
41
+ label = sample["LABEL"]
42
+ sample["PERCENT"] = round((label_count[label] / total_samples) * 100, 3)
43
+
44
+ return clust_result