DAJIN2 0.1.32a0__zip → 0.3.2__zip
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {DAJIN2-0.1.32a0 → DAJIN2-0.3.2}/MANIFEST.in +3 -0
- DAJIN2-0.3.2/PKG-INFO +258 -0
- DAJIN2-0.3.2/README.md +240 -0
- DAJIN2-0.3.2/requirements.txt +21 -0
- {DAJIN2-0.1.32a0 → DAJIN2-0.3.2}/setup.py +8 -4
- DAJIN2-0.3.2/src/DAJIN2/core/classification/__init__.py +1 -0
- DAJIN2-0.3.2/src/DAJIN2/core/classification/classifier.py +49 -0
- DAJIN2-0.3.2/src/DAJIN2/core/clustering/__init__.py +3 -0
- DAJIN2-0.3.2/src/DAJIN2/core/clustering/appender.py +44 -0
- DAJIN2-0.3.2/src/DAJIN2/core/clustering/clustering.py +150 -0
- DAJIN2-0.3.2/src/DAJIN2/core/clustering/kmer_generator.py +46 -0
- DAJIN2-0.3.2/src/DAJIN2/core/clustering/label_extractor.py +112 -0
- DAJIN2-0.3.2/src/DAJIN2/core/clustering/label_handler.py +42 -0
- DAJIN2-0.3.2/src/DAJIN2/core/clustering/label_merger.py +49 -0
- DAJIN2-0.3.2/src/DAJIN2/core/clustering/score_handler.py +142 -0
- DAJIN2-0.3.2/src/DAJIN2/core/consensus/__init__.py +5 -0
- DAJIN2-0.3.2/src/DAJIN2/core/consensus/consensus.py +132 -0
- DAJIN2-0.3.2/src/DAJIN2/core/consensus/name_handler.py +88 -0
- DAJIN2-0.3.2/src/DAJIN2/core/core.py +293 -0
- DAJIN2-0.3.2/src/DAJIN2/core/preprocess/__init__.py +13 -0
- DAJIN2-0.3.2/src/DAJIN2/core/preprocess/cache_checker.py +24 -0
- DAJIN2-0.3.2/src/DAJIN2/core/preprocess/directories.py +34 -0
- DAJIN2-0.3.2/src/DAJIN2/core/preprocess/fastx_parser.py +59 -0
- DAJIN2-0.3.2/src/DAJIN2/core/preprocess/genome_fetcher.py +43 -0
- DAJIN2-0.3.2/src/DAJIN2/core/preprocess/homopolymer_handler.py +139 -0
- DAJIN2-0.3.2/src/DAJIN2/core/preprocess/insertions_to_fasta.py +346 -0
- DAJIN2-0.3.2/src/DAJIN2/core/preprocess/knockin_handler.py +45 -0
- DAJIN2-0.3.2/src/DAJIN2/core/preprocess/mapping.py +123 -0
- DAJIN2-0.3.2/src/DAJIN2/core/preprocess/midsv_caller.py +138 -0
- DAJIN2-0.3.2/src/DAJIN2/core/preprocess/mutation_extractor.py +305 -0
- DAJIN2-0.3.2/src/DAJIN2/core/report/__init__.py +3 -0
- DAJIN2-0.3.2/src/DAJIN2/core/report/report_bam.py +135 -0
- DAJIN2-0.3.2/src/DAJIN2/core/report/report_files.py +35 -0
- DAJIN2-0.3.2/src/DAJIN2/core/report/report_mutation.py +198 -0
- {DAJIN2-0.1.32a0 → DAJIN2-0.3.2}/src/DAJIN2/gui.py +19 -10
- DAJIN2-0.3.2/src/DAJIN2/main.py +223 -0
- DAJIN2-0.3.2/src/DAJIN2/utils/config.py +65 -0
- DAJIN2-0.3.2/src/DAJIN2/utils/cssplits_handler.py +92 -0
- DAJIN2-0.3.2/src/DAJIN2/utils/dna_handler.py +7 -0
- DAJIN2-0.3.2/src/DAJIN2/utils/input_validator.py +169 -0
- DAJIN2-0.3.2/src/DAJIN2/utils/io.py +131 -0
- DAJIN2-0.3.2/src/DAJIN2/utils/multiprocess.py +66 -0
- DAJIN2-0.1.32a0/src/DAJIN2/postprocess/report.py → DAJIN2-0.3.2/src/DAJIN2/utils/report_generator.py +5 -4
- DAJIN2-0.3.2/src/DAJIN2/utils/sam_handler.py +193 -0
- {DAJIN2-0.1.32a0 → DAJIN2-0.3.2}/src/DAJIN2/view.py +5 -3
- DAJIN2-0.3.2/src/DAJIN2.egg-info/PKG-INFO +258 -0
- DAJIN2-0.3.2/src/DAJIN2.egg-info/SOURCES.txt +57 -0
- DAJIN2-0.3.2/src/DAJIN2.egg-info/entry_points.txt +2 -0
- {DAJIN2-0.1.32a0 → DAJIN2-0.3.2}/src/DAJIN2.egg-info/requires.txt +7 -5
- DAJIN2-0.1.32a0/PKG-INFO +0 -71
- DAJIN2-0.1.32a0/README.md +0 -57
- DAJIN2-0.1.32a0/src/DAJIN2/DAJIN2.py +0 -266
- DAJIN2-0.1.32a0/src/DAJIN2/batch.py +0 -92
- DAJIN2-0.1.32a0/src/DAJIN2/core/classification/__init__.py +0 -2
- DAJIN2-0.1.32a0/src/DAJIN2/core/classification/classify.py +0 -83
- DAJIN2-0.1.32a0/src/DAJIN2/core/classification/detect_sv.py +0 -17
- DAJIN2-0.1.32a0/src/DAJIN2/core/clustering/__init__.py +0 -2
- DAJIN2-0.1.32a0/src/DAJIN2/core/clustering/clustering.py +0 -171
- DAJIN2-0.1.32a0/src/DAJIN2/core/clustering/make_score.py +0 -71
- DAJIN2-0.1.32a0/src/DAJIN2/core/clustering/merge_clusters.py +0 -40
- DAJIN2-0.1.32a0/src/DAJIN2/core/clustering/return_labels.py +0 -106
- DAJIN2-0.1.32a0/src/DAJIN2/core/consensus/__init__.py +0 -5
- DAJIN2-0.1.32a0/src/DAJIN2/core/consensus/consensus.py +0 -151
- DAJIN2-0.1.32a0/src/DAJIN2/core/core_execute.py +0 -202
- DAJIN2-0.1.32a0/src/DAJIN2/core/preprocess/__init__.py +0 -12
- DAJIN2-0.1.32a0/src/DAJIN2/core/preprocess/call_midsv.py +0 -131
- DAJIN2-0.1.32a0/src/DAJIN2/core/preprocess/correct_knockin.py +0 -161
- DAJIN2-0.1.32a0/src/DAJIN2/core/preprocess/correct_revititive_deletions.py +0 -138
- DAJIN2-0.1.32a0/src/DAJIN2/core/preprocess/correct_sequence_error.py +0 -240
- DAJIN2-0.1.32a0/src/DAJIN2/core/preprocess/extract_knockin_loci.py +0 -31
- DAJIN2-0.1.32a0/src/DAJIN2/core/preprocess/extract_mutation_loci.py +0 -160
- DAJIN2-0.1.32a0/src/DAJIN2/core/preprocess/format_inputs.py +0 -119
- DAJIN2-0.1.32a0/src/DAJIN2/core/preprocess/mappy_align.py +0 -116
- DAJIN2-0.1.32a0/src/DAJIN2/core/preprocess/replace_NtoD.py +0 -48
- DAJIN2-0.1.32a0/src/DAJIN2/core/preprocess/validate_inputs.py +0 -150
- DAJIN2-0.1.32a0/src/DAJIN2/core/report/__init__.py +0 -1
- DAJIN2-0.1.32a0/src/DAJIN2/core/report/report_bam.py +0 -319
- DAJIN2-0.1.32a0/src/DAJIN2/core/report/report_files.py +0 -38
- DAJIN2-0.1.32a0/src/DAJIN2/postprocess/__init__.py +0 -0
- DAJIN2-0.1.32a0/src/DAJIN2/single.py +0 -27
- DAJIN2-0.1.32a0/src/DAJIN2.egg-info/PKG-INFO +0 -71
- DAJIN2-0.1.32a0/src/DAJIN2.egg-info/SOURCES.txt +0 -48
- DAJIN2-0.1.32a0/src/DAJIN2.egg-info/entry_points.txt +0 -2
- {DAJIN2-0.1.32a0 → DAJIN2-0.3.2}/LICENSE +0 -0
- {DAJIN2-0.1.32a0 → DAJIN2-0.3.2}/setup.cfg +0 -0
- {DAJIN2-0.1.32a0 → DAJIN2-0.3.2}/src/DAJIN2/__init__.py +0 -0
- {DAJIN2-0.1.32a0 → DAJIN2-0.3.2}/src/DAJIN2/core/__init__.py +0 -0
- /DAJIN2-0.1.32a0/src/DAJIN2/core/consensus/subset.py → /DAJIN2-0.3.2/src/DAJIN2/core/consensus/clust_subsetter.py +0 -0
- {DAJIN2-0.1.32a0 → DAJIN2-0.3.2}/src/DAJIN2/static/css/style.css +0 -0
- {DAJIN2-0.1.32a0 → DAJIN2-0.3.2}/src/DAJIN2/template_igvjs.html +0 -0
- {DAJIN2-0.1.32a0 → DAJIN2-0.3.2}/src/DAJIN2/templates/index.html +0 -0
- {DAJIN2-0.1.32a0 → DAJIN2-0.3.2}/src/DAJIN2.egg-info/dependency_links.txt +0 -0
- {DAJIN2-0.1.32a0 → DAJIN2-0.3.2}/src/DAJIN2.egg-info/top_level.txt +0 -0
DAJIN2-0.3.2/PKG-INFO
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: DAJIN2
|
|
3
|
+
Version: 0.3.2
|
|
4
|
+
Summary: One-step genotyping tools for targeted long-read sequencing
|
|
5
|
+
Home-page: https://github.com/akikuno/DAJIN2
|
|
6
|
+
Author: Akihiro Kuno
|
|
7
|
+
Author-email: akuno@md.tsukuba.ac.jp
|
|
8
|
+
Classifier: Development Status :: 4 - Beta
|
|
9
|
+
Classifier: Environment :: Console
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: POSIX
|
|
13
|
+
Classifier: Operating System :: MacOS
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
License-File: LICENSE
|
|
18
|
+
|
|
19
|
+
[](https://choosealicense.com/licenses/mit/)
|
|
20
|
+
[](https://github.com/akikuno/dajin2/actions)
|
|
21
|
+
[](https://pypi.org/project/DAJIN2/)
|
|
22
|
+
[](https://pypi.org/project/DAJIN2/)
|
|
23
|
+
[](https://anaconda.org/bioconda/dajin2)
|
|
24
|
+
[](https://zenodo.org/badge/latestdoi/387721337)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
<p align="center">
|
|
28
|
+
<img src="https://user-images.githubusercontent.com/15861316/261833016-7f356960-88cf-4574-87e2-36162b174340.png" width="90%">
|
|
29
|
+
</p>
|
|
30
|
+
|
|
31
|
+
[日本語はこちら](https://github.com/akikuno/DAJIN2/blob/main/docs/README_JP.md)
|
|
32
|
+
|
|
33
|
+
DAJIN2 is a genotyping software designed for organisms that have undergone genome editing, utilizing nanopore sequencing technology.
|
|
34
|
+
|
|
35
|
+
The name DAJIN is inspired by the term 一網**打尽** (Ichimou **DAJIN** or Yīwǎng **Dǎjìn**), which signifies capturing everything in a single net.
|
|
36
|
+
|
|
37
|
+
## 🙏 Feedbacks
|
|
38
|
+
|
|
39
|
+
DAJIN2 is still in the development phase.
|
|
40
|
+
Basic tests covering point mutations, deletions, and insertion designs have been conducted.
|
|
41
|
+
If you encounter any bugs or issues, please report them via [Issues](https://github.com/akikuno/DAJIN2/issues).
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
## 🛠 Installation
|
|
46
|
+
|
|
47
|
+
### From [Bioconda](https://anaconda.org/bioconda/DAJIN2) (Recommended)
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
conda install -c bioconda DAJIN2
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### From [PyPI](https://pypi.org/project/DAJIN2/)
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
pip install DAJIN2
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
> **Warning**
|
|
60
|
+
> If you encounter the error **Failed to build mappy** when installing DAJIN2 from pip, please install `gcc` and `zlib`.
|
|
61
|
+
> `sudo apt install gcc zlib1g zlib1g-dev` (Ubuntu)
|
|
62
|
+
> `brew install gcc zlib` (macOS)
|
|
63
|
+
|
|
64
|
+
<!-- ```bash
|
|
65
|
+
# Ubuntu
|
|
66
|
+
sudo apt install gcc zlib1g zlib1g-dev
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
# macOS
|
|
71
|
+
brew install gcc zlib
|
|
72
|
+
``` -->
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
## 💡 Usage
|
|
76
|
+
|
|
77
|
+
### Single Sample Analysis
|
|
78
|
+
|
|
79
|
+
DAJIN2 allows for the analysis of single samples (one sample vs one control).
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
DAJIN2 <-s|--sample> <-c|--control> <-a|--allele> <-n|--name> [-g|--genome] [-t|--threads] [-h|--help] [-v|--version]
|
|
83
|
+
|
|
84
|
+
options:
|
|
85
|
+
-s, --sample Path to a sample FASTQ file
|
|
86
|
+
-c, --control Path to a control FASTQ file
|
|
87
|
+
-a, --allele Path to a FASTA file
|
|
88
|
+
-n, --name Output directory name
|
|
89
|
+
-g, --genome (Optional) Reference genome ID (e.g hg38, mm39) [default: '']
|
|
90
|
+
-t, --threads (Optional) Number of threads [default: 1]
|
|
91
|
+
-h, --help show this help message and exit
|
|
92
|
+
-v, --version show the version number and exit
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
#### Example
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
# Donwload the example dataset
|
|
99
|
+
wget https://github.com/akikuno/DAJIN2/raw/main/examples/example-single.tar.gz
|
|
100
|
+
tar -xf example-single.tar.gz
|
|
101
|
+
|
|
102
|
+
# Run DAJIN2
|
|
103
|
+
DAJIN2 \
|
|
104
|
+
--name stx2-deletion \
|
|
105
|
+
--sample example-single/sample.fq.gz \
|
|
106
|
+
--control example-single/control.fq.gz \
|
|
107
|
+
--allele example-single/design.fa \
|
|
108
|
+
--genome mm39 \
|
|
109
|
+
--threads 10
|
|
110
|
+
|
|
111
|
+
# 2023-06-04 11:30:03: example-single/control.fq.gz is now processing...
|
|
112
|
+
# 2023-06-04 11:30:06: Preprocess example-single/control.fq.gz...
|
|
113
|
+
# 2023-06-04 11:30:06: Mapping example-single/control.fq.gz...
|
|
114
|
+
# 2023-06-04 11:30:21: Call MIDSV example-single/control.fq.gz...
|
|
115
|
+
# 2023-06-04 11:30:31: 🍵 example-single/control.fq.gz is finished!
|
|
116
|
+
# 2023-06-04 11:30:31: example-single/sample.fq.gz is now processing...
|
|
117
|
+
# 2023-06-04 11:30:35: Preprocess example-single/sample.fq.gz...
|
|
118
|
+
# 2023-06-04 11:34:13: Classify example-single/sample.fq.gz...
|
|
119
|
+
# 2023-06-04 11:34:18: Clustering example-single/sample.fq.gz...
|
|
120
|
+
# 2023-06-04 11:35:01: Consensus calling example-single/sample.fq.gz...
|
|
121
|
+
# 2023-06-04 11:35:08: 🍵 example-single/sample.fq.gz is finished!
|
|
122
|
+
# 🎉 Finished! Open DAJIN_Results/stx2-deletion to see the report.
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
### Batch Processing
|
|
126
|
+
|
|
127
|
+
By using the `batch` subcommand, you can process multiple FASTQ files simultaneously.
|
|
128
|
+
For this purpose, a CSV or Excel file consolidating the sample information is required.
|
|
129
|
+
For a specific example, please refer to [this link](https://github.com/akikuno/DAJIN2/blob/main/examples/example-batch/batch.csv).
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
DAJIN2 batch <-f|--file> [-t|--threads] [-h]
|
|
134
|
+
|
|
135
|
+
options:
|
|
136
|
+
-f, --file Path to a CSV or Excel file
|
|
137
|
+
-t, --threads (Optional) Number of threads [default: 1]
|
|
138
|
+
-h, --help Show this help message and exit
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
#### Example
|
|
142
|
+
|
|
143
|
+
```bash
|
|
144
|
+
# Donwload the example dataset
|
|
145
|
+
wget https://github.com/akikuno/DAJIN2/raw/main/examples/example-batch.tar.gz
|
|
146
|
+
tar -xf example-batch.tar.gz
|
|
147
|
+
|
|
148
|
+
# Run DAJIN2
|
|
149
|
+
DAJIN2 batch --file example-batch/batch.csv --threads 3
|
|
150
|
+
|
|
151
|
+
# 2023-07-31 17:01:10: example-batch/tyr_control.fq.gz is now processing...
|
|
152
|
+
# 2023-07-31 17:01:16: Preprocess example-batch/tyr_control.fq.gz...
|
|
153
|
+
# 2023-07-31 17:01:48: Output BAM files of example-batch/tyr_control.fq.gz...
|
|
154
|
+
# 2023-07-31 17:01:52: 🍵 example-batch/tyr_control.fq.gz is finished!
|
|
155
|
+
# 2023-07-31 17:01:52: example-batch/tyr_c230gt_50%.fq.gz is now processing...
|
|
156
|
+
# 2023-07-31 17:01:52: example-batch/tyr_c230gt_10%.fq.gz is now processing...
|
|
157
|
+
# 2023-07-31 17:01:52: example-batch/tyr_c230gt_01%.fq.gz is now processing...
|
|
158
|
+
# 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_01%.fq.gz...
|
|
159
|
+
# 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_50%.fq.gz...
|
|
160
|
+
# 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_10%.fq.gz...
|
|
161
|
+
# 2023-07-31 17:02:17: Classify example-batch/tyr_c230gt_50%.fq.gz...
|
|
162
|
+
# 2023-07-31 17:02:19: Clustering example-batch/tyr_c230gt_50%.fq.gz...
|
|
163
|
+
# 2023-07-31 17:02:34: Classify example-batch/tyr_c230gt_01%.fq.gz...
|
|
164
|
+
# 2023-07-31 17:02:35: Classify example-batch/tyr_c230gt_10%.fq.gz...
|
|
165
|
+
# 2023-07-31 17:02:39: Clustering example-batch/tyr_c230gt_01%.fq.gz...
|
|
166
|
+
# 2023-07-31 17:02:39: Clustering example-batch/tyr_c230gt_10%.fq.gz...
|
|
167
|
+
# 2023-07-31 17:02:53: Consensus calling of example-batch/tyr_c230gt_50%.fq.gz...
|
|
168
|
+
# 2023-07-31 17:02:59: Output reports of example-batch/tyr_c230gt_50%.fq.gz...
|
|
169
|
+
# 2023-07-31 17:03:04: 🍵 example-batch/tyr_c230gt_50%.fq.gz is finished!
|
|
170
|
+
# 2023-07-31 17:03:39: Consensus calling of example-batch/tyr_c230gt_01%.fq.gz...
|
|
171
|
+
# 2023-07-31 17:03:51: Output reports of example-batch/tyr_c230gt_01%.fq.gz...
|
|
172
|
+
# 2023-07-31 17:04:03: 🍵 example-batch/tyr_c230gt_01%.fq.gz is finished!
|
|
173
|
+
# 2023-07-31 17:04:08: Consensus calling of example-batch/tyr_c230gt_10%.fq.gz...
|
|
174
|
+
# 2023-07-31 17:04:16: Output reports of example-batch/tyr_c230gt_10%.fq.gz...
|
|
175
|
+
# 2023-07-31 17:04:24: 🍵 example-batch/tyr_c230gt_10%.fq.gz is finished!
|
|
176
|
+
# 🎉 Finished! Open DAJIN_Results/tyr-substitution to see the report.
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
## 📈 Report Contents
|
|
180
|
+
|
|
181
|
+
Upon completion of DAJIN2 processing, a directory named **DAJIN_Results** is generated.
|
|
182
|
+
Inside the **DAJIN_Results** directory, the following files can be found:
|
|
183
|
+
|
|
184
|
+
```
|
|
185
|
+
DAJIN_Results/tyr-substitution
|
|
186
|
+
├── BAM
|
|
187
|
+
│ ├── tyr_c230gt_01%
|
|
188
|
+
│ ├── tyr_c230gt_10%
|
|
189
|
+
│ ├── tyr_c230gt_50%
|
|
190
|
+
│ └── tyr_control
|
|
191
|
+
├── FASTA
|
|
192
|
+
│ ├── tyr_c230gt_01%
|
|
193
|
+
│ ├── tyr_c230gt_10%
|
|
194
|
+
│ └── tyr_c230gt_50%
|
|
195
|
+
├── HTML
|
|
196
|
+
│ ├── tyr_c230gt_01%
|
|
197
|
+
│ ├── tyr_c230gt_10%
|
|
198
|
+
│ └── tyr_c230gt_50%
|
|
199
|
+
├── MUTATION_INFO
|
|
200
|
+
│ ├── tyr_c230gt_01%.csv
|
|
201
|
+
│ ├── tyr_c230gt_10%.csv
|
|
202
|
+
│ └── tyr_c230gt_50%.csv
|
|
203
|
+
├── read_all.csv
|
|
204
|
+
├── read_plot.html
|
|
205
|
+
├── read_plot.pdf
|
|
206
|
+
└── read_summary.csv
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
### 1. BAM
|
|
210
|
+
|
|
211
|
+
The BAM directory contains the BAM files of reads classified per allele.
|
|
212
|
+
|
|
213
|
+
> **Note**
|
|
214
|
+
> Specifying a reference genome using the `genome` option will align the reads to that genome.
|
|
215
|
+
> Without `genome` options, the reads will align to the control allele within the input FASTA file.
|
|
216
|
+
|
|
217
|
+
### 2. FASTA and HTML
|
|
218
|
+
|
|
219
|
+
The FASTA directory stores the FASTA files of each allele.
|
|
220
|
+
The HTML directory contains HTML files for each allele, where mutation sites are color-highlighted.
|
|
221
|
+
For example, Tyr point mutation is highlighted in **green**.
|
|
222
|
+
|
|
223
|
+
<img src="https://user-images.githubusercontent.com/15861316/274518501-2ca3f442-1b86-4635-be3d-fd37575c4ca2.png" width="75%" />
|
|
224
|
+
|
|
225
|
+
### 3. MUTATION_INFO
|
|
226
|
+
|
|
227
|
+
The MUTATION_INFO directory saves tables depicting mutation sites for each allele.
|
|
228
|
+
An example of a Tyr point mutation is described by its position on the chromosome and the type of mutation.
|
|
229
|
+
|
|
230
|
+
<img src="https://user-images.githubusercontent.com/15861316/274519342-a613490d-5dbb-4a27-a2cf-bca0686b30f0.png" width="75%">
|
|
231
|
+
|
|
232
|
+
### 4. read_plot.html and read_plot.pdf
|
|
233
|
+
|
|
234
|
+
Both read_plot.html and read_plot.pdf illustrate the proportions of each allele.
|
|
235
|
+
The chart's **Allele type** indicates the type of allele, and **% of reads** shows the proportion of reads for that allele.
|
|
236
|
+
|
|
237
|
+
Additionally, the types of **Allele type** include:
|
|
238
|
+
- **intact**: Alleles that perfectly match the input FASTA allele.
|
|
239
|
+
- **indels**: Substitutions, deletions, insertions, or inversions within 50 bases.
|
|
240
|
+
- **sv**: Substitutions, deletions, insertions, or inversions beyond 50 bases.
|
|
241
|
+
|
|
242
|
+
<img src="https://user-images.githubusercontent.com/15861316/274521067-4d217251-4c62-4dc9-9c05-7f5377dd3025.png" width="75%">
|
|
243
|
+
|
|
244
|
+
> **Warning**
|
|
245
|
+
> In PCR amplicon sequencing, the % of reads might not match the actual allele proportions due to amplification bias.
|
|
246
|
+
> Especially when large deletions are present, the deletion alleles might be significantly amplified, potentially not reflecting the actual allele proportions.
|
|
247
|
+
|
|
248
|
+
### 5. read_all.csv and read_summary.csv
|
|
249
|
+
|
|
250
|
+
- read_all.csv: Records which allele each read is classified under.
|
|
251
|
+
- read_summary.csv: Describes the number of reads and presence proportion for each allele.
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
## 📄 References
|
|
255
|
+
|
|
256
|
+
For more information, please refer to the following publication:
|
|
257
|
+
|
|
258
|
+
[Kuno A, et al. (2022) DAJIN enables multiplex genotyping to simultaneously validate intended and unintended target genome editing outcomes. *PLoS Biology* 20(1): e3001507.](https://doi.org/10.1371/journal.pbio.3001507)
|
DAJIN2-0.3.2/README.md
ADDED
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
[](https://choosealicense.com/licenses/mit/)
|
|
2
|
+
[](https://github.com/akikuno/dajin2/actions)
|
|
3
|
+
[](https://pypi.org/project/DAJIN2/)
|
|
4
|
+
[](https://pypi.org/project/DAJIN2/)
|
|
5
|
+
[](https://anaconda.org/bioconda/dajin2)
|
|
6
|
+
[](https://zenodo.org/badge/latestdoi/387721337)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
<p align="center">
|
|
10
|
+
<img src="https://user-images.githubusercontent.com/15861316/261833016-7f356960-88cf-4574-87e2-36162b174340.png" width="90%">
|
|
11
|
+
</p>
|
|
12
|
+
|
|
13
|
+
[日本語はこちら](https://github.com/akikuno/DAJIN2/blob/main/docs/README_JP.md)
|
|
14
|
+
|
|
15
|
+
DAJIN2 is a genotyping software designed for organisms that have undergone genome editing, utilizing nanopore sequencing technology.
|
|
16
|
+
|
|
17
|
+
The name DAJIN is inspired by the term 一網**打尽** (Ichimou **DAJIN** or Yīwǎng **Dǎjìn**), which signifies capturing everything in a single net.
|
|
18
|
+
|
|
19
|
+
## 🙏 Feedbacks
|
|
20
|
+
|
|
21
|
+
DAJIN2 is still in the development phase.
|
|
22
|
+
Basic tests covering point mutations, deletions, and insertion designs have been conducted.
|
|
23
|
+
If you encounter any bugs or issues, please report them via [Issues](https://github.com/akikuno/DAJIN2/issues).
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
## 🛠 Installation
|
|
28
|
+
|
|
29
|
+
### From [Bioconda](https://anaconda.org/bioconda/DAJIN2) (Recommended)
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
conda install -c bioconda DAJIN2
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
### From [PyPI](https://pypi.org/project/DAJIN2/)
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install DAJIN2
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
> **Warning**
|
|
42
|
+
> If you encounter the error **Failed to build mappy** when installing DAJIN2 from pip, please install `gcc` and `zlib`.
|
|
43
|
+
> `sudo apt install gcc zlib1g zlib1g-dev` (Ubuntu)
|
|
44
|
+
> `brew install gcc zlib` (macOS)
|
|
45
|
+
|
|
46
|
+
<!-- ```bash
|
|
47
|
+
# Ubuntu
|
|
48
|
+
sudo apt install gcc zlib1g zlib1g-dev
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
# macOS
|
|
53
|
+
brew install gcc zlib
|
|
54
|
+
``` -->
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
## 💡 Usage
|
|
58
|
+
|
|
59
|
+
### Single Sample Analysis
|
|
60
|
+
|
|
61
|
+
DAJIN2 allows for the analysis of single samples (one sample vs one control).
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
DAJIN2 <-s|--sample> <-c|--control> <-a|--allele> <-n|--name> [-g|--genome] [-t|--threads] [-h|--help] [-v|--version]
|
|
65
|
+
|
|
66
|
+
options:
|
|
67
|
+
-s, --sample Path to a sample FASTQ file
|
|
68
|
+
-c, --control Path to a control FASTQ file
|
|
69
|
+
-a, --allele Path to a FASTA file
|
|
70
|
+
-n, --name Output directory name
|
|
71
|
+
-g, --genome (Optional) Reference genome ID (e.g hg38, mm39) [default: '']
|
|
72
|
+
-t, --threads (Optional) Number of threads [default: 1]
|
|
73
|
+
-h, --help show this help message and exit
|
|
74
|
+
-v, --version show the version number and exit
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
#### Example
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
# Donwload the example dataset
|
|
81
|
+
wget https://github.com/akikuno/DAJIN2/raw/main/examples/example-single.tar.gz
|
|
82
|
+
tar -xf example-single.tar.gz
|
|
83
|
+
|
|
84
|
+
# Run DAJIN2
|
|
85
|
+
DAJIN2 \
|
|
86
|
+
--name stx2-deletion \
|
|
87
|
+
--sample example-single/sample.fq.gz \
|
|
88
|
+
--control example-single/control.fq.gz \
|
|
89
|
+
--allele example-single/design.fa \
|
|
90
|
+
--genome mm39 \
|
|
91
|
+
--threads 10
|
|
92
|
+
|
|
93
|
+
# 2023-06-04 11:30:03: example-single/control.fq.gz is now processing...
|
|
94
|
+
# 2023-06-04 11:30:06: Preprocess example-single/control.fq.gz...
|
|
95
|
+
# 2023-06-04 11:30:06: Mapping example-single/control.fq.gz...
|
|
96
|
+
# 2023-06-04 11:30:21: Call MIDSV example-single/control.fq.gz...
|
|
97
|
+
# 2023-06-04 11:30:31: 🍵 example-single/control.fq.gz is finished!
|
|
98
|
+
# 2023-06-04 11:30:31: example-single/sample.fq.gz is now processing...
|
|
99
|
+
# 2023-06-04 11:30:35: Preprocess example-single/sample.fq.gz...
|
|
100
|
+
# 2023-06-04 11:34:13: Classify example-single/sample.fq.gz...
|
|
101
|
+
# 2023-06-04 11:34:18: Clustering example-single/sample.fq.gz...
|
|
102
|
+
# 2023-06-04 11:35:01: Consensus calling example-single/sample.fq.gz...
|
|
103
|
+
# 2023-06-04 11:35:08: 🍵 example-single/sample.fq.gz is finished!
|
|
104
|
+
# 🎉 Finished! Open DAJIN_Results/stx2-deletion to see the report.
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
### Batch Processing
|
|
108
|
+
|
|
109
|
+
By using the `batch` subcommand, you can process multiple FASTQ files simultaneously.
|
|
110
|
+
For this purpose, a CSV or Excel file consolidating the sample information is required.
|
|
111
|
+
For a specific example, please refer to [this link](https://github.com/akikuno/DAJIN2/blob/main/examples/example-batch/batch.csv).
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
DAJIN2 batch <-f|--file> [-t|--threads] [-h]
|
|
116
|
+
|
|
117
|
+
options:
|
|
118
|
+
-f, --file Path to a CSV or Excel file
|
|
119
|
+
-t, --threads (Optional) Number of threads [default: 1]
|
|
120
|
+
-h, --help Show this help message and exit
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
#### Example
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
# Donwload the example dataset
|
|
127
|
+
wget https://github.com/akikuno/DAJIN2/raw/main/examples/example-batch.tar.gz
|
|
128
|
+
tar -xf example-batch.tar.gz
|
|
129
|
+
|
|
130
|
+
# Run DAJIN2
|
|
131
|
+
DAJIN2 batch --file example-batch/batch.csv --threads 3
|
|
132
|
+
|
|
133
|
+
# 2023-07-31 17:01:10: example-batch/tyr_control.fq.gz is now processing...
|
|
134
|
+
# 2023-07-31 17:01:16: Preprocess example-batch/tyr_control.fq.gz...
|
|
135
|
+
# 2023-07-31 17:01:48: Output BAM files of example-batch/tyr_control.fq.gz...
|
|
136
|
+
# 2023-07-31 17:01:52: 🍵 example-batch/tyr_control.fq.gz is finished!
|
|
137
|
+
# 2023-07-31 17:01:52: example-batch/tyr_c230gt_50%.fq.gz is now processing...
|
|
138
|
+
# 2023-07-31 17:01:52: example-batch/tyr_c230gt_10%.fq.gz is now processing...
|
|
139
|
+
# 2023-07-31 17:01:52: example-batch/tyr_c230gt_01%.fq.gz is now processing...
|
|
140
|
+
# 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_01%.fq.gz...
|
|
141
|
+
# 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_50%.fq.gz...
|
|
142
|
+
# 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_10%.fq.gz...
|
|
143
|
+
# 2023-07-31 17:02:17: Classify example-batch/tyr_c230gt_50%.fq.gz...
|
|
144
|
+
# 2023-07-31 17:02:19: Clustering example-batch/tyr_c230gt_50%.fq.gz...
|
|
145
|
+
# 2023-07-31 17:02:34: Classify example-batch/tyr_c230gt_01%.fq.gz...
|
|
146
|
+
# 2023-07-31 17:02:35: Classify example-batch/tyr_c230gt_10%.fq.gz...
|
|
147
|
+
# 2023-07-31 17:02:39: Clustering example-batch/tyr_c230gt_01%.fq.gz...
|
|
148
|
+
# 2023-07-31 17:02:39: Clustering example-batch/tyr_c230gt_10%.fq.gz...
|
|
149
|
+
# 2023-07-31 17:02:53: Consensus calling of example-batch/tyr_c230gt_50%.fq.gz...
|
|
150
|
+
# 2023-07-31 17:02:59: Output reports of example-batch/tyr_c230gt_50%.fq.gz...
|
|
151
|
+
# 2023-07-31 17:03:04: 🍵 example-batch/tyr_c230gt_50%.fq.gz is finished!
|
|
152
|
+
# 2023-07-31 17:03:39: Consensus calling of example-batch/tyr_c230gt_01%.fq.gz...
|
|
153
|
+
# 2023-07-31 17:03:51: Output reports of example-batch/tyr_c230gt_01%.fq.gz...
|
|
154
|
+
# 2023-07-31 17:04:03: 🍵 example-batch/tyr_c230gt_01%.fq.gz is finished!
|
|
155
|
+
# 2023-07-31 17:04:08: Consensus calling of example-batch/tyr_c230gt_10%.fq.gz...
|
|
156
|
+
# 2023-07-31 17:04:16: Output reports of example-batch/tyr_c230gt_10%.fq.gz...
|
|
157
|
+
# 2023-07-31 17:04:24: 🍵 example-batch/tyr_c230gt_10%.fq.gz is finished!
|
|
158
|
+
# 🎉 Finished! Open DAJIN_Results/tyr-substitution to see the report.
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
## 📈 Report Contents
|
|
162
|
+
|
|
163
|
+
Upon completion of DAJIN2 processing, a directory named **DAJIN_Results** is generated.
|
|
164
|
+
Inside the **DAJIN_Results** directory, the following files can be found:
|
|
165
|
+
|
|
166
|
+
```
|
|
167
|
+
DAJIN_Results/tyr-substitution
|
|
168
|
+
├── BAM
|
|
169
|
+
│ ├── tyr_c230gt_01%
|
|
170
|
+
│ ├── tyr_c230gt_10%
|
|
171
|
+
│ ├── tyr_c230gt_50%
|
|
172
|
+
│ └── tyr_control
|
|
173
|
+
├── FASTA
|
|
174
|
+
│ ├── tyr_c230gt_01%
|
|
175
|
+
│ ├── tyr_c230gt_10%
|
|
176
|
+
│ └── tyr_c230gt_50%
|
|
177
|
+
├── HTML
|
|
178
|
+
│ ├── tyr_c230gt_01%
|
|
179
|
+
│ ├── tyr_c230gt_10%
|
|
180
|
+
│ └── tyr_c230gt_50%
|
|
181
|
+
├── MUTATION_INFO
|
|
182
|
+
│ ├── tyr_c230gt_01%.csv
|
|
183
|
+
│ ├── tyr_c230gt_10%.csv
|
|
184
|
+
│ └── tyr_c230gt_50%.csv
|
|
185
|
+
├── read_all.csv
|
|
186
|
+
├── read_plot.html
|
|
187
|
+
├── read_plot.pdf
|
|
188
|
+
└── read_summary.csv
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
### 1. BAM
|
|
192
|
+
|
|
193
|
+
The BAM directory contains the BAM files of reads classified per allele.
|
|
194
|
+
|
|
195
|
+
> **Note**
|
|
196
|
+
> Specifying a reference genome using the `genome` option will align the reads to that genome.
|
|
197
|
+
> Without `genome` options, the reads will align to the control allele within the input FASTA file.
|
|
198
|
+
|
|
199
|
+
### 2. FASTA and HTML
|
|
200
|
+
|
|
201
|
+
The FASTA directory stores the FASTA files of each allele.
|
|
202
|
+
The HTML directory contains HTML files for each allele, where mutation sites are color-highlighted.
|
|
203
|
+
For example, Tyr point mutation is highlighted in **green**.
|
|
204
|
+
|
|
205
|
+
<img src="https://user-images.githubusercontent.com/15861316/274518501-2ca3f442-1b86-4635-be3d-fd37575c4ca2.png" width="75%" />
|
|
206
|
+
|
|
207
|
+
### 3. MUTATION_INFO
|
|
208
|
+
|
|
209
|
+
The MUTATION_INFO directory saves tables depicting mutation sites for each allele.
|
|
210
|
+
An example of a Tyr point mutation is described by its position on the chromosome and the type of mutation.
|
|
211
|
+
|
|
212
|
+
<img src="https://user-images.githubusercontent.com/15861316/274519342-a613490d-5dbb-4a27-a2cf-bca0686b30f0.png" width="75%">
|
|
213
|
+
|
|
214
|
+
### 4. read_plot.html and read_plot.pdf
|
|
215
|
+
|
|
216
|
+
Both read_plot.html and read_plot.pdf illustrate the proportions of each allele.
|
|
217
|
+
The chart's **Allele type** indicates the type of allele, and **% of reads** shows the proportion of reads for that allele.
|
|
218
|
+
|
|
219
|
+
Additionally, the types of **Allele type** include:
|
|
220
|
+
- **intact**: Alleles that perfectly match the input FASTA allele.
|
|
221
|
+
- **indels**: Substitutions, deletions, insertions, or inversions within 50 bases.
|
|
222
|
+
- **sv**: Substitutions, deletions, insertions, or inversions beyond 50 bases.
|
|
223
|
+
|
|
224
|
+
<img src="https://user-images.githubusercontent.com/15861316/274521067-4d217251-4c62-4dc9-9c05-7f5377dd3025.png" width="75%">
|
|
225
|
+
|
|
226
|
+
> **Warning**
|
|
227
|
+
> In PCR amplicon sequencing, the % of reads might not match the actual allele proportions due to amplification bias.
|
|
228
|
+
> Especially when large deletions are present, the deletion alleles might be significantly amplified, potentially not reflecting the actual allele proportions.
|
|
229
|
+
|
|
230
|
+
### 5. read_all.csv and read_summary.csv
|
|
231
|
+
|
|
232
|
+
- read_all.csv: Records which allele each read is classified under.
|
|
233
|
+
- read_summary.csv: Describes the number of reads and presence proportion for each allele.
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
## 📄 References
|
|
237
|
+
|
|
238
|
+
For more information, please refer to the following publication:
|
|
239
|
+
|
|
240
|
+
[Kuno A, et al. (2022) DAJIN enables multiplex genotyping to simultaneously validate intended and unintended target genome editing outcomes. *PLoS Biology* 20(1): e3001507.](https://doi.org/10.1371/journal.pbio.3001507)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
numpy >= 1.20.0
|
|
2
|
+
scipy >= 1.6.0
|
|
3
|
+
pandas >= 1.0.0
|
|
4
|
+
openpyxl >= 3.0.0
|
|
5
|
+
rapidfuzz >=3.0.0
|
|
6
|
+
statsmodels >= 0.13.5
|
|
7
|
+
scikit-learn >= 1.0.0
|
|
8
|
+
|
|
9
|
+
mappy >= 2.24
|
|
10
|
+
pysam >= 0.19.0
|
|
11
|
+
|
|
12
|
+
Flask >= 2.2.0
|
|
13
|
+
waitress >= 2.1.0
|
|
14
|
+
Jinja2 >= 3.1.0
|
|
15
|
+
|
|
16
|
+
plotly >= 5.0.0
|
|
17
|
+
kaleido >= 0.2.0
|
|
18
|
+
|
|
19
|
+
cstag >= 0.4.1
|
|
20
|
+
midsv >= 0.10.1
|
|
21
|
+
wslPath >=0.3.0
|
|
@@ -9,10 +9,10 @@ with open("requirements.txt") as requirements_file:
|
|
|
9
9
|
|
|
10
10
|
setuptools.setup(
|
|
11
11
|
name="DAJIN2",
|
|
12
|
-
version="0.
|
|
12
|
+
version="0.3.2",
|
|
13
13
|
author="Akihiro Kuno",
|
|
14
14
|
author_email="akuno@md.tsukuba.ac.jp",
|
|
15
|
-
description="One-step genotyping tools for
|
|
15
|
+
description="One-step genotyping tools for targeted long-read sequencing",
|
|
16
16
|
long_description=long_description,
|
|
17
17
|
long_description_content_type="text/markdown",
|
|
18
18
|
url="https://github.com/akikuno/DAJIN2",
|
|
@@ -21,12 +21,16 @@ setuptools.setup(
|
|
|
21
21
|
where="src",
|
|
22
22
|
),
|
|
23
23
|
package_dir={"": "src"},
|
|
24
|
-
entry_points={"console_scripts": ["DAJIN2=DAJIN2.
|
|
24
|
+
entry_points={"console_scripts": ["DAJIN2=DAJIN2.main:execute"]},
|
|
25
25
|
include_package_data=True,
|
|
26
26
|
classifiers=[
|
|
27
|
+
"Development Status :: 4 - Beta",
|
|
28
|
+
"Environment :: Console",
|
|
27
29
|
"Programming Language :: Python :: 3",
|
|
28
30
|
"License :: OSI Approved :: MIT License",
|
|
29
31
|
"Operating System :: POSIX",
|
|
32
|
+
"Operating System :: MacOS",
|
|
33
|
+
"Intended Audience :: Science/Research",
|
|
34
|
+
"Topic :: Scientific/Engineering :: Bio-Informatics",
|
|
30
35
|
],
|
|
31
|
-
python_requires=">=3.7",
|
|
32
36
|
)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from DAJIN2.core.classification.classifier import classify_alleles
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import midsv
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from itertools import groupby
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _calc_match(CSSPLIT: str) -> float:
|
|
9
|
+
match_score = CSSPLIT.count("=")
|
|
10
|
+
match_score -= CSSPLIT.count("+") # insertion
|
|
11
|
+
match_score -= sum(cs.islower() for cs in CSSPLIT) # inversion
|
|
12
|
+
cssplit = CSSPLIT.split(",")
|
|
13
|
+
|
|
14
|
+
return match_score / len(cssplit)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _score_allele(TEMPDIR: Path, allele: str, SAMPLE_NAME: str) -> list[dict]:
|
|
18
|
+
midsv_sample = midsv.read_jsonl(Path(TEMPDIR, SAMPLE_NAME, "midsv", f"{allele}.json"))
|
|
19
|
+
scored_alleles = []
|
|
20
|
+
|
|
21
|
+
for dict_midsv in midsv_sample:
|
|
22
|
+
score = _calc_match(dict_midsv["CSSPLIT"])
|
|
23
|
+
dict_midsv.update({"SCORE": score, "ALLELE": allele})
|
|
24
|
+
scored_alleles.append(dict_midsv)
|
|
25
|
+
|
|
26
|
+
return scored_alleles
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _extract_alleles_with_max_score(score_of_each_alleles: list[dict]) -> list[dict]:
|
|
30
|
+
alleles_with_max_score = []
|
|
31
|
+
score_of_each_alleles.sort(key=lambda x: x["QNAME"])
|
|
32
|
+
for _, group in groupby(score_of_each_alleles, key=lambda x: x["QNAME"]):
|
|
33
|
+
max_read = max(group, key=lambda x: x["SCORE"])
|
|
34
|
+
del max_read["SCORE"]
|
|
35
|
+
alleles_with_max_score.append(max_read)
|
|
36
|
+
return alleles_with_max_score
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
##########################################################
|
|
40
|
+
# main
|
|
41
|
+
##########################################################
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def classify_alleles(TEMPDIR: Path, FASTA_ALLELES: dict, SAMPLE_NAME: str) -> list[dict]:
|
|
45
|
+
score_of_each_alleles = []
|
|
46
|
+
for allele in FASTA_ALLELES:
|
|
47
|
+
score_of_each_alleles.extend(_score_allele(TEMPDIR, allele, SAMPLE_NAME))
|
|
48
|
+
|
|
49
|
+
return _extract_alleles_with_max_score(score_of_each_alleles)
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def add_labels(classif_sample: list[dict], labels: list[int]) -> list[dict]:
|
|
5
|
+
"""Add 'LABEL' key to each sample dict, indicating each clster of allele."""
|
|
6
|
+
clust_sample = classif_sample.copy()
|
|
7
|
+
|
|
8
|
+
for clust, label in zip(clust_sample, labels):
|
|
9
|
+
clust["LABEL"] = label
|
|
10
|
+
|
|
11
|
+
return clust_sample
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def count_labels(clust_sample: list[dict]) -> dict[int, int]:
|
|
15
|
+
"""Count the occurrences of each label in the cluster sample."""
|
|
16
|
+
label_count = {}
|
|
17
|
+
for sample in clust_sample:
|
|
18
|
+
label = sample["LABEL"]
|
|
19
|
+
label_count[label] = label_count.get(label, 0) + 1
|
|
20
|
+
return label_count
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def add_readnum(clust_sample: list[dict]) -> list[dict]:
|
|
24
|
+
"""Add 'READNUM' key to each sample dict, indicating the number of occurrences of each label."""
|
|
25
|
+
clust_result = clust_sample.copy()
|
|
26
|
+
label_count = count_labels(clust_result)
|
|
27
|
+
|
|
28
|
+
for sample in clust_result:
|
|
29
|
+
sample["READNUM"] = label_count[sample["LABEL"]]
|
|
30
|
+
|
|
31
|
+
return clust_result
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def add_percent(clust_sample: list[dict]) -> list[dict]:
|
|
35
|
+
"""Add 'PERCENT' key to each sample dict, indicating the percentage of occurrences of each label."""
|
|
36
|
+
clust_result = clust_sample.copy()
|
|
37
|
+
total_samples = len(clust_result)
|
|
38
|
+
label_count = count_labels(clust_result)
|
|
39
|
+
|
|
40
|
+
for sample in clust_result:
|
|
41
|
+
label = sample["LABEL"]
|
|
42
|
+
sample["PERCENT"] = round((label_count[label] / total_samples) * 100, 3)
|
|
43
|
+
|
|
44
|
+
return clust_result
|