DAJIN2 0.4.2__zip → 0.4.4__zip
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {DAJIN2-0.4.2/src/DAJIN2.egg-info → dajin2-0.4.4}/PKG-INFO +38 -63
- {DAJIN2-0.4.2 → dajin2-0.4.4}/README.md +28 -53
- dajin2-0.4.4/requirements.txt +20 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/setup.py +1 -1
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/clustering/clustering.py +11 -10
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/clustering/label_merger.py +20 -16
- dajin2-0.4.4/src/DAJIN2/core/clustering/strand_bias_handler.py +115 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/core.py +8 -8
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/genome_fetcher.py +11 -3
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/midsv_caller.py +3 -4
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/mutation_extractor.py +7 -7
- dajin2-0.4.4/src/DAJIN2/core/report/__init__.py +3 -0
- DAJIN2-0.4.2/src/DAJIN2/core/report/report_bam.py → dajin2-0.4.4/src/DAJIN2/core/report/bam_exporter.py +64 -50
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/main.py +20 -20
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/utils/io.py +14 -6
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/utils/sam_handler.py +0 -13
- {DAJIN2-0.4.2 → dajin2-0.4.4/src/DAJIN2.egg-info}/PKG-INFO +38 -63
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2.egg-info/SOURCES.txt +3 -3
- dajin2-0.4.4/src/DAJIN2.egg-info/requires.txt +16 -0
- DAJIN2-0.4.2/requirements.txt +0 -20
- DAJIN2-0.4.2/src/DAJIN2/core/clustering/strand_bias_handler.py +0 -113
- DAJIN2-0.4.2/src/DAJIN2/core/report/__init__.py +0 -3
- DAJIN2-0.4.2/src/DAJIN2.egg-info/requires.txt +0 -16
- {DAJIN2-0.4.2 → dajin2-0.4.4}/LICENSE +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/MANIFEST.in +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/setup.cfg +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/__init__.py +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/__init__.py +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/classification/__init__.py +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/classification/allele_merger.py +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/classification/classifier.py +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/clustering/__init__.py +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/clustering/appender.py +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/clustering/kmer_generator.py +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/clustering/label_extractor.py +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/clustering/label_updator.py +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/clustering/score_handler.py +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/consensus/__init__.py +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/consensus/clust_formatter.py +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/consensus/consensus.py +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/consensus/mutation_extractor.py +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/consensus/name_handler.py +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/consensus/similarity_searcher.py +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/__init__.py +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/cache_checker.py +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/directory_manager.py +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/homopolymer_handler.py +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/input_formatter.py +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/insertions_to_fasta.py +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/knockin_handler.py +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/mapping.py +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/report/insertion_reflector.py +0 -0
- /DAJIN2-0.4.2/src/DAJIN2/core/report/report_mutation.py → /dajin2-0.4.4/src/DAJIN2/core/report/mutation_exporter.py +0 -0
- /DAJIN2-0.4.2/src/DAJIN2/core/report/report_files.py → /dajin2-0.4.4/src/DAJIN2/core/report/sequence_exporter.py +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/gui.py +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/static/css/style.css +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/template_igvjs.html +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/templates/index.html +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/utils/config.py +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/utils/cssplits_handler.py +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/utils/dna_handler.py +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/utils/fastx_handler.py +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/utils/input_validator.py +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/utils/multiprocess.py +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/utils/report_generator.py +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/view.py +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2.egg-info/dependency_links.txt +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2.egg-info/entry_points.txt +0 -0
- {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: DAJIN2
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.4
|
|
4
4
|
Summary: One-step genotyping tools for targeted long-read sequencing
|
|
5
5
|
Home-page: https://github.com/akikuno/DAJIN2
|
|
6
6
|
Author: Akihiro Kuno
|
|
@@ -14,22 +14,22 @@ Classifier: Intended Audience :: Science/Research
|
|
|
14
14
|
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
15
15
|
Description-Content-Type: text/markdown
|
|
16
16
|
License-File: LICENSE
|
|
17
|
-
Requires-Dist: numpy>=1.
|
|
18
|
-
Requires-Dist: scipy>=1.
|
|
17
|
+
Requires-Dist: numpy>=1.24.0
|
|
18
|
+
Requires-Dist: scipy>=1.10.0
|
|
19
19
|
Requires-Dist: pandas>=1.0.0
|
|
20
|
-
Requires-Dist: openpyxl>=3.
|
|
21
|
-
Requires-Dist: rapidfuzz>=3.
|
|
22
|
-
Requires-Dist: scikit-learn>=1.
|
|
20
|
+
Requires-Dist: openpyxl>=3.1.0
|
|
21
|
+
Requires-Dist: rapidfuzz>=3.6.0
|
|
22
|
+
Requires-Dist: scikit-learn>=1.3.0
|
|
23
23
|
Requires-Dist: mappy>=2.24
|
|
24
|
-
Requires-Dist: pysam>=0.
|
|
24
|
+
Requires-Dist: pysam>=0.21.0
|
|
25
25
|
Requires-Dist: Flask>=2.2.0
|
|
26
26
|
Requires-Dist: waitress>=2.1.0
|
|
27
27
|
Requires-Dist: Jinja2>=3.1.0
|
|
28
|
-
Requires-Dist: plotly>=5.
|
|
28
|
+
Requires-Dist: plotly>=5.19.0
|
|
29
29
|
Requires-Dist: kaleido>=0.2.0
|
|
30
30
|
Requires-Dist: cstag>=1.0.0
|
|
31
|
-
Requires-Dist: midsv>=0.
|
|
32
|
-
Requires-Dist: wslPath>=0.
|
|
31
|
+
Requires-Dist: midsv>=0.11.0
|
|
32
|
+
Requires-Dist: wslPath>=0.4.1
|
|
33
33
|
|
|
34
34
|
[](https://choosealicense.com/licenses/mit/)
|
|
35
35
|
[](https://github.com/akikuno/dajin2/actions)
|
|
@@ -78,6 +78,7 @@ conda activate env-dajin2
|
|
|
78
78
|
> CONDA_SUBDIR=osx-64 conda create -n env-dajin2 -c conda-forge -c bioconda python=3.10 DAJIN2 -y
|
|
79
79
|
> conda activate env-dajin2
|
|
80
80
|
> conda config --env --set subdir osx-64
|
|
81
|
+
> python -c "import platform; print(platform.machine())" # Confirm that the output is 'x86_64', not 'arm64'
|
|
81
82
|
> ```
|
|
82
83
|
|
|
83
84
|
### From [PyPI](https://pypi.org/project/DAJIN2/)
|
|
@@ -164,12 +165,17 @@ Options:
|
|
|
164
165
|
#### Example
|
|
165
166
|
|
|
166
167
|
```bash
|
|
168
|
+
# Download example dataset
|
|
169
|
+
curl -LJO https://github.com/akikuno/DAJIN2/raw/main/examples/example_single.tar.gz
|
|
170
|
+
tar -xf example_single.tar.gz
|
|
171
|
+
|
|
172
|
+
# Run DAJIN2
|
|
167
173
|
DAJIN2 \
|
|
168
|
-
--control
|
|
169
|
-
--sample
|
|
170
|
-
--allele
|
|
171
|
-
--name
|
|
172
|
-
--genome
|
|
174
|
+
--control example_single/control \
|
|
175
|
+
--sample example_single/sample \
|
|
176
|
+
--allele example_single/stx2_deletion.fa \
|
|
177
|
+
--name stx2_deletion \
|
|
178
|
+
--genome mm39 \
|
|
173
179
|
--threads 4
|
|
174
180
|
```
|
|
175
181
|
|
|
@@ -206,7 +212,6 @@ DAJIN2 \
|
|
|
206
212
|
|
|
207
213
|
By using the `batch` subcommand, you can process multiple FASTQ files simultaneously.
|
|
208
214
|
For this purpose, a CSV or Excel file consolidating the sample information is required.
|
|
209
|
-
<!-- For a specific example, please refer to [this link](https://github.com/akikuno/DAJIN2/blob/main/examples/example-batch/batch.csv). -->
|
|
210
215
|
|
|
211
216
|
> [!NOTE]
|
|
212
217
|
> For guidance on how to compile sample information, please refer to [this document](https://docs.google.com/presentation/d/e/2PACX-1vSMEmXJPG2TNjfT66XZJRzqJd82aAqO5gJrdEzyhn15YBBr_Li-j5puOgVChYf3jA/embed?start=false&loop=false&delayms=3000).
|
|
@@ -224,44 +229,14 @@ options:
|
|
|
224
229
|
#### Example
|
|
225
230
|
|
|
226
231
|
```bash
|
|
227
|
-
DAJIN2 --file batch.csv --threads 4
|
|
228
|
-
```
|
|
229
|
-
|
|
230
|
-
<!-- ```bash
|
|
231
232
|
# Donwload the example dataset
|
|
232
|
-
|
|
233
|
-
tar -xf
|
|
233
|
+
curl -LJO https://github.com/akikuno/DAJIN2/raw/main/examples/example_batch.tar.gz
|
|
234
|
+
tar -xf example_batch.tar.gz
|
|
234
235
|
|
|
235
236
|
# Run DAJIN2
|
|
236
|
-
DAJIN2 batch --file
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
# 2023-07-31 17:01:16: Preprocess example-batch/tyr_control.fq.gz...
|
|
240
|
-
# 2023-07-31 17:01:48: Output BAM files of example-batch/tyr_control.fq.gz...
|
|
241
|
-
# 2023-07-31 17:01:52: 🍵 example-batch/tyr_control.fq.gz is finished!
|
|
242
|
-
# 2023-07-31 17:01:52: example-batch/tyr_c230gt_50%.fq.gz is now processing...
|
|
243
|
-
# 2023-07-31 17:01:52: example-batch/tyr_c230gt_10%.fq.gz is now processing...
|
|
244
|
-
# 2023-07-31 17:01:52: example-batch/tyr_c230gt_01%.fq.gz is now processing...
|
|
245
|
-
# 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_01%.fq.gz...
|
|
246
|
-
# 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_50%.fq.gz...
|
|
247
|
-
# 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_10%.fq.gz...
|
|
248
|
-
# 2023-07-31 17:02:17: Classify example-batch/tyr_c230gt_50%.fq.gz...
|
|
249
|
-
# 2023-07-31 17:02:19: Clustering example-batch/tyr_c230gt_50%.fq.gz...
|
|
250
|
-
# 2023-07-31 17:02:34: Classify example-batch/tyr_c230gt_01%.fq.gz...
|
|
251
|
-
# 2023-07-31 17:02:35: Classify example-batch/tyr_c230gt_10%.fq.gz...
|
|
252
|
-
# 2023-07-31 17:02:39: Clustering example-batch/tyr_c230gt_01%.fq.gz...
|
|
253
|
-
# 2023-07-31 17:02:39: Clustering example-batch/tyr_c230gt_10%.fq.gz...
|
|
254
|
-
# 2023-07-31 17:02:53: Consensus calling of example-batch/tyr_c230gt_50%.fq.gz...
|
|
255
|
-
# 2023-07-31 17:02:59: Output reports of example-batch/tyr_c230gt_50%.fq.gz...
|
|
256
|
-
# 2023-07-31 17:03:04: 🍵 example-batch/tyr_c230gt_50%.fq.gz is finished!
|
|
257
|
-
# 2023-07-31 17:03:39: Consensus calling of example-batch/tyr_c230gt_01%.fq.gz...
|
|
258
|
-
# 2023-07-31 17:03:51: Output reports of example-batch/tyr_c230gt_01%.fq.gz...
|
|
259
|
-
# 2023-07-31 17:04:03: 🍵 example-batch/tyr_c230gt_01%.fq.gz is finished!
|
|
260
|
-
# 2023-07-31 17:04:08: Consensus calling of example-batch/tyr_c230gt_10%.fq.gz...
|
|
261
|
-
# 2023-07-31 17:04:16: Output reports of example-batch/tyr_c230gt_10%.fq.gz...
|
|
262
|
-
# 2023-07-31 17:04:24: 🍵 example-batch/tyr_c230gt_10%.fq.gz is finished!
|
|
263
|
-
# 🎉 Finished! Open DAJIN_Results/tyr-substitution to see the report.
|
|
264
|
-
``` -->
|
|
237
|
+
DAJIN2 batch --file example_batch/batch.csv --threads 4
|
|
238
|
+
```
|
|
239
|
+
|
|
265
240
|
|
|
266
241
|
## 📈 Report Contents
|
|
267
242
|
|
|
@@ -271,22 +246,22 @@ Inside the **DAJIN_Results** directory, the following files can be found:
|
|
|
271
246
|
```
|
|
272
247
|
DAJIN_Results/tyr-substitution
|
|
273
248
|
├── BAM
|
|
274
|
-
│ ├── tyr_c230gt_01
|
|
275
|
-
│ ├── tyr_c230gt_10
|
|
276
|
-
│ ├── tyr_c230gt_50
|
|
249
|
+
│ ├── tyr_c230gt_01
|
|
250
|
+
│ ├── tyr_c230gt_10
|
|
251
|
+
│ ├── tyr_c230gt_50
|
|
277
252
|
│ └── tyr_control
|
|
278
253
|
├── FASTA
|
|
279
|
-
│ ├── tyr_c230gt_01
|
|
280
|
-
│ ├── tyr_c230gt_10
|
|
281
|
-
│ └── tyr_c230gt_50
|
|
254
|
+
│ ├── tyr_c230gt_01
|
|
255
|
+
│ ├── tyr_c230gt_10
|
|
256
|
+
│ └── tyr_c230gt_50
|
|
282
257
|
├── HTML
|
|
283
|
-
│ ├── tyr_c230gt_01
|
|
284
|
-
│ ├── tyr_c230gt_10
|
|
285
|
-
│ └── tyr_c230gt_50
|
|
258
|
+
│ ├── tyr_c230gt_01
|
|
259
|
+
│ ├── tyr_c230gt_10
|
|
260
|
+
│ └── tyr_c230gt_50
|
|
286
261
|
├── MUTATION_INFO
|
|
287
|
-
│ ├── tyr_c230gt_01
|
|
288
|
-
│ ├── tyr_c230gt_10
|
|
289
|
-
│ └── tyr_c230gt_50
|
|
262
|
+
│ ├── tyr_c230gt_01.csv
|
|
263
|
+
│ ├── tyr_c230gt_10.csv
|
|
264
|
+
│ └── tyr_c230gt_50.csv
|
|
290
265
|
├── read_plot.html
|
|
291
266
|
├── read_plot.pdf
|
|
292
267
|
└── read_summary.xlsx
|
|
@@ -45,6 +45,7 @@ conda activate env-dajin2
|
|
|
45
45
|
> CONDA_SUBDIR=osx-64 conda create -n env-dajin2 -c conda-forge -c bioconda python=3.10 DAJIN2 -y
|
|
46
46
|
> conda activate env-dajin2
|
|
47
47
|
> conda config --env --set subdir osx-64
|
|
48
|
+
> python -c "import platform; print(platform.machine())" # Confirm that the output is 'x86_64', not 'arm64'
|
|
48
49
|
> ```
|
|
49
50
|
|
|
50
51
|
### From [PyPI](https://pypi.org/project/DAJIN2/)
|
|
@@ -131,12 +132,17 @@ Options:
|
|
|
131
132
|
#### Example
|
|
132
133
|
|
|
133
134
|
```bash
|
|
135
|
+
# Download example dataset
|
|
136
|
+
curl -LJO https://github.com/akikuno/DAJIN2/raw/main/examples/example_single.tar.gz
|
|
137
|
+
tar -xf example_single.tar.gz
|
|
138
|
+
|
|
139
|
+
# Run DAJIN2
|
|
134
140
|
DAJIN2 \
|
|
135
|
-
--control
|
|
136
|
-
--sample
|
|
137
|
-
--allele
|
|
138
|
-
--name
|
|
139
|
-
--genome
|
|
141
|
+
--control example_single/control \
|
|
142
|
+
--sample example_single/sample \
|
|
143
|
+
--allele example_single/stx2_deletion.fa \
|
|
144
|
+
--name stx2_deletion \
|
|
145
|
+
--genome mm39 \
|
|
140
146
|
--threads 4
|
|
141
147
|
```
|
|
142
148
|
|
|
@@ -173,7 +179,6 @@ DAJIN2 \
|
|
|
173
179
|
|
|
174
180
|
By using the `batch` subcommand, you can process multiple FASTQ files simultaneously.
|
|
175
181
|
For this purpose, a CSV or Excel file consolidating the sample information is required.
|
|
176
|
-
<!-- For a specific example, please refer to [this link](https://github.com/akikuno/DAJIN2/blob/main/examples/example-batch/batch.csv). -->
|
|
177
182
|
|
|
178
183
|
> [!NOTE]
|
|
179
184
|
> For guidance on how to compile sample information, please refer to [this document](https://docs.google.com/presentation/d/e/2PACX-1vSMEmXJPG2TNjfT66XZJRzqJd82aAqO5gJrdEzyhn15YBBr_Li-j5puOgVChYf3jA/embed?start=false&loop=false&delayms=3000).
|
|
@@ -191,44 +196,14 @@ options:
|
|
|
191
196
|
#### Example
|
|
192
197
|
|
|
193
198
|
```bash
|
|
194
|
-
DAJIN2 --file batch.csv --threads 4
|
|
195
|
-
```
|
|
196
|
-
|
|
197
|
-
<!-- ```bash
|
|
198
199
|
# Donwload the example dataset
|
|
199
|
-
|
|
200
|
-
tar -xf
|
|
200
|
+
curl -LJO https://github.com/akikuno/DAJIN2/raw/main/examples/example_batch.tar.gz
|
|
201
|
+
tar -xf example_batch.tar.gz
|
|
201
202
|
|
|
202
203
|
# Run DAJIN2
|
|
203
|
-
DAJIN2 batch --file
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
# 2023-07-31 17:01:16: Preprocess example-batch/tyr_control.fq.gz...
|
|
207
|
-
# 2023-07-31 17:01:48: Output BAM files of example-batch/tyr_control.fq.gz...
|
|
208
|
-
# 2023-07-31 17:01:52: 🍵 example-batch/tyr_control.fq.gz is finished!
|
|
209
|
-
# 2023-07-31 17:01:52: example-batch/tyr_c230gt_50%.fq.gz is now processing...
|
|
210
|
-
# 2023-07-31 17:01:52: example-batch/tyr_c230gt_10%.fq.gz is now processing...
|
|
211
|
-
# 2023-07-31 17:01:52: example-batch/tyr_c230gt_01%.fq.gz is now processing...
|
|
212
|
-
# 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_01%.fq.gz...
|
|
213
|
-
# 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_50%.fq.gz...
|
|
214
|
-
# 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_10%.fq.gz...
|
|
215
|
-
# 2023-07-31 17:02:17: Classify example-batch/tyr_c230gt_50%.fq.gz...
|
|
216
|
-
# 2023-07-31 17:02:19: Clustering example-batch/tyr_c230gt_50%.fq.gz...
|
|
217
|
-
# 2023-07-31 17:02:34: Classify example-batch/tyr_c230gt_01%.fq.gz...
|
|
218
|
-
# 2023-07-31 17:02:35: Classify example-batch/tyr_c230gt_10%.fq.gz...
|
|
219
|
-
# 2023-07-31 17:02:39: Clustering example-batch/tyr_c230gt_01%.fq.gz...
|
|
220
|
-
# 2023-07-31 17:02:39: Clustering example-batch/tyr_c230gt_10%.fq.gz...
|
|
221
|
-
# 2023-07-31 17:02:53: Consensus calling of example-batch/tyr_c230gt_50%.fq.gz...
|
|
222
|
-
# 2023-07-31 17:02:59: Output reports of example-batch/tyr_c230gt_50%.fq.gz...
|
|
223
|
-
# 2023-07-31 17:03:04: 🍵 example-batch/tyr_c230gt_50%.fq.gz is finished!
|
|
224
|
-
# 2023-07-31 17:03:39: Consensus calling of example-batch/tyr_c230gt_01%.fq.gz...
|
|
225
|
-
# 2023-07-31 17:03:51: Output reports of example-batch/tyr_c230gt_01%.fq.gz...
|
|
226
|
-
# 2023-07-31 17:04:03: 🍵 example-batch/tyr_c230gt_01%.fq.gz is finished!
|
|
227
|
-
# 2023-07-31 17:04:08: Consensus calling of example-batch/tyr_c230gt_10%.fq.gz...
|
|
228
|
-
# 2023-07-31 17:04:16: Output reports of example-batch/tyr_c230gt_10%.fq.gz...
|
|
229
|
-
# 2023-07-31 17:04:24: 🍵 example-batch/tyr_c230gt_10%.fq.gz is finished!
|
|
230
|
-
# 🎉 Finished! Open DAJIN_Results/tyr-substitution to see the report.
|
|
231
|
-
``` -->
|
|
204
|
+
DAJIN2 batch --file example_batch/batch.csv --threads 4
|
|
205
|
+
```
|
|
206
|
+
|
|
232
207
|
|
|
233
208
|
## 📈 Report Contents
|
|
234
209
|
|
|
@@ -238,22 +213,22 @@ Inside the **DAJIN_Results** directory, the following files can be found:
|
|
|
238
213
|
```
|
|
239
214
|
DAJIN_Results/tyr-substitution
|
|
240
215
|
├── BAM
|
|
241
|
-
│ ├── tyr_c230gt_01
|
|
242
|
-
│ ├── tyr_c230gt_10
|
|
243
|
-
│ ├── tyr_c230gt_50
|
|
216
|
+
│ ├── tyr_c230gt_01
|
|
217
|
+
│ ├── tyr_c230gt_10
|
|
218
|
+
│ ├── tyr_c230gt_50
|
|
244
219
|
│ └── tyr_control
|
|
245
220
|
├── FASTA
|
|
246
|
-
│ ├── tyr_c230gt_01
|
|
247
|
-
│ ├── tyr_c230gt_10
|
|
248
|
-
│ └── tyr_c230gt_50
|
|
221
|
+
│ ├── tyr_c230gt_01
|
|
222
|
+
│ ├── tyr_c230gt_10
|
|
223
|
+
│ └── tyr_c230gt_50
|
|
249
224
|
├── HTML
|
|
250
|
-
│ ├── tyr_c230gt_01
|
|
251
|
-
│ ├── tyr_c230gt_10
|
|
252
|
-
│ └── tyr_c230gt_50
|
|
225
|
+
│ ├── tyr_c230gt_01
|
|
226
|
+
│ ├── tyr_c230gt_10
|
|
227
|
+
│ └── tyr_c230gt_50
|
|
253
228
|
├── MUTATION_INFO
|
|
254
|
-
│ ├── tyr_c230gt_01
|
|
255
|
-
│ ├── tyr_c230gt_10
|
|
256
|
-
│ └── tyr_c230gt_50
|
|
229
|
+
│ ├── tyr_c230gt_01.csv
|
|
230
|
+
│ ├── tyr_c230gt_10.csv
|
|
231
|
+
│ └── tyr_c230gt_50.csv
|
|
257
232
|
├── read_plot.html
|
|
258
233
|
├── read_plot.pdf
|
|
259
234
|
└── read_summary.xlsx
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
numpy >= 1.24.0
|
|
2
|
+
scipy >= 1.10.0
|
|
3
|
+
pandas >= 1.0.0
|
|
4
|
+
openpyxl >= 3.1.0
|
|
5
|
+
rapidfuzz >=3.6.0
|
|
6
|
+
scikit-learn >= 1.3.0
|
|
7
|
+
|
|
8
|
+
mappy >= 2.24
|
|
9
|
+
pysam >= 0.21.0
|
|
10
|
+
|
|
11
|
+
Flask >= 2.2.0
|
|
12
|
+
waitress >= 2.1.0
|
|
13
|
+
Jinja2 >= 3.1.0
|
|
14
|
+
|
|
15
|
+
plotly >= 5.19.0
|
|
16
|
+
kaleido >= 0.2.0
|
|
17
|
+
|
|
18
|
+
cstag >= 1.0.0
|
|
19
|
+
midsv >= 0.11.0
|
|
20
|
+
wslPath >=0.4.1
|
|
@@ -9,7 +9,7 @@ with open("requirements.txt") as requirements_file:
|
|
|
9
9
|
|
|
10
10
|
setuptools.setup(
|
|
11
11
|
name="DAJIN2",
|
|
12
|
-
version="0.4.
|
|
12
|
+
version="0.4.4",
|
|
13
13
|
author="Akihiro Kuno",
|
|
14
14
|
author_email="akuno@md.tsukuba.ac.jp",
|
|
15
15
|
description="One-step genotyping tools for targeted long-read sequencing",
|
|
@@ -39,17 +39,16 @@ def optimize_labels(X: spmatrix, coverage_sample: int, coverage_control: int) ->
|
|
|
39
39
|
# print(i, Counter(labels_sample), Counter(labels_control), Counter(labels_current)) # ! DEBUG
|
|
40
40
|
|
|
41
41
|
num_labels_control = count_number_of_clusters(labels_control, coverage_control)
|
|
42
|
-
|
|
42
|
+
rand_index = metrics.adjusted_rand_score(labels_previous, labels_current)
|
|
43
43
|
|
|
44
44
|
"""
|
|
45
45
|
Return the number of clusters when:
|
|
46
|
-
|
|
47
|
-
|
|
46
|
+
- the number of clusters in control is split into more than one.
|
|
47
|
+
- the mutual information between the current and previous labels is high enough (= similar).
|
|
48
|
+
To reduce the allele number, previous labels are returned.
|
|
48
49
|
"""
|
|
49
|
-
if num_labels_control >= 2:
|
|
50
|
+
if num_labels_control >= 2 or rand_index >= 0.95:
|
|
50
51
|
return labels_previous
|
|
51
|
-
if 0.95 <= mutual_info <= 1.0:
|
|
52
|
-
return labels_current
|
|
53
52
|
labels_previous = labels_current
|
|
54
53
|
return labels_previous
|
|
55
54
|
|
|
@@ -58,11 +57,13 @@ def get_label_most_common(labels: list[int]) -> int:
|
|
|
58
57
|
return Counter(labels).most_common()[0][0]
|
|
59
58
|
|
|
60
59
|
|
|
61
|
-
def return_labels(
|
|
60
|
+
def return_labels(
|
|
61
|
+
path_score_sample: Path, path_score_control: Path, path_sample: Path, strand_bias_in_control: bool
|
|
62
|
+
) -> list[int]:
|
|
62
63
|
np.random.seed(seed=1)
|
|
63
64
|
score_control = list(io.read_jsonl(path_score_control))
|
|
64
65
|
X_control = csr_matrix(score_control)
|
|
65
|
-
|
|
66
|
+
"""Subset to 1000 reads of controls in the most common cluster to remove outliers and reduce computation time"""
|
|
66
67
|
labels_control = BisectingKMeans(n_clusters=2, random_state=1).fit_predict(X_control)
|
|
67
68
|
label_most_common = get_label_most_common(labels_control)
|
|
68
69
|
scores_control_subset = subset_scores(labels_control, io.read_jsonl(path_score_control), label_most_common, 1000)
|
|
@@ -71,7 +72,7 @@ def return_labels(path_score_sample: Path, path_score_control: Path, path_sample
|
|
|
71
72
|
coverage_sample = io.count_newlines(path_score_sample)
|
|
72
73
|
coverage_control = len(scores_control_subset)
|
|
73
74
|
labels = optimize_labels(X, coverage_sample, coverage_control)
|
|
74
|
-
|
|
75
|
-
if
|
|
75
|
+
"""Re-allocate clusters with strand bias to clusters without strand bias"""
|
|
76
|
+
if strand_bias_in_control is False:
|
|
76
77
|
labels = remove_biased_clusters(path_sample, path_score_sample, labels)
|
|
77
78
|
return labels
|
|
@@ -11,20 +11,6 @@ def calculate_label_percentages(labels: list[int]) -> dict[int, float]:
|
|
|
11
11
|
return {label: (count / total_labels * 100) for label, count in label_counts.items()}
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
def merge_mixed_cluster(labels_control: list[int], labels_sample: list[int], threshold: float = 0.5) -> list[int]:
|
|
15
|
-
"""Merge labels in sample if they appear more than 'threshold' percentage in control."""
|
|
16
|
-
labels_merged = labels_sample.copy()
|
|
17
|
-
label_percentages_control = calculate_label_percentages(labels_control)
|
|
18
|
-
mixed_labels = {label for label, percent in label_percentages_control.items() if percent > threshold}
|
|
19
|
-
|
|
20
|
-
new_label = max(labels_merged) + 1
|
|
21
|
-
for i, label in enumerate(labels_sample):
|
|
22
|
-
if label in mixed_labels:
|
|
23
|
-
labels_merged[i] = new_label
|
|
24
|
-
|
|
25
|
-
return labels_merged
|
|
26
|
-
|
|
27
|
-
|
|
28
14
|
def map_clusters_to_previous(labels_sample: list[int], labels_previous: list[int]) -> dict[int, int]:
|
|
29
15
|
"""
|
|
30
16
|
Determine which cluster in labels_previous corresponds to each cluster in labels_sample.
|
|
@@ -63,6 +49,8 @@ def merge_minor_cluster(
|
|
|
63
49
|
minor_labels_percentage = {label for label, percent in label_percentages.items() if percent < threshold_percentage}
|
|
64
50
|
minor_labels_readnumber = {label for label, num in Counter(labels_sample).items() if num <= threshold_readnumber}
|
|
65
51
|
minor_labels = minor_labels_percentage | minor_labels_readnumber
|
|
52
|
+
if minor_labels == set():
|
|
53
|
+
return labels_sample
|
|
66
54
|
|
|
67
55
|
correspondence = map_clusters_to_previous(labels_sample, labels_previous)
|
|
68
56
|
update_required_labels = get_update_required_labels(correspondence)
|
|
@@ -70,7 +58,23 @@ def merge_minor_cluster(
|
|
|
70
58
|
labels_merged = labels_sample.copy()
|
|
71
59
|
for m in minor_labels:
|
|
72
60
|
new_label = max(labels_merged) + 1
|
|
73
|
-
labels_merged = [
|
|
61
|
+
labels_merged = [
|
|
62
|
+
new_label if label in update_required_labels[correspondence[m]] else label for label in labels_merged
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
return labels_merged
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def merge_mixed_cluster(labels_control: list[int], labels_sample: list[int], threshold: float = 0.5) -> list[int]:
|
|
69
|
+
"""Merge labels in sample if they appear more than 'threshold' percentage in control."""
|
|
70
|
+
labels_merged = labels_sample.copy()
|
|
71
|
+
label_percentages_control = calculate_label_percentages(labels_control)
|
|
72
|
+
mixed_labels = {label for label, percent in label_percentages_control.items() if percent > threshold}
|
|
73
|
+
|
|
74
|
+
new_label = max(labels_merged) + 1
|
|
75
|
+
for i, label in enumerate(labels_sample):
|
|
76
|
+
if label in mixed_labels:
|
|
77
|
+
labels_merged[i] = new_label
|
|
74
78
|
|
|
75
79
|
return labels_merged
|
|
76
80
|
|
|
@@ -82,7 +86,7 @@ def merge_minor_cluster(
|
|
|
82
86
|
|
|
83
87
|
def merge_labels(labels_control: list[int], labels_sample: list[int], labels_previous: list[int]) -> list[int]:
|
|
84
88
|
labels_merged = merge_minor_cluster(
|
|
85
|
-
labels_sample, labels_previous, threshold_percentage=0.5, threshold_readnumber=
|
|
89
|
+
labels_sample, labels_previous, threshold_percentage=0.5, threshold_readnumber=5
|
|
86
90
|
)
|
|
87
91
|
labels_merged = merge_mixed_cluster(labels_control, labels_merged)
|
|
88
92
|
return labels_merged
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Nanopore sequencing results often results in strand specific mutations even though the mutation is not strand specific, thus they are considered as sequencing errors and should be removed.
|
|
5
|
+
|
|
6
|
+
This module provides functions to determine whether each allele obtained after clustering is formed due to sequencing errors caused by strand bias.
|
|
7
|
+
|
|
8
|
+
Re-allocates reads belonging to clusters with strand bias to clusters without strand bias.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from collections import defaultdict
|
|
13
|
+
from sklearn.tree import DecisionTreeClassifier
|
|
14
|
+
|
|
15
|
+
from DAJIN2.utils import io
|
|
16
|
+
|
|
17
|
+
# Constants
|
|
18
|
+
STRAND_BIAS_LOWER_LIMIT = 0.1
|
|
19
|
+
STRAND_BIAS_UPPER_LIMIT = 0.9
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def is_strand_bias(path_control: Path) -> bool:
|
|
23
|
+
"""
|
|
24
|
+
Determines whether there is a strand bias in sequencing data
|
|
25
|
+
based on the distribution of '+' and '-' strands.
|
|
26
|
+
"""
|
|
27
|
+
count_strand = defaultdict(int)
|
|
28
|
+
for sample in io.read_jsonl(path_control):
|
|
29
|
+
count_strand[sample["STRAND"]] += 1
|
|
30
|
+
|
|
31
|
+
total = count_strand["+"] + count_strand["-"]
|
|
32
|
+
percentage_plus = count_strand["+"] / total if total > 0 else 0
|
|
33
|
+
|
|
34
|
+
return not (STRAND_BIAS_LOWER_LIMIT < percentage_plus < STRAND_BIAS_UPPER_LIMIT)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
###############################################################################
|
|
38
|
+
# Handle Strand bias
|
|
39
|
+
# # Clusters of reads with mutations with strand bias are merged into similar clusters without strand bias
|
|
40
|
+
###############################################################################
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def count_strand(labels: list[int], samples: list[dict[str, str]]) -> tuple[dict[str, int], dict[str, int]]:
|
|
44
|
+
"""Count the occurrences of each strand type by label."""
|
|
45
|
+
positive_strand_counts_by_labels = defaultdict(int)
|
|
46
|
+
total_counts_by_labels = defaultdict(int)
|
|
47
|
+
|
|
48
|
+
for label, sample in zip(labels, samples):
|
|
49
|
+
total_counts_by_labels[label] += 1
|
|
50
|
+
if sample["STRAND"] == "+":
|
|
51
|
+
positive_strand_counts_by_labels[label] += 1
|
|
52
|
+
|
|
53
|
+
return dict(positive_strand_counts_by_labels), dict(total_counts_by_labels)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def determine_strand_biases(
|
|
57
|
+
positive_strand_counts_by_labels: defaultdict, total_counts_by_labels: defaultdict
|
|
58
|
+
) -> dict[int, bool]:
|
|
59
|
+
"""Determine strand biases based on positive strand counts."""
|
|
60
|
+
strand_biases = {}
|
|
61
|
+
for label, total in total_counts_by_labels.items():
|
|
62
|
+
positive_strand_count = positive_strand_counts_by_labels[label]
|
|
63
|
+
strand_ratio = positive_strand_count / total
|
|
64
|
+
strand_biases[label] = not (STRAND_BIAS_LOWER_LIMIT < strand_ratio < STRAND_BIAS_UPPER_LIMIT)
|
|
65
|
+
|
|
66
|
+
return strand_biases
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def prepare_training_testing_sets(labels, scores, strand_biases) -> tuple[list, list, list]:
|
|
70
|
+
"""Prepare training and testing datasets based on strand biases."""
|
|
71
|
+
train_data, train_labels, test_data = [], [], []
|
|
72
|
+
for label, score in zip(labels, scores):
|
|
73
|
+
if strand_biases[label]:
|
|
74
|
+
test_data.append(score)
|
|
75
|
+
else:
|
|
76
|
+
train_data.append(score)
|
|
77
|
+
train_labels.append(label)
|
|
78
|
+
return train_data, train_labels, test_data
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def train_decision_tree(train_data, train_labels) -> DecisionTreeClassifier:
|
|
82
|
+
"""Train a decision tree classifier using the provided features and labels."""
|
|
83
|
+
dtree = DecisionTreeClassifier(random_state=1)
|
|
84
|
+
dtree.fit(train_data, train_labels)
|
|
85
|
+
return dtree
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def allocate_labels(labels: list[int], strand_biases: dict[str, bool], dtree, test_data) -> list[int]:
|
|
89
|
+
"""Re-allocates reads belonging to clusters with strand bias to clusters without strand bias."""
|
|
90
|
+
label_predictions = iter(dtree.predict(test_data))
|
|
91
|
+
for i, label in enumerate(labels):
|
|
92
|
+
if strand_biases[label]:
|
|
93
|
+
labels[i] = next(label_predictions)
|
|
94
|
+
return labels
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def remove_biased_clusters(path_sample: Path, path_score_sample: Path, labels: list[int]) -> list[int]:
|
|
98
|
+
"""Remove clusters with strand bias by re-labeling based on decision tree predictions.
|
|
99
|
+
Continue until at least one of the samples exhibits strand bias (i.e., do not calculate if all samples exhibit strand bias, or conversely, if none of the samples exhibit strand bias) or
|
|
100
|
+
1000 iterations are reached, which serves as a safeguard to prevent infinite loops.
|
|
101
|
+
"""
|
|
102
|
+
samples = io.read_jsonl(path_sample)
|
|
103
|
+
positive_strand_counts_by_labels, total_counts_by_labels = count_strand(labels, samples)
|
|
104
|
+
strand_biases = determine_strand_biases(positive_strand_counts_by_labels, total_counts_by_labels)
|
|
105
|
+
|
|
106
|
+
iteration_count = 0
|
|
107
|
+
labels_corrected = labels
|
|
108
|
+
while len(set(strand_biases.values())) > 1 or iteration_count < 1000:
|
|
109
|
+
scores = io.read_jsonl(path_score_sample)
|
|
110
|
+
train_data, train_labels, test_data = prepare_training_testing_sets(labels, scores, strand_biases)
|
|
111
|
+
dtree = train_decision_tree(train_data, train_labels)
|
|
112
|
+
labels_corrected = allocate_labels(labels, strand_biases, dtree, test_data)
|
|
113
|
+
strand_biases = determine_strand_biases(labels_corrected, path_sample)
|
|
114
|
+
iteration_count += 1
|
|
115
|
+
return labels_corrected
|
|
@@ -70,8 +70,8 @@ def execute_control(arguments: dict):
|
|
|
70
70
|
# Output BAM files
|
|
71
71
|
###########################################################
|
|
72
72
|
logger.info(f"Output BAM files of {arguments['control']}...")
|
|
73
|
-
report.
|
|
74
|
-
ARGS.tempdir, ARGS.control_name, ARGS.genome_coordinates, ARGS.threads, is_control=True
|
|
73
|
+
report.bam_exporter.export_to_bam(
|
|
74
|
+
ARGS.tempdir, ARGS.control_name, ARGS.genome_coordinates, ARGS.threads, ARGS.uuid, is_control=True
|
|
75
75
|
)
|
|
76
76
|
###########################################################
|
|
77
77
|
# Finish call
|
|
@@ -204,15 +204,15 @@ def execute_sample(arguments: dict):
|
|
|
204
204
|
# RESULT
|
|
205
205
|
io.write_jsonl(RESULT_SAMPLE, Path(ARGS.tempdir, "result", f"{ARGS.sample_name}.jsonl"))
|
|
206
206
|
# FASTA
|
|
207
|
-
report.
|
|
208
|
-
report.
|
|
207
|
+
report.sequence_exporter.export_to_fasta(ARGS.tempdir, ARGS.sample_name, cons_sequence)
|
|
208
|
+
report.sequence_exporter.export_reference_to_fasta(ARGS.tempdir, ARGS.sample_name)
|
|
209
209
|
# HTML
|
|
210
|
-
report.
|
|
210
|
+
report.sequence_exporter.export_to_html(ARGS.tempdir, ARGS.sample_name, cons_percentage)
|
|
211
211
|
# CSV (Allele Info)
|
|
212
|
-
report.
|
|
212
|
+
report.mutation_exporter.export_to_csv(ARGS.tempdir, ARGS.sample_name, ARGS.genome_coordinates, cons_percentage)
|
|
213
213
|
# BAM
|
|
214
|
-
report.
|
|
215
|
-
ARGS.tempdir, ARGS.sample_name, ARGS.genome_coordinates, ARGS.threads, RESULT_SAMPLE
|
|
214
|
+
report.bam_exporter.export_to_bam(
|
|
215
|
+
ARGS.tempdir, ARGS.sample_name, ARGS.genome_coordinates, ARGS.threads, ARGS.uuid, RESULT_SAMPLE
|
|
216
216
|
)
|
|
217
217
|
for path_bam_igvjs in Path(ARGS.tempdir, "cache", ".igvjs").glob(f"{ARGS.control_name}_control.bam*"):
|
|
218
218
|
shutil.copy(path_bam_igvjs, Path(ARGS.tempdir, "report", ".igvjs", ARGS.sample_name))
|
|
@@ -5,11 +5,19 @@ from urllib.request import urlopen
|
|
|
5
5
|
|
|
6
6
|
def fetch_seq_coordinates(genome: str, blat_url: str, seq: str) -> dict:
|
|
7
7
|
url = f"{blat_url}?db={genome}&type=BLAT&userSeq={seq}"
|
|
8
|
-
|
|
9
|
-
matches = [
|
|
8
|
+
records = urlopen(url).read().decode("utf8").split("\n")
|
|
9
|
+
matches = []
|
|
10
|
+
for record in records:
|
|
11
|
+
if "100.0%" not in record:
|
|
12
|
+
continue
|
|
13
|
+
record_trim = [r for r in record.split(" ") if r]
|
|
14
|
+
if record_trim[-1] == str(len(seq)):
|
|
15
|
+
matches = record_trim
|
|
16
|
+
|
|
10
17
|
if not matches:
|
|
11
18
|
raise ValueError(f"{seq[:60]}... is not found in {genome}")
|
|
12
|
-
|
|
19
|
+
|
|
20
|
+
chrom, strand, start, end, _ = matches[-5:]
|
|
13
21
|
return {"chrom": chrom, "strand": strand, "start": int(start), "end": int(end)}
|
|
14
22
|
|
|
15
23
|
|
|
@@ -8,8 +8,7 @@ from itertools import chain, groupby
|
|
|
8
8
|
|
|
9
9
|
from collections import Counter
|
|
10
10
|
|
|
11
|
-
from DAJIN2.utils import sam_handler
|
|
12
|
-
from DAJIN2.utils import cssplits_handler
|
|
11
|
+
from DAJIN2.utils import io, sam_handler, cssplits_handler
|
|
13
12
|
|
|
14
13
|
|
|
15
14
|
def has_inversion_in_splice(CIGAR: str) -> bool:
|
|
@@ -215,8 +214,8 @@ def generate_midsv(ARGS, is_control: bool = False, is_insertion: bool = False) -
|
|
|
215
214
|
path_splice = Path(ARGS.tempdir, name, "sam", f"splice_{allele}.sam")
|
|
216
215
|
path_output_midsv = Path(ARGS.tempdir, name, "midsv", f"{allele}.json")
|
|
217
216
|
|
|
218
|
-
sam_ont = sam_handler.remove_overlapped_reads(list(
|
|
219
|
-
sam_splice = sam_handler.remove_overlapped_reads(list(
|
|
217
|
+
sam_ont = sam_handler.remove_overlapped_reads(list(io.read_sam(path_ont)))
|
|
218
|
+
sam_splice = sam_handler.remove_overlapped_reads(list(io.read_sam(path_splice)))
|
|
220
219
|
qname_of_map_ont = extract_qname_of_map_ont(sam_ont, sam_splice)
|
|
221
220
|
sam_of_map_ont = filter_sam_by_preset(sam_ont, qname_of_map_ont, preset="map-ont")
|
|
222
221
|
sam_of_splice = filter_sam_by_preset(sam_splice, qname_of_map_ont, preset="splice")
|
|
@@ -89,13 +89,13 @@ def cosine_similarity(x, y):
|
|
|
89
89
|
|
|
90
90
|
|
|
91
91
|
def identify_dissimilar_loci(values_sample, values_control, index: int, is_consensus: bool = False) -> int:
|
|
92
|
-
# If 'sample' has more than
|
|
93
|
-
|
|
94
|
-
if values_sample[index] - values_control[index] > threshold:
|
|
92
|
+
# If 'sample' has more than 20% variation compared to 'control' in consensus mode, unconditionally set it to 'dissimilar loci'. This is set to counteract cases where, when evaluating cosine similarity during significant deletions, values exceedingly close to 1 can occur even if not observed in the control (e.g., control = [1,1,1,1,1], sample = [100,100,100,100,100] -> cosine similarity = 1).
|
|
93
|
+
if is_consensus and values_sample[index] - values_control[index] > 20:
|
|
95
94
|
return True
|
|
96
95
|
|
|
97
|
-
|
|
98
|
-
|
|
96
|
+
# Subset 10 bases around index and add 1e-6 to avoid division by zero when calculating cosine similarity.
|
|
97
|
+
x = np.array(values_sample[index - 5 : index + 6]) + 1e-6
|
|
98
|
+
y = np.array(values_control[index - 5 : index + 6]) + 1e-6
|
|
99
99
|
|
|
100
100
|
return cosine_similarity(x, y) < 0.95
|
|
101
101
|
|
|
@@ -109,8 +109,8 @@ def detect_anomalies(values_sample, values_control, threshold: float, is_consens
|
|
|
109
109
|
|
|
110
110
|
values_subtract_reshaped = values_subtract.reshape(-1, 1)
|
|
111
111
|
kmeans = MiniBatchKMeans(n_clusters=2, random_state=0, n_init="auto").fit(values_subtract_reshaped)
|
|
112
|
-
|
|
113
|
-
candidate_loci = {i for i, v in enumerate(values_subtract_reshaped) if v >
|
|
112
|
+
threshold_kmeans = kmeans.cluster_centers_.mean()
|
|
113
|
+
candidate_loci = {i for i, v in enumerate(values_subtract_reshaped) if v > threshold_kmeans}
|
|
114
114
|
|
|
115
115
|
return {i for i in candidate_loci if identify_dissimilar_loci(values_sample, values_control, i, is_consensus)}
|
|
116
116
|
|