DAJIN2 0.4.3__zip → 0.4.4__zip
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {DAJIN2-0.4.3/src/DAJIN2.egg-info → dajin2-0.4.4}/PKG-INFO +15 -50
- {DAJIN2-0.4.3 → dajin2-0.4.4}/README.md +14 -49
- {DAJIN2-0.4.3 → dajin2-0.4.4}/setup.py +1 -1
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/clustering/clustering.py +11 -10
- dajin2-0.4.4/src/DAJIN2/core/clustering/strand_bias_handler.py +115 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/mutation_extractor.py +7 -7
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/main.py +20 -20
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/utils/io.py +8 -6
- {DAJIN2-0.4.3 → dajin2-0.4.4/src/DAJIN2.egg-info}/PKG-INFO +15 -50
- DAJIN2-0.4.3/src/DAJIN2/core/clustering/strand_bias_handler.py +0 -113
- {DAJIN2-0.4.3 → dajin2-0.4.4}/LICENSE +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/MANIFEST.in +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/requirements.txt +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/setup.cfg +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/__init__.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/__init__.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/classification/__init__.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/classification/allele_merger.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/classification/classifier.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/clustering/__init__.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/clustering/appender.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/clustering/kmer_generator.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/clustering/label_extractor.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/clustering/label_merger.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/clustering/label_updator.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/clustering/score_handler.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/consensus/__init__.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/consensus/clust_formatter.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/consensus/consensus.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/consensus/mutation_extractor.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/consensus/name_handler.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/consensus/similarity_searcher.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/core.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/__init__.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/cache_checker.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/directory_manager.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/genome_fetcher.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/homopolymer_handler.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/input_formatter.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/insertions_to_fasta.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/knockin_handler.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/mapping.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/midsv_caller.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/report/__init__.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/report/bam_exporter.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/report/insertion_reflector.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/report/mutation_exporter.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/report/sequence_exporter.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/gui.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/static/css/style.css +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/template_igvjs.html +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/templates/index.html +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/utils/config.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/utils/cssplits_handler.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/utils/dna_handler.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/utils/fastx_handler.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/utils/input_validator.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/utils/multiprocess.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/utils/report_generator.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/utils/sam_handler.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/view.py +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2.egg-info/SOURCES.txt +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2.egg-info/dependency_links.txt +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2.egg-info/entry_points.txt +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2.egg-info/requires.txt +0 -0
- {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: DAJIN2
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.4
|
|
4
4
|
Summary: One-step genotyping tools for targeted long-read sequencing
|
|
5
5
|
Home-page: https://github.com/akikuno/DAJIN2
|
|
6
6
|
Author: Akihiro Kuno
|
|
@@ -166,7 +166,7 @@ Options:
|
|
|
166
166
|
|
|
167
167
|
```bash
|
|
168
168
|
# Download example dataset
|
|
169
|
-
|
|
169
|
+
curl -LJO https://github.com/akikuno/DAJIN2/raw/main/examples/example_single.tar.gz
|
|
170
170
|
tar -xf example_single.tar.gz
|
|
171
171
|
|
|
172
172
|
# Run DAJIN2
|
|
@@ -230,48 +230,13 @@ options:
|
|
|
230
230
|
|
|
231
231
|
```bash
|
|
232
232
|
# Donwload the example dataset
|
|
233
|
-
|
|
233
|
+
curl -LJO https://github.com/akikuno/DAJIN2/raw/main/examples/example_batch.tar.gz
|
|
234
234
|
tar -xf example_batch.tar.gz
|
|
235
235
|
|
|
236
236
|
# Run DAJIN2
|
|
237
237
|
DAJIN2 batch --file example_batch/batch.csv --threads 4
|
|
238
238
|
```
|
|
239
239
|
|
|
240
|
-
<!-- ```bash
|
|
241
|
-
# Donwload the example dataset
|
|
242
|
-
wget https://github.com/akikuno/DAJIN2/raw/main/examples/example_batch.tar.gz
|
|
243
|
-
tar -xf example_batch.tar.gz
|
|
244
|
-
|
|
245
|
-
# Run DAJIN2
|
|
246
|
-
DAJIN2 batch --file example-batch/batch.csv --threads 3
|
|
247
|
-
|
|
248
|
-
# 2023-07-31 17:01:10: example-batch/tyr_control.fq.gz is now processing...
|
|
249
|
-
# 2023-07-31 17:01:16: Preprocess example-batch/tyr_control.fq.gz...
|
|
250
|
-
# 2023-07-31 17:01:48: Output BAM files of example-batch/tyr_control.fq.gz...
|
|
251
|
-
# 2023-07-31 17:01:52: 🍵 example-batch/tyr_control.fq.gz is finished!
|
|
252
|
-
# 2023-07-31 17:01:52: example-batch/tyr_c230gt_50%.fq.gz is now processing...
|
|
253
|
-
# 2023-07-31 17:01:52: example-batch/tyr_c230gt_10%.fq.gz is now processing...
|
|
254
|
-
# 2023-07-31 17:01:52: example-batch/tyr_c230gt_01%.fq.gz is now processing...
|
|
255
|
-
# 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_01%.fq.gz...
|
|
256
|
-
# 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_50%.fq.gz...
|
|
257
|
-
# 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_10%.fq.gz...
|
|
258
|
-
# 2023-07-31 17:02:17: Classify example-batch/tyr_c230gt_50%.fq.gz...
|
|
259
|
-
# 2023-07-31 17:02:19: Clustering example-batch/tyr_c230gt_50%.fq.gz...
|
|
260
|
-
# 2023-07-31 17:02:34: Classify example-batch/tyr_c230gt_01%.fq.gz...
|
|
261
|
-
# 2023-07-31 17:02:35: Classify example-batch/tyr_c230gt_10%.fq.gz...
|
|
262
|
-
# 2023-07-31 17:02:39: Clustering example-batch/tyr_c230gt_01%.fq.gz...
|
|
263
|
-
# 2023-07-31 17:02:39: Clustering example-batch/tyr_c230gt_10%.fq.gz...
|
|
264
|
-
# 2023-07-31 17:02:53: Consensus calling of example-batch/tyr_c230gt_50%.fq.gz...
|
|
265
|
-
# 2023-07-31 17:02:59: Output reports of example-batch/tyr_c230gt_50%.fq.gz...
|
|
266
|
-
# 2023-07-31 17:03:04: 🍵 example-batch/tyr_c230gt_50%.fq.gz is finished!
|
|
267
|
-
# 2023-07-31 17:03:39: Consensus calling of example-batch/tyr_c230gt_01%.fq.gz...
|
|
268
|
-
# 2023-07-31 17:03:51: Output reports of example-batch/tyr_c230gt_01%.fq.gz...
|
|
269
|
-
# 2023-07-31 17:04:03: 🍵 example-batch/tyr_c230gt_01%.fq.gz is finished!
|
|
270
|
-
# 2023-07-31 17:04:08: Consensus calling of example-batch/tyr_c230gt_10%.fq.gz...
|
|
271
|
-
# 2023-07-31 17:04:16: Output reports of example-batch/tyr_c230gt_10%.fq.gz...
|
|
272
|
-
# 2023-07-31 17:04:24: 🍵 example-batch/tyr_c230gt_10%.fq.gz is finished!
|
|
273
|
-
# 🎉 Finished! Open DAJIN_Results/tyr-substitution to see the report.
|
|
274
|
-
``` -->
|
|
275
240
|
|
|
276
241
|
## 📈 Report Contents
|
|
277
242
|
|
|
@@ -281,22 +246,22 @@ Inside the **DAJIN_Results** directory, the following files can be found:
|
|
|
281
246
|
```
|
|
282
247
|
DAJIN_Results/tyr-substitution
|
|
283
248
|
├── BAM
|
|
284
|
-
│ ├── tyr_c230gt_01
|
|
285
|
-
│ ├── tyr_c230gt_10
|
|
286
|
-
│ ├── tyr_c230gt_50
|
|
249
|
+
│ ├── tyr_c230gt_01
|
|
250
|
+
│ ├── tyr_c230gt_10
|
|
251
|
+
│ ├── tyr_c230gt_50
|
|
287
252
|
│ └── tyr_control
|
|
288
253
|
├── FASTA
|
|
289
|
-
│ ├── tyr_c230gt_01
|
|
290
|
-
│ ├── tyr_c230gt_10
|
|
291
|
-
│ └── tyr_c230gt_50
|
|
254
|
+
│ ├── tyr_c230gt_01
|
|
255
|
+
│ ├── tyr_c230gt_10
|
|
256
|
+
│ └── tyr_c230gt_50
|
|
292
257
|
├── HTML
|
|
293
|
-
│ ├── tyr_c230gt_01
|
|
294
|
-
│ ├── tyr_c230gt_10
|
|
295
|
-
│ └── tyr_c230gt_50
|
|
258
|
+
│ ├── tyr_c230gt_01
|
|
259
|
+
│ ├── tyr_c230gt_10
|
|
260
|
+
│ └── tyr_c230gt_50
|
|
296
261
|
├── MUTATION_INFO
|
|
297
|
-
│ ├── tyr_c230gt_01
|
|
298
|
-
│ ├── tyr_c230gt_10
|
|
299
|
-
│ └── tyr_c230gt_50
|
|
262
|
+
│ ├── tyr_c230gt_01.csv
|
|
263
|
+
│ ├── tyr_c230gt_10.csv
|
|
264
|
+
│ └── tyr_c230gt_50.csv
|
|
300
265
|
├── read_plot.html
|
|
301
266
|
├── read_plot.pdf
|
|
302
267
|
└── read_summary.xlsx
|
|
@@ -133,7 +133,7 @@ Options:
|
|
|
133
133
|
|
|
134
134
|
```bash
|
|
135
135
|
# Download example dataset
|
|
136
|
-
|
|
136
|
+
curl -LJO https://github.com/akikuno/DAJIN2/raw/main/examples/example_single.tar.gz
|
|
137
137
|
tar -xf example_single.tar.gz
|
|
138
138
|
|
|
139
139
|
# Run DAJIN2
|
|
@@ -197,48 +197,13 @@ options:
|
|
|
197
197
|
|
|
198
198
|
```bash
|
|
199
199
|
# Donwload the example dataset
|
|
200
|
-
|
|
200
|
+
curl -LJO https://github.com/akikuno/DAJIN2/raw/main/examples/example_batch.tar.gz
|
|
201
201
|
tar -xf example_batch.tar.gz
|
|
202
202
|
|
|
203
203
|
# Run DAJIN2
|
|
204
204
|
DAJIN2 batch --file example_batch/batch.csv --threads 4
|
|
205
205
|
```
|
|
206
206
|
|
|
207
|
-
<!-- ```bash
|
|
208
|
-
# Donwload the example dataset
|
|
209
|
-
wget https://github.com/akikuno/DAJIN2/raw/main/examples/example_batch.tar.gz
|
|
210
|
-
tar -xf example_batch.tar.gz
|
|
211
|
-
|
|
212
|
-
# Run DAJIN2
|
|
213
|
-
DAJIN2 batch --file example-batch/batch.csv --threads 3
|
|
214
|
-
|
|
215
|
-
# 2023-07-31 17:01:10: example-batch/tyr_control.fq.gz is now processing...
|
|
216
|
-
# 2023-07-31 17:01:16: Preprocess example-batch/tyr_control.fq.gz...
|
|
217
|
-
# 2023-07-31 17:01:48: Output BAM files of example-batch/tyr_control.fq.gz...
|
|
218
|
-
# 2023-07-31 17:01:52: 🍵 example-batch/tyr_control.fq.gz is finished!
|
|
219
|
-
# 2023-07-31 17:01:52: example-batch/tyr_c230gt_50%.fq.gz is now processing...
|
|
220
|
-
# 2023-07-31 17:01:52: example-batch/tyr_c230gt_10%.fq.gz is now processing...
|
|
221
|
-
# 2023-07-31 17:01:52: example-batch/tyr_c230gt_01%.fq.gz is now processing...
|
|
222
|
-
# 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_01%.fq.gz...
|
|
223
|
-
# 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_50%.fq.gz...
|
|
224
|
-
# 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_10%.fq.gz...
|
|
225
|
-
# 2023-07-31 17:02:17: Classify example-batch/tyr_c230gt_50%.fq.gz...
|
|
226
|
-
# 2023-07-31 17:02:19: Clustering example-batch/tyr_c230gt_50%.fq.gz...
|
|
227
|
-
# 2023-07-31 17:02:34: Classify example-batch/tyr_c230gt_01%.fq.gz...
|
|
228
|
-
# 2023-07-31 17:02:35: Classify example-batch/tyr_c230gt_10%.fq.gz...
|
|
229
|
-
# 2023-07-31 17:02:39: Clustering example-batch/tyr_c230gt_01%.fq.gz...
|
|
230
|
-
# 2023-07-31 17:02:39: Clustering example-batch/tyr_c230gt_10%.fq.gz...
|
|
231
|
-
# 2023-07-31 17:02:53: Consensus calling of example-batch/tyr_c230gt_50%.fq.gz...
|
|
232
|
-
# 2023-07-31 17:02:59: Output reports of example-batch/tyr_c230gt_50%.fq.gz...
|
|
233
|
-
# 2023-07-31 17:03:04: 🍵 example-batch/tyr_c230gt_50%.fq.gz is finished!
|
|
234
|
-
# 2023-07-31 17:03:39: Consensus calling of example-batch/tyr_c230gt_01%.fq.gz...
|
|
235
|
-
# 2023-07-31 17:03:51: Output reports of example-batch/tyr_c230gt_01%.fq.gz...
|
|
236
|
-
# 2023-07-31 17:04:03: 🍵 example-batch/tyr_c230gt_01%.fq.gz is finished!
|
|
237
|
-
# 2023-07-31 17:04:08: Consensus calling of example-batch/tyr_c230gt_10%.fq.gz...
|
|
238
|
-
# 2023-07-31 17:04:16: Output reports of example-batch/tyr_c230gt_10%.fq.gz...
|
|
239
|
-
# 2023-07-31 17:04:24: 🍵 example-batch/tyr_c230gt_10%.fq.gz is finished!
|
|
240
|
-
# 🎉 Finished! Open DAJIN_Results/tyr-substitution to see the report.
|
|
241
|
-
``` -->
|
|
242
207
|
|
|
243
208
|
## 📈 Report Contents
|
|
244
209
|
|
|
@@ -248,22 +213,22 @@ Inside the **DAJIN_Results** directory, the following files can be found:
|
|
|
248
213
|
```
|
|
249
214
|
DAJIN_Results/tyr-substitution
|
|
250
215
|
├── BAM
|
|
251
|
-
│ ├── tyr_c230gt_01
|
|
252
|
-
│ ├── tyr_c230gt_10
|
|
253
|
-
│ ├── tyr_c230gt_50
|
|
216
|
+
│ ├── tyr_c230gt_01
|
|
217
|
+
│ ├── tyr_c230gt_10
|
|
218
|
+
│ ├── tyr_c230gt_50
|
|
254
219
|
│ └── tyr_control
|
|
255
220
|
├── FASTA
|
|
256
|
-
│ ├── tyr_c230gt_01
|
|
257
|
-
│ ├── tyr_c230gt_10
|
|
258
|
-
│ └── tyr_c230gt_50
|
|
221
|
+
│ ├── tyr_c230gt_01
|
|
222
|
+
│ ├── tyr_c230gt_10
|
|
223
|
+
│ └── tyr_c230gt_50
|
|
259
224
|
├── HTML
|
|
260
|
-
│ ├── tyr_c230gt_01
|
|
261
|
-
│ ├── tyr_c230gt_10
|
|
262
|
-
│ └── tyr_c230gt_50
|
|
225
|
+
│ ├── tyr_c230gt_01
|
|
226
|
+
│ ├── tyr_c230gt_10
|
|
227
|
+
│ └── tyr_c230gt_50
|
|
263
228
|
├── MUTATION_INFO
|
|
264
|
-
│ ├── tyr_c230gt_01
|
|
265
|
-
│ ├── tyr_c230gt_10
|
|
266
|
-
│ └── tyr_c230gt_50
|
|
229
|
+
│ ├── tyr_c230gt_01.csv
|
|
230
|
+
│ ├── tyr_c230gt_10.csv
|
|
231
|
+
│ └── tyr_c230gt_50.csv
|
|
267
232
|
├── read_plot.html
|
|
268
233
|
├── read_plot.pdf
|
|
269
234
|
└── read_summary.xlsx
|
|
@@ -9,7 +9,7 @@ with open("requirements.txt") as requirements_file:
|
|
|
9
9
|
|
|
10
10
|
setuptools.setup(
|
|
11
11
|
name="DAJIN2",
|
|
12
|
-
version="0.4.
|
|
12
|
+
version="0.4.4",
|
|
13
13
|
author="Akihiro Kuno",
|
|
14
14
|
author_email="akuno@md.tsukuba.ac.jp",
|
|
15
15
|
description="One-step genotyping tools for targeted long-read sequencing",
|
|
@@ -39,17 +39,16 @@ def optimize_labels(X: spmatrix, coverage_sample: int, coverage_control: int) ->
|
|
|
39
39
|
# print(i, Counter(labels_sample), Counter(labels_control), Counter(labels_current)) # ! DEBUG
|
|
40
40
|
|
|
41
41
|
num_labels_control = count_number_of_clusters(labels_control, coverage_control)
|
|
42
|
-
|
|
42
|
+
rand_index = metrics.adjusted_rand_score(labels_previous, labels_current)
|
|
43
43
|
|
|
44
44
|
"""
|
|
45
45
|
Return the number of clusters when:
|
|
46
|
-
|
|
47
|
-
|
|
46
|
+
- the number of clusters in control is split into more than one.
|
|
47
|
+
- the mutual information between the current and previous labels is high enough (= similar).
|
|
48
|
+
To reduce the allele number, previous labels are returned.
|
|
48
49
|
"""
|
|
49
|
-
if num_labels_control >= 2:
|
|
50
|
+
if num_labels_control >= 2 or rand_index >= 0.95:
|
|
50
51
|
return labels_previous
|
|
51
|
-
if 0.95 <= mutual_info <= 1.0:
|
|
52
|
-
return labels_current
|
|
53
52
|
labels_previous = labels_current
|
|
54
53
|
return labels_previous
|
|
55
54
|
|
|
@@ -58,11 +57,13 @@ def get_label_most_common(labels: list[int]) -> int:
|
|
|
58
57
|
return Counter(labels).most_common()[0][0]
|
|
59
58
|
|
|
60
59
|
|
|
61
|
-
def return_labels(
|
|
60
|
+
def return_labels(
|
|
61
|
+
path_score_sample: Path, path_score_control: Path, path_sample: Path, strand_bias_in_control: bool
|
|
62
|
+
) -> list[int]:
|
|
62
63
|
np.random.seed(seed=1)
|
|
63
64
|
score_control = list(io.read_jsonl(path_score_control))
|
|
64
65
|
X_control = csr_matrix(score_control)
|
|
65
|
-
|
|
66
|
+
"""Subset to 1000 reads of controls in the most common cluster to remove outliers and reduce computation time"""
|
|
66
67
|
labels_control = BisectingKMeans(n_clusters=2, random_state=1).fit_predict(X_control)
|
|
67
68
|
label_most_common = get_label_most_common(labels_control)
|
|
68
69
|
scores_control_subset = subset_scores(labels_control, io.read_jsonl(path_score_control), label_most_common, 1000)
|
|
@@ -71,7 +72,7 @@ def return_labels(path_score_sample: Path, path_score_control: Path, path_sample
|
|
|
71
72
|
coverage_sample = io.count_newlines(path_score_sample)
|
|
72
73
|
coverage_control = len(scores_control_subset)
|
|
73
74
|
labels = optimize_labels(X, coverage_sample, coverage_control)
|
|
74
|
-
|
|
75
|
-
if
|
|
75
|
+
"""Re-allocate clusters with strand bias to clusters without strand bias"""
|
|
76
|
+
if strand_bias_in_control is False:
|
|
76
77
|
labels = remove_biased_clusters(path_sample, path_score_sample, labels)
|
|
77
78
|
return labels
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Nanopore sequencing results often results in strand specific mutations even though the mutation is not strand specific, thus they are considered as sequencing errors and should be removed.
|
|
5
|
+
|
|
6
|
+
This module provides functions to determine whether each allele obtained after clustering is formed due to sequencing errors caused by strand bias.
|
|
7
|
+
|
|
8
|
+
Re-allocates reads belonging to clusters with strand bias to clusters without strand bias.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from collections import defaultdict
|
|
13
|
+
from sklearn.tree import DecisionTreeClassifier
|
|
14
|
+
|
|
15
|
+
from DAJIN2.utils import io
|
|
16
|
+
|
|
17
|
+
# Constants
|
|
18
|
+
STRAND_BIAS_LOWER_LIMIT = 0.1
|
|
19
|
+
STRAND_BIAS_UPPER_LIMIT = 0.9
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def is_strand_bias(path_control: Path) -> bool:
|
|
23
|
+
"""
|
|
24
|
+
Determines whether there is a strand bias in sequencing data
|
|
25
|
+
based on the distribution of '+' and '-' strands.
|
|
26
|
+
"""
|
|
27
|
+
count_strand = defaultdict(int)
|
|
28
|
+
for sample in io.read_jsonl(path_control):
|
|
29
|
+
count_strand[sample["STRAND"]] += 1
|
|
30
|
+
|
|
31
|
+
total = count_strand["+"] + count_strand["-"]
|
|
32
|
+
percentage_plus = count_strand["+"] / total if total > 0 else 0
|
|
33
|
+
|
|
34
|
+
return not (STRAND_BIAS_LOWER_LIMIT < percentage_plus < STRAND_BIAS_UPPER_LIMIT)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
###############################################################################
|
|
38
|
+
# Handle Strand bias
|
|
39
|
+
# # Clusters of reads with mutations with strand bias are merged into similar clusters without strand bias
|
|
40
|
+
###############################################################################
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def count_strand(labels: list[int], samples: list[dict[str, str]]) -> tuple[dict[str, int], dict[str, int]]:
|
|
44
|
+
"""Count the occurrences of each strand type by label."""
|
|
45
|
+
positive_strand_counts_by_labels = defaultdict(int)
|
|
46
|
+
total_counts_by_labels = defaultdict(int)
|
|
47
|
+
|
|
48
|
+
for label, sample in zip(labels, samples):
|
|
49
|
+
total_counts_by_labels[label] += 1
|
|
50
|
+
if sample["STRAND"] == "+":
|
|
51
|
+
positive_strand_counts_by_labels[label] += 1
|
|
52
|
+
|
|
53
|
+
return dict(positive_strand_counts_by_labels), dict(total_counts_by_labels)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def determine_strand_biases(
|
|
57
|
+
positive_strand_counts_by_labels: defaultdict, total_counts_by_labels: defaultdict
|
|
58
|
+
) -> dict[int, bool]:
|
|
59
|
+
"""Determine strand biases based on positive strand counts."""
|
|
60
|
+
strand_biases = {}
|
|
61
|
+
for label, total in total_counts_by_labels.items():
|
|
62
|
+
positive_strand_count = positive_strand_counts_by_labels[label]
|
|
63
|
+
strand_ratio = positive_strand_count / total
|
|
64
|
+
strand_biases[label] = not (STRAND_BIAS_LOWER_LIMIT < strand_ratio < STRAND_BIAS_UPPER_LIMIT)
|
|
65
|
+
|
|
66
|
+
return strand_biases
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def prepare_training_testing_sets(labels, scores, strand_biases) -> tuple[list, list, list]:
|
|
70
|
+
"""Prepare training and testing datasets based on strand biases."""
|
|
71
|
+
train_data, train_labels, test_data = [], [], []
|
|
72
|
+
for label, score in zip(labels, scores):
|
|
73
|
+
if strand_biases[label]:
|
|
74
|
+
test_data.append(score)
|
|
75
|
+
else:
|
|
76
|
+
train_data.append(score)
|
|
77
|
+
train_labels.append(label)
|
|
78
|
+
return train_data, train_labels, test_data
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def train_decision_tree(train_data, train_labels) -> DecisionTreeClassifier:
|
|
82
|
+
"""Train a decision tree classifier using the provided features and labels."""
|
|
83
|
+
dtree = DecisionTreeClassifier(random_state=1)
|
|
84
|
+
dtree.fit(train_data, train_labels)
|
|
85
|
+
return dtree
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def allocate_labels(labels: list[int], strand_biases: dict[str, bool], dtree, test_data) -> list[int]:
|
|
89
|
+
"""Re-allocates reads belonging to clusters with strand bias to clusters without strand bias."""
|
|
90
|
+
label_predictions = iter(dtree.predict(test_data))
|
|
91
|
+
for i, label in enumerate(labels):
|
|
92
|
+
if strand_biases[label]:
|
|
93
|
+
labels[i] = next(label_predictions)
|
|
94
|
+
return labels
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def remove_biased_clusters(path_sample: Path, path_score_sample: Path, labels: list[int]) -> list[int]:
|
|
98
|
+
"""Remove clusters with strand bias by re-labeling based on decision tree predictions.
|
|
99
|
+
Continue until at least one of the samples exhibits strand bias (i.e., do not calculate if all samples exhibit strand bias, or conversely, if none of the samples exhibit strand bias) or
|
|
100
|
+
1000 iterations are reached, which serves as a safeguard to prevent infinite loops.
|
|
101
|
+
"""
|
|
102
|
+
samples = io.read_jsonl(path_sample)
|
|
103
|
+
positive_strand_counts_by_labels, total_counts_by_labels = count_strand(labels, samples)
|
|
104
|
+
strand_biases = determine_strand_biases(positive_strand_counts_by_labels, total_counts_by_labels)
|
|
105
|
+
|
|
106
|
+
iteration_count = 0
|
|
107
|
+
labels_corrected = labels
|
|
108
|
+
while len(set(strand_biases.values())) > 1 or iteration_count < 1000:
|
|
109
|
+
scores = io.read_jsonl(path_score_sample)
|
|
110
|
+
train_data, train_labels, test_data = prepare_training_testing_sets(labels, scores, strand_biases)
|
|
111
|
+
dtree = train_decision_tree(train_data, train_labels)
|
|
112
|
+
labels_corrected = allocate_labels(labels, strand_biases, dtree, test_data)
|
|
113
|
+
strand_biases = determine_strand_biases(labels_corrected, path_sample)
|
|
114
|
+
iteration_count += 1
|
|
115
|
+
return labels_corrected
|
|
@@ -89,13 +89,13 @@ def cosine_similarity(x, y):
|
|
|
89
89
|
|
|
90
90
|
|
|
91
91
|
def identify_dissimilar_loci(values_sample, values_control, index: int, is_consensus: bool = False) -> int:
|
|
92
|
-
# If 'sample' has more than
|
|
93
|
-
|
|
94
|
-
if values_sample[index] - values_control[index] > threshold:
|
|
92
|
+
# If 'sample' has more than 20% variation compared to 'control' in consensus mode, unconditionally set it to 'dissimilar loci'. This is set to counteract cases where, when evaluating cosine similarity during significant deletions, values exceedingly close to 1 can occur even if not observed in the control (e.g., control = [1,1,1,1,1], sample = [100,100,100,100,100] -> cosine similarity = 1).
|
|
93
|
+
if is_consensus and values_sample[index] - values_control[index] > 20:
|
|
95
94
|
return True
|
|
96
95
|
|
|
97
|
-
|
|
98
|
-
|
|
96
|
+
# Subset 10 bases around index and add 1e-6 to avoid division by zero when calculating cosine similarity.
|
|
97
|
+
x = np.array(values_sample[index - 5 : index + 6]) + 1e-6
|
|
98
|
+
y = np.array(values_control[index - 5 : index + 6]) + 1e-6
|
|
99
99
|
|
|
100
100
|
return cosine_similarity(x, y) < 0.95
|
|
101
101
|
|
|
@@ -109,8 +109,8 @@ def detect_anomalies(values_sample, values_control, threshold: float, is_consens
|
|
|
109
109
|
|
|
110
110
|
values_subtract_reshaped = values_subtract.reshape(-1, 1)
|
|
111
111
|
kmeans = MiniBatchKMeans(n_clusters=2, random_state=0, n_init="auto").fit(values_subtract_reshaped)
|
|
112
|
-
|
|
113
|
-
candidate_loci = {i for i, v in enumerate(values_subtract_reshaped) if v >
|
|
112
|
+
threshold_kmeans = kmeans.cluster_centers_.mean()
|
|
113
|
+
candidate_loci = {i for i, v in enumerate(values_subtract_reshaped) if v > threshold_kmeans}
|
|
114
114
|
|
|
115
115
|
return {i for i in candidate_loci if identify_dissimilar_loci(values_sample, values_control, i, is_consensus)}
|
|
116
116
|
|
|
@@ -20,7 +20,7 @@ from DAJIN2.core import core
|
|
|
20
20
|
from DAJIN2.utils import io, config, report_generator, input_validator, multiprocess
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
DAJIN_VERSION = "0.4.
|
|
23
|
+
DAJIN_VERSION = "0.4.4"
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
def generate_report(name: str) -> None:
|
|
@@ -58,21 +58,21 @@ def execute_single_mode(arguments: dict[str]):
|
|
|
58
58
|
################################################################################
|
|
59
59
|
|
|
60
60
|
|
|
61
|
-
def
|
|
62
|
-
"""Validate the
|
|
63
|
-
|
|
64
|
-
|
|
61
|
+
def validate_headers_of_batch_file(headers: list, filepath: str) -> None:
|
|
62
|
+
"""Validate the headers of a batch file."""
|
|
63
|
+
required_headers = ["sample", "control", "allele", "name"]
|
|
64
|
+
accepted_headers = ["sample", "control", "allele", "name", "genome"]
|
|
65
65
|
|
|
66
|
-
if not set(
|
|
67
|
-
raise ValueError(f"{filepath} must contain {', '.join(
|
|
66
|
+
if not set(required_headers).issubset(set(headers)):
|
|
67
|
+
raise ValueError(f"{filepath} must contain {', '.join(required_headers)} in the header")
|
|
68
68
|
|
|
69
|
-
if not set(
|
|
70
|
-
raise ValueError(f"Accepted header names of {filepath} are {', '.join(
|
|
69
|
+
if not set(headers).issubset(accepted_headers):
|
|
70
|
+
raise ValueError(f"Accepted header names of {filepath} are {', '.join(accepted_headers)}.")
|
|
71
71
|
|
|
72
72
|
|
|
73
|
-
def create_argument_dict(
|
|
74
|
-
"""Create a dictionary of arguments from the given
|
|
75
|
-
args = dict(zip(
|
|
73
|
+
def create_argument_dict(headers: list, group: list, cache_urls_genome: dict, is_control: bool) -> dict:
|
|
74
|
+
"""Create a dictionary of arguments from the given headers and group."""
|
|
75
|
+
args = dict(zip(headers, group))
|
|
76
76
|
args["threads"] = 1 # Set the number of threads to 1 for batch mode
|
|
77
77
|
|
|
78
78
|
# Assign the "sample" field depending on whether it's a control or not
|
|
@@ -89,11 +89,11 @@ def create_argument_dict(columns: list, group: list, cache_urls_genome: dict, is
|
|
|
89
89
|
|
|
90
90
|
|
|
91
91
|
def run_DAJIN2(
|
|
92
|
-
groups: list,
|
|
92
|
+
groups: list, headers: list, cache_urls_genome: dict, is_control: bool = True, num_workers: int = 1
|
|
93
93
|
) -> None:
|
|
94
94
|
contents = []
|
|
95
95
|
for group in groups:
|
|
96
|
-
args = create_argument_dict(
|
|
96
|
+
args = create_argument_dict(headers, group, cache_urls_genome, is_control)
|
|
97
97
|
if args: # Add args to contents only if it's not an empty dict
|
|
98
98
|
contents.append(args)
|
|
99
99
|
|
|
@@ -117,17 +117,17 @@ def execute_batch_mode(arguments: dict[str]):
|
|
|
117
117
|
inputs = io.load_batchfile(path_batchfile)
|
|
118
118
|
|
|
119
119
|
# Validate Column of the batch file
|
|
120
|
-
|
|
121
|
-
|
|
120
|
+
headers = inputs[0]
|
|
121
|
+
validate_headers_of_batch_file(headers, path_batchfile)
|
|
122
122
|
|
|
123
123
|
# Validate contents and fetch genome urls
|
|
124
124
|
contents = inputs[1:]
|
|
125
125
|
cache_urls_genome = dict()
|
|
126
|
-
index_of_name =
|
|
126
|
+
index_of_name = headers.index("name")
|
|
127
127
|
contents.sort(key=lambda x: x[index_of_name])
|
|
128
128
|
for _, groups in groupby(contents, key=lambda x: x[index_of_name]):
|
|
129
129
|
for group in groups:
|
|
130
|
-
args = dict(zip(
|
|
130
|
+
args = dict(zip(headers, group))
|
|
131
131
|
# validate contents in the batch file
|
|
132
132
|
input_validator.validate_files(args["sample"], args["control"], args["allele"])
|
|
133
133
|
# validate genome and fetch urls
|
|
@@ -141,8 +141,8 @@ def execute_batch_mode(arguments: dict[str]):
|
|
|
141
141
|
config.set_logging(path_logfile)
|
|
142
142
|
groups = list(groups)
|
|
143
143
|
# Run DAJIN2
|
|
144
|
-
run_DAJIN2(groups,
|
|
145
|
-
run_DAJIN2(groups,
|
|
144
|
+
run_DAJIN2(groups, headers, cache_urls_genome, is_control=True, num_workers=arguments["threads"])
|
|
145
|
+
run_DAJIN2(groups, headers, cache_urls_genome, is_control=False, num_workers=arguments["threads"])
|
|
146
146
|
# Finish
|
|
147
147
|
generate_report(name)
|
|
148
148
|
shutil.move(path_logfile, Path("DAJIN_Results", name))
|
|
@@ -83,8 +83,8 @@ def write_xlsx(data: list[dict[str, str]], file_path: str | Path) -> None:
|
|
|
83
83
|
###########################################################
|
|
84
84
|
|
|
85
85
|
|
|
86
|
-
def
|
|
87
|
-
"""
|
|
86
|
+
def determine_file_type(file_path: str) -> str | None:
|
|
87
|
+
"""Determine if the file is an Excel or CSV file. Raise error for other types."""
|
|
88
88
|
file_extension = Path(file_path).suffix
|
|
89
89
|
if file_extension in [".xlsx", ".xls"]:
|
|
90
90
|
return "excel"
|
|
@@ -112,16 +112,18 @@ def read_xlsx(file_path: str | Path) -> list[dict[str, str]]:
|
|
|
112
112
|
def read_csv(file_path: str) -> list[dict[str, str]]:
|
|
113
113
|
"""Load data from a CSV file and return as a list."""
|
|
114
114
|
with open(file_path, "r") as csvfile:
|
|
115
|
-
|
|
115
|
+
inputs = []
|
|
116
116
|
for row in csv.reader(csvfile):
|
|
117
|
+
if not row: # Skip empty rows
|
|
118
|
+
continue
|
|
117
119
|
trimmed_row = [field.strip() for field in row]
|
|
118
|
-
|
|
119
|
-
return
|
|
120
|
+
inputs.append(trimmed_row)
|
|
121
|
+
return inputs
|
|
120
122
|
|
|
121
123
|
|
|
122
124
|
def load_batchfile(batchfile_path: str) -> list[dict[str, str]]:
|
|
123
125
|
"""Load data from either an Excel or CSV file."""
|
|
124
|
-
file_type =
|
|
126
|
+
file_type = determine_file_type(batchfile_path)
|
|
125
127
|
if file_type == "excel":
|
|
126
128
|
return read_xlsx(batchfile_path)
|
|
127
129
|
elif file_type == "csv":
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: DAJIN2
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.4
|
|
4
4
|
Summary: One-step genotyping tools for targeted long-read sequencing
|
|
5
5
|
Home-page: https://github.com/akikuno/DAJIN2
|
|
6
6
|
Author: Akihiro Kuno
|
|
@@ -166,7 +166,7 @@ Options:
|
|
|
166
166
|
|
|
167
167
|
```bash
|
|
168
168
|
# Download example dataset
|
|
169
|
-
|
|
169
|
+
curl -LJO https://github.com/akikuno/DAJIN2/raw/main/examples/example_single.tar.gz
|
|
170
170
|
tar -xf example_single.tar.gz
|
|
171
171
|
|
|
172
172
|
# Run DAJIN2
|
|
@@ -230,48 +230,13 @@ options:
|
|
|
230
230
|
|
|
231
231
|
```bash
|
|
232
232
|
# Donwload the example dataset
|
|
233
|
-
|
|
233
|
+
curl -LJO https://github.com/akikuno/DAJIN2/raw/main/examples/example_batch.tar.gz
|
|
234
234
|
tar -xf example_batch.tar.gz
|
|
235
235
|
|
|
236
236
|
# Run DAJIN2
|
|
237
237
|
DAJIN2 batch --file example_batch/batch.csv --threads 4
|
|
238
238
|
```
|
|
239
239
|
|
|
240
|
-
<!-- ```bash
|
|
241
|
-
# Donwload the example dataset
|
|
242
|
-
wget https://github.com/akikuno/DAJIN2/raw/main/examples/example_batch.tar.gz
|
|
243
|
-
tar -xf example_batch.tar.gz
|
|
244
|
-
|
|
245
|
-
# Run DAJIN2
|
|
246
|
-
DAJIN2 batch --file example-batch/batch.csv --threads 3
|
|
247
|
-
|
|
248
|
-
# 2023-07-31 17:01:10: example-batch/tyr_control.fq.gz is now processing...
|
|
249
|
-
# 2023-07-31 17:01:16: Preprocess example-batch/tyr_control.fq.gz...
|
|
250
|
-
# 2023-07-31 17:01:48: Output BAM files of example-batch/tyr_control.fq.gz...
|
|
251
|
-
# 2023-07-31 17:01:52: 🍵 example-batch/tyr_control.fq.gz is finished!
|
|
252
|
-
# 2023-07-31 17:01:52: example-batch/tyr_c230gt_50%.fq.gz is now processing...
|
|
253
|
-
# 2023-07-31 17:01:52: example-batch/tyr_c230gt_10%.fq.gz is now processing...
|
|
254
|
-
# 2023-07-31 17:01:52: example-batch/tyr_c230gt_01%.fq.gz is now processing...
|
|
255
|
-
# 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_01%.fq.gz...
|
|
256
|
-
# 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_50%.fq.gz...
|
|
257
|
-
# 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_10%.fq.gz...
|
|
258
|
-
# 2023-07-31 17:02:17: Classify example-batch/tyr_c230gt_50%.fq.gz...
|
|
259
|
-
# 2023-07-31 17:02:19: Clustering example-batch/tyr_c230gt_50%.fq.gz...
|
|
260
|
-
# 2023-07-31 17:02:34: Classify example-batch/tyr_c230gt_01%.fq.gz...
|
|
261
|
-
# 2023-07-31 17:02:35: Classify example-batch/tyr_c230gt_10%.fq.gz...
|
|
262
|
-
# 2023-07-31 17:02:39: Clustering example-batch/tyr_c230gt_01%.fq.gz...
|
|
263
|
-
# 2023-07-31 17:02:39: Clustering example-batch/tyr_c230gt_10%.fq.gz...
|
|
264
|
-
# 2023-07-31 17:02:53: Consensus calling of example-batch/tyr_c230gt_50%.fq.gz...
|
|
265
|
-
# 2023-07-31 17:02:59: Output reports of example-batch/tyr_c230gt_50%.fq.gz...
|
|
266
|
-
# 2023-07-31 17:03:04: 🍵 example-batch/tyr_c230gt_50%.fq.gz is finished!
|
|
267
|
-
# 2023-07-31 17:03:39: Consensus calling of example-batch/tyr_c230gt_01%.fq.gz...
|
|
268
|
-
# 2023-07-31 17:03:51: Output reports of example-batch/tyr_c230gt_01%.fq.gz...
|
|
269
|
-
# 2023-07-31 17:04:03: 🍵 example-batch/tyr_c230gt_01%.fq.gz is finished!
|
|
270
|
-
# 2023-07-31 17:04:08: Consensus calling of example-batch/tyr_c230gt_10%.fq.gz...
|
|
271
|
-
# 2023-07-31 17:04:16: Output reports of example-batch/tyr_c230gt_10%.fq.gz...
|
|
272
|
-
# 2023-07-31 17:04:24: 🍵 example-batch/tyr_c230gt_10%.fq.gz is finished!
|
|
273
|
-
# 🎉 Finished! Open DAJIN_Results/tyr-substitution to see the report.
|
|
274
|
-
``` -->
|
|
275
240
|
|
|
276
241
|
## 📈 Report Contents
|
|
277
242
|
|
|
@@ -281,22 +246,22 @@ Inside the **DAJIN_Results** directory, the following files can be found:
|
|
|
281
246
|
```
|
|
282
247
|
DAJIN_Results/tyr-substitution
|
|
283
248
|
├── BAM
|
|
284
|
-
│ ├── tyr_c230gt_01
|
|
285
|
-
│ ├── tyr_c230gt_10
|
|
286
|
-
│ ├── tyr_c230gt_50
|
|
249
|
+
│ ├── tyr_c230gt_01
|
|
250
|
+
│ ├── tyr_c230gt_10
|
|
251
|
+
│ ├── tyr_c230gt_50
|
|
287
252
|
│ └── tyr_control
|
|
288
253
|
├── FASTA
|
|
289
|
-
│ ├── tyr_c230gt_01
|
|
290
|
-
│ ├── tyr_c230gt_10
|
|
291
|
-
│ └── tyr_c230gt_50
|
|
254
|
+
│ ├── tyr_c230gt_01
|
|
255
|
+
│ ├── tyr_c230gt_10
|
|
256
|
+
│ └── tyr_c230gt_50
|
|
292
257
|
├── HTML
|
|
293
|
-
│ ├── tyr_c230gt_01
|
|
294
|
-
│ ├── tyr_c230gt_10
|
|
295
|
-
│ └── tyr_c230gt_50
|
|
258
|
+
│ ├── tyr_c230gt_01
|
|
259
|
+
│ ├── tyr_c230gt_10
|
|
260
|
+
│ └── tyr_c230gt_50
|
|
296
261
|
├── MUTATION_INFO
|
|
297
|
-
│ ├── tyr_c230gt_01
|
|
298
|
-
│ ├── tyr_c230gt_10
|
|
299
|
-
│ └── tyr_c230gt_50
|
|
262
|
+
│ ├── tyr_c230gt_01.csv
|
|
263
|
+
│ ├── tyr_c230gt_10.csv
|
|
264
|
+
│ └── tyr_c230gt_50.csv
|
|
300
265
|
├── read_plot.html
|
|
301
266
|
├── read_plot.pdf
|
|
302
267
|
└── read_summary.xlsx
|
|
@@ -1,113 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
from collections import defaultdict, Counter
|
|
5
|
-
from sklearn.tree import DecisionTreeClassifier
|
|
6
|
-
|
|
7
|
-
from DAJIN2.utils import io
|
|
8
|
-
|
|
9
|
-
# Constants
|
|
10
|
-
STRAND_BIAS_LOWER_LIMIT = 0.1
|
|
11
|
-
STRAND_BIAS_UPPER_LIMIT = 0.9
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
def is_strand_bias(path_control: Path) -> bool:
|
|
15
|
-
count_strand = defaultdict(int)
|
|
16
|
-
for m in io.read_jsonl(path_control):
|
|
17
|
-
count_strand[m["STRAND"]] += 1
|
|
18
|
-
|
|
19
|
-
total = count_strand["+"] + count_strand["-"]
|
|
20
|
-
percentage_plus = count_strand["+"] / total if total else 0
|
|
21
|
-
|
|
22
|
-
return not (STRAND_BIAS_LOWER_LIMIT < percentage_plus < STRAND_BIAS_UPPER_LIMIT)
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
###############################################################################
|
|
26
|
-
# Handle Strand bias
|
|
27
|
-
# # Clusters of reads with mutations with strand bias are merged into similar clusters without strand bias
|
|
28
|
-
###############################################################################
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def _count_strand(labels: list[int], samples: list[dict[str, str]]) -> tuple[defaultdict, defaultdict]:
|
|
32
|
-
"""Count the occurrences of each strand type by label."""
|
|
33
|
-
count_strand_by_labels = defaultdict(int)
|
|
34
|
-
total_count_by_labels = defaultdict(int)
|
|
35
|
-
|
|
36
|
-
for label, sample in zip(labels, samples):
|
|
37
|
-
total_count_by_labels[label] += 1
|
|
38
|
-
if sample["STRAND"] == "+":
|
|
39
|
-
count_strand_by_labels[label] += 1
|
|
40
|
-
|
|
41
|
-
return count_strand_by_labels, total_count_by_labels
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
def _calculate_strand_biases(
|
|
45
|
-
count_strand_by_labels: defaultdict, total_count_by_labels: defaultdict
|
|
46
|
-
) -> dict[int, bool]:
|
|
47
|
-
"""Calculate strand biases based on strand counts."""
|
|
48
|
-
strand_biases = {}
|
|
49
|
-
for label, total in total_count_by_labels.items():
|
|
50
|
-
strand_count = count_strand_by_labels[label]
|
|
51
|
-
strand_ratio = strand_count / total
|
|
52
|
-
strand_biases[label] = not (STRAND_BIAS_LOWER_LIMIT < strand_ratio < STRAND_BIAS_UPPER_LIMIT)
|
|
53
|
-
|
|
54
|
-
return strand_biases
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
def _get_strand_biases_on_each_label(labels: list[int], path_sample: Path | str) -> dict[int, bool]:
|
|
58
|
-
"""Get strand biases for given labels and samples.
|
|
59
|
-
Args:
|
|
60
|
-
labels: A list of integer labels.
|
|
61
|
-
path_sample: The path to the sample file.
|
|
62
|
-
Returns:
|
|
63
|
-
A dictionary containing strand biases by label.
|
|
64
|
-
"""
|
|
65
|
-
samples = io.read_jsonl(path_sample)
|
|
66
|
-
count_strand_by_labels, total_count_by_labels = _count_strand(labels, samples)
|
|
67
|
-
return _calculate_strand_biases(count_strand_by_labels, total_count_by_labels)
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
def _prepare_training_testing_sets(labels, scores, strand_biases) -> tuple[list, list, list]:
|
|
71
|
-
x_train, y_train, x_test = [], [], []
|
|
72
|
-
for label, score in zip(labels, scores):
|
|
73
|
-
if strand_biases[label]:
|
|
74
|
-
x_test.append(score)
|
|
75
|
-
else:
|
|
76
|
-
x_train.append(score)
|
|
77
|
-
y_train.append(label)
|
|
78
|
-
return x_train, y_train, x_test
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
def _train_decision_tree(x_train, y_train) -> DecisionTreeClassifier:
|
|
82
|
-
dtree = DecisionTreeClassifier(random_state=1)
|
|
83
|
-
dtree.fit(x_train, y_train)
|
|
84
|
-
return dtree
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
def _allocate_labels(labels, strand_biases, dtree, x_test) -> list[int]:
|
|
88
|
-
label_predictions = dtree.predict(x_test)
|
|
89
|
-
label_predict_iter = iter(label_predictions)
|
|
90
|
-
for i, label in enumerate(labels):
|
|
91
|
-
if strand_biases[label]:
|
|
92
|
-
labels[i] = next(label_predict_iter)
|
|
93
|
-
return labels
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
def _correct_clusters_with_strand_bias(path_score_sample, labels, strand_biases) -> list[int]:
|
|
97
|
-
scores = io.read_jsonl(path_score_sample)
|
|
98
|
-
x_train, y_train, x_test = _prepare_training_testing_sets(labels, scores, strand_biases)
|
|
99
|
-
dtree = _train_decision_tree(x_train, y_train)
|
|
100
|
-
return _allocate_labels(labels, strand_biases, dtree, x_test)
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
def remove_biased_clusters(path_sample, path_score_sample, labels) -> list[int]:
|
|
104
|
-
strand_biases = _get_strand_biases_on_each_label(labels, path_sample)
|
|
105
|
-
# Until there is at least one True and one False or
|
|
106
|
-
# 1000 iterations (1000 is a suitable number to exit an infinite loop just in case)
|
|
107
|
-
i = 0
|
|
108
|
-
labels_corrected = labels
|
|
109
|
-
while len(Counter(strand_biases.values())) > 1 and i < 1000:
|
|
110
|
-
labels_corrected = _correct_clusters_with_strand_bias(path_score_sample, labels_corrected, strand_biases)
|
|
111
|
-
strand_biases = _get_strand_biases_on_each_label(labels_corrected, path_sample)
|
|
112
|
-
i += 1
|
|
113
|
-
return labels_corrected
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|