DAJIN2 0.4.2__zip → 0.4.4__zip

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. {DAJIN2-0.4.2/src/DAJIN2.egg-info → dajin2-0.4.4}/PKG-INFO +38 -63
  2. {DAJIN2-0.4.2 → dajin2-0.4.4}/README.md +28 -53
  3. dajin2-0.4.4/requirements.txt +20 -0
  4. {DAJIN2-0.4.2 → dajin2-0.4.4}/setup.py +1 -1
  5. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/clustering/clustering.py +11 -10
  6. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/clustering/label_merger.py +20 -16
  7. dajin2-0.4.4/src/DAJIN2/core/clustering/strand_bias_handler.py +115 -0
  8. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/core.py +8 -8
  9. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/genome_fetcher.py +11 -3
  10. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/midsv_caller.py +3 -4
  11. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/mutation_extractor.py +7 -7
  12. dajin2-0.4.4/src/DAJIN2/core/report/__init__.py +3 -0
  13. DAJIN2-0.4.2/src/DAJIN2/core/report/report_bam.py → dajin2-0.4.4/src/DAJIN2/core/report/bam_exporter.py +64 -50
  14. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/main.py +20 -20
  15. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/utils/io.py +14 -6
  16. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/utils/sam_handler.py +0 -13
  17. {DAJIN2-0.4.2 → dajin2-0.4.4/src/DAJIN2.egg-info}/PKG-INFO +38 -63
  18. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2.egg-info/SOURCES.txt +3 -3
  19. dajin2-0.4.4/src/DAJIN2.egg-info/requires.txt +16 -0
  20. DAJIN2-0.4.2/requirements.txt +0 -20
  21. DAJIN2-0.4.2/src/DAJIN2/core/clustering/strand_bias_handler.py +0 -113
  22. DAJIN2-0.4.2/src/DAJIN2/core/report/__init__.py +0 -3
  23. DAJIN2-0.4.2/src/DAJIN2.egg-info/requires.txt +0 -16
  24. {DAJIN2-0.4.2 → dajin2-0.4.4}/LICENSE +0 -0
  25. {DAJIN2-0.4.2 → dajin2-0.4.4}/MANIFEST.in +0 -0
  26. {DAJIN2-0.4.2 → dajin2-0.4.4}/setup.cfg +0 -0
  27. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/__init__.py +0 -0
  28. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/__init__.py +0 -0
  29. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/classification/__init__.py +0 -0
  30. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/classification/allele_merger.py +0 -0
  31. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/classification/classifier.py +0 -0
  32. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/clustering/__init__.py +0 -0
  33. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/clustering/appender.py +0 -0
  34. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/clustering/kmer_generator.py +0 -0
  35. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/clustering/label_extractor.py +0 -0
  36. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/clustering/label_updator.py +0 -0
  37. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/clustering/score_handler.py +0 -0
  38. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/consensus/__init__.py +0 -0
  39. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/consensus/clust_formatter.py +0 -0
  40. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/consensus/consensus.py +0 -0
  41. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/consensus/mutation_extractor.py +0 -0
  42. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/consensus/name_handler.py +0 -0
  43. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/consensus/similarity_searcher.py +0 -0
  44. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/__init__.py +0 -0
  45. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/cache_checker.py +0 -0
  46. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/directory_manager.py +0 -0
  47. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/homopolymer_handler.py +0 -0
  48. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/input_formatter.py +0 -0
  49. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/insertions_to_fasta.py +0 -0
  50. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/knockin_handler.py +0 -0
  51. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/mapping.py +0 -0
  52. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/core/report/insertion_reflector.py +0 -0
  53. /DAJIN2-0.4.2/src/DAJIN2/core/report/report_mutation.py → /dajin2-0.4.4/src/DAJIN2/core/report/mutation_exporter.py +0 -0
  54. /DAJIN2-0.4.2/src/DAJIN2/core/report/report_files.py → /dajin2-0.4.4/src/DAJIN2/core/report/sequence_exporter.py +0 -0
  55. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/gui.py +0 -0
  56. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/static/css/style.css +0 -0
  57. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/template_igvjs.html +0 -0
  58. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/templates/index.html +0 -0
  59. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/utils/config.py +0 -0
  60. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/utils/cssplits_handler.py +0 -0
  61. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/utils/dna_handler.py +0 -0
  62. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/utils/fastx_handler.py +0 -0
  63. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/utils/input_validator.py +0 -0
  64. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/utils/multiprocess.py +0 -0
  65. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/utils/report_generator.py +0 -0
  66. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2/view.py +0 -0
  67. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2.egg-info/dependency_links.txt +0 -0
  68. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2.egg-info/entry_points.txt +0 -0
  69. {DAJIN2-0.4.2 → dajin2-0.4.4}/src/DAJIN2.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: DAJIN2
3
- Version: 0.4.2
3
+ Version: 0.4.4
4
4
  Summary: One-step genotyping tools for targeted long-read sequencing
5
5
  Home-page: https://github.com/akikuno/DAJIN2
6
6
  Author: Akihiro Kuno
@@ -14,22 +14,22 @@ Classifier: Intended Audience :: Science/Research
14
14
  Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
15
15
  Description-Content-Type: text/markdown
16
16
  License-File: LICENSE
17
- Requires-Dist: numpy>=1.20.0
18
- Requires-Dist: scipy>=1.6.0
17
+ Requires-Dist: numpy>=1.24.0
18
+ Requires-Dist: scipy>=1.10.0
19
19
  Requires-Dist: pandas>=1.0.0
20
- Requires-Dist: openpyxl>=3.0.0
21
- Requires-Dist: rapidfuzz>=3.0.0
22
- Requires-Dist: scikit-learn>=1.0.0
20
+ Requires-Dist: openpyxl>=3.1.0
21
+ Requires-Dist: rapidfuzz>=3.6.0
22
+ Requires-Dist: scikit-learn>=1.3.0
23
23
  Requires-Dist: mappy>=2.24
24
- Requires-Dist: pysam>=0.19.0
24
+ Requires-Dist: pysam>=0.21.0
25
25
  Requires-Dist: Flask>=2.2.0
26
26
  Requires-Dist: waitress>=2.1.0
27
27
  Requires-Dist: Jinja2>=3.1.0
28
- Requires-Dist: plotly>=5.0.0
28
+ Requires-Dist: plotly>=5.19.0
29
29
  Requires-Dist: kaleido>=0.2.0
30
30
  Requires-Dist: cstag>=1.0.0
31
- Requires-Dist: midsv>=0.10.1
32
- Requires-Dist: wslPath>=0.3.0
31
+ Requires-Dist: midsv>=0.11.0
32
+ Requires-Dist: wslPath>=0.4.1
33
33
 
34
34
  [![License](https://img.shields.io/badge/License-MIT-9cf.svg)](https://choosealicense.com/licenses/mit/)
35
35
  [![Test](https://img.shields.io/github/actions/workflow/status/akikuno/dajin2/pytest.yml?branch=main&label=Test&color=brightgreen)](https://github.com/akikuno/dajin2/actions)
@@ -78,6 +78,7 @@ conda activate env-dajin2
78
78
  > CONDA_SUBDIR=osx-64 conda create -n env-dajin2 -c conda-forge -c bioconda python=3.10 DAJIN2 -y
79
79
  > conda activate env-dajin2
80
80
  > conda config --env --set subdir osx-64
81
+ > python -c "import platform; print(platform.machine())" # Confirm that the output is 'x86_64', not 'arm64'
81
82
  > ```
82
83
 
83
84
  ### From [PyPI](https://pypi.org/project/DAJIN2/)
@@ -164,12 +165,17 @@ Options:
164
165
  #### Example
165
166
 
166
167
  ```bash
168
+ # Download example dataset
169
+ curl -LJO https://github.com/akikuno/DAJIN2/raw/main/examples/example_single.tar.gz
170
+ tar -xf example_single.tar.gz
171
+
172
+ # Run DAJIN2
167
173
  DAJIN2 \
168
- --control example/barcode01 \
169
- --sample example/barcode02 \
170
- --allele example/design.fa \
171
- --name IL6-knockin \
172
- --genome hg38 \
174
+ --control example_single/control \
175
+ --sample example_single/sample \
176
+ --allele example_single/stx2_deletion.fa \
177
+ --name stx2_deletion \
178
+ --genome mm39 \
173
179
  --threads 4
174
180
  ```
175
181
 
@@ -206,7 +212,6 @@ DAJIN2 \
206
212
 
207
213
  By using the `batch` subcommand, you can process multiple FASTQ files simultaneously.
208
214
  For this purpose, a CSV or Excel file consolidating the sample information is required.
209
- <!-- For a specific example, please refer to [this link](https://github.com/akikuno/DAJIN2/blob/main/examples/example-batch/batch.csv). -->
210
215
 
211
216
  > [!NOTE]
212
217
  > For guidance on how to compile sample information, please refer to [this document](https://docs.google.com/presentation/d/e/2PACX-1vSMEmXJPG2TNjfT66XZJRzqJd82aAqO5gJrdEzyhn15YBBr_Li-j5puOgVChYf3jA/embed?start=false&loop=false&delayms=3000).
@@ -224,44 +229,14 @@ options:
224
229
  #### Example
225
230
 
226
231
  ```bash
227
- DAJIN2 --file batch.csv --threads 4
228
- ```
229
-
230
- <!-- ```bash
231
232
  # Donwload the example dataset
232
- wget https://github.com/akikuno/DAJIN2/raw/main/examples/example-batch.tar.gz
233
- tar -xf example-batch.tar.gz
233
+ curl -LJO https://github.com/akikuno/DAJIN2/raw/main/examples/example_batch.tar.gz
234
+ tar -xf example_batch.tar.gz
234
235
 
235
236
  # Run DAJIN2
236
- DAJIN2 batch --file example-batch/batch.csv --threads 3
237
-
238
- # 2023-07-31 17:01:10: example-batch/tyr_control.fq.gz is now processing...
239
- # 2023-07-31 17:01:16: Preprocess example-batch/tyr_control.fq.gz...
240
- # 2023-07-31 17:01:48: Output BAM files of example-batch/tyr_control.fq.gz...
241
- # 2023-07-31 17:01:52: 🍵 example-batch/tyr_control.fq.gz is finished!
242
- # 2023-07-31 17:01:52: example-batch/tyr_c230gt_50%.fq.gz is now processing...
243
- # 2023-07-31 17:01:52: example-batch/tyr_c230gt_10%.fq.gz is now processing...
244
- # 2023-07-31 17:01:52: example-batch/tyr_c230gt_01%.fq.gz is now processing...
245
- # 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_01%.fq.gz...
246
- # 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_50%.fq.gz...
247
- # 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_10%.fq.gz...
248
- # 2023-07-31 17:02:17: Classify example-batch/tyr_c230gt_50%.fq.gz...
249
- # 2023-07-31 17:02:19: Clustering example-batch/tyr_c230gt_50%.fq.gz...
250
- # 2023-07-31 17:02:34: Classify example-batch/tyr_c230gt_01%.fq.gz...
251
- # 2023-07-31 17:02:35: Classify example-batch/tyr_c230gt_10%.fq.gz...
252
- # 2023-07-31 17:02:39: Clustering example-batch/tyr_c230gt_01%.fq.gz...
253
- # 2023-07-31 17:02:39: Clustering example-batch/tyr_c230gt_10%.fq.gz...
254
- # 2023-07-31 17:02:53: Consensus calling of example-batch/tyr_c230gt_50%.fq.gz...
255
- # 2023-07-31 17:02:59: Output reports of example-batch/tyr_c230gt_50%.fq.gz...
256
- # 2023-07-31 17:03:04: 🍵 example-batch/tyr_c230gt_50%.fq.gz is finished!
257
- # 2023-07-31 17:03:39: Consensus calling of example-batch/tyr_c230gt_01%.fq.gz...
258
- # 2023-07-31 17:03:51: Output reports of example-batch/tyr_c230gt_01%.fq.gz...
259
- # 2023-07-31 17:04:03: 🍵 example-batch/tyr_c230gt_01%.fq.gz is finished!
260
- # 2023-07-31 17:04:08: Consensus calling of example-batch/tyr_c230gt_10%.fq.gz...
261
- # 2023-07-31 17:04:16: Output reports of example-batch/tyr_c230gt_10%.fq.gz...
262
- # 2023-07-31 17:04:24: 🍵 example-batch/tyr_c230gt_10%.fq.gz is finished!
263
- # 🎉 Finished! Open DAJIN_Results/tyr-substitution to see the report.
264
- ``` -->
237
+ DAJIN2 batch --file example_batch/batch.csv --threads 4
238
+ ```
239
+
265
240
 
266
241
  ## 📈 Report Contents
267
242
 
@@ -271,22 +246,22 @@ Inside the **DAJIN_Results** directory, the following files can be found:
271
246
  ```
272
247
  DAJIN_Results/tyr-substitution
273
248
  ├── BAM
274
- │ ├── tyr_c230gt_01%
275
- │ ├── tyr_c230gt_10%
276
- │ ├── tyr_c230gt_50%
249
+ │ ├── tyr_c230gt_01
250
+ │ ├── tyr_c230gt_10
251
+ │ ├── tyr_c230gt_50
277
252
  │ └── tyr_control
278
253
  ├── FASTA
279
- │ ├── tyr_c230gt_01%
280
- │ ├── tyr_c230gt_10%
281
- │ └── tyr_c230gt_50%
254
+ │ ├── tyr_c230gt_01
255
+ │ ├── tyr_c230gt_10
256
+ │ └── tyr_c230gt_50
282
257
  ├── HTML
283
- │ ├── tyr_c230gt_01%
284
- │ ├── tyr_c230gt_10%
285
- │ └── tyr_c230gt_50%
258
+ │ ├── tyr_c230gt_01
259
+ │ ├── tyr_c230gt_10
260
+ │ └── tyr_c230gt_50
286
261
  ├── MUTATION_INFO
287
- │ ├── tyr_c230gt_01%.csv
288
- │ ├── tyr_c230gt_10%.csv
289
- │ └── tyr_c230gt_50%.csv
262
+ │ ├── tyr_c230gt_01.csv
263
+ │ ├── tyr_c230gt_10.csv
264
+ │ └── tyr_c230gt_50.csv
290
265
  ├── read_plot.html
291
266
  ├── read_plot.pdf
292
267
  └── read_summary.xlsx
@@ -45,6 +45,7 @@ conda activate env-dajin2
45
45
  > CONDA_SUBDIR=osx-64 conda create -n env-dajin2 -c conda-forge -c bioconda python=3.10 DAJIN2 -y
46
46
  > conda activate env-dajin2
47
47
  > conda config --env --set subdir osx-64
48
+ > python -c "import platform; print(platform.machine())" # Confirm that the output is 'x86_64', not 'arm64'
48
49
  > ```
49
50
 
50
51
  ### From [PyPI](https://pypi.org/project/DAJIN2/)
@@ -131,12 +132,17 @@ Options:
131
132
  #### Example
132
133
 
133
134
  ```bash
135
+ # Download example dataset
136
+ curl -LJO https://github.com/akikuno/DAJIN2/raw/main/examples/example_single.tar.gz
137
+ tar -xf example_single.tar.gz
138
+
139
+ # Run DAJIN2
134
140
  DAJIN2 \
135
- --control example/barcode01 \
136
- --sample example/barcode02 \
137
- --allele example/design.fa \
138
- --name IL6-knockin \
139
- --genome hg38 \
141
+ --control example_single/control \
142
+ --sample example_single/sample \
143
+ --allele example_single/stx2_deletion.fa \
144
+ --name stx2_deletion \
145
+ --genome mm39 \
140
146
  --threads 4
141
147
  ```
142
148
 
@@ -173,7 +179,6 @@ DAJIN2 \
173
179
 
174
180
  By using the `batch` subcommand, you can process multiple FASTQ files simultaneously.
175
181
  For this purpose, a CSV or Excel file consolidating the sample information is required.
176
- <!-- For a specific example, please refer to [this link](https://github.com/akikuno/DAJIN2/blob/main/examples/example-batch/batch.csv). -->
177
182
 
178
183
  > [!NOTE]
179
184
  > For guidance on how to compile sample information, please refer to [this document](https://docs.google.com/presentation/d/e/2PACX-1vSMEmXJPG2TNjfT66XZJRzqJd82aAqO5gJrdEzyhn15YBBr_Li-j5puOgVChYf3jA/embed?start=false&loop=false&delayms=3000).
@@ -191,44 +196,14 @@ options:
191
196
  #### Example
192
197
 
193
198
  ```bash
194
- DAJIN2 --file batch.csv --threads 4
195
- ```
196
-
197
- <!-- ```bash
198
199
  # Donwload the example dataset
199
- wget https://github.com/akikuno/DAJIN2/raw/main/examples/example-batch.tar.gz
200
- tar -xf example-batch.tar.gz
200
+ curl -LJO https://github.com/akikuno/DAJIN2/raw/main/examples/example_batch.tar.gz
201
+ tar -xf example_batch.tar.gz
201
202
 
202
203
  # Run DAJIN2
203
- DAJIN2 batch --file example-batch/batch.csv --threads 3
204
-
205
- # 2023-07-31 17:01:10: example-batch/tyr_control.fq.gz is now processing...
206
- # 2023-07-31 17:01:16: Preprocess example-batch/tyr_control.fq.gz...
207
- # 2023-07-31 17:01:48: Output BAM files of example-batch/tyr_control.fq.gz...
208
- # 2023-07-31 17:01:52: 🍵 example-batch/tyr_control.fq.gz is finished!
209
- # 2023-07-31 17:01:52: example-batch/tyr_c230gt_50%.fq.gz is now processing...
210
- # 2023-07-31 17:01:52: example-batch/tyr_c230gt_10%.fq.gz is now processing...
211
- # 2023-07-31 17:01:52: example-batch/tyr_c230gt_01%.fq.gz is now processing...
212
- # 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_01%.fq.gz...
213
- # 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_50%.fq.gz...
214
- # 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_10%.fq.gz...
215
- # 2023-07-31 17:02:17: Classify example-batch/tyr_c230gt_50%.fq.gz...
216
- # 2023-07-31 17:02:19: Clustering example-batch/tyr_c230gt_50%.fq.gz...
217
- # 2023-07-31 17:02:34: Classify example-batch/tyr_c230gt_01%.fq.gz...
218
- # 2023-07-31 17:02:35: Classify example-batch/tyr_c230gt_10%.fq.gz...
219
- # 2023-07-31 17:02:39: Clustering example-batch/tyr_c230gt_01%.fq.gz...
220
- # 2023-07-31 17:02:39: Clustering example-batch/tyr_c230gt_10%.fq.gz...
221
- # 2023-07-31 17:02:53: Consensus calling of example-batch/tyr_c230gt_50%.fq.gz...
222
- # 2023-07-31 17:02:59: Output reports of example-batch/tyr_c230gt_50%.fq.gz...
223
- # 2023-07-31 17:03:04: 🍵 example-batch/tyr_c230gt_50%.fq.gz is finished!
224
- # 2023-07-31 17:03:39: Consensus calling of example-batch/tyr_c230gt_01%.fq.gz...
225
- # 2023-07-31 17:03:51: Output reports of example-batch/tyr_c230gt_01%.fq.gz...
226
- # 2023-07-31 17:04:03: 🍵 example-batch/tyr_c230gt_01%.fq.gz is finished!
227
- # 2023-07-31 17:04:08: Consensus calling of example-batch/tyr_c230gt_10%.fq.gz...
228
- # 2023-07-31 17:04:16: Output reports of example-batch/tyr_c230gt_10%.fq.gz...
229
- # 2023-07-31 17:04:24: 🍵 example-batch/tyr_c230gt_10%.fq.gz is finished!
230
- # 🎉 Finished! Open DAJIN_Results/tyr-substitution to see the report.
231
- ``` -->
204
+ DAJIN2 batch --file example_batch/batch.csv --threads 4
205
+ ```
206
+
232
207
 
233
208
  ## 📈 Report Contents
234
209
 
@@ -238,22 +213,22 @@ Inside the **DAJIN_Results** directory, the following files can be found:
238
213
  ```
239
214
  DAJIN_Results/tyr-substitution
240
215
  ├── BAM
241
- │ ├── tyr_c230gt_01%
242
- │ ├── tyr_c230gt_10%
243
- │ ├── tyr_c230gt_50%
216
+ │ ├── tyr_c230gt_01
217
+ │ ├── tyr_c230gt_10
218
+ │ ├── tyr_c230gt_50
244
219
  │ └── tyr_control
245
220
  ├── FASTA
246
- │ ├── tyr_c230gt_01%
247
- │ ├── tyr_c230gt_10%
248
- │ └── tyr_c230gt_50%
221
+ │ ├── tyr_c230gt_01
222
+ │ ├── tyr_c230gt_10
223
+ │ └── tyr_c230gt_50
249
224
  ├── HTML
250
- │ ├── tyr_c230gt_01%
251
- │ ├── tyr_c230gt_10%
252
- │ └── tyr_c230gt_50%
225
+ │ ├── tyr_c230gt_01
226
+ │ ├── tyr_c230gt_10
227
+ │ └── tyr_c230gt_50
253
228
  ├── MUTATION_INFO
254
- │ ├── tyr_c230gt_01%.csv
255
- │ ├── tyr_c230gt_10%.csv
256
- │ └── tyr_c230gt_50%.csv
229
+ │ ├── tyr_c230gt_01.csv
230
+ │ ├── tyr_c230gt_10.csv
231
+ │ └── tyr_c230gt_50.csv
257
232
  ├── read_plot.html
258
233
  ├── read_plot.pdf
259
234
  └── read_summary.xlsx
@@ -0,0 +1,20 @@
1
+ numpy >= 1.24.0
2
+ scipy >= 1.10.0
3
+ pandas >= 1.0.0
4
+ openpyxl >= 3.1.0
5
+ rapidfuzz >=3.6.0
6
+ scikit-learn >= 1.3.0
7
+
8
+ mappy >= 2.24
9
+ pysam >= 0.21.0
10
+
11
+ Flask >= 2.2.0
12
+ waitress >= 2.1.0
13
+ Jinja2 >= 3.1.0
14
+
15
+ plotly >= 5.19.0
16
+ kaleido >= 0.2.0
17
+
18
+ cstag >= 1.0.0
19
+ midsv >= 0.11.0
20
+ wslPath >=0.4.1
@@ -9,7 +9,7 @@ with open("requirements.txt") as requirements_file:
9
9
 
10
10
  setuptools.setup(
11
11
  name="DAJIN2",
12
- version="0.4.2",
12
+ version="0.4.4",
13
13
  author="Akihiro Kuno",
14
14
  author_email="akuno@md.tsukuba.ac.jp",
15
15
  description="One-step genotyping tools for targeted long-read sequencing",
@@ -39,17 +39,16 @@ def optimize_labels(X: spmatrix, coverage_sample: int, coverage_control: int) ->
39
39
  # print(i, Counter(labels_sample), Counter(labels_control), Counter(labels_current)) # ! DEBUG
40
40
 
41
41
  num_labels_control = count_number_of_clusters(labels_control, coverage_control)
42
- mutual_info = metrics.adjusted_rand_score(labels_previous, labels_current)
42
+ rand_index = metrics.adjusted_rand_score(labels_previous, labels_current)
43
43
 
44
44
  """
45
45
  Return the number of clusters when:
46
- - the number of clusters in control is split into more than one.
47
- - the mutual information between the current and previous labels is high enough (= similar).
46
+ - the number of clusters in control is split into more than one.
47
+ - the mutual information between the current and previous labels is high enough (= similar).
48
+ To reduce the allele number, previous labels are returned.
48
49
  """
49
- if num_labels_control >= 2:
50
+ if num_labels_control >= 2 or rand_index >= 0.95:
50
51
  return labels_previous
51
- if 0.95 <= mutual_info <= 1.0:
52
- return labels_current
53
52
  labels_previous = labels_current
54
53
  return labels_previous
55
54
 
@@ -58,11 +57,13 @@ def get_label_most_common(labels: list[int]) -> int:
58
57
  return Counter(labels).most_common()[0][0]
59
58
 
60
59
 
61
- def return_labels(path_score_sample: Path, path_score_control: Path, path_sample: Path, strand_bias: bool) -> list[int]:
60
+ def return_labels(
61
+ path_score_sample: Path, path_score_control: Path, path_sample: Path, strand_bias_in_control: bool
62
+ ) -> list[int]:
62
63
  np.random.seed(seed=1)
63
64
  score_control = list(io.read_jsonl(path_score_control))
64
65
  X_control = csr_matrix(score_control)
65
- # subset to 1000 reads of controls in the most common cluster to remove outliers and reduce computation time
66
+ """Subset to 1000 reads of controls in the most common cluster to remove outliers and reduce computation time"""
66
67
  labels_control = BisectingKMeans(n_clusters=2, random_state=1).fit_predict(X_control)
67
68
  label_most_common = get_label_most_common(labels_control)
68
69
  scores_control_subset = subset_scores(labels_control, io.read_jsonl(path_score_control), label_most_common, 1000)
@@ -71,7 +72,7 @@ def return_labels(path_score_sample: Path, path_score_control: Path, path_sample
71
72
  coverage_sample = io.count_newlines(path_score_sample)
72
73
  coverage_control = len(scores_control_subset)
73
74
  labels = optimize_labels(X, coverage_sample, coverage_control)
74
- # correct clusters with strand bias
75
- if strand_bias is False:
75
+ """Re-allocate clusters with strand bias to clusters without strand bias"""
76
+ if strand_bias_in_control is False:
76
77
  labels = remove_biased_clusters(path_sample, path_score_sample, labels)
77
78
  return labels
@@ -11,20 +11,6 @@ def calculate_label_percentages(labels: list[int]) -> dict[int, float]:
11
11
  return {label: (count / total_labels * 100) for label, count in label_counts.items()}
12
12
 
13
13
 
14
- def merge_mixed_cluster(labels_control: list[int], labels_sample: list[int], threshold: float = 0.5) -> list[int]:
15
- """Merge labels in sample if they appear more than 'threshold' percentage in control."""
16
- labels_merged = labels_sample.copy()
17
- label_percentages_control = calculate_label_percentages(labels_control)
18
- mixed_labels = {label for label, percent in label_percentages_control.items() if percent > threshold}
19
-
20
- new_label = max(labels_merged) + 1
21
- for i, label in enumerate(labels_sample):
22
- if label in mixed_labels:
23
- labels_merged[i] = new_label
24
-
25
- return labels_merged
26
-
27
-
28
14
  def map_clusters_to_previous(labels_sample: list[int], labels_previous: list[int]) -> dict[int, int]:
29
15
  """
30
16
  Determine which cluster in labels_previous corresponds to each cluster in labels_sample.
@@ -63,6 +49,8 @@ def merge_minor_cluster(
63
49
  minor_labels_percentage = {label for label, percent in label_percentages.items() if percent < threshold_percentage}
64
50
  minor_labels_readnumber = {label for label, num in Counter(labels_sample).items() if num <= threshold_readnumber}
65
51
  minor_labels = minor_labels_percentage | minor_labels_readnumber
52
+ if minor_labels == set():
53
+ return labels_sample
66
54
 
67
55
  correspondence = map_clusters_to_previous(labels_sample, labels_previous)
68
56
  update_required_labels = get_update_required_labels(correspondence)
@@ -70,7 +58,23 @@ def merge_minor_cluster(
70
58
  labels_merged = labels_sample.copy()
71
59
  for m in minor_labels:
72
60
  new_label = max(labels_merged) + 1
73
- labels_merged = [new_label if label in update_required_labels[correspondence[m]] else label for label in labels_merged]
61
+ labels_merged = [
62
+ new_label if label in update_required_labels[correspondence[m]] else label for label in labels_merged
63
+ ]
64
+
65
+ return labels_merged
66
+
67
+
68
+ def merge_mixed_cluster(labels_control: list[int], labels_sample: list[int], threshold: float = 0.5) -> list[int]:
69
+ """Merge labels in sample if they appear more than 'threshold' percentage in control."""
70
+ labels_merged = labels_sample.copy()
71
+ label_percentages_control = calculate_label_percentages(labels_control)
72
+ mixed_labels = {label for label, percent in label_percentages_control.items() if percent > threshold}
73
+
74
+ new_label = max(labels_merged) + 1
75
+ for i, label in enumerate(labels_sample):
76
+ if label in mixed_labels:
77
+ labels_merged[i] = new_label
74
78
 
75
79
  return labels_merged
76
80
 
@@ -82,7 +86,7 @@ def merge_minor_cluster(
82
86
 
83
87
  def merge_labels(labels_control: list[int], labels_sample: list[int], labels_previous: list[int]) -> list[int]:
84
88
  labels_merged = merge_minor_cluster(
85
- labels_sample, labels_previous, threshold_percentage=0.5, threshold_readnumber=10
89
+ labels_sample, labels_previous, threshold_percentage=0.5, threshold_readnumber=5
86
90
  )
87
91
  labels_merged = merge_mixed_cluster(labels_control, labels_merged)
88
92
  return labels_merged
@@ -0,0 +1,115 @@
1
+ from __future__ import annotations
2
+
3
+ """
4
+ Nanopore sequencing results often results in strand specific mutations even though the mutation is not strand specific, thus they are considered as sequencing errors and should be removed.
5
+
6
+ This module provides functions to determine whether each allele obtained after clustering is formed due to sequencing errors caused by strand bias.
7
+
8
+ Re-allocates reads belonging to clusters with strand bias to clusters without strand bias.
9
+ """
10
+
11
+ from pathlib import Path
12
+ from collections import defaultdict
13
+ from sklearn.tree import DecisionTreeClassifier
14
+
15
+ from DAJIN2.utils import io
16
+
17
+ # Constants
18
+ STRAND_BIAS_LOWER_LIMIT = 0.1
19
+ STRAND_BIAS_UPPER_LIMIT = 0.9
20
+
21
+
22
+ def is_strand_bias(path_control: Path) -> bool:
23
+ """
24
+ Determines whether there is a strand bias in sequencing data
25
+ based on the distribution of '+' and '-' strands.
26
+ """
27
+ count_strand = defaultdict(int)
28
+ for sample in io.read_jsonl(path_control):
29
+ count_strand[sample["STRAND"]] += 1
30
+
31
+ total = count_strand["+"] + count_strand["-"]
32
+ percentage_plus = count_strand["+"] / total if total > 0 else 0
33
+
34
+ return not (STRAND_BIAS_LOWER_LIMIT < percentage_plus < STRAND_BIAS_UPPER_LIMIT)
35
+
36
+
37
+ ###############################################################################
38
+ # Handle Strand bias
39
+ # # Clusters of reads with mutations with strand bias are merged into similar clusters without strand bias
40
+ ###############################################################################
41
+
42
+
43
+ def count_strand(labels: list[int], samples: list[dict[str, str]]) -> tuple[dict[str, int], dict[str, int]]:
44
+ """Count the occurrences of each strand type by label."""
45
+ positive_strand_counts_by_labels = defaultdict(int)
46
+ total_counts_by_labels = defaultdict(int)
47
+
48
+ for label, sample in zip(labels, samples):
49
+ total_counts_by_labels[label] += 1
50
+ if sample["STRAND"] == "+":
51
+ positive_strand_counts_by_labels[label] += 1
52
+
53
+ return dict(positive_strand_counts_by_labels), dict(total_counts_by_labels)
54
+
55
+
56
+ def determine_strand_biases(
57
+ positive_strand_counts_by_labels: defaultdict, total_counts_by_labels: defaultdict
58
+ ) -> dict[int, bool]:
59
+ """Determine strand biases based on positive strand counts."""
60
+ strand_biases = {}
61
+ for label, total in total_counts_by_labels.items():
62
+ positive_strand_count = positive_strand_counts_by_labels[label]
63
+ strand_ratio = positive_strand_count / total
64
+ strand_biases[label] = not (STRAND_BIAS_LOWER_LIMIT < strand_ratio < STRAND_BIAS_UPPER_LIMIT)
65
+
66
+ return strand_biases
67
+
68
+
69
+ def prepare_training_testing_sets(labels, scores, strand_biases) -> tuple[list, list, list]:
70
+ """Prepare training and testing datasets based on strand biases."""
71
+ train_data, train_labels, test_data = [], [], []
72
+ for label, score in zip(labels, scores):
73
+ if strand_biases[label]:
74
+ test_data.append(score)
75
+ else:
76
+ train_data.append(score)
77
+ train_labels.append(label)
78
+ return train_data, train_labels, test_data
79
+
80
+
81
+ def train_decision_tree(train_data, train_labels) -> DecisionTreeClassifier:
82
+ """Train a decision tree classifier using the provided features and labels."""
83
+ dtree = DecisionTreeClassifier(random_state=1)
84
+ dtree.fit(train_data, train_labels)
85
+ return dtree
86
+
87
+
88
+ def allocate_labels(labels: list[int], strand_biases: dict[str, bool], dtree, test_data) -> list[int]:
89
+ """Re-allocates reads belonging to clusters with strand bias to clusters without strand bias."""
90
+ label_predictions = iter(dtree.predict(test_data))
91
+ for i, label in enumerate(labels):
92
+ if strand_biases[label]:
93
+ labels[i] = next(label_predictions)
94
+ return labels
95
+
96
+
97
+ def remove_biased_clusters(path_sample: Path, path_score_sample: Path, labels: list[int]) -> list[int]:
98
+ """Remove clusters with strand bias by re-labeling based on decision tree predictions.
99
+ Continue until at least one of the samples exhibits strand bias (i.e., do not calculate if all samples exhibit strand bias, or conversely, if none of the samples exhibit strand bias) or
100
+ 1000 iterations are reached, which serves as a safeguard to prevent infinite loops.
101
+ """
102
+ samples = io.read_jsonl(path_sample)
103
+ positive_strand_counts_by_labels, total_counts_by_labels = count_strand(labels, samples)
104
+ strand_biases = determine_strand_biases(positive_strand_counts_by_labels, total_counts_by_labels)
105
+
106
+ iteration_count = 0
107
+ labels_corrected = labels
108
+ while len(set(strand_biases.values())) > 1 or iteration_count < 1000:
109
+ scores = io.read_jsonl(path_score_sample)
110
+ train_data, train_labels, test_data = prepare_training_testing_sets(labels, scores, strand_biases)
111
+ dtree = train_decision_tree(train_data, train_labels)
112
+ labels_corrected = allocate_labels(labels, strand_biases, dtree, test_data)
113
+ strand_biases = determine_strand_biases(labels_corrected, path_sample)
114
+ iteration_count += 1
115
+ return labels_corrected
@@ -70,8 +70,8 @@ def execute_control(arguments: dict):
70
70
  # Output BAM files
71
71
  ###########################################################
72
72
  logger.info(f"Output BAM files of {arguments['control']}...")
73
- report.report_bam.export_to_bam(
74
- ARGS.tempdir, ARGS.control_name, ARGS.genome_coordinates, ARGS.threads, is_control=True
73
+ report.bam_exporter.export_to_bam(
74
+ ARGS.tempdir, ARGS.control_name, ARGS.genome_coordinates, ARGS.threads, ARGS.uuid, is_control=True
75
75
  )
76
76
  ###########################################################
77
77
  # Finish call
@@ -204,15 +204,15 @@ def execute_sample(arguments: dict):
204
204
  # RESULT
205
205
  io.write_jsonl(RESULT_SAMPLE, Path(ARGS.tempdir, "result", f"{ARGS.sample_name}.jsonl"))
206
206
  # FASTA
207
- report.report_files.export_to_fasta(ARGS.tempdir, ARGS.sample_name, cons_sequence)
208
- report.report_files.export_reference_to_fasta(ARGS.tempdir, ARGS.sample_name)
207
+ report.sequence_exporter.export_to_fasta(ARGS.tempdir, ARGS.sample_name, cons_sequence)
208
+ report.sequence_exporter.export_reference_to_fasta(ARGS.tempdir, ARGS.sample_name)
209
209
  # HTML
210
- report.report_files.export_to_html(ARGS.tempdir, ARGS.sample_name, cons_percentage)
210
+ report.sequence_exporter.export_to_html(ARGS.tempdir, ARGS.sample_name, cons_percentage)
211
211
  # CSV (Allele Info)
212
- report.report_mutation.export_to_csv(ARGS.tempdir, ARGS.sample_name, ARGS.genome_coordinates, cons_percentage)
212
+ report.mutation_exporter.export_to_csv(ARGS.tempdir, ARGS.sample_name, ARGS.genome_coordinates, cons_percentage)
213
213
  # BAM
214
- report.report_bam.export_to_bam(
215
- ARGS.tempdir, ARGS.sample_name, ARGS.genome_coordinates, ARGS.threads, RESULT_SAMPLE
214
+ report.bam_exporter.export_to_bam(
215
+ ARGS.tempdir, ARGS.sample_name, ARGS.genome_coordinates, ARGS.threads, ARGS.uuid, RESULT_SAMPLE
216
216
  )
217
217
  for path_bam_igvjs in Path(ARGS.tempdir, "cache", ".igvjs").glob(f"{ARGS.control_name}_control.bam*"):
218
218
  shutil.copy(path_bam_igvjs, Path(ARGS.tempdir, "report", ".igvjs", ARGS.sample_name))
@@ -5,11 +5,19 @@ from urllib.request import urlopen
5
5
 
6
6
  def fetch_seq_coordinates(genome: str, blat_url: str, seq: str) -> dict:
7
7
  url = f"{blat_url}?db={genome}&type=BLAT&userSeq={seq}"
8
- response = urlopen(url).read().decode("utf8").split("\n")
9
- matches = [x for x in response if "100.0%" in x]
8
+ records = urlopen(url).read().decode("utf8").split("\n")
9
+ matches = []
10
+ for record in records:
11
+ if "100.0%" not in record:
12
+ continue
13
+ record_trim = [r for r in record.split(" ") if r]
14
+ if record_trim[-1] == str(len(seq)):
15
+ matches = record_trim
16
+
10
17
  if not matches:
11
18
  raise ValueError(f"{seq[:60]}... is not found in {genome}")
12
- chrom, strand, start, end, _ = matches[0].split()[-5:]
19
+
20
+ chrom, strand, start, end, _ = matches[-5:]
13
21
  return {"chrom": chrom, "strand": strand, "start": int(start), "end": int(end)}
14
22
 
15
23
 
@@ -8,8 +8,7 @@ from itertools import chain, groupby
8
8
 
9
9
  from collections import Counter
10
10
 
11
- from DAJIN2.utils import sam_handler
12
- from DAJIN2.utils import cssplits_handler
11
+ from DAJIN2.utils import io, sam_handler, cssplits_handler
13
12
 
14
13
 
15
14
  def has_inversion_in_splice(CIGAR: str) -> bool:
@@ -215,8 +214,8 @@ def generate_midsv(ARGS, is_control: bool = False, is_insertion: bool = False) -
215
214
  path_splice = Path(ARGS.tempdir, name, "sam", f"splice_{allele}.sam")
216
215
  path_output_midsv = Path(ARGS.tempdir, name, "midsv", f"{allele}.json")
217
216
 
218
- sam_ont = sam_handler.remove_overlapped_reads(list(sam_handler.read_sam(path_ont)))
219
- sam_splice = sam_handler.remove_overlapped_reads(list(sam_handler.read_sam(path_splice)))
217
+ sam_ont = sam_handler.remove_overlapped_reads(list(io.read_sam(path_ont)))
218
+ sam_splice = sam_handler.remove_overlapped_reads(list(io.read_sam(path_splice)))
220
219
  qname_of_map_ont = extract_qname_of_map_ont(sam_ont, sam_splice)
221
220
  sam_of_map_ont = filter_sam_by_preset(sam_ont, qname_of_map_ont, preset="map-ont")
222
221
  sam_of_splice = filter_sam_by_preset(sam_splice, qname_of_map_ont, preset="splice")
@@ -89,13 +89,13 @@ def cosine_similarity(x, y):
89
89
 
90
90
 
91
91
  def identify_dissimilar_loci(values_sample, values_control, index: int, is_consensus: bool = False) -> int:
92
- # If 'sample' has more than X% variation compared to 'control', unconditionally set it to "dissimilar loci"
93
- threshold = 20 if is_consensus else 5
94
- if values_sample[index] - values_control[index] > threshold:
92
+ # If 'sample' has more than 20% variation compared to 'control' in consensus mode, unconditionally set it to 'dissimilar loci'. This is set to counteract cases where, when evaluating cosine similarity during significant deletions, values exceedingly close to 1 can occur even if not observed in the control (e.g., control = [1,1,1,1,1], sample = [100,100,100,100,100] -> cosine similarity = 1).
93
+ if is_consensus and values_sample[index] - values_control[index] > 20:
95
94
  return True
96
95
 
97
- x = values_sample[index - 5 : index + 6]
98
- y = values_control[index - 5 : index + 6]
96
+ # Subset 10 bases around index and add 1e-6 to avoid division by zero when calculating cosine similarity.
97
+ x = np.array(values_sample[index - 5 : index + 6]) + 1e-6
98
+ y = np.array(values_control[index - 5 : index + 6]) + 1e-6
99
99
 
100
100
  return cosine_similarity(x, y) < 0.95
101
101
 
@@ -109,8 +109,8 @@ def detect_anomalies(values_sample, values_control, threshold: float, is_consens
109
109
 
110
110
  values_subtract_reshaped = values_subtract.reshape(-1, 1)
111
111
  kmeans = MiniBatchKMeans(n_clusters=2, random_state=0, n_init="auto").fit(values_subtract_reshaped)
112
- threshold = kmeans.cluster_centers_.mean()
113
- candidate_loci = {i for i, v in enumerate(values_subtract_reshaped) if v > threshold}
112
+ threshold_kmeans = kmeans.cluster_centers_.mean()
113
+ candidate_loci = {i for i, v in enumerate(values_subtract_reshaped) if v > threshold_kmeans}
114
114
 
115
115
  return {i for i in candidate_loci if identify_dissimilar_loci(values_sample, values_control, i, is_consensus)}
116
116
 
@@ -0,0 +1,3 @@
1
+ from DAJIN2.core.report import bam_exporter
2
+ from DAJIN2.core.report import sequence_exporter
3
+ from DAJIN2.core.report import mutation_exporter