DAJIN2 0.4.1__zip → 0.4.3__zip

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. {DAJIN2-0.4.1/src/DAJIN2.egg-info → DAJIN2-0.4.3}/PKG-INFO +41 -36
  2. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/README.md +30 -23
  3. DAJIN2-0.4.3/requirements.txt +20 -0
  4. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/setup.py +1 -1
  5. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/core/clustering/label_merger.py +20 -16
  6. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/core/consensus/consensus.py +3 -2
  7. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/core/consensus/name_handler.py +1 -7
  8. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/core/core.py +20 -123
  9. DAJIN2-0.4.3/src/DAJIN2/core/preprocess/__init__.py +9 -0
  10. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/core/preprocess/genome_fetcher.py +11 -3
  11. DAJIN2-0.4.3/src/DAJIN2/core/preprocess/input_formatter.py +109 -0
  12. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/core/preprocess/mapping.py +4 -0
  13. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/core/preprocess/midsv_caller.py +3 -4
  14. DAJIN2-0.4.3/src/DAJIN2/core/report/__init__.py +3 -0
  15. DAJIN2-0.4.1/src/DAJIN2/core/report/report_bam.py → DAJIN2-0.4.3/src/DAJIN2/core/report/bam_exporter.py +64 -50
  16. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/main.py +1 -1
  17. DAJIN2-0.4.3/src/DAJIN2/utils/fastx_handler.py +94 -0
  18. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/utils/input_validator.py +32 -21
  19. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/utils/io.py +6 -0
  20. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/utils/sam_handler.py +1 -0
  21. {DAJIN2-0.4.1 → DAJIN2-0.4.3/src/DAJIN2.egg-info}/PKG-INFO +41 -36
  22. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2.egg-info/SOURCES.txt +5 -5
  23. DAJIN2-0.4.3/src/DAJIN2.egg-info/requires.txt +16 -0
  24. DAJIN2-0.4.1/requirements.txt +0 -23
  25. DAJIN2-0.4.1/src/DAJIN2/core/preprocess/__init__.py +0 -12
  26. DAJIN2-0.4.1/src/DAJIN2/core/preprocess/fastx_parser.py +0 -59
  27. DAJIN2-0.4.1/src/DAJIN2/core/report/__init__.py +0 -3
  28. DAJIN2-0.4.1/src/DAJIN2/utils/fastx_handler.py +0 -42
  29. DAJIN2-0.4.1/src/DAJIN2.egg-info/requires.txt +0 -18
  30. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/LICENSE +0 -0
  31. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/MANIFEST.in +0 -0
  32. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/setup.cfg +0 -0
  33. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/__init__.py +0 -0
  34. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/core/__init__.py +0 -0
  35. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/core/classification/__init__.py +0 -0
  36. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/core/classification/allele_merger.py +0 -0
  37. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/core/classification/classifier.py +0 -0
  38. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/core/clustering/__init__.py +0 -0
  39. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/core/clustering/appender.py +0 -0
  40. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/core/clustering/clustering.py +0 -0
  41. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/core/clustering/kmer_generator.py +0 -0
  42. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/core/clustering/label_extractor.py +0 -0
  43. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/core/clustering/label_updator.py +0 -0
  44. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/core/clustering/score_handler.py +0 -0
  45. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/core/clustering/strand_bias_handler.py +0 -0
  46. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/core/consensus/__init__.py +0 -0
  47. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/core/consensus/clust_formatter.py +0 -0
  48. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/core/consensus/mutation_extractor.py +0 -0
  49. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/core/consensus/similarity_searcher.py +0 -0
  50. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/core/preprocess/cache_checker.py +0 -0
  51. /DAJIN2-0.4.1/src/DAJIN2/core/preprocess/directories.py → /DAJIN2-0.4.3/src/DAJIN2/core/preprocess/directory_manager.py +0 -0
  52. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/core/preprocess/homopolymer_handler.py +0 -0
  53. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/core/preprocess/insertions_to_fasta.py +0 -0
  54. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/core/preprocess/knockin_handler.py +0 -0
  55. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/core/preprocess/mutation_extractor.py +0 -0
  56. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/core/report/insertion_reflector.py +0 -0
  57. /DAJIN2-0.4.1/src/DAJIN2/core/report/report_mutation.py → /DAJIN2-0.4.3/src/DAJIN2/core/report/mutation_exporter.py +0 -0
  58. /DAJIN2-0.4.1/src/DAJIN2/core/report/report_files.py → /DAJIN2-0.4.3/src/DAJIN2/core/report/sequence_exporter.py +0 -0
  59. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/gui.py +0 -0
  60. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/static/css/style.css +0 -0
  61. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/template_igvjs.html +0 -0
  62. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/templates/index.html +0 -0
  63. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/utils/config.py +0 -0
  64. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/utils/cssplits_handler.py +0 -0
  65. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/utils/dna_handler.py +0 -0
  66. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/utils/multiprocess.py +0 -0
  67. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/utils/report_generator.py +0 -0
  68. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2/view.py +0 -0
  69. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2.egg-info/dependency_links.txt +0 -0
  70. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2.egg-info/entry_points.txt +0 -0
  71. {DAJIN2-0.4.1 → DAJIN2-0.4.3}/src/DAJIN2.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: DAJIN2
3
- Version: 0.4.1
3
+ Version: 0.4.3
4
4
  Summary: One-step genotyping tools for targeted long-read sequencing
5
5
  Home-page: https://github.com/akikuno/DAJIN2
6
6
  Author: Akihiro Kuno
@@ -14,24 +14,22 @@ Classifier: Intended Audience :: Science/Research
14
14
  Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
15
15
  Description-Content-Type: text/markdown
16
16
  License-File: LICENSE
17
- Requires-Dist: numpy>=1.20.0
18
- Requires-Dist: scipy>=1.6.0
17
+ Requires-Dist: numpy>=1.24.0
18
+ Requires-Dist: scipy>=1.10.0
19
19
  Requires-Dist: pandas>=1.0.0
20
- Requires-Dist: openpyxl>=3.0.0
21
- Requires-Dist: rapidfuzz>=3.0.0
22
- Requires-Dist: statsmodels>=0.13.5
23
- Requires-Dist: scikit-learn>=1.0.0
24
- Requires-Dist: openpyxl>=3.0.0
20
+ Requires-Dist: openpyxl>=3.1.0
21
+ Requires-Dist: rapidfuzz>=3.6.0
22
+ Requires-Dist: scikit-learn>=1.3.0
25
23
  Requires-Dist: mappy>=2.24
26
- Requires-Dist: pysam>=0.19.0
24
+ Requires-Dist: pysam>=0.21.0
27
25
  Requires-Dist: Flask>=2.2.0
28
26
  Requires-Dist: waitress>=2.1.0
29
27
  Requires-Dist: Jinja2>=3.1.0
30
- Requires-Dist: plotly>=5.0.0
28
+ Requires-Dist: plotly>=5.19.0
31
29
  Requires-Dist: kaleido>=0.2.0
32
- Requires-Dist: cstag>=0.4.1
33
- Requires-Dist: midsv>=0.10.1
34
- Requires-Dist: wslPath>=0.3.0
30
+ Requires-Dist: cstag>=1.0.0
31
+ Requires-Dist: midsv>=0.11.0
32
+ Requires-Dist: wslPath>=0.4.1
35
33
 
36
34
  [![License](https://img.shields.io/badge/License-MIT-9cf.svg)](https://choosealicense.com/licenses/mit/)
37
35
  [![Test](https://img.shields.io/github/actions/workflow/status/akikuno/dajin2/pytest.yml?branch=main&label=Test&color=brightgreen)](https://github.com/akikuno/dajin2/actions)
@@ -56,14 +54,14 @@ The name DAJIN is derived from the phrase 一網**打尽** (Ichimou **DAJIN** or
56
54
  + **Comprehensive Mutation Detection**: Equipped with the capability to detect genome editing events over a wide range, it can identify a broad spectrum of mutations, from small changes to large structural variations.
57
55
  + DAJIN2 is also possible to detect complex mutations characteristic of genome editing, such as "insertions occurring in regions where deletions have occurred."
58
56
  + **Intuitive Visualization**: The outcomes of genome editing are visualized intuitively, allowing for the rapid and easy identification and analysis of mutations.
59
- + **Multi-Sample Compatibility**: Accommodates a variety of samples, enabling simultaneous processing of multiple samples. This facilitates efficient progression of large-scale experiments and comparative studies.
57
+ + **Multi-Sample Compatibility**: Enabling parallel processing of multiple samples. This facilitates efficient progression of large-scale experiments and comparative studies.
60
58
 
61
59
 
62
60
  ## 🛠 Installation
63
61
 
64
62
  ### Prerequisites
65
63
 
66
- - Python 3.7 or later
64
+ - Python 3.8 or later
67
65
  - Unix-like environment (Linux, macOS, WSL2, etc.)
68
66
 
69
67
  ### From [Bioconda](https://anaconda.org/bioconda/DAJIN2) (Recommended)
@@ -80,6 +78,7 @@ conda activate env-dajin2
80
78
  > CONDA_SUBDIR=osx-64 conda create -n env-dajin2 -c conda-forge -c bioconda python=3.10 DAJIN2 -y
81
79
  > conda activate env-dajin2
82
80
  > conda config --env --set subdir osx-64
81
+ > python -c "import platform; print(platform.machine())" # Confirm that the output is 'x86_64', not 'arm64'
83
82
  > ```
84
83
 
85
84
  ### From [PyPI](https://pypi.org/project/DAJIN2/)
@@ -92,7 +91,7 @@ pip install DAJIN2
92
91
  > If you encounter any issues during the installation, please refer to the [Troubleshooting Guide](https://github.com/akikuno/DAJIN2/blob/main/docs/TROUBLESHOOTING.md)
93
92
 
94
93
 
95
- ## 💡 Usage
94
+ ## 💻 Usage
96
95
 
97
96
  ### Required Files
98
97
 
@@ -126,11 +125,11 @@ Assuming barcode01 as the control and barcode02 as the sample, specify each dire
126
125
  The FASTA file should contain descriptions of the alleles anticipated as a result of genome editing.
127
126
 
128
127
  > [!IMPORTANT]
129
- > Specifying the control allele: A header name >control and its sequence are mandatory.
128
+ > **A header name >control and its sequence are mandatory.**
130
129
 
131
130
  If there are anticipated alleles (e.g., knock-ins or knock-outs), include their sequences in the FASTA file too. These anticipated alleles can be named arbitrarily.
132
131
 
133
- Below is a typical example of a FASTA file:
132
+ Below is an example of a FASTA file:
134
133
 
135
134
  ```text
136
135
  >control
@@ -166,12 +165,17 @@ Options:
166
165
  #### Example
167
166
 
168
167
  ```bash
168
+ # Download example dataset
169
+ wget https://github.com/akikuno/DAJIN2/raw/main/examples/example_single.tar.gz
170
+ tar -xf example_single.tar.gz
171
+
172
+ # Run DAJIN2
169
173
  DAJIN2 \
170
- --control example/barcode01 \
171
- --sample example/barcode02 \
172
- --allele example/design.fa \
173
- --name IL6-knockin \
174
- --genome hg38 \
174
+ --control example_single/control \
175
+ --sample example_single/sample \
176
+ --allele example_single/stx2_deletion.fa \
177
+ --name stx2_deletion \
178
+ --genome mm39 \
175
179
  --threads 4
176
180
  ```
177
181
 
@@ -208,7 +212,6 @@ DAJIN2 \
208
212
 
209
213
  By using the `batch` subcommand, you can process multiple FASTQ files simultaneously.
210
214
  For this purpose, a CSV or Excel file consolidating the sample information is required.
211
- <!-- For a specific example, please refer to [this link](https://github.com/akikuno/DAJIN2/blob/main/examples/example-batch/batch.csv). -->
212
215
 
213
216
  > [!NOTE]
214
217
  > For guidance on how to compile sample information, please refer to [this document](https://docs.google.com/presentation/d/e/2PACX-1vSMEmXJPG2TNjfT66XZJRzqJd82aAqO5gJrdEzyhn15YBBr_Li-j5puOgVChYf3jA/embed?start=false&loop=false&delayms=3000).
@@ -226,13 +229,18 @@ options:
226
229
  #### Example
227
230
 
228
231
  ```bash
229
- DAJIN2 --file batch.csv --threads 4
232
+ # Donwload the example dataset
233
+ wget https://github.com/akikuno/DAJIN2/raw/main/examples/example_batch.tar.gz
234
+ tar -xf example_batch.tar.gz
235
+
236
+ # Run DAJIN2
237
+ DAJIN2 batch --file example_batch/batch.csv --threads 4
230
238
  ```
231
239
 
232
240
  <!-- ```bash
233
241
  # Donwload the example dataset
234
- wget https://github.com/akikuno/DAJIN2/raw/main/examples/example-batch.tar.gz
235
- tar -xf example-batch.tar.gz
242
+ wget https://github.com/akikuno/DAJIN2/raw/main/examples/example_batch.tar.gz
243
+ tar -xf example_batch.tar.gz
236
244
 
237
245
  # Run DAJIN2
238
246
  DAJIN2 batch --file example-batch/batch.csv --threads 3
@@ -313,16 +321,17 @@ For example, Tyr point mutation is highlighted in **green**.
313
321
  ### 3. MUTATION_INFO
314
322
 
315
323
  The MUTATION_INFO directory saves tables depicting mutation sites for each allele.
316
- An example of a Tyr point mutation is described by its position on the chromosome and the type of mutation.
324
+ An example of a *Tyr* point mutation is described by its position on the chromosome and the type of mutation.
317
325
 
318
326
  <img src="https://user-images.githubusercontent.com/15861316/274519342-a613490d-5dbb-4a27-a2cf-bca0686b30f0.png" width="75%">
319
327
 
320
- ### 4. read_plot.html and read_plot.pdf
328
+ ### 4. resd_summary.xlsx, read_plot.html and read_plot.pdf
321
329
 
330
+ read_summary.xlsx describes the number of reads and presence proportion for each allele.
322
331
  Both read_plot.html and read_plot.pdf illustrate the proportions of each allele.
323
- The chart's **Allele type** indicates the type of allele, and **Percent of reads** shows the proportion of reads for that allele.
332
+ The chart's **Allele type** indicates the type of allele, and **Percent of reads** shows the proportion of reads for each allele.
324
333
 
325
- Additionally, the types of **Allele type** include:
334
+ The **Allele type** includes:
326
335
  - **Intact**: Alleles that perfectly match the input FASTA allele.
327
336
  - **Indels**: Substitutions, deletions, insertions, or inversions within 50 bases.
328
337
  - **SV**: Substitutions, deletions, insertions, or inversions beyond 50 bases.
@@ -333,14 +342,10 @@ Additionally, the types of **Allele type** include:
333
342
  > In PCR amplicon sequencing, the % of reads might not match the actual allele proportions due to amplification bias.
334
343
  > Especially when large deletions are present, the deletion alleles might be significantly amplified, potentially not reflecting the actual allele proportions.
335
344
 
336
- ### 5. read_summary.xlsx
337
-
338
- - read_summary.xlsx: Describes the number of reads and presence proportion for each allele.
339
-
340
345
  ## 📣Feedback and Support
341
346
 
342
347
  For questions, bug reports, or other forms of feedback, we'd love to hear from you!
343
- Please use [GitHub Issues](https://github.com/akikuno/DAJIN2/issues) for all reporting purposes.
348
+ Please use [GitHub Issues](https://github.com/akikuno/DAJIN2/issues/new/choose) for all reporting purposes.
344
349
 
345
350
  Please refer to [CONTRIBUTING](https://github.com/akikuno/DAJIN2/blob/main/docs/CONTRIBUTING.md) for how to contribute and how to verify your contributions.
346
351
 
@@ -21,14 +21,14 @@ The name DAJIN is derived from the phrase 一網**打尽** (Ichimou **DAJIN** or
21
21
  + **Comprehensive Mutation Detection**: Equipped with the capability to detect genome editing events over a wide range, it can identify a broad spectrum of mutations, from small changes to large structural variations.
22
22
  + DAJIN2 is also possible to detect complex mutations characteristic of genome editing, such as "insertions occurring in regions where deletions have occurred."
23
23
  + **Intuitive Visualization**: The outcomes of genome editing are visualized intuitively, allowing for the rapid and easy identification and analysis of mutations.
24
- + **Multi-Sample Compatibility**: Accommodates a variety of samples, enabling simultaneous processing of multiple samples. This facilitates efficient progression of large-scale experiments and comparative studies.
24
+ + **Multi-Sample Compatibility**: Enabling parallel processing of multiple samples. This facilitates efficient progression of large-scale experiments and comparative studies.
25
25
 
26
26
 
27
27
  ## 🛠 Installation
28
28
 
29
29
  ### Prerequisites
30
30
 
31
- - Python 3.7 or later
31
+ - Python 3.8 or later
32
32
  - Unix-like environment (Linux, macOS, WSL2, etc.)
33
33
 
34
34
  ### From [Bioconda](https://anaconda.org/bioconda/DAJIN2) (Recommended)
@@ -45,6 +45,7 @@ conda activate env-dajin2
45
45
  > CONDA_SUBDIR=osx-64 conda create -n env-dajin2 -c conda-forge -c bioconda python=3.10 DAJIN2 -y
46
46
  > conda activate env-dajin2
47
47
  > conda config --env --set subdir osx-64
48
+ > python -c "import platform; print(platform.machine())" # Confirm that the output is 'x86_64', not 'arm64'
48
49
  > ```
49
50
 
50
51
  ### From [PyPI](https://pypi.org/project/DAJIN2/)
@@ -57,7 +58,7 @@ pip install DAJIN2
57
58
  > If you encounter any issues during the installation, please refer to the [Troubleshooting Guide](https://github.com/akikuno/DAJIN2/blob/main/docs/TROUBLESHOOTING.md)
58
59
 
59
60
 
60
- ## 💡 Usage
61
+ ## 💻 Usage
61
62
 
62
63
  ### Required Files
63
64
 
@@ -91,11 +92,11 @@ Assuming barcode01 as the control and barcode02 as the sample, specify each dire
91
92
  The FASTA file should contain descriptions of the alleles anticipated as a result of genome editing.
92
93
 
93
94
  > [!IMPORTANT]
94
- > Specifying the control allele: A header name >control and its sequence are mandatory.
95
+ > **A header name >control and its sequence are mandatory.**
95
96
 
96
97
  If there are anticipated alleles (e.g., knock-ins or knock-outs), include their sequences in the FASTA file too. These anticipated alleles can be named arbitrarily.
97
98
 
98
- Below is a typical example of a FASTA file:
99
+ Below is an example of a FASTA file:
99
100
 
100
101
  ```text
101
102
  >control
@@ -131,12 +132,17 @@ Options:
131
132
  #### Example
132
133
 
133
134
  ```bash
135
+ # Download example dataset
136
+ wget https://github.com/akikuno/DAJIN2/raw/main/examples/example_single.tar.gz
137
+ tar -xf example_single.tar.gz
138
+
139
+ # Run DAJIN2
134
140
  DAJIN2 \
135
- --control example/barcode01 \
136
- --sample example/barcode02 \
137
- --allele example/design.fa \
138
- --name IL6-knockin \
139
- --genome hg38 \
141
+ --control example_single/control \
142
+ --sample example_single/sample \
143
+ --allele example_single/stx2_deletion.fa \
144
+ --name stx2_deletion \
145
+ --genome mm39 \
140
146
  --threads 4
141
147
  ```
142
148
 
@@ -173,7 +179,6 @@ DAJIN2 \
173
179
 
174
180
  By using the `batch` subcommand, you can process multiple FASTQ files simultaneously.
175
181
  For this purpose, a CSV or Excel file consolidating the sample information is required.
176
- <!-- For a specific example, please refer to [this link](https://github.com/akikuno/DAJIN2/blob/main/examples/example-batch/batch.csv). -->
177
182
 
178
183
  > [!NOTE]
179
184
  > For guidance on how to compile sample information, please refer to [this document](https://docs.google.com/presentation/d/e/2PACX-1vSMEmXJPG2TNjfT66XZJRzqJd82aAqO5gJrdEzyhn15YBBr_Li-j5puOgVChYf3jA/embed?start=false&loop=false&delayms=3000).
@@ -191,13 +196,18 @@ options:
191
196
  #### Example
192
197
 
193
198
  ```bash
194
- DAJIN2 --file batch.csv --threads 4
199
+ # Donwload the example dataset
200
+ wget https://github.com/akikuno/DAJIN2/raw/main/examples/example_batch.tar.gz
201
+ tar -xf example_batch.tar.gz
202
+
203
+ # Run DAJIN2
204
+ DAJIN2 batch --file example_batch/batch.csv --threads 4
195
205
  ```
196
206
 
197
207
  <!-- ```bash
198
208
  # Donwload the example dataset
199
- wget https://github.com/akikuno/DAJIN2/raw/main/examples/example-batch.tar.gz
200
- tar -xf example-batch.tar.gz
209
+ wget https://github.com/akikuno/DAJIN2/raw/main/examples/example_batch.tar.gz
210
+ tar -xf example_batch.tar.gz
201
211
 
202
212
  # Run DAJIN2
203
213
  DAJIN2 batch --file example-batch/batch.csv --threads 3
@@ -278,16 +288,17 @@ For example, Tyr point mutation is highlighted in **green**.
278
288
  ### 3. MUTATION_INFO
279
289
 
280
290
  The MUTATION_INFO directory saves tables depicting mutation sites for each allele.
281
- An example of a Tyr point mutation is described by its position on the chromosome and the type of mutation.
291
+ An example of a *Tyr* point mutation is described by its position on the chromosome and the type of mutation.
282
292
 
283
293
  <img src="https://user-images.githubusercontent.com/15861316/274519342-a613490d-5dbb-4a27-a2cf-bca0686b30f0.png" width="75%">
284
294
 
285
- ### 4. read_plot.html and read_plot.pdf
295
+ ### 4. resd_summary.xlsx, read_plot.html and read_plot.pdf
286
296
 
297
+ read_summary.xlsx describes the number of reads and presence proportion for each allele.
287
298
  Both read_plot.html and read_plot.pdf illustrate the proportions of each allele.
288
- The chart's **Allele type** indicates the type of allele, and **Percent of reads** shows the proportion of reads for that allele.
299
+ The chart's **Allele type** indicates the type of allele, and **Percent of reads** shows the proportion of reads for each allele.
289
300
 
290
- Additionally, the types of **Allele type** include:
301
+ The **Allele type** includes:
291
302
  - **Intact**: Alleles that perfectly match the input FASTA allele.
292
303
  - **Indels**: Substitutions, deletions, insertions, or inversions within 50 bases.
293
304
  - **SV**: Substitutions, deletions, insertions, or inversions beyond 50 bases.
@@ -298,14 +309,10 @@ Additionally, the types of **Allele type** include:
298
309
  > In PCR amplicon sequencing, the % of reads might not match the actual allele proportions due to amplification bias.
299
310
  > Especially when large deletions are present, the deletion alleles might be significantly amplified, potentially not reflecting the actual allele proportions.
300
311
 
301
- ### 5. read_summary.xlsx
302
-
303
- - read_summary.xlsx: Describes the number of reads and presence proportion for each allele.
304
-
305
312
  ## 📣Feedback and Support
306
313
 
307
314
  For questions, bug reports, or other forms of feedback, we'd love to hear from you!
308
- Please use [GitHub Issues](https://github.com/akikuno/DAJIN2/issues) for all reporting purposes.
315
+ Please use [GitHub Issues](https://github.com/akikuno/DAJIN2/issues/new/choose) for all reporting purposes.
309
316
 
310
317
  Please refer to [CONTRIBUTING](https://github.com/akikuno/DAJIN2/blob/main/docs/CONTRIBUTING.md) for how to contribute and how to verify your contributions.
311
318
 
@@ -0,0 +1,20 @@
1
+ numpy >= 1.24.0
2
+ scipy >= 1.10.0
3
+ pandas >= 1.0.0
4
+ openpyxl >= 3.1.0
5
+ rapidfuzz >=3.6.0
6
+ scikit-learn >= 1.3.0
7
+
8
+ mappy >= 2.24
9
+ pysam >= 0.21.0
10
+
11
+ Flask >= 2.2.0
12
+ waitress >= 2.1.0
13
+ Jinja2 >= 3.1.0
14
+
15
+ plotly >= 5.19.0
16
+ kaleido >= 0.2.0
17
+
18
+ cstag >= 1.0.0
19
+ midsv >= 0.11.0
20
+ wslPath >=0.4.1
@@ -9,7 +9,7 @@ with open("requirements.txt") as requirements_file:
9
9
 
10
10
  setuptools.setup(
11
11
  name="DAJIN2",
12
- version="0.4.1",
12
+ version="0.4.3",
13
13
  author="Akihiro Kuno",
14
14
  author_email="akuno@md.tsukuba.ac.jp",
15
15
  description="One-step genotyping tools for targeted long-read sequencing",
@@ -11,20 +11,6 @@ def calculate_label_percentages(labels: list[int]) -> dict[int, float]:
11
11
  return {label: (count / total_labels * 100) for label, count in label_counts.items()}
12
12
 
13
13
 
14
- def merge_mixed_cluster(labels_control: list[int], labels_sample: list[int], threshold: float = 0.5) -> list[int]:
15
- """Merge labels in sample if they appear more than 'threshold' percentage in control."""
16
- labels_merged = labels_sample.copy()
17
- label_percentages_control = calculate_label_percentages(labels_control)
18
- mixed_labels = {label for label, percent in label_percentages_control.items() if percent > threshold}
19
-
20
- new_label = max(labels_merged) + 1
21
- for i, label in enumerate(labels_sample):
22
- if label in mixed_labels:
23
- labels_merged[i] = new_label
24
-
25
- return labels_merged
26
-
27
-
28
14
  def map_clusters_to_previous(labels_sample: list[int], labels_previous: list[int]) -> dict[int, int]:
29
15
  """
30
16
  Determine which cluster in labels_previous corresponds to each cluster in labels_sample.
@@ -63,6 +49,8 @@ def merge_minor_cluster(
63
49
  minor_labels_percentage = {label for label, percent in label_percentages.items() if percent < threshold_percentage}
64
50
  minor_labels_readnumber = {label for label, num in Counter(labels_sample).items() if num <= threshold_readnumber}
65
51
  minor_labels = minor_labels_percentage | minor_labels_readnumber
52
+ if minor_labels == set():
53
+ return labels_sample
66
54
 
67
55
  correspondence = map_clusters_to_previous(labels_sample, labels_previous)
68
56
  update_required_labels = get_update_required_labels(correspondence)
@@ -70,7 +58,23 @@ def merge_minor_cluster(
70
58
  labels_merged = labels_sample.copy()
71
59
  for m in minor_labels:
72
60
  new_label = max(labels_merged) + 1
73
- labels_merged = [new_label if label in update_required_labels[correspondence[m]] else label for label in labels_merged]
61
+ labels_merged = [
62
+ new_label if label in update_required_labels[correspondence[m]] else label for label in labels_merged
63
+ ]
64
+
65
+ return labels_merged
66
+
67
+
68
+ def merge_mixed_cluster(labels_control: list[int], labels_sample: list[int], threshold: float = 0.5) -> list[int]:
69
+ """Merge labels in sample if they appear more than 'threshold' percentage in control."""
70
+ labels_merged = labels_sample.copy()
71
+ label_percentages_control = calculate_label_percentages(labels_control)
72
+ mixed_labels = {label for label, percent in label_percentages_control.items() if percent > threshold}
73
+
74
+ new_label = max(labels_merged) + 1
75
+ for i, label in enumerate(labels_sample):
76
+ if label in mixed_labels:
77
+ labels_merged[i] = new_label
74
78
 
75
79
  return labels_merged
76
80
 
@@ -82,7 +86,7 @@ def merge_minor_cluster(
82
86
 
83
87
  def merge_labels(labels_control: list[int], labels_sample: list[int], labels_previous: list[int]) -> list[int]:
84
88
  labels_merged = merge_minor_cluster(
85
- labels_sample, labels_previous, threshold_percentage=0.5, threshold_readnumber=10
89
+ labels_sample, labels_previous, threshold_percentage=0.5, threshold_readnumber=5
86
90
  )
87
91
  labels_merged = merge_mixed_cluster(labels_control, labels_merged)
88
92
  return labels_merged
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from pathlib import Path
4
- from typing import NamedTuple
4
+ from dataclasses import dataclass
5
5
  from itertools import groupby
6
6
  from collections import defaultdict
7
7
 
@@ -90,7 +90,8 @@ def call_percentage(cssplits: list[list[str]], mutation_loci: list[set[str]]) ->
90
90
  ###########################################################
91
91
 
92
92
 
93
- class ConsensusKey(NamedTuple):
93
+ @dataclass(frozen=True)
94
+ class ConsensusKey:
94
95
  allele: str
95
96
  label: int
96
97
  percent: float
@@ -1,13 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import re
4
- from typing import NamedTuple
5
-
6
-
7
- class ConsensusKey(NamedTuple):
8
- allele: str
9
- label: int
10
- percent: float
4
+ from DAJIN2.core.consensus.consensus import ConsensusKey
11
5
 
12
6
 
13
7
  def _detect_sv(cons_percentages: dict[ConsensusKey, list], threshold: int = 50) -> list[bool]:
@@ -2,119 +2,16 @@ from __future__ import annotations
2
2
 
3
3
  import shutil
4
4
  import logging
5
- import uuid
6
5
 
7
6
  from pathlib import Path
8
- from typing import NamedTuple
9
- from collections import defaultdict
10
7
 
11
- from DAJIN2.utils import io, config, fastx_handler
8
+ from DAJIN2.utils import io, fastx_handler
12
9
  from DAJIN2.core import classification, clustering, consensus, preprocess, report
10
+ from DAJIN2.core.preprocess.input_formatter import FormattedInputs
13
11
 
14
12
  logger = logging.getLogger(__name__)
15
13
 
16
14
 
17
- def parse_arguments(arguments: dict) -> tuple:
18
- genome_urls = defaultdict(str)
19
- if arguments.get("genome"):
20
- genome_urls.update(
21
- {"genome": arguments["genome"], "blat": arguments["blat"], "goldenpath": arguments["goldenpath"]}
22
- )
23
-
24
- return (
25
- arguments["sample"],
26
- arguments["control"],
27
- arguments["allele"],
28
- arguments["name"],
29
- arguments["threads"],
30
- genome_urls,
31
- uuid.uuid4().hex,
32
- )
33
-
34
-
35
- def convert_input_paths_to_posix(sample: str, control: str, allele: str) -> tuple:
36
- sample = io.convert_to_posix(sample)
37
- control = io.convert_to_posix(control)
38
- allele = io.convert_to_posix(allele)
39
-
40
- return sample, control, allele
41
-
42
-
43
- def create_temporal_directory(name: str, control_name: str) -> Path:
44
- tempdir = Path(config.TEMP_ROOT_DIR, name)
45
- Path(tempdir, "cache", ".igvjs", control_name).mkdir(parents=True, exist_ok=True)
46
-
47
- return tempdir
48
-
49
-
50
- def check_caches(tempdir: Path, path_allele: str, genome_url: str) -> bool:
51
- is_cache_hash = preprocess.cache_checker.exists_cached_hash(tempdir=tempdir, path=path_allele)
52
- is_cache_genome = preprocess.cache_checker.exists_cached_genome(tempdir=tempdir, genome=genome_url)
53
-
54
- return is_cache_hash and is_cache_genome
55
-
56
-
57
- def get_genome_coordinates(genome_urls: dict, fasta_alleles: dict, is_cache_genome: bool, tempdir: Path) -> dict:
58
- genome_coordinates = {
59
- "genome": genome_urls["genome"],
60
- "chrom_size": 0,
61
- "chrom": "control",
62
- "start": 0,
63
- "end": len(fasta_alleles["control"]) - 1,
64
- "strand": "+",
65
- }
66
- if genome_urls["genome"]:
67
- if is_cache_genome:
68
- genome_coordinates = next(io.read_jsonl(Path(tempdir, "cache", "genome_coordinates.jsonl")))
69
- else:
70
- genome_coordinates = preprocess.genome_fetcher.fetch_coordinates(
71
- genome_coordinates, genome_urls, fasta_alleles["control"]
72
- )
73
- genome_coordinates["chrom_size"] = preprocess.genome_fetcher.fetch_chromosome_size(
74
- genome_coordinates, genome_urls
75
- )
76
- io.write_jsonl([genome_coordinates], Path(tempdir, "cache", "genome_coordinates.jsonl"))
77
-
78
- return genome_coordinates
79
-
80
-
81
- class FormattedInputs(NamedTuple):
82
- path_sample: str
83
- path_control: str
84
- path_allele: str
85
- sample_name: str
86
- control_name: str
87
- fasta_alleles: dict[str, str]
88
- tempdir: Path
89
- genome_coordinates: dict[str, str]
90
- threads: int
91
- uuid: str
92
-
93
-
94
- def format_inputs(arguments: dict) -> FormattedInputs:
95
- path_sample, path_control, path_allele, name, threads, genome_urls, uuid = parse_arguments(arguments)
96
- path_sample, path_control, path_allele = convert_input_paths_to_posix(path_sample, path_control, path_allele)
97
- sample_name = preprocess.fastx_parser.extract_basename(path_sample)
98
- control_name = preprocess.fastx_parser.extract_basename(path_control)
99
- fasta_alleles = preprocess.fastx_parser.dictionize_allele(path_allele)
100
- tempdir = create_temporal_directory(name, control_name)
101
- is_cache_genome = check_caches(tempdir, path_allele, genome_urls["genome"])
102
- genome_coordinates = get_genome_coordinates(genome_urls, fasta_alleles, is_cache_genome, tempdir)
103
-
104
- return FormattedInputs(
105
- path_sample,
106
- path_control,
107
- path_allele,
108
- sample_name,
109
- control_name,
110
- fasta_alleles,
111
- tempdir,
112
- genome_coordinates,
113
- threads,
114
- uuid,
115
- )
116
-
117
-
118
15
  ###########################################################
119
16
  # main
120
17
  ###########################################################
@@ -126,9 +23,9 @@ def execute_control(arguments: dict):
126
23
  ###########################################################
127
24
  # Preprocess
128
25
  ###########################################################
129
- ARGS = format_inputs(arguments)
130
- preprocess.directories.create_temporal_directories(ARGS.tempdir, ARGS.control_name, is_control=True)
131
- preprocess.directories.create_report_directories(ARGS.tempdir, ARGS.control_name, is_control=True)
26
+ ARGS: FormattedInputs = preprocess.format_inputs(arguments)
27
+ preprocess.create_temporal_directories(ARGS.tempdir, ARGS.control_name, is_control=True)
28
+ preprocess.create_report_directories(ARGS.tempdir, ARGS.control_name, is_control=True)
132
29
  io.cache_control_hash(ARGS.tempdir, ARGS.path_allele)
133
30
 
134
31
  ###########################################################
@@ -151,7 +48,7 @@ def execute_control(arguments: dict):
151
48
  # ============================================================
152
49
  # Export fasta files as single-FASTA format
153
50
  # ============================================================
154
- preprocess.fastx_parser.export_fasta_files(ARGS.tempdir, ARGS.fasta_alleles, ARGS.control_name)
51
+ fastx_handler.export_fasta_files(ARGS.tempdir, ARGS.fasta_alleles, ARGS.control_name)
155
52
 
156
53
  # ============================================================
157
54
  # Mapping using mappy
@@ -173,8 +70,8 @@ def execute_control(arguments: dict):
173
70
  # Output BAM files
174
71
  ###########################################################
175
72
  logger.info(f"Output BAM files of {arguments['control']}...")
176
- report.report_bam.export_to_bam(
177
- ARGS.tempdir, ARGS.control_name, ARGS.genome_coordinates, ARGS.threads, is_control=True
73
+ report.bam_exporter.export_to_bam(
74
+ ARGS.tempdir, ARGS.control_name, ARGS.genome_coordinates, ARGS.threads, ARGS.uuid, is_control=True
178
75
  )
179
76
  ###########################################################
180
77
  # Finish call
@@ -189,9 +86,9 @@ def execute_sample(arguments: dict):
189
86
  # Preprocess
190
87
  ###########################################################
191
88
 
192
- ARGS = format_inputs(arguments)
193
- preprocess.directories.create_temporal_directories(ARGS.tempdir, ARGS.sample_name, is_control=False)
194
- preprocess.directories.create_report_directories(ARGS.tempdir, ARGS.sample_name, is_control=False)
89
+ ARGS: FormattedInputs = preprocess.format_inputs(arguments)
90
+ preprocess.create_temporal_directories(ARGS.tempdir, ARGS.sample_name, is_control=False)
91
+ preprocess.create_report_directories(ARGS.tempdir, ARGS.sample_name, is_control=False)
195
92
 
196
93
  logger.info(f"Preprocess {arguments['sample']}...")
197
94
 
@@ -209,7 +106,7 @@ def execute_sample(arguments: dict):
209
106
  shutil.copy(path_fasta, Path(ARGS.tempdir, ARGS.sample_name, "fasta"))
210
107
 
211
108
  paths_fasta = Path(ARGS.tempdir, ARGS.sample_name, "fasta").glob("*.fasta")
212
- preprocess.mapping.generate_sam(ARGS, paths_fasta, is_control=False, is_insertion=False)
109
+ preprocess.generate_sam(ARGS, paths_fasta, is_control=False, is_insertion=False)
213
110
 
214
111
  # ============================================================
215
112
  # MIDSV conversion
@@ -234,8 +131,8 @@ def execute_sample(arguments: dict):
234
131
 
235
132
  if paths_insertion_fasta:
236
133
  # mapping to insertion alleles
237
- preprocess.mapping.generate_sam(ARGS, paths_insertion_fasta, is_control=True, is_insertion=True)
238
- preprocess.mapping.generate_sam(ARGS, paths_insertion_fasta, is_control=False, is_insertion=True)
134
+ preprocess.generate_sam(ARGS, paths_insertion_fasta, is_control=True, is_insertion=True)
135
+ preprocess.generate_sam(ARGS, paths_insertion_fasta, is_control=False, is_insertion=True)
239
136
  # add insertions to ARGS.fasta_alleles
240
137
  for path_fasta in paths_insertion_fasta:
241
138
  allele, seq = Path(path_fasta).read_text().strip().split("\n")
@@ -307,15 +204,15 @@ def execute_sample(arguments: dict):
307
204
  # RESULT
308
205
  io.write_jsonl(RESULT_SAMPLE, Path(ARGS.tempdir, "result", f"{ARGS.sample_name}.jsonl"))
309
206
  # FASTA
310
- report.report_files.export_to_fasta(ARGS.tempdir, ARGS.sample_name, cons_sequence)
311
- report.report_files.export_reference_to_fasta(ARGS.tempdir, ARGS.sample_name)
207
+ report.sequence_exporter.export_to_fasta(ARGS.tempdir, ARGS.sample_name, cons_sequence)
208
+ report.sequence_exporter.export_reference_to_fasta(ARGS.tempdir, ARGS.sample_name)
312
209
  # HTML
313
- report.report_files.export_to_html(ARGS.tempdir, ARGS.sample_name, cons_percentage)
210
+ report.sequence_exporter.export_to_html(ARGS.tempdir, ARGS.sample_name, cons_percentage)
314
211
  # CSV (Allele Info)
315
- report.report_mutation.export_to_csv(ARGS.tempdir, ARGS.sample_name, ARGS.genome_coordinates, cons_percentage)
212
+ report.mutation_exporter.export_to_csv(ARGS.tempdir, ARGS.sample_name, ARGS.genome_coordinates, cons_percentage)
316
213
  # BAM
317
- report.report_bam.export_to_bam(
318
- ARGS.tempdir, ARGS.sample_name, ARGS.genome_coordinates, ARGS.threads, RESULT_SAMPLE
214
+ report.bam_exporter.export_to_bam(
215
+ ARGS.tempdir, ARGS.sample_name, ARGS.genome_coordinates, ARGS.threads, ARGS.uuid, RESULT_SAMPLE
319
216
  )
320
217
  for path_bam_igvjs in Path(ARGS.tempdir, "cache", ".igvjs").glob(f"{ARGS.control_name}_control.bam*"):
321
218
  shutil.copy(path_bam_igvjs, Path(ARGS.tempdir, "report", ".igvjs", ARGS.sample_name))
@@ -0,0 +1,9 @@
1
+ from DAJIN2.core.preprocess.cache_checker import exists_cached_hash, exists_cached_genome
2
+ from DAJIN2.core.preprocess.genome_fetcher import fetch_coordinates, fetch_chromosome_size
3
+ from DAJIN2.core.preprocess.mapping import generate_sam
4
+ from DAJIN2.core.preprocess.directory_manager import create_temporal_directories, create_report_directories
5
+ from DAJIN2.core.preprocess.input_formatter import format_inputs
6
+ from DAJIN2.core.preprocess.midsv_caller import generate_midsv
7
+ from DAJIN2.core.preprocess.knockin_handler import extract_knockin_loci
8
+ from DAJIN2.core.preprocess.mutation_extractor import cache_mutation_loci
9
+ from DAJIN2.core.preprocess.insertions_to_fasta import generate_insertion_fasta