DAJIN2 0.4.1__zip → 0.4.2__zip
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {DAJIN2-0.4.1/src/DAJIN2.egg-info → DAJIN2-0.4.2}/PKG-INFO +13 -18
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/README.md +11 -14
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/requirements.txt +1 -4
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/setup.py +1 -1
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/consensus/consensus.py +3 -2
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/consensus/name_handler.py +1 -7
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/core.py +12 -115
- DAJIN2-0.4.2/src/DAJIN2/core/preprocess/__init__.py +9 -0
- DAJIN2-0.4.2/src/DAJIN2/core/preprocess/input_formatter.py +109 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/preprocess/mapping.py +4 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/preprocess/midsv_caller.py +2 -2
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/main.py +1 -1
- DAJIN2-0.4.2/src/DAJIN2/utils/fastx_handler.py +94 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/utils/input_validator.py +32 -21
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/utils/sam_handler.py +14 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2/src/DAJIN2.egg-info}/PKG-INFO +13 -18
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2.egg-info/SOURCES.txt +2 -2
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2.egg-info/requires.txt +1 -3
- DAJIN2-0.4.1/src/DAJIN2/core/preprocess/__init__.py +0 -12
- DAJIN2-0.4.1/src/DAJIN2/core/preprocess/fastx_parser.py +0 -59
- DAJIN2-0.4.1/src/DAJIN2/utils/fastx_handler.py +0 -42
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/LICENSE +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/MANIFEST.in +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/setup.cfg +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/__init__.py +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/__init__.py +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/classification/__init__.py +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/classification/allele_merger.py +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/classification/classifier.py +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/clustering/__init__.py +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/clustering/appender.py +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/clustering/clustering.py +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/clustering/kmer_generator.py +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/clustering/label_extractor.py +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/clustering/label_merger.py +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/clustering/label_updator.py +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/clustering/score_handler.py +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/clustering/strand_bias_handler.py +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/consensus/__init__.py +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/consensus/clust_formatter.py +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/consensus/mutation_extractor.py +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/consensus/similarity_searcher.py +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/preprocess/cache_checker.py +0 -0
- /DAJIN2-0.4.1/src/DAJIN2/core/preprocess/directories.py → /DAJIN2-0.4.2/src/DAJIN2/core/preprocess/directory_manager.py +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/preprocess/genome_fetcher.py +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/preprocess/homopolymer_handler.py +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/preprocess/insertions_to_fasta.py +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/preprocess/knockin_handler.py +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/preprocess/mutation_extractor.py +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/report/__init__.py +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/report/insertion_reflector.py +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/report/report_bam.py +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/report/report_files.py +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/report/report_mutation.py +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/gui.py +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/static/css/style.css +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/template_igvjs.html +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/templates/index.html +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/utils/config.py +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/utils/cssplits_handler.py +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/utils/dna_handler.py +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/utils/io.py +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/utils/multiprocess.py +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/utils/report_generator.py +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/view.py +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2.egg-info/dependency_links.txt +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2.egg-info/entry_points.txt +0 -0
- {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: DAJIN2
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.2
|
|
4
4
|
Summary: One-step genotyping tools for targeted long-read sequencing
|
|
5
5
|
Home-page: https://github.com/akikuno/DAJIN2
|
|
6
6
|
Author: Akihiro Kuno
|
|
@@ -19,9 +19,7 @@ Requires-Dist: scipy>=1.6.0
|
|
|
19
19
|
Requires-Dist: pandas>=1.0.0
|
|
20
20
|
Requires-Dist: openpyxl>=3.0.0
|
|
21
21
|
Requires-Dist: rapidfuzz>=3.0.0
|
|
22
|
-
Requires-Dist: statsmodels>=0.13.5
|
|
23
22
|
Requires-Dist: scikit-learn>=1.0.0
|
|
24
|
-
Requires-Dist: openpyxl>=3.0.0
|
|
25
23
|
Requires-Dist: mappy>=2.24
|
|
26
24
|
Requires-Dist: pysam>=0.19.0
|
|
27
25
|
Requires-Dist: Flask>=2.2.0
|
|
@@ -29,7 +27,7 @@ Requires-Dist: waitress>=2.1.0
|
|
|
29
27
|
Requires-Dist: Jinja2>=3.1.0
|
|
30
28
|
Requires-Dist: plotly>=5.0.0
|
|
31
29
|
Requires-Dist: kaleido>=0.2.0
|
|
32
|
-
Requires-Dist: cstag>=0.
|
|
30
|
+
Requires-Dist: cstag>=1.0.0
|
|
33
31
|
Requires-Dist: midsv>=0.10.1
|
|
34
32
|
Requires-Dist: wslPath>=0.3.0
|
|
35
33
|
|
|
@@ -56,14 +54,14 @@ The name DAJIN is derived from the phrase 一網**打尽** (Ichimou **DAJIN** or
|
|
|
56
54
|
+ **Comprehensive Mutation Detection**: Equipped with the capability to detect genome editing events over a wide range, it can identify a broad spectrum of mutations, from small changes to large structural variations.
|
|
57
55
|
+ DAJIN2 is also possible to detect complex mutations characteristic of genome editing, such as "insertions occurring in regions where deletions have occurred."
|
|
58
56
|
+ **Intuitive Visualization**: The outcomes of genome editing are visualized intuitively, allowing for the rapid and easy identification and analysis of mutations.
|
|
59
|
-
+ **Multi-Sample Compatibility**:
|
|
57
|
+
+ **Multi-Sample Compatibility**: Enabling parallel processing of multiple samples. This facilitates efficient progression of large-scale experiments and comparative studies.
|
|
60
58
|
|
|
61
59
|
|
|
62
60
|
## 🛠 Installation
|
|
63
61
|
|
|
64
62
|
### Prerequisites
|
|
65
63
|
|
|
66
|
-
- Python 3.
|
|
64
|
+
- Python 3.8 or later
|
|
67
65
|
- Unix-like environment (Linux, macOS, WSL2, etc.)
|
|
68
66
|
|
|
69
67
|
### From [Bioconda](https://anaconda.org/bioconda/DAJIN2) (Recommended)
|
|
@@ -92,7 +90,7 @@ pip install DAJIN2
|
|
|
92
90
|
> If you encounter any issues during the installation, please refer to the [Troubleshooting Guide](https://github.com/akikuno/DAJIN2/blob/main/docs/TROUBLESHOOTING.md)
|
|
93
91
|
|
|
94
92
|
|
|
95
|
-
##
|
|
93
|
+
## 💻 Usage
|
|
96
94
|
|
|
97
95
|
### Required Files
|
|
98
96
|
|
|
@@ -126,11 +124,11 @@ Assuming barcode01 as the control and barcode02 as the sample, specify each dire
|
|
|
126
124
|
The FASTA file should contain descriptions of the alleles anticipated as a result of genome editing.
|
|
127
125
|
|
|
128
126
|
> [!IMPORTANT]
|
|
129
|
-
>
|
|
127
|
+
> **A header name >control and its sequence are mandatory.**
|
|
130
128
|
|
|
131
129
|
If there are anticipated alleles (e.g., knock-ins or knock-outs), include their sequences in the FASTA file too. These anticipated alleles can be named arbitrarily.
|
|
132
130
|
|
|
133
|
-
Below is
|
|
131
|
+
Below is an example of a FASTA file:
|
|
134
132
|
|
|
135
133
|
```text
|
|
136
134
|
>control
|
|
@@ -313,16 +311,17 @@ For example, Tyr point mutation is highlighted in **green**.
|
|
|
313
311
|
### 3. MUTATION_INFO
|
|
314
312
|
|
|
315
313
|
The MUTATION_INFO directory saves tables depicting mutation sites for each allele.
|
|
316
|
-
An example of a Tyr point mutation is described by its position on the chromosome and the type of mutation.
|
|
314
|
+
An example of a *Tyr* point mutation is described by its position on the chromosome and the type of mutation.
|
|
317
315
|
|
|
318
316
|
<img src="https://user-images.githubusercontent.com/15861316/274519342-a613490d-5dbb-4a27-a2cf-bca0686b30f0.png" width="75%">
|
|
319
317
|
|
|
320
|
-
### 4. read_plot.html and read_plot.pdf
|
|
318
|
+
### 4. resd_summary.xlsx, read_plot.html and read_plot.pdf
|
|
321
319
|
|
|
320
|
+
read_summary.xlsx describes the number of reads and presence proportion for each allele.
|
|
322
321
|
Both read_plot.html and read_plot.pdf illustrate the proportions of each allele.
|
|
323
|
-
The chart's **Allele type** indicates the type of allele, and **Percent of reads** shows the proportion of reads for
|
|
322
|
+
The chart's **Allele type** indicates the type of allele, and **Percent of reads** shows the proportion of reads for each allele.
|
|
324
323
|
|
|
325
|
-
|
|
324
|
+
The **Allele type** includes:
|
|
326
325
|
- **Intact**: Alleles that perfectly match the input FASTA allele.
|
|
327
326
|
- **Indels**: Substitutions, deletions, insertions, or inversions within 50 bases.
|
|
328
327
|
- **SV**: Substitutions, deletions, insertions, or inversions beyond 50 bases.
|
|
@@ -333,14 +332,10 @@ Additionally, the types of **Allele type** include:
|
|
|
333
332
|
> In PCR amplicon sequencing, the % of reads might not match the actual allele proportions due to amplification bias.
|
|
334
333
|
> Especially when large deletions are present, the deletion alleles might be significantly amplified, potentially not reflecting the actual allele proportions.
|
|
335
334
|
|
|
336
|
-
### 5. read_summary.xlsx
|
|
337
|
-
|
|
338
|
-
- read_summary.xlsx: Describes the number of reads and presence proportion for each allele.
|
|
339
|
-
|
|
340
335
|
## 📣Feedback and Support
|
|
341
336
|
|
|
342
337
|
For questions, bug reports, or other forms of feedback, we'd love to hear from you!
|
|
343
|
-
Please use [GitHub Issues](https://github.com/akikuno/DAJIN2/issues) for all reporting purposes.
|
|
338
|
+
Please use [GitHub Issues](https://github.com/akikuno/DAJIN2/issues/new/choose) for all reporting purposes.
|
|
344
339
|
|
|
345
340
|
Please refer to [CONTRIBUTING](https://github.com/akikuno/DAJIN2/blob/main/docs/CONTRIBUTING.md) for how to contribute and how to verify your contributions.
|
|
346
341
|
|
|
@@ -21,14 +21,14 @@ The name DAJIN is derived from the phrase 一網**打尽** (Ichimou **DAJIN** or
|
|
|
21
21
|
+ **Comprehensive Mutation Detection**: Equipped with the capability to detect genome editing events over a wide range, it can identify a broad spectrum of mutations, from small changes to large structural variations.
|
|
22
22
|
+ DAJIN2 is also possible to detect complex mutations characteristic of genome editing, such as "insertions occurring in regions where deletions have occurred."
|
|
23
23
|
+ **Intuitive Visualization**: The outcomes of genome editing are visualized intuitively, allowing for the rapid and easy identification and analysis of mutations.
|
|
24
|
-
+ **Multi-Sample Compatibility**:
|
|
24
|
+
+ **Multi-Sample Compatibility**: Enabling parallel processing of multiple samples. This facilitates efficient progression of large-scale experiments and comparative studies.
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
## 🛠 Installation
|
|
28
28
|
|
|
29
29
|
### Prerequisites
|
|
30
30
|
|
|
31
|
-
- Python 3.
|
|
31
|
+
- Python 3.8 or later
|
|
32
32
|
- Unix-like environment (Linux, macOS, WSL2, etc.)
|
|
33
33
|
|
|
34
34
|
### From [Bioconda](https://anaconda.org/bioconda/DAJIN2) (Recommended)
|
|
@@ -57,7 +57,7 @@ pip install DAJIN2
|
|
|
57
57
|
> If you encounter any issues during the installation, please refer to the [Troubleshooting Guide](https://github.com/akikuno/DAJIN2/blob/main/docs/TROUBLESHOOTING.md)
|
|
58
58
|
|
|
59
59
|
|
|
60
|
-
##
|
|
60
|
+
## 💻 Usage
|
|
61
61
|
|
|
62
62
|
### Required Files
|
|
63
63
|
|
|
@@ -91,11 +91,11 @@ Assuming barcode01 as the control and barcode02 as the sample, specify each dire
|
|
|
91
91
|
The FASTA file should contain descriptions of the alleles anticipated as a result of genome editing.
|
|
92
92
|
|
|
93
93
|
> [!IMPORTANT]
|
|
94
|
-
>
|
|
94
|
+
> **A header name >control and its sequence are mandatory.**
|
|
95
95
|
|
|
96
96
|
If there are anticipated alleles (e.g., knock-ins or knock-outs), include their sequences in the FASTA file too. These anticipated alleles can be named arbitrarily.
|
|
97
97
|
|
|
98
|
-
Below is
|
|
98
|
+
Below is an example of a FASTA file:
|
|
99
99
|
|
|
100
100
|
```text
|
|
101
101
|
>control
|
|
@@ -278,16 +278,17 @@ For example, Tyr point mutation is highlighted in **green**.
|
|
|
278
278
|
### 3. MUTATION_INFO
|
|
279
279
|
|
|
280
280
|
The MUTATION_INFO directory saves tables depicting mutation sites for each allele.
|
|
281
|
-
An example of a Tyr point mutation is described by its position on the chromosome and the type of mutation.
|
|
281
|
+
An example of a *Tyr* point mutation is described by its position on the chromosome and the type of mutation.
|
|
282
282
|
|
|
283
283
|
<img src="https://user-images.githubusercontent.com/15861316/274519342-a613490d-5dbb-4a27-a2cf-bca0686b30f0.png" width="75%">
|
|
284
284
|
|
|
285
|
-
### 4. read_plot.html and read_plot.pdf
|
|
285
|
+
### 4. resd_summary.xlsx, read_plot.html and read_plot.pdf
|
|
286
286
|
|
|
287
|
+
read_summary.xlsx describes the number of reads and presence proportion for each allele.
|
|
287
288
|
Both read_plot.html and read_plot.pdf illustrate the proportions of each allele.
|
|
288
|
-
The chart's **Allele type** indicates the type of allele, and **Percent of reads** shows the proportion of reads for
|
|
289
|
+
The chart's **Allele type** indicates the type of allele, and **Percent of reads** shows the proportion of reads for each allele.
|
|
289
290
|
|
|
290
|
-
|
|
291
|
+
The **Allele type** includes:
|
|
291
292
|
- **Intact**: Alleles that perfectly match the input FASTA allele.
|
|
292
293
|
- **Indels**: Substitutions, deletions, insertions, or inversions within 50 bases.
|
|
293
294
|
- **SV**: Substitutions, deletions, insertions, or inversions beyond 50 bases.
|
|
@@ -298,14 +299,10 @@ Additionally, the types of **Allele type** include:
|
|
|
298
299
|
> In PCR amplicon sequencing, the % of reads might not match the actual allele proportions due to amplification bias.
|
|
299
300
|
> Especially when large deletions are present, the deletion alleles might be significantly amplified, potentially not reflecting the actual allele proportions.
|
|
300
301
|
|
|
301
|
-
### 5. read_summary.xlsx
|
|
302
|
-
|
|
303
|
-
- read_summary.xlsx: Describes the number of reads and presence proportion for each allele.
|
|
304
|
-
|
|
305
302
|
## 📣Feedback and Support
|
|
306
303
|
|
|
307
304
|
For questions, bug reports, or other forms of feedback, we'd love to hear from you!
|
|
308
|
-
Please use [GitHub Issues](https://github.com/akikuno/DAJIN2/issues) for all reporting purposes.
|
|
305
|
+
Please use [GitHub Issues](https://github.com/akikuno/DAJIN2/issues/new/choose) for all reporting purposes.
|
|
309
306
|
|
|
310
307
|
Please refer to [CONTRIBUTING](https://github.com/akikuno/DAJIN2/blob/main/docs/CONTRIBUTING.md) for how to contribute and how to verify your contributions.
|
|
311
308
|
|
|
@@ -3,11 +3,8 @@ scipy >= 1.6.0
|
|
|
3
3
|
pandas >= 1.0.0
|
|
4
4
|
openpyxl >= 3.0.0
|
|
5
5
|
rapidfuzz >=3.0.0
|
|
6
|
-
statsmodels >= 0.13.5
|
|
7
6
|
scikit-learn >= 1.0.0
|
|
8
7
|
|
|
9
|
-
openpyxl >= 3.0.0
|
|
10
|
-
|
|
11
8
|
mappy >= 2.24
|
|
12
9
|
pysam >= 0.19.0
|
|
13
10
|
|
|
@@ -18,6 +15,6 @@ Jinja2 >= 3.1.0
|
|
|
18
15
|
plotly >= 5.0.0
|
|
19
16
|
kaleido >= 0.2.0
|
|
20
17
|
|
|
21
|
-
cstag >= 0.
|
|
18
|
+
cstag >= 1.0.0
|
|
22
19
|
midsv >= 0.10.1
|
|
23
20
|
wslPath >=0.3.0
|
|
@@ -9,7 +9,7 @@ with open("requirements.txt") as requirements_file:
|
|
|
9
9
|
|
|
10
10
|
setuptools.setup(
|
|
11
11
|
name="DAJIN2",
|
|
12
|
-
version="0.4.
|
|
12
|
+
version="0.4.2",
|
|
13
13
|
author="Akihiro Kuno",
|
|
14
14
|
author_email="akuno@md.tsukuba.ac.jp",
|
|
15
15
|
description="One-step genotyping tools for targeted long-read sequencing",
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from
|
|
4
|
+
from dataclasses import dataclass
|
|
5
5
|
from itertools import groupby
|
|
6
6
|
from collections import defaultdict
|
|
7
7
|
|
|
@@ -90,7 +90,8 @@ def call_percentage(cssplits: list[list[str]], mutation_loci: list[set[str]]) ->
|
|
|
90
90
|
###########################################################
|
|
91
91
|
|
|
92
92
|
|
|
93
|
-
|
|
93
|
+
@dataclass(frozen=True)
|
|
94
|
+
class ConsensusKey:
|
|
94
95
|
allele: str
|
|
95
96
|
label: int
|
|
96
97
|
percent: float
|
|
@@ -1,13 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import re
|
|
4
|
-
from
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class ConsensusKey(NamedTuple):
|
|
8
|
-
allele: str
|
|
9
|
-
label: int
|
|
10
|
-
percent: float
|
|
4
|
+
from DAJIN2.core.consensus.consensus import ConsensusKey
|
|
11
5
|
|
|
12
6
|
|
|
13
7
|
def _detect_sv(cons_percentages: dict[ConsensusKey, list], threshold: int = 50) -> list[bool]:
|
|
@@ -2,119 +2,16 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import shutil
|
|
4
4
|
import logging
|
|
5
|
-
import uuid
|
|
6
5
|
|
|
7
6
|
from pathlib import Path
|
|
8
|
-
from typing import NamedTuple
|
|
9
|
-
from collections import defaultdict
|
|
10
7
|
|
|
11
|
-
from DAJIN2.utils import io,
|
|
8
|
+
from DAJIN2.utils import io, fastx_handler
|
|
12
9
|
from DAJIN2.core import classification, clustering, consensus, preprocess, report
|
|
10
|
+
from DAJIN2.core.preprocess.input_formatter import FormattedInputs
|
|
13
11
|
|
|
14
12
|
logger = logging.getLogger(__name__)
|
|
15
13
|
|
|
16
14
|
|
|
17
|
-
def parse_arguments(arguments: dict) -> tuple:
|
|
18
|
-
genome_urls = defaultdict(str)
|
|
19
|
-
if arguments.get("genome"):
|
|
20
|
-
genome_urls.update(
|
|
21
|
-
{"genome": arguments["genome"], "blat": arguments["blat"], "goldenpath": arguments["goldenpath"]}
|
|
22
|
-
)
|
|
23
|
-
|
|
24
|
-
return (
|
|
25
|
-
arguments["sample"],
|
|
26
|
-
arguments["control"],
|
|
27
|
-
arguments["allele"],
|
|
28
|
-
arguments["name"],
|
|
29
|
-
arguments["threads"],
|
|
30
|
-
genome_urls,
|
|
31
|
-
uuid.uuid4().hex,
|
|
32
|
-
)
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def convert_input_paths_to_posix(sample: str, control: str, allele: str) -> tuple:
|
|
36
|
-
sample = io.convert_to_posix(sample)
|
|
37
|
-
control = io.convert_to_posix(control)
|
|
38
|
-
allele = io.convert_to_posix(allele)
|
|
39
|
-
|
|
40
|
-
return sample, control, allele
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def create_temporal_directory(name: str, control_name: str) -> Path:
|
|
44
|
-
tempdir = Path(config.TEMP_ROOT_DIR, name)
|
|
45
|
-
Path(tempdir, "cache", ".igvjs", control_name).mkdir(parents=True, exist_ok=True)
|
|
46
|
-
|
|
47
|
-
return tempdir
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
def check_caches(tempdir: Path, path_allele: str, genome_url: str) -> bool:
|
|
51
|
-
is_cache_hash = preprocess.cache_checker.exists_cached_hash(tempdir=tempdir, path=path_allele)
|
|
52
|
-
is_cache_genome = preprocess.cache_checker.exists_cached_genome(tempdir=tempdir, genome=genome_url)
|
|
53
|
-
|
|
54
|
-
return is_cache_hash and is_cache_genome
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
def get_genome_coordinates(genome_urls: dict, fasta_alleles: dict, is_cache_genome: bool, tempdir: Path) -> dict:
|
|
58
|
-
genome_coordinates = {
|
|
59
|
-
"genome": genome_urls["genome"],
|
|
60
|
-
"chrom_size": 0,
|
|
61
|
-
"chrom": "control",
|
|
62
|
-
"start": 0,
|
|
63
|
-
"end": len(fasta_alleles["control"]) - 1,
|
|
64
|
-
"strand": "+",
|
|
65
|
-
}
|
|
66
|
-
if genome_urls["genome"]:
|
|
67
|
-
if is_cache_genome:
|
|
68
|
-
genome_coordinates = next(io.read_jsonl(Path(tempdir, "cache", "genome_coordinates.jsonl")))
|
|
69
|
-
else:
|
|
70
|
-
genome_coordinates = preprocess.genome_fetcher.fetch_coordinates(
|
|
71
|
-
genome_coordinates, genome_urls, fasta_alleles["control"]
|
|
72
|
-
)
|
|
73
|
-
genome_coordinates["chrom_size"] = preprocess.genome_fetcher.fetch_chromosome_size(
|
|
74
|
-
genome_coordinates, genome_urls
|
|
75
|
-
)
|
|
76
|
-
io.write_jsonl([genome_coordinates], Path(tempdir, "cache", "genome_coordinates.jsonl"))
|
|
77
|
-
|
|
78
|
-
return genome_coordinates
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
class FormattedInputs(NamedTuple):
|
|
82
|
-
path_sample: str
|
|
83
|
-
path_control: str
|
|
84
|
-
path_allele: str
|
|
85
|
-
sample_name: str
|
|
86
|
-
control_name: str
|
|
87
|
-
fasta_alleles: dict[str, str]
|
|
88
|
-
tempdir: Path
|
|
89
|
-
genome_coordinates: dict[str, str]
|
|
90
|
-
threads: int
|
|
91
|
-
uuid: str
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
def format_inputs(arguments: dict) -> FormattedInputs:
|
|
95
|
-
path_sample, path_control, path_allele, name, threads, genome_urls, uuid = parse_arguments(arguments)
|
|
96
|
-
path_sample, path_control, path_allele = convert_input_paths_to_posix(path_sample, path_control, path_allele)
|
|
97
|
-
sample_name = preprocess.fastx_parser.extract_basename(path_sample)
|
|
98
|
-
control_name = preprocess.fastx_parser.extract_basename(path_control)
|
|
99
|
-
fasta_alleles = preprocess.fastx_parser.dictionize_allele(path_allele)
|
|
100
|
-
tempdir = create_temporal_directory(name, control_name)
|
|
101
|
-
is_cache_genome = check_caches(tempdir, path_allele, genome_urls["genome"])
|
|
102
|
-
genome_coordinates = get_genome_coordinates(genome_urls, fasta_alleles, is_cache_genome, tempdir)
|
|
103
|
-
|
|
104
|
-
return FormattedInputs(
|
|
105
|
-
path_sample,
|
|
106
|
-
path_control,
|
|
107
|
-
path_allele,
|
|
108
|
-
sample_name,
|
|
109
|
-
control_name,
|
|
110
|
-
fasta_alleles,
|
|
111
|
-
tempdir,
|
|
112
|
-
genome_coordinates,
|
|
113
|
-
threads,
|
|
114
|
-
uuid,
|
|
115
|
-
)
|
|
116
|
-
|
|
117
|
-
|
|
118
15
|
###########################################################
|
|
119
16
|
# main
|
|
120
17
|
###########################################################
|
|
@@ -126,9 +23,9 @@ def execute_control(arguments: dict):
|
|
|
126
23
|
###########################################################
|
|
127
24
|
# Preprocess
|
|
128
25
|
###########################################################
|
|
129
|
-
ARGS = format_inputs(arguments)
|
|
130
|
-
preprocess.
|
|
131
|
-
preprocess.
|
|
26
|
+
ARGS: FormattedInputs = preprocess.format_inputs(arguments)
|
|
27
|
+
preprocess.create_temporal_directories(ARGS.tempdir, ARGS.control_name, is_control=True)
|
|
28
|
+
preprocess.create_report_directories(ARGS.tempdir, ARGS.control_name, is_control=True)
|
|
132
29
|
io.cache_control_hash(ARGS.tempdir, ARGS.path_allele)
|
|
133
30
|
|
|
134
31
|
###########################################################
|
|
@@ -151,7 +48,7 @@ def execute_control(arguments: dict):
|
|
|
151
48
|
# ============================================================
|
|
152
49
|
# Export fasta files as single-FASTA format
|
|
153
50
|
# ============================================================
|
|
154
|
-
|
|
51
|
+
fastx_handler.export_fasta_files(ARGS.tempdir, ARGS.fasta_alleles, ARGS.control_name)
|
|
155
52
|
|
|
156
53
|
# ============================================================
|
|
157
54
|
# Mapping using mappy
|
|
@@ -189,9 +86,9 @@ def execute_sample(arguments: dict):
|
|
|
189
86
|
# Preprocess
|
|
190
87
|
###########################################################
|
|
191
88
|
|
|
192
|
-
ARGS = format_inputs(arguments)
|
|
193
|
-
preprocess.
|
|
194
|
-
preprocess.
|
|
89
|
+
ARGS: FormattedInputs = preprocess.format_inputs(arguments)
|
|
90
|
+
preprocess.create_temporal_directories(ARGS.tempdir, ARGS.sample_name, is_control=False)
|
|
91
|
+
preprocess.create_report_directories(ARGS.tempdir, ARGS.sample_name, is_control=False)
|
|
195
92
|
|
|
196
93
|
logger.info(f"Preprocess {arguments['sample']}...")
|
|
197
94
|
|
|
@@ -209,7 +106,7 @@ def execute_sample(arguments: dict):
|
|
|
209
106
|
shutil.copy(path_fasta, Path(ARGS.tempdir, ARGS.sample_name, "fasta"))
|
|
210
107
|
|
|
211
108
|
paths_fasta = Path(ARGS.tempdir, ARGS.sample_name, "fasta").glob("*.fasta")
|
|
212
|
-
preprocess.
|
|
109
|
+
preprocess.generate_sam(ARGS, paths_fasta, is_control=False, is_insertion=False)
|
|
213
110
|
|
|
214
111
|
# ============================================================
|
|
215
112
|
# MIDSV conversion
|
|
@@ -234,8 +131,8 @@ def execute_sample(arguments: dict):
|
|
|
234
131
|
|
|
235
132
|
if paths_insertion_fasta:
|
|
236
133
|
# mapping to insertion alleles
|
|
237
|
-
preprocess.
|
|
238
|
-
preprocess.
|
|
134
|
+
preprocess.generate_sam(ARGS, paths_insertion_fasta, is_control=True, is_insertion=True)
|
|
135
|
+
preprocess.generate_sam(ARGS, paths_insertion_fasta, is_control=False, is_insertion=True)
|
|
239
136
|
# add insertions to ARGS.fasta_alleles
|
|
240
137
|
for path_fasta in paths_insertion_fasta:
|
|
241
138
|
allele, seq = Path(path_fasta).read_text().strip().split("\n")
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
from DAJIN2.core.preprocess.cache_checker import exists_cached_hash, exists_cached_genome
|
|
2
|
+
from DAJIN2.core.preprocess.genome_fetcher import fetch_coordinates, fetch_chromosome_size
|
|
3
|
+
from DAJIN2.core.preprocess.mapping import generate_sam
|
|
4
|
+
from DAJIN2.core.preprocess.directory_manager import create_temporal_directories, create_report_directories
|
|
5
|
+
from DAJIN2.core.preprocess.input_formatter import format_inputs
|
|
6
|
+
from DAJIN2.core.preprocess.midsv_caller import generate_midsv
|
|
7
|
+
from DAJIN2.core.preprocess.knockin_handler import extract_knockin_loci
|
|
8
|
+
from DAJIN2.core.preprocess.mutation_extractor import cache_mutation_loci
|
|
9
|
+
from DAJIN2.core.preprocess.insertions_to_fasta import generate_insertion_fasta
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import uuid
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
|
|
9
|
+
from DAJIN2.utils import io, config, fastx_handler
|
|
10
|
+
|
|
11
|
+
from DAJIN2.core import preprocess
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def parse_arguments(arguments: dict) -> tuple:
|
|
15
|
+
genome_urls = defaultdict(str)
|
|
16
|
+
if arguments.get("genome"):
|
|
17
|
+
genome_urls.update(
|
|
18
|
+
{"genome": arguments["genome"], "blat": arguments["blat"], "goldenpath": arguments["goldenpath"]}
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
return (
|
|
22
|
+
arguments["sample"],
|
|
23
|
+
arguments["control"],
|
|
24
|
+
arguments["allele"],
|
|
25
|
+
arguments["name"],
|
|
26
|
+
arguments["threads"],
|
|
27
|
+
genome_urls,
|
|
28
|
+
uuid.uuid4().hex,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def convert_input_paths_to_posix(sample: str, control: str, allele: str) -> tuple:
|
|
33
|
+
sample = io.convert_to_posix(sample)
|
|
34
|
+
control = io.convert_to_posix(control)
|
|
35
|
+
allele = io.convert_to_posix(allele)
|
|
36
|
+
|
|
37
|
+
return sample, control, allele
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def create_temporal_directory(name: str, control_name: str) -> Path:
|
|
41
|
+
tempdir = Path(config.TEMP_ROOT_DIR, name)
|
|
42
|
+
Path(tempdir, "cache", ".igvjs", control_name).mkdir(parents=True, exist_ok=True)
|
|
43
|
+
|
|
44
|
+
return tempdir
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def check_caches(tempdir: Path, path_allele: str, genome_url: str) -> bool:
|
|
48
|
+
is_cache_hash = preprocess.exists_cached_hash(tempdir=tempdir, path=path_allele)
|
|
49
|
+
is_cache_genome = preprocess.exists_cached_genome(tempdir=tempdir, genome=genome_url)
|
|
50
|
+
|
|
51
|
+
return is_cache_hash and is_cache_genome
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def get_genome_coordinates(genome_urls: dict, fasta_alleles: dict, is_cache_genome: bool, tempdir: Path) -> dict:
|
|
55
|
+
genome_coordinates = {
|
|
56
|
+
"genome": genome_urls["genome"],
|
|
57
|
+
"chrom_size": 0,
|
|
58
|
+
"chrom": "control",
|
|
59
|
+
"start": 0,
|
|
60
|
+
"end": len(fasta_alleles["control"]) - 1,
|
|
61
|
+
"strand": "+",
|
|
62
|
+
}
|
|
63
|
+
if genome_urls["genome"]:
|
|
64
|
+
if is_cache_genome:
|
|
65
|
+
genome_coordinates = next(io.read_jsonl(Path(tempdir, "cache", "genome_coordinates.jsonl")))
|
|
66
|
+
else:
|
|
67
|
+
genome_coordinates = preprocess.fetch_coordinates(genome_coordinates, genome_urls, fasta_alleles["control"])
|
|
68
|
+
genome_coordinates["chrom_size"] = preprocess.fetch_chromosome_size(genome_coordinates, genome_urls)
|
|
69
|
+
io.write_jsonl([genome_coordinates], Path(tempdir, "cache", "genome_coordinates.jsonl"))
|
|
70
|
+
|
|
71
|
+
return genome_coordinates
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@dataclass(frozen=True)
|
|
75
|
+
class FormattedInputs:
|
|
76
|
+
path_sample: str
|
|
77
|
+
path_control: str
|
|
78
|
+
path_allele: str
|
|
79
|
+
sample_name: str
|
|
80
|
+
control_name: str
|
|
81
|
+
fasta_alleles: dict[str, str]
|
|
82
|
+
tempdir: Path
|
|
83
|
+
genome_coordinates: dict[str, str]
|
|
84
|
+
threads: int
|
|
85
|
+
uuid: str
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def format_inputs(arguments: dict) -> FormattedInputs:
|
|
89
|
+
path_sample, path_control, path_allele, name, threads, genome_urls, uuid = parse_arguments(arguments)
|
|
90
|
+
path_sample, path_control, path_allele = convert_input_paths_to_posix(path_sample, path_control, path_allele)
|
|
91
|
+
sample_name = fastx_handler.extract_filename(path_sample)
|
|
92
|
+
control_name = fastx_handler.extract_filename(path_control)
|
|
93
|
+
fasta_alleles = fastx_handler.dictionize_allele(path_allele)
|
|
94
|
+
tempdir = create_temporal_directory(name, control_name)
|
|
95
|
+
is_cache_genome = check_caches(tempdir, path_allele, genome_urls["genome"])
|
|
96
|
+
genome_coordinates = get_genome_coordinates(genome_urls, fasta_alleles, is_cache_genome, tempdir)
|
|
97
|
+
|
|
98
|
+
return FormattedInputs(
|
|
99
|
+
path_sample,
|
|
100
|
+
path_control,
|
|
101
|
+
path_allele,
|
|
102
|
+
sample_name,
|
|
103
|
+
control_name,
|
|
104
|
+
fasta_alleles,
|
|
105
|
+
tempdir,
|
|
106
|
+
genome_coordinates,
|
|
107
|
+
threads,
|
|
108
|
+
uuid,
|
|
109
|
+
)
|
|
@@ -215,8 +215,8 @@ def generate_midsv(ARGS, is_control: bool = False, is_insertion: bool = False) -
|
|
|
215
215
|
path_splice = Path(ARGS.tempdir, name, "sam", f"splice_{allele}.sam")
|
|
216
216
|
path_output_midsv = Path(ARGS.tempdir, name, "midsv", f"{allele}.json")
|
|
217
217
|
|
|
218
|
-
sam_ont = sam_handler.remove_overlapped_reads(list(
|
|
219
|
-
sam_splice = sam_handler.remove_overlapped_reads(list(
|
|
218
|
+
sam_ont = sam_handler.remove_overlapped_reads(list(sam_handler.read_sam(path_ont)))
|
|
219
|
+
sam_splice = sam_handler.remove_overlapped_reads(list(sam_handler.read_sam(path_splice)))
|
|
220
220
|
qname_of_map_ont = extract_qname_of_map_ont(sam_ont, sam_splice)
|
|
221
221
|
sam_of_map_ont = filter_sam_by_preset(sam_ont, qname_of_map_ont, preset="map-ont")
|
|
222
222
|
sam_of_splice = filter_sam_by_preset(sam_splice, qname_of_map_ont, preset="splice")
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import gzip
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import mappy
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
#################################################
|
|
11
|
+
# Helper function
|
|
12
|
+
#################################################
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def sanitize_filename(path_file: Path | str) -> str:
|
|
16
|
+
"""
|
|
17
|
+
Sanitize the path_file by replacing invalid characters on Windows OS with '-'
|
|
18
|
+
"""
|
|
19
|
+
path_file = str(path_file).lstrip()
|
|
20
|
+
if not path_file:
|
|
21
|
+
raise ValueError("Provided FASTA/FASTQ is empty or consists only of whitespace")
|
|
22
|
+
return re.sub(r'[\\/:?.,\'"<>| ]', "-", path_file)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
#################################################
|
|
26
|
+
# Extract filename
|
|
27
|
+
#################################################
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def extract_filename(path_fasta: Path | str) -> str:
|
|
31
|
+
filename = Path(path_fasta).name
|
|
32
|
+
filename = re.sub(r"\..*$", "", filename) # Remove file extension
|
|
33
|
+
return sanitize_filename(filename)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
#################################################
|
|
37
|
+
# Convert allele file to dictionary type fasta format
|
|
38
|
+
#################################################
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def dictionize_allele(path_fasta: str | Path) -> dict[str, str]:
|
|
42
|
+
return {sanitize_filename(name): seq.upper() for name, seq, _ in mappy.fastx_read(str(path_fasta))}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
#################################################
|
|
46
|
+
# Export fasta files as single-FASTA format
|
|
47
|
+
#################################################
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def export_fasta_files(TEMPDIR: Path, FASTA_ALLELES: dict, NAME: str) -> None:
|
|
51
|
+
"""+ Save multiple FASTAs in separate single-FASTA format files."""
|
|
52
|
+
for identifier, sequence in FASTA_ALLELES.items():
|
|
53
|
+
contents = "\n".join([">" + identifier, sequence]) + "\n"
|
|
54
|
+
output_fasta = Path(TEMPDIR, NAME, "fasta", f"{identifier}.fasta")
|
|
55
|
+
output_fasta.write_text(contents)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
#################################################
|
|
59
|
+
# save_concatenated_fastx
|
|
60
|
+
#################################################
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def extract_extention(path_file: Path) -> str:
|
|
64
|
+
suffixes = path_file.suffixes
|
|
65
|
+
return "".join(suffixes)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def is_gzip_file(path_file: Path) -> bool:
|
|
69
|
+
"""Check if a file is a GZip compressed file."""
|
|
70
|
+
try:
|
|
71
|
+
with path_file.open("rb") as f:
|
|
72
|
+
return f.read(2) == b"\x1f\x8b"
|
|
73
|
+
except IOError:
|
|
74
|
+
return False
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def save_fastq_as_gzip(TEMPDIR: Path, path_fastx: list[Path], barcode: str) -> None:
|
|
78
|
+
"""Merge gzip and non-gzip files into a single gzip file."""
|
|
79
|
+
with gzip.open(Path(TEMPDIR, barcode, "fastq", f"{barcode}.fastq.gz"), "wb") as merged_file:
|
|
80
|
+
for path_file in path_fastx:
|
|
81
|
+
if is_gzip_file(path_file):
|
|
82
|
+
with gzip.open(path_file, "rb") as f:
|
|
83
|
+
merged_file.write(f.read())
|
|
84
|
+
else:
|
|
85
|
+
with open(path_file, "r") as f:
|
|
86
|
+
merged_file.write(f.read().encode())
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def save_concatenated_fastx(TEMPDIR: Path, directory: str) -> None:
|
|
90
|
+
fastx_suffix = {".fa", ".fq", ".fasta", ".fastq", ".fa.gz", ".fq.gz", ".fasta.gz", ".fastq.gz"}
|
|
91
|
+
path_directory = Path(directory)
|
|
92
|
+
barcode = path_directory.stem
|
|
93
|
+
path_fastx = [path for path in path_directory.iterdir() if extract_extention(path) in fastx_suffix]
|
|
94
|
+
save_fastq_as_gzip(TEMPDIR, path_fastx, barcode)
|
|
@@ -23,40 +23,51 @@ def update_threads(threads: int) -> int:
|
|
|
23
23
|
########################################################################
|
|
24
24
|
|
|
25
25
|
|
|
26
|
-
def validate_file_existence(
|
|
27
|
-
if not Path(
|
|
28
|
-
raise FileNotFoundError(f"{
|
|
26
|
+
def validate_file_existence(path_file: str):
|
|
27
|
+
if not Path(path_file).exists():
|
|
28
|
+
raise FileNotFoundError(f"{path_file} is not found")
|
|
29
29
|
|
|
30
30
|
|
|
31
|
-
def validate_fastq_extension(
|
|
32
|
-
if not re.search(r".fastq$|.fastq.gz$|.fq$|.fq.gz$",
|
|
33
|
-
raise ValueError(f"{
|
|
31
|
+
def validate_fastq_extension(path_fastq: str):
|
|
32
|
+
if not re.search(r".fastq$|.fastq.gz$|.fq$|.fq.gz$", path_fastq):
|
|
33
|
+
raise ValueError(f"{path_fastq} requires extensions either 'fastq', 'fastq.gz', 'fq' or 'fq.gz'")
|
|
34
34
|
|
|
35
35
|
|
|
36
|
-
# Varidate if the file is in the proper format
|
|
37
|
-
|
|
38
|
-
def validate_fastq_content(fastq_path: str):
|
|
36
|
+
# Varidate if the file is in the proper format viewing top 100 lines
|
|
37
|
+
def validate_fastq_content(path_fastq: str):
|
|
39
38
|
try:
|
|
40
|
-
|
|
41
|
-
|
|
39
|
+
headers, seqs, quals = zip(*[(n, s, q) for i, (n, s, q) in enumerate(mappy.fastx_read(path_fastq)) if i < 100])
|
|
40
|
+
# Remove empty elements
|
|
41
|
+
headers = [header for header in headers if header]
|
|
42
|
+
seqs = [seq for seq in seqs if seq]
|
|
43
|
+
quals = [qual for qual in quals if qual]
|
|
44
|
+
|
|
45
|
+
if not (len(headers) == len(seqs) == len(quals) > 0):
|
|
42
46
|
raise ValueError
|
|
47
|
+
|
|
43
48
|
except ValueError:
|
|
44
|
-
raise ValueError(f"{
|
|
49
|
+
raise ValueError(f"{path_fastq} is not a proper FASTQ format")
|
|
45
50
|
|
|
46
51
|
|
|
47
|
-
def validate_fasta_content(
|
|
52
|
+
def validate_fasta_content(path_fasta: str):
|
|
48
53
|
try:
|
|
49
|
-
|
|
50
|
-
|
|
54
|
+
headers, seqs = zip(*[(n, s) for n, s, _ in mappy.fastx_read(path_fasta)])
|
|
55
|
+
# Remove empty elements
|
|
56
|
+
headers = [header for header in headers if header]
|
|
57
|
+
seqs = [seq for seq in seqs if seq]
|
|
58
|
+
|
|
59
|
+
if len(headers) != len(seqs) or not headers:
|
|
51
60
|
raise ValueError
|
|
61
|
+
|
|
52
62
|
except ValueError:
|
|
53
|
-
raise ValueError(f"{
|
|
54
|
-
|
|
55
|
-
|
|
63
|
+
raise ValueError(f"{path_fasta} is not a proper FASTA format")
|
|
64
|
+
|
|
65
|
+
if len(headers) != len(set(headers)):
|
|
66
|
+
raise ValueError(f"{path_fasta} must include unique identifiers")
|
|
56
67
|
if len(seqs) != len(set(seqs)):
|
|
57
|
-
raise ValueError(f"{
|
|
58
|
-
if "control" not in
|
|
59
|
-
raise ValueError(f"One of the headers in the {
|
|
68
|
+
raise ValueError(f"{path_fasta} must include unique DNA sequences")
|
|
69
|
+
if "control" not in headers:
|
|
70
|
+
raise ValueError(f"One of the headers in the {path_fasta} must be '>control'")
|
|
60
71
|
|
|
61
72
|
|
|
62
73
|
def validate_files(SAMPLE: str, CONTROL: str, ALLELE: str) -> None:
|
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import re
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Generator
|
|
4
7
|
from itertools import groupby
|
|
5
8
|
from DAJIN2.utils.dna_handler import revcomp
|
|
6
9
|
|
|
@@ -22,6 +25,17 @@ def is_mapped(s: list[str]) -> bool:
|
|
|
22
25
|
return not s[0].startswith("@") and s[9] != "*"
|
|
23
26
|
|
|
24
27
|
|
|
28
|
+
###########################################################
|
|
29
|
+
# Read sam
|
|
30
|
+
###########################################################
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def read_sam(path_of_sam: str | Path) -> Generator[list]:
|
|
34
|
+
with open(path_of_sam) as f:
|
|
35
|
+
for line in f:
|
|
36
|
+
yield line.strip().split("\t")
|
|
37
|
+
|
|
38
|
+
|
|
25
39
|
###########################################################
|
|
26
40
|
# remove_overlapped_reads
|
|
27
41
|
###########################################################
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: DAJIN2
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.2
|
|
4
4
|
Summary: One-step genotyping tools for targeted long-read sequencing
|
|
5
5
|
Home-page: https://github.com/akikuno/DAJIN2
|
|
6
6
|
Author: Akihiro Kuno
|
|
@@ -19,9 +19,7 @@ Requires-Dist: scipy>=1.6.0
|
|
|
19
19
|
Requires-Dist: pandas>=1.0.0
|
|
20
20
|
Requires-Dist: openpyxl>=3.0.0
|
|
21
21
|
Requires-Dist: rapidfuzz>=3.0.0
|
|
22
|
-
Requires-Dist: statsmodels>=0.13.5
|
|
23
22
|
Requires-Dist: scikit-learn>=1.0.0
|
|
24
|
-
Requires-Dist: openpyxl>=3.0.0
|
|
25
23
|
Requires-Dist: mappy>=2.24
|
|
26
24
|
Requires-Dist: pysam>=0.19.0
|
|
27
25
|
Requires-Dist: Flask>=2.2.0
|
|
@@ -29,7 +27,7 @@ Requires-Dist: waitress>=2.1.0
|
|
|
29
27
|
Requires-Dist: Jinja2>=3.1.0
|
|
30
28
|
Requires-Dist: plotly>=5.0.0
|
|
31
29
|
Requires-Dist: kaleido>=0.2.0
|
|
32
|
-
Requires-Dist: cstag>=0.
|
|
30
|
+
Requires-Dist: cstag>=1.0.0
|
|
33
31
|
Requires-Dist: midsv>=0.10.1
|
|
34
32
|
Requires-Dist: wslPath>=0.3.0
|
|
35
33
|
|
|
@@ -56,14 +54,14 @@ The name DAJIN is derived from the phrase 一網**打尽** (Ichimou **DAJIN** or
|
|
|
56
54
|
+ **Comprehensive Mutation Detection**: Equipped with the capability to detect genome editing events over a wide range, it can identify a broad spectrum of mutations, from small changes to large structural variations.
|
|
57
55
|
+ DAJIN2 is also possible to detect complex mutations characteristic of genome editing, such as "insertions occurring in regions where deletions have occurred."
|
|
58
56
|
+ **Intuitive Visualization**: The outcomes of genome editing are visualized intuitively, allowing for the rapid and easy identification and analysis of mutations.
|
|
59
|
-
+ **Multi-Sample Compatibility**:
|
|
57
|
+
+ **Multi-Sample Compatibility**: Enabling parallel processing of multiple samples. This facilitates efficient progression of large-scale experiments and comparative studies.
|
|
60
58
|
|
|
61
59
|
|
|
62
60
|
## 🛠 Installation
|
|
63
61
|
|
|
64
62
|
### Prerequisites
|
|
65
63
|
|
|
66
|
-
- Python 3.
|
|
64
|
+
- Python 3.8 or later
|
|
67
65
|
- Unix-like environment (Linux, macOS, WSL2, etc.)
|
|
68
66
|
|
|
69
67
|
### From [Bioconda](https://anaconda.org/bioconda/DAJIN2) (Recommended)
|
|
@@ -92,7 +90,7 @@ pip install DAJIN2
|
|
|
92
90
|
> If you encounter any issues during the installation, please refer to the [Troubleshooting Guide](https://github.com/akikuno/DAJIN2/blob/main/docs/TROUBLESHOOTING.md)
|
|
93
91
|
|
|
94
92
|
|
|
95
|
-
##
|
|
93
|
+
## 💻 Usage
|
|
96
94
|
|
|
97
95
|
### Required Files
|
|
98
96
|
|
|
@@ -126,11 +124,11 @@ Assuming barcode01 as the control and barcode02 as the sample, specify each dire
|
|
|
126
124
|
The FASTA file should contain descriptions of the alleles anticipated as a result of genome editing.
|
|
127
125
|
|
|
128
126
|
> [!IMPORTANT]
|
|
129
|
-
>
|
|
127
|
+
> **A header name >control and its sequence are mandatory.**
|
|
130
128
|
|
|
131
129
|
If there are anticipated alleles (e.g., knock-ins or knock-outs), include their sequences in the FASTA file too. These anticipated alleles can be named arbitrarily.
|
|
132
130
|
|
|
133
|
-
Below is
|
|
131
|
+
Below is an example of a FASTA file:
|
|
134
132
|
|
|
135
133
|
```text
|
|
136
134
|
>control
|
|
@@ -313,16 +311,17 @@ For example, Tyr point mutation is highlighted in **green**.
|
|
|
313
311
|
### 3. MUTATION_INFO
|
|
314
312
|
|
|
315
313
|
The MUTATION_INFO directory saves tables depicting mutation sites for each allele.
|
|
316
|
-
An example of a Tyr point mutation is described by its position on the chromosome and the type of mutation.
|
|
314
|
+
An example of a *Tyr* point mutation is described by its position on the chromosome and the type of mutation.
|
|
317
315
|
|
|
318
316
|
<img src="https://user-images.githubusercontent.com/15861316/274519342-a613490d-5dbb-4a27-a2cf-bca0686b30f0.png" width="75%">
|
|
319
317
|
|
|
320
|
-
### 4. read_plot.html and read_plot.pdf
|
|
318
|
+
### 4. resd_summary.xlsx, read_plot.html and read_plot.pdf
|
|
321
319
|
|
|
320
|
+
read_summary.xlsx describes the number of reads and presence proportion for each allele.
|
|
322
321
|
Both read_plot.html and read_plot.pdf illustrate the proportions of each allele.
|
|
323
|
-
The chart's **Allele type** indicates the type of allele, and **Percent of reads** shows the proportion of reads for
|
|
322
|
+
The chart's **Allele type** indicates the type of allele, and **Percent of reads** shows the proportion of reads for each allele.
|
|
324
323
|
|
|
325
|
-
|
|
324
|
+
The **Allele type** includes:
|
|
326
325
|
- **Intact**: Alleles that perfectly match the input FASTA allele.
|
|
327
326
|
- **Indels**: Substitutions, deletions, insertions, or inversions within 50 bases.
|
|
328
327
|
- **SV**: Substitutions, deletions, insertions, or inversions beyond 50 bases.
|
|
@@ -333,14 +332,10 @@ Additionally, the types of **Allele type** include:
|
|
|
333
332
|
> In PCR amplicon sequencing, the % of reads might not match the actual allele proportions due to amplification bias.
|
|
334
333
|
> Especially when large deletions are present, the deletion alleles might be significantly amplified, potentially not reflecting the actual allele proportions.
|
|
335
334
|
|
|
336
|
-
### 5. read_summary.xlsx
|
|
337
|
-
|
|
338
|
-
- read_summary.xlsx: Describes the number of reads and presence proportion for each allele.
|
|
339
|
-
|
|
340
335
|
## 📣Feedback and Support
|
|
341
336
|
|
|
342
337
|
For questions, bug reports, or other forms of feedback, we'd love to hear from you!
|
|
343
|
-
Please use [GitHub Issues](https://github.com/akikuno/DAJIN2/issues) for all reporting purposes.
|
|
338
|
+
Please use [GitHub Issues](https://github.com/akikuno/DAJIN2/issues/new/choose) for all reporting purposes.
|
|
344
339
|
|
|
345
340
|
Please refer to [CONTRIBUTING](https://github.com/akikuno/DAJIN2/blob/main/docs/CONTRIBUTING.md) for how to contribute and how to verify your contributions.
|
|
346
341
|
|
|
@@ -36,10 +36,10 @@ src/DAJIN2/core/consensus/name_handler.py
|
|
|
36
36
|
src/DAJIN2/core/consensus/similarity_searcher.py
|
|
37
37
|
src/DAJIN2/core/preprocess/__init__.py
|
|
38
38
|
src/DAJIN2/core/preprocess/cache_checker.py
|
|
39
|
-
src/DAJIN2/core/preprocess/
|
|
40
|
-
src/DAJIN2/core/preprocess/fastx_parser.py
|
|
39
|
+
src/DAJIN2/core/preprocess/directory_manager.py
|
|
41
40
|
src/DAJIN2/core/preprocess/genome_fetcher.py
|
|
42
41
|
src/DAJIN2/core/preprocess/homopolymer_handler.py
|
|
42
|
+
src/DAJIN2/core/preprocess/input_formatter.py
|
|
43
43
|
src/DAJIN2/core/preprocess/insertions_to_fasta.py
|
|
44
44
|
src/DAJIN2/core/preprocess/knockin_handler.py
|
|
45
45
|
src/DAJIN2/core/preprocess/mapping.py
|
|
@@ -3,9 +3,7 @@ scipy>=1.6.0
|
|
|
3
3
|
pandas>=1.0.0
|
|
4
4
|
openpyxl>=3.0.0
|
|
5
5
|
rapidfuzz>=3.0.0
|
|
6
|
-
statsmodels>=0.13.5
|
|
7
6
|
scikit-learn>=1.0.0
|
|
8
|
-
openpyxl>=3.0.0
|
|
9
7
|
mappy>=2.24
|
|
10
8
|
pysam>=0.19.0
|
|
11
9
|
Flask>=2.2.0
|
|
@@ -13,6 +11,6 @@ waitress>=2.1.0
|
|
|
13
11
|
Jinja2>=3.1.0
|
|
14
12
|
plotly>=5.0.0
|
|
15
13
|
kaleido>=0.2.0
|
|
16
|
-
cstag>=0.
|
|
14
|
+
cstag>=1.0.0
|
|
17
15
|
midsv>=0.10.1
|
|
18
16
|
wslPath>=0.3.0
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
from DAJIN2.core.preprocess import (
|
|
2
|
-
fastx_parser,
|
|
3
|
-
genome_fetcher,
|
|
4
|
-
cache_checker,
|
|
5
|
-
directories,
|
|
6
|
-
)
|
|
7
|
-
|
|
8
|
-
from DAJIN2.core.preprocess.mapping import generate_sam
|
|
9
|
-
from DAJIN2.core.preprocess.midsv_caller import generate_midsv
|
|
10
|
-
from DAJIN2.core.preprocess.knockin_handler import extract_knockin_loci
|
|
11
|
-
from DAJIN2.core.preprocess.mutation_extractor import cache_mutation_loci
|
|
12
|
-
from DAJIN2.core.preprocess.insertions_to_fasta import generate_insertion_fasta
|
|
@@ -1,59 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import re
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
|
|
6
|
-
import mappy
|
|
7
|
-
|
|
8
|
-
########################################################################
|
|
9
|
-
# Helper function
|
|
10
|
-
########################################################################
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def _sanitize_name(name: str) -> str:
|
|
14
|
-
"""
|
|
15
|
-
Sanitize the name by replacing invalid characters with '-'
|
|
16
|
-
"""
|
|
17
|
-
name = name.lstrip()
|
|
18
|
-
if not name:
|
|
19
|
-
raise ValueError("Provided FASTA/FASTQ is empty or consists only of whitespace")
|
|
20
|
-
return re.sub(r'[\\/:?.,\'"<>| ]', "-", name)
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
########################################################################
|
|
24
|
-
# Extract basename
|
|
25
|
-
########################################################################
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
def extract_basename(fastq_path: str) -> str:
|
|
29
|
-
name = Path(fastq_path).name
|
|
30
|
-
name = re.sub(r"\..*$", "", name) # Remove file extension
|
|
31
|
-
return _sanitize_name(name)
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
########################################################################
|
|
35
|
-
# Convert allele file to dictionary type fasta format
|
|
36
|
-
########################################################################
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
def dictionize_allele(path_fasta: str | Path) -> dict[str, str]:
|
|
40
|
-
return {_sanitize_name(name): seq.upper() for name, seq, _ in mappy.fastx_read(str(path_fasta))}
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
########################################################################
|
|
44
|
-
# Export fasta files as single-FASTA format
|
|
45
|
-
########################################################################
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
def export_fasta_files(TEMPDIR: Path, FASTA_ALLELES: dict, NAME: str) -> None:
|
|
49
|
-
"""
|
|
50
|
-
This function exports FASTA files in single-FASTA format.
|
|
51
|
-
|
|
52
|
-
:param TEMPDIR: Temporary directory Path object where the output files will be saved.
|
|
53
|
-
:param FASTA_ALLELES: Dictionary containing identifier and sequence pairs.
|
|
54
|
-
:param NAME: Name to be included in the output path.
|
|
55
|
-
"""
|
|
56
|
-
for identifier, sequence in FASTA_ALLELES.items():
|
|
57
|
-
contents = "\n".join([">" + identifier, sequence]) + "\n"
|
|
58
|
-
output_fasta = Path(TEMPDIR, NAME, "fasta", f"{identifier}.fasta")
|
|
59
|
-
output_fasta.write_text(contents)
|
|
@@ -1,42 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import gzip
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
|
|
6
|
-
#################################################
|
|
7
|
-
# save_concatenated_fastx
|
|
8
|
-
#################################################
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def extract_extention(file_path: Path) -> str:
|
|
12
|
-
suffixes = file_path.suffixes
|
|
13
|
-
return "".join(suffixes[-2:]) if len(suffixes) >= 2 else suffixes[0]
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
def is_gzip_file(file_name: Path) -> bool:
|
|
17
|
-
"""Check if a file is a GZip compressed file."""
|
|
18
|
-
try:
|
|
19
|
-
with file_name.open("rb") as f:
|
|
20
|
-
return f.read(2) == b"\x1f\x8b"
|
|
21
|
-
except IOError:
|
|
22
|
-
return False
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def save_fastq_as_gzip(TEMPDIR: Path, path_fastx: list[Path], barcode: str) -> None:
|
|
26
|
-
"""Merge gzip and non-gzip files into a single gzip file."""
|
|
27
|
-
with gzip.open(Path(TEMPDIR, barcode, "fastq", f"{barcode}.fastq.gz"), "wb") as merged_file:
|
|
28
|
-
for file_name in path_fastx:
|
|
29
|
-
if is_gzip_file(file_name):
|
|
30
|
-
with gzip.open(file_name, "rb") as f:
|
|
31
|
-
merged_file.write(f.read())
|
|
32
|
-
else:
|
|
33
|
-
with open(file_name, "r") as f:
|
|
34
|
-
merged_file.write(f.read().encode())
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
def save_concatenated_fastx(TEMPDIR: Path, directory: str) -> None:
|
|
38
|
-
fastx_suffix = {".fa", ".fq", ".fasta", ".fastq", ".fa.gz", ".fq.gz", ".fasta.gz", ".fastq.gz"}
|
|
39
|
-
path_directory = Path(directory)
|
|
40
|
-
barcode = path_directory.stem
|
|
41
|
-
path_fastx = [path for path in path_directory.iterdir() if extract_extention(path) in fastx_suffix]
|
|
42
|
-
save_fastq_as_gzip(TEMPDIR, path_fastx, barcode)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|