DAJIN2 0.4.1__zip → 0.4.2__zip

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {DAJIN2-0.4.1/src/DAJIN2.egg-info → DAJIN2-0.4.2}/PKG-INFO +13 -18
  2. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/README.md +11 -14
  3. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/requirements.txt +1 -4
  4. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/setup.py +1 -1
  5. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/consensus/consensus.py +3 -2
  6. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/consensus/name_handler.py +1 -7
  7. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/core.py +12 -115
  8. DAJIN2-0.4.2/src/DAJIN2/core/preprocess/__init__.py +9 -0
  9. DAJIN2-0.4.2/src/DAJIN2/core/preprocess/input_formatter.py +109 -0
  10. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/preprocess/mapping.py +4 -0
  11. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/preprocess/midsv_caller.py +2 -2
  12. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/main.py +1 -1
  13. DAJIN2-0.4.2/src/DAJIN2/utils/fastx_handler.py +94 -0
  14. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/utils/input_validator.py +32 -21
  15. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/utils/sam_handler.py +14 -0
  16. {DAJIN2-0.4.1 → DAJIN2-0.4.2/src/DAJIN2.egg-info}/PKG-INFO +13 -18
  17. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2.egg-info/SOURCES.txt +2 -2
  18. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2.egg-info/requires.txt +1 -3
  19. DAJIN2-0.4.1/src/DAJIN2/core/preprocess/__init__.py +0 -12
  20. DAJIN2-0.4.1/src/DAJIN2/core/preprocess/fastx_parser.py +0 -59
  21. DAJIN2-0.4.1/src/DAJIN2/utils/fastx_handler.py +0 -42
  22. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/LICENSE +0 -0
  23. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/MANIFEST.in +0 -0
  24. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/setup.cfg +0 -0
  25. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/__init__.py +0 -0
  26. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/__init__.py +0 -0
  27. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/classification/__init__.py +0 -0
  28. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/classification/allele_merger.py +0 -0
  29. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/classification/classifier.py +0 -0
  30. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/clustering/__init__.py +0 -0
  31. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/clustering/appender.py +0 -0
  32. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/clustering/clustering.py +0 -0
  33. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/clustering/kmer_generator.py +0 -0
  34. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/clustering/label_extractor.py +0 -0
  35. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/clustering/label_merger.py +0 -0
  36. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/clustering/label_updator.py +0 -0
  37. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/clustering/score_handler.py +0 -0
  38. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/clustering/strand_bias_handler.py +0 -0
  39. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/consensus/__init__.py +0 -0
  40. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/consensus/clust_formatter.py +0 -0
  41. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/consensus/mutation_extractor.py +0 -0
  42. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/consensus/similarity_searcher.py +0 -0
  43. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/preprocess/cache_checker.py +0 -0
  44. /DAJIN2-0.4.1/src/DAJIN2/core/preprocess/directories.py → /DAJIN2-0.4.2/src/DAJIN2/core/preprocess/directory_manager.py +0 -0
  45. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/preprocess/genome_fetcher.py +0 -0
  46. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/preprocess/homopolymer_handler.py +0 -0
  47. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/preprocess/insertions_to_fasta.py +0 -0
  48. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/preprocess/knockin_handler.py +0 -0
  49. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/preprocess/mutation_extractor.py +0 -0
  50. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/report/__init__.py +0 -0
  51. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/report/insertion_reflector.py +0 -0
  52. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/report/report_bam.py +0 -0
  53. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/report/report_files.py +0 -0
  54. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/core/report/report_mutation.py +0 -0
  55. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/gui.py +0 -0
  56. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/static/css/style.css +0 -0
  57. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/template_igvjs.html +0 -0
  58. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/templates/index.html +0 -0
  59. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/utils/config.py +0 -0
  60. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/utils/cssplits_handler.py +0 -0
  61. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/utils/dna_handler.py +0 -0
  62. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/utils/io.py +0 -0
  63. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/utils/multiprocess.py +0 -0
  64. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/utils/report_generator.py +0 -0
  65. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2/view.py +0 -0
  66. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2.egg-info/dependency_links.txt +0 -0
  67. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2.egg-info/entry_points.txt +0 -0
  68. {DAJIN2-0.4.1 → DAJIN2-0.4.2}/src/DAJIN2.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: DAJIN2
3
- Version: 0.4.1
3
+ Version: 0.4.2
4
4
  Summary: One-step genotyping tools for targeted long-read sequencing
5
5
  Home-page: https://github.com/akikuno/DAJIN2
6
6
  Author: Akihiro Kuno
@@ -19,9 +19,7 @@ Requires-Dist: scipy>=1.6.0
19
19
  Requires-Dist: pandas>=1.0.0
20
20
  Requires-Dist: openpyxl>=3.0.0
21
21
  Requires-Dist: rapidfuzz>=3.0.0
22
- Requires-Dist: statsmodels>=0.13.5
23
22
  Requires-Dist: scikit-learn>=1.0.0
24
- Requires-Dist: openpyxl>=3.0.0
25
23
  Requires-Dist: mappy>=2.24
26
24
  Requires-Dist: pysam>=0.19.0
27
25
  Requires-Dist: Flask>=2.2.0
@@ -29,7 +27,7 @@ Requires-Dist: waitress>=2.1.0
29
27
  Requires-Dist: Jinja2>=3.1.0
30
28
  Requires-Dist: plotly>=5.0.0
31
29
  Requires-Dist: kaleido>=0.2.0
32
- Requires-Dist: cstag>=0.4.1
30
+ Requires-Dist: cstag>=1.0.0
33
31
  Requires-Dist: midsv>=0.10.1
34
32
  Requires-Dist: wslPath>=0.3.0
35
33
 
@@ -56,14 +54,14 @@ The name DAJIN is derived from the phrase 一網**打尽** (Ichimou **DAJIN** or
56
54
  + **Comprehensive Mutation Detection**: Equipped with the capability to detect genome editing events over a wide range, it can identify a broad spectrum of mutations, from small changes to large structural variations.
57
55
  + DAJIN2 is also possible to detect complex mutations characteristic of genome editing, such as "insertions occurring in regions where deletions have occurred."
58
56
  + **Intuitive Visualization**: The outcomes of genome editing are visualized intuitively, allowing for the rapid and easy identification and analysis of mutations.
59
- + **Multi-Sample Compatibility**: Accommodates a variety of samples, enabling simultaneous processing of multiple samples. This facilitates efficient progression of large-scale experiments and comparative studies.
57
+ + **Multi-Sample Compatibility**: Enabling parallel processing of multiple samples. This facilitates efficient progression of large-scale experiments and comparative studies.
60
58
 
61
59
 
62
60
  ## 🛠 Installation
63
61
 
64
62
  ### Prerequisites
65
63
 
66
- - Python 3.7 or later
64
+ - Python 3.8 or later
67
65
  - Unix-like environment (Linux, macOS, WSL2, etc.)
68
66
 
69
67
  ### From [Bioconda](https://anaconda.org/bioconda/DAJIN2) (Recommended)
@@ -92,7 +90,7 @@ pip install DAJIN2
92
90
  > If you encounter any issues during the installation, please refer to the [Troubleshooting Guide](https://github.com/akikuno/DAJIN2/blob/main/docs/TROUBLESHOOTING.md)
93
91
 
94
92
 
95
- ## 💡 Usage
93
+ ## 💻 Usage
96
94
 
97
95
  ### Required Files
98
96
 
@@ -126,11 +124,11 @@ Assuming barcode01 as the control and barcode02 as the sample, specify each dire
126
124
  The FASTA file should contain descriptions of the alleles anticipated as a result of genome editing.
127
125
 
128
126
  > [!IMPORTANT]
129
- > Specifying the control allele: A header name >control and its sequence are mandatory.
127
+ > **A header name >control and its sequence are mandatory.**
130
128
 
131
129
  If there are anticipated alleles (e.g., knock-ins or knock-outs), include their sequences in the FASTA file too. These anticipated alleles can be named arbitrarily.
132
130
 
133
- Below is a typical example of a FASTA file:
131
+ Below is an example of a FASTA file:
134
132
 
135
133
  ```text
136
134
  >control
@@ -313,16 +311,17 @@ For example, Tyr point mutation is highlighted in **green**.
313
311
  ### 3. MUTATION_INFO
314
312
 
315
313
  The MUTATION_INFO directory saves tables depicting mutation sites for each allele.
316
- An example of a Tyr point mutation is described by its position on the chromosome and the type of mutation.
314
+ An example of a *Tyr* point mutation is described by its position on the chromosome and the type of mutation.
317
315
 
318
316
  <img src="https://user-images.githubusercontent.com/15861316/274519342-a613490d-5dbb-4a27-a2cf-bca0686b30f0.png" width="75%">
319
317
 
320
- ### 4. read_plot.html and read_plot.pdf
318
+ ### 4. resd_summary.xlsx, read_plot.html and read_plot.pdf
321
319
 
320
+ read_summary.xlsx describes the number of reads and presence proportion for each allele.
322
321
  Both read_plot.html and read_plot.pdf illustrate the proportions of each allele.
323
- The chart's **Allele type** indicates the type of allele, and **Percent of reads** shows the proportion of reads for that allele.
322
+ The chart's **Allele type** indicates the type of allele, and **Percent of reads** shows the proportion of reads for each allele.
324
323
 
325
- Additionally, the types of **Allele type** include:
324
+ The **Allele type** includes:
326
325
  - **Intact**: Alleles that perfectly match the input FASTA allele.
327
326
  - **Indels**: Substitutions, deletions, insertions, or inversions within 50 bases.
328
327
  - **SV**: Substitutions, deletions, insertions, or inversions beyond 50 bases.
@@ -333,14 +332,10 @@ Additionally, the types of **Allele type** include:
333
332
  > In PCR amplicon sequencing, the % of reads might not match the actual allele proportions due to amplification bias.
334
333
  > Especially when large deletions are present, the deletion alleles might be significantly amplified, potentially not reflecting the actual allele proportions.
335
334
 
336
- ### 5. read_summary.xlsx
337
-
338
- - read_summary.xlsx: Describes the number of reads and presence proportion for each allele.
339
-
340
335
  ## 📣Feedback and Support
341
336
 
342
337
  For questions, bug reports, or other forms of feedback, we'd love to hear from you!
343
- Please use [GitHub Issues](https://github.com/akikuno/DAJIN2/issues) for all reporting purposes.
338
+ Please use [GitHub Issues](https://github.com/akikuno/DAJIN2/issues/new/choose) for all reporting purposes.
344
339
 
345
340
  Please refer to [CONTRIBUTING](https://github.com/akikuno/DAJIN2/blob/main/docs/CONTRIBUTING.md) for how to contribute and how to verify your contributions.
346
341
 
@@ -21,14 +21,14 @@ The name DAJIN is derived from the phrase 一網**打尽** (Ichimou **DAJIN** or
21
21
  + **Comprehensive Mutation Detection**: Equipped with the capability to detect genome editing events over a wide range, it can identify a broad spectrum of mutations, from small changes to large structural variations.
22
22
  + DAJIN2 is also possible to detect complex mutations characteristic of genome editing, such as "insertions occurring in regions where deletions have occurred."
23
23
  + **Intuitive Visualization**: The outcomes of genome editing are visualized intuitively, allowing for the rapid and easy identification and analysis of mutations.
24
- + **Multi-Sample Compatibility**: Accommodates a variety of samples, enabling simultaneous processing of multiple samples. This facilitates efficient progression of large-scale experiments and comparative studies.
24
+ + **Multi-Sample Compatibility**: Enabling parallel processing of multiple samples. This facilitates efficient progression of large-scale experiments and comparative studies.
25
25
 
26
26
 
27
27
  ## 🛠 Installation
28
28
 
29
29
  ### Prerequisites
30
30
 
31
- - Python 3.7 or later
31
+ - Python 3.8 or later
32
32
  - Unix-like environment (Linux, macOS, WSL2, etc.)
33
33
 
34
34
  ### From [Bioconda](https://anaconda.org/bioconda/DAJIN2) (Recommended)
@@ -57,7 +57,7 @@ pip install DAJIN2
57
57
  > If you encounter any issues during the installation, please refer to the [Troubleshooting Guide](https://github.com/akikuno/DAJIN2/blob/main/docs/TROUBLESHOOTING.md)
58
58
 
59
59
 
60
- ## 💡 Usage
60
+ ## 💻 Usage
61
61
 
62
62
  ### Required Files
63
63
 
@@ -91,11 +91,11 @@ Assuming barcode01 as the control and barcode02 as the sample, specify each dire
91
91
  The FASTA file should contain descriptions of the alleles anticipated as a result of genome editing.
92
92
 
93
93
  > [!IMPORTANT]
94
- > Specifying the control allele: A header name >control and its sequence are mandatory.
94
+ > **A header name >control and its sequence are mandatory.**
95
95
 
96
96
  If there are anticipated alleles (e.g., knock-ins or knock-outs), include their sequences in the FASTA file too. These anticipated alleles can be named arbitrarily.
97
97
 
98
- Below is a typical example of a FASTA file:
98
+ Below is an example of a FASTA file:
99
99
 
100
100
  ```text
101
101
  >control
@@ -278,16 +278,17 @@ For example, Tyr point mutation is highlighted in **green**.
278
278
  ### 3. MUTATION_INFO
279
279
 
280
280
  The MUTATION_INFO directory saves tables depicting mutation sites for each allele.
281
- An example of a Tyr point mutation is described by its position on the chromosome and the type of mutation.
281
+ An example of a *Tyr* point mutation is described by its position on the chromosome and the type of mutation.
282
282
 
283
283
  <img src="https://user-images.githubusercontent.com/15861316/274519342-a613490d-5dbb-4a27-a2cf-bca0686b30f0.png" width="75%">
284
284
 
285
- ### 4. read_plot.html and read_plot.pdf
285
+ ### 4. resd_summary.xlsx, read_plot.html and read_plot.pdf
286
286
 
287
+ read_summary.xlsx describes the number of reads and presence proportion for each allele.
287
288
  Both read_plot.html and read_plot.pdf illustrate the proportions of each allele.
288
- The chart's **Allele type** indicates the type of allele, and **Percent of reads** shows the proportion of reads for that allele.
289
+ The chart's **Allele type** indicates the type of allele, and **Percent of reads** shows the proportion of reads for each allele.
289
290
 
290
- Additionally, the types of **Allele type** include:
291
+ The **Allele type** includes:
291
292
  - **Intact**: Alleles that perfectly match the input FASTA allele.
292
293
  - **Indels**: Substitutions, deletions, insertions, or inversions within 50 bases.
293
294
  - **SV**: Substitutions, deletions, insertions, or inversions beyond 50 bases.
@@ -298,14 +299,10 @@ Additionally, the types of **Allele type** include:
298
299
  > In PCR amplicon sequencing, the % of reads might not match the actual allele proportions due to amplification bias.
299
300
  > Especially when large deletions are present, the deletion alleles might be significantly amplified, potentially not reflecting the actual allele proportions.
300
301
 
301
- ### 5. read_summary.xlsx
302
-
303
- - read_summary.xlsx: Describes the number of reads and presence proportion for each allele.
304
-
305
302
  ## 📣Feedback and Support
306
303
 
307
304
  For questions, bug reports, or other forms of feedback, we'd love to hear from you!
308
- Please use [GitHub Issues](https://github.com/akikuno/DAJIN2/issues) for all reporting purposes.
305
+ Please use [GitHub Issues](https://github.com/akikuno/DAJIN2/issues/new/choose) for all reporting purposes.
309
306
 
310
307
  Please refer to [CONTRIBUTING](https://github.com/akikuno/DAJIN2/blob/main/docs/CONTRIBUTING.md) for how to contribute and how to verify your contributions.
311
308
 
@@ -3,11 +3,8 @@ scipy >= 1.6.0
3
3
  pandas >= 1.0.0
4
4
  openpyxl >= 3.0.0
5
5
  rapidfuzz >=3.0.0
6
- statsmodels >= 0.13.5
7
6
  scikit-learn >= 1.0.0
8
7
 
9
- openpyxl >= 3.0.0
10
-
11
8
  mappy >= 2.24
12
9
  pysam >= 0.19.0
13
10
 
@@ -18,6 +15,6 @@ Jinja2 >= 3.1.0
18
15
  plotly >= 5.0.0
19
16
  kaleido >= 0.2.0
20
17
 
21
- cstag >= 0.4.1
18
+ cstag >= 1.0.0
22
19
  midsv >= 0.10.1
23
20
  wslPath >=0.3.0
@@ -9,7 +9,7 @@ with open("requirements.txt") as requirements_file:
9
9
 
10
10
  setuptools.setup(
11
11
  name="DAJIN2",
12
- version="0.4.1",
12
+ version="0.4.2",
13
13
  author="Akihiro Kuno",
14
14
  author_email="akuno@md.tsukuba.ac.jp",
15
15
  description="One-step genotyping tools for targeted long-read sequencing",
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from pathlib import Path
4
- from typing import NamedTuple
4
+ from dataclasses import dataclass
5
5
  from itertools import groupby
6
6
  from collections import defaultdict
7
7
 
@@ -90,7 +90,8 @@ def call_percentage(cssplits: list[list[str]], mutation_loci: list[set[str]]) ->
90
90
  ###########################################################
91
91
 
92
92
 
93
- class ConsensusKey(NamedTuple):
93
+ @dataclass(frozen=True)
94
+ class ConsensusKey:
94
95
  allele: str
95
96
  label: int
96
97
  percent: float
@@ -1,13 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import re
4
- from typing import NamedTuple
5
-
6
-
7
- class ConsensusKey(NamedTuple):
8
- allele: str
9
- label: int
10
- percent: float
4
+ from DAJIN2.core.consensus.consensus import ConsensusKey
11
5
 
12
6
 
13
7
  def _detect_sv(cons_percentages: dict[ConsensusKey, list], threshold: int = 50) -> list[bool]:
@@ -2,119 +2,16 @@ from __future__ import annotations
2
2
 
3
3
  import shutil
4
4
  import logging
5
- import uuid
6
5
 
7
6
  from pathlib import Path
8
- from typing import NamedTuple
9
- from collections import defaultdict
10
7
 
11
- from DAJIN2.utils import io, config, fastx_handler
8
+ from DAJIN2.utils import io, fastx_handler
12
9
  from DAJIN2.core import classification, clustering, consensus, preprocess, report
10
+ from DAJIN2.core.preprocess.input_formatter import FormattedInputs
13
11
 
14
12
  logger = logging.getLogger(__name__)
15
13
 
16
14
 
17
- def parse_arguments(arguments: dict) -> tuple:
18
- genome_urls = defaultdict(str)
19
- if arguments.get("genome"):
20
- genome_urls.update(
21
- {"genome": arguments["genome"], "blat": arguments["blat"], "goldenpath": arguments["goldenpath"]}
22
- )
23
-
24
- return (
25
- arguments["sample"],
26
- arguments["control"],
27
- arguments["allele"],
28
- arguments["name"],
29
- arguments["threads"],
30
- genome_urls,
31
- uuid.uuid4().hex,
32
- )
33
-
34
-
35
- def convert_input_paths_to_posix(sample: str, control: str, allele: str) -> tuple:
36
- sample = io.convert_to_posix(sample)
37
- control = io.convert_to_posix(control)
38
- allele = io.convert_to_posix(allele)
39
-
40
- return sample, control, allele
41
-
42
-
43
- def create_temporal_directory(name: str, control_name: str) -> Path:
44
- tempdir = Path(config.TEMP_ROOT_DIR, name)
45
- Path(tempdir, "cache", ".igvjs", control_name).mkdir(parents=True, exist_ok=True)
46
-
47
- return tempdir
48
-
49
-
50
- def check_caches(tempdir: Path, path_allele: str, genome_url: str) -> bool:
51
- is_cache_hash = preprocess.cache_checker.exists_cached_hash(tempdir=tempdir, path=path_allele)
52
- is_cache_genome = preprocess.cache_checker.exists_cached_genome(tempdir=tempdir, genome=genome_url)
53
-
54
- return is_cache_hash and is_cache_genome
55
-
56
-
57
- def get_genome_coordinates(genome_urls: dict, fasta_alleles: dict, is_cache_genome: bool, tempdir: Path) -> dict:
58
- genome_coordinates = {
59
- "genome": genome_urls["genome"],
60
- "chrom_size": 0,
61
- "chrom": "control",
62
- "start": 0,
63
- "end": len(fasta_alleles["control"]) - 1,
64
- "strand": "+",
65
- }
66
- if genome_urls["genome"]:
67
- if is_cache_genome:
68
- genome_coordinates = next(io.read_jsonl(Path(tempdir, "cache", "genome_coordinates.jsonl")))
69
- else:
70
- genome_coordinates = preprocess.genome_fetcher.fetch_coordinates(
71
- genome_coordinates, genome_urls, fasta_alleles["control"]
72
- )
73
- genome_coordinates["chrom_size"] = preprocess.genome_fetcher.fetch_chromosome_size(
74
- genome_coordinates, genome_urls
75
- )
76
- io.write_jsonl([genome_coordinates], Path(tempdir, "cache", "genome_coordinates.jsonl"))
77
-
78
- return genome_coordinates
79
-
80
-
81
- class FormattedInputs(NamedTuple):
82
- path_sample: str
83
- path_control: str
84
- path_allele: str
85
- sample_name: str
86
- control_name: str
87
- fasta_alleles: dict[str, str]
88
- tempdir: Path
89
- genome_coordinates: dict[str, str]
90
- threads: int
91
- uuid: str
92
-
93
-
94
- def format_inputs(arguments: dict) -> FormattedInputs:
95
- path_sample, path_control, path_allele, name, threads, genome_urls, uuid = parse_arguments(arguments)
96
- path_sample, path_control, path_allele = convert_input_paths_to_posix(path_sample, path_control, path_allele)
97
- sample_name = preprocess.fastx_parser.extract_basename(path_sample)
98
- control_name = preprocess.fastx_parser.extract_basename(path_control)
99
- fasta_alleles = preprocess.fastx_parser.dictionize_allele(path_allele)
100
- tempdir = create_temporal_directory(name, control_name)
101
- is_cache_genome = check_caches(tempdir, path_allele, genome_urls["genome"])
102
- genome_coordinates = get_genome_coordinates(genome_urls, fasta_alleles, is_cache_genome, tempdir)
103
-
104
- return FormattedInputs(
105
- path_sample,
106
- path_control,
107
- path_allele,
108
- sample_name,
109
- control_name,
110
- fasta_alleles,
111
- tempdir,
112
- genome_coordinates,
113
- threads,
114
- uuid,
115
- )
116
-
117
-
118
15
  ###########################################################
119
16
  # main
120
17
  ###########################################################
@@ -126,9 +23,9 @@ def execute_control(arguments: dict):
126
23
  ###########################################################
127
24
  # Preprocess
128
25
  ###########################################################
129
- ARGS = format_inputs(arguments)
130
- preprocess.directories.create_temporal_directories(ARGS.tempdir, ARGS.control_name, is_control=True)
131
- preprocess.directories.create_report_directories(ARGS.tempdir, ARGS.control_name, is_control=True)
26
+ ARGS: FormattedInputs = preprocess.format_inputs(arguments)
27
+ preprocess.create_temporal_directories(ARGS.tempdir, ARGS.control_name, is_control=True)
28
+ preprocess.create_report_directories(ARGS.tempdir, ARGS.control_name, is_control=True)
132
29
  io.cache_control_hash(ARGS.tempdir, ARGS.path_allele)
133
30
 
134
31
  ###########################################################
@@ -151,7 +48,7 @@ def execute_control(arguments: dict):
151
48
  # ============================================================
152
49
  # Export fasta files as single-FASTA format
153
50
  # ============================================================
154
- preprocess.fastx_parser.export_fasta_files(ARGS.tempdir, ARGS.fasta_alleles, ARGS.control_name)
51
+ fastx_handler.export_fasta_files(ARGS.tempdir, ARGS.fasta_alleles, ARGS.control_name)
155
52
 
156
53
  # ============================================================
157
54
  # Mapping using mappy
@@ -189,9 +86,9 @@ def execute_sample(arguments: dict):
189
86
  # Preprocess
190
87
  ###########################################################
191
88
 
192
- ARGS = format_inputs(arguments)
193
- preprocess.directories.create_temporal_directories(ARGS.tempdir, ARGS.sample_name, is_control=False)
194
- preprocess.directories.create_report_directories(ARGS.tempdir, ARGS.sample_name, is_control=False)
89
+ ARGS: FormattedInputs = preprocess.format_inputs(arguments)
90
+ preprocess.create_temporal_directories(ARGS.tempdir, ARGS.sample_name, is_control=False)
91
+ preprocess.create_report_directories(ARGS.tempdir, ARGS.sample_name, is_control=False)
195
92
 
196
93
  logger.info(f"Preprocess {arguments['sample']}...")
197
94
 
@@ -209,7 +106,7 @@ def execute_sample(arguments: dict):
209
106
  shutil.copy(path_fasta, Path(ARGS.tempdir, ARGS.sample_name, "fasta"))
210
107
 
211
108
  paths_fasta = Path(ARGS.tempdir, ARGS.sample_name, "fasta").glob("*.fasta")
212
- preprocess.mapping.generate_sam(ARGS, paths_fasta, is_control=False, is_insertion=False)
109
+ preprocess.generate_sam(ARGS, paths_fasta, is_control=False, is_insertion=False)
213
110
 
214
111
  # ============================================================
215
112
  # MIDSV conversion
@@ -234,8 +131,8 @@ def execute_sample(arguments: dict):
234
131
 
235
132
  if paths_insertion_fasta:
236
133
  # mapping to insertion alleles
237
- preprocess.mapping.generate_sam(ARGS, paths_insertion_fasta, is_control=True, is_insertion=True)
238
- preprocess.mapping.generate_sam(ARGS, paths_insertion_fasta, is_control=False, is_insertion=True)
134
+ preprocess.generate_sam(ARGS, paths_insertion_fasta, is_control=True, is_insertion=True)
135
+ preprocess.generate_sam(ARGS, paths_insertion_fasta, is_control=False, is_insertion=True)
239
136
  # add insertions to ARGS.fasta_alleles
240
137
  for path_fasta in paths_insertion_fasta:
241
138
  allele, seq = Path(path_fasta).read_text().strip().split("\n")
@@ -0,0 +1,9 @@
1
+ from DAJIN2.core.preprocess.cache_checker import exists_cached_hash, exists_cached_genome
2
+ from DAJIN2.core.preprocess.genome_fetcher import fetch_coordinates, fetch_chromosome_size
3
+ from DAJIN2.core.preprocess.mapping import generate_sam
4
+ from DAJIN2.core.preprocess.directory_manager import create_temporal_directories, create_report_directories
5
+ from DAJIN2.core.preprocess.input_formatter import format_inputs
6
+ from DAJIN2.core.preprocess.midsv_caller import generate_midsv
7
+ from DAJIN2.core.preprocess.knockin_handler import extract_knockin_loci
8
+ from DAJIN2.core.preprocess.mutation_extractor import cache_mutation_loci
9
+ from DAJIN2.core.preprocess.insertions_to_fasta import generate_insertion_fasta
@@ -0,0 +1,109 @@
1
+ from __future__ import annotations
2
+
3
+ import uuid
4
+
5
+ from pathlib import Path
6
+ from dataclasses import dataclass
7
+ from collections import defaultdict
8
+
9
+ from DAJIN2.utils import io, config, fastx_handler
10
+
11
+ from DAJIN2.core import preprocess
12
+
13
+
14
+ def parse_arguments(arguments: dict) -> tuple:
15
+ genome_urls = defaultdict(str)
16
+ if arguments.get("genome"):
17
+ genome_urls.update(
18
+ {"genome": arguments["genome"], "blat": arguments["blat"], "goldenpath": arguments["goldenpath"]}
19
+ )
20
+
21
+ return (
22
+ arguments["sample"],
23
+ arguments["control"],
24
+ arguments["allele"],
25
+ arguments["name"],
26
+ arguments["threads"],
27
+ genome_urls,
28
+ uuid.uuid4().hex,
29
+ )
30
+
31
+
32
+ def convert_input_paths_to_posix(sample: str, control: str, allele: str) -> tuple:
33
+ sample = io.convert_to_posix(sample)
34
+ control = io.convert_to_posix(control)
35
+ allele = io.convert_to_posix(allele)
36
+
37
+ return sample, control, allele
38
+
39
+
40
+ def create_temporal_directory(name: str, control_name: str) -> Path:
41
+ tempdir = Path(config.TEMP_ROOT_DIR, name)
42
+ Path(tempdir, "cache", ".igvjs", control_name).mkdir(parents=True, exist_ok=True)
43
+
44
+ return tempdir
45
+
46
+
47
+ def check_caches(tempdir: Path, path_allele: str, genome_url: str) -> bool:
48
+ is_cache_hash = preprocess.exists_cached_hash(tempdir=tempdir, path=path_allele)
49
+ is_cache_genome = preprocess.exists_cached_genome(tempdir=tempdir, genome=genome_url)
50
+
51
+ return is_cache_hash and is_cache_genome
52
+
53
+
54
+ def get_genome_coordinates(genome_urls: dict, fasta_alleles: dict, is_cache_genome: bool, tempdir: Path) -> dict:
55
+ genome_coordinates = {
56
+ "genome": genome_urls["genome"],
57
+ "chrom_size": 0,
58
+ "chrom": "control",
59
+ "start": 0,
60
+ "end": len(fasta_alleles["control"]) - 1,
61
+ "strand": "+",
62
+ }
63
+ if genome_urls["genome"]:
64
+ if is_cache_genome:
65
+ genome_coordinates = next(io.read_jsonl(Path(tempdir, "cache", "genome_coordinates.jsonl")))
66
+ else:
67
+ genome_coordinates = preprocess.fetch_coordinates(genome_coordinates, genome_urls, fasta_alleles["control"])
68
+ genome_coordinates["chrom_size"] = preprocess.fetch_chromosome_size(genome_coordinates, genome_urls)
69
+ io.write_jsonl([genome_coordinates], Path(tempdir, "cache", "genome_coordinates.jsonl"))
70
+
71
+ return genome_coordinates
72
+
73
+
74
+ @dataclass(frozen=True)
75
+ class FormattedInputs:
76
+ path_sample: str
77
+ path_control: str
78
+ path_allele: str
79
+ sample_name: str
80
+ control_name: str
81
+ fasta_alleles: dict[str, str]
82
+ tempdir: Path
83
+ genome_coordinates: dict[str, str]
84
+ threads: int
85
+ uuid: str
86
+
87
+
88
+ def format_inputs(arguments: dict) -> FormattedInputs:
89
+ path_sample, path_control, path_allele, name, threads, genome_urls, uuid = parse_arguments(arguments)
90
+ path_sample, path_control, path_allele = convert_input_paths_to_posix(path_sample, path_control, path_allele)
91
+ sample_name = fastx_handler.extract_filename(path_sample)
92
+ control_name = fastx_handler.extract_filename(path_control)
93
+ fasta_alleles = fastx_handler.dictionize_allele(path_allele)
94
+ tempdir = create_temporal_directory(name, control_name)
95
+ is_cache_genome = check_caches(tempdir, path_allele, genome_urls["genome"])
96
+ genome_coordinates = get_genome_coordinates(genome_urls, fasta_alleles, is_cache_genome, tempdir)
97
+
98
+ return FormattedInputs(
99
+ path_sample,
100
+ path_control,
101
+ path_allele,
102
+ sample_name,
103
+ control_name,
104
+ fasta_alleles,
105
+ tempdir,
106
+ genome_coordinates,
107
+ threads,
108
+ uuid,
109
+ )
@@ -43,6 +43,10 @@ def to_sam(
43
43
  query_seq = QUERY_SEQ.upper()
44
44
  query_qual = QUERY_QUAL
45
45
 
46
+ # Skip multi-mapping reads
47
+ if hit.mapq == 0:
48
+ continue
49
+
46
50
  # Report flag
47
51
  if hit.is_primary:
48
52
  flag = 0 if hit.strand == 1 else 16
@@ -215,8 +215,8 @@ def generate_midsv(ARGS, is_control: bool = False, is_insertion: bool = False) -
215
215
  path_splice = Path(ARGS.tempdir, name, "sam", f"splice_{allele}.sam")
216
216
  path_output_midsv = Path(ARGS.tempdir, name, "midsv", f"{allele}.json")
217
217
 
218
- sam_ont = sam_handler.remove_overlapped_reads(list(midsv.read_sam(path_ont)))
219
- sam_splice = sam_handler.remove_overlapped_reads(list(midsv.read_sam(path_splice)))
218
+ sam_ont = sam_handler.remove_overlapped_reads(list(sam_handler.read_sam(path_ont)))
219
+ sam_splice = sam_handler.remove_overlapped_reads(list(sam_handler.read_sam(path_splice)))
220
220
  qname_of_map_ont = extract_qname_of_map_ont(sam_ont, sam_splice)
221
221
  sam_of_map_ont = filter_sam_by_preset(sam_ont, qname_of_map_ont, preset="map-ont")
222
222
  sam_of_splice = filter_sam_by_preset(sam_splice, qname_of_map_ont, preset="splice")
@@ -20,7 +20,7 @@ from DAJIN2.core import core
20
20
  from DAJIN2.utils import io, config, report_generator, input_validator, multiprocess
21
21
 
22
22
 
23
- DAJIN_VERSION = "0.4.1"
23
+ DAJIN_VERSION = "0.4.2"
24
24
 
25
25
 
26
26
  def generate_report(name: str) -> None:
@@ -0,0 +1,94 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ import gzip
5
+ from pathlib import Path
6
+
7
+ import mappy
8
+
9
+
10
+ #################################################
11
+ # Helper function
12
+ #################################################
13
+
14
+
15
+ def sanitize_filename(path_file: Path | str) -> str:
16
+ """
17
+ Sanitize the path_file by replacing invalid characters on Windows OS with '-'
18
+ """
19
+ path_file = str(path_file).lstrip()
20
+ if not path_file:
21
+ raise ValueError("Provided FASTA/FASTQ is empty or consists only of whitespace")
22
+ return re.sub(r'[\\/:?.,\'"<>| ]', "-", path_file)
23
+
24
+
25
+ #################################################
26
+ # Extract filename
27
+ #################################################
28
+
29
+
30
+ def extract_filename(path_fasta: Path | str) -> str:
31
+ filename = Path(path_fasta).name
32
+ filename = re.sub(r"\..*$", "", filename) # Remove file extension
33
+ return sanitize_filename(filename)
34
+
35
+
36
+ #################################################
37
+ # Convert allele file to dictionary type fasta format
38
+ #################################################
39
+
40
+
41
+ def dictionize_allele(path_fasta: str | Path) -> dict[str, str]:
42
+ return {sanitize_filename(name): seq.upper() for name, seq, _ in mappy.fastx_read(str(path_fasta))}
43
+
44
+
45
+ #################################################
46
+ # Export fasta files as single-FASTA format
47
+ #################################################
48
+
49
+
50
+ def export_fasta_files(TEMPDIR: Path, FASTA_ALLELES: dict, NAME: str) -> None:
51
+ """+ Save multiple FASTAs in separate single-FASTA format files."""
52
+ for identifier, sequence in FASTA_ALLELES.items():
53
+ contents = "\n".join([">" + identifier, sequence]) + "\n"
54
+ output_fasta = Path(TEMPDIR, NAME, "fasta", f"{identifier}.fasta")
55
+ output_fasta.write_text(contents)
56
+
57
+
58
+ #################################################
59
+ # save_concatenated_fastx
60
+ #################################################
61
+
62
+
63
+ def extract_extention(path_file: Path) -> str:
64
+ suffixes = path_file.suffixes
65
+ return "".join(suffixes)
66
+
67
+
68
+ def is_gzip_file(path_file: Path) -> bool:
69
+ """Check if a file is a GZip compressed file."""
70
+ try:
71
+ with path_file.open("rb") as f:
72
+ return f.read(2) == b"\x1f\x8b"
73
+ except IOError:
74
+ return False
75
+
76
+
77
+ def save_fastq_as_gzip(TEMPDIR: Path, path_fastx: list[Path], barcode: str) -> None:
78
+ """Merge gzip and non-gzip files into a single gzip file."""
79
+ with gzip.open(Path(TEMPDIR, barcode, "fastq", f"{barcode}.fastq.gz"), "wb") as merged_file:
80
+ for path_file in path_fastx:
81
+ if is_gzip_file(path_file):
82
+ with gzip.open(path_file, "rb") as f:
83
+ merged_file.write(f.read())
84
+ else:
85
+ with open(path_file, "r") as f:
86
+ merged_file.write(f.read().encode())
87
+
88
+
89
+ def save_concatenated_fastx(TEMPDIR: Path, directory: str) -> None:
90
+ fastx_suffix = {".fa", ".fq", ".fasta", ".fastq", ".fa.gz", ".fq.gz", ".fasta.gz", ".fastq.gz"}
91
+ path_directory = Path(directory)
92
+ barcode = path_directory.stem
93
+ path_fastx = [path for path in path_directory.iterdir() if extract_extention(path) in fastx_suffix]
94
+ save_fastq_as_gzip(TEMPDIR, path_fastx, barcode)
@@ -23,40 +23,51 @@ def update_threads(threads: int) -> int:
23
23
  ########################################################################
24
24
 
25
25
 
26
- def validate_file_existence(input_file: str):
27
- if not Path(input_file).exists():
28
- raise FileNotFoundError(f"{input_file} is not found")
26
+ def validate_file_existence(path_file: str):
27
+ if not Path(path_file).exists():
28
+ raise FileNotFoundError(f"{path_file} is not found")
29
29
 
30
30
 
31
- def validate_fastq_extension(fastq_path: str):
32
- if not re.search(r".fastq$|.fastq.gz$|.fq$|.fq.gz$", fastq_path):
33
- raise ValueError(f"{fastq_path} requires extensions either 'fastq', 'fastq.gz', 'fq' or 'fq.gz'")
31
+ def validate_fastq_extension(path_fastq: str):
32
+ if not re.search(r".fastq$|.fastq.gz$|.fq$|.fq.gz$", path_fastq):
33
+ raise ValueError(f"{path_fastq} requires extensions either 'fastq', 'fastq.gz', 'fq' or 'fq.gz'")
34
34
 
35
35
 
36
- # Varidate if the file is in the proper format.
37
- # See top 100 lines
38
- def validate_fastq_content(fastq_path: str):
36
+ # Varidate if the file is in the proper format viewing top 100 lines
37
+ def validate_fastq_content(path_fastq: str):
39
38
  try:
40
- names, seqs, quals = zip(*[(n, s, q) for i, (n, s, q) in enumerate(mappy.fastx_read(fastq_path)) if i < 100])
41
- if not (len(names) == len(seqs) == len(quals) > 0):
39
+ headers, seqs, quals = zip(*[(n, s, q) for i, (n, s, q) in enumerate(mappy.fastx_read(path_fastq)) if i < 100])
40
+ # Remove empty elements
41
+ headers = [header for header in headers if header]
42
+ seqs = [seq for seq in seqs if seq]
43
+ quals = [qual for qual in quals if qual]
44
+
45
+ if not (len(headers) == len(seqs) == len(quals) > 0):
42
46
  raise ValueError
47
+
43
48
  except ValueError:
44
- raise ValueError(f"{fastq_path} is not a FASTQ format")
49
+ raise ValueError(f"{path_fastq} is not a proper FASTQ format")
45
50
 
46
51
 
47
- def validate_fasta_content(fasta_path: str):
52
+ def validate_fasta_content(path_fasta: str):
48
53
  try:
49
- names, seqs = zip(*[(n, s) for n, s, _ in mappy.fastx_read(fasta_path)])
50
- if len(names) != len(seqs) or not names:
54
+ headers, seqs = zip(*[(n, s) for n, s, _ in mappy.fastx_read(path_fasta)])
55
+ # Remove empty elements
56
+ headers = [header for header in headers if header]
57
+ seqs = [seq for seq in seqs if seq]
58
+
59
+ if len(headers) != len(seqs) or not headers:
51
60
  raise ValueError
61
+
52
62
  except ValueError:
53
- raise ValueError(f"{fasta_path} is not a proper FASTA format")
54
- if len(names) != len(set(names)):
55
- raise ValueError(f"{fasta_path} must include unique identifiers")
63
+ raise ValueError(f"{path_fasta} is not a proper FASTA format")
64
+
65
+ if len(headers) != len(set(headers)):
66
+ raise ValueError(f"{path_fasta} must include unique identifiers")
56
67
  if len(seqs) != len(set(seqs)):
57
- raise ValueError(f"{fasta_path} must include unique DNA sequences")
58
- if "control" not in names:
59
- raise ValueError(f"One of the headers in the {fasta_path} must be '>control'")
68
+ raise ValueError(f"{path_fasta} must include unique DNA sequences")
69
+ if "control" not in headers:
70
+ raise ValueError(f"One of the headers in the {path_fasta} must be '>control'")
60
71
 
61
72
 
62
73
  def validate_files(SAMPLE: str, CONTROL: str, ALLELE: str) -> None:
@@ -1,6 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import re
4
+
5
+ from pathlib import Path
6
+ from typing import Generator
4
7
  from itertools import groupby
5
8
  from DAJIN2.utils.dna_handler import revcomp
6
9
 
@@ -22,6 +25,17 @@ def is_mapped(s: list[str]) -> bool:
22
25
  return not s[0].startswith("@") and s[9] != "*"
23
26
 
24
27
 
28
+ ###########################################################
29
+ # Read sam
30
+ ###########################################################
31
+
32
+
33
+ def read_sam(path_of_sam: str | Path) -> Generator[list]:
34
+ with open(path_of_sam) as f:
35
+ for line in f:
36
+ yield line.strip().split("\t")
37
+
38
+
25
39
  ###########################################################
26
40
  # remove_overlapped_reads
27
41
  ###########################################################
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: DAJIN2
3
- Version: 0.4.1
3
+ Version: 0.4.2
4
4
  Summary: One-step genotyping tools for targeted long-read sequencing
5
5
  Home-page: https://github.com/akikuno/DAJIN2
6
6
  Author: Akihiro Kuno
@@ -19,9 +19,7 @@ Requires-Dist: scipy>=1.6.0
19
19
  Requires-Dist: pandas>=1.0.0
20
20
  Requires-Dist: openpyxl>=3.0.0
21
21
  Requires-Dist: rapidfuzz>=3.0.0
22
- Requires-Dist: statsmodels>=0.13.5
23
22
  Requires-Dist: scikit-learn>=1.0.0
24
- Requires-Dist: openpyxl>=3.0.0
25
23
  Requires-Dist: mappy>=2.24
26
24
  Requires-Dist: pysam>=0.19.0
27
25
  Requires-Dist: Flask>=2.2.0
@@ -29,7 +27,7 @@ Requires-Dist: waitress>=2.1.0
29
27
  Requires-Dist: Jinja2>=3.1.0
30
28
  Requires-Dist: plotly>=5.0.0
31
29
  Requires-Dist: kaleido>=0.2.0
32
- Requires-Dist: cstag>=0.4.1
30
+ Requires-Dist: cstag>=1.0.0
33
31
  Requires-Dist: midsv>=0.10.1
34
32
  Requires-Dist: wslPath>=0.3.0
35
33
 
@@ -56,14 +54,14 @@ The name DAJIN is derived from the phrase 一網**打尽** (Ichimou **DAJIN** or
56
54
  + **Comprehensive Mutation Detection**: Equipped with the capability to detect genome editing events over a wide range, it can identify a broad spectrum of mutations, from small changes to large structural variations.
57
55
  + DAJIN2 is also possible to detect complex mutations characteristic of genome editing, such as "insertions occurring in regions where deletions have occurred."
58
56
  + **Intuitive Visualization**: The outcomes of genome editing are visualized intuitively, allowing for the rapid and easy identification and analysis of mutations.
59
- + **Multi-Sample Compatibility**: Accommodates a variety of samples, enabling simultaneous processing of multiple samples. This facilitates efficient progression of large-scale experiments and comparative studies.
57
+ + **Multi-Sample Compatibility**: Enabling parallel processing of multiple samples. This facilitates efficient progression of large-scale experiments and comparative studies.
60
58
 
61
59
 
62
60
  ## 🛠 Installation
63
61
 
64
62
  ### Prerequisites
65
63
 
66
- - Python 3.7 or later
64
+ - Python 3.8 or later
67
65
  - Unix-like environment (Linux, macOS, WSL2, etc.)
68
66
 
69
67
  ### From [Bioconda](https://anaconda.org/bioconda/DAJIN2) (Recommended)
@@ -92,7 +90,7 @@ pip install DAJIN2
92
90
  > If you encounter any issues during the installation, please refer to the [Troubleshooting Guide](https://github.com/akikuno/DAJIN2/blob/main/docs/TROUBLESHOOTING.md)
93
91
 
94
92
 
95
- ## 💡 Usage
93
+ ## 💻 Usage
96
94
 
97
95
  ### Required Files
98
96
 
@@ -126,11 +124,11 @@ Assuming barcode01 as the control and barcode02 as the sample, specify each dire
126
124
  The FASTA file should contain descriptions of the alleles anticipated as a result of genome editing.
127
125
 
128
126
  > [!IMPORTANT]
129
- > Specifying the control allele: A header name >control and its sequence are mandatory.
127
+ > **A header name >control and its sequence are mandatory.**
130
128
 
131
129
  If there are anticipated alleles (e.g., knock-ins or knock-outs), include their sequences in the FASTA file too. These anticipated alleles can be named arbitrarily.
132
130
 
133
- Below is a typical example of a FASTA file:
131
+ Below is an example of a FASTA file:
134
132
 
135
133
  ```text
136
134
  >control
@@ -313,16 +311,17 @@ For example, Tyr point mutation is highlighted in **green**.
313
311
  ### 3. MUTATION_INFO
314
312
 
315
313
  The MUTATION_INFO directory saves tables depicting mutation sites for each allele.
316
- An example of a Tyr point mutation is described by its position on the chromosome and the type of mutation.
314
+ An example of a *Tyr* point mutation is described by its position on the chromosome and the type of mutation.
317
315
 
318
316
  <img src="https://user-images.githubusercontent.com/15861316/274519342-a613490d-5dbb-4a27-a2cf-bca0686b30f0.png" width="75%">
319
317
 
320
- ### 4. read_plot.html and read_plot.pdf
318
+ ### 4. resd_summary.xlsx, read_plot.html and read_plot.pdf
321
319
 
320
+ read_summary.xlsx describes the number of reads and presence proportion for each allele.
322
321
  Both read_plot.html and read_plot.pdf illustrate the proportions of each allele.
323
- The chart's **Allele type** indicates the type of allele, and **Percent of reads** shows the proportion of reads for that allele.
322
+ The chart's **Allele type** indicates the type of allele, and **Percent of reads** shows the proportion of reads for each allele.
324
323
 
325
- Additionally, the types of **Allele type** include:
324
+ The **Allele type** includes:
326
325
  - **Intact**: Alleles that perfectly match the input FASTA allele.
327
326
  - **Indels**: Substitutions, deletions, insertions, or inversions within 50 bases.
328
327
  - **SV**: Substitutions, deletions, insertions, or inversions beyond 50 bases.
@@ -333,14 +332,10 @@ Additionally, the types of **Allele type** include:
333
332
  > In PCR amplicon sequencing, the % of reads might not match the actual allele proportions due to amplification bias.
334
333
  > Especially when large deletions are present, the deletion alleles might be significantly amplified, potentially not reflecting the actual allele proportions.
335
334
 
336
- ### 5. read_summary.xlsx
337
-
338
- - read_summary.xlsx: Describes the number of reads and presence proportion for each allele.
339
-
340
335
  ## 📣Feedback and Support
341
336
 
342
337
  For questions, bug reports, or other forms of feedback, we'd love to hear from you!
343
- Please use [GitHub Issues](https://github.com/akikuno/DAJIN2/issues) for all reporting purposes.
338
+ Please use [GitHub Issues](https://github.com/akikuno/DAJIN2/issues/new/choose) for all reporting purposes.
344
339
 
345
340
  Please refer to [CONTRIBUTING](https://github.com/akikuno/DAJIN2/blob/main/docs/CONTRIBUTING.md) for how to contribute and how to verify your contributions.
346
341
 
@@ -36,10 +36,10 @@ src/DAJIN2/core/consensus/name_handler.py
36
36
  src/DAJIN2/core/consensus/similarity_searcher.py
37
37
  src/DAJIN2/core/preprocess/__init__.py
38
38
  src/DAJIN2/core/preprocess/cache_checker.py
39
- src/DAJIN2/core/preprocess/directories.py
40
- src/DAJIN2/core/preprocess/fastx_parser.py
39
+ src/DAJIN2/core/preprocess/directory_manager.py
41
40
  src/DAJIN2/core/preprocess/genome_fetcher.py
42
41
  src/DAJIN2/core/preprocess/homopolymer_handler.py
42
+ src/DAJIN2/core/preprocess/input_formatter.py
43
43
  src/DAJIN2/core/preprocess/insertions_to_fasta.py
44
44
  src/DAJIN2/core/preprocess/knockin_handler.py
45
45
  src/DAJIN2/core/preprocess/mapping.py
@@ -3,9 +3,7 @@ scipy>=1.6.0
3
3
  pandas>=1.0.0
4
4
  openpyxl>=3.0.0
5
5
  rapidfuzz>=3.0.0
6
- statsmodels>=0.13.5
7
6
  scikit-learn>=1.0.0
8
- openpyxl>=3.0.0
9
7
  mappy>=2.24
10
8
  pysam>=0.19.0
11
9
  Flask>=2.2.0
@@ -13,6 +11,6 @@ waitress>=2.1.0
13
11
  Jinja2>=3.1.0
14
12
  plotly>=5.0.0
15
13
  kaleido>=0.2.0
16
- cstag>=0.4.1
14
+ cstag>=1.0.0
17
15
  midsv>=0.10.1
18
16
  wslPath>=0.3.0
@@ -1,12 +0,0 @@
1
- from DAJIN2.core.preprocess import (
2
- fastx_parser,
3
- genome_fetcher,
4
- cache_checker,
5
- directories,
6
- )
7
-
8
- from DAJIN2.core.preprocess.mapping import generate_sam
9
- from DAJIN2.core.preprocess.midsv_caller import generate_midsv
10
- from DAJIN2.core.preprocess.knockin_handler import extract_knockin_loci
11
- from DAJIN2.core.preprocess.mutation_extractor import cache_mutation_loci
12
- from DAJIN2.core.preprocess.insertions_to_fasta import generate_insertion_fasta
@@ -1,59 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import re
4
- from pathlib import Path
5
-
6
- import mappy
7
-
8
- ########################################################################
9
- # Helper function
10
- ########################################################################
11
-
12
-
13
- def _sanitize_name(name: str) -> str:
14
- """
15
- Sanitize the name by replacing invalid characters with '-'
16
- """
17
- name = name.lstrip()
18
- if not name:
19
- raise ValueError("Provided FASTA/FASTQ is empty or consists only of whitespace")
20
- return re.sub(r'[\\/:?.,\'"<>| ]', "-", name)
21
-
22
-
23
- ########################################################################
24
- # Extract basename
25
- ########################################################################
26
-
27
-
28
- def extract_basename(fastq_path: str) -> str:
29
- name = Path(fastq_path).name
30
- name = re.sub(r"\..*$", "", name) # Remove file extension
31
- return _sanitize_name(name)
32
-
33
-
34
- ########################################################################
35
- # Convert allele file to dictionary type fasta format
36
- ########################################################################
37
-
38
-
39
- def dictionize_allele(path_fasta: str | Path) -> dict[str, str]:
40
- return {_sanitize_name(name): seq.upper() for name, seq, _ in mappy.fastx_read(str(path_fasta))}
41
-
42
-
43
- ########################################################################
44
- # Export fasta files as single-FASTA format
45
- ########################################################################
46
-
47
-
48
- def export_fasta_files(TEMPDIR: Path, FASTA_ALLELES: dict, NAME: str) -> None:
49
- """
50
- This function exports FASTA files in single-FASTA format.
51
-
52
- :param TEMPDIR: Temporary directory Path object where the output files will be saved.
53
- :param FASTA_ALLELES: Dictionary containing identifier and sequence pairs.
54
- :param NAME: Name to be included in the output path.
55
- """
56
- for identifier, sequence in FASTA_ALLELES.items():
57
- contents = "\n".join([">" + identifier, sequence]) + "\n"
58
- output_fasta = Path(TEMPDIR, NAME, "fasta", f"{identifier}.fasta")
59
- output_fasta.write_text(contents)
@@ -1,42 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import gzip
4
- from pathlib import Path
5
-
6
- #################################################
7
- # save_concatenated_fastx
8
- #################################################
9
-
10
-
11
- def extract_extention(file_path: Path) -> str:
12
- suffixes = file_path.suffixes
13
- return "".join(suffixes[-2:]) if len(suffixes) >= 2 else suffixes[0]
14
-
15
-
16
- def is_gzip_file(file_name: Path) -> bool:
17
- """Check if a file is a GZip compressed file."""
18
- try:
19
- with file_name.open("rb") as f:
20
- return f.read(2) == b"\x1f\x8b"
21
- except IOError:
22
- return False
23
-
24
-
25
- def save_fastq_as_gzip(TEMPDIR: Path, path_fastx: list[Path], barcode: str) -> None:
26
- """Merge gzip and non-gzip files into a single gzip file."""
27
- with gzip.open(Path(TEMPDIR, barcode, "fastq", f"{barcode}.fastq.gz"), "wb") as merged_file:
28
- for file_name in path_fastx:
29
- if is_gzip_file(file_name):
30
- with gzip.open(file_name, "rb") as f:
31
- merged_file.write(f.read())
32
- else:
33
- with open(file_name, "r") as f:
34
- merged_file.write(f.read().encode())
35
-
36
-
37
- def save_concatenated_fastx(TEMPDIR: Path, directory: str) -> None:
38
- fastx_suffix = {".fa", ".fq", ".fasta", ".fastq", ".fa.gz", ".fq.gz", ".fasta.gz", ".fastq.gz"}
39
- path_directory = Path(directory)
40
- barcode = path_directory.stem
41
- path_fastx = [path for path in path_directory.iterdir() if extract_extention(path) in fastx_suffix]
42
- save_fastq_as_gzip(TEMPDIR, path_fastx, barcode)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes