DAJIN2 0.4.2__zip → 0.4.3__zip

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {DAJIN2-0.4.2/src/DAJIN2.egg-info → DAJIN2-0.4.3}/PKG-INFO +29 -19
  2. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/README.md +19 -9
  3. DAJIN2-0.4.3/requirements.txt +20 -0
  4. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/setup.py +1 -1
  5. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/core/clustering/label_merger.py +20 -16
  6. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/core/core.py +8 -8
  7. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/core/preprocess/genome_fetcher.py +11 -3
  8. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/core/preprocess/midsv_caller.py +3 -4
  9. DAJIN2-0.4.3/src/DAJIN2/core/report/__init__.py +3 -0
  10. DAJIN2-0.4.2/src/DAJIN2/core/report/report_bam.py → DAJIN2-0.4.3/src/DAJIN2/core/report/bam_exporter.py +64 -50
  11. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/main.py +1 -1
  12. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/utils/io.py +6 -0
  13. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/utils/sam_handler.py +0 -13
  14. {DAJIN2-0.4.2 → DAJIN2-0.4.3/src/DAJIN2.egg-info}/PKG-INFO +29 -19
  15. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2.egg-info/SOURCES.txt +3 -3
  16. DAJIN2-0.4.3/src/DAJIN2.egg-info/requires.txt +16 -0
  17. DAJIN2-0.4.2/requirements.txt +0 -20
  18. DAJIN2-0.4.2/src/DAJIN2/core/report/__init__.py +0 -3
  19. DAJIN2-0.4.2/src/DAJIN2.egg-info/requires.txt +0 -16
  20. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/LICENSE +0 -0
  21. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/MANIFEST.in +0 -0
  22. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/setup.cfg +0 -0
  23. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/__init__.py +0 -0
  24. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/core/__init__.py +0 -0
  25. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/core/classification/__init__.py +0 -0
  26. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/core/classification/allele_merger.py +0 -0
  27. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/core/classification/classifier.py +0 -0
  28. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/core/clustering/__init__.py +0 -0
  29. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/core/clustering/appender.py +0 -0
  30. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/core/clustering/clustering.py +0 -0
  31. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/core/clustering/kmer_generator.py +0 -0
  32. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/core/clustering/label_extractor.py +0 -0
  33. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/core/clustering/label_updator.py +0 -0
  34. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/core/clustering/score_handler.py +0 -0
  35. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/core/clustering/strand_bias_handler.py +0 -0
  36. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/core/consensus/__init__.py +0 -0
  37. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/core/consensus/clust_formatter.py +0 -0
  38. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/core/consensus/consensus.py +0 -0
  39. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/core/consensus/mutation_extractor.py +0 -0
  40. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/core/consensus/name_handler.py +0 -0
  41. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/core/consensus/similarity_searcher.py +0 -0
  42. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/core/preprocess/__init__.py +0 -0
  43. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/core/preprocess/cache_checker.py +0 -0
  44. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/core/preprocess/directory_manager.py +0 -0
  45. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/core/preprocess/homopolymer_handler.py +0 -0
  46. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/core/preprocess/input_formatter.py +0 -0
  47. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/core/preprocess/insertions_to_fasta.py +0 -0
  48. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/core/preprocess/knockin_handler.py +0 -0
  49. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/core/preprocess/mapping.py +0 -0
  50. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/core/preprocess/mutation_extractor.py +0 -0
  51. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/core/report/insertion_reflector.py +0 -0
  52. /DAJIN2-0.4.2/src/DAJIN2/core/report/report_mutation.py → /DAJIN2-0.4.3/src/DAJIN2/core/report/mutation_exporter.py +0 -0
  53. /DAJIN2-0.4.2/src/DAJIN2/core/report/report_files.py → /DAJIN2-0.4.3/src/DAJIN2/core/report/sequence_exporter.py +0 -0
  54. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/gui.py +0 -0
  55. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/static/css/style.css +0 -0
  56. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/template_igvjs.html +0 -0
  57. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/templates/index.html +0 -0
  58. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/utils/config.py +0 -0
  59. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/utils/cssplits_handler.py +0 -0
  60. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/utils/dna_handler.py +0 -0
  61. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/utils/fastx_handler.py +0 -0
  62. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/utils/input_validator.py +0 -0
  63. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/utils/multiprocess.py +0 -0
  64. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/utils/report_generator.py +0 -0
  65. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2/view.py +0 -0
  66. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2.egg-info/dependency_links.txt +0 -0
  67. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2.egg-info/entry_points.txt +0 -0
  68. {DAJIN2-0.4.2 → DAJIN2-0.4.3}/src/DAJIN2.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: DAJIN2
3
- Version: 0.4.2
3
+ Version: 0.4.3
4
4
  Summary: One-step genotyping tools for targeted long-read sequencing
5
5
  Home-page: https://github.com/akikuno/DAJIN2
6
6
  Author: Akihiro Kuno
@@ -14,22 +14,22 @@ Classifier: Intended Audience :: Science/Research
14
14
  Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
15
15
  Description-Content-Type: text/markdown
16
16
  License-File: LICENSE
17
- Requires-Dist: numpy>=1.20.0
18
- Requires-Dist: scipy>=1.6.0
17
+ Requires-Dist: numpy>=1.24.0
18
+ Requires-Dist: scipy>=1.10.0
19
19
  Requires-Dist: pandas>=1.0.0
20
- Requires-Dist: openpyxl>=3.0.0
21
- Requires-Dist: rapidfuzz>=3.0.0
22
- Requires-Dist: scikit-learn>=1.0.0
20
+ Requires-Dist: openpyxl>=3.1.0
21
+ Requires-Dist: rapidfuzz>=3.6.0
22
+ Requires-Dist: scikit-learn>=1.3.0
23
23
  Requires-Dist: mappy>=2.24
24
- Requires-Dist: pysam>=0.19.0
24
+ Requires-Dist: pysam>=0.21.0
25
25
  Requires-Dist: Flask>=2.2.0
26
26
  Requires-Dist: waitress>=2.1.0
27
27
  Requires-Dist: Jinja2>=3.1.0
28
- Requires-Dist: plotly>=5.0.0
28
+ Requires-Dist: plotly>=5.19.0
29
29
  Requires-Dist: kaleido>=0.2.0
30
30
  Requires-Dist: cstag>=1.0.0
31
- Requires-Dist: midsv>=0.10.1
32
- Requires-Dist: wslPath>=0.3.0
31
+ Requires-Dist: midsv>=0.11.0
32
+ Requires-Dist: wslPath>=0.4.1
33
33
 
34
34
  [![License](https://img.shields.io/badge/License-MIT-9cf.svg)](https://choosealicense.com/licenses/mit/)
35
35
  [![Test](https://img.shields.io/github/actions/workflow/status/akikuno/dajin2/pytest.yml?branch=main&label=Test&color=brightgreen)](https://github.com/akikuno/dajin2/actions)
@@ -78,6 +78,7 @@ conda activate env-dajin2
78
78
  > CONDA_SUBDIR=osx-64 conda create -n env-dajin2 -c conda-forge -c bioconda python=3.10 DAJIN2 -y
79
79
  > conda activate env-dajin2
80
80
  > conda config --env --set subdir osx-64
81
+ > python -c "import platform; print(platform.machine())" # Confirm that the output is 'x86_64', not 'arm64'
81
82
  > ```
82
83
 
83
84
  ### From [PyPI](https://pypi.org/project/DAJIN2/)
@@ -164,12 +165,17 @@ Options:
164
165
  #### Example
165
166
 
166
167
  ```bash
168
+ # Download example dataset
169
+ wget https://github.com/akikuno/DAJIN2/raw/main/examples/example_single.tar.gz
170
+ tar -xf example_single.tar.gz
171
+
172
+ # Run DAJIN2
167
173
  DAJIN2 \
168
- --control example/barcode01 \
169
- --sample example/barcode02 \
170
- --allele example/design.fa \
171
- --name IL6-knockin \
172
- --genome hg38 \
174
+ --control example_single/control \
175
+ --sample example_single/sample \
176
+ --allele example_single/stx2_deletion.fa \
177
+ --name stx2_deletion \
178
+ --genome mm39 \
173
179
  --threads 4
174
180
  ```
175
181
 
@@ -206,7 +212,6 @@ DAJIN2 \
206
212
 
207
213
  By using the `batch` subcommand, you can process multiple FASTQ files simultaneously.
208
214
  For this purpose, a CSV or Excel file consolidating the sample information is required.
209
- <!-- For a specific example, please refer to [this link](https://github.com/akikuno/DAJIN2/blob/main/examples/example-batch/batch.csv). -->
210
215
 
211
216
  > [!NOTE]
212
217
  > For guidance on how to compile sample information, please refer to [this document](https://docs.google.com/presentation/d/e/2PACX-1vSMEmXJPG2TNjfT66XZJRzqJd82aAqO5gJrdEzyhn15YBBr_Li-j5puOgVChYf3jA/embed?start=false&loop=false&delayms=3000).
@@ -224,13 +229,18 @@ options:
224
229
  #### Example
225
230
 
226
231
  ```bash
227
- DAJIN2 --file batch.csv --threads 4
232
+ # Donwload the example dataset
233
+ wget https://github.com/akikuno/DAJIN2/raw/main/examples/example_batch.tar.gz
234
+ tar -xf example_batch.tar.gz
235
+
236
+ # Run DAJIN2
237
+ DAJIN2 batch --file example_batch/batch.csv --threads 4
228
238
  ```
229
239
 
230
240
  <!-- ```bash
231
241
  # Donwload the example dataset
232
- wget https://github.com/akikuno/DAJIN2/raw/main/examples/example-batch.tar.gz
233
- tar -xf example-batch.tar.gz
242
+ wget https://github.com/akikuno/DAJIN2/raw/main/examples/example_batch.tar.gz
243
+ tar -xf example_batch.tar.gz
234
244
 
235
245
  # Run DAJIN2
236
246
  DAJIN2 batch --file example-batch/batch.csv --threads 3
@@ -45,6 +45,7 @@ conda activate env-dajin2
45
45
  > CONDA_SUBDIR=osx-64 conda create -n env-dajin2 -c conda-forge -c bioconda python=3.10 DAJIN2 -y
46
46
  > conda activate env-dajin2
47
47
  > conda config --env --set subdir osx-64
48
+ > python -c "import platform; print(platform.machine())" # Confirm that the output is 'x86_64', not 'arm64'
48
49
  > ```
49
50
 
50
51
  ### From [PyPI](https://pypi.org/project/DAJIN2/)
@@ -131,12 +132,17 @@ Options:
131
132
  #### Example
132
133
 
133
134
  ```bash
135
+ # Download example dataset
136
+ wget https://github.com/akikuno/DAJIN2/raw/main/examples/example_single.tar.gz
137
+ tar -xf example_single.tar.gz
138
+
139
+ # Run DAJIN2
134
140
  DAJIN2 \
135
- --control example/barcode01 \
136
- --sample example/barcode02 \
137
- --allele example/design.fa \
138
- --name IL6-knockin \
139
- --genome hg38 \
141
+ --control example_single/control \
142
+ --sample example_single/sample \
143
+ --allele example_single/stx2_deletion.fa \
144
+ --name stx2_deletion \
145
+ --genome mm39 \
140
146
  --threads 4
141
147
  ```
142
148
 
@@ -173,7 +179,6 @@ DAJIN2 \
173
179
 
174
180
  By using the `batch` subcommand, you can process multiple FASTQ files simultaneously.
175
181
  For this purpose, a CSV or Excel file consolidating the sample information is required.
176
- <!-- For a specific example, please refer to [this link](https://github.com/akikuno/DAJIN2/blob/main/examples/example-batch/batch.csv). -->
177
182
 
178
183
  > [!NOTE]
179
184
  > For guidance on how to compile sample information, please refer to [this document](https://docs.google.com/presentation/d/e/2PACX-1vSMEmXJPG2TNjfT66XZJRzqJd82aAqO5gJrdEzyhn15YBBr_Li-j5puOgVChYf3jA/embed?start=false&loop=false&delayms=3000).
@@ -191,13 +196,18 @@ options:
191
196
  #### Example
192
197
 
193
198
  ```bash
194
- DAJIN2 --file batch.csv --threads 4
199
+ # Donwload the example dataset
200
+ wget https://github.com/akikuno/DAJIN2/raw/main/examples/example_batch.tar.gz
201
+ tar -xf example_batch.tar.gz
202
+
203
+ # Run DAJIN2
204
+ DAJIN2 batch --file example_batch/batch.csv --threads 4
195
205
  ```
196
206
 
197
207
  <!-- ```bash
198
208
  # Donwload the example dataset
199
- wget https://github.com/akikuno/DAJIN2/raw/main/examples/example-batch.tar.gz
200
- tar -xf example-batch.tar.gz
209
+ wget https://github.com/akikuno/DAJIN2/raw/main/examples/example_batch.tar.gz
210
+ tar -xf example_batch.tar.gz
201
211
 
202
212
  # Run DAJIN2
203
213
  DAJIN2 batch --file example-batch/batch.csv --threads 3
@@ -0,0 +1,20 @@
1
+ numpy >= 1.24.0
2
+ scipy >= 1.10.0
3
+ pandas >= 1.0.0
4
+ openpyxl >= 3.1.0
5
+ rapidfuzz >=3.6.0
6
+ scikit-learn >= 1.3.0
7
+
8
+ mappy >= 2.24
9
+ pysam >= 0.21.0
10
+
11
+ Flask >= 2.2.0
12
+ waitress >= 2.1.0
13
+ Jinja2 >= 3.1.0
14
+
15
+ plotly >= 5.19.0
16
+ kaleido >= 0.2.0
17
+
18
+ cstag >= 1.0.0
19
+ midsv >= 0.11.0
20
+ wslPath >=0.4.1
@@ -9,7 +9,7 @@ with open("requirements.txt") as requirements_file:
9
9
 
10
10
  setuptools.setup(
11
11
  name="DAJIN2",
12
- version="0.4.2",
12
+ version="0.4.3",
13
13
  author="Akihiro Kuno",
14
14
  author_email="akuno@md.tsukuba.ac.jp",
15
15
  description="One-step genotyping tools for targeted long-read sequencing",
@@ -11,20 +11,6 @@ def calculate_label_percentages(labels: list[int]) -> dict[int, float]:
11
11
  return {label: (count / total_labels * 100) for label, count in label_counts.items()}
12
12
 
13
13
 
14
- def merge_mixed_cluster(labels_control: list[int], labels_sample: list[int], threshold: float = 0.5) -> list[int]:
15
- """Merge labels in sample if they appear more than 'threshold' percentage in control."""
16
- labels_merged = labels_sample.copy()
17
- label_percentages_control = calculate_label_percentages(labels_control)
18
- mixed_labels = {label for label, percent in label_percentages_control.items() if percent > threshold}
19
-
20
- new_label = max(labels_merged) + 1
21
- for i, label in enumerate(labels_sample):
22
- if label in mixed_labels:
23
- labels_merged[i] = new_label
24
-
25
- return labels_merged
26
-
27
-
28
14
  def map_clusters_to_previous(labels_sample: list[int], labels_previous: list[int]) -> dict[int, int]:
29
15
  """
30
16
  Determine which cluster in labels_previous corresponds to each cluster in labels_sample.
@@ -63,6 +49,8 @@ def merge_minor_cluster(
63
49
  minor_labels_percentage = {label for label, percent in label_percentages.items() if percent < threshold_percentage}
64
50
  minor_labels_readnumber = {label for label, num in Counter(labels_sample).items() if num <= threshold_readnumber}
65
51
  minor_labels = minor_labels_percentage | minor_labels_readnumber
52
+ if minor_labels == set():
53
+ return labels_sample
66
54
 
67
55
  correspondence = map_clusters_to_previous(labels_sample, labels_previous)
68
56
  update_required_labels = get_update_required_labels(correspondence)
@@ -70,7 +58,23 @@ def merge_minor_cluster(
70
58
  labels_merged = labels_sample.copy()
71
59
  for m in minor_labels:
72
60
  new_label = max(labels_merged) + 1
73
- labels_merged = [new_label if label in update_required_labels[correspondence[m]] else label for label in labels_merged]
61
+ labels_merged = [
62
+ new_label if label in update_required_labels[correspondence[m]] else label for label in labels_merged
63
+ ]
64
+
65
+ return labels_merged
66
+
67
+
68
+ def merge_mixed_cluster(labels_control: list[int], labels_sample: list[int], threshold: float = 0.5) -> list[int]:
69
+ """Merge labels in sample if they appear more than 'threshold' percentage in control."""
70
+ labels_merged = labels_sample.copy()
71
+ label_percentages_control = calculate_label_percentages(labels_control)
72
+ mixed_labels = {label for label, percent in label_percentages_control.items() if percent > threshold}
73
+
74
+ new_label = max(labels_merged) + 1
75
+ for i, label in enumerate(labels_sample):
76
+ if label in mixed_labels:
77
+ labels_merged[i] = new_label
74
78
 
75
79
  return labels_merged
76
80
 
@@ -82,7 +86,7 @@ def merge_minor_cluster(
82
86
 
83
87
  def merge_labels(labels_control: list[int], labels_sample: list[int], labels_previous: list[int]) -> list[int]:
84
88
  labels_merged = merge_minor_cluster(
85
- labels_sample, labels_previous, threshold_percentage=0.5, threshold_readnumber=10
89
+ labels_sample, labels_previous, threshold_percentage=0.5, threshold_readnumber=5
86
90
  )
87
91
  labels_merged = merge_mixed_cluster(labels_control, labels_merged)
88
92
  return labels_merged
@@ -70,8 +70,8 @@ def execute_control(arguments: dict):
70
70
  # Output BAM files
71
71
  ###########################################################
72
72
  logger.info(f"Output BAM files of {arguments['control']}...")
73
- report.report_bam.export_to_bam(
74
- ARGS.tempdir, ARGS.control_name, ARGS.genome_coordinates, ARGS.threads, is_control=True
73
+ report.bam_exporter.export_to_bam(
74
+ ARGS.tempdir, ARGS.control_name, ARGS.genome_coordinates, ARGS.threads, ARGS.uuid, is_control=True
75
75
  )
76
76
  ###########################################################
77
77
  # Finish call
@@ -204,15 +204,15 @@ def execute_sample(arguments: dict):
204
204
  # RESULT
205
205
  io.write_jsonl(RESULT_SAMPLE, Path(ARGS.tempdir, "result", f"{ARGS.sample_name}.jsonl"))
206
206
  # FASTA
207
- report.report_files.export_to_fasta(ARGS.tempdir, ARGS.sample_name, cons_sequence)
208
- report.report_files.export_reference_to_fasta(ARGS.tempdir, ARGS.sample_name)
207
+ report.sequence_exporter.export_to_fasta(ARGS.tempdir, ARGS.sample_name, cons_sequence)
208
+ report.sequence_exporter.export_reference_to_fasta(ARGS.tempdir, ARGS.sample_name)
209
209
  # HTML
210
- report.report_files.export_to_html(ARGS.tempdir, ARGS.sample_name, cons_percentage)
210
+ report.sequence_exporter.export_to_html(ARGS.tempdir, ARGS.sample_name, cons_percentage)
211
211
  # CSV (Allele Info)
212
- report.report_mutation.export_to_csv(ARGS.tempdir, ARGS.sample_name, ARGS.genome_coordinates, cons_percentage)
212
+ report.mutation_exporter.export_to_csv(ARGS.tempdir, ARGS.sample_name, ARGS.genome_coordinates, cons_percentage)
213
213
  # BAM
214
- report.report_bam.export_to_bam(
215
- ARGS.tempdir, ARGS.sample_name, ARGS.genome_coordinates, ARGS.threads, RESULT_SAMPLE
214
+ report.bam_exporter.export_to_bam(
215
+ ARGS.tempdir, ARGS.sample_name, ARGS.genome_coordinates, ARGS.threads, ARGS.uuid, RESULT_SAMPLE
216
216
  )
217
217
  for path_bam_igvjs in Path(ARGS.tempdir, "cache", ".igvjs").glob(f"{ARGS.control_name}_control.bam*"):
218
218
  shutil.copy(path_bam_igvjs, Path(ARGS.tempdir, "report", ".igvjs", ARGS.sample_name))
@@ -5,11 +5,19 @@ from urllib.request import urlopen
5
5
 
6
6
  def fetch_seq_coordinates(genome: str, blat_url: str, seq: str) -> dict:
7
7
  url = f"{blat_url}?db={genome}&type=BLAT&userSeq={seq}"
8
- response = urlopen(url).read().decode("utf8").split("\n")
9
- matches = [x for x in response if "100.0%" in x]
8
+ records = urlopen(url).read().decode("utf8").split("\n")
9
+ matches = []
10
+ for record in records:
11
+ if "100.0%" not in record:
12
+ continue
13
+ record_trim = [r for r in record.split(" ") if r]
14
+ if record_trim[-1] == str(len(seq)):
15
+ matches = record_trim
16
+
10
17
  if not matches:
11
18
  raise ValueError(f"{seq[:60]}... is not found in {genome}")
12
- chrom, strand, start, end, _ = matches[0].split()[-5:]
19
+
20
+ chrom, strand, start, end, _ = matches[-5:]
13
21
  return {"chrom": chrom, "strand": strand, "start": int(start), "end": int(end)}
14
22
 
15
23
 
@@ -8,8 +8,7 @@ from itertools import chain, groupby
8
8
 
9
9
  from collections import Counter
10
10
 
11
- from DAJIN2.utils import sam_handler
12
- from DAJIN2.utils import cssplits_handler
11
+ from DAJIN2.utils import io, sam_handler, cssplits_handler
13
12
 
14
13
 
15
14
  def has_inversion_in_splice(CIGAR: str) -> bool:
@@ -215,8 +214,8 @@ def generate_midsv(ARGS, is_control: bool = False, is_insertion: bool = False) -
215
214
  path_splice = Path(ARGS.tempdir, name, "sam", f"splice_{allele}.sam")
216
215
  path_output_midsv = Path(ARGS.tempdir, name, "midsv", f"{allele}.json")
217
216
 
218
- sam_ont = sam_handler.remove_overlapped_reads(list(sam_handler.read_sam(path_ont)))
219
- sam_splice = sam_handler.remove_overlapped_reads(list(sam_handler.read_sam(path_splice)))
217
+ sam_ont = sam_handler.remove_overlapped_reads(list(io.read_sam(path_ont)))
218
+ sam_splice = sam_handler.remove_overlapped_reads(list(io.read_sam(path_splice)))
220
219
  qname_of_map_ont = extract_qname_of_map_ont(sam_ont, sam_splice)
221
220
  sam_of_map_ont = filter_sam_by_preset(sam_ont, qname_of_map_ont, preset="map-ont")
222
221
  sam_of_splice = filter_sam_by_preset(sam_splice, qname_of_map_ont, preset="splice")
@@ -0,0 +1,3 @@
1
+ from DAJIN2.core.report import bam_exporter
2
+ from DAJIN2.core.report import sequence_exporter
3
+ from DAJIN2.core.report import mutation_exporter
@@ -1,17 +1,16 @@
1
1
  from __future__ import annotations
2
2
 
3
- import random
4
3
  from collections import defaultdict
5
4
  from itertools import groupby
6
5
  from pathlib import Path
7
6
 
8
- import midsv
9
7
  import pysam
10
8
 
11
- from DAJIN2.utils import sam_handler
9
+ from DAJIN2.utils import io, sam_handler
12
10
 
13
11
 
14
- def realign(sam: list[list[str]], GENOME_COODINATES: dict) -> list[str]:
12
+ def recalculate_sam_coodinates_to_reference(sam: list[list[str]], GENOME_COODINATES: dict) -> list[str]:
13
+ """Recalculate SAM genomic coordinates with the reference genome, not with the FASTA_ALLELE"""
15
14
  sam_headers = [s for s in sam if s[0].startswith("@")]
16
15
  sam_contents = [s for s in sam if not s[0].startswith("@")]
17
16
  for s in sam_headers:
@@ -29,31 +28,44 @@ def realign(sam: list[list[str]], GENOME_COODINATES: dict) -> list[str]:
29
28
  return sam_headers + sam_contents
30
29
 
31
30
 
31
+ def convert_pos_to_one_indexed(sam_lines: list[list[str]]) -> list[list[str]]:
32
+ """Convert SAM POS from 0-indexed to 1-indexed"""
33
+
34
+ def convert_line(line: list[str]) -> list[str]:
35
+ if not line[0].startswith("@") and line[3] == "0":
36
+ line[3] = "1"
37
+ return line
38
+
39
+ return [convert_line(line) for line in sam_lines]
40
+
41
+
32
42
  def group_by_name(sam_contents: list[str], clust_sample: list[dict]) -> dict[list]:
43
+ """Group alignments in map-ont.sam by allele name (NAME)"""
33
44
  sam_contents.sort()
34
- clust_sample_qname = sorted(clust_sample, key=lambda x: x["QNAME"])
35
- clust_sample_qname_set = set()
36
- for qnames in clust_sample_qname:
37
- qname = qnames["QNAME"]
38
- clust_sample_qname_set.add(qname)
45
+ clust_sample_sorted = sorted(clust_sample, key=lambda x: x["QNAME"])
46
+
47
+ qnames: set[str] = {c["QNAME"] for c in clust_sample_sorted}
48
+
39
49
  sam_groups = defaultdict(list)
40
- idx_left = 0
41
- idx_right = 0
42
- while idx_left < len(sam_contents) and idx_right < len(clust_sample_qname):
43
- read_left = sam_contents[idx_left][:-1]
44
- read_right = clust_sample_qname[idx_right]
45
- qname_left = read_left[0]
46
- qname_right = read_right["QNAME"]
47
- if qname_left not in clust_sample_qname_set:
48
- idx_left += 1
50
+ idx_sam_contents = 0
51
+ idx_clust_sample = 0
52
+ while idx_sam_contents < len(sam_contents) and idx_clust_sample < len(clust_sample_sorted):
53
+ alignments_sam = sam_contents[idx_sam_contents][:-1] # Discard CS tags to reduce file size
54
+ alignments_clsut_sample = clust_sample_sorted[idx_clust_sample]
55
+ qname_sam = alignments_sam[0]
56
+
57
+ if qname_sam not in qnames:
58
+ idx_sam_contents += 1
49
59
  continue
50
- if qname_left == qname_right:
51
- key = read_right["NAME"]
52
- sam_groups[key].append(read_left)
53
- idx_left += 1
60
+
61
+ if qname_sam == alignments_clsut_sample["QNAME"]:
62
+ key = alignments_clsut_sample["NAME"]
63
+ sam_groups[key].append(alignments_sam)
64
+ idx_sam_contents += 1
54
65
  else:
55
- idx_right += 1
56
- return sam_groups
66
+ idx_clust_sample += 1
67
+
68
+ return dict(sam_groups)
57
69
 
58
70
 
59
71
  ###############################################################################
@@ -67,13 +79,11 @@ def subset_qnames(RESULT_SAMPLE, readnum: int = 100) -> dict[set[str]]:
67
79
  group = list(group)
68
80
  qnames = [res["QNAME"] for res in group[:readnum]]
69
81
  qnames_by_name[name] = set(qnames)
70
- return qnames_by_name
82
+ return dict(qnames_by_name)
71
83
 
72
84
 
73
- def subset_reads(name, sam_content, qnames_by_name):
74
- qnames = qnames_by_name[name]
75
- sam_subset = [sam for sam in sam_content if sam[0] in qnames]
76
- return sam_subset
85
+ def subset_reads(sam_content: list[str], qnames: set[str]) -> list[str]:
86
+ return [sam for sam in sam_content if sam[0] in qnames]
77
87
 
78
88
 
79
89
  ###############################################################################
@@ -89,31 +99,34 @@ def write_sam_to_bam(sam: list[list[str]], path_sam: str | Path, path_bam: str |
89
99
 
90
100
 
91
101
  def update_sam(sam: list, GENOME_COODINATES: dict = {}) -> list:
92
- sam_update = sam.copy()
93
- sam_update = sam_handler.remove_overlapped_reads(sam_update)
94
- sam_update = sam_handler.remove_microhomology(sam_update)
95
- if "genome" in GENOME_COODINATES:
96
- sam_update = realign(sam_update, GENOME_COODINATES)
97
- return sam_update
102
+ sam_records = sam.copy()
103
+ sam_records = sam_handler.remove_overlapped_reads(sam_records)
104
+ sam_records = sam_handler.remove_microhomology(sam_records)
105
+ if GENOME_COODINATES["genome"]:
106
+ return recalculate_sam_coodinates_to_reference(sam_records, GENOME_COODINATES)
107
+ else:
108
+ return convert_pos_to_one_indexed(sam_records)
98
109
 
99
110
 
100
- def export_to_bam(TEMPDIR, NAME, GENOME_COODINATES, THREADS, RESULT_SAMPLE=None, is_control=False) -> None:
101
- randomnum = random.randint(100_000, 999_999)
111
+ def export_to_bam(TEMPDIR, NAME, GENOME_COODINATES, THREADS, UUID, RESULT_SAMPLE=None, is_control=False) -> None:
102
112
  path_sam_input = Path(TEMPDIR, NAME, "sam", "map-ont_control.sam")
103
- sam = list(midsv.read_sam(path_sam_input))
113
+ sam_records = list(io.read_sam(path_sam_input))
114
+
104
115
  # Update sam
105
- sam_update = update_sam(sam, GENOME_COODINATES)
116
+ sam_updated = update_sam(sam_records, GENOME_COODINATES)
117
+
106
118
  # Output SAM and BAM
107
- path_sam_output = Path(TEMPDIR, "report", "BAM", f"tmp{randomnum}_{NAME}_control.sam")
119
+ path_sam_output = Path(TEMPDIR, "report", "BAM", f"temp_{UUID}_{NAME}_control.sam")
108
120
  path_bam_output = Path(TEMPDIR, "report", "BAM", NAME, f"{NAME}.bam")
109
- write_sam_to_bam(sam_update, path_sam_output, path_bam_output, THREADS)
121
+ write_sam_to_bam(sam_updated, path_sam_output, path_bam_output, THREADS)
122
+
110
123
  # Prepare SAM headers and contents
111
- sam_headers = [s for s in sam_update if s[0].startswith("@")]
112
- sam_contents = [s for s in sam_update if not s[0].startswith("@")]
124
+ sam_headers = [s for s in sam_updated if s[0].startswith("@")]
125
+ sam_contents = [s for s in sam_updated if not s[0].startswith("@")]
113
126
  if is_control:
114
- qnames = set(list(set(s[0] for s in sam_contents[:10000]))[:100])
115
- sam_subset = [s for s in sam_update if s[0] in qnames]
116
- path_sam_output = Path(TEMPDIR, "report", "BAM", f"tmp{randomnum}_{NAME}_control_cache.sam")
127
+ qnames: set[str] = set(list(set(s[0] for s in sam_contents[:10000]))[:100])
128
+ sam_subset = [s for s in sam_updated if s[0] in qnames]
129
+ path_sam_output = Path(TEMPDIR, "report", "BAM", f"temp_{UUID}_{NAME}_control_cache.sam")
117
130
  path_bam_output = Path(TEMPDIR, "cache", ".igvjs", NAME, "control.bam")
118
131
  write_sam_to_bam(sam_headers + sam_subset, path_sam_output, path_bam_output, THREADS)
119
132
  else:
@@ -122,14 +135,15 @@ def export_to_bam(TEMPDIR, NAME, GENOME_COODINATES, THREADS, RESULT_SAMPLE=None,
122
135
  # Output SAM and BAM
123
136
  for name, sam_content in sam_groups.items():
124
137
  # BAM
125
- path_sam_output = Path(TEMPDIR, "report", "bam", f"tmp{randomnum}_{name}.sam")
138
+ path_sam_output = Path(TEMPDIR, "report", "BAM", f"temp_{UUID}_{name}.sam")
126
139
  path_bam_output = Path(TEMPDIR, "report", "BAM", NAME, f"{NAME}_{name}.bam")
127
140
  write_sam_to_bam(sam_headers + sam_content, path_sam_output, path_bam_output, THREADS)
128
141
  # igvjs
129
- sam_subset = subset_reads(name, sam_content, qnames_by_name)
130
- path_sam_output = Path(TEMPDIR, "report", "bam", f"tmp{randomnum}_{name}_subset.sam")
142
+ sam_subset = subset_reads(sam_content, qnames_by_name[name])
143
+ path_sam_output = Path(TEMPDIR, "report", "BAM", f"temp_{UUID}_{name}_subset.sam")
131
144
  path_bam_output = Path(TEMPDIR, "report", ".igvjs", NAME, f"{name}.bam")
132
145
  write_sam_to_bam(sam_headers + sam_subset, path_sam_output, path_bam_output, THREADS)
146
+
133
147
  # Remove temporary files
134
- sam_temp = Path(TEMPDIR, "report", "BAM").glob(f"tmp{randomnum}*.sam")
148
+ sam_temp = Path(TEMPDIR, "report", "BAM").glob(f"temp_{UUID}*.sam")
135
149
  [s.unlink() for s in sam_temp]
@@ -20,7 +20,7 @@ from DAJIN2.core import core
20
20
  from DAJIN2.utils import io, config, report_generator, input_validator, multiprocess
21
21
 
22
22
 
23
- DAJIN_VERSION = "0.4.2"
23
+ DAJIN_VERSION = "0.4.3"
24
24
 
25
25
 
26
26
  def generate_report(name: str) -> None:
@@ -19,6 +19,12 @@ from openpyxl import load_workbook, Workbook
19
19
  ###########################################################
20
20
 
21
21
 
22
+ def read_sam(path_of_sam: str | Path) -> Generator[list]:
23
+ with open(path_of_sam) as f:
24
+ for line in f:
25
+ yield line.strip().split("\t")
26
+
27
+
22
28
  def load_pickle(file_path: Path):
23
29
  with open(file_path, "rb") as f:
24
30
  return pickle.load(f)
@@ -2,8 +2,6 @@ from __future__ import annotations
2
2
 
3
3
  import re
4
4
 
5
- from pathlib import Path
6
- from typing import Generator
7
5
  from itertools import groupby
8
6
  from DAJIN2.utils.dna_handler import revcomp
9
7
 
@@ -25,17 +23,6 @@ def is_mapped(s: list[str]) -> bool:
25
23
  return not s[0].startswith("@") and s[9] != "*"
26
24
 
27
25
 
28
- ###########################################################
29
- # Read sam
30
- ###########################################################
31
-
32
-
33
- def read_sam(path_of_sam: str | Path) -> Generator[list]:
34
- with open(path_of_sam) as f:
35
- for line in f:
36
- yield line.strip().split("\t")
37
-
38
-
39
26
  ###########################################################
40
27
  # remove_overlapped_reads
41
28
  ###########################################################
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: DAJIN2
3
- Version: 0.4.2
3
+ Version: 0.4.3
4
4
  Summary: One-step genotyping tools for targeted long-read sequencing
5
5
  Home-page: https://github.com/akikuno/DAJIN2
6
6
  Author: Akihiro Kuno
@@ -14,22 +14,22 @@ Classifier: Intended Audience :: Science/Research
14
14
  Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
15
15
  Description-Content-Type: text/markdown
16
16
  License-File: LICENSE
17
- Requires-Dist: numpy>=1.20.0
18
- Requires-Dist: scipy>=1.6.0
17
+ Requires-Dist: numpy>=1.24.0
18
+ Requires-Dist: scipy>=1.10.0
19
19
  Requires-Dist: pandas>=1.0.0
20
- Requires-Dist: openpyxl>=3.0.0
21
- Requires-Dist: rapidfuzz>=3.0.0
22
- Requires-Dist: scikit-learn>=1.0.0
20
+ Requires-Dist: openpyxl>=3.1.0
21
+ Requires-Dist: rapidfuzz>=3.6.0
22
+ Requires-Dist: scikit-learn>=1.3.0
23
23
  Requires-Dist: mappy>=2.24
24
- Requires-Dist: pysam>=0.19.0
24
+ Requires-Dist: pysam>=0.21.0
25
25
  Requires-Dist: Flask>=2.2.0
26
26
  Requires-Dist: waitress>=2.1.0
27
27
  Requires-Dist: Jinja2>=3.1.0
28
- Requires-Dist: plotly>=5.0.0
28
+ Requires-Dist: plotly>=5.19.0
29
29
  Requires-Dist: kaleido>=0.2.0
30
30
  Requires-Dist: cstag>=1.0.0
31
- Requires-Dist: midsv>=0.10.1
32
- Requires-Dist: wslPath>=0.3.0
31
+ Requires-Dist: midsv>=0.11.0
32
+ Requires-Dist: wslPath>=0.4.1
33
33
 
34
34
  [![License](https://img.shields.io/badge/License-MIT-9cf.svg)](https://choosealicense.com/licenses/mit/)
35
35
  [![Test](https://img.shields.io/github/actions/workflow/status/akikuno/dajin2/pytest.yml?branch=main&label=Test&color=brightgreen)](https://github.com/akikuno/dajin2/actions)
@@ -78,6 +78,7 @@ conda activate env-dajin2
78
78
  > CONDA_SUBDIR=osx-64 conda create -n env-dajin2 -c conda-forge -c bioconda python=3.10 DAJIN2 -y
79
79
  > conda activate env-dajin2
80
80
  > conda config --env --set subdir osx-64
81
+ > python -c "import platform; print(platform.machine())" # Confirm that the output is 'x86_64', not 'arm64'
81
82
  > ```
82
83
 
83
84
  ### From [PyPI](https://pypi.org/project/DAJIN2/)
@@ -164,12 +165,17 @@ Options:
164
165
  #### Example
165
166
 
166
167
  ```bash
168
+ # Download example dataset
169
+ wget https://github.com/akikuno/DAJIN2/raw/main/examples/example_single.tar.gz
170
+ tar -xf example_single.tar.gz
171
+
172
+ # Run DAJIN2
167
173
  DAJIN2 \
168
- --control example/barcode01 \
169
- --sample example/barcode02 \
170
- --allele example/design.fa \
171
- --name IL6-knockin \
172
- --genome hg38 \
174
+ --control example_single/control \
175
+ --sample example_single/sample \
176
+ --allele example_single/stx2_deletion.fa \
177
+ --name stx2_deletion \
178
+ --genome mm39 \
173
179
  --threads 4
174
180
  ```
175
181
 
@@ -206,7 +212,6 @@ DAJIN2 \
206
212
 
207
213
  By using the `batch` subcommand, you can process multiple FASTQ files simultaneously.
208
214
  For this purpose, a CSV or Excel file consolidating the sample information is required.
209
- <!-- For a specific example, please refer to [this link](https://github.com/akikuno/DAJIN2/blob/main/examples/example-batch/batch.csv). -->
210
215
 
211
216
  > [!NOTE]
212
217
  > For guidance on how to compile sample information, please refer to [this document](https://docs.google.com/presentation/d/e/2PACX-1vSMEmXJPG2TNjfT66XZJRzqJd82aAqO5gJrdEzyhn15YBBr_Li-j5puOgVChYf3jA/embed?start=false&loop=false&delayms=3000).
@@ -224,13 +229,18 @@ options:
224
229
  #### Example
225
230
 
226
231
  ```bash
227
- DAJIN2 --file batch.csv --threads 4
232
+ # Donwload the example dataset
233
+ wget https://github.com/akikuno/DAJIN2/raw/main/examples/example_batch.tar.gz
234
+ tar -xf example_batch.tar.gz
235
+
236
+ # Run DAJIN2
237
+ DAJIN2 batch --file example_batch/batch.csv --threads 4
228
238
  ```
229
239
 
230
240
  <!-- ```bash
231
241
  # Donwload the example dataset
232
- wget https://github.com/akikuno/DAJIN2/raw/main/examples/example-batch.tar.gz
233
- tar -xf example-batch.tar.gz
242
+ wget https://github.com/akikuno/DAJIN2/raw/main/examples/example_batch.tar.gz
243
+ tar -xf example_batch.tar.gz
234
244
 
235
245
  # Run DAJIN2
236
246
  DAJIN2 batch --file example-batch/batch.csv --threads 3
@@ -46,10 +46,10 @@ src/DAJIN2/core/preprocess/mapping.py
46
46
  src/DAJIN2/core/preprocess/midsv_caller.py
47
47
  src/DAJIN2/core/preprocess/mutation_extractor.py
48
48
  src/DAJIN2/core/report/__init__.py
49
+ src/DAJIN2/core/report/bam_exporter.py
49
50
  src/DAJIN2/core/report/insertion_reflector.py
50
- src/DAJIN2/core/report/report_bam.py
51
- src/DAJIN2/core/report/report_files.py
52
- src/DAJIN2/core/report/report_mutation.py
51
+ src/DAJIN2/core/report/mutation_exporter.py
52
+ src/DAJIN2/core/report/sequence_exporter.py
53
53
  src/DAJIN2/static/css/style.css
54
54
  src/DAJIN2/templates/index.html
55
55
  src/DAJIN2/utils/config.py
@@ -0,0 +1,16 @@
1
+ numpy>=1.24.0
2
+ scipy>=1.10.0
3
+ pandas>=1.0.0
4
+ openpyxl>=3.1.0
5
+ rapidfuzz>=3.6.0
6
+ scikit-learn>=1.3.0
7
+ mappy>=2.24
8
+ pysam>=0.21.0
9
+ Flask>=2.2.0
10
+ waitress>=2.1.0
11
+ Jinja2>=3.1.0
12
+ plotly>=5.19.0
13
+ kaleido>=0.2.0
14
+ cstag>=1.0.0
15
+ midsv>=0.11.0
16
+ wslPath>=0.4.1
@@ -1,20 +0,0 @@
1
- numpy >= 1.20.0
2
- scipy >= 1.6.0
3
- pandas >= 1.0.0
4
- openpyxl >= 3.0.0
5
- rapidfuzz >=3.0.0
6
- scikit-learn >= 1.0.0
7
-
8
- mappy >= 2.24
9
- pysam >= 0.19.0
10
-
11
- Flask >= 2.2.0
12
- waitress >= 2.1.0
13
- Jinja2 >= 3.1.0
14
-
15
- plotly >= 5.0.0
16
- kaleido >= 0.2.0
17
-
18
- cstag >= 1.0.0
19
- midsv >= 0.10.1
20
- wslPath >=0.3.0
@@ -1,3 +0,0 @@
1
- from DAJIN2.core.report import report_bam
2
- from DAJIN2.core.report import report_files
3
- from DAJIN2.core.report import report_mutation
@@ -1,16 +0,0 @@
1
- numpy>=1.20.0
2
- scipy>=1.6.0
3
- pandas>=1.0.0
4
- openpyxl>=3.0.0
5
- rapidfuzz>=3.0.0
6
- scikit-learn>=1.0.0
7
- mappy>=2.24
8
- pysam>=0.19.0
9
- Flask>=2.2.0
10
- waitress>=2.1.0
11
- Jinja2>=3.1.0
12
- plotly>=5.0.0
13
- kaleido>=0.2.0
14
- cstag>=1.0.0
15
- midsv>=0.10.1
16
- wslPath>=0.3.0
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes