DAJIN2 0.4.5__zip → 0.4.6__zip
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dajin2-0.4.5/src/DAJIN2.egg-info → dajin2-0.4.6}/PKG-INFO +7 -2
- {dajin2-0.4.5 → dajin2-0.4.6}/README.md +5 -1
- {dajin2-0.4.5 → dajin2-0.4.6}/requirements.txt +2 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/setup.py +1 -1
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/clustering/strand_bias_handler.py +19 -8
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/consensus/mutation_extractor.py +3 -1
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/consensus/similarity_searcher.py +14 -4
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/core.py +4 -1
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/preprocess/genome_fetcher.py +11 -4
- dajin2-0.4.6/src/DAJIN2/core/preprocess/homopolymer_handler.py +70 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/preprocess/insertions_to_fasta.py +136 -78
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/preprocess/knockin_handler.py +11 -11
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/preprocess/midsv_caller.py +3 -3
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/preprocess/mutation_extractor.py +33 -18
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/main.py +39 -39
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/utils/config.py +2 -2
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/utils/cssplits_handler.py +71 -45
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/utils/input_validator.py +22 -15
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/utils/io.py +19 -10
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/utils/sam_handler.py +1 -1
- {dajin2-0.4.5 → dajin2-0.4.6/src/DAJIN2.egg-info}/PKG-INFO +7 -2
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2.egg-info/requires.txt +1 -0
- dajin2-0.4.5/src/DAJIN2/core/preprocess/homopolymer_handler.py +0 -51
- {dajin2-0.4.5 → dajin2-0.4.6}/LICENSE +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/MANIFEST.in +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/setup.cfg +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/__init__.py +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/__init__.py +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/classification/__init__.py +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/classification/allele_merger.py +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/classification/classifier.py +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/clustering/__init__.py +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/clustering/appender.py +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/clustering/clustering.py +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/clustering/kmer_generator.py +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/clustering/label_extractor.py +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/clustering/label_merger.py +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/clustering/label_updator.py +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/clustering/score_handler.py +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/consensus/__init__.py +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/consensus/clust_formatter.py +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/consensus/consensus.py +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/consensus/name_handler.py +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/preprocess/__init__.py +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/preprocess/cache_checker.py +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/preprocess/directory_manager.py +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/preprocess/input_formatter.py +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/preprocess/mapping.py +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/report/__init__.py +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/report/bam_exporter.py +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/report/insertion_reflector.py +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/report/mutation_exporter.py +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/report/sequence_exporter.py +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/gui.py +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/static/css/style.css +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/template_igvjs.html +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/templates/index.html +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/utils/dna_handler.py +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/utils/fastx_handler.py +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/utils/multiprocess.py +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/utils/report_generator.py +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/view.py +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2.egg-info/SOURCES.txt +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2.egg-info/dependency_links.txt +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2.egg-info/entry_points.txt +0 -0
- {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: DAJIN2
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.6
|
|
4
4
|
Summary: One-step genotyping tools for targeted long-read sequencing
|
|
5
5
|
Home-page: https://github.com/akikuno/DAJIN2
|
|
6
6
|
Author: Akihiro Kuno
|
|
@@ -14,6 +14,7 @@ Classifier: Intended Audience :: Science/Research
|
|
|
14
14
|
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
15
15
|
Description-Content-Type: text/markdown
|
|
16
16
|
License-File: LICENSE
|
|
17
|
+
Requires-Dist: certifi
|
|
17
18
|
Requires-Dist: numpy>=1.24.0
|
|
18
19
|
Requires-Dist: scipy>=1.10.0
|
|
19
20
|
Requires-Dist: pandas>=1.0.0
|
|
@@ -61,7 +62,7 @@ The name DAJIN is derived from the phrase 一網**打尽** (Ichimou **DAJIN** or
|
|
|
61
62
|
|
|
62
63
|
### Prerequisites
|
|
63
64
|
|
|
64
|
-
- Python 3.8
|
|
65
|
+
- Python 3.8 to 3.10
|
|
65
66
|
- Unix-like environment (Linux, macOS, WSL2, etc.)
|
|
66
67
|
|
|
67
68
|
### From [Bioconda](https://anaconda.org/bioconda/DAJIN2) (Recommended)
|
|
@@ -71,6 +72,10 @@ conda create -n env-dajin2 -c conda-forge -c bioconda python=3.10 DAJIN2 -y
|
|
|
71
72
|
conda activate env-dajin2
|
|
72
73
|
```
|
|
73
74
|
|
|
75
|
+
> [!IMPORTANT]
|
|
76
|
+
> DAJIN2 supports Python versions 3.8 to 3.10, but not Python 3.11 yet due to a [Bioconda issue](https://github.com/bioconda/bioconda-recipes/issues/37805).
|
|
77
|
+
|
|
78
|
+
|
|
74
79
|
> [!NOTE]
|
|
75
80
|
> To Apple Silicon (ARM64) users:
|
|
76
81
|
> [Since the Bioconda channel does not yet support Apple Silicon](https://github.com/bioconda/bioconda-recipes/issues/37068#issuecomment-1257790919), please use the following command to install `DAJIN2` through Rosetta.
|
|
@@ -28,7 +28,7 @@ The name DAJIN is derived from the phrase 一網**打尽** (Ichimou **DAJIN** or
|
|
|
28
28
|
|
|
29
29
|
### Prerequisites
|
|
30
30
|
|
|
31
|
-
- Python 3.8
|
|
31
|
+
- Python 3.8 to 3.10
|
|
32
32
|
- Unix-like environment (Linux, macOS, WSL2, etc.)
|
|
33
33
|
|
|
34
34
|
### From [Bioconda](https://anaconda.org/bioconda/DAJIN2) (Recommended)
|
|
@@ -38,6 +38,10 @@ conda create -n env-dajin2 -c conda-forge -c bioconda python=3.10 DAJIN2 -y
|
|
|
38
38
|
conda activate env-dajin2
|
|
39
39
|
```
|
|
40
40
|
|
|
41
|
+
> [!IMPORTANT]
|
|
42
|
+
> DAJIN2 supports Python versions 3.8 to 3.10, but not Python 3.11 yet due to a [Bioconda issue](https://github.com/bioconda/bioconda-recipes/issues/37805).
|
|
43
|
+
|
|
44
|
+
|
|
41
45
|
> [!NOTE]
|
|
42
46
|
> To Apple Silicon (ARM64) users:
|
|
43
47
|
> [Since the Bioconda channel does not yet support Apple Silicon](https://github.com/bioconda/bioconda-recipes/issues/37068#issuecomment-1257790919), please use the following command to install `DAJIN2` through Rosetta.
|
|
@@ -9,7 +9,7 @@ with open("requirements.txt") as requirements_file:
|
|
|
9
9
|
|
|
10
10
|
setuptools.setup(
|
|
11
11
|
name="DAJIN2",
|
|
12
|
-
version="0.4.
|
|
12
|
+
version="0.4.6",
|
|
13
13
|
author="Akihiro Kuno",
|
|
14
14
|
author_email="akuno@md.tsukuba.ac.jp",
|
|
15
15
|
description="One-step genotyping tools for targeted long-read sequencing",
|
|
@@ -9,6 +9,7 @@ Re-allocates reads belonging to clusters with strand bias to clusters without st
|
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
11
|
from pathlib import Path
|
|
12
|
+
from typing import Generator
|
|
12
13
|
from collections import defaultdict
|
|
13
14
|
from sklearn.tree import DecisionTreeClassifier
|
|
14
15
|
|
|
@@ -40,7 +41,7 @@ def is_strand_bias(path_control: Path) -> bool:
|
|
|
40
41
|
###############################################################################
|
|
41
42
|
|
|
42
43
|
|
|
43
|
-
def count_strand(labels: list[int], samples:
|
|
44
|
+
def count_strand(labels: list[int], samples: Generator[dict[str, str]]) -> tuple[dict[str, int], dict[str, int]]:
|
|
44
45
|
"""Count the occurrences of each strand type by label."""
|
|
45
46
|
positive_strand_counts_by_labels = defaultdict(int)
|
|
46
47
|
total_counts_by_labels = defaultdict(int)
|
|
@@ -54,18 +55,25 @@ def count_strand(labels: list[int], samples: list[dict[str, str]]) -> tuple[dict
|
|
|
54
55
|
|
|
55
56
|
|
|
56
57
|
def determine_strand_biases(
|
|
57
|
-
positive_strand_counts_by_labels:
|
|
58
|
+
positive_strand_counts_by_labels: dict[str, int], total_counts_by_labels: dict[str, int]
|
|
58
59
|
) -> dict[int, bool]:
|
|
59
60
|
"""Determine strand biases based on positive strand counts."""
|
|
60
61
|
strand_biases = {}
|
|
61
62
|
for label, total in total_counts_by_labels.items():
|
|
62
|
-
positive_strand_count = positive_strand_counts_by_labels
|
|
63
|
+
positive_strand_count = positive_strand_counts_by_labels.get(label, 0)
|
|
63
64
|
strand_ratio = positive_strand_count / total
|
|
64
65
|
strand_biases[label] = not (STRAND_BIAS_LOWER_LIMIT < strand_ratio < STRAND_BIAS_UPPER_LIMIT)
|
|
65
66
|
|
|
66
67
|
return strand_biases
|
|
67
68
|
|
|
68
69
|
|
|
70
|
+
def annotate_strand_bias_by_labels(path_sample: Path, labels: list[int]) -> bool:
|
|
71
|
+
"""Determine whether there is strand bias in the samples based on the provided labels."""
|
|
72
|
+
samples = io.read_jsonl(path_sample)
|
|
73
|
+
positive_strand_counts_by_labels, total_counts_by_labels = count_strand(labels, samples)
|
|
74
|
+
return determine_strand_biases(positive_strand_counts_by_labels, total_counts_by_labels)
|
|
75
|
+
|
|
76
|
+
|
|
69
77
|
def prepare_training_testing_sets(labels, scores, strand_biases) -> tuple[list, list, list]:
|
|
70
78
|
"""Prepare training and testing datasets based on strand biases."""
|
|
71
79
|
train_data, train_labels, test_data = [], [], []
|
|
@@ -96,20 +104,23 @@ def allocate_labels(labels: list[int], strand_biases: dict[str, bool], dtree, te
|
|
|
96
104
|
|
|
97
105
|
def remove_biased_clusters(path_sample: Path, path_score_sample: Path, labels: list[int]) -> list[int]:
|
|
98
106
|
"""Remove clusters with strand bias by re-labeling based on decision tree predictions.
|
|
99
|
-
Continue until at least one of the samples exhibits strand bias (i.e., do not calculate if all samples exhibit strand bias
|
|
107
|
+
Continue until at least one of the samples exhibits strand bias (i.e., do not calculate if all samples exhibit strand bias or, if none of the samples exhibit strand bias) and
|
|
100
108
|
1000 iterations are reached, which serves as a safeguard to prevent infinite loops.
|
|
101
109
|
"""
|
|
102
|
-
|
|
103
|
-
positive_strand_counts_by_labels, total_counts_by_labels = count_strand(labels, samples)
|
|
104
|
-
strand_biases = determine_strand_biases(positive_strand_counts_by_labels, total_counts_by_labels)
|
|
110
|
+
strand_biases = annotate_strand_bias_by_labels(path_sample, labels)
|
|
105
111
|
|
|
106
112
|
iteration_count = 0
|
|
107
113
|
labels_corrected = labels
|
|
108
114
|
while len(set(strand_biases.values())) > 1 and iteration_count < 1000:
|
|
115
|
+
# Re-allocation of labels of biased clusters to unbiased clusters
|
|
109
116
|
scores = io.read_jsonl(path_score_sample)
|
|
110
117
|
train_data, train_labels, test_data = prepare_training_testing_sets(labels, scores, strand_biases)
|
|
111
118
|
dtree = train_decision_tree(train_data, train_labels)
|
|
112
119
|
labels_corrected = allocate_labels(labels, strand_biases, dtree, test_data)
|
|
113
|
-
|
|
120
|
+
|
|
121
|
+
# Re-calculate strand biases based on the corrected labels
|
|
122
|
+
strand_biases = annotate_strand_bias_by_labels(path_sample, labels)
|
|
123
|
+
|
|
114
124
|
iteration_count += 1
|
|
125
|
+
|
|
115
126
|
return labels_corrected
|
|
@@ -75,7 +75,9 @@ def cache_normalized_indels(ARGS, path_midsv_sample: Path) -> None:
|
|
|
75
75
|
path_midsv_control = extract_path_control(ARGS, allele)
|
|
76
76
|
path_midsv_n_filtered_control = extract_path_n_filtered_control(ARGS, path_midsv_control)
|
|
77
77
|
|
|
78
|
-
cache_selected_control_by_similarity(
|
|
78
|
+
cache_selected_control_by_similarity(
|
|
79
|
+
ARGS, path_midsv_n_filtered_control, path_midsv_sample, path_midsv_sample.parent
|
|
80
|
+
)
|
|
79
81
|
|
|
80
82
|
path_midsv_similar_control = Path(path_midsv_sample.parent, f"{allele}_{label}_control.jsonl")
|
|
81
83
|
|
|
@@ -72,14 +72,22 @@ def identify_normal_reads(
|
|
|
72
72
|
###########################################################
|
|
73
73
|
|
|
74
74
|
|
|
75
|
-
def filter_control(path_midsv_control: Path, path_midsv_sample: Path) -> list[bool]:
|
|
75
|
+
def filter_control(ARGS, path_midsv_control: Path, path_midsv_sample: Path) -> list[bool]:
|
|
76
76
|
"""
|
|
77
77
|
find similar control reads compared to sample reads
|
|
78
78
|
"""
|
|
79
79
|
cssplits = (m["CSSPLIT"].split(",") for m in io.read_jsonl(path_midsv_sample))
|
|
80
80
|
coverage_match = np.array([sum(1 for cs in cssplit if cs.startswith("=")) for cssplit in zip(*cssplits)])
|
|
81
81
|
mut_onehot_sample = onehot_by_mutations(io.read_jsonl(path_midsv_sample))
|
|
82
|
-
|
|
82
|
+
|
|
83
|
+
path_mut_onehot_control = Path(
|
|
84
|
+
ARGS.tempdir, ARGS.control_name, "consensus", f"{path_midsv_control.stem}_onehot.pickle"
|
|
85
|
+
)
|
|
86
|
+
if path_mut_onehot_control.exists():
|
|
87
|
+
mut_onehot_control = io.load_pickle(path_mut_onehot_control)
|
|
88
|
+
else:
|
|
89
|
+
mut_onehot_control = onehot_by_mutations(io.read_jsonl(path_midsv_control))
|
|
90
|
+
io.save_pickle(mut_onehot_control, path_mut_onehot_control)
|
|
83
91
|
|
|
84
92
|
mut_percentage_sample = calculate_percentage(mut_onehot_sample, coverage_match)
|
|
85
93
|
values_mask = get_values_to_mask(mut_percentage_sample)
|
|
@@ -90,8 +98,10 @@ def filter_control(path_midsv_control: Path, path_midsv_sample: Path) -> list[bo
|
|
|
90
98
|
return identify_normal_reads(mut_onehot_sample_masked, mut_onehot_control_masked)
|
|
91
99
|
|
|
92
100
|
|
|
93
|
-
def cache_selected_control_by_similarity(
|
|
94
|
-
|
|
101
|
+
def cache_selected_control_by_similarity(
|
|
102
|
+
ARGS, path_midsv_control: Path, path_midsv_sample: Path, path_output: Path
|
|
103
|
+
) -> None:
|
|
104
|
+
normal_reads_flags = filter_control(ARGS, path_midsv_control, path_midsv_sample)
|
|
95
105
|
midsv_control = io.read_jsonl(path_midsv_control)
|
|
96
106
|
midsv_filtered = (m for m, flag in zip(midsv_control, normal_reads_flags) if flag is True)
|
|
97
107
|
|
|
@@ -5,7 +5,7 @@ import logging
|
|
|
5
5
|
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
|
|
8
|
-
from DAJIN2.utils import io, fastx_handler
|
|
8
|
+
from DAJIN2.utils import io, config, fastx_handler
|
|
9
9
|
from DAJIN2.core import classification, clustering, consensus, preprocess, report
|
|
10
10
|
from DAJIN2.core.preprocess.input_formatter import FormattedInputs
|
|
11
11
|
|
|
@@ -18,6 +18,9 @@ logger = logging.getLogger(__name__)
|
|
|
18
18
|
|
|
19
19
|
|
|
20
20
|
def execute_control(arguments: dict):
|
|
21
|
+
|
|
22
|
+
logger.info(f"\N{runner} Start running DAJIN2 version {config.DAJIN_VERSION}")
|
|
23
|
+
|
|
21
24
|
logger.info(f"{arguments['control']} is now processing...")
|
|
22
25
|
|
|
23
26
|
###########################################################
|
|
@@ -1,11 +1,18 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import ssl
|
|
3
4
|
from urllib.request import urlopen
|
|
4
5
|
|
|
5
6
|
|
|
7
|
+
def fetch_html_without_verification(url: str) -> str:
|
|
8
|
+
context = ssl._create_unverified_context() # Create an SSL context that temporarily disables verification
|
|
9
|
+
with urlopen(url, context=context, timeout=10) as response:
|
|
10
|
+
return response.read().decode("utf-8").split("\n")
|
|
11
|
+
|
|
12
|
+
|
|
6
13
|
def fetch_seq_coordinates(genome: str, blat_url: str, seq: str) -> dict:
|
|
7
14
|
url = f"{blat_url}?db={genome}&type=BLAT&userSeq={seq}"
|
|
8
|
-
records =
|
|
15
|
+
records = fetch_html_without_verification(url)
|
|
9
16
|
matches = []
|
|
10
17
|
for record in records:
|
|
11
18
|
if "100.0%" not in record:
|
|
@@ -43,9 +50,9 @@ def fetch_chromosome_size(genome_coordinates: dict, genome_urls: dict) -> int:
|
|
|
43
50
|
genome = genome_coordinates["genome"]
|
|
44
51
|
url = f"{genome_urls['goldenpath']}/{genome}/bigZips/{genome}.chrom.sizes"
|
|
45
52
|
|
|
46
|
-
|
|
47
|
-
for
|
|
48
|
-
chrom_name, size =
|
|
53
|
+
records = fetch_html_without_verification(url)
|
|
54
|
+
for record in records:
|
|
55
|
+
chrom_name, size = record.split("\t")
|
|
49
56
|
if chrom == chrom_name:
|
|
50
57
|
return int(size)
|
|
51
58
|
raise ValueError(f"Chromosome {chrom} size not found.")
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def get_repeat_regions(sequence: str, loci: set[int]) -> list[tuple[int, int]]:
|
|
9
|
+
"""
|
|
10
|
+
Find homopolymers in the sequence but discard them that
|
|
11
|
+
are adjacent to candidate mutation loci because they are
|
|
12
|
+
likely to be covered by the real mutations
|
|
13
|
+
"""
|
|
14
|
+
pattern = r"A{4,}|C{4,}|G{4,}|T{4,}|N{4,}"
|
|
15
|
+
repeat_regions = []
|
|
16
|
+
for start, end in (match.span() for match in re.finditer(pattern, sequence)):
|
|
17
|
+
if not (start - 1 in loci and end + 1 in loci):
|
|
18
|
+
repeat_regions.append((start, end))
|
|
19
|
+
return repeat_regions
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def cosine_similarity(X, Y) -> float:
|
|
23
|
+
# Add 1e-6 to avoid division by zero when calculating cosine similarity
|
|
24
|
+
X += 1e-6
|
|
25
|
+
Y += 1e-6
|
|
26
|
+
return float(np.dot(X, Y) / (np.linalg.norm(X) * np.linalg.norm(Y)))
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
###########################################################
|
|
30
|
+
# main
|
|
31
|
+
###########################################################
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def extract_sequence_errors_in_homopolymer_loci(
|
|
35
|
+
sequence: str,
|
|
36
|
+
indels_normalized_sample: dict[str, np.array],
|
|
37
|
+
indels_normalized_control: dict[str, np.array],
|
|
38
|
+
anomal_loci: dict[set],
|
|
39
|
+
) -> dict[str, set[int]]:
|
|
40
|
+
sequence_errors_in_homopolymer = dict()
|
|
41
|
+
for mut in ["+", "-", "*"]:
|
|
42
|
+
repeat_regions = get_repeat_regions(sequence, anomal_loci[mut])
|
|
43
|
+
if len(repeat_regions) == 0:
|
|
44
|
+
sequence_errors_in_homopolymer[mut] = set()
|
|
45
|
+
continue
|
|
46
|
+
sequence_errors = set()
|
|
47
|
+
for start, end in repeat_regions:
|
|
48
|
+
x = np.array(indels_normalized_sample[mut][start:end])
|
|
49
|
+
y = np.array(indels_normalized_control[mut][start:end])
|
|
50
|
+
|
|
51
|
+
# Scaling data to [0, 1] for cosine similarity
|
|
52
|
+
|
|
53
|
+
# Check if the range of x is zero
|
|
54
|
+
if x.max() - x.min() == 0:
|
|
55
|
+
x_scaled = np.zeros_like(x)
|
|
56
|
+
else:
|
|
57
|
+
x_scaled = (x - x.min()) / (x.max() - x.min())
|
|
58
|
+
|
|
59
|
+
# Check if the range of y is zero
|
|
60
|
+
if y.max() - y.min() == 0:
|
|
61
|
+
y_scaled = np.zeros_like(y)
|
|
62
|
+
else:
|
|
63
|
+
y_scaled = (y - y.min()) / (y.max() - y.min())
|
|
64
|
+
|
|
65
|
+
if cosine_similarity(x_scaled, y_scaled) > 0.95:
|
|
66
|
+
sequence_errors.update(range(start, end + 1))
|
|
67
|
+
|
|
68
|
+
sequence_errors_in_homopolymer[mut] = sequence_errors
|
|
69
|
+
|
|
70
|
+
return sequence_errors_in_homopolymer
|