DAJIN2 0.4.5__zip → 0.4.6__zip

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. {dajin2-0.4.5/src/DAJIN2.egg-info → dajin2-0.4.6}/PKG-INFO +7 -2
  2. {dajin2-0.4.5 → dajin2-0.4.6}/README.md +5 -1
  3. {dajin2-0.4.5 → dajin2-0.4.6}/requirements.txt +2 -0
  4. {dajin2-0.4.5 → dajin2-0.4.6}/setup.py +1 -1
  5. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/clustering/strand_bias_handler.py +19 -8
  6. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/consensus/mutation_extractor.py +3 -1
  7. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/consensus/similarity_searcher.py +14 -4
  8. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/core.py +4 -1
  9. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/preprocess/genome_fetcher.py +11 -4
  10. dajin2-0.4.6/src/DAJIN2/core/preprocess/homopolymer_handler.py +70 -0
  11. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/preprocess/insertions_to_fasta.py +136 -78
  12. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/preprocess/knockin_handler.py +11 -11
  13. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/preprocess/midsv_caller.py +3 -3
  14. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/preprocess/mutation_extractor.py +33 -18
  15. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/main.py +39 -39
  16. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/utils/config.py +2 -2
  17. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/utils/cssplits_handler.py +71 -45
  18. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/utils/input_validator.py +22 -15
  19. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/utils/io.py +19 -10
  20. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/utils/sam_handler.py +1 -1
  21. {dajin2-0.4.5 → dajin2-0.4.6/src/DAJIN2.egg-info}/PKG-INFO +7 -2
  22. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2.egg-info/requires.txt +1 -0
  23. dajin2-0.4.5/src/DAJIN2/core/preprocess/homopolymer_handler.py +0 -51
  24. {dajin2-0.4.5 → dajin2-0.4.6}/LICENSE +0 -0
  25. {dajin2-0.4.5 → dajin2-0.4.6}/MANIFEST.in +0 -0
  26. {dajin2-0.4.5 → dajin2-0.4.6}/setup.cfg +0 -0
  27. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/__init__.py +0 -0
  28. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/__init__.py +0 -0
  29. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/classification/__init__.py +0 -0
  30. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/classification/allele_merger.py +0 -0
  31. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/classification/classifier.py +0 -0
  32. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/clustering/__init__.py +0 -0
  33. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/clustering/appender.py +0 -0
  34. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/clustering/clustering.py +0 -0
  35. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/clustering/kmer_generator.py +0 -0
  36. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/clustering/label_extractor.py +0 -0
  37. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/clustering/label_merger.py +0 -0
  38. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/clustering/label_updator.py +0 -0
  39. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/clustering/score_handler.py +0 -0
  40. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/consensus/__init__.py +0 -0
  41. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/consensus/clust_formatter.py +0 -0
  42. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/consensus/consensus.py +0 -0
  43. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/consensus/name_handler.py +0 -0
  44. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/preprocess/__init__.py +0 -0
  45. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/preprocess/cache_checker.py +0 -0
  46. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/preprocess/directory_manager.py +0 -0
  47. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/preprocess/input_formatter.py +0 -0
  48. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/preprocess/mapping.py +0 -0
  49. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/report/__init__.py +0 -0
  50. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/report/bam_exporter.py +0 -0
  51. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/report/insertion_reflector.py +0 -0
  52. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/report/mutation_exporter.py +0 -0
  53. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/report/sequence_exporter.py +0 -0
  54. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/gui.py +0 -0
  55. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/static/css/style.css +0 -0
  56. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/template_igvjs.html +0 -0
  57. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/templates/index.html +0 -0
  58. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/utils/dna_handler.py +0 -0
  59. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/utils/fastx_handler.py +0 -0
  60. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/utils/multiprocess.py +0 -0
  61. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/utils/report_generator.py +0 -0
  62. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/view.py +0 -0
  63. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2.egg-info/SOURCES.txt +0 -0
  64. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2.egg-info/dependency_links.txt +0 -0
  65. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2.egg-info/entry_points.txt +0 -0
  66. {dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: DAJIN2
3
- Version: 0.4.5
3
+ Version: 0.4.6
4
4
  Summary: One-step genotyping tools for targeted long-read sequencing
5
5
  Home-page: https://github.com/akikuno/DAJIN2
6
6
  Author: Akihiro Kuno
@@ -14,6 +14,7 @@ Classifier: Intended Audience :: Science/Research
14
14
  Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
15
15
  Description-Content-Type: text/markdown
16
16
  License-File: LICENSE
17
+ Requires-Dist: certifi
17
18
  Requires-Dist: numpy>=1.24.0
18
19
  Requires-Dist: scipy>=1.10.0
19
20
  Requires-Dist: pandas>=1.0.0
@@ -61,7 +62,7 @@ The name DAJIN is derived from the phrase 一網**打尽** (Ichimou **DAJIN** or
61
62
 
62
63
  ### Prerequisites
63
64
 
64
- - Python 3.8 or later
65
+ - Python 3.8 to 3.10
65
66
  - Unix-like environment (Linux, macOS, WSL2, etc.)
66
67
 
67
68
  ### From [Bioconda](https://anaconda.org/bioconda/DAJIN2) (Recommended)
@@ -71,6 +72,10 @@ conda create -n env-dajin2 -c conda-forge -c bioconda python=3.10 DAJIN2 -y
71
72
  conda activate env-dajin2
72
73
  ```
73
74
 
75
+ > [!IMPORTANT]
76
+ > DAJIN2 supports Python versions 3.8 to 3.10, but not Python 3.11 yet due to a [Bioconda issue](https://github.com/bioconda/bioconda-recipes/issues/37805).
77
+
78
+
74
79
  > [!NOTE]
75
80
  > To Apple Silicon (ARM64) users:
76
81
  > [Since the Bioconda channel does not yet support Apple Silicon](https://github.com/bioconda/bioconda-recipes/issues/37068#issuecomment-1257790919), please use the following command to install `DAJIN2` through Rosetta.
@@ -28,7 +28,7 @@ The name DAJIN is derived from the phrase 一網**打尽** (Ichimou **DAJIN** or
28
28
 
29
29
  ### Prerequisites
30
30
 
31
- - Python 3.8 or later
31
+ - Python 3.8 to 3.10
32
32
  - Unix-like environment (Linux, macOS, WSL2, etc.)
33
33
 
34
34
  ### From [Bioconda](https://anaconda.org/bioconda/DAJIN2) (Recommended)
@@ -38,6 +38,10 @@ conda create -n env-dajin2 -c conda-forge -c bioconda python=3.10 DAJIN2 -y
38
38
  conda activate env-dajin2
39
39
  ```
40
40
 
41
+ > [!IMPORTANT]
42
+ > DAJIN2 supports Python versions 3.8 to 3.10, but not Python 3.11 yet due to a [Bioconda issue](https://github.com/bioconda/bioconda-recipes/issues/37805).
43
+
44
+
41
45
  > [!NOTE]
42
46
  > To Apple Silicon (ARM64) users:
43
47
  > [Since the Bioconda channel does not yet support Apple Silicon](https://github.com/bioconda/bioconda-recipes/issues/37068#issuecomment-1257790919), please use the following command to install `DAJIN2` through Rosetta.
@@ -1,3 +1,5 @@
1
+ certifi
2
+
1
3
  numpy >= 1.24.0
2
4
  scipy >= 1.10.0
3
5
  pandas >= 1.0.0
@@ -9,7 +9,7 @@ with open("requirements.txt") as requirements_file:
9
9
 
10
10
  setuptools.setup(
11
11
  name="DAJIN2",
12
- version="0.4.5",
12
+ version="0.4.6",
13
13
  author="Akihiro Kuno",
14
14
  author_email="akuno@md.tsukuba.ac.jp",
15
15
  description="One-step genotyping tools for targeted long-read sequencing",
@@ -9,6 +9,7 @@ Re-allocates reads belonging to clusters with strand bias to clusters without st
9
9
  """
10
10
 
11
11
  from pathlib import Path
12
+ from typing import Generator
12
13
  from collections import defaultdict
13
14
  from sklearn.tree import DecisionTreeClassifier
14
15
 
@@ -40,7 +41,7 @@ def is_strand_bias(path_control: Path) -> bool:
40
41
  ###############################################################################
41
42
 
42
43
 
43
- def count_strand(labels: list[int], samples: list[dict[str, str]]) -> tuple[dict[str, int], dict[str, int]]:
44
+ def count_strand(labels: list[int], samples: Generator[dict[str, str]]) -> tuple[dict[str, int], dict[str, int]]:
44
45
  """Count the occurrences of each strand type by label."""
45
46
  positive_strand_counts_by_labels = defaultdict(int)
46
47
  total_counts_by_labels = defaultdict(int)
@@ -54,18 +55,25 @@ def count_strand(labels: list[int], samples: list[dict[str, str]]) -> tuple[dict
54
55
 
55
56
 
56
57
  def determine_strand_biases(
57
- positive_strand_counts_by_labels: defaultdict, total_counts_by_labels: defaultdict
58
+ positive_strand_counts_by_labels: dict[str, int], total_counts_by_labels: dict[str, int]
58
59
  ) -> dict[int, bool]:
59
60
  """Determine strand biases based on positive strand counts."""
60
61
  strand_biases = {}
61
62
  for label, total in total_counts_by_labels.items():
62
- positive_strand_count = positive_strand_counts_by_labels[label]
63
+ positive_strand_count = positive_strand_counts_by_labels.get(label, 0)
63
64
  strand_ratio = positive_strand_count / total
64
65
  strand_biases[label] = not (STRAND_BIAS_LOWER_LIMIT < strand_ratio < STRAND_BIAS_UPPER_LIMIT)
65
66
 
66
67
  return strand_biases
67
68
 
68
69
 
70
+ def annotate_strand_bias_by_labels(path_sample: Path, labels: list[int]) -> bool:
71
+ """Determine whether there is strand bias in the samples based on the provided labels."""
72
+ samples = io.read_jsonl(path_sample)
73
+ positive_strand_counts_by_labels, total_counts_by_labels = count_strand(labels, samples)
74
+ return determine_strand_biases(positive_strand_counts_by_labels, total_counts_by_labels)
75
+
76
+
69
77
  def prepare_training_testing_sets(labels, scores, strand_biases) -> tuple[list, list, list]:
70
78
  """Prepare training and testing datasets based on strand biases."""
71
79
  train_data, train_labels, test_data = [], [], []
@@ -96,20 +104,23 @@ def allocate_labels(labels: list[int], strand_biases: dict[str, bool], dtree, te
96
104
 
97
105
  def remove_biased_clusters(path_sample: Path, path_score_sample: Path, labels: list[int]) -> list[int]:
98
106
  """Remove clusters with strand bias by re-labeling based on decision tree predictions.
99
- Continue until at least one of the samples exhibits strand bias (i.e., do not calculate if all samples exhibit strand bias, or conversely, if none of the samples exhibit strand bias) or
107
+ Continue until at least one of the samples exhibits strand bias (i.e., do not calculate if all samples exhibit strand bias or, if none of the samples exhibit strand bias) and
100
108
  1000 iterations are reached, which serves as a safeguard to prevent infinite loops.
101
109
  """
102
- samples = io.read_jsonl(path_sample)
103
- positive_strand_counts_by_labels, total_counts_by_labels = count_strand(labels, samples)
104
- strand_biases = determine_strand_biases(positive_strand_counts_by_labels, total_counts_by_labels)
110
+ strand_biases = annotate_strand_bias_by_labels(path_sample, labels)
105
111
 
106
112
  iteration_count = 0
107
113
  labels_corrected = labels
108
114
  while len(set(strand_biases.values())) > 1 and iteration_count < 1000:
115
+ # Re-allocation of labels of biased clusters to unbiased clusters
109
116
  scores = io.read_jsonl(path_score_sample)
110
117
  train_data, train_labels, test_data = prepare_training_testing_sets(labels, scores, strand_biases)
111
118
  dtree = train_decision_tree(train_data, train_labels)
112
119
  labels_corrected = allocate_labels(labels, strand_biases, dtree, test_data)
113
- strand_biases = determine_strand_biases(labels_corrected, path_sample)
120
+
121
+ # Re-calculate strand biases based on the corrected labels
122
+ strand_biases = annotate_strand_bias_by_labels(path_sample, labels)
123
+
114
124
  iteration_count += 1
125
+
115
126
  return labels_corrected
@@ -75,7 +75,9 @@ def cache_normalized_indels(ARGS, path_midsv_sample: Path) -> None:
75
75
  path_midsv_control = extract_path_control(ARGS, allele)
76
76
  path_midsv_n_filtered_control = extract_path_n_filtered_control(ARGS, path_midsv_control)
77
77
 
78
- cache_selected_control_by_similarity(path_midsv_n_filtered_control, path_midsv_sample, path_midsv_sample.parent)
78
+ cache_selected_control_by_similarity(
79
+ ARGS, path_midsv_n_filtered_control, path_midsv_sample, path_midsv_sample.parent
80
+ )
79
81
 
80
82
  path_midsv_similar_control = Path(path_midsv_sample.parent, f"{allele}_{label}_control.jsonl")
81
83
 
@@ -72,14 +72,22 @@ def identify_normal_reads(
72
72
  ###########################################################
73
73
 
74
74
 
75
- def filter_control(path_midsv_control: Path, path_midsv_sample: Path) -> list[bool]:
75
+ def filter_control(ARGS, path_midsv_control: Path, path_midsv_sample: Path) -> list[bool]:
76
76
  """
77
77
  find similar control reads compared to sample reads
78
78
  """
79
79
  cssplits = (m["CSSPLIT"].split(",") for m in io.read_jsonl(path_midsv_sample))
80
80
  coverage_match = np.array([sum(1 for cs in cssplit if cs.startswith("=")) for cssplit in zip(*cssplits)])
81
81
  mut_onehot_sample = onehot_by_mutations(io.read_jsonl(path_midsv_sample))
82
- mut_onehot_control = onehot_by_mutations(io.read_jsonl(path_midsv_control))
82
+
83
+ path_mut_onehot_control = Path(
84
+ ARGS.tempdir, ARGS.control_name, "consensus", f"{path_midsv_control.stem}_onehot.pickle"
85
+ )
86
+ if path_mut_onehot_control.exists():
87
+ mut_onehot_control = io.load_pickle(path_mut_onehot_control)
88
+ else:
89
+ mut_onehot_control = onehot_by_mutations(io.read_jsonl(path_midsv_control))
90
+ io.save_pickle(mut_onehot_control, path_mut_onehot_control)
83
91
 
84
92
  mut_percentage_sample = calculate_percentage(mut_onehot_sample, coverage_match)
85
93
  values_mask = get_values_to_mask(mut_percentage_sample)
@@ -90,8 +98,10 @@ def filter_control(path_midsv_control: Path, path_midsv_sample: Path) -> list[bo
90
98
  return identify_normal_reads(mut_onehot_sample_masked, mut_onehot_control_masked)
91
99
 
92
100
 
93
- def cache_selected_control_by_similarity(path_midsv_control: Path, path_midsv_sample: Path, path_output: Path) -> None:
94
- normal_reads_flags = filter_control(path_midsv_control, path_midsv_sample)
101
+ def cache_selected_control_by_similarity(
102
+ ARGS, path_midsv_control: Path, path_midsv_sample: Path, path_output: Path
103
+ ) -> None:
104
+ normal_reads_flags = filter_control(ARGS, path_midsv_control, path_midsv_sample)
95
105
  midsv_control = io.read_jsonl(path_midsv_control)
96
106
  midsv_filtered = (m for m, flag in zip(midsv_control, normal_reads_flags) if flag is True)
97
107
 
@@ -5,7 +5,7 @@ import logging
5
5
 
6
6
  from pathlib import Path
7
7
 
8
- from DAJIN2.utils import io, fastx_handler
8
+ from DAJIN2.utils import io, config, fastx_handler
9
9
  from DAJIN2.core import classification, clustering, consensus, preprocess, report
10
10
  from DAJIN2.core.preprocess.input_formatter import FormattedInputs
11
11
 
@@ -18,6 +18,9 @@ logger = logging.getLogger(__name__)
18
18
 
19
19
 
20
20
  def execute_control(arguments: dict):
21
+
22
+ logger.info(f"\N{runner} Start running DAJIN2 version {config.DAJIN_VERSION}")
23
+
21
24
  logger.info(f"{arguments['control']} is now processing...")
22
25
 
23
26
  ###########################################################
@@ -1,11 +1,18 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import ssl
3
4
  from urllib.request import urlopen
4
5
 
5
6
 
7
+ def fetch_html_without_verification(url: str) -> str:
8
+ context = ssl._create_unverified_context() # Create an SSL context that temporarily disables verification
9
+ with urlopen(url, context=context, timeout=10) as response:
10
+ return response.read().decode("utf-8").split("\n")
11
+
12
+
6
13
  def fetch_seq_coordinates(genome: str, blat_url: str, seq: str) -> dict:
7
14
  url = f"{blat_url}?db={genome}&type=BLAT&userSeq={seq}"
8
- records = urlopen(url).read().decode("utf8").split("\n")
15
+ records = fetch_html_without_verification(url)
9
16
  matches = []
10
17
  for record in records:
11
18
  if "100.0%" not in record:
@@ -43,9 +50,9 @@ def fetch_chromosome_size(genome_coordinates: dict, genome_urls: dict) -> int:
43
50
  genome = genome_coordinates["genome"]
44
51
  url = f"{genome_urls['goldenpath']}/{genome}/bigZips/{genome}.chrom.sizes"
45
52
 
46
- response = urlopen(url).read().decode("utf8").split("\n")
47
- for line in response:
48
- chrom_name, size = line.split("\t")
53
+ records = fetch_html_without_verification(url)
54
+ for record in records:
55
+ chrom_name, size = record.split("\t")
49
56
  if chrom == chrom_name:
50
57
  return int(size)
51
58
  raise ValueError(f"Chromosome {chrom} size not found.")
@@ -0,0 +1,70 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+
5
+ import numpy as np
6
+
7
+
8
+ def get_repeat_regions(sequence: str, loci: set[int]) -> list[tuple[int, int]]:
9
+ """
10
+ Find homopolymers in the sequence but discard them that
11
+ are adjacent to candidate mutation loci because they are
12
+ likely to be covered by the real mutations
13
+ """
14
+ pattern = r"A{4,}|C{4,}|G{4,}|T{4,}|N{4,}"
15
+ repeat_regions = []
16
+ for start, end in (match.span() for match in re.finditer(pattern, sequence)):
17
+ if not (start - 1 in loci and end + 1 in loci):
18
+ repeat_regions.append((start, end))
19
+ return repeat_regions
20
+
21
+
22
+ def cosine_similarity(X, Y) -> float:
23
+ # Add 1e-6 to avoid division by zero when calculating cosine similarity
24
+ X += 1e-6
25
+ Y += 1e-6
26
+ return float(np.dot(X, Y) / (np.linalg.norm(X) * np.linalg.norm(Y)))
27
+
28
+
29
+ ###########################################################
30
+ # main
31
+ ###########################################################
32
+
33
+
34
+ def extract_sequence_errors_in_homopolymer_loci(
35
+ sequence: str,
36
+ indels_normalized_sample: dict[str, np.array],
37
+ indels_normalized_control: dict[str, np.array],
38
+ anomal_loci: dict[set],
39
+ ) -> dict[str, set[int]]:
40
+ sequence_errors_in_homopolymer = dict()
41
+ for mut in ["+", "-", "*"]:
42
+ repeat_regions = get_repeat_regions(sequence, anomal_loci[mut])
43
+ if len(repeat_regions) == 0:
44
+ sequence_errors_in_homopolymer[mut] = set()
45
+ continue
46
+ sequence_errors = set()
47
+ for start, end in repeat_regions:
48
+ x = np.array(indels_normalized_sample[mut][start:end])
49
+ y = np.array(indels_normalized_control[mut][start:end])
50
+
51
+ # Scaling data to [0, 1] for cosine similarity
52
+
53
+ # Check if the range of x is zero
54
+ if x.max() - x.min() == 0:
55
+ x_scaled = np.zeros_like(x)
56
+ else:
57
+ x_scaled = (x - x.min()) / (x.max() - x.min())
58
+
59
+ # Check if the range of y is zero
60
+ if y.max() - y.min() == 0:
61
+ y_scaled = np.zeros_like(y)
62
+ else:
63
+ y_scaled = (y - y.min()) / (y.max() - y.min())
64
+
65
+ if cosine_similarity(x_scaled, y_scaled) > 0.95:
66
+ sequence_errors.update(range(start, end + 1))
67
+
68
+ sequence_errors_in_homopolymer[mut] = sequence_errors
69
+
70
+ return sequence_errors_in_homopolymer