PyPI - DAJIN2 - Versions diffs - 0.4.5__zip → 0.4.6__zip - Mend

DAJIN2 0.4.5zip → 0.4.6zip

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

{dajin2-0.4.5/src/DAJIN2.egg-info → dajin2-0.4.6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: DAJIN2
-Version: 0.4.5
+Version: 0.4.6
 Summary: One-step genotyping tools for targeted long-read sequencing
 Home-page: https://github.com/akikuno/DAJIN2
 Author: Akihiro Kuno
@@ -14,6 +14,7 @@ Classifier: Intended Audience :: Science/Research
 Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
 Description-Content-Type: text/markdown
 License-File: LICENSE
+Requires-Dist: certifi
 Requires-Dist: numpy>=1.24.0
 Requires-Dist: scipy>=1.10.0
 Requires-Dist: pandas>=1.0.0
@@ -61,7 +62,7 @@ The name DAJIN is derived from the phrase 一網**打尽** (Ichimou **DAJIN** or
 ### Prerequisites
-- Python 3.8 or later
+- Python 3.8 to 3.10
 - Unix-like environment (Linux, macOS, WSL2, etc.)
 ### From [Bioconda](https://anaconda.org/bioconda/DAJIN2) (Recommended)
@@ -71,6 +72,10 @@ conda create -n env-dajin2 -c conda-forge -c bioconda python=3.10 DAJIN2 -y
 conda activate env-dajin2
 ```
+> [!IMPORTANT]
+> DAJIN2 supports Python versions 3.8 to 3.10, but not Python 3.11 yet due to a [Bioconda issue](https://github.com/bioconda/bioconda-recipes/issues/37805).
 > [!NOTE]
 > To Apple Silicon (ARM64) users:
 > [Since the Bioconda channel does not yet support Apple Silicon](https://github.com/bioconda/bioconda-recipes/issues/37068#issuecomment-1257790919), please use the following command to install `DAJIN2` through Rosetta.

{dajin2-0.4.5 → dajin2-0.4.6}/README.md RENAMED Viewed

@@ -28,7 +28,7 @@ The name DAJIN is derived from the phrase 一網**打尽** (Ichimou **DAJIN** or
 ### Prerequisites
-- Python 3.8 or later
+- Python 3.8 to 3.10
 - Unix-like environment (Linux, macOS, WSL2, etc.)
 ### From [Bioconda](https://anaconda.org/bioconda/DAJIN2) (Recommended)
@@ -38,6 +38,10 @@ conda create -n env-dajin2 -c conda-forge -c bioconda python=3.10 DAJIN2 -y
 conda activate env-dajin2
 ```
+> [!IMPORTANT]
+> DAJIN2 supports Python versions 3.8 to 3.10, but not Python 3.11 yet due to a [Bioconda issue](https://github.com/bioconda/bioconda-recipes/issues/37805).
 > [!NOTE]
 > To Apple Silicon (ARM64) users:
 > [Since the Bioconda channel does not yet support Apple Silicon](https://github.com/bioconda/bioconda-recipes/issues/37068#issuecomment-1257790919), please use the following command to install `DAJIN2` through Rosetta.

{dajin2-0.4.5 → dajin2-0.4.6}/requirements.txt RENAMED Viewed

@@ -1,3 +1,5 @@
+certifi
 numpy >= 1.24.0
 scipy >= 1.10.0
 pandas >= 1.0.0

{dajin2-0.4.5 → dajin2-0.4.6}/setup.py RENAMED Viewed

@@ -9,7 +9,7 @@ with open("requirements.txt") as requirements_file:
 setuptools.setup(
     name="DAJIN2",
-    version="0.4.5",
+    version="0.4.6",
     author="Akihiro Kuno",
     author_email="akuno@md.tsukuba.ac.jp",
     description="One-step genotyping tools for targeted long-read sequencing",

{dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/clustering/strand_bias_handler.py RENAMED Viewed

@@ -9,6 +9,7 @@ Re-allocates reads belonging to clusters with strand bias to clusters without st
 """
 from pathlib import Path
+from typing import Generator
 from collections import defaultdict
 from sklearn.tree import DecisionTreeClassifier
@@ -40,7 +41,7 @@ def is_strand_bias(path_control: Path) -> bool:
 ###############################################################################
-def count_strand(labels: list[int], samples: list[dict[str, str]]) -> tuple[dict[str, int], dict[str, int]]:
+def count_strand(labels: list[int], samples: Generator[dict[str, str]]) -> tuple[dict[str, int], dict[str, int]]:
     """Count the occurrences of each strand type by label."""
     positive_strand_counts_by_labels = defaultdict(int)
     total_counts_by_labels = defaultdict(int)
@@ -54,18 +55,25 @@ def count_strand(labels: list[int], samples: list[dict[str, str]]) -> tuple[dict
 def determine_strand_biases(
-    positive_strand_counts_by_labels: defaultdict, total_counts_by_labels: defaultdict
+    positive_strand_counts_by_labels: dict[str, int], total_counts_by_labels: dict[str, int]
 ) -> dict[int, bool]:
     """Determine strand biases based on positive strand counts."""
     strand_biases = {}
     for label, total in total_counts_by_labels.items():
-        positive_strand_count = positive_strand_counts_by_labels[label]
+        positive_strand_count = positive_strand_counts_by_labels.get(label, 0)
         strand_ratio = positive_strand_count / total
         strand_biases[label] = not (STRAND_BIAS_LOWER_LIMIT < strand_ratio < STRAND_BIAS_UPPER_LIMIT)
     return strand_biases
+def annotate_strand_bias_by_labels(path_sample: Path, labels: list[int]) -> bool:
+    """Determine whether there is strand bias in the samples based on the provided labels."""
+    samples = io.read_jsonl(path_sample)
+    positive_strand_counts_by_labels, total_counts_by_labels = count_strand(labels, samples)
+    return determine_strand_biases(positive_strand_counts_by_labels, total_counts_by_labels)
 def prepare_training_testing_sets(labels, scores, strand_biases) -> tuple[list, list, list]:
     """Prepare training and testing datasets based on strand biases."""
     train_data, train_labels, test_data = [], [], []
@@ -96,20 +104,23 @@ def allocate_labels(labels: list[int], strand_biases: dict[str, bool], dtree, te
 def remove_biased_clusters(path_sample: Path, path_score_sample: Path, labels: list[int]) -> list[int]:
     """Remove clusters with strand bias by re-labeling based on decision tree predictions.
-    Continue until at least one of the samples exhibits strand bias (i.e., do not calculate if all samples exhibit strand bias, or conversely, if none of the samples exhibit strand bias) or
+    Continue until at least one of the samples exhibits strand bias (i.e., do not calculate if all samples exhibit strand bias or, if none of the samples exhibit strand bias) and
     1000 iterations are reached, which serves as a safeguard to prevent infinite loops.
     """
-    samples = io.read_jsonl(path_sample)
-    positive_strand_counts_by_labels, total_counts_by_labels = count_strand(labels, samples)
-    strand_biases = determine_strand_biases(positive_strand_counts_by_labels, total_counts_by_labels)
+    strand_biases = annotate_strand_bias_by_labels(path_sample, labels)
     iteration_count = 0
     labels_corrected = labels
     while len(set(strand_biases.values())) > 1 and iteration_count < 1000:
+        # Re-allocation of labels of biased clusters to unbiased clusters
         scores = io.read_jsonl(path_score_sample)
         train_data, train_labels, test_data = prepare_training_testing_sets(labels, scores, strand_biases)
         dtree = train_decision_tree(train_data, train_labels)
         labels_corrected = allocate_labels(labels, strand_biases, dtree, test_data)
-        strand_biases = determine_strand_biases(labels_corrected, path_sample)
+        # Re-calculate strand biases based on the corrected labels
+        strand_biases = annotate_strand_bias_by_labels(path_sample, labels)
         iteration_count += 1
     return labels_corrected

{dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/consensus/mutation_extractor.py RENAMED Viewed

@@ -75,7 +75,9 @@ def cache_normalized_indels(ARGS, path_midsv_sample: Path) -> None:
     path_midsv_control = extract_path_control(ARGS, allele)
     path_midsv_n_filtered_control = extract_path_n_filtered_control(ARGS, path_midsv_control)
-    cache_selected_control_by_similarity(path_midsv_n_filtered_control, path_midsv_sample, path_midsv_sample.parent)
+    cache_selected_control_by_similarity(
+        ARGS, path_midsv_n_filtered_control, path_midsv_sample, path_midsv_sample.parent
+    )
     path_midsv_similar_control = Path(path_midsv_sample.parent, f"{allele}_{label}_control.jsonl")

{dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/consensus/similarity_searcher.py RENAMED Viewed

@@ -72,14 +72,22 @@ def identify_normal_reads(
 ###########################################################
-def filter_control(path_midsv_control: Path, path_midsv_sample: Path) -> list[bool]:
+def filter_control(ARGS, path_midsv_control: Path, path_midsv_sample: Path) -> list[bool]:
     """
     find similar control reads compared to sample reads
     """
     cssplits = (m["CSSPLIT"].split(",") for m in io.read_jsonl(path_midsv_sample))
     coverage_match = np.array([sum(1 for cs in cssplit if cs.startswith("=")) for cssplit in zip(*cssplits)])
     mut_onehot_sample = onehot_by_mutations(io.read_jsonl(path_midsv_sample))
-    mut_onehot_control = onehot_by_mutations(io.read_jsonl(path_midsv_control))
+    path_mut_onehot_control = Path(
+        ARGS.tempdir, ARGS.control_name, "consensus", f"{path_midsv_control.stem}_onehot.pickle"
+    )
+    if path_mut_onehot_control.exists():
+        mut_onehot_control = io.load_pickle(path_mut_onehot_control)
+    else:
+        mut_onehot_control = onehot_by_mutations(io.read_jsonl(path_midsv_control))
+        io.save_pickle(mut_onehot_control, path_mut_onehot_control)
     mut_percentage_sample = calculate_percentage(mut_onehot_sample, coverage_match)
     values_mask = get_values_to_mask(mut_percentage_sample)
@@ -90,8 +98,10 @@ def filter_control(path_midsv_control: Path, path_midsv_sample: Path) -> list[bo
     return identify_normal_reads(mut_onehot_sample_masked, mut_onehot_control_masked)
-def cache_selected_control_by_similarity(path_midsv_control: Path, path_midsv_sample: Path, path_output: Path) -> None:
-    normal_reads_flags = filter_control(path_midsv_control, path_midsv_sample)
+def cache_selected_control_by_similarity(
+    ARGS, path_midsv_control: Path, path_midsv_sample: Path, path_output: Path
+) -> None:
+    normal_reads_flags = filter_control(ARGS, path_midsv_control, path_midsv_sample)
     midsv_control = io.read_jsonl(path_midsv_control)
     midsv_filtered = (m for m, flag in zip(midsv_control, normal_reads_flags) if flag is True)

{dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/core.py RENAMED Viewed

@@ -5,7 +5,7 @@ import logging
 from pathlib import Path
-from DAJIN2.utils import io, fastx_handler
+from DAJIN2.utils import io, config, fastx_handler
 from DAJIN2.core import classification, clustering, consensus, preprocess, report
 from DAJIN2.core.preprocess.input_formatter import FormattedInputs
@@ -18,6 +18,9 @@ logger = logging.getLogger(__name__)
 def execute_control(arguments: dict):
+    logger.info(f"\N{runner} Start running DAJIN2 version {config.DAJIN_VERSION}")
     logger.info(f"{arguments['control']} is now processing...")
     ###########################################################

{dajin2-0.4.5 → dajin2-0.4.6}/src/DAJIN2/core/preprocess/genome_fetcher.py RENAMED Viewed

@@ -1,11 +1,18 @@
 from __future__ import annotations
+import ssl
 from urllib.request import urlopen
+def fetch_html_without_verification(url: str) -> str:
+    context = ssl._create_unverified_context()  # Create an SSL context that temporarily disables verification
+    with urlopen(url, context=context, timeout=10) as response:
+        return response.read().decode("utf-8").split("\n")
 def fetch_seq_coordinates(genome: str, blat_url: str, seq: str) -> dict:
     url = f"{blat_url}?db={genome}&type=BLAT&userSeq={seq}"
-    records = urlopen(url).read().decode("utf8").split("\n")
+    records = fetch_html_without_verification(url)
     matches = []
     for record in records:
         if "100.0%" not in record:
@@ -43,9 +50,9 @@ def fetch_chromosome_size(genome_coordinates: dict, genome_urls: dict) -> int:
     genome = genome_coordinates["genome"]
     url = f"{genome_urls['goldenpath']}/{genome}/bigZips/{genome}.chrom.sizes"
-    response = urlopen(url).read().decode("utf8").split("\n")
-    for line in response:
-        chrom_name, size = line.split("\t")
+    records = fetch_html_without_verification(url)
+    for record in records:
+        chrom_name, size = record.split("\t")
         if chrom == chrom_name:
             return int(size)
     raise ValueError(f"Chromosome {chrom} size not found.")

dajin2-0.4.6/src/DAJIN2/core/preprocess/homopolymer_handler.py ADDED Viewed

@@ -0,0 +1,70 @@
+from __future__ import annotations
+import re
+import numpy as np
+def get_repeat_regions(sequence: str, loci: set[int]) -> list[tuple[int, int]]:
+    """
+    Find homopolymers in the sequence but discard them that
+    are adjacent to candidate mutation loci because they are
+    likely to be covered by the real mutations
+    """
+    pattern = r"A{4,}|C{4,}|G{4,}|T{4,}|N{4,}"
+    repeat_regions = []
+    for start, end in (match.span() for match in re.finditer(pattern, sequence)):
+        if not (start - 1 in loci and end + 1 in loci):
+            repeat_regions.append((start, end))
+    return repeat_regions
+def cosine_similarity(X, Y) -> float:
+    # Add 1e-6 to avoid division by zero when calculating cosine similarity
+    X += 1e-6
+    Y += 1e-6
+    return float(np.dot(X, Y) / (np.linalg.norm(X) * np.linalg.norm(Y)))
+###########################################################
+# main
+###########################################################
+def extract_sequence_errors_in_homopolymer_loci(
+    sequence: str,
+    indels_normalized_sample: dict[str, np.array],
+    indels_normalized_control: dict[str, np.array],
+    anomal_loci: dict[set],
+) -> dict[str, set[int]]:
+    sequence_errors_in_homopolymer = dict()
+    for mut in ["+", "-", "*"]:
+        repeat_regions = get_repeat_regions(sequence, anomal_loci[mut])
+        if len(repeat_regions) == 0:
+            sequence_errors_in_homopolymer[mut] = set()
+            continue
+        sequence_errors = set()
+        for start, end in repeat_regions:
+            x = np.array(indels_normalized_sample[mut][start:end])
+            y = np.array(indels_normalized_control[mut][start:end])
+            # Scaling data to [0, 1] for cosine similarity
+            # Check if the range of x is zero
+            if x.max() - x.min() == 0:
+                x_scaled = np.zeros_like(x)
+            else:
+                x_scaled = (x - x.min()) / (x.max() - x.min())
+            # Check if the range of y is zero
+            if y.max() - y.min() == 0:
+                y_scaled = np.zeros_like(y)
+            else:
+                y_scaled = (y - y.min()) / (y.max() - y.min())
+            if cosine_similarity(x_scaled, y_scaled) > 0.95:
+                sequence_errors.update(range(start, end + 1))
+        sequence_errors_in_homopolymer[mut] = sequence_errors
+    return sequence_errors_in_homopolymer

DAJIN2 0.4.5__zip → 0.4.6__zip

DAJIN2 0.4.5zip → 0.4.6zip