PyPI - sai-pg - Versions diffs - 1.0.0__py3-none-any.whl - Mend

sai-pg 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

sai/__init__.py +18 -0
sai/__main__.py +73 -0
sai/parsers/__init__.py +18 -0
sai/parsers/argument_validation.py +169 -0
sai/parsers/outlier_parser.py +76 -0
sai/parsers/plot_parser.py +152 -0
sai/parsers/score_parser.py +241 -0
sai/sai.py +315 -0
sai/stats/__init__.py +18 -0
sai/stats/features.py +302 -0
sai/utils/__init__.py +22 -0
sai/utils/generators/__init__.py +23 -0
sai/utils/generators/chunk_generator.py +148 -0
sai/utils/generators/data_generator.py +49 -0
sai/utils/generators/window_generator.py +250 -0
sai/utils/genomic_dataclasses.py +46 -0
sai/utils/multiprocessing/__init__.py +22 -0
sai/utils/multiprocessing/mp_manager.py +251 -0
sai/utils/multiprocessing/mp_pool.py +73 -0
sai/utils/preprocessors/__init__.py +23 -0
sai/utils/preprocessors/chunk_preprocessor.py +152 -0
sai/utils/preprocessors/data_preprocessor.py +94 -0
sai/utils/preprocessors/feature_preprocessor.py +211 -0
sai/utils/utils.py +689 -0
sai_pg-1.0.0.dist-info/METADATA +44 -0
sai_pg-1.0.0.dist-info/RECORD +30 -0
sai_pg-1.0.0.dist-info/WHEEL +5 -0
sai_pg-1.0.0.dist-info/entry_points.txt +2 -0
sai_pg-1.0.0.dist-info/licenses/LICENSE +674 -0
sai_pg-1.0.0.dist-info/top_level.txt +1 -0

sai/stats/features.py ADDED Viewed

@@ -0,0 +1,302 @@
+# Copyright 2025 Xin Huang
+#
+# GNU General Public License v3.0
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, please see
+#
+#    https://www.gnu.org/licenses/gpl-3.0.en.html
+import numpy as np
+def calc_freq(gts: np.ndarray, ploidy: int = 1) -> np.ndarray:
+    """
+    Calculates allele frequencies, supporting both phased and unphased data.
+    Parameters
+    ----------
+    gts : np.ndarray
+        A 2D numpy array where each row represents a locus and each column represents an individual.
+    ploidy : int, optional
+        Ploidy level of the organism. If ploidy=1, the function assumes phased data and calculates
+        frequency by taking the mean across individuals. For unphased data, it calculates frequency by
+        dividing the sum across individuals by the total number of alleles. Default is 1.
+    Returns
+    -------
+    np.ndarray
+        An array of allele frequencies for each locus.
+    """
+    return np.sum(gts, axis=1) / (gts.shape[1] * ploidy)
+def compute_matching_loci(
+    ref_gts: np.ndarray,
+    tgt_gts: np.ndarray,
+    src_gts_list: list[np.ndarray],
+    w: float,
+    y_list: list[tuple[str, float]],
+    ploidy: int,
+    anc_allele_available: bool,
+) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """
+    Computes loci that meet specified allele frequency conditions across reference, target, and source genotypes.
+    Parameters
+    ----------
+    ref_gts : np.ndarray
+        A 2D numpy array where each row represents a locus and each column represents an individual in the reference group.
+    tgt_gts : np.ndarray
+        A 2D numpy array where each row represents a locus and each column represents an individual in the target group.
+    src_gts_list : list of np.ndarray
+        A list of 2D numpy arrays for each source population, where each row represents a locus and each column
+        represents an individual in that source population.
+    w : float
+        Threshold for the allele frequency in `ref_gts`. Only loci with frequencies less than `w` are counted.
+        Must be within the range [0, 1].
+    y_list : list of tuple[str, float]
+        List of allele frequency conditions for each source population in `src_gts_list`.
+        Each entry is a tuple (operator, threshold), where:
+        - `operator` can be '=', '<', '>', '<=', '>='
+        - `threshold` is a float within [0, 1]
+        The length must match `src_gts_list`.
+    ploidy : int
+        The ploidy level of the organism.
+    anc_allele_available : bool
+        If True, checks only for matches with `y` (assuming `1` represents the derived allele).
+        If False, checks both matches with `y` and `1 - y`, taking the dominant allele in the source as the reference.
+    Returns
+    -------
+    tuple[np.ndarray, np.ndarray, np.ndarray]
+        - Adjusted reference allele frequencies (`ref_freq`).
+        - Adjusted target allele frequencies (`tgt_freq`).
+        - Boolean array indicating loci that meet the specified frequency conditions (`condition`).
+    """
+    # Validate input parameters
+    if not (0 <= w <= 1):
+        raise ValueError("Parameters w must be within the range [0, 1].")
+    for op, y in y_list:
+        if not (0 <= y <= 1):
+            raise ValueError(f"Invalid value in y_list: {y}. within the range [0, 1].")
+        if op not in ("=", "<", ">", "<=", ">="):
+            raise ValueError(
+                f"Invalid operator in y_list: {op}. Must be '=', '<', '>', '<=', or '>='."
+            )
+    if len(src_gts_list) != len(y_list):
+        raise ValueError("The length of src_gts_list and y_list must match.")
+    # Compute allele frequencies
+    ref_freq = calc_freq(ref_gts, ploidy)
+    tgt_freq = calc_freq(tgt_gts, ploidy)
+    src_freq_list = [calc_freq(src_gts, ploidy) for src_gts in src_gts_list]
+    # Check match for each `y`
+    op_funcs = {
+        "=": lambda src_freq, y: src_freq == y,
+        "<": lambda src_freq, y: src_freq < y,
+        ">": lambda src_freq, y: src_freq > y,
+        "<=": lambda src_freq, y: src_freq <= y,
+        ">=": lambda src_freq, y: src_freq >= y,
+    }
+    match_conditions = [
+        op_funcs[op](src_freq, y) for src_freq, (op, y) in zip(src_freq_list, y_list)
+    ]
+    all_match_y = np.all(match_conditions, axis=0)
+    if not anc_allele_available:
+        # Check if all source populations match `1 - y`
+        match_conditions_1_minus_y = [
+            op_funcs[op](src_freq, 1 - y)
+            for src_freq, (op, y) in zip(src_freq_list, y_list)
+        ]
+        all_match_1_minus_y = np.all(match_conditions_1_minus_y, axis=0)
+        all_match = all_match_y | all_match_1_minus_y
+        # Identify loci where all sources match `1 - y` for frequency inversion
+        inverted = all_match_1_minus_y
+        # Invert frequencies for these loci
+        ref_freq[inverted] = 1 - ref_freq[inverted]
+        tgt_freq[inverted] = 1 - tgt_freq[inverted]
+    else:
+        all_match = all_match_y
+    # Final condition: locus must satisfy source matching and have `ref_freq < w`
+    condition = all_match & (ref_freq < w)
+    return ref_freq, tgt_freq, condition
+def calc_u(
+    ref_gts: np.ndarray,
+    tgt_gts: np.ndarray,
+    src_gts_list: list[np.ndarray],
+    pos: np.ndarray,
+    w: float,
+    x: float,
+    y_list: list[float],
+    ploidy: int = 1,
+    anc_allele_available: bool = False,
+) -> tuple[int, np.ndarray]:
+    """
+    Calculates the count of genetic loci that meet specified allele frequency conditions
+    across reference, target, and multiple source genotypes, with adjustments based on src_freq consistency.
+    Parameters
+    ----------
+    ref_gts : np.ndarray
+        A 2D numpy array where each row represents a locus and each column represents an individual in the reference group.
+    tgt_gts : np.ndarray
+        A 2D numpy array where each row represents a locus and each column represents an individual in the target group.
+    src_gts_list : list of np.ndarray
+        A list of 2D numpy arrays for each source population, where each row represents a locus and each column
+        represents an individual in that source population.
+    pos : np.ndarray
+        A 1D numpy array where each element represents the genomic position.
+    w : float
+        Threshold for the allele frequency in `ref_gts`. Only loci with frequencies less than `w` are counted.
+        Must be within the range [0, 1].
+    x : float
+        Threshold for the allele frequency in `tgt_gts`. Only loci with frequencies greater than `x` are counted.
+        Must be within the range [0, 1].
+    y_list : list of float
+        List of exact allele frequency thresholds for each source population in `src_gts_list`.
+        Must be within the range [0, 1] and have the same length as `src_gts_list`.
+    ploidy : int, optional
+        The ploidy level of the organism. Default is 1, which assumes phased data.
+    anc_allele_available : bool
+        If True, checks only for matches with `y` (assuming `1` represents the derived allele).
+        If False, checks both matches with `y` and `1 - y`, taking the major allele in the source as the reference.
+    Returns
+    -------
+    tuple[int, np.ndarray]
+        - The count of loci that meet all specified frequency conditions.
+        - A 1D numpy array containing the genomic positions of the loci that meet the conditions.
+    Raises
+    ------
+    ValueError
+        If `x` is outside the range [0, 1].
+    """
+    # Validate input parameters
+    if not (0 <= x <= 1):
+        raise ValueError("Parameter x must be within the range [0, 1].")
+    ref_freq, tgt_freq, condition = compute_matching_loci(
+        ref_gts,
+        tgt_gts,
+        src_gts_list,
+        w,
+        y_list,
+        ploidy,
+        anc_allele_available,
+    )
+    # Apply final conditions
+    condition &= tgt_freq > x
+    loci_indices = np.where(condition)[0]
+    loci_positions = pos[loci_indices]
+    count = loci_indices.size
+    # Return count of matching loci
+    return count, loci_positions
+def calc_q(
+    ref_gts: np.ndarray,
+    tgt_gts: np.ndarray,
+    src_gts_list: list[np.ndarray],
+    pos: np.ndarray,
+    w: float,
+    y_list: list[float],
+    quantile: float = 0.95,
+    ploidy: int = 1,
+    anc_allele_available: bool = False,
+) -> float:
+    """
+    Calculates a specified quantile of derived allele frequencies in `tgt_gts` for loci that meet specific conditions
+    across reference and multiple source genotypes, with adjustments based on src_freq consistency.
+    Parameters
+    ----------
+    ref_gts : np.ndarray
+        A 2D numpy array where each row represents a locus and each column represents an individual in the reference group.
+    tgt_gts : np.ndarray
+        A 2D numpy array where each row represents a locus and each column represents an individual in the target group.
+    src_gts_list : list of np.ndarray
+        A list of 2D numpy arrays for each source population, where each row represents a locus and each column
+        represents an individual in that source population.
+    pos: np.ndarray
+        A 1D numpy array where each element represents the genomic position.
+    w : float
+        Frequency threshold for the derived allele in `ref_gts`. Only loci with frequencies lower than `w` are included.
+        Must be within the range [0, 1].
+    y_list : list of float
+        List of exact frequency thresholds for each source population in `src_gts_list`.
+        Must be within the range [0, 1] and have the same length as `src_gts_list`.
+    quantile : float, optional
+        The quantile to compute for the filtered `tgt_gts` frequencies. Must be within the range [0, 1].
+        Default is 0.95 (95% quantile).
+    ploidy : int, optional
+        The ploidy level of the organism. Default is 1, which assumes phased data.
+    anc_allele_available : bool
+        If True, checks only for matches with `y` (assuming `1` represents the derived allele).
+        If False, checks both matches with `y` and `1 - y`, taking the major allele in the source as the reference.
+    Returns
+    -------
+    tuple[float, np.ndarray]
+        - The specified quantile of the derived allele frequencies in `tgt_gts` for loci meeting the specified conditions,
+          or NaN if no loci meet the criteria.
+        - A 1D numpy array containing the genomic positions of the loci that meet the conditions.
+    Raises
+    ------
+    ValueError
+        If `quantile` is outside the range [0, 1].
+    """
+    # Validate input parameters
+    if not (0 <= quantile <= 1):
+        raise ValueError("Parameter quantile must be within the range [0, 1].")
+    ref_freq, tgt_freq, condition = compute_matching_loci(
+        ref_gts,
+        tgt_gts,
+        src_gts_list,
+        w,
+        y_list,
+        ploidy,
+        anc_allele_available,
+    )
+    # Filter `tgt_gts` frequencies based on the combined condition
+    filtered_tgt_freq = tgt_freq[condition]
+    filtered_positions = pos[condition]
+    # Return NaN if no loci meet the criteria
+    if filtered_tgt_freq.size == 0:
+        return np.nan, np.array([])
+    threshold = np.nanquantile(filtered_tgt_freq, quantile)
+    loci_positions = filtered_positions[filtered_tgt_freq >= threshold]
+    # Calculate and return the specified quantile of the filtered `tgt_gts` frequencies
+    return threshold, loci_positions

sai/utils/__init__.py ADDED Viewed

@@ -0,0 +1,22 @@
+# Copyright 2025 Xin Huang
+#
+# GNU General Public License v3.0
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, please see
+#
+#    https://www.gnu.org/licenses/gpl-3.0.en.html
+from .genomic_dataclasses import *
+from .utils import *

sai/utils/generators/__init__.py ADDED Viewed

@@ -0,0 +1,23 @@
+# Copyright 2025 Xin Huang
+#
+# GNU General Public License v3.0
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, please see
+#
+#    https://www.gnu.org/licenses/gpl-3.0.en.html
+from .data_generator import DataGenerator
+from .chunk_generator import ChunkGenerator
+from .window_generator import WindowGenerator

sai/utils/generators/chunk_generator.py ADDED Viewed

@@ -0,0 +1,148 @@
+# Copyright 2025 Xin Huang
+#
+# GNU General Public License v3.0
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, please see
+#
+#    https://www.gnu.org/licenses/gpl-3.0.en.html
+import pysam
+from typing import Iterator
+from sai.utils import split_genome
+from sai.utils.generators import DataGenerator
+class ChunkGenerator(DataGenerator):
+    """
+    Generates genome chunks from VCF windows for parallel processing.
+    This class splits genomic windows into non-overlapping chunks assigned to workers,
+    based on the VCF file length and a user-defined window and step size.
+    """
+    def __init__(
+        self,
+        vcf_file: str,
+        chr_name: str,
+        step_size: int,
+        window_size: int,
+        num_chunks: int,
+    ):
+        """
+        Initializes a new instance of ChunkGenerator.
+        Parameters
+        ----------
+        vcf_file : str
+            Path to the VCF file to process.
+        chr_name: str
+            Name of the chromosome to process.
+        step_size : int
+            Step size for generating windows.
+        window_size : int
+            Window size for generating windows.
+        num_chunks : int
+            Number of chunks to split the windows among.
+        Raises
+        ------
+        ValueError
+            If the specified chromosome is not found in the VCF file.
+        """
+        with pysam.VariantFile(vcf_file) as vcf:
+            first_pos = last_pos = None
+            for rec in vcf:
+                if rec.chrom != chr_name:
+                    if first_pos is not None:
+                        break
+                    continue
+                if first_pos is None:
+                    first_pos = rec.pos
+                last_pos = rec.pos
+        if first_pos is None:
+            raise ValueError(f"Chromosome {chr_name} not found in VCF.")
+        windows = split_genome([first_pos, last_pos], window_size, step_size)
+        self.chunks = self._split_windows_ranges(windows, num_chunks)
+        self.num_chunks = len(self.chunks)
+        self.chr_name = chr_name
+    def get(self) -> Iterator[tuple[str, int, int]]:
+        """
+        Yields a tuple representing the chunk assigned to each worker.
+        Yields
+        ------
+        tuple of int
+            A tuple representing the range (chr_name, start, end) assigned to each worker.
+        """
+        for chunk in self.chunks:
+            yield {
+                "chr_name": self.chr_name,
+                "start": chunk[0],
+                "end": chunk[1],
+            }
+    def __len__(self) -> int:
+        """
+        Returns the number of chunks.
+        Returns
+        -------
+        int
+            Number of chunks.
+        """
+        return self.num_chunks
+    def _split_windows_ranges(self, windows: list, num_chunks: int) -> list:
+        """
+        Splits the list of windows into ranges assigned to each chunk.
+        Each range is defined by the first window's start and the last window's end
+        within that split.
+        Parameters
+        ----------
+        windows : list of tuple
+            List of (start, end) tuples representing windows.
+        num_chunks : int
+            Number of chunks to divide the windows among.
+        Returns
+        -------
+        list of tuple
+            List of (start, end) tuples representing the ranges for each chunk.
+        """
+        avg = len(windows) // num_chunks
+        remainder = len(windows) % num_chunks
+        result = []
+        start_idx = 0
+        prev_end = None
+        for i in range(num_chunks):
+            end_idx = start_idx + avg + (1 if i < remainder else 0)
+            sub = windows[start_idx:end_idx]
+            if sub:
+                min_start = sub[0][0]
+                max_end = sub[-1][1]
+                if (prev_end is not None) and (min_start < prev_end):
+                    min_start = prev_end + 1
+                result.append((min_start, max_end))
+                prev_end = max_end
+            start_idx = end_idx
+        return result

sai/utils/generators/data_generator.py ADDED Viewed

@@ -0,0 +1,49 @@
+# Copyright 2025 Xin Huang
+#
+# GNU General Public License v3.0
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, please see
+#
+#    https://www.gnu.org/licenses/gpl-3.0.en.html
+from abc import ABC, abstractmethod
+class DataGenerator(ABC):
+    """
+    Abstract base class for generating data.
+    This class defines a common interface for data generation. Subclasses
+    must implement the get method to generate data according to specific
+    requirements or configurations provided via keyword arguments.
+    """
+    @abstractmethod
+    def get(self, **kwargs):
+        """
+        Generates data based on the provided keyword arguments.
+        Subclasses should implement this method to generate and return data
+        according to the requirements described by the keyword arguments.
+        Parameters:
+        **kwargs: Arbitrary keyword arguments specific to the data generation
+        implementation in subclasses.
+        Returns:
+        The generated data, the format and type of which are determined by the
+        subclass implementation.
+        """
+        pass