PyPI - sai-pg - Versions diffs - 1.0.0__py3-none-any.whl - Mend

sai-pg 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

sai/__init__.py +18 -0
sai/__main__.py +73 -0
sai/parsers/__init__.py +18 -0
sai/parsers/argument_validation.py +169 -0
sai/parsers/outlier_parser.py +76 -0
sai/parsers/plot_parser.py +152 -0
sai/parsers/score_parser.py +241 -0
sai/sai.py +315 -0
sai/stats/__init__.py +18 -0
sai/stats/features.py +302 -0
sai/utils/__init__.py +22 -0
sai/utils/generators/__init__.py +23 -0
sai/utils/generators/chunk_generator.py +148 -0
sai/utils/generators/data_generator.py +49 -0
sai/utils/generators/window_generator.py +250 -0
sai/utils/genomic_dataclasses.py +46 -0
sai/utils/multiprocessing/__init__.py +22 -0
sai/utils/multiprocessing/mp_manager.py +251 -0
sai/utils/multiprocessing/mp_pool.py +73 -0
sai/utils/preprocessors/__init__.py +23 -0
sai/utils/preprocessors/chunk_preprocessor.py +152 -0
sai/utils/preprocessors/data_preprocessor.py +94 -0
sai/utils/preprocessors/feature_preprocessor.py +211 -0
sai/utils/utils.py +689 -0
sai_pg-1.0.0.dist-info/METADATA +44 -0
sai_pg-1.0.0.dist-info/RECORD +30 -0
sai_pg-1.0.0.dist-info/WHEEL +5 -0
sai_pg-1.0.0.dist-info/entry_points.txt +2 -0
sai_pg-1.0.0.dist-info/licenses/LICENSE +674 -0
sai_pg-1.0.0.dist-info/top_level.txt +1 -0

sai/parsers/score_parser.py ADDED Viewed

@@ -0,0 +1,241 @@
+# Copyright 2025 Xin Huang
+#
+# GNU General Public License v3.0
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, please see
+#
+#    https://www.gnu.org/licenses/gpl-3.0.en.html
+import argparse
+import re
+from sai.parsers.argument_validation import positive_int
+from sai.parsers.argument_validation import existed_file
+from sai.parsers.argument_validation import between_zero_and_one
+from sai.parsers.argument_validation import validate_stat_type
+from sai.sai import score
+from sai.utils.utils import parse_ind_file
+def _run_score(args: argparse.Namespace) -> None:
+    """
+    Executes the scoring function with arguments provided via the command line interface.
+    Parameters
+    ----------
+    args : argparse.Namespace
+        Parsed command-line arguments that contain the necessary parameters for the scoring function,
+        including:
+        - vcf : str
+            Path to the VCF file containing variant data.
+        - chr_name : str
+            Name of the chromosome to be analyzed.
+        - ref : str
+            Path to the reference group individual file.
+        - tgt : str
+            Path to the target group individual file.
+        - src : str
+            Path to the source population individual file.
+        - win_len : int
+            Length of each analysis window.
+        - win_step : int
+            Step size for moving the window along the sequence.
+        - num_src : int
+            Number of source populations. The length of `args.y` should match `num_src`.
+        - anc_alleles : str
+            Path to the ancestral allele file.
+        - w : float
+            Allele frequency threshold for the reference group.
+        - y : list of float
+            List of allele frequency thresholds for each source population. Its length must match `num_src`.
+        - output : str
+            Path to the output file for storing results.
+        - stat_type: str
+            Specifies the type of statistic to compute.
+    Raises
+    ------
+    ValueError
+        If the length of `args.y` does not match the expected number of source populations (`args.num_src`),
+        or if other input parameters do not meet expected conditions.
+    """
+    src_samples = parse_ind_file(args.src)
+    num_src = len(src_samples.keys())
+    if len(args.y) != num_src:
+        raise ValueError(
+            f"The length of y ({len(args.y)}) does not match the number of source populations ({num_src}) found in {args.src}."
+        )
+    score(
+        vcf_file=args.vcf,
+        chr_name=args.chr_name,
+        ref_ind_file=args.ref,
+        tgt_ind_file=args.tgt,
+        src_ind_file=args.src,
+        win_len=args.win_len,
+        win_step=args.win_step,
+        num_src=num_src,
+        anc_allele_file=args.anc_alleles,
+        w=args.w,
+        y=args.y,
+        output_file=args.output,
+        stat_type=args.stat,
+        num_workers=1,
+    )
+def _parse_y_thresholds(value: str) -> tuple[str, float]:
+    """
+    Parses the --y parameter value to extract an operator and a numerical threshold.
+    This function ensures that the input is correctly formatted as one of the following:
+    - `=X`  (equality condition)
+    - `>X`  (greater than condition)
+    - `<X`  (less than condition)
+    - `>=X` (greater than or equal to condition)
+    - `<=X` (less than or equal to condition)
+    The numerical value `X` must be within the range [0, 1].
+    Parameters
+    ----------
+    value : str
+        A string representing the allele frequency threshold condition, e.g., "=0.7", ">0.8", "<=0.2".
+    Returns
+    -------
+    tuple[str, float]
+        A tuple containing:
+        - A string representing the comparison operator (`=`, `<`, `>`, `<=`, `>=`).
+        - A float representing the threshold value.
+    Raises
+    ------
+    argparse.ArgumentTypeError
+        If the input format is invalid or the numerical threshold is outside the range [0, 1].
+    """
+    match = re.match(r"^(=|<|>|<=|>=)(\d*\.?\d+)$", value)
+    if not match:
+        raise argparse.ArgumentTypeError(
+            f"Invalid format for --y: {value}. Must be in the form =X, >X, <X, >=X, or <=X "
+            f"(e.g., =0.7, >0.8, <0.1, >=0.5, <=0.2)."
+        )
+    operator, num_str = match.groups()
+    num = float(num_str)
+    if not (0 <= num <= 1):
+        raise argparse.ArgumentTypeError(
+            f"Value for --y must be between 0 and 1, got {num}."
+        )
+    return operator, num
+def add_score_parser(subparsers: argparse.ArgumentParser) -> None:
+    """
+    Initializes and configures the command-line interface parser
+    for the score subcommand.
+    Parameters
+    ----------
+    subparsers : argparse.ArgumentParser
+        A command-line interface parser to be configured.
+    """
+    parser = subparsers.add_parser(
+        "score", help="Run the score command based on specified parameters."
+    )
+    parser.add_argument(
+        "--vcf",
+        type=existed_file,
+        required=True,
+        help="Path to the VCF file containing variant data.",
+    )
+    parser.add_argument(
+        "--chr-name",
+        dest="chr_name",
+        type=str,
+        required=True,
+        help="Chromosome name to analyze from the VCF file.",
+    )
+    parser.add_argument(
+        "--ref",
+        type=existed_file,
+        required=True,
+        help="Path to the file with reference population identifiers.",
+    )
+    parser.add_argument(
+        "--tgt",
+        type=existed_file,
+        required=True,
+        help="Path to the file with target population identifiers.",
+    )
+    parser.add_argument(
+        "--src",
+        type=existed_file,
+        required=True,
+        help="Path to the file with source population identifiers.",
+    )
+    parser.add_argument(
+        "--win-len",
+        dest="win_len",
+        type=positive_int,
+        default=50000,
+        help="Length of each genomic window in base pairs. Default: 50,000.",
+    )
+    parser.add_argument(
+        "--win-step",
+        dest="win_step",
+        type=positive_int,
+        default=10000,
+        help="Step size in base pairs between consecutive windows. Default: 10,000.",
+    )
+    parser.add_argument(
+        "--anc-alleles",
+        dest="anc_alleles",
+        type=existed_file,
+        default=None,
+        help="Path to the BED file with ancestral allele information. If ancestral allele information is not provided, filtering will be performed for each variant based on whether the allele frequency of any allele (assuming biallelic) meets the specified condition during the calculation of the statistics. Default: None.",
+    )
+    parser.add_argument(
+        "--w",
+        type=between_zero_and_one,
+        default=0.01,
+        help="Frequency threshold for variants in the reference population; only variants with frequencies below this threshold are included in the analysis. Default: 0.01.",
+    )
+    parser.add_argument(
+        "--y",
+        type=_parse_y_thresholds,
+        nargs="+",
+        default=[("=", 1.0)],
+        help="List of allele frequency conditions for the source populations. "
+        "Each value must be in the form =X, >X, <X, >=X, or <=X "
+        "(e.g., =0.7, >0.8, <0.1, >=0.5, <=0.2). "
+        "The number of values must match the number of source populations in the file specified by `--src`; "
+        "the order of the allele frequency conditions should also correspond to the order of source populations in that file. Default: =1",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        required=True,
+        help="Output file path for saving results.",
+    )
+    parser.add_argument(
+        "--stat",
+        type=validate_stat_type,
+        required=True,
+        help="Type of statistic to compute: UXX or QXX, where XX is a percentage-like index indicating a threshold in the target population. For example, `U50` means the allele frequency is greater than 0.5, and `Q95` means the allele frequency is greater than or equal to the 95th percentile among sites meeting the specified conditions.",
+    )
+    parser.set_defaults(runner=_run_score)

sai/sai.py ADDED Viewed

@@ -0,0 +1,315 @@
+# Copyright 2025 Xin Huang
+#
+# GNU General Public License v3.0
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, please see
+#
+#    https://www.gnu.org/licenses/gpl-3.0.en.html
+import os
+import warnings
+import pandas as pd
+import matplotlib.pyplot as plt
+from matplotlib.ticker import MaxNLocator
+from sai.utils.generators import ChunkGenerator
+from sai.utils.preprocessors import ChunkPreprocessor
+from sai.utils.utils import natsorted_df
+def score(
+    vcf_file: str,
+    chr_name: str,
+    ref_ind_file: str,
+    tgt_ind_file: str,
+    src_ind_file: str,
+    win_len: int,
+    win_step: int,
+    num_src: int,
+    anc_allele_file: str,
+    w: float,
+    y: list[float],
+    output_file: str,
+    stat_type: str,
+    num_workers: int,
+) -> None:
+    """
+    Processes and scores genomic data by generating windowed data and feature vectors.
+    Parameters
+    ----------
+    vcf_file : str
+        Path to the VCF file containing variant data.
+    chr_name : str
+        The chromosome name to be analyzed from the VCF file.
+    ref_ind_file : str
+        Path to the file containing reference population identifiers.
+    tgt_ind_file : str
+        Path to the file containing target population identifiers.
+    src_ind_file : str
+        Path to the file containing source population identifiers.
+    win_len : int
+        Length of each genomic window in base pairs.
+    win_step : int
+        Step size in base pairs between consecutive windows.
+    num_src : int
+        Number of source populations to include in each windowed analysis.
+    anc_allele_file : str
+        Path to the file containing ancestral allele information.
+    w : float
+        Frequency threshold for calculating feature vectors.
+    y : list[float]
+        List of frequency thresholds used for various calculations in feature vector processing.
+    output_file : str
+        File path to save the output of processed feature vectors.
+    stat_type: str
+        Specifies the type of statistic to compute.
+    num_workers : int
+        Number of parallel processes for multiprocessing.
+    """
+    generator = ChunkGenerator(
+        vcf_file=vcf_file,
+        chr_name=chr_name,
+        window_size=win_len,
+        step_size=win_step,
+        num_chunks=num_workers * 8,
+    )
+    preprocessor = ChunkPreprocessor(
+        vcf_file=vcf_file,
+        ref_ind_file=ref_ind_file,
+        tgt_ind_file=tgt_ind_file,
+        src_ind_file=src_ind_file,
+        win_len=win_len,
+        win_step=win_step,
+        w=w,
+        y=y,
+        output_file=output_file,
+        stat_type=stat_type,
+        anc_allele_file=anc_allele_file,
+        num_src=num_src,
+    )
+    header = f"Chrom\tStart\tEnd\tRef\tTgt\tSrc\tN(Variants)\t{stat_type}(w<{w},y=({','.join(f'{op}{val}' for op, val in y)}))\tCandidate\n"
+    directory = os.path.dirname(output_file)
+    if directory:
+        os.makedirs(directory, exist_ok=True)
+    with open(output_file, "w") as f:
+        f.write(header)
+    items = []
+    for params in generator.get():
+        items.extend(preprocessor.run(**params))
+    preprocessor.process_items(items)
+def outlier(score_file: str, output: str, quantile: float) -> None:
+    """
+    Outputs rows exceeding the specified quantile for the chosen column ('U' or 'Q'),
+    sorted by Start and then End columns.
+    Parameters
+    ----------
+    score_file : str
+        Path to the input file, in CSV format.
+    output : str
+        Path to the output file.
+    quantile : float
+        Quantile threshold to filter rows.
+    """
+    # Read the input data file
+    data = pd.read_csv(
+        score_file,
+        sep="\t",
+        na_values=["nan"],
+        dtype={"Candidate": str},
+        index_col=False,
+    )
+    column = data.columns[-2]
+    # Convert column to numeric for computation
+    data[column] = pd.to_numeric(data[column], errors="coerce")
+    # Calculate quantile threshold for the chosen column
+    threshold = data[column].quantile(quantile)
+    if data[column].nunique() == 1:
+        warnings.warn(
+            f"Column '{column}' contains only one unique value ({threshold}), making quantile filtering meaningless.",
+            UserWarning,
+        )
+        outliers = pd.DataFrame(columns=data.columns)
+    elif (threshold == 1) and (column.startswith("Q")):
+        outliers = data[data[column] >= threshold]
+    else:
+        outliers = data[data[column] > threshold]
+    # Sort the filtered data by 'Chrom', 'Start', 'End' columns
+    if not outliers.empty:
+        outliers = outliers.reset_index(drop=True)
+        outliers_sorted = natsorted_df(outliers)
+    else:
+        outliers_sorted = outliers
+    # Convert all columns to string before saving
+    outliers_sorted = outliers_sorted.astype(str)
+    # Save the sorted filtered data to the output file
+    outliers_sorted.to_csv(output, index=False, sep="\t")
+def plot(
+    u_file: str,
+    q_file: str,
+    output: str,
+    xlabel: str,
+    ylabel: str,
+    title: str,
+    figsize_x: float = 6,
+    figsize_y: float = 6,
+    dpi: int = 300,
+    alpha: float = 0.6,
+    marker_size: float = 20,
+    marker_color: str = "blue",
+    marker_style: str = "o",
+) -> None:
+    """
+    Reads two score/outlier files (U and Q), finds common candidate positions, and plots U vs. Q.
+    Parameters
+    ----------
+    u_file : str
+        Path to the input file containing U score/outlier data.
+    q_file : str
+        Path to the input file containing Q score/outlier data.
+    output : str
+        Path to save the output plot.
+    xlabel : str
+        Label for the X-axis.
+    ylabel : str
+        Label for the Y-axis.
+    title : str
+        Title of the plot.
+    figsize_x : float, optional
+        Width of the figure (default: 6).
+    figsize_y : float, optional
+        Height of the figure (default: 6).
+    dpi : int, optional
+        Resolution of the saved plot (default: 300).
+    alpha : float, optional
+        Transparency level of scatter points (default: 0.6).
+    marker_size : float, optional
+        Size of the scatter plot markers (default: 20).
+    marker_color : str, optional
+        Color of the markers (default: "blue").
+    marker_style : str, optional
+        Shape of the marker (default: "o").
+    """
+    u_data = pd.read_csv(u_file, sep="\t")
+    q_data = pd.read_csv(q_file, sep="\t")
+    u_column = u_data.columns[-2]
+    q_column = q_data.columns[-2]
+    u_data["interval"] = (
+        u_data["Chrom"].astype(str)
+        + ":"
+        + u_data["Start"].astype(str)
+        + "-"
+        + u_data["End"].astype(str)
+    )
+    q_data["interval"] = (
+        q_data["Chrom"].astype(str)
+        + ":"
+        + q_data["Start"].astype(str)
+        + "-"
+        + q_data["End"].astype(str)
+    )
+    u_data[u_column] = pd.to_numeric(u_data[u_column], errors="coerce")
+    q_data[q_column] = pd.to_numeric(q_data[q_column], errors="coerce")
+    u_data = u_data.dropna(subset=[u_column])
+    q_data = q_data.dropna(subset=[q_column])
+    u_interval_dict = {row["interval"]: row[u_column] for _, row in u_data.iterrows()}
+    q_interval_dict = {row["interval"]: row[q_column] for _, row in q_data.iterrows()}
+    u_candidate_dict = {
+        row["interval"]: set(str(row["Candidate"]).split(","))
+        for _, row in u_data.iterrows()
+    }
+    q_candidate_dict = {
+        row["interval"]: set(str(row["Candidate"]).split(","))
+        for _, row in q_data.iterrows()
+    }
+    common_intervals = set(u_interval_dict.keys()) & set(q_interval_dict.keys())
+    if not common_intervals:
+        raise ValueError(
+            "No common genomic intervals found between U and Q score/outlier files."
+        )
+    # Helper: get candidate overlap or "."
+    def get_candidate_overlap(interval):
+        u_set = u_candidate_dict.get(interval, set())
+        q_set = q_candidate_dict.get(interval, set())
+        overlap = sorted(u_set & q_set)
+        return ",".join(overlap) if overlap else "NA"
+    overlap_df = pd.DataFrame(
+        {
+            "Chrom": [interval.split(":")[0] for interval in common_intervals],
+            "Start": [
+                int(interval.split(":")[1].split("-")[0])
+                for interval in common_intervals
+            ],
+            "End": [
+                int(interval.split(":")[1].split("-")[1])
+                for interval in common_intervals
+            ],
+            u_column: [u_interval_dict[c] for c in common_intervals],
+            q_column: [q_interval_dict[c] for c in common_intervals],
+            "Overlapping Candidate": [
+                get_candidate_overlap(c) for c in common_intervals
+            ],
+        }
+    )
+    overlap_df_sorted = natsorted_df(overlap_df)
+    overlap_output = os.path.splitext(output)[0] + ".overlap.tsv"
+    pd.DataFrame(overlap_df_sorted).to_csv(overlap_output, sep="\t", index=False)
+    plt.figure(figsize=(figsize_x, figsize_y))
+    plt.gca().yaxis.set_major_locator(MaxNLocator(integer=True))
+    plt.scatter(
+        x=overlap_df[q_column],
+        y=overlap_df[u_column],
+        alpha=alpha,
+        s=marker_size,
+        c=marker_color,
+        marker=marker_style,
+    )
+    xmin, xmax = plt.gca().get_xlim()
+    ymin, ymax = plt.gca().get_ylim()
+    plt.xlim(left=max(0, xmin))
+    plt.ylim(bottom=max(0, ymin))
+    plt.xlabel(xlabel)
+    plt.ylabel(ylabel)
+    plt.title(title)
+    plt.grid(alpha=0.5, linestyle="--")
+    plt.savefig(output, bbox_inches="tight", dpi=dpi)
+    plt.close()

sai/stats/__init__.py ADDED Viewed

@@ -0,0 +1,18 @@
+# Copyright 2025 Xin Huang
+#
+# GNU General Public License v3.0
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, please see
+#
+#    https://www.gnu.org/licenses/gpl-3.0.en.html