PyPI - sai-pg - Versions diffs - 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

sai-pg 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

sai/__init__.py +2 -0
sai/__main__.py +6 -3
sai/configs/__init__.py +24 -0
sai/configs/global_config.py +83 -0
sai/configs/ploidy_config.py +94 -0
sai/configs/pop_config.py +82 -0
sai/configs/stat_config.py +220 -0
sai/{utils/generators → generators}/chunk_generator.py +2 -8
sai/{utils/generators → generators}/window_generator.py +82 -37
sai/{utils/multiprocessing → multiprocessing}/mp_manager.py +2 -2
sai/{utils/multiprocessing → multiprocessing}/mp_pool.py +2 -2
sai/parsers/outlier_parser.py +4 -3
sai/parsers/score_parser.py +8 -119
sai/{utils/preprocessors → preprocessors}/chunk_preprocessor.py +21 -15
sai/preprocessors/feature_preprocessor.py +236 -0
sai/registries/__init__.py +22 -0
sai/registries/generic_registry.py +89 -0
sai/registries/stat_registry.py +30 -0
sai/sai.py +124 -220
sai/stats/__init__.py +11 -0
sai/stats/danc_statistic.py +83 -0
sai/stats/dd_statistic.py +77 -0
sai/stats/df_statistic.py +84 -0
sai/stats/dplus_statistic.py +86 -0
sai/stats/fd_statistic.py +92 -0
sai/stats/generic_statistic.py +93 -0
sai/stats/q_statistic.py +104 -0
sai/stats/stat_utils.py +259 -0
sai/stats/u_statistic.py +99 -0
sai/utils/utils.py +220 -143
{sai_pg-1.0.0.dist-info → sai_pg-1.1.0.dist-info}/METADATA +3 -14
sai_pg-1.1.0.dist-info/RECORD +70 -0
{sai_pg-1.0.0.dist-info → sai_pg-1.1.0.dist-info}/WHEEL +1 -1
sai_pg-1.1.0.dist-info/top_level.txt +2 -0
tests/configs/test_global_config.py +163 -0
tests/configs/test_ploidy_config.py +93 -0
tests/configs/test_pop_config.py +90 -0
tests/configs/test_stat_config.py +171 -0
tests/generators/test_chunk_generator.py +51 -0
tests/generators/test_window_generator.py +164 -0
tests/multiprocessing/test_mp_manager.py +92 -0
tests/multiprocessing/test_mp_pool.py +79 -0
tests/parsers/test_argument_validation.py +133 -0
tests/parsers/test_outlier_parser.py +53 -0
tests/parsers/test_score_parser.py +63 -0
tests/preprocessors/test_chunk_preprocessor.py +79 -0
tests/preprocessors/test_feature_preprocessor.py +223 -0
tests/registries/test_registries.py +74 -0
tests/stats/test_danc_statistic.py +51 -0
tests/stats/test_dd_statistic.py +45 -0
tests/stats/test_df_statistic.py +73 -0
tests/stats/test_dplus_statistic.py +79 -0
tests/stats/test_fd_statistic.py +68 -0
tests/stats/test_q_statistic.py +268 -0
tests/stats/test_stat_utils.py +354 -0
tests/stats/test_u_statistic.py +233 -0
tests/test___main__.py +51 -0
tests/test_sai.py +102 -0
tests/utils/test_utils.py +511 -0
sai/parsers/plot_parser.py +0 -152
sai/stats/features.py +0 -302
sai/utils/preprocessors/feature_preprocessor.py +0 -211
sai_pg-1.0.0.dist-info/RECORD +0 -30
sai_pg-1.0.0.dist-info/top_level.txt +0 -1
/sai/{utils/generators → generators}/__init__.py +0 -0
/sai/{utils/generators → generators}/data_generator.py +0 -0
/sai/{utils/multiprocessing → multiprocessing}/__init__.py +0 -0
/sai/{utils/preprocessors → preprocessors}/__init__.py +0 -0
/sai/{utils/preprocessors → preprocessors}/data_preprocessor.py +0 -0
{sai_pg-1.0.0.dist-info → sai_pg-1.1.0.dist-info}/entry_points.txt +0 -0
{sai_pg-1.0.0.dist-info → sai_pg-1.1.0.dist-info}/licenses/LICENSE +0 -0

sai/sai.py CHANGED Viewed

@@ -20,28 +20,23 @@
 import os
 import warnings
+import yaml
 import pandas as pd
-import matplotlib.pyplot as plt
-from matplotlib.ticker import MaxNLocator
-from sai.utils.generators import ChunkGenerator
-from sai.utils.preprocessors import ChunkPreprocessor
+from pathlib import Path
+from sai.generators import ChunkGenerator
+from sai.preprocessors import ChunkPreprocessor
+from sai.configs import GlobalConfig
 from sai.utils.utils import natsorted_df
 def score(
     vcf_file: str,
     chr_name: str,
-    ref_ind_file: str,
-    tgt_ind_file: str,
-    src_ind_file: str,
     win_len: int,
     win_step: int,
-    num_src: int,
     anc_allele_file: str,
-    w: float,
-    y: list[float],
     output_file: str,
-    stat_type: str,
+    config: str,
     num_workers: int,
 ) -> None:
     """
@@ -53,55 +48,76 @@ def score(
         Path to the VCF file containing variant data.
     chr_name : str
         The chromosome name to be analyzed from the VCF file.
-    ref_ind_file : str
-        Path to the file containing reference population identifiers.
-    tgt_ind_file : str
-        Path to the file containing target population identifiers.
-    src_ind_file : str
-        Path to the file containing source population identifiers.
     win_len : int
         Length of each genomic window in base pairs.
     win_step : int
         Step size in base pairs between consecutive windows.
-    num_src : int
-        Number of source populations to include in each windowed analysis.
     anc_allele_file : str
         Path to the file containing ancestral allele information.
-    w : float
-        Frequency threshold for calculating feature vectors.
-    y : list[float]
-        List of frequency thresholds used for various calculations in feature vector processing.
     output_file : str
         File path to save the output of processed feature vectors.
-    stat_type: str
-        Specifies the type of statistic to compute.
+    config: str
+        Path to the YAML configuration file specifying the statistics and ploidies to compute.
     num_workers : int
         Number of parallel processes for multiprocessing.
     """
+    try:
+        with open(config, "r") as f:
+            config_dict = yaml.safe_load(f)
+    except FileNotFoundError:
+        raise FileNotFoundError(f"Configuration file '{config}' not found.")
+    except yaml.YAMLError as e:
+        raise ValueError(f"Error parsing YAML configuration file '{config}': {e}")
+    required_fields = ["statistics", "ploidies", "populations"]
+    missing_fields = [field for field in required_fields if field not in config_dict]
+    if missing_fields:
+        raise ValueError(
+            f"Missing required fields in configuration file '{config}': {', '.join(missing_fields)}"
+        )
+    global_config = GlobalConfig(**config_dict)
+    stat_config = global_config.statistics
+    ploidy_config = global_config.ploidies
+    pop_config = global_config.populations
     generator = ChunkGenerator(
         vcf_file=vcf_file,
         chr_name=chr_name,
         window_size=win_len,
         step_size=win_step,
-        num_chunks=num_workers * 8,
+        # num_chunks=num_workers * 8,
+        num_chunks=1,
     )
     preprocessor = ChunkPreprocessor(
         vcf_file=vcf_file,
-        ref_ind_file=ref_ind_file,
-        tgt_ind_file=tgt_ind_file,
-        src_ind_file=src_ind_file,
+        ref_ind_file=pop_config.get_population("ref"),
+        tgt_ind_file=pop_config.get_population("tgt"),
+        src_ind_file=pop_config.get_population("src"),
+        out_ind_file=pop_config.get_population("outgroup"),
         win_len=win_len,
         win_step=win_step,
-        w=w,
-        y=y,
         output_file=output_file,
-        stat_type=stat_type,
+        ploidy_config=ploidy_config,
+        stat_config=stat_config,
         anc_allele_file=anc_allele_file,
-        num_src=num_src,
     )
-    header = f"Chrom\tStart\tEnd\tRef\tTgt\tSrc\tN(Variants)\t{stat_type}(w<{w},y=({','.join(f'{op}{val}' for op, val in y)}))\tCandidate\n"
+    src_pops = list(ploidy_config.root["src"].keys())
+    header_parts = ["Chrom", "Start", "End", "Ref", "Tgt", "Src", "N(Variants)"]
+    for stat_name in stat_config.root.keys():
+        if stat_name in ("U", "Q") or len(src_pops) <= 1:
+            header_parts.append(stat_name)
+        else:
+            for sp in src_pops:
+                header_parts.append(f"{stat_name}.{sp}")
+    header = "\t".join(header_parts) + "\n"
     directory = os.path.dirname(output_file)
     if directory:
@@ -109,6 +125,13 @@ def score(
     with open(output_file, "w") as f:
         f.write(header)
+    for key in ("U", "Q"):
+        if key in stat_config.root:
+            path = Path(output_file)
+            log_file = path.with_suffix(f".{key}.log")
+            with open(log_file, "w") as f:
+                f.write(f"Chrom\tStart\tEnd\t{key}_SNP\n")
     items = []
     for params in generator.get():
@@ -117,199 +140,80 @@ def score(
     preprocessor.process_items(items)
-def outlier(score_file: str, output: str, quantile: float) -> None:
+def outlier(score_file: str, output_prefix: str, quantile: float) -> None:
     """
-    Outputs rows exceeding the specified quantile for the chosen column ('U' or 'Q'),
-    sorted by Start and then End columns.
+    Identifies outlier windows for each statistic column in a score file and
+    write them to separate output files.
+    This function reads a tab-delimited score file, determines which columns
+    contain statistics (e.g., U, Q, D+, etc.), computes the specified quantile
+    threshold for each statistic, and outputs rows exceeding that threshold.
+    Results for each statistic are written to a separate TSV file, sorted by
+    Chrom, Start, and End when available.
     Parameters
     ----------
     score_file : str
-        Path to the input file, in CSV format.
-    output : str
-        Path to the output file.
+        Path to the input score file (tab-delimited).
+    output_prefix : str
+        Prefix for the output files. Each output file is named
+        "{output_prefix}.{stat}.tsv".
     quantile : float
-        Quantile threshold to filter rows.
+        Quantile threshold (between 0 and 1) used to define outliers.
     """
-    # Read the input data file
-    data = pd.read_csv(
-        score_file,
-        sep="\t",
-        na_values=["nan"],
-        dtype={"Candidate": str},
-        index_col=False,
-    )
+    df = pd.read_csv(score_file, sep="\t", na_values=["nan"], index_col=False)
-    column = data.columns[-2]
-    # Convert column to numeric for computation
-    data[column] = pd.to_numeric(data[column], errors="coerce")
-    # Calculate quantile threshold for the chosen column
-    threshold = data[column].quantile(quantile)
-    if data[column].nunique() == 1:
-        warnings.warn(
-            f"Column '{column}' contains only one unique value ({threshold}), making quantile filtering meaningless.",
-            UserWarning,
-        )
-        outliers = pd.DataFrame(columns=data.columns)
-    elif (threshold == 1) and (column.startswith("Q")):
-        outliers = data[data[column] >= threshold]
-    else:
-        outliers = data[data[column] > threshold]
-    # Sort the filtered data by 'Chrom', 'Start', 'End' columns
-    if not outliers.empty:
-        outliers = outliers.reset_index(drop=True)
-        outliers_sorted = natsorted_df(outliers)
+    cols = list(df.columns)
+    if "N(Variants)" in cols:
+        start_idx = cols.index("N(Variants)") + 1
+        metric_cols = cols[start_idx:]
     else:
-        outliers_sorted = outliers
-    # Convert all columns to string before saving
-    outliers_sorted = outliers_sorted.astype(str)
-    # Save the sorted filtered data to the output file
-    outliers_sorted.to_csv(output, index=False, sep="\t")
-def plot(
-    u_file: str,
-    q_file: str,
-    output: str,
-    xlabel: str,
-    ylabel: str,
-    title: str,
-    figsize_x: float = 6,
-    figsize_y: float = 6,
-    dpi: int = 300,
-    alpha: float = 0.6,
-    marker_size: float = 20,
-    marker_color: str = "blue",
-    marker_style: str = "o",
-) -> None:
-    """
-    Reads two score/outlier files (U and Q), finds common candidate positions, and plots U vs. Q.
-    Parameters
-    ----------
-    u_file : str
-        Path to the input file containing U score/outlier data.
-    q_file : str
-        Path to the input file containing Q score/outlier data.
-    output : str
-        Path to save the output plot.
-    xlabel : str
-        Label for the X-axis.
-    ylabel : str
-        Label for the Y-axis.
-    title : str
-        Title of the plot.
-    figsize_x : float, optional
-        Width of the figure (default: 6).
-    figsize_y : float, optional
-        Height of the figure (default: 6).
-    dpi : int, optional
-        Resolution of the saved plot (default: 300).
-    alpha : float, optional
-        Transparency level of scatter points (default: 0.6).
-    marker_size : float, optional
-        Size of the scatter plot markers (default: 20).
-    marker_color : str, optional
-        Color of the markers (default: "blue").
-    marker_style : str, optional
-        Shape of the marker (default: "o").
-    """
-    u_data = pd.read_csv(u_file, sep="\t")
-    q_data = pd.read_csv(q_file, sep="\t")
-    u_column = u_data.columns[-2]
-    q_column = q_data.columns[-2]
-    u_data["interval"] = (
-        u_data["Chrom"].astype(str)
-        + ":"
-        + u_data["Start"].astype(str)
-        + "-"
-        + u_data["End"].astype(str)
-    )
-    q_data["interval"] = (
-        q_data["Chrom"].astype(str)
-        + ":"
-        + q_data["Start"].astype(str)
-        + "-"
-        + q_data["End"].astype(str)
-    )
-    u_data[u_column] = pd.to_numeric(u_data[u_column], errors="coerce")
-    q_data[q_column] = pd.to_numeric(q_data[q_column], errors="coerce")
-    u_data = u_data.dropna(subset=[u_column])
-    q_data = q_data.dropna(subset=[q_column])
-    u_interval_dict = {row["interval"]: row[u_column] for _, row in u_data.iterrows()}
-    q_interval_dict = {row["interval"]: row[q_column] for _, row in q_data.iterrows()}
-    u_candidate_dict = {
-        row["interval"]: set(str(row["Candidate"]).split(","))
-        for _, row in u_data.iterrows()
-    }
-    q_candidate_dict = {
-        row["interval"]: set(str(row["Candidate"]).split(","))
-        for _, row in q_data.iterrows()
-    }
-    common_intervals = set(u_interval_dict.keys()) & set(q_interval_dict.keys())
-    if not common_intervals:
-        raise ValueError(
-            "No common genomic intervals found between U and Q score/outlier files."
+        # fallback: exclude common non-metric columns, keep numeric ones
+        non_metrics = {"Chrom", "Start", "End", "Ref", "Tgt", "Src"}
+        candidate = [c for c in cols if c not in non_metrics]
+        metric_cols = [
+            c for c in candidate if pd.to_numeric(df[c], errors="coerce").notna().any()
+        ]
+    if not metric_cols:
+        raise ValueError("No metric columns found.")
+    for col in metric_cols:
+        s_num = pd.to_numeric(df[col], errors="coerce").dropna()
+        if s_num.empty:
+            warnings.warn(
+                f"Column '{col}' has no numeric values; writing empty result.",
+                UserWarning,
+            )
+            out_sorted = pd.DataFrame(columns=df.columns)
+        elif s_num.nunique() == 1:
+            thr = s_num.iloc[0]
+            warnings.warn(
+                f"Column '{col}' has only one unique value ({thr}); writing empty result.",
+                UserWarning,
+            )
+            out_sorted = pd.DataFrame(columns=df.columns)
+        else:
+            thr = s_num.quantile(quantile)
+            col_num = pd.to_numeric(df[col], errors="coerce")
+            if not col.startswith("U"):
+                out = df[col_num >= thr]
+            else:
+                out = df[col_num > thr]
+            if not out.empty:
+                out = out.reset_index(drop=True)
+                try:
+                    out_sorted = natsorted_df(out)  # your existing natural sort
+                except NameError:
+                    keys = [k for k in ("Chrom", "Start", "End") if k in out.columns]
+                    out_sorted = (
+                        out.sort_values(by=keys, kind="mergesort") if keys else out
+                    )
+            else:
+                out_sorted = out
+        out_sorted.astype(str).to_csv(
+            f"{output_prefix}.{col}.{quantile}.outliers.tsv", index=False, sep="\t"
         )
-    # Helper: get candidate overlap or "."
-    def get_candidate_overlap(interval):
-        u_set = u_candidate_dict.get(interval, set())
-        q_set = q_candidate_dict.get(interval, set())
-        overlap = sorted(u_set & q_set)
-        return ",".join(overlap) if overlap else "NA"
-    overlap_df = pd.DataFrame(
-        {
-            "Chrom": [interval.split(":")[0] for interval in common_intervals],
-            "Start": [
-                int(interval.split(":")[1].split("-")[0])
-                for interval in common_intervals
-            ],
-            "End": [
-                int(interval.split(":")[1].split("-")[1])
-                for interval in common_intervals
-            ],
-            u_column: [u_interval_dict[c] for c in common_intervals],
-            q_column: [q_interval_dict[c] for c in common_intervals],
-            "Overlapping Candidate": [
-                get_candidate_overlap(c) for c in common_intervals
-            ],
-        }
-    )
-    overlap_df_sorted = natsorted_df(overlap_df)
-    overlap_output = os.path.splitext(output)[0] + ".overlap.tsv"
-    pd.DataFrame(overlap_df_sorted).to_csv(overlap_output, sep="\t", index=False)
-    plt.figure(figsize=(figsize_x, figsize_y))
-    plt.gca().yaxis.set_major_locator(MaxNLocator(integer=True))
-    plt.scatter(
-        x=overlap_df[q_column],
-        y=overlap_df[u_column],
-        alpha=alpha,
-        s=marker_size,
-        c=marker_color,
-        marker=marker_style,
-    )
-    xmin, xmax = plt.gca().get_xlim()
-    ymin, ymax = plt.gca().get_ylim()
-    plt.xlim(left=max(0, xmin))
-    plt.ylim(bottom=max(0, ymin))
-    plt.xlabel(xlabel)
-    plt.ylabel(ylabel)
-    plt.title(title)
-    plt.grid(alpha=0.5, linestyle="--")
-    plt.savefig(output, bbox_inches="tight", dpi=dpi)
-    plt.close()

sai/stats/__init__.py CHANGED Viewed

@@ -16,3 +16,14 @@
 # along with this program. If not, please see
 #
 #    https://www.gnu.org/licenses/gpl-3.0.en.html
+from .generic_statistic import GenericStatistic
+from .danc_statistic import DancStatistic
+from .dd_statistic import DdStatistic
+from .df_statistic import DfStatistic
+from .dplus_statistic import DplusStatistic
+from .fd_statistic import FdStatistic
+from .q_statistic import QStatistic
+from .u_statistic import UStatistic
+from .stat_utils import *

sai/stats/danc_statistic.py ADDED Viewed

@@ -0,0 +1,83 @@
+# Copyright 2025 Xin Huang
+#
+# GNU General Public License v3.0
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, please see
+#
+#    https://www.gnu.org/licenses/gpl-3.0.en.html
+import numpy as np
+from typing import Dict, Any
+from sai.registries.stat_registry import STAT_REGISTRY
+from sai.stats import GenericStatistic
+from sai.stats.stat_utils import calc_four_pops_freq, calc_pattern_sum
+@STAT_REGISTRY.register("Danc")
+class DancStatistic(GenericStatistic):
+    """
+    Class for computing the Danc statistic (Fang et al. 2024. PLoS Genet)
+    The Danc statistic detects asymmetric ancestry contribution by comparing
+    excess BAAA and ABAA site patterns in a four-population framework.
+    """
+    STAT_NAME = "Danc"
+    def compute(self, **kwargs) -> Dict[str, Any]:
+        """
+        Computes the Danc statistic for each source population.
+        This method computes the statistic per source population using four-population
+        site pattern counts.
+        Parameters
+        ----------
+        **kwargs : dict
+            Unused. Present to maintain compatibility with the base class interface.
+        Returns
+        -------
+        dict
+            A dictionary containing:
+            - 'name' : str
+                The name of the statistic ("Danc").
+            - 'value' : list[float]
+                A list of Danc values, one for each source population.
+        """
+        danc_results = []
+        for i in range(len(self.src_gts_list)):
+            ref_freq, tgt_freq, src_freq, out_freq = calc_four_pops_freq(
+                ref_gts=self.ref_gts,
+                tgt_gts=self.tgt_gts,
+                src_gts=self.src_gts_list[i],
+                out_gts=self.out_gts,
+                ref_ploidy=self.ref_ploidy,
+                tgt_ploidy=self.tgt_ploidy,
+                src_ploidy=self.src_ploidy_list[i],
+                out_ploidy=self.out_ploidy,
+            )
+            baaa = calc_pattern_sum(ref_freq, tgt_freq, src_freq, out_freq, "baaa")
+            abaa = calc_pattern_sum(ref_freq, tgt_freq, src_freq, out_freq, "abaa")
+            numerator = baaa - abaa
+            denominator = baaa + abaa
+            danc = numerator / denominator if denominator != 0 else np.nan
+            danc_results.append(danc)
+        return {"name": self.STAT_NAME, "value": danc_results}

sai/stats/dd_statistic.py ADDED Viewed

@@ -0,0 +1,77 @@
+# Copyright 2025 Xin Huang
+#
+# GNU General Public License v3.0
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, please see
+#
+#    https://www.gnu.org/licenses/gpl-3.0.en.html
+import numpy as np
+from scipy.spatial.distance import cdist
+from typing import Dict, Any
+from sai.registries.stat_registry import STAT_REGISTRY
+from sai.stats import GenericStatistic
+@STAT_REGISTRY.register("DD")
+class DdStatistic(GenericStatistic):
+    """
+    Class for computing the average difference of the sequence divergence.
+    The DD statistic quantifies the difference in average pairwise sequence
+    divergence between a source population and two target populations (reference
+    and target), using Manhattan (cityblock) distance.
+    """
+    STAT_NAME = "DD"
+    def compute(self, **kwargs) -> Dict[str, Any]:
+        """
+        Computes the DD statistic for each source population.
+        For each source population, the method calculates pairwise Manhattan distances
+        between the source and both the target and reference populations, averages the
+        distances per genome, and computes the difference in mean divergence.
+        Parameters
+        ----------
+        **kwargs : dict
+            Unused. Present to maintain compatibility with the base class interface.
+        Returns
+        -------
+        dict
+            A dictionary containing:
+            - 'name' : str
+                The name of the statistic ("DD").
+            - 'value' : list[float]
+                A list of DD values, one for each source population.
+        """
+        dd_results = []
+        for i in range(len(self.src_gts_list)):
+            # pairwise distances
+            src_gts = self.src_gts_list[i]
+            seq_divs_src_tgt = cdist(src_gts.T, self.tgt_gts.T, metric="cityblock")
+            seq_divs_src_ref = cdist(src_gts.T, self.ref_gts.T, metric="cityblock")
+            # mean of each row
+            mean_src_tgt = np.mean(seq_divs_src_tgt, axis=1)
+            mean_src_ref = np.mean(seq_divs_src_ref, axis=1)
+            dd = np.mean(mean_src_ref - mean_src_tgt)
+            dd_results.append(dd)
+        return {"name": self.STAT_NAME, "value": dd_results}

sai/stats/df_statistic.py ADDED Viewed

@@ -0,0 +1,84 @@
+# Copyright 2025 Xin Huang
+#
+# GNU General Public License v3.0
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, please see
+#
+#    https://www.gnu.org/licenses/gpl-3.0.en.html
+import numpy as np
+from typing import Dict, Any
+from sai.registries.stat_registry import STAT_REGISTRY
+from sai.stats import GenericStatistic
+from sai.stats.stat_utils import calc_four_pops_freq, calc_pattern_sum
+@STAT_REGISTRY.register("df")
+class DfStatistic(GenericStatistic):
+    """
+    Class for computing the distance fraction (df) statistic (Pfeifer and Kapan. 2019. BMC Bioinformatics).
+    The df statistic quantifies the relative excess of shared derived alleles
+    using ABBA, BABA, and BBAA site patterns across a four-population test.
+    """
+    STAT_NAME = "df"
+    def compute(self, **kwargs) -> Dict[str, Any]:
+        """
+        Computes the df statistic for each source population.
+        This method computes df for each source population using site pattern
+        counts based on allele frequency input.
+        Parameters
+        ----------
+        **kwargs : dict
+            Unused. Present to maintain compatibility with the base class interface.
+        Returns
+        -------
+        dict
+            A dictionary containing:
+            - 'name' : str
+                The name of the statistic ("df").
+            - 'value' : list[float]
+                A list of df values, one per source population.
+        """
+        df_results = []
+        for i in range(len(self.src_gts_list)):
+            ref_freq, tgt_freq, src_freq, out_freq = calc_four_pops_freq(
+                ref_gts=self.ref_gts,
+                tgt_gts=self.tgt_gts,
+                src_gts=self.src_gts_list[i],
+                out_gts=self.out_gts,
+                ref_ploidy=self.ref_ploidy,
+                tgt_ploidy=self.tgt_ploidy,
+                src_ploidy=self.src_ploidy_list[i],
+                out_ploidy=self.out_ploidy,
+            )
+            abba = calc_pattern_sum(ref_freq, tgt_freq, src_freq, out_freq, "abba")
+            baba = calc_pattern_sum(ref_freq, tgt_freq, src_freq, out_freq, "baba")
+            bbaa = calc_pattern_sum(ref_freq, tgt_freq, src_freq, out_freq, "bbaa")
+            numerator = abba - baba
+            denominator = abba + baba + 2 * bbaa
+            df = numerator / denominator if denominator != 0 else np.nan
+            df_results.append(df)
+        return {"name": self.STAT_NAME, "value": df_results}

sai-pg 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

sai-pg 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl