PyPI - sai-pg - Versions diffs - 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

sai-pg 1.0.1py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

sai/__init__.py +2 -0
sai/__main__.py +6 -3
sai/configs/__init__.py +24 -0
sai/configs/global_config.py +83 -0
sai/configs/ploidy_config.py +94 -0
sai/configs/pop_config.py +82 -0
sai/configs/stat_config.py +220 -0
sai/{utils/generators → generators}/chunk_generator.py +1 -1
sai/{utils/generators → generators}/window_generator.py +81 -37
sai/{utils/multiprocessing → multiprocessing}/mp_manager.py +2 -2
sai/{utils/multiprocessing → multiprocessing}/mp_pool.py +2 -2
sai/parsers/outlier_parser.py +4 -3
sai/parsers/score_parser.py +8 -119
sai/{utils/preprocessors → preprocessors}/chunk_preprocessor.py +21 -15
sai/preprocessors/feature_preprocessor.py +236 -0
sai/registries/__init__.py +22 -0
sai/registries/generic_registry.py +89 -0
sai/registries/stat_registry.py +30 -0
sai/sai.py +124 -220
sai/stats/__init__.py +11 -0
sai/stats/danc_statistic.py +83 -0
sai/stats/dd_statistic.py +77 -0
sai/stats/df_statistic.py +84 -0
sai/stats/dplus_statistic.py +86 -0
sai/stats/fd_statistic.py +92 -0
sai/stats/generic_statistic.py +93 -0
sai/stats/q_statistic.py +104 -0
sai/stats/stat_utils.py +259 -0
sai/stats/u_statistic.py +99 -0
sai/utils/utils.py +213 -142
{sai_pg-1.0.1.dist-info → sai_pg-1.1.0.dist-info}/METADATA +3 -14
sai_pg-1.1.0.dist-info/RECORD +70 -0
{sai_pg-1.0.1.dist-info → sai_pg-1.1.0.dist-info}/WHEEL +1 -1
sai_pg-1.1.0.dist-info/top_level.txt +2 -0
tests/configs/test_global_config.py +163 -0
tests/configs/test_ploidy_config.py +93 -0
tests/configs/test_pop_config.py +90 -0
tests/configs/test_stat_config.py +171 -0
tests/generators/test_chunk_generator.py +51 -0
tests/generators/test_window_generator.py +164 -0
tests/multiprocessing/test_mp_manager.py +92 -0
tests/multiprocessing/test_mp_pool.py +79 -0
tests/parsers/test_argument_validation.py +133 -0
tests/parsers/test_outlier_parser.py +53 -0
tests/parsers/test_score_parser.py +63 -0
tests/preprocessors/test_chunk_preprocessor.py +79 -0
tests/preprocessors/test_feature_preprocessor.py +223 -0
tests/registries/test_registries.py +74 -0
tests/stats/test_danc_statistic.py +51 -0
tests/stats/test_dd_statistic.py +45 -0
tests/stats/test_df_statistic.py +73 -0
tests/stats/test_dplus_statistic.py +79 -0
tests/stats/test_fd_statistic.py +68 -0
tests/stats/test_q_statistic.py +268 -0
tests/stats/test_stat_utils.py +354 -0
tests/stats/test_u_statistic.py +233 -0
tests/test___main__.py +51 -0
tests/test_sai.py +102 -0
tests/utils/test_utils.py +511 -0
sai/parsers/plot_parser.py +0 -152
sai/stats/features.py +0 -302
sai/utils/preprocessors/feature_preprocessor.py +0 -211
sai_pg-1.0.1.dist-info/RECORD +0 -30
sai_pg-1.0.1.dist-info/top_level.txt +0 -1
/sai/{utils/generators → generators}/__init__.py +0 -0
/sai/{utils/generators → generators}/data_generator.py +0 -0
/sai/{utils/multiprocessing → multiprocessing}/__init__.py +0 -0
/sai/{utils/preprocessors → preprocessors}/__init__.py +0 -0
/sai/{utils/preprocessors → preprocessors}/data_preprocessor.py +0 -0
{sai_pg-1.0.1.dist-info → sai_pg-1.1.0.dist-info}/entry_points.txt +0 -0
{sai_pg-1.0.1.dist-info → sai_pg-1.1.0.dist-info}/licenses/LICENSE +0 -0

sai/__init__.py CHANGED Viewed

@@ -16,3 +16,5 @@
 # along with this program. If not, please see
 #
 #    https://www.gnu.org/licenses/gpl-3.0.en.html
+__version__ = "1.1.0"

sai/__main__.py CHANGED Viewed

@@ -19,9 +19,10 @@
 import argparse
+import sai.stats
+from sai import __version__
 from sai.parsers.score_parser import add_score_parser
 from sai.parsers.outlier_parser import add_outlier_parser
-from sai.parsers.plot_parser import add_plot_parser
 def _set_sigpipe_handler() -> None:
@@ -47,13 +48,15 @@ def _sai_cli_parser() -> argparse.ArgumentParser:
     top_parser : argparse.ArgumentParser
         A configured command-line interface parser.
     """
-    top_parser = argparse.ArgumentParser()
+    top_parser = argparse.ArgumentParser(
+        description="SAI: Statistics for Adaptive Introgression"
+    )
+    top_parser.add_argument("--version", action="version", version=f"{__version__}")
     subparsers = top_parser.add_subparsers(dest="subcommand")
     subparsers.required = True
     add_score_parser(subparsers)
     add_outlier_parser(subparsers)
-    add_plot_parser(subparsers)
     return top_parser

sai/configs/__init__.py ADDED Viewed

@@ -0,0 +1,24 @@
+# Copyright 2025 Xin Huang
+#
+# GNU General Public License v3.0
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, please see
+#
+#    https://www.gnu.org/licenses/gpl-3.0.en.html
+from .global_config import GlobalConfig
+from .ploidy_config import PloidyConfig
+from .pop_config import PopConfig
+from .stat_config import StatConfig

sai/configs/global_config.py ADDED Viewed

@@ -0,0 +1,83 @@
+# Copyright 2025 Xin Huang
+#
+# GNU General Public License v3.0
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, please see
+#
+#    https://www.gnu.org/licenses/gpl-3.0.en.html
+from pydantic import BaseModel
+from pydantic import model_validator
+from typing import Optional
+from sai.configs.stat_config import StatConfig
+from sai.configs.ploidy_config import PloidyConfig
+from sai.configs.pop_config import PopConfig
+class GlobalConfig(BaseModel):
+    statistics: StatConfig
+    ploidies: PloidyConfig
+    populations: PopConfig
+    @model_validator(mode="after")
+    def validate_population_in_ploidies(self) -> "GlobalConfig":
+        """
+        Cross-validates that every population in statistics also appears
+        in the corresponding group in ploidies.
+        """
+        stat_data = (
+            self.statistics.root
+        )  # Dict[str, Dict[str, Dict[str, Union[float, tuple]]]]
+        ploidy_data = self.ploidies.root  # Dict[str, Dict[str, int]]
+        for stat_name, params in stat_data.items():
+            for group in ("ref", "tgt", "src"):
+                pop_dict = params.get(group, {})
+                for pop in pop_dict:
+                    if pop not in ploidy_data.get(group, {}):
+                        raise ValueError(
+                            f"Population '{pop}' used in statistics[{stat_name}][{group}] "
+                            f"is not defined in ploidies[{group}]"
+                        )
+        return self
+    @model_validator(mode="after")
+    def validate_population_in_populations(self) -> "GlobalConfig":
+        """
+        Cross-validates that every population in statistics also appears
+        in the corresponding group in sample files.
+        """
+        from sai.utils import parse_ind_file
+        stat_data = self.statistics.root  # Dict[stat_name][group][pop] = ...
+        population_paths = self.populations.root  # Dict[group] = path
+        categories_per_group = {
+            group: set(parse_ind_file(path).keys())
+            for group, path in population_paths.items()
+        }
+        for stat_name, params in stat_data.items():
+            for group in ("ref", "tgt", "src"):
+                pop_dict = params.get(group, {})
+                expected_categories = categories_per_group.get(group, set())
+                for pop in pop_dict:
+                    if pop not in expected_categories:
+                        raise ValueError(
+                            f"Population '{pop}' used in statistics[{stat_name}][{group}] "
+                            f"is not found in the population file for group '{group}'."
+                        )
+        return self

sai/configs/ploidy_config.py ADDED Viewed

@@ -0,0 +1,94 @@
+# Copyright 2025 Xin Huang
+#
+# GNU General Public License v3.0
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, please see
+#
+#    https://www.gnu.org/licenses/gpl-3.0.en.html
+from pydantic import RootModel, field_validator
+from typing import Dict, Union
+class PloidyConfig(RootModel[Dict[str, Dict[str, int]]]):
+    """
+    Configuration for specifying per-population ploidy levels
+    under categories like 'ref', 'tgt', 'src', and 'outgroup'.
+    Ensures:
+    - Only allowed keys are present
+    - Each sub-dictionary maps to positive integers
+    - Required keys ("ref", "tgt", "src") are present
+    """
+    @field_validator("root")
+    def validate_ploidy_dict(
+        cls, v: Dict[str, Dict[str, int]]
+    ) -> Dict[str, Dict[str, int]]:
+        allowed_keys = {"ref", "tgt", "src", "outgroup"}
+        required_keys = {"ref", "tgt", "src"}
+        extra_keys = set(v.keys()) - allowed_keys
+        if extra_keys:
+            raise ValueError(
+                f"Unsupported ploidy keys: {extra_keys}. Allowed keys are {allowed_keys}."
+            )
+        missing_keys = required_keys - set(v.keys())
+        if missing_keys:
+            raise ValueError(f"Missing required ploidy keys: {missing_keys}.")
+        for group, subdict in v.items():
+            if not isinstance(subdict, dict):
+                raise ValueError(
+                    f"Value for '{group}' must be a dictionary of population -> ploidy."
+                )
+            for pop, ploidy in subdict.items():
+                if not isinstance(ploidy, int) or ploidy <= 0:
+                    raise ValueError(
+                        f"Ploidy for '{group}:{pop}' must be a positive integer."
+                    )
+        return v
+    def get_ploidy(self, group: str, population: str = None) -> Union[int, list[int]]:
+        """
+        Returns the ploidy for a given population under a given group.
+        Parameters
+        ----------
+        group : str
+            One of "ref", "tgt", "src", or "outgroup".
+        population : str, optional
+            The name of the population within the group. If None, return all ploidies as a list.
+        Returns
+        -------
+        int or list[int]
+            - If population is given: returns the ploidy for that population.
+            - If population is None: returns a list of ploidies for all populations in the group.
+        """
+        if group not in self.root:
+            raise KeyError(f"Group '{group}' not found in configuration.")
+        if population is None:
+            return list(self.root[group].values())
+        if population not in self.root[group]:
+            raise KeyError(
+                f"Population '{population}' not found under group '{group}'."
+            )
+        return self.root[group][population]

sai/configs/pop_config.py ADDED Viewed

@@ -0,0 +1,82 @@
+# Copyright 2025 Xin Huang
+#
+# GNU General Public License v3.0
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, please see
+#
+#    https://www.gnu.org/licenses/gpl-3.0.en.html
+import os
+from typing import Dict
+from pydantic import RootModel, field_validator
+REQUIRED_KEYS = {"ref", "tgt", "src"}
+OPTIONAL_KEYS = {"outgroup"}
+ALLOWED_KEYS = REQUIRED_KEYS | OPTIONAL_KEYS
+class PopConfig(RootModel[Dict[str, str]]):
+    """
+    Configuration for population sample name files.
+    Required:
+        - ref: Path to file containing reference population sample names.
+        - tgt: Path to file containing target population sample names.
+        - src: Path to file containing source population sample names.
+    Optional:
+        - outgroup: Path to file containing outgroup sample names.
+    """
+    @field_validator("root")
+    def validate_population_keys_and_paths(cls, v: Dict[str, str]) -> Dict[str, str]:
+        keys = set(v.keys())
+        missing = REQUIRED_KEYS - keys
+        invalid = keys - ALLOWED_KEYS
+        if missing:
+            raise ValueError(f"Missing required population keys: {missing}")
+        if invalid:
+            raise ValueError(f"Unsupported population keys: {invalid}")
+        for name, path in v.items():
+            if not os.path.isfile(path):
+                raise ValueError(f"{name} file does not exist: {path}")
+        return v
+    def get_population(self, group: str) -> str:
+        """
+        Retrieves the file path for a given population group.
+        Parameters
+        ----------
+        group : str
+            The population group name (e.g., 'ref', 'tgt', 'src', or 'outgroup').
+        Returns
+        -------
+        str
+            The file path corresponding to the group.
+        Raises
+        ------
+        ValueError
+            If the requested group is not present in the configuration.
+        """
+        if group not in self.root:
+            if group == "outgroup":
+                return None
+            else:
+                raise ValueError(f"Population group '{group}' not found in config.")
+        return self.root[group]

sai/configs/stat_config.py ADDED Viewed

@@ -0,0 +1,220 @@
+# Copyright 2025 Xin Huang
+#
+# GNU General Public License v3.0
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, please see
+#
+#    https://www.gnu.org/licenses/gpl-3.0.en.html
+SUPPORTED_STATISTICS = [
+    "Danc",
+    "DD",
+    "df",
+    "Dplus",
+    "fd",
+    "U",
+    "Q",
+]
+from pydantic import RootModel, field_validator, ValidationError
+from typing import Dict, Literal, List, Optional, Union
+class StatConfig(
+    RootModel[
+        Dict[
+            str,
+            Dict[str, Dict[str, Union[float, str]]],
+        ]
+    ]
+):
+    """
+    A class to represent the configuration for various statistics used in the analysis.
+    This class manages the configuration of statistical parameters for different
+    statistical tests (e.g., "U", "Q"). It validates the range of parameters
+    such as `ref`, `tgt`, and `src`, where `ref` and `tgt` are numerical values
+    representing frequencies between 0 and 1, and `src` can be a list of strings with
+    comparison operators (e.g., "=0.5", ">0.2").
+    """
+    @field_validator("root")
+    def check_valid_stat_types(
+        cls, v: Dict[str, Dict[str, Dict[str, Union[float, str]]]]
+    ) -> Dict[
+        str,
+        Dict[str, Dict[str, Union[float, tuple[str, float]]]],
+    ]:
+        """
+        Validates statistics parameters, specifically for U and Q types.
+        Parameters
+        ----------
+        v : Dict[str, Dict[str, Dict[str, Union[float, str]]]]
+            A dictionary mapping statistic names (e.g., "U", "Q") to parameter groups ("ref", "tgt", "src"),
+            where each group is a mapping of population names to values.
+            - Outer dict key: statistic name (e.g., "U", "Q", "fd")
+            - Middle dict key: parameter group ("ref", "tgt", or "src")
+            - Inner dict key: population name (e.g., "AFR", "CHB")
+            - Inner dict value:
+                - For "ref" and "tgt": float (frequency between 0 and 1)
+                - For "src": string comparator expression (e.g., ">=0.2", "=1")
+        Returns
+        -------
+        Dict[str, Dict[str, Dict[str, Union[float, tuple[str, float]]]]]
+            A validated and normalized statistics dictionary.
+            - Outer dict key: statistic name (e.g., "U", "Q")
+            - Middle dict key: parameter group ("ref", "tgt", "src")
+            - Inner dict key: population name (e.g., "AFR", "CHB")
+            - Inner dict value:
+                - For "ref" and "tgt": float (validated to be between 0 and 1)
+                - For "src": tuple (comparator operator, float), e.g., (">=", 0.2)
+        Raises
+        ------
+        ValueError
+            If any name of statistics is not supported.
+        """
+        for stat_name, params in v.items():
+            if stat_name not in SUPPORTED_STATISTICS:
+                raise ValueError(f"The {stat_name} statistic is not supported.")
+            if stat_name in ["U", "Q"]:
+                # Validate U and Q statistics parameters
+                cls.check_range_for_u_q(stat_name, params)
+        return v
+    @staticmethod
+    def check_range_for_u_q(
+        stat_name: str, params: Dict[str, Dict[str, Union[float, str]]]
+    ) -> None:
+        """
+        Validates the parameters for U and Q statistics.
+        ref and tgt must be between 0 and 1, and src must contain a valid comparator
+        with a frequency value.
+        Parameters
+        ----------
+        stat_name : str
+            The name of the statistic (e.g., "U" or "Q").
+        params : Dict[str, Dict[str, Union[float, str]]]
+            A dictionary containing the parameters for the statistic, such as ref,
+            tgt, and src.
+        Raises
+        ------
+        ValueError
+            If any of the parameters are outside the valid range or in an incorrect
+            format.
+        """
+        if stat_name in ["U", "Q"]:
+            required_keys = {"ref", "tgt", "src"}
+            param_keys = set(params.keys())
+            if param_keys != required_keys:
+                raise ValueError(
+                    f"{stat_name} must have exactly the keys: {required_keys}, but got {param_keys}."
+                )
+        for param, pop_values in params.items():
+            if param in ["ref", "tgt"]:
+                for pop, value in pop_values.items():
+                    num = float(value)
+                    if not (0 <= num <= 1):
+                        raise ValueError(
+                            f"{param}[{pop}] value must be between 0 and 1 for {stat_name}, got {val}."
+                        )
+            elif param == "src":
+                new_src: Dict[str, tuple[str, float]] = {}
+                for pop, expr in pop_values.items():
+                    if not isinstance(expr, str):
+                        raise ValueError(
+                            f"{param}[{pop}] value must be a comparator string for {stat_name}."
+                        )
+                    new_src[pop] = StatConfig.check_comparator(
+                        expr, stat_name, f"src[{pop}]"
+                    )
+                params["src"] = new_src
+    @staticmethod
+    def check_comparator(value: str, stat_name: str, param: str) -> tuple[str, float]:
+        """
+        Validates that the src parameter contains a valid comparator (e.g., "=0.5", ">=0.2"),
+        and ensure the number is between 0 and 1.
+        Parameters
+        ----------
+        value : str
+            The value of the src parameter, which should contain a comparator (e.g., "=0.5").
+        stat_name : str
+            The name of the statistic (e.g., "U" or "Q").
+        param : str
+            The parameter name ("src").
+        Returns
+        -------
+        tuple[str, float]
+            A tuple containing:
+            - A string representing the comparison operator (`=`, `<`, `>`, `<=`, `>=`).
+            - A float representing the threshold value.
+        Raises
+        ------
+        ValueError
+            If the value does not contain a valid comparator or the number is not in
+            the range 0-1.
+        """
+        valid_comparators = ["<=", ">=", "=", "<", ">"]
+        if not any(comp in value for comp in valid_comparators):
+            raise ValueError(
+                f"{param} for {stat_name} must contain a valid comparator (e.g., '=0.5', '>=0.2')."
+            )
+        # Extract the numeric value after the comparator
+        comparator = next(comp for comp in valid_comparators if comp in value)
+        try:
+            num = float(value[len(comparator) :])
+        except ValueError:
+            raise ValueError(
+                f"{param} value for {stat_name} must be a valid number after the comparator."
+            )
+        if not (0 <= num <= 1):
+            raise ValueError(
+                f"{param} value must be between 0 and 1 for {stat_name}, but got {num}."
+            )
+        return comparator, num
+    def get_parameters(
+        self, stat_name: str
+    ) -> Optional[Dict[str, Dict[str, Union[float, tuple[str, float]]]]]:
+        """
+        Retrieves the parameters for a specific statistic.
+        Parameters
+        ----------
+        stat_name : str
+            The name of the statistic whose parameters are to be retrieved.
+        Returns
+        -------
+        Optional[Dict[str, Dict[str, Union[float, tuple[str, float]]]]]
+            A dictionary containing the parameters for the specified statistic,
+            or None if not found.
+        """
+        return self.root.get(stat_name, None)

sai/{utils/generators → generators}/chunk_generator.py RENAMED Viewed

@@ -21,7 +21,7 @@
 import pysam
 from typing import Iterator
 from sai.utils import split_genome
-from sai.utils.generators import DataGenerator
+from sai.generators import DataGenerator
 class ChunkGenerator(DataGenerator):

sai-pg 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl

sai-pg 1.0.1py3-none-any.whl → 1.1.0py3-none-any.whl