PyPI - sai-pg - Versions diffs - 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

sai-pg 1.0.1py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

sai/__init__.py +2 -0
sai/__main__.py +6 -3
sai/configs/__init__.py +24 -0
sai/configs/global_config.py +83 -0
sai/configs/ploidy_config.py +94 -0
sai/configs/pop_config.py +82 -0
sai/configs/stat_config.py +220 -0
sai/{utils/generators → generators}/chunk_generator.py +1 -1
sai/{utils/generators → generators}/window_generator.py +81 -37
sai/{utils/multiprocessing → multiprocessing}/mp_manager.py +2 -2
sai/{utils/multiprocessing → multiprocessing}/mp_pool.py +2 -2
sai/parsers/outlier_parser.py +4 -3
sai/parsers/score_parser.py +8 -119
sai/{utils/preprocessors → preprocessors}/chunk_preprocessor.py +21 -15
sai/preprocessors/feature_preprocessor.py +236 -0
sai/registries/__init__.py +22 -0
sai/registries/generic_registry.py +89 -0
sai/registries/stat_registry.py +30 -0
sai/sai.py +124 -220
sai/stats/__init__.py +11 -0
sai/stats/danc_statistic.py +83 -0
sai/stats/dd_statistic.py +77 -0
sai/stats/df_statistic.py +84 -0
sai/stats/dplus_statistic.py +86 -0
sai/stats/fd_statistic.py +92 -0
sai/stats/generic_statistic.py +93 -0
sai/stats/q_statistic.py +104 -0
sai/stats/stat_utils.py +259 -0
sai/stats/u_statistic.py +99 -0
sai/utils/utils.py +213 -142
{sai_pg-1.0.1.dist-info → sai_pg-1.1.0.dist-info}/METADATA +3 -14
sai_pg-1.1.0.dist-info/RECORD +70 -0
{sai_pg-1.0.1.dist-info → sai_pg-1.1.0.dist-info}/WHEEL +1 -1
sai_pg-1.1.0.dist-info/top_level.txt +2 -0
tests/configs/test_global_config.py +163 -0
tests/configs/test_ploidy_config.py +93 -0
tests/configs/test_pop_config.py +90 -0
tests/configs/test_stat_config.py +171 -0
tests/generators/test_chunk_generator.py +51 -0
tests/generators/test_window_generator.py +164 -0
tests/multiprocessing/test_mp_manager.py +92 -0
tests/multiprocessing/test_mp_pool.py +79 -0
tests/parsers/test_argument_validation.py +133 -0
tests/parsers/test_outlier_parser.py +53 -0
tests/parsers/test_score_parser.py +63 -0
tests/preprocessors/test_chunk_preprocessor.py +79 -0
tests/preprocessors/test_feature_preprocessor.py +223 -0
tests/registries/test_registries.py +74 -0
tests/stats/test_danc_statistic.py +51 -0
tests/stats/test_dd_statistic.py +45 -0
tests/stats/test_df_statistic.py +73 -0
tests/stats/test_dplus_statistic.py +79 -0
tests/stats/test_fd_statistic.py +68 -0
tests/stats/test_q_statistic.py +268 -0
tests/stats/test_stat_utils.py +354 -0
tests/stats/test_u_statistic.py +233 -0
tests/test___main__.py +51 -0
tests/test_sai.py +102 -0
tests/utils/test_utils.py +511 -0
sai/parsers/plot_parser.py +0 -152
sai/stats/features.py +0 -302
sai/utils/preprocessors/feature_preprocessor.py +0 -211
sai_pg-1.0.1.dist-info/RECORD +0 -30
sai_pg-1.0.1.dist-info/top_level.txt +0 -1
/sai/{utils/generators → generators}/__init__.py +0 -0
/sai/{utils/generators → generators}/data_generator.py +0 -0
/sai/{utils/multiprocessing → multiprocessing}/__init__.py +0 -0
/sai/{utils/preprocessors → preprocessors}/__init__.py +0 -0
/sai/{utils/preprocessors → preprocessors}/data_preprocessor.py +0 -0
{sai_pg-1.0.1.dist-info → sai_pg-1.1.0.dist-info}/entry_points.txt +0 -0
{sai_pg-1.0.1.dist-info → sai_pg-1.1.0.dist-info}/licenses/LICENSE +0 -0

sai/{utils/generators → generators}/window_generator.py RENAMED Viewed

@@ -18,10 +18,12 @@
 #    https://www.gnu.org/licenses/gpl-3.0.en.html
+import numpy as np
 from itertools import combinations, product
 from typing import Iterator, Any
 from sai.utils import read_data, split_genome
-from sai.utils.generators import DataGenerator
+from sai.generators import DataGenerator
+from sai.configs import PloidyConfig
 class WindowGenerator(DataGenerator):
@@ -37,8 +39,10 @@ class WindowGenerator(DataGenerator):
         ref_ind_file: str,
         tgt_ind_file: str,
         src_ind_file: str,
+        out_ind_file: str,
         win_len: int,
         win_step: int,
+        ploidy_config: PloidyConfig,
         start: int = None,
         end: int = None,
         anc_allele_file: str = None,
@@ -59,10 +63,14 @@ class WindowGenerator(DataGenerator):
             The path to the file containing identifiers for target populations.
         src_ind_file : str
             The path to the file containing identifiers for source populations.
+        out_ind_file : str
+            The path to the file containing identifiers for outgroup populations.
         win_len : int
             The length of each window in base pairs.
         win_step : int
             The step size between windows in base pairs.
+        ploidy_config : PloidyConfig
+            Configuration specifying ploidy levels for each population involved in the analysis.
         start: int, optional
             The starting position (1-based, inclusive) on the chromosome. Default: None.
         end: int, optional
@@ -88,17 +96,10 @@ class WindowGenerator(DataGenerator):
         self.win_step = win_step
         self.num_src = num_src
         self.chr_name = chr_name
+        self.ploidy_config = ploidy_config
         # Load data
-        (
-            self.ref_data,
-            self.ref_samples,
-            self.tgt_data,
-            self.tgt_samples,
-            self.src_data,
-            self.src_samples,
-            self.ploidy,
-        ) = read_data(
+        results = read_data(
             vcf_file=vcf_file,
             chr_name=self.chr_name,
             start=start,
@@ -106,13 +107,25 @@ class WindowGenerator(DataGenerator):
             ref_ind_file=ref_ind_file,
             tgt_ind_file=tgt_ind_file,
             src_ind_file=src_ind_file,
+            out_ind_file=out_ind_file,
+            ploidy_config=ploidy_config,
             anc_allele_file=anc_allele_file,
             is_phased=False,
             filter_ref=False,
             filter_tgt=False,
             filter_src=False,
+            filter_missing=True,
         )
+        self.ref_data = results["ref"][0]
+        self.tgt_data = results["tgt"][0]
+        self.src_data = results["src"][0]
+        self.out_data = results["outgroup"][0]
+        self.ref_samples = results["ref"][1]
+        self.tgt_samples = results["tgt"][1]
+        self.src_samples = results["src"][1]
+        self.out_samples = results["outgroup"][1]
         self.src_combinations = list(
             combinations(self.src_samples.keys(), self.num_src)
         )
@@ -149,39 +162,70 @@ class WindowGenerator(DataGenerator):
         for ref_pop, tgt_pop, src_comb in product(
             self.ref_samples, self.tgt_samples, self.src_combinations
         ):
-            tgt_pos = self.tgt_data[tgt_pop].POS
             for start, end in self.tgt_windows[tgt_pop]:
-                ref_gts = self.ref_data[ref_pop].GT[
-                    (self.ref_data[ref_pop].POS >= start)
-                    & (self.ref_data[ref_pop].POS <= end)
+                ref_data = self.ref_data[ref_pop]
+                tgt_data = self.tgt_data[tgt_pop]
+                src_data_list = [self.src_data[src_pop] for src_pop in src_comb]
+                ref_mask = (ref_data.POS >= start) & (ref_data.POS <= end)
+                tgt_mask = (tgt_data.POS >= start) & (tgt_data.POS <= end)
+                src_masks = [
+                    (src_data.POS >= start) & (src_data.POS <= end)
+                    for src_data in src_data_list
                 ]
-                tgt_gts = self.tgt_data[tgt_pop].GT[
-                    (self.tgt_data[tgt_pop].POS >= start)
-                    & (self.tgt_data[tgt_pop].POS <= end)
+                ref_pos = ref_data.POS[ref_mask]
+                tgt_pos = tgt_data.POS[tgt_mask]
+                src_pos_list = [
+                    src_data.POS[mask]
+                    for src_data, mask in zip(src_data_list, src_masks)
                 ]
+                common_pos = np.intersect1d(ref_pos, tgt_pos)
+                for src_pos in src_pos_list:
+                    common_pos = np.intersect1d(common_pos, src_pos)
+                ref_gts = ref_data.GT.compress(
+                    np.isin(ref_data.POS, common_pos), axis=0
+                )
+                tgt_gts = tgt_data.GT.compress(
+                    np.isin(tgt_data.POS, common_pos), axis=0
+                )
                 src_gts_list = [
-                    self.src_data[src_pop].GT[
-                        (self.src_data[src_pop].POS >= start)
-                        & (self.src_data[src_pop].POS <= end)
-                    ]
-                    for src_pop in src_comb
+                    src_data.GT.compress(np.isin(src_data.POS, common_pos), axis=0)
+                    for src_data in src_data_list
                 ]
-                sub_pos = tgt_pos[(tgt_pos >= start) & (tgt_pos <= end)]
+                sub_pos = common_pos
-                yield {
-                    "chr_name": self.chr_name,
-                    "ref_pop": ref_pop,
-                    "tgt_pop": tgt_pop,
-                    "src_pop_list": src_comb,  # List of source populations in this combination
-                    "start": start,
-                    "end": end,
-                    "pos": sub_pos,
-                    "ref_gts": ref_gts,
-                    "tgt_gts": tgt_gts,
-                    "src_gts_list": src_gts_list,  # List of genotypes for each source population in src_comb
-                    "ploidy": self.ploidy,
-                }
+                if len(sub_pos) == 0:
+                    yield {
+                        "chr_name": self.chr_name,
+                        "ref_pop": ref_pop,
+                        "tgt_pop": tgt_pop,
+                        "src_pop_list": src_comb,
+                        "start": start,
+                        "end": end,
+                        "pos": [],
+                        "ref_gts": None,
+                        "tgt_gts": None,
+                        "src_gts_list": None,
+                        "ploidy_config": self.ploidy_config,
+                    }
+                else:
+                    yield {
+                        "chr_name": self.chr_name,
+                        "ref_pop": ref_pop,
+                        "tgt_pop": tgt_pop,
+                        "src_pop_list": src_comb,  # List of source populations in this combination
+                        "start": start,
+                        "end": end,
+                        "pos": sub_pos,
+                        "ref_gts": ref_gts,
+                        "tgt_gts": tgt_gts,
+                        "src_gts_list": src_gts_list,  # List of genotypes for each source population in src_comb
+                        "ploidy_config": self.ploidy_config,
+                    }
     def _none_window_generator(self) -> Iterator[dict[str, Any]]:
         """
@@ -218,7 +262,7 @@ class WindowGenerator(DataGenerator):
                     "ref_gts": None,
                     "tgt_gts": None,
                     "src_gts_list": None,
-                    "ploidy": None,
+                    "ploidy_config": self.ploidy_config,
                 }
     def get(self) -> Iterator[dict[str, Any]]:

sai/{utils/multiprocessing → multiprocessing}/mp_manager.py RENAMED Viewed

@@ -25,8 +25,8 @@ from multiprocessing import current_process
 from multiprocessing import Manager
 from multiprocessing import Process
 from threading import Thread
-from sai.utils.generators import DataGenerator
-from sai.utils.preprocessors import DataPreprocessor
+from sai.generators import DataGenerator
+from sai.preprocessors import DataPreprocessor
 def monitor(shared_dict: dict, workers: list[multiprocessing.Process]) -> None:

sai/{utils/multiprocessing → multiprocessing}/mp_pool.py RENAMED Viewed

@@ -20,8 +20,8 @@
 from multiprocessing import Pool
 from typing import Any
-from sai.utils.generators import DataGenerator
-from sai.utils.preprocessors import DataPreprocessor
+from sai.generators import DataGenerator
+from sai.preprocessors import DataPreprocessor
 def mp_worker(params: tuple[DataPreprocessor, dict]) -> Any:

sai/parsers/outlier_parser.py CHANGED Viewed

@@ -37,7 +37,7 @@ def _run_outlier(args: argparse.Namespace) -> None:
     # Call the outlier function with parsed arguments
     outlier(
         score_file=args.score,
-        output=args.output,
+        output_prefix=args.output_prefix,
         quantile=args.quantile,
     )
@@ -62,10 +62,11 @@ def add_outlier_parser(subparsers: argparse.ArgumentParser) -> None:
         help="Path to the input score file.",
     )
     parser.add_argument(
-        "--output",
+        "--output-prefix",
+        dest="output_prefix",
         type=str,
         required=True,
-        help="Path to save the output file.",
+        help="Prefix of the output files.",
     )
     parser.add_argument(
         "--quantile",

sai/parsers/score_parser.py CHANGED Viewed

@@ -19,13 +19,9 @@
 import argparse
-import re
 from sai.parsers.argument_validation import positive_int
 from sai.parsers.argument_validation import existed_file
-from sai.parsers.argument_validation import between_zero_and_one
-from sai.parsers.argument_validation import validate_stat_type
 from sai.sai import score
-from sai.utils.utils import parse_ind_file
 def _run_score(args: argparse.Namespace) -> None:
@@ -42,108 +38,36 @@ def _run_score(args: argparse.Namespace) -> None:
             Path to the VCF file containing variant data.
         - chr_name : str
             Name of the chromosome to be analyzed.
-        - ref : str
-            Path to the reference group individual file.
-        - tgt : str
-            Path to the target group individual file.
-        - src : str
-            Path to the source population individual file.
         - win_len : int
             Length of each analysis window.
         - win_step : int
             Step size for moving the window along the sequence.
-        - num_src : int
-            Number of source populations. The length of `args.y` should match `num_src`.
         - anc_alleles : str
             Path to the ancestral allele file.
-        - w : float
-            Allele frequency threshold for the reference group.
-        - y : list of float
-            List of allele frequency thresholds for each source population. Its length must match `num_src`.
         - output : str
             Path to the output file for storing results.
-        - stat_type: str
-            Specifies the type of statistic to compute.
+        - stat_config: str
+            Path to the YAML configuration file specifying the statistics, ploidy levels, and populations to compute.
     Raises
     ------
     ValueError
-        If the length of `args.y` does not match the expected number of source populations (`args.num_src`),
+        If fewer than three ploidy values are provided,
+        or if the number of ploidy values for source populations does not match `num_src`.
         or if other input parameters do not meet expected conditions.
     """
-    src_samples = parse_ind_file(args.src)
-    num_src = len(src_samples.keys())
-    if len(args.y) != num_src:
-        raise ValueError(
-            f"The length of y ({len(args.y)}) does not match the number of source populations ({num_src}) found in {args.src}."
-        )
     score(
         vcf_file=args.vcf,
         chr_name=args.chr_name,
-        ref_ind_file=args.ref,
-        tgt_ind_file=args.tgt,
-        src_ind_file=args.src,
         win_len=args.win_len,
         win_step=args.win_step,
-        num_src=num_src,
         anc_allele_file=args.anc_alleles,
-        w=args.w,
-        y=args.y,
         output_file=args.output,
-        stat_type=args.stat,
+        config=args.config,
         num_workers=1,
     )
-def _parse_y_thresholds(value: str) -> tuple[str, float]:
-    """
-    Parses the --y parameter value to extract an operator and a numerical threshold.
-    This function ensures that the input is correctly formatted as one of the following:
-    - `=X`  (equality condition)
-    - `>X`  (greater than condition)
-    - `<X`  (less than condition)
-    - `>=X` (greater than or equal to condition)
-    - `<=X` (less than or equal to condition)
-    The numerical value `X` must be within the range [0, 1].
-    Parameters
-    ----------
-    value : str
-        A string representing the allele frequency threshold condition, e.g., "=0.7", ">0.8", "<=0.2".
-    Returns
-    -------
-    tuple[str, float]
-        A tuple containing:
-        - A string representing the comparison operator (`=`, `<`, `>`, `<=`, `>=`).
-        - A float representing the threshold value.
-    Raises
-    ------
-    argparse.ArgumentTypeError
-        If the input format is invalid or the numerical threshold is outside the range [0, 1].
-    """
-    match = re.match(r"^(=|<|>|<=|>=)(\d*\.?\d+)$", value)
-    if not match:
-        raise argparse.ArgumentTypeError(
-            f"Invalid format for --y: {value}. Must be in the form =X, >X, <X, >=X, or <=X "
-            f"(e.g., =0.7, >0.8, <0.1, >=0.5, <=0.2)."
-        )
-    operator, num_str = match.groups()
-    num = float(num_str)
-    if not (0 <= num <= 1):
-        raise argparse.ArgumentTypeError(
-            f"Value for --y must be between 0 and 1, got {num}."
-        )
-    return operator, num
 def add_score_parser(subparsers: argparse.ArgumentParser) -> None:
     """
     Initializes and configures the command-line interface parser
@@ -170,24 +94,6 @@ def add_score_parser(subparsers: argparse.ArgumentParser) -> None:
         required=True,
         help="Chromosome name to analyze from the VCF file.",
     )
-    parser.add_argument(
-        "--ref",
-        type=existed_file,
-        required=True,
-        help="Path to the file with reference population identifiers.",
-    )
-    parser.add_argument(
-        "--tgt",
-        type=existed_file,
-        required=True,
-        help="Path to the file with target population identifiers.",
-    )
-    parser.add_argument(
-        "--src",
-        type=existed_file,
-        required=True,
-        help="Path to the file with source population identifiers.",
-    )
     parser.add_argument(
         "--win-len",
         dest="win_len",
@@ -209,23 +115,6 @@ def add_score_parser(subparsers: argparse.ArgumentParser) -> None:
         default=None,
         help="Path to the BED file with ancestral allele information. If ancestral allele information is not provided, filtering will be performed for each variant based on whether the allele frequency of any allele (assuming biallelic) meets the specified condition during the calculation of the statistics. Default: None.",
     )
-    parser.add_argument(
-        "--w",
-        type=between_zero_and_one,
-        default=0.01,
-        help="Frequency threshold for variants in the reference population; only variants with frequencies below this threshold are included in the analysis. Default: 0.01.",
-    )
-    parser.add_argument(
-        "--y",
-        type=_parse_y_thresholds,
-        nargs="+",
-        default=[("=", 1.0)],
-        help="List of allele frequency conditions for the source populations. "
-        "Each value must be in the form =X, >X, <X, >=X, or <=X "
-        "(e.g., =0.7, >0.8, <0.1, >=0.5, <=0.2). "
-        "The number of values must match the number of source populations in the file specified by `--src`; "
-        "the order of the allele frequency conditions should also correspond to the order of source populations in that file. Default: =1",
-    )
     parser.add_argument(
         "--output",
         type=str,
@@ -233,9 +122,9 @@ def add_score_parser(subparsers: argparse.ArgumentParser) -> None:
         help="Output file path for saving results.",
     )
     parser.add_argument(
-        "--stat",
-        type=validate_stat_type,
+        "--config",
+        type=existed_file,
         required=True,
-        help="Type of statistic to compute: UXX or QXX, where XX is a percentage-like index indicating a threshold in the target population. For example, `U50` means the allele frequency is greater than 0.5, and `Q95` means the allele frequency is greater than or equal to the 95th percentile among sites meeting the specified conditions.",
+        help="Path to the YAML configuration file specifying the statistics to compute, ploidy settings, and population group file paths.",
     )
     parser.set_defaults(runner=_run_score)

sai/{utils/preprocessors → preprocessors}/chunk_preprocessor.py RENAMED Viewed

@@ -19,8 +19,10 @@
 from typing import Any
-from sai.utils.generators import WindowGenerator
-from sai.utils.preprocessors import DataPreprocessor
+from sai.utils import parse_ind_file
+from sai.generators import WindowGenerator
+from sai.preprocessors import DataPreprocessor
+from sai.configs import PloidyConfig, StatConfig
 from .feature_preprocessor import FeaturePreprocessor
@@ -39,12 +41,12 @@ class ChunkPreprocessor(DataPreprocessor):
         ref_ind_file: str,
         tgt_ind_file: str,
         src_ind_file: str,
+        out_ind_file: str,
         win_len: int,
         win_step: int,
-        w: float,
-        y: list[float],
         output_file: str,
-        stat_type: str,
+        ploidy_config: PloidyConfig,
+        stat_config: StatConfig,
         anc_allele_file: str = None,
         num_src: int = 1,
     ):
@@ -61,18 +63,18 @@ class ChunkPreprocessor(DataPreprocessor):
             Path to the file containing target individual IDs.
         src_ind_file : str
             Path to the file containing source individual IDs.
+        out_ind_file : str
+            Path to the file containing outgroup individual IDs.
         win_len : int
             Window length for generating genomic windows.
         win_step : int
             Step size for sliding windows across the genome.
-        w : float
-            Parameter w for feature vector computation.
-        y : list of float
-            List of y parameters for feature vector computation.
         output_file : str
             Path to the output file for storing feature vectors.
-        stat_type : str
-            Type of statistic to compute for feature vectors.
+        ploidy_config : PloidyConfig
+            Configuration specifying ploidy levels for each population involved in the analysis.
+        stat_config : StatConfig
+            Configuration of statistics to compute for feature vectors.
         anc_allele_file : str, optional
             Path to the ancestral allele file. If None, ancestral allele
             information is considered unavailable.
@@ -83,18 +85,20 @@ class ChunkPreprocessor(DataPreprocessor):
         self.ref_ind_file = ref_ind_file
         self.tgt_ind_file = tgt_ind_file
         self.src_ind_file = src_ind_file
+        self.out_ind_file = out_ind_file
         self.win_len = win_len
         self.win_step = win_step
+        self.ploidy_config = ploidy_config
         self.anc_allele_file = anc_allele_file
-        self.num_src = num_src
+        src_samples = parse_ind_file(src_ind_file)
+        self.num_src = len(src_samples.keys())
         anc_allele_available = anc_allele_file is not None
         self.feature_preprocessor = FeaturePreprocessor(
-            w=w,
-            y=y,
             output_file=output_file,
-            stat_type=stat_type,
+            stat_config=stat_config,
             anc_allele_available=anc_allele_available,
         )
@@ -127,8 +131,10 @@ class ChunkPreprocessor(DataPreprocessor):
             ref_ind_file=self.ref_ind_file,
             tgt_ind_file=self.tgt_ind_file,
             src_ind_file=self.src_ind_file,
+            out_ind_file=self.out_ind_file,
             win_len=self.win_len,
             win_step=self.win_step,
+            ploidy_config=self.ploidy_config,
             anc_allele_file=self.anc_allele_file,
             num_src=self.num_src,
         )

sai-pg 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl

sai-pg 1.0.1py3-none-any.whl → 1.1.0py3-none-any.whl