PyPI - sai-pg - Versions diffs - 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

sai-pg 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

sai/__init__.py +2 -0
sai/__main__.py +6 -3
sai/configs/__init__.py +24 -0
sai/configs/global_config.py +83 -0
sai/configs/ploidy_config.py +94 -0
sai/configs/pop_config.py +82 -0
sai/configs/stat_config.py +220 -0
sai/{utils/generators → generators}/chunk_generator.py +2 -8
sai/{utils/generators → generators}/window_generator.py +82 -37
sai/{utils/multiprocessing → multiprocessing}/mp_manager.py +2 -2
sai/{utils/multiprocessing → multiprocessing}/mp_pool.py +2 -2
sai/parsers/outlier_parser.py +4 -3
sai/parsers/score_parser.py +8 -119
sai/{utils/preprocessors → preprocessors}/chunk_preprocessor.py +21 -15
sai/preprocessors/feature_preprocessor.py +236 -0
sai/registries/__init__.py +22 -0
sai/registries/generic_registry.py +89 -0
sai/registries/stat_registry.py +30 -0
sai/sai.py +124 -220
sai/stats/__init__.py +11 -0
sai/stats/danc_statistic.py +83 -0
sai/stats/dd_statistic.py +77 -0
sai/stats/df_statistic.py +84 -0
sai/stats/dplus_statistic.py +86 -0
sai/stats/fd_statistic.py +92 -0
sai/stats/generic_statistic.py +93 -0
sai/stats/q_statistic.py +104 -0
sai/stats/stat_utils.py +259 -0
sai/stats/u_statistic.py +99 -0
sai/utils/utils.py +220 -143
{sai_pg-1.0.0.dist-info → sai_pg-1.1.0.dist-info}/METADATA +3 -14
sai_pg-1.1.0.dist-info/RECORD +70 -0
{sai_pg-1.0.0.dist-info → sai_pg-1.1.0.dist-info}/WHEEL +1 -1
sai_pg-1.1.0.dist-info/top_level.txt +2 -0
tests/configs/test_global_config.py +163 -0
tests/configs/test_ploidy_config.py +93 -0
tests/configs/test_pop_config.py +90 -0
tests/configs/test_stat_config.py +171 -0
tests/generators/test_chunk_generator.py +51 -0
tests/generators/test_window_generator.py +164 -0
tests/multiprocessing/test_mp_manager.py +92 -0
tests/multiprocessing/test_mp_pool.py +79 -0
tests/parsers/test_argument_validation.py +133 -0
tests/parsers/test_outlier_parser.py +53 -0
tests/parsers/test_score_parser.py +63 -0
tests/preprocessors/test_chunk_preprocessor.py +79 -0
tests/preprocessors/test_feature_preprocessor.py +223 -0
tests/registries/test_registries.py +74 -0
tests/stats/test_danc_statistic.py +51 -0
tests/stats/test_dd_statistic.py +45 -0
tests/stats/test_df_statistic.py +73 -0
tests/stats/test_dplus_statistic.py +79 -0
tests/stats/test_fd_statistic.py +68 -0
tests/stats/test_q_statistic.py +268 -0
tests/stats/test_stat_utils.py +354 -0
tests/stats/test_u_statistic.py +233 -0
tests/test___main__.py +51 -0
tests/test_sai.py +102 -0
tests/utils/test_utils.py +511 -0
sai/parsers/plot_parser.py +0 -152
sai/stats/features.py +0 -302
sai/utils/preprocessors/feature_preprocessor.py +0 -211
sai_pg-1.0.0.dist-info/RECORD +0 -30
sai_pg-1.0.0.dist-info/top_level.txt +0 -1
/sai/{utils/generators → generators}/__init__.py +0 -0
/sai/{utils/generators → generators}/data_generator.py +0 -0
/sai/{utils/multiprocessing → multiprocessing}/__init__.py +0 -0
/sai/{utils/preprocessors → preprocessors}/__init__.py +0 -0
/sai/{utils/preprocessors → preprocessors}/data_preprocessor.py +0 -0
{sai_pg-1.0.0.dist-info → sai_pg-1.1.0.dist-info}/entry_points.txt +0 -0
{sai_pg-1.0.0.dist-info → sai_pg-1.1.0.dist-info}/licenses/LICENSE +0 -0

sai/stats/u_statistic.py ADDED Viewed

@@ -0,0 +1,99 @@
+# Copyright 2025 Xin Huang
+#
+# GNU General Public License v3.0
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, please see
+#
+#    https://www.gnu.org/licenses/gpl-3.0.en.html
+import numpy as np
+from typing import Dict, Any
+from sai.registries.stat_registry import STAT_REGISTRY
+from sai.stats import GenericStatistic
+from sai.stats.stat_utils import compute_matching_loci
+@STAT_REGISTRY.register("U")
+class UStatistic(GenericStatistic):
+    """
+    Class for computing the number of uniquely shared sites between the target and source populations (Racimo et al. 2017. Mol Biol Evol),
+    conditional on allele frequency patterns in the reference and source populations.
+    """
+    STAT_NAME = "U"
+    def compute(self, **kwargs) -> Dict[str, Any]:
+        """
+        Computes the count of genetic loci that meet specified allele frequency conditions
+        across reference, target, and multiple source genotypes, with adjustments based on src_freq consistency.
+        Parameters
+        ----------
+        pos : np.ndarray
+            A 1D numpy array where each element represents the genomic position.
+        w : float
+            Threshold for the allele frequency in `ref_gts`. Only loci with frequencies less than `w` are counted.
+            Must be within the range [0, 1].
+        x : float
+            Threshold for the allele frequency in `tgt_gts`. Only loci with frequencies greater than `x` are counted.
+            Must be within the range [0, 1].
+        y_list : list[float]
+            List of exact allele frequency thresholds for each source population in `src_gts_list`.
+            Must be within the range [0, 1] and have the same length as `src_gts_list`.
+        anc_allele_available : bool
+            If True, checks only for matches with `y` (assuming `1` represents the derived allele).
+            If False, checks both matches with `y` and `1 - y`, taking the major allele in the source as the reference.
+        Returns
+        -------
+        dict
+            A dictionary containing:
+            - 'name' : str
+                The name of the statistic ("U").
+            - 'value' : int
+                The count of loci that meet all specified frequency conditions.
+            - 'ccd_pos' : np.ndarray
+                A 1D numpy array containing the genomic positions of the loci that meet the conditions.
+        """
+        required_keys = ["pos", "w", "x", "y_list", "anc_allele_available"]
+        if missing := [k for k in required_keys if k not in kwargs]:
+            raise ValueError(f"Missing required argument(s): {', '.join(missing)}")
+        pos = kwargs["pos"]
+        w = kwargs["w"]
+        x = kwargs["x"]
+        y_list = kwargs["y_list"]
+        anc_allele_available = kwargs["anc_allele_available"]
+        ploidy = [self.ref_ploidy, self.tgt_ploidy] + self.src_ploidy_list
+        ref_freq, tgt_freq, condition = compute_matching_loci(
+            self.ref_gts,
+            self.tgt_gts,
+            self.src_gts_list,
+            w,
+            y_list,
+            ploidy,
+            anc_allele_available,
+        )
+        # Apply final conditions
+        condition &= tgt_freq > x
+        loci_indices = np.where(condition)[0]
+        loci_positions = pos[loci_indices]
+        count = loci_indices.size
+        # Return count of matching loci
+        return {"name": self.STAT_NAME, "value": count, "cdd_pos": loci_positions}

sai/utils/utils.py CHANGED Viewed

@@ -19,11 +19,13 @@
 import allel
+import warnings
 import numpy as np
 import pandas as pd
-from typing import Optional, Union
 from natsort import natsorted
+from typing import Optional, Union
 from sai.utils.genomic_dataclasses import ChromosomeData
+from sai.configs import PloidyConfig
 def parse_ind_file(filename: str) -> dict[str, list[str]]:
@@ -37,7 +39,7 @@ def parse_ind_file(filename: str) -> dict[str, list[str]]:
     Returns
     -------
-    samples : dict of str to list of str
+    samples : dict[str, list[str]]
         A dictionary where the keys represent categories, and the values are lists of samples
         associated with those categories.
@@ -77,11 +79,12 @@ def read_geno_data(
     vcf: str,
     ind_samples: dict[str, list[str]],
     chr_name: str,
+    ploidy: int = 2,
     start: int = None,
     end: int = None,
     anc_allele_file: Optional[str] = None,
     filter_missing: bool = True,
-) -> tuple[ChromosomeData, list[str], int]:
+) -> dict[str, ChromosomeData]:
     """
     Read genotype data from a VCF file efficiently for a specified chromosome.
@@ -93,9 +96,11 @@ def read_geno_data(
         A dictionary where keys are categories (e.g., different sample groups), and values are lists of sample names.
     chr_name : str
         The name of the chromosome to read.
-    start: int, optional
+    ploidy : int, optional
+        Ploidy level of the genome.
+    start : int, optional
         The starting position (1-based, inclusive) on the chromosome. Default: None.
-    end: int, optional
+    end : int, optional
         The ending position (1-based, inclusive) on the chromosome. Default: None.
     anc_allele_file : str, optional
         The name of the BED file containing ancestral allele information, or None if not provided.
@@ -104,12 +109,7 @@ def read_geno_data(
     Returns
     -------
-    chrom_data: ChromosomeData
-        A ChromosomeData instance for the specified chromosome in the VCF.
-    samples: list
-        A list of samples in the data.
-    ploidy: int
-        Ploidy level of the organism.
+    A dictionary mapping each population name to its ChromosomeData.
     """
     try:
         # Load all samples from the VCF file
@@ -132,21 +132,21 @@ def read_geno_data(
             ],
             alt_number=1,
             samples=all_samples,
+            numbers={"GT": ploidy},
             region=region,  # Specify the chromosome region
             tabix=None,
         )
     except Exception as e:
         raise ValueError(f"Failed to read VCF file {vcf} from {region}: {e}") from e
-    # Convert genotype data to a more efficient GenotypeArray
     if vcf_data is None:
-        return None, all_samples, None
+        return None
     gt = allel.GenotypeArray(vcf_data.get("calldata/GT"))
     pos = vcf_data.get("variants/POS")
     ref = vcf_data.get("variants/REF")
     alt = vcf_data.get("variants/ALT")
-    ploidy = gt.shape[2]
+    sample_names = list(vcf_data.get("samples"))
     if gt is None or pos is None or ref is None or alt is None:
         raise ValueError("Invalid VCF file: Missing essential genotype data fields.")
@@ -162,27 +162,33 @@ def read_geno_data(
     else:
         anc_alleles = None
-    sample_indices = [all_samples.index(s) for s in all_samples]
+    chrom_data_dict = {}
-    chrom_data = ChromosomeData(
-        POS=pos, REF=ref, ALT=alt, GT=gt.take(sample_indices, axis=1)
-    )
+    for pop, pop_samples in ind_samples.items():
+        indices = [sample_names.index(s) for s in pop_samples]
+        pop_gt = gt.take(indices, axis=1)
-    # Remove missing data if specified
-    if filter_missing:
-        non_missing_index = chrom_data.GT.count_missing(axis=1) == 0
-        num_missing = len(non_missing_index) - np.sum(non_missing_index)
-        if num_missing != 0:
-            print(
-                f"Found {num_missing} variants with missing genotypes, removing them ..."
-            )
-        chrom_data = filter_geno_data(chrom_data, non_missing_index)
+        chrom_data = ChromosomeData(
+            POS=pos.copy(), REF=ref.copy(), ALT=alt.copy(), GT=pop_gt
+        )
-    # Check and incorporate ancestral alleles if the file is provided
-    if anc_alleles:
-        chrom_data = check_anc_allele(chrom_data, anc_alleles, chr_name)
+        missing_mask = chrom_data.GT.count_missing(axis=1) != 0
-    return chrom_data, vcf_data.get("samples"), ploidy
+        if filter_missing:
+            if np.any(missing_mask):
+                chrom_data = filter_geno_data(chrom_data, ~missing_mask)
+        else:
+            if np.any(missing_mask):
+                raise ValueError(
+                    "Missing data is found. Please remove variants with missing data or enable filtering."
+                )
+        if anc_alleles:
+            chrom_data = check_anc_allele(chrom_data, anc_alleles, chr_name)
+        chrom_data_dict[pop] = chrom_data
+    return chrom_data_dict
 def filter_geno_data(
@@ -214,9 +220,11 @@ def filter_geno_data(
 def read_data(
     vcf_file: str,
     chr_name: str,
+    ploidy_config: PloidyConfig,
     ref_ind_file: Optional[str],
     tgt_ind_file: Optional[str],
     src_ind_file: Optional[str],
+    out_ind_file: Optional[str],
     anc_allele_file: Optional[str],
     start: int = None,
     end: int = None,
@@ -224,15 +232,18 @@ def read_data(
     filter_ref: bool = True,
     filter_tgt: bool = True,
     filter_src: bool = False,
+    filter_out: bool = False,
     filter_missing: bool = True,
-) -> tuple[
-    Optional[dict[str, dict[str, ChromosomeData]]],
-    Optional[dict[str, list[str]]],
-    Optional[dict[str, dict[str, ChromosomeData]]],
-    Optional[dict[str, list[str]]],
-    Optional[dict[str, dict[str, ChromosomeData]]],
-    Optional[dict[str, list[str]]],
-    Optional[int],
+) -> dict[
+    str,
+    tuple[
+        Optional[dict[str, dict[str, ChromosomeData]]],
+        Optional[dict[str, list[str]]],
+        Optional[dict[str, dict[str, ChromosomeData]]],
+        Optional[dict[str, list[str]]],
+        Optional[dict[str, dict[str, ChromosomeData]]],
+        Optional[dict[str, list[str]]],
+    ],
 ]:
     """
     Helper function for reading data from reference, target, and source populations.
@@ -241,14 +252,18 @@ def read_data(
     ----------
     vcf_file : str
         Name of the VCF file containing genotype data.
-    chr_name: str
+    chr_name : str
         Name of the chromosome to read.
+    ploidy_config : PloidyConfig
+        Configuration specifying ploidy levels for each population involved in the analysis.
     ref_ind_file : str or None
         File with reference population sample information. None if not provided.
     tgt_ind_file : str or None
         File with target population sample information. None if not provided.
     src_ind_file : str or None
         File with source population sample information. None if not provided.
+    out_ind_file : str or None
+        File with outgroup population sample information. None if not provided.
     anc_allele_file : str or None
         File with ancestral allele information. None if not provided.
     start: int, optional
@@ -263,11 +278,21 @@ def read_data(
         Whether to filter fixed variants for target data. Default: True.
     filter_src : bool, optional
         Whether to filter fixed variants for source data. Default: False.
+    filter_out : bool, optional
+        Whether to filter fixed variants for outgroup data. Default: False.
     filter_missing : bool, optional
         Whether to filter out missing data. Default: True.
     Returns
     -------
+    result : dict
+        {
+            "ref": (ref_data, ref_samples),
+            "tgt": (tgt_data, tgt_samples),
+            "src": (src_data, src_samples),
+            "outgroup": (out_data, out_samples)  # optional
+        }
     ref_data : dict or None
         Genotype data from reference populations, organized by category and chromosome.
     ref_samples : dict or None
@@ -280,12 +305,14 @@ def read_data(
         Genotype data from source populations, organized by category and chromosome.
     src_samples : dict or None
         Sample information from source populations.
-    ploidy: int or None
-        Ploidy level of the organism.
+    out_data : dict or None
+        Genotype data from outgroup populations, organized by category and chromosome.
+    out_samples : dict or None
+        Sample information from outgroup populations.
     Notes
     -----
-    The `ref_data`, `tgt_data`, and `src_data` are organized as nested dictionaries where:
+    The `ref_data`, `tgt_data`, `src_data`, `out_data` are organized as nested dictionaries where:
         - The outermost keys represent different populations or sample categories.
         - The second-level keys represent different chromosomes.
@@ -298,111 +325,40 @@ def read_data(
     This organization allows easy access and manipulation of genotype data by category and chromosome,
     enabling flexible processing across different populations and chromosomal regions.
     """
-    ref_data = ref_samples = tgt_data = tgt_samples = src_data = src_samples = None
-    # Parse sample information
-    if ref_ind_file:
-        ref_samples = parse_ind_file(ref_ind_file)
-    if tgt_ind_file:
-        tgt_samples = parse_ind_file(tgt_ind_file)
-    if src_ind_file:
-        src_samples = parse_ind_file(src_ind_file)
-    # Combine all samples for a single VCF read
-    all_samples = {}
-    if ref_samples:
-        all_samples.update(ref_samples)
-    if tgt_samples:
-        all_samples.update(tgt_samples)
-    if src_samples:
-        all_samples.update(src_samples)
-    try:
-        # Read VCF data
-        geno_data, all_samples, ploidy = read_geno_data(
-            vcf=vcf_file,
-            ind_samples=all_samples,
+    group_params = [
+        ("ref", ref_ind_file, filter_ref),
+        ("tgt", tgt_ind_file, filter_tgt),
+        ("src", src_ind_file, filter_src),
+        ("outgroup", out_ind_file, filter_out),
+    ]
+    results = {}
+    for group, ind_file, filter_flag in group_params:
+        if ind_file is None:
+            results[group] = (None, None)
+            continue
+        if group == "outgroup" and group not in ploidy_config.root:
+            results[group] = (None, None)
+            continue
+        data, samples = _load_population_data(
+            vcf_file=vcf_file,
             chr_name=chr_name,
+            sample_file=ind_file,
+            anc_allele_file=anc_allele_file,
             start=start,
             end=end,
-            anc_allele_file=anc_allele_file,
+            is_phased=is_phased,
+            filter_flag=filter_flag,
             filter_missing=filter_missing,
+            ploidy_config=ploidy_config,
+            group=group,
         )
-    except Exception as e:
-        raise ValueError(f"Failed to read VCF data: {e}")
-    if geno_data is None:
-        return None, ref_samples, None, tgt_samples, None, src_samples, None
-    # Separate reference, target, and source data
-    ref_data = extract_group_data(geno_data, all_samples, ref_samples)
-    tgt_data = extract_group_data(geno_data, all_samples, tgt_samples)
-    src_data = extract_group_data(geno_data, all_samples, src_samples)
-    # Apply fixed variant filtering conditionally
-    if filter_ref and ref_data and ref_samples:
-        ref_data = filter_fixed_variants(ref_data, ref_samples)
-    if filter_tgt and tgt_data and tgt_samples:
-        tgt_data = filter_fixed_variants(tgt_data, tgt_samples)
-    if filter_src and src_data and src_samples:
-        src_data = filter_fixed_variants(src_data, src_samples)
-    # Adjust genotypes based on phased/unphased requirement
-    reshape_genotypes(ref_data, is_phased)
-    reshape_genotypes(tgt_data, is_phased)
-    reshape_genotypes(src_data, is_phased)
-    return ref_data, ref_samples, tgt_data, tgt_samples, src_data, src_samples, ploidy
-def extract_group_data(
-    geno_data: dict[str, ChromosomeData],
-    all_samples: list[str],
-    sample_groups: Optional[dict[str, list[str]]] = None,
-) -> Optional[dict[str, ChromosomeData]]:
-    """
-    Extract genotype data from geno_data based on the sample groups.
-    Parameters
-    ----------
-    geno_data : dict of str to ChromosomeData
-        Contains genotype data, where each value is a ChromosomeData instance.
-    all_samples: list
-        A list of all sample names in the dataset.
-    sample_groups : dict of str to list of str, optional
-        Contains sample group information, where each key is a group name and the value is a list of samples.
-        If None, the function returns None.
-    Returns
-    -------
-    extracted_data : dict or None
-        Genotype data organized by sample group, or None if no sample groups are provided.
-        The structure is as follows:
-        - Keys represent sample group names.
-        - Values are ChromosomeData instances, filtered to include only the samples in the specified group.
-    """
-    if sample_groups is None:
-        return None
+        results[group] = (data, samples)
-    sample_indices = {sample: idx for idx, sample in enumerate(all_samples)}
-    extracted_data = {}
-    for group, samples in sample_groups.items():
-        indices = [sample_indices[s] for s in samples if s in sample_indices]
-        # Extract ChromosomeData for the selected samples in each group
-        extracted_data[group] = ChromosomeData(
-            GT=geno_data.GT[:, indices, :],
-            POS=geno_data.POS,
-            REF=geno_data.REF,
-            ALT=geno_data.ALT,
-        )
-    return extracted_data
+    return results
 def filter_fixed_variants(
@@ -608,6 +564,7 @@ def split_genome(
     pos: np.ndarray,
     window_size: int,
     step_size: int,
+    start: int = None,
 ) -> list[tuple]:
     """
     Creates sliding windows along the genome based on variant positions.
@@ -620,6 +577,9 @@ def split_genome(
         Length of each sliding window.
     step_size : int
         Step size of the sliding windows.
+    start: int, optional
+        Minimum starting coordinate for the first window. The first window will start
+        no smaller than this value. Default is None.
     Returns
     -------
@@ -644,7 +604,9 @@ def split_genome(
     window_positions = []
     win_start = (pos[0] + step_size) // step_size * step_size - window_size + 1
-    win_start = max(win_start, 1)
+    if start is None:
+        start = 1
+    win_start = max(win_start, start)
     # Create windows based on step size and window size
     while win_start <= pos[-1]:
@@ -687,3 +649,118 @@ def natsorted_df(df: pd.DataFrame) -> pd.DataFrame:
     )
     return df.loc[sorted_indices].reset_index(drop=True)
+def _load_population_data(
+    vcf_file: str,
+    chr_name: str,
+    sample_file: Optional[str],
+    anc_allele_file: Optional[str],
+    start: Optional[int],
+    end: Optional[int],
+    is_phased: bool,
+    filter_flag: bool,
+    filter_missing: bool,
+    ploidy_config: PloidyConfig,
+    group: str,  # e.g., "ref", "tgt", "src"
+) -> tuple[
+    Optional[dict[str, dict[str, ChromosomeData]]], Optional[dict[str, list[str]]]
+]:
+    """
+    Loads genotype data and sample information for a population group (e.g., reference) from a VCF file,
+    handling multiple populations with potentially different ploidy.
+    Parameters
+    ----------
+    vcf_file : str
+        Path to the VCF file containing variant data.
+    chr_name : str
+        Chromosome name to extract from the VCF.
+    sample_file : str or None
+        Path to the file containing sample IDs grouped by population.
+        If None, no data is loaded.
+    anc_allele_file : str or None
+        Path to the BED file with ancestral allele annotations.
+    start : int or None
+        Start position on the chromosome (1-based, inclusive). If None, starts at the beginning.
+    end : int or None
+        End position on the chromosome (1-based, inclusive). If None, reads to the end.
+    is_phased : bool
+        Whether the genotypes are phased.
+    filter_flag : bool
+        Whether to remove variants fixed in all samples of each population.
+    filter_missing : bool
+        Whether to filter out variants with missing genotypes across all samples.
+    ploidy_config : PloidyConfig
+        Configuration containing ploidy for all populations in all groups.
+    group : str
+        The group label (e.g., "ref", "tgt", "src") used to extract populations from ploidy_config.
+    Returns
+    -------
+    data : dict[str, dict[str, ChromosomeData]] or None
+        Dictionary mapping population -> chromosome -> ChromosomeData.
+    samples : dict[str, list[str]] or None
+        Dictionary mapping population -> list of sample IDs.
+    """
+    if sample_file is None:
+        return None, None
+    samples = parse_ind_file(sample_file)
+    if group not in ploidy_config.root:
+        raise ValueError(f"Ploidy configuration missing group '{group}'.")
+    group_ploidies = ploidy_config.root[group]
+    # Ensure all populations in ploidy_config[group] are in sample_file
+    for population in group_ploidies:
+        if population not in samples:
+            raise ValueError(
+                f"Population '{population}' in ploidy_config[{group}] not found in sample file: {sample_file}"
+            )
+    data: dict[str, ChromosomeData] = {}
+    for population, sample_list in samples.items():
+        if population not in group_ploidies:
+            warnings.warn(
+                f"Population '{population}' found in sample file but not in ploidy_config[{group}]; skipping.",
+                RuntimeWarning,
+            )
+            continue
+        ploidy = group_ploidies[population]
+        try:
+            geno_data = read_geno_data(
+                vcf=vcf_file,
+                ind_samples={population: sample_list},
+                chr_name=chr_name,
+                start=start,
+                end=end,
+                anc_allele_file=anc_allele_file,
+                filter_missing=filter_missing,
+                ploidy=ploidy,
+            )
+        except Exception as e:
+            raise ValueError(
+                f"Failed to read VCF data for {sample_file}, population '{population}': {e}"
+            )
+        if geno_data is None:
+            continue
+        if filter_flag:
+            geno_data = filter_fixed_variants(geno_data, {population: sample_list})
+        reshape_genotypes(geno_data, is_phased)
+        data[population] = geno_data[
+            population
+        ]  # geno_data: dict[population -> ChromosomeData]
+    if not data:
+        return None, samples
+    return data, samples

{sai_pg-1.0.0.dist-info → sai_pg-1.1.0.dist-info}/METADATA RENAMED Viewed

@@ -1,11 +1,10 @@
 Metadata-Version: 2.4
 Name: sai-pg
-Version: 1.0.0
+Version: 1.1.0
 Summary: A Python Package for Statistics for Adaptive Introgression
-Home-page: https://github.com/xin-huang/sai
-Author: Xin Huang
-Author-email: xinhuang.res@gmail.com
+Author-email: Xin Huang <xinhuang.res@gmail.com>
 License: GPLv3
+Project-URL: Homepage, https://github.com/xin-huang/sai
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Programming Language :: Python
 Classifier: Programming Language :: Python :: 3.9
@@ -19,17 +18,7 @@ Requires-Dist: pandas==2.2.1
 Requires-Dist: pysam==0.23.0
 Requires-Dist: scikit-allel==1.3.7
 Requires-Dist: scipy==1.12.0
-Dynamic: author
-Dynamic: author-email
-Dynamic: classifier
-Dynamic: description
-Dynamic: description-content-type
-Dynamic: home-page
-Dynamic: license
 Dynamic: license-file
-Dynamic: requires-dist
-Dynamic: requires-python
-Dynamic: summary
 # SAI

sai-pg 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

sai-pg 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl