PyPI - gsMap - Versions diffs - 1.73.2__py3-none-any.whl → 1.73.4__py3-none-any.whl - Mend

gsMap 1.73.2py3-none-any.whl → 1.73.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

gsMap/__init__.py +1 -1
gsMap/config.py +2 -9
gsMap/diagnosis.py +4 -3
gsMap/generate_ldscore.py +115 -453
gsMap/utils/generate_r2_matrix.py +455 -352
gsMap/utils/regression_read.py +131 -157
{gsmap-1.73.2.dist-info → gsmap-1.73.4.dist-info}/METADATA +1 -1
{gsmap-1.73.2.dist-info → gsmap-1.73.4.dist-info}/RECORD +11 -11
{gsmap-1.73.2.dist-info → gsmap-1.73.4.dist-info}/WHEEL +0 -0
{gsmap-1.73.2.dist-info → gsmap-1.73.4.dist-info}/entry_points.txt +0 -0
{gsmap-1.73.2.dist-info → gsmap-1.73.4.dist-info}/licenses/LICENSE +0 -0

gsMap/generate_ldscore.py CHANGED Viewed

@@ -13,12 +13,11 @@ from pathlib import Path
 import numpy as np
 import pandas as pd
 import pyranges as pr
-import zarr
 from scipy.sparse import csr_matrix
 from tqdm import trange
 from gsMap.config import GenerateLDScoreConfig
-from gsMap.utils.generate_r2_matrix import getBlockLefts, load_bfile
+from gsMap.utils.generate_r2_matrix import PlinkBEDFile
 # Configure warning behavior more precisely
 warnings.filterwarnings("ignore", category=FutureWarning, module="pandas")
@@ -57,7 +56,8 @@ def load_gtf(
     gtf = gtf[gtf["Feature"] == "gene"]
     # Find common genes between GTF and marker scores
-    common_gene = np.intersect1d(mk_score.index, gtf.gene_name)
+    # common_gene = np.intersect1d(mk_score.index, gtf.gene_name)
+    common_gene = list(set(mk_score.index) & set(gtf.gene_name))
     logger.info(f"Found {len(common_gene)} common genes between GTF and marker scores")
     # Filter GTF and marker scores to common genes
@@ -69,6 +69,9 @@ def load_gtf(
     # Process the GTF (open window around gene coordinates)
     gtf_bed = gtf[["Chromosome", "Start", "End", "gene_name", "Strand"]].copy()
+    gtf_bed["Chromosome"] = gtf_bed["Chromosome"].apply(
+        lambda x: f"chr{x}" if not str(x).startswith("chr") else x
+    )
     gtf_bed.loc[:, "TSS"] = gtf_bed["Start"]
     gtf_bed.loc[:, "TED"] = gtf_bed["End"]
@@ -109,44 +112,6 @@ def load_marker_score(mk_score_file: str) -> pd.DataFrame:
     return mk_score
-def load_bim(bfile_root: str, chrom: int) -> tuple[pd.DataFrame, pr.PyRanges]:
-    """
-    Load PLINK BIM file and convert to a PyRanges object.
-    Parameters
-    ----------
-    bfile_root : str
-        Root path for PLINK bfiles
-    chrom : int
-        Chromosome number
-    Returns
-    -------
-    tuple
-        A tuple containing (bim_df, bim_pr) where:
-        - bim_df is a pandas DataFrame with BIM data
-        - bim_pr is a PyRanges object with BIM data
-    """
-    bim_file = f"{bfile_root}.{chrom}.bim"
-    logger.debug(f"Loading BIM file: {bim_file}")
-    bim = pd.read_csv(bim_file, sep="\t", header=None)
-    bim.columns = ["CHR", "SNP", "CM", "BP", "A1", "A2"]
-    # Convert to PyRanges
-    bim_pr = bim.copy()
-    bim_pr.columns = ["Chromosome", "SNP", "CM", "Start", "A1", "A2"]
-    # Adjust coordinates (BIM is 1-based, PyRanges uses 0-based)
-    bim_pr["End"] = bim_pr["Start"].copy()
-    bim_pr["Start"] = bim_pr["Start"] - 1
-    bim_pr = pr.PyRanges(bim_pr)
-    bim_pr.Chromosome = f"chr{chrom}"
-    return bim, bim_pr
 def overlaps_gtf_bim(gtf_pr: pr.PyRanges, bim_pr: pr.PyRanges) -> pd.DataFrame:
     """
     Find overlaps between GTF and BIM data, and select nearest gene for each SNP.
@@ -176,282 +141,13 @@ def overlaps_gtf_bim(gtf_pr: pr.PyRanges, bim_pr: pr.PyRanges) -> pd.DataFrame:
     return nearest_genes
-def filter_snps_by_keep_snp(bim_df: pd.DataFrame, keep_snp_file: str) -> pd.DataFrame:
-    """
-    Filter BIM DataFrame to keep only SNPs in a provided list.
-    Parameters
-    ----------
-    bim_df : pd.DataFrame
-        DataFrame with BIM data
-    keep_snp_file : str
-        Path to a file with SNP IDs to keep
-    Returns
-    -------
-    pd.DataFrame
-        Filtered BIM DataFrame
-    """
-    # Read SNPs to keep
-    keep_snp = pd.read_csv(keep_snp_file, header=None)[0].to_list()
-    # Filter the BIM DataFrame
-    filtered_bim_df = bim_df[bim_df["SNP"].isin(keep_snp)]
-    logger.info(f"Kept {len(filtered_bim_df)} SNPs out of {len(bim_df)} after filtering")
-    return filtered_bim_df
-def get_snp_counts(config: GenerateLDScoreConfig) -> dict:
-    """
-    Count SNPs per chromosome and calculate start positions for zarr arrays.
-    Parameters
-    ----------
-    config : GenerateLDScoreConfig
-        Configuration object
-    Returns
-    -------
-    dict
-        Dictionary with SNP counts and start positions
-    """
-    snp_counts = {}
-    total_snp = 0
-    for chrom in range(1, 23):
-        bim_df, _ = load_bim(config.bfile_root, chrom)
-        if config.keep_snp_root:
-            keep_snp_file = f"{config.keep_snp_root}.{chrom}.snp"
-            filtered_bim_df = filter_snps_by_keep_snp(bim_df, keep_snp_file)
-        else:
-            filtered_bim_df = bim_df
-        snp_counts[chrom] = filtered_bim_df.shape[0]
-        total_snp += snp_counts[chrom]
-    snp_counts["total"] = total_snp
-    # Calculate cumulative SNP counts for zarr array indexing
-    chrom_snp_length_array = np.array([snp_counts[chrom] for chrom in range(1, 23)]).cumsum()
-    snp_counts["chrom_snp_start_point"] = [0] + chrom_snp_length_array.tolist()
-    return snp_counts
-def get_snp_pass_maf(bfile_root: str, chrom: int, maf_min: float = 0.05) -> list[str]:
-    """
-    Get SNPs that pass the minimum minor allele frequency (MAF) threshold.
-    Parameters
-    ----------
-    bfile_root : str
-        Root path for PLINK bfiles
-    chrom : int
-        Chromosome number
-    maf_min : float, optional
-        Minimum MAF threshold, by default 0.05
-    Returns
-    -------
-    list
-        List of SNP IDs that pass the MAF threshold
-    """
-    array_snps, array_indivs, geno_array = load_bfile(
-        bfile_chr_prefix=f"{bfile_root}.{chrom}", mafMin=maf_min
-    )
-    m = len(array_snps.IDList)
-    n = len(array_indivs.IDList)
-    logger.info(
-        f"Loading genotype data for {m} SNPs and {n} individuals from {bfile_root}.{chrom}"
-    )
-    # Filter SNPs by MAF
-    snp_pass_maf = array_snps.IDList.iloc[geno_array.kept_snps]
-    logger.info(f"After filtering SNPs with MAF < {maf_min}, {len(snp_pass_maf)} SNPs remain")
-    return snp_pass_maf.SNP.to_list()
-def get_ldscore(
-    bfile_root: str,
-    chrom: int,
-    annot_matrix: np.ndarray,
-    ld_wind: float,
-    ld_unit: str = "CM",
-    keep_snps_index: list[int] = None,
-) -> pd.DataFrame:
-    """
-    Calculate LD scores using PLINK data and an annotation matrix.
-    Parameters
-    ----------
-    bfile_root : str
-        Root path for PLINK bfiles
-    chrom : int
-        Chromosome number
-    annot_matrix : np.ndarray
-        Annotation matrix
-    ld_wind : float
-        LD window size
-    ld_unit : str, optional
-        Unit for the LD window, by default "CM"
-    keep_snps_index : list[int], optional
-        Indices of SNPs to keep, by default None
-    Returns
-    -------
-    pd.DataFrame
-        DataFrame with calculated LD scores
-    """
-    array_snps, array_indivs, geno_array = load_bfile(
-        bfile_chr_prefix=f"{bfile_root}.{chrom}", keep_snps=keep_snps_index
-    )
-    # Configure LD window based on specified unit
-    if ld_unit == "SNP":
-        max_dist = ld_wind
-        coords = np.array(range(geno_array.m))
-    elif ld_unit == "KB":
-        max_dist = ld_wind * 1000
-        coords = np.array(array_snps.df["BP"])[geno_array.kept_snps]
-    elif ld_unit == "CM":
-        max_dist = ld_wind
-        coords = np.array(array_snps.df["CM"])[geno_array.kept_snps]
-        # Check if the CM is all 0
-        if np.all(coords == 0):
-            logger.warning(
-                "All CM values are 0 in the BIM file. Using 1MB window size for LD score calculation."
-            )
-            max_dist = 1_000_000
-            coords = np.array(array_snps.df["BP"])[geno_array.kept_snps]
-    else:
-        raise ValueError(f"Invalid ld_wind_unit: {ld_unit}. Must be one of: SNP, KB, CM")
-    # Calculate blocks for LD computation
-    block_left = getBlockLefts(coords, max_dist)
-    assert block_left.sum() > 0, "Invalid window size, please check the ld_wind parameter."
-    # Calculate LD scores
-    ld_scores = pd.DataFrame(geno_array.ldScoreVarBlocks(block_left, 100, annot=annot_matrix))
-    return ld_scores
-def calculate_ldscore_from_annotation(
-    snp_annotation_df: pd.DataFrame,
-    chrom: int,
-    bfile_root: str,
-    ld_wind: float = 1,
-    ld_unit: str = "CM",
-) -> pd.DataFrame:
-    """
-    Calculate LD scores from SNP annotation DataFrame.
-    Parameters
-    ----------
-    snp_annotation_df : pd.DataFrame
-        DataFrame with SNP annotations
-    chrom : int
-        Chromosome number
-    bfile_root : str
-        Root path for PLINK bfiles
-    ld_wind : float, optional
-        LD window size, by default 1
-    ld_unit : str, optional
-        Unit for the LD window, by default "CM"
-    Returns
-    -------
-    pd.DataFrame
-        DataFrame with calculated LD scores
-    """
-    # Calculate LD scores
-    snp_gene_weight_matrix = get_ldscore(
-        bfile_root, chrom, snp_annotation_df.values, ld_wind=ld_wind, ld_unit=ld_unit
-    )
-    # Set proper data types and indices
-    snp_gene_weight_matrix = snp_gene_weight_matrix.astype(np.float32, copy=False)
-    snp_gene_weight_matrix.index = snp_annotation_df.index
-    snp_gene_weight_matrix.columns = snp_annotation_df.columns
-    return snp_gene_weight_matrix
-def calculate_ldscore_from_multiple_annotation(
-    snp_annotation_df_list: list[pd.DataFrame],
-    chrom: int,
-    bfile_root: str,
-    ld_wind: float = 1,
-    ld_unit: str = "CM",
-) -> list[pd.DataFrame]:
-    """
-    Calculate LD scores from multiple SNP annotation DataFrames.
-    Parameters
-    ----------
-    snp_annotation_df_list : list
-        List of DataFrames with SNP annotations
-    chrom : int
-        Chromosome number
-    bfile_root : str
-        Root path for PLINK bfiles
-    ld_wind : float, optional
-        LD window size, by default 1
-    ld_unit : str, optional
-        Unit for the LD window, by default "CM"
-    Returns
-    -------
-    list
-        List of DataFrames with calculated LD scores
-    """
-    # Combine annotations
-    combined_annotations = pd.concat(snp_annotation_df_list, axis=1).astype(np.float32, copy=False)
-    # Calculate LD scores
-    combined_ld_scores = get_ldscore(
-        bfile_root, chrom, combined_annotations.values, ld_wind=ld_wind, ld_unit=ld_unit
-    )
-    # Apply proper indices and columns
-    combined_ld_scores.index = combined_annotations.index
-    combined_ld_scores.columns = combined_annotations.columns
-    # Split back into separate DataFrames
-    annotation_lengths = [len(df.columns) for df in snp_annotation_df_list]
-    result_dataframes = []
-    start_col = 0
-    for length in annotation_lengths:
-        end_col = start_col + length
-        result_dataframes.append(combined_ld_scores.iloc[:, start_col:end_col])
-        start_col = end_col
-    return result_dataframes
 class LDScoreCalculator:
     """
     Class for calculating LD scores from gene specificity scores.
-    This class handles the assignment of gene specificity scores to SNPs
-    and the calculation of LD scores.
     """
     def __init__(self, config: GenerateLDScoreConfig):
-        """
-        Initialize LDScoreCalculator.
-        Parameters
-        ----------
-        config : GenerateLDScoreConfig
-            Configuration object
-        """
+        """Initialize LDScoreCalculator."""
         self.config = config
         self.validate_config()
@@ -466,9 +162,6 @@ class LDScoreCalculator:
         # Initialize enhancer data if provided
         self.enhancer_pr = self._initialize_enhancer() if config.enhancer_annotation_file else None
-        # Initialize zarr file if needed
-        self._initialize_zarr_if_needed()
     def validate_config(self):
         """Validate configuration parameters."""
         if not Path(self.config.mkscore_feather_path).exists():
@@ -519,33 +212,6 @@ class LDScoreCalculator:
         # Convert to PyRanges
         return pr.PyRanges(enhancer_df.reset_index())
-    def _initialize_zarr_if_needed(self):
-        """Initialize zarr file if zarr format is specified."""
-        if self.config.ldscore_save_format == "zarr":
-            chrom_snp_length_dict = get_snp_counts(self.config)
-            self.chrom_snp_start_point = chrom_snp_length_dict["chrom_snp_start_point"]
-            zarr_path = (
-                Path(self.config.ldscore_save_dir) / f"{self.config.sample_name}.ldscore.zarr"
-            )
-            if not zarr_path.exists():
-                self.zarr_file = zarr.open(
-                    zarr_path.as_posix(),
-                    mode="a",
-                    dtype=np.float16,
-                    chunks=self.config.zarr_chunk_size,
-                    shape=(chrom_snp_length_dict["total"], self.mk_score_common.shape[1]),
-                )
-                zarr_path.parent.mkdir(parents=True, exist_ok=True)
-                # Save metadata
-                self.zarr_file.attrs["spot_names"] = self.mk_score_common.columns.to_list()
-                self.zarr_file.attrs["chrom_snp_start_point"] = self.chrom_snp_start_point
-            else:
-                self.zarr_file = zarr.open(zarr_path.as_posix(), mode="a")
     def process_chromosome(self, chrom: int):
         """
         Process a single chromosome to calculate LD scores.
@@ -557,35 +223,42 @@ class LDScoreCalculator:
         """
         logger.info(f"Processing chromosome {chrom}")
-        # Get SNPs passing MAF filter
-        self.snp_pass_maf = get_snp_pass_maf(self.config.bfile_root, chrom, maf_min=0.05)
+        # Initialize PlinkBEDFile once for this chromosome
+        plink_bed = PlinkBEDFile(f"{self.config.bfile_root}.{chrom}")
+        # Get SNPs passing MAF filter using built-in method
+        self.snp_pass_maf = plink_bed.get_snps_by_maf(0.05)
         # Get SNP-gene dummy pairs
-        self.snp_gene_pair_dummy = self._get_snp_gene_dummy(chrom)
+        self.snp_gene_pair_dummy = self._get_snp_gene_dummy(chrom, plink_bed)
         # Apply SNP filter if provided
         self._apply_snp_filter(chrom)
         # Process additional baseline annotations if provided
         if self.config.additional_baseline_annotation:
-            self._process_additional_baseline(chrom)
+            self._process_additional_baseline(chrom, plink_bed)
         else:
-            # Calculate SNP-gene weight matrix
-            self.snp_gene_weight_matrix = calculate_ldscore_from_annotation(
-                self.snp_gene_pair_dummy,
-                chrom,
-                self.config.bfile_root,
+            # Calculate SNP-gene weight matrix using built-in methods
+            ld_scores = plink_bed.get_ldscore(
+                annot_matrix=self.snp_gene_pair_dummy.values,
                 ld_wind=self.config.ld_wind,
                 ld_unit=self.config.ld_unit,
             )
+            self.snp_gene_weight_matrix = pd.DataFrame(
+                ld_scores,
+                index=self.snp_gene_pair_dummy.index,
+                columns=self.snp_gene_pair_dummy.columns,
+            )
             # Apply SNP filter if needed
             if self.keep_snp_mask is not None:
                 self.snp_gene_weight_matrix = self.snp_gene_weight_matrix[self.keep_snp_mask]
         # Generate w_ld file if keep_snp_root is provided
         if self.config.keep_snp_root:
-            self._generate_w_ld(chrom)
+            self._generate_w_ld(chrom, plink_bed)
         # Save pre-calculated SNP-gene weight matrix if requested
         self._save_snp_gene_weight_matrix_if_needed(chrom)
@@ -596,16 +269,16 @@ class LDScoreCalculator:
         # Calculate baseline LD scores
         logger.info(f"Calculating baseline LD scores for chr{chrom}")
-        self._calculate_baseline_ldscores(chrom)
+        self._calculate_baseline_ldscores(chrom, plink_bed)
         # Calculate LD scores for annotation
         logger.info(f"Calculating annotation LD scores for chr{chrom}")
-        self._calculate_annotation_ldscores(chrom)
+        self._calculate_annotation_ldscores(chrom, plink_bed)
         # Clear memory
         self._clear_memory()
-    def _generate_w_ld(self, chrom: int):
+    def _generate_w_ld(self, chrom: int, plink_bed):
         """
         Generate w_ld file for the chromosome using filtered SNPs.
@@ -613,6 +286,8 @@ class LDScoreCalculator:
         ----------
         chrom : int
             Chromosome number
+        plink_bed : PlinkBEDFile
+            Initialized PlinkBEDFile object
         """
         if not self.config.keep_snp_root:
             logger.info(
@@ -622,48 +297,38 @@ class LDScoreCalculator:
         logger.info(f"Generating w_ld for chr{chrom}")
-        # Get the indices of SNPs to keep based on the keep_snp_mask
-        keep_snps_index = np.nonzero(self.keep_snp_mask)[0]
+        # Get the indices of SNPs to keep based on the keep_snp
+        keep_snps_indices = plink_bed.bim_df[
+            plink_bed.bim_df.SNP.isin(self.snp_name)
+        ].index.tolist()
         # Create a simple unit annotation (all ones) for the filtered SNPs
-        unit_annotation = np.ones((len(keep_snps_index), 1))
+        unit_annotation = np.ones((len(keep_snps_indices), 1))
-        # Calculate LD scores using the filtered SNPs
-        w_ld_scores = get_ldscore(
-            self.config.bfile_root,
-            chrom,
-            unit_annotation,
+        # Calculate LD scores
+        w_ld_scores = plink_bed.get_ldscore(
+            annot_matrix=unit_annotation,
             ld_wind=self.config.ld_wind,
             ld_unit=self.config.ld_unit,
-            keep_snps_index=keep_snps_index.tolist(),
-        )
-        # Load the BIM file to get SNP information
-        bim_data = pd.read_csv(
-            f"{self.config.bfile_root}.{chrom}.bim",
-            sep="\t",
-            header=None,
-            names=["CHR", "SNP", "CM", "BP", "A1", "A2"],
+            keep_snps_index=keep_snps_indices,
         )
-        # Get SNP names for the kept indices
-        kept_snp_names = bim_data.iloc[keep_snps_index].SNP.tolist()
         # Create the w_ld DataFrame
+        bim_subset = plink_bed.bim_df.loc[keep_snps_indices]
         w_ld_df = pd.DataFrame(
             {
-                "SNP": kept_snp_names,
-                "L2": w_ld_scores.values.flatten(),
-                "CHR": bim_data.iloc[keep_snps_index].CHR.values,
-                "BP": bim_data.iloc[keep_snps_index].BP.values,
-                "CM": bim_data.iloc[keep_snps_index].CM.values,
+                "SNP": bim_subset.SNP,
+                "L2": w_ld_scores.flatten(),
+                "CHR": bim_subset.CHR,
+                "BP": bim_subset.BP,
+                "CM": bim_subset.CM,
             }
         )
         # Reorder columns
         w_ld_df = w_ld_df[["CHR", "SNP", "BP", "CM", "L2"]]
-        # Save to feather format
+        # Save to file
         w_ld_dir = Path(self.config.ldscore_save_dir) / "w_ld"
         w_ld_dir.mkdir(parents=True, exist_ok=True)
         w_ld_file = w_ld_dir / f"weights.{chrom}.l2.ldscore.gz"
@@ -693,7 +358,7 @@ class LDScoreCalculator:
             logger.info(f"Using all {len(self.snp_name)} SNPs (no filter applied)")
             logger.warning("No keep_snp_root provided, all SNPs will be used to calculate w_ld.")
-    def _process_additional_baseline(self, chrom: int):
+    def _process_additional_baseline(self, chrom: int, plink_bed):
         """
         Process additional baseline annotations.
@@ -701,6 +366,8 @@ class LDScoreCalculator:
         ----------
         chrom : int
             Chromosome number
+        plink_bed : PlinkBEDFile
+            Initialized PlinkBEDFile object
         """
         # Load additional baseline annotations
         additional_baseline_path = Path(self.config.additional_baseline_annotation)
@@ -730,25 +397,44 @@ class LDScoreCalculator:
                 f"{missing_count} SNPs not found in additional baseline annotations. "
                 "Setting their values to 0."
             )
-            additional_baseline_df = additional_baseline_df.reindex(
-                self.snp_gene_pair_dummy.index, fill_value=0
-            )
-        else:
-            additional_baseline_df = additional_baseline_df.reindex(self.snp_gene_pair_dummy.index)
-        # Calculate LD scores for both annotation sets together
-        self.snp_gene_weight_matrix, additional_ldscore = (
-            calculate_ldscore_from_multiple_annotation(
-                [self.snp_gene_pair_dummy, additional_baseline_df],
-                chrom,
-                self.config.bfile_root,
-                ld_wind=self.config.ld_wind,
-                ld_unit=self.config.ld_unit,
-            )
+        additional_baseline_df = additional_baseline_df.reindex(
+            self.snp_gene_pair_dummy.index, fill_value=0
+        )
+        # Combine annotations into a single matrix
+        combined_annotations = pd.concat(
+            [self.snp_gene_pair_dummy, additional_baseline_df], axis=1
         )
-        # Filter additional ldscore
-        additional_ldscore = additional_ldscore.loc[self.snp_name]
+        # Calculate LD scores
+        ld_scores = plink_bed.get_ldscore(
+            annot_matrix=combined_annotations.values.astype(np.float32, copy=False),
+            ld_wind=self.config.ld_wind,
+            ld_unit=self.config.ld_unit,
+        )
+        # Split results
+        # total_cols = combined_annotations.shape[1]
+        gene_cols = self.snp_gene_pair_dummy.shape[1]
+        #         baseline_cols = additional_baseline_df.shape[1]
+        # Create DataFrames with proper indices and columns
+        self.snp_gene_weight_matrix = pd.DataFrame(
+            ld_scores[:, :gene_cols],
+            index=combined_annotations.index,
+            columns=self.snp_gene_pair_dummy.columns,
+        )
+        additional_ldscore = pd.DataFrame(
+            ld_scores[:, gene_cols:],
+            index=combined_annotations.index,
+            columns=additional_baseline_df.columns,
+        )
+        # Filter by keep_snp_mask if specified
+        if self.keep_snp_mask is not None:
+            additional_ldscore = additional_ldscore[self.keep_snp_mask]
+            self.snp_gene_weight_matrix = self.snp_gene_weight_matrix[self.keep_snp_mask]
         # Save additional baseline LD scores
         ld_score_file = f"{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.ldscore.feather"
@@ -793,7 +479,7 @@ class LDScoreCalculator:
             save_path = save_dir / f"{chrom}.snp_gene_weight_matrix.feather"
             self.snp_gene_weight_matrix.reset_index().to_feather(save_path)
-    def _calculate_baseline_ldscores(self, chrom: int):
+    def _calculate_baseline_ldscores(self, chrom: int, plink_bed):
         """
         Calculate and save baseline LD scores.
@@ -801,6 +487,8 @@ class LDScoreCalculator:
         ----------
         chrom : int
             Chromosome number
+        plink_bed : PlinkBEDFile
+            Initialized PlinkBEDFile object
         """
         # Create baseline scores
         baseline_mk_score = np.ones((self.snp_gene_pair_dummy.shape[1], 2))
@@ -818,7 +506,9 @@ class LDScoreCalculator:
         m_5_file = f"{self.config.ldscore_save_dir}/baseline/baseline.{chrom}.l2.M_5_50"
         # Calculate LD scores
-        ldscore_chunk = self._calculate_ldscore_from_weights(baseline_df, drop_dummy_na=False)
+        ldscore_chunk = self._calculate_ldscore_from_weights(
+            baseline_df, plink_bed, drop_dummy_na=False
+        )
         # Save LD scores and M values
         self._save_ldscore_to_feather(
@@ -836,9 +526,9 @@ class LDScoreCalculator:
         # If keep_snp_root is not provided, use the first column of baseline ldscore as w_ld
         if not self.config.keep_snp_root:
-            self._save_baseline_as_w_ld(chrom, ldscore_chunk)
+            self._save_baseline_as_w_ld(chrom, ldscore_chunk, plink_bed)
-    def _save_baseline_as_w_ld(self, chrom: int, ldscore_chunk: np.ndarray):
+    def _save_baseline_as_w_ld(self, chrom: int, ldscore_chunk: np.ndarray, plink_bed):
         """
         Save the first column of baseline ldscore as w_ld.
@@ -848,6 +538,8 @@ class LDScoreCalculator:
             Chromosome number
         ldscore_chunk : np.ndarray
             Array with baseline LD scores
+        plink_bed : PlinkBEDFile
+            Initialized PlinkBEDFile object
         """
         logger.info(f"Using first column of baseline ldscore as w_ld for chr{chrom}")
@@ -861,23 +553,24 @@ class LDScoreCalculator:
         # Extract the first column
         w_ld_values = ldscore_chunk[:, 0]
-        # Create a DataFrame
-        bim_data = pd.read_csv(
-            f"{self.config.bfile_root}.{chrom}.bim",
-            sep="\t",
-            header=None,
-            names=["CHR", "SNP", "CM", "BP", "A1", "A2"],
+        # Create a DataFrame with SNP information from the BIM file
+        snp_indices = (
+            plink_bed.kept_snps
+            if hasattr(plink_bed, "kept_snps")
+            else np.arange(len(self.snp_name))
         )
+        bim_subset = plink_bed.bim_df.iloc[snp_indices]
         w_ld_df = pd.DataFrame(
             {
                 "SNP": self.snp_name,
                 "L2": w_ld_values,
+                "CHR": bim_subset.CHR.values[: len(self.snp_name)],  # Ensure length matches
+                "BP": bim_subset.BP.values[: len(self.snp_name)],
+                "CM": bim_subset.CM.values[: len(self.snp_name)],
             }
         )
-        # Add CHR, BP, and CM information
-        w_ld_df = w_ld_df.merge(bim_data[["SNP", "CHR", "BP", "CM"]], on="SNP", how="left")
         # Reorder columns
         w_ld_df = w_ld_df[["CHR", "SNP", "BP", "CM", "L2"]]
@@ -885,7 +578,7 @@ class LDScoreCalculator:
         logger.info(f"Saved w_ld for chr{chrom} to {w_ld_file}")
-    def _calculate_annotation_ldscores(self, chrom: int):
+    def _calculate_annotation_ldscores(self, chrom: int, plink_bed):
         """
         Calculate and save LD scores for spatial annotations.
@@ -893,6 +586,8 @@ class LDScoreCalculator:
         ----------
         chrom : int
             Chromosome number
+        plink_bed : PlinkBEDFile
+            Initialized PlinkBEDFile object
         """
         # Get marker scores for gene columns (excluding dummy NA column)
         mk_scores = self.mk_score_common.loc[self.snp_gene_pair_dummy.columns[:-1]]
@@ -915,7 +610,7 @@ class LDScoreCalculator:
             m_5_file = f"{self.config.ldscore_save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.M_5_50"
             # Calculate LD scores
-            ldscore_chunk = self._calculate_ldscore_from_weights(mk_score_chunk)
+            ldscore_chunk = self._calculate_ldscore_from_weights(mk_score_chunk, plink_bed)
             # Save LD scores based on format
             if self.config.ldscore_save_format == "feather":
@@ -924,12 +619,6 @@ class LDScoreCalculator:
                     column_names=mk_score_chunk.columns,
                     save_file_name=ld_score_file,
                 )
-            elif self.config.ldscore_save_format == "zarr":
-                self._save_ldscore_chunk_to_zarr(
-                    ldscore_chunk,
-                    chrom=chrom,
-                    start_col_index=i,
-                )
             else:
                 raise ValueError(f"Invalid ldscore_save_format: {self.config.ldscore_save_format}")
@@ -948,7 +637,7 @@ class LDScoreCalculator:
             gc.collect()
     def _calculate_ldscore_from_weights(
-        self, marker_scores: pd.DataFrame, drop_dummy_na: bool = True
+        self, marker_scores: pd.DataFrame, plink_bed, drop_dummy_na: bool = True
     ) -> np.ndarray:
         """
         Calculate LD scores using SNP-gene weight matrix.
@@ -957,6 +646,8 @@ class LDScoreCalculator:
         ----------
         marker_scores : pd.DataFrame
             DataFrame with marker scores
+        plink_bed : PlinkBEDFile
+            Initialized PlinkBEDFile object
         drop_dummy_na : bool, optional
             Whether to drop the dummy NA column, by default True
@@ -1009,37 +700,6 @@ class LDScoreCalculator:
         df.index.name = "SNP"
         df.reset_index().to_feather(save_file_name)
-    def _save_ldscore_chunk_to_zarr(
-        self, ldscore_data: np.ndarray, chrom: int, start_col_index: int
-    ):
-        """
-        Save LD scores to a zarr array.
-        Parameters
-        ----------
-        ldscore_data : np.ndarray
-            Array with LD scores
-        chrom : int
-            Chromosome number
-        start_col_index : int
-            Starting column index in the zarr array
-        """
-        # Convert to float16 for storage efficiency
-        ldscore_data = ldscore_data.astype(np.float16, copy=False)
-        # Handle numerical overflow
-        ldscore_data[np.isinf(ldscore_data)] = np.finfo(np.float16).max
-        # Get start and end indices for this chromosome
-        chrom_start = self.chrom_snp_start_point[chrom - 1]
-        chrom_end = self.chrom_snp_start_point[chrom]
-        # Save to zarr array
-        self.zarr_file[
-            chrom_start:chrom_end,
-            start_col_index : start_col_index + ldscore_data.shape[1],
-        ] = ldscore_data
     def _calculate_and_save_m_values(
         self,
         marker_scores: pd.DataFrame,
@@ -1084,7 +744,7 @@ class LDScoreCalculator:
         np.savetxt(m_file_path, m_values, delimiter="\t")
         np.savetxt(m_5_file_path, m_5_values, delimiter="\t")
-    def _get_snp_gene_dummy(self, chrom: int) -> pd.DataFrame:
+    def _get_snp_gene_dummy(self, chrom: int, plink_bed) -> pd.DataFrame:
         """
         Get dummy matrix for SNP-gene pairs.
@@ -1092,6 +752,7 @@ class LDScoreCalculator:
         ----------
         chrom : int
             Chromosome number
+        plink_bed : PlinkBEDFile
         Returns
         -------
@@ -1101,7 +762,8 @@ class LDScoreCalculator:
         logger.info(f"Creating SNP-gene mappings for chromosome {chrom}")
         # Load BIM file
-        bim, bim_pr = load_bim(self.config.bfile_root, chrom)
+        bim = plink_bed.bim_df
+        bim_pr = plink_bed.convert_bim_to_pyrange(bim)
         # Determine mapping strategy
         if self.config.gene_window_enhancer_priority in ["gene_window_first", "enhancer_first"]:

gsMap 1.73.2__py3-none-any.whl → 1.73.4__py3-none-any.whl

gsMap 1.73.2py3-none-any.whl → 1.73.4py3-none-any.whl