PyPI - gsMap - Versions diffs - 1.73.3__py3-none-any.whl → 1.73.5__py3-none-any.whl - Mend

gsMap 1.73.3py3-none-any.whl → 1.73.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

gsMap/__init__.py +2 -2
gsMap/config.py +2 -9
gsMap/diagnosis.py +4 -3
gsMap/generate_ldscore.py +110 -454
gsMap/utils/generate_r2_matrix.py +453 -352
gsMap/utils/regression_read.py +131 -157
{gsmap-1.73.3.dist-info → gsmap-1.73.5.dist-info}/METADATA +2 -2
{gsmap-1.73.3.dist-info → gsmap-1.73.5.dist-info}/RECORD +11 -11
{gsmap-1.73.3.dist-info → gsmap-1.73.5.dist-info}/WHEEL +0 -0
{gsmap-1.73.3.dist-info → gsmap-1.73.5.dist-info}/entry_points.txt +0 -0
{gsmap-1.73.3.dist-info → gsmap-1.73.5.dist-info}/licenses/LICENSE +0 -0

gsMap/utils/generate_r2_matrix.py CHANGED Viewed

@@ -1,67 +1,27 @@
+"""
+Module for reading and processing PLINK genotype data and calculating LD scores.
+Note:
+This code is adapted and modified from:
+https://github.com/bulik/ldsc/blob/master/ldsc/ldscore.py
+"""
+import logging
 import bitarray as ba
 import numpy as np
 import pandas as pd
+import pyranges as pr
 from tqdm import tqdm
-# Define the reading functions
-def ID_List_Factory(colnames, keepcol, fname_end, header=None, usecols=None):
-    # -
-    class IDContainer:
-        """
-        A class to read data from a file, store it as a DataFrame, and provide a method for a left outer join operation.
-        """
-        def __init__(self, fname):
-            """
-            Initialize the IDContainer with the given filename and reading options.
-            """
-            self.usecols = usecols
-            self.colnames = colnames
-            self.keepcol = keepcol
-            self.fname_end = fname_end
-            self.header = header
-            self.read(fname)
-            self.n = len(self.df)
-        # -
-        def read(self, fname):
-            """
-            Read data from the given file and store it as a DataFrame.
-            """
-            end = self.fname_end
-            if end and not fname.endswith(end):
-                raise ValueError(f"{end} filename must end in {end}")
-            self.df = pd.read_csv(
-                fname,
-                header=self.header,
-                usecols=self.usecols,
-                sep=r"\s+",
-            )
-            if self.colnames:
-                self.df.columns = self.colnames
-            if self.keepcol is not None:
-                self.IDList = self.df.iloc[:, [self.keepcol]].astype("object")
-    return IDContainer
+# Configure logger
+logger = logging.getLogger("gsMap.utils.plink_ldscore_tool")
 def getBlockLefts(coords, max_dist):
     """
-    Converts coordinates + max block length to the a list of coordinates of the leftmost
+    Converts coordinates + max block length to a list of coordinates of the leftmost
     SNPs to be included in blocks.
-    Parameters
-    ----------
-    coords : array
-        Array of coordinates. Must be sorted.
-    max_dist : float
-        Maximum distance between SNPs included in the same window.
-    Returns
-    -------
-    block_left : 1D np.ndarray with same length as block_left
-        block_left[j] :=  min{k | dist(j, k) < max_dist}.
     """
     M = len(coords)
     j = 0
@@ -77,16 +37,6 @@ def getBlockLefts(coords, max_dist):
 def block_left_to_right(block_left):
     """
     Converts block lefts to block rights.
-    Parameters
-    ----------
-    block_left : array
-        Array of block lefts.
-    Returns
-    -------
-    block_right : 1D np.ndarray with same length as block_left
-        block_right[j] := max {k | block_left[k] <= j}
     """
     M = len(block_left)
     j = 0
@@ -99,223 +49,149 @@ def block_left_to_right(block_left):
     return block_right
-class GenotypeArrayInMemory:
+class PlinkBEDFile:
     """
-    Parent class for various classes containing interfaces for files with genotype
-    matrices, e.g., plink .bed files, etc
+    Interface for Plink .bed format for reading and processing genotype data.
     """
-    def __init__(self, fname, n, snp_list, keep_snps=None, keep_indivs=None, mafMin=None):
-        self.m = len(snp_list.IDList)
-        self.n = n
-        self.keep_snps = keep_snps
-        self.keep_indivs = keep_indivs
-        self.df = np.array(snp_list.df[["CHR", "SNP", "BP", "CM"]])
-        self.colnames = ["CHR", "SNP", "BP", "CM"]
-        self.mafMin = mafMin if mafMin is not None else 0
-        self._currentSNP = 0
-        (self.nru, self.geno) = self.__read__(fname, self.m, n)
-        # filter individuals
-        if keep_indivs is not None:
-            keep_indivs = np.array(keep_indivs, dtype="int")
-            if np.any(keep_indivs > self.n):
-                raise ValueError("keep_indivs indices out of bounds")
-            # -
-            (self.geno, self.m, self.n) = self.__filter_indivs__(
-                self.geno, keep_indivs, self.m, self.n
-            )
-            # -
-            if self.n > 0:
-                print(f"After filtering, {self.n} individuals remain")
-            else:
-                raise ValueError("After filtering, no individuals remain")
-        # -
-        # filter SNPs
-        if keep_snps is not None:
-            keep_snps = np.array(keep_snps, dtype="int")
-            if np.any(keep_snps > self.m):  # if keep_snps is None, this returns False
-                raise ValueError("keep_snps indices out of bounds")
-        # -
-        (self.geno, self.m, self.n, self.kept_snps, self.freq) = self.__filter_snps_maf__(
-            self.geno, self.m, self.n, self.mafMin, keep_snps
+    def __init__(self, bfile_prefix):
+        """
+        Initialize the PlinkBEDFile from a PLINK file prefix.
+        Parameters
+        ----------
+        bfile_prefix : str
+            PLINK file prefix (without .bed/.bim/.fam extension)
+        """
+        # Initialize bitarray for bed code mapping
+        self._bedcode = {
+            2: ba.bitarray("11"),
+            9: ba.bitarray("10"),
+            1: ba.bitarray("01"),
+            0: ba.bitarray("00"),
+        }
+        # Load BIM file
+        self.bim_df = self.load_bim(f"{bfile_prefix}.bim")
+        # Load FAM file
+        self.fam_df = self.load_fam(f"{bfile_prefix}.fam")
+        # Set up initial parameters
+        self.m_original = len(self.bim_df)
+        self.n_original = len(self.fam_df)
+        # Read the bed file
+        logger.info(f"Loading Plink genotype data from {bfile_prefix}.bed")
+        (self.nru_original, self.geno_original) = self._read(
+            f"{bfile_prefix}.bed", self.m_original, self.n_original
         )
-        # -
-        if self.m > 0:
-            print(f"After filtering, {self.m} SNPs remain")
+        # Pre-calculate MAF for all SNPs
+        logger.info("Calculating MAF and QC for all SNPs")
+        self.all_snp_info = self._calculate_all_snp_info()
+        # Filter out invalid SNPs
+        valid_mask = self.all_snp_info["valid_snp"]
+        if num_invalid := np.sum(~valid_mask):
+            logger.warning(
+                f"Filtering out {num_invalid} bad quality SNPs: {self.bim_df.loc[~valid_mask, 'SNP'].tolist()}"
+            )
         else:
-            raise ValueError("After filtering, no SNPs remain")
-        # -
-        self.df = self.df[self.kept_snps, :]
-        self.maf = np.minimum(self.freq, np.ones(self.m) - self.freq)
-        self.sqrtpq = np.sqrt(self.freq * (np.ones(self.m) - self.freq))
-        self.df = np.c_[self.df, self.maf]
-        self.colnames.append("MAF")
-    # -
-    def __read__(self, fname, m, n):
-        raise NotImplementedError
-    def __restart__(self):
-        self._currentSNP = 0
+            logger.info("All SNPs passed the basic quality check")
-    # -
-    def __filter_indivs__(geno, keep_indivs, m, n):
-        raise NotImplementedError
+        # Create new genotype data with only the valid SNPs
+        new_geno = ba.bitarray()
+        for j in np.arange(self.m_original)[valid_mask]:
+            new_geno += self.geno_original[
+                2 * self.nru_original * j : 2 * self.nru_original * (j + 1)
+            ]
-    # -
-    def __filter_maf_(geno, m, n, maf):
-        raise NotImplementedError
+        # Update original data to only include valid SNPs
+        self.geno_original = new_geno
-    # -
-    def ldScoreVarBlocks(self, block_left, c, annot=None):
-        """Computes an unbiased estimate of L2(j) for j=1,..,M."""
+        # Only keep valid SNPs
+        self.bim_df = self.bim_df.loc[valid_mask].reset_index(drop=True)
+        self.m_original = len(self.bim_df)
+        self.kept_snps = np.arange(self.m_original)
-        def func(x):
-            return self.__l2_unbiased__(x, self.n)
+        # Initialize current state variables
+        self._currentSNP = 0
+        self.m = self.m_original
+        self.n = self.n_original
+        self.nru = self.nru_original
+        self.geno = self.geno_original.copy()
-        snp_getter = self.nextSNPs
-        return self.__corSumVarBlocks__(block_left, c, func, snp_getter, annot)
+        # Update frequency info based on valid SNPs
+        self.freq = self.all_snp_info["freq"][valid_mask]
+        self.maf = np.minimum(self.freq, 1 - self.freq)
+        self.sqrtpq = np.sqrt(self.freq * (1 - self.freq))
-    # -
-    # In small samples, the observed r^2 tends to be higher than the true r^2 due to sampling variability.
-    # The bias correction term (1-sq) / denom adjusts for this bias by subtracting a small value that depends on the sample size and the observed r^2.
-    def __l2_unbiased__(self, x, n):
-        denom = n - 2 if n > 2 else n  # allow n<2 for testing purposes
-        sq = np.square(x)
-        return sq - (1 - sq) / denom
+        # Add MAF to the BIM dataframe
+        self.bim_df["MAF"] = self.maf
+        logger.info(f"Loaded genotype data with {self.m} SNPs and {self.n} individuals")
-    # -
-    # Methods for calculating sums of Pearson correlation coefficients (i.e.,ld-score)
-    # c stands for the chunk size (default = 50)
-    def __corSumVarBlocks__(self, block_left, c, func, snp_getter, annot=None):
+    @staticmethod
+    def load_bim(bim_file):
         """
+        Load a BIM file into a pandas DataFrame.
         Parameters
         ----------
-        block_left : np.ndarray with shape (M, )
-            block_left[i] = index of leftmost SNP included in LD Score of SNP i.
-            if c > 1, then only entries that are multiples of c are examined, and it is
-            assumed that block_left[a*c+i] = block_left[a*c], except at
-            the beginning of the chromosome where the 0th SNP is included in the window.
-        c : int
-            Chunk size.
-        func : function
-            Function to be applied to the genotype correlation matrix. Before dotting with
-            annot. Examples: for biased L2, np.square. For biased L4,
-            lambda x: np.square(np.square(x)). For L1, lambda x: x.
-        snp_getter : function(int)
-            The method to be used to get the next SNPs
-        annot: numpy array with shape (m,n_a)
-            SNP annotations.
+        bim_file : str
+            Path to the BIM file
         Returns
         -------
-        cor_sum : np.ndarray with shape (M, num_annots)
-            Estimates.
+        pd.DataFrame
+            DataFrame containing BIM data
         """
-        m, n = self.m, self.n
-        block_sizes = np.array(np.arange(m) - block_left)
-        block_sizes = np.ceil(block_sizes / c) * c
-        if annot is None:
-            annot = np.ones((m, 1))
-        else:
-            annot_m = annot.shape[0]
-            if annot_m != self.m:
-                raise ValueError("Incorrect number of SNPs in annot")
-        # -
-        n_a = annot.shape[1]  # number of annotations
-        cor_sum = np.zeros((m, n_a))
-        # b = index of first SNP for which SNP 0 is not included in LD Score
-        b = np.nonzero(block_left > 0)
-        if np.any(b):
-            b = b[0][0]
-        else:
-            b = m
-        b = int(np.ceil(b / c) * c)  # round up to a multiple of c
-        if b > m:
-            c = 1
-            b = m
+        df = pd.read_csv(
+            bim_file, sep="\t", header=None, names=["CHR", "SNP", "CM", "BP", "A1", "A2"]
+        )
+        return df
-        l_A = 0  # l_A := index of leftmost SNP in matrix A
-        A = snp_getter(b)
-        rfuncAB = np.zeros((b, c))
-        rfuncBB = np.zeros((c, c))
-        # chunk inside of block
-        for l_B in np.arange(0, b, c):  # l_B := index of leftmost SNP in matrix B
-            B = A[:, l_B : l_B + c]
-            # ld matrix
-            np.dot(A.T, B / n, out=rfuncAB)
-            # ld matrix square
-            rfuncAB = func(rfuncAB)
-            cor_sum[l_A : l_A + b, :] += np.dot(rfuncAB, annot[l_B : l_B + c, :])
+    @staticmethod
+    def convert_bim_to_pyrange(bim_df) -> pr.PyRanges:
+        bim_pr = bim_df.copy()
+        bim_pr.drop(columns=["MAF"], inplace=True)
+        bim_pr.columns = ["Chromosome", "SNP", "CM", "Start", "A1", "A2"]
+        bim_pr.Chromosome = "chr" + bim_pr["Chromosome"].astype(str)
-        # chunk to right of block
-        b0 = b
-        md = int(c * np.floor(m / c))
-        end = md + 1 if md != m else md
-        for l_B in tqdm(np.arange(b0, end, c), desc="Compute SNP Gene Weight"):
-            # check if the annot matrix is all zeros for this block + chunk
-            # this happens w/ sparse categories (i.e., pathways)
-            # update the block
-            old_b = b
-            b = int(block_sizes[l_B])
-            if l_B > b0 and b > 0:
-                # block_size can't increase more than c
-                # block_size can't be less than c unless it is zero
-                # both of these things make sense
-                A = np.hstack((A[:, old_b - b + c : old_b], B))
-                l_A += old_b - b + c
-            elif l_B == b0 and b > 0:
-                A = A[:, b0 - b : b0]
-                l_A = b0 - b
-            elif b == 0:  # no SNPs to left in window, e.g., after a sequence gap
-                A = np.array(()).reshape((n, 0))
-                l_A = l_B
-            if l_B == md:
-                c = m - md
-                rfuncAB = np.zeros((b, c))
-                rfuncBB = np.zeros((c, c))
-            if b != old_b:
-                rfuncAB = np.zeros((b, c))
-            # -
-            B = snp_getter(c)
-            p1 = np.all(annot[l_A : l_A + b, :] == 0)
-            p2 = np.all(annot[l_B : l_B + c, :] == 0)
-            if p1 and p2:
-                continue
-            # -
-            np.dot(A.T, B / n, out=rfuncAB)
-            rfuncAB = func(rfuncAB)
-            cor_sum[l_A : l_A + b, :] += np.dot(rfuncAB, annot[l_B : l_B + c, :])
-            cor_sum[l_B : l_B + c, :] += np.dot(annot[l_A : l_A + b, :].T, rfuncAB).T
-            np.dot(B.T, B / n, out=rfuncBB)
-            rfuncBB = func(rfuncBB)
-            cor_sum[l_B : l_B + c, :] += np.dot(rfuncBB, annot[l_B : l_B + c, :])
-        # -
-        return cor_sum
+        # Adjust coordinates (BIM is 1-based, PyRanges uses 0-based)
+        bim_pr["End"] = bim_pr["Start"].copy()
+        bim_pr["Start"] = bim_pr["Start"] - 1
+        bim_pr = pr.PyRanges(bim_pr)
-class PlinkBEDFile(GenotypeArrayInMemory):
-    """
-    Interface for Plink .bed format
-    """
+        return bim_pr
-    def __init__(self, fname, n, snp_list, keep_snps=None, keep_indivs=None, mafMin=None):
-        self._bedcode = {
-            2: ba.bitarray("11"),
-            9: ba.bitarray("10"),
-            1: ba.bitarray("01"),
-            0: ba.bitarray("00"),
-        }
-        # -
-        GenotypeArrayInMemory.__init__(
-            self, fname, n, snp_list, keep_snps=keep_snps, keep_indivs=keep_indivs, mafMin=mafMin
-        )
+    @staticmethod
+    def load_fam(fam_file):
+        """
+        Load a FAM file into a pandas DataFrame.
+        Parameters
+        ----------
+        fam_file : str
+            Path to the FAM file
+        Returns
+        -------
+        pd.DataFrame
+            DataFrame containing FAM data
+        """
+        df = pd.read_csv(fam_file, sep=r"\s+", header=None, usecols=[1], names=["IID"])
+        return df
-    # -
-    def __read__(self, fname, m, n):
+    def _read(self, fname, m, n):
+        """
+        Read the bed file and return the genotype data.
+        """
         if not fname.endswith(".bed"):
             raise ValueError(".bed filename must end in .bed")
-        # -
         fh = open(fname, "rb")
         magicNumber = ba.bitarray(endian="little")
         magicNumber.fromfile(fh, 2)
@@ -323,29 +199,150 @@ class PlinkBEDFile(GenotypeArrayInMemory):
         bedMode.fromfile(fh, 1)
         e = (4 - n % 4) if n % 4 != 0 else 0
         nru = n + e
-        self.nru = nru
-        # check magic number
+        # Check magic number
         if magicNumber != ba.bitarray("0011011011011000"):
             raise OSError("Magic number from Plink .bed file not recognized")
-        # -
         if bedMode != ba.bitarray("10000000"):
             raise OSError("Plink .bed file must be in default SNP-major mode")
-        # check file length
-        self.geno = ba.bitarray(endian="little")
-        self.geno.fromfile(fh)
-        self.__test_length__(self.geno, self.m, self.nru)
-        return (self.nru, self.geno)
-    # -
-    def __test_length__(self, geno, m, nru):
+        # Check file length
+        geno = ba.bitarray(endian="little")
+        geno.fromfile(fh)
+        self._test_length(geno, m, nru)
+        return (nru, geno)
+    def _test_length(self, geno, m, nru):
+        """
+        Test if the genotype data has the expected length.
+        """
         exp_len = 2 * m * nru
         real_len = len(geno)
         if real_len != exp_len:
             s = "Plink .bed file has {n1} bits, expected {n2}"
             raise OSError(s.format(n1=real_len, n2=exp_len))
-    # -
-    def __filter_indivs__(self, geno, keep_indivs, m, n):
+    def _calculate_all_snp_info(self):
+        """
+        Pre-calculate MAF and other information for all SNPs.
+        Returns
+        -------
+        dict
+            Dictionary containing information for all SNPs
+        """
+        nru = self.nru_original
+        n = self.n_original
+        m = self.m_original
+        geno = self.geno_original
+        snp_info = {
+            "freq": np.zeros(m),  # Allele frequencies
+            "het_miss_count": np.zeros(m),  # Count of het or missing genotypes
+            "valid_snp": np.zeros(m, dtype=bool),  # Whether SNP passes basic criteria
+        }
+        # For each SNP, calculate statistics
+        for j in range(m):
+            z = geno[2 * nru * j : 2 * nru * (j + 1)]
+            A = z[0::2]
+            a = A.count()
+            B = z[1::2]
+            b = B.count()
+            c = (A & B).count()
+            major_ct = b + c  # number of copies of the major allele
+            n_nomiss = n - a + c  # number of individuals with nonmissing genotypes
+            f = major_ct / (2 * n_nomiss) if n_nomiss > 0 else 0
+            het_miss_ct = a + b - 2 * c  # count of SNPs that are het or missing
+            snp_info["freq"][j] = f
+            snp_info["het_miss_count"][j] = het_miss_ct
+            snp_info["valid_snp"][j] = het_miss_ct < n  # Basic validity check
+        return snp_info
+    def apply_filters(self, keep_snps=None, keep_indivs=None, mafMin=None):
+        """
+        Apply filters to the genotype data without reloading the bed file.
+        Parameters
+        ----------
+        keep_snps : array-like, optional
+            Indices of SNPs to keep.
+        keep_indivs : array-like, optional
+            Indices of individuals to keep.
+        mafMin : float, optional
+            Minimum minor allele frequency.
+        Returns
+        -------
+        self
+            Returns self for method chaining.
+        """
+        # Reset to original state first
+        self.geno = self.geno_original.copy()
+        self.m = self.m_original
+        self.n = self.n_original
+        self.nru = self.nru_original
+        self._currentSNP = 0
+        # Initialize with all SNPs
+        kept_snps = np.arange(self.m_original)
+        # Apply MAF filter using pre-calculated values
+        if mafMin is not None and mafMin > 0:
+            # Remove the redundant valid_snp check since all SNPs are already valid
+            maf_mask = self.maf > mafMin
+            kept_snps = kept_snps[maf_mask]
+            logger.info(f"After MAF filtering (>{mafMin}), {len(kept_snps)} SNPs remain")
+        # Apply SNP filter if specified
+        if keep_snps is not None:
+            keep_snps = np.array(keep_snps, dtype="int")
+            if np.any(keep_snps > self.m_original):
+                raise ValueError("keep_snps indices out of bounds")
+            # Intersect with current kept_snps
+            kept_snps = np.intersect1d(kept_snps, keep_snps)
+            logger.info(f"After keep_snps filtering, {len(kept_snps)} SNPs remain")
+        # Filter SNPs in the genotype data
+        if len(kept_snps) < self.m_original:
+            # Create new genotype data with only the kept SNPs
+            new_geno = ba.bitarray()
+            for j in kept_snps:
+                new_geno += self.geno_original[2 * self.nru * j : 2 * self.nru * (j + 1)]
+            self.geno = new_geno
+            self.m = len(kept_snps)
+        # Filter individuals if specified
+        if keep_indivs is not None:
+            keep_indivs = np.array(keep_indivs, dtype="int")
+            if np.any(keep_indivs > self.n):
+                raise ValueError("keep_indivs indices out of bounds")
+            (self.geno, self.m, self.n) = self._filter_indivs(
+                self.geno, keep_indivs, self.m, self.n
+            )
+            if self.n > 0:
+                logger.info(f"After filtering, {self.n} individuals remain")
+            else:
+                raise ValueError("After filtering, no individuals remain")
+        # Update kept_snps and other attributes
+        self.kept_snps = kept_snps
+        self.freq = self.all_snp_info["freq"][kept_snps]
+        self.maf = np.minimum(self.freq, 1 - self.freq)
+        self.sqrtpq = np.sqrt(self.freq * (1 - self.freq))
+        return self
+    def _filter_indivs(self, geno, keep_indivs, m, n):
+        """
+        Filter individuals based on the keep_indivs parameter.
+        """
         n_new = len(keep_indivs)
         e = (4 - n_new % 4) if n_new % 4 != 0 else 0
         nru_new = n_new + e
@@ -358,95 +355,118 @@ class PlinkBEDFile(GenotypeArrayInMemory):
         self.nru = nru_new
         return (z, m, n_new)
-    # -
-    def __filter_snps_maf__(self, geno, m, n, mafMin, keep_snps):
+    def get_snps_by_maf(self, mafMin):
         """
-        Credit to Chris Chang and the Plink2 developers for this algorithm
-        Modified from plink_filter.c
-        https://github.com/chrchang/plink-ng/blob/master/plink_filter.c
-        Genotypes are read forwards (since we are cheating and using endian="little")
-        A := (genotype) & 1010...
-        B := (genotype) & 0101...
-        C := (A >> 1) & B
-        Then
-        a := A.count() = missing ct + hom major ct
-        b := B.count() = het ct + hom major ct
-        c := C.count() = hom major ct
-        Which implies that
-        missing ct = a - c
-        # of indivs with nonmissing genotype = n - a + c
-        major allele ct = b + c
-        major allele frequency = (b+c)/(2*(n-a+c))
-        het ct + missing ct = a + b - 2*c
-        Why does bitarray not have >> ????
+        Get the list of SNPs that pass the MAF threshold.
+        Parameters
+        ----------
+        mafMin : float
+            Minimum MAF threshold
+        Returns
+        -------
+        list
+            List of SNP IDs that pass the MAF threshold
         """
-        nru = self.nru
-        m_poly = 0
-        y = ba.bitarray()
-        if keep_snps is None:
-            keep_snps = range(m)
-        kept_snps = []
-        freq = []
-        for e, j in enumerate(keep_snps):
-            z = geno[2 * nru * j : 2 * nru * (j + 1)]
-            A = z[0::2]
-            a = A.count()
-            B = z[1::2]
-            b = B.count()
-            c = (A & B).count()
-            major_ct = b + c  # number of copies of the major allele
-            n_nomiss = n - a + c  # number of individuals with nonmissing genotypes
-            f = major_ct / (2 * n_nomiss) if n_nomiss > 0 else 0
-            het_miss_ct = a + b - 2 * c  # remove SNPs that are only either het or missing
-            if np.minimum(f, 1 - f) > mafMin and het_miss_ct < n:
-                freq.append(f)
-                y += z
-                m_poly += 1
-                kept_snps.append(j)
-        # -
-        return (y, m_poly, n, kept_snps, freq)
-    # -
-    def nextSNPs(self, b, minorRef=None):
+        maf_mask = self.maf > mafMin
+        # Get SNP names from the BIM dataframe
+        snp_pass_maf = self.bim_df.loc[maf_mask, "SNP"].tolist()
+        logger.info(f"{len(snp_pass_maf)} SNPs with MAF > f{mafMin}")
+        return snp_pass_maf
+    def get_ldscore(self, annot_matrix=None, ld_wind=1.0, ld_unit="CM", keep_snps_index=None):
         """
-        Unpacks the binary array of genotypes and returns an n x b matrix of floats of
-        normalized genotypes for the next b SNPs, where n := number of samples.
+        Calculate LD scores using an annotation matrix.
         Parameters
         ----------
-        b : int
-            Number of SNPs to return.
-        minorRef: bool, default None
-            Should we flip reference alleles so that the minor allele is the reference?
-            (This is useful for computing l1 w.r.t. minor allele).
+        annot_matrix : np.ndarray, optional
+            Annotation matrix. If None, uses a matrix of all ones.
+        ld_wind : float, optional
+            LD window size, by default 1.0
+        ld_unit : str, optional
+            Unit for the LD window, by default "CM"
+        keep_snps_index : list[int], optional
+            Indices of SNPs to keep, by default None
         Returns
         -------
-        X : np.array with dtype float64 with shape (n, b), where n := number of samples
-            Matrix of genotypes normalized to mean zero and variance one. If minorRef is
-            not None, then the minor allele will be the positive allele (i.e., two copies
-            of the minor allele --> a positive number).
+        np.ndarray
+            Array with calculated LD scores
+        """
+        # Apply filters if needed
+        if keep_snps_index is not None:
+            original_kept_snps = self.kept_snps.copy()
+            self.apply_filters(keep_snps=keep_snps_index)
+        # Configure LD window based on specified unit
+        if ld_unit == "SNP":
+            max_dist = ld_wind
+            coords = np.array(range(self.m))
+        elif ld_unit == "KB":
+            max_dist = ld_wind * 1000
+            coords = np.array(self.bim_df.loc[self.kept_snps, "BP"])
+        elif ld_unit == "CM":
+            max_dist = ld_wind
+            coords = np.array(self.bim_df.loc[self.kept_snps, "CM"])
+            # Check if the CM is all 0
+            if np.all(coords == 0):
+                logger.warning(
+                    "All CM values are 0. Using 1MB window size for LD score calculation."
+                )
+                max_dist = 1_000_000
+                coords = np.array(self.bim_df.loc[self.kept_snps, "BP"])
+        else:
+            raise ValueError(f"Invalid ld_wind_unit: {ld_unit}. Must be one of: SNP, KB, CM")
+        # Calculate blocks for LD computation
+        block_left = getBlockLefts(coords, max_dist)
+        assert block_left.sum() > 0, "Invalid window size, please check the ld_wind parameter."
+        # Calculate LD scores
+        ld_scores = self.ldScoreVarBlocks(block_left, 100, annot=annot_matrix)
+        # Restore original state if filters were applied
+        if keep_snps_index is not None:
+            self.apply_filters(keep_snps=original_kept_snps)
+        return ld_scores
+    def restart(self):
+        """
+        Reset the current SNP index to 0.
+        """
+        self._currentSNP = 0
+    def nextSNPs(self, b, minorRef=None):
+        """
+        Unpacks the binary array of genotypes and returns an n x b matrix of floats of
+        normalized genotypes for the next b SNPs.
         """
-        # -
         try:
             b = int(b)
             if b <= 0:
                 raise ValueError("b must be > 0")
         except TypeError as e:
             raise TypeError("b must be an integer") from e
-        # -
         if self._currentSNP + b > self.m:
             s = "{b} SNPs requested, {k} SNPs remain"
             raise ValueError(s.format(b=b, k=(self.m - self._currentSNP)))
-        # -
         c = self._currentSNP
         n = self.n
         nru = self.nru
         slice = self.geno[2 * c * nru : 2 * (c + b) * nru]
-        X = np.array(slice.decode(self._bedcode), dtype="float64").reshape((b, nru)).T
+        X = np.array(slice.decode(self._bedcode), dtype="float32").reshape((b, nru)).T
         X = X[0:n, :]
-        Y = np.zeros(X.shape)
-        # normalize the SNPs and impute the missing one with the mean
+        Y = np.zeros(X.shape, dtype="float32")
+        # Normalize the SNPs and impute the missing ones with the mean
         for j in range(0, b):
             newsnp = X[:, j]
             ii = newsnp != 9
@@ -455,35 +475,116 @@ class PlinkBEDFile(GenotypeArrayInMemory):
             denom = np.std(newsnp)
             if denom == 0:
                 denom = 1
-            # -
             if minorRef is not None and self.freq[self._currentSNP + j] > 0.5:
                 denom = denom * -1
-            # -
             Y[:, j] = (newsnp - avg) / denom
-        # -
         self._currentSNP += b
         return Y
+    def _l2_unbiased(self, x, n):
+        """
+        Calculate the unbiased estimate of L2.
+        """
+        denom = n - 2 if n > 2 else n  # allow n<2 for testing purposes
+        sq = np.square(x)
+        return sq - (1 - sq) / denom
+    def ldScoreVarBlocks(self, block_left, c, annot=None):
+        """
+        Computes an unbiased estimate of L2(j) for j=1,..,M.
+        """
-def load_bfile(bfile_chr_prefix, keep_snps=None, keep_indivs=None, mafMin=None):
-    PlinkBIMFile = ID_List_Factory(
-        ["CHR", "SNP", "CM", "BP", "A1", "A2"], 1, ".bim", usecols=[0, 1, 2, 3, 4, 5]
-    )
-    PlinkFAMFile = ID_List_Factory(["IID"], 0, ".fam", usecols=[1])
+        def func(x):
+            return self._l2_unbiased(x, self.n)
-    snp_file = bfile_chr_prefix + ".bim"
-    array_snps = PlinkBIMFile(snp_file)
+        snp_getter = self.nextSNPs
+        return self._corSumVarBlocks(block_left, c, func, snp_getter, annot)
-    # Load fam
-    ind_file = bfile_chr_prefix + ".fam"
-    array_indivs = PlinkFAMFile(ind_file)
+    def _corSumVarBlocks(self, block_left, c, func, snp_getter, annot=None):
+        """
+        Calculate the sum of correlation coefficients.
+        """
+        m, n = self.m, self.n
+        block_sizes = np.array(np.arange(m) - block_left)
+        block_sizes = np.ceil(block_sizes / c) * c
+        if annot is None:
+            annot = np.ones((m, 1), dtype="float32")
+        else:
+            # annot = annot.astype("float32")  # Ensure annot is float32
+            annot_m = annot.shape[0]
+            if annot_m != self.m:
+                raise ValueError("Incorrect number of SNPs in annot")
-    n = len(array_indivs.IDList)
+        n_a = annot.shape[1]  # number of annotations
+        cor_sum = np.zeros((m, n_a), dtype="float32")
+        # b = index of first SNP for which SNP 0 is not included in LD Score
+        b = np.nonzero(block_left > 0)
+        if np.any(b):
+            b = b[0][0]
+        else:
+            b = m
+        b = int(np.ceil(b / c) * c)  # round up to a multiple of c
+        if b > m:
+            c = 1
+            b = m
-    # Load genotype array
-    array_file = bfile_chr_prefix + ".bed"
-    geno_array = PlinkBEDFile(
-        array_file, n, array_snps, keep_snps=keep_snps, keep_indivs=keep_indivs, mafMin=mafMin
-    )
+        l_A = 0  # l_A := index of leftmost SNP in matrix A
+        A = snp_getter(b)  # This now returns float32 data
+        rfuncAB = np.zeros((b, c), dtype="float32")
+        rfuncBB = np.zeros((c, c), dtype="float32")
+        # chunk inside of block
+        for l_B in np.arange(0, b, c):  # l_B := index of leftmost SNP in matrix B
+            B = A[:, l_B : l_B + c]
+            # ld matrix
+            np.dot(A.T, B / n, out=rfuncAB)
+            # ld matrix square
+            rfuncAB = func(rfuncAB)
+            cor_sum[l_A : l_A + b, :] += np.dot(rfuncAB, annot[l_B : l_B + c, :])
-    return array_snps, array_indivs, geno_array
+        # chunk to right of block
+        b0 = b
+        md = int(c * np.floor(m / c))
+        end = md + 1 if md != m else md
+        for l_B in tqdm(np.arange(b0, end, c), desc="Compute SNP Gene Weight"):
+            # check if the annot matrix is all zeros for this block + chunk
+            # this happens w/ sparse categories (i.e., pathways)
+            # update the block
+            old_b = b
+            b = int(block_sizes[l_B])
+            if l_B > b0 and b > 0:
+                # block_size can't increase more than c
+                # block_size can't be less than c unless it is zero
+                # both of these things make sense
+                A = np.hstack((A[:, old_b - b + c : old_b], B))
+                l_A += old_b - b + c
+            elif l_B == b0 and b > 0:
+                A = A[:, b0 - b : b0]
+                l_A = b0 - b
+            elif b == 0:  # no SNPs to left in window, e.g., after a sequence gap
+                A = np.array((), dtype="float32").reshape((n, 0))
+                l_A = l_B
+            if l_B == md:
+                c = m - md
+                rfuncAB = np.zeros((b, c), dtype="float32")
+                rfuncBB = np.zeros((c, c), dtype="float32")
+            if b != old_b:
+                rfuncAB = np.zeros((b, c), dtype="float32")
+            B = snp_getter(c)  # This now returns float32 data
+            p1 = np.all(annot[l_A : l_A + b, :] == 0)
+            p2 = np.all(annot[l_B : l_B + c, :] == 0)
+            if p1 and p2:
+                continue
+            np.dot(A.T, B / n, out=rfuncAB)
+            rfuncAB = func(rfuncAB)
+            cor_sum[l_A : l_A + b, :] += np.dot(rfuncAB, annot[l_B : l_B + c, :])
+            cor_sum[l_B : l_B + c, :] += np.dot(annot[l_A : l_A + b, :].T, rfuncAB).T
+            np.dot(B.T, B / n, out=rfuncBB)
+            rfuncBB = func(rfuncBB)
+            cor_sum[l_B : l_B + c, :] += np.dot(rfuncBB, annot[l_B : l_B + c, :])
+        return cor_sum

gsMap 1.73.3__py3-none-any.whl → 1.73.5__py3-none-any.whl

gsMap 1.73.3py3-none-any.whl → 1.73.5py3-none-any.whl