PyPI - gsMap - Versions diffs - 1.73.3__py3-none-any.whl → 1.73.4__py3-none-any.whl - Mend

gsMap 1.73.3py3-none-any.whl → 1.73.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

gsMap/__init__.py +1 -1
gsMap/config.py +2 -9
gsMap/diagnosis.py +4 -3
gsMap/generate_ldscore.py +110 -454
gsMap/utils/generate_r2_matrix.py +455 -352
gsMap/utils/regression_read.py +131 -157
{gsmap-1.73.3.dist-info → gsmap-1.73.4.dist-info}/METADATA +1 -1
{gsmap-1.73.3.dist-info → gsmap-1.73.4.dist-info}/RECORD +11 -11
{gsmap-1.73.3.dist-info → gsmap-1.73.4.dist-info}/WHEEL +0 -0
{gsmap-1.73.3.dist-info → gsmap-1.73.4.dist-info}/entry_points.txt +0 -0
{gsmap-1.73.3.dist-info → gsmap-1.73.4.dist-info}/licenses/LICENSE +0 -0

gsMap/utils/regression_read.py CHANGED Viewed

@@ -1,201 +1,175 @@
+import glob
+import logging
 import os
-import numpy as np
 import pandas as pd
+logger = logging.getLogger("gsMap.utils.regression_read")
-# Fun for reading gwas data
-def _read_sumstats(fh, alleles=False, dropna=False):
-    """
-    Parse gwas summary statistics.
-    """
-    print(f"Reading summary statistics from {fh} ...")
-    sumstats = ps_sumstats(fh, alleles=alleles, dropna=dropna)
-    print(f"Read summary statistics for {len(sumstats)} SNPs.")
-    m = len(sumstats)
-    sumstats = sumstats.drop_duplicates(subset="SNP")
-    if m > len(sumstats):
-        print(f"Dropped {m - len(sumstats)} SNPs with duplicated rs numbers.")
-    return sumstats
+def _read_sumstats(fh, alleles=False, dropna=False):
+    """Parse GWAS summary statistics."""
+    logger.info(f"Reading summary statistics from {fh} ...")
+    # Determine compression type
+    compression = None
+    if fh.endswith("gz"):
+        compression = "gzip"
+    elif fh.endswith("bz2"):
+        compression = "bz2"
-def ps_sumstats(fh, alleles=False, dropna=True):
-    """
-    Parses .sumstats files. See docs/file_formats_sumstats.txt.
-    """
+    # Define columns and dtypes
     dtype_dict = {"SNP": str, "Z": float, "N": float, "A1": str, "A2": str}
-    compression = get_compression(fh)
     usecols = ["SNP", "Z", "N"]
     if alleles:
         usecols += ["A1", "A2"]
+    # Read the file
     try:
-        x = read_csv(fh, usecols=usecols, dtype=dtype_dict, compression=compression)
+        sumstats = pd.read_csv(
+            fh,
+            sep=r"\s+",
+            na_values=".",
+            usecols=usecols,
+            dtype=dtype_dict,
+            compression=compression,
+        )
     except (AttributeError, ValueError) as e:
+        logger.error(f"Failed to parse sumstats file: {str(e.args)}")
         raise ValueError("Improperly formatted sumstats file: " + str(e.args)) from e
+    # Drop NA values if specified
     if dropna:
-        x = x.dropna(how="any")
+        sumstats = sumstats.dropna(how="any")
-    return x
+    logger.info(f"Read summary statistics for {len(sumstats)} SNPs.")
+    # Drop duplicates
+    m = len(sumstats)
+    sumstats = sumstats.drop_duplicates(subset="SNP")
+    if m > len(sumstats):
+        logger.info(f"Dropped {m - len(sumstats)} SNPs with duplicated rs numbers.")
-def get_compression(fh):
-    """
-    Determin the format of compression used with read_csv?
-    """
-    if fh.endswith("gz"):
-        compression = "gzip"
-    elif fh.endswith("bz2"):
-        compression = "bz2"
-    else:
-        compression = None
-    # -
-    return compression
-def read_csv(fh, **kwargs):
-    """
-    Read the csv data
-    """
-    return pd.read_csv(fh, sep=r"\s+", na_values=".", **kwargs)
-# Fun for reading loading LD scores
-def which_compression(fh):
-    """
-    Given a file prefix, figure out what sort of compression to use.
-    """
-    if os.access(fh + ".bz2", 4):
-        suffix = ".bz2"
-        compression = "bz2"
-    elif os.access(fh + ".gz", 4):
-        suffix = ".gz"
-        compression = "gzip"
-    elif os.access(fh + ".parquet", 4):
-        suffix = ".parquet"
-        compression = "parquet"
-    elif os.access(fh + ".feather", 4):
-        suffix = ".feather"
-        compression = "feather"
-    elif os.access(fh, 4):
-        suffix = ""
-        compression = None
-    else:
-        raise OSError(f"Could not open {fh}[./gz/bz2/parquet/feather]")
-    # -
-    return suffix, compression
+    return sumstats
+def _read_chr_files(base_path, suffix, expected_count=22):
+    """Read chromosome files using glob pattern matching."""
+    # Create the pattern to search for files
+    file_pattern = f"{base_path}[1-9]*{suffix}*"
+    # Find all matching files
+    all_files = glob.glob(file_pattern)
+    # Extract chromosome numbers
+    chr_files = []
+    for file in all_files:
+        try:
+            # Extract the chromosome number from filename
+            file_name = os.path.basename(file)
+            base_name = os.path.basename(base_path)
+            chr_part = file_name.replace(base_name, "").split(suffix)[0]
+            chr_num = int(chr_part)
+            if 1 <= chr_num <= expected_count:
+                chr_files.append((chr_num, file))
+        except (ValueError, IndexError):
+            continue
+    # Check if we have the expected number of chromosome files
+    if len(chr_files) != expected_count:
+        logger.warning(
+            f"❗ SEVERE WARNING ❗ Expected {expected_count} chromosome files, but found {len(chr_files)}! "
+            f"⚠️ For human GWAS data, all 22 autosomes must be present. Please verify your input files."
+        )
+    # Sort by chromosome number and return file paths
+    chr_files.sort()
+    return [file for _, file in chr_files]
+def _read_file(file_path):
+    """Read a file based on its format/extension."""
+    try:
+        if file_path.endswith(".feather"):
+            return pd.read_feather(file_path)
+        elif file_path.endswith(".parquet"):
+            return pd.read_parquet(file_path)
+        elif file_path.endswith(".gz"):
+            return pd.read_csv(file_path, compression="gzip", sep="\t")
+        elif file_path.endswith(".bz2"):
+            return pd.read_csv(file_path, compression="bz2", sep="\t")
+        else:
+            return pd.read_csv(file_path, sep="\t")
+    except Exception as e:
+        logger.error(f"Failed to read file {file_path}: {str(e)}")
+        raise
 def _read_ref_ld_v2(ld_file):
+    """Read reference LD scores for all chromosomes."""
     suffix = ".l2.ldscore"
-    file = ld_file
-    first_fh = f"{file}1{suffix}"
-    s, compression = which_compression(first_fh)
-    print(f"Reading ld score annotations from {file}[1-22]{suffix}.{compression}")
-    ref_ld = pd.concat(
-        [pd.read_feather(f"{file}{chr}{suffix}{s}") for chr in range(1, 23)], axis=0
-    )
-    # set first column as index
-    ref_ld.rename(columns={"index": "SNP"}, inplace=True)
-    ref_ld.set_index("SNP", inplace=True)
-    return ref_ld
+    logger.info(f"Reading LD score annotations from {ld_file}[1-22]{suffix}...")
+    # Get the chromosome files
+    chr_files = _read_chr_files(ld_file, suffix)
-def _read_M_v2(ld_file, n_annot, not_M_5_50):
-    suffix = ".l2.M"
-    if not not_M_5_50:
-        suffix += "_5_50"
-    M_annot = np.array(
-        [
-            np.loadtxt(
-                f"{ld_file}{chr}{suffix}",
-            )
-            for chr in range(1, 23)
-        ]
-    )
-    assert M_annot.shape == (22, n_annot)
-    return M_annot.sum(axis=0).reshape((1, n_annot))
+    # Read and concatenate all files
+    df_list = [_read_file(file) for file in chr_files]
+    if not df_list:
+        logger.error(f"No LD score files found matching pattern: {ld_file}*{suffix}*")
+        raise FileNotFoundError(f"No LD score files found matching pattern: {ld_file}*{suffix}*")
-# Fun for reading M annotations
-def _read_M(ld_file, n_annot, not_M_5_50):
-    """
-    Read M (--M, --M-file, etc).
-    """
-    M_annot = M(ld_file, common=(not not_M_5_50))
+    ref_ld = pd.concat(df_list, axis=0)
+    logger.info(f"Loaded {len(ref_ld)} SNPs from LD score files")
-    try:
-        M_annot = np.array(M_annot).reshape((1, n_annot))
-    except ValueError as e:
-        raise ValueError(
-            "# terms in --M must match # of LD Scores in --ref-ld.\n" + str(e.args)
-        ) from e
-    return M_annot
-def M(fh, common=False):
-    """
-    Parses .l{N}.M files, split across num chromosomes.
-    """
-    suffix = ".l2.M"
-    if common:
-        suffix += "_5_50"
-    # -
-    M_array = []
-    for i in range(1, 23):
-        M_current = pd.read_csv(f"{fh}{i}" + suffix, header=None)
-        M_array.append(M_current)
-    M_array = pd.concat(M_array, axis=1).sum(axis=1)
-    # -
-    return np.array(M_array).reshape((1, len(M_array)))
-def _check_variance_v2(M_annot, ref_ld):
-    ii = ref_ld.var() == 0
-    if ii.all():
-        raise ValueError("All LD Scores have zero variance.")
-    elif not ii.any():
-        print("No partitioned LD Scores have zero variance.")
-    else:
-        ii_snp = ii_m = np.array(~ii)
-        print(f"Removing {sum(ii)} partitioned LD Scores with zero variance.")
-        ref_ld = ref_ld.iloc[:, ii_snp]
-        M_annot = M_annot[:, ii_m]
-    return M_annot, ref_ld
+    # Set SNP as index
+    if "index" in ref_ld.columns:
+        ref_ld.rename(columns={"index": "SNP"}, inplace=True)
+    if "SNP" in ref_ld.columns:
+        ref_ld.set_index("SNP", inplace=True)
+    return ref_ld
 def _read_w_ld(w_file):
+    """Read LD weights for all chromosomes."""
     suffix = ".l2.ldscore"
-    file = w_file
-    first_fh = f"{file}1{suffix}"
-    s, compression = which_compression(first_fh)
-    #
+    logger.info(f"Reading LD score annotations from {w_file}[1-22]{suffix}...")
+    # Get the chromosome files
+    chr_files = _read_chr_files(w_file, suffix)
+    if not chr_files:
+        logger.error(f"No LD score files found matching pattern: {w_file}*{suffix}*")
+        raise FileNotFoundError(f"No LD score files found matching pattern: {w_file}*{suffix}*")
+    # Read and process each file
     w_array = []
-    print(f"Reading ld score annotations from {file}[1-22]{suffix}.{compression}")
-    for chr in range(1, 23):
-        file_chr = f"{file}{chr}{suffix}{s}"
-        #
-        if compression == "parquet":
-            x = pd.read_parquet(file_chr)
-        elif compression == "feather":
-            x = pd.read_feather(file_chr)
-        else:
-            x = pd.read_csv(file_chr, compression=compression, sep="\t")
+    for file in chr_files:
+        x = _read_file(file)
-        x = x.sort_values(by=["CHR", "BP"])
+        # Sort if possible
+        if "CHR" in x.columns and "BP" in x.columns:
+            x = x.sort_values(by=["CHR", "BP"])
+        # Drop unnecessary columns
         columns_to_drop = ["MAF", "CM", "Gene", "TSS", "CHR", "BP"]
         columns_to_drop = [col for col in columns_to_drop if col in x.columns]
-        x = x.drop(columns_to_drop, axis=1)
+        if columns_to_drop:
+            x = x.drop(columns=columns_to_drop, axis=1)
         w_array.append(x)
-    #
+    # Concatenate and set column names
     w_ld = pd.concat(w_array, axis=0)
-    w_ld.columns = ["SNP", "LD_weights"]
+    logger.info(f"Loaded {len(w_ld)} SNPs from LD weight files")
+    # Set column names
+    w_ld.columns = (
+        ["SNP", "LD_weights"] + list(w_ld.columns[2:])
+        if len(w_ld.columns) > 2
+        else ["SNP", "LD_weights"]
+    )
     return w_ld

{gsmap-1.73.3.dist-info → gsmap-1.73.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: gsMap
-Version: 1.73.3
+Version: 1.73.4
 Summary: Genetics-informed pathogenic spatial mapping
 Author-email: liyang <songliyang@westlake.edu.cn>, wenhao <chenwenhao@westlake.edu.cn>
 Requires-Python: >=3.10

{gsmap-1.73.3.dist-info → gsmap-1.73.4.dist-info}/RECORD RENAMED Viewed

@@ -1,12 +1,12 @@
-gsMap/__init__.py,sha256=0XtiYZAbXor3EAyHAebfh1qGJuKOgeB3h1MPE6ukNNY,77
+gsMap/__init__.py,sha256=hRDqmAAKm9MYDtkkkVvoAJQDZAlG2yZ5nafywVU2Ufo,77
 gsMap/__main__.py,sha256=Vdhw8YA1K3wPMlbJQYL5WqvRzAKVeZ16mZQFO9VRmCo,62
 gsMap/cauchy_combination_test.py,sha256=SiUyqJKr4ATFtRgsCEJ43joGcSagCOnnurkB1FlQiB4,5105
-gsMap/config.py,sha256=LmBVMb0eda6bfrKkQuh7eZnZdvgecjCnozRd_clqvlY,51584
+gsMap/config.py,sha256=xQQJKqe-ZLohxzEZ0L_CEXXbbUK-U6-H6BnISteqrHs,51316
 gsMap/create_slice_mean.py,sha256=Nnmb7ACtS-9TurW5xQ4TqCinejPsYcvuT5Oxqa5Uges,5723
-gsMap/diagnosis.py,sha256=YyT_TkPbb3c22DLpRYu9yynbNGrhytcCgxCoPwz9Bpc,12962
+gsMap/diagnosis.py,sha256=Z-zJriPge0_kUbU-S41w7cPT2xYFlDVzbp6p6QMoKQc,13025
 gsMap/find_latent_representation.py,sha256=aZ5fFY2RhAsNaDeoehd5lN28556d6GGHK9xEUTvo6G4,5365
 gsMap/format_sumstats.py,sha256=1c9OgbqDQWOgXeSrbAhbJfChv_2IwXIgLE6Pbw2sx0s,13778
-gsMap/generate_ldscore.py,sha256=G108fVVdGj0Pn50TqFmAXLjQ7OTY9BWnilHoDeIn2D8,45348
+gsMap/generate_ldscore.py,sha256=9Qlx8na0w82U8UsSvdPCsDbNAxNFPHKYuUjY4M04fOg,35363
 gsMap/latent_to_gene.py,sha256=sDPvOU4iF-HkfQY0nnkIVXpjyTQ9-PjQflwEFWrPg-A,12869
 gsMap/main.py,sha256=SzfAXhrlr4LXnSD4gkvAtUUPYXyra6a_MzVCxDBZjr0,1170
 gsMap/report.py,sha256=_1FYkzGhVGMnvHgEQ8z51iMrVEVlh48a31jLqbV2o9w,6953
@@ -20,12 +20,12 @@ gsMap/GNN/model.py,sha256=75In9sxBkaqqpCQSrQEUO-zsQQVQnkXVbKsAgyAZjiQ,2918
 gsMap/GNN/train.py,sha256=4qipaxaz3rQOtlRpTYCfl1Oz4kz_A6vNB1aw8_gGK_k,3076
 gsMap/templates/report_template.html,sha256=QODZEbVxpW1xsLz7lDrD_DyUfzYoi9E17o2tLJlf8OQ,8016
 gsMap/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-gsMap/utils/generate_r2_matrix.py,sha256=0zyoJDWUVavlQtR6_XXb7Ah9UhPyT3n0t6XCqlI1HXQ,17354
+gsMap/utils/generate_r2_matrix.py,sha256=0FEbSEiZhNj3nnnt9V-fp7WWPLpfBci3tP4ydBbG280,20114
 gsMap/utils/jackknife.py,sha256=w_qMj9GlqViouHuOw1U80N6doWuCTXuPoAVU4P-5mm8,17673
 gsMap/utils/manhattan_plot.py,sha256=4ok5CHAaT_MadyMPnFZMR_llmE8Vf4-KiEfametgHq0,25480
-gsMap/utils/regression_read.py,sha256=rKA0nkUpTJf6WuGddhKrsBCExchDNEyojOWu_qddZNw,5474
-gsmap-1.73.3.dist-info/entry_points.txt,sha256=s_P2Za22O077tc1FPLKMinbdRVXaN_HTcDBgWMYpqA4,41
-gsmap-1.73.3.dist-info/licenses/LICENSE,sha256=fb5WP6qQytSKO5rM0ZSqQXg_92Fdt0aAeFNwSi3Lpmc,1069
-gsmap-1.73.3.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
-gsmap-1.73.3.dist-info/METADATA,sha256=-MD9qe4n_qOVF1dAQ6gcSLtCl1DZDMeoRw2EVijGDms,8196
-gsmap-1.73.3.dist-info/RECORD,,
+gsMap/utils/regression_read.py,sha256=uBSKlvYVhUKmDSCBvKHQrE1wLNyvK-rbzc5TJV51oDI,5649
+gsmap-1.73.4.dist-info/entry_points.txt,sha256=s_P2Za22O077tc1FPLKMinbdRVXaN_HTcDBgWMYpqA4,41
+gsmap-1.73.4.dist-info/licenses/LICENSE,sha256=fb5WP6qQytSKO5rM0ZSqQXg_92Fdt0aAeFNwSi3Lpmc,1069
+gsmap-1.73.4.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
+gsmap-1.73.4.dist-info/METADATA,sha256=fyLpDSS5SEIyPj9rZ7ymcXPIOCLcAU2j-OW0D5xC2GA,8196
+gsmap-1.73.4.dist-info/RECORD,,

{gsmap-1.73.3.dist-info → gsmap-1.73.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{gsmap-1.73.3.dist-info → gsmap-1.73.4.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{gsmap-1.73.3.dist-info → gsmap-1.73.4.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

gsMap 1.73.3__py3-none-any.whl → 1.73.4__py3-none-any.whl

gsMap 1.73.3py3-none-any.whl → 1.73.4py3-none-any.whl