gsMap 1.73.3__py3-none-any.whl → 1.73.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,201 +1,175 @@
1
+ import glob
2
+ import logging
1
3
  import os
2
4
 
3
- import numpy as np
4
5
  import pandas as pd
5
6
 
7
+ logger = logging.getLogger("gsMap.utils.regression_read")
6
8
 
7
- # Fun for reading gwas data
8
- def _read_sumstats(fh, alleles=False, dropna=False):
9
- """
10
- Parse gwas summary statistics.
11
- """
12
- print(f"Reading summary statistics from {fh} ...")
13
- sumstats = ps_sumstats(fh, alleles=alleles, dropna=dropna)
14
- print(f"Read summary statistics for {len(sumstats)} SNPs.")
15
-
16
- m = len(sumstats)
17
- sumstats = sumstats.drop_duplicates(subset="SNP")
18
- if m > len(sumstats):
19
- print(f"Dropped {m - len(sumstats)} SNPs with duplicated rs numbers.")
20
9
 
21
- return sumstats
10
+ def _read_sumstats(fh, alleles=False, dropna=False):
11
+ """Parse GWAS summary statistics."""
12
+ logger.info(f"Reading summary statistics from {fh} ...")
22
13
 
14
+ # Determine compression type
15
+ compression = None
16
+ if fh.endswith("gz"):
17
+ compression = "gzip"
18
+ elif fh.endswith("bz2"):
19
+ compression = "bz2"
23
20
 
24
- def ps_sumstats(fh, alleles=False, dropna=True):
25
- """
26
- Parses .sumstats files. See docs/file_formats_sumstats.txt.
27
- """
21
+ # Define columns and dtypes
28
22
  dtype_dict = {"SNP": str, "Z": float, "N": float, "A1": str, "A2": str}
29
- compression = get_compression(fh)
30
23
  usecols = ["SNP", "Z", "N"]
31
24
  if alleles:
32
25
  usecols += ["A1", "A2"]
33
26
 
27
+ # Read the file
34
28
  try:
35
- x = read_csv(fh, usecols=usecols, dtype=dtype_dict, compression=compression)
29
+ sumstats = pd.read_csv(
30
+ fh,
31
+ sep=r"\s+",
32
+ na_values=".",
33
+ usecols=usecols,
34
+ dtype=dtype_dict,
35
+ compression=compression,
36
+ )
36
37
  except (AttributeError, ValueError) as e:
38
+ logger.error(f"Failed to parse sumstats file: {str(e.args)}")
37
39
  raise ValueError("Improperly formatted sumstats file: " + str(e.args)) from e
38
40
 
41
+ # Drop NA values if specified
39
42
  if dropna:
40
- x = x.dropna(how="any")
43
+ sumstats = sumstats.dropna(how="any")
41
44
 
42
- return x
45
+ logger.info(f"Read summary statistics for {len(sumstats)} SNPs.")
43
46
 
47
+ # Drop duplicates
48
+ m = len(sumstats)
49
+ sumstats = sumstats.drop_duplicates(subset="SNP")
50
+ if m > len(sumstats):
51
+ logger.info(f"Dropped {m - len(sumstats)} SNPs with duplicated rs numbers.")
44
52
 
45
- def get_compression(fh):
46
- """
47
- Determin the format of compression used with read_csv?
48
- """
49
- if fh.endswith("gz"):
50
- compression = "gzip"
51
- elif fh.endswith("bz2"):
52
- compression = "bz2"
53
- else:
54
- compression = None
55
- # -
56
- return compression
57
-
58
-
59
- def read_csv(fh, **kwargs):
60
- """
61
- Read the csv data
62
- """
63
- return pd.read_csv(fh, sep=r"\s+", na_values=".", **kwargs)
64
-
65
-
66
- # Fun for reading loading LD scores
67
- def which_compression(fh):
68
- """
69
- Given a file prefix, figure out what sort of compression to use.
70
- """
71
- if os.access(fh + ".bz2", 4):
72
- suffix = ".bz2"
73
- compression = "bz2"
74
- elif os.access(fh + ".gz", 4):
75
- suffix = ".gz"
76
- compression = "gzip"
77
- elif os.access(fh + ".parquet", 4):
78
- suffix = ".parquet"
79
- compression = "parquet"
80
- elif os.access(fh + ".feather", 4):
81
- suffix = ".feather"
82
- compression = "feather"
83
- elif os.access(fh, 4):
84
- suffix = ""
85
- compression = None
86
- else:
87
- raise OSError(f"Could not open {fh}[./gz/bz2/parquet/feather]")
88
- # -
89
- return suffix, compression
53
+ return sumstats
54
+
55
+
56
+ def _read_chr_files(base_path, suffix, expected_count=22):
57
+ """Read chromosome files using glob pattern matching."""
58
+ # Create the pattern to search for files
59
+ file_pattern = f"{base_path}[1-9]*{suffix}*"
60
+
61
+ # Find all matching files
62
+ all_files = glob.glob(file_pattern)
63
+
64
+ # Extract chromosome numbers
65
+ chr_files = []
66
+ for file in all_files:
67
+ try:
68
+ # Extract the chromosome number from filename
69
+ file_name = os.path.basename(file)
70
+ base_name = os.path.basename(base_path)
71
+ chr_part = file_name.replace(base_name, "").split(suffix)[0]
72
+ chr_num = int(chr_part)
73
+ if 1 <= chr_num <= expected_count:
74
+ chr_files.append((chr_num, file))
75
+ except (ValueError, IndexError):
76
+ continue
77
+
78
+ # Check if we have the expected number of chromosome files
79
+ if len(chr_files) != expected_count:
80
+ logger.warning(
81
+ f"❗ SEVERE WARNING ❗ Expected {expected_count} chromosome files, but found {len(chr_files)}! "
82
+ f"⚠️ For human GWAS data, all 22 autosomes must be present. Please verify your input files."
83
+ )
84
+
85
+ # Sort by chromosome number and return file paths
86
+ chr_files.sort()
87
+ return [file for _, file in chr_files]
88
+
89
+
90
+ def _read_file(file_path):
91
+ """Read a file based on its format/extension."""
92
+ try:
93
+ if file_path.endswith(".feather"):
94
+ return pd.read_feather(file_path)
95
+ elif file_path.endswith(".parquet"):
96
+ return pd.read_parquet(file_path)
97
+ elif file_path.endswith(".gz"):
98
+ return pd.read_csv(file_path, compression="gzip", sep="\t")
99
+ elif file_path.endswith(".bz2"):
100
+ return pd.read_csv(file_path, compression="bz2", sep="\t")
101
+ else:
102
+ return pd.read_csv(file_path, sep="\t")
103
+ except Exception as e:
104
+ logger.error(f"Failed to read file {file_path}: {str(e)}")
105
+ raise
90
106
 
91
107
 
92
108
  def _read_ref_ld_v2(ld_file):
109
+ """Read reference LD scores for all chromosomes."""
93
110
  suffix = ".l2.ldscore"
94
- file = ld_file
95
- first_fh = f"{file}1{suffix}"
96
- s, compression = which_compression(first_fh)
97
- print(f"Reading ld score annotations from {file}[1-22]{suffix}.{compression}")
98
- ref_ld = pd.concat(
99
- [pd.read_feather(f"{file}{chr}{suffix}{s}") for chr in range(1, 23)], axis=0
100
- )
101
- # set first column as index
102
- ref_ld.rename(columns={"index": "SNP"}, inplace=True)
103
- ref_ld.set_index("SNP", inplace=True)
104
- return ref_ld
111
+ logger.info(f"Reading LD score annotations from {ld_file}[1-22]{suffix}...")
105
112
 
113
+ # Get the chromosome files
114
+ chr_files = _read_chr_files(ld_file, suffix)
106
115
 
107
- def _read_M_v2(ld_file, n_annot, not_M_5_50):
108
- suffix = ".l2.M"
109
- if not not_M_5_50:
110
- suffix += "_5_50"
111
- M_annot = np.array(
112
- [
113
- np.loadtxt(
114
- f"{ld_file}{chr}{suffix}",
115
- )
116
- for chr in range(1, 23)
117
- ]
118
- )
119
- assert M_annot.shape == (22, n_annot)
120
- return M_annot.sum(axis=0).reshape((1, n_annot))
116
+ # Read and concatenate all files
117
+ df_list = [_read_file(file) for file in chr_files]
121
118
 
119
+ if not df_list:
120
+ logger.error(f"No LD score files found matching pattern: {ld_file}*{suffix}*")
121
+ raise FileNotFoundError(f"No LD score files found matching pattern: {ld_file}*{suffix}*")
122
122
 
123
- # Fun for reading M annotations
124
- def _read_M(ld_file, n_annot, not_M_5_50):
125
- """
126
- Read M (--M, --M-file, etc).
127
- """
128
- M_annot = M(ld_file, common=(not not_M_5_50))
123
+ ref_ld = pd.concat(df_list, axis=0)
124
+ logger.info(f"Loaded {len(ref_ld)} SNPs from LD score files")
129
125
 
130
- try:
131
- M_annot = np.array(M_annot).reshape((1, n_annot))
132
- except ValueError as e:
133
- raise ValueError(
134
- "# terms in --M must match # of LD Scores in --ref-ld.\n" + str(e.args)
135
- ) from e
136
- return M_annot
137
-
138
-
139
- def M(fh, common=False):
140
- """
141
- Parses .l{N}.M files, split across num chromosomes.
142
- """
143
- suffix = ".l2.M"
144
- if common:
145
- suffix += "_5_50"
146
- # -
147
- M_array = []
148
- for i in range(1, 23):
149
- M_current = pd.read_csv(f"{fh}{i}" + suffix, header=None)
150
- M_array.append(M_current)
151
-
152
- M_array = pd.concat(M_array, axis=1).sum(axis=1)
153
- # -
154
- return np.array(M_array).reshape((1, len(M_array)))
155
-
156
-
157
- def _check_variance_v2(M_annot, ref_ld):
158
- ii = ref_ld.var() == 0
159
- if ii.all():
160
- raise ValueError("All LD Scores have zero variance.")
161
- elif not ii.any():
162
- print("No partitioned LD Scores have zero variance.")
163
- else:
164
- ii_snp = ii_m = np.array(~ii)
165
- print(f"Removing {sum(ii)} partitioned LD Scores with zero variance.")
166
- ref_ld = ref_ld.iloc[:, ii_snp]
167
- M_annot = M_annot[:, ii_m]
168
- return M_annot, ref_ld
126
+ # Set SNP as index
127
+ if "index" in ref_ld.columns:
128
+ ref_ld.rename(columns={"index": "SNP"}, inplace=True)
129
+ if "SNP" in ref_ld.columns:
130
+ ref_ld.set_index("SNP", inplace=True)
131
+
132
+ return ref_ld
169
133
 
170
134
 
171
135
  def _read_w_ld(w_file):
136
+ """Read LD weights for all chromosomes."""
172
137
  suffix = ".l2.ldscore"
173
- file = w_file
174
- first_fh = f"{file}1{suffix}"
175
- s, compression = which_compression(first_fh)
176
- #
138
+ logger.info(f"Reading LD score annotations from {w_file}[1-22]{suffix}...")
139
+
140
+ # Get the chromosome files
141
+ chr_files = _read_chr_files(w_file, suffix)
142
+
143
+ if not chr_files:
144
+ logger.error(f"No LD score files found matching pattern: {w_file}*{suffix}*")
145
+ raise FileNotFoundError(f"No LD score files found matching pattern: {w_file}*{suffix}*")
146
+
147
+ # Read and process each file
177
148
  w_array = []
178
- print(f"Reading ld score annotations from {file}[1-22]{suffix}.{compression}")
179
-
180
- for chr in range(1, 23):
181
- file_chr = f"{file}{chr}{suffix}{s}"
182
- #
183
- if compression == "parquet":
184
- x = pd.read_parquet(file_chr)
185
- elif compression == "feather":
186
- x = pd.read_feather(file_chr)
187
- else:
188
- x = pd.read_csv(file_chr, compression=compression, sep="\t")
149
+ for file in chr_files:
150
+ x = _read_file(file)
189
151
 
190
- x = x.sort_values(by=["CHR", "BP"])
152
+ # Sort if possible
153
+ if "CHR" in x.columns and "BP" in x.columns:
154
+ x = x.sort_values(by=["CHR", "BP"])
191
155
 
156
+ # Drop unnecessary columns
192
157
  columns_to_drop = ["MAF", "CM", "Gene", "TSS", "CHR", "BP"]
193
158
  columns_to_drop = [col for col in columns_to_drop if col in x.columns]
194
- x = x.drop(columns_to_drop, axis=1)
159
+ if columns_to_drop:
160
+ x = x.drop(columns=columns_to_drop, axis=1)
195
161
 
196
162
  w_array.append(x)
197
- #
163
+
164
+ # Concatenate and set column names
198
165
  w_ld = pd.concat(w_array, axis=0)
199
- w_ld.columns = ["SNP", "LD_weights"]
166
+ logger.info(f"Loaded {len(w_ld)} SNPs from LD weight files")
167
+
168
+ # Set column names
169
+ w_ld.columns = (
170
+ ["SNP", "LD_weights"] + list(w_ld.columns[2:])
171
+ if len(w_ld.columns) > 2
172
+ else ["SNP", "LD_weights"]
173
+ )
200
174
 
201
175
  return w_ld
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gsMap
3
- Version: 1.73.3
3
+ Version: 1.73.4
4
4
  Summary: Genetics-informed pathogenic spatial mapping
5
5
  Author-email: liyang <songliyang@westlake.edu.cn>, wenhao <chenwenhao@westlake.edu.cn>
6
6
  Requires-Python: >=3.10
@@ -1,12 +1,12 @@
1
- gsMap/__init__.py,sha256=0XtiYZAbXor3EAyHAebfh1qGJuKOgeB3h1MPE6ukNNY,77
1
+ gsMap/__init__.py,sha256=hRDqmAAKm9MYDtkkkVvoAJQDZAlG2yZ5nafywVU2Ufo,77
2
2
  gsMap/__main__.py,sha256=Vdhw8YA1K3wPMlbJQYL5WqvRzAKVeZ16mZQFO9VRmCo,62
3
3
  gsMap/cauchy_combination_test.py,sha256=SiUyqJKr4ATFtRgsCEJ43joGcSagCOnnurkB1FlQiB4,5105
4
- gsMap/config.py,sha256=LmBVMb0eda6bfrKkQuh7eZnZdvgecjCnozRd_clqvlY,51584
4
+ gsMap/config.py,sha256=xQQJKqe-ZLohxzEZ0L_CEXXbbUK-U6-H6BnISteqrHs,51316
5
5
  gsMap/create_slice_mean.py,sha256=Nnmb7ACtS-9TurW5xQ4TqCinejPsYcvuT5Oxqa5Uges,5723
6
- gsMap/diagnosis.py,sha256=YyT_TkPbb3c22DLpRYu9yynbNGrhytcCgxCoPwz9Bpc,12962
6
+ gsMap/diagnosis.py,sha256=Z-zJriPge0_kUbU-S41w7cPT2xYFlDVzbp6p6QMoKQc,13025
7
7
  gsMap/find_latent_representation.py,sha256=aZ5fFY2RhAsNaDeoehd5lN28556d6GGHK9xEUTvo6G4,5365
8
8
  gsMap/format_sumstats.py,sha256=1c9OgbqDQWOgXeSrbAhbJfChv_2IwXIgLE6Pbw2sx0s,13778
9
- gsMap/generate_ldscore.py,sha256=G108fVVdGj0Pn50TqFmAXLjQ7OTY9BWnilHoDeIn2D8,45348
9
+ gsMap/generate_ldscore.py,sha256=9Qlx8na0w82U8UsSvdPCsDbNAxNFPHKYuUjY4M04fOg,35363
10
10
  gsMap/latent_to_gene.py,sha256=sDPvOU4iF-HkfQY0nnkIVXpjyTQ9-PjQflwEFWrPg-A,12869
11
11
  gsMap/main.py,sha256=SzfAXhrlr4LXnSD4gkvAtUUPYXyra6a_MzVCxDBZjr0,1170
12
12
  gsMap/report.py,sha256=_1FYkzGhVGMnvHgEQ8z51iMrVEVlh48a31jLqbV2o9w,6953
@@ -20,12 +20,12 @@ gsMap/GNN/model.py,sha256=75In9sxBkaqqpCQSrQEUO-zsQQVQnkXVbKsAgyAZjiQ,2918
20
20
  gsMap/GNN/train.py,sha256=4qipaxaz3rQOtlRpTYCfl1Oz4kz_A6vNB1aw8_gGK_k,3076
21
21
  gsMap/templates/report_template.html,sha256=QODZEbVxpW1xsLz7lDrD_DyUfzYoi9E17o2tLJlf8OQ,8016
22
22
  gsMap/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
- gsMap/utils/generate_r2_matrix.py,sha256=0zyoJDWUVavlQtR6_XXb7Ah9UhPyT3n0t6XCqlI1HXQ,17354
23
+ gsMap/utils/generate_r2_matrix.py,sha256=0FEbSEiZhNj3nnnt9V-fp7WWPLpfBci3tP4ydBbG280,20114
24
24
  gsMap/utils/jackknife.py,sha256=w_qMj9GlqViouHuOw1U80N6doWuCTXuPoAVU4P-5mm8,17673
25
25
  gsMap/utils/manhattan_plot.py,sha256=4ok5CHAaT_MadyMPnFZMR_llmE8Vf4-KiEfametgHq0,25480
26
- gsMap/utils/regression_read.py,sha256=rKA0nkUpTJf6WuGddhKrsBCExchDNEyojOWu_qddZNw,5474
27
- gsmap-1.73.3.dist-info/entry_points.txt,sha256=s_P2Za22O077tc1FPLKMinbdRVXaN_HTcDBgWMYpqA4,41
28
- gsmap-1.73.3.dist-info/licenses/LICENSE,sha256=fb5WP6qQytSKO5rM0ZSqQXg_92Fdt0aAeFNwSi3Lpmc,1069
29
- gsmap-1.73.3.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
30
- gsmap-1.73.3.dist-info/METADATA,sha256=-MD9qe4n_qOVF1dAQ6gcSLtCl1DZDMeoRw2EVijGDms,8196
31
- gsmap-1.73.3.dist-info/RECORD,,
26
+ gsMap/utils/regression_read.py,sha256=uBSKlvYVhUKmDSCBvKHQrE1wLNyvK-rbzc5TJV51oDI,5649
27
+ gsmap-1.73.4.dist-info/entry_points.txt,sha256=s_P2Za22O077tc1FPLKMinbdRVXaN_HTcDBgWMYpqA4,41
28
+ gsmap-1.73.4.dist-info/licenses/LICENSE,sha256=fb5WP6qQytSKO5rM0ZSqQXg_92Fdt0aAeFNwSi3Lpmc,1069
29
+ gsmap-1.73.4.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
30
+ gsmap-1.73.4.dist-info/METADATA,sha256=fyLpDSS5SEIyPj9rZ7ymcXPIOCLcAU2j-OW0D5xC2GA,8196
31
+ gsmap-1.73.4.dist-info/RECORD,,
File without changes