gsMap 1.73.2__py3-none-any.whl → 1.73.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsMap/__init__.py +1 -1
- gsMap/config.py +2 -9
- gsMap/diagnosis.py +4 -3
- gsMap/generate_ldscore.py +115 -453
- gsMap/utils/generate_r2_matrix.py +455 -352
- gsMap/utils/regression_read.py +131 -157
- {gsmap-1.73.2.dist-info → gsmap-1.73.4.dist-info}/METADATA +1 -1
- {gsmap-1.73.2.dist-info → gsmap-1.73.4.dist-info}/RECORD +11 -11
- {gsmap-1.73.2.dist-info → gsmap-1.73.4.dist-info}/WHEEL +0 -0
- {gsmap-1.73.2.dist-info → gsmap-1.73.4.dist-info}/entry_points.txt +0 -0
- {gsmap-1.73.2.dist-info → gsmap-1.73.4.dist-info}/licenses/LICENSE +0 -0
gsMap/utils/regression_read.py
CHANGED
@@ -1,201 +1,175 @@
|
|
1
|
+
import glob
|
2
|
+
import logging
|
1
3
|
import os
|
2
4
|
|
3
|
-
import numpy as np
|
4
5
|
import pandas as pd
|
5
6
|
|
7
|
+
logger = logging.getLogger("gsMap.utils.regression_read")
|
6
8
|
|
7
|
-
# Fun for reading gwas data
|
8
|
-
def _read_sumstats(fh, alleles=False, dropna=False):
|
9
|
-
"""
|
10
|
-
Parse gwas summary statistics.
|
11
|
-
"""
|
12
|
-
print(f"Reading summary statistics from {fh} ...")
|
13
|
-
sumstats = ps_sumstats(fh, alleles=alleles, dropna=dropna)
|
14
|
-
print(f"Read summary statistics for {len(sumstats)} SNPs.")
|
15
|
-
|
16
|
-
m = len(sumstats)
|
17
|
-
sumstats = sumstats.drop_duplicates(subset="SNP")
|
18
|
-
if m > len(sumstats):
|
19
|
-
print(f"Dropped {m - len(sumstats)} SNPs with duplicated rs numbers.")
|
20
9
|
|
21
|
-
|
10
|
+
def _read_sumstats(fh, alleles=False, dropna=False):
|
11
|
+
"""Parse GWAS summary statistics."""
|
12
|
+
logger.info(f"Reading summary statistics from {fh} ...")
|
22
13
|
|
14
|
+
# Determine compression type
|
15
|
+
compression = None
|
16
|
+
if fh.endswith("gz"):
|
17
|
+
compression = "gzip"
|
18
|
+
elif fh.endswith("bz2"):
|
19
|
+
compression = "bz2"
|
23
20
|
|
24
|
-
|
25
|
-
"""
|
26
|
-
Parses .sumstats files. See docs/file_formats_sumstats.txt.
|
27
|
-
"""
|
21
|
+
# Define columns and dtypes
|
28
22
|
dtype_dict = {"SNP": str, "Z": float, "N": float, "A1": str, "A2": str}
|
29
|
-
compression = get_compression(fh)
|
30
23
|
usecols = ["SNP", "Z", "N"]
|
31
24
|
if alleles:
|
32
25
|
usecols += ["A1", "A2"]
|
33
26
|
|
27
|
+
# Read the file
|
34
28
|
try:
|
35
|
-
|
29
|
+
sumstats = pd.read_csv(
|
30
|
+
fh,
|
31
|
+
sep=r"\s+",
|
32
|
+
na_values=".",
|
33
|
+
usecols=usecols,
|
34
|
+
dtype=dtype_dict,
|
35
|
+
compression=compression,
|
36
|
+
)
|
36
37
|
except (AttributeError, ValueError) as e:
|
38
|
+
logger.error(f"Failed to parse sumstats file: {str(e.args)}")
|
37
39
|
raise ValueError("Improperly formatted sumstats file: " + str(e.args)) from e
|
38
40
|
|
41
|
+
# Drop NA values if specified
|
39
42
|
if dropna:
|
40
|
-
|
43
|
+
sumstats = sumstats.dropna(how="any")
|
41
44
|
|
42
|
-
|
45
|
+
logger.info(f"Read summary statistics for {len(sumstats)} SNPs.")
|
43
46
|
|
47
|
+
# Drop duplicates
|
48
|
+
m = len(sumstats)
|
49
|
+
sumstats = sumstats.drop_duplicates(subset="SNP")
|
50
|
+
if m > len(sumstats):
|
51
|
+
logger.info(f"Dropped {m - len(sumstats)} SNPs with duplicated rs numbers.")
|
44
52
|
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
if
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
53
|
+
return sumstats
|
54
|
+
|
55
|
+
|
56
|
+
def _read_chr_files(base_path, suffix, expected_count=22):
|
57
|
+
"""Read chromosome files using glob pattern matching."""
|
58
|
+
# Create the pattern to search for files
|
59
|
+
file_pattern = f"{base_path}[1-9]*{suffix}*"
|
60
|
+
|
61
|
+
# Find all matching files
|
62
|
+
all_files = glob.glob(file_pattern)
|
63
|
+
|
64
|
+
# Extract chromosome numbers
|
65
|
+
chr_files = []
|
66
|
+
for file in all_files:
|
67
|
+
try:
|
68
|
+
# Extract the chromosome number from filename
|
69
|
+
file_name = os.path.basename(file)
|
70
|
+
base_name = os.path.basename(base_path)
|
71
|
+
chr_part = file_name.replace(base_name, "").split(suffix)[0]
|
72
|
+
chr_num = int(chr_part)
|
73
|
+
if 1 <= chr_num <= expected_count:
|
74
|
+
chr_files.append((chr_num, file))
|
75
|
+
except (ValueError, IndexError):
|
76
|
+
continue
|
77
|
+
|
78
|
+
# Check if we have the expected number of chromosome files
|
79
|
+
if len(chr_files) != expected_count:
|
80
|
+
logger.warning(
|
81
|
+
f"❗ SEVERE WARNING ❗ Expected {expected_count} chromosome files, but found {len(chr_files)}! "
|
82
|
+
f"⚠️ For human GWAS data, all 22 autosomes must be present. Please verify your input files."
|
83
|
+
)
|
84
|
+
|
85
|
+
# Sort by chromosome number and return file paths
|
86
|
+
chr_files.sort()
|
87
|
+
return [file for _, file in chr_files]
|
88
|
+
|
89
|
+
|
90
|
+
def _read_file(file_path):
|
91
|
+
"""Read a file based on its format/extension."""
|
92
|
+
try:
|
93
|
+
if file_path.endswith(".feather"):
|
94
|
+
return pd.read_feather(file_path)
|
95
|
+
elif file_path.endswith(".parquet"):
|
96
|
+
return pd.read_parquet(file_path)
|
97
|
+
elif file_path.endswith(".gz"):
|
98
|
+
return pd.read_csv(file_path, compression="gzip", sep="\t")
|
99
|
+
elif file_path.endswith(".bz2"):
|
100
|
+
return pd.read_csv(file_path, compression="bz2", sep="\t")
|
101
|
+
else:
|
102
|
+
return pd.read_csv(file_path, sep="\t")
|
103
|
+
except Exception as e:
|
104
|
+
logger.error(f"Failed to read file {file_path}: {str(e)}")
|
105
|
+
raise
|
90
106
|
|
91
107
|
|
92
108
|
def _read_ref_ld_v2(ld_file):
|
109
|
+
"""Read reference LD scores for all chromosomes."""
|
93
110
|
suffix = ".l2.ldscore"
|
94
|
-
|
95
|
-
first_fh = f"{file}1{suffix}"
|
96
|
-
s, compression = which_compression(first_fh)
|
97
|
-
print(f"Reading ld score annotations from {file}[1-22]{suffix}.{compression}")
|
98
|
-
ref_ld = pd.concat(
|
99
|
-
[pd.read_feather(f"{file}{chr}{suffix}{s}") for chr in range(1, 23)], axis=0
|
100
|
-
)
|
101
|
-
# set first column as index
|
102
|
-
ref_ld.rename(columns={"index": "SNP"}, inplace=True)
|
103
|
-
ref_ld.set_index("SNP", inplace=True)
|
104
|
-
return ref_ld
|
111
|
+
logger.info(f"Reading LD score annotations from {ld_file}[1-22]{suffix}...")
|
105
112
|
|
113
|
+
# Get the chromosome files
|
114
|
+
chr_files = _read_chr_files(ld_file, suffix)
|
106
115
|
|
107
|
-
|
108
|
-
|
109
|
-
if not not_M_5_50:
|
110
|
-
suffix += "_5_50"
|
111
|
-
M_annot = np.array(
|
112
|
-
[
|
113
|
-
np.loadtxt(
|
114
|
-
f"{ld_file}{chr}{suffix}",
|
115
|
-
)
|
116
|
-
for chr in range(1, 23)
|
117
|
-
]
|
118
|
-
)
|
119
|
-
assert M_annot.shape == (22, n_annot)
|
120
|
-
return M_annot.sum(axis=0).reshape((1, n_annot))
|
116
|
+
# Read and concatenate all files
|
117
|
+
df_list = [_read_file(file) for file in chr_files]
|
121
118
|
|
119
|
+
if not df_list:
|
120
|
+
logger.error(f"No LD score files found matching pattern: {ld_file}*{suffix}*")
|
121
|
+
raise FileNotFoundError(f"No LD score files found matching pattern: {ld_file}*{suffix}*")
|
122
122
|
|
123
|
-
|
124
|
-
|
125
|
-
"""
|
126
|
-
Read M (--M, --M-file, etc).
|
127
|
-
"""
|
128
|
-
M_annot = M(ld_file, common=(not not_M_5_50))
|
123
|
+
ref_ld = pd.concat(df_list, axis=0)
|
124
|
+
logger.info(f"Loaded {len(ref_ld)} SNPs from LD score files")
|
129
125
|
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
return
|
137
|
-
|
138
|
-
|
139
|
-
def M(fh, common=False):
|
140
|
-
"""
|
141
|
-
Parses .l{N}.M files, split across num chromosomes.
|
142
|
-
"""
|
143
|
-
suffix = ".l2.M"
|
144
|
-
if common:
|
145
|
-
suffix += "_5_50"
|
146
|
-
# -
|
147
|
-
M_array = []
|
148
|
-
for i in range(1, 23):
|
149
|
-
M_current = pd.read_csv(f"{fh}{i}" + suffix, header=None)
|
150
|
-
M_array.append(M_current)
|
151
|
-
|
152
|
-
M_array = pd.concat(M_array, axis=1).sum(axis=1)
|
153
|
-
# -
|
154
|
-
return np.array(M_array).reshape((1, len(M_array)))
|
155
|
-
|
156
|
-
|
157
|
-
def _check_variance_v2(M_annot, ref_ld):
|
158
|
-
ii = ref_ld.var() == 0
|
159
|
-
if ii.all():
|
160
|
-
raise ValueError("All LD Scores have zero variance.")
|
161
|
-
elif not ii.any():
|
162
|
-
print("No partitioned LD Scores have zero variance.")
|
163
|
-
else:
|
164
|
-
ii_snp = ii_m = np.array(~ii)
|
165
|
-
print(f"Removing {sum(ii)} partitioned LD Scores with zero variance.")
|
166
|
-
ref_ld = ref_ld.iloc[:, ii_snp]
|
167
|
-
M_annot = M_annot[:, ii_m]
|
168
|
-
return M_annot, ref_ld
|
126
|
+
# Set SNP as index
|
127
|
+
if "index" in ref_ld.columns:
|
128
|
+
ref_ld.rename(columns={"index": "SNP"}, inplace=True)
|
129
|
+
if "SNP" in ref_ld.columns:
|
130
|
+
ref_ld.set_index("SNP", inplace=True)
|
131
|
+
|
132
|
+
return ref_ld
|
169
133
|
|
170
134
|
|
171
135
|
def _read_w_ld(w_file):
|
136
|
+
"""Read LD weights for all chromosomes."""
|
172
137
|
suffix = ".l2.ldscore"
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
138
|
+
logger.info(f"Reading LD score annotations from {w_file}[1-22]{suffix}...")
|
139
|
+
|
140
|
+
# Get the chromosome files
|
141
|
+
chr_files = _read_chr_files(w_file, suffix)
|
142
|
+
|
143
|
+
if not chr_files:
|
144
|
+
logger.error(f"No LD score files found matching pattern: {w_file}*{suffix}*")
|
145
|
+
raise FileNotFoundError(f"No LD score files found matching pattern: {w_file}*{suffix}*")
|
146
|
+
|
147
|
+
# Read and process each file
|
177
148
|
w_array = []
|
178
|
-
|
179
|
-
|
180
|
-
for chr in range(1, 23):
|
181
|
-
file_chr = f"{file}{chr}{suffix}{s}"
|
182
|
-
#
|
183
|
-
if compression == "parquet":
|
184
|
-
x = pd.read_parquet(file_chr)
|
185
|
-
elif compression == "feather":
|
186
|
-
x = pd.read_feather(file_chr)
|
187
|
-
else:
|
188
|
-
x = pd.read_csv(file_chr, compression=compression, sep="\t")
|
149
|
+
for file in chr_files:
|
150
|
+
x = _read_file(file)
|
189
151
|
|
190
|
-
|
152
|
+
# Sort if possible
|
153
|
+
if "CHR" in x.columns and "BP" in x.columns:
|
154
|
+
x = x.sort_values(by=["CHR", "BP"])
|
191
155
|
|
156
|
+
# Drop unnecessary columns
|
192
157
|
columns_to_drop = ["MAF", "CM", "Gene", "TSS", "CHR", "BP"]
|
193
158
|
columns_to_drop = [col for col in columns_to_drop if col in x.columns]
|
194
|
-
|
159
|
+
if columns_to_drop:
|
160
|
+
x = x.drop(columns=columns_to_drop, axis=1)
|
195
161
|
|
196
162
|
w_array.append(x)
|
197
|
-
|
163
|
+
|
164
|
+
# Concatenate and set column names
|
198
165
|
w_ld = pd.concat(w_array, axis=0)
|
199
|
-
|
166
|
+
logger.info(f"Loaded {len(w_ld)} SNPs from LD weight files")
|
167
|
+
|
168
|
+
# Set column names
|
169
|
+
w_ld.columns = (
|
170
|
+
["SNP", "LD_weights"] + list(w_ld.columns[2:])
|
171
|
+
if len(w_ld.columns) > 2
|
172
|
+
else ["SNP", "LD_weights"]
|
173
|
+
)
|
200
174
|
|
201
175
|
return w_ld
|
@@ -1,12 +1,12 @@
|
|
1
|
-
gsMap/__init__.py,sha256=
|
1
|
+
gsMap/__init__.py,sha256=hRDqmAAKm9MYDtkkkVvoAJQDZAlG2yZ5nafywVU2Ufo,77
|
2
2
|
gsMap/__main__.py,sha256=Vdhw8YA1K3wPMlbJQYL5WqvRzAKVeZ16mZQFO9VRmCo,62
|
3
3
|
gsMap/cauchy_combination_test.py,sha256=SiUyqJKr4ATFtRgsCEJ43joGcSagCOnnurkB1FlQiB4,5105
|
4
|
-
gsMap/config.py,sha256=
|
4
|
+
gsMap/config.py,sha256=xQQJKqe-ZLohxzEZ0L_CEXXbbUK-U6-H6BnISteqrHs,51316
|
5
5
|
gsMap/create_slice_mean.py,sha256=Nnmb7ACtS-9TurW5xQ4TqCinejPsYcvuT5Oxqa5Uges,5723
|
6
|
-
gsMap/diagnosis.py,sha256=
|
6
|
+
gsMap/diagnosis.py,sha256=Z-zJriPge0_kUbU-S41w7cPT2xYFlDVzbp6p6QMoKQc,13025
|
7
7
|
gsMap/find_latent_representation.py,sha256=aZ5fFY2RhAsNaDeoehd5lN28556d6GGHK9xEUTvo6G4,5365
|
8
8
|
gsMap/format_sumstats.py,sha256=1c9OgbqDQWOgXeSrbAhbJfChv_2IwXIgLE6Pbw2sx0s,13778
|
9
|
-
gsMap/generate_ldscore.py,sha256=
|
9
|
+
gsMap/generate_ldscore.py,sha256=9Qlx8na0w82U8UsSvdPCsDbNAxNFPHKYuUjY4M04fOg,35363
|
10
10
|
gsMap/latent_to_gene.py,sha256=sDPvOU4iF-HkfQY0nnkIVXpjyTQ9-PjQflwEFWrPg-A,12869
|
11
11
|
gsMap/main.py,sha256=SzfAXhrlr4LXnSD4gkvAtUUPYXyra6a_MzVCxDBZjr0,1170
|
12
12
|
gsMap/report.py,sha256=_1FYkzGhVGMnvHgEQ8z51iMrVEVlh48a31jLqbV2o9w,6953
|
@@ -20,12 +20,12 @@ gsMap/GNN/model.py,sha256=75In9sxBkaqqpCQSrQEUO-zsQQVQnkXVbKsAgyAZjiQ,2918
|
|
20
20
|
gsMap/GNN/train.py,sha256=4qipaxaz3rQOtlRpTYCfl1Oz4kz_A6vNB1aw8_gGK_k,3076
|
21
21
|
gsMap/templates/report_template.html,sha256=QODZEbVxpW1xsLz7lDrD_DyUfzYoi9E17o2tLJlf8OQ,8016
|
22
22
|
gsMap/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
23
|
-
gsMap/utils/generate_r2_matrix.py,sha256=
|
23
|
+
gsMap/utils/generate_r2_matrix.py,sha256=0FEbSEiZhNj3nnnt9V-fp7WWPLpfBci3tP4ydBbG280,20114
|
24
24
|
gsMap/utils/jackknife.py,sha256=w_qMj9GlqViouHuOw1U80N6doWuCTXuPoAVU4P-5mm8,17673
|
25
25
|
gsMap/utils/manhattan_plot.py,sha256=4ok5CHAaT_MadyMPnFZMR_llmE8Vf4-KiEfametgHq0,25480
|
26
|
-
gsMap/utils/regression_read.py,sha256=
|
27
|
-
gsmap-1.73.
|
28
|
-
gsmap-1.73.
|
29
|
-
gsmap-1.73.
|
30
|
-
gsmap-1.73.
|
31
|
-
gsmap-1.73.
|
26
|
+
gsMap/utils/regression_read.py,sha256=uBSKlvYVhUKmDSCBvKHQrE1wLNyvK-rbzc5TJV51oDI,5649
|
27
|
+
gsmap-1.73.4.dist-info/entry_points.txt,sha256=s_P2Za22O077tc1FPLKMinbdRVXaN_HTcDBgWMYpqA4,41
|
28
|
+
gsmap-1.73.4.dist-info/licenses/LICENSE,sha256=fb5WP6qQytSKO5rM0ZSqQXg_92Fdt0aAeFNwSi3Lpmc,1069
|
29
|
+
gsmap-1.73.4.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
|
30
|
+
gsmap-1.73.4.dist-info/METADATA,sha256=fyLpDSS5SEIyPj9rZ7ymcXPIOCLcAU2j-OW0D5xC2GA,8196
|
31
|
+
gsmap-1.73.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|