gsMap 1.60__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,294 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ import os
4
+
5
+
6
+ # Fun for reading gwas data
7
+ def _read_sumstats(fh, alleles=False, dropna=False):
8
+ '''
9
+ Parse gwas summary statistics.
10
+ '''
11
+ print('Reading summary statistics from {S} ...'.format(S=fh))
12
+ sumstats = ps_sumstats(fh, alleles=alleles, dropna=dropna)
13
+ print('Read summary statistics for {N} SNPs.'.format(N=len(sumstats)))
14
+
15
+ m = len(sumstats)
16
+ sumstats = sumstats.drop_duplicates(subset='SNP')
17
+ if m > len(sumstats):
18
+ print('Dropped {M} SNPs with duplicated rs numbers.'.format(M=m - len(sumstats)))
19
+
20
+ return sumstats
21
+
22
+
23
+ def ps_sumstats(fh, alleles=False, dropna=True):
24
+ '''
25
+ Parses .sumstats files. See docs/file_formats_sumstats.txt.
26
+ '''
27
+
28
+ dtype_dict = {'SNP': str, 'Z': float, 'N': float, 'A1': str, 'A2': str}
29
+ compression = get_compression(fh)
30
+ usecols = ['SNP', 'Z', 'N']
31
+ if alleles:
32
+ usecols += ['A1', 'A2']
33
+
34
+ try:
35
+ x = read_csv(fh, usecols=usecols, dtype=dtype_dict, compression=compression)
36
+ except (AttributeError, ValueError) as e:
37
+ raise ValueError('Improperly formatted sumstats file: ' + str(e.args))
38
+
39
+ if dropna:
40
+ x = x.dropna(how='any')
41
+
42
+ return x
43
+
44
+
45
+ def get_compression(fh):
46
+ '''
47
+ Determin the format of compression used with read_csv?
48
+ '''
49
+ if fh.endswith('gz'):
50
+ compression = 'gzip'
51
+ elif fh.endswith('bz2'):
52
+ compression = 'bz2'
53
+ else:
54
+ compression = None
55
+ # -
56
+ return compression
57
+
58
+
59
+ def read_csv(fh, **kwargs):
60
+ '''
61
+ Read the csv data
62
+ '''
63
+ return pd.read_csv(fh, delim_whitespace=True, na_values='.', **kwargs)
64
+
65
+
66
+ # Fun for reading loading LD scores
67
+ def which_compression(fh):
68
+ '''
69
+ Given a file prefix, figure out what sort of compression to use.
70
+ '''
71
+ if os.access(fh + '.bz2', 4):
72
+ suffix = '.bz2'
73
+ compression = 'bz2'
74
+ elif os.access(fh + '.gz', 4):
75
+ suffix = '.gz'
76
+ compression = 'gzip'
77
+ elif os.access(fh + '.parquet', 4):
78
+ suffix = '.parquet'
79
+ compression = 'parquet'
80
+ elif os.access(fh + '.feather', 4):
81
+ suffix = '.feather'
82
+ compression = 'feather'
83
+ elif os.access(fh, 4):
84
+ suffix = ''
85
+ compression = None
86
+ else:
87
+ raise IOError('Could not open {F}[./gz/bz2/parquet/feather]'.format(F=fh))
88
+ # -
89
+ return suffix, compression
90
+
91
+
92
+ def _read_ref_ld(ld_file):
93
+ suffix = '.l2.ldscore'
94
+ file = ld_file
95
+ first_fh = f'{file}1{suffix}'
96
+ s, compression = which_compression(first_fh)
97
+ #
98
+ ldscore_array = []
99
+ print(f'Reading ld score annotations from {file}[1-22]{suffix}.{compression}')
100
+
101
+ for chr in range(1, 23):
102
+ file_chr = f'{file}{chr}{suffix}{s}'
103
+ #
104
+ if compression == 'parquet':
105
+ x = pd.read_parquet(file_chr)
106
+ elif compression == 'feather':
107
+ x = pd.read_feather(file_chr)
108
+ else:
109
+ x = pd.read_csv(file_chr, compression=compression, sep='\t')
110
+
111
+ x = x.sort_values(by=['CHR', 'BP']) # SEs will be wrong unless sorted
112
+
113
+ columns_to_drop = ['MAF', 'CM', 'Gene', 'TSS', 'CHR', 'BP']
114
+ columns_to_drop = [col for col in columns_to_drop if col in x.columns]
115
+ x = x.drop(columns_to_drop, axis=1)
116
+
117
+ ldscore_array.append(x)
118
+ #
119
+ ref_ld = pd.concat(ldscore_array, axis=0)
120
+ return ref_ld
121
+
122
+
123
+ def _read_ref_ld_v2(ld_file):
124
+ suffix = '.l2.ldscore'
125
+ file = ld_file
126
+ first_fh = f'{file}1{suffix}'
127
+ s, compression = which_compression(first_fh)
128
+ print(f'Reading ld score annotations from {file}[1-22]{suffix}.{compression}')
129
+ ref_ld = pd.concat(
130
+ [pd.read_feather(f'{file}{chr}{suffix}{s}') for chr in range(1, 23)], axis=0
131
+ )
132
+ # set first column as index
133
+ ref_ld.rename(columns={'index': 'SNP'}, inplace=True)
134
+ ref_ld.set_index('SNP', inplace=True)
135
+ return ref_ld
136
+
137
+ def _read_M_v2(ld_file, n_annot, not_M_5_50):
138
+ suffix = '.l2.M'
139
+ if not not_M_5_50:
140
+ suffix += '_5_50'
141
+ M_annot= np.array(
142
+ [
143
+ np.loadtxt(f'{ld_file}{chr}{suffix}', )
144
+ for chr in range(1, 23)]
145
+
146
+ )
147
+ assert M_annot.shape == (22, n_annot)
148
+ return M_annot.sum(axis=0).reshape((1, n_annot))
149
+ # Fun for reading M annotations
150
+ def _read_M(ld_file, n_annot, not_M_5_50):
151
+ '''
152
+ Read M (--M, --M-file, etc).
153
+ '''
154
+ M_annot = M(ld_file, common=(not not_M_5_50))
155
+
156
+ try:
157
+ M_annot = np.array(M_annot).reshape((1, n_annot))
158
+ except ValueError as e:
159
+ raise ValueError('# terms in --M must match # of LD Scores in --ref-ld.\n' + str(e.args))
160
+ return M_annot
161
+
162
+
163
+ def M(fh, common=False):
164
+ '''
165
+ Parses .l{N}.M files, split across num chromosomes.
166
+ '''
167
+ suffix = '.l2.M'
168
+ if common:
169
+ suffix += '_5_50'
170
+ # -
171
+ M_array = []
172
+ for i in range(1, 23):
173
+ M_current = pd.read_csv(f'{fh}{i}' + suffix, header=None)
174
+ M_array.append(M_current)
175
+
176
+ M_array = pd.concat(M_array, axis=1).sum(axis=1)
177
+ # -
178
+ return np.array(M_array).reshape((1, len(M_array)))
179
+
180
+
181
+ def _check_variance(M_annot, ref_ld):
182
+ '''
183
+ Remove zero-variance LD Scores.
184
+ '''
185
+ ii = ref_ld.iloc[:, 1:].var() == 0 # NB there is a SNP column here
186
+ if ii.all():
187
+ raise ValueError('All LD Scores have zero variance.')
188
+ else:
189
+ print('Removing partitioned LD Scores with zero variance.')
190
+ ii_snp = np.array([True] + list(~ii))
191
+ ii_m = np.array(~ii)
192
+ ref_ld = ref_ld.iloc[:, ii_snp]
193
+ M_annot = M_annot[:, ii_m]
194
+ # -
195
+ return M_annot, ref_ld, ii
196
+ def _check_variance_v2(M_annot, ref_ld):
197
+ ii = ref_ld.var() == 0
198
+ if ii.all():
199
+ raise ValueError('All LD Scores have zero variance.')
200
+ elif not ii.any():
201
+ print('No partitioned LD Scores have zero variance.')
202
+ else:
203
+ ii_snp= ii_m = np.array(~ii)
204
+ print(f'Removing {sum(ii)} partitioned LD Scores with zero variance.')
205
+ ref_ld = ref_ld.iloc[:, ii_snp]
206
+ M_annot = M_annot[:, ii_m]
207
+ return M_annot, ref_ld
208
+
209
+
210
+ # Fun for reading regression weights
211
+ def which_compression(fh):
212
+ '''
213
+ Given a file prefix, figure out what sort of compression to use.
214
+ '''
215
+ if os.access(fh + '.bz2', 4):
216
+ suffix = '.bz2'
217
+ compression = 'bz2'
218
+ elif os.access(fh + '.gz', 4):
219
+ suffix = '.gz'
220
+ compression = 'gzip'
221
+ elif os.access(fh + '.parquet', 4):
222
+ suffix = '.parquet'
223
+ compression = 'parquet'
224
+ elif os.access(fh + '.feather', 4):
225
+ suffix = '.feather'
226
+ compression = 'feather'
227
+ elif os.access(fh, 4):
228
+ suffix = ''
229
+ compression = None
230
+ else:
231
+ raise IOError('Could not open {F}[./gz/bz2/parquet/feather]'.format(F=fh))
232
+ # -
233
+ return suffix, compression
234
+
235
+
236
+ def _read_w_ld(w_file):
237
+ suffix = '.l2.ldscore'
238
+ file = w_file
239
+ first_fh = f'{file}1{suffix}'
240
+ s, compression = which_compression(first_fh)
241
+ #
242
+ w_array = []
243
+ print(f'Reading ld score annotations from {file}[1-22]{suffix}.{compression}')
244
+
245
+ for chr in range(1, 23):
246
+ file_chr = f'{file}{chr}{suffix}{s}'
247
+ #
248
+ if compression == 'parquet':
249
+ x = pd.read_parquet(file_chr)
250
+ elif compression == 'feather':
251
+ x = pd.read_feather(file_chr)
252
+ else:
253
+ x = pd.read_csv(file_chr, compression=compression, sep='\t')
254
+
255
+ x = x.sort_values(by=['CHR', 'BP'])
256
+
257
+ columns_to_drop = ['MAF', 'CM', 'Gene', 'TSS', 'CHR', 'BP']
258
+ columns_to_drop = [col for col in columns_to_drop if col in x.columns]
259
+ x = x.drop(columns_to_drop, axis=1)
260
+
261
+ w_array.append(x)
262
+ #
263
+ w_ld = pd.concat(w_array, axis=0)
264
+ w_ld.columns = ['SNP', 'LD_weights']
265
+
266
+ return w_ld
267
+
268
+
269
+ # Fun for merging
270
+ def _merge_and_log(ld, sumstats, noun):
271
+ '''
272
+ Wrap smart merge with log messages about # of SNPs.
273
+ '''
274
+ sumstats = smart_merge(ld, sumstats)
275
+ msg = 'After merging with {F}, {N} SNPs remain.'
276
+ if len(sumstats) == 0:
277
+ raise ValueError(msg.format(N=len(sumstats), F=noun))
278
+ else:
279
+ print(msg.format(N=len(sumstats), F=noun))
280
+ # -
281
+ return sumstats
282
+
283
+
284
+ def smart_merge(x, y):
285
+ '''
286
+ Check if SNP columns are equal. If so, save time by using concat instead of merge.
287
+ '''
288
+ if len(x) == len(y) and (x.index == y.index).all() and (x.SNP == y.SNP).all():
289
+ x = x.reset_index(drop=True)
290
+ y = y.reset_index(drop=True).drop('SNP', 1)
291
+ out = pd.concat([x, y], axis=1)
292
+ else:
293
+ out = pd.merge(x, y, how='inner', on='SNP')
294
+ return out
@@ -0,0 +1,307 @@
1
+ import os
2
+ import numpy as np
3
+ import pandas as pd
4
+
5
+ import argparse
6
+ import logging
7
+ import multiprocessing
8
+ from collections import defaultdict
9
+ from pathlib import Path
10
+
11
+ from scipy.stats import norm
12
+ from tqdm.contrib.concurrent import process_map
13
+
14
+ import gsMap.jackknife as jk
15
+ from gsMap.config import add_spatial_ldsc_args, SpatialLDSCConfig
16
+ from gsMap.regression_read import _read_sumstats, _read_w_ld, _read_ref_ld_v2, _read_M_v2
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ # %%
22
+ def _coef_new(jknife):
23
+ # return coef[0], coef_se[0], z[0]]
24
+ est_ = jknife.est[0, 0] / Nbar
25
+ se_ = jknife.jknife_se[0, 0] / Nbar
26
+ return est_, se_
27
+
28
+
29
+ def append_intercept(x):
30
+ n_row = x.shape[0]
31
+ intercept = np.ones((n_row, 1))
32
+ x_new = np.concatenate((x, intercept), axis=1)
33
+ return x_new
34
+
35
+
36
+ def filter_sumstats_by_chisq(sumstats, chisq_max):
37
+ before_len = len(sumstats)
38
+ if chisq_max is None:
39
+ chisq_max = max(0.001 * sumstats.N.max(), 80)
40
+ logger.info(f'No chi^2 threshold provided, using {chisq_max} as default')
41
+ sumstats['chisq'] = sumstats.Z ** 2
42
+ sumstats = sumstats[sumstats.chisq < chisq_max]
43
+ after_len = len(sumstats)
44
+ if after_len < before_len:
45
+ logger.info(f'Removed {before_len - after_len} SNPs with chi^2 > {chisq_max} ({after_len} SNPs remain)')
46
+ else:
47
+ logger.info(f'No SNPs removed with chi^2 > {chisq_max} ({after_len} SNPs remain)')
48
+ return sumstats
49
+
50
+
51
+ def aggregate(y, x, N, M, intercept=1):
52
+ num = M * (np.mean(y) - intercept)
53
+ denom = np.mean(np.multiply(x, N))
54
+ return num / denom
55
+
56
+
57
+ def weights(ld, w_ld, N, M, hsq, intercept=1):
58
+ M = float(M)
59
+ hsq = np.clip(hsq, 0.0, 1.0)
60
+ ld = np.maximum(ld, 1.0)
61
+ w_ld = np.maximum(w_ld, 1.0)
62
+ c = hsq * N / M
63
+ het_w = 1.0 / (2 * np.square(intercept + np.multiply(c, ld)))
64
+ oc_w = 1.0 / w_ld
65
+ w = np.multiply(het_w, oc_w)
66
+ return w
67
+
68
+
69
+ def jackknife_for_processmap(spot_id):
70
+ # calculate the initial weight for each spot
71
+ initial_w = (
72
+ get_weight_optimized(sumstats, x_tot_precomputed_common_snp[:, spot_id], 10000, w_ld_common_snp, intercept=1)
73
+ .astype(np.float32)
74
+ .reshape((-1, 1)))
75
+
76
+ # apply the weight to baseline annotation, spatial annotation and CHISQ
77
+ initial_w_scaled = initial_w / np.sum(initial_w)
78
+ baseline_annotation_spot = baseline_annotation * initial_w_scaled
79
+ spatial_annotation_spot = spatial_annotation.iloc[:, spot_id].values.reshape((-1, 1)) * initial_w_scaled
80
+ CHISQ = sumstats.chisq.to_numpy(dtype=np.float32).reshape((-1, 1)).copy()
81
+ y = CHISQ * initial_w_scaled
82
+
83
+ # run the jackknife
84
+ x_focal = np.concatenate((spatial_annotation_spot,
85
+ baseline_annotation_spot), axis=1)
86
+ try:
87
+ jknife = jk.LstsqJackknifeFast(x_focal, y, n_blocks)
88
+ # LinAlgError
89
+ except np.linalg.LinAlgError as e:
90
+ logger.warning(f'LinAlgError: {e}')
91
+ return np.nan, np.nan
92
+ return _coef_new(jknife)
93
+
94
+
95
+ # Updated function
96
+ def get_weight_optimized(sumstats, x_tot_precomputed, M_tot, w_ld, intercept=1):
97
+ tot_agg = aggregate(sumstats.chisq, x_tot_precomputed, sumstats.N, M_tot, intercept)
98
+ initial_w = weights(x_tot_precomputed, w_ld.LD_weights.values, sumstats.N.values, M_tot, tot_agg, intercept)
99
+ initial_w = np.sqrt(initial_w)
100
+ return initial_w
101
+
102
+
103
+ def _preprocess_sumstats(trait_name, sumstat_file_path, baseline_and_w_ld_common_snp: pd.Index, chisq_max=None):
104
+ # Load the gwas summary statistics
105
+ sumstats = _read_sumstats(fh=sumstat_file_path, alleles=False, dropna=False)
106
+ sumstats.set_index('SNP', inplace=True)
107
+ sumstats = sumstats.astype(np.float32)
108
+ sumstats = filter_sumstats_by_chisq(sumstats, chisq_max)
109
+
110
+ # NB: The intersection order is essential for keeping the same order of SNPs by its BP location
111
+ common_snp = baseline_and_w_ld_common_snp.intersection(sumstats.index)
112
+ if len(common_snp) < 200000:
113
+ logger.warning(f'WARNING: number of SNPs less than 200k; for {trait_name} this is almost always bad.')
114
+
115
+ sumstats = sumstats.loc[common_snp]
116
+ return sumstats
117
+
118
+
119
+ def _get_sumstats_from_sumstats_dict(sumstats_config_dict: dict, baseline_and_w_ld_common_snp: pd.Index,
120
+ chisq_max=None):
121
+ # first validate if all sumstats file exists
122
+ logger.info('Validating sumstats files...')
123
+ for trait_name, sumstat_file_path in sumstats_config_dict.items():
124
+ if not os.path.exists(sumstat_file_path):
125
+ raise FileNotFoundError(f'{sumstat_file_path} not found')
126
+ # then load all sumstats
127
+ sumstats_cleaned_dict = {}
128
+ for trait_name, sumstat_file_path in sumstats_config_dict.items():
129
+ sumstats_cleaned_dict[trait_name] = _preprocess_sumstats(trait_name, sumstat_file_path,
130
+ baseline_and_w_ld_common_snp, chisq_max)
131
+ logger.info('cleaned sumstats loaded')
132
+ return sumstats_cleaned_dict
133
+
134
+
135
+ def run_spatial_ldsc(config: SpatialLDSCConfig):
136
+ global spatial_annotation, baseline_annotation, n_blocks, Nbar, sumstats, x_tot_precomputed_common_snp, w_ld_common_snp
137
+ # config
138
+ n_blocks = config.n_blocks
139
+ sample_name = config.sample_name
140
+
141
+ print(f'------Running Spatial LDSC for {sample_name}...')
142
+ # Load the regression weights
143
+ w_ld = _read_w_ld(config.w_file)
144
+ w_ld_cname = w_ld.columns[1]
145
+ w_ld.set_index('SNP', inplace=True)
146
+
147
+ # Load the baseline annotations
148
+ ld_file_baseline = f'{config.ldscore_input_dir}/baseline/baseline.'
149
+ ref_ld_baseline = _read_ref_ld_v2(ld_file_baseline)
150
+ n_annot_baseline = len(ref_ld_baseline.columns)
151
+ M_annot_baseline = _read_M_v2(ld_file_baseline, n_annot_baseline, config.not_M_5_50)
152
+
153
+ # common snp between baseline and w_ld
154
+ baseline_and_w_ld_common_snp = ref_ld_baseline.index.intersection(w_ld.index)
155
+ if len(baseline_and_w_ld_common_snp) < 200000:
156
+ logger.warning(f'WARNING: number of SNPs less than 200k; for {sample_name} this is almost always bad.')
157
+ ref_ld_baseline = ref_ld_baseline.loc[baseline_and_w_ld_common_snp]
158
+
159
+ # load additional baseline annotations
160
+ if config.use_additional_baseline_annotation:
161
+ ld_file_baseline_additional = f'{config.ldscore_input_dir}/additional_baseline/baseline.'
162
+ ref_ld_baseline_additional = _read_ref_ld_v2(ld_file_baseline_additional)
163
+ n_annot_baseline_additional = len(ref_ld_baseline_additional.columns)
164
+ logger.info(f'{len(ref_ld_baseline_additional.columns)} additional baseline annotations loaded')
165
+ # M_annot_baseline_additional = _read_M_v2(ld_file_baseline_additional, n_annot_baseline_additional,
166
+ # config.not_M_5_50)
167
+ ref_ld_baseline_additional = ref_ld_baseline_additional.loc[baseline_and_w_ld_common_snp]
168
+ ref_ld_baseline = pd.concat([ref_ld_baseline, ref_ld_baseline_additional], axis=1)
169
+ del ref_ld_baseline_additional
170
+
171
+ w_ld = w_ld.loc[baseline_and_w_ld_common_snp]
172
+
173
+ # Clean the sumstats
174
+ sumstats_cleaned_dict = _get_sumstats_from_sumstats_dict(config.sumstats_config_dict, baseline_and_w_ld_common_snp,
175
+ chisq_max=config.chisq_max)
176
+
177
+ # Detect avalable chunk files
178
+ all_file = os.listdir(config.ldscore_input_dir)
179
+ if config.all_chunk is None:
180
+ all_chunk = sum('chunk' in name for name in all_file)
181
+ print(f'\t')
182
+ print(f'Find {all_chunk} chunked files')
183
+ else:
184
+ all_chunk = config.all_chunk
185
+ print(f'using {all_chunk} chunked files by provided argument')
186
+ print(f'\t')
187
+ print(f'Input {all_chunk} chunked files')
188
+
189
+ # Process each chunk
190
+ output_dict = defaultdict(list)
191
+ for chunk_index in range(1, all_chunk + 1):
192
+ print(f'------Processing chunk-{chunk_index}')
193
+
194
+ # Load the spatial annotations for this chunk
195
+ ld_file_spatial = f'{config.ldscore_input_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.'
196
+ ref_ld_spatial = _read_ref_ld_v2(ld_file_spatial)
197
+ ref_ld_spatial = ref_ld_spatial.loc[baseline_and_w_ld_common_snp]
198
+ ref_ld_spatial = ref_ld_spatial.astype(np.float32, copy=False)
199
+
200
+ # get the x_tot_precomputed matrix by adding baseline and spatial annotation
201
+ x_tot_precomputed = ref_ld_spatial + ref_ld_baseline.sum(axis=1).values.reshape((-1, 1))
202
+
203
+ for trait_name, sumstats in sumstats_cleaned_dict.items():
204
+ logger.info(f'Processing {trait_name}...')
205
+
206
+ # filter ldscore by common snp
207
+ common_snp = sumstats.index
208
+ spatial_annotation = ref_ld_spatial.loc[common_snp].astype(np.float32, copy=False)
209
+ spatial_annotation_cnames = spatial_annotation.columns
210
+ baseline_annotation = ref_ld_baseline.loc[common_snp].astype(np.float32, copy=False)
211
+ w_ld_common_snp = w_ld.loc[common_snp].astype(np.float32, copy=False)
212
+ x_tot_precomputed_common_snp = x_tot_precomputed.loc[common_snp].values
213
+
214
+ # weight the baseline annotation by N
215
+ baseline_annotation = baseline_annotation * sumstats.N.values.reshape((-1, 1)) / sumstats.N.mean()
216
+ # append intercept
217
+ baseline_annotation = append_intercept(baseline_annotation)
218
+
219
+ # Run the jackknife
220
+ Nbar = sumstats.N.mean()
221
+ chunk_size = spatial_annotation.shape[1]
222
+ out_chunk = process_map(jackknife_for_processmap, range(chunk_size),
223
+ max_workers=config.num_processes,
224
+ chunksize=10,
225
+ desc=f'LDSC chunk-{chunk_index}: {trait_name}')
226
+
227
+ # cache the results
228
+ out_chunk = pd.DataFrame.from_records(out_chunk,
229
+ columns=['beta', 'se', ],
230
+ index=spatial_annotation_cnames)
231
+ # get the spots with nan
232
+ nan_spots = out_chunk[out_chunk.isna().any(axis=1)].index
233
+ logger.info(f'Nan spots: {nan_spots} in chunk-{chunk_index} for {trait_name}. They are removed.')
234
+ # drop the nan
235
+ out_chunk = out_chunk.dropna()
236
+
237
+ out_chunk['z'] = out_chunk.beta / out_chunk.se
238
+ out_chunk['p'] = norm.sf(out_chunk['z'])
239
+ output_dict[trait_name].append(out_chunk)
240
+
241
+ # garbage collection
242
+ del spatial_annotation
243
+
244
+ # Save the results
245
+ out_dir = Path(config.ldsc_save_dir)
246
+ out_dir.mkdir(parents=True, exist_ok=True, mode=0o777)
247
+ for trait_name, out_chunk_list in output_dict.items():
248
+ out_all = pd.concat(out_chunk_list, axis=0)
249
+ out_file_name = out_dir / f'{sample_name}_{trait_name}.csv.gz'
250
+ out_all['spot'] = out_all.index
251
+ out_all = out_all[['spot', 'beta', 'se', 'z', 'p']]
252
+ out_all.to_csv(out_file_name, compression='gzip', index=False)
253
+ logger.info(f'Output saved to {out_file_name} for {trait_name}')
254
+ logger.info(f'------Spatial LDSC for {sample_name} finished!')
255
+
256
+
257
+ # %%
258
+ if __name__ == '__main__':
259
+ # Main function of analysis
260
+ parser = argparse.ArgumentParser(
261
+ description="Run Spatial LD Score Regression (LDSC) analysis for GWAS and spatial transcriptomic data."
262
+ )
263
+ parser = add_spatial_ldsc_args(parser)
264
+ TEST = True
265
+ if TEST:
266
+ gwas_root = "/storage/yangjianLab/songliyang/GWAS_trait/LDSC"
267
+ gwas_trait = "/storage/yangjianLab/songliyang/GWAS_trait/GWAS_Public_Use_MaxPower.csv"
268
+ root = "/storage/yangjianLab/songliyang/SpatialData/Data/Brain/Human/Nature_Neuroscience_2021/processed/h5ad"
269
+
270
+ name = 'Cortex_151507'
271
+ spe_name = name
272
+ # ld_pth = f"/storage/yangjianLab/songliyang/SpatialData/Data/Brain/Human/Nature_Neuroscience_2021/annotation/{spe_name}/snp_annotation"
273
+ ld_pth = f"/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/Nature_Neuroscience_2021/snake_workdir/{name}/generate_ldscore"
274
+ out_pth = f"/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/Nature_Neuroscience_2021/snake_workdir/{name}/ldsc"
275
+ gwas_file = "ADULT1_ADULT2_ONSET_ASTHMA"
276
+ # Prepare the arguments list using f-strings
277
+ args_list = [
278
+ "--h2", f"{gwas_root}/{gwas_file}.sumstats.gz",
279
+ "--w_file", "/storage/yangjianLab/sharedata/LDSC_resource/LDSC_SEG_ldscores/weights_hm3_no_hla/weights.",
280
+ "--sample_name", spe_name,
281
+ "--num_processes", '4',
282
+ "--ldscore_input_dir", ld_pth,
283
+ "--ldsc_save_dir", out_pth,
284
+ '--trait_name', 'adult1_adult2_onset_asthma'
285
+ ]
286
+ # args = parser.parse_args(args_list)
287
+ else:
288
+ args = parser.parse_args()
289
+
290
+ os.chdir('/storage/yangjianLab/chenwenhao/tmp/gsMap_Height_debug')
291
+ TASK_ID = 16
292
+ spe_name = f'E{TASK_ID}.5_E1S1'
293
+ config = SpatialLDSCConfig(**{'all_chunk': None,
294
+ 'chisq_max': None,
295
+ # 'sumstats_file': '/storage/yangjianLab/songliyang/GWAS_trait/LDSC/GIANT_EUR_Height_2022_Nature.sumstats.gz',
296
+ 'ldsc_save_dir': f'{spe_name}/ldsc_results_three_row_sum_sub_config_traits',
297
+ 'ldscore_input_dir': '/storage/yangjianLab/songliyang/SpatialData/Data/Embryo/Mice/Cell_MOSTA/annotation/E16.5_E1S1/generate_ldscore_new',
298
+ 'n_blocks': 200,
299
+ 'not_M_5_50': False,
300
+ 'num_processes': 15,
301
+ 'sample_name': spe_name,
302
+ # 'trait_name': 'GIANT_EUR_Height_2022_Nature',
303
+ 'sumstats_config_file': '/storage/yangjianLab/chenwenhao/projects/202312_gsMap/src/gsMap/example/sumstats_config_sub.yaml',
304
+ 'w_file': '/storage/yangjianLab/sharedata/LDSC_resource/LDSC_SEG_ldscores/weights_hm3_no_hla/weights.'
305
+ })
306
+ # config = SpatialLDSCConfig(**vars(args))
307
+ run_spatial_ldsc(config)